1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * CPU detection code, extracted from mmx.h
3cabdff1aSopenharmony_ci * (c)1997-99 by H. Dietz and R. Fisher
4cabdff1aSopenharmony_ci * Converted to C and improved by Fabrice Bellard.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include <stdlib.h>
24cabdff1aSopenharmony_ci#include <string.h>
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci#include "libavutil/x86/asm.h"
27cabdff1aSopenharmony_ci#include "libavutil/x86/cpu.h"
28cabdff1aSopenharmony_ci#include "libavutil/cpu.h"
29cabdff1aSopenharmony_ci#include "libavutil/cpu_internal.h"
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ci#if HAVE_X86ASM
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci#define cpuid(index, eax, ebx, ecx, edx)        \
34cabdff1aSopenharmony_ci    ff_cpu_cpuid(index, &eax, &ebx, &ecx, &edx)
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ci#define xgetbv(index, eax, edx)                 \
37cabdff1aSopenharmony_ci    ff_cpu_xgetbv(index, &eax, &edx)
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ci#elif HAVE_INLINE_ASM
40cabdff1aSopenharmony_ci
41cabdff1aSopenharmony_ci/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
42cabdff1aSopenharmony_ci#define cpuid(index, eax, ebx, ecx, edx)                        \
43cabdff1aSopenharmony_ci    __asm__ volatile (                                          \
44cabdff1aSopenharmony_ci        "mov    %%"FF_REG_b", %%"FF_REG_S" \n\t"                \
45cabdff1aSopenharmony_ci        "cpuid                       \n\t"                      \
46cabdff1aSopenharmony_ci        "xchg   %%"FF_REG_b", %%"FF_REG_S                       \
47cabdff1aSopenharmony_ci        : "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)        \
48cabdff1aSopenharmony_ci        : "0" (index), "2"(0))
49cabdff1aSopenharmony_ci
50cabdff1aSopenharmony_ci#define xgetbv(index, eax, edx)                                 \
51cabdff1aSopenharmony_ci    __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index))
52cabdff1aSopenharmony_ci
53cabdff1aSopenharmony_ci#define get_eflags(x)                           \
54cabdff1aSopenharmony_ci    __asm__ volatile ("pushfl     \n"           \
55cabdff1aSopenharmony_ci                      "pop    %0  \n"           \
56cabdff1aSopenharmony_ci                      : "=r"(x))
57cabdff1aSopenharmony_ci
58cabdff1aSopenharmony_ci#define set_eflags(x)                           \
59cabdff1aSopenharmony_ci    __asm__ volatile ("push    %0 \n"           \
60cabdff1aSopenharmony_ci                      "popfl      \n"           \
61cabdff1aSopenharmony_ci                      :: "r"(x))
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_ci#endif /* HAVE_INLINE_ASM */
64cabdff1aSopenharmony_ci
65cabdff1aSopenharmony_ci#if ARCH_X86_64
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_ci#define cpuid_test() 1
68cabdff1aSopenharmony_ci
69cabdff1aSopenharmony_ci#elif HAVE_X86ASM
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_ci#define cpuid_test ff_cpu_cpuid_test
72cabdff1aSopenharmony_ci
73cabdff1aSopenharmony_ci#elif HAVE_INLINE_ASM
74cabdff1aSopenharmony_ci
75cabdff1aSopenharmony_cistatic int cpuid_test(void)
76cabdff1aSopenharmony_ci{
77cabdff1aSopenharmony_ci    x86_reg a, c;
78cabdff1aSopenharmony_ci
79cabdff1aSopenharmony_ci    /* Check if CPUID is supported by attempting to toggle the ID bit in
80cabdff1aSopenharmony_ci     * the EFLAGS register. */
81cabdff1aSopenharmony_ci    get_eflags(a);
82cabdff1aSopenharmony_ci    set_eflags(a ^ 0x200000);
83cabdff1aSopenharmony_ci    get_eflags(c);
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_ci    return a != c;
86cabdff1aSopenharmony_ci}
87cabdff1aSopenharmony_ci#endif
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci/* Function to test if multimedia instructions are supported...  */
90cabdff1aSopenharmony_ciint ff_get_cpu_flags_x86(void)
91cabdff1aSopenharmony_ci{
92cabdff1aSopenharmony_ci    int rval = 0;
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci#ifdef cpuid
95cabdff1aSopenharmony_ci
96cabdff1aSopenharmony_ci    int eax, ebx, ecx, edx;
97cabdff1aSopenharmony_ci    int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0;
98cabdff1aSopenharmony_ci    int family = 0, model = 0;
99cabdff1aSopenharmony_ci    union { int i[3]; char c[12]; } vendor;
100cabdff1aSopenharmony_ci    int xcr0_lo = 0, xcr0_hi = 0;
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_ci    if (!cpuid_test())
103cabdff1aSopenharmony_ci        return 0; /* CPUID not supported */
104cabdff1aSopenharmony_ci
105cabdff1aSopenharmony_ci    cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]);
106cabdff1aSopenharmony_ci
107cabdff1aSopenharmony_ci    if (max_std_level >= 1) {
108cabdff1aSopenharmony_ci        cpuid(1, eax, ebx, ecx, std_caps);
109cabdff1aSopenharmony_ci        family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
110cabdff1aSopenharmony_ci        model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
111cabdff1aSopenharmony_ci        if (std_caps & (1 << 15))
112cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_CMOV;
113cabdff1aSopenharmony_ci        if (std_caps & (1 << 23))
114cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_MMX;
115cabdff1aSopenharmony_ci        if (std_caps & (1 << 25))
116cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_MMXEXT;
117cabdff1aSopenharmony_ci#if HAVE_SSE
118cabdff1aSopenharmony_ci        if (std_caps & (1 << 25))
119cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_SSE;
120cabdff1aSopenharmony_ci        if (std_caps & (1 << 26))
121cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_SSE2;
122cabdff1aSopenharmony_ci        if (ecx & 1)
123cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_SSE3;
124cabdff1aSopenharmony_ci        if (ecx & 0x00000200 )
125cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_SSSE3;
126cabdff1aSopenharmony_ci        if (ecx & 0x00080000 )
127cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_SSE4;
128cabdff1aSopenharmony_ci        if (ecx & 0x00100000 )
129cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_SSE42;
130cabdff1aSopenharmony_ci        if (ecx & 0x02000000 )
131cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_AESNI;
132cabdff1aSopenharmony_ci#if HAVE_AVX
133cabdff1aSopenharmony_ci        /* Check OXSAVE and AVX bits */
134cabdff1aSopenharmony_ci        if ((ecx & 0x18000000) == 0x18000000) {
135cabdff1aSopenharmony_ci            /* Check for OS support */
136cabdff1aSopenharmony_ci            xgetbv(0, xcr0_lo, xcr0_hi);
137cabdff1aSopenharmony_ci            if ((xcr0_lo & 0x6) == 0x6) {
138cabdff1aSopenharmony_ci                rval |= AV_CPU_FLAG_AVX;
139cabdff1aSopenharmony_ci                if (ecx & 0x00001000)
140cabdff1aSopenharmony_ci                    rval |= AV_CPU_FLAG_FMA3;
141cabdff1aSopenharmony_ci            }
142cabdff1aSopenharmony_ci        }
143cabdff1aSopenharmony_ci#endif /* HAVE_AVX */
144cabdff1aSopenharmony_ci#endif /* HAVE_SSE */
145cabdff1aSopenharmony_ci    }
146cabdff1aSopenharmony_ci    if (max_std_level >= 7) {
147cabdff1aSopenharmony_ci        cpuid(7, eax, ebx, ecx, edx);
148cabdff1aSopenharmony_ci#if HAVE_AVX2
149cabdff1aSopenharmony_ci        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
150cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_AVX2;
151cabdff1aSopenharmony_ci#if HAVE_AVX512 /* F, CD, BW, DQ, VL */
152cabdff1aSopenharmony_ci        if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
153cabdff1aSopenharmony_ci            if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000) {
154cabdff1aSopenharmony_ci                rval |= AV_CPU_FLAG_AVX512;
155cabdff1aSopenharmony_ci#if HAVE_AVX512ICL
156cabdff1aSopenharmony_ci                if ((ebx & 0xd0200000) == 0xd0200000 && (ecx & 0x5f42) == 0x5f42)
157cabdff1aSopenharmony_ci                    rval |= AV_CPU_FLAG_AVX512ICL;
158cabdff1aSopenharmony_ci#endif /* HAVE_AVX512ICL */
159cabdff1aSopenharmony_ci            }
160cabdff1aSopenharmony_ci        }
161cabdff1aSopenharmony_ci#endif /* HAVE_AVX512 */
162cabdff1aSopenharmony_ci#endif /* HAVE_AVX2 */
163cabdff1aSopenharmony_ci        /* BMI1/2 don't need OS support */
164cabdff1aSopenharmony_ci        if (ebx & 0x00000008) {
165cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_BMI1;
166cabdff1aSopenharmony_ci            if (ebx & 0x00000100)
167cabdff1aSopenharmony_ci                rval |= AV_CPU_FLAG_BMI2;
168cabdff1aSopenharmony_ci        }
169cabdff1aSopenharmony_ci    }
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci    cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
172cabdff1aSopenharmony_ci
173cabdff1aSopenharmony_ci    if (max_ext_level >= 0x80000001) {
174cabdff1aSopenharmony_ci        cpuid(0x80000001, eax, ebx, ecx, ext_caps);
175cabdff1aSopenharmony_ci        if (ext_caps & (1U << 31))
176cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_3DNOW;
177cabdff1aSopenharmony_ci        if (ext_caps & (1 << 30))
178cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_3DNOWEXT;
179cabdff1aSopenharmony_ci        if (ext_caps & (1 << 23))
180cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_MMX;
181cabdff1aSopenharmony_ci        if (ext_caps & (1 << 22))
182cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_MMXEXT;
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ci        if (!strncmp(vendor.c, "AuthenticAMD", 12)) {
185cabdff1aSopenharmony_ci        /* Allow for selectively disabling SSE2 functions on AMD processors
186cabdff1aSopenharmony_ci           with SSE2 support but not SSE4a. This includes Athlon64, some
187cabdff1aSopenharmony_ci           Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
188cabdff1aSopenharmony_ci           than SSE2 often enough to utilize this special-case flag.
189cabdff1aSopenharmony_ci           AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
190cabdff1aSopenharmony_ci           so that SSE2 is used unless explicitly disabled by checking
191cabdff1aSopenharmony_ci           AV_CPU_FLAG_SSE2SLOW. */
192cabdff1aSopenharmony_ci            if (rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040))
193cabdff1aSopenharmony_ci                rval |= AV_CPU_FLAG_SSE2SLOW;
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ci        /* Similar to the above but for AVX functions on AMD processors.
196cabdff1aSopenharmony_ci           This is necessary only for functions using YMM registers on Bulldozer
197cabdff1aSopenharmony_ci           and Jaguar based CPUs as they lack 256-bit execution units. SSE/AVX
198cabdff1aSopenharmony_ci           functions using XMM registers are always faster on them.
199cabdff1aSopenharmony_ci           AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is
200cabdff1aSopenharmony_ci           used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
201cabdff1aSopenharmony_ci            if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
202cabdff1aSopenharmony_ci                rval |= AV_CPU_FLAG_AVXSLOW;
203cabdff1aSopenharmony_ci
204cabdff1aSopenharmony_ci        /* Zen 3 and earlier have slow gather */
205cabdff1aSopenharmony_ci            if ((family <= 0x19) && (rval & AV_CPU_FLAG_AVX2))
206cabdff1aSopenharmony_ci                rval |= AV_CPU_FLAG_SLOW_GATHER;
207cabdff1aSopenharmony_ci        }
208cabdff1aSopenharmony_ci
209cabdff1aSopenharmony_ci        /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
210cabdff1aSopenharmony_ci         * used unless the OS has AVX support. */
211cabdff1aSopenharmony_ci        if (rval & AV_CPU_FLAG_AVX) {
212cabdff1aSopenharmony_ci            if (ecx & 0x00000800)
213cabdff1aSopenharmony_ci                rval |= AV_CPU_FLAG_XOP;
214cabdff1aSopenharmony_ci            if (ecx & 0x00010000)
215cabdff1aSopenharmony_ci                rval |= AV_CPU_FLAG_FMA4;
216cabdff1aSopenharmony_ci        }
217cabdff1aSopenharmony_ci    }
218cabdff1aSopenharmony_ci
219cabdff1aSopenharmony_ci    if (!strncmp(vendor.c, "GenuineIntel", 12)) {
220cabdff1aSopenharmony_ci        if (family == 6 && (model == 9 || model == 13 || model == 14)) {
221cabdff1aSopenharmony_ci            /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
222cabdff1aSopenharmony_ci             * 6/14 (core1 "yonah") theoretically support sse2, but it's
223cabdff1aSopenharmony_ci             * usually slower than mmx, so let's just pretend they don't.
224cabdff1aSopenharmony_ci             * AV_CPU_FLAG_SSE2 is disabled and AV_CPU_FLAG_SSE2SLOW is
225cabdff1aSopenharmony_ci             * enabled so that SSE2 is not used unless explicitly enabled
226cabdff1aSopenharmony_ci             * by checking AV_CPU_FLAG_SSE2SLOW. The same situation
227cabdff1aSopenharmony_ci             * applies for AV_CPU_FLAG_SSE3 and AV_CPU_FLAG_SSE3SLOW. */
228cabdff1aSopenharmony_ci            if (rval & AV_CPU_FLAG_SSE2)
229cabdff1aSopenharmony_ci                rval ^= AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE2;
230cabdff1aSopenharmony_ci            if (rval & AV_CPU_FLAG_SSE3)
231cabdff1aSopenharmony_ci                rval ^= AV_CPU_FLAG_SSE3SLOW | AV_CPU_FLAG_SSE3;
232cabdff1aSopenharmony_ci        }
233cabdff1aSopenharmony_ci        /* The Atom processor has SSSE3 support, which is useful in many cases,
234cabdff1aSopenharmony_ci         * but sometimes the SSSE3 version is slower than the SSE2 equivalent
235cabdff1aSopenharmony_ci         * on the Atom, but is generally faster on other processors supporting
236cabdff1aSopenharmony_ci         * SSSE3. This flag allows for selectively disabling certain SSSE3
237cabdff1aSopenharmony_ci         * functions on the Atom. */
238cabdff1aSopenharmony_ci        if (family == 6 && model == 28)
239cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_ATOM;
240cabdff1aSopenharmony_ci
241cabdff1aSopenharmony_ci        /* Conroe has a slow shuffle unit. Check the model number to ensure not
242cabdff1aSopenharmony_ci         * to include crippled low-end Penryns and Nehalems that lack SSE4. */
243cabdff1aSopenharmony_ci        if ((rval & AV_CPU_FLAG_SSSE3) && !(rval & AV_CPU_FLAG_SSE4) &&
244cabdff1aSopenharmony_ci            family == 6 && model < 23)
245cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_SSSE3SLOW;
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci        /* Haswell has slow gather */
248cabdff1aSopenharmony_ci        if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 70)
249cabdff1aSopenharmony_ci            rval |= AV_CPU_FLAG_SLOW_GATHER;
250cabdff1aSopenharmony_ci    }
251cabdff1aSopenharmony_ci
252cabdff1aSopenharmony_ci#endif /* cpuid */
253cabdff1aSopenharmony_ci
254cabdff1aSopenharmony_ci    return rval;
255cabdff1aSopenharmony_ci}
256cabdff1aSopenharmony_ci
257cabdff1aSopenharmony_cisize_t ff_get_cpu_max_align_x86(void)
258cabdff1aSopenharmony_ci{
259cabdff1aSopenharmony_ci    int flags = av_get_cpu_flags();
260cabdff1aSopenharmony_ci
261cabdff1aSopenharmony_ci    if (flags & AV_CPU_FLAG_AVX512)
262cabdff1aSopenharmony_ci        return 64;
263cabdff1aSopenharmony_ci    if (flags & (AV_CPU_FLAG_AVX2      |
264cabdff1aSopenharmony_ci                 AV_CPU_FLAG_AVX       |
265cabdff1aSopenharmony_ci                 AV_CPU_FLAG_XOP       |
266cabdff1aSopenharmony_ci                 AV_CPU_FLAG_FMA4      |
267cabdff1aSopenharmony_ci                 AV_CPU_FLAG_FMA3      |
268cabdff1aSopenharmony_ci                 AV_CPU_FLAG_AVXSLOW))
269cabdff1aSopenharmony_ci        return 32;
270cabdff1aSopenharmony_ci    if (flags & (AV_CPU_FLAG_AESNI     |
271cabdff1aSopenharmony_ci                 AV_CPU_FLAG_SSE42     |
272cabdff1aSopenharmony_ci                 AV_CPU_FLAG_SSE4      |
273cabdff1aSopenharmony_ci                 AV_CPU_FLAG_SSSE3     |
274cabdff1aSopenharmony_ci                 AV_CPU_FLAG_SSE3      |
275cabdff1aSopenharmony_ci                 AV_CPU_FLAG_SSE2      |
276cabdff1aSopenharmony_ci                 AV_CPU_FLAG_SSE       |
277cabdff1aSopenharmony_ci                 AV_CPU_FLAG_ATOM      |
278cabdff1aSopenharmony_ci                 AV_CPU_FLAG_SSSE3SLOW |
279cabdff1aSopenharmony_ci                 AV_CPU_FLAG_SSE3SLOW  |
280cabdff1aSopenharmony_ci                 AV_CPU_FLAG_SSE2SLOW))
281cabdff1aSopenharmony_ci        return 16;
282cabdff1aSopenharmony_ci
283cabdff1aSopenharmony_ci    return 8;
284cabdff1aSopenharmony_ci}
285