1/************************************************************************** 2 * 3 * Copyright 2008 Dennis Smit 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * on the rights to use, copy, modify, merge, publish, distribute, sub 10 * license, and/or sell copies of the Software, and to permit persons to whom 11 * the Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 20 * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 21 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 22 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 23 * USE OR OTHER DEALINGS IN THE SOFTWARE. 24 * 25 **************************************************************************/ 26 27/** 28 * @file 29 * CPU feature detection. 30 * 31 * @author Dennis Smit 32 * @author Based on the work of Eric Anholt <anholt@FreeBSD.org> 33 */ 34 35#include "pipe/p_config.h" 36#include "pipe/p_compiler.h" 37 38#include "util/u_debug.h" 39#include "u_cpu_detect.h" 40#include "u_math.h" 41#include "c11/threads.h" 42 43#include <stdio.h> 44#include <inttypes.h> 45 46#if defined(PIPE_ARCH_PPC) 47#if defined(PIPE_OS_APPLE) 48#include <sys/sysctl.h> 49#else 50#include <signal.h> 51#include <setjmp.h> 52#endif 53#endif 54 55#if defined(PIPE_OS_BSD) 56#include <sys/param.h> 57#include <sys/sysctl.h> 58#include <machine/cpu.h> 59#endif 60 61#if defined(PIPE_OS_FREEBSD) 62#if __has_include(<sys/auxv.h>) 63#include <sys/auxv.h> 64#define HAVE_ELF_AUX_INFO 65#endif 66#endif 67 68#if defined(PIPE_OS_LINUX) 69#include <signal.h> 70#include <fcntl.h> 71#include <elf.h> 72#endif 73 74#ifdef PIPE_OS_UNIX 75#include <unistd.h> 76#endif 77 78#if defined(HAS_ANDROID_CPUFEATURES) 79#include <cpu-features.h> 80#endif 81 82#if defined(PIPE_OS_WINDOWS) 83#include <windows.h> 84#if defined(PIPE_CC_MSVC) 85#include <intrin.h> 86#endif 87#endif 88 89#if defined(HAS_SCHED_H) 90#include <sched.h> 91#endif 92 93DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false) 94 95 96struct util_cpu_caps_t util_cpu_caps; 97 98#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) 99static int has_cpuid(void); 100#endif 101 102 103#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) && !defined(PIPE_OS_BSD) && !defined(PIPE_OS_LINUX) 104static jmp_buf __lv_powerpc_jmpbuf; 105static volatile sig_atomic_t __lv_powerpc_canjump = 0; 106 107static void 108sigill_handler(int sig) 109{ 110 if (!__lv_powerpc_canjump) { 111 signal (sig, SIG_DFL); 112 raise (sig); 113 } 114 115 __lv_powerpc_canjump = 0; 116 longjmp(__lv_powerpc_jmpbuf, 1); 117} 118#endif 119 120#if defined(PIPE_ARCH_PPC) 121static void 122check_os_altivec_support(void) 123{ 124#if defined(__ALTIVEC__) 125 util_cpu_caps.has_altivec = 1; 126#endif 127#if defined(__VSX__) 128 util_cpu_caps.has_vsx = 1; 129#endif 130#if defined(__ALTIVEC__) && defined(__VSX__) 131/* Do nothing */ 132#elif defined(PIPE_OS_APPLE) || defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD) 133#ifdef HW_VECTORUNIT 134 int sels[2] = {CTL_HW, HW_VECTORUNIT}; 135#else 136 int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC}; 137#endif 138 int has_vu = 0; 139 size_t len = sizeof (has_vu); 140 int err; 141 142 err = sysctl(sels, 2, &has_vu, &len, NULL, 0); 143 144 if (err == 0) { 145 if (has_vu != 0) { 146 util_cpu_caps.has_altivec = 1; 147 } 148 } 149#elif defined(PIPE_OS_FREEBSD) /* !PIPE_OS_APPLE && !PIPE_OS_NETBSD && !PIPE_OS_OPENBSD */ 150 unsigned long hwcap = 0; 151#ifdef HAVE_ELF_AUX_INFO 152 elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); 153#else 154 size_t len = sizeof(hwcap); 155 sysctlbyname("hw.cpu_features", &hwcap, &len, NULL, 0); 156#endif 157 if (hwcap & PPC_FEATURE_HAS_ALTIVEC) 158 util_cpu_caps.has_altivec = 1; 159 if (hwcap & PPC_FEATURE_HAS_VSX) 160 util_cpu_caps.has_vsx = 1; 161#elif defined(PIPE_OS_LINUX) /* !PIPE_OS_FREEBSD */ 162#if defined(PIPE_ARCH_PPC_64) 163 Elf64_auxv_t aux; 164#else 165 Elf32_auxv_t aux; 166#endif 167 int fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); 168 if (fd >= 0) { 169 while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) { 170 if (aux.a_type == AT_HWCAP) { 171 char *env_vsx = getenv("GALLIVM_VSX"); 172 uint64_t hwcap = aux.a_un.a_val; 173 util_cpu_caps.has_altivec = (hwcap >> 28) & 1; 174 if (!env_vsx || env_vsx[0] != '0') { 175 util_cpu_caps.has_vsx = (hwcap >> 7) & 1; 176 } 177 break; 178 } 179 } 180 close(fd); 181 } 182#else /* !PIPE_OS_APPLE && !PIPE_OS_BSD && !PIPE_OS_LINUX */ 183 /* not on Apple/Darwin or Linux, do it the brute-force way */ 184 /* this is borrowed from the libmpeg2 library */ 185 signal(SIGILL, sigill_handler); 186 if (setjmp(__lv_powerpc_jmpbuf)) { 187 signal(SIGILL, SIG_DFL); 188 } else { 189 boolean enable_altivec = TRUE; /* Default: enable if available, and if not overridden */ 190 boolean enable_vsx = TRUE; 191#ifdef DEBUG 192 /* Disabling Altivec code generation is not the same as disabling VSX code generation, 193 * which can be done simply by passing -mattr=-vsx to the LLVM compiler; cf. 194 * lp_build_create_jit_compiler_for_module(). 195 * If you want to disable Altivec code generation, the best place to do it is here. 196 */ 197 char *env_control = getenv("GALLIVM_ALTIVEC"); /* 1=enable (default); 0=disable */ 198 if (env_control && env_control[0] == '0') { 199 enable_altivec = FALSE; 200 } 201#endif 202 /* VSX instructions can be explicitly enabled/disabled via GALLIVM_VSX=1 or 0 */ 203 char *env_vsx = getenv("GALLIVM_VSX"); 204 if (env_vsx && env_vsx[0] == '0') { 205 enable_vsx = FALSE; 206 } 207 if (enable_altivec) { 208 __lv_powerpc_canjump = 1; 209 210 __asm __volatile 211 ("mtspr 256, %0\n\t" 212 "vand %%v0, %%v0, %%v0" 213 : 214 : "r" (-1)); 215 216 util_cpu_caps.has_altivec = 1; 217 218 if (enable_vsx) { 219 __asm __volatile("xxland %vs0, %vs0, %vs0"); 220 util_cpu_caps.has_vsx = 1; 221 } 222 signal(SIGILL, SIG_DFL); 223 } else { 224 util_cpu_caps.has_altivec = 0; 225 } 226 } 227#endif /* !PIPE_OS_APPLE && !PIPE_OS_LINUX */ 228} 229#endif /* PIPE_ARCH_PPC */ 230 231 232#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) 233static int has_cpuid(void) 234{ 235#if defined(PIPE_ARCH_X86) 236#if defined(PIPE_OS_GCC) 237 int a, c; 238 239 __asm __volatile 240 ("pushf\n" 241 "popl %0\n" 242 "movl %0, %1\n" 243 "xorl $0x200000, %0\n" 244 "push %0\n" 245 "popf\n" 246 "pushf\n" 247 "popl %0\n" 248 : "=a" (a), "=c" (c) 249 : 250 : "cc"); 251 252 return a != c; 253#else 254 /* FIXME */ 255 return 1; 256#endif 257#elif defined(PIPE_ARCH_X86_64) 258 return 1; 259#else 260 return 0; 261#endif 262} 263 264 265/** 266 * @sa cpuid.h included in gcc-4.3 onwards. 267 * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx 268 */ 269static inline void 270cpuid(uint32_t ax, uint32_t *p) 271{ 272#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) 273 __asm __volatile ( 274 "xchgl %%ebx, %1\n\t" 275 "cpuid\n\t" 276 "xchgl %%ebx, %1" 277 : "=a" (p[0]), 278 "=S" (p[1]), 279 "=c" (p[2]), 280 "=d" (p[3]) 281 : "0" (ax) 282 ); 283#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64) 284 __asm __volatile ( 285 "cpuid\n\t" 286 : "=a" (p[0]), 287 "=b" (p[1]), 288 "=c" (p[2]), 289 "=d" (p[3]) 290 : "0" (ax) 291 ); 292#elif defined(PIPE_CC_MSVC) 293 __cpuid(p, ax); 294#else 295 p[0] = 0; 296 p[1] = 0; 297 p[2] = 0; 298 p[3] = 0; 299#endif 300} 301 302/** 303 * @sa cpuid.h included in gcc-4.4 onwards. 304 * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx 305 */ 306static inline void 307cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p) 308{ 309#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) 310 __asm __volatile ( 311 "xchgl %%ebx, %1\n\t" 312 "cpuid\n\t" 313 "xchgl %%ebx, %1" 314 : "=a" (p[0]), 315 "=S" (p[1]), 316 "=c" (p[2]), 317 "=d" (p[3]) 318 : "0" (ax), "2" (cx) 319 ); 320#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64) 321 __asm __volatile ( 322 "cpuid\n\t" 323 : "=a" (p[0]), 324 "=b" (p[1]), 325 "=c" (p[2]), 326 "=d" (p[3]) 327 : "0" (ax), "2" (cx) 328 ); 329#elif defined(PIPE_CC_MSVC) 330 __cpuidex(p, ax, cx); 331#else 332 p[0] = 0; 333 p[1] = 0; 334 p[2] = 0; 335 p[3] = 0; 336#endif 337} 338 339 340static inline uint64_t xgetbv(void) 341{ 342#if defined(PIPE_CC_GCC) 343 uint32_t eax, edx; 344 345 __asm __volatile ( 346 ".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4 347 : "=a"(eax), 348 "=d"(edx) 349 : "c"(0) 350 ); 351 352 return ((uint64_t)edx << 32) | eax; 353#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) 354 return _xgetbv(_XCR_XFEATURE_ENABLED_MASK); 355#else 356 return 0; 357#endif 358} 359 360 361#if defined(PIPE_ARCH_X86) 362PIPE_ALIGN_STACK static inline boolean sse2_has_daz(void) 363{ 364 alignas(16) struct { 365 uint32_t pad1[7]; 366 uint32_t mxcsr_mask; 367 uint32_t pad2[128-8]; 368 } fxarea; 369 370 fxarea.mxcsr_mask = 0; 371#if defined(PIPE_CC_GCC) 372 __asm __volatile ("fxsave %0" : "+m" (fxarea)); 373#elif defined(PIPE_CC_MSVC) || defined(PIPE_CC_ICL) 374 _fxsave(&fxarea); 375#else 376 fxarea.mxcsr_mask = 0; 377#endif 378 return !!(fxarea.mxcsr_mask & (1 << 6)); 379} 380#endif 381 382#endif /* X86 or X86_64 */ 383 384#if defined(PIPE_ARCH_ARM) 385static void 386check_os_arm_support(void) 387{ 388 /* 389 * On Android, the cpufeatures library is preferred way of checking 390 * CPU capabilities. However, it is not available for standalone Mesa 391 * builds, i.e. when Android build system (Android.mk-based) is not 392 * used. Because of this we cannot use PIPE_OS_ANDROID here, but rather 393 * have a separate macro that only gets enabled from respective Android.mk. 394 */ 395#if defined(__ARM_NEON) || defined(__ARM_NEON__) 396 util_cpu_caps.has_neon = 1; 397#elif defined(PIPE_OS_FREEBSD) && defined(HAVE_ELF_AUX_INFO) 398 unsigned long hwcap = 0; 399 elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); 400 if (hwcap & HWCAP_NEON) 401 util_cpu_caps.has_neon = 1; 402#elif defined(HAS_ANDROID_CPUFEATURES) 403 AndroidCpuFamily cpu_family = android_getCpuFamily(); 404 uint64_t cpu_features = android_getCpuFeatures(); 405 406 if (cpu_family == ANDROID_CPU_FAMILY_ARM) { 407 if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) 408 util_cpu_caps.has_neon = 1; 409 } 410#elif defined(PIPE_OS_LINUX) 411 Elf32_auxv_t aux; 412 int fd; 413 414 fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); 415 if (fd >= 0) { 416 while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) { 417 if (aux.a_type == AT_HWCAP) { 418 uint32_t hwcap = aux.a_un.a_val; 419 420 util_cpu_caps.has_neon = (hwcap >> 12) & 1; 421 break; 422 } 423 } 424 close (fd); 425 } 426#endif /* PIPE_OS_LINUX */ 427} 428 429#elif defined(PIPE_ARCH_AARCH64) 430static void 431check_os_arm_support(void) 432{ 433 util_cpu_caps.has_neon = true; 434} 435#endif /* PIPE_ARCH_ARM || PIPE_ARCH_AARCH64 */ 436 437#if defined(PIPE_ARCH_MIPS64) 438static void 439check_os_mips64_support(void) 440{ 441#if defined(PIPE_OS_LINUX) 442 Elf64_auxv_t aux; 443 int fd; 444 445 fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); 446 if (fd >= 0) { 447 while (read(fd, &aux, sizeof(Elf64_auxv_t)) == sizeof(Elf64_auxv_t)) { 448 if (aux.a_type == AT_HWCAP) { 449 uint64_t hwcap = aux.a_un.a_val; 450 451 util_cpu_caps.has_msa = (hwcap >> 1) & 1; 452 break; 453 } 454 } 455 close (fd); 456 } 457#endif /* PIPE_OS_LINUX */ 458} 459#endif /* PIPE_ARCH_MIPS64 */ 460 461 462static void 463get_cpu_topology(void) 464{ 465 /* Default. This is OK if L3 is not present or there is only one. */ 466 util_cpu_caps.num_L3_caches = 1; 467 468 memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3)); 469 470#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) 471 /* AMD Zen */ 472 if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 && 473 util_cpu_caps.family < CPU_AMD_LAST) { 474 uint32_t regs[4]; 475 476 uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0}; 477 uint32_t mask[UTIL_MAX_CPUS / 32] = {0}; 478 bool saved = false; 479 480 uint32_t L3_found[UTIL_MAX_CPUS] = {0}; 481 uint32_t num_L3_caches = 0; 482 util_affinity_mask *L3_affinity_masks = NULL; 483 484 /* Query APIC IDs from each CPU core. 485 * 486 * An APIC ID is a logical ID of the CPU with respect to the cache 487 * hierarchy, meaning that consecutive APIC IDs are neighbours in 488 * the hierarchy, e.g. sharing the same cache. 489 * 490 * For example, CPU 0 can have APIC ID 0 and CPU 12 can have APIC ID 1, 491 * which means that both CPU 0 and 12 are next to each other. 492 * (e.g. they are 2 threads belonging to 1 SMT2 core) 493 * 494 * We need to find out which CPUs share the same L3 cache and they can 495 * be all over the place. 496 * 497 * Querying the APIC ID can only be done by pinning the current thread 498 * to each core. The original affinity mask is saved. 499 * 500 * Loop over all possible CPUs even though some may be offline. 501 */ 502 for (int16_t i = 0; i < util_cpu_caps.max_cpus && i < UTIL_MAX_CPUS; i++) { 503 uint32_t cpu_bit = 1u << (i % 32); 504 505 mask[i / 32] = cpu_bit; 506 507 /* The assumption is that trying to bind the thread to a CPU that is 508 * offline will fail. 509 */ 510 if (util_set_current_thread_affinity(mask, 511 !saved ? saved_mask : NULL, 512 util_cpu_caps.num_cpu_mask_bits)) { 513 saved = true; 514 515 /* Query the APIC ID of the current core. */ 516 cpuid(0x00000001, regs); 517 unsigned apic_id = regs[1] >> 24; 518 519 /* Query the total core count for the CPU */ 520 uint32_t core_count = 1; 521 if (regs[3] & (1 << 28)) 522 core_count = (regs[1] >> 16) & 0xff; 523 524 core_count = util_next_power_of_two(core_count); 525 526 /* Query the L3 cache count. */ 527 cpuid_count(0x8000001D, 3, regs); 528 unsigned cache_level = (regs[0] >> 5) & 0x7; 529 unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1; 530 531 if (cache_level != 3) 532 continue; 533 534 unsigned local_core_id = apic_id & (core_count - 1); 535 unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count); 536 unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3); 537#define L3_ID(p, i) (p << 16 | i << 1 | 1); 538 539 unsigned l3_id = L3_ID(phys_id, local_l3_cache_index); 540 int idx = -1; 541 for (unsigned c = 0; c < num_L3_caches; c++) { 542 if (L3_found[c] == l3_id) { 543 idx = c; 544 break; 545 } 546 } 547 if (idx == -1) { 548 idx = num_L3_caches; 549 L3_found[num_L3_caches++] = l3_id; 550 L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches); 551 if (!L3_affinity_masks) 552 return; 553 memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask)); 554 } 555 util_cpu_caps.cpu_to_L3[i] = idx; 556 L3_affinity_masks[idx][i / 32] |= cpu_bit; 557 558 } 559 mask[i / 32] = 0; 560 } 561 562 util_cpu_caps.num_L3_caches = num_L3_caches; 563 util_cpu_caps.L3_affinity_mask = L3_affinity_masks; 564 565 if (saved) { 566 if (debug_get_option_dump_cpu()) { 567 fprintf(stderr, "CPU <-> L3 cache mapping:\n"); 568 for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) { 569 fprintf(stderr, " - L3 %u mask = ", i); 570 for (int j = util_cpu_caps.max_cpus - 1; j >= 0; j -= 32) 571 fprintf(stderr, "%08x ", util_cpu_caps.L3_affinity_mask[i][j / 32]); 572 fprintf(stderr, "\n"); 573 } 574 } 575 576 /* Restore the original affinity mask. */ 577 util_set_current_thread_affinity(saved_mask, NULL, 578 util_cpu_caps.num_cpu_mask_bits); 579 } else { 580 if (debug_get_option_dump_cpu()) 581 fprintf(stderr, "Cannot set thread affinity for any thread.\n"); 582 } 583 } 584#endif 585} 586 587static void 588util_cpu_detect_once(void) 589{ 590 int available_cpus = 0; 591 int total_cpus = 0; 592 593 memset(&util_cpu_caps, 0, sizeof util_cpu_caps); 594 595 /* Count the number of CPUs in system */ 596#if defined(PIPE_OS_WINDOWS) 597 { 598 SYSTEM_INFO system_info; 599 GetSystemInfo(&system_info); 600 available_cpus = MAX2(1, system_info.dwNumberOfProcessors); 601 } 602#elif defined(PIPE_OS_UNIX) 603# if defined(HAS_SCHED_GETAFFINITY) 604 { 605 /* sched_setaffinity() can be used to further restrict the number of 606 * CPUs on which the process can run. Use sched_getaffinity() to 607 * determine the true number of available CPUs. 608 * 609 * FIXME: The Linux manual page for sched_getaffinity describes how this 610 * simple implementation will fail with > 1024 CPUs, and we'll fall back 611 * to the _SC_NPROCESSORS_ONLN path. Support for > 1024 CPUs can be 612 * added to this path once someone has such a system for testing. 613 */ 614 cpu_set_t affin; 615 if (sched_getaffinity(getpid(), sizeof(affin), &affin) == 0) 616 available_cpus = CPU_COUNT(&affin); 617 } 618# endif 619 620 /* Linux, FreeBSD, DragonFly, and Mac OS X should have 621 * _SC_NOPROCESSORS_ONLN. NetBSD and OpenBSD should have HW_NCPUONLINE. 622 * This is what FFmpeg uses on those platforms. 623 */ 624# if defined(PIPE_OS_BSD) && defined(HW_NCPUONLINE) 625 if (available_cpus == 0) { 626 const int mib[] = { CTL_HW, HW_NCPUONLINE }; 627 int ncpu; 628 size_t len = sizeof(ncpu); 629 630 sysctl(mib, 2, &ncpu, &len, NULL, 0); 631 available_cpus = ncpu; 632 } 633# elif defined(_SC_NPROCESSORS_ONLN) 634 if (available_cpus == 0) { 635 available_cpus = sysconf(_SC_NPROCESSORS_ONLN); 636 if (available_cpus == ~0) 637 available_cpus = 1; 638 } 639# elif defined(PIPE_OS_BSD) 640 if (available_cpus == 0) { 641 const int mib[] = { CTL_HW, HW_NCPU }; 642 int ncpu; 643 int len = sizeof(ncpu); 644 645 sysctl(mib, 2, &ncpu, &len, NULL, 0); 646 available_cpus = ncpu; 647 } 648# endif /* defined(PIPE_OS_BSD) */ 649 650 /* Determine the maximum number of CPUs configured in the system. This is 651 * used to properly set num_cpu_mask_bits below. On BSDs that don't have 652 * HW_NCPUONLINE, it was not clear whether HW_NCPU is the number of 653 * configured or the number of online CPUs. For that reason, prefer the 654 * _SC_NPROCESSORS_CONF path on all BSDs. 655 */ 656# if defined(_SC_NPROCESSORS_CONF) 657 total_cpus = sysconf(_SC_NPROCESSORS_CONF); 658 if (total_cpus == ~0) 659 total_cpus = 1; 660# elif defined(PIPE_OS_BSD) 661 { 662 const int mib[] = { CTL_HW, HW_NCPU }; 663 int ncpu; 664 int len = sizeof(ncpu); 665 666 sysctl(mib, 2, &ncpu, &len, NULL, 0); 667 total_cpus = ncpu; 668 } 669# endif /* defined(PIPE_OS_BSD) */ 670#endif /* defined(PIPE_OS_UNIX) */ 671 672 util_cpu_caps.nr_cpus = MAX2(1, available_cpus); 673 total_cpus = MAX2(total_cpus, util_cpu_caps.nr_cpus); 674 675 util_cpu_caps.max_cpus = total_cpus; 676 util_cpu_caps.num_cpu_mask_bits = align(total_cpus, 32); 677 678 /* Make the fallback cacheline size nonzero so that it can be 679 * safely passed to align(). 680 */ 681 util_cpu_caps.cacheline = sizeof(void *); 682 683#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) 684 if (has_cpuid()) { 685 uint32_t regs[4]; 686 uint32_t regs2[4]; 687 688 util_cpu_caps.cacheline = 32; 689 690 /* Get max cpuid level */ 691 cpuid(0x00000000, regs); 692 693 if (regs[0] >= 0x00000001) { 694 unsigned int cacheline; 695 696 cpuid (0x00000001, regs2); 697 698 util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf; 699 /* Add "extended family". */ 700 if (util_cpu_caps.x86_cpu_type == 0xf) 701 util_cpu_caps.x86_cpu_type += ((regs2[0] >> 20) & 0xff); 702 703 switch (util_cpu_caps.x86_cpu_type) { 704 case 0x17: 705 util_cpu_caps.family = CPU_AMD_ZEN1_ZEN2; 706 break; 707 case 0x18: 708 util_cpu_caps.family = CPU_AMD_ZEN_HYGON; 709 break; 710 case 0x19: 711 util_cpu_caps.family = CPU_AMD_ZEN3; 712 break; 713 default: 714 if (util_cpu_caps.x86_cpu_type > 0x19) 715 util_cpu_caps.family = CPU_AMD_ZEN_NEXT; 716 } 717 718 /* general feature flags */ 719 util_cpu_caps.has_tsc = (regs2[3] >> 4) & 1; /* 0x0000010 */ 720 util_cpu_caps.has_mmx = (regs2[3] >> 23) & 1; /* 0x0800000 */ 721 util_cpu_caps.has_sse = (regs2[3] >> 25) & 1; /* 0x2000000 */ 722 util_cpu_caps.has_sse2 = (regs2[3] >> 26) & 1; /* 0x4000000 */ 723 util_cpu_caps.has_sse3 = (regs2[2] >> 0) & 1; /* 0x0000001 */ 724 util_cpu_caps.has_ssse3 = (regs2[2] >> 9) & 1; /* 0x0000020 */ 725 util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1; 726 util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1; 727 util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1; 728 util_cpu_caps.has_avx = ((regs2[2] >> 28) & 1) && // AVX 729 ((regs2[2] >> 27) & 1) && // OSXSAVE 730 ((xgetbv() & 6) == 6); // XMM & YMM 731 util_cpu_caps.has_f16c = ((regs2[2] >> 29) & 1) && util_cpu_caps.has_avx; 732 util_cpu_caps.has_fma = ((regs2[2] >> 12) & 1) && util_cpu_caps.has_avx; 733 util_cpu_caps.has_mmx2 = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */ 734#if defined(PIPE_ARCH_X86_64) 735 util_cpu_caps.has_daz = 1; 736#else 737 util_cpu_caps.has_daz = util_cpu_caps.has_sse3 || 738 (util_cpu_caps.has_sse2 && sse2_has_daz()); 739#endif 740 741 cacheline = ((regs2[1] >> 8) & 0xFF) * 8; 742 if (cacheline > 0) 743 util_cpu_caps.cacheline = cacheline; 744 } 745 if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) { 746 uint32_t regs7[4]; 747 cpuid_count(0x00000007, 0x00000000, regs7); 748 util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1; 749 } 750 751 // check for avx512 752 if (((regs2[2] >> 27) & 1) && // OSXSAVE 753 (xgetbv() & (0x7 << 5)) && // OPMASK: upper-256 enabled by OS 754 ((xgetbv() & 6) == 6)) { // XMM/YMM enabled by OS 755 uint32_t regs3[4]; 756 cpuid_count(0x00000007, 0x00000000, regs3); 757 util_cpu_caps.has_avx512f = (regs3[1] >> 16) & 1; 758 util_cpu_caps.has_avx512dq = (regs3[1] >> 17) & 1; 759 util_cpu_caps.has_avx512ifma = (regs3[1] >> 21) & 1; 760 util_cpu_caps.has_avx512pf = (regs3[1] >> 26) & 1; 761 util_cpu_caps.has_avx512er = (regs3[1] >> 27) & 1; 762 util_cpu_caps.has_avx512cd = (regs3[1] >> 28) & 1; 763 util_cpu_caps.has_avx512bw = (regs3[1] >> 30) & 1; 764 util_cpu_caps.has_avx512vl = (regs3[1] >> 31) & 1; 765 util_cpu_caps.has_avx512vbmi = (regs3[2] >> 1) & 1; 766 } 767 768 if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) { 769 /* GenuineIntel */ 770 util_cpu_caps.has_intel = 1; 771 } 772 773 cpuid(0x80000000, regs); 774 775 if (regs[0] >= 0x80000001) { 776 777 cpuid(0x80000001, regs2); 778 779 util_cpu_caps.has_mmx |= (regs2[3] >> 23) & 1; 780 util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1; 781 util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1; 782 util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1; 783 784 util_cpu_caps.has_xop = util_cpu_caps.has_avx && 785 ((regs2[2] >> 11) & 1); 786 } 787 788 if (regs[0] >= 0x80000006) { 789 /* should we really do this if the clflush size above worked? */ 790 unsigned int cacheline; 791 cpuid(0x80000006, regs2); 792 cacheline = regs2[2] & 0xFF; 793 if (cacheline > 0) 794 util_cpu_caps.cacheline = cacheline; 795 } 796 797 if (!util_cpu_caps.has_sse) { 798 util_cpu_caps.has_sse2 = 0; 799 util_cpu_caps.has_sse3 = 0; 800 util_cpu_caps.has_ssse3 = 0; 801 util_cpu_caps.has_sse4_1 = 0; 802 } 803 } 804#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ 805 806#if defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64) 807 check_os_arm_support(); 808#endif 809 810#if defined(PIPE_ARCH_PPC) 811 check_os_altivec_support(); 812#endif /* PIPE_ARCH_PPC */ 813 814#if defined(PIPE_ARCH_MIPS64) 815 check_os_mips64_support(); 816#endif /* PIPE_ARCH_MIPS64 */ 817 818#if defined(PIPE_ARCH_S390) 819 util_cpu_caps.family = CPU_S390X; 820#endif 821 822 get_cpu_topology(); 823 824 if (debug_get_option_dump_cpu()) { 825 printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus); 826 827 printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type); 828 printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline); 829 830 printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc); 831 printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx); 832 printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2); 833 printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse); 834 printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2); 835 printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3); 836 printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3); 837 printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1); 838 printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2); 839 printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx); 840 printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2); 841 printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c); 842 printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt); 843 printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow); 844 printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext); 845 printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop); 846 printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec); 847 printf("util_cpu_caps.has_vsx = %u\n", util_cpu_caps.has_vsx); 848 printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon); 849 printf("util_cpu_caps.has_msa = %u\n", util_cpu_caps.has_msa); 850 printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz); 851 printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f); 852 printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq); 853 printf("util_cpu_caps.has_avx512ifma = %u\n", util_cpu_caps.has_avx512ifma); 854 printf("util_cpu_caps.has_avx512pf = %u\n", util_cpu_caps.has_avx512pf); 855 printf("util_cpu_caps.has_avx512er = %u\n", util_cpu_caps.has_avx512er); 856 printf("util_cpu_caps.has_avx512cd = %u\n", util_cpu_caps.has_avx512cd); 857 printf("util_cpu_caps.has_avx512bw = %u\n", util_cpu_caps.has_avx512bw); 858 printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl); 859 printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi); 860 printf("util_cpu_caps.num_L3_caches = %u\n", util_cpu_caps.num_L3_caches); 861 printf("util_cpu_caps.num_cpu_mask_bits = %u\n", util_cpu_caps.num_cpu_mask_bits); 862 } 863 864 /* This must happen at the end as it's used to guard everything else */ 865 p_atomic_set(&util_cpu_caps.detect_done, 1); 866} 867 868static once_flag cpu_once_flag = ONCE_FLAG_INIT; 869 870void 871util_cpu_detect(void) 872{ 873 call_once(&cpu_once_flag, util_cpu_detect_once); 874} 875