1/* 2 * Copyright © 2017 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sub license, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 13 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 14 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 15 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS 16 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 19 * USE OR OTHER DEALINGS IN THE SOFTWARE. 20 * 21 * The above copyright notice and this permission notice (including the 22 * next paragraph) shall be included in all copies or substantial portions 23 * of the Software. 24 */ 25 26#include "ac_gpu_info.h" 27#include "ac_shader_util.h" 28#include "ac_debug.h" 29 30#include "addrlib/src/amdgpu_asic_addr.h" 31#include "sid.h" 32#include "util/macros.h" 33#include "util/u_cpu_detect.h" 34#include "util/u_math.h" 35#include "util/os_misc.h" 36#include "util/bitset.h" 37 38#include <stdio.h> 39#include <ctype.h> 40 41#define AMDGPU_ARCTURUS_RANGE 0x32, 0x3C 42#define AMDGPU_ALDEBARAN_RANGE 0x3C, 0xFF 43 44#define ASICREV_IS_ARCTURUS(r) ASICREV_IS(r, ARCTURUS) 45#define ASICREV_IS_ALDEBARAN(r) ASICREV_IS(r, ALDEBARAN) 46 47#ifdef _WIN32 48#define DRM_CAP_ADDFB2_MODIFIERS 0x10 49#define DRM_CAP_SYNCOBJ 0x13 50#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 51#define AMDGPU_GEM_DOMAIN_GTT 0x2 52#define AMDGPU_GEM_DOMAIN_VRAM 0x4 53#define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0) 54#define AMDGPU_GEM_CREATE_ENCRYPTED (1 << 10) 55#define AMDGPU_HW_IP_GFX 0 56#define AMDGPU_HW_IP_COMPUTE 1 57#define AMDGPU_HW_IP_DMA 2 58#define AMDGPU_HW_IP_UVD 3 59#define AMDGPU_HW_IP_VCE 4 60#define AMDGPU_HW_IP_UVD_ENC 5 61#define AMDGPU_HW_IP_VCN_DEC 6 62#define AMDGPU_HW_IP_VCN_ENC 7 63#define AMDGPU_HW_IP_VCN_JPEG 8 64#define AMDGPU_IDS_FLAGS_FUSION 0x1 65#define AMDGPU_IDS_FLAGS_PREEMPTION 0x2 66#define AMDGPU_IDS_FLAGS_TMZ 0x4 67#define AMDGPU_INFO_FW_VCE 0x1 68#define AMDGPU_INFO_FW_UVD 0x2 69#define AMDGPU_INFO_FW_GFX_ME 0x04 70#define AMDGPU_INFO_FW_GFX_PFP 0x05 71#define AMDGPU_INFO_FW_GFX_CE 0x06 72#define AMDGPU_INFO_DEV_INFO 0x16 73#define AMDGPU_INFO_MEMORY 0x19 74#define AMDGPU_INFO_VIDEO_CAPS_DECODE 0 75#define AMDGPU_INFO_VIDEO_CAPS_ENCODE 1 76#define AMDGPU_INFO_FW_GFX_MEC 0x08 77 78#define AMDGPU_VRAM_TYPE_UNKNOWN 0 79#define AMDGPU_VRAM_TYPE_GDDR1 1 80#define AMDGPU_VRAM_TYPE_DDR2 2 81#define AMDGPU_VRAM_TYPE_GDDR3 3 82#define AMDGPU_VRAM_TYPE_GDDR4 4 83#define AMDGPU_VRAM_TYPE_GDDR5 5 84#define AMDGPU_VRAM_TYPE_HBM 6 85#define AMDGPU_VRAM_TYPE_DDR3 7 86#define AMDGPU_VRAM_TYPE_DDR4 8 87#define AMDGPU_VRAM_TYPE_GDDR6 9 88#define AMDGPU_VRAM_TYPE_DDR5 10 89 90struct drm_amdgpu_heap_info { 91 uint64_t total_heap_size; 92}; 93struct drm_amdgpu_memory_info { 94 struct drm_amdgpu_heap_info vram; 95 struct drm_amdgpu_heap_info cpu_accessible_vram; 96 struct drm_amdgpu_heap_info gtt; 97}; 98struct drm_amdgpu_info_device { 99 /** PCI Device ID */ 100 uint32_t device_id; 101 /** Internal chip revision: A0, A1, etc.) */ 102 uint32_t chip_rev; 103 uint32_t external_rev; 104 /** Revision id in PCI Config space */ 105 uint32_t pci_rev; 106 uint32_t family; 107 uint32_t num_shader_engines; 108 uint32_t num_shader_arrays_per_engine; 109 /* in KHz */ 110 uint32_t gpu_counter_freq; 111 uint64_t max_engine_clock; 112 uint64_t max_memory_clock; 113 /* cu information */ 114 uint32_t cu_active_number; 115 /* NOTE: cu_ao_mask is INVALID, DON'T use it */ 116 uint32_t cu_ao_mask; 117 uint32_t cu_bitmap[4][4]; 118 /** Render backend pipe mask. One render backend is CB+DB. */ 119 uint32_t enabled_rb_pipes_mask; 120 uint32_t num_rb_pipes; 121 uint32_t num_hw_gfx_contexts; 122 uint32_t _pad; 123 uint64_t ids_flags; 124 /** Starting virtual address for UMDs. */ 125 uint64_t virtual_address_offset; 126 /** The maximum virtual address */ 127 uint64_t virtual_address_max; 128 /** Required alignment of virtual addresses. */ 129 uint32_t virtual_address_alignment; 130 /** Page table entry - fragment size */ 131 uint32_t pte_fragment_size; 132 uint32_t gart_page_size; 133 /** constant engine ram size*/ 134 uint32_t ce_ram_size; 135 /** video memory type info*/ 136 uint32_t vram_type; 137 /** video memory bit width*/ 138 uint32_t vram_bit_width; 139 /* vce harvesting instance */ 140 uint32_t vce_harvest_config; 141 /* gfx double offchip LDS buffers */ 142 uint32_t gc_double_offchip_lds_buf; 143 /* NGG Primitive Buffer */ 144 uint64_t prim_buf_gpu_addr; 145 /* NGG Position Buffer */ 146 uint64_t pos_buf_gpu_addr; 147 /* NGG Control Sideband */ 148 uint64_t cntl_sb_buf_gpu_addr; 149 /* NGG Parameter Cache */ 150 uint64_t param_buf_gpu_addr; 151 uint32_t prim_buf_size; 152 uint32_t pos_buf_size; 153 uint32_t cntl_sb_buf_size; 154 uint32_t param_buf_size; 155 /* wavefront size*/ 156 uint32_t wave_front_size; 157 /* shader visible vgprs*/ 158 uint32_t num_shader_visible_vgprs; 159 /* CU per shader array*/ 160 uint32_t num_cu_per_sh; 161 /* number of tcc blocks*/ 162 uint32_t num_tcc_blocks; 163 /* gs vgt table depth*/ 164 uint32_t gs_vgt_table_depth; 165 /* gs primitive buffer depth*/ 166 uint32_t gs_prim_buffer_depth; 167 /* max gs wavefront per vgt*/ 168 uint32_t max_gs_waves_per_vgt; 169 uint32_t _pad1; 170 /* always on cu bitmap */ 171 uint32_t cu_ao_bitmap[4][4]; 172 /** Starting high virtual address for UMDs. */ 173 uint64_t high_va_offset; 174 /** The maximum high virtual address */ 175 uint64_t high_va_max; 176 /* gfx10 pa_sc_tile_steering_override */ 177 uint32_t pa_sc_tile_steering_override; 178 /* disabled TCCs */ 179 uint64_t tcc_disabled_mask; 180}; 181struct drm_amdgpu_info_hw_ip { 182 uint32_t hw_ip_version_major; 183 uint32_t hw_ip_version_minor; 184 uint32_t ib_start_alignment; 185 uint32_t ib_size_alignment; 186 uint32_t available_rings; 187 uint32_t ip_discovery_version; 188}; 189typedef struct _drmPciBusInfo { 190 uint16_t domain; 191 uint8_t bus; 192 uint8_t dev; 193 uint8_t func; 194} drmPciBusInfo, *drmPciBusInfoPtr; 195typedef struct _drmDevice { 196 union { 197 drmPciBusInfoPtr pci; 198 } businfo; 199} drmDevice, *drmDevicePtr; 200enum amdgpu_sw_info { 201 amdgpu_sw_info_address32_hi = 0, 202}; 203typedef struct amdgpu_device *amdgpu_device_handle; 204typedef struct amdgpu_bo *amdgpu_bo_handle; 205struct amdgpu_bo_alloc_request { 206 uint64_t alloc_size; 207 uint64_t phys_alignment; 208 uint32_t preferred_heap; 209 uint64_t flags; 210}; 211struct amdgpu_gds_resource_info { 212 uint32_t gds_gfx_partition_size; 213 uint32_t gds_total_size; 214}; 215struct amdgpu_buffer_size_alignments { 216 uint64_t size_local; 217 uint64_t size_remote; 218}; 219struct amdgpu_heap_info { 220 uint64_t heap_size; 221}; 222struct amdgpu_gpu_info { 223 uint32_t asic_id; 224 uint32_t chip_external_rev; 225 uint32_t family_id; 226 uint64_t ids_flags; 227 uint64_t max_engine_clk; 228 uint64_t max_memory_clk; 229 uint32_t num_shader_engines; 230 uint32_t num_shader_arrays_per_engine; 231 uint32_t rb_pipes; 232 uint32_t enabled_rb_pipes_mask; 233 uint32_t gpu_counter_freq; 234 uint32_t mc_arb_ramcfg; 235 uint32_t gb_addr_cfg; 236 uint32_t gb_tile_mode[32]; 237 uint32_t gb_macro_tile_mode[16]; 238 uint32_t cu_bitmap[4][4]; 239 uint32_t vram_type; 240 uint32_t vram_bit_width; 241 uint32_t ce_ram_size; 242 uint32_t vce_harvest_config; 243 uint32_t pci_rev_id; 244}; 245static int drmGetCap(int fd, uint64_t capability, uint64_t *value) 246{ 247 return -EINVAL; 248} 249static void drmFreeDevice(drmDevicePtr *device) 250{ 251} 252static int drmGetDevice2(int fd, uint32_t flags, drmDevicePtr *device) 253{ 254 return -ENODEV; 255} 256static int amdgpu_bo_alloc(amdgpu_device_handle dev, 257 struct amdgpu_bo_alloc_request *alloc_buffer, 258 amdgpu_bo_handle *buf_handle) 259{ 260 return -EINVAL; 261} 262static int amdgpu_bo_free(amdgpu_bo_handle buf_handle) 263{ 264 return -EINVAL; 265} 266static int amdgpu_query_buffer_size_alignment(amdgpu_device_handle dev, 267 struct amdgpu_buffer_size_alignments 268 *info) 269{ 270 return -EINVAL; 271} 272static int amdgpu_query_firmware_version(amdgpu_device_handle dev, unsigned fw_type, 273 unsigned ip_instance, unsigned index, 274 uint32_t *version, uint32_t *feature) 275{ 276 return -EINVAL; 277} 278static int amdgpu_query_hw_ip_info(amdgpu_device_handle dev, unsigned type, 279 unsigned ip_instance, 280 struct drm_amdgpu_info_hw_ip *info) 281{ 282 return -EINVAL; 283} 284static int amdgpu_query_heap_info(amdgpu_device_handle dev, uint32_t heap, 285 uint32_t flags, struct amdgpu_heap_info *info) 286{ 287 return -EINVAL; 288} 289static int amdgpu_query_gpu_info(amdgpu_device_handle dev, 290 struct amdgpu_gpu_info *info) 291{ 292 return -EINVAL; 293} 294static int amdgpu_query_info(amdgpu_device_handle dev, unsigned info_id, 295 unsigned size, void *value) 296{ 297 return -EINVAL; 298} 299static int amdgpu_query_sw_info(amdgpu_device_handle dev, enum amdgpu_sw_info info, 300 void *value) 301{ 302 return -EINVAL; 303} 304static int amdgpu_query_gds_info(amdgpu_device_handle dev, 305 struct amdgpu_gds_resource_info *gds_info) 306{ 307 return -EINVAL; 308} 309static int amdgpu_query_video_caps_info(amdgpu_device_handle dev, unsigned cap_type, 310 unsigned size, void *value) 311{ 312 return -EINVAL; 313} 314static const char *amdgpu_get_marketing_name(amdgpu_device_handle dev) 315{ 316 return NULL; 317} 318#else 319#include "drm-uapi/amdgpu_drm.h" 320#include <amdgpu.h> 321#include <xf86drm.h> 322#endif 323 324#define CIK_TILE_MODE_COLOR_2D 14 325 326#define CIK__GB_TILE_MODE__PIPE_CONFIG(x) (((x) >> 6) & 0x1f) 327#define CIK__PIPE_CONFIG__ADDR_SURF_P2 0 328#define CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16 4 329#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16 5 330#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32 6 331#define CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32 7 332#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16 8 333#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16 9 334#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16 10 335#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16 11 336#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16 12 337#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32 13 338#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32 14 339#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16 16 340#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16 17 341 342static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info) 343{ 344 unsigned mode2d = info->gb_tile_mode[CIK_TILE_MODE_COLOR_2D]; 345 346 switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) { 347 case CIK__PIPE_CONFIG__ADDR_SURF_P2: 348 return 2; 349 case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16: 350 case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16: 351 case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32: 352 case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32: 353 return 4; 354 case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16: 355 case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16: 356 case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16: 357 case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16: 358 case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16: 359 case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32: 360 case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32: 361 return 8; 362 case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16: 363 case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16: 364 return 16; 365 default: 366 fprintf(stderr, "Invalid GFX7 pipe configuration, assuming P2\n"); 367 assert(!"this should never occur"); 368 return 2; 369 } 370} 371 372static bool has_syncobj(int fd) 373{ 374 uint64_t value; 375 if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &value)) 376 return false; 377 return value ? true : false; 378} 379 380static bool has_timeline_syncobj(int fd) 381{ 382 uint64_t value; 383 if (drmGetCap(fd, DRM_CAP_SYNCOBJ_TIMELINE, &value)) 384 return false; 385 return value ? true : false; 386} 387 388static bool has_modifiers(int fd) 389{ 390 uint64_t value; 391 if (drmGetCap(fd, DRM_CAP_ADDFB2_MODIFIERS, &value)) 392 return false; 393 return value ? true : false; 394} 395 396static uint64_t fix_vram_size(uint64_t size) 397{ 398 /* The VRAM size is underreported, so we need to fix it, because 399 * it's used to compute the number of memory modules for harvesting. 400 */ 401 return align64(size, 256 * 1024 * 1024); 402} 403 404static bool 405has_tmz_support(amdgpu_device_handle dev, struct radeon_info *info, uint32_t ids_flags) 406{ 407 struct amdgpu_bo_alloc_request request = {0}; 408 int r; 409 amdgpu_bo_handle bo; 410 411 if (ids_flags & AMDGPU_IDS_FLAGS_TMZ) 412 return true; 413 414 /* AMDGPU_IDS_FLAGS_TMZ is supported starting from drm_minor 40 */ 415 if (info->drm_minor >= 40) 416 return false; 417 418 /* Find out ourselves if TMZ is enabled */ 419 if (info->gfx_level < GFX9) 420 return false; 421 422 if (info->drm_minor < 36) 423 return false; 424 425 request.alloc_size = 256; 426 request.phys_alignment = 1024; 427 request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM; 428 request.flags = AMDGPU_GEM_CREATE_ENCRYPTED; 429 r = amdgpu_bo_alloc(dev, &request, &bo); 430 if (r) 431 return false; 432 amdgpu_bo_free(bo); 433 return true; 434} 435 436static void set_custom_cu_en_mask(struct radeon_info *info) 437{ 438 info->spi_cu_en = ~0; 439 440 const char *cu_env_var = os_get_option("AMD_CU_MASK"); 441 if (!cu_env_var) 442 return; 443 444 int size = strlen(cu_env_var); 445 char *str = alloca(size + 1); 446 memset(str, 0, size + 1); 447 448 size = 0; 449 450 /* Strip whitespace. */ 451 for (unsigned src = 0; cu_env_var[src]; src++) { 452 if (cu_env_var[src] != ' ' && cu_env_var[src] != '\t' && 453 cu_env_var[src] != '\n' && cu_env_var[src] != '\r') { 454 str[size++] = cu_env_var[src]; 455 } 456 } 457 458 /* The following syntax is used, all whitespace is ignored: 459 * ID = [0-9][0-9]* ex. base 10 numbers 460 * ID_list = (ID | ID-ID)[, (ID | ID-ID)]* ex. 0,2-4,7 461 * CU_list = 0x[0-F]* | ID_list ex. 0x337F OR 0,2-4,7 462 * AMD_CU_MASK = CU_list 463 * 464 * It's a CU mask within a shader array. It's applied to all shader arrays. 465 */ 466 bool is_good_form = true; 467 uint32_t spi_cu_en = 0; 468 469 if (size > 2 && str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) { 470 str += 2; 471 size -= 2; 472 473 for (unsigned i = 0; i < size; i++) 474 is_good_form &= isxdigit(str[i]) != 0; 475 476 if (!is_good_form) { 477 fprintf(stderr, "amd: invalid AMD_CU_MASK: ill-formed hex value\n"); 478 } else { 479 spi_cu_en = strtol(str, NULL, 16); 480 } 481 } else { 482 /* Parse ID_list. */ 483 long first = 0, last = -1; 484 485 if (!isdigit(*str)) { 486 is_good_form = false; 487 } else { 488 while (*str) { 489 bool comma = false; 490 491 if (isdigit(*str)) { 492 first = last = strtol(str, &str, 10); 493 } else if (*str == '-') { 494 str++; 495 /* Parse a digit after a dash. */ 496 if (isdigit(*str)) { 497 last = strtol(str, &str, 10); 498 } else { 499 fprintf(stderr, "amd: invalid AMD_CU_MASK: expected a digit after -\n"); 500 is_good_form = false; 501 break; 502 } 503 } else if (*str == ',') { 504 comma = true; 505 str++; 506 if (!isdigit(*str)) { 507 fprintf(stderr, "amd: invalid AMD_CU_MASK: expected a digit after ,\n"); 508 is_good_form = false; 509 break; 510 } 511 } 512 513 if (comma || !*str) { 514 if (first > last) { 515 fprintf(stderr, "amd: invalid AMD_CU_MASK: range not increasing (%li, %li)\n", first, last); 516 is_good_form = false; 517 break; 518 } 519 if (last > 31) { 520 fprintf(stderr, "amd: invalid AMD_CU_MASK: index too large (%li)\n", last); 521 is_good_form = false; 522 break; 523 } 524 525 spi_cu_en |= BITFIELD_RANGE(first, last - first + 1); 526 last = -1; 527 } 528 } 529 } 530 } 531 532 /* The mask is parsed. Now assign bits to CUs. */ 533 if (is_good_form) { 534 bool error = false; 535 536 /* Clear bits that have no effect. */ 537 spi_cu_en &= BITFIELD_MASK(info->max_good_cu_per_sa); 538 539 if (!spi_cu_en) { 540 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU in each SA must be enabled\n"); 541 error = true; 542 } 543 544 if (info->has_graphics) { 545 uint32_t min_full_cu_mask = BITFIELD_MASK(info->min_good_cu_per_sa); 546 547 /* The hw ignores all non-compute CU masks if any of them is 0. Disallow that. */ 548 if ((spi_cu_en & min_full_cu_mask) == 0) { 549 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU from 0x%x per SA must be " 550 "enabled (SPI limitation)\n", min_full_cu_mask); 551 error = true; 552 } 553 554 /* We usually disable 1 or 2 CUs for VS and GS, which means at last 1 other CU 555 * must be enabled. 556 */ 557 uint32_t cu_mask_ge, unused; 558 ac_compute_late_alloc(info, false, false, false, &unused, &cu_mask_ge); 559 cu_mask_ge &= min_full_cu_mask; 560 561 if ((spi_cu_en & cu_mask_ge) == 0) { 562 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU from 0x%x per SA must be " 563 "enabled (late alloc constraint for GE)\n", cu_mask_ge); 564 error = true; 565 } 566 567 if ((min_full_cu_mask & spi_cu_en & ~cu_mask_ge) == 0) { 568 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU from 0x%x per SA must be " 569 "enabled (late alloc constraint for PS)\n", 570 min_full_cu_mask & ~cu_mask_ge); 571 error = true; 572 } 573 } 574 575 if (!error) { 576 info->spi_cu_en = spi_cu_en; 577 info->spi_cu_en_has_effect = spi_cu_en & BITFIELD_MASK(info->max_good_cu_per_sa); 578 } 579 } 580} 581 582bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info) 583{ 584 struct amdgpu_gpu_info amdinfo; 585 struct drm_amdgpu_info_device device_info = {0}; 586 struct amdgpu_buffer_size_alignments alignment_info = {0}; 587 uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0; 588 int r, i, j; 589 amdgpu_device_handle dev = dev_p; 590 drmDevicePtr devinfo; 591 592 STATIC_ASSERT(AMDGPU_HW_IP_GFX == AMD_IP_GFX); 593 STATIC_ASSERT(AMDGPU_HW_IP_COMPUTE == AMD_IP_COMPUTE); 594 STATIC_ASSERT(AMDGPU_HW_IP_DMA == AMD_IP_SDMA); 595 STATIC_ASSERT(AMDGPU_HW_IP_UVD == AMD_IP_UVD); 596 STATIC_ASSERT(AMDGPU_HW_IP_VCE == AMD_IP_VCE); 597 STATIC_ASSERT(AMDGPU_HW_IP_UVD_ENC == AMD_IP_UVD_ENC); 598 STATIC_ASSERT(AMDGPU_HW_IP_VCN_DEC == AMD_IP_VCN_DEC); 599 STATIC_ASSERT(AMDGPU_HW_IP_VCN_ENC == AMD_IP_VCN_ENC); 600 STATIC_ASSERT(AMDGPU_HW_IP_VCN_JPEG == AMD_IP_VCN_JPEG); 601 602 /* Get PCI info. */ 603 r = drmGetDevice2(fd, 0, &devinfo); 604 if (r) { 605 fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n"); 606 return false; 607 } 608 info->pci_domain = devinfo->businfo.pci->domain; 609 info->pci_bus = devinfo->businfo.pci->bus; 610 info->pci_dev = devinfo->businfo.pci->dev; 611 info->pci_func = devinfo->businfo.pci->func; 612 drmFreeDevice(&devinfo); 613 614 assert(info->drm_major == 3); 615 info->is_amdgpu = true; 616 617 if (info->drm_minor < 15) { 618 fprintf(stderr, "amdgpu: DRM version is %u.%u.%u, but this driver is " 619 "only compatible with 3.15.0 (kernel 4.12) or later.\n", 620 info->drm_major, info->drm_minor, info->drm_patchlevel); 621 return false; 622 } 623 624 /* Query hardware and driver information. */ 625 r = amdgpu_query_gpu_info(dev, &amdinfo); 626 if (r) { 627 fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n"); 628 return false; 629 } 630 631 r = amdgpu_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info), &device_info); 632 if (r) { 633 fprintf(stderr, "amdgpu: amdgpu_query_info(dev_info) failed.\n"); 634 return false; 635 } 636 637 r = amdgpu_query_buffer_size_alignment(dev, &alignment_info); 638 if (r) { 639 fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n"); 640 return false; 641 } 642 643 for (unsigned ip_type = 0; ip_type < AMD_NUM_IP_TYPES; ip_type++) { 644 struct drm_amdgpu_info_hw_ip ip_info = {0}; 645 646 r = amdgpu_query_hw_ip_info(dev, ip_type, 0, &ip_info); 647 if (r || !ip_info.available_rings) 648 continue; 649 650 /* Gfx6-8 don't set ip_discovery_version. */ 651 if (info->drm_minor >= 48 && ip_info.ip_discovery_version) { 652 info->ip[ip_type].ver_major = (ip_info.ip_discovery_version >> 16) & 0xff; 653 info->ip[ip_type].ver_minor = (ip_info.ip_discovery_version >> 8) & 0xff; 654 } else { 655 info->ip[ip_type].ver_major = ip_info.hw_ip_version_major; 656 info->ip[ip_type].ver_minor = ip_info.hw_ip_version_minor; 657 658 /* Fix incorrect IP versions reported by the kernel. */ 659 if (device_info.family == FAMILY_NV && 660 (ASICREV_IS(device_info.external_rev, NAVI10) || 661 ASICREV_IS(device_info.external_rev, NAVI12) || 662 ASICREV_IS(device_info.external_rev, NAVI14))) 663 info->ip[AMD_IP_GFX].ver_minor = info->ip[AMD_IP_COMPUTE].ver_minor = 1; 664 else if (device_info.family == FAMILY_NV || 665 device_info.family == FAMILY_VGH || 666 device_info.family == FAMILY_RMB || 667 device_info.family == FAMILY_GC_10_3_6 || 668 device_info.family == FAMILY_GC_10_3_7) 669 info->ip[AMD_IP_GFX].ver_minor = info->ip[AMD_IP_COMPUTE].ver_minor = 3; 670 } 671 info->ip[ip_type].num_queues = util_bitcount(ip_info.available_rings); 672 info->ib_alignment = MAX3(info->ib_alignment, ip_info.ib_start_alignment, 673 ip_info.ib_size_alignment); 674 } 675 676 /* Only require gfx or compute. */ 677 if (!info->ip[AMD_IP_GFX].num_queues && !info->ip[AMD_IP_COMPUTE].num_queues) { 678 fprintf(stderr, "amdgpu: failed to find gfx or compute.\n"); 679 return false; 680 } 681 682 assert(util_is_power_of_two_or_zero(info->ip[AMD_IP_COMPUTE].num_queues)); 683 assert(util_is_power_of_two_or_zero(info->ip[AMD_IP_SDMA].num_queues)); 684 685 /* The kernel pads gfx and compute IBs to 256 dwords since: 686 * 66f3b2d527154bd258a57c8815004b5964aa1cf5 687 * Do the same. 688 */ 689 info->ib_alignment = MAX2(info->ib_alignment, 1024); 690 691 r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, &info->me_fw_version, 692 &info->me_fw_feature); 693 if (r) { 694 fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n"); 695 return false; 696 } 697 698 r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_MEC, 0, 0, &info->mec_fw_version, 699 &info->mec_fw_feature); 700 if (r) { 701 fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(mec) failed.\n"); 702 return false; 703 } 704 705 r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, &info->pfp_fw_version, 706 &info->pfp_fw_feature); 707 if (r) { 708 fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n"); 709 return false; 710 } 711 712 r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0, &uvd_version, &uvd_feature); 713 if (r) { 714 fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n"); 715 return false; 716 } 717 718 r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0, &vce_version, &vce_feature); 719 if (r) { 720 fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n"); 721 return false; 722 } 723 724 r = amdgpu_query_sw_info(dev, amdgpu_sw_info_address32_hi, &info->address32_hi); 725 if (r) { 726 fprintf(stderr, "amdgpu: amdgpu_query_sw_info(address32_hi) failed.\n"); 727 return false; 728 } 729 730 struct drm_amdgpu_memory_info meminfo = {0}; 731 732 r = amdgpu_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo); 733 if (r) { 734 fprintf(stderr, "amdgpu: amdgpu_query_info(memory) failed.\n"); 735 return false; 736 } 737 738 /* Note: usable_heap_size values can be random and can't be relied on. */ 739 info->gart_size_kb = DIV_ROUND_UP(meminfo.gtt.total_heap_size, 1024); 740 info->vram_size_kb = DIV_ROUND_UP(fix_vram_size(meminfo.vram.total_heap_size), 1024); 741 info->vram_vis_size_kb = DIV_ROUND_UP(meminfo.cpu_accessible_vram.total_heap_size, 1024); 742 743 if (info->drm_minor >= 41) { 744 amdgpu_query_video_caps_info(dev, AMDGPU_INFO_VIDEO_CAPS_DECODE, 745 sizeof(info->dec_caps), &(info->dec_caps)); 746 amdgpu_query_video_caps_info(dev, AMDGPU_INFO_VIDEO_CAPS_ENCODE, 747 sizeof(info->enc_caps), &(info->enc_caps)); 748 } 749 750 /* Add some margin of error, though this shouldn't be needed in theory. */ 751 info->all_vram_visible = info->vram_size_kb * 0.9 < info->vram_vis_size_kb; 752 753 /* Set chip identification. */ 754 info->pci_id = device_info.device_id; 755 info->pci_rev_id = device_info.pci_rev; 756 info->vce_harvest_config = device_info.vce_harvest_config; 757 758#define identify_chip2(asic, chipname) \ 759 if (ASICREV_IS(device_info.external_rev, asic)) { \ 760 info->family = CHIP_##chipname; \ 761 info->name = #chipname; \ 762 } 763#define identify_chip(chipname) identify_chip2(chipname, chipname) 764 765 switch (device_info.family) { 766 case FAMILY_SI: 767 identify_chip(TAHITI); 768 identify_chip(PITCAIRN); 769 identify_chip2(CAPEVERDE, VERDE); 770 identify_chip(OLAND); 771 identify_chip(HAINAN); 772 break; 773 case FAMILY_CI: 774 identify_chip(BONAIRE); 775 identify_chip(HAWAII); 776 break; 777 case FAMILY_KV: 778 identify_chip2(SPECTRE, KAVERI); 779 identify_chip2(SPOOKY, KAVERI); 780 identify_chip2(KALINDI, KABINI); 781 identify_chip2(GODAVARI, KABINI); 782 break; 783 case FAMILY_VI: 784 identify_chip(ICELAND); 785 identify_chip(TONGA); 786 identify_chip(FIJI); 787 identify_chip(POLARIS10); 788 identify_chip(POLARIS11); 789 identify_chip(POLARIS12); 790 identify_chip(VEGAM); 791 break; 792 case FAMILY_CZ: 793 identify_chip(CARRIZO); 794 identify_chip(STONEY); 795 break; 796 case FAMILY_AI: 797 identify_chip(VEGA10); 798 identify_chip(VEGA12); 799 identify_chip(VEGA20); 800 identify_chip(ARCTURUS); 801 identify_chip(ALDEBARAN); 802 break; 803 case FAMILY_RV: 804 identify_chip(RAVEN); 805 identify_chip(RAVEN2); 806 identify_chip(RENOIR); 807 break; 808 case FAMILY_NV: 809 identify_chip(NAVI10); 810 identify_chip(NAVI12); 811 identify_chip(NAVI14); 812 identify_chip(NAVI21); 813 identify_chip(NAVI22); 814 identify_chip(NAVI23); 815 identify_chip(NAVI24); 816 break; 817 case FAMILY_VGH: 818 identify_chip(VANGOGH); 819 break; 820 case FAMILY_RMB: 821 identify_chip(REMBRANDT); 822 break; 823 case FAMILY_GC_10_3_6: 824 identify_chip(GFX1036); 825 break; 826 case FAMILY_GC_10_3_7: 827 identify_chip2(GFX1037, GFX1036); 828 break; 829 case FAMILY_GFX1100: 830 identify_chip(GFX1100); 831 identify_chip(GFX1101); 832 identify_chip(GFX1102); 833 break; 834 case FAMILY_GFX1103: 835 identify_chip(GFX1103); 836 break; 837 } 838 839 if (!info->name) { 840 fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n", 841 device_info.family, device_info.external_rev); 842 return false; 843 } 844 845 memset(info->lowercase_name, 0, sizeof(info->lowercase_name)); 846 for (unsigned i = 0; info->name[i] && i < ARRAY_SIZE(info->lowercase_name) - 1; i++) 847 info->lowercase_name[i] = tolower(info->name[i]); 848 849 if (info->ip[AMD_IP_GFX].ver_major == 11) 850 info->gfx_level = GFX11; 851 else if (info->ip[AMD_IP_GFX].ver_major == 10 && info->ip[AMD_IP_GFX].ver_minor == 3) 852 info->gfx_level = GFX10_3; 853 else if (info->ip[AMD_IP_GFX].ver_major == 10 && info->ip[AMD_IP_GFX].ver_minor == 1) 854 info->gfx_level = GFX10; 855 else if (info->ip[AMD_IP_GFX].ver_major == 9 || info->ip[AMD_IP_COMPUTE].ver_major == 9) 856 info->gfx_level = GFX9; 857 else if (info->ip[AMD_IP_GFX].ver_major == 8) 858 info->gfx_level = GFX8; 859 else if (info->ip[AMD_IP_GFX].ver_major == 7) 860 info->gfx_level = GFX7; 861 else if (info->ip[AMD_IP_GFX].ver_major == 6) 862 info->gfx_level = GFX6; 863 else { 864 fprintf(stderr, "amdgpu: Unknown gfx version: %u.%u\n", 865 info->ip[AMD_IP_GFX].ver_major, info->ip[AMD_IP_GFX].ver_minor); 866 return false; 867 } 868 869 info->smart_access_memory = info->all_vram_visible && 870 info->gfx_level >= GFX10_3 && 871 util_get_cpu_caps()->family >= CPU_AMD_ZEN3 && 872 util_get_cpu_caps()->family < CPU_AMD_LAST; 873 874 info->family_id = device_info.family; 875 info->chip_external_rev = device_info.external_rev; 876 info->chip_rev = device_info.chip_rev; 877 info->marketing_name = amdgpu_get_marketing_name(dev); 878 info->is_pro_graphics = info->marketing_name && (strstr(info->marketing_name, "Pro") || 879 strstr(info->marketing_name, "PRO") || 880 strstr(info->marketing_name, "Frontier")); 881 882 /* Set which chips have dedicated VRAM. */ 883 info->has_dedicated_vram = !(device_info.ids_flags & AMDGPU_IDS_FLAGS_FUSION); 884 885 /* The kernel can split large buffers in VRAM but not in GTT, so large 886 * allocations can fail or cause buffer movement failures in the kernel. 887 */ 888 if (info->has_dedicated_vram) 889 info->max_heap_size_kb = info->vram_size_kb; 890 else 891 info->max_heap_size_kb = info->gart_size_kb; 892 893 info->vram_type = device_info.vram_type; 894 info->memory_bus_width = device_info.vram_bit_width; 895 896 /* Set which chips have uncached device memory. */ 897 info->has_l2_uncached = info->gfx_level >= GFX9; 898 899 /* Set hardware information. */ 900 /* convert the shader/memory clocks from KHz to MHz */ 901 info->max_gpu_freq_mhz = device_info.max_engine_clock / 1000; 902 info->memory_freq_mhz_effective = info->memory_freq_mhz = device_info.max_memory_clock / 1000; 903 info->max_tcc_blocks = device_info.num_tcc_blocks; 904 info->max_se = device_info.num_shader_engines; 905 info->max_sa_per_se = device_info.num_shader_arrays_per_engine; 906 info->uvd_fw_version = info->ip[AMD_IP_UVD].num_queues ? uvd_version : 0; 907 info->vce_fw_version = info->ip[AMD_IP_VCE].num_queues ? vce_version : 0; 908 909 /* Based on MemoryOpsPerClockTable from PAL. */ 910 switch (info->vram_type) { 911 case AMDGPU_VRAM_TYPE_DDR2: 912 case AMDGPU_VRAM_TYPE_DDR3: 913 case AMDGPU_VRAM_TYPE_DDR4: /* same for LPDDR4 */ 914 case AMDGPU_VRAM_TYPE_HBM: /* same for HBM2 and HBM3 */ 915 info->memory_freq_mhz_effective *= 2; 916 break; 917 case AMDGPU_VRAM_TYPE_DDR5: /* same for LPDDR5 */ 918 case AMDGPU_VRAM_TYPE_GDDR5: 919 info->memory_freq_mhz_effective *= 4; 920 break; 921 case AMDGPU_VRAM_TYPE_GDDR6: 922 info->memory_freq_mhz_effective *= 16; 923 break; 924 } 925 926 /* unified ring */ 927 info->has_video_hw.vcn_decode 928 = info->family >= CHIP_GFX1100 929 ? info->ip[AMD_IP_VCN_UNIFIED].num_queues != 0 930 : info->ip[AMD_IP_VCN_DEC].num_queues != 0; 931 info->has_userptr = true; 932 info->has_syncobj = has_syncobj(fd); 933 info->has_timeline_syncobj = has_timeline_syncobj(fd); 934 info->has_fence_to_handle = info->has_syncobj && info->drm_minor >= 21; 935 info->has_local_buffers = info->drm_minor >= 20; 936 info->has_bo_metadata = true; 937 info->has_eqaa_surface_allocator = info->gfx_level < GFX11; 938 /* Disable sparse mappings on GFX6 due to VM faults in CP DMA. Enable them once 939 * these faults are mitigated in software. 940 */ 941 info->has_sparse_vm_mappings = info->gfx_level >= GFX7; 942 info->has_scheduled_fence_dependency = info->drm_minor >= 28; 943 info->mid_command_buffer_preemption_enabled = device_info.ids_flags & AMDGPU_IDS_FLAGS_PREEMPTION; 944 info->has_tmz_support = has_tmz_support(dev, info, device_info.ids_flags); 945 info->kernel_has_modifiers = has_modifiers(fd); 946 info->has_graphics = info->ip[AMD_IP_GFX].num_queues > 0; 947 948 info->pa_sc_tile_steering_override = device_info.pa_sc_tile_steering_override; 949 info->max_render_backends = device_info.num_rb_pipes; 950 /* The value returned by the kernel driver was wrong. */ 951 if (info->family == CHIP_KAVERI) 952 info->max_render_backends = 2; 953 954 info->clock_crystal_freq = device_info.gpu_counter_freq; 955 if (!info->clock_crystal_freq) { 956 fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n"); 957 info->clock_crystal_freq = 1; 958 } 959 if (info->gfx_level >= GFX10) { 960 info->tcc_cache_line_size = 128; 961 962 if (info->drm_minor >= 35) { 963 info->num_tcc_blocks = info->max_tcc_blocks - util_bitcount64(device_info.tcc_disabled_mask); 964 } else { 965 /* This is a hack, but it's all we can do without a kernel upgrade. */ 966 info->num_tcc_blocks = info->vram_size_kb / (512 * 1024); 967 if (info->num_tcc_blocks > info->max_tcc_blocks) 968 info->num_tcc_blocks /= 2; 969 } 970 } else { 971 if (!info->has_graphics && info->family >= CHIP_ALDEBARAN) 972 info->tcc_cache_line_size = 128; 973 else 974 info->tcc_cache_line_size = 64; 975 976 info->num_tcc_blocks = info->max_tcc_blocks; 977 } 978 979 info->tcc_rb_non_coherent = !util_is_power_of_two_or_zero(info->num_tcc_blocks); 980 981 switch (info->family) { 982 case CHIP_TAHITI: 983 case CHIP_PITCAIRN: 984 case CHIP_OLAND: 985 case CHIP_HAWAII: 986 case CHIP_KABINI: 987 case CHIP_TONGA: 988 case CHIP_STONEY: 989 case CHIP_RAVEN2: 990 info->l2_cache_size = info->num_tcc_blocks * 64 * 1024; 991 break; 992 case CHIP_VERDE: 993 case CHIP_HAINAN: 994 case CHIP_BONAIRE: 995 case CHIP_KAVERI: 996 case CHIP_ICELAND: 997 case CHIP_CARRIZO: 998 case CHIP_FIJI: 999 case CHIP_POLARIS12: 1000 case CHIP_VEGAM: 1001 info->l2_cache_size = info->num_tcc_blocks * 128 * 1024; 1002 break; 1003 default: 1004 info->l2_cache_size = info->num_tcc_blocks * 256 * 1024; 1005 break; 1006 case CHIP_REMBRANDT: 1007 info->l2_cache_size = info->num_tcc_blocks * 512 * 1024; 1008 break; 1009 } 1010 1011 info->l1_cache_size = 16384; 1012 1013 info->mc_arb_ramcfg = amdinfo.mc_arb_ramcfg; 1014 info->gb_addr_config = amdinfo.gb_addr_cfg; 1015 if (info->gfx_level >= GFX9) { 1016 info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(info->gb_addr_config); 1017 info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config); 1018 } else { 1019 info->num_tile_pipes = cik_get_num_tile_pipes(&amdinfo); 1020 info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config); 1021 } 1022 info->r600_has_virtual_memory = true; 1023 1024 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above 1025 * 16KB makes some SIMDs unoccupied). 1026 * 1027 * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used. 1028 */ 1029 info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024 : 64 * 1024; 1030 /* lds_encode_granularity is the block size used for encoding registers. 1031 * lds_alloc_granularity is what the hardware will align the LDS size to. 1032 */ 1033 info->lds_encode_granularity = info->gfx_level >= GFX7 ? 128 * 4 : 64 * 4; 1034 info->lds_alloc_granularity = info->gfx_level >= GFX10_3 ? 256 * 4 : info->lds_encode_granularity; 1035 1036 /* This is "align_mask" copied from the kernel, maximums of all IP versions. */ 1037 info->ib_pad_dw_mask[AMD_IP_GFX] = 0xff; 1038 info->ib_pad_dw_mask[AMD_IP_COMPUTE] = 0xff; 1039 info->ib_pad_dw_mask[AMD_IP_SDMA] = 0xf; 1040 info->ib_pad_dw_mask[AMD_IP_UVD] = 0xf; 1041 info->ib_pad_dw_mask[AMD_IP_VCE] = 0x3f; 1042 info->ib_pad_dw_mask[AMD_IP_UVD_ENC] = 0x3f; 1043 info->ib_pad_dw_mask[AMD_IP_VCN_DEC] = 0xf; 1044 info->ib_pad_dw_mask[AMD_IP_VCN_ENC] = 0x3f; 1045 info->ib_pad_dw_mask[AMD_IP_VCN_JPEG] = 0xf; 1046 1047 /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs 1048 * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc. 1049 * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel. 1050 */ 1051 info->has_clear_state = info->gfx_level >= GFX7; 1052 1053 info->has_distributed_tess = 1054 info->gfx_level >= GFX10 || (info->gfx_level >= GFX8 && info->max_se >= 2); 1055 1056 info->has_dcc_constant_encode = 1057 info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->gfx_level >= GFX10; 1058 1059 info->has_rbplus = info->family == CHIP_STONEY || info->gfx_level >= GFX9; 1060 1061 /* Some chips have RB+ registers, but don't support RB+. Those must 1062 * always disable it. 1063 */ 1064 info->rbplus_allowed = 1065 info->has_rbplus && 1066 (info->family == CHIP_STONEY || info->family == CHIP_VEGA12 || info->family == CHIP_RAVEN || 1067 info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->gfx_level >= GFX10_3); 1068 1069 info->has_out_of_order_rast = 1070 info->gfx_level >= GFX8 && info->gfx_level <= GFX9 && info->max_se >= 2; 1071 1072 /* Whether chips support double rate packed math instructions. */ 1073 info->has_packed_math_16bit = info->gfx_level >= GFX9; 1074 1075 /* Whether chips support dot product instructions. A subset of these support a smaller 1076 * instruction encoding which accumulates with the destination. 1077 */ 1078 info->has_accelerated_dot_product = 1079 info->family == CHIP_ARCTURUS || info->family == CHIP_ALDEBARAN || 1080 info->family == CHIP_VEGA20 || info->family >= CHIP_NAVI12; 1081 1082 /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */ 1083 info->has_load_ctx_reg_pkt = 1084 info->gfx_level >= GFX9 || (info->gfx_level >= GFX8 && info->me_fw_feature >= 41); 1085 1086 info->cpdma_prefetch_writes_memory = info->gfx_level <= GFX8; 1087 1088 info->has_gfx9_scissor_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN; 1089 1090 info->has_tc_compat_zrange_bug = info->gfx_level >= GFX8 && info->gfx_level <= GFX9; 1091 1092 info->has_msaa_sample_loc_bug = 1093 (info->family >= CHIP_POLARIS10 && info->family <= CHIP_POLARIS12) || 1094 info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN; 1095 1096 info->has_ls_vgpr_init_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN; 1097 1098 /* Drawing from 0-sized index buffers causes hangs on gfx10. */ 1099 info->has_zero_index_buffer_bug = info->gfx_level == GFX10; 1100 1101 /* Whether chips are affected by the image load/sample/gather hw bug when 1102 * DCC is enabled (ie. WRITE_COMPRESS_ENABLE should be 0). 1103 */ 1104 info->has_image_load_dcc_bug = info->family == CHIP_NAVI23 || 1105 info->family == CHIP_VANGOGH || 1106 info->family == CHIP_REMBRANDT; 1107 1108 /* DB has a bug when ITERATE_256 is set to 1 that can cause a hang. The 1109 * workaround is to set DECOMPRESS_ON_Z_PLANES to 2 for 4X MSAA D/S images. 1110 */ 1111 info->has_two_planes_iterate256_bug = info->gfx_level == GFX10; 1112 1113 /* GFX10+Navi21: NGG->legacy transitions require VGT_FLUSH. */ 1114 info->has_vgt_flush_ngg_legacy_bug = info->gfx_level == GFX10 || 1115 info->family == CHIP_NAVI21; 1116 1117 /* HW bug workaround when CS threadgroups > 256 threads and async compute 1118 * isn't used, i.e. only one compute job can run at a time. If async 1119 * compute is possible, the threadgroup size must be limited to 256 threads 1120 * on all queues to avoid the bug. 1121 * Only GFX6 and certain GFX7 chips are affected. 1122 * 1123 * FIXME: RADV doesn't limit the number of threads for async compute. 1124 */ 1125 info->has_cs_regalloc_hang_bug = info->gfx_level == GFX6 || 1126 info->family == CHIP_BONAIRE || 1127 info->family == CHIP_KABINI; 1128 1129 /* Support for GFX10.3 was added with F32_ME_FEATURE_VERSION_31 but the 1130 * feature version wasn't bumped. 1131 */ 1132 info->has_32bit_predication = (info->gfx_level >= GFX10 && 1133 info->me_fw_feature >= 32) || 1134 (info->gfx_level == GFX9 && 1135 info->me_fw_feature >= 52); 1136 1137 info->has_export_conflict_bug = info->gfx_level == GFX11; 1138 1139 /* Get the number of good compute units. */ 1140 info->num_cu = 0; 1141 for (i = 0; i < info->max_se; i++) { 1142 for (j = 0; j < info->max_sa_per_se; j++) { 1143 if (info->gfx_level >= GFX11) { 1144 assert(info->max_sa_per_se <= 2); 1145 info->cu_mask[i][j] = device_info.cu_bitmap[i % 4][(i / 4) * 2 + j]; 1146 } else if (info->family == CHIP_ARCTURUS) { 1147 /* The CU bitmap in amd gpu info structure is 1148 * 4x4 size array, and it's usually suitable for Vega 1149 * ASICs which has 4*2 SE/SA layout. 1150 * But for Arcturus, SE/SA layout is changed to 8*1. 1151 * To mostly reduce the impact, we make it compatible 1152 * with current bitmap array as below: 1153 * SE4 --> cu_bitmap[0][1] 1154 * SE5 --> cu_bitmap[1][1] 1155 * SE6 --> cu_bitmap[2][1] 1156 * SE7 --> cu_bitmap[3][1] 1157 */ 1158 assert(info->max_sa_per_se == 1); 1159 info->cu_mask[i][0] = device_info.cu_bitmap[i % 4][i / 4]; 1160 } else { 1161 info->cu_mask[i][j] = device_info.cu_bitmap[i][j]; 1162 } 1163 info->num_cu += util_bitcount(info->cu_mask[i][j]); 1164 } 1165 } 1166 1167 /* Derive the number of enabled SEs from the CU mask. */ 1168 if (info->gfx_level >= GFX10_3 && info->max_se > 1) { 1169 info->num_se = 0; 1170 1171 for (unsigned se = 0; se < info->max_se; se++) { 1172 for (unsigned sa = 0; sa < info->max_sa_per_se; sa++) { 1173 if (info->cu_mask[se][sa]) { 1174 info->num_se++; 1175 break; 1176 } 1177 } 1178 } 1179 } else { 1180 /* GFX10 and older always enable all SEs because they don't support SE harvesting. */ 1181 info->num_se = info->max_se; 1182 } 1183 1184 /* On GFX10, only whole WGPs (in units of 2 CUs) can be disabled, 1185 * and max - min <= 2. 1186 */ 1187 unsigned cu_group = info->gfx_level >= GFX10 ? 2 : 1; 1188 info->max_good_cu_per_sa = 1189 DIV_ROUND_UP(info->num_cu, (info->num_se * info->max_sa_per_se * cu_group)) * 1190 cu_group; 1191 info->min_good_cu_per_sa = 1192 (info->num_cu / (info->num_se * info->max_sa_per_se * cu_group)) * cu_group; 1193 1194 memcpy(info->si_tile_mode_array, amdinfo.gb_tile_mode, sizeof(amdinfo.gb_tile_mode)); 1195 info->enabled_rb_mask = amdinfo.enabled_rb_pipes_mask; 1196 1197 memcpy(info->cik_macrotile_mode_array, amdinfo.gb_macro_tile_mode, 1198 sizeof(amdinfo.gb_macro_tile_mode)); 1199 1200 info->pte_fragment_size = alignment_info.size_local; 1201 info->gart_page_size = alignment_info.size_remote; 1202 1203 if (info->gfx_level == GFX6) 1204 info->gfx_ib_pad_with_type2 = true; 1205 1206 /* GFX10 and maybe GFX9 need this alignment for cache coherency. */ 1207 if (info->gfx_level >= GFX9) 1208 info->ib_alignment = MAX2(info->ib_alignment, info->tcc_cache_line_size); 1209 1210 if ((info->drm_minor >= 31 && (info->family == CHIP_RAVEN || info->family == CHIP_RAVEN2 || 1211 info->family == CHIP_RENOIR)) || 1212 info->gfx_level >= GFX10_3) { 1213 /* GFX10+ requires retiling in all cases. */ 1214 if (info->max_render_backends == 1 && info->gfx_level == GFX9) 1215 info->use_display_dcc_unaligned = true; 1216 else 1217 info->use_display_dcc_with_retile_blit = true; 1218 } 1219 1220 info->has_stable_pstate = info->drm_minor >= 45; 1221 1222 if (info->gfx_level >= GFX11) { 1223 info->pc_lines = 1024; 1224 info->pbb_max_alloc_count = 255; /* minimum is 2, maximum is 256 */ 1225 } else if (info->gfx_level >= GFX9 && info->has_graphics) { 1226 unsigned pc_lines = 0; 1227 1228 switch (info->family) { 1229 case CHIP_VEGA10: 1230 case CHIP_VEGA12: 1231 case CHIP_VEGA20: 1232 pc_lines = 2048; 1233 break; 1234 case CHIP_RAVEN: 1235 case CHIP_RAVEN2: 1236 case CHIP_RENOIR: 1237 case CHIP_NAVI10: 1238 case CHIP_NAVI12: 1239 case CHIP_NAVI21: 1240 case CHIP_NAVI22: 1241 case CHIP_NAVI23: 1242 pc_lines = 1024; 1243 break; 1244 case CHIP_NAVI14: 1245 case CHIP_NAVI24: 1246 pc_lines = 512; 1247 break; 1248 case CHIP_VANGOGH: 1249 case CHIP_REMBRANDT: 1250 case CHIP_GFX1036: 1251 pc_lines = 256; 1252 break; 1253 default: 1254 assert(0); 1255 } 1256 1257 info->pc_lines = pc_lines; 1258 1259 if (info->gfx_level >= GFX10) { 1260 info->pbb_max_alloc_count = pc_lines / 3; 1261 } else { 1262 info->pbb_max_alloc_count = MIN2(128, pc_lines / (4 * info->max_se)); 1263 } 1264 } 1265 1266 if (info->gfx_level >= GFX10_3) 1267 info->max_wave64_per_simd = 16; 1268 else if (info->gfx_level == GFX10) 1269 info->max_wave64_per_simd = 20; 1270 else if (info->family >= CHIP_POLARIS10 && info->family <= CHIP_VEGAM) 1271 info->max_wave64_per_simd = 8; 1272 else 1273 info->max_wave64_per_simd = 10; 1274 1275 if (info->gfx_level >= GFX10) { 1276 info->num_physical_sgprs_per_simd = 128 * info->max_wave64_per_simd; 1277 info->min_sgpr_alloc = 128; 1278 info->sgpr_alloc_granularity = 128; 1279 } else if (info->gfx_level >= GFX8) { 1280 info->num_physical_sgprs_per_simd = 800; 1281 info->min_sgpr_alloc = 16; 1282 info->sgpr_alloc_granularity = 16; 1283 } else { 1284 info->num_physical_sgprs_per_simd = 512; 1285 info->min_sgpr_alloc = 8; 1286 info->sgpr_alloc_granularity = 8; 1287 } 1288 1289 info->has_3d_cube_border_color_mipmap = info->has_graphics || info->family == CHIP_ARCTURUS; 1290 info->never_stop_sq_perf_counters = info->gfx_level == GFX10 || 1291 info->gfx_level == GFX10_3; 1292 info->never_send_perfcounter_stop = info->gfx_level == GFX11; 1293 info->has_sqtt_rb_harvest_bug = (info->family == CHIP_NAVI23 || 1294 info->family == CHIP_NAVI24 || 1295 info->family == CHIP_REMBRANDT || 1296 info->family == CHIP_VANGOGH) && 1297 util_bitcount(info->enabled_rb_mask) != 1298 info->max_render_backends; 1299 1300 /* On GFX10.3, the polarity of AUTO_FLUSH_MODE is inverted. */ 1301 info->has_sqtt_auto_flush_mode_bug = info->gfx_level == GFX10_3; 1302 1303 info->max_sgpr_alloc = info->family == CHIP_TONGA || info->family == CHIP_ICELAND ? 96 : 104; 1304 1305 if (!info->has_graphics && info->family >= CHIP_ALDEBARAN) { 1306 info->min_wave64_vgpr_alloc = 8; 1307 info->max_vgpr_alloc = 512; 1308 info->wave64_vgpr_alloc_granularity = 8; 1309 } else { 1310 info->min_wave64_vgpr_alloc = 4; 1311 info->max_vgpr_alloc = 256; 1312 info->wave64_vgpr_alloc_granularity = 4; 1313 } 1314 1315 info->num_physical_wave64_vgprs_per_simd = info->gfx_level >= GFX10 ? 512 : 256; 1316 info->num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4; 1317 1318 /* BIG_PAGE is supported since gfx10.3 and requires VRAM. VRAM is only guaranteed 1319 * with AMDGPU_GEM_CREATE_DISCARDABLE. DISCARDABLE was added in DRM 3.47.0. 1320 */ 1321 info->discardable_allows_big_page = info->gfx_level >= GFX10_3 && 1322 info->has_dedicated_vram && 1323 info->drm_minor >= 47; 1324 1325 /* The maximum number of scratch waves. The number is only a function of the number of CUs. 1326 * It should be large enough to hold at least 1 threadgroup. Use the minimum per-SA CU count. 1327 * 1328 * We can decrease the number to make it fit into the infinity cache. 1329 */ 1330 const unsigned max_waves_per_tg = 32; /* 1024 threads in Wave32 */ 1331 info->max_scratch_waves = MAX2(32 * info->min_good_cu_per_sa * info->max_sa_per_se * info->num_se, 1332 max_waves_per_tg); 1333 info->num_rb = util_bitcount(info->enabled_rb_mask); 1334 info->max_gflops = info->num_cu * 128 * info->max_gpu_freq_mhz / 1000; 1335 info->memory_bandwidth_gbps = DIV_ROUND_UP(info->memory_freq_mhz_effective * info->memory_bus_width / 8, 1000); 1336 1337 if (info->gfx_level >= GFX10_3 && info->has_dedicated_vram) { 1338 info->l3_cache_size_mb = info->num_tcc_blocks * 1339 (info->family == CHIP_NAVI21 || 1340 info->family == CHIP_NAVI22 ? 8 : 4); 1341 } 1342 1343 set_custom_cu_en_mask(info); 1344 1345 const char *ib_filename = debug_get_option("AMD_PARSE_IB", NULL); 1346 if (ib_filename) { 1347 FILE *f = fopen(ib_filename, "r"); 1348 if (f) { 1349 fseek(f, 0, SEEK_END); 1350 size_t size = ftell(f); 1351 uint32_t *ib = (uint32_t *)malloc(size); 1352 fseek(f, 0, SEEK_SET); 1353 size_t n_read = fread(ib, 1, size, f); 1354 fclose(f); 1355 1356 if (n_read != size) { 1357 fprintf(stderr, "failed to read %zu bytes from '%s'\n", size, ib_filename); 1358 exit(1); 1359 } 1360 1361 ac_parse_ib(stdout, ib, size / 4, NULL, 0, "IB", info->gfx_level, NULL, NULL); 1362 free(ib); 1363 exit(0); 1364 } 1365 } 1366 return true; 1367} 1368 1369void ac_compute_driver_uuid(char *uuid, size_t size) 1370{ 1371 char amd_uuid[] = "AMD-MESA-DRV"; 1372 1373 assert(size >= sizeof(amd_uuid)); 1374 1375 memset(uuid, 0, size); 1376 strncpy(uuid, amd_uuid, size); 1377} 1378 1379void ac_compute_device_uuid(struct radeon_info *info, char *uuid, size_t size) 1380{ 1381 uint32_t *uint_uuid = (uint32_t *)uuid; 1382 1383 assert(size >= sizeof(uint32_t) * 4); 1384 1385 /** 1386 * Use the device info directly instead of using a sha1. GL/VK UUIDs 1387 * are 16 byte vs 20 byte for sha1, and the truncation that would be 1388 * required would get rid of part of the little entropy we have. 1389 * */ 1390 memset(uuid, 0, size); 1391 uint_uuid[0] = info->pci_domain; 1392 uint_uuid[1] = info->pci_bus; 1393 uint_uuid[2] = info->pci_dev; 1394 uint_uuid[3] = info->pci_func; 1395} 1396 1397void ac_print_gpu_info(struct radeon_info *info, FILE *f) 1398{ 1399 fprintf(f, "Device info:\n"); 1400 fprintf(f, " name = %s\n", info->name); 1401 fprintf(f, " marketing_name = %s\n", info->marketing_name); 1402 fprintf(f, " num_se = %i\n", info->num_se); 1403 fprintf(f, " num_rb = %i\n", info->num_rb); 1404 fprintf(f, " num_cu = %i\n", info->num_cu); 1405 fprintf(f, " max_gpu_freq = %i MHz\n", info->max_gpu_freq_mhz); 1406 fprintf(f, " max_gflops = %u GFLOPS\n", info->max_gflops); 1407 1408 if (info->gfx_level >= GFX10) { 1409 fprintf(f, " l0_cache_size = %i KB\n", DIV_ROUND_UP(info->l1_cache_size, 1024)); 1410 fprintf(f, " l1_cache_size = %i KB\n", 128); 1411 } else { 1412 fprintf(f, " l1_cache_size = %i KB\n", DIV_ROUND_UP(info->l1_cache_size, 1024)); 1413 } 1414 1415 fprintf(f, " l2_cache_size = %i KB\n", DIV_ROUND_UP(info->l2_cache_size, 1024)); 1416 1417 if (info->l3_cache_size_mb) 1418 fprintf(f, " l3_cache_size = %i MB\n", info->l3_cache_size_mb); 1419 1420 fprintf(f, " memory_channels = %u (TCC blocks)\n", info->num_tcc_blocks); 1421 fprintf(f, " memory_size = %u GB (%u MB)\n", 1422 DIV_ROUND_UP(info->vram_size_kb, (1024 * 1024)), 1423 DIV_ROUND_UP(info->vram_size_kb, 1024)); 1424 fprintf(f, " memory_freq = %u GHz\n", DIV_ROUND_UP(info->memory_freq_mhz_effective, 1000)); 1425 fprintf(f, " memory_bus_width = %u bits\n", info->memory_bus_width); 1426 fprintf(f, " memory_bandwidth = %u GB/s\n", info->memory_bandwidth_gbps); 1427 fprintf(f, " clock_crystal_freq = %i KHz\n", info->clock_crystal_freq); 1428 1429 const char *ip_string[] = { 1430 [AMD_IP_GFX] = "GFX", 1431 [AMD_IP_COMPUTE] = "COMP", 1432 [AMD_IP_SDMA] = "SDMA", 1433 [AMD_IP_UVD] = "UVD", 1434 [AMD_IP_VCE] = "VCE", 1435 [AMD_IP_UVD_ENC] = "UVD_ENC", 1436 [AMD_IP_VCN_DEC] = "VCN_DEC", 1437 [AMD_IP_VCN_ENC] = info->family >= CHIP_GFX1100 ? "VCN" : "VCN_ENC", 1438 [AMD_IP_VCN_JPEG] = "VCN_JPG", 1439 }; 1440 1441 for (unsigned i = 0; i < AMD_NUM_IP_TYPES; i++) { 1442 if (info->ip[i].num_queues) { 1443 fprintf(f, " IP %-7s %2u.%u \tqueues:%u\n", ip_string[i], 1444 info->ip[i].ver_major, info->ip[i].ver_minor, info->ip[i].num_queues); 1445 } 1446 } 1447 1448 fprintf(f, "Identification:\n"); 1449 fprintf(f, " pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n", info->pci_domain, info->pci_bus, 1450 info->pci_dev, info->pci_func); 1451 fprintf(f, " pci_id = 0x%x\n", info->pci_id); 1452 fprintf(f, " pci_rev_id = 0x%x\n", info->pci_rev_id); 1453 fprintf(f, " family = %i\n", info->family); 1454 fprintf(f, " gfx_level = %i\n", info->gfx_level); 1455 fprintf(f, " family_id = %i\n", info->family_id); 1456 fprintf(f, " chip_external_rev = %i\n", info->chip_external_rev); 1457 fprintf(f, " chip_rev = %i\n", info->chip_rev); 1458 1459 fprintf(f, "Flags:\n"); 1460 fprintf(f, " is_pro_graphics = %u\n", info->is_pro_graphics); 1461 fprintf(f, " has_graphics = %i\n", info->has_graphics); 1462 fprintf(f, " has_clear_state = %u\n", info->has_clear_state); 1463 fprintf(f, " has_distributed_tess = %u\n", info->has_distributed_tess); 1464 fprintf(f, " has_dcc_constant_encode = %u\n", info->has_dcc_constant_encode); 1465 fprintf(f, " has_rbplus = %u\n", info->has_rbplus); 1466 fprintf(f, " rbplus_allowed = %u\n", info->rbplus_allowed); 1467 fprintf(f, " has_load_ctx_reg_pkt = %u\n", info->has_load_ctx_reg_pkt); 1468 fprintf(f, " has_out_of_order_rast = %u\n", info->has_out_of_order_rast); 1469 fprintf(f, " cpdma_prefetch_writes_memory = %u\n", info->cpdma_prefetch_writes_memory); 1470 fprintf(f, " has_gfx9_scissor_bug = %i\n", info->has_gfx9_scissor_bug); 1471 fprintf(f, " has_tc_compat_zrange_bug = %i\n", info->has_tc_compat_zrange_bug); 1472 fprintf(f, " has_msaa_sample_loc_bug = %i\n", info->has_msaa_sample_loc_bug); 1473 fprintf(f, " has_ls_vgpr_init_bug = %i\n", info->has_ls_vgpr_init_bug); 1474 fprintf(f, " has_32bit_predication = %i\n", info->has_32bit_predication); 1475 fprintf(f, " has_3d_cube_border_color_mipmap = %i\n", info->has_3d_cube_border_color_mipmap); 1476 fprintf(f, " never_stop_sq_perf_counters = %i\n", info->never_stop_sq_perf_counters); 1477 fprintf(f, " has_sqtt_rb_harvest_bug = %i\n", info->has_sqtt_rb_harvest_bug); 1478 fprintf(f, " has_sqtt_auto_flush_mode_bug = %i\n", info->has_sqtt_auto_flush_mode_bug); 1479 fprintf(f, " never_send_perfcounter_stop = %i\n", info->never_send_perfcounter_stop); 1480 fprintf(f, " discardable_allows_big_page = %i\n", info->discardable_allows_big_page); 1481 1482 fprintf(f, "Display features:\n"); 1483 fprintf(f, " use_display_dcc_unaligned = %u\n", info->use_display_dcc_unaligned); 1484 fprintf(f, " use_display_dcc_with_retile_blit = %u\n", info->use_display_dcc_with_retile_blit); 1485 1486 fprintf(f, "Memory info:\n"); 1487 fprintf(f, " pte_fragment_size = %u\n", info->pte_fragment_size); 1488 fprintf(f, " gart_page_size = %u\n", info->gart_page_size); 1489 fprintf(f, " gart_size = %i MB\n", (int)DIV_ROUND_UP(info->gart_size_kb, 1024)); 1490 fprintf(f, " vram_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_size_kb, 1024)); 1491 fprintf(f, " vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_vis_size_kb, 1024)); 1492 fprintf(f, " vram_type = %i\n", info->vram_type); 1493 fprintf(f, " max_heap_size_kb = %i MB\n", (int)DIV_ROUND_UP(info->max_heap_size_kb, 1024)); 1494 fprintf(f, " min_alloc_size = %u\n", info->min_alloc_size); 1495 fprintf(f, " address32_hi = 0x%x\n", info->address32_hi); 1496 fprintf(f, " has_dedicated_vram = %u\n", info->has_dedicated_vram); 1497 fprintf(f, " all_vram_visible = %u\n", info->all_vram_visible); 1498 fprintf(f, " smart_access_memory = %u\n", info->smart_access_memory); 1499 fprintf(f, " max_tcc_blocks = %i\n", info->max_tcc_blocks); 1500 fprintf(f, " tcc_cache_line_size = %u\n", info->tcc_cache_line_size); 1501 fprintf(f, " tcc_rb_non_coherent = %u\n", info->tcc_rb_non_coherent); 1502 fprintf(f, " pc_lines = %u\n", info->pc_lines); 1503 fprintf(f, " lds_size_per_workgroup = %u\n", info->lds_size_per_workgroup); 1504 fprintf(f, " lds_alloc_granularity = %i\n", info->lds_alloc_granularity); 1505 fprintf(f, " lds_encode_granularity = %i\n", info->lds_encode_granularity); 1506 fprintf(f, " max_memory_clock = %i MHz\n", info->memory_freq_mhz); 1507 1508 fprintf(f, "CP info:\n"); 1509 fprintf(f, " gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2); 1510 fprintf(f, " ib_alignment = %u\n", info->ib_alignment); 1511 fprintf(f, " me_fw_version = %i\n", info->me_fw_version); 1512 fprintf(f, " me_fw_feature = %i\n", info->me_fw_feature); 1513 fprintf(f, " mec_fw_version = %i\n", info->mec_fw_version); 1514 fprintf(f, " mec_fw_feature = %i\n", info->mec_fw_feature); 1515 fprintf(f, " pfp_fw_version = %i\n", info->pfp_fw_version); 1516 fprintf(f, " pfp_fw_feature = %i\n", info->pfp_fw_feature); 1517 1518 fprintf(f, "Multimedia info:\n"); 1519 fprintf(f, " vce_encode = %u\n", info->ip[AMD_IP_VCE].num_queues); 1520 1521 if (info->family >= CHIP_GFX1100) 1522 fprintf(f, " vcn_unified = %u\n", info->has_video_hw.vcn_decode); 1523 else { 1524 fprintf(f, " vcn_decode = %u\n", info->has_video_hw.vcn_decode); 1525 fprintf(f, " vcn_encode = %u\n", info->ip[AMD_IP_VCN_ENC].num_queues); 1526 } 1527 1528 fprintf(f, " uvd_fw_version = %u\n", info->uvd_fw_version); 1529 fprintf(f, " vce_fw_version = %u\n", info->vce_fw_version); 1530 fprintf(f, " vce_harvest_config = %i\n", info->vce_harvest_config); 1531 1532 fprintf(f, "Kernel & winsys capabilities:\n"); 1533 fprintf(f, " drm = %i.%i.%i\n", info->drm_major, info->drm_minor, info->drm_patchlevel); 1534 fprintf(f, " has_userptr = %i\n", info->has_userptr); 1535 fprintf(f, " has_syncobj = %u\n", info->has_syncobj); 1536 fprintf(f, " has_timeline_syncobj = %u\n", info->has_timeline_syncobj); 1537 fprintf(f, " has_fence_to_handle = %u\n", info->has_fence_to_handle); 1538 fprintf(f, " has_local_buffers = %u\n", info->has_local_buffers); 1539 fprintf(f, " has_bo_metadata = %u\n", info->has_bo_metadata); 1540 fprintf(f, " has_eqaa_surface_allocator = %u\n", info->has_eqaa_surface_allocator); 1541 fprintf(f, " has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings); 1542 fprintf(f, " has_stable_pstate = %u\n", info->has_stable_pstate); 1543 fprintf(f, " has_scheduled_fence_dependency = %u\n", info->has_scheduled_fence_dependency); 1544 fprintf(f, " mid_command_buffer_preemption_enabled = %u\n", 1545 info->mid_command_buffer_preemption_enabled); 1546 fprintf(f, " has_tmz_support = %u\n", info->has_tmz_support); 1547 1548 fprintf(f, "Shader core info:\n"); 1549 for (unsigned i = 0; i < info->max_se; i++) { 1550 for (unsigned j = 0; j < info->max_sa_per_se; j++) { 1551 fprintf(f, " cu_mask[SE%u][SA%u] = 0x%x \t(%u)\tCU_EN = 0x%x\n", i, j, 1552 info->cu_mask[i][j], util_bitcount(info->cu_mask[i][j]), 1553 info->spi_cu_en & BITFIELD_MASK(util_bitcount(info->cu_mask[i][j]))); 1554 } 1555 } 1556 fprintf(f, " spi_cu_en_has_effect = %i\n", info->spi_cu_en_has_effect); 1557 fprintf(f, " max_good_cu_per_sa = %i\n", info->max_good_cu_per_sa); 1558 fprintf(f, " min_good_cu_per_sa = %i\n", info->min_good_cu_per_sa); 1559 fprintf(f, " max_se = %i\n", info->max_se); 1560 fprintf(f, " max_sa_per_se = %i\n", info->max_sa_per_se); 1561 fprintf(f, " max_wave64_per_simd = %i\n", info->max_wave64_per_simd); 1562 fprintf(f, " num_physical_sgprs_per_simd = %i\n", info->num_physical_sgprs_per_simd); 1563 fprintf(f, " num_physical_wave64_vgprs_per_simd = %i\n", 1564 info->num_physical_wave64_vgprs_per_simd); 1565 fprintf(f, " num_simd_per_compute_unit = %i\n", info->num_simd_per_compute_unit); 1566 fprintf(f, " min_sgpr_alloc = %i\n", info->min_sgpr_alloc); 1567 fprintf(f, " max_sgpr_alloc = %i\n", info->max_sgpr_alloc); 1568 fprintf(f, " sgpr_alloc_granularity = %i\n", info->sgpr_alloc_granularity); 1569 fprintf(f, " min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc); 1570 fprintf(f, " max_vgpr_alloc = %i\n", info->max_vgpr_alloc); 1571 fprintf(f, " wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity); 1572 fprintf(f, " max_scratch_waves = %i\n", info->max_scratch_waves); 1573 1574 fprintf(f, "Render backend info:\n"); 1575 fprintf(f, " pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override); 1576 fprintf(f, " max_render_backends = %i\n", info->max_render_backends); 1577 fprintf(f, " num_tile_pipes = %i\n", info->num_tile_pipes); 1578 fprintf(f, " pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes); 1579 fprintf(f, " enabled_rb_mask = 0x%x\n", info->enabled_rb_mask); 1580 fprintf(f, " max_alignment = %u\n", (unsigned)info->max_alignment); 1581 fprintf(f, " pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count); 1582 1583 fprintf(f, "GB_ADDR_CONFIG: 0x%08x\n", info->gb_addr_config); 1584 if (info->gfx_level >= GFX10) { 1585 fprintf(f, " num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config)); 1586 fprintf(f, " pipe_interleave_size = %u\n", 1587 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config)); 1588 fprintf(f, " max_compressed_frags = %u\n", 1589 1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config)); 1590 if (info->gfx_level >= GFX10_3) 1591 fprintf(f, " num_pkrs = %u\n", 1 << G_0098F8_NUM_PKRS(info->gb_addr_config)); 1592 } else if (info->gfx_level == GFX9) { 1593 fprintf(f, " num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config)); 1594 fprintf(f, " pipe_interleave_size = %u\n", 1595 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config)); 1596 fprintf(f, " max_compressed_frags = %u\n", 1597 1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config)); 1598 fprintf(f, " bank_interleave_size = %u\n", 1599 1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config)); 1600 fprintf(f, " num_banks = %u\n", 1 << G_0098F8_NUM_BANKS(info->gb_addr_config)); 1601 fprintf(f, " shader_engine_tile_size = %u\n", 1602 16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config)); 1603 fprintf(f, " num_shader_engines = %u\n", 1604 1 << G_0098F8_NUM_SHADER_ENGINES_GFX9(info->gb_addr_config)); 1605 fprintf(f, " num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX9(info->gb_addr_config)); 1606 fprintf(f, " multi_gpu_tile_size = %u (raw)\n", 1607 G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config)); 1608 fprintf(f, " num_rb_per_se = %u\n", 1 << G_0098F8_NUM_RB_PER_SE(info->gb_addr_config)); 1609 fprintf(f, " row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config)); 1610 fprintf(f, " num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config)); 1611 fprintf(f, " se_enable = %u (raw)\n", G_0098F8_SE_ENABLE(info->gb_addr_config)); 1612 } else { 1613 fprintf(f, " num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config)); 1614 fprintf(f, " pipe_interleave_size = %u\n", 1615 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config)); 1616 fprintf(f, " bank_interleave_size = %u\n", 1617 1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config)); 1618 fprintf(f, " num_shader_engines = %u\n", 1619 1 << G_0098F8_NUM_SHADER_ENGINES_GFX6(info->gb_addr_config)); 1620 fprintf(f, " shader_engine_tile_size = %u\n", 1621 16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config)); 1622 fprintf(f, " num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX6(info->gb_addr_config)); 1623 fprintf(f, " multi_gpu_tile_size = %u (raw)\n", 1624 G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config)); 1625 fprintf(f, " row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config)); 1626 fprintf(f, " num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config)); 1627 } 1628} 1629 1630int ac_get_gs_table_depth(enum amd_gfx_level gfx_level, enum radeon_family family) 1631{ 1632 if (gfx_level >= GFX9) 1633 return -1; 1634 1635 switch (family) { 1636 case CHIP_OLAND: 1637 case CHIP_HAINAN: 1638 case CHIP_KAVERI: 1639 case CHIP_KABINI: 1640 case CHIP_ICELAND: 1641 case CHIP_CARRIZO: 1642 case CHIP_STONEY: 1643 return 16; 1644 case CHIP_TAHITI: 1645 case CHIP_PITCAIRN: 1646 case CHIP_VERDE: 1647 case CHIP_BONAIRE: 1648 case CHIP_HAWAII: 1649 case CHIP_TONGA: 1650 case CHIP_FIJI: 1651 case CHIP_POLARIS10: 1652 case CHIP_POLARIS11: 1653 case CHIP_POLARIS12: 1654 case CHIP_VEGAM: 1655 return 32; 1656 default: 1657 unreachable("Unknown GPU"); 1658 } 1659} 1660 1661void ac_get_raster_config(struct radeon_info *info, uint32_t *raster_config_p, 1662 uint32_t *raster_config_1_p, uint32_t *se_tile_repeat_p) 1663{ 1664 unsigned raster_config, raster_config_1, se_tile_repeat; 1665 1666 switch (info->family) { 1667 /* 1 SE / 1 RB */ 1668 case CHIP_HAINAN: 1669 case CHIP_KABINI: 1670 case CHIP_STONEY: 1671 raster_config = 0x00000000; 1672 raster_config_1 = 0x00000000; 1673 break; 1674 /* 1 SE / 4 RBs */ 1675 case CHIP_VERDE: 1676 raster_config = 0x0000124a; 1677 raster_config_1 = 0x00000000; 1678 break; 1679 /* 1 SE / 2 RBs (Oland is special) */ 1680 case CHIP_OLAND: 1681 raster_config = 0x00000082; 1682 raster_config_1 = 0x00000000; 1683 break; 1684 /* 1 SE / 2 RBs */ 1685 case CHIP_KAVERI: 1686 case CHIP_ICELAND: 1687 case CHIP_CARRIZO: 1688 raster_config = 0x00000002; 1689 raster_config_1 = 0x00000000; 1690 break; 1691 /* 2 SEs / 4 RBs */ 1692 case CHIP_BONAIRE: 1693 case CHIP_POLARIS11: 1694 case CHIP_POLARIS12: 1695 raster_config = 0x16000012; 1696 raster_config_1 = 0x00000000; 1697 break; 1698 /* 2 SEs / 8 RBs */ 1699 case CHIP_TAHITI: 1700 case CHIP_PITCAIRN: 1701 raster_config = 0x2a00126a; 1702 raster_config_1 = 0x00000000; 1703 break; 1704 /* 4 SEs / 8 RBs */ 1705 case CHIP_TONGA: 1706 case CHIP_POLARIS10: 1707 raster_config = 0x16000012; 1708 raster_config_1 = 0x0000002a; 1709 break; 1710 /* 4 SEs / 16 RBs */ 1711 case CHIP_HAWAII: 1712 case CHIP_FIJI: 1713 case CHIP_VEGAM: 1714 raster_config = 0x3a00161a; 1715 raster_config_1 = 0x0000002e; 1716 break; 1717 default: 1718 fprintf(stderr, "ac: Unknown GPU, using 0 for raster_config\n"); 1719 raster_config = 0x00000000; 1720 raster_config_1 = 0x00000000; 1721 break; 1722 } 1723 1724 /* drm/radeon on Kaveri is buggy, so disable 1 RB to work around it. 1725 * This decreases performance by up to 50% when the RB is the bottleneck. 1726 */ 1727 if (info->family == CHIP_KAVERI && !info->is_amdgpu) 1728 raster_config = 0x00000000; 1729 1730 /* Fiji: Old kernels have incorrect tiling config. This decreases 1731 * RB performance by 25%. (it disables 1 RB in the second packer) 1732 */ 1733 if (info->family == CHIP_FIJI && info->cik_macrotile_mode_array[0] == 0x000000e8) { 1734 raster_config = 0x16000012; 1735 raster_config_1 = 0x0000002a; 1736 } 1737 1738 unsigned se_width = 8 << G_028350_SE_XSEL_GFX6(raster_config); 1739 unsigned se_height = 8 << G_028350_SE_YSEL_GFX6(raster_config); 1740 1741 /* I don't know how to calculate this, though this is probably a good guess. */ 1742 se_tile_repeat = MAX2(se_width, se_height) * info->max_se; 1743 1744 *raster_config_p = raster_config; 1745 *raster_config_1_p = raster_config_1; 1746 if (se_tile_repeat_p) 1747 *se_tile_repeat_p = se_tile_repeat; 1748} 1749 1750void ac_get_harvested_configs(struct radeon_info *info, unsigned raster_config, 1751 unsigned *cik_raster_config_1_p, unsigned *raster_config_se) 1752{ 1753 unsigned sh_per_se = MAX2(info->max_sa_per_se, 1); 1754 unsigned num_se = MAX2(info->max_se, 1); 1755 unsigned rb_mask = info->enabled_rb_mask; 1756 unsigned num_rb = MIN2(info->max_render_backends, 16); 1757 unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2); 1758 unsigned rb_per_se = num_rb / num_se; 1759 unsigned se_mask[4]; 1760 unsigned se; 1761 1762 se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask; 1763 se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask; 1764 se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask; 1765 se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask; 1766 1767 assert(num_se == 1 || num_se == 2 || num_se == 4); 1768 assert(sh_per_se == 1 || sh_per_se == 2); 1769 assert(rb_per_pkr == 1 || rb_per_pkr == 2); 1770 1771 if (info->gfx_level >= GFX7) { 1772 unsigned raster_config_1 = *cik_raster_config_1_p; 1773 if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) || (!se_mask[2] && !se_mask[3]))) { 1774 raster_config_1 &= C_028354_SE_PAIR_MAP; 1775 1776 if (!se_mask[0] && !se_mask[1]) { 1777 raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3); 1778 } else { 1779 raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0); 1780 } 1781 *cik_raster_config_1_p = raster_config_1; 1782 } 1783 } 1784 1785 for (se = 0; se < num_se; se++) { 1786 unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se); 1787 unsigned pkr1_mask = pkr0_mask << rb_per_pkr; 1788 int idx = (se / 2) * 2; 1789 1790 raster_config_se[se] = raster_config; 1791 if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) { 1792 raster_config_se[se] &= C_028350_SE_MAP; 1793 1794 if (!se_mask[idx]) { 1795 raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3); 1796 } else { 1797 raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0); 1798 } 1799 } 1800 1801 pkr0_mask &= rb_mask; 1802 pkr1_mask &= rb_mask; 1803 if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) { 1804 raster_config_se[se] &= C_028350_PKR_MAP; 1805 1806 if (!pkr0_mask) { 1807 raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3); 1808 } else { 1809 raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0); 1810 } 1811 } 1812 1813 if (rb_per_se >= 2) { 1814 unsigned rb0_mask = 1 << (se * rb_per_se); 1815 unsigned rb1_mask = rb0_mask << 1; 1816 1817 rb0_mask &= rb_mask; 1818 rb1_mask &= rb_mask; 1819 if (!rb0_mask || !rb1_mask) { 1820 raster_config_se[se] &= C_028350_RB_MAP_PKR0; 1821 1822 if (!rb0_mask) { 1823 raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3); 1824 } else { 1825 raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0); 1826 } 1827 } 1828 1829 if (rb_per_se > 2) { 1830 rb0_mask = 1 << (se * rb_per_se + rb_per_pkr); 1831 rb1_mask = rb0_mask << 1; 1832 rb0_mask &= rb_mask; 1833 rb1_mask &= rb_mask; 1834 if (!rb0_mask || !rb1_mask) { 1835 raster_config_se[se] &= C_028350_RB_MAP_PKR1; 1836 1837 if (!rb0_mask) { 1838 raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3); 1839 } else { 1840 raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0); 1841 } 1842 } 1843 } 1844 } 1845 } 1846} 1847 1848unsigned 1849ac_get_compute_resource_limits(const struct radeon_info *info, unsigned waves_per_threadgroup, 1850 unsigned max_waves_per_sh, unsigned threadgroups_per_cu) 1851{ 1852 unsigned compute_resource_limits = S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0); 1853 1854 if (info->gfx_level >= GFX7) { 1855 unsigned num_cu_per_se = info->num_cu / info->num_se; 1856 1857 /* Gfx9 should set the limit to max instead of 0 to fix high priority compute. */ 1858 if (info->gfx_level == GFX9 && !max_waves_per_sh) { 1859 max_waves_per_sh = info->max_good_cu_per_sa * info->num_simd_per_compute_unit * 1860 info->max_wave64_per_simd; 1861 } 1862 1863 /* Force even distribution on all SIMDs in CU if the workgroup 1864 * size is 64. This has shown some good improvements if # of CUs 1865 * per SE is not a multiple of 4. 1866 */ 1867 if (num_cu_per_se % 4 && waves_per_threadgroup == 1) 1868 compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1); 1869 1870 assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8); 1871 compute_resource_limits |= 1872 S_00B854_WAVES_PER_SH(max_waves_per_sh) | S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1); 1873 } else { 1874 /* GFX6 */ 1875 if (max_waves_per_sh) { 1876 unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16); 1877 compute_resource_limits |= S_00B854_WAVES_PER_SH_GFX6(limit_div16); 1878 } 1879 } 1880 return compute_resource_limits; 1881} 1882 1883void ac_get_hs_info(struct radeon_info *info, 1884 struct ac_hs_info *hs) 1885{ 1886 bool double_offchip_buffers = info->gfx_level >= GFX7 && 1887 info->family != CHIP_CARRIZO && 1888 info->family != CHIP_STONEY; 1889 unsigned max_offchip_buffers_per_se; 1890 unsigned max_offchip_buffers; 1891 unsigned offchip_granularity; 1892 unsigned hs_offchip_param; 1893 1894 hs->tess_offchip_block_dw_size = 1895 info->family == CHIP_HAWAII ? 4096 : 8192; 1896 1897 /* 1898 * Per RadeonSI: 1899 * This must be one less than the maximum number due to a hw limitation. 1900 * Various hardware bugs need this. 1901 * 1902 * Per AMDVLK: 1903 * Vega10 should limit max_offchip_buffers to 508 (4 * 127). 1904 * Gfx7 should limit max_offchip_buffers to 508 1905 * Gfx6 should limit max_offchip_buffers to 126 (2 * 63) 1906 * 1907 * Follow AMDVLK here. 1908 */ 1909 if (info->gfx_level >= GFX11) { 1910 max_offchip_buffers_per_se = 256; /* TODO: we could decrease this to reduce memory/cache usage */ 1911 } else if (info->gfx_level >= GFX10) { 1912 max_offchip_buffers_per_se = 128; 1913 } else if (info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) { 1914 /* Only certain chips can use the maximum value. */ 1915 max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64; 1916 } else { 1917 max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63; 1918 } 1919 1920 max_offchip_buffers = max_offchip_buffers_per_se * info->max_se; 1921 1922 /* Hawaii has a bug with offchip buffers > 256 that can be worked 1923 * around by setting 4K granularity. 1924 */ 1925 if (hs->tess_offchip_block_dw_size == 4096) { 1926 assert(info->family == CHIP_HAWAII); 1927 offchip_granularity = V_03093C_X_4K_DWORDS; 1928 } else { 1929 assert(hs->tess_offchip_block_dw_size == 8192); 1930 offchip_granularity = V_03093C_X_8K_DWORDS; 1931 } 1932 1933 switch (info->gfx_level) { 1934 case GFX6: 1935 max_offchip_buffers = MIN2(max_offchip_buffers, 126); 1936 break; 1937 case GFX7: 1938 case GFX8: 1939 case GFX9: 1940 max_offchip_buffers = MIN2(max_offchip_buffers, 508); 1941 break; 1942 case GFX10: 1943 break; 1944 default: 1945 break; 1946 } 1947 1948 hs->max_offchip_buffers = max_offchip_buffers; 1949 1950 if (info->gfx_level >= GFX11) { 1951 /* OFFCHIP_BUFFERING is per SE. */ 1952 hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(max_offchip_buffers_per_se - 1) | 1953 S_03093C_OFFCHIP_GRANULARITY_GFX103(offchip_granularity); 1954 } else if (info->gfx_level >= GFX10_3) { 1955 hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(max_offchip_buffers - 1) | 1956 S_03093C_OFFCHIP_GRANULARITY_GFX103(offchip_granularity); 1957 } else if (info->gfx_level >= GFX7) { 1958 if (info->gfx_level >= GFX8) 1959 --max_offchip_buffers; 1960 hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(max_offchip_buffers) | 1961 S_03093C_OFFCHIP_GRANULARITY_GFX7(offchip_granularity); 1962 } else { 1963 hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers); 1964 } 1965 1966 hs->hs_offchip_param = hs_offchip_param; 1967 1968 hs->tess_factor_ring_size = 48 * 1024 * info->max_se; 1969 hs->tess_offchip_ring_offset = align(hs->tess_factor_ring_size, 64 * 1024); 1970 hs->tess_offchip_ring_size = hs->max_offchip_buffers * hs->tess_offchip_block_dw_size * 4; 1971} 1972 1973static uint16_t get_task_num_entries(enum radeon_family fam) 1974{ 1975 /* Number of task shader ring entries. Needs to be a power of two. 1976 * Use a low number on smaller chips so we don't waste space, 1977 * but keep it high on bigger chips so it doesn't inhibit parallelism. 1978 * 1979 * This number is compiled into task/mesh shaders as a constant. 1980 * In order to ensure this works fine with the shader cache, we must 1981 * base this decision on the chip family, not the number of CUs in 1982 * the current GPU. (So, the cache remains consistent for all 1983 * chips in the same family.) 1984 */ 1985 switch (fam) { 1986 case CHIP_VANGOGH: 1987 case CHIP_NAVI24: 1988 case CHIP_REMBRANDT: 1989 return 256; 1990 case CHIP_NAVI21: 1991 case CHIP_NAVI22: 1992 case CHIP_NAVI23: 1993 default: 1994 return 1024; 1995 } 1996} 1997 1998void ac_get_task_info(struct radeon_info *info, 1999 struct ac_task_info *task_info) 2000{ 2001 const uint16_t num_entries = get_task_num_entries(info->family); 2002 const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES; 2003 const uint32_t payload_ring_bytes = num_entries * AC_TASK_PAYLOAD_ENTRY_BYTES; 2004 2005 /* Ensure that the addresses of each ring are 256 byte aligned. */ 2006 task_info->num_entries = num_entries; 2007 task_info->draw_ring_offset = ALIGN(AC_TASK_CTRLBUF_BYTES, 256); 2008 task_info->payload_ring_offset = ALIGN(task_info->draw_ring_offset + draw_ring_bytes, 256); 2009 task_info->bo_size_bytes = task_info->payload_ring_offset + payload_ring_bytes; 2010} 2011