1/* 2 * Copyright © 2013 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <assert.h> 25#include <stdbool.h> 26#include <stdio.h> 27#include <stdlib.h> 28#include <string.h> 29#include <unistd.h> 30 31#include <xf86drm.h> 32 33#include "intel_device_info.h" 34#include "intel_hwconfig.h" 35#include "intel/common/intel_gem.h" 36#include "util/bitscan.h" 37#include "util/debug.h" 38#include "util/log.h" 39#include "util/macros.h" 40#include "util/os_misc.h" 41 42#include "drm-uapi/i915_drm.h" 43 44static const struct { 45 const char *name; 46 int pci_id; 47} name_map[] = { 48 { "lpt", 0x27a2 }, 49 { "brw", 0x2a02 }, 50 { "g4x", 0x2a42 }, 51 { "ilk", 0x0042 }, 52 { "snb", 0x0126 }, 53 { "ivb", 0x016a }, 54 { "hsw", 0x0d2e }, 55 { "byt", 0x0f33 }, 56 { "bdw", 0x162e }, 57 { "chv", 0x22B3 }, 58 { "skl", 0x1912 }, 59 { "bxt", 0x5A85 }, 60 { "kbl", 0x5912 }, 61 { "aml", 0x591C }, 62 { "glk", 0x3185 }, 63 { "cfl", 0x3E9B }, 64 { "whl", 0x3EA1 }, 65 { "cml", 0x9b41 }, 66 { "icl", 0x8a52 }, 67 { "ehl", 0x4500 }, 68 { "jsl", 0x4E71 }, 69 { "tgl", 0x9a49 }, 70 { "rkl", 0x4c8a }, 71 { "dg1", 0x4905 }, 72 { "adl", 0x4680 }, 73 { "sg1", 0x4907 }, 74 { "rpl", 0xa780 }, 75 { "dg2", 0x5690 }, 76}; 77 78/** 79 * Get the PCI ID for the device name. 80 * 81 * Returns -1 if the device is not known. 82 */ 83int 84intel_device_name_to_pci_device_id(const char *name) 85{ 86 for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) { 87 if (!strcmp(name_map[i].name, name)) 88 return name_map[i].pci_id; 89 } 90 91 return -1; 92} 93 94static const struct intel_device_info intel_device_info_gfx3 = { 95 .ver = 3, 96 .platform = INTEL_PLATFORM_GFX3, 97 .simulator_id = -1, 98 .num_slices = 1, 99 .num_subslices = { 1, }, 100 .max_eus_per_subslice = 8, 101 .num_thread_per_eu = 4, 102 .timestamp_frequency = 12500000, 103 .cs_prefetch_size = 512, 104}; 105 106static const struct intel_device_info intel_device_info_i965 = { 107 .ver = 4, 108 .platform = INTEL_PLATFORM_I965, 109 .has_negative_rhw_bug = true, 110 .num_slices = 1, 111 .num_subslices = { 1, }, 112 .max_eus_per_subslice = 8, 113 .num_thread_per_eu = 4, 114 .max_vs_threads = 16, 115 .max_gs_threads = 2, 116 .max_wm_threads = 8 * 4, 117 .urb = { 118 .size = 256, 119 }, 120 .timestamp_frequency = 12500000, 121 .simulator_id = -1, 122 .cs_prefetch_size = 512, 123}; 124 125static const struct intel_device_info intel_device_info_g4x = { 126 .ver = 4, 127 .verx10 = 45, 128 .has_pln = true, 129 .has_compr4 = true, 130 .has_surface_tile_offset = true, 131 .platform = INTEL_PLATFORM_G4X, 132 .num_slices = 1, 133 .num_subslices = { 1, }, 134 .max_eus_per_subslice = 10, 135 .num_thread_per_eu = 5, 136 .max_vs_threads = 32, 137 .max_gs_threads = 2, 138 .max_wm_threads = 10 * 5, 139 .urb = { 140 .size = 384, 141 }, 142 .timestamp_frequency = 12500000, 143 .simulator_id = -1, 144 .cs_prefetch_size = 512, 145}; 146 147static const struct intel_device_info intel_device_info_ilk = { 148 .ver = 5, 149 .platform = INTEL_PLATFORM_ILK, 150 .has_pln = true, 151 .has_compr4 = true, 152 .has_surface_tile_offset = true, 153 .num_slices = 1, 154 .num_subslices = { 1, }, 155 .max_eus_per_subslice = 12, 156 .num_thread_per_eu = 6, 157 .max_vs_threads = 72, 158 .max_gs_threads = 32, 159 .max_wm_threads = 12 * 6, 160 .urb = { 161 .size = 1024, 162 }, 163 .timestamp_frequency = 12500000, 164 .simulator_id = -1, 165 .cs_prefetch_size = 512, 166}; 167 168static const struct intel_device_info intel_device_info_snb_gt1 = { 169 .ver = 6, 170 .gt = 1, 171 .platform = INTEL_PLATFORM_SNB, 172 .has_hiz_and_separate_stencil = true, 173 .has_llc = true, 174 .has_pln = true, 175 .has_surface_tile_offset = true, 176 .needs_unlit_centroid_workaround = true, 177 .num_slices = 1, 178 .num_subslices = { 1, }, 179 .max_eus_per_subslice = 6, 180 .num_thread_per_eu = 6, /* Not confirmed */ 181 .max_vs_threads = 24, 182 .max_gs_threads = 21, /* conservative; 24 if rendering disabled. */ 183 .max_wm_threads = 40, 184 .urb = { 185 .size = 32, 186 .min_entries = { 187 [MESA_SHADER_VERTEX] = 24, 188 }, 189 .max_entries = { 190 [MESA_SHADER_VERTEX] = 256, 191 [MESA_SHADER_GEOMETRY] = 256, 192 }, 193 }, 194 .timestamp_frequency = 12500000, 195 .simulator_id = -1, 196 .cs_prefetch_size = 512, 197}; 198 199static const struct intel_device_info intel_device_info_snb_gt2 = { 200 .ver = 6, 201 .gt = 2, 202 .platform = INTEL_PLATFORM_SNB, 203 .has_hiz_and_separate_stencil = true, 204 .has_llc = true, 205 .has_pln = true, 206 .has_surface_tile_offset = true, 207 .needs_unlit_centroid_workaround = true, 208 .num_slices = 1, 209 .num_subslices = { 1, }, 210 .max_eus_per_subslice = 12, 211 .num_thread_per_eu = 6, /* Not confirmed */ 212 .max_vs_threads = 60, 213 .max_gs_threads = 60, 214 .max_wm_threads = 80, 215 .urb = { 216 .size = 64, 217 .min_entries = { 218 [MESA_SHADER_VERTEX] = 24, 219 }, 220 .max_entries = { 221 [MESA_SHADER_VERTEX] = 256, 222 [MESA_SHADER_GEOMETRY] = 256, 223 }, 224 }, 225 .timestamp_frequency = 12500000, 226 .simulator_id = -1, 227 .cs_prefetch_size = 512, 228}; 229 230#define GFX7_FEATURES \ 231 .ver = 7, \ 232 .has_hiz_and_separate_stencil = true, \ 233 .must_use_separate_stencil = true, \ 234 .has_llc = true, \ 235 .has_pln = true, \ 236 .has_64bit_float = true, \ 237 .has_surface_tile_offset = true, \ 238 .timestamp_frequency = 12500000, \ 239 .max_constant_urb_size_kb = 16, \ 240 .cs_prefetch_size = 512 241 242static const struct intel_device_info intel_device_info_ivb_gt1 = { 243 GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 1, 244 .num_slices = 1, 245 .num_subslices = { 1, }, 246 .max_eus_per_subslice = 6, 247 .num_thread_per_eu = 6, 248 .l3_banks = 2, 249 .max_vs_threads = 36, 250 .max_tcs_threads = 36, 251 .max_tes_threads = 36, 252 .max_gs_threads = 36, 253 .max_wm_threads = 48, 254 .max_cs_threads = 36, 255 .urb = { 256 .min_entries = { 257 [MESA_SHADER_VERTEX] = 32, 258 [MESA_SHADER_TESS_EVAL] = 10, 259 }, 260 .max_entries = { 261 [MESA_SHADER_VERTEX] = 512, 262 [MESA_SHADER_TESS_CTRL] = 32, 263 [MESA_SHADER_TESS_EVAL] = 288, 264 [MESA_SHADER_GEOMETRY] = 192, 265 }, 266 }, 267 .simulator_id = 7, 268}; 269 270static const struct intel_device_info intel_device_info_ivb_gt2 = { 271 GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 2, 272 .num_slices = 1, 273 .num_subslices = { 1, }, 274 .max_eus_per_subslice = 12, 275 .num_thread_per_eu = 8, /* Not sure why this isn't a multiple of 276 * @max_wm_threads ... */ 277 .l3_banks = 4, 278 .max_vs_threads = 128, 279 .max_tcs_threads = 128, 280 .max_tes_threads = 128, 281 .max_gs_threads = 128, 282 .max_wm_threads = 172, 283 .max_cs_threads = 64, 284 .urb = { 285 .min_entries = { 286 [MESA_SHADER_VERTEX] = 32, 287 [MESA_SHADER_TESS_EVAL] = 10, 288 }, 289 .max_entries = { 290 [MESA_SHADER_VERTEX] = 704, 291 [MESA_SHADER_TESS_CTRL] = 64, 292 [MESA_SHADER_TESS_EVAL] = 448, 293 [MESA_SHADER_GEOMETRY] = 320, 294 }, 295 }, 296 .simulator_id = 7, 297}; 298 299static const struct intel_device_info intel_device_info_byt = { 300 GFX7_FEATURES, .platform = INTEL_PLATFORM_BYT, .gt = 1, 301 .num_slices = 1, 302 .num_subslices = { 1, }, 303 .max_eus_per_subslice = 4, 304 .num_thread_per_eu = 8, 305 .l3_banks = 1, 306 .has_llc = false, 307 .max_vs_threads = 36, 308 .max_tcs_threads = 36, 309 .max_tes_threads = 36, 310 .max_gs_threads = 36, 311 .max_wm_threads = 48, 312 .max_cs_threads = 32, 313 .urb = { 314 .min_entries = { 315 [MESA_SHADER_VERTEX] = 32, 316 [MESA_SHADER_TESS_EVAL] = 10, 317 }, 318 .max_entries = { 319 [MESA_SHADER_VERTEX] = 512, 320 [MESA_SHADER_TESS_CTRL] = 32, 321 [MESA_SHADER_TESS_EVAL] = 288, 322 [MESA_SHADER_GEOMETRY] = 192, 323 }, 324 }, 325 .simulator_id = 10, 326}; 327 328#define HSW_FEATURES \ 329 GFX7_FEATURES, \ 330 .platform = INTEL_PLATFORM_HSW, \ 331 .verx10 = 75, \ 332 .supports_simd16_3src = true 333 334static const struct intel_device_info intel_device_info_hsw_gt1 = { 335 HSW_FEATURES, .gt = 1, 336 .num_slices = 1, 337 .num_subslices = { 1, }, 338 .max_eus_per_subslice = 10, 339 .num_thread_per_eu = 7, 340 .l3_banks = 2, 341 .max_vs_threads = 70, 342 .max_tcs_threads = 70, 343 .max_tes_threads = 70, 344 .max_gs_threads = 70, 345 .max_wm_threads = 102, 346 .max_cs_threads = 70, 347 .urb = { 348 .min_entries = { 349 [MESA_SHADER_VERTEX] = 32, 350 [MESA_SHADER_TESS_EVAL] = 10, 351 }, 352 .max_entries = { 353 [MESA_SHADER_VERTEX] = 640, 354 [MESA_SHADER_TESS_CTRL] = 64, 355 [MESA_SHADER_TESS_EVAL] = 384, 356 [MESA_SHADER_GEOMETRY] = 256, 357 }, 358 }, 359 .simulator_id = 9, 360}; 361 362static const struct intel_device_info intel_device_info_hsw_gt2 = { 363 HSW_FEATURES, .gt = 2, 364 .num_slices = 1, 365 .num_subslices = { 2, }, 366 .max_eus_per_subslice = 10, 367 .num_thread_per_eu = 7, 368 .l3_banks = 4, 369 .max_vs_threads = 280, 370 .max_tcs_threads = 256, 371 .max_tes_threads = 280, 372 .max_gs_threads = 256, 373 .max_wm_threads = 204, 374 .max_cs_threads = 70, 375 .urb = { 376 .min_entries = { 377 [MESA_SHADER_VERTEX] = 64, 378 [MESA_SHADER_TESS_EVAL] = 10, 379 }, 380 .max_entries = { 381 [MESA_SHADER_VERTEX] = 1664, 382 [MESA_SHADER_TESS_CTRL] = 128, 383 [MESA_SHADER_TESS_EVAL] = 960, 384 [MESA_SHADER_GEOMETRY] = 640, 385 }, 386 }, 387 .simulator_id = 9, 388}; 389 390static const struct intel_device_info intel_device_info_hsw_gt3 = { 391 HSW_FEATURES, .gt = 3, 392 .num_slices = 2, 393 .num_subslices = { 2, 2, }, 394 .max_eus_per_subslice = 10, 395 .num_thread_per_eu = 7, 396 .l3_banks = 8, 397 .max_vs_threads = 280, 398 .max_tcs_threads = 256, 399 .max_tes_threads = 280, 400 .max_gs_threads = 256, 401 .max_wm_threads = 408, 402 .max_cs_threads = 70, 403 .urb = { 404 .min_entries = { 405 [MESA_SHADER_VERTEX] = 64, 406 [MESA_SHADER_TESS_EVAL] = 10, 407 }, 408 .max_entries = { 409 [MESA_SHADER_VERTEX] = 1664, 410 [MESA_SHADER_TESS_CTRL] = 128, 411 [MESA_SHADER_TESS_EVAL] = 960, 412 [MESA_SHADER_GEOMETRY] = 640, 413 }, 414 }, 415 .max_constant_urb_size_kb = 32, 416 .simulator_id = 9, 417}; 418 419/* It's unclear how well supported sampling from the hiz buffer is on GFX8, 420 * so keep things conservative for now and set has_sample_with_hiz = false. 421 */ 422#define GFX8_FEATURES \ 423 .ver = 8, \ 424 .has_hiz_and_separate_stencil = true, \ 425 .must_use_separate_stencil = true, \ 426 .has_llc = true, \ 427 .has_sample_with_hiz = false, \ 428 .has_pln = true, \ 429 .has_integer_dword_mul = true, \ 430 .has_64bit_float = true, \ 431 .has_64bit_int = true, \ 432 .supports_simd16_3src = true, \ 433 .has_surface_tile_offset = true, \ 434 .num_thread_per_eu = 7, \ 435 .max_vs_threads = 504, \ 436 .max_tcs_threads = 504, \ 437 .max_tes_threads = 504, \ 438 .max_gs_threads = 504, \ 439 .max_wm_threads = 384, \ 440 .max_threads_per_psd = 64, \ 441 .timestamp_frequency = 12500000, \ 442 .max_constant_urb_size_kb = 32, \ 443 .cs_prefetch_size = 512 444 445static const struct intel_device_info intel_device_info_bdw_gt1 = { 446 GFX8_FEATURES, .gt = 1, 447 .platform = INTEL_PLATFORM_BDW, 448 .num_slices = 1, 449 .num_subslices = { 2, }, 450 .max_eus_per_subslice = 6, 451 .l3_banks = 2, 452 .max_cs_threads = 42, 453 .urb = { 454 .min_entries = { 455 [MESA_SHADER_VERTEX] = 64, 456 [MESA_SHADER_TESS_EVAL] = 34, 457 }, 458 .max_entries = { 459 [MESA_SHADER_VERTEX] = 2560, 460 [MESA_SHADER_TESS_CTRL] = 504, 461 [MESA_SHADER_TESS_EVAL] = 1536, 462 /* Reduced from 960, seems to be similar to the bug on Gfx9 GT1. */ 463 [MESA_SHADER_GEOMETRY] = 690, 464 }, 465 }, 466 .simulator_id = 11, 467}; 468 469static const struct intel_device_info intel_device_info_bdw_gt2 = { 470 GFX8_FEATURES, .gt = 2, 471 .platform = INTEL_PLATFORM_BDW, 472 .num_slices = 1, 473 .num_subslices = { 3, }, 474 .max_eus_per_subslice = 8, 475 .l3_banks = 4, 476 .max_cs_threads = 56, 477 .urb = { 478 .min_entries = { 479 [MESA_SHADER_VERTEX] = 64, 480 [MESA_SHADER_TESS_EVAL] = 34, 481 }, 482 .max_entries = { 483 [MESA_SHADER_VERTEX] = 2560, 484 [MESA_SHADER_TESS_CTRL] = 504, 485 [MESA_SHADER_TESS_EVAL] = 1536, 486 [MESA_SHADER_GEOMETRY] = 960, 487 }, 488 }, 489 .simulator_id = 11, 490}; 491 492static const struct intel_device_info intel_device_info_bdw_gt3 = { 493 GFX8_FEATURES, .gt = 3, 494 .platform = INTEL_PLATFORM_BDW, 495 .num_slices = 2, 496 .num_subslices = { 3, 3, }, 497 .max_eus_per_subslice = 8, 498 .l3_banks = 8, 499 .max_cs_threads = 56, 500 .urb = { 501 .min_entries = { 502 [MESA_SHADER_VERTEX] = 64, 503 [MESA_SHADER_TESS_EVAL] = 34, 504 }, 505 .max_entries = { 506 [MESA_SHADER_VERTEX] = 2560, 507 [MESA_SHADER_TESS_CTRL] = 504, 508 [MESA_SHADER_TESS_EVAL] = 1536, 509 [MESA_SHADER_GEOMETRY] = 960, 510 }, 511 }, 512 .simulator_id = 11, 513}; 514 515static const struct intel_device_info intel_device_info_chv = { 516 GFX8_FEATURES, .platform = INTEL_PLATFORM_CHV, .gt = 1, 517 .has_llc = false, 518 .has_integer_dword_mul = false, 519 .num_slices = 1, 520 .num_subslices = { 2, }, 521 .max_eus_per_subslice = 8, 522 .l3_banks = 2, 523 .max_vs_threads = 80, 524 .max_tcs_threads = 80, 525 .max_tes_threads = 80, 526 .max_gs_threads = 80, 527 .max_wm_threads = 128, 528 .max_cs_threads = 6 * 7, 529 .urb = { 530 .min_entries = { 531 [MESA_SHADER_VERTEX] = 34, 532 [MESA_SHADER_TESS_EVAL] = 34, 533 }, 534 .max_entries = { 535 [MESA_SHADER_VERTEX] = 640, 536 [MESA_SHADER_TESS_CTRL] = 80, 537 [MESA_SHADER_TESS_EVAL] = 384, 538 [MESA_SHADER_GEOMETRY] = 256, 539 }, 540 }, 541 .simulator_id = 13, 542}; 543 544#define GFX9_HW_INFO \ 545 .ver = 9, \ 546 .max_vs_threads = 336, \ 547 .max_gs_threads = 336, \ 548 .max_tcs_threads = 336, \ 549 .max_tes_threads = 336, \ 550 .max_threads_per_psd = 64, \ 551 .max_cs_threads = 56, \ 552 .timestamp_frequency = 12000000, \ 553 .cs_prefetch_size = 512, \ 554 .urb = { \ 555 .min_entries = { \ 556 [MESA_SHADER_VERTEX] = 64, \ 557 [MESA_SHADER_TESS_EVAL] = 34, \ 558 }, \ 559 .max_entries = { \ 560 [MESA_SHADER_VERTEX] = 1856, \ 561 [MESA_SHADER_TESS_CTRL] = 672, \ 562 [MESA_SHADER_TESS_EVAL] = 1120, \ 563 [MESA_SHADER_GEOMETRY] = 640, \ 564 }, \ 565 } 566 567#define GFX9_LP_FEATURES \ 568 GFX8_FEATURES, \ 569 GFX9_HW_INFO, \ 570 .has_integer_dword_mul = false, \ 571 .gt = 1, \ 572 .has_llc = false, \ 573 .has_sample_with_hiz = true, \ 574 .num_slices = 1, \ 575 .num_thread_per_eu = 6, \ 576 .max_vs_threads = 112, \ 577 .max_tcs_threads = 112, \ 578 .max_tes_threads = 112, \ 579 .max_gs_threads = 112, \ 580 .max_cs_threads = 6 * 6, \ 581 .timestamp_frequency = 19200000, \ 582 .urb = { \ 583 .min_entries = { \ 584 [MESA_SHADER_VERTEX] = 34, \ 585 [MESA_SHADER_TESS_EVAL] = 34, \ 586 }, \ 587 .max_entries = { \ 588 [MESA_SHADER_VERTEX] = 704, \ 589 [MESA_SHADER_TESS_CTRL] = 256, \ 590 [MESA_SHADER_TESS_EVAL] = 416, \ 591 [MESA_SHADER_GEOMETRY] = 256, \ 592 }, \ 593 } 594 595#define GFX9_LP_FEATURES_3X6 \ 596 GFX9_LP_FEATURES, \ 597 .num_subslices = { 3, }, \ 598 .max_eus_per_subslice = 6 599 600#define GFX9_LP_FEATURES_2X6 \ 601 GFX9_LP_FEATURES, \ 602 .num_subslices = { 2, }, \ 603 .max_eus_per_subslice = 6, \ 604 .max_vs_threads = 56, \ 605 .max_tcs_threads = 56, \ 606 .max_tes_threads = 56, \ 607 .max_gs_threads = 56, \ 608 .max_cs_threads = 6 * 6, \ 609 .urb = { \ 610 .min_entries = { \ 611 [MESA_SHADER_VERTEX] = 34, \ 612 [MESA_SHADER_TESS_EVAL] = 34, \ 613 }, \ 614 .max_entries = { \ 615 [MESA_SHADER_VERTEX] = 352, \ 616 [MESA_SHADER_TESS_CTRL] = 128, \ 617 [MESA_SHADER_TESS_EVAL] = 208, \ 618 [MESA_SHADER_GEOMETRY] = 128, \ 619 }, \ 620 } 621 622#define GFX9_FEATURES \ 623 GFX8_FEATURES, \ 624 GFX9_HW_INFO, \ 625 .has_sample_with_hiz = true 626 627static const struct intel_device_info intel_device_info_skl_gt1 = { 628 GFX9_FEATURES, .gt = 1, 629 .platform = INTEL_PLATFORM_SKL, 630 .num_slices = 1, 631 .num_subslices = { 2, }, 632 .max_eus_per_subslice = 6, 633 .l3_banks = 2, 634 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions 635 * leading to some vertices to go missing if we use too much URB. 636 */ 637 .urb.max_entries[MESA_SHADER_VERTEX] = 928, 638 .simulator_id = 12, 639}; 640 641static const struct intel_device_info intel_device_info_skl_gt2 = { 642 GFX9_FEATURES, .gt = 2, 643 .platform = INTEL_PLATFORM_SKL, 644 .num_slices = 1, 645 .num_subslices = { 3, }, 646 .max_eus_per_subslice = 8, 647 .l3_banks = 4, 648 .simulator_id = 12, 649}; 650 651static const struct intel_device_info intel_device_info_skl_gt3 = { 652 GFX9_FEATURES, .gt = 3, 653 .platform = INTEL_PLATFORM_SKL, 654 .num_slices = 2, 655 .num_subslices = { 3, 3, }, 656 .max_eus_per_subslice = 8, 657 .l3_banks = 8, 658 .simulator_id = 12, 659}; 660 661static const struct intel_device_info intel_device_info_skl_gt4 = { 662 GFX9_FEATURES, .gt = 4, 663 .platform = INTEL_PLATFORM_SKL, 664 .num_slices = 3, 665 .num_subslices = { 3, 3, 3, }, 666 .max_eus_per_subslice = 8, 667 .l3_banks = 12, 668 /* From the "L3 Allocation and Programming" documentation: 669 * 670 * "URB is limited to 1008KB due to programming restrictions. This is not a 671 * restriction of the L3 implementation, but of the FF and other clients. 672 * Therefore, in a GT4 implementation it is possible for the programmed 673 * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but 674 * only 1008KB of this will be used." 675 */ 676 .simulator_id = 12, 677}; 678 679static const struct intel_device_info intel_device_info_bxt = { 680 GFX9_LP_FEATURES_3X6, 681 .platform = INTEL_PLATFORM_BXT, 682 .l3_banks = 2, 683 .simulator_id = 14, 684}; 685 686static const struct intel_device_info intel_device_info_bxt_2x6 = { 687 GFX9_LP_FEATURES_2X6, 688 .platform = INTEL_PLATFORM_BXT, 689 .l3_banks = 1, 690 .simulator_id = 14, 691}; 692/* 693 * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+. 694 * There's no KBL entry. Using the default SKL (GFX9) GS entries value. 695 */ 696 697static const struct intel_device_info intel_device_info_kbl_gt1 = { 698 GFX9_FEATURES, 699 .platform = INTEL_PLATFORM_KBL, 700 .gt = 1, 701 702 .max_cs_threads = 7 * 6, 703 .num_slices = 1, 704 .num_subslices = { 2, }, 705 .max_eus_per_subslice = 6, 706 .l3_banks = 2, 707 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions 708 * leading to some vertices to go missing if we use too much URB. 709 */ 710 .urb.max_entries[MESA_SHADER_VERTEX] = 928, 711 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256, 712 .simulator_id = 16, 713}; 714 715static const struct intel_device_info intel_device_info_kbl_gt1_5 = { 716 GFX9_FEATURES, 717 .platform = INTEL_PLATFORM_KBL, 718 .gt = 1, 719 720 .max_cs_threads = 7 * 6, 721 .num_slices = 1, 722 .num_subslices = { 3, }, 723 .max_eus_per_subslice = 6, 724 .l3_banks = 4, 725 .simulator_id = 16, 726}; 727 728static const struct intel_device_info intel_device_info_kbl_gt2 = { 729 GFX9_FEATURES, 730 .platform = INTEL_PLATFORM_KBL, 731 .gt = 2, 732 733 .num_slices = 1, 734 .num_subslices = { 3, }, 735 .max_eus_per_subslice = 8, 736 .l3_banks = 4, 737 .simulator_id = 16, 738}; 739 740static const struct intel_device_info intel_device_info_kbl_gt3 = { 741 GFX9_FEATURES, 742 .platform = INTEL_PLATFORM_KBL, 743 .gt = 3, 744 745 .num_slices = 2, 746 .num_subslices = { 3, 3, }, 747 .max_eus_per_subslice = 8, 748 .l3_banks = 8, 749 .simulator_id = 16, 750}; 751 752static const struct intel_device_info intel_device_info_kbl_gt4 = { 753 GFX9_FEATURES, 754 .platform = INTEL_PLATFORM_KBL, 755 .gt = 4, 756 757 /* 758 * From the "L3 Allocation and Programming" documentation: 759 * 760 * "URB is limited to 1008KB due to programming restrictions. This 761 * is not a restriction of the L3 implementation, but of the FF and 762 * other clients. Therefore, in a GT4 implementation it is 763 * possible for the programmed allocation of the L3 data array to 764 * provide 3*384KB=1152KB for URB, but only 1008KB of this 765 * will be used." 766 */ 767 .num_slices = 3, 768 .num_subslices = { 3, 3, 3, }, 769 .max_eus_per_subslice = 8, 770 .l3_banks = 12, 771 .simulator_id = 16, 772}; 773 774static const struct intel_device_info intel_device_info_glk = { 775 GFX9_LP_FEATURES_3X6, 776 .platform = INTEL_PLATFORM_GLK, 777 .l3_banks = 2, 778 .simulator_id = 17, 779}; 780 781static const struct intel_device_info intel_device_info_glk_2x6 = { 782 GFX9_LP_FEATURES_2X6, 783 .platform = INTEL_PLATFORM_GLK, 784 .l3_banks = 2, 785 .simulator_id = 17, 786}; 787 788static const struct intel_device_info intel_device_info_cfl_gt1 = { 789 GFX9_FEATURES, 790 .platform = INTEL_PLATFORM_CFL, 791 .gt = 1, 792 793 .num_slices = 1, 794 .num_subslices = { 2, }, 795 .max_eus_per_subslice = 6, 796 .l3_banks = 2, 797 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions 798 * leading to some vertices to go missing if we use too much URB. 799 */ 800 .urb.max_entries[MESA_SHADER_VERTEX] = 928, 801 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256, 802 .simulator_id = 24, 803}; 804static const struct intel_device_info intel_device_info_cfl_gt2 = { 805 GFX9_FEATURES, 806 .platform = INTEL_PLATFORM_CFL, 807 .gt = 2, 808 809 .num_slices = 1, 810 .num_subslices = { 3, }, 811 .max_eus_per_subslice = 8, 812 .l3_banks = 4, 813 .simulator_id = 24, 814}; 815 816static const struct intel_device_info intel_device_info_cfl_gt3 = { 817 GFX9_FEATURES, 818 .platform = INTEL_PLATFORM_CFL, 819 .gt = 3, 820 821 .num_slices = 2, 822 .num_subslices = { 3, 3, }, 823 .max_eus_per_subslice = 8, 824 .l3_banks = 8, 825 .simulator_id = 24, 826}; 827 828#define subslices(args...) { args, } 829 830#define GFX11_HW_INFO \ 831 .ver = 11, \ 832 .has_pln = false, \ 833 .max_vs_threads = 364, \ 834 .max_gs_threads = 224, \ 835 .max_tcs_threads = 224, \ 836 .max_tes_threads = 364, \ 837 .max_threads_per_psd = 64, \ 838 .max_cs_threads = 56, \ 839 .cs_prefetch_size = 512 840 841#define GFX11_FEATURES(_gt, _slices, _subslices, _l3, _platform) \ 842 GFX8_FEATURES, \ 843 GFX11_HW_INFO, \ 844 .platform = _platform, \ 845 .has_64bit_float = false, \ 846 .has_64bit_int = false, \ 847 .has_integer_dword_mul = false, \ 848 .has_sample_with_hiz = false, \ 849 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \ 850 .num_subslices = _subslices, \ 851 .max_eus_per_subslice = 8 852 853#define GFX11_URB_MIN_MAX_ENTRIES \ 854 .min_entries = { \ 855 [MESA_SHADER_VERTEX] = 64, \ 856 [MESA_SHADER_TESS_EVAL] = 34, \ 857 }, \ 858 .max_entries = { \ 859 [MESA_SHADER_VERTEX] = 2384, \ 860 [MESA_SHADER_TESS_CTRL] = 1032, \ 861 [MESA_SHADER_TESS_EVAL] = 2384, \ 862 [MESA_SHADER_GEOMETRY] = 1032, \ 863 } 864 865static const struct intel_device_info intel_device_info_icl_gt2 = { 866 GFX11_FEATURES(2, 1, subslices(8), 8, INTEL_PLATFORM_ICL), 867 .urb = { 868 GFX11_URB_MIN_MAX_ENTRIES, 869 }, 870 .simulator_id = 19, 871}; 872 873static const struct intel_device_info intel_device_info_icl_gt1_5 = { 874 GFX11_FEATURES(1, 1, subslices(6), 6, INTEL_PLATFORM_ICL), 875 .urb = { 876 GFX11_URB_MIN_MAX_ENTRIES, 877 }, 878 .simulator_id = 19, 879}; 880 881static const struct intel_device_info intel_device_info_icl_gt1 = { 882 GFX11_FEATURES(1, 1, subslices(4), 6, INTEL_PLATFORM_ICL), 883 .urb = { 884 GFX11_URB_MIN_MAX_ENTRIES, 885 }, 886 .simulator_id = 19, 887}; 888 889static const struct intel_device_info intel_device_info_icl_gt0_5 = { 890 GFX11_FEATURES(1, 1, subslices(1), 6, INTEL_PLATFORM_ICL), 891 .urb = { 892 GFX11_URB_MIN_MAX_ENTRIES, 893 }, 894 .simulator_id = 19, 895}; 896 897#define GFX11_LP_FEATURES \ 898 .urb = { \ 899 GFX11_URB_MIN_MAX_ENTRIES, \ 900 }, \ 901 .disable_ccs_repack = true, \ 902 .simulator_id = 28 903 904static const struct intel_device_info intel_device_info_ehl_4x8 = { 905 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL), 906 GFX11_LP_FEATURES, 907}; 908 909static const struct intel_device_info intel_device_info_ehl_4x6 = { 910 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL), 911 GFX11_LP_FEATURES, 912 .max_eus_per_subslice = 6, 913}; 914 915static const struct intel_device_info intel_device_info_ehl_4x5 = { 916 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL), 917 GFX11_LP_FEATURES, 918 .max_eus_per_subslice = 5, 919}; 920 921static const struct intel_device_info intel_device_info_ehl_4x4 = { 922 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL), 923 GFX11_LP_FEATURES, 924 .max_eus_per_subslice = 4, 925}; 926 927static const struct intel_device_info intel_device_info_ehl_2x8 = { 928 GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL), 929 GFX11_LP_FEATURES, 930}; 931 932static const struct intel_device_info intel_device_info_ehl_2x4 = { 933 GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL), 934 GFX11_LP_FEATURES, 935 .max_eus_per_subslice = 4, 936}; 937 938#define GFX12_URB_MIN_MAX_ENTRIES \ 939 .min_entries = { \ 940 [MESA_SHADER_VERTEX] = 64, \ 941 [MESA_SHADER_TESS_EVAL] = 34, \ 942 }, \ 943 .max_entries = { \ 944 [MESA_SHADER_VERTEX] = 3576, \ 945 [MESA_SHADER_TESS_CTRL] = 1548, \ 946 [MESA_SHADER_TESS_EVAL] = 3576, \ 947 /* Wa_14013840143 */ \ 948 [MESA_SHADER_GEOMETRY] = 1536, \ 949 } 950 951#define GFX12_HW_INFO \ 952 .ver = 12, \ 953 .has_pln = false, \ 954 .has_sample_with_hiz = false, \ 955 .has_aux_map = true, \ 956 .max_vs_threads = 546, \ 957 .max_gs_threads = 336, \ 958 .max_tcs_threads = 336, \ 959 .max_tes_threads = 546, \ 960 .max_threads_per_psd = 64, \ 961 .max_cs_threads = 112, /* threads per DSS */ \ 962 .urb = { \ 963 GFX12_URB_MIN_MAX_ENTRIES, \ 964 } 965 966#define GFX12_FEATURES(_gt, _slices, _l3) \ 967 GFX8_FEATURES, \ 968 GFX12_HW_INFO, \ 969 .has_64bit_float = false, \ 970 .has_64bit_int = false, \ 971 .has_integer_dword_mul = false, \ 972 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \ 973 .simulator_id = 22, \ 974 .max_eus_per_subslice = 16, \ 975 .cs_prefetch_size = 512 976 977#define dual_subslices(args...) { args, } 978 979#define GFX12_GT05_FEATURES \ 980 GFX12_FEATURES(1, 1, 4), \ 981 .num_subslices = dual_subslices(1) 982 983#define GFX12_GT_FEATURES(_gt) \ 984 GFX12_FEATURES(_gt, 1, _gt == 1 ? 4 : 8), \ 985 .num_subslices = dual_subslices(_gt == 1 ? 2 : 6) 986 987static const struct intel_device_info intel_device_info_tgl_gt1 = { 988 GFX12_GT_FEATURES(1), 989 .platform = INTEL_PLATFORM_TGL, 990}; 991 992static const struct intel_device_info intel_device_info_tgl_gt2 = { 993 GFX12_GT_FEATURES(2), 994 .platform = INTEL_PLATFORM_TGL, 995}; 996 997static const struct intel_device_info intel_device_info_rkl_gt05 = { 998 GFX12_GT05_FEATURES, 999 .platform = INTEL_PLATFORM_RKL, 1000}; 1001 1002static const struct intel_device_info intel_device_info_rkl_gt1 = { 1003 GFX12_GT_FEATURES(1), 1004 .platform = INTEL_PLATFORM_RKL, 1005}; 1006 1007static const struct intel_device_info intel_device_info_adl_gt05 = { 1008 GFX12_GT05_FEATURES, 1009 .platform = INTEL_PLATFORM_ADL, 1010 .display_ver = 13, 1011}; 1012 1013static const struct intel_device_info intel_device_info_adl_gt1 = { 1014 GFX12_GT_FEATURES(1), 1015 .platform = INTEL_PLATFORM_ADL, 1016 .display_ver = 13, 1017}; 1018 1019static const struct intel_device_info intel_device_info_adl_n = { 1020 GFX12_GT_FEATURES(1), 1021 .platform = INTEL_PLATFORM_ADL, 1022 .display_ver = 13, 1023}; 1024 1025static const struct intel_device_info intel_device_info_adl_gt2 = { 1026 GFX12_GT_FEATURES(2), 1027 .platform = INTEL_PLATFORM_ADL, 1028 .display_ver = 13, 1029}; 1030 1031static const struct intel_device_info intel_device_info_rpl = { 1032 GFX12_FEATURES(1, 1, 4), 1033 .num_subslices = dual_subslices(2), 1034 .platform = INTEL_PLATFORM_RPL, 1035 .display_ver = 13, 1036}; 1037 1038static const struct intel_device_info intel_device_info_rpl_p = { 1039 GFX12_GT_FEATURES(2), 1040 .platform = INTEL_PLATFORM_RPL, 1041 .display_ver = 13, 1042}; 1043 1044#define GFX12_DG1_SG1_FEATURES \ 1045 GFX12_GT_FEATURES(2), \ 1046 .platform = INTEL_PLATFORM_DG1, \ 1047 .has_llc = false, \ 1048 .has_local_mem = true, \ 1049 .urb.size = 768, \ 1050 .simulator_id = 30 1051 1052static const struct intel_device_info intel_device_info_dg1 = { 1053 GFX12_DG1_SG1_FEATURES, 1054}; 1055 1056static const struct intel_device_info intel_device_info_sg1 = { 1057 GFX12_DG1_SG1_FEATURES, 1058}; 1059 1060#define XEHP_FEATURES(_gt, _slices, _l3) \ 1061 GFX12_FEATURES(_gt, _slices, _l3), \ 1062 .num_thread_per_eu = 8 /* BSpec 44472 */, \ 1063 .verx10 = 125, \ 1064 .has_llc = false, \ 1065 .has_local_mem = true, \ 1066 .has_aux_map = false, \ 1067 .simulator_id = 29, \ 1068 .cs_prefetch_size = 1024 1069 1070#define DG2_FEATURES \ 1071 /* (Sub)slice info comes from the kernel topology info */ \ 1072 XEHP_FEATURES(0, 1, 0), \ 1073 .display_ver = 13, \ 1074 .revision = 4, /* For offline compiler */ \ 1075 .num_subslices = dual_subslices(1), \ 1076 .has_lsc = true, \ 1077 .apply_hwconfig = true, \ 1078 .has_coarse_pixel_primitive_and_cb = true, \ 1079 .has_mesh_shading = true 1080 1081static const struct intel_device_info intel_device_info_dg2_g10 = { 1082 DG2_FEATURES, 1083 .platform = INTEL_PLATFORM_DG2_G10, 1084}; 1085 1086static const struct intel_device_info intel_device_info_dg2_g11 = { 1087 DG2_FEATURES, 1088 .platform = INTEL_PLATFORM_DG2_G11, 1089}; 1090 1091static const struct intel_device_info intel_device_info_dg2_g12 = { 1092 DG2_FEATURES, 1093 .platform = INTEL_PLATFORM_DG2_G12, 1094}; 1095 1096static void 1097reset_masks(struct intel_device_info *devinfo) 1098{ 1099 devinfo->subslice_slice_stride = 0; 1100 devinfo->eu_subslice_stride = 0; 1101 devinfo->eu_slice_stride = 0; 1102 1103 devinfo->num_slices = 0; 1104 memset(devinfo->num_subslices, 0, sizeof(devinfo->num_subslices)); 1105 1106 memset(&devinfo->slice_masks, 0, sizeof(devinfo->slice_masks)); 1107 memset(devinfo->subslice_masks, 0, sizeof(devinfo->subslice_masks)); 1108 memset(devinfo->eu_masks, 0, sizeof(devinfo->eu_masks)); 1109 memset(devinfo->ppipe_subslices, 0, sizeof(devinfo->ppipe_subslices)); 1110} 1111 1112static void 1113update_slice_subslice_counts(struct intel_device_info *devinfo) 1114{ 1115 devinfo->num_slices = __builtin_popcount(devinfo->slice_masks); 1116 devinfo->subslice_total = 0; 1117 for (int s = 0; s < devinfo->max_slices; s++) { 1118 if (!intel_device_info_slice_available(devinfo, s)) 1119 continue; 1120 1121 for (int b = 0; b < devinfo->subslice_slice_stride; b++) { 1122 devinfo->num_subslices[s] += 1123 __builtin_popcount(devinfo->subslice_masks[s * devinfo->subslice_slice_stride + b]); 1124 } 1125 devinfo->subslice_total += devinfo->num_subslices[s]; 1126 } 1127 assert(devinfo->num_slices > 0); 1128 assert(devinfo->subslice_total > 0); 1129} 1130 1131static void 1132update_pixel_pipes(struct intel_device_info *devinfo, uint8_t *subslice_masks) 1133{ 1134 if (devinfo->ver < 11) 1135 return; 1136 1137 /* The kernel only reports one slice on all existing ICL+ platforms, even 1138 * if multiple slices are present. The slice mask is allowed to have the 1139 * accurate value greater than 1 on gfx12.5+ platforms though, in order to 1140 * be tolerant with the behavior of our simulation environment. 1141 */ 1142 assert(devinfo->slice_masks == 1 || devinfo->verx10 >= 125); 1143 1144 /* Count the number of subslices on each pixel pipe. Assume that every 1145 * contiguous group of 4 subslices in the mask belong to the same pixel 1146 * pipe. However note that on TGL+ the kernel returns a mask of enabled 1147 * *dual* subslices instead of actual subslices somewhat confusingly, so 1148 * each pixel pipe only takes 2 bits in the mask even though it's still 4 1149 * subslices. 1150 */ 1151 const unsigned ppipe_bits = devinfo->ver >= 12 ? 2 : 4; 1152 for (unsigned p = 0; p < INTEL_DEVICE_MAX_PIXEL_PIPES; p++) { 1153 const unsigned offset = p * ppipe_bits; 1154 const unsigned subslice_idx = offset / 1155 devinfo->max_subslices_per_slice * devinfo->subslice_slice_stride; 1156 const unsigned ppipe_mask = 1157 BITFIELD_RANGE(offset % devinfo->max_subslices_per_slice, ppipe_bits); 1158 1159 if (subslice_idx < ARRAY_SIZE(devinfo->subslice_masks)) 1160 devinfo->ppipe_subslices[p] = 1161 __builtin_popcount(subslice_masks[subslice_idx] & ppipe_mask); 1162 else 1163 devinfo->ppipe_subslices[p] = 0; 1164 } 1165} 1166 1167static void 1168update_l3_banks(struct intel_device_info *devinfo) 1169{ 1170 if (devinfo->ver != 12) 1171 return; 1172 1173 if (devinfo->verx10 >= 125) { 1174 if (devinfo->subslice_total > 16) { 1175 assert(devinfo->subslice_total <= 32); 1176 devinfo->l3_banks = 32; 1177 } else if (devinfo->subslice_total > 8) { 1178 devinfo->l3_banks = 16; 1179 } else { 1180 devinfo->l3_banks = 8; 1181 } 1182 } else { 1183 assert(devinfo->num_slices == 1); 1184 if (devinfo->subslice_total >= 6) { 1185 assert(devinfo->subslice_total == 6); 1186 devinfo->l3_banks = 8; 1187 } else if (devinfo->subslice_total > 2) { 1188 devinfo->l3_banks = 6; 1189 } else { 1190 devinfo->l3_banks = 4; 1191 } 1192 } 1193} 1194 1195/* At some point in time, some people decided to redefine what topology means, 1196 * from useful HW related information (slice, subslice, etc...), to much less 1197 * useful generic stuff that no one cares about (a single slice with lots of 1198 * subslices). Of course all of this was done without asking the people who 1199 * defined the topology query in the first place, to solve a lack of 1200 * information Gfx10+. This function is here to workaround the fact it's not 1201 * possible to change people's mind even before this stuff goes upstream. Sad 1202 * times... 1203 */ 1204static void 1205update_from_single_slice_topology(struct intel_device_info *devinfo, 1206 const struct drm_i915_query_topology_info *topology, 1207 const struct drm_i915_query_topology_info *geom_topology) 1208{ 1209 /* An array of bit masks of the subslices available for 3D 1210 * workloads, analogous to intel_device_info::subslice_masks. This 1211 * may differ from the set of enabled subslices on XeHP+ platforms 1212 * with compute-only subslices. 1213 */ 1214 uint8_t geom_subslice_masks[ARRAY_SIZE(devinfo->subslice_masks)] = { 0 }; 1215 1216 assert(devinfo->verx10 >= 125); 1217 1218 reset_masks(devinfo); 1219 1220 assert(topology->max_slices == 1); 1221 assert(topology->max_subslices > 0); 1222 assert(topology->max_eus_per_subslice > 0); 1223 1224 /* i915 gives us only one slice so we have to rebuild that out of groups of 1225 * 4 dualsubslices. 1226 */ 1227 devinfo->max_subslices_per_slice = 4; 1228 devinfo->max_eus_per_subslice = 16; 1229 devinfo->subslice_slice_stride = 1; 1230 devinfo->eu_slice_stride = DIV_ROUND_UP(16 * 4, 8); 1231 devinfo->eu_subslice_stride = DIV_ROUND_UP(16, 8); 1232 1233 for (uint32_t ss_idx = 0; ss_idx < topology->max_subslices; ss_idx++) { 1234 const uint32_t s = ss_idx / 4; 1235 const uint32_t ss = ss_idx % 4; 1236 1237 /* Determine whether ss_idx is enabled (ss_idx_available) and 1238 * available for 3D workloads (geom_ss_idx_available), which may 1239 * differ on XeHP+ if ss_idx is a compute-only DSS. 1240 */ 1241 const bool ss_idx_available = 1242 (topology->data[topology->subslice_offset + ss_idx / 8] >> 1243 (ss_idx % 8)) & 1; 1244 const bool geom_ss_idx_available = 1245 (geom_topology->data[geom_topology->subslice_offset + ss_idx / 8] >> 1246 (ss_idx % 8)) & 1; 1247 1248 if (geom_ss_idx_available) { 1249 assert(ss_idx_available); 1250 geom_subslice_masks[s * devinfo->subslice_slice_stride + 1251 ss / 8] |= 1u << (ss % 8); 1252 } 1253 1254 if (!ss_idx_available) 1255 continue; 1256 1257 devinfo->max_slices = MAX2(devinfo->max_slices, s + 1); 1258 devinfo->slice_masks |= 1u << s; 1259 1260 devinfo->subslice_masks[s * devinfo->subslice_slice_stride + 1261 ss / 8] |= 1u << (ss % 8); 1262 1263 for (uint32_t eu = 0; eu < devinfo->max_eus_per_subslice; eu++) { 1264 const bool eu_available = 1265 (topology->data[topology->eu_offset + 1266 ss_idx * topology->eu_stride + 1267 eu / 8] >> (eu % 8)) & 1; 1268 1269 if (!eu_available) 1270 continue; 1271 1272 devinfo->eu_masks[s * devinfo->eu_slice_stride + 1273 ss * devinfo->eu_subslice_stride + 1274 eu / 8] |= 1u << (eu % 8); 1275 } 1276 } 1277 1278 update_slice_subslice_counts(devinfo); 1279 update_pixel_pipes(devinfo, geom_subslice_masks); 1280 update_l3_banks(devinfo); 1281} 1282 1283static void 1284update_from_topology(struct intel_device_info *devinfo, 1285 const struct drm_i915_query_topology_info *topology) 1286{ 1287 reset_masks(devinfo); 1288 1289 assert(topology->max_slices > 0); 1290 assert(topology->max_subslices > 0); 1291 assert(topology->max_eus_per_subslice > 0); 1292 1293 devinfo->subslice_slice_stride = topology->subslice_stride; 1294 1295 devinfo->eu_subslice_stride = DIV_ROUND_UP(topology->max_eus_per_subslice, 8); 1296 devinfo->eu_slice_stride = topology->max_subslices * devinfo->eu_subslice_stride; 1297 1298 assert(sizeof(devinfo->slice_masks) >= DIV_ROUND_UP(topology->max_slices, 8)); 1299 memcpy(&devinfo->slice_masks, topology->data, DIV_ROUND_UP(topology->max_slices, 8)); 1300 devinfo->max_slices = topology->max_slices; 1301 devinfo->max_subslices_per_slice = topology->max_subslices; 1302 devinfo->max_eus_per_subslice = topology->max_eus_per_subslice; 1303 1304 uint32_t subslice_mask_len = 1305 topology->max_slices * topology->subslice_stride; 1306 assert(sizeof(devinfo->subslice_masks) >= subslice_mask_len); 1307 memcpy(devinfo->subslice_masks, &topology->data[topology->subslice_offset], 1308 subslice_mask_len); 1309 1310 uint32_t eu_mask_len = 1311 topology->eu_stride * topology->max_subslices * topology->max_slices; 1312 assert(sizeof(devinfo->eu_masks) >= eu_mask_len); 1313 memcpy(devinfo->eu_masks, &topology->data[topology->eu_offset], eu_mask_len); 1314 1315 /* Now that all the masks are in place, update the counts. */ 1316 update_slice_subslice_counts(devinfo); 1317 update_pixel_pipes(devinfo, devinfo->subslice_masks); 1318 update_l3_banks(devinfo); 1319} 1320 1321/* Generate detailed mask from the I915_PARAM_SLICE_MASK, 1322 * I915_PARAM_SUBSLICE_MASK & I915_PARAM_EU_TOTAL getparam. 1323 */ 1324static bool 1325update_from_masks(struct intel_device_info *devinfo, uint32_t slice_mask, 1326 uint32_t subslice_mask, uint32_t n_eus) 1327{ 1328 struct drm_i915_query_topology_info *topology; 1329 1330 assert((slice_mask & 0xff) == slice_mask); 1331 1332 size_t data_length = 100; 1333 1334 topology = calloc(1, sizeof(*topology) + data_length); 1335 if (!topology) 1336 return false; 1337 1338 topology->max_slices = util_last_bit(slice_mask); 1339 topology->max_subslices = util_last_bit(subslice_mask); 1340 1341 topology->subslice_offset = DIV_ROUND_UP(topology->max_slices, 8); 1342 topology->subslice_stride = DIV_ROUND_UP(topology->max_subslices, 8); 1343 1344 uint32_t n_subslices = __builtin_popcount(slice_mask) * 1345 __builtin_popcount(subslice_mask); 1346 uint32_t max_eus_per_subslice = DIV_ROUND_UP(n_eus, n_subslices); 1347 uint32_t eu_mask = (1U << max_eus_per_subslice) - 1; 1348 1349 topology->max_eus_per_subslice = max_eus_per_subslice; 1350 topology->eu_offset = topology->subslice_offset + 1351 topology->max_slices * DIV_ROUND_UP(topology->max_subslices, 8); 1352 topology->eu_stride = DIV_ROUND_UP(max_eus_per_subslice, 8); 1353 1354 /* Set slice mask in topology */ 1355 for (int b = 0; b < topology->subslice_offset; b++) 1356 topology->data[b] = (slice_mask >> (b * 8)) & 0xff; 1357 1358 for (int s = 0; s < topology->max_slices; s++) { 1359 1360 /* Set subslice mask in topology */ 1361 for (int b = 0; b < topology->subslice_stride; b++) { 1362 int subslice_offset = topology->subslice_offset + 1363 s * topology->subslice_stride + b; 1364 1365 topology->data[subslice_offset] = (subslice_mask >> (b * 8)) & 0xff; 1366 } 1367 1368 /* Set eu mask in topology */ 1369 for (int ss = 0; ss < topology->max_subslices; ss++) { 1370 for (int b = 0; b < topology->eu_stride; b++) { 1371 int eu_offset = topology->eu_offset + 1372 (s * topology->max_subslices + ss) * topology->eu_stride + b; 1373 1374 topology->data[eu_offset] = (eu_mask >> (b * 8)) & 0xff; 1375 } 1376 } 1377 } 1378 1379 update_from_topology(devinfo, topology); 1380 free(topology); 1381 1382 return true; 1383} 1384 1385/* Generate mask from the device data. */ 1386static void 1387fill_masks(struct intel_device_info *devinfo) 1388{ 1389 /* All of our internal device descriptions assign the same number of 1390 * subslices for each slice. Just verify that this is true. 1391 */ 1392 for (int s = 1; s < devinfo->num_slices; s++) 1393 assert(devinfo->num_subslices[0] == devinfo->num_subslices[s]); 1394 1395 update_from_masks(devinfo, 1396 (1U << devinfo->num_slices) - 1, 1397 (1U << devinfo->num_subslices[0]) - 1, 1398 devinfo->num_slices * devinfo->num_subslices[0] * 1399 devinfo->max_eus_per_subslice); 1400} 1401 1402static bool 1403getparam(int fd, uint32_t param, int *value) 1404{ 1405 int tmp; 1406 1407 struct drm_i915_getparam gp = { 1408 .param = param, 1409 .value = &tmp, 1410 }; 1411 1412 int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp); 1413 if (ret != 0) 1414 return false; 1415 1416 *value = tmp; 1417 return true; 1418} 1419 1420static bool 1421get_context_param(int fd, uint32_t context, uint32_t param, uint64_t *value) 1422{ 1423 struct drm_i915_gem_context_param gp = { 1424 .ctx_id = context, 1425 .param = param, 1426 }; 1427 1428 int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &gp); 1429 if (ret != 0) 1430 return false; 1431 1432 *value = gp.value; 1433 return true; 1434} 1435 1436static void 1437update_cs_workgroup_threads(struct intel_device_info *devinfo) 1438{ 1439 /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we 1440 * can program is 64 without going up to a rectangular group. This only 1441 * impacts Haswell and TGL which have higher thread counts. 1442 * 1443 * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+ 1444 * is 10 bits so we have no such restrictions. 1445 */ 1446 devinfo->max_cs_workgroup_threads = 1447 devinfo->verx10 >= 125 ? devinfo->max_cs_threads : 1448 MIN2(devinfo->max_cs_threads, 64); 1449} 1450 1451bool 1452intel_get_device_info_from_pci_id(int pci_id, 1453 struct intel_device_info *devinfo) 1454{ 1455 switch (pci_id) { 1456#undef CHIPSET 1457#define CHIPSET(id, family, fam_str, name) \ 1458 case id: *devinfo = intel_device_info_##family; break; 1459#include "pci_ids/crocus_pci_ids.h" 1460#include "pci_ids/iris_pci_ids.h" 1461 1462#undef CHIPSET 1463#define CHIPSET(id, fam_str, name) \ 1464 case id: *devinfo = intel_device_info_gfx3; break; 1465#include "pci_ids/i915_pci_ids.h" 1466 1467 default: 1468 mesa_logw("Driver does not support the 0x%x PCI ID.", pci_id); 1469 return false; 1470 } 1471 1472 switch (pci_id) { 1473#undef CHIPSET 1474#define CHIPSET(_id, _family, _fam_str, _name) \ 1475 case _id: \ 1476 /* sizeof(str_literal) includes the null */ \ 1477 STATIC_ASSERT(sizeof(_name) + sizeof(_fam_str) + 2 <= \ 1478 sizeof(devinfo->name)); \ 1479 strncpy(devinfo->name, _name " (" _fam_str ")", sizeof(devinfo->name)); \ 1480 break; 1481#include "pci_ids/crocus_pci_ids.h" 1482#include "pci_ids/iris_pci_ids.h" 1483 default: 1484 strncpy(devinfo->name, "Intel Unknown", sizeof(devinfo->name)); 1485 } 1486 1487 fill_masks(devinfo); 1488 1489 /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer: 1490 * 1491 * "Scratch Space per slice is computed based on 4 sub-slices. SW must 1492 * allocate scratch space enough so that each slice has 4 slices allowed." 1493 * 1494 * The equivalent internal documentation says that this programming note 1495 * applies to all Gfx9+ platforms. 1496 * 1497 * The hardware typically calculates the scratch space pointer by taking 1498 * the base address, and adding per-thread-scratch-space * thread ID. 1499 * Extra padding can be necessary depending how the thread IDs are 1500 * calculated for a particular shader stage. 1501 */ 1502 1503 switch(devinfo->ver) { 1504 case 9: 1505 devinfo->max_wm_threads = 64 /* threads-per-PSD */ 1506 * devinfo->num_slices 1507 * 4; /* effective subslices per slice */ 1508 break; 1509 case 11: 1510 case 12: 1511 devinfo->max_wm_threads = 128 /* threads-per-PSD */ 1512 * devinfo->num_slices 1513 * 8; /* subslices per slice */ 1514 break; 1515 default: 1516 assert(devinfo->ver < 9); 1517 break; 1518 } 1519 1520 assert(devinfo->num_slices <= ARRAY_SIZE(devinfo->num_subslices)); 1521 1522 if (devinfo->verx10 == 0) 1523 devinfo->verx10 = devinfo->ver * 10; 1524 1525 if (devinfo->display_ver == 0) 1526 devinfo->display_ver = devinfo->ver; 1527 1528 update_cs_workgroup_threads(devinfo); 1529 1530 return true; 1531} 1532 1533/** 1534 * for gfx8/gfx9, SLICE_MASK/SUBSLICE_MASK can be used to compute the topology 1535 * (kernel 4.13+) 1536 */ 1537static bool 1538getparam_topology(struct intel_device_info *devinfo, int fd) 1539{ 1540 int slice_mask = 0; 1541 if (!getparam(fd, I915_PARAM_SLICE_MASK, &slice_mask)) 1542 goto maybe_warn; 1543 1544 int n_eus; 1545 if (!getparam(fd, I915_PARAM_EU_TOTAL, &n_eus)) 1546 goto maybe_warn; 1547 1548 int subslice_mask = 0; 1549 if (!getparam(fd, I915_PARAM_SUBSLICE_MASK, &subslice_mask)) 1550 goto maybe_warn; 1551 1552 return update_from_masks(devinfo, slice_mask, subslice_mask, n_eus); 1553 1554 maybe_warn: 1555 /* Only with Gfx8+ are we starting to see devices with fusing that can only 1556 * be detected at runtime. 1557 */ 1558 if (devinfo->ver >= 8) 1559 mesa_logw("Kernel 4.1 required to properly query GPU properties."); 1560 1561 return false; 1562} 1563 1564/** 1565 * preferred API for updating the topology in devinfo (kernel 4.17+) 1566 */ 1567static bool 1568query_topology(struct intel_device_info *devinfo, int fd) 1569{ 1570 struct drm_i915_query_topology_info *topo_info = 1571 intel_i915_query_alloc(fd, DRM_I915_QUERY_TOPOLOGY_INFO, NULL); 1572 if (topo_info == NULL) 1573 return false; 1574 1575 if (devinfo->verx10 >= 125) { 1576 struct drm_i915_query_topology_info *geom_topo_info = 1577 intel_i915_query_alloc(fd, DRM_I915_QUERY_GEOMETRY_SUBSLICES, NULL); 1578 if (geom_topo_info == NULL) { 1579 free(topo_info); 1580 return false; 1581 } 1582 1583 update_from_single_slice_topology(devinfo, topo_info, geom_topo_info); 1584 free(geom_topo_info); 1585 } else { 1586 update_from_topology(devinfo, topo_info); 1587 } 1588 1589 free(topo_info); 1590 1591 return true; 1592 1593} 1594 1595/** 1596 * Reports memory region info, and allows buffers to target system-memory, 1597 * and/or device local memory. 1598 */ 1599static bool 1600query_regions(struct intel_device_info *devinfo, int fd, bool update) 1601{ 1602 struct drm_i915_query_memory_regions *meminfo = 1603 intel_i915_query_alloc(fd, DRM_I915_QUERY_MEMORY_REGIONS, NULL); 1604 if (meminfo == NULL) 1605 return false; 1606 1607 for (int i = 0; i < meminfo->num_regions; i++) { 1608 const struct drm_i915_memory_region_info *mem = &meminfo->regions[i]; 1609 switch (mem->region.memory_class) { 1610 case I915_MEMORY_CLASS_SYSTEM: { 1611 if (!update) { 1612 devinfo->mem.sram.mem_class = mem->region.memory_class; 1613 devinfo->mem.sram.mem_instance = mem->region.memory_instance; 1614 devinfo->mem.sram.mappable.size = mem->probed_size; 1615 } else { 1616 assert(devinfo->mem.sram.mem_class == mem->region.memory_class); 1617 assert(devinfo->mem.sram.mem_instance == mem->region.memory_instance); 1618 assert(devinfo->mem.sram.mappable.size == mem->probed_size); 1619 } 1620 /* The kernel uAPI only reports an accurate unallocated_size value 1621 * for I915_MEMORY_CLASS_DEVICE. 1622 */ 1623 uint64_t available; 1624 if (os_get_available_system_memory(&available)) 1625 devinfo->mem.sram.mappable.free = MIN2(available, mem->probed_size); 1626 break; 1627 } 1628 case I915_MEMORY_CLASS_DEVICE: 1629 if (!update) { 1630 devinfo->mem.vram.mem_class = mem->region.memory_class; 1631 devinfo->mem.vram.mem_instance = mem->region.memory_instance; 1632 if (mem->probed_cpu_visible_size > 0) { 1633 devinfo->mem.vram.mappable.size = mem->probed_cpu_visible_size; 1634 devinfo->mem.vram.unmappable.size = 1635 mem->probed_size - mem->probed_cpu_visible_size; 1636 } else { 1637 /* We are running on an older kernel without support for the 1638 * small-bar uapi. These kernels only support systems where the 1639 * entire vram is mappable. 1640 */ 1641 devinfo->mem.vram.mappable.size = mem->probed_size; 1642 devinfo->mem.vram.unmappable.size = 0; 1643 } 1644 } else { 1645 assert(devinfo->mem.vram.mem_class == mem->region.memory_class); 1646 assert(devinfo->mem.vram.mem_instance == mem->region.memory_instance); 1647 assert((devinfo->mem.vram.mappable.size + 1648 devinfo->mem.vram.unmappable.size) == mem->probed_size); 1649 } 1650 if (mem->unallocated_cpu_visible_size > 0) { 1651 if (mem->unallocated_size != -1) { 1652 devinfo->mem.vram.mappable.free = mem->unallocated_cpu_visible_size; 1653 devinfo->mem.vram.unmappable.free = 1654 mem->unallocated_size - mem->unallocated_cpu_visible_size; 1655 } 1656 } else { 1657 /* We are running on an older kernel without support for the 1658 * small-bar uapi. These kernels only support systems where the 1659 * entire vram is mappable. 1660 */ 1661 if (mem->unallocated_size != -1) { 1662 devinfo->mem.vram.mappable.free = mem->unallocated_size; 1663 devinfo->mem.vram.unmappable.free = 0; 1664 } 1665 } 1666 break; 1667 default: 1668 break; 1669 } 1670 } 1671 1672 free(meminfo); 1673 devinfo->mem.use_class_instance = true; 1674 return true; 1675} 1676 1677static bool 1678compute_system_memory(struct intel_device_info *devinfo, bool update) 1679{ 1680 uint64_t total_phys; 1681 if (!os_get_total_physical_memory(&total_phys)) 1682 return false; 1683 1684 uint64_t available = 0; 1685 os_get_available_system_memory(&available); 1686 1687 if (!update) 1688 devinfo->mem.sram.mappable.size = total_phys; 1689 else 1690 assert(devinfo->mem.sram.mappable.size == total_phys); 1691 1692 devinfo->mem.sram.mappable.free = available; 1693 1694 return true; 1695} 1696 1697static int 1698intel_get_aperture_size(int fd, uint64_t *size) 1699{ 1700 struct drm_i915_gem_get_aperture aperture = { 0 }; 1701 1702 int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture); 1703 if (ret == 0 && size) 1704 *size = aperture.aper_size; 1705 1706 return ret; 1707} 1708 1709static bool 1710has_bit6_swizzle(int fd) 1711{ 1712 struct drm_gem_close close; 1713 int ret; 1714 1715 struct drm_i915_gem_create gem_create = { 1716 .size = 4096, 1717 }; 1718 1719 if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) { 1720 unreachable("Failed to create GEM BO"); 1721 return false; 1722 } 1723 1724 bool swizzled = false; 1725 1726 /* set_tiling overwrites the input on the error path, so we have to open 1727 * code intel_ioctl. 1728 */ 1729 do { 1730 struct drm_i915_gem_set_tiling set_tiling = { 1731 .handle = gem_create.handle, 1732 .tiling_mode = I915_TILING_X, 1733 .stride = 512, 1734 }; 1735 1736 ret = ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling); 1737 } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); 1738 1739 if (ret != 0) { 1740 unreachable("Failed to set BO tiling"); 1741 goto close_and_return; 1742 } 1743 1744 struct drm_i915_gem_get_tiling get_tiling = { 1745 .handle = gem_create.handle, 1746 }; 1747 1748 if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) { 1749 unreachable("Failed to get BO tiling"); 1750 goto close_and_return; 1751 } 1752 1753 assert(get_tiling.tiling_mode == I915_TILING_X); 1754 swizzled = get_tiling.swizzle_mode != I915_BIT_6_SWIZZLE_NONE; 1755 1756close_and_return: 1757 memset(&close, 0, sizeof(close)); 1758 close.handle = gem_create.handle; 1759 intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close); 1760 1761 return swizzled; 1762} 1763 1764static bool 1765has_get_tiling(int fd) 1766{ 1767 int ret; 1768 1769 struct drm_i915_gem_create gem_create = { 1770 .size = 4096, 1771 }; 1772 1773 if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) { 1774 unreachable("Failed to create GEM BO"); 1775 return false; 1776 } 1777 1778 struct drm_i915_gem_get_tiling get_tiling = { 1779 .handle = gem_create.handle, 1780 }; 1781 ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &get_tiling); 1782 1783 struct drm_gem_close close = { 1784 .handle = gem_create.handle, 1785 }; 1786 intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close); 1787 1788 return ret == 0; 1789} 1790 1791static void 1792fixup_chv_device_info(struct intel_device_info *devinfo) 1793{ 1794 assert(devinfo->platform == INTEL_PLATFORM_CHV); 1795 1796 /* Cherryview is annoying. The number of EUs is depending on fusing and 1797 * isn't determinable from the PCI ID alone. We default to the minimum 1798 * available for that PCI ID and then compute the real value from the 1799 * subslice information we get from the kernel. 1800 */ 1801 const uint32_t subslice_total = intel_device_info_subslice_total(devinfo); 1802 const uint32_t eu_total = intel_device_info_eu_total(devinfo); 1803 1804 /* Logical CS threads = EUs per subslice * num threads per EU */ 1805 uint32_t max_cs_threads = 1806 eu_total / subslice_total * devinfo->num_thread_per_eu; 1807 1808 /* Fuse configurations may give more threads than expected, never less. */ 1809 if (max_cs_threads > devinfo->max_cs_threads) 1810 devinfo->max_cs_threads = max_cs_threads; 1811 1812 update_cs_workgroup_threads(devinfo); 1813 1814 /* Braswell is even more annoying. Its marketing name isn't determinable 1815 * from the PCI ID and is also dependent on fusing. 1816 */ 1817 if (devinfo->pci_device_id != 0x22B1) 1818 return; 1819 1820 char *bsw_model; 1821 switch (eu_total) { 1822 case 16: bsw_model = "405"; break; 1823 case 12: bsw_model = "400"; break; 1824 default: bsw_model = " "; break; 1825 } 1826 1827 char *needle = strstr(devinfo->name, "XXX"); 1828 assert(needle); 1829 if (needle) 1830 memcpy(needle, bsw_model, 3); 1831} 1832 1833static void 1834init_max_scratch_ids(struct intel_device_info *devinfo) 1835{ 1836 /* Determine the max number of subslices that potentially might be used in 1837 * scratch space ids. 1838 * 1839 * For, Gfx11+, scratch space allocation is based on the number of threads 1840 * in the base configuration. 1841 * 1842 * For Gfx9, devinfo->subslice_total is the TOTAL number of subslices and 1843 * we wish to view that there are 4 subslices per slice instead of the 1844 * actual number of subslices per slice. The documentation for 3DSTATE_PS 1845 * "Scratch Space Base Pointer" says: 1846 * 1847 * "Scratch Space per slice is computed based on 4 sub-slices. SW 1848 * must allocate scratch space enough so that each slice has 4 1849 * slices allowed." 1850 * 1851 * According to the other driver team, this applies to compute shaders 1852 * as well. This is not currently documented at all. 1853 * 1854 * For Gfx8 and older we user devinfo->subslice_total. 1855 */ 1856 unsigned subslices; 1857 if (devinfo->verx10 == 125) 1858 subslices = 32; 1859 else if (devinfo->ver == 12) 1860 subslices = (devinfo->platform == INTEL_PLATFORM_DG1 || devinfo->gt == 2 ? 6 : 2); 1861 else if (devinfo->ver == 11) 1862 subslices = 8; 1863 else if (devinfo->ver >= 9 && devinfo->ver < 11) 1864 subslices = 4 * devinfo->num_slices; 1865 else 1866 subslices = devinfo->subslice_total; 1867 assert(subslices >= devinfo->subslice_total); 1868 1869 unsigned scratch_ids_per_subslice; 1870 if (devinfo->ver >= 12) { 1871 /* Same as ICL below, but with 16 EUs. */ 1872 scratch_ids_per_subslice = 16 * 8; 1873 } else if (devinfo->ver >= 11) { 1874 /* The MEDIA_VFE_STATE docs say: 1875 * 1876 * "Starting with this configuration, the Maximum Number of 1877 * Threads must be set to (#EU * 8) for GPGPU dispatches. 1878 * 1879 * Although there are only 7 threads per EU in the configuration, 1880 * the FFTID is calculated as if there are 8 threads per EU, 1881 * which in turn requires a larger amount of Scratch Space to be 1882 * allocated by the driver." 1883 */ 1884 scratch_ids_per_subslice = 8 * 8; 1885 } else if (devinfo->platform == INTEL_PLATFORM_HSW) { 1886 /* WaCSScratchSize:hsw 1887 * 1888 * Haswell's scratch space address calculation appears to be sparse 1889 * rather than tightly packed. The Thread ID has bits indicating 1890 * which subslice, EU within a subslice, and thread within an EU it 1891 * is. There's a maximum of two slices and two subslices, so these 1892 * can be stored with a single bit. Even though there are only 10 EUs 1893 * per subslice, this is stored in 4 bits, so there's an effective 1894 * maximum value of 16 EUs. Similarly, although there are only 7 1895 * threads per EU, this is stored in a 3 bit number, giving an 1896 * effective maximum value of 8 threads per EU. 1897 * 1898 * This means that we need to use 16 * 8 instead of 10 * 7 for the 1899 * number of threads per subslice. 1900 */ 1901 scratch_ids_per_subslice = 16 * 8; 1902 } else if (devinfo->platform == INTEL_PLATFORM_CHV) { 1903 /* Cherryview devices have either 6 or 8 EUs per subslice, and each 1904 * EU has 7 threads. The 6 EU devices appear to calculate thread IDs 1905 * as if it had 8 EUs. 1906 */ 1907 scratch_ids_per_subslice = 8 * 7; 1908 } else { 1909 scratch_ids_per_subslice = devinfo->max_cs_threads; 1910 } 1911 1912 unsigned max_thread_ids = scratch_ids_per_subslice * subslices; 1913 1914 if (devinfo->verx10 >= 125) { 1915 /* On GFX version 12.5, scratch access changed to a surface-based model. 1916 * Instead of each shader type having its own layout based on IDs passed 1917 * from the relevant fixed-function unit, all scratch access is based on 1918 * thread IDs like it always has been for compute. 1919 */ 1920 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++) 1921 devinfo->max_scratch_ids[i] = max_thread_ids; 1922 } else { 1923 unsigned max_scratch_ids[] = { 1924 [MESA_SHADER_VERTEX] = devinfo->max_vs_threads, 1925 [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads, 1926 [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads, 1927 [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads, 1928 [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads, 1929 [MESA_SHADER_COMPUTE] = max_thread_ids, 1930 }; 1931 STATIC_ASSERT(sizeof(devinfo->max_scratch_ids) == sizeof(max_scratch_ids)); 1932 memcpy(devinfo->max_scratch_ids, max_scratch_ids, 1933 sizeof(devinfo->max_scratch_ids)); 1934 } 1935} 1936 1937bool 1938intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo) 1939{ 1940 /* Get PCI info. 1941 * 1942 * Some callers may already have a valid drm device which holds values of 1943 * PCI fields queried here prior to calling this function. But making this 1944 * query optional leads to a more cumbersome implementation. These callers 1945 * still need to initialize the fields somewhere out of this function and 1946 * rely on an ioctl to get PCI device id for the next step when skipping 1947 * this drm query. 1948 */ 1949 drmDevicePtr drmdev = NULL; 1950 if (drmGetDevice2(fd, DRM_DEVICE_GET_PCI_REVISION, &drmdev)) { 1951 mesa_loge("Failed to query drm device."); 1952 return false; 1953 } 1954 if (!intel_get_device_info_from_pci_id 1955 (drmdev->deviceinfo.pci->device_id, devinfo)) { 1956 drmFreeDevice(&drmdev); 1957 return false; 1958 } 1959 devinfo->pci_domain = drmdev->businfo.pci->domain; 1960 devinfo->pci_bus = drmdev->businfo.pci->bus; 1961 devinfo->pci_dev = drmdev->businfo.pci->dev; 1962 devinfo->pci_func = drmdev->businfo.pci->func; 1963 devinfo->pci_device_id = drmdev->deviceinfo.pci->device_id; 1964 devinfo->pci_revision_id = drmdev->deviceinfo.pci->revision_id; 1965 drmFreeDevice(&drmdev); 1966 devinfo->no_hw = env_var_as_boolean("INTEL_NO_HW", false); 1967 1968 if (devinfo->ver == 10) { 1969 mesa_loge("Gfx10 support is redacted."); 1970 return false; 1971 } 1972 1973 /* remaining initializion queries the kernel for device info */ 1974 if (devinfo->no_hw) { 1975 /* Provide some sensible values for NO_HW. */ 1976 devinfo->gtt_size = 1977 devinfo->ver >= 8 ? (1ull << 48) : 2ull * 1024 * 1024 * 1024; 1978 compute_system_memory(devinfo, false); 1979 return true; 1980 } 1981 1982 if (intel_get_and_process_hwconfig_table(fd, devinfo)) { 1983 /* After applying hwconfig values, some items need to be recalculated. */ 1984 devinfo->max_cs_threads = 1985 devinfo->max_eus_per_subslice * devinfo->num_thread_per_eu; 1986 1987 update_cs_workgroup_threads(devinfo); 1988 } 1989 1990 int timestamp_frequency; 1991 if (getparam(fd, I915_PARAM_CS_TIMESTAMP_FREQUENCY, 1992 ×tamp_frequency)) 1993 devinfo->timestamp_frequency = timestamp_frequency; 1994 else if (devinfo->ver >= 10) { 1995 mesa_loge("Kernel 4.15 required to read the CS timestamp frequency."); 1996 return false; 1997 } 1998 1999 if (!getparam(fd, I915_PARAM_REVISION, &devinfo->revision)) 2000 devinfo->revision = 0; 2001 2002 if (!query_topology(devinfo, fd)) { 2003 if (devinfo->ver >= 10) { 2004 /* topology uAPI required for CNL+ (kernel 4.17+) */ 2005 return false; 2006 } 2007 2008 /* else use the kernel 4.13+ api for gfx8+. For older kernels, topology 2009 * will be wrong, affecting GPU metrics. In this case, fail silently. 2010 */ 2011 getparam_topology(devinfo, fd); 2012 } 2013 2014 /* If the memory region uAPI query is not available, try to generate some 2015 * numbers out of os_* utils for sram only. 2016 */ 2017 if (!query_regions(devinfo, fd, false)) 2018 compute_system_memory(devinfo, false); 2019 2020 /* region info is required for lmem support */ 2021 if (devinfo->has_local_mem && !devinfo->mem.use_class_instance) { 2022 mesa_logw("Could not query local memory size."); 2023 return false; 2024 } 2025 2026 if (devinfo->platform == INTEL_PLATFORM_CHV) 2027 fixup_chv_device_info(devinfo); 2028 2029 /* Broadwell PRM says: 2030 * 2031 * "Before Gfx8, there was a historical configuration control field to 2032 * swizzle address bit[6] for in X/Y tiling modes. This was set in three 2033 * different places: TILECTL[1:0], ARB_MODE[5:4], and 2034 * DISP_ARB_CTL[14:13]. 2035 * 2036 * For Gfx8 and subsequent generations, the swizzle fields are all 2037 * reserved, and the CPU's memory controller performs all address 2038 * swizzling modifications." 2039 */ 2040 devinfo->has_bit6_swizzle = devinfo->ver < 8 && has_bit6_swizzle(fd); 2041 2042 intel_get_aperture_size(fd, &devinfo->aperture_bytes); 2043 get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE, &devinfo->gtt_size); 2044 devinfo->has_tiling_uapi = has_get_tiling(fd); 2045 2046 /* Gfx7 and older do not support EU/Subslice info */ 2047 assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7); 2048 devinfo->subslice_total = MAX2(devinfo->subslice_total, 1); 2049 2050 init_max_scratch_ids(devinfo); 2051 2052 return true; 2053} 2054 2055bool intel_device_info_update_memory_info(struct intel_device_info *devinfo, int fd) 2056{ 2057 return query_regions(devinfo, fd, true) || compute_system_memory(devinfo, true); 2058} 2059