1/*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include <assert.h>
25#include <stdbool.h>
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29#include <unistd.h>
30
31#include <xf86drm.h>
32
33#include "intel_device_info.h"
34#include "intel_hwconfig.h"
35#include "intel/common/intel_gem.h"
36#include "util/bitscan.h"
37#include "util/debug.h"
38#include "util/log.h"
39#include "util/macros.h"
40#include "util/os_misc.h"
41
42#include "drm-uapi/i915_drm.h"
43
44static const struct {
45   const char *name;
46   int pci_id;
47} name_map[] = {
48   { "lpt", 0x27a2 },
49   { "brw", 0x2a02 },
50   { "g4x", 0x2a42 },
51   { "ilk", 0x0042 },
52   { "snb", 0x0126 },
53   { "ivb", 0x016a },
54   { "hsw", 0x0d2e },
55   { "byt", 0x0f33 },
56   { "bdw", 0x162e },
57   { "chv", 0x22B3 },
58   { "skl", 0x1912 },
59   { "bxt", 0x5A85 },
60   { "kbl", 0x5912 },
61   { "aml", 0x591C },
62   { "glk", 0x3185 },
63   { "cfl", 0x3E9B },
64   { "whl", 0x3EA1 },
65   { "cml", 0x9b41 },
66   { "icl", 0x8a52 },
67   { "ehl", 0x4500 },
68   { "jsl", 0x4E71 },
69   { "tgl", 0x9a49 },
70   { "rkl", 0x4c8a },
71   { "dg1", 0x4905 },
72   { "adl", 0x4680 },
73   { "sg1", 0x4907 },
74   { "rpl", 0xa780 },
75   { "dg2", 0x5690 },
76};
77
78/**
79 * Get the PCI ID for the device name.
80 *
81 * Returns -1 if the device is not known.
82 */
83int
84intel_device_name_to_pci_device_id(const char *name)
85{
86   for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) {
87      if (!strcmp(name_map[i].name, name))
88         return name_map[i].pci_id;
89   }
90
91   return -1;
92}
93
94static const struct intel_device_info intel_device_info_gfx3 = {
95   .ver = 3,
96   .platform = INTEL_PLATFORM_GFX3,
97   .simulator_id = -1,
98   .num_slices = 1,
99   .num_subslices = { 1, },
100   .max_eus_per_subslice = 8,
101   .num_thread_per_eu = 4,
102   .timestamp_frequency = 12500000,
103   .cs_prefetch_size = 512,
104};
105
106static const struct intel_device_info intel_device_info_i965 = {
107   .ver = 4,
108   .platform = INTEL_PLATFORM_I965,
109   .has_negative_rhw_bug = true,
110   .num_slices = 1,
111   .num_subslices = { 1, },
112   .max_eus_per_subslice = 8,
113   .num_thread_per_eu = 4,
114   .max_vs_threads = 16,
115   .max_gs_threads = 2,
116   .max_wm_threads = 8 * 4,
117   .urb = {
118      .size = 256,
119   },
120   .timestamp_frequency = 12500000,
121   .simulator_id = -1,
122   .cs_prefetch_size = 512,
123};
124
125static const struct intel_device_info intel_device_info_g4x = {
126   .ver = 4,
127   .verx10 = 45,
128   .has_pln = true,
129   .has_compr4 = true,
130   .has_surface_tile_offset = true,
131   .platform = INTEL_PLATFORM_G4X,
132   .num_slices = 1,
133   .num_subslices = { 1, },
134   .max_eus_per_subslice = 10,
135   .num_thread_per_eu = 5,
136   .max_vs_threads = 32,
137   .max_gs_threads = 2,
138   .max_wm_threads = 10 * 5,
139   .urb = {
140      .size = 384,
141   },
142   .timestamp_frequency = 12500000,
143   .simulator_id = -1,
144   .cs_prefetch_size = 512,
145};
146
147static const struct intel_device_info intel_device_info_ilk = {
148   .ver = 5,
149   .platform = INTEL_PLATFORM_ILK,
150   .has_pln = true,
151   .has_compr4 = true,
152   .has_surface_tile_offset = true,
153   .num_slices = 1,
154   .num_subslices = { 1, },
155   .max_eus_per_subslice = 12,
156   .num_thread_per_eu = 6,
157   .max_vs_threads = 72,
158   .max_gs_threads = 32,
159   .max_wm_threads = 12 * 6,
160   .urb = {
161      .size = 1024,
162   },
163   .timestamp_frequency = 12500000,
164   .simulator_id = -1,
165   .cs_prefetch_size = 512,
166};
167
168static const struct intel_device_info intel_device_info_snb_gt1 = {
169   .ver = 6,
170   .gt = 1,
171   .platform = INTEL_PLATFORM_SNB,
172   .has_hiz_and_separate_stencil = true,
173   .has_llc = true,
174   .has_pln = true,
175   .has_surface_tile_offset = true,
176   .needs_unlit_centroid_workaround = true,
177   .num_slices = 1,
178   .num_subslices = { 1, },
179   .max_eus_per_subslice = 6,
180   .num_thread_per_eu = 6, /* Not confirmed */
181   .max_vs_threads = 24,
182   .max_gs_threads = 21, /* conservative; 24 if rendering disabled. */
183   .max_wm_threads = 40,
184   .urb = {
185      .size = 32,
186      .min_entries = {
187         [MESA_SHADER_VERTEX]   = 24,
188      },
189      .max_entries = {
190         [MESA_SHADER_VERTEX]   = 256,
191         [MESA_SHADER_GEOMETRY] = 256,
192      },
193   },
194   .timestamp_frequency = 12500000,
195   .simulator_id = -1,
196   .cs_prefetch_size = 512,
197};
198
199static const struct intel_device_info intel_device_info_snb_gt2 = {
200   .ver = 6,
201   .gt = 2,
202   .platform = INTEL_PLATFORM_SNB,
203   .has_hiz_and_separate_stencil = true,
204   .has_llc = true,
205   .has_pln = true,
206   .has_surface_tile_offset = true,
207   .needs_unlit_centroid_workaround = true,
208   .num_slices = 1,
209   .num_subslices = { 1, },
210   .max_eus_per_subslice = 12,
211   .num_thread_per_eu = 6, /* Not confirmed */
212   .max_vs_threads = 60,
213   .max_gs_threads = 60,
214   .max_wm_threads = 80,
215   .urb = {
216      .size = 64,
217      .min_entries = {
218         [MESA_SHADER_VERTEX]   = 24,
219      },
220      .max_entries = {
221         [MESA_SHADER_VERTEX]   = 256,
222         [MESA_SHADER_GEOMETRY] = 256,
223      },
224   },
225   .timestamp_frequency = 12500000,
226   .simulator_id = -1,
227   .cs_prefetch_size = 512,
228};
229
230#define GFX7_FEATURES                               \
231   .ver = 7,                                        \
232   .has_hiz_and_separate_stencil = true,            \
233   .must_use_separate_stencil = true,               \
234   .has_llc = true,                                 \
235   .has_pln = true,                                 \
236   .has_64bit_float = true,                         \
237   .has_surface_tile_offset = true,                 \
238   .timestamp_frequency = 12500000,                 \
239   .max_constant_urb_size_kb = 16,                  \
240   .cs_prefetch_size = 512
241
242static const struct intel_device_info intel_device_info_ivb_gt1 = {
243   GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 1,
244   .num_slices = 1,
245   .num_subslices = { 1, },
246   .max_eus_per_subslice = 6,
247   .num_thread_per_eu = 6,
248   .l3_banks = 2,
249   .max_vs_threads = 36,
250   .max_tcs_threads = 36,
251   .max_tes_threads = 36,
252   .max_gs_threads = 36,
253   .max_wm_threads = 48,
254   .max_cs_threads = 36,
255   .urb = {
256      .min_entries = {
257         [MESA_SHADER_VERTEX]    = 32,
258         [MESA_SHADER_TESS_EVAL] = 10,
259      },
260      .max_entries = {
261         [MESA_SHADER_VERTEX]    = 512,
262         [MESA_SHADER_TESS_CTRL] = 32,
263         [MESA_SHADER_TESS_EVAL] = 288,
264         [MESA_SHADER_GEOMETRY]  = 192,
265      },
266   },
267   .simulator_id = 7,
268};
269
270static const struct intel_device_info intel_device_info_ivb_gt2 = {
271   GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 2,
272   .num_slices = 1,
273   .num_subslices = { 1, },
274   .max_eus_per_subslice = 12,
275   .num_thread_per_eu = 8, /* Not sure why this isn't a multiple of
276                            * @max_wm_threads ... */
277   .l3_banks = 4,
278   .max_vs_threads = 128,
279   .max_tcs_threads = 128,
280   .max_tes_threads = 128,
281   .max_gs_threads = 128,
282   .max_wm_threads = 172,
283   .max_cs_threads = 64,
284   .urb = {
285      .min_entries = {
286         [MESA_SHADER_VERTEX]    = 32,
287         [MESA_SHADER_TESS_EVAL] = 10,
288      },
289      .max_entries = {
290         [MESA_SHADER_VERTEX]    = 704,
291         [MESA_SHADER_TESS_CTRL] = 64,
292         [MESA_SHADER_TESS_EVAL] = 448,
293         [MESA_SHADER_GEOMETRY]  = 320,
294      },
295   },
296   .simulator_id = 7,
297};
298
299static const struct intel_device_info intel_device_info_byt = {
300   GFX7_FEATURES, .platform = INTEL_PLATFORM_BYT, .gt = 1,
301   .num_slices = 1,
302   .num_subslices = { 1, },
303   .max_eus_per_subslice = 4,
304   .num_thread_per_eu = 8,
305   .l3_banks = 1,
306   .has_llc = false,
307   .max_vs_threads = 36,
308   .max_tcs_threads = 36,
309   .max_tes_threads = 36,
310   .max_gs_threads = 36,
311   .max_wm_threads = 48,
312   .max_cs_threads = 32,
313   .urb = {
314      .min_entries = {
315         [MESA_SHADER_VERTEX]    = 32,
316         [MESA_SHADER_TESS_EVAL] = 10,
317      },
318      .max_entries = {
319         [MESA_SHADER_VERTEX]    = 512,
320         [MESA_SHADER_TESS_CTRL] = 32,
321         [MESA_SHADER_TESS_EVAL] = 288,
322         [MESA_SHADER_GEOMETRY]  = 192,
323      },
324   },
325   .simulator_id = 10,
326};
327
328#define HSW_FEATURES \
329   GFX7_FEATURES, \
330   .platform = INTEL_PLATFORM_HSW, \
331   .verx10 = 75, \
332   .supports_simd16_3src = true
333
334static const struct intel_device_info intel_device_info_hsw_gt1 = {
335   HSW_FEATURES, .gt = 1,
336   .num_slices = 1,
337   .num_subslices = { 1, },
338   .max_eus_per_subslice = 10,
339   .num_thread_per_eu = 7,
340   .l3_banks = 2,
341   .max_vs_threads = 70,
342   .max_tcs_threads = 70,
343   .max_tes_threads = 70,
344   .max_gs_threads = 70,
345   .max_wm_threads = 102,
346   .max_cs_threads = 70,
347   .urb = {
348      .min_entries = {
349         [MESA_SHADER_VERTEX]    = 32,
350         [MESA_SHADER_TESS_EVAL] = 10,
351      },
352      .max_entries = {
353         [MESA_SHADER_VERTEX]    = 640,
354         [MESA_SHADER_TESS_CTRL] = 64,
355         [MESA_SHADER_TESS_EVAL] = 384,
356         [MESA_SHADER_GEOMETRY]  = 256,
357      },
358   },
359   .simulator_id = 9,
360};
361
362static const struct intel_device_info intel_device_info_hsw_gt2 = {
363   HSW_FEATURES, .gt = 2,
364   .num_slices = 1,
365   .num_subslices = { 2, },
366   .max_eus_per_subslice = 10,
367   .num_thread_per_eu = 7,
368   .l3_banks = 4,
369   .max_vs_threads = 280,
370   .max_tcs_threads = 256,
371   .max_tes_threads = 280,
372   .max_gs_threads = 256,
373   .max_wm_threads = 204,
374   .max_cs_threads = 70,
375   .urb = {
376      .min_entries = {
377         [MESA_SHADER_VERTEX]    = 64,
378         [MESA_SHADER_TESS_EVAL] = 10,
379      },
380      .max_entries = {
381         [MESA_SHADER_VERTEX]    = 1664,
382         [MESA_SHADER_TESS_CTRL] = 128,
383         [MESA_SHADER_TESS_EVAL] = 960,
384         [MESA_SHADER_GEOMETRY]  = 640,
385      },
386   },
387   .simulator_id = 9,
388};
389
390static const struct intel_device_info intel_device_info_hsw_gt3 = {
391   HSW_FEATURES, .gt = 3,
392   .num_slices = 2,
393   .num_subslices = { 2, 2, },
394   .max_eus_per_subslice = 10,
395   .num_thread_per_eu = 7,
396   .l3_banks = 8,
397   .max_vs_threads = 280,
398   .max_tcs_threads = 256,
399   .max_tes_threads = 280,
400   .max_gs_threads = 256,
401   .max_wm_threads = 408,
402   .max_cs_threads = 70,
403   .urb = {
404      .min_entries = {
405         [MESA_SHADER_VERTEX]    = 64,
406         [MESA_SHADER_TESS_EVAL] = 10,
407      },
408      .max_entries = {
409         [MESA_SHADER_VERTEX]    = 1664,
410         [MESA_SHADER_TESS_CTRL] = 128,
411         [MESA_SHADER_TESS_EVAL] = 960,
412         [MESA_SHADER_GEOMETRY]  = 640,
413      },
414   },
415   .max_constant_urb_size_kb = 32,
416   .simulator_id = 9,
417};
418
419/* It's unclear how well supported sampling from the hiz buffer is on GFX8,
420 * so keep things conservative for now and set has_sample_with_hiz = false.
421 */
422#define GFX8_FEATURES                               \
423   .ver = 8,                                        \
424   .has_hiz_and_separate_stencil = true,            \
425   .must_use_separate_stencil = true,               \
426   .has_llc = true,                                 \
427   .has_sample_with_hiz = false,                    \
428   .has_pln = true,                                 \
429   .has_integer_dword_mul = true,                   \
430   .has_64bit_float = true,                         \
431   .has_64bit_int = true,                           \
432   .supports_simd16_3src = true,                    \
433   .has_surface_tile_offset = true,                 \
434   .num_thread_per_eu = 7,                          \
435   .max_vs_threads = 504,                           \
436   .max_tcs_threads = 504,                          \
437   .max_tes_threads = 504,                          \
438   .max_gs_threads = 504,                           \
439   .max_wm_threads = 384,                           \
440   .max_threads_per_psd = 64,                       \
441   .timestamp_frequency = 12500000,                 \
442   .max_constant_urb_size_kb = 32,                  \
443   .cs_prefetch_size = 512
444
445static const struct intel_device_info intel_device_info_bdw_gt1 = {
446   GFX8_FEATURES, .gt = 1,
447   .platform = INTEL_PLATFORM_BDW,
448   .num_slices = 1,
449   .num_subslices = { 2, },
450   .max_eus_per_subslice = 6,
451   .l3_banks = 2,
452   .max_cs_threads = 42,
453   .urb = {
454      .min_entries = {
455         [MESA_SHADER_VERTEX]    = 64,
456         [MESA_SHADER_TESS_EVAL] = 34,
457      },
458      .max_entries = {
459         [MESA_SHADER_VERTEX]    = 2560,
460         [MESA_SHADER_TESS_CTRL] = 504,
461         [MESA_SHADER_TESS_EVAL] = 1536,
462         /* Reduced from 960, seems to be similar to the bug on Gfx9 GT1. */
463         [MESA_SHADER_GEOMETRY]  = 690,
464      },
465   },
466   .simulator_id = 11,
467};
468
469static const struct intel_device_info intel_device_info_bdw_gt2 = {
470   GFX8_FEATURES, .gt = 2,
471   .platform = INTEL_PLATFORM_BDW,
472   .num_slices = 1,
473   .num_subslices = { 3, },
474   .max_eus_per_subslice = 8,
475   .l3_banks = 4,
476   .max_cs_threads = 56,
477   .urb = {
478      .min_entries = {
479         [MESA_SHADER_VERTEX]    = 64,
480         [MESA_SHADER_TESS_EVAL] = 34,
481      },
482      .max_entries = {
483         [MESA_SHADER_VERTEX]    = 2560,
484         [MESA_SHADER_TESS_CTRL] = 504,
485         [MESA_SHADER_TESS_EVAL] = 1536,
486         [MESA_SHADER_GEOMETRY]  = 960,
487      },
488   },
489   .simulator_id = 11,
490};
491
492static const struct intel_device_info intel_device_info_bdw_gt3 = {
493   GFX8_FEATURES, .gt = 3,
494   .platform = INTEL_PLATFORM_BDW,
495   .num_slices = 2,
496   .num_subslices = { 3, 3, },
497   .max_eus_per_subslice = 8,
498   .l3_banks = 8,
499   .max_cs_threads = 56,
500   .urb = {
501      .min_entries = {
502         [MESA_SHADER_VERTEX]    = 64,
503         [MESA_SHADER_TESS_EVAL] = 34,
504      },
505      .max_entries = {
506         [MESA_SHADER_VERTEX]    = 2560,
507         [MESA_SHADER_TESS_CTRL] = 504,
508         [MESA_SHADER_TESS_EVAL] = 1536,
509         [MESA_SHADER_GEOMETRY]  = 960,
510      },
511   },
512   .simulator_id = 11,
513};
514
515static const struct intel_device_info intel_device_info_chv = {
516   GFX8_FEATURES, .platform = INTEL_PLATFORM_CHV, .gt = 1,
517   .has_llc = false,
518   .has_integer_dword_mul = false,
519   .num_slices = 1,
520   .num_subslices = { 2, },
521   .max_eus_per_subslice = 8,
522   .l3_banks = 2,
523   .max_vs_threads = 80,
524   .max_tcs_threads = 80,
525   .max_tes_threads = 80,
526   .max_gs_threads = 80,
527   .max_wm_threads = 128,
528   .max_cs_threads = 6 * 7,
529   .urb = {
530      .min_entries = {
531         [MESA_SHADER_VERTEX]    = 34,
532         [MESA_SHADER_TESS_EVAL] = 34,
533      },
534      .max_entries = {
535         [MESA_SHADER_VERTEX]    = 640,
536         [MESA_SHADER_TESS_CTRL] = 80,
537         [MESA_SHADER_TESS_EVAL] = 384,
538         [MESA_SHADER_GEOMETRY]  = 256,
539      },
540   },
541   .simulator_id = 13,
542};
543
544#define GFX9_HW_INFO                                \
545   .ver = 9,                                        \
546   .max_vs_threads = 336,                           \
547   .max_gs_threads = 336,                           \
548   .max_tcs_threads = 336,                          \
549   .max_tes_threads = 336,                          \
550   .max_threads_per_psd = 64,                       \
551   .max_cs_threads = 56,                            \
552   .timestamp_frequency = 12000000,                 \
553   .cs_prefetch_size = 512,                         \
554   .urb = {                                         \
555      .min_entries = {                              \
556         [MESA_SHADER_VERTEX]    = 64,              \
557         [MESA_SHADER_TESS_EVAL] = 34,              \
558      },                                            \
559      .max_entries = {                              \
560         [MESA_SHADER_VERTEX]    = 1856,            \
561         [MESA_SHADER_TESS_CTRL] = 672,             \
562         [MESA_SHADER_TESS_EVAL] = 1120,            \
563         [MESA_SHADER_GEOMETRY]  = 640,             \
564      },                                            \
565   }
566
567#define GFX9_LP_FEATURES                           \
568   GFX8_FEATURES,                                  \
569   GFX9_HW_INFO,                                   \
570   .has_integer_dword_mul = false,                 \
571   .gt = 1,                                        \
572   .has_llc = false,                               \
573   .has_sample_with_hiz = true,                    \
574   .num_slices = 1,                                \
575   .num_thread_per_eu = 6,                         \
576   .max_vs_threads = 112,                          \
577   .max_tcs_threads = 112,                         \
578   .max_tes_threads = 112,                         \
579   .max_gs_threads = 112,                          \
580   .max_cs_threads = 6 * 6,                        \
581   .timestamp_frequency = 19200000,                \
582   .urb = {                                        \
583      .min_entries = {                             \
584         [MESA_SHADER_VERTEX]    = 34,             \
585         [MESA_SHADER_TESS_EVAL] = 34,             \
586      },                                           \
587      .max_entries = {                             \
588         [MESA_SHADER_VERTEX]    = 704,            \
589         [MESA_SHADER_TESS_CTRL] = 256,            \
590         [MESA_SHADER_TESS_EVAL] = 416,            \
591         [MESA_SHADER_GEOMETRY]  = 256,            \
592      },                                           \
593   }
594
595#define GFX9_LP_FEATURES_3X6                       \
596   GFX9_LP_FEATURES,                               \
597   .num_subslices = { 3, },                        \
598   .max_eus_per_subslice = 6
599
600#define GFX9_LP_FEATURES_2X6                       \
601   GFX9_LP_FEATURES,                               \
602   .num_subslices = { 2, },                        \
603   .max_eus_per_subslice = 6,                       \
604   .max_vs_threads = 56,                           \
605   .max_tcs_threads = 56,                          \
606   .max_tes_threads = 56,                          \
607   .max_gs_threads = 56,                           \
608   .max_cs_threads = 6 * 6,                        \
609   .urb = {                                        \
610      .min_entries = {                             \
611         [MESA_SHADER_VERTEX]    = 34,             \
612         [MESA_SHADER_TESS_EVAL] = 34,             \
613      },                                           \
614      .max_entries = {                             \
615         [MESA_SHADER_VERTEX]    = 352,            \
616         [MESA_SHADER_TESS_CTRL] = 128,            \
617         [MESA_SHADER_TESS_EVAL] = 208,            \
618         [MESA_SHADER_GEOMETRY]  = 128,            \
619      },                                           \
620   }
621
622#define GFX9_FEATURES                               \
623   GFX8_FEATURES,                                   \
624   GFX9_HW_INFO,                                    \
625   .has_sample_with_hiz = true
626
627static const struct intel_device_info intel_device_info_skl_gt1 = {
628   GFX9_FEATURES, .gt = 1,
629   .platform = INTEL_PLATFORM_SKL,
630   .num_slices = 1,
631   .num_subslices = { 2, },
632   .max_eus_per_subslice = 6,
633   .l3_banks = 2,
634   /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
635    * leading to some vertices to go missing if we use too much URB.
636    */
637   .urb.max_entries[MESA_SHADER_VERTEX] = 928,
638   .simulator_id = 12,
639};
640
641static const struct intel_device_info intel_device_info_skl_gt2 = {
642   GFX9_FEATURES, .gt = 2,
643   .platform = INTEL_PLATFORM_SKL,
644   .num_slices = 1,
645   .num_subslices = { 3, },
646   .max_eus_per_subslice = 8,
647   .l3_banks = 4,
648   .simulator_id = 12,
649};
650
651static const struct intel_device_info intel_device_info_skl_gt3 = {
652   GFX9_FEATURES, .gt = 3,
653   .platform = INTEL_PLATFORM_SKL,
654   .num_slices = 2,
655   .num_subslices = { 3, 3, },
656   .max_eus_per_subslice = 8,
657   .l3_banks = 8,
658   .simulator_id = 12,
659};
660
661static const struct intel_device_info intel_device_info_skl_gt4 = {
662   GFX9_FEATURES, .gt = 4,
663   .platform = INTEL_PLATFORM_SKL,
664   .num_slices = 3,
665   .num_subslices = { 3, 3, 3, },
666   .max_eus_per_subslice = 8,
667   .l3_banks = 12,
668   /* From the "L3 Allocation and Programming" documentation:
669    *
670    * "URB is limited to 1008KB due to programming restrictions.  This is not a
671    * restriction of the L3 implementation, but of the FF and other clients.
672    * Therefore, in a GT4 implementation it is possible for the programmed
673    * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but
674    * only 1008KB of this will be used."
675    */
676   .simulator_id = 12,
677};
678
679static const struct intel_device_info intel_device_info_bxt = {
680   GFX9_LP_FEATURES_3X6,
681   .platform = INTEL_PLATFORM_BXT,
682   .l3_banks = 2,
683   .simulator_id = 14,
684};
685
686static const struct intel_device_info intel_device_info_bxt_2x6 = {
687   GFX9_LP_FEATURES_2X6,
688   .platform = INTEL_PLATFORM_BXT,
689   .l3_banks = 1,
690   .simulator_id = 14,
691};
692/*
693 * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
694 * There's no KBL entry. Using the default SKL (GFX9) GS entries value.
695 */
696
697static const struct intel_device_info intel_device_info_kbl_gt1 = {
698   GFX9_FEATURES,
699   .platform = INTEL_PLATFORM_KBL,
700   .gt = 1,
701
702   .max_cs_threads = 7 * 6,
703   .num_slices = 1,
704   .num_subslices = { 2, },
705   .max_eus_per_subslice = 6,
706   .l3_banks = 2,
707   /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
708    * leading to some vertices to go missing if we use too much URB.
709    */
710   .urb.max_entries[MESA_SHADER_VERTEX] = 928,
711   .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
712   .simulator_id = 16,
713};
714
715static const struct intel_device_info intel_device_info_kbl_gt1_5 = {
716   GFX9_FEATURES,
717   .platform = INTEL_PLATFORM_KBL,
718   .gt = 1,
719
720   .max_cs_threads = 7 * 6,
721   .num_slices = 1,
722   .num_subslices = { 3, },
723   .max_eus_per_subslice = 6,
724   .l3_banks = 4,
725   .simulator_id = 16,
726};
727
728static const struct intel_device_info intel_device_info_kbl_gt2 = {
729   GFX9_FEATURES,
730   .platform = INTEL_PLATFORM_KBL,
731   .gt = 2,
732
733   .num_slices = 1,
734   .num_subslices = { 3, },
735   .max_eus_per_subslice = 8,
736   .l3_banks = 4,
737   .simulator_id = 16,
738};
739
740static const struct intel_device_info intel_device_info_kbl_gt3 = {
741   GFX9_FEATURES,
742   .platform = INTEL_PLATFORM_KBL,
743   .gt = 3,
744
745   .num_slices = 2,
746   .num_subslices = { 3, 3, },
747   .max_eus_per_subslice = 8,
748   .l3_banks = 8,
749   .simulator_id = 16,
750};
751
752static const struct intel_device_info intel_device_info_kbl_gt4 = {
753   GFX9_FEATURES,
754   .platform = INTEL_PLATFORM_KBL,
755   .gt = 4,
756
757   /*
758    * From the "L3 Allocation and Programming" documentation:
759    *
760    * "URB is limited to 1008KB due to programming restrictions.  This
761    *  is not a restriction of the L3 implementation, but of the FF and
762    *  other clients.  Therefore, in a GT4 implementation it is
763    *  possible for the programmed allocation of the L3 data array to
764    *  provide 3*384KB=1152KB for URB, but only 1008KB of this
765    *  will be used."
766    */
767   .num_slices = 3,
768   .num_subslices = { 3, 3, 3, },
769   .max_eus_per_subslice = 8,
770   .l3_banks = 12,
771   .simulator_id = 16,
772};
773
774static const struct intel_device_info intel_device_info_glk = {
775   GFX9_LP_FEATURES_3X6,
776   .platform = INTEL_PLATFORM_GLK,
777   .l3_banks = 2,
778   .simulator_id = 17,
779};
780
781static const struct intel_device_info intel_device_info_glk_2x6 = {
782   GFX9_LP_FEATURES_2X6,
783   .platform = INTEL_PLATFORM_GLK,
784   .l3_banks = 2,
785   .simulator_id = 17,
786};
787
788static const struct intel_device_info intel_device_info_cfl_gt1 = {
789   GFX9_FEATURES,
790   .platform = INTEL_PLATFORM_CFL,
791   .gt = 1,
792
793   .num_slices = 1,
794   .num_subslices = { 2, },
795   .max_eus_per_subslice = 6,
796   .l3_banks = 2,
797   /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
798    * leading to some vertices to go missing if we use too much URB.
799    */
800   .urb.max_entries[MESA_SHADER_VERTEX] = 928,
801   .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
802   .simulator_id = 24,
803};
804static const struct intel_device_info intel_device_info_cfl_gt2 = {
805   GFX9_FEATURES,
806   .platform = INTEL_PLATFORM_CFL,
807   .gt = 2,
808
809   .num_slices = 1,
810   .num_subslices = { 3, },
811   .max_eus_per_subslice = 8,
812   .l3_banks = 4,
813   .simulator_id = 24,
814};
815
816static const struct intel_device_info intel_device_info_cfl_gt3 = {
817   GFX9_FEATURES,
818   .platform = INTEL_PLATFORM_CFL,
819   .gt = 3,
820
821   .num_slices = 2,
822   .num_subslices = { 3, 3, },
823   .max_eus_per_subslice = 8,
824   .l3_banks = 8,
825   .simulator_id = 24,
826};
827
828#define subslices(args...) { args, }
829
830#define GFX11_HW_INFO                               \
831   .ver = 11,                                       \
832   .has_pln = false,                                \
833   .max_vs_threads = 364,                           \
834   .max_gs_threads = 224,                           \
835   .max_tcs_threads = 224,                          \
836   .max_tes_threads = 364,                          \
837   .max_threads_per_psd = 64,                       \
838   .max_cs_threads = 56,                            \
839   .cs_prefetch_size = 512
840
841#define GFX11_FEATURES(_gt, _slices, _subslices, _l3, _platform)  \
842   GFX8_FEATURES,                                     \
843   GFX11_HW_INFO,                                     \
844   .platform = _platform,                             \
845   .has_64bit_float = false,                          \
846   .has_64bit_int = false,                            \
847   .has_integer_dword_mul = false,                    \
848   .has_sample_with_hiz = false,                      \
849   .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
850   .num_subslices = _subslices,                       \
851   .max_eus_per_subslice = 8
852
853#define GFX11_URB_MIN_MAX_ENTRIES                     \
854   .min_entries = {                                   \
855      [MESA_SHADER_VERTEX]    = 64,                   \
856      [MESA_SHADER_TESS_EVAL] = 34,                   \
857   },                                                 \
858   .max_entries = {                                   \
859      [MESA_SHADER_VERTEX]    = 2384,                 \
860      [MESA_SHADER_TESS_CTRL] = 1032,                 \
861      [MESA_SHADER_TESS_EVAL] = 2384,                 \
862      [MESA_SHADER_GEOMETRY]  = 1032,                 \
863   }
864
865static const struct intel_device_info intel_device_info_icl_gt2 = {
866   GFX11_FEATURES(2, 1, subslices(8), 8, INTEL_PLATFORM_ICL),
867   .urb = {
868      GFX11_URB_MIN_MAX_ENTRIES,
869   },
870   .simulator_id = 19,
871};
872
873static const struct intel_device_info intel_device_info_icl_gt1_5 = {
874   GFX11_FEATURES(1, 1, subslices(6), 6, INTEL_PLATFORM_ICL),
875   .urb = {
876      GFX11_URB_MIN_MAX_ENTRIES,
877   },
878   .simulator_id = 19,
879};
880
881static const struct intel_device_info intel_device_info_icl_gt1 = {
882   GFX11_FEATURES(1, 1, subslices(4), 6, INTEL_PLATFORM_ICL),
883   .urb = {
884      GFX11_URB_MIN_MAX_ENTRIES,
885   },
886   .simulator_id = 19,
887};
888
889static const struct intel_device_info intel_device_info_icl_gt0_5 = {
890   GFX11_FEATURES(1, 1, subslices(1), 6, INTEL_PLATFORM_ICL),
891   .urb = {
892      GFX11_URB_MIN_MAX_ENTRIES,
893   },
894   .simulator_id = 19,
895};
896
897#define GFX11_LP_FEATURES                           \
898   .urb = {                                         \
899      GFX11_URB_MIN_MAX_ENTRIES,                    \
900   },                                               \
901   .disable_ccs_repack = true,                      \
902   .simulator_id = 28
903
904static const struct intel_device_info intel_device_info_ehl_4x8 = {
905   GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
906   GFX11_LP_FEATURES,
907};
908
909static const struct intel_device_info intel_device_info_ehl_4x6 = {
910   GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
911   GFX11_LP_FEATURES,
912   .max_eus_per_subslice = 6,
913};
914
915static const struct intel_device_info intel_device_info_ehl_4x5 = {
916   GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
917   GFX11_LP_FEATURES,
918   .max_eus_per_subslice = 5,
919};
920
921static const struct intel_device_info intel_device_info_ehl_4x4 = {
922   GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
923   GFX11_LP_FEATURES,
924   .max_eus_per_subslice = 4,
925};
926
927static const struct intel_device_info intel_device_info_ehl_2x8 = {
928   GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL),
929   GFX11_LP_FEATURES,
930};
931
932static const struct intel_device_info intel_device_info_ehl_2x4 = {
933   GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL),
934   GFX11_LP_FEATURES,
935   .max_eus_per_subslice = 4,
936};
937
938#define GFX12_URB_MIN_MAX_ENTRIES                   \
939   .min_entries = {                                 \
940      [MESA_SHADER_VERTEX]    = 64,                 \
941      [MESA_SHADER_TESS_EVAL] = 34,                 \
942   },                                               \
943   .max_entries = {                                 \
944      [MESA_SHADER_VERTEX]    = 3576,               \
945      [MESA_SHADER_TESS_CTRL] = 1548,               \
946      [MESA_SHADER_TESS_EVAL] = 3576,               \
947      /* Wa_14013840143 */                          \
948      [MESA_SHADER_GEOMETRY]  = 1536,               \
949   }
950
951#define GFX12_HW_INFO                               \
952   .ver = 12,                                       \
953   .has_pln = false,                                \
954   .has_sample_with_hiz = false,                    \
955   .has_aux_map = true,                             \
956   .max_vs_threads = 546,                           \
957   .max_gs_threads = 336,                           \
958   .max_tcs_threads = 336,                          \
959   .max_tes_threads = 546,                          \
960   .max_threads_per_psd = 64,                       \
961   .max_cs_threads = 112, /* threads per DSS */     \
962   .urb = {                                         \
963      GFX12_URB_MIN_MAX_ENTRIES,                    \
964   }
965
966#define GFX12_FEATURES(_gt, _slices, _l3)                       \
967   GFX8_FEATURES,                                               \
968   GFX12_HW_INFO,                                               \
969   .has_64bit_float = false,                                    \
970   .has_64bit_int = false,                                      \
971   .has_integer_dword_mul = false,                              \
972   .gt = _gt, .num_slices = _slices, .l3_banks = _l3,           \
973   .simulator_id = 22,                                          \
974   .max_eus_per_subslice = 16,                                   \
975   .cs_prefetch_size = 512
976
977#define dual_subslices(args...) { args, }
978
979#define GFX12_GT05_FEATURES                                     \
980   GFX12_FEATURES(1, 1, 4),                                     \
981   .num_subslices = dual_subslices(1)
982
983#define GFX12_GT_FEATURES(_gt)                                  \
984   GFX12_FEATURES(_gt, 1, _gt == 1 ? 4 : 8),                    \
985   .num_subslices = dual_subslices(_gt == 1 ? 2 : 6)
986
987static const struct intel_device_info intel_device_info_tgl_gt1 = {
988   GFX12_GT_FEATURES(1),
989   .platform = INTEL_PLATFORM_TGL,
990};
991
992static const struct intel_device_info intel_device_info_tgl_gt2 = {
993   GFX12_GT_FEATURES(2),
994   .platform = INTEL_PLATFORM_TGL,
995};
996
997static const struct intel_device_info intel_device_info_rkl_gt05 = {
998   GFX12_GT05_FEATURES,
999   .platform = INTEL_PLATFORM_RKL,
1000};
1001
1002static const struct intel_device_info intel_device_info_rkl_gt1 = {
1003   GFX12_GT_FEATURES(1),
1004   .platform = INTEL_PLATFORM_RKL,
1005};
1006
1007static const struct intel_device_info intel_device_info_adl_gt05 = {
1008   GFX12_GT05_FEATURES,
1009   .platform = INTEL_PLATFORM_ADL,
1010   .display_ver = 13,
1011};
1012
1013static const struct intel_device_info intel_device_info_adl_gt1 = {
1014   GFX12_GT_FEATURES(1),
1015   .platform = INTEL_PLATFORM_ADL,
1016   .display_ver = 13,
1017};
1018
1019static const struct intel_device_info intel_device_info_adl_n = {
1020   GFX12_GT_FEATURES(1),
1021   .platform = INTEL_PLATFORM_ADL,
1022   .display_ver = 13,
1023};
1024
1025static const struct intel_device_info intel_device_info_adl_gt2 = {
1026   GFX12_GT_FEATURES(2),
1027   .platform = INTEL_PLATFORM_ADL,
1028   .display_ver = 13,
1029};
1030
1031static const struct intel_device_info intel_device_info_rpl = {
1032   GFX12_FEATURES(1, 1, 4),
1033   .num_subslices = dual_subslices(2),
1034   .platform = INTEL_PLATFORM_RPL,
1035   .display_ver = 13,
1036};
1037
1038static const struct intel_device_info intel_device_info_rpl_p = {
1039   GFX12_GT_FEATURES(2),
1040   .platform = INTEL_PLATFORM_RPL,
1041   .display_ver = 13,
1042};
1043
1044#define GFX12_DG1_SG1_FEATURES                  \
1045   GFX12_GT_FEATURES(2),                        \
1046   .platform = INTEL_PLATFORM_DG1,              \
1047   .has_llc = false,                            \
1048   .has_local_mem = true,                       \
1049   .urb.size = 768,                             \
1050   .simulator_id = 30
1051
1052static const struct intel_device_info intel_device_info_dg1 = {
1053   GFX12_DG1_SG1_FEATURES,
1054};
1055
1056static const struct intel_device_info intel_device_info_sg1 = {
1057   GFX12_DG1_SG1_FEATURES,
1058};
1059
1060#define XEHP_FEATURES(_gt, _slices, _l3)                        \
1061   GFX12_FEATURES(_gt, _slices, _l3),                           \
1062   .num_thread_per_eu = 8 /* BSpec 44472 */,                    \
1063   .verx10 = 125,                                               \
1064   .has_llc = false,                                            \
1065   .has_local_mem = true,                                       \
1066   .has_aux_map = false,                                        \
1067   .simulator_id = 29,                                          \
1068   .cs_prefetch_size = 1024
1069
1070#define DG2_FEATURES                                            \
1071   /* (Sub)slice info comes from the kernel topology info */    \
1072   XEHP_FEATURES(0, 1, 0),                                      \
1073   .display_ver = 13,                                           \
1074   .revision = 4, /* For offline compiler */                    \
1075   .num_subslices = dual_subslices(1),                          \
1076   .has_lsc = true,                                             \
1077   .apply_hwconfig = true,                                      \
1078   .has_coarse_pixel_primitive_and_cb = true,                   \
1079   .has_mesh_shading = true
1080
1081static const struct intel_device_info intel_device_info_dg2_g10 = {
1082   DG2_FEATURES,
1083   .platform = INTEL_PLATFORM_DG2_G10,
1084};
1085
1086static const struct intel_device_info intel_device_info_dg2_g11 = {
1087   DG2_FEATURES,
1088   .platform = INTEL_PLATFORM_DG2_G11,
1089};
1090
1091static const struct intel_device_info intel_device_info_dg2_g12 = {
1092   DG2_FEATURES,
1093   .platform = INTEL_PLATFORM_DG2_G12,
1094};
1095
1096static void
1097reset_masks(struct intel_device_info *devinfo)
1098{
1099   devinfo->subslice_slice_stride = 0;
1100   devinfo->eu_subslice_stride = 0;
1101   devinfo->eu_slice_stride = 0;
1102
1103   devinfo->num_slices = 0;
1104   memset(devinfo->num_subslices, 0, sizeof(devinfo->num_subslices));
1105
1106   memset(&devinfo->slice_masks, 0, sizeof(devinfo->slice_masks));
1107   memset(devinfo->subslice_masks, 0, sizeof(devinfo->subslice_masks));
1108   memset(devinfo->eu_masks, 0, sizeof(devinfo->eu_masks));
1109   memset(devinfo->ppipe_subslices, 0, sizeof(devinfo->ppipe_subslices));
1110}
1111
1112static void
1113update_slice_subslice_counts(struct intel_device_info *devinfo)
1114{
1115   devinfo->num_slices = __builtin_popcount(devinfo->slice_masks);
1116   devinfo->subslice_total = 0;
1117   for (int s = 0; s < devinfo->max_slices; s++) {
1118      if (!intel_device_info_slice_available(devinfo, s))
1119         continue;
1120
1121      for (int b = 0; b < devinfo->subslice_slice_stride; b++) {
1122         devinfo->num_subslices[s] +=
1123            __builtin_popcount(devinfo->subslice_masks[s * devinfo->subslice_slice_stride + b]);
1124      }
1125      devinfo->subslice_total += devinfo->num_subslices[s];
1126   }
1127   assert(devinfo->num_slices > 0);
1128   assert(devinfo->subslice_total > 0);
1129}
1130
1131static void
1132update_pixel_pipes(struct intel_device_info *devinfo, uint8_t *subslice_masks)
1133{
1134   if (devinfo->ver < 11)
1135      return;
1136
1137   /* The kernel only reports one slice on all existing ICL+ platforms, even
1138    * if multiple slices are present. The slice mask is allowed to have the
1139    * accurate value greater than 1 on gfx12.5+ platforms though, in order to
1140    * be tolerant with the behavior of our simulation environment.
1141    */
1142   assert(devinfo->slice_masks == 1 || devinfo->verx10 >= 125);
1143
1144   /* Count the number of subslices on each pixel pipe. Assume that every
1145    * contiguous group of 4 subslices in the mask belong to the same pixel
1146    * pipe. However note that on TGL+ the kernel returns a mask of enabled
1147    * *dual* subslices instead of actual subslices somewhat confusingly, so
1148    * each pixel pipe only takes 2 bits in the mask even though it's still 4
1149    * subslices.
1150    */
1151   const unsigned ppipe_bits = devinfo->ver >= 12 ? 2 : 4;
1152   for (unsigned p = 0; p < INTEL_DEVICE_MAX_PIXEL_PIPES; p++) {
1153      const unsigned offset = p * ppipe_bits;
1154      const unsigned subslice_idx = offset /
1155         devinfo->max_subslices_per_slice * devinfo->subslice_slice_stride;
1156      const unsigned ppipe_mask =
1157         BITFIELD_RANGE(offset % devinfo->max_subslices_per_slice, ppipe_bits);
1158
1159      if (subslice_idx < ARRAY_SIZE(devinfo->subslice_masks))
1160         devinfo->ppipe_subslices[p] =
1161            __builtin_popcount(subslice_masks[subslice_idx] & ppipe_mask);
1162      else
1163         devinfo->ppipe_subslices[p] = 0;
1164   }
1165}
1166
1167static void
1168update_l3_banks(struct intel_device_info *devinfo)
1169{
1170   if (devinfo->ver != 12)
1171      return;
1172
1173   if (devinfo->verx10 >= 125) {
1174      if (devinfo->subslice_total > 16) {
1175         assert(devinfo->subslice_total <= 32);
1176         devinfo->l3_banks = 32;
1177      } else if (devinfo->subslice_total > 8) {
1178         devinfo->l3_banks = 16;
1179      } else {
1180         devinfo->l3_banks = 8;
1181      }
1182   } else {
1183      assert(devinfo->num_slices == 1);
1184      if (devinfo->subslice_total >= 6) {
1185         assert(devinfo->subslice_total == 6);
1186         devinfo->l3_banks = 8;
1187      } else if (devinfo->subslice_total > 2) {
1188         devinfo->l3_banks = 6;
1189      } else {
1190         devinfo->l3_banks = 4;
1191      }
1192   }
1193}
1194
1195/* At some point in time, some people decided to redefine what topology means,
1196 * from useful HW related information (slice, subslice, etc...), to much less
1197 * useful generic stuff that no one cares about (a single slice with lots of
1198 * subslices). Of course all of this was done without asking the people who
1199 * defined the topology query in the first place, to solve a lack of
1200 * information Gfx10+. This function is here to workaround the fact it's not
1201 * possible to change people's mind even before this stuff goes upstream. Sad
1202 * times...
1203 */
1204static void
1205update_from_single_slice_topology(struct intel_device_info *devinfo,
1206                                  const struct drm_i915_query_topology_info *topology,
1207                                  const struct drm_i915_query_topology_info *geom_topology)
1208{
1209   /* An array of bit masks of the subslices available for 3D
1210    * workloads, analogous to intel_device_info::subslice_masks.  This
1211    * may differ from the set of enabled subslices on XeHP+ platforms
1212    * with compute-only subslices.
1213    */
1214   uint8_t geom_subslice_masks[ARRAY_SIZE(devinfo->subslice_masks)] = { 0 };
1215
1216   assert(devinfo->verx10 >= 125);
1217
1218   reset_masks(devinfo);
1219
1220   assert(topology->max_slices == 1);
1221   assert(topology->max_subslices > 0);
1222   assert(topology->max_eus_per_subslice > 0);
1223
1224   /* i915 gives us only one slice so we have to rebuild that out of groups of
1225    * 4 dualsubslices.
1226    */
1227   devinfo->max_subslices_per_slice = 4;
1228   devinfo->max_eus_per_subslice = 16;
1229   devinfo->subslice_slice_stride = 1;
1230   devinfo->eu_slice_stride = DIV_ROUND_UP(16 * 4, 8);
1231   devinfo->eu_subslice_stride = DIV_ROUND_UP(16, 8);
1232
1233   for (uint32_t ss_idx = 0; ss_idx < topology->max_subslices; ss_idx++) {
1234      const uint32_t s = ss_idx / 4;
1235      const uint32_t ss = ss_idx % 4;
1236
1237      /* Determine whether ss_idx is enabled (ss_idx_available) and
1238       * available for 3D workloads (geom_ss_idx_available), which may
1239       * differ on XeHP+ if ss_idx is a compute-only DSS.
1240       */
1241      const bool ss_idx_available =
1242         (topology->data[topology->subslice_offset + ss_idx / 8] >>
1243          (ss_idx % 8)) & 1;
1244      const bool geom_ss_idx_available =
1245         (geom_topology->data[geom_topology->subslice_offset + ss_idx / 8] >>
1246          (ss_idx % 8)) & 1;
1247
1248      if (geom_ss_idx_available) {
1249         assert(ss_idx_available);
1250         geom_subslice_masks[s * devinfo->subslice_slice_stride +
1251                             ss / 8] |= 1u << (ss % 8);
1252      }
1253
1254      if (!ss_idx_available)
1255         continue;
1256
1257      devinfo->max_slices = MAX2(devinfo->max_slices, s + 1);
1258      devinfo->slice_masks |= 1u << s;
1259
1260      devinfo->subslice_masks[s * devinfo->subslice_slice_stride +
1261                              ss / 8] |= 1u << (ss % 8);
1262
1263      for (uint32_t eu = 0; eu < devinfo->max_eus_per_subslice; eu++) {
1264         const bool eu_available =
1265            (topology->data[topology->eu_offset +
1266                            ss_idx * topology->eu_stride +
1267                            eu / 8] >> (eu % 8)) & 1;
1268
1269         if (!eu_available)
1270            continue;
1271
1272         devinfo->eu_masks[s * devinfo->eu_slice_stride +
1273                           ss * devinfo->eu_subslice_stride +
1274                           eu / 8] |= 1u << (eu % 8);
1275      }
1276   }
1277
1278   update_slice_subslice_counts(devinfo);
1279   update_pixel_pipes(devinfo, geom_subslice_masks);
1280   update_l3_banks(devinfo);
1281}
1282
1283static void
1284update_from_topology(struct intel_device_info *devinfo,
1285                     const struct drm_i915_query_topology_info *topology)
1286{
1287   reset_masks(devinfo);
1288
1289   assert(topology->max_slices > 0);
1290   assert(topology->max_subslices > 0);
1291   assert(topology->max_eus_per_subslice > 0);
1292
1293   devinfo->subslice_slice_stride = topology->subslice_stride;
1294
1295   devinfo->eu_subslice_stride = DIV_ROUND_UP(topology->max_eus_per_subslice, 8);
1296   devinfo->eu_slice_stride = topology->max_subslices * devinfo->eu_subslice_stride;
1297
1298   assert(sizeof(devinfo->slice_masks) >= DIV_ROUND_UP(topology->max_slices, 8));
1299   memcpy(&devinfo->slice_masks, topology->data, DIV_ROUND_UP(topology->max_slices, 8));
1300   devinfo->max_slices = topology->max_slices;
1301   devinfo->max_subslices_per_slice = topology->max_subslices;
1302   devinfo->max_eus_per_subslice = topology->max_eus_per_subslice;
1303
1304   uint32_t subslice_mask_len =
1305      topology->max_slices * topology->subslice_stride;
1306   assert(sizeof(devinfo->subslice_masks) >= subslice_mask_len);
1307   memcpy(devinfo->subslice_masks, &topology->data[topology->subslice_offset],
1308          subslice_mask_len);
1309
1310   uint32_t eu_mask_len =
1311      topology->eu_stride * topology->max_subslices * topology->max_slices;
1312   assert(sizeof(devinfo->eu_masks) >= eu_mask_len);
1313   memcpy(devinfo->eu_masks, &topology->data[topology->eu_offset], eu_mask_len);
1314
1315   /* Now that all the masks are in place, update the counts. */
1316   update_slice_subslice_counts(devinfo);
1317   update_pixel_pipes(devinfo, devinfo->subslice_masks);
1318   update_l3_banks(devinfo);
1319}
1320
1321/* Generate detailed mask from the I915_PARAM_SLICE_MASK,
1322 * I915_PARAM_SUBSLICE_MASK & I915_PARAM_EU_TOTAL getparam.
1323 */
1324static bool
1325update_from_masks(struct intel_device_info *devinfo, uint32_t slice_mask,
1326                  uint32_t subslice_mask, uint32_t n_eus)
1327{
1328   struct drm_i915_query_topology_info *topology;
1329
1330   assert((slice_mask & 0xff) == slice_mask);
1331
1332   size_t data_length = 100;
1333
1334   topology = calloc(1, sizeof(*topology) + data_length);
1335   if (!topology)
1336      return false;
1337
1338   topology->max_slices = util_last_bit(slice_mask);
1339   topology->max_subslices = util_last_bit(subslice_mask);
1340
1341   topology->subslice_offset = DIV_ROUND_UP(topology->max_slices, 8);
1342   topology->subslice_stride = DIV_ROUND_UP(topology->max_subslices, 8);
1343
1344   uint32_t n_subslices = __builtin_popcount(slice_mask) *
1345      __builtin_popcount(subslice_mask);
1346   uint32_t max_eus_per_subslice = DIV_ROUND_UP(n_eus, n_subslices);
1347   uint32_t eu_mask = (1U << max_eus_per_subslice) - 1;
1348
1349   topology->max_eus_per_subslice = max_eus_per_subslice;
1350   topology->eu_offset = topology->subslice_offset +
1351      topology->max_slices * DIV_ROUND_UP(topology->max_subslices, 8);
1352   topology->eu_stride = DIV_ROUND_UP(max_eus_per_subslice, 8);
1353
1354   /* Set slice mask in topology */
1355   for (int b = 0; b < topology->subslice_offset; b++)
1356      topology->data[b] = (slice_mask >> (b * 8)) & 0xff;
1357
1358   for (int s = 0; s < topology->max_slices; s++) {
1359
1360      /* Set subslice mask in topology */
1361      for (int b = 0; b < topology->subslice_stride; b++) {
1362         int subslice_offset = topology->subslice_offset +
1363            s * topology->subslice_stride + b;
1364
1365         topology->data[subslice_offset] = (subslice_mask >> (b * 8)) & 0xff;
1366      }
1367
1368      /* Set eu mask in topology */
1369      for (int ss = 0; ss < topology->max_subslices; ss++) {
1370         for (int b = 0; b < topology->eu_stride; b++) {
1371            int eu_offset = topology->eu_offset +
1372               (s * topology->max_subslices + ss) * topology->eu_stride + b;
1373
1374            topology->data[eu_offset] = (eu_mask >> (b * 8)) & 0xff;
1375         }
1376      }
1377   }
1378
1379   update_from_topology(devinfo, topology);
1380   free(topology);
1381
1382   return true;
1383}
1384
1385/* Generate mask from the device data. */
1386static void
1387fill_masks(struct intel_device_info *devinfo)
1388{
1389   /* All of our internal device descriptions assign the same number of
1390    * subslices for each slice. Just verify that this is true.
1391    */
1392   for (int s = 1; s < devinfo->num_slices; s++)
1393      assert(devinfo->num_subslices[0] == devinfo->num_subslices[s]);
1394
1395   update_from_masks(devinfo,
1396                     (1U << devinfo->num_slices) - 1,
1397                     (1U << devinfo->num_subslices[0]) - 1,
1398                     devinfo->num_slices * devinfo->num_subslices[0] *
1399                     devinfo->max_eus_per_subslice);
1400}
1401
1402static bool
1403getparam(int fd, uint32_t param, int *value)
1404{
1405   int tmp;
1406
1407   struct drm_i915_getparam gp = {
1408      .param = param,
1409      .value = &tmp,
1410   };
1411
1412   int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
1413   if (ret != 0)
1414      return false;
1415
1416   *value = tmp;
1417   return true;
1418}
1419
1420static bool
1421get_context_param(int fd, uint32_t context, uint32_t param, uint64_t *value)
1422{
1423   struct drm_i915_gem_context_param gp = {
1424      .ctx_id = context,
1425      .param = param,
1426   };
1427
1428   int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &gp);
1429   if (ret != 0)
1430      return false;
1431
1432   *value = gp.value;
1433   return true;
1434}
1435
1436static void
1437update_cs_workgroup_threads(struct intel_device_info *devinfo)
1438{
1439   /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we
1440    * can program is 64 without going up to a rectangular group. This only
1441    * impacts Haswell and TGL which have higher thread counts.
1442    *
1443    * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+
1444    * is 10 bits so we have no such restrictions.
1445    */
1446   devinfo->max_cs_workgroup_threads =
1447      devinfo->verx10 >= 125 ? devinfo->max_cs_threads :
1448                               MIN2(devinfo->max_cs_threads, 64);
1449}
1450
1451bool
1452intel_get_device_info_from_pci_id(int pci_id,
1453                                  struct intel_device_info *devinfo)
1454{
1455   switch (pci_id) {
1456#undef CHIPSET
1457#define CHIPSET(id, family, fam_str, name) \
1458      case id: *devinfo = intel_device_info_##family; break;
1459#include "pci_ids/crocus_pci_ids.h"
1460#include "pci_ids/iris_pci_ids.h"
1461
1462#undef CHIPSET
1463#define CHIPSET(id, fam_str, name) \
1464      case id: *devinfo = intel_device_info_gfx3; break;
1465#include "pci_ids/i915_pci_ids.h"
1466
1467   default:
1468      mesa_logw("Driver does not support the 0x%x PCI ID.", pci_id);
1469      return false;
1470   }
1471
1472   switch (pci_id) {
1473#undef CHIPSET
1474#define CHIPSET(_id, _family, _fam_str, _name) \
1475   case _id: \
1476      /* sizeof(str_literal) includes the null */ \
1477      STATIC_ASSERT(sizeof(_name) + sizeof(_fam_str) + 2 <= \
1478                    sizeof(devinfo->name)); \
1479      strncpy(devinfo->name, _name " (" _fam_str ")", sizeof(devinfo->name)); \
1480      break;
1481#include "pci_ids/crocus_pci_ids.h"
1482#include "pci_ids/iris_pci_ids.h"
1483   default:
1484      strncpy(devinfo->name, "Intel Unknown", sizeof(devinfo->name));
1485   }
1486
1487   fill_masks(devinfo);
1488
1489   /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
1490    *
1491    * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
1492    *  allocate scratch space enough so that each slice has 4 slices allowed."
1493    *
1494    * The equivalent internal documentation says that this programming note
1495    * applies to all Gfx9+ platforms.
1496    *
1497    * The hardware typically calculates the scratch space pointer by taking
1498    * the base address, and adding per-thread-scratch-space * thread ID.
1499    * Extra padding can be necessary depending how the thread IDs are
1500    * calculated for a particular shader stage.
1501    */
1502
1503   switch(devinfo->ver) {
1504   case 9:
1505      devinfo->max_wm_threads = 64 /* threads-per-PSD */
1506                              * devinfo->num_slices
1507                              * 4; /* effective subslices per slice */
1508      break;
1509   case 11:
1510   case 12:
1511      devinfo->max_wm_threads = 128 /* threads-per-PSD */
1512                              * devinfo->num_slices
1513                              * 8; /* subslices per slice */
1514      break;
1515   default:
1516      assert(devinfo->ver < 9);
1517      break;
1518   }
1519
1520   assert(devinfo->num_slices <= ARRAY_SIZE(devinfo->num_subslices));
1521
1522   if (devinfo->verx10 == 0)
1523      devinfo->verx10 = devinfo->ver * 10;
1524
1525   if (devinfo->display_ver == 0)
1526      devinfo->display_ver = devinfo->ver;
1527
1528   update_cs_workgroup_threads(devinfo);
1529
1530   return true;
1531}
1532
1533/**
1534 * for gfx8/gfx9, SLICE_MASK/SUBSLICE_MASK can be used to compute the topology
1535 * (kernel 4.13+)
1536 */
1537static bool
1538getparam_topology(struct intel_device_info *devinfo, int fd)
1539{
1540   int slice_mask = 0;
1541   if (!getparam(fd, I915_PARAM_SLICE_MASK, &slice_mask))
1542      goto maybe_warn;
1543
1544   int n_eus;
1545   if (!getparam(fd, I915_PARAM_EU_TOTAL, &n_eus))
1546      goto maybe_warn;
1547
1548   int subslice_mask = 0;
1549   if (!getparam(fd, I915_PARAM_SUBSLICE_MASK, &subslice_mask))
1550      goto maybe_warn;
1551
1552   return update_from_masks(devinfo, slice_mask, subslice_mask, n_eus);
1553
1554 maybe_warn:
1555   /* Only with Gfx8+ are we starting to see devices with fusing that can only
1556    * be detected at runtime.
1557    */
1558   if (devinfo->ver >= 8)
1559      mesa_logw("Kernel 4.1 required to properly query GPU properties.");
1560
1561   return false;
1562}
1563
1564/**
1565 * preferred API for updating the topology in devinfo (kernel 4.17+)
1566 */
1567static bool
1568query_topology(struct intel_device_info *devinfo, int fd)
1569{
1570   struct drm_i915_query_topology_info *topo_info =
1571      intel_i915_query_alloc(fd, DRM_I915_QUERY_TOPOLOGY_INFO, NULL);
1572   if (topo_info == NULL)
1573      return false;
1574
1575   if (devinfo->verx10 >= 125) {
1576      struct drm_i915_query_topology_info *geom_topo_info =
1577         intel_i915_query_alloc(fd, DRM_I915_QUERY_GEOMETRY_SUBSLICES, NULL);
1578      if (geom_topo_info == NULL) {
1579         free(topo_info);
1580         return false;
1581      }
1582
1583      update_from_single_slice_topology(devinfo, topo_info, geom_topo_info);
1584      free(geom_topo_info);
1585   } else {
1586      update_from_topology(devinfo, topo_info);
1587   }
1588
1589   free(topo_info);
1590
1591   return true;
1592
1593}
1594
1595/**
1596 * Reports memory region info, and allows buffers to target system-memory,
1597 * and/or device local memory.
1598 */
1599static bool
1600query_regions(struct intel_device_info *devinfo, int fd, bool update)
1601{
1602   struct drm_i915_query_memory_regions *meminfo =
1603      intel_i915_query_alloc(fd, DRM_I915_QUERY_MEMORY_REGIONS, NULL);
1604   if (meminfo == NULL)
1605      return false;
1606
1607   for (int i = 0; i < meminfo->num_regions; i++) {
1608      const struct drm_i915_memory_region_info *mem = &meminfo->regions[i];
1609      switch (mem->region.memory_class) {
1610      case I915_MEMORY_CLASS_SYSTEM: {
1611         if (!update) {
1612            devinfo->mem.sram.mem_class = mem->region.memory_class;
1613            devinfo->mem.sram.mem_instance = mem->region.memory_instance;
1614            devinfo->mem.sram.mappable.size = mem->probed_size;
1615         } else {
1616            assert(devinfo->mem.sram.mem_class == mem->region.memory_class);
1617            assert(devinfo->mem.sram.mem_instance == mem->region.memory_instance);
1618            assert(devinfo->mem.sram.mappable.size == mem->probed_size);
1619         }
1620         /* The kernel uAPI only reports an accurate unallocated_size value
1621          * for I915_MEMORY_CLASS_DEVICE.
1622          */
1623         uint64_t available;
1624         if (os_get_available_system_memory(&available))
1625            devinfo->mem.sram.mappable.free = MIN2(available, mem->probed_size);
1626         break;
1627      }
1628      case I915_MEMORY_CLASS_DEVICE:
1629         if (!update) {
1630            devinfo->mem.vram.mem_class = mem->region.memory_class;
1631            devinfo->mem.vram.mem_instance = mem->region.memory_instance;
1632            if (mem->probed_cpu_visible_size > 0) {
1633               devinfo->mem.vram.mappable.size = mem->probed_cpu_visible_size;
1634               devinfo->mem.vram.unmappable.size =
1635                  mem->probed_size - mem->probed_cpu_visible_size;
1636            } else {
1637               /* We are running on an older kernel without support for the
1638                * small-bar uapi. These kernels only support systems where the
1639                * entire vram is mappable.
1640                */
1641               devinfo->mem.vram.mappable.size = mem->probed_size;
1642               devinfo->mem.vram.unmappable.size = 0;
1643            }
1644         } else {
1645            assert(devinfo->mem.vram.mem_class == mem->region.memory_class);
1646            assert(devinfo->mem.vram.mem_instance == mem->region.memory_instance);
1647            assert((devinfo->mem.vram.mappable.size +
1648                    devinfo->mem.vram.unmappable.size) == mem->probed_size);
1649         }
1650         if (mem->unallocated_cpu_visible_size > 0) {
1651            if (mem->unallocated_size != -1) {
1652               devinfo->mem.vram.mappable.free = mem->unallocated_cpu_visible_size;
1653               devinfo->mem.vram.unmappable.free =
1654                  mem->unallocated_size - mem->unallocated_cpu_visible_size;
1655            }
1656         } else {
1657            /* We are running on an older kernel without support for the
1658             * small-bar uapi. These kernels only support systems where the
1659             * entire vram is mappable.
1660             */
1661            if (mem->unallocated_size != -1) {
1662               devinfo->mem.vram.mappable.free = mem->unallocated_size;
1663               devinfo->mem.vram.unmappable.free = 0;
1664            }
1665         }
1666         break;
1667      default:
1668         break;
1669      }
1670   }
1671
1672   free(meminfo);
1673   devinfo->mem.use_class_instance = true;
1674   return true;
1675}
1676
1677static bool
1678compute_system_memory(struct intel_device_info *devinfo, bool update)
1679{
1680   uint64_t total_phys;
1681   if (!os_get_total_physical_memory(&total_phys))
1682      return false;
1683
1684   uint64_t available = 0;
1685   os_get_available_system_memory(&available);
1686
1687   if (!update)
1688      devinfo->mem.sram.mappable.size = total_phys;
1689   else
1690      assert(devinfo->mem.sram.mappable.size == total_phys);
1691
1692   devinfo->mem.sram.mappable.free = available;
1693
1694   return true;
1695}
1696
1697static int
1698intel_get_aperture_size(int fd, uint64_t *size)
1699{
1700   struct drm_i915_gem_get_aperture aperture = { 0 };
1701
1702   int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
1703   if (ret == 0 && size)
1704      *size = aperture.aper_size;
1705
1706   return ret;
1707}
1708
1709static bool
1710has_bit6_swizzle(int fd)
1711{
1712   struct drm_gem_close close;
1713   int ret;
1714
1715   struct drm_i915_gem_create gem_create = {
1716      .size = 4096,
1717   };
1718
1719   if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
1720      unreachable("Failed to create GEM BO");
1721      return false;
1722   }
1723
1724   bool swizzled = false;
1725
1726   /* set_tiling overwrites the input on the error path, so we have to open
1727    * code intel_ioctl.
1728    */
1729   do {
1730      struct drm_i915_gem_set_tiling set_tiling = {
1731         .handle = gem_create.handle,
1732         .tiling_mode = I915_TILING_X,
1733         .stride = 512,
1734      };
1735
1736      ret = ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
1737   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
1738
1739   if (ret != 0) {
1740      unreachable("Failed to set BO tiling");
1741      goto close_and_return;
1742   }
1743
1744   struct drm_i915_gem_get_tiling get_tiling = {
1745      .handle = gem_create.handle,
1746   };
1747
1748   if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
1749      unreachable("Failed to get BO tiling");
1750      goto close_and_return;
1751   }
1752
1753   assert(get_tiling.tiling_mode == I915_TILING_X);
1754   swizzled = get_tiling.swizzle_mode != I915_BIT_6_SWIZZLE_NONE;
1755
1756close_and_return:
1757   memset(&close, 0, sizeof(close));
1758   close.handle = gem_create.handle;
1759   intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
1760
1761   return swizzled;
1762}
1763
1764static bool
1765has_get_tiling(int fd)
1766{
1767   int ret;
1768
1769   struct drm_i915_gem_create gem_create = {
1770      .size = 4096,
1771   };
1772
1773   if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
1774      unreachable("Failed to create GEM BO");
1775      return false;
1776   }
1777
1778   struct drm_i915_gem_get_tiling get_tiling = {
1779      .handle = gem_create.handle,
1780   };
1781   ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &get_tiling);
1782
1783   struct drm_gem_close close = {
1784      .handle = gem_create.handle,
1785   };
1786   intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
1787
1788   return ret == 0;
1789}
1790
1791static void
1792fixup_chv_device_info(struct intel_device_info *devinfo)
1793{
1794   assert(devinfo->platform == INTEL_PLATFORM_CHV);
1795
1796   /* Cherryview is annoying.  The number of EUs is depending on fusing and
1797    * isn't determinable from the PCI ID alone.  We default to the minimum
1798    * available for that PCI ID and then compute the real value from the
1799    * subslice information we get from the kernel.
1800    */
1801   const uint32_t subslice_total = intel_device_info_subslice_total(devinfo);
1802   const uint32_t eu_total = intel_device_info_eu_total(devinfo);
1803
1804   /* Logical CS threads = EUs per subslice * num threads per EU */
1805   uint32_t max_cs_threads =
1806      eu_total / subslice_total * devinfo->num_thread_per_eu;
1807
1808   /* Fuse configurations may give more threads than expected, never less. */
1809   if (max_cs_threads > devinfo->max_cs_threads)
1810      devinfo->max_cs_threads = max_cs_threads;
1811
1812   update_cs_workgroup_threads(devinfo);
1813
1814   /* Braswell is even more annoying.  Its marketing name isn't determinable
1815    * from the PCI ID and is also dependent on fusing.
1816    */
1817   if (devinfo->pci_device_id != 0x22B1)
1818      return;
1819
1820   char *bsw_model;
1821   switch (eu_total) {
1822   case 16: bsw_model = "405"; break;
1823   case 12: bsw_model = "400"; break;
1824   default: bsw_model = "   "; break;
1825   }
1826
1827   char *needle = strstr(devinfo->name, "XXX");
1828   assert(needle);
1829   if (needle)
1830      memcpy(needle, bsw_model, 3);
1831}
1832
1833static void
1834init_max_scratch_ids(struct intel_device_info *devinfo)
1835{
1836   /* Determine the max number of subslices that potentially might be used in
1837    * scratch space ids.
1838    *
1839    * For, Gfx11+, scratch space allocation is based on the number of threads
1840    * in the base configuration.
1841    *
1842    * For Gfx9, devinfo->subslice_total is the TOTAL number of subslices and
1843    * we wish to view that there are 4 subslices per slice instead of the
1844    * actual number of subslices per slice. The documentation for 3DSTATE_PS
1845    * "Scratch Space Base Pointer" says:
1846    *
1847    *    "Scratch Space per slice is computed based on 4 sub-slices.  SW
1848    *     must allocate scratch space enough so that each slice has 4
1849    *     slices allowed."
1850    *
1851    * According to the other driver team, this applies to compute shaders
1852    * as well.  This is not currently documented at all.
1853    *
1854    * For Gfx8 and older we user devinfo->subslice_total.
1855    */
1856   unsigned subslices;
1857   if (devinfo->verx10 == 125)
1858      subslices = 32;
1859   else if (devinfo->ver == 12)
1860      subslices = (devinfo->platform == INTEL_PLATFORM_DG1 || devinfo->gt == 2 ? 6 : 2);
1861   else if (devinfo->ver == 11)
1862      subslices = 8;
1863   else if (devinfo->ver >= 9 && devinfo->ver < 11)
1864      subslices = 4 * devinfo->num_slices;
1865   else
1866      subslices = devinfo->subslice_total;
1867   assert(subslices >= devinfo->subslice_total);
1868
1869   unsigned scratch_ids_per_subslice;
1870   if (devinfo->ver >= 12) {
1871      /* Same as ICL below, but with 16 EUs. */
1872      scratch_ids_per_subslice = 16 * 8;
1873   } else if (devinfo->ver >= 11) {
1874      /* The MEDIA_VFE_STATE docs say:
1875       *
1876       *    "Starting with this configuration, the Maximum Number of
1877       *     Threads must be set to (#EU * 8) for GPGPU dispatches.
1878       *
1879       *     Although there are only 7 threads per EU in the configuration,
1880       *     the FFTID is calculated as if there are 8 threads per EU,
1881       *     which in turn requires a larger amount of Scratch Space to be
1882       *     allocated by the driver."
1883       */
1884      scratch_ids_per_subslice = 8 * 8;
1885   } else if (devinfo->platform == INTEL_PLATFORM_HSW) {
1886      /* WaCSScratchSize:hsw
1887       *
1888       * Haswell's scratch space address calculation appears to be sparse
1889       * rather than tightly packed. The Thread ID has bits indicating
1890       * which subslice, EU within a subslice, and thread within an EU it
1891       * is. There's a maximum of two slices and two subslices, so these
1892       * can be stored with a single bit. Even though there are only 10 EUs
1893       * per subslice, this is stored in 4 bits, so there's an effective
1894       * maximum value of 16 EUs. Similarly, although there are only 7
1895       * threads per EU, this is stored in a 3 bit number, giving an
1896       * effective maximum value of 8 threads per EU.
1897       *
1898       * This means that we need to use 16 * 8 instead of 10 * 7 for the
1899       * number of threads per subslice.
1900       */
1901      scratch_ids_per_subslice = 16 * 8;
1902   } else if (devinfo->platform == INTEL_PLATFORM_CHV) {
1903      /* Cherryview devices have either 6 or 8 EUs per subslice, and each
1904       * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
1905       * as if it had 8 EUs.
1906       */
1907      scratch_ids_per_subslice = 8 * 7;
1908   } else {
1909      scratch_ids_per_subslice = devinfo->max_cs_threads;
1910   }
1911
1912   unsigned max_thread_ids = scratch_ids_per_subslice * subslices;
1913
1914   if (devinfo->verx10 >= 125) {
1915      /* On GFX version 12.5, scratch access changed to a surface-based model.
1916       * Instead of each shader type having its own layout based on IDs passed
1917       * from the relevant fixed-function unit, all scratch access is based on
1918       * thread IDs like it always has been for compute.
1919       */
1920      for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
1921         devinfo->max_scratch_ids[i] = max_thread_ids;
1922   } else {
1923      unsigned max_scratch_ids[] = {
1924         [MESA_SHADER_VERTEX]    = devinfo->max_vs_threads,
1925         [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
1926         [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
1927         [MESA_SHADER_GEOMETRY]  = devinfo->max_gs_threads,
1928         [MESA_SHADER_FRAGMENT]  = devinfo->max_wm_threads,
1929         [MESA_SHADER_COMPUTE]   = max_thread_ids,
1930      };
1931      STATIC_ASSERT(sizeof(devinfo->max_scratch_ids) == sizeof(max_scratch_ids));
1932      memcpy(devinfo->max_scratch_ids, max_scratch_ids,
1933             sizeof(devinfo->max_scratch_ids));
1934   }
1935}
1936
1937bool
1938intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo)
1939{
1940   /* Get PCI info.
1941    *
1942    * Some callers may already have a valid drm device which holds values of
1943    * PCI fields queried here prior to calling this function. But making this
1944    * query optional leads to a more cumbersome implementation. These callers
1945    * still need to initialize the fields somewhere out of this function and
1946    * rely on an ioctl to get PCI device id for the next step when skipping
1947    * this drm query.
1948    */
1949   drmDevicePtr drmdev = NULL;
1950   if (drmGetDevice2(fd, DRM_DEVICE_GET_PCI_REVISION, &drmdev)) {
1951      mesa_loge("Failed to query drm device.");
1952      return false;
1953   }
1954   if (!intel_get_device_info_from_pci_id
1955       (drmdev->deviceinfo.pci->device_id, devinfo)) {
1956      drmFreeDevice(&drmdev);
1957      return false;
1958   }
1959   devinfo->pci_domain = drmdev->businfo.pci->domain;
1960   devinfo->pci_bus = drmdev->businfo.pci->bus;
1961   devinfo->pci_dev = drmdev->businfo.pci->dev;
1962   devinfo->pci_func = drmdev->businfo.pci->func;
1963   devinfo->pci_device_id = drmdev->deviceinfo.pci->device_id;
1964   devinfo->pci_revision_id = drmdev->deviceinfo.pci->revision_id;
1965   drmFreeDevice(&drmdev);
1966   devinfo->no_hw = env_var_as_boolean("INTEL_NO_HW", false);
1967
1968   if (devinfo->ver == 10) {
1969      mesa_loge("Gfx10 support is redacted.");
1970      return false;
1971   }
1972
1973   /* remaining initializion queries the kernel for device info */
1974   if (devinfo->no_hw) {
1975      /* Provide some sensible values for NO_HW. */
1976      devinfo->gtt_size =
1977         devinfo->ver >= 8 ? (1ull << 48) : 2ull * 1024 * 1024 * 1024;
1978      compute_system_memory(devinfo, false);
1979      return true;
1980   }
1981
1982   if (intel_get_and_process_hwconfig_table(fd, devinfo)) {
1983      /* After applying hwconfig values, some items need to be recalculated. */
1984      devinfo->max_cs_threads =
1985         devinfo->max_eus_per_subslice * devinfo->num_thread_per_eu;
1986
1987      update_cs_workgroup_threads(devinfo);
1988   }
1989
1990   int timestamp_frequency;
1991   if (getparam(fd, I915_PARAM_CS_TIMESTAMP_FREQUENCY,
1992                &timestamp_frequency))
1993      devinfo->timestamp_frequency = timestamp_frequency;
1994   else if (devinfo->ver >= 10) {
1995      mesa_loge("Kernel 4.15 required to read the CS timestamp frequency.");
1996      return false;
1997   }
1998
1999   if (!getparam(fd, I915_PARAM_REVISION, &devinfo->revision))
2000      devinfo->revision = 0;
2001
2002   if (!query_topology(devinfo, fd)) {
2003      if (devinfo->ver >= 10) {
2004         /* topology uAPI required for CNL+ (kernel 4.17+) */
2005         return false;
2006      }
2007
2008      /* else use the kernel 4.13+ api for gfx8+.  For older kernels, topology
2009       * will be wrong, affecting GPU metrics. In this case, fail silently.
2010       */
2011      getparam_topology(devinfo, fd);
2012   }
2013
2014   /* If the memory region uAPI query is not available, try to generate some
2015    * numbers out of os_* utils for sram only.
2016    */
2017   if (!query_regions(devinfo, fd, false))
2018      compute_system_memory(devinfo, false);
2019
2020   /* region info is required for lmem support */
2021   if (devinfo->has_local_mem && !devinfo->mem.use_class_instance) {
2022      mesa_logw("Could not query local memory size.");
2023      return false;
2024   }
2025
2026   if (devinfo->platform == INTEL_PLATFORM_CHV)
2027      fixup_chv_device_info(devinfo);
2028
2029   /* Broadwell PRM says:
2030    *
2031    *   "Before Gfx8, there was a historical configuration control field to
2032    *    swizzle address bit[6] for in X/Y tiling modes. This was set in three
2033    *    different places: TILECTL[1:0], ARB_MODE[5:4], and
2034    *    DISP_ARB_CTL[14:13].
2035    *
2036    *    For Gfx8 and subsequent generations, the swizzle fields are all
2037    *    reserved, and the CPU's memory controller performs all address
2038    *    swizzling modifications."
2039    */
2040   devinfo->has_bit6_swizzle = devinfo->ver < 8 && has_bit6_swizzle(fd);
2041
2042   intel_get_aperture_size(fd, &devinfo->aperture_bytes);
2043   get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE, &devinfo->gtt_size);
2044   devinfo->has_tiling_uapi = has_get_tiling(fd);
2045
2046   /* Gfx7 and older do not support EU/Subslice info */
2047   assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7);
2048   devinfo->subslice_total = MAX2(devinfo->subslice_total, 1);
2049
2050   init_max_scratch_ids(devinfo);
2051
2052   return true;
2053}
2054
2055bool intel_device_info_update_memory_info(struct intel_device_info *devinfo, int fd)
2056{
2057   return query_regions(devinfo, fd, true) || compute_system_memory(devinfo, true);
2058}
2059