1/*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_eu.h"
25#include "brw_fs.h"
26#include "brw_vec4.h"
27#include "brw_cfg.h"
28
29using namespace brw;
30
31namespace {
32   /**
33    * Enumeration representing the various asynchronous units that can run
34    * computations in parallel on behalf of a shader thread.
35    */
36   enum intel_eu_unit {
37      /** EU front-end. */
38      EU_UNIT_FE,
39      /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40      EU_UNIT_FPU,
41      /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
42      EU_UNIT_EM,
43      /** Sampler shared function. */
44      EU_UNIT_SAMPLER,
45      /** Pixel Interpolator shared function. */
46      EU_UNIT_PI,
47      /** Unified Return Buffer shared function. */
48      EU_UNIT_URB,
49      /** Data Port Data Cache shared function. */
50      EU_UNIT_DP_DC,
51      /** Data Port Render Cache shared function. */
52      EU_UNIT_DP_RC,
53      /** Data Port Constant Cache shared function. */
54      EU_UNIT_DP_CC,
55      /** Message Gateway shared function. */
56      EU_UNIT_GATEWAY,
57      /** Thread Spawner shared function. */
58      EU_UNIT_SPAWNER,
59      /* EU_UNIT_VME, */
60      /* EU_UNIT_CRE, */
61      /** Number of asynchronous units currently tracked. */
62      EU_NUM_UNITS,
63      /** Dummy unit for instructions that don't consume runtime from the above. */
64      EU_UNIT_NULL = EU_NUM_UNITS
65   };
66
67   /**
68    * Enumeration representing a computation result another computation can
69    * potentially depend on.
70    */
71   enum intel_eu_dependency_id {
72      /* Register part of the GRF. */
73      EU_DEPENDENCY_ID_GRF0 = 0,
74      /* Register part of the MRF.  Only used on Gfx4-6. */
75      EU_DEPENDENCY_ID_MRF0 = EU_DEPENDENCY_ID_GRF0 + BRW_MAX_GRF,
76      /* Address register part of the ARF. */
77      EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_MRF0 + 24,
78      /* Accumulator register part of the ARF. */
79      EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
80      /* Flag register part of the ARF. */
81      EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
82      /* SBID token write completion.  Only used on Gfx12+. */
83      EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
84      /* SBID token read completion.  Only used on Gfx12+. */
85      EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 16,
86      /* Number of computation dependencies currently tracked. */
87      EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 16
88   };
89
90   /**
91    * State of our modeling of the program execution.
92    */
93   struct state {
94      state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95      /**
96       * Time at which a given unit will be ready to execute the next
97       * computation, in clock units.
98       */
99      unsigned unit_ready[EU_NUM_UNITS];
100      /**
101       * Time at which an instruction dependent on a given dependency ID will
102       * be ready to execute, in clock units.
103       */
104      unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
105      /**
106       * Aggregated utilization of a given unit excluding idle cycles,
107       * in clock units.
108       */
109      float unit_busy[EU_NUM_UNITS];
110      /**
111       * Factor of the overhead of a computation accounted for in the
112       * aggregated utilization calculation.
113       */
114      float weight;
115   };
116
117   /**
118    * Information derived from an IR instruction used to compute performance
119    * estimates.  Allows the timing calculation to work on both FS and VEC4
120    * instructions.
121    */
122   struct instruction_info {
123      instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) :
124         isa(isa), devinfo(isa->devinfo), op(inst->opcode),
125         td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126         tx(get_exec_type(inst)), sx(0), ss(0),
127         sc(has_bank_conflict(isa, inst) ? sd : 0),
128         desc(inst->desc), sfid(inst->sfid)
129      {
130         /* We typically want the maximum source size, except for split send
131          * messages which require the total size.
132          */
133         if (inst->opcode == SHADER_OPCODE_SEND) {
134            ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135                 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136         } else {
137            for (unsigned i = 0; i < inst->sources; i++)
138               ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139         }
140
141         /* Convert the execution size to GRF units. */
142         sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143
144         /* 32x32 integer multiplication has half the usual ALU throughput.
145          * Treat it as double-precision.
146          */
147         if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
148             !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149             type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150            tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
151      }
152
153      instruction_info(const struct brw_isa_info *isa,
154                       const vec4_instruction *inst) :
155         isa(isa), devinfo(isa->devinfo), op(inst->opcode),
156         td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
157         tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
158         desc(inst->desc), sfid(inst->sfid)
159      {
160         /* Compute the maximum source size. */
161         for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
162            ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
163
164         /* Convert the execution size to GRF units. */
165         sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
166
167         /* 32x32 integer multiplication has half the usual ALU throughput.
168          * Treat it as double-precision.
169          */
170         if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
171             !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
172             type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
173            tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
174      }
175
176      /** ISA encoding information */
177      const struct brw_isa_info *isa;
178      /** Device information. */
179      const struct intel_device_info *devinfo;
180      /** Instruction opcode. */
181      opcode op;
182      /** Destination type. */
183      brw_reg_type td;
184      /** Destination size in GRF units. */
185      unsigned sd;
186      /** Execution type. */
187      brw_reg_type tx;
188      /** Execution size in GRF units. */
189      unsigned sx;
190      /** Source size. */
191      unsigned ss;
192      /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
193      unsigned sc;
194      /** Send message descriptor. */
195      uint32_t desc;
196      /** Send message shared function ID. */
197      uint8_t sfid;
198   };
199
200   /**
201    * Timing information of an instruction used to estimate the performance of
202    * the program.
203    */
204   struct perf_desc {
205      perf_desc(enum intel_eu_unit u, int df, int db,
206                int ls, int ld, int la, int lf) :
207         u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
208
209      /**
210       * Back-end unit its runtime shall be accounted to, in addition to the
211       * EU front-end which is always assumed to be involved.
212       */
213      enum intel_eu_unit u;
214      /**
215       * Overhead cycles from the time that the EU front-end starts executing
216       * the instruction until it's ready to execute the next instruction.
217       */
218      int df;
219      /**
220       * Overhead cycles from the time that the back-end starts executing the
221       * instruction until it's ready to execute the next instruction.
222       */
223      int db;
224      /**
225       * Latency cycles from the time that the back-end starts executing the
226       * instruction until its sources have been read from the register file.
227       */
228      int ls;
229      /**
230       * Latency cycles from the time that the back-end starts executing the
231       * instruction until its regular destination has been written to the
232       * register file.
233       */
234      int ld;
235      /**
236       * Latency cycles from the time that the back-end starts executing the
237       * instruction until its accumulator destination has been written to the
238       * ARF file.
239       *
240       * Note that this is an approximation of the real behavior of
241       * accumulating instructions in the hardware: Instead of modeling a pair
242       * of back-to-back accumulating instructions as a first computation with
243       * latency equal to ld followed by another computation with a
244       * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
245       * model the stall as if it occurred at the top of the pipeline, with
246       * the latency of the accumulator computation offset accordingly.
247       */
248      int la;
249      /**
250       * Latency cycles from the time that the back-end starts executing the
251       * instruction until its flag destination has been written to the ARF
252       * file.
253       */
254      int lf;
255   };
256
257   /**
258    * Compute the timing information of an instruction based on any relevant
259    * information from the IR and a number of parameters specifying a linear
260    * approximation: Parameter X_Y specifies the derivative of timing X
261    * relative to info field Y, while X_1 specifies the independent term of
262    * the approximation of timing X.
263    */
264   perf_desc
265   calculate_desc(const instruction_info &info, enum intel_eu_unit u,
266                  int df_1, int df_sd, int df_sc,
267                  int db_1, int db_sx,
268                  int ls_1, int ld_1, int la_1, int lf_1,
269                  int l_ss, int l_sd)
270   {
271      return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
272                          db_1 + db_sx * int(info.sx),
273                          ls_1 + l_ss * int(info.ss),
274                          ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
275                          la_1, lf_1);
276   }
277
278   /**
279    * Compute the timing information of an instruction based on any relevant
280    * information from the IR and a number of linear approximation parameters
281    * hard-coded for each IR instruction.
282    *
283    * Most timing parameters are obtained from the multivariate linear
284    * regression of a sample of empirical timings measured using the tm0
285    * register (as can be done today by using the shader_time debugging
286    * option).  The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
287    * "Shared Functions - Extended Math", Section 3.2 "Performance".
288    * Parameters marked XXX shall be considered low-quality, they're possibly
289    * high variance or completely guessed in cases where experimental data was
290    * unavailable.
291    */
292   const perf_desc
293   instruction_desc(const instruction_info &info)
294   {
295      const struct intel_device_info *devinfo = info.devinfo;
296
297      switch (info.op) {
298      case BRW_OPCODE_SYNC:
299      case BRW_OPCODE_SEL:
300      case BRW_OPCODE_NOT:
301      case BRW_OPCODE_AND:
302      case BRW_OPCODE_OR:
303      case BRW_OPCODE_XOR:
304      case BRW_OPCODE_SHR:
305      case BRW_OPCODE_SHL:
306      case BRW_OPCODE_DIM:
307      case BRW_OPCODE_ASR:
308      case BRW_OPCODE_CMPN:
309      case BRW_OPCODE_F16TO32:
310      case BRW_OPCODE_BFREV:
311      case BRW_OPCODE_BFI1:
312      case BRW_OPCODE_AVG:
313      case BRW_OPCODE_FRC:
314      case BRW_OPCODE_RNDU:
315      case BRW_OPCODE_RNDD:
316      case BRW_OPCODE_RNDE:
317      case BRW_OPCODE_RNDZ:
318      case BRW_OPCODE_MAC:
319      case BRW_OPCODE_MACH:
320      case BRW_OPCODE_LZD:
321      case BRW_OPCODE_FBH:
322      case BRW_OPCODE_FBL:
323      case BRW_OPCODE_CBIT:
324      case BRW_OPCODE_ADDC:
325      case BRW_OPCODE_ROR:
326      case BRW_OPCODE_ROL:
327      case BRW_OPCODE_SUBB:
328      case BRW_OPCODE_SAD2:
329      case BRW_OPCODE_SADA2:
330      case BRW_OPCODE_LINE:
331      case BRW_OPCODE_NOP:
332      case SHADER_OPCODE_CLUSTER_BROADCAST:
333      case SHADER_OPCODE_SCRATCH_HEADER:
334      case FS_OPCODE_DDX_COARSE:
335      case FS_OPCODE_DDX_FINE:
336      case FS_OPCODE_DDY_COARSE:
337      case FS_OPCODE_PIXEL_X:
338      case FS_OPCODE_PIXEL_Y:
339      case FS_OPCODE_SET_SAMPLE_ID:
340      case VEC4_OPCODE_MOV_BYTES:
341      case VEC4_OPCODE_UNPACK_UNIFORM:
342      case VEC4_OPCODE_DOUBLE_TO_F32:
343      case VEC4_OPCODE_DOUBLE_TO_D32:
344      case VEC4_OPCODE_DOUBLE_TO_U32:
345      case VEC4_OPCODE_TO_DOUBLE:
346      case VEC4_OPCODE_PICK_LOW_32BIT:
347      case VEC4_OPCODE_PICK_HIGH_32BIT:
348      case VEC4_OPCODE_SET_LOW_32BIT:
349      case VEC4_OPCODE_SET_HIGH_32BIT:
350      case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
351      case GS_OPCODE_SET_DWORD_2:
352      case GS_OPCODE_SET_WRITE_OFFSET:
353      case GS_OPCODE_SET_VERTEX_COUNT:
354      case GS_OPCODE_PREPARE_CHANNEL_MASKS:
355      case GS_OPCODE_SET_CHANNEL_MASKS:
356      case GS_OPCODE_GET_INSTANCE_ID:
357      case GS_OPCODE_SET_PRIMITIVE_ID:
358      case GS_OPCODE_SVB_SET_DST_INDEX:
359      case TCS_OPCODE_SRC0_010_IS_ZERO:
360      case TCS_OPCODE_GET_PRIMITIVE_ID:
361      case TES_OPCODE_GET_PRIMITIVE_ID:
362      case SHADER_OPCODE_READ_SR_REG:
363         if (devinfo->ver >= 11) {
364            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
365                                  0, 10, 6 /* XXX */, 14, 0, 0);
366         } else if (devinfo->ver >= 8) {
367            if (type_sz(info.tx) > 4)
368               return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
369                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
370            else
371               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
372                                     0, 8, 4, 12, 0, 0);
373         } else if (devinfo->verx10 >= 75) {
374            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
375                                  0, 10, 6 /* XXX */, 16, 0, 0);
376         } else {
377            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
378                                  0, 12, 8 /* XXX */, 18, 0, 0);
379         }
380
381      case BRW_OPCODE_MOV:
382      case BRW_OPCODE_CMP:
383      case BRW_OPCODE_ADD:
384      case BRW_OPCODE_ADD3:
385      case BRW_OPCODE_MUL:
386      case SHADER_OPCODE_MOV_RELOC_IMM:
387      case VEC4_OPCODE_MOV_FOR_SCRATCH:
388         if (devinfo->ver >= 11) {
389            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
390                                  0, 10, 6, 14, 0, 0);
391         } else if (devinfo->ver >= 8) {
392            if (type_sz(info.tx) > 4)
393               return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
394                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
395            else
396               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
397                                     0, 8, 4, 12, 0, 0);
398         } else if (devinfo->verx10 >= 75) {
399            if (info.tx == BRW_REGISTER_TYPE_F)
400               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
401                                     0, 12, 8 /* XXX */, 18, 0, 0);
402            else
403               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
404                                     0, 10, 6 /* XXX */, 16, 0, 0);
405         } else if (devinfo->ver >= 7) {
406            if (info.tx == BRW_REGISTER_TYPE_F)
407               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
408                                     0, 14, 10 /* XXX */, 20, 0, 0);
409            else
410               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
411                                     0, 12, 8 /* XXX */, 18, 0, 0);
412         } else {
413            return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
414                                  0, 2 /* XXX */,
415                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
416                                  0, 0);
417         }
418
419      case BRW_OPCODE_BFE:
420      case BRW_OPCODE_BFI2:
421      case BRW_OPCODE_CSEL:
422         if (devinfo->ver >= 11)
423            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
424                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
425         else if (devinfo->ver >= 8)
426            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
427                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
428         else if (devinfo->verx10 >= 75)
429            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
430                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
431         else if (devinfo->ver >= 7)
432            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
433                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
434         else
435            abort();
436
437      case BRW_OPCODE_MAD:
438         if (devinfo->ver >= 11) {
439            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
440                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
441         } else if (devinfo->ver >= 8) {
442            if (type_sz(info.tx) > 4)
443               return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
444                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
445            else
446               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
447                                     0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
448         } else if (devinfo->verx10 >= 75) {
449            if (info.tx == BRW_REGISTER_TYPE_F)
450               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
451                                     0, 12, 8 /* XXX */, 18, 0, 0);
452            else
453               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
454                                     0, 10, 6 /* XXX */, 16, 0, 0);
455         } else if (devinfo->ver >= 7) {
456            if (info.tx == BRW_REGISTER_TYPE_F)
457               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
458                                     0, 14, 10 /* XXX */, 20, 0, 0);
459            else
460               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
461                                     0, 12, 8 /* XXX */, 18, 0, 0);
462         } else if (devinfo->ver >= 6) {
463            return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 1 /* XXX */,
464                                  0, 2 /* XXX */,
465                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
466                                  0, 0);
467         } else {
468            abort();
469         }
470
471      case BRW_OPCODE_F32TO16:
472         if (devinfo->ver >= 11)
473            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
474                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
475         else if (devinfo->ver >= 8)
476            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
477                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
478         else if (devinfo->verx10 >= 75)
479            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
480                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
481         else if (devinfo->ver >= 7)
482            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
483                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
484         else
485            abort();
486
487      case BRW_OPCODE_DP4:
488      case BRW_OPCODE_DPH:
489      case BRW_OPCODE_DP3:
490      case BRW_OPCODE_DP2:
491         if (devinfo->ver >= 8)
492            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
493                                  0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
494         else if (devinfo->verx10 >= 75)
495            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
496                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
497         else
498            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
499                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
500
501      case BRW_OPCODE_DP4A:
502         if (devinfo->ver >= 12)
503            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
504                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
505         else
506            abort();
507
508      case SHADER_OPCODE_RCP:
509      case SHADER_OPCODE_RSQ:
510      case SHADER_OPCODE_SQRT:
511      case SHADER_OPCODE_EXP2:
512      case SHADER_OPCODE_LOG2:
513      case SHADER_OPCODE_SIN:
514      case SHADER_OPCODE_COS:
515      case SHADER_OPCODE_POW:
516      case SHADER_OPCODE_INT_QUOTIENT:
517      case SHADER_OPCODE_INT_REMAINDER:
518         if (devinfo->ver >= 6) {
519            switch (info.op) {
520            case SHADER_OPCODE_RCP:
521            case SHADER_OPCODE_RSQ:
522            case SHADER_OPCODE_SQRT:
523            case SHADER_OPCODE_EXP2:
524            case SHADER_OPCODE_LOG2:
525            case SHADER_OPCODE_SIN:
526            case SHADER_OPCODE_COS:
527               if (devinfo->ver >= 8)
528                  return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
529                                        0, 16, 0, 0, 0, 0);
530               else if (devinfo->verx10 >= 75)
531                  return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
532                                        0, 12, 0, 0, 0, 0);
533               else
534                  return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
535                                        0, 14, 0, 0, 0, 0);
536
537            case SHADER_OPCODE_POW:
538               if (devinfo->ver >= 8)
539                  return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
540                                        0, 24, 0, 0, 0, 0);
541               else if (devinfo->verx10 >= 75)
542                  return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
543                                        0, 20, 0, 0, 0, 0);
544               else
545                  return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
546                                        0, 22, 0, 0, 0, 0);
547
548            case SHADER_OPCODE_INT_QUOTIENT:
549            case SHADER_OPCODE_INT_REMAINDER:
550               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
551                                     0, 28 /* XXX */, 0, 0, 0, 0);
552
553            default:
554               abort();
555            }
556         } else {
557            switch (info.op) {
558            case SHADER_OPCODE_RCP:
559               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 8,
560                                     0, 22, 0, 0, 0, 8);
561
562            case SHADER_OPCODE_RSQ:
563               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 16,
564                                     0, 44, 0, 0, 0, 8);
565
566            case SHADER_OPCODE_INT_QUOTIENT:
567            case SHADER_OPCODE_SQRT:
568            case SHADER_OPCODE_LOG2:
569               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 24,
570                                     0, 66, 0, 0, 0, 8);
571
572            case SHADER_OPCODE_INT_REMAINDER:
573            case SHADER_OPCODE_EXP2:
574               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 32,
575                                     0, 88, 0, 0, 0, 8);
576
577            case SHADER_OPCODE_SIN:
578            case SHADER_OPCODE_COS:
579               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 48,
580                                     0, 132, 0, 0, 0, 8);
581
582            case SHADER_OPCODE_POW:
583               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 64,
584                                     0, 176, 0, 0, 0, 8);
585
586            default:
587               abort();
588            }
589         }
590
591      case BRW_OPCODE_DO:
592         if (devinfo->ver >= 6)
593            return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
594                                  0, 0, 0, 0, 0, 0);
595         else
596            return calculate_desc(info, EU_UNIT_NULL, 2 /* XXX */, 0, 0, 0, 0,
597                                  0, 0, 0, 0, 0, 0);
598
599      case BRW_OPCODE_IF:
600      case BRW_OPCODE_ELSE:
601      case BRW_OPCODE_ENDIF:
602      case BRW_OPCODE_WHILE:
603      case BRW_OPCODE_BREAK:
604      case BRW_OPCODE_CONTINUE:
605      case BRW_OPCODE_HALT:
606         if (devinfo->ver >= 8)
607            return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
608                                  0, 0, 0, 0, 0, 0);
609         else if (devinfo->verx10 >= 75)
610            return calculate_desc(info, EU_UNIT_NULL, 6, 0, 0, 0, 0,
611                                  0, 0, 0, 0, 0, 0);
612         else
613            return calculate_desc(info, EU_UNIT_NULL, 2, 0, 0, 0, 0,
614                                  0, 0, 0, 0, 0, 0);
615
616      case FS_OPCODE_LINTERP:
617         if (devinfo->ver >= 8)
618            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
619                                  0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
620         else if (devinfo->verx10 >= 75)
621            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
622                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
623         else
624            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
625                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
626
627      case BRW_OPCODE_LRP:
628         if (devinfo->ver >= 8)
629            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
630                                  0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
631         else if (devinfo->verx10 >= 75)
632            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
633                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
634         else if (devinfo->ver >= 6)
635            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
636                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
637         else
638            abort();
639
640      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
641         if (devinfo->ver >= 11)
642            return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
643                                  0, 10 /* XXX */, 6 /* XXX */,
644                                  14 /* XXX */, 0, 0);
645         else if (devinfo->ver >= 8)
646            return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
647                                  0, 8 /* XXX */, 4 /* XXX */,
648                                  12 /* XXX */, 0, 0);
649         else if (devinfo->verx10 >= 75)
650            return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
651                                  0, 10 /* XXX */, 6 /* XXX */,
652                                  16 /* XXX */, 0, 0);
653         else if (devinfo->ver >= 7)
654            return calculate_desc(info, EU_UNIT_FPU, 24, 6, 0, 0, 6,
655                                  0, 12 /* XXX */, 8 /* XXX */,
656                                  18 /* XXX */, 0, 0);
657         else
658            abort();
659
660      case SHADER_OPCODE_MOV_INDIRECT:
661         if (devinfo->ver >= 11)
662            return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
663                                  0, 10 /* XXX */, 6 /* XXX */,
664                                  14 /* XXX */, 0, 0);
665         else if (devinfo->ver >= 8)
666            return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
667                                  0, 8 /* XXX */, 4 /* XXX */,
668                                  12 /* XXX */, 0, 0);
669         else if (devinfo->verx10 >= 75)
670            return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
671                                  0, 10 /* XXX */, 6 /* XXX */,
672                                  16 /* XXX */, 0, 0);
673         else
674            return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
675                                  0, 12 /* XXX */, 8 /* XXX */,
676                                  18 /* XXX */, 0, 0);
677
678      case SHADER_OPCODE_BROADCAST:
679         if (devinfo->ver >= 11)
680            return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
681                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
682         else if (devinfo->ver >= 8)
683            return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
684                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
685         else if (devinfo->verx10 >= 75)
686            return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
687                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
688         else if (devinfo->ver >= 7)
689            return calculate_desc(info, EU_UNIT_FPU, 20, 0, 0, 4, 0,
690                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
691         else
692            abort();
693
694      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
695      case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
696         if (devinfo->ver >= 11)
697            return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
698                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
699         else if (devinfo->ver >= 8)
700            return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
701                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
702         else if (devinfo->verx10 >= 75)
703            return calculate_desc(info, EU_UNIT_FPU, 36, 0, 0, 6, 0,
704                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
705         else if (devinfo->ver >= 7)
706            return calculate_desc(info, EU_UNIT_FPU, 40, 0, 0, 6, 0,
707                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
708         else
709            abort();
710
711      case SHADER_OPCODE_RND_MODE:
712      case SHADER_OPCODE_FLOAT_CONTROL_MODE:
713         if (devinfo->ver >= 11)
714            return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
715                                  4 /* XXX */, 0,
716                                  0, 0, 0, 0, 0, 0);
717         else if (devinfo->ver >= 8)
718            return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
719                                  4 /* XXX */, 0,
720                                  0, 0, 0, 0, 0, 0);
721         else if (devinfo->verx10 >= 75)
722            return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
723                                  4 /* XXX */, 0,
724                                  0, 0, 0, 0, 0, 0);
725         else if (devinfo->ver >= 6)
726            return calculate_desc(info, EU_UNIT_FPU, 28 /* XXX */, 0, 0,
727                                  4 /* XXX */, 0,
728                                  0, 0, 0, 0, 0, 0);
729         else
730            abort();
731
732      case SHADER_OPCODE_SHUFFLE:
733         if (devinfo->ver >= 11)
734            return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
735                                  44 /* XXX */, 0,
736                                  0, 10 /* XXX */, 6 /* XXX */,
737                                  14 /* XXX */, 0, 0);
738         else if (devinfo->ver >= 8)
739            return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
740                                  42 /* XXX */, 0,
741                                  0, 8 /* XXX */, 4 /* XXX */,
742                                  12 /* XXX */, 0, 0);
743         else if (devinfo->verx10 >= 75)
744            return calculate_desc(info, EU_UNIT_FPU, 0, 44 /* XXX */, 0,
745                                  0, 44 /* XXX */,
746                                  0, 10 /* XXX */, 6 /* XXX */,
747                                  16 /* XXX */, 0, 0);
748         else if (devinfo->ver >= 6)
749            return calculate_desc(info, EU_UNIT_FPU, 0, 46 /* XXX */, 0,
750                                  0, 46 /* XXX */,
751                                  0, 12 /* XXX */, 8 /* XXX */,
752                                  18 /* XXX */, 0, 0);
753         else
754            abort();
755
756      case SHADER_OPCODE_SEL_EXEC:
757         if (devinfo->ver >= 11)
758            return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
759                                  0, 4 /* XXX */,
760                                  0, 10 /* XXX */, 6 /* XXX */,
761                                  14 /* XXX */, 0, 0);
762         else if (devinfo->ver >= 8)
763            return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
764                                  0, 4 /* XXX */,
765                                  0, 8 /* XXX */, 4 /* XXX */,
766                                  12 /* XXX */, 0, 0);
767         else if (devinfo->verx10 >= 75)
768            return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
769                                  0, 4 /* XXX */,
770                                  0, 10 /* XXX */, 6 /* XXX */,
771                                  16 /* XXX */, 0, 0);
772         else
773            return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 4 /* XXX */, 0,
774                                  0, 4 /* XXX */,
775                                  0, 12 /* XXX */, 8 /* XXX */,
776                                  18 /* XXX */, 0, 0);
777
778      case SHADER_OPCODE_QUAD_SWIZZLE:
779         if (devinfo->ver >= 11)
780            return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
781                                  0, 8 /* XXX */,
782                                  0, 10 /* XXX */, 6 /* XXX */,
783                                  14 /* XXX */, 0, 0);
784         else if (devinfo->ver >= 8)
785            return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
786                                  0, 8 /* XXX */,
787                                  0, 8 /* XXX */, 4 /* XXX */,
788                                  12 /* XXX */, 0, 0);
789         else if (devinfo->verx10 >= 75)
790            return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
791                                  0, 8 /* XXX */,
792                                  0, 10 /* XXX */, 6 /* XXX */,
793                                  16 /* XXX */, 0, 0);
794         else
795            return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
796                                  0, 8 /* XXX */,
797                                  0, 12 /* XXX */, 8 /* XXX */,
798                                  18 /* XXX */, 0, 0);
799
800      case FS_OPCODE_DDY_FINE:
801         if (devinfo->ver >= 11)
802            return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
803                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
804         else if (devinfo->ver >= 8)
805            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
806                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
807         else if (devinfo->verx10 >= 75)
808            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
809                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
810         else
811            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
812                                  0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
813
814      case FS_OPCODE_LOAD_LIVE_CHANNELS:
815         if (devinfo->ver >= 11)
816            return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
817                                  2 /* XXX */, 0,
818                                  0, 0, 0, 10 /* XXX */, 0, 0);
819         else if (devinfo->ver >= 8)
820            return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
821                                  0, 2 /* XXX */,
822                                  0, 0, 0, 8 /* XXX */, 0, 0);
823         else
824            abort();
825
826      case VEC4_OPCODE_PACK_BYTES:
827         if (devinfo->ver >= 8)
828            return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
829                                  4 /* XXX */, 0,
830                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
831                                  0, 0);
832         else if (devinfo->verx10 >= 75)
833            return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
834                                  4 /* XXX */, 0,
835                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
836                                  0, 0);
837         else
838            return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
839                                  4 /* XXX */, 0,
840                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
841                                  0, 0);
842
843      case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
844      case TCS_OPCODE_GET_INSTANCE_ID:
845      case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
846      case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
847      case TES_OPCODE_CREATE_INPUT_READ_HEADER:
848         if (devinfo->ver >= 8)
849            return calculate_desc(info, EU_UNIT_FPU, 22 /* XXX */, 0, 0,
850                                  6 /* XXX */, 0,
851                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
852                                  0, 0);
853         else if (devinfo->verx10 >= 75)
854            return calculate_desc(info, EU_UNIT_FPU, 26 /* XXX */, 0, 0,
855                                  6 /* XXX */, 0,
856                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
857                                  0, 0);
858         else
859            return calculate_desc(info, EU_UNIT_FPU, 30 /* XXX */, 0, 0,
860                                  6 /* XXX */, 0,
861                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
862                                  0, 0);
863
864      case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
865      case TCS_OPCODE_CREATE_BARRIER_HEADER:
866         if (devinfo->ver >= 8)
867            return calculate_desc(info, EU_UNIT_FPU, 32 /* XXX */, 0, 0,
868                                  8 /* XXX */, 0,
869                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
870                                  0, 0);
871         else if (devinfo->verx10 >= 75)
872            return calculate_desc(info, EU_UNIT_FPU, 38 /* XXX */, 0, 0,
873                                  8 /* XXX */, 0,
874                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
875                                  0, 0);
876         else if (devinfo->ver >= 6)
877            return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
878                                  8 /* XXX */, 0,
879                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
880                                  0, 0);
881         else
882            abort();
883
884      case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
885         if (devinfo->ver >= 8)
886            return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 0, 0,
887                                  4 /* XXX */, 0,
888                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
889                                  0, 0);
890         else if (devinfo->verx10 >= 75)
891            return calculate_desc(info, EU_UNIT_FPU, 14 /* XXX */, 0, 0,
892                                  4 /* XXX */, 0,
893                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
894                                  0, 0);
895         else if (devinfo->ver >= 7)
896            return calculate_desc(info, EU_UNIT_FPU, 16 /* XXX */, 0, 0,
897                                  4 /* XXX */, 0,
898                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
899                                  0, 0);
900         else
901            abort();
902
903      case SHADER_OPCODE_TEX:
904      case FS_OPCODE_TXB:
905      case SHADER_OPCODE_TXD:
906      case SHADER_OPCODE_TXF:
907      case SHADER_OPCODE_TXF_LZ:
908      case SHADER_OPCODE_TXL:
909      case SHADER_OPCODE_TXL_LZ:
910      case SHADER_OPCODE_TXF_CMS:
911      case SHADER_OPCODE_TXF_CMS_W:
912      case SHADER_OPCODE_TXF_UMS:
913      case SHADER_OPCODE_TXF_MCS:
914      case SHADER_OPCODE_TXS:
915      case SHADER_OPCODE_LOD:
916      case SHADER_OPCODE_GET_BUFFER_SIZE:
917      case SHADER_OPCODE_TG4:
918      case SHADER_OPCODE_TG4_OFFSET:
919      case SHADER_OPCODE_SAMPLEINFO:
920      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
921         return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
922                               8 /* XXX */, 750 /* XXX */, 0, 0,
923                               2 /* XXX */, 0);
924
925      case VEC4_OPCODE_URB_READ:
926      case VEC4_VS_OPCODE_URB_WRITE:
927      case VEC4_GS_OPCODE_URB_WRITE:
928      case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
929      case GS_OPCODE_THREAD_END:
930      case GS_OPCODE_FF_SYNC:
931      case VEC4_TCS_OPCODE_URB_WRITE:
932      case TCS_OPCODE_RELEASE_INPUT:
933      case TCS_OPCODE_THREAD_END:
934         return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
935                               32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
936
937      case SHADER_OPCODE_MEMORY_FENCE:
938      case SHADER_OPCODE_INTERLOCK:
939         switch (info.sfid) {
940         case GFX6_SFID_DATAPORT_RENDER_CACHE:
941            if (devinfo->ver >= 7)
942               return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
943                                     10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
944            else
945               abort();
946
947         case BRW_SFID_URB:
948         case GFX7_SFID_DATAPORT_DATA_CACHE:
949         case GFX12_SFID_SLM:
950         case GFX12_SFID_TGM:
951         case GFX12_SFID_UGM:
952         case HSW_SFID_DATAPORT_DATA_CACHE_1:
953            if (devinfo->ver >= 7)
954               return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
955                                     10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
956            else
957               abort();
958
959         default:
960            abort();
961         }
962
963      case SHADER_OPCODE_GFX4_SCRATCH_READ:
964      case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
965      case SHADER_OPCODE_GFX7_SCRATCH_READ:
966         return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 0, 8 /* XXX */,
967                               10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
968
969      case VEC4_OPCODE_UNTYPED_ATOMIC:
970         if (devinfo->ver >= 7)
971            return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
972                                  30 /* XXX */, 400 /* XXX */,
973                                  10 /* XXX */, 100 /* XXX */, 0, 0,
974                                  0, 400 /* XXX */);
975         else
976            abort();
977
978      case VEC4_OPCODE_UNTYPED_SURFACE_READ:
979      case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
980         if (devinfo->ver >= 7)
981            return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
982                                  0, 20 /* XXX */,
983                                  10 /* XXX */, 100 /* XXX */, 0, 0,
984                                  0, 0);
985         else
986            abort();
987
988      case FS_OPCODE_FB_WRITE:
989      case FS_OPCODE_FB_READ:
990      case FS_OPCODE_REP_FB_WRITE:
991         return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */,
992                               10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
993
994      case GS_OPCODE_SVB_WRITE:
995         if (devinfo->ver >= 6)
996            return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
997                                  0, 450 /* XXX */,
998                                  10 /* XXX */, 300 /* XXX */, 0, 0,
999                                  0, 0);
1000         else
1001            abort();
1002
1003      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1004      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
1005         return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
1006                               10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1007
1008      case VS_OPCODE_PULL_CONSTANT_LOAD:
1009      case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
1010         return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
1011                               8, 750, 0, 0, 2, 0);
1012
1013      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1014      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1015      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1016         if (devinfo->ver >= 7)
1017            return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
1018                                  0, 90 /* XXX */, 0, 0, 0, 0);
1019         else
1020            abort();
1021
1022      case SHADER_OPCODE_BARRIER:
1023         if (devinfo->ver >= 7)
1024            return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
1025                                  0 /* XXX */, 0,
1026                                  0, 0, 0, 0, 0, 0);
1027         else
1028            abort();
1029
1030      case CS_OPCODE_CS_TERMINATE:
1031         if (devinfo->ver >= 7)
1032            return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
1033                                  10 /* XXX */, 0, 0, 0, 0, 0);
1034         else
1035            abort();
1036
1037      case SHADER_OPCODE_SEND:
1038         switch (info.sfid) {
1039         case GFX6_SFID_DATAPORT_RENDER_CACHE:
1040            if (devinfo->ver >= 7) {
1041               switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1042               case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
1043                  return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
1044                                        30 /* XXX */, 450 /* XXX */,
1045                                        10 /* XXX */, 100 /* XXX */,
1046                                        0, 0, 0, 400 /* XXX */);
1047               default:
1048                  return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
1049                                        0, 450 /* XXX */,
1050                                        10 /* XXX */, 300 /* XXX */, 0, 0,
1051                                        0, 0);
1052               }
1053            } else if (devinfo->ver >= 6)  {
1054               return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
1055                                     0, 450 /* XXX */,
1056                                     10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1057            } else {
1058               abort();
1059            }
1060         case BRW_SFID_SAMPLER: {
1061            if (devinfo->ver >= 6)
1062               return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
1063                                     8, 750, 0, 0, 2, 0);
1064            else
1065               abort();
1066         }
1067         case GFX7_SFID_DATAPORT_DATA_CACHE:
1068         case HSW_SFID_DATAPORT_DATA_CACHE_1:
1069            if (devinfo->verx10 >= 75) {
1070               switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1071               case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1072               case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1073               case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1074               case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1075                  return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1076                                        30 /* XXX */, 400 /* XXX */,
1077                                        10 /* XXX */, 100 /* XXX */, 0, 0,
1078                                        0, 400 /* XXX */);
1079
1080               default:
1081                  return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1082                                        0, 20 /* XXX */,
1083                                        10 /* XXX */, 100 /* XXX */, 0, 0,
1084                                        0, 0);
1085               }
1086            } else if (devinfo->ver >= 7) {
1087               switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1088               case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1089                  return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1090                                        30 /* XXX */, 400 /* XXX */,
1091                                        10 /* XXX */, 100 /* XXX */,
1092                                        0, 0, 0, 400 /* XXX */);
1093               default:
1094                  return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1095                                        0, 20 /* XXX */,
1096                                        10 /* XXX */, 100 /* XXX */, 0, 0,
1097                                        0, 0);
1098               }
1099            } else {
1100               abort();
1101            }
1102
1103         case GFX12_SFID_UGM:
1104         case GFX12_SFID_TGM:
1105         case GFX12_SFID_SLM:
1106            switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
1107            case LSC_OP_LOAD:
1108            case LSC_OP_STORE:
1109            case LSC_OP_LOAD_CMASK:
1110            case LSC_OP_STORE_CMASK:
1111               return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1112                                     0, 20 /* XXX */,
1113                                     10 /* XXX */, 100 /* XXX */, 0, 0,
1114                                     0, 0);
1115
1116            case LSC_OP_FENCE:
1117            case LSC_OP_ATOMIC_INC:
1118            case LSC_OP_ATOMIC_DEC:
1119            case LSC_OP_ATOMIC_LOAD:
1120            case LSC_OP_ATOMIC_STORE:
1121            case LSC_OP_ATOMIC_ADD:
1122            case LSC_OP_ATOMIC_SUB:
1123            case LSC_OP_ATOMIC_MIN:
1124            case LSC_OP_ATOMIC_MAX:
1125            case LSC_OP_ATOMIC_UMIN:
1126            case LSC_OP_ATOMIC_UMAX:
1127            case LSC_OP_ATOMIC_CMPXCHG:
1128            case LSC_OP_ATOMIC_FADD:
1129            case LSC_OP_ATOMIC_FSUB:
1130            case LSC_OP_ATOMIC_FMIN:
1131            case LSC_OP_ATOMIC_FMAX:
1132            case LSC_OP_ATOMIC_FCMPXCHG:
1133            case LSC_OP_ATOMIC_AND:
1134            case LSC_OP_ATOMIC_OR:
1135            case LSC_OP_ATOMIC_XOR:
1136               return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1137                                     30 /* XXX */, 400 /* XXX */,
1138                                     10 /* XXX */, 100 /* XXX */, 0, 0,
1139                                     0, 400 /* XXX */);
1140            default:
1141               abort();
1142            }
1143
1144         case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH:
1145         case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
1146            return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
1147                                  10 /* XXX */, 0, 0, 0, 0, 0);
1148
1149         case BRW_SFID_URB:
1150            return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
1151                                  32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
1152
1153         default:
1154            abort();
1155         }
1156
1157      case SHADER_OPCODE_UNDEF:
1158      case SHADER_OPCODE_HALT_TARGET:
1159      case FS_OPCODE_SCHEDULING_FENCE:
1160         return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
1161                               0, 0, 0, 0, 0, 0);
1162
1163      default:
1164         abort();
1165      }
1166   }
1167
1168   /**
1169    * Model the performance behavior of a stall on the specified dependency
1170    * ID.
1171    */
1172   void
1173   stall_on_dependency(state &st, enum intel_eu_dependency_id id)
1174   {
1175      if (id < ARRAY_SIZE(st.dep_ready))
1176         st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1177                                       st.dep_ready[id]);
1178   }
1179
1180   /**
1181    * Model the performance behavior of the front-end and back-end while
1182    * executing an instruction with the specified timing information, assuming
1183    * all dependencies are already clear.
1184    */
1185   void
1186   execute_instruction(state &st, const perf_desc &perf)
1187   {
1188      /* Compute the time at which the front-end will be ready to execute the
1189       * next instruction.
1190       */
1191      st.unit_ready[EU_UNIT_FE] += perf.df;
1192
1193      if (perf.u < EU_NUM_UNITS) {
1194         /* Wait for the back-end to be ready to execute this instruction. */
1195         st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1196                                       st.unit_ready[perf.u]);
1197
1198         /* Compute the time at which the back-end will be ready to execute
1199          * the next instruction, and update the back-end utilization.
1200          */
1201         st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
1202         st.unit_busy[perf.u] += perf.db * st.weight;
1203      }
1204   }
1205
1206   /**
1207    * Model the performance behavior of a read dependency provided by an
1208    * instruction.
1209    */
1210   void
1211   mark_read_dependency(state &st, const perf_desc &perf,
1212                        enum intel_eu_dependency_id id)
1213   {
1214      if (id < ARRAY_SIZE(st.dep_ready))
1215         st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
1216   }
1217
1218   /**
1219    * Model the performance behavior of a write dependency provided by an
1220    * instruction.
1221    */
1222   void
1223   mark_write_dependency(state &st, const perf_desc &perf,
1224                         enum intel_eu_dependency_id id)
1225   {
1226      if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
1227         st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
1228      else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
1229         st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
1230      else if (id < ARRAY_SIZE(st.dep_ready))
1231         st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
1232   }
1233
1234   /**
1235    * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1236    */
1237   enum intel_eu_dependency_id
1238   reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r,
1239                     const int delta)
1240   {
1241      if (r.file == VGRF) {
1242         const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1243         assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1244         return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1245
1246      } else if (r.file == FIXED_GRF) {
1247         const unsigned i = r.nr + delta;
1248         assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1249         return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1250
1251      } else if (r.file == MRF && devinfo->ver >= 7) {
1252         const unsigned i = GFX7_MRF_HACK_START +
1253                            r.nr + r.offset / REG_SIZE + delta;
1254         assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1255         return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1256
1257      } else if (r.file == MRF && devinfo->ver < 7) {
1258         const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
1259                            r.offset / REG_SIZE + delta;
1260         assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_MRF0);
1261         return intel_eu_dependency_id(EU_DEPENDENCY_ID_MRF0 + i);
1262
1263      } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
1264                 r.nr < BRW_ARF_ACCUMULATOR) {
1265         assert(delta == 0);
1266         return EU_DEPENDENCY_ID_ADDR0;
1267
1268      } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
1269                 r.nr < BRW_ARF_FLAG) {
1270         const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
1271         assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
1272         return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
1273
1274      } else {
1275         return EU_NUM_DEPENDENCY_IDS;
1276      }
1277   }
1278
1279   /**
1280    * Return the dependency ID of flag register starting at offset \p i.
1281    */
1282   enum intel_eu_dependency_id
1283   flag_dependency_id(unsigned i)
1284   {
1285      assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
1286      return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
1287   }
1288
1289   /**
1290    * Return the dependency ID corresponding to the SBID read completion
1291    * condition of a Gfx12+ SWSB.
1292    */
1293   enum intel_eu_dependency_id
1294   tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1295   {
1296      if (swsb.mode) {
1297         assert(swsb.sbid <
1298                EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
1299         return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
1300      } else {
1301         return EU_NUM_DEPENDENCY_IDS;
1302      }
1303   }
1304
1305   /**
1306    * Return the dependency ID corresponding to the SBID write completion
1307    * condition of a Gfx12+ SWSB.
1308    */
1309   enum intel_eu_dependency_id
1310   tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1311   {
1312      if (swsb.mode) {
1313         assert(swsb.sbid <
1314                EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
1315         return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
1316      } else {
1317         return EU_NUM_DEPENDENCY_IDS;
1318      }
1319   }
1320
1321   /**
1322    * Return the implicit accumulator register accessed by channel \p i of the
1323    * instruction.
1324    */
1325   unsigned
1326   accum_reg_of_channel(const intel_device_info *devinfo,
1327                        const backend_instruction *inst,
1328                        brw_reg_type tx, unsigned i)
1329   {
1330      assert(inst->reads_accumulator_implicitly() ||
1331             inst->writes_accumulator_implicitly(devinfo));
1332      const unsigned offset = (inst->group + i) * type_sz(tx) *
1333         (devinfo->ver < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
1334      return offset / REG_SIZE % 2;
1335   }
1336
1337   /**
1338    * Model the performance behavior of an FS back-end instruction.
1339    */
1340   void
1341   issue_fs_inst(state &st, const struct brw_isa_info *isa,
1342                 const backend_instruction *be_inst)
1343   {
1344      const struct intel_device_info *devinfo = isa->devinfo;
1345      const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
1346      const instruction_info info(isa, inst);
1347      const perf_desc perf = instruction_desc(info);
1348
1349      /* Stall on any source dependencies. */
1350      for (unsigned i = 0; i < inst->sources; i++) {
1351         for (unsigned j = 0; j < regs_read(inst, i); j++)
1352            stall_on_dependency(
1353               st, reg_dependency_id(devinfo, inst->src[i], j));
1354      }
1355
1356      if (inst->reads_accumulator_implicitly()) {
1357         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1358              j <= accum_reg_of_channel(devinfo, inst, info.tx,
1359                                        inst->exec_size - 1); j++)
1360            stall_on_dependency(
1361               st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1362      }
1363
1364      if (is_send(inst) && inst->base_mrf != -1) {
1365         for (unsigned j = 0; j < inst->mlen; j++)
1366            stall_on_dependency(
1367               st, reg_dependency_id(
1368                  devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1369      }
1370
1371      if (const unsigned mask = inst->flags_read(devinfo)) {
1372         for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1373            if (mask & (1 << i))
1374               stall_on_dependency(st, flag_dependency_id(i));
1375         }
1376      }
1377
1378      /* Stall on any write dependencies. */
1379      if (!inst->no_dd_check) {
1380         if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1381            for (unsigned j = 0; j < regs_written(inst); j++)
1382               stall_on_dependency(
1383                  st, reg_dependency_id(devinfo, inst->dst, j));
1384         }
1385
1386         if (inst->writes_accumulator_implicitly(devinfo)) {
1387            for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1388                 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1389                                           inst->exec_size - 1); j++)
1390               stall_on_dependency(
1391                  st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1392         }
1393
1394         if (const unsigned mask = inst->flags_written(devinfo)) {
1395            for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1396               if (mask & (1 << i))
1397                  stall_on_dependency(st, flag_dependency_id(i));
1398            }
1399         }
1400      }
1401
1402      /* Stall on any SBID dependencies. */
1403      if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1404         stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1405      else if (inst->sched.mode & TGL_SBID_SRC)
1406         stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1407
1408      /* Execute the instruction. */
1409      execute_instruction(st, perf);
1410
1411      /* Mark any source dependencies. */
1412      if (inst->is_send_from_grf()) {
1413         for (unsigned i = 0; i < inst->sources; i++) {
1414            if (inst->is_payload(i)) {
1415               for (unsigned j = 0; j < regs_read(inst, i); j++)
1416                  mark_read_dependency(
1417                     st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1418            }
1419         }
1420      }
1421
1422      if (is_send(inst) && inst->base_mrf != -1) {
1423         for (unsigned j = 0; j < inst->mlen; j++)
1424            mark_read_dependency(st, perf,
1425               reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1426      }
1427
1428      /* Mark any destination dependencies. */
1429      if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1430         for (unsigned j = 0; j < regs_written(inst); j++) {
1431            mark_write_dependency(st, perf,
1432                                  reg_dependency_id(devinfo, inst->dst, j));
1433         }
1434      }
1435
1436      if (inst->writes_accumulator_implicitly(devinfo)) {
1437         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1438              j <= accum_reg_of_channel(devinfo, inst, info.tx,
1439                                        inst->exec_size - 1); j++)
1440            mark_write_dependency(st, perf,
1441                                  reg_dependency_id(devinfo, brw_acc_reg(8), j));
1442      }
1443
1444      if (const unsigned mask = inst->flags_written(devinfo)) {
1445         for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1446            if (mask & (1 << i))
1447               mark_write_dependency(st, perf, flag_dependency_id(i));
1448         }
1449      }
1450
1451      /* Mark any SBID dependencies. */
1452      if (inst->sched.mode & TGL_SBID_SET) {
1453         mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1454         mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1455      }
1456   }
1457
1458   /**
1459    * Model the performance behavior of a VEC4 back-end instruction.
1460    */
1461   void
1462   issue_vec4_instruction(state &st, const struct brw_isa_info *isa,
1463                          const backend_instruction *be_inst)
1464   {
1465      const struct intel_device_info *devinfo = isa->devinfo;
1466      const vec4_instruction *inst =
1467         static_cast<const vec4_instruction *>(be_inst);
1468      const instruction_info info(isa, inst);
1469      const perf_desc perf = instruction_desc(info);
1470
1471      /* Stall on any source dependencies. */
1472      for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1473         for (unsigned j = 0; j < regs_read(inst, i); j++)
1474            stall_on_dependency(
1475               st, reg_dependency_id(devinfo, inst->src[i], j));
1476      }
1477
1478      if (inst->reads_accumulator_implicitly()) {
1479         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1480              j <= accum_reg_of_channel(devinfo, inst, info.tx,
1481                                        inst->exec_size - 1); j++)
1482            stall_on_dependency(
1483               st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1484      }
1485
1486      if (inst->base_mrf != -1) {
1487         for (unsigned j = 0; j < inst->mlen; j++)
1488            stall_on_dependency(
1489               st, reg_dependency_id(
1490                  devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1491      }
1492
1493      if (inst->reads_flag())
1494         stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1495
1496      /* Stall on any write dependencies. */
1497      if (!inst->no_dd_check) {
1498         if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1499            for (unsigned j = 0; j < regs_written(inst); j++)
1500               stall_on_dependency(
1501                  st, reg_dependency_id(devinfo, inst->dst, j));
1502         }
1503
1504         if (inst->writes_accumulator_implicitly(devinfo)) {
1505            for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1506                 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1507                                           inst->exec_size - 1); j++)
1508               stall_on_dependency(
1509                  st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1510         }
1511
1512         if (inst->writes_flag(devinfo))
1513            stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1514      }
1515
1516      /* Execute the instruction. */
1517      execute_instruction(st, perf);
1518
1519      /* Mark any source dependencies. */
1520      if (inst->is_send_from_grf()) {
1521         for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1522            for (unsigned j = 0; j < regs_read(inst, i); j++)
1523               mark_read_dependency(
1524                  st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1525         }
1526      }
1527
1528      if (inst->base_mrf != -1) {
1529         for (unsigned j = 0; j < inst->mlen; j++)
1530            mark_read_dependency(st, perf,
1531               reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1532      }
1533
1534      /* Mark any destination dependencies. */
1535      if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1536         for (unsigned j = 0; j < regs_written(inst); j++) {
1537            mark_write_dependency(st, perf,
1538                                  reg_dependency_id(devinfo, inst->dst, j));
1539         }
1540      }
1541
1542      if (inst->writes_accumulator_implicitly(devinfo)) {
1543         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1544              j <= accum_reg_of_channel(devinfo, inst, info.tx,
1545                                        inst->exec_size - 1); j++)
1546            mark_write_dependency(st, perf,
1547                                  reg_dependency_id(devinfo, brw_acc_reg(8), j));
1548      }
1549
1550      if (inst->writes_flag(devinfo))
1551         mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0);
1552   }
1553
1554   /**
1555    * Calculate the maximum possible throughput of the program compatible with
1556    * the cycle-count utilization estimated for each asynchronous unit, in
1557    * threads-per-cycle units.
1558    */
1559   float
1560   calculate_thread_throughput(const state &st, float busy)
1561   {
1562      for (unsigned i = 0; i < EU_NUM_UNITS; i++)
1563         busy = MAX2(busy, st.unit_busy[i]);
1564
1565      return 1.0 / busy;
1566   }
1567
1568   /**
1569    * Estimate the performance of the specified shader.
1570    */
1571   void
1572   calculate_performance(performance &p, const backend_shader *s,
1573                         void (*issue_instruction)(
1574                            state &, const struct brw_isa_info *,
1575                            const backend_instruction *),
1576                         unsigned dispatch_width)
1577   {
1578      /* XXX - Note that the previous version of this code used worst-case
1579       *       scenario estimation of branching divergence for SIMD32 shaders,
1580       *       but this heuristic was removed to improve performance in common
1581       *       scenarios. Wider shader variants are less optimal when divergence
1582       *       is high, e.g. when application renders complex scene on a small
1583       *       surface. It is assumed that such renders are short, so their
1584       *       time doesn't matter and when it comes to the overall performance,
1585       *       they are dominated by more optimal larger renders.
1586       *
1587       *       It's possible that we could do better with divergence analysis
1588       *       by isolating branches which are 100% uniform.
1589       *
1590       *       Plumbing the trip counts from NIR loop analysis would allow us
1591       *       to do a better job regarding the loop weights.
1592       *
1593       *       In the meantime use values that roughly match the control flow
1594       *       weights used elsewhere in the compiler back-end.
1595       *
1596       *       Note that we provide slightly more pessimistic weights on
1597       *       Gfx12+ for SIMD32, since the effective warp size on that
1598       *       platform is 2x the SIMD width due to EU fusion, which increases
1599       *       the likelihood of divergent control flow in comparison to
1600       *       previous generations, giving narrower SIMD modes a performance
1601       *       advantage in several test-cases with non-uniform discard jumps.
1602       */
1603      const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1604                                    1.0 : 0.5);
1605      const float loop_weight = 10;
1606      unsigned halt_count = 0;
1607      unsigned elapsed = 0;
1608      state st;
1609
1610      foreach_block(block, s->cfg) {
1611         const unsigned elapsed0 = elapsed;
1612
1613         foreach_inst_in_block(backend_instruction, inst, block) {
1614            const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1615
1616            issue_instruction(st, &s->compiler->isa, inst);
1617
1618            if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1619               st.weight /= discard_weight;
1620
1621            elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1622
1623            if (inst->opcode == BRW_OPCODE_DO)
1624               st.weight *= loop_weight;
1625            else if (inst->opcode == BRW_OPCODE_WHILE)
1626               st.weight /= loop_weight;
1627            else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1628               st.weight *= discard_weight;
1629         }
1630
1631         p.block_latency[block->num] = elapsed - elapsed0;
1632      }
1633
1634      p.latency = elapsed;
1635      p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1636   }
1637}
1638
1639brw::performance::performance(const fs_visitor *v) :
1640   block_latency(new unsigned[v->cfg->num_blocks])
1641{
1642   calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1643}
1644
1645brw::performance::performance(const vec4_visitor *v) :
1646   block_latency(new unsigned[v->cfg->num_blocks])
1647{
1648   calculate_performance(*this, v, issue_vec4_instruction, 8);
1649}
1650
1651brw::performance::~performance()
1652{
1653   delete[] block_latency;
1654}
1655