1/* 2 * Copyright © 2020 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_eu.h" 25#include "brw_fs.h" 26#include "brw_vec4.h" 27#include "brw_cfg.h" 28 29using namespace brw; 30 31namespace { 32 /** 33 * Enumeration representing the various asynchronous units that can run 34 * computations in parallel on behalf of a shader thread. 35 */ 36 enum intel_eu_unit { 37 /** EU front-end. */ 38 EU_UNIT_FE, 39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */ 40 EU_UNIT_FPU, 41 /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */ 42 EU_UNIT_EM, 43 /** Sampler shared function. */ 44 EU_UNIT_SAMPLER, 45 /** Pixel Interpolator shared function. */ 46 EU_UNIT_PI, 47 /** Unified Return Buffer shared function. */ 48 EU_UNIT_URB, 49 /** Data Port Data Cache shared function. */ 50 EU_UNIT_DP_DC, 51 /** Data Port Render Cache shared function. */ 52 EU_UNIT_DP_RC, 53 /** Data Port Constant Cache shared function. */ 54 EU_UNIT_DP_CC, 55 /** Message Gateway shared function. */ 56 EU_UNIT_GATEWAY, 57 /** Thread Spawner shared function. */ 58 EU_UNIT_SPAWNER, 59 /* EU_UNIT_VME, */ 60 /* EU_UNIT_CRE, */ 61 /** Number of asynchronous units currently tracked. */ 62 EU_NUM_UNITS, 63 /** Dummy unit for instructions that don't consume runtime from the above. */ 64 EU_UNIT_NULL = EU_NUM_UNITS 65 }; 66 67 /** 68 * Enumeration representing a computation result another computation can 69 * potentially depend on. 70 */ 71 enum intel_eu_dependency_id { 72 /* Register part of the GRF. */ 73 EU_DEPENDENCY_ID_GRF0 = 0, 74 /* Register part of the MRF. Only used on Gfx4-6. */ 75 EU_DEPENDENCY_ID_MRF0 = EU_DEPENDENCY_ID_GRF0 + BRW_MAX_GRF, 76 /* Address register part of the ARF. */ 77 EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_MRF0 + 24, 78 /* Accumulator register part of the ARF. */ 79 EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1, 80 /* Flag register part of the ARF. */ 81 EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12, 82 /* SBID token write completion. Only used on Gfx12+. */ 83 EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8, 84 /* SBID token read completion. Only used on Gfx12+. */ 85 EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 16, 86 /* Number of computation dependencies currently tracked. */ 87 EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 16 88 }; 89 90 /** 91 * State of our modeling of the program execution. 92 */ 93 struct state { 94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {} 95 /** 96 * Time at which a given unit will be ready to execute the next 97 * computation, in clock units. 98 */ 99 unsigned unit_ready[EU_NUM_UNITS]; 100 /** 101 * Time at which an instruction dependent on a given dependency ID will 102 * be ready to execute, in clock units. 103 */ 104 unsigned dep_ready[EU_NUM_DEPENDENCY_IDS]; 105 /** 106 * Aggregated utilization of a given unit excluding idle cycles, 107 * in clock units. 108 */ 109 float unit_busy[EU_NUM_UNITS]; 110 /** 111 * Factor of the overhead of a computation accounted for in the 112 * aggregated utilization calculation. 113 */ 114 float weight; 115 }; 116 117 /** 118 * Information derived from an IR instruction used to compute performance 119 * estimates. Allows the timing calculation to work on both FS and VEC4 120 * instructions. 121 */ 122 struct instruction_info { 123 instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) : 124 isa(isa), devinfo(isa->devinfo), op(inst->opcode), 125 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)), 126 tx(get_exec_type(inst)), sx(0), ss(0), 127 sc(has_bank_conflict(isa, inst) ? sd : 0), 128 desc(inst->desc), sfid(inst->sfid) 129 { 130 /* We typically want the maximum source size, except for split send 131 * messages which require the total size. 132 */ 133 if (inst->opcode == SHADER_OPCODE_SEND) { 134 ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) + 135 DIV_ROUND_UP(inst->size_read(3), REG_SIZE); 136 } else { 137 for (unsigned i = 0; i < inst->sources; i++) 138 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); 139 } 140 141 /* Convert the execution size to GRF units. */ 142 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE); 143 144 /* 32x32 integer multiplication has half the usual ALU throughput. 145 * Treat it as double-precision. 146 */ 147 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) && 148 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 && 149 type_sz(inst->src[0].type) == type_sz(inst->src[1].type)) 150 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D); 151 } 152 153 instruction_info(const struct brw_isa_info *isa, 154 const vec4_instruction *inst) : 155 isa(isa), devinfo(isa->devinfo), op(inst->opcode), 156 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)), 157 tx(get_exec_type(inst)), sx(0), ss(0), sc(0), 158 desc(inst->desc), sfid(inst->sfid) 159 { 160 /* Compute the maximum source size. */ 161 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) 162 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); 163 164 /* Convert the execution size to GRF units. */ 165 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE); 166 167 /* 32x32 integer multiplication has half the usual ALU throughput. 168 * Treat it as double-precision. 169 */ 170 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) && 171 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 && 172 type_sz(inst->src[0].type) == type_sz(inst->src[1].type)) 173 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D); 174 } 175 176 /** ISA encoding information */ 177 const struct brw_isa_info *isa; 178 /** Device information. */ 179 const struct intel_device_info *devinfo; 180 /** Instruction opcode. */ 181 opcode op; 182 /** Destination type. */ 183 brw_reg_type td; 184 /** Destination size in GRF units. */ 185 unsigned sd; 186 /** Execution type. */ 187 brw_reg_type tx; 188 /** Execution size in GRF units. */ 189 unsigned sx; 190 /** Source size. */ 191 unsigned ss; 192 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */ 193 unsigned sc; 194 /** Send message descriptor. */ 195 uint32_t desc; 196 /** Send message shared function ID. */ 197 uint8_t sfid; 198 }; 199 200 /** 201 * Timing information of an instruction used to estimate the performance of 202 * the program. 203 */ 204 struct perf_desc { 205 perf_desc(enum intel_eu_unit u, int df, int db, 206 int ls, int ld, int la, int lf) : 207 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {} 208 209 /** 210 * Back-end unit its runtime shall be accounted to, in addition to the 211 * EU front-end which is always assumed to be involved. 212 */ 213 enum intel_eu_unit u; 214 /** 215 * Overhead cycles from the time that the EU front-end starts executing 216 * the instruction until it's ready to execute the next instruction. 217 */ 218 int df; 219 /** 220 * Overhead cycles from the time that the back-end starts executing the 221 * instruction until it's ready to execute the next instruction. 222 */ 223 int db; 224 /** 225 * Latency cycles from the time that the back-end starts executing the 226 * instruction until its sources have been read from the register file. 227 */ 228 int ls; 229 /** 230 * Latency cycles from the time that the back-end starts executing the 231 * instruction until its regular destination has been written to the 232 * register file. 233 */ 234 int ld; 235 /** 236 * Latency cycles from the time that the back-end starts executing the 237 * instruction until its accumulator destination has been written to the 238 * ARF file. 239 * 240 * Note that this is an approximation of the real behavior of 241 * accumulating instructions in the hardware: Instead of modeling a pair 242 * of back-to-back accumulating instructions as a first computation with 243 * latency equal to ld followed by another computation with a 244 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we 245 * model the stall as if it occurred at the top of the pipeline, with 246 * the latency of the accumulator computation offset accordingly. 247 */ 248 int la; 249 /** 250 * Latency cycles from the time that the back-end starts executing the 251 * instruction until its flag destination has been written to the ARF 252 * file. 253 */ 254 int lf; 255 }; 256 257 /** 258 * Compute the timing information of an instruction based on any relevant 259 * information from the IR and a number of parameters specifying a linear 260 * approximation: Parameter X_Y specifies the derivative of timing X 261 * relative to info field Y, while X_1 specifies the independent term of 262 * the approximation of timing X. 263 */ 264 perf_desc 265 calculate_desc(const instruction_info &info, enum intel_eu_unit u, 266 int df_1, int df_sd, int df_sc, 267 int db_1, int db_sx, 268 int ls_1, int ld_1, int la_1, int lf_1, 269 int l_ss, int l_sd) 270 { 271 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc), 272 db_1 + db_sx * int(info.sx), 273 ls_1 + l_ss * int(info.ss), 274 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd), 275 la_1, lf_1); 276 } 277 278 /** 279 * Compute the timing information of an instruction based on any relevant 280 * information from the IR and a number of linear approximation parameters 281 * hard-coded for each IR instruction. 282 * 283 * Most timing parameters are obtained from the multivariate linear 284 * regression of a sample of empirical timings measured using the tm0 285 * register (as can be done today by using the shader_time debugging 286 * option). The Gfx4-5 math timings are obtained from BSpec Volume 5c.3 287 * "Shared Functions - Extended Math", Section 3.2 "Performance". 288 * Parameters marked XXX shall be considered low-quality, they're possibly 289 * high variance or completely guessed in cases where experimental data was 290 * unavailable. 291 */ 292 const perf_desc 293 instruction_desc(const instruction_info &info) 294 { 295 const struct intel_device_info *devinfo = info.devinfo; 296 297 switch (info.op) { 298 case BRW_OPCODE_SYNC: 299 case BRW_OPCODE_SEL: 300 case BRW_OPCODE_NOT: 301 case BRW_OPCODE_AND: 302 case BRW_OPCODE_OR: 303 case BRW_OPCODE_XOR: 304 case BRW_OPCODE_SHR: 305 case BRW_OPCODE_SHL: 306 case BRW_OPCODE_DIM: 307 case BRW_OPCODE_ASR: 308 case BRW_OPCODE_CMPN: 309 case BRW_OPCODE_F16TO32: 310 case BRW_OPCODE_BFREV: 311 case BRW_OPCODE_BFI1: 312 case BRW_OPCODE_AVG: 313 case BRW_OPCODE_FRC: 314 case BRW_OPCODE_RNDU: 315 case BRW_OPCODE_RNDD: 316 case BRW_OPCODE_RNDE: 317 case BRW_OPCODE_RNDZ: 318 case BRW_OPCODE_MAC: 319 case BRW_OPCODE_MACH: 320 case BRW_OPCODE_LZD: 321 case BRW_OPCODE_FBH: 322 case BRW_OPCODE_FBL: 323 case BRW_OPCODE_CBIT: 324 case BRW_OPCODE_ADDC: 325 case BRW_OPCODE_ROR: 326 case BRW_OPCODE_ROL: 327 case BRW_OPCODE_SUBB: 328 case BRW_OPCODE_SAD2: 329 case BRW_OPCODE_SADA2: 330 case BRW_OPCODE_LINE: 331 case BRW_OPCODE_NOP: 332 case SHADER_OPCODE_CLUSTER_BROADCAST: 333 case SHADER_OPCODE_SCRATCH_HEADER: 334 case FS_OPCODE_DDX_COARSE: 335 case FS_OPCODE_DDX_FINE: 336 case FS_OPCODE_DDY_COARSE: 337 case FS_OPCODE_PIXEL_X: 338 case FS_OPCODE_PIXEL_Y: 339 case FS_OPCODE_SET_SAMPLE_ID: 340 case VEC4_OPCODE_MOV_BYTES: 341 case VEC4_OPCODE_UNPACK_UNIFORM: 342 case VEC4_OPCODE_DOUBLE_TO_F32: 343 case VEC4_OPCODE_DOUBLE_TO_D32: 344 case VEC4_OPCODE_DOUBLE_TO_U32: 345 case VEC4_OPCODE_TO_DOUBLE: 346 case VEC4_OPCODE_PICK_LOW_32BIT: 347 case VEC4_OPCODE_PICK_HIGH_32BIT: 348 case VEC4_OPCODE_SET_LOW_32BIT: 349 case VEC4_OPCODE_SET_HIGH_32BIT: 350 case VEC4_OPCODE_ZERO_OOB_PUSH_REGS: 351 case GS_OPCODE_SET_DWORD_2: 352 case GS_OPCODE_SET_WRITE_OFFSET: 353 case GS_OPCODE_SET_VERTEX_COUNT: 354 case GS_OPCODE_PREPARE_CHANNEL_MASKS: 355 case GS_OPCODE_SET_CHANNEL_MASKS: 356 case GS_OPCODE_GET_INSTANCE_ID: 357 case GS_OPCODE_SET_PRIMITIVE_ID: 358 case GS_OPCODE_SVB_SET_DST_INDEX: 359 case TCS_OPCODE_SRC0_010_IS_ZERO: 360 case TCS_OPCODE_GET_PRIMITIVE_ID: 361 case TES_OPCODE_GET_PRIMITIVE_ID: 362 case SHADER_OPCODE_READ_SR_REG: 363 if (devinfo->ver >= 11) { 364 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 365 0, 10, 6 /* XXX */, 14, 0, 0); 366 } else if (devinfo->ver >= 8) { 367 if (type_sz(info.tx) > 4) 368 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, 369 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 370 else 371 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 372 0, 8, 4, 12, 0, 0); 373 } else if (devinfo->verx10 >= 75) { 374 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 375 0, 10, 6 /* XXX */, 16, 0, 0); 376 } else { 377 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 378 0, 12, 8 /* XXX */, 18, 0, 0); 379 } 380 381 case BRW_OPCODE_MOV: 382 case BRW_OPCODE_CMP: 383 case BRW_OPCODE_ADD: 384 case BRW_OPCODE_ADD3: 385 case BRW_OPCODE_MUL: 386 case SHADER_OPCODE_MOV_RELOC_IMM: 387 case VEC4_OPCODE_MOV_FOR_SCRATCH: 388 if (devinfo->ver >= 11) { 389 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 390 0, 10, 6, 14, 0, 0); 391 } else if (devinfo->ver >= 8) { 392 if (type_sz(info.tx) > 4) 393 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, 394 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 395 else 396 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 397 0, 8, 4, 12, 0, 0); 398 } else if (devinfo->verx10 >= 75) { 399 if (info.tx == BRW_REGISTER_TYPE_F) 400 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 401 0, 12, 8 /* XXX */, 18, 0, 0); 402 else 403 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 404 0, 10, 6 /* XXX */, 16, 0, 0); 405 } else if (devinfo->ver >= 7) { 406 if (info.tx == BRW_REGISTER_TYPE_F) 407 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 408 0, 14, 10 /* XXX */, 20, 0, 0); 409 else 410 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 411 0, 12, 8 /* XXX */, 18, 0, 0); 412 } else { 413 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0, 414 0, 2 /* XXX */, 415 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 416 0, 0); 417 } 418 419 case BRW_OPCODE_BFE: 420 case BRW_OPCODE_BFI2: 421 case BRW_OPCODE_CSEL: 422 if (devinfo->ver >= 11) 423 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 424 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 425 else if (devinfo->ver >= 8) 426 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 427 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 428 else if (devinfo->verx10 >= 75) 429 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 430 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 431 else if (devinfo->ver >= 7) 432 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 433 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 434 else 435 abort(); 436 437 case BRW_OPCODE_MAD: 438 if (devinfo->ver >= 11) { 439 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 440 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 441 } else if (devinfo->ver >= 8) { 442 if (type_sz(info.tx) > 4) 443 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4, 444 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 445 else 446 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 447 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 448 } else if (devinfo->verx10 >= 75) { 449 if (info.tx == BRW_REGISTER_TYPE_F) 450 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 451 0, 12, 8 /* XXX */, 18, 0, 0); 452 else 453 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 454 0, 10, 6 /* XXX */, 16, 0, 0); 455 } else if (devinfo->ver >= 7) { 456 if (info.tx == BRW_REGISTER_TYPE_F) 457 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 458 0, 14, 10 /* XXX */, 20, 0, 0); 459 else 460 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 461 0, 12, 8 /* XXX */, 18, 0, 0); 462 } else if (devinfo->ver >= 6) { 463 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 1 /* XXX */, 464 0, 2 /* XXX */, 465 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 466 0, 0); 467 } else { 468 abort(); 469 } 470 471 case BRW_OPCODE_F32TO16: 472 if (devinfo->ver >= 11) 473 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, 474 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 475 else if (devinfo->ver >= 8) 476 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, 477 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 478 else if (devinfo->verx10 >= 75) 479 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, 480 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 481 else if (devinfo->ver >= 7) 482 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, 483 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 484 else 485 abort(); 486 487 case BRW_OPCODE_DP4: 488 case BRW_OPCODE_DPH: 489 case BRW_OPCODE_DP3: 490 case BRW_OPCODE_DP2: 491 if (devinfo->ver >= 8) 492 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 493 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 494 else if (devinfo->verx10 >= 75) 495 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 496 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 497 else 498 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 499 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 500 501 case BRW_OPCODE_DP4A: 502 if (devinfo->ver >= 12) 503 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 504 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 505 else 506 abort(); 507 508 case SHADER_OPCODE_RCP: 509 case SHADER_OPCODE_RSQ: 510 case SHADER_OPCODE_SQRT: 511 case SHADER_OPCODE_EXP2: 512 case SHADER_OPCODE_LOG2: 513 case SHADER_OPCODE_SIN: 514 case SHADER_OPCODE_COS: 515 case SHADER_OPCODE_POW: 516 case SHADER_OPCODE_INT_QUOTIENT: 517 case SHADER_OPCODE_INT_REMAINDER: 518 if (devinfo->ver >= 6) { 519 switch (info.op) { 520 case SHADER_OPCODE_RCP: 521 case SHADER_OPCODE_RSQ: 522 case SHADER_OPCODE_SQRT: 523 case SHADER_OPCODE_EXP2: 524 case SHADER_OPCODE_LOG2: 525 case SHADER_OPCODE_SIN: 526 case SHADER_OPCODE_COS: 527 if (devinfo->ver >= 8) 528 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4, 529 0, 16, 0, 0, 0, 0); 530 else if (devinfo->verx10 >= 75) 531 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2, 532 0, 12, 0, 0, 0, 0); 533 else 534 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2, 535 0, 14, 0, 0, 0, 0); 536 537 case SHADER_OPCODE_POW: 538 if (devinfo->ver >= 8) 539 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8, 540 0, 24, 0, 0, 0, 0); 541 else if (devinfo->verx10 >= 75) 542 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4, 543 0, 20, 0, 0, 0, 0); 544 else 545 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4, 546 0, 22, 0, 0, 0, 0); 547 548 case SHADER_OPCODE_INT_QUOTIENT: 549 case SHADER_OPCODE_INT_REMAINDER: 550 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0, 551 0, 28 /* XXX */, 0, 0, 0, 0); 552 553 default: 554 abort(); 555 } 556 } else { 557 switch (info.op) { 558 case SHADER_OPCODE_RCP: 559 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 8, 560 0, 22, 0, 0, 0, 8); 561 562 case SHADER_OPCODE_RSQ: 563 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 16, 564 0, 44, 0, 0, 0, 8); 565 566 case SHADER_OPCODE_INT_QUOTIENT: 567 case SHADER_OPCODE_SQRT: 568 case SHADER_OPCODE_LOG2: 569 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 24, 570 0, 66, 0, 0, 0, 8); 571 572 case SHADER_OPCODE_INT_REMAINDER: 573 case SHADER_OPCODE_EXP2: 574 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 32, 575 0, 88, 0, 0, 0, 8); 576 577 case SHADER_OPCODE_SIN: 578 case SHADER_OPCODE_COS: 579 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 48, 580 0, 132, 0, 0, 0, 8); 581 582 case SHADER_OPCODE_POW: 583 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 64, 584 0, 176, 0, 0, 0, 8); 585 586 default: 587 abort(); 588 } 589 } 590 591 case BRW_OPCODE_DO: 592 if (devinfo->ver >= 6) 593 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0, 594 0, 0, 0, 0, 0, 0); 595 else 596 return calculate_desc(info, EU_UNIT_NULL, 2 /* XXX */, 0, 0, 0, 0, 597 0, 0, 0, 0, 0, 0); 598 599 case BRW_OPCODE_IF: 600 case BRW_OPCODE_ELSE: 601 case BRW_OPCODE_ENDIF: 602 case BRW_OPCODE_WHILE: 603 case BRW_OPCODE_BREAK: 604 case BRW_OPCODE_CONTINUE: 605 case BRW_OPCODE_HALT: 606 if (devinfo->ver >= 8) 607 return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0, 608 0, 0, 0, 0, 0, 0); 609 else if (devinfo->verx10 >= 75) 610 return calculate_desc(info, EU_UNIT_NULL, 6, 0, 0, 0, 0, 611 0, 0, 0, 0, 0, 0); 612 else 613 return calculate_desc(info, EU_UNIT_NULL, 2, 0, 0, 0, 0, 614 0, 0, 0, 0, 0, 0); 615 616 case FS_OPCODE_LINTERP: 617 if (devinfo->ver >= 8) 618 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, 619 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 620 else if (devinfo->verx10 >= 75) 621 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 622 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 623 else 624 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 625 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 626 627 case BRW_OPCODE_LRP: 628 if (devinfo->ver >= 8) 629 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4, 630 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 631 else if (devinfo->verx10 >= 75) 632 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 633 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 634 else if (devinfo->ver >= 6) 635 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, 636 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 637 else 638 abort(); 639 640 case FS_OPCODE_PACK_HALF_2x16_SPLIT: 641 if (devinfo->ver >= 11) 642 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6, 643 0, 10 /* XXX */, 6 /* XXX */, 644 14 /* XXX */, 0, 0); 645 else if (devinfo->ver >= 8) 646 return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6, 647 0, 8 /* XXX */, 4 /* XXX */, 648 12 /* XXX */, 0, 0); 649 else if (devinfo->verx10 >= 75) 650 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6, 651 0, 10 /* XXX */, 6 /* XXX */, 652 16 /* XXX */, 0, 0); 653 else if (devinfo->ver >= 7) 654 return calculate_desc(info, EU_UNIT_FPU, 24, 6, 0, 0, 6, 655 0, 12 /* XXX */, 8 /* XXX */, 656 18 /* XXX */, 0, 0); 657 else 658 abort(); 659 660 case SHADER_OPCODE_MOV_INDIRECT: 661 if (devinfo->ver >= 11) 662 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0, 663 0, 10 /* XXX */, 6 /* XXX */, 664 14 /* XXX */, 0, 0); 665 else if (devinfo->ver >= 8) 666 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0, 667 0, 8 /* XXX */, 4 /* XXX */, 668 12 /* XXX */, 0, 0); 669 else if (devinfo->verx10 >= 75) 670 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0, 671 0, 10 /* XXX */, 6 /* XXX */, 672 16 /* XXX */, 0, 0); 673 else 674 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0, 675 0, 12 /* XXX */, 8 /* XXX */, 676 18 /* XXX */, 0, 0); 677 678 case SHADER_OPCODE_BROADCAST: 679 if (devinfo->ver >= 11) 680 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0, 681 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 682 else if (devinfo->ver >= 8) 683 return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0, 684 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 685 else if (devinfo->verx10 >= 75) 686 return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0, 687 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 688 else if (devinfo->ver >= 7) 689 return calculate_desc(info, EU_UNIT_FPU, 20, 0, 0, 4, 0, 690 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 691 else 692 abort(); 693 694 case SHADER_OPCODE_FIND_LIVE_CHANNEL: 695 case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: 696 if (devinfo->ver >= 11) 697 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0, 698 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 699 else if (devinfo->ver >= 8) 700 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0, 701 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 702 else if (devinfo->verx10 >= 75) 703 return calculate_desc(info, EU_UNIT_FPU, 36, 0, 0, 6, 0, 704 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 705 else if (devinfo->ver >= 7) 706 return calculate_desc(info, EU_UNIT_FPU, 40, 0, 0, 6, 0, 707 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 708 else 709 abort(); 710 711 case SHADER_OPCODE_RND_MODE: 712 case SHADER_OPCODE_FLOAT_CONTROL_MODE: 713 if (devinfo->ver >= 11) 714 return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0, 715 4 /* XXX */, 0, 716 0, 0, 0, 0, 0, 0); 717 else if (devinfo->ver >= 8) 718 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 719 4 /* XXX */, 0, 720 0, 0, 0, 0, 0, 0); 721 else if (devinfo->verx10 >= 75) 722 return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0, 723 4 /* XXX */, 0, 724 0, 0, 0, 0, 0, 0); 725 else if (devinfo->ver >= 6) 726 return calculate_desc(info, EU_UNIT_FPU, 28 /* XXX */, 0, 0, 727 4 /* XXX */, 0, 728 0, 0, 0, 0, 0, 0); 729 else 730 abort(); 731 732 case SHADER_OPCODE_SHUFFLE: 733 if (devinfo->ver >= 11) 734 return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0, 735 44 /* XXX */, 0, 736 0, 10 /* XXX */, 6 /* XXX */, 737 14 /* XXX */, 0, 0); 738 else if (devinfo->ver >= 8) 739 return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0, 740 42 /* XXX */, 0, 741 0, 8 /* XXX */, 4 /* XXX */, 742 12 /* XXX */, 0, 0); 743 else if (devinfo->verx10 >= 75) 744 return calculate_desc(info, EU_UNIT_FPU, 0, 44 /* XXX */, 0, 745 0, 44 /* XXX */, 746 0, 10 /* XXX */, 6 /* XXX */, 747 16 /* XXX */, 0, 0); 748 else if (devinfo->ver >= 6) 749 return calculate_desc(info, EU_UNIT_FPU, 0, 46 /* XXX */, 0, 750 0, 46 /* XXX */, 751 0, 12 /* XXX */, 8 /* XXX */, 752 18 /* XXX */, 0, 0); 753 else 754 abort(); 755 756 case SHADER_OPCODE_SEL_EXEC: 757 if (devinfo->ver >= 11) 758 return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0, 759 0, 4 /* XXX */, 760 0, 10 /* XXX */, 6 /* XXX */, 761 14 /* XXX */, 0, 0); 762 else if (devinfo->ver >= 8) 763 return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0, 764 0, 4 /* XXX */, 765 0, 8 /* XXX */, 4 /* XXX */, 766 12 /* XXX */, 0, 0); 767 else if (devinfo->verx10 >= 75) 768 return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0, 769 0, 4 /* XXX */, 770 0, 10 /* XXX */, 6 /* XXX */, 771 16 /* XXX */, 0, 0); 772 else 773 return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 4 /* XXX */, 0, 774 0, 4 /* XXX */, 775 0, 12 /* XXX */, 8 /* XXX */, 776 18 /* XXX */, 0, 0); 777 778 case SHADER_OPCODE_QUAD_SWIZZLE: 779 if (devinfo->ver >= 11) 780 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0, 781 0, 8 /* XXX */, 782 0, 10 /* XXX */, 6 /* XXX */, 783 14 /* XXX */, 0, 0); 784 else if (devinfo->ver >= 8) 785 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0, 786 0, 8 /* XXX */, 787 0, 8 /* XXX */, 4 /* XXX */, 788 12 /* XXX */, 0, 0); 789 else if (devinfo->verx10 >= 75) 790 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0, 791 0, 8 /* XXX */, 792 0, 10 /* XXX */, 6 /* XXX */, 793 16 /* XXX */, 0, 0); 794 else 795 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0, 796 0, 8 /* XXX */, 797 0, 12 /* XXX */, 8 /* XXX */, 798 18 /* XXX */, 0, 0); 799 800 case FS_OPCODE_DDY_FINE: 801 if (devinfo->ver >= 11) 802 return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4, 803 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 804 else if (devinfo->ver >= 8) 805 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 806 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 807 else if (devinfo->verx10 >= 75) 808 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 809 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 810 else 811 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, 812 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0); 813 814 case FS_OPCODE_LOAD_LIVE_CHANNELS: 815 if (devinfo->ver >= 11) 816 return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0, 817 2 /* XXX */, 0, 818 0, 0, 0, 10 /* XXX */, 0, 0); 819 else if (devinfo->ver >= 8) 820 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0, 821 0, 2 /* XXX */, 822 0, 0, 0, 8 /* XXX */, 0, 0); 823 else 824 abort(); 825 826 case VEC4_OPCODE_PACK_BYTES: 827 if (devinfo->ver >= 8) 828 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0, 829 4 /* XXX */, 0, 830 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, 831 0, 0); 832 else if (devinfo->verx10 >= 75) 833 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0, 834 4 /* XXX */, 0, 835 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, 836 0, 0); 837 else 838 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0, 839 4 /* XXX */, 0, 840 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 841 0, 0); 842 843 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: 844 case TCS_OPCODE_GET_INSTANCE_ID: 845 case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS: 846 case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: 847 case TES_OPCODE_CREATE_INPUT_READ_HEADER: 848 if (devinfo->ver >= 8) 849 return calculate_desc(info, EU_UNIT_FPU, 22 /* XXX */, 0, 0, 850 6 /* XXX */, 0, 851 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, 852 0, 0); 853 else if (devinfo->verx10 >= 75) 854 return calculate_desc(info, EU_UNIT_FPU, 26 /* XXX */, 0, 0, 855 6 /* XXX */, 0, 856 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, 857 0, 0); 858 else 859 return calculate_desc(info, EU_UNIT_FPU, 30 /* XXX */, 0, 0, 860 6 /* XXX */, 0, 861 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 862 0, 0); 863 864 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: 865 case TCS_OPCODE_CREATE_BARRIER_HEADER: 866 if (devinfo->ver >= 8) 867 return calculate_desc(info, EU_UNIT_FPU, 32 /* XXX */, 0, 0, 868 8 /* XXX */, 0, 869 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, 870 0, 0); 871 else if (devinfo->verx10 >= 75) 872 return calculate_desc(info, EU_UNIT_FPU, 38 /* XXX */, 0, 0, 873 8 /* XXX */, 0, 874 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, 875 0, 0); 876 else if (devinfo->ver >= 6) 877 return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0, 878 8 /* XXX */, 0, 879 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 880 0, 0); 881 else 882 abort(); 883 884 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: 885 if (devinfo->ver >= 8) 886 return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 0, 0, 887 4 /* XXX */, 0, 888 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, 889 0, 0); 890 else if (devinfo->verx10 >= 75) 891 return calculate_desc(info, EU_UNIT_FPU, 14 /* XXX */, 0, 0, 892 4 /* XXX */, 0, 893 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, 894 0, 0); 895 else if (devinfo->ver >= 7) 896 return calculate_desc(info, EU_UNIT_FPU, 16 /* XXX */, 0, 0, 897 4 /* XXX */, 0, 898 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 899 0, 0); 900 else 901 abort(); 902 903 case SHADER_OPCODE_TEX: 904 case FS_OPCODE_TXB: 905 case SHADER_OPCODE_TXD: 906 case SHADER_OPCODE_TXF: 907 case SHADER_OPCODE_TXF_LZ: 908 case SHADER_OPCODE_TXL: 909 case SHADER_OPCODE_TXL_LZ: 910 case SHADER_OPCODE_TXF_CMS: 911 case SHADER_OPCODE_TXF_CMS_W: 912 case SHADER_OPCODE_TXF_UMS: 913 case SHADER_OPCODE_TXF_MCS: 914 case SHADER_OPCODE_TXS: 915 case SHADER_OPCODE_LOD: 916 case SHADER_OPCODE_GET_BUFFER_SIZE: 917 case SHADER_OPCODE_TG4: 918 case SHADER_OPCODE_TG4_OFFSET: 919 case SHADER_OPCODE_SAMPLEINFO: 920 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: 921 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */, 922 8 /* XXX */, 750 /* XXX */, 0, 0, 923 2 /* XXX */, 0); 924 925 case VEC4_OPCODE_URB_READ: 926 case VEC4_VS_OPCODE_URB_WRITE: 927 case VEC4_GS_OPCODE_URB_WRITE: 928 case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE: 929 case GS_OPCODE_THREAD_END: 930 case GS_OPCODE_FF_SYNC: 931 case VEC4_TCS_OPCODE_URB_WRITE: 932 case TCS_OPCODE_RELEASE_INPUT: 933 case TCS_OPCODE_THREAD_END: 934 return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */, 935 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0); 936 937 case SHADER_OPCODE_MEMORY_FENCE: 938 case SHADER_OPCODE_INTERLOCK: 939 switch (info.sfid) { 940 case GFX6_SFID_DATAPORT_RENDER_CACHE: 941 if (devinfo->ver >= 7) 942 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0, 943 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); 944 else 945 abort(); 946 947 case BRW_SFID_URB: 948 case GFX7_SFID_DATAPORT_DATA_CACHE: 949 case GFX12_SFID_SLM: 950 case GFX12_SFID_TGM: 951 case GFX12_SFID_UGM: 952 case HSW_SFID_DATAPORT_DATA_CACHE_1: 953 if (devinfo->ver >= 7) 954 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0, 955 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); 956 else 957 abort(); 958 959 default: 960 abort(); 961 } 962 963 case SHADER_OPCODE_GFX4_SCRATCH_READ: 964 case SHADER_OPCODE_GFX4_SCRATCH_WRITE: 965 case SHADER_OPCODE_GFX7_SCRATCH_READ: 966 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 0, 8 /* XXX */, 967 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); 968 969 case VEC4_OPCODE_UNTYPED_ATOMIC: 970 if (devinfo->ver >= 7) 971 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 972 30 /* XXX */, 400 /* XXX */, 973 10 /* XXX */, 100 /* XXX */, 0, 0, 974 0, 400 /* XXX */); 975 else 976 abort(); 977 978 case VEC4_OPCODE_UNTYPED_SURFACE_READ: 979 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: 980 if (devinfo->ver >= 7) 981 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 982 0, 20 /* XXX */, 983 10 /* XXX */, 100 /* XXX */, 0, 0, 984 0, 0); 985 else 986 abort(); 987 988 case FS_OPCODE_FB_WRITE: 989 case FS_OPCODE_FB_READ: 990 case FS_OPCODE_REP_FB_WRITE: 991 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */, 992 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); 993 994 case GS_OPCODE_SVB_WRITE: 995 if (devinfo->ver >= 6) 996 return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0, 997 0, 450 /* XXX */, 998 10 /* XXX */, 300 /* XXX */, 0, 0, 999 0, 0); 1000 else 1001 abort(); 1002 1003 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 1004 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: 1005 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */, 1006 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); 1007 1008 case VS_OPCODE_PULL_CONSTANT_LOAD: 1009 case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7: 1010 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16, 1011 8, 750, 0, 0, 2, 0); 1012 1013 case FS_OPCODE_INTERPOLATE_AT_SAMPLE: 1014 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: 1015 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: 1016 if (devinfo->ver >= 7) 1017 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0, 1018 0, 90 /* XXX */, 0, 0, 0, 0); 1019 else 1020 abort(); 1021 1022 case SHADER_OPCODE_BARRIER: 1023 if (devinfo->ver >= 7) 1024 return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0, 1025 0 /* XXX */, 0, 1026 0, 0, 0, 0, 0, 0); 1027 else 1028 abort(); 1029 1030 case CS_OPCODE_CS_TERMINATE: 1031 if (devinfo->ver >= 7) 1032 return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0, 1033 10 /* XXX */, 0, 0, 0, 0, 0); 1034 else 1035 abort(); 1036 1037 case SHADER_OPCODE_SEND: 1038 switch (info.sfid) { 1039 case GFX6_SFID_DATAPORT_RENDER_CACHE: 1040 if (devinfo->ver >= 7) { 1041 switch (brw_dp_desc_msg_type(devinfo, info.desc)) { 1042 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP: 1043 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 1044 30 /* XXX */, 450 /* XXX */, 1045 10 /* XXX */, 100 /* XXX */, 1046 0, 0, 0, 400 /* XXX */); 1047 default: 1048 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 1049 0, 450 /* XXX */, 1050 10 /* XXX */, 300 /* XXX */, 0, 0, 1051 0, 0); 1052 } 1053 } else if (devinfo->ver >= 6) { 1054 return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0, 1055 0, 450 /* XXX */, 1056 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); 1057 } else { 1058 abort(); 1059 } 1060 case BRW_SFID_SAMPLER: { 1061 if (devinfo->ver >= 6) 1062 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16, 1063 8, 750, 0, 0, 2, 0); 1064 else 1065 abort(); 1066 } 1067 case GFX7_SFID_DATAPORT_DATA_CACHE: 1068 case HSW_SFID_DATAPORT_DATA_CACHE_1: 1069 if (devinfo->verx10 >= 75) { 1070 switch (brw_dp_desc_msg_type(devinfo, info.desc)) { 1071 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: 1072 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: 1073 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: 1074 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: 1075 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 1076 30 /* XXX */, 400 /* XXX */, 1077 10 /* XXX */, 100 /* XXX */, 0, 0, 1078 0, 400 /* XXX */); 1079 1080 default: 1081 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 1082 0, 20 /* XXX */, 1083 10 /* XXX */, 100 /* XXX */, 0, 0, 1084 0, 0); 1085 } 1086 } else if (devinfo->ver >= 7) { 1087 switch (brw_dp_desc_msg_type(devinfo, info.desc)) { 1088 case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP: 1089 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 1090 30 /* XXX */, 400 /* XXX */, 1091 10 /* XXX */, 100 /* XXX */, 1092 0, 0, 0, 400 /* XXX */); 1093 default: 1094 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 1095 0, 20 /* XXX */, 1096 10 /* XXX */, 100 /* XXX */, 0, 0, 1097 0, 0); 1098 } 1099 } else { 1100 abort(); 1101 } 1102 1103 case GFX12_SFID_UGM: 1104 case GFX12_SFID_TGM: 1105 case GFX12_SFID_SLM: 1106 switch (lsc_msg_desc_opcode(devinfo, info.desc)) { 1107 case LSC_OP_LOAD: 1108 case LSC_OP_STORE: 1109 case LSC_OP_LOAD_CMASK: 1110 case LSC_OP_STORE_CMASK: 1111 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 1112 0, 20 /* XXX */, 1113 10 /* XXX */, 100 /* XXX */, 0, 0, 1114 0, 0); 1115 1116 case LSC_OP_FENCE: 1117 case LSC_OP_ATOMIC_INC: 1118 case LSC_OP_ATOMIC_DEC: 1119 case LSC_OP_ATOMIC_LOAD: 1120 case LSC_OP_ATOMIC_STORE: 1121 case LSC_OP_ATOMIC_ADD: 1122 case LSC_OP_ATOMIC_SUB: 1123 case LSC_OP_ATOMIC_MIN: 1124 case LSC_OP_ATOMIC_MAX: 1125 case LSC_OP_ATOMIC_UMIN: 1126 case LSC_OP_ATOMIC_UMAX: 1127 case LSC_OP_ATOMIC_CMPXCHG: 1128 case LSC_OP_ATOMIC_FADD: 1129 case LSC_OP_ATOMIC_FSUB: 1130 case LSC_OP_ATOMIC_FMIN: 1131 case LSC_OP_ATOMIC_FMAX: 1132 case LSC_OP_ATOMIC_FCMPXCHG: 1133 case LSC_OP_ATOMIC_AND: 1134 case LSC_OP_ATOMIC_OR: 1135 case LSC_OP_ATOMIC_XOR: 1136 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 1137 30 /* XXX */, 400 /* XXX */, 1138 10 /* XXX */, 100 /* XXX */, 0, 0, 1139 0, 400 /* XXX */); 1140 default: 1141 abort(); 1142 } 1143 1144 case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: 1145 case GEN_RT_SFID_RAY_TRACE_ACCELERATOR: 1146 return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0, 1147 10 /* XXX */, 0, 0, 0, 0, 0); 1148 1149 case BRW_SFID_URB: 1150 return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */, 1151 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0); 1152 1153 default: 1154 abort(); 1155 } 1156 1157 case SHADER_OPCODE_UNDEF: 1158 case SHADER_OPCODE_HALT_TARGET: 1159 case FS_OPCODE_SCHEDULING_FENCE: 1160 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0, 1161 0, 0, 0, 0, 0, 0); 1162 1163 default: 1164 abort(); 1165 } 1166 } 1167 1168 /** 1169 * Model the performance behavior of a stall on the specified dependency 1170 * ID. 1171 */ 1172 void 1173 stall_on_dependency(state &st, enum intel_eu_dependency_id id) 1174 { 1175 if (id < ARRAY_SIZE(st.dep_ready)) 1176 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE], 1177 st.dep_ready[id]); 1178 } 1179 1180 /** 1181 * Model the performance behavior of the front-end and back-end while 1182 * executing an instruction with the specified timing information, assuming 1183 * all dependencies are already clear. 1184 */ 1185 void 1186 execute_instruction(state &st, const perf_desc &perf) 1187 { 1188 /* Compute the time at which the front-end will be ready to execute the 1189 * next instruction. 1190 */ 1191 st.unit_ready[EU_UNIT_FE] += perf.df; 1192 1193 if (perf.u < EU_NUM_UNITS) { 1194 /* Wait for the back-end to be ready to execute this instruction. */ 1195 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE], 1196 st.unit_ready[perf.u]); 1197 1198 /* Compute the time at which the back-end will be ready to execute 1199 * the next instruction, and update the back-end utilization. 1200 */ 1201 st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db; 1202 st.unit_busy[perf.u] += perf.db * st.weight; 1203 } 1204 } 1205 1206 /** 1207 * Model the performance behavior of a read dependency provided by an 1208 * instruction. 1209 */ 1210 void 1211 mark_read_dependency(state &st, const perf_desc &perf, 1212 enum intel_eu_dependency_id id) 1213 { 1214 if (id < ARRAY_SIZE(st.dep_ready)) 1215 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls; 1216 } 1217 1218 /** 1219 * Model the performance behavior of a write dependency provided by an 1220 * instruction. 1221 */ 1222 void 1223 mark_write_dependency(state &st, const perf_desc &perf, 1224 enum intel_eu_dependency_id id) 1225 { 1226 if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0) 1227 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la; 1228 else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0) 1229 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf; 1230 else if (id < ARRAY_SIZE(st.dep_ready)) 1231 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld; 1232 } 1233 1234 /** 1235 * Return the dependency ID of a backend_reg, offset by \p delta GRFs. 1236 */ 1237 enum intel_eu_dependency_id 1238 reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r, 1239 const int delta) 1240 { 1241 if (r.file == VGRF) { 1242 const unsigned i = r.nr + r.offset / REG_SIZE + delta; 1243 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0); 1244 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i); 1245 1246 } else if (r.file == FIXED_GRF) { 1247 const unsigned i = r.nr + delta; 1248 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0); 1249 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i); 1250 1251 } else if (r.file == MRF && devinfo->ver >= 7) { 1252 const unsigned i = GFX7_MRF_HACK_START + 1253 r.nr + r.offset / REG_SIZE + delta; 1254 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0); 1255 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i); 1256 1257 } else if (r.file == MRF && devinfo->ver < 7) { 1258 const unsigned i = (r.nr & ~BRW_MRF_COMPR4) + 1259 r.offset / REG_SIZE + delta; 1260 assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_MRF0); 1261 return intel_eu_dependency_id(EU_DEPENDENCY_ID_MRF0 + i); 1262 1263 } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS && 1264 r.nr < BRW_ARF_ACCUMULATOR) { 1265 assert(delta == 0); 1266 return EU_DEPENDENCY_ID_ADDR0; 1267 1268 } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR && 1269 r.nr < BRW_ARF_FLAG) { 1270 const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta; 1271 assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0); 1272 return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i); 1273 1274 } else { 1275 return EU_NUM_DEPENDENCY_IDS; 1276 } 1277 } 1278 1279 /** 1280 * Return the dependency ID of flag register starting at offset \p i. 1281 */ 1282 enum intel_eu_dependency_id 1283 flag_dependency_id(unsigned i) 1284 { 1285 assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0); 1286 return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i); 1287 } 1288 1289 /** 1290 * Return the dependency ID corresponding to the SBID read completion 1291 * condition of a Gfx12+ SWSB. 1292 */ 1293 enum intel_eu_dependency_id 1294 tgl_swsb_rd_dependency_id(tgl_swsb swsb) 1295 { 1296 if (swsb.mode) { 1297 assert(swsb.sbid < 1298 EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0); 1299 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid); 1300 } else { 1301 return EU_NUM_DEPENDENCY_IDS; 1302 } 1303 } 1304 1305 /** 1306 * Return the dependency ID corresponding to the SBID write completion 1307 * condition of a Gfx12+ SWSB. 1308 */ 1309 enum intel_eu_dependency_id 1310 tgl_swsb_wr_dependency_id(tgl_swsb swsb) 1311 { 1312 if (swsb.mode) { 1313 assert(swsb.sbid < 1314 EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0); 1315 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid); 1316 } else { 1317 return EU_NUM_DEPENDENCY_IDS; 1318 } 1319 } 1320 1321 /** 1322 * Return the implicit accumulator register accessed by channel \p i of the 1323 * instruction. 1324 */ 1325 unsigned 1326 accum_reg_of_channel(const intel_device_info *devinfo, 1327 const backend_instruction *inst, 1328 brw_reg_type tx, unsigned i) 1329 { 1330 assert(inst->reads_accumulator_implicitly() || 1331 inst->writes_accumulator_implicitly(devinfo)); 1332 const unsigned offset = (inst->group + i) * type_sz(tx) * 1333 (devinfo->ver < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2); 1334 return offset / REG_SIZE % 2; 1335 } 1336 1337 /** 1338 * Model the performance behavior of an FS back-end instruction. 1339 */ 1340 void 1341 issue_fs_inst(state &st, const struct brw_isa_info *isa, 1342 const backend_instruction *be_inst) 1343 { 1344 const struct intel_device_info *devinfo = isa->devinfo; 1345 const fs_inst *inst = static_cast<const fs_inst *>(be_inst); 1346 const instruction_info info(isa, inst); 1347 const perf_desc perf = instruction_desc(info); 1348 1349 /* Stall on any source dependencies. */ 1350 for (unsigned i = 0; i < inst->sources; i++) { 1351 for (unsigned j = 0; j < regs_read(inst, i); j++) 1352 stall_on_dependency( 1353 st, reg_dependency_id(devinfo, inst->src[i], j)); 1354 } 1355 1356 if (inst->reads_accumulator_implicitly()) { 1357 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1358 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1359 inst->exec_size - 1); j++) 1360 stall_on_dependency( 1361 st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1362 } 1363 1364 if (is_send(inst) && inst->base_mrf != -1) { 1365 for (unsigned j = 0; j < inst->mlen; j++) 1366 stall_on_dependency( 1367 st, reg_dependency_id( 1368 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); 1369 } 1370 1371 if (const unsigned mask = inst->flags_read(devinfo)) { 1372 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { 1373 if (mask & (1 << i)) 1374 stall_on_dependency(st, flag_dependency_id(i)); 1375 } 1376 } 1377 1378 /* Stall on any write dependencies. */ 1379 if (!inst->no_dd_check) { 1380 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { 1381 for (unsigned j = 0; j < regs_written(inst); j++) 1382 stall_on_dependency( 1383 st, reg_dependency_id(devinfo, inst->dst, j)); 1384 } 1385 1386 if (inst->writes_accumulator_implicitly(devinfo)) { 1387 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1388 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1389 inst->exec_size - 1); j++) 1390 stall_on_dependency( 1391 st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1392 } 1393 1394 if (const unsigned mask = inst->flags_written(devinfo)) { 1395 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { 1396 if (mask & (1 << i)) 1397 stall_on_dependency(st, flag_dependency_id(i)); 1398 } 1399 } 1400 } 1401 1402 /* Stall on any SBID dependencies. */ 1403 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST)) 1404 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched)); 1405 else if (inst->sched.mode & TGL_SBID_SRC) 1406 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched)); 1407 1408 /* Execute the instruction. */ 1409 execute_instruction(st, perf); 1410 1411 /* Mark any source dependencies. */ 1412 if (inst->is_send_from_grf()) { 1413 for (unsigned i = 0; i < inst->sources; i++) { 1414 if (inst->is_payload(i)) { 1415 for (unsigned j = 0; j < regs_read(inst, i); j++) 1416 mark_read_dependency( 1417 st, perf, reg_dependency_id(devinfo, inst->src[i], j)); 1418 } 1419 } 1420 } 1421 1422 if (is_send(inst) && inst->base_mrf != -1) { 1423 for (unsigned j = 0; j < inst->mlen; j++) 1424 mark_read_dependency(st, perf, 1425 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); 1426 } 1427 1428 /* Mark any destination dependencies. */ 1429 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { 1430 for (unsigned j = 0; j < regs_written(inst); j++) { 1431 mark_write_dependency(st, perf, 1432 reg_dependency_id(devinfo, inst->dst, j)); 1433 } 1434 } 1435 1436 if (inst->writes_accumulator_implicitly(devinfo)) { 1437 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1438 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1439 inst->exec_size - 1); j++) 1440 mark_write_dependency(st, perf, 1441 reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1442 } 1443 1444 if (const unsigned mask = inst->flags_written(devinfo)) { 1445 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { 1446 if (mask & (1 << i)) 1447 mark_write_dependency(st, perf, flag_dependency_id(i)); 1448 } 1449 } 1450 1451 /* Mark any SBID dependencies. */ 1452 if (inst->sched.mode & TGL_SBID_SET) { 1453 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched)); 1454 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched)); 1455 } 1456 } 1457 1458 /** 1459 * Model the performance behavior of a VEC4 back-end instruction. 1460 */ 1461 void 1462 issue_vec4_instruction(state &st, const struct brw_isa_info *isa, 1463 const backend_instruction *be_inst) 1464 { 1465 const struct intel_device_info *devinfo = isa->devinfo; 1466 const vec4_instruction *inst = 1467 static_cast<const vec4_instruction *>(be_inst); 1468 const instruction_info info(isa, inst); 1469 const perf_desc perf = instruction_desc(info); 1470 1471 /* Stall on any source dependencies. */ 1472 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) { 1473 for (unsigned j = 0; j < regs_read(inst, i); j++) 1474 stall_on_dependency( 1475 st, reg_dependency_id(devinfo, inst->src[i], j)); 1476 } 1477 1478 if (inst->reads_accumulator_implicitly()) { 1479 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1480 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1481 inst->exec_size - 1); j++) 1482 stall_on_dependency( 1483 st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1484 } 1485 1486 if (inst->base_mrf != -1) { 1487 for (unsigned j = 0; j < inst->mlen; j++) 1488 stall_on_dependency( 1489 st, reg_dependency_id( 1490 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); 1491 } 1492 1493 if (inst->reads_flag()) 1494 stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0); 1495 1496 /* Stall on any write dependencies. */ 1497 if (!inst->no_dd_check) { 1498 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { 1499 for (unsigned j = 0; j < regs_written(inst); j++) 1500 stall_on_dependency( 1501 st, reg_dependency_id(devinfo, inst->dst, j)); 1502 } 1503 1504 if (inst->writes_accumulator_implicitly(devinfo)) { 1505 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1506 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1507 inst->exec_size - 1); j++) 1508 stall_on_dependency( 1509 st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1510 } 1511 1512 if (inst->writes_flag(devinfo)) 1513 stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0); 1514 } 1515 1516 /* Execute the instruction. */ 1517 execute_instruction(st, perf); 1518 1519 /* Mark any source dependencies. */ 1520 if (inst->is_send_from_grf()) { 1521 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) { 1522 for (unsigned j = 0; j < regs_read(inst, i); j++) 1523 mark_read_dependency( 1524 st, perf, reg_dependency_id(devinfo, inst->src[i], j)); 1525 } 1526 } 1527 1528 if (inst->base_mrf != -1) { 1529 for (unsigned j = 0; j < inst->mlen; j++) 1530 mark_read_dependency(st, perf, 1531 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); 1532 } 1533 1534 /* Mark any destination dependencies. */ 1535 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { 1536 for (unsigned j = 0; j < regs_written(inst); j++) { 1537 mark_write_dependency(st, perf, 1538 reg_dependency_id(devinfo, inst->dst, j)); 1539 } 1540 } 1541 1542 if (inst->writes_accumulator_implicitly(devinfo)) { 1543 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1544 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1545 inst->exec_size - 1); j++) 1546 mark_write_dependency(st, perf, 1547 reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1548 } 1549 1550 if (inst->writes_flag(devinfo)) 1551 mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0); 1552 } 1553 1554 /** 1555 * Calculate the maximum possible throughput of the program compatible with 1556 * the cycle-count utilization estimated for each asynchronous unit, in 1557 * threads-per-cycle units. 1558 */ 1559 float 1560 calculate_thread_throughput(const state &st, float busy) 1561 { 1562 for (unsigned i = 0; i < EU_NUM_UNITS; i++) 1563 busy = MAX2(busy, st.unit_busy[i]); 1564 1565 return 1.0 / busy; 1566 } 1567 1568 /** 1569 * Estimate the performance of the specified shader. 1570 */ 1571 void 1572 calculate_performance(performance &p, const backend_shader *s, 1573 void (*issue_instruction)( 1574 state &, const struct brw_isa_info *, 1575 const backend_instruction *), 1576 unsigned dispatch_width) 1577 { 1578 /* XXX - Note that the previous version of this code used worst-case 1579 * scenario estimation of branching divergence for SIMD32 shaders, 1580 * but this heuristic was removed to improve performance in common 1581 * scenarios. Wider shader variants are less optimal when divergence 1582 * is high, e.g. when application renders complex scene on a small 1583 * surface. It is assumed that such renders are short, so their 1584 * time doesn't matter and when it comes to the overall performance, 1585 * they are dominated by more optimal larger renders. 1586 * 1587 * It's possible that we could do better with divergence analysis 1588 * by isolating branches which are 100% uniform. 1589 * 1590 * Plumbing the trip counts from NIR loop analysis would allow us 1591 * to do a better job regarding the loop weights. 1592 * 1593 * In the meantime use values that roughly match the control flow 1594 * weights used elsewhere in the compiler back-end. 1595 * 1596 * Note that we provide slightly more pessimistic weights on 1597 * Gfx12+ for SIMD32, since the effective warp size on that 1598 * platform is 2x the SIMD width due to EU fusion, which increases 1599 * the likelihood of divergent control flow in comparison to 1600 * previous generations, giving narrower SIMD modes a performance 1601 * advantage in several test-cases with non-uniform discard jumps. 1602 */ 1603 const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ? 1604 1.0 : 0.5); 1605 const float loop_weight = 10; 1606 unsigned halt_count = 0; 1607 unsigned elapsed = 0; 1608 state st; 1609 1610 foreach_block(block, s->cfg) { 1611 const unsigned elapsed0 = elapsed; 1612 1613 foreach_inst_in_block(backend_instruction, inst, block) { 1614 const unsigned clock0 = st.unit_ready[EU_UNIT_FE]; 1615 1616 issue_instruction(st, &s->compiler->isa, inst); 1617 1618 if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count) 1619 st.weight /= discard_weight; 1620 1621 elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight; 1622 1623 if (inst->opcode == BRW_OPCODE_DO) 1624 st.weight *= loop_weight; 1625 else if (inst->opcode == BRW_OPCODE_WHILE) 1626 st.weight /= loop_weight; 1627 else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++) 1628 st.weight *= discard_weight; 1629 } 1630 1631 p.block_latency[block->num] = elapsed - elapsed0; 1632 } 1633 1634 p.latency = elapsed; 1635 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed); 1636 } 1637} 1638 1639brw::performance::performance(const fs_visitor *v) : 1640 block_latency(new unsigned[v->cfg->num_blocks]) 1641{ 1642 calculate_performance(*this, v, issue_fs_inst, v->dispatch_width); 1643} 1644 1645brw::performance::performance(const vec4_visitor *v) : 1646 block_latency(new unsigned[v->cfg->num_blocks]) 1647{ 1648 calculate_performance(*this, v, issue_vec4_instruction, 8); 1649} 1650 1651brw::performance::~performance() 1652{ 1653 delete[] block_latency; 1654} 1655