1/* 2 * Copyright © 2016-2017 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "broadcom/common/v3d_device_info.h" 25#include "v3d_compiler.h" 26#include "util/u_prim.h" 27#include "compiler/nir/nir_schedule.h" 28#include "compiler/nir/nir_builder.h" 29 30int 31vir_get_nsrc(struct qinst *inst) 32{ 33 switch (inst->qpu.type) { 34 case V3D_QPU_INSTR_TYPE_BRANCH: 35 return 0; 36 case V3D_QPU_INSTR_TYPE_ALU: 37 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) 38 return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op); 39 else 40 return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op); 41 } 42 43 return 0; 44} 45 46/** 47 * Returns whether the instruction has any side effects that must be 48 * preserved. 49 */ 50bool 51vir_has_side_effects(struct v3d_compile *c, struct qinst *inst) 52{ 53 switch (inst->qpu.type) { 54 case V3D_QPU_INSTR_TYPE_BRANCH: 55 return true; 56 case V3D_QPU_INSTR_TYPE_ALU: 57 switch (inst->qpu.alu.add.op) { 58 case V3D_QPU_A_SETREVF: 59 case V3D_QPU_A_SETMSF: 60 case V3D_QPU_A_VPMSETUP: 61 case V3D_QPU_A_STVPMV: 62 case V3D_QPU_A_STVPMD: 63 case V3D_QPU_A_STVPMP: 64 case V3D_QPU_A_VPMWT: 65 case V3D_QPU_A_TMUWT: 66 return true; 67 default: 68 break; 69 } 70 71 switch (inst->qpu.alu.mul.op) { 72 case V3D_QPU_M_MULTOP: 73 return true; 74 default: 75 break; 76 } 77 } 78 79 if (inst->qpu.sig.ldtmu || 80 inst->qpu.sig.ldvary || 81 inst->qpu.sig.ldtlbu || 82 inst->qpu.sig.ldtlb || 83 inst->qpu.sig.wrtmuc || 84 inst->qpu.sig.thrsw) { 85 return true; 86 } 87 88 /* ldunifa works like ldunif: it reads an element and advances the 89 * pointer, so each read has a side effect (we don't care for ldunif 90 * because we reconstruct the uniform stream buffer after compiling 91 * with the surviving uniforms), so allowing DCE to remove 92 * one would break follow-up loads. We could fix this by emiting a 93 * unifa for each ldunifa, but each unifa requires 3 delay slots 94 * before a ldunifa, so that would be quite expensive. 95 */ 96 if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf) 97 return true; 98 99 return false; 100} 101 102bool 103vir_is_raw_mov(struct qinst *inst) 104{ 105 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || 106 (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV && 107 inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) { 108 return false; 109 } 110 111 if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE || 112 inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) { 113 return false; 114 } 115 116 if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE || 117 inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE || 118 inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || 119 inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) { 120 return false; 121 } 122 123 if (inst->qpu.flags.ac != V3D_QPU_COND_NONE || 124 inst->qpu.flags.mc != V3D_QPU_COND_NONE) 125 return false; 126 127 return true; 128} 129 130bool 131vir_is_add(struct qinst *inst) 132{ 133 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 134 inst->qpu.alu.add.op != V3D_QPU_A_NOP); 135} 136 137bool 138vir_is_mul(struct qinst *inst) 139{ 140 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 141 inst->qpu.alu.mul.op != V3D_QPU_M_NOP); 142} 143 144bool 145vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst) 146{ 147 if (inst->dst.file == QFILE_MAGIC) 148 return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index); 149 150 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 151 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { 152 return true; 153 } 154 155 return false; 156} 157 158bool 159vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) 160{ 161 for (int i = 0; i < vir_get_nsrc(inst); i++) { 162 switch (inst->src[i].file) { 163 case QFILE_VPM: 164 return true; 165 default: 166 break; 167 } 168 } 169 170 if (devinfo->ver < 41 && (inst->qpu.sig.ldvary || 171 inst->qpu.sig.ldtlb || 172 inst->qpu.sig.ldtlbu || 173 inst->qpu.sig.ldvpm)) { 174 return true; 175 } 176 177 return false; 178} 179 180bool 181vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) 182{ 183 switch (inst->dst.file) { 184 case QFILE_MAGIC: 185 switch (inst->dst.index) { 186 case V3D_QPU_WADDR_RECIP: 187 case V3D_QPU_WADDR_RSQRT: 188 case V3D_QPU_WADDR_EXP: 189 case V3D_QPU_WADDR_LOG: 190 case V3D_QPU_WADDR_SIN: 191 return true; 192 } 193 break; 194 default: 195 break; 196 } 197 198 if (devinfo->ver < 41 && inst->qpu.sig.ldtmu) 199 return true; 200 201 return false; 202} 203 204void 205vir_set_unpack(struct qinst *inst, int src, 206 enum v3d_qpu_input_unpack unpack) 207{ 208 assert(src == 0 || src == 1); 209 210 if (vir_is_add(inst)) { 211 if (src == 0) 212 inst->qpu.alu.add.a_unpack = unpack; 213 else 214 inst->qpu.alu.add.b_unpack = unpack; 215 } else { 216 assert(vir_is_mul(inst)); 217 if (src == 0) 218 inst->qpu.alu.mul.a_unpack = unpack; 219 else 220 inst->qpu.alu.mul.b_unpack = unpack; 221 } 222} 223 224void 225vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack) 226{ 227 if (vir_is_add(inst)) { 228 inst->qpu.alu.add.output_pack = pack; 229 } else { 230 assert(vir_is_mul(inst)); 231 inst->qpu.alu.mul.output_pack = pack; 232 } 233} 234 235void 236vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond) 237{ 238 if (vir_is_add(inst)) { 239 inst->qpu.flags.ac = cond; 240 } else { 241 assert(vir_is_mul(inst)); 242 inst->qpu.flags.mc = cond; 243 } 244} 245 246enum v3d_qpu_cond 247vir_get_cond(struct qinst *inst) 248{ 249 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 250 251 if (vir_is_add(inst)) 252 return inst->qpu.flags.ac; 253 else if (vir_is_mul(inst)) 254 return inst->qpu.flags.mc; 255 else /* NOP */ 256 return V3D_QPU_COND_NONE; 257} 258 259void 260vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf) 261{ 262 c->flags_temp = -1; 263 if (vir_is_add(inst)) { 264 inst->qpu.flags.apf = pf; 265 } else { 266 assert(vir_is_mul(inst)); 267 inst->qpu.flags.mpf = pf; 268 } 269} 270 271void 272vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf) 273{ 274 c->flags_temp = -1; 275 if (vir_is_add(inst)) { 276 inst->qpu.flags.auf = uf; 277 } else { 278 assert(vir_is_mul(inst)); 279 inst->qpu.flags.muf = uf; 280 } 281} 282 283#if 0 284uint8_t 285vir_channels_written(struct qinst *inst) 286{ 287 if (vir_is_mul(inst)) { 288 switch (inst->dst.pack) { 289 case QPU_PACK_MUL_NOP: 290 case QPU_PACK_MUL_8888: 291 return 0xf; 292 case QPU_PACK_MUL_8A: 293 return 0x1; 294 case QPU_PACK_MUL_8B: 295 return 0x2; 296 case QPU_PACK_MUL_8C: 297 return 0x4; 298 case QPU_PACK_MUL_8D: 299 return 0x8; 300 } 301 } else { 302 switch (inst->dst.pack) { 303 case QPU_PACK_A_NOP: 304 case QPU_PACK_A_8888: 305 case QPU_PACK_A_8888_SAT: 306 case QPU_PACK_A_32_SAT: 307 return 0xf; 308 case QPU_PACK_A_8A: 309 case QPU_PACK_A_8A_SAT: 310 return 0x1; 311 case QPU_PACK_A_8B: 312 case QPU_PACK_A_8B_SAT: 313 return 0x2; 314 case QPU_PACK_A_8C: 315 case QPU_PACK_A_8C_SAT: 316 return 0x4; 317 case QPU_PACK_A_8D: 318 case QPU_PACK_A_8D_SAT: 319 return 0x8; 320 case QPU_PACK_A_16A: 321 case QPU_PACK_A_16A_SAT: 322 return 0x3; 323 case QPU_PACK_A_16B: 324 case QPU_PACK_A_16B_SAT: 325 return 0xc; 326 } 327 } 328 unreachable("Bad pack field"); 329} 330#endif 331 332struct qreg 333vir_get_temp(struct v3d_compile *c) 334{ 335 struct qreg reg; 336 337 reg.file = QFILE_TEMP; 338 reg.index = c->num_temps++; 339 340 if (c->num_temps > c->defs_array_size) { 341 uint32_t old_size = c->defs_array_size; 342 c->defs_array_size = MAX2(old_size * 2, 16); 343 344 c->defs = reralloc(c, c->defs, struct qinst *, 345 c->defs_array_size); 346 memset(&c->defs[old_size], 0, 347 sizeof(c->defs[0]) * (c->defs_array_size - old_size)); 348 349 c->spillable = reralloc(c, c->spillable, 350 BITSET_WORD, 351 BITSET_WORDS(c->defs_array_size)); 352 for (int i = old_size; i < c->defs_array_size; i++) 353 BITSET_SET(c->spillable, i); 354 } 355 356 return reg; 357} 358 359struct qinst * 360vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1) 361{ 362 struct qinst *inst = calloc(1, sizeof(*inst)); 363 364 inst->qpu = v3d_qpu_nop(); 365 inst->qpu.alu.add.op = op; 366 367 inst->dst = dst; 368 inst->src[0] = src0; 369 inst->src[1] = src1; 370 inst->uniform = ~0; 371 372 inst->ip = -1; 373 374 return inst; 375} 376 377struct qinst * 378vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1) 379{ 380 struct qinst *inst = calloc(1, sizeof(*inst)); 381 382 inst->qpu = v3d_qpu_nop(); 383 inst->qpu.alu.mul.op = op; 384 385 inst->dst = dst; 386 inst->src[0] = src0; 387 inst->src[1] = src1; 388 inst->uniform = ~0; 389 390 inst->ip = -1; 391 392 return inst; 393} 394 395struct qinst * 396vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) 397{ 398 struct qinst *inst = calloc(1, sizeof(*inst)); 399 400 inst->qpu = v3d_qpu_nop(); 401 inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH; 402 inst->qpu.branch.cond = cond; 403 inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE; 404 inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL; 405 inst->qpu.branch.ub = true; 406 inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL; 407 408 inst->dst = vir_nop_reg(); 409 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0); 410 411 inst->ip = -1; 412 413 return inst; 414} 415 416static void 417vir_emit(struct v3d_compile *c, struct qinst *inst) 418{ 419 inst->ip = -1; 420 421 switch (c->cursor.mode) { 422 case vir_cursor_add: 423 list_add(&inst->link, c->cursor.link); 424 break; 425 case vir_cursor_addtail: 426 list_addtail(&inst->link, c->cursor.link); 427 break; 428 } 429 430 c->cursor = vir_after_inst(inst); 431 c->live_intervals_valid = false; 432} 433 434/* Updates inst to write to a new temporary, emits it, and notes the def. */ 435struct qreg 436vir_emit_def(struct v3d_compile *c, struct qinst *inst) 437{ 438 assert(inst->dst.file == QFILE_NULL); 439 440 /* If we're emitting an instruction that's a def, it had better be 441 * writing a register. 442 */ 443 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { 444 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP || 445 v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op)); 446 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP || 447 v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op)); 448 } 449 450 inst->dst = vir_get_temp(c); 451 452 if (inst->dst.file == QFILE_TEMP) 453 c->defs[inst->dst.index] = inst; 454 455 vir_emit(c, inst); 456 457 return inst->dst; 458} 459 460struct qinst * 461vir_emit_nondef(struct v3d_compile *c, struct qinst *inst) 462{ 463 if (inst->dst.file == QFILE_TEMP) 464 c->defs[inst->dst.index] = NULL; 465 466 vir_emit(c, inst); 467 468 return inst; 469} 470 471struct qblock * 472vir_new_block(struct v3d_compile *c) 473{ 474 struct qblock *block = rzalloc(c, struct qblock); 475 476 list_inithead(&block->instructions); 477 478 block->predecessors = _mesa_set_create(block, 479 _mesa_hash_pointer, 480 _mesa_key_pointer_equal); 481 482 block->index = c->next_block_index++; 483 484 return block; 485} 486 487void 488vir_set_emit_block(struct v3d_compile *c, struct qblock *block) 489{ 490 c->cur_block = block; 491 c->cursor = vir_after_block(block); 492 list_addtail(&block->link, &c->blocks); 493} 494 495struct qblock * 496vir_entry_block(struct v3d_compile *c) 497{ 498 return list_first_entry(&c->blocks, struct qblock, link); 499} 500 501struct qblock * 502vir_exit_block(struct v3d_compile *c) 503{ 504 return list_last_entry(&c->blocks, struct qblock, link); 505} 506 507void 508vir_link_blocks(struct qblock *predecessor, struct qblock *successor) 509{ 510 _mesa_set_add(successor->predecessors, predecessor); 511 if (predecessor->successors[0]) { 512 assert(!predecessor->successors[1]); 513 predecessor->successors[1] = successor; 514 } else { 515 predecessor->successors[0] = successor; 516 } 517} 518 519const struct v3d_compiler * 520v3d_compiler_init(const struct v3d_device_info *devinfo, 521 uint32_t max_inline_uniform_buffers) 522{ 523 struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler); 524 if (!compiler) 525 return NULL; 526 527 compiler->devinfo = devinfo; 528 compiler->max_inline_uniform_buffers = max_inline_uniform_buffers; 529 530 if (!vir_init_reg_sets(compiler)) { 531 ralloc_free(compiler); 532 return NULL; 533 } 534 535 return compiler; 536} 537 538void 539v3d_compiler_free(const struct v3d_compiler *compiler) 540{ 541 ralloc_free((void *)compiler); 542} 543 544static struct v3d_compile * 545vir_compile_init(const struct v3d_compiler *compiler, 546 struct v3d_key *key, 547 nir_shader *s, 548 void (*debug_output)(const char *msg, 549 void *debug_output_data), 550 void *debug_output_data, 551 int program_id, int variant_id, 552 uint32_t compile_strategy_idx, 553 uint32_t max_threads, 554 uint32_t min_threads_for_reg_alloc, 555 uint32_t max_tmu_spills, 556 bool disable_general_tmu_sched, 557 bool disable_loop_unrolling, 558 bool disable_constant_ubo_load_sorting, 559 bool disable_tmu_pipelining, 560 bool fallback_scheduler) 561{ 562 struct v3d_compile *c = rzalloc(NULL, struct v3d_compile); 563 564 c->compiler = compiler; 565 c->devinfo = compiler->devinfo; 566 c->key = key; 567 c->program_id = program_id; 568 c->variant_id = variant_id; 569 c->compile_strategy_idx = compile_strategy_idx; 570 c->threads = max_threads; 571 c->debug_output = debug_output; 572 c->debug_output_data = debug_output_data; 573 c->compilation_result = V3D_COMPILATION_SUCCEEDED; 574 c->min_threads_for_reg_alloc = min_threads_for_reg_alloc; 575 c->max_tmu_spills = max_tmu_spills; 576 c->fallback_scheduler = fallback_scheduler; 577 c->disable_general_tmu_sched = disable_general_tmu_sched; 578 c->disable_tmu_pipelining = disable_tmu_pipelining; 579 c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting; 580 c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL 581 ? true : disable_loop_unrolling; 582 583 s = nir_shader_clone(c, s); 584 c->s = s; 585 586 list_inithead(&c->blocks); 587 vir_set_emit_block(c, vir_new_block(c)); 588 589 c->output_position_index = -1; 590 c->output_sample_mask_index = -1; 591 592 c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer, 593 _mesa_key_pointer_equal); 594 595 c->tmu.outstanding_regs = _mesa_pointer_set_create(c); 596 c->flags_temp = -1; 597 598 return c; 599} 600 601static int 602type_size_vec4(const struct glsl_type *type, bool bindless) 603{ 604 return glsl_count_attribute_slots(type, false); 605} 606 607static void 608v3d_lower_nir(struct v3d_compile *c) 609{ 610 struct nir_lower_tex_options tex_options = { 611 .lower_txd = true, 612 .lower_tg4_broadcom_swizzle = true, 613 614 .lower_rect = false, /* XXX: Use this on V3D 3.x */ 615 .lower_txp = ~0, 616 /* Apply swizzles to all samplers. */ 617 .swizzle_result = ~0, 618 .lower_invalid_implicit_lod = true, 619 }; 620 621 /* Lower the format swizzle and (for 32-bit returns) 622 * ARB_texture_swizzle-style swizzle. 623 */ 624 assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex)); 625 for (int i = 0; i < c->key->num_tex_used; i++) { 626 for (int j = 0; j < 4; j++) 627 tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j]; 628 } 629 630 assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler)); 631 for (int i = 0; i < c->key->num_samplers_used; i++) { 632 if (c->key->sampler[i].return_size == 16) { 633 tex_options.lower_tex_packing[i] = 634 nir_lower_tex_packing_16; 635 } 636 } 637 638 /* CS textures may not have return_size reflecting the shadow state. */ 639 nir_foreach_uniform_variable(var, c->s) { 640 const struct glsl_type *type = glsl_without_array(var->type); 641 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 642 643 if (!glsl_type_is_sampler(type) || 644 !glsl_sampler_type_is_shadow(type)) 645 continue; 646 647 for (int i = 0; i < array_len; i++) { 648 tex_options.lower_tex_packing[var->data.binding + i] = 649 nir_lower_tex_packing_16; 650 } 651 } 652 653 NIR_PASS(_, c->s, nir_lower_tex, &tex_options); 654 NIR_PASS(_, c->s, nir_lower_system_values); 655 NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL); 656 657 NIR_PASS(_, c->s, nir_lower_vars_to_scratch, 658 nir_var_function_temp, 659 0, 660 glsl_get_natural_size_align_bytes); 661 NIR_PASS(_, c->s, v3d_nir_lower_scratch); 662} 663 664static void 665v3d_set_prog_data_uniforms(struct v3d_compile *c, 666 struct v3d_prog_data *prog_data) 667{ 668 int count = c->num_uniforms; 669 struct v3d_uniform_list *ulist = &prog_data->uniforms; 670 671 ulist->count = count; 672 ulist->data = ralloc_array(prog_data, uint32_t, count); 673 memcpy(ulist->data, c->uniform_data, 674 count * sizeof(*ulist->data)); 675 ulist->contents = ralloc_array(prog_data, enum quniform_contents, count); 676 memcpy(ulist->contents, c->uniform_contents, 677 count * sizeof(*ulist->contents)); 678} 679 680static void 681v3d_vs_set_prog_data(struct v3d_compile *c, 682 struct v3d_vs_prog_data *prog_data) 683{ 684 /* The vertex data gets format converted by the VPM so that 685 * each attribute channel takes up a VPM column. Precompute 686 * the sizes for the shader record. 687 */ 688 for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) { 689 prog_data->vattr_sizes[i] = c->vattr_sizes[i]; 690 prog_data->vpm_input_size += c->vattr_sizes[i]; 691 } 692 693 memset(prog_data->driver_location_map, -1, 694 sizeof(prog_data->driver_location_map)); 695 696 nir_foreach_shader_in_variable(var, c->s) { 697 prog_data->driver_location_map[var->data.location] = 698 var->data.driver_location; 699 } 700 701 prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read, 702 SYSTEM_VALUE_VERTEX_ID) || 703 BITSET_TEST(c->s->info.system_values_read, 704 SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); 705 706 prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read, 707 SYSTEM_VALUE_BASE_INSTANCE); 708 709 prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read, 710 SYSTEM_VALUE_INSTANCE_ID) || 711 BITSET_TEST(c->s->info.system_values_read, 712 SYSTEM_VALUE_INSTANCE_INDEX); 713 714 if (prog_data->uses_vid) 715 prog_data->vpm_input_size++; 716 if (prog_data->uses_biid) 717 prog_data->vpm_input_size++; 718 if (prog_data->uses_iid) 719 prog_data->vpm_input_size++; 720 721 /* Input/output segment size are in sectors (8 rows of 32 bits per 722 * channel). 723 */ 724 prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8; 725 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; 726 727 /* Set us up for shared input/output segments. This is apparently 728 * necessary for our VCM setup to avoid varying corruption. 729 */ 730 prog_data->separate_segments = false; 731 prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size, 732 prog_data->vpm_input_size); 733 prog_data->vpm_input_size = 0; 734 735 /* Compute VCM cache size. We set up our program to take up less than 736 * half of the VPM, so that any set of bin and render programs won't 737 * run out of space. We need space for at least one input segment, 738 * and then allocate the rest to output segments (one for the current 739 * program, the rest to VCM). The valid range of the VCM cache size 740 * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4 741 * batches. 742 */ 743 assert(c->devinfo->vpm_size); 744 int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; 745 int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size; 746 int half_vpm = vpm_size_in_sectors / 2; 747 int vpm_output_sectors = half_vpm - prog_data->vpm_input_size; 748 int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size; 749 assert(vpm_output_batches >= 2); 750 prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4); 751} 752 753static void 754v3d_gs_set_prog_data(struct v3d_compile *c, 755 struct v3d_gs_prog_data *prog_data) 756{ 757 prog_data->num_inputs = c->num_inputs; 758 memcpy(prog_data->input_slots, c->input_slots, 759 c->num_inputs * sizeof(*c->input_slots)); 760 761 /* gl_PrimitiveIdIn is written by the GBG into the first word of the 762 * VPM output header automatically and the shader will overwrite 763 * it after reading it if necessary, so it doesn't add to the VPM 764 * size requirements. 765 */ 766 prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read, 767 SYSTEM_VALUE_PRIMITIVE_ID); 768 769 /* Output segment size is in sectors (8 rows of 32 bits per channel) */ 770 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; 771 772 /* Compute SIMD dispatch width and update VPM output size accordingly 773 * to ensure we can fit our program in memory. Available widths are 774 * 16, 8, 4, 1. 775 * 776 * Notice that at draw time we will have to consider VPM memory 777 * requirements from other stages and choose a smaller dispatch 778 * width if needed to fit the program in VPM memory. 779 */ 780 prog_data->simd_width = 16; 781 while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) || 782 prog_data->simd_width == 2) { 783 prog_data->simd_width >>= 1; 784 prog_data->vpm_output_size = 785 align(prog_data->vpm_output_size, 2) / 2; 786 } 787 assert(prog_data->vpm_output_size <= 16); 788 assert(prog_data->simd_width != 2); 789 790 prog_data->out_prim_type = c->s->info.gs.output_primitive; 791 prog_data->num_invocations = c->s->info.gs.invocations; 792 793 prog_data->writes_psiz = 794 c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ); 795} 796 797static void 798v3d_set_fs_prog_data_inputs(struct v3d_compile *c, 799 struct v3d_fs_prog_data *prog_data) 800{ 801 prog_data->num_inputs = c->num_inputs; 802 memcpy(prog_data->input_slots, c->input_slots, 803 c->num_inputs * sizeof(*c->input_slots)); 804 805 STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) > 806 (V3D_MAX_FS_INPUTS - 1) / 24); 807 for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) { 808 if (BITSET_TEST(c->flat_shade_flags, i)) 809 prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24); 810 811 if (BITSET_TEST(c->noperspective_flags, i)) 812 prog_data->noperspective_flags[i / 24] |= 1 << (i % 24); 813 814 if (BITSET_TEST(c->centroid_flags, i)) 815 prog_data->centroid_flags[i / 24] |= 1 << (i % 24); 816 } 817} 818 819static void 820v3d_fs_set_prog_data(struct v3d_compile *c, 821 struct v3d_fs_prog_data *prog_data) 822{ 823 v3d_set_fs_prog_data_inputs(c, prog_data); 824 prog_data->writes_z = c->writes_z; 825 prog_data->writes_z_from_fep = c->writes_z_from_fep; 826 prog_data->disable_ez = !c->s->info.fs.early_fragment_tests; 827 prog_data->uses_center_w = c->uses_center_w; 828 prog_data->uses_implicit_point_line_varyings = 829 c->uses_implicit_point_line_varyings; 830 prog_data->lock_scoreboard_on_first_thrsw = 831 c->lock_scoreboard_on_first_thrsw; 832 prog_data->force_per_sample_msaa = c->force_per_sample_msaa; 833 prog_data->uses_pid = c->fs_uses_primitive_id; 834} 835 836static void 837v3d_cs_set_prog_data(struct v3d_compile *c, 838 struct v3d_compute_prog_data *prog_data) 839{ 840 prog_data->shared_size = c->s->info.shared_size; 841 842 prog_data->local_size[0] = c->s->info.workgroup_size[0]; 843 prog_data->local_size[1] = c->s->info.workgroup_size[1]; 844 prog_data->local_size[2] = c->s->info.workgroup_size[2]; 845 846 prog_data->has_subgroups = c->has_subgroups; 847} 848 849static void 850v3d_set_prog_data(struct v3d_compile *c, 851 struct v3d_prog_data *prog_data) 852{ 853 prog_data->threads = c->threads; 854 prog_data->single_seg = !c->last_thrsw; 855 prog_data->spill_size = c->spill_size; 856 prog_data->tmu_spills = c->spills; 857 prog_data->tmu_fills = c->fills; 858 prog_data->qpu_read_stalls = c->qpu_inst_stalled_count; 859 prog_data->compile_strategy_idx = c->compile_strategy_idx; 860 prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl; 861 prog_data->has_control_barrier = c->s->info.uses_control_barrier; 862 prog_data->has_global_address = c->has_global_address; 863 864 v3d_set_prog_data_uniforms(c, prog_data); 865 866 switch (c->s->info.stage) { 867 case MESA_SHADER_VERTEX: 868 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data); 869 break; 870 case MESA_SHADER_GEOMETRY: 871 v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data); 872 break; 873 case MESA_SHADER_FRAGMENT: 874 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data); 875 break; 876 case MESA_SHADER_COMPUTE: 877 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data); 878 break; 879 default: 880 unreachable("unsupported shader stage"); 881 } 882} 883 884static uint64_t * 885v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size) 886{ 887 *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t); 888 889 uint64_t *qpu_insts = malloc(*final_assembly_size); 890 if (!qpu_insts) 891 return NULL; 892 893 memcpy(qpu_insts, c->qpu_insts, *final_assembly_size); 894 895 vir_compile_destroy(c); 896 897 return qpu_insts; 898} 899 900static void 901v3d_nir_lower_vs_early(struct v3d_compile *c) 902{ 903 /* Split our I/O vars and dead code eliminate the unused 904 * components. 905 */ 906 NIR_PASS(_, c->s, nir_lower_io_to_scalar_early, 907 nir_var_shader_in | nir_var_shader_out); 908 uint64_t used_outputs[4] = {0}; 909 for (int i = 0; i < c->vs_key->num_used_outputs; i++) { 910 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]); 911 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]); 912 used_outputs[comp] |= 1ull << slot; 913 } 914 NIR_PASS(_, c->s, nir_remove_unused_io_vars, 915 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ 916 NIR_PASS(_, c->s, nir_lower_global_vars_to_local); 917 v3d_optimize_nir(c, c->s); 918 NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); 919 920 /* This must go before nir_lower_io */ 921 if (c->vs_key->per_vertex_point_size) 922 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f); 923 924 NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 925 type_size_vec4, 926 (nir_lower_io_options)0); 927 /* clean up nir_lower_io's deref_var remains and do a constant folding pass 928 * on the code it generated. 929 */ 930 NIR_PASS(_, c->s, nir_opt_dce); 931 NIR_PASS(_, c->s, nir_opt_constant_folding); 932} 933 934static void 935v3d_nir_lower_gs_early(struct v3d_compile *c) 936{ 937 /* Split our I/O vars and dead code eliminate the unused 938 * components. 939 */ 940 NIR_PASS(_, c->s, nir_lower_io_to_scalar_early, 941 nir_var_shader_in | nir_var_shader_out); 942 uint64_t used_outputs[4] = {0}; 943 for (int i = 0; i < c->gs_key->num_used_outputs; i++) { 944 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]); 945 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]); 946 used_outputs[comp] |= 1ull << slot; 947 } 948 NIR_PASS(_, c->s, nir_remove_unused_io_vars, 949 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ 950 NIR_PASS(_, c->s, nir_lower_global_vars_to_local); 951 v3d_optimize_nir(c, c->s); 952 NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); 953 954 /* This must go before nir_lower_io */ 955 if (c->gs_key->per_vertex_point_size) 956 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f); 957 958 NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 959 type_size_vec4, 960 (nir_lower_io_options)0); 961 /* clean up nir_lower_io's deref_var remains and do a constant folding pass 962 * on the code it generated. 963 */ 964 NIR_PASS(_, c->s, nir_opt_dce); 965 NIR_PASS(_, c->s, nir_opt_constant_folding); 966} 967 968static void 969v3d_fixup_fs_output_types(struct v3d_compile *c) 970{ 971 nir_foreach_shader_out_variable(var, c->s) { 972 uint32_t mask = 0; 973 974 switch (var->data.location) { 975 case FRAG_RESULT_COLOR: 976 mask = ~0; 977 break; 978 case FRAG_RESULT_DATA0: 979 case FRAG_RESULT_DATA1: 980 case FRAG_RESULT_DATA2: 981 case FRAG_RESULT_DATA3: 982 mask = 1 << (var->data.location - FRAG_RESULT_DATA0); 983 break; 984 } 985 986 if (c->fs_key->int_color_rb & mask) { 987 var->type = 988 glsl_vector_type(GLSL_TYPE_INT, 989 glsl_get_components(var->type)); 990 } else if (c->fs_key->uint_color_rb & mask) { 991 var->type = 992 glsl_vector_type(GLSL_TYPE_UINT, 993 glsl_get_components(var->type)); 994 } 995 } 996} 997 998static void 999v3d_nir_lower_fs_early(struct v3d_compile *c) 1000{ 1001 if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb) 1002 v3d_fixup_fs_output_types(c); 1003 1004 NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c); 1005 1006 if (c->fs_key->line_smoothing) { 1007 NIR_PASS(_, c->s, v3d_nir_lower_line_smooth); 1008 NIR_PASS(_, c->s, nir_lower_global_vars_to_local); 1009 /* The lowering pass can introduce new sysval reads */ 1010 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s)); 1011 } 1012} 1013 1014static void 1015v3d_nir_lower_gs_late(struct v3d_compile *c) 1016{ 1017 if (c->key->ucp_enables) { 1018 NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables, 1019 false, NULL); 1020 } 1021 1022 /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */ 1023 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); 1024} 1025 1026static void 1027v3d_nir_lower_vs_late(struct v3d_compile *c) 1028{ 1029 if (c->key->ucp_enables) { 1030 NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables, 1031 false, false, NULL); 1032 NIR_PASS_V(c->s, nir_lower_io_to_scalar, 1033 nir_var_shader_out); 1034 } 1035 1036 /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */ 1037 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); 1038} 1039 1040static void 1041v3d_nir_lower_fs_late(struct v3d_compile *c) 1042{ 1043 /* In OpenGL the fragment shader can't read gl_ClipDistance[], but 1044 * Vulkan allows it, in which case the SPIR-V compiler will declare 1045 * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as 1046 * the last parameter to always operate with a compact array in both 1047 * OpenGL and Vulkan so we do't have to care about the API we 1048 * are using. 1049 */ 1050 if (c->key->ucp_enables) 1051 NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true); 1052 1053 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); 1054} 1055 1056static uint32_t 1057vir_get_max_temps(struct v3d_compile *c) 1058{ 1059 int max_ip = 0; 1060 vir_for_each_inst_inorder(inst, c) 1061 max_ip++; 1062 1063 uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip); 1064 1065 for (int t = 0; t < c->num_temps; t++) { 1066 for (int i = c->temp_start[t]; (i < c->temp_end[t] && 1067 i < max_ip); i++) { 1068 if (i > max_ip) 1069 break; 1070 pressure[i]++; 1071 } 1072 } 1073 1074 uint32_t max_temps = 0; 1075 for (int i = 0; i < max_ip; i++) 1076 max_temps = MAX2(max_temps, pressure[i]); 1077 1078 ralloc_free(pressure); 1079 1080 return max_temps; 1081} 1082 1083enum v3d_dependency_class { 1084 V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0 1085}; 1086 1087static bool 1088v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr, 1089 nir_schedule_dependency *dep, 1090 void *user_data) 1091{ 1092 struct v3d_compile *c = user_data; 1093 1094 switch (intr->intrinsic) { 1095 case nir_intrinsic_store_output: 1096 /* Writing to location 0 overwrites the value passed in for 1097 * gl_PrimitiveID on geometry shaders 1098 */ 1099 if (c->s->info.stage != MESA_SHADER_GEOMETRY || 1100 nir_intrinsic_base(intr) != 0) 1101 break; 1102 1103 nir_const_value *const_value = 1104 nir_src_as_const_value(intr->src[1]); 1105 1106 if (const_value == NULL) 1107 break; 1108 1109 uint64_t offset = 1110 nir_const_value_as_uint(*const_value, 1111 nir_src_bit_size(intr->src[1])); 1112 if (offset != 0) 1113 break; 1114 1115 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0; 1116 dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY; 1117 return true; 1118 1119 case nir_intrinsic_load_primitive_id: 1120 if (c->s->info.stage != MESA_SHADER_GEOMETRY) 1121 break; 1122 1123 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0; 1124 dep->type = NIR_SCHEDULE_READ_DEPENDENCY; 1125 return true; 1126 1127 default: 1128 break; 1129 } 1130 1131 return false; 1132} 1133 1134static unsigned 1135v3d_instr_delay_cb(nir_instr *instr, void *data) 1136{ 1137 struct v3d_compile *c = (struct v3d_compile *) data; 1138 1139 switch (instr->type) { 1140 case nir_instr_type_ssa_undef: 1141 case nir_instr_type_load_const: 1142 case nir_instr_type_alu: 1143 case nir_instr_type_deref: 1144 case nir_instr_type_jump: 1145 case nir_instr_type_parallel_copy: 1146 case nir_instr_type_call: 1147 case nir_instr_type_phi: 1148 return 1; 1149 1150 /* We should not use very large delays for TMU instructions. Typically, 1151 * thread switches will be sufficient to hide all or most of the latency, 1152 * so we typically only need a little bit of extra room. If we over-estimate 1153 * the latency here we may end up unnecesarily delaying the critical path in 1154 * the shader, which would have a negative effect in performance, so here 1155 * we are trying to strike a balance based on empirical testing. 1156 */ 1157 case nir_instr_type_intrinsic: { 1158 if (!c->disable_general_tmu_sched) { 1159 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1160 switch (intr->intrinsic) { 1161 case nir_intrinsic_load_ssbo: 1162 case nir_intrinsic_load_scratch: 1163 case nir_intrinsic_load_shared: 1164 case nir_intrinsic_image_load: 1165 return 3; 1166 case nir_intrinsic_load_ubo: 1167 if (nir_src_is_divergent(intr->src[1])) 1168 return 3; 1169 FALLTHROUGH; 1170 default: 1171 return 1; 1172 } 1173 } else { 1174 return 1; 1175 } 1176 break; 1177 } 1178 1179 case nir_instr_type_tex: 1180 return 5; 1181 } 1182 1183 return 0; 1184} 1185 1186static bool 1187should_split_wrmask(const nir_instr *instr, const void *data) 1188{ 1189 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1190 switch (intr->intrinsic) { 1191 case nir_intrinsic_store_ssbo: 1192 case nir_intrinsic_store_shared: 1193 case nir_intrinsic_store_global: 1194 case nir_intrinsic_store_scratch: 1195 return true; 1196 default: 1197 return false; 1198 } 1199} 1200 1201static nir_intrinsic_instr * 1202nir_instr_as_constant_ubo_load(nir_instr *inst) 1203{ 1204 if (inst->type != nir_instr_type_intrinsic) 1205 return NULL; 1206 1207 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst); 1208 if (intr->intrinsic != nir_intrinsic_load_ubo) 1209 return NULL; 1210 1211 assert(nir_src_is_const(intr->src[0])); 1212 if (!nir_src_is_const(intr->src[1])) 1213 return NULL; 1214 1215 return intr; 1216} 1217 1218static bool 1219v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref) 1220{ 1221 bool progress = false; 1222 1223 nir_instr *ref_inst = &ref->instr; 1224 uint32_t ref_offset = nir_src_as_uint(ref->src[1]); 1225 uint32_t ref_index = nir_src_as_uint(ref->src[0]); 1226 1227 /* Go through all instructions after ref searching for constant UBO 1228 * loads for the same UBO index. 1229 */ 1230 bool seq_break = false; 1231 nir_instr *inst = &ref->instr; 1232 nir_instr *next_inst = NULL; 1233 while (true) { 1234 inst = next_inst ? next_inst : nir_instr_next(inst); 1235 if (!inst) 1236 break; 1237 1238 next_inst = NULL; 1239 1240 if (inst->type != nir_instr_type_intrinsic) 1241 continue; 1242 1243 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst); 1244 if (intr->intrinsic != nir_intrinsic_load_ubo) 1245 continue; 1246 1247 /* We only produce unifa sequences for non-divergent loads */ 1248 if (nir_src_is_divergent(intr->src[1])) 1249 continue; 1250 1251 /* If there are any UBO loads that are not constant or that 1252 * use a different UBO index in between the reference load and 1253 * any other constant load for the same index, they would break 1254 * the unifa sequence. We will flag that so we can then move 1255 * all constant UBO loads for the reference index before these 1256 * and not just the ones that are not ordered to avoid breaking 1257 * the sequence and reduce unifa writes. 1258 */ 1259 if (!nir_src_is_const(intr->src[1])) { 1260 seq_break = true; 1261 continue; 1262 } 1263 uint32_t offset = nir_src_as_uint(intr->src[1]); 1264 1265 assert(nir_src_is_const(intr->src[0])); 1266 uint32_t index = nir_src_as_uint(intr->src[0]); 1267 if (index != ref_index) { 1268 seq_break = true; 1269 continue; 1270 } 1271 1272 /* Only move loads with an offset that is close enough to the 1273 * reference offset, since otherwise we would not be able to 1274 * skip the unifa write for them. See ntq_emit_load_ubo_unifa. 1275 */ 1276 if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE) 1277 continue; 1278 1279 /* We will move this load if its offset is smaller than ref's 1280 * (in which case we will move it before ref) or if the offset 1281 * is larger than ref's but there are sequence breakers in 1282 * in between (in which case we will move it after ref and 1283 * before the sequence breakers). 1284 */ 1285 if (!seq_break && offset >= ref_offset) 1286 continue; 1287 1288 /* Find where exactly we want to move this load: 1289 * 1290 * If we are moving it before ref, we want to check any other 1291 * UBO loads we placed before ref and make sure we insert this 1292 * one properly ordered with them. Likewise, if we are moving 1293 * it after ref. 1294 */ 1295 nir_instr *pos = ref_inst; 1296 nir_instr *tmp = pos; 1297 do { 1298 if (offset < ref_offset) 1299 tmp = nir_instr_prev(tmp); 1300 else 1301 tmp = nir_instr_next(tmp); 1302 1303 if (!tmp || tmp == inst) 1304 break; 1305 1306 /* Ignore non-unifa UBO loads */ 1307 if (tmp->type != nir_instr_type_intrinsic) 1308 continue; 1309 1310 nir_intrinsic_instr *tmp_intr = 1311 nir_instr_as_intrinsic(tmp); 1312 if (tmp_intr->intrinsic != nir_intrinsic_load_ubo) 1313 continue; 1314 1315 if (nir_src_is_divergent(tmp_intr->src[1])) 1316 continue; 1317 1318 /* Stop if we find a unifa UBO load that breaks the 1319 * sequence. 1320 */ 1321 if (!nir_src_is_const(tmp_intr->src[1])) 1322 break; 1323 1324 if (nir_src_as_uint(tmp_intr->src[0]) != index) 1325 break; 1326 1327 uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]); 1328 if (offset < ref_offset) { 1329 if (tmp_offset < offset || 1330 tmp_offset >= ref_offset) { 1331 break; 1332 } else { 1333 pos = tmp; 1334 } 1335 } else { 1336 if (tmp_offset > offset || 1337 tmp_offset <= ref_offset) { 1338 break; 1339 } else { 1340 pos = tmp; 1341 } 1342 } 1343 } while (true); 1344 1345 /* We can't move the UBO load before the instruction that 1346 * defines its constant offset. If that instruction is placed 1347 * in between the new location (pos) and the current location 1348 * of this load, we will have to move that instruction too. 1349 * 1350 * We don't care about the UBO index definition because that 1351 * is optimized to be reused by all UBO loads for the same 1352 * index and therefore is certain to be defined before the 1353 * first UBO load that uses it. 1354 */ 1355 nir_instr *offset_inst = NULL; 1356 tmp = inst; 1357 while ((tmp = nir_instr_prev(tmp)) != NULL) { 1358 if (pos == tmp) { 1359 /* We reached the target location without 1360 * finding the instruction that defines the 1361 * offset, so that instruction must be before 1362 * the new position and we don't have to fix it. 1363 */ 1364 break; 1365 } 1366 if (intr->src[1].ssa->parent_instr == tmp) { 1367 offset_inst = tmp; 1368 break; 1369 } 1370 } 1371 1372 if (offset_inst) { 1373 exec_node_remove(&offset_inst->node); 1374 exec_node_insert_node_before(&pos->node, 1375 &offset_inst->node); 1376 } 1377 1378 /* Since we are moving the instruction before its current 1379 * location, grab its successor before the move so that 1380 * we can continue the next iteration of the main loop from 1381 * that instruction. 1382 */ 1383 next_inst = nir_instr_next(inst); 1384 1385 /* Move this load to the selected location */ 1386 exec_node_remove(&inst->node); 1387 if (offset < ref_offset) 1388 exec_node_insert_node_before(&pos->node, &inst->node); 1389 else 1390 exec_node_insert_after(&pos->node, &inst->node); 1391 1392 progress = true; 1393 } 1394 1395 return progress; 1396} 1397 1398static bool 1399v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c, 1400 nir_block *block) 1401{ 1402 bool progress = false; 1403 bool local_progress; 1404 do { 1405 local_progress = false; 1406 nir_foreach_instr_safe(inst, block) { 1407 nir_intrinsic_instr *intr = 1408 nir_instr_as_constant_ubo_load(inst); 1409 if (intr) { 1410 local_progress |= 1411 v3d_nir_sort_constant_ubo_load(block, intr); 1412 } 1413 } 1414 progress |= local_progress; 1415 } while (local_progress); 1416 1417 return progress; 1418} 1419 1420/** 1421 * Sorts constant UBO loads in each block by offset to maximize chances of 1422 * skipping unifa writes when converting to VIR. This can increase register 1423 * pressure. 1424 */ 1425static bool 1426v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c) 1427{ 1428 nir_foreach_function(function, s) { 1429 if (function->impl) { 1430 nir_foreach_block(block, function->impl) { 1431 c->sorted_any_ubo_loads |= 1432 v3d_nir_sort_constant_ubo_loads_block(c, block); 1433 } 1434 nir_metadata_preserve(function->impl, 1435 nir_metadata_block_index | 1436 nir_metadata_dominance); 1437 } 1438 } 1439 return c->sorted_any_ubo_loads; 1440} 1441 1442static void 1443lower_load_num_subgroups(struct v3d_compile *c, 1444 nir_builder *b, 1445 nir_intrinsic_instr *intr) 1446{ 1447 assert(c->s->info.stage == MESA_SHADER_COMPUTE); 1448 assert(intr->intrinsic == nir_intrinsic_load_num_subgroups); 1449 1450 b->cursor = nir_after_instr(&intr->instr); 1451 uint32_t num_subgroups = 1452 DIV_ROUND_UP(c->s->info.workgroup_size[0] * 1453 c->s->info.workgroup_size[1] * 1454 c->s->info.workgroup_size[2], V3D_CHANNELS); 1455 nir_ssa_def *result = nir_imm_int(b, num_subgroups); 1456 nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); 1457 nir_instr_remove(&intr->instr); 1458} 1459 1460static bool 1461lower_subgroup_intrinsics(struct v3d_compile *c, 1462 nir_block *block, nir_builder *b) 1463{ 1464 bool progress = false; 1465 nir_foreach_instr_safe(inst, block) { 1466 if (inst->type != nir_instr_type_intrinsic) 1467 continue;; 1468 1469 nir_intrinsic_instr *intr = 1470 nir_instr_as_intrinsic(inst); 1471 if (!intr) 1472 continue; 1473 1474 switch (intr->intrinsic) { 1475 case nir_intrinsic_load_num_subgroups: 1476 lower_load_num_subgroups(c, b, intr); 1477 progress = true; 1478 FALLTHROUGH; 1479 case nir_intrinsic_load_subgroup_id: 1480 case nir_intrinsic_load_subgroup_size: 1481 case nir_intrinsic_load_subgroup_invocation: 1482 case nir_intrinsic_elect: 1483 c->has_subgroups = true; 1484 break; 1485 default: 1486 break; 1487 } 1488 } 1489 1490 return progress; 1491} 1492 1493static bool 1494v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c) 1495{ 1496 bool progress = false; 1497 nir_foreach_function(function, s) { 1498 if (function->impl) { 1499 nir_builder b; 1500 nir_builder_init(&b, function->impl); 1501 1502 nir_foreach_block(block, function->impl) 1503 progress |= lower_subgroup_intrinsics(c, block, &b); 1504 1505 nir_metadata_preserve(function->impl, 1506 nir_metadata_block_index | 1507 nir_metadata_dominance); 1508 } 1509 } 1510 return progress; 1511} 1512 1513static void 1514v3d_attempt_compile(struct v3d_compile *c) 1515{ 1516 switch (c->s->info.stage) { 1517 case MESA_SHADER_VERTEX: 1518 c->vs_key = (struct v3d_vs_key *) c->key; 1519 break; 1520 case MESA_SHADER_GEOMETRY: 1521 c->gs_key = (struct v3d_gs_key *) c->key; 1522 break; 1523 case MESA_SHADER_FRAGMENT: 1524 c->fs_key = (struct v3d_fs_key *) c->key; 1525 break; 1526 case MESA_SHADER_COMPUTE: 1527 break; 1528 default: 1529 unreachable("unsupported shader stage"); 1530 } 1531 1532 switch (c->s->info.stage) { 1533 case MESA_SHADER_VERTEX: 1534 v3d_nir_lower_vs_early(c); 1535 break; 1536 case MESA_SHADER_GEOMETRY: 1537 v3d_nir_lower_gs_early(c); 1538 break; 1539 case MESA_SHADER_FRAGMENT: 1540 v3d_nir_lower_fs_early(c); 1541 break; 1542 default: 1543 break; 1544 } 1545 1546 v3d_lower_nir(c); 1547 1548 switch (c->s->info.stage) { 1549 case MESA_SHADER_VERTEX: 1550 v3d_nir_lower_vs_late(c); 1551 break; 1552 case MESA_SHADER_GEOMETRY: 1553 v3d_nir_lower_gs_late(c); 1554 break; 1555 case MESA_SHADER_FRAGMENT: 1556 v3d_nir_lower_fs_late(c); 1557 break; 1558 default: 1559 break; 1560 } 1561 1562 NIR_PASS(_, c->s, v3d_nir_lower_io, c); 1563 NIR_PASS(_, c->s, v3d_nir_lower_txf_ms, c); 1564 NIR_PASS(_, c->s, v3d_nir_lower_image_load_store); 1565 nir_lower_idiv_options idiv_options = { 1566 .imprecise_32bit_lowering = true, 1567 .allow_fp16 = true, 1568 }; 1569 NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options); 1570 1571 if (c->key->robust_buffer_access) { 1572 /* v3d_nir_lower_robust_buffer_access assumes constant buffer 1573 * indices on ubo/ssbo intrinsics so run copy propagation and 1574 * constant folding passes before we run the lowering to warrant 1575 * this. We also want to run the lowering before v3d_optimize to 1576 * clean-up redundant get_buffer_size calls produced in the pass. 1577 */ 1578 NIR_PASS(_, c->s, nir_copy_prop); 1579 NIR_PASS(_, c->s, nir_opt_constant_folding); 1580 NIR_PASS(_, c->s, v3d_nir_lower_robust_buffer_access, c); 1581 } 1582 1583 NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s); 1584 1585 NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize, c); 1586 1587 NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c); 1588 1589 v3d_optimize_nir(c, c->s); 1590 1591 /* Do late algebraic optimization to turn add(a, neg(b)) back into 1592 * subs, then the mandatory cleanup after algebraic. Note that it may 1593 * produce fnegs, and if so then we need to keep running to squash 1594 * fneg(fneg(a)). 1595 */ 1596 bool more_late_algebraic = true; 1597 while (more_late_algebraic) { 1598 more_late_algebraic = false; 1599 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late); 1600 NIR_PASS(_, c->s, nir_opt_constant_folding); 1601 NIR_PASS(_, c->s, nir_copy_prop); 1602 NIR_PASS(_, c->s, nir_opt_dce); 1603 NIR_PASS(_, c->s, nir_opt_cse); 1604 } 1605 1606 NIR_PASS(_, c->s, nir_lower_bool_to_int32); 1607 NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true); 1608 NIR_PASS_V(c->s, nir_divergence_analysis); 1609 NIR_PASS(_, c->s, nir_convert_from_ssa, true); 1610 1611 struct nir_schedule_options schedule_options = { 1612 /* Schedule for about half our register space, to enable more 1613 * shaders to hit 4 threads. 1614 */ 1615 .threshold = c->threads == 4 ? 24 : 48, 1616 1617 /* Vertex shaders share the same memory for inputs and outputs, 1618 * fragement and geometry shaders do not. 1619 */ 1620 .stages_with_shared_io_memory = 1621 (((1 << MESA_ALL_SHADER_STAGES) - 1) & 1622 ~((1 << MESA_SHADER_FRAGMENT) | 1623 (1 << MESA_SHADER_GEOMETRY))), 1624 1625 .fallback = c->fallback_scheduler, 1626 1627 .intrinsic_cb = v3d_intrinsic_dependency_cb, 1628 .intrinsic_cb_data = c, 1629 1630 .instr_delay_cb = v3d_instr_delay_cb, 1631 .instr_delay_cb_data = c, 1632 }; 1633 NIR_PASS_V(c->s, nir_schedule, &schedule_options); 1634 1635 if (!c->disable_constant_ubo_load_sorting) 1636 NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c); 1637 1638 NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform | 1639 nir_move_const_undef); 1640 1641 v3d_nir_to_vir(c); 1642} 1643 1644uint32_t 1645v3d_prog_data_size(gl_shader_stage stage) 1646{ 1647 static const int prog_data_size[] = { 1648 [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data), 1649 [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data), 1650 [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data), 1651 [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data), 1652 }; 1653 1654 assert(stage >= 0 && 1655 stage < ARRAY_SIZE(prog_data_size) && 1656 prog_data_size[stage]); 1657 1658 return prog_data_size[stage]; 1659} 1660 1661int v3d_shaderdb_dump(struct v3d_compile *c, 1662 char **shaderdb_str) 1663{ 1664 if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED) 1665 return -1; 1666 1667 return asprintf(shaderdb_str, 1668 "%s shader: %d inst, %d threads, %d loops, " 1669 "%d uniforms, %d max-temps, %d:%d spills:fills, " 1670 "%d sfu-stalls, %d inst-and-stalls, %d nops", 1671 vir_get_stage_name(c), 1672 c->qpu_inst_count, 1673 c->threads, 1674 c->loops, 1675 c->num_uniforms, 1676 vir_get_max_temps(c), 1677 c->spills, 1678 c->fills, 1679 c->qpu_inst_stalled_count, 1680 c->qpu_inst_count + c->qpu_inst_stalled_count, 1681 c->nop_count); 1682} 1683 1684/* This is a list of incremental changes to the compilation strategy 1685 * that will be used to try to compile the shader successfully. The 1686 * default strategy is to enable all optimizations which will have 1687 * the highest register pressure but is expected to produce most 1688 * optimal code. Following strategies incrementally disable specific 1689 * optimizations that are known to contribute to register pressure 1690 * in order to be able to compile the shader successfully while meeting 1691 * thread count requirements. 1692 * 1693 * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also 1694 * cover previous hardware as well (meaning that we are not limiting 1695 * register allocation to any particular thread count). This is fine 1696 * because v3d_nir_to_vir will cap this to the actual minimum. 1697 */ 1698struct v3d_compiler_strategy { 1699 const char *name; 1700 uint32_t max_threads; 1701 uint32_t min_threads; 1702 bool disable_general_tmu_sched; 1703 bool disable_loop_unrolling; 1704 bool disable_ubo_load_sorting; 1705 bool disable_tmu_pipelining; 1706 uint32_t max_tmu_spills; 1707} static const strategies[] = { 1708 /*0*/ { "default", 4, 4, false, false, false, false, 0 }, 1709 /*1*/ { "disable general TMU sched", 4, 4, true, false, false, false, 0 }, 1710 /*2*/ { "disable loop unrolling", 4, 4, true, true, false, false, 0 }, 1711 /*3*/ { "disable UBO load sorting", 4, 4, true, true, true, false, 0 }, 1712 /*4*/ { "disable TMU pipelining", 4, 4, true, true, true, true, 0 }, 1713 /*5*/ { "lower thread count", 2, 1, false, false, false, false, -1 }, 1714 /*6*/ { "disable general TMU sched (2t)", 2, 1, true, false, false, false, -1 }, 1715 /*7*/ { "disable loop unrolling (2t)", 2, 1, true, true, false, false, -1 }, 1716 /*8*/ { "disable UBO load sorting (2t)", 2, 1, true, true, true, false, -1 }, 1717 /*9*/ { "disable TMU pipelining (2t)", 2, 1, true, true, true, true, -1 }, 1718 /*10*/ { "fallback scheduler", 2, 1, true, true, true, true, -1 } 1719}; 1720 1721/** 1722 * If a particular optimization didn't make any progress during a compile 1723 * attempt disabling it alone won't allow us to compile the shader successfuly, 1724 * since we'll end up with the same code. Detect these scenarios so we can 1725 * avoid wasting time with useless compiles. We should also consider if the 1726 * gy changes other aspects of the compilation process though, like 1727 * spilling, and not skip it in that case. 1728 */ 1729static bool 1730skip_compile_strategy(struct v3d_compile *c, uint32_t idx) 1731{ 1732 /* We decide if we can skip a strategy based on the optimizations that 1733 * were active in the previous strategy, so we should only be calling this 1734 * for strategies after the first. 1735 */ 1736 assert(idx > 0); 1737 1738 /* Don't skip a strategy that changes spilling behavior */ 1739 if (strategies[idx].max_tmu_spills != 1740 strategies[idx - 1].max_tmu_spills) { 1741 return false; 1742 } 1743 1744 switch (idx) { 1745 /* General TMU sched.: skip if we didn't emit any TMU loads */ 1746 case 1: 1747 case 6: 1748 return !c->has_general_tmu_load; 1749 /* Loop unrolling: skip if we didn't unroll any loops */ 1750 case 2: 1751 case 7: 1752 return !c->unrolled_any_loops; 1753 /* UBO load sorting: skip if we didn't sort any loads */ 1754 case 3: 1755 case 8: 1756 return !c->sorted_any_ubo_loads; 1757 /* TMU pipelining: skip if we didn't pipeline any TMU ops */ 1758 case 4: 1759 case 9: 1760 return !c->pipelined_any_tmu; 1761 /* Lower thread count: skip if we already tried less that 4 threads */ 1762 case 5: 1763 return c->threads < 4; 1764 default: 1765 return false; 1766 }; 1767} 1768uint64_t *v3d_compile(const struct v3d_compiler *compiler, 1769 struct v3d_key *key, 1770 struct v3d_prog_data **out_prog_data, 1771 nir_shader *s, 1772 void (*debug_output)(const char *msg, 1773 void *debug_output_data), 1774 void *debug_output_data, 1775 int program_id, int variant_id, 1776 uint32_t *final_assembly_size) 1777{ 1778 struct v3d_compile *c = NULL; 1779 1780 uint32_t best_spill_fill_count = UINT32_MAX; 1781 struct v3d_compile *best_c = NULL; 1782 for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) { 1783 /* Fallback strategy */ 1784 if (strat > 0) { 1785 assert(c); 1786 if (skip_compile_strategy(c, strat)) 1787 continue; 1788 1789 char *debug_msg; 1790 int ret = asprintf(&debug_msg, 1791 "Falling back to strategy '%s' " 1792 "for %s prog %d/%d", 1793 strategies[strat].name, 1794 vir_get_stage_name(c), 1795 c->program_id, c->variant_id); 1796 1797 if (ret >= 0) { 1798 if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) 1799 fprintf(stderr, "%s\n", debug_msg); 1800 1801 c->debug_output(debug_msg, c->debug_output_data); 1802 free(debug_msg); 1803 } 1804 1805 if (c != best_c) 1806 vir_compile_destroy(c); 1807 } 1808 1809 c = vir_compile_init(compiler, key, s, 1810 debug_output, debug_output_data, 1811 program_id, variant_id, 1812 strat, 1813 strategies[strat].max_threads, 1814 strategies[strat].min_threads, 1815 strategies[strat].max_tmu_spills, 1816 strategies[strat].disable_general_tmu_sched, 1817 strategies[strat].disable_loop_unrolling, 1818 strategies[strat].disable_ubo_load_sorting, 1819 strategies[strat].disable_tmu_pipelining, 1820 strat == ARRAY_SIZE(strategies) - 1); 1821 1822 v3d_attempt_compile(c); 1823 1824 /* Broken shader or driver bug */ 1825 if (c->compilation_result == V3D_COMPILATION_FAILED) 1826 break; 1827 1828 /* If we compiled without spills, choose this. 1829 * Otherwise if this is a 4-thread compile, choose this (these 1830 * have a very low cap on the allowed TMU spills so we assume 1831 * it will be better than a 2-thread compile without spills). 1832 * Otherwise, keep going while tracking the strategy with the 1833 * lowest spill count. 1834 */ 1835 if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) { 1836 if (c->spills == 0 || 1837 strategies[strat].min_threads == 4) { 1838 best_c = c; 1839 break; 1840 } else if (c->spills + c->fills < 1841 best_spill_fill_count) { 1842 best_c = c; 1843 best_spill_fill_count = c->spills + c->fills; 1844 } 1845 1846 if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) { 1847 char *debug_msg; 1848 int ret = asprintf(&debug_msg, 1849 "Compiled %s prog %d/%d with %d " 1850 "spills and %d fills. Will try " 1851 "more strategies.", 1852 vir_get_stage_name(c), 1853 c->program_id, c->variant_id, 1854 c->spills, c->fills); 1855 if (ret >= 0) { 1856 fprintf(stderr, "%s\n", debug_msg); 1857 c->debug_output(debug_msg, c->debug_output_data); 1858 free(debug_msg); 1859 } 1860 } 1861 } 1862 1863 /* Only try next streategy if we failed to register allocate 1864 * or we had to spill. 1865 */ 1866 assert(c->compilation_result == 1867 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION || 1868 c->spills > 0); 1869 } 1870 1871 /* If the best strategy was not the last, choose that */ 1872 if (best_c && c != best_c) { 1873 vir_compile_destroy(c); 1874 c = best_c; 1875 } 1876 1877 if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) && 1878 c->compilation_result != 1879 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION && 1880 c->spills > 0) { 1881 char *debug_msg; 1882 int ret = asprintf(&debug_msg, 1883 "Compiled %s prog %d/%d with %d " 1884 "spills and %d fills", 1885 vir_get_stage_name(c), 1886 c->program_id, c->variant_id, 1887 c->spills, c->fills); 1888 fprintf(stderr, "%s\n", debug_msg); 1889 1890 if (ret >= 0) { 1891 c->debug_output(debug_msg, c->debug_output_data); 1892 free(debug_msg); 1893 } 1894 } 1895 1896 if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) { 1897 fprintf(stderr, "Failed to compile %s prog %d/%d " 1898 "with any strategy.\n", 1899 vir_get_stage_name(c), c->program_id, c->variant_id); 1900 } 1901 1902 struct v3d_prog_data *prog_data; 1903 1904 prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage)); 1905 1906 v3d_set_prog_data(c, prog_data); 1907 1908 *out_prog_data = prog_data; 1909 1910 char *shaderdb; 1911 int ret = v3d_shaderdb_dump(c, &shaderdb); 1912 if (ret >= 0) { 1913 if (V3D_DEBUG & V3D_DEBUG_SHADERDB) 1914 fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb); 1915 1916 c->debug_output(shaderdb, c->debug_output_data); 1917 free(shaderdb); 1918 } 1919 1920 return v3d_return_qpu_insts(c, final_assembly_size); 1921} 1922 1923void 1924vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst) 1925{ 1926 if (qinst->dst.file == QFILE_TEMP) 1927 c->defs[qinst->dst.index] = NULL; 1928 1929 assert(&qinst->link != c->cursor.link); 1930 1931 list_del(&qinst->link); 1932 free(qinst); 1933 1934 c->live_intervals_valid = false; 1935} 1936 1937struct qreg 1938vir_follow_movs(struct v3d_compile *c, struct qreg reg) 1939{ 1940 /* XXX 1941 int pack = reg.pack; 1942 1943 while (reg.file == QFILE_TEMP && 1944 c->defs[reg.index] && 1945 (c->defs[reg.index]->op == QOP_MOV || 1946 c->defs[reg.index]->op == QOP_FMOV) && 1947 !c->defs[reg.index]->dst.pack && 1948 !c->defs[reg.index]->src[0].pack) { 1949 reg = c->defs[reg.index]->src[0]; 1950 } 1951 1952 reg.pack = pack; 1953 */ 1954 return reg; 1955} 1956 1957void 1958vir_compile_destroy(struct v3d_compile *c) 1959{ 1960 /* Defuse the assert that we aren't removing the cursor's instruction. 1961 */ 1962 c->cursor.link = NULL; 1963 1964 vir_for_each_block(block, c) { 1965 while (!list_is_empty(&block->instructions)) { 1966 struct qinst *qinst = 1967 list_first_entry(&block->instructions, 1968 struct qinst, link); 1969 vir_remove_instruction(c, qinst); 1970 } 1971 } 1972 1973 ralloc_free(c); 1974} 1975 1976uint32_t 1977vir_get_uniform_index(struct v3d_compile *c, 1978 enum quniform_contents contents, 1979 uint32_t data) 1980{ 1981 for (int i = 0; i < c->num_uniforms; i++) { 1982 if (c->uniform_contents[i] == contents && 1983 c->uniform_data[i] == data) { 1984 return i; 1985 } 1986 } 1987 1988 uint32_t uniform = c->num_uniforms++; 1989 1990 if (uniform >= c->uniform_array_size) { 1991 c->uniform_array_size = MAX2(MAX2(16, uniform + 1), 1992 c->uniform_array_size * 2); 1993 1994 c->uniform_data = reralloc(c, c->uniform_data, 1995 uint32_t, 1996 c->uniform_array_size); 1997 c->uniform_contents = reralloc(c, c->uniform_contents, 1998 enum quniform_contents, 1999 c->uniform_array_size); 2000 } 2001 2002 c->uniform_contents[uniform] = contents; 2003 c->uniform_data[uniform] = data; 2004 2005 return uniform; 2006} 2007 2008/* Looks back into the current block to find the ldunif that wrote the uniform 2009 * at the requested index. If it finds it, it returns true and writes the 2010 * destination register of the ldunif instruction to 'unif'. 2011 * 2012 * This can impact register pressure and end up leading to worse code, so we 2013 * limit the number of instructions we are willing to look back through to 2014 * strike a good balance. 2015 */ 2016static bool 2017try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif) 2018{ 2019 uint32_t count = 20; 2020 struct qinst *prev_inst = NULL; 2021 assert(c->cur_block); 2022 2023#ifdef DEBUG 2024 /* We can only reuse a uniform if it was emitted in the same block, 2025 * so callers must make sure the current instruction is being emitted 2026 * in the current block. 2027 */ 2028 bool found = false; 2029 vir_for_each_inst(inst, c->cur_block) { 2030 if (&inst->link == c->cursor.link) { 2031 found = true; 2032 break; 2033 } 2034 } 2035 2036 assert(found || &c->cur_block->instructions == c->cursor.link); 2037#endif 2038 2039 list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev, 2040 &c->cur_block->instructions, link) { 2041 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) && 2042 inst->uniform == index) { 2043 prev_inst = inst; 2044 break; 2045 } 2046 2047 if (--count == 0) 2048 break; 2049 } 2050 2051 if (!prev_inst) 2052 return false; 2053 2054 2055 list_for_each_entry_from(struct qinst, inst, prev_inst->link.next, 2056 &c->cur_block->instructions, link) { 2057 if (inst->dst.file == prev_inst->dst.file && 2058 inst->dst.index == prev_inst->dst.index) { 2059 return false; 2060 } 2061 } 2062 2063 *unif = prev_inst->dst; 2064 return true; 2065} 2066 2067struct qreg 2068vir_uniform(struct v3d_compile *c, 2069 enum quniform_contents contents, 2070 uint32_t data) 2071{ 2072 const int num_uniforms = c->num_uniforms; 2073 const int index = vir_get_uniform_index(c, contents, data); 2074 2075 /* If this is not the first time we see this uniform try to reuse the 2076 * result of the last ldunif that loaded it. 2077 */ 2078 const bool is_new_uniform = num_uniforms != c->num_uniforms; 2079 if (!is_new_uniform && !c->disable_ldunif_opt) { 2080 struct qreg ldunif_dst; 2081 if (try_opt_ldunif(c, index, &ldunif_dst)) 2082 return ldunif_dst; 2083 } 2084 2085 struct qinst *inst = vir_NOP(c); 2086 inst->qpu.sig.ldunif = true; 2087 inst->uniform = index; 2088 inst->dst = vir_get_temp(c); 2089 c->defs[inst->dst.index] = inst; 2090 return inst->dst; 2091} 2092 2093#define OPTPASS(func) \ 2094 do { \ 2095 bool stage_progress = func(c); \ 2096 if (stage_progress) { \ 2097 progress = true; \ 2098 if (print_opt_debug) { \ 2099 fprintf(stderr, \ 2100 "VIR opt pass %2d: %s progress\n", \ 2101 pass, #func); \ 2102 } \ 2103 /*XXX vir_validate(c);*/ \ 2104 } \ 2105 } while (0) 2106 2107void 2108vir_optimize(struct v3d_compile *c) 2109{ 2110 bool print_opt_debug = false; 2111 int pass = 1; 2112 2113 while (true) { 2114 bool progress = false; 2115 2116 OPTPASS(vir_opt_copy_propagate); 2117 OPTPASS(vir_opt_redundant_flags); 2118 OPTPASS(vir_opt_dead_code); 2119 OPTPASS(vir_opt_small_immediates); 2120 OPTPASS(vir_opt_constant_alu); 2121 2122 if (!progress) 2123 break; 2124 2125 pass++; 2126 } 2127} 2128 2129const char * 2130vir_get_stage_name(struct v3d_compile *c) 2131{ 2132 if (c->vs_key && c->vs_key->is_coord) 2133 return "MESA_SHADER_VERTEX_BIN"; 2134 else if (c->gs_key && c->gs_key->is_coord) 2135 return "MESA_SHADER_GEOMETRY_BIN"; 2136 else 2137 return gl_shader_stage_name(c->s->info.stage); 2138} 2139 2140static inline uint32_t 2141compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo) 2142{ 2143 assert(devinfo->vpm_size > 0); 2144 const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; 2145 return devinfo->vpm_size / sector_size; 2146} 2147 2148/* Computes various parameters affecting VPM memory configuration for programs 2149 * involving geometry shaders to ensure the program fits in memory and honors 2150 * requirements described in section "VPM usage" of the programming manual. 2151 */ 2152static bool 2153compute_vpm_config_gs(struct v3d_device_info *devinfo, 2154 struct v3d_vs_prog_data *vs, 2155 struct v3d_gs_prog_data *gs, 2156 struct vpm_config *vpm_cfg_out) 2157{ 2158 const uint32_t A = vs->separate_segments ? 1 : 0; 2159 const uint32_t Ad = vs->vpm_input_size; 2160 const uint32_t Vd = vs->vpm_output_size; 2161 2162 const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo); 2163 2164 /* Try to fit program into our VPM memory budget by adjusting 2165 * configurable parameters iteratively. We do this in two phases: 2166 * the first phase tries to fit the program into the total available 2167 * VPM memory. If we succeed at that, then the second phase attempts 2168 * to fit the program into half of that budget so we can run bin and 2169 * render programs in parallel. 2170 */ 2171 struct vpm_config vpm_cfg[2]; 2172 struct vpm_config *final_vpm_cfg = NULL; 2173 uint32_t phase = 0; 2174 2175 vpm_cfg[phase].As = 1; 2176 vpm_cfg[phase].Gs = 1; 2177 vpm_cfg[phase].Gd = gs->vpm_output_size; 2178 vpm_cfg[phase].gs_width = gs->simd_width; 2179 2180 /* While there is a requirement that Vc >= [Vn / 16], this is 2181 * always the case when tessellation is not present because in that 2182 * case Vn can only be 6 at most (when input primitive is triangles 2183 * with adjacency). 2184 * 2185 * We always choose Vc=2. We can't go lower than this due to GFXH-1744, 2186 * and Broadcom has not found it worth it to increase it beyond this 2187 * in general. Increasing Vc also increases VPM memory pressure which 2188 * can turn up being detrimental for performance in some scenarios. 2189 */ 2190 vpm_cfg[phase].Vc = 2; 2191 2192 /* Gv is a constraint on the hardware to not exceed the 2193 * specified number of vertex segments per GS batch. If adding a 2194 * new primitive to a GS batch would result in a range of more 2195 * than Gv vertex segments being referenced by the batch, then 2196 * the hardware will flush the batch and start a new one. This 2197 * means that we can choose any value we want, we just need to 2198 * be aware that larger values improve GS batch utilization 2199 * at the expense of more VPM memory pressure (which can affect 2200 * other performance aspects, such as GS dispatch width). 2201 * We start with the largest value, and will reduce it if we 2202 * find that total memory pressure is too high. 2203 */ 2204 vpm_cfg[phase].Gv = 3; 2205 do { 2206 /* When GS is present in absence of TES, then we need to satisfy 2207 * that Ve >= Gv. We go with the smallest value of Ve to avoid 2208 * increasing memory pressure. 2209 */ 2210 vpm_cfg[phase].Ve = vpm_cfg[phase].Gv; 2211 2212 uint32_t vpm_sectors = 2213 A * vpm_cfg[phase].As * Ad + 2214 (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd + 2215 vpm_cfg[phase].Gs * vpm_cfg[phase].Gd; 2216 2217 /* Ideally we want to use no more than half of the available 2218 * memory so we can execute a bin and render program in parallel 2219 * without stalls. If we achieved that then we are done. 2220 */ 2221 if (vpm_sectors <= vpm_size / 2) { 2222 final_vpm_cfg = &vpm_cfg[phase]; 2223 break; 2224 } 2225 2226 /* At the very least, we should not allocate more than the 2227 * total available VPM memory. If we have a configuration that 2228 * succeeds at this we save it and continue to see if we can 2229 * meet the half-memory-use criteria too. 2230 */ 2231 if (phase == 0 && vpm_sectors <= vpm_size) { 2232 vpm_cfg[1] = vpm_cfg[0]; 2233 phase = 1; 2234 } 2235 2236 /* Try lowering Gv */ 2237 if (vpm_cfg[phase].Gv > 0) { 2238 vpm_cfg[phase].Gv--; 2239 continue; 2240 } 2241 2242 /* Try lowering GS dispatch width */ 2243 if (vpm_cfg[phase].gs_width > 1) { 2244 do { 2245 vpm_cfg[phase].gs_width >>= 1; 2246 vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2; 2247 } while (vpm_cfg[phase].gs_width == 2); 2248 2249 /* Reset Gv to max after dropping dispatch width */ 2250 vpm_cfg[phase].Gv = 3; 2251 continue; 2252 } 2253 2254 /* We ran out of options to reduce memory pressure. If we 2255 * are at phase 1 we have at least a valid configuration, so we 2256 * we use that. 2257 */ 2258 if (phase == 1) 2259 final_vpm_cfg = &vpm_cfg[0]; 2260 break; 2261 } while (true); 2262 2263 if (!final_vpm_cfg) 2264 return false; 2265 2266 assert(final_vpm_cfg); 2267 assert(final_vpm_cfg->Gd <= 16); 2268 assert(final_vpm_cfg->Gv < 4); 2269 assert(final_vpm_cfg->Ve < 4); 2270 assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4); 2271 assert(final_vpm_cfg->gs_width == 1 || 2272 final_vpm_cfg->gs_width == 4 || 2273 final_vpm_cfg->gs_width == 8 || 2274 final_vpm_cfg->gs_width == 16); 2275 2276 *vpm_cfg_out = *final_vpm_cfg; 2277 return true; 2278} 2279 2280bool 2281v3d_compute_vpm_config(struct v3d_device_info *devinfo, 2282 struct v3d_vs_prog_data *vs_bin, 2283 struct v3d_vs_prog_data *vs, 2284 struct v3d_gs_prog_data *gs_bin, 2285 struct v3d_gs_prog_data *gs, 2286 struct vpm_config *vpm_cfg_bin, 2287 struct vpm_config *vpm_cfg) 2288{ 2289 assert(vs && vs_bin); 2290 assert((gs != NULL) == (gs_bin != NULL)); 2291 2292 if (!gs) { 2293 vpm_cfg_bin->As = 1; 2294 vpm_cfg_bin->Ve = 0; 2295 vpm_cfg_bin->Vc = vs_bin->vcm_cache_size; 2296 2297 vpm_cfg->As = 1; 2298 vpm_cfg->Ve = 0; 2299 vpm_cfg->Vc = vs->vcm_cache_size; 2300 } else { 2301 if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin)) 2302 return false; 2303 2304 if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg)) 2305 return false; 2306 } 2307 2308 return true; 2309} 2310