1/* 2 * Copyright © 2014 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "util/ralloc.h" 25#include "util/register_allocate.h" 26#include "common/v3d_device_info.h" 27#include "v3d_compiler.h" 28 29#define ACC_INDEX 0 30#define ACC_COUNT 6 31#define PHYS_INDEX (ACC_INDEX + ACC_COUNT) 32#define PHYS_COUNT 64 33 34#define CLASS_BITS_PHYS (1 << 0) 35#define CLASS_BITS_ACC (1 << 1) 36#define CLASS_BITS_R5 (1 << 4) 37#define CLASS_BITS_ANY (CLASS_BITS_PHYS | \ 38 CLASS_BITS_ACC | \ 39 CLASS_BITS_R5) 40 41static inline uint32_t 42temp_to_node(uint32_t temp) 43{ 44 return temp + ACC_COUNT; 45} 46 47static inline uint32_t 48node_to_temp(uint32_t node) 49{ 50 assert(node >= ACC_COUNT); 51 return node - ACC_COUNT; 52} 53 54static inline uint8_t 55get_temp_class_bits(struct v3d_ra_node_info *nodes, 56 uint32_t temp) 57{ 58 return nodes->info[temp_to_node(temp)].class_bits; 59} 60 61static inline void 62set_temp_class_bits(struct v3d_ra_node_info *nodes, 63 uint32_t temp, uint8_t class_bits) 64{ 65 nodes->info[temp_to_node(temp)].class_bits = class_bits; 66} 67 68static struct ra_class * 69choose_reg_class(struct v3d_compile *c, uint8_t class_bits) 70{ 71 if (class_bits == CLASS_BITS_PHYS) { 72 return c->compiler->reg_class_phys[c->thread_index]; 73 } else if (class_bits == (CLASS_BITS_R5)) { 74 return c->compiler->reg_class_r5[c->thread_index]; 75 } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) { 76 return c->compiler->reg_class_phys_or_acc[c->thread_index]; 77 } else { 78 assert(class_bits == CLASS_BITS_ANY); 79 return c->compiler->reg_class_any[c->thread_index]; 80 } 81} 82 83static inline struct ra_class * 84choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp) 85{ 86 assert(temp < c->num_temps && temp < c->nodes.alloc_count); 87 return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp)); 88} 89 90static inline bool 91qinst_writes_tmu(const struct v3d_device_info *devinfo, 92 struct qinst *inst) 93{ 94 return (inst->dst.file == QFILE_MAGIC && 95 v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index)) || 96 inst->qpu.sig.wrtmuc; 97} 98 99static bool 100is_end_of_tmu_sequence(const struct v3d_device_info *devinfo, 101 struct qinst *inst, struct qblock *block) 102{ 103 /* Only tmuwt and ldtmu can finish TMU sequences */ 104 bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 105 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT; 106 bool is_ldtmu = inst->qpu.sig.ldtmu; 107 if (!is_tmuwt && !is_ldtmu) 108 return false; 109 110 /* Check if this is the last tmuwt or ldtmu in the sequence */ 111 list_for_each_entry_from(struct qinst, scan_inst, inst->link.next, 112 &block->instructions, link) { 113 is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 114 scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT; 115 is_ldtmu = scan_inst->qpu.sig.ldtmu; 116 117 if (is_tmuwt || is_ldtmu) 118 return false; 119 120 if (qinst_writes_tmu(devinfo, scan_inst)) 121 return true; 122 } 123 124 return true; 125} 126 127static bool 128vir_is_mov_uniform(struct v3d_compile *c, int temp) 129{ 130 struct qinst *def = c->defs[temp]; 131 132 return def && def->qpu.sig.ldunif; 133} 134 135static bool 136can_reconstruct_inst(struct qinst *inst) 137{ 138 assert(inst); 139 140 if (vir_is_add(inst)) { 141 switch (inst->qpu.alu.add.op) { 142 case V3D_QPU_A_FXCD: 143 case V3D_QPU_A_FYCD: 144 case V3D_QPU_A_XCD: 145 case V3D_QPU_A_YCD: 146 case V3D_QPU_A_IID: 147 case V3D_QPU_A_EIDX: 148 case V3D_QPU_A_TIDX: 149 case V3D_QPU_A_SAMPID: 150 /* No need to check input unpacks because none of these 151 * opcodes read sources. FXCD,FYCD have pack variants. 152 */ 153 return inst->qpu.flags.ac == V3D_QPU_COND_NONE && 154 inst->qpu.flags.auf == V3D_QPU_UF_NONE && 155 inst->qpu.flags.apf == V3D_QPU_PF_NONE && 156 inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE; 157 default: 158 return false; 159 } 160 } 161 162 return false; 163} 164 165static bool 166can_reconstruct_temp(struct v3d_compile *c, int temp) 167{ 168 struct qinst *def = c->defs[temp]; 169 return def && can_reconstruct_inst(def); 170} 171 172static struct qreg 173reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op) 174{ 175 struct qreg dest; 176 switch (op) { 177 case V3D_QPU_A_FXCD: 178 dest = vir_FXCD(c); 179 break; 180 case V3D_QPU_A_FYCD: 181 dest = vir_FYCD(c); 182 break; 183 case V3D_QPU_A_XCD: 184 dest = vir_XCD(c); 185 break; 186 case V3D_QPU_A_YCD: 187 dest = vir_YCD(c); 188 break; 189 case V3D_QPU_A_IID: 190 dest = vir_IID(c); 191 break; 192 case V3D_QPU_A_EIDX: 193 dest = vir_EIDX(c); 194 break; 195 case V3D_QPU_A_TIDX: 196 dest = vir_TIDX(c); 197 break; 198 case V3D_QPU_A_SAMPID: 199 dest = vir_SAMPID(c); 200 break; 201 default: 202 unreachable("Unexpected opcode for reconstruction"); 203 } 204 205 return dest; 206} 207 208enum temp_spill_type { 209 SPILL_TYPE_UNIFORM, 210 SPILL_TYPE_RECONSTRUCT, 211 SPILL_TYPE_TMU 212}; 213 214static enum temp_spill_type 215get_spill_type_for_temp(struct v3d_compile *c, int temp) 216{ 217 if (vir_is_mov_uniform(c, temp)) 218 return SPILL_TYPE_UNIFORM; 219 220 if (can_reconstruct_temp(c, temp)) 221 return SPILL_TYPE_RECONSTRUCT; 222 223 return SPILL_TYPE_TMU; 224} 225 226static int 227v3d_choose_spill_node(struct v3d_compile *c) 228{ 229 const float tmu_scale = 10; 230 float block_scale = 1.0; 231 float spill_costs[c->num_temps]; 232 bool in_tmu_operation = false; 233 bool started_last_seg = false; 234 235 for (unsigned i = 0; i < c->num_temps; i++) 236 spill_costs[i] = 0.0; 237 238 /* XXX: Scale the cost up when inside of a loop. */ 239 vir_for_each_block(block, c) { 240 vir_for_each_inst(inst, block) { 241 /* We can't insert new thread switches after 242 * starting output writes. 243 */ 244 bool no_spilling = 245 (c->threads > 1 && started_last_seg) || 246 (c->max_tmu_spills == 0); 247 248 /* Discourage spilling of TMU operations */ 249 for (int i = 0; i < vir_get_nsrc(inst); i++) { 250 if (inst->src[i].file != QFILE_TEMP) 251 continue; 252 253 int temp = inst->src[i].index; 254 enum temp_spill_type spill_type = 255 get_spill_type_for_temp(c, temp); 256 257 if (spill_type != SPILL_TYPE_TMU) { 258 spill_costs[temp] += block_scale; 259 } else if (!no_spilling) { 260 float tmu_op_scale = in_tmu_operation ? 261 3.0 : 1.0; 262 spill_costs[temp] += (block_scale * 263 tmu_scale * 264 tmu_op_scale); 265 } else { 266 BITSET_CLEAR(c->spillable, temp); 267 } 268 } 269 270 if (inst->dst.file == QFILE_TEMP) { 271 int temp = inst->dst.index; 272 enum temp_spill_type spill_type = 273 get_spill_type_for_temp(c, temp); 274 275 if (spill_type != SPILL_TYPE_TMU) { 276 /* We just rematerialize it later */ 277 } else if (!no_spilling) { 278 spill_costs[temp] += (block_scale * 279 tmu_scale); 280 } else { 281 BITSET_CLEAR(c->spillable, temp); 282 } 283 } 284 285 /* Refuse to spill a ldvary's dst, because that means 286 * that ldvary's r5 would end up being used across a 287 * thrsw. 288 */ 289 if (inst->qpu.sig.ldvary) { 290 assert(inst->dst.file == QFILE_TEMP); 291 BITSET_CLEAR(c->spillable, inst->dst.index); 292 } 293 294 if (inst->is_last_thrsw) 295 started_last_seg = true; 296 297 if (v3d_qpu_writes_vpm(&inst->qpu) || 298 v3d_qpu_uses_tlb(&inst->qpu)) 299 started_last_seg = true; 300 301 /* Track when we're in between a TMU setup and the 302 * final LDTMU or TMUWT from that TMU setup. We 303 * penalize spills during that time. 304 */ 305 if (is_end_of_tmu_sequence(c->devinfo, inst, block)) 306 in_tmu_operation = false; 307 308 if (qinst_writes_tmu(c->devinfo, inst)) 309 in_tmu_operation = true; 310 } 311 } 312 313 for (unsigned i = 0; i < c->num_temps; i++) { 314 if (BITSET_TEST(c->spillable, i)) { 315 ra_set_node_spill_cost(c->g, temp_to_node(i), 316 spill_costs[i]); 317 } 318 } 319 320 return ra_get_best_spill_node(c->g); 321} 322 323static void 324ensure_nodes(struct v3d_compile *c) 325{ 326 if (c->num_temps < c->nodes.alloc_count) 327 return; 328 329 c->nodes.alloc_count *= 2; 330 c->nodes.info = reralloc_array_size(c, 331 c->nodes.info, 332 sizeof(c->nodes.info[0]), 333 c->nodes.alloc_count + ACC_COUNT); 334} 335 336/* Creates the interference node for a new temp. We use this to keep the node 337 * list updated during the spilling process, which generates new temps/nodes. 338 */ 339static void 340add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits) 341{ 342 ensure_nodes(c); 343 344 int node = ra_add_node(c->g, choose_reg_class(c, class_bits)); 345 assert(node == temp + ACC_COUNT); 346 347 /* We fill the node priority after we are done inserting spills */ 348 c->nodes.info[node].class_bits = class_bits; 349 c->nodes.info[node].priority = 0; 350} 351 352/* The spill offset for this thread takes a bit of setup, so do it once at 353 * program start. 354 */ 355void 356v3d_setup_spill_base(struct v3d_compile *c) 357{ 358 /* Setting up the spill base is done in the entry block; so change 359 * both the current block to emit and the cursor. 360 */ 361 struct qblock *current_block = c->cur_block; 362 c->cur_block = vir_entry_block(c); 363 c->cursor = vir_before_block(c->cur_block); 364 365 int start_num_temps = c->num_temps; 366 367 /* Each thread wants to be in a separate region of the scratch space 368 * so that the QPUs aren't fighting over cache lines. We have the 369 * driver keep a single global spill BO rather than 370 * per-spilling-program BOs, so we need a uniform from the driver for 371 * what the per-thread scale is. 372 */ 373 struct qreg thread_offset = 374 vir_UMUL(c, 375 vir_TIDX(c), 376 vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0)); 377 378 /* Each channel in a reg is 4 bytes, so scale them up by that. */ 379 struct qreg element_offset = vir_SHL(c, vir_EIDX(c), 380 vir_uniform_ui(c, 2)); 381 382 c->spill_base = vir_ADD(c, 383 vir_ADD(c, thread_offset, element_offset), 384 vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0)); 385 386 /* Make sure that we don't spill the spilling setup instructions. */ 387 for (int i = start_num_temps; i < c->num_temps; i++) { 388 BITSET_CLEAR(c->spillable, i); 389 390 /* If we are spilling, update the RA map with the temps added 391 * by the spill setup. Our spill_base register can never be an 392 * accumulator because it is used for TMU spill/fill and thus 393 * needs to persist across thread switches. 394 */ 395 if (c->spilling) { 396 int temp_class = CLASS_BITS_PHYS; 397 if (i != c->spill_base.index) 398 temp_class |= CLASS_BITS_ACC; 399 add_node(c, i, temp_class); 400 } 401 } 402 403 /* Restore the current block. */ 404 c->cur_block = current_block; 405 c->cursor = vir_after_block(c->cur_block); 406} 407 408/** 409 * Computes the address for a spill/fill sequence and completes the spill/fill 410 * sequence by emitting the following code: 411 * 412 * ldunif.spill_offset 413 * add tmua spill_base spill_offset 414 * thrsw 415 * 416 * If the sequence is for a spill, then it will emit a tmuwt after the thrsw, 417 * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'. 418 * 419 * The parameter 'ip' represents the ip at which the spill/fill is happening. 420 * This is used to disallow accumulators on temps that cross this ip boundary 421 * due to the new thrsw itroduced in the sequence above. 422 */ 423static void 424v3d_emit_spill_tmua(struct v3d_compile *c, 425 uint32_t spill_offset, 426 enum v3d_qpu_cond cond, 427 int32_t ip, 428 struct qreg *fill_dst) 429{ 430 assert(ip >= 0); 431 432 /* Load a uniform with the spill offset and add it to the spill base 433 * to obtain the TMUA address. It can be of class ANY because we know 434 * we are consuming it immediately without thrsw in between. 435 */ 436 assert(c->disable_ldunif_opt); 437 struct qreg offset = vir_uniform_ui(c, spill_offset); 438 add_node(c, offset.index, CLASS_BITS_ANY); 439 440 /* We always enable per-quad on spills/fills to ensure we spill 441 * any channels involved with helper invocations. 442 */ 443 struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); 444 struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset); 445 inst->qpu.flags.ac = cond; 446 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 447 0xffffff7f); /* per-quad */ 448 449 vir_emit_thrsw(c); 450 451 /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the 452 * result of the fill. The TMUWT temp is not really read, the ldtmu 453 * temp will be used immediately so just like the uniform above we 454 * can allow accumulators. 455 */ 456 if (!fill_dst) { 457 struct qreg dst = vir_TMUWT(c); 458 assert(dst.file == QFILE_TEMP); 459 add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC); 460 } else { 461 *fill_dst = vir_LDTMU(c); 462 assert(fill_dst->file == QFILE_TEMP); 463 add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC); 464 } 465 466 /* Temps across the thread switch we injected can't be assigned to 467 * accumulators. 468 * 469 * Fills inject code before ip, so anything that starts at ip or later 470 * is not affected by the thrsw. Something that ends at ip will be 471 * affected though. 472 * 473 * Spills inject code after ip, so anything that starts strictly later 474 * than ip is not affected (the temp starting at ip is usually the 475 * spilled temp except for postponed spills). Something that ends at ip 476 * won't be affected either. 477 */ 478 for (int i = 0; i < c->spill_start_num_temps; i++) { 479 bool thrsw_cross = fill_dst ? 480 c->temp_start[i] < ip && c->temp_end[i] >= ip : 481 c->temp_start[i] <= ip && c->temp_end[i] > ip; 482 if (thrsw_cross) { 483 ra_set_node_class(c->g, temp_to_node(i), 484 choose_reg_class(c, CLASS_BITS_PHYS)); 485 } 486 } 487} 488 489static void 490v3d_emit_tmu_spill(struct v3d_compile *c, 491 struct qinst *inst, 492 struct qreg spill_temp, 493 struct qinst *position, 494 uint32_t ip, 495 uint32_t spill_offset) 496{ 497 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 498 assert(inst->dst.file == QFILE_TEMP); 499 500 c->cursor = vir_after_inst(position); 501 502 enum v3d_qpu_cond cond = vir_get_cond(inst); 503 504 /* If inst and position don't match, this is a postponed spill, 505 * in which case we have already allocated the temp for the spill 506 * and we should use that, otherwise create a new temp with the 507 * same register class bits as the original. 508 */ 509 if (inst == position) { 510 uint8_t class_bits = get_temp_class_bits(&c->nodes, 511 inst->dst.index); 512 inst->dst = vir_get_temp(c); 513 add_node(c, inst->dst.index, class_bits); 514 } else { 515 inst->dst = spill_temp; 516 517 /* If this is a postponed spill the register being spilled may 518 * have been written more than once including conditional 519 * writes, so ignore predication on the spill instruction and 520 * always spill the full register. 521 */ 522 cond = V3D_QPU_COND_NONE; 523 } 524 525 struct qinst *tmp = 526 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), 527 inst->dst); 528 tmp->qpu.flags.mc = cond; 529 530 v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL); 531 532 c->spills++; 533 c->tmu_dirty_rcl = true; 534} 535 536static inline bool 537interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end) 538{ 539 return !(t0_start >= t1_end || t1_start >= t0_end); 540} 541 542static void 543v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) 544{ 545 c->spill_start_num_temps = c->num_temps; 546 c->spilling = true; 547 548 enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp); 549 550 uint32_t spill_offset = 0; 551 if (spill_type == SPILL_TYPE_TMU) { 552 spill_offset = c->spill_size; 553 c->spill_size += V3D_CHANNELS * sizeof(uint32_t); 554 555 if (spill_offset == 0) 556 v3d_setup_spill_base(c); 557 } 558 559 struct qinst *last_thrsw = c->last_thrsw; 560 assert(last_thrsw && last_thrsw->is_last_thrsw); 561 562 int uniform_index = ~0; 563 if (spill_type == SPILL_TYPE_UNIFORM) { 564 struct qinst *orig_unif = c->defs[spill_temp]; 565 uniform_index = orig_unif->uniform; 566 } 567 568 enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP; 569 if (spill_type == SPILL_TYPE_RECONSTRUCT) { 570 struct qinst *orig_def = c->defs[spill_temp]; 571 assert(vir_is_add(orig_def)); 572 reconstruct_op = orig_def->qpu.alu.add.op; 573 } 574 575 uint32_t spill_node = temp_to_node(spill_temp); 576 577 /* We must disable the ldunif optimization if we are spilling uniforms */ 578 bool had_disable_ldunif_opt = c->disable_ldunif_opt; 579 c->disable_ldunif_opt = true; 580 581 struct qinst *start_of_tmu_sequence = NULL; 582 struct qinst *postponed_spill = NULL; 583 struct qreg postponed_spill_temp = { 0 }; 584 vir_for_each_block(block, c) { 585 vir_for_each_inst_safe(inst, block) { 586 int32_t ip = inst->ip; 587 588 /* Track when we're in between a TMU setup and the final 589 * LDTMU or TMUWT from that TMU setup. We can't spill/fill any 590 * temps during that time, because that involves inserting a 591 * new TMU setup/LDTMU sequence, so we postpone the spill or 592 * move the fill up to not intrude in the middle of the TMU 593 * sequence. 594 */ 595 if (is_end_of_tmu_sequence(c->devinfo, inst, block)) { 596 if (postponed_spill) { 597 v3d_emit_tmu_spill(c, postponed_spill, 598 postponed_spill_temp, 599 inst, ip, spill_offset); 600 } 601 602 start_of_tmu_sequence = NULL; 603 postponed_spill = NULL; 604 } 605 606 if (!start_of_tmu_sequence && 607 qinst_writes_tmu(c->devinfo, inst)) { 608 start_of_tmu_sequence = inst; 609 } 610 611 /* fills */ 612 int filled_src = -1; 613 for (int i = 0; i < vir_get_nsrc(inst); i++) { 614 if (inst->src[i].file != QFILE_TEMP || 615 inst->src[i].index != spill_temp) { 616 continue; 617 } 618 619 if (filled_src >= 0) { 620 inst->src[i] = inst->src[filled_src]; 621 continue; 622 } 623 624 c->cursor = vir_before_inst(inst); 625 626 if (spill_type == SPILL_TYPE_UNIFORM) { 627 struct qreg unif = 628 vir_uniform(c, 629 c->uniform_contents[uniform_index], 630 c->uniform_data[uniform_index]); 631 inst->src[i] = unif; 632 /* We are using the uniform in the 633 * instruction immediately after, so 634 * we can use any register class for it. 635 */ 636 add_node(c, unif.index, CLASS_BITS_ANY); 637 } else if (spill_type == SPILL_TYPE_RECONSTRUCT) { 638 struct qreg temp = 639 reconstruct_temp(c, reconstruct_op); 640 inst->src[i] = temp; 641 /* We are using the temp in the 642 * instruction immediately after so we 643 * can use ACC. 644 */ 645 add_node(c, temp.index, CLASS_BITS_PHYS | 646 CLASS_BITS_ACC); 647 } else { 648 /* If we have a postponed spill, we 649 * don't need a fill as the temp would 650 * not have been spilled yet, however, 651 * we need to update the temp index. 652 */ 653 if (postponed_spill) { 654 inst->src[i] = 655 postponed_spill_temp; 656 } else { 657 int32_t fill_ip = ip; 658 if (start_of_tmu_sequence) { 659 c->cursor = vir_before_inst(start_of_tmu_sequence); 660 fill_ip = start_of_tmu_sequence->ip; 661 } 662 663 v3d_emit_spill_tmua(c, spill_offset, 664 V3D_QPU_COND_NONE, 665 fill_ip, &inst->src[i]); 666 c->fills++; 667 } 668 } 669 670 filled_src = i; 671 } 672 673 /* spills */ 674 if (inst->dst.file == QFILE_TEMP && 675 inst->dst.index == spill_temp) { 676 if (spill_type != SPILL_TYPE_TMU) { 677 c->cursor.link = NULL; 678 vir_remove_instruction(c, inst); 679 } else { 680 /* If we are in the middle of a TMU 681 * sequence, we postpone the actual 682 * spill until we have finished it. We, 683 * still need to replace the spill temp 684 * with a new temp though. 685 */ 686 if (start_of_tmu_sequence) { 687 if (postponed_spill) { 688 postponed_spill->dst = 689 postponed_spill_temp; 690 } 691 if (!postponed_spill || 692 vir_get_cond(inst) == V3D_QPU_COND_NONE) { 693 postponed_spill_temp = 694 vir_get_temp(c); 695 add_node(c, 696 postponed_spill_temp.index, 697 c->nodes.info[spill_node].class_bits); 698 } 699 postponed_spill = inst; 700 } else { 701 v3d_emit_tmu_spill(c, inst, 702 postponed_spill_temp, 703 inst, ip, 704 spill_offset); 705 } 706 } 707 } 708 } 709 } 710 711 /* Make sure c->last_thrsw is the actual last thrsw, not just one we 712 * inserted in our most recent unspill. 713 */ 714 c->last_thrsw = last_thrsw; 715 716 /* Don't allow spilling of our spilling instructions. There's no way 717 * they can help get things colored. 718 */ 719 for (int i = c->spill_start_num_temps; i < c->num_temps; i++) 720 BITSET_CLEAR(c->spillable, i); 721 722 /* Reset interference for spilled node */ 723 ra_set_node_spill_cost(c->g, spill_node, 0); 724 ra_reset_node_interference(c->g, spill_node); 725 BITSET_CLEAR(c->spillable, spill_temp); 726 727 /* Rebuild program ips */ 728 int32_t ip = 0; 729 vir_for_each_inst_inorder(inst, c) 730 inst->ip = ip++; 731 732 /* Rebuild liveness */ 733 vir_calculate_live_intervals(c); 734 735 /* Add interferences for the new spilled temps and update interferences 736 * for c->spill_base (since we may have modified its liveness). Also, 737 * update node priorities based one new liveness data. 738 */ 739 uint32_t sb_temp =c->spill_base.index; 740 uint32_t sb_node = temp_to_node(sb_temp); 741 for (uint32_t i = 0; i < c->num_temps; i++) { 742 if (c->temp_end[i] == -1) 743 continue; 744 745 uint32_t node_i = temp_to_node(i); 746 c->nodes.info[node_i].priority = 747 c->temp_end[i] - c->temp_start[i]; 748 749 for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps); 750 j < c->num_temps; j++) { 751 if (interferes(c->temp_start[i], c->temp_end[i], 752 c->temp_start[j], c->temp_end[j])) { 753 uint32_t node_j = temp_to_node(j); 754 ra_add_node_interference(c->g, node_i, node_j); 755 } 756 } 757 758 if (spill_type == SPILL_TYPE_TMU) { 759 if (i != sb_temp && 760 interferes(c->temp_start[i], c->temp_end[i], 761 c->temp_start[sb_temp], c->temp_end[sb_temp])) { 762 ra_add_node_interference(c->g, node_i, sb_node); 763 } 764 } 765 } 766 767 c->disable_ldunif_opt = had_disable_ldunif_opt; 768 c->spilling = false; 769} 770 771struct v3d_ra_select_callback_data { 772 uint32_t next_acc; 773 uint32_t next_phys; 774 struct v3d_ra_node_info *nodes; 775}; 776 777/* Choosing accumulators improves chances of merging QPU instructions 778 * due to these merges requiring that at most 2 rf registers are used 779 * by the add and mul instructions. 780 */ 781static bool 782v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, 783 BITSET_WORD *regs, 784 int priority) 785{ 786 /* Favor accumulators if we have less that this number of physical 787 * registers. Accumulators have more restrictions (like being 788 * invalidated through thrsw), so running out of physical registers 789 * even if we have accumulators available can lead to register 790 * allocation failures. 791 */ 792 static const int available_rf_threshold = 5; 793 int available_rf = 0 ; 794 for (int i = 0; i < PHYS_COUNT; i++) { 795 if (BITSET_TEST(regs, PHYS_INDEX + i)) 796 available_rf++; 797 if (available_rf >= available_rf_threshold) 798 break; 799 } 800 if (available_rf < available_rf_threshold) 801 return true; 802 803 /* Favor accumulators for short-lived temps (our priority represents 804 * liveness), to prevent long-lived temps from grabbing accumulators 805 * and preventing follow-up instructions from using them, potentially 806 * leading to large portions of the shader being unable to use 807 * accumulators and therefore merge instructions successfully. 808 */ 809 static const int priority_threshold = 20; 810 if (priority <= priority_threshold) 811 return true; 812 813 return false; 814} 815 816static bool 817v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, 818 BITSET_WORD *regs, 819 unsigned int *out) 820{ 821 /* Choose r5 for our ldunifs if possible (nobody else can load to that 822 * reg, and it keeps the QPU cond field free from being occupied by 823 * ldunifrf). 824 */ 825 int r5 = ACC_INDEX + 5; 826 if (BITSET_TEST(regs, r5)) { 827 *out = r5; 828 return true; 829 } 830 831 /* Round-robin through our accumulators to give post-RA instruction 832 * selection more options. 833 */ 834 for (int i = 0; i < ACC_COUNT; i++) { 835 int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT; 836 int acc = ACC_INDEX + acc_off; 837 838 if (BITSET_TEST(regs, acc)) { 839 v3d_ra->next_acc = acc_off + 1; 840 *out = acc; 841 return true; 842 } 843 } 844 845 return false; 846} 847 848static bool 849v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, 850 BITSET_WORD *regs, 851 unsigned int *out) 852{ 853 for (int i = 0; i < PHYS_COUNT; i++) { 854 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; 855 int phys = PHYS_INDEX + phys_off; 856 857 if (BITSET_TEST(regs, phys)) { 858 v3d_ra->next_phys = phys_off + 1; 859 *out = phys; 860 return true; 861 } 862 } 863 864 return false; 865} 866 867static unsigned int 868v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data) 869{ 870 struct v3d_ra_select_callback_data *v3d_ra = data; 871 872 unsigned int reg; 873 if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) && 874 v3d_ra_select_accum(v3d_ra, regs, ®)) { 875 return reg; 876 } 877 878 if (v3d_ra_select_rf(v3d_ra, regs, ®)) 879 return reg; 880 881 /* If we ran out of physical registers try to assign an accumulator 882 * if we didn't favor that option earlier. 883 */ 884 if (v3d_ra_select_accum(v3d_ra, regs, ®)) 885 return reg; 886 887 unreachable("RA must pass us at least one possible reg."); 888} 889 890bool 891vir_init_reg_sets(struct v3d_compiler *compiler) 892{ 893 /* Allocate up to 3 regfile classes, for the ways the physical 894 * register file can be divided up for fragment shader threading. 895 */ 896 int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3); 897 898 compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT, 899 false); 900 if (!compiler->regs) 901 return false; 902 903 for (int threads = 0; threads < max_thread_index; threads++) { 904 compiler->reg_class_any[threads] = 905 ra_alloc_contig_reg_class(compiler->regs, 1); 906 compiler->reg_class_r5[threads] = 907 ra_alloc_contig_reg_class(compiler->regs, 1); 908 compiler->reg_class_phys_or_acc[threads] = 909 ra_alloc_contig_reg_class(compiler->regs, 1); 910 compiler->reg_class_phys[threads] = 911 ra_alloc_contig_reg_class(compiler->regs, 1); 912 913 for (int i = PHYS_INDEX; 914 i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) { 915 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); 916 ra_class_add_reg(compiler->reg_class_phys[threads], i); 917 ra_class_add_reg(compiler->reg_class_any[threads], i); 918 } 919 920 for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { 921 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); 922 ra_class_add_reg(compiler->reg_class_any[threads], i); 923 } 924 /* r5 can only store a single 32-bit value, so not much can 925 * use it. 926 */ 927 ra_class_add_reg(compiler->reg_class_r5[threads], 928 ACC_INDEX + 5); 929 ra_class_add_reg(compiler->reg_class_any[threads], 930 ACC_INDEX + 5); 931 } 932 933 ra_set_finalize(compiler->regs, NULL); 934 935 return true; 936} 937 938static inline bool 939tmu_spilling_allowed(struct v3d_compile *c) 940{ 941 return c->spills + c->fills < c->max_tmu_spills; 942} 943 944static void 945update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, 946 struct qinst *inst) 947{ 948 int32_t ip = inst->ip; 949 assert(ip >= 0); 950 951 /* If the instruction writes r3/r4 (and optionally moves its 952 * result to a temp), nothing else can be stored in r3/r4 across 953 * it. 954 */ 955 if (vir_writes_r3(c->devinfo, inst)) { 956 for (int i = 0; i < c->num_temps; i++) { 957 if (c->temp_start[i] < ip && c->temp_end[i] > ip) { 958 ra_add_node_interference(c->g, 959 temp_to_node(i), 960 acc_nodes[3]); 961 } 962 } 963 } 964 965 if (vir_writes_r4(c->devinfo, inst)) { 966 for (int i = 0; i < c->num_temps; i++) { 967 if (c->temp_start[i] < ip && c->temp_end[i] > ip) { 968 ra_add_node_interference(c->g, 969 temp_to_node(i), 970 acc_nodes[4]); 971 } 972 } 973 } 974 975 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { 976 switch (inst->qpu.alu.add.op) { 977 case V3D_QPU_A_LDVPMV_IN: 978 case V3D_QPU_A_LDVPMV_OUT: 979 case V3D_QPU_A_LDVPMD_IN: 980 case V3D_QPU_A_LDVPMD_OUT: 981 case V3D_QPU_A_LDVPMP: 982 case V3D_QPU_A_LDVPMG_IN: 983 case V3D_QPU_A_LDVPMG_OUT: { 984 /* LDVPMs only store to temps (the MA flag 985 * decides whether the LDVPM is in or out) 986 */ 987 assert(inst->dst.file == QFILE_TEMP); 988 set_temp_class_bits(&c->nodes, inst->dst.index, 989 CLASS_BITS_PHYS); 990 break; 991 } 992 993 case V3D_QPU_A_RECIP: 994 case V3D_QPU_A_RSQRT: 995 case V3D_QPU_A_EXP: 996 case V3D_QPU_A_LOG: 997 case V3D_QPU_A_SIN: 998 case V3D_QPU_A_RSQRT2: { 999 /* The SFU instructions write directly to the 1000 * phys regfile. 1001 */ 1002 assert(inst->dst.file == QFILE_TEMP); 1003 set_temp_class_bits(&c->nodes, inst->dst.index, 1004 CLASS_BITS_PHYS); 1005 break; 1006 } 1007 1008 default: 1009 break; 1010 } 1011 } 1012 1013 if (inst->src[0].file == QFILE_REG) { 1014 switch (inst->src[0].index) { 1015 case 0: 1016 case 1: 1017 case 2: 1018 case 3: { 1019 /* Payload setup instructions: Force allocate 1020 * the dst to the given register (so the MOV 1021 * will disappear). 1022 */ 1023 assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); 1024 assert(inst->dst.file == QFILE_TEMP); 1025 uint32_t node = temp_to_node(inst->dst.index); 1026 ra_set_node_reg(c->g, node, 1027 PHYS_INDEX + inst->src[0].index); 1028 break; 1029 } 1030 } 1031 } 1032 1033 if (inst->dst.file == QFILE_TEMP) { 1034 /* Only a ldunif gets to write to R5, which only has a 1035 * single 32-bit channel of storage. 1036 * 1037 * NOTE: ldunifa is subject to the same, however, going by 1038 * shader-db it is best to keep r5 exclusive to ldunif, probably 1039 * because ldunif has usually a shorter lifespan, allowing for 1040 * more accumulator reuse and QPU merges. 1041 */ 1042 if (!inst->qpu.sig.ldunif) { 1043 uint8_t class_bits = 1044 get_temp_class_bits(&c->nodes, inst->dst.index) & 1045 ~CLASS_BITS_R5; 1046 set_temp_class_bits(&c->nodes, inst->dst.index, 1047 class_bits); 1048 1049 } else { 1050 /* Until V3D 4.x, we could only load a uniform 1051 * to r5, so we'll need to spill if uniform 1052 * loads interfere with each other. 1053 */ 1054 if (c->devinfo->ver < 40) { 1055 set_temp_class_bits(&c->nodes, inst->dst.index, 1056 CLASS_BITS_R5); 1057 } 1058 } 1059 } 1060 1061 /* All accumulators are invalidated across a thread switch. */ 1062 if (inst->qpu.sig.thrsw) { 1063 for (int i = 0; i < c->num_temps; i++) { 1064 if (c->temp_start[i] < ip && c->temp_end[i] > ip) { 1065 set_temp_class_bits(&c->nodes, i, 1066 CLASS_BITS_PHYS); 1067 } 1068 } 1069 } 1070} 1071 1072/** 1073 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. 1074 * 1075 * The return value should be freed by the caller. 1076 */ 1077struct qpu_reg * 1078v3d_register_allocate(struct v3d_compile *c) 1079{ 1080 int acc_nodes[ACC_COUNT]; 1081 c->nodes = (struct v3d_ra_node_info) { 1082 .alloc_count = c->num_temps, 1083 .info = ralloc_array_size(c, sizeof(c->nodes.info[0]), 1084 c->num_temps + ACC_COUNT), 1085 }; 1086 1087 struct v3d_ra_select_callback_data callback_data = { 1088 .next_acc = 0, 1089 /* Start at RF3, to try to keep the TLB writes from using 1090 * RF0-2. 1091 */ 1092 .next_phys = 3, 1093 .nodes = &c->nodes, 1094 }; 1095 1096 vir_calculate_live_intervals(c); 1097 1098 /* Convert 1, 2, 4 threads to 0, 1, 2 index. 1099 * 1100 * V3D 4.x has double the physical register space, so 64 physical regs 1101 * are available at both 1x and 2x threading, and 4x has 32. 1102 */ 1103 c->thread_index = ffs(c->threads) - 1; 1104 if (c->devinfo->ver >= 40) { 1105 if (c->thread_index >= 1) 1106 c->thread_index--; 1107 } 1108 1109 c->g = ra_alloc_interference_graph(c->compiler->regs, 1110 c->num_temps + ARRAY_SIZE(acc_nodes)); 1111 ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data); 1112 1113 /* Make some fixed nodes for the accumulators, which we will need to 1114 * interfere with when ops have implied r3/r4 writes or for the thread 1115 * switches. We could represent these as classes for the nodes to 1116 * live in, but the classes take up a lot of memory to set up, so we 1117 * don't want to make too many. 1118 */ 1119 for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) { 1120 if (i < ACC_COUNT) { 1121 acc_nodes[i] = i; 1122 ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i); 1123 c->nodes.info[i].priority = 0; 1124 c->nodes.info[i].class_bits = 0; 1125 } else { 1126 uint32_t t = node_to_temp(i); 1127 c->nodes.info[i].priority = 1128 c->temp_end[t] - c->temp_start[t]; 1129 c->nodes.info[i].class_bits = CLASS_BITS_ANY; 1130 } 1131 } 1132 1133 /* Walk the instructions adding register class restrictions and 1134 * interferences. 1135 */ 1136 int ip = 0; 1137 vir_for_each_inst_inorder(inst, c) { 1138 inst->ip = ip++; 1139 update_graph_and_reg_classes_for_inst(c, acc_nodes, inst); 1140 } 1141 1142 /* Set the register classes for all our temporaries in the graph */ 1143 for (uint32_t i = 0; i < c->num_temps; i++) { 1144 ra_set_node_class(c->g, temp_to_node(i), 1145 choose_reg_class_for_temp(c, i)); 1146 } 1147 1148 /* Add register interferences based on liveness data */ 1149 for (uint32_t i = 0; i < c->num_temps; i++) { 1150 for (uint32_t j = i + 1; j < c->num_temps; j++) { 1151 if (interferes(c->temp_start[i], c->temp_end[i], 1152 c->temp_start[j], c->temp_end[j])) { 1153 ra_add_node_interference(c->g, 1154 temp_to_node(i), 1155 temp_to_node(j)); 1156 } 1157 } 1158 } 1159 1160 /* Debug option to force a bit of TMU spilling, for running 1161 * across conformance tests to make sure that spilling works. 1162 */ 1163 const int force_register_spills = 0; 1164 if (force_register_spills > 0) 1165 c->max_tmu_spills = UINT32_MAX; 1166 1167 struct qpu_reg *temp_registers = NULL; 1168 while (true) { 1169 if (c->spill_size < 1170 V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { 1171 int node = v3d_choose_spill_node(c); 1172 uint32_t temp = node_to_temp(node); 1173 if (node != -1) { 1174 v3d_spill_reg(c, acc_nodes, temp); 1175 continue; 1176 } 1177 } 1178 1179 if (ra_allocate(c->g)) 1180 break; 1181 1182 /* Failed allocation, try to spill */ 1183 int node = v3d_choose_spill_node(c); 1184 if (node == -1) 1185 goto spill_fail; 1186 1187 uint32_t temp = node_to_temp(node); 1188 enum temp_spill_type spill_type = 1189 get_spill_type_for_temp(c, temp); 1190 if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) { 1191 v3d_spill_reg(c, acc_nodes, temp); 1192 if (c->spills + c->fills > c->max_tmu_spills) 1193 goto spill_fail; 1194 } else { 1195 goto spill_fail; 1196 } 1197 } 1198 1199 /* Allocation was successful, build the 'temp -> reg' map */ 1200 temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); 1201 for (uint32_t i = 0; i < c->num_temps; i++) { 1202 int ra_reg = ra_get_node_reg(c->g, temp_to_node(i)); 1203 if (ra_reg < PHYS_INDEX) { 1204 temp_registers[i].magic = true; 1205 temp_registers[i].index = (V3D_QPU_WADDR_R0 + 1206 ra_reg - ACC_INDEX); 1207 } else { 1208 temp_registers[i].magic = false; 1209 temp_registers[i].index = ra_reg - PHYS_INDEX; 1210 } 1211 } 1212 1213spill_fail: 1214 ralloc_free(c->nodes.info); 1215 c->nodes.info = NULL; 1216 c->nodes.alloc_count = 0; 1217 ralloc_free(c->g); 1218 c->g = NULL; 1219 return temp_registers; 1220} 1221