1/* 2 * Copyright © 2010 Intel Corporation 3 * Copyright © 2014-2017 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25/** 26 * @file 27 * 28 * The basic model of the list scheduler is to take a basic block, compute a 29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically 30 * pick a DAG head, then put all the children that are now DAG heads into the 31 * list of things to schedule. 32 * 33 * The goal of scheduling here is to pack pairs of operations together in a 34 * single QPU instruction. 35 */ 36 37#include "qpu/qpu_disasm.h" 38#include "v3d_compiler.h" 39#include "util/ralloc.h" 40#include "util/dag.h" 41 42static bool debug; 43 44struct schedule_node_child; 45 46struct schedule_node { 47 struct dag_node dag; 48 struct list_head link; 49 struct qinst *inst; 50 51 /* Longest cycles + instruction_latency() of any parent of this node. */ 52 uint32_t unblocked_time; 53 54 /** 55 * Minimum number of cycles from scheduling this instruction until the 56 * end of the program, based on the slowest dependency chain through 57 * the children. 58 */ 59 uint32_t delay; 60 61 /** 62 * cycles between this instruction being scheduled and when its result 63 * can be consumed. 64 */ 65 uint32_t latency; 66}; 67 68/* When walking the instructions in reverse, we need to swap before/after in 69 * add_dep(). 70 */ 71enum direction { F, R }; 72 73struct schedule_state { 74 const struct v3d_device_info *devinfo; 75 struct dag *dag; 76 struct schedule_node *last_r[6]; 77 struct schedule_node *last_rf[64]; 78 struct schedule_node *last_sf; 79 struct schedule_node *last_vpm_read; 80 struct schedule_node *last_tmu_write; 81 struct schedule_node *last_tmu_config; 82 struct schedule_node *last_tmu_read; 83 struct schedule_node *last_tlb; 84 struct schedule_node *last_vpm; 85 struct schedule_node *last_unif; 86 struct schedule_node *last_rtop; 87 struct schedule_node *last_unifa; 88 enum direction dir; 89 /* Estimated cycle when the current instruction would start. */ 90 uint32_t time; 91}; 92 93static void 94add_dep(struct schedule_state *state, 95 struct schedule_node *before, 96 struct schedule_node *after, 97 bool write) 98{ 99 bool write_after_read = !write && state->dir == R; 100 uintptr_t edge_data = write_after_read; 101 102 if (!before || !after) 103 return; 104 105 assert(before != after); 106 107 if (state->dir == F) 108 dag_add_edge(&before->dag, &after->dag, edge_data); 109 else 110 dag_add_edge(&after->dag, &before->dag, edge_data); 111} 112 113static void 114add_read_dep(struct schedule_state *state, 115 struct schedule_node *before, 116 struct schedule_node *after) 117{ 118 add_dep(state, before, after, false); 119} 120 121static void 122add_write_dep(struct schedule_state *state, 123 struct schedule_node **before, 124 struct schedule_node *after) 125{ 126 add_dep(state, *before, after, true); 127 *before = after; 128} 129 130static bool 131qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) 132{ 133 if (inst->sig.ldtlb || inst->sig.ldtlbu) 134 return true; 135 136 if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 137 return false; 138 139 if (inst->alu.add.magic_write && 140 (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || 141 inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) 142 return true; 143 144 if (inst->alu.mul.magic_write && 145 (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || 146 inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) 147 return true; 148 149 return false; 150} 151 152static void 153process_mux_deps(struct schedule_state *state, struct schedule_node *n, 154 enum v3d_qpu_mux mux) 155{ 156 switch (mux) { 157 case V3D_QPU_MUX_A: 158 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); 159 break; 160 case V3D_QPU_MUX_B: 161 if (!n->inst->qpu.sig.small_imm) { 162 add_read_dep(state, 163 state->last_rf[n->inst->qpu.raddr_b], n); 164 } 165 break; 166 default: 167 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); 168 break; 169 } 170} 171 172static bool 173tmu_write_is_sequence_terminator(uint32_t waddr) 174{ 175 switch (waddr) { 176 case V3D_QPU_WADDR_TMUS: 177 case V3D_QPU_WADDR_TMUSCM: 178 case V3D_QPU_WADDR_TMUSF: 179 case V3D_QPU_WADDR_TMUSLOD: 180 case V3D_QPU_WADDR_TMUA: 181 case V3D_QPU_WADDR_TMUAU: 182 return true; 183 default: 184 return false; 185 } 186} 187 188static bool 189can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr) 190{ 191 if (devinfo->ver < 40) 192 return false; 193 194 if (tmu_write_is_sequence_terminator(waddr)) 195 return false; 196 197 if (waddr == V3D_QPU_WADDR_TMUD) 198 return false; 199 200 return true; 201} 202 203static void 204process_waddr_deps(struct schedule_state *state, struct schedule_node *n, 205 uint32_t waddr, bool magic) 206{ 207 if (!magic) { 208 add_write_dep(state, &state->last_rf[waddr], n); 209 } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) { 210 if (can_reorder_tmu_write(state->devinfo, waddr)) 211 add_read_dep(state, state->last_tmu_write, n); 212 else 213 add_write_dep(state, &state->last_tmu_write, n); 214 215 if (tmu_write_is_sequence_terminator(waddr)) 216 add_write_dep(state, &state->last_tmu_config, n); 217 } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) { 218 /* Handled by v3d_qpu_writes_r4() check. */ 219 } else { 220 switch (waddr) { 221 case V3D_QPU_WADDR_R0: 222 case V3D_QPU_WADDR_R1: 223 case V3D_QPU_WADDR_R2: 224 add_write_dep(state, 225 &state->last_r[waddr - V3D_QPU_WADDR_R0], 226 n); 227 break; 228 case V3D_QPU_WADDR_R3: 229 case V3D_QPU_WADDR_R4: 230 case V3D_QPU_WADDR_R5: 231 /* Handled by v3d_qpu_writes_r*() checks below. */ 232 break; 233 234 case V3D_QPU_WADDR_VPM: 235 case V3D_QPU_WADDR_VPMU: 236 add_write_dep(state, &state->last_vpm, n); 237 break; 238 239 case V3D_QPU_WADDR_TLB: 240 case V3D_QPU_WADDR_TLBU: 241 add_write_dep(state, &state->last_tlb, n); 242 break; 243 244 case V3D_QPU_WADDR_SYNC: 245 case V3D_QPU_WADDR_SYNCB: 246 case V3D_QPU_WADDR_SYNCU: 247 /* For CS barrier(): Sync against any other memory 248 * accesses. There doesn't appear to be any need for 249 * barriers to affect ALU operations. 250 */ 251 add_write_dep(state, &state->last_tmu_write, n); 252 add_write_dep(state, &state->last_tmu_read, n); 253 break; 254 255 case V3D_QPU_WADDR_UNIFA: 256 if (state->devinfo->ver >= 40) 257 add_write_dep(state, &state->last_unifa, n); 258 break; 259 260 case V3D_QPU_WADDR_NOP: 261 break; 262 263 default: 264 fprintf(stderr, "Unknown waddr %d\n", waddr); 265 abort(); 266 } 267 } 268} 269 270/** 271 * Common code for dependencies that need to be tracked both forward and 272 * backward. 273 * 274 * This is for things like "all reads of r4 have to happen between the r4 275 * writes that surround them". 276 */ 277static void 278calculate_deps(struct schedule_state *state, struct schedule_node *n) 279{ 280 const struct v3d_device_info *devinfo = state->devinfo; 281 struct qinst *qinst = n->inst; 282 struct v3d_qpu_instr *inst = &qinst->qpu; 283 /* If the input and output segments are shared, then all VPM reads to 284 * a location need to happen before all writes. We handle this by 285 * serializing all VPM operations for now. 286 */ 287 bool separate_vpm_segment = false; 288 289 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 290 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) 291 add_read_dep(state, state->last_sf, n); 292 293 /* XXX: BDI */ 294 /* XXX: BDU */ 295 /* XXX: ub */ 296 /* XXX: raddr_a */ 297 298 add_write_dep(state, &state->last_unif, n); 299 return; 300 } 301 302 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 303 304 /* XXX: LOAD_IMM */ 305 306 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) 307 process_mux_deps(state, n, inst->alu.add.a); 308 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) 309 process_mux_deps(state, n, inst->alu.add.b); 310 311 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) 312 process_mux_deps(state, n, inst->alu.mul.a); 313 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) 314 process_mux_deps(state, n, inst->alu.mul.b); 315 316 switch (inst->alu.add.op) { 317 case V3D_QPU_A_VPMSETUP: 318 /* Could distinguish read/write by unpacking the uniform. */ 319 add_write_dep(state, &state->last_vpm, n); 320 add_write_dep(state, &state->last_vpm_read, n); 321 break; 322 323 case V3D_QPU_A_STVPMV: 324 case V3D_QPU_A_STVPMD: 325 case V3D_QPU_A_STVPMP: 326 add_write_dep(state, &state->last_vpm, n); 327 break; 328 329 case V3D_QPU_A_LDVPMV_IN: 330 case V3D_QPU_A_LDVPMD_IN: 331 case V3D_QPU_A_LDVPMG_IN: 332 case V3D_QPU_A_LDVPMP: 333 if (!separate_vpm_segment) 334 add_write_dep(state, &state->last_vpm, n); 335 break; 336 337 case V3D_QPU_A_VPMWT: 338 add_read_dep(state, state->last_vpm, n); 339 break; 340 341 case V3D_QPU_A_MSF: 342 add_read_dep(state, state->last_tlb, n); 343 break; 344 345 case V3D_QPU_A_SETMSF: 346 case V3D_QPU_A_SETREVF: 347 add_write_dep(state, &state->last_tlb, n); 348 break; 349 350 default: 351 break; 352 } 353 354 switch (inst->alu.mul.op) { 355 case V3D_QPU_M_MULTOP: 356 case V3D_QPU_M_UMUL24: 357 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and 358 * resets it to 0. We could possibly reorder umul24s relative 359 * to each other, but for now just keep all the MUL parts in 360 * order. 361 */ 362 add_write_dep(state, &state->last_rtop, n); 363 break; 364 default: 365 break; 366 } 367 368 if (inst->alu.add.op != V3D_QPU_A_NOP) { 369 process_waddr_deps(state, n, inst->alu.add.waddr, 370 inst->alu.add.magic_write); 371 } 372 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 373 process_waddr_deps(state, n, inst->alu.mul.waddr, 374 inst->alu.mul.magic_write); 375 } 376 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) { 377 process_waddr_deps(state, n, inst->sig_addr, 378 inst->sig_magic); 379 } 380 381 if (v3d_qpu_writes_r3(devinfo, inst)) 382 add_write_dep(state, &state->last_r[3], n); 383 if (v3d_qpu_writes_r4(devinfo, inst)) 384 add_write_dep(state, &state->last_r[4], n); 385 if (v3d_qpu_writes_r5(devinfo, inst)) 386 add_write_dep(state, &state->last_r[5], n); 387 388 /* If we add any more dependencies here we should consider whether we 389 * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. 390 */ 391 if (inst->sig.thrsw) { 392 /* All accumulator contents and flags are undefined after the 393 * switch. 394 */ 395 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) 396 add_write_dep(state, &state->last_r[i], n); 397 add_write_dep(state, &state->last_sf, n); 398 add_write_dep(state, &state->last_rtop, n); 399 400 /* Scoreboard-locking operations have to stay after the last 401 * thread switch. 402 */ 403 add_write_dep(state, &state->last_tlb, n); 404 405 add_write_dep(state, &state->last_tmu_write, n); 406 add_write_dep(state, &state->last_tmu_config, n); 407 } 408 409 if (v3d_qpu_waits_on_tmu(inst)) { 410 /* TMU loads are coming from a FIFO, so ordering is important. 411 */ 412 add_write_dep(state, &state->last_tmu_read, n); 413 /* Keep TMU loads after their TMU lookup terminator */ 414 add_read_dep(state, state->last_tmu_config, n); 415 } 416 417 /* Allow wrtmuc to be reordered with other instructions in the 418 * same TMU sequence by using a read dependency on the last TMU 419 * sequence terminator. 420 */ 421 if (inst->sig.wrtmuc) 422 add_read_dep(state, state->last_tmu_config, n); 423 424 if (inst->sig.ldtlb | inst->sig.ldtlbu) 425 add_write_dep(state, &state->last_tlb, n); 426 427 if (inst->sig.ldvpm) { 428 add_write_dep(state, &state->last_vpm_read, n); 429 430 /* At least for now, we're doing shared I/O segments, so queue 431 * all writes after all reads. 432 */ 433 if (!separate_vpm_segment) 434 add_write_dep(state, &state->last_vpm, n); 435 } 436 437 /* inst->sig.ldunif or sideband uniform read */ 438 if (vir_has_uniform(qinst)) 439 add_write_dep(state, &state->last_unif, n); 440 441 /* Both unifa and ldunifa must preserve ordering */ 442 if (inst->sig.ldunifa || inst->sig.ldunifarf) 443 add_write_dep(state, &state->last_unifa, n); 444 445 if (v3d_qpu_reads_flags(inst)) 446 add_read_dep(state, state->last_sf, n); 447 if (v3d_qpu_writes_flags(inst)) 448 add_write_dep(state, &state->last_sf, n); 449} 450 451static void 452calculate_forward_deps(struct v3d_compile *c, struct dag *dag, 453 struct list_head *schedule_list) 454{ 455 struct schedule_state state; 456 457 memset(&state, 0, sizeof(state)); 458 state.dag = dag; 459 state.devinfo = c->devinfo; 460 state.dir = F; 461 462 list_for_each_entry(struct schedule_node, node, schedule_list, link) 463 calculate_deps(&state, node); 464} 465 466static void 467calculate_reverse_deps(struct v3d_compile *c, struct dag *dag, 468 struct list_head *schedule_list) 469{ 470 struct schedule_state state; 471 472 memset(&state, 0, sizeof(state)); 473 state.dag = dag; 474 state.devinfo = c->devinfo; 475 state.dir = R; 476 477 list_for_each_entry_rev(struct schedule_node, node, schedule_list, 478 link) { 479 calculate_deps(&state, (struct schedule_node *)node); 480 } 481} 482 483struct choose_scoreboard { 484 struct dag *dag; 485 int tick; 486 int last_magic_sfu_write_tick; 487 int last_stallable_sfu_reg; 488 int last_stallable_sfu_tick; 489 int last_ldvary_tick; 490 int last_unifa_write_tick; 491 int last_uniforms_reset_tick; 492 int last_thrsw_tick; 493 int last_branch_tick; 494 int last_setmsf_tick; 495 bool first_thrsw_emitted; 496 bool last_thrsw_emitted; 497 bool fixup_ldvary; 498 int ldvary_count; 499}; 500 501static bool 502mux_reads_too_soon(struct choose_scoreboard *scoreboard, 503 const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) 504{ 505 switch (mux) { 506 case V3D_QPU_MUX_R4: 507 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2) 508 return true; 509 break; 510 511 case V3D_QPU_MUX_R5: 512 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) 513 return true; 514 break; 515 default: 516 break; 517 } 518 519 return false; 520} 521 522static bool 523reads_too_soon_after_write(struct choose_scoreboard *scoreboard, 524 struct qinst *qinst) 525{ 526 const struct v3d_qpu_instr *inst = &qinst->qpu; 527 528 /* XXX: Branching off of raddr. */ 529 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 530 return false; 531 532 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 533 534 if (inst->alu.add.op != V3D_QPU_A_NOP) { 535 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && 536 mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { 537 return true; 538 } 539 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && 540 mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { 541 return true; 542 } 543 } 544 545 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 546 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && 547 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { 548 return true; 549 } 550 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && 551 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { 552 return true; 553 } 554 } 555 556 /* XXX: imm */ 557 558 return false; 559} 560 561static bool 562writes_too_soon_after_write(const struct v3d_device_info *devinfo, 563 struct choose_scoreboard *scoreboard, 564 struct qinst *qinst) 565{ 566 const struct v3d_qpu_instr *inst = &qinst->qpu; 567 568 /* Don't schedule any other r4 write too soon after an SFU write. 569 * This would normally be prevented by dependency tracking, but might 570 * occur if a dead SFU computation makes it to scheduling. 571 */ 572 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 && 573 v3d_qpu_writes_r4(devinfo, inst)) 574 return true; 575 576 return false; 577} 578 579static bool 580scoreboard_is_locked(struct choose_scoreboard *scoreboard, 581 bool lock_scoreboard_on_first_thrsw) 582{ 583 if (lock_scoreboard_on_first_thrsw) { 584 return scoreboard->first_thrsw_emitted && 585 scoreboard->tick - scoreboard->last_thrsw_tick >= 3; 586 } 587 588 return scoreboard->last_thrsw_emitted && 589 scoreboard->tick - scoreboard->last_thrsw_tick >= 3; 590} 591 592static bool 593pixel_scoreboard_too_soon(struct v3d_compile *c, 594 struct choose_scoreboard *scoreboard, 595 const struct v3d_qpu_instr *inst) 596{ 597 return qpu_inst_is_tlb(inst) && 598 !scoreboard_is_locked(scoreboard, 599 c->lock_scoreboard_on_first_thrsw); 600} 601 602static bool 603qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, 604 uint32_t waddr) { 605 606 if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 607 return false; 608 609 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && 610 inst->raddr_a == waddr) 611 return true; 612 613 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && 614 !inst->sig.small_imm && (inst->raddr_b == waddr)) 615 return true; 616 617 return false; 618} 619 620static bool 621mux_read_stalls(struct choose_scoreboard *scoreboard, 622 const struct v3d_qpu_instr *inst) 623{ 624 return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && 625 qpu_instruction_uses_rf(inst, 626 scoreboard->last_stallable_sfu_reg); 627} 628 629/* We define a max schedule priority to allow negative priorities as result of 630 * substracting this max when an instruction stalls. So instructions that 631 * stall have lower priority than regular instructions. */ 632#define MAX_SCHEDULE_PRIORITY 16 633 634static int 635get_instruction_priority(const struct v3d_device_info *devinfo, 636 const struct v3d_qpu_instr *inst) 637{ 638 uint32_t baseline_score; 639 uint32_t next_score = 0; 640 641 /* Schedule TLB operations as late as possible, to get more 642 * parallelism between shaders. 643 */ 644 if (qpu_inst_is_tlb(inst)) 645 return next_score; 646 next_score++; 647 648 /* Empirical testing shows that using priorities to hide latency of 649 * TMU operations when scheduling QPU leads to slightly worse 650 * performance, even at 2 threads. We think this is because the thread 651 * switching is already quite effective at hiding latency and NIR 652 * scheduling (and possibly TMU pipelining too) are sufficient to hide 653 * TMU latency, so piling up on that here doesn't provide any benefits 654 * and instead may cause us to postpone critical paths that depend on 655 * the TMU results. 656 */ 657#if 0 658 /* Schedule texture read results collection late to hide latency. */ 659 if (v3d_qpu_waits_on_tmu(inst)) 660 return next_score; 661 next_score++; 662#endif 663 664 /* Default score for things that aren't otherwise special. */ 665 baseline_score = next_score; 666 next_score++; 667 668#if 0 669 /* Schedule texture read setup early to hide their latency better. */ 670 if (v3d_qpu_writes_tmu(devinfo, inst)) 671 return next_score; 672 next_score++; 673#endif 674 675 /* We should increase the maximum if we assert here */ 676 assert(next_score < MAX_SCHEDULE_PRIORITY); 677 678 return baseline_score; 679} 680 681enum { 682 V3D_PERIPHERAL_VPM_READ = (1 << 0), 683 V3D_PERIPHERAL_VPM_WRITE = (1 << 1), 684 V3D_PERIPHERAL_VPM_WAIT = (1 << 2), 685 V3D_PERIPHERAL_SFU = (1 << 3), 686 V3D_PERIPHERAL_TMU_WRITE = (1 << 4), 687 V3D_PERIPHERAL_TMU_READ = (1 << 5), 688 V3D_PERIPHERAL_TMU_WAIT = (1 << 6), 689 V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7), 690 V3D_PERIPHERAL_TSY = (1 << 8), 691 V3D_PERIPHERAL_TLB = (1 << 9), 692}; 693 694static uint32_t 695qpu_peripherals(const struct v3d_device_info *devinfo, 696 const struct v3d_qpu_instr *inst) 697{ 698 uint32_t result = 0; 699 if (v3d_qpu_reads_vpm(inst)) 700 result |= V3D_PERIPHERAL_VPM_READ; 701 if (v3d_qpu_writes_vpm(inst)) 702 result |= V3D_PERIPHERAL_VPM_WRITE; 703 if (v3d_qpu_waits_vpm(inst)) 704 result |= V3D_PERIPHERAL_VPM_WAIT; 705 706 if (v3d_qpu_writes_tmu(devinfo, inst)) 707 result |= V3D_PERIPHERAL_TMU_WRITE; 708 if (inst->sig.ldtmu) 709 result |= V3D_PERIPHERAL_TMU_READ; 710 if (inst->sig.wrtmuc) 711 result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG; 712 713 if (v3d_qpu_uses_sfu(inst)) 714 result |= V3D_PERIPHERAL_SFU; 715 716 if (v3d_qpu_uses_tlb(inst)) 717 result |= V3D_PERIPHERAL_TLB; 718 719 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 720 if (inst->alu.add.op != V3D_QPU_A_NOP && 721 inst->alu.add.magic_write && 722 v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) { 723 result |= V3D_PERIPHERAL_TSY; 724 } 725 726 if (inst->alu.add.op == V3D_QPU_A_TMUWT) 727 result |= V3D_PERIPHERAL_TMU_WAIT; 728 } 729 730 return result; 731} 732 733static bool 734qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, 735 const struct v3d_qpu_instr *a, 736 const struct v3d_qpu_instr *b) 737{ 738 const uint32_t a_peripherals = qpu_peripherals(devinfo, a); 739 const uint32_t b_peripherals = qpu_peripherals(devinfo, b); 740 741 /* We can always do one peripheral access per instruction. */ 742 if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1) 743 return true; 744 745 if (devinfo->ver < 41) 746 return false; 747 748 /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than 749 * tmuc). 750 */ 751 if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && 752 b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { 753 return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); 754 } 755 756 if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE && 757 b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) { 758 return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); 759 } 760 761 /* V3D 4.1+ allows TMU read with VPM read/write. */ 762 if (a_peripherals == V3D_PERIPHERAL_TMU_READ && 763 (b_peripherals == V3D_PERIPHERAL_VPM_READ || 764 b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { 765 return true; 766 } 767 if (b_peripherals == V3D_PERIPHERAL_TMU_READ && 768 (a_peripherals == V3D_PERIPHERAL_VPM_READ || 769 a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { 770 return true; 771 } 772 773 return false; 774} 775 776/* Compute a bitmask of which rf registers are used between 777 * the two instructions. 778 */ 779static uint64_t 780qpu_raddrs_used(const struct v3d_qpu_instr *a, 781 const struct v3d_qpu_instr *b) 782{ 783 assert(a->type == V3D_QPU_INSTR_TYPE_ALU); 784 assert(b->type == V3D_QPU_INSTR_TYPE_ALU); 785 786 uint64_t raddrs_used = 0; 787 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) 788 raddrs_used |= (1ll << a->raddr_a); 789 if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) 790 raddrs_used |= (1ll << a->raddr_b); 791 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) 792 raddrs_used |= (1ll << b->raddr_a); 793 if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) 794 raddrs_used |= (1ll << b->raddr_b); 795 796 return raddrs_used; 797} 798 799/* Take two instructions and attempt to merge their raddr fields 800 * into one merged instruction. Returns false if the two instructions 801 * access more than two different rf registers between them, or more 802 * than one rf register and one small immediate. 803 */ 804static bool 805qpu_merge_raddrs(struct v3d_qpu_instr *result, 806 const struct v3d_qpu_instr *add_instr, 807 const struct v3d_qpu_instr *mul_instr) 808{ 809 uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); 810 int naddrs = util_bitcount64(raddrs_used); 811 812 if (naddrs > 2) 813 return false; 814 815 if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { 816 if (naddrs > 1) 817 return false; 818 819 if (add_instr->sig.small_imm && mul_instr->sig.small_imm) 820 if (add_instr->raddr_b != mul_instr->raddr_b) 821 return false; 822 823 result->sig.small_imm = true; 824 result->raddr_b = add_instr->sig.small_imm ? 825 add_instr->raddr_b : mul_instr->raddr_b; 826 } 827 828 if (naddrs == 0) 829 return true; 830 831 int raddr_a = ffsll(raddrs_used) - 1; 832 raddrs_used &= ~(1ll << raddr_a); 833 result->raddr_a = raddr_a; 834 835 if (!result->sig.small_imm) { 836 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && 837 raddr_a == add_instr->raddr_b) { 838 if (add_instr->alu.add.a == V3D_QPU_MUX_B) 839 result->alu.add.a = V3D_QPU_MUX_A; 840 if (add_instr->alu.add.b == V3D_QPU_MUX_B && 841 v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { 842 result->alu.add.b = V3D_QPU_MUX_A; 843 } 844 } 845 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && 846 raddr_a == mul_instr->raddr_b) { 847 if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) 848 result->alu.mul.a = V3D_QPU_MUX_A; 849 if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && 850 v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { 851 result->alu.mul.b = V3D_QPU_MUX_A; 852 } 853 } 854 } 855 if (!raddrs_used) 856 return true; 857 858 int raddr_b = ffsll(raddrs_used) - 1; 859 result->raddr_b = raddr_b; 860 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && 861 raddr_b == add_instr->raddr_a) { 862 if (add_instr->alu.add.a == V3D_QPU_MUX_A) 863 result->alu.add.a = V3D_QPU_MUX_B; 864 if (add_instr->alu.add.b == V3D_QPU_MUX_A && 865 v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { 866 result->alu.add.b = V3D_QPU_MUX_B; 867 } 868 } 869 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && 870 raddr_b == mul_instr->raddr_a) { 871 if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) 872 result->alu.mul.a = V3D_QPU_MUX_B; 873 if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && 874 v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { 875 result->alu.mul.b = V3D_QPU_MUX_B; 876 } 877 } 878 879 return true; 880} 881 882static bool 883can_do_add_as_mul(enum v3d_qpu_add_op op) 884{ 885 switch (op) { 886 case V3D_QPU_A_ADD: 887 case V3D_QPU_A_SUB: 888 return true; 889 default: 890 return false; 891 } 892} 893 894static enum v3d_qpu_mul_op 895add_op_as_mul_op(enum v3d_qpu_add_op op) 896{ 897 switch (op) { 898 case V3D_QPU_A_ADD: 899 return V3D_QPU_M_ADD; 900 case V3D_QPU_A_SUB: 901 return V3D_QPU_M_SUB; 902 default: 903 unreachable("unexpected add opcode"); 904 } 905} 906 907static void 908qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) 909{ 910 STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); 911 assert(inst->alu.add.op != V3D_QPU_A_NOP); 912 assert(inst->alu.mul.op == V3D_QPU_M_NOP); 913 914 memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul)); 915 inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op); 916 inst->alu.add.op = V3D_QPU_A_NOP; 917 918 inst->flags.mc = inst->flags.ac; 919 inst->flags.mpf = inst->flags.apf; 920 inst->flags.muf = inst->flags.auf; 921 inst->flags.ac = V3D_QPU_COND_NONE; 922 inst->flags.apf = V3D_QPU_PF_NONE; 923 inst->flags.auf = V3D_QPU_UF_NONE; 924 925 inst->alu.mul.output_pack = inst->alu.add.output_pack; 926 inst->alu.mul.a_unpack = inst->alu.add.a_unpack; 927 inst->alu.mul.b_unpack = inst->alu.add.b_unpack; 928 inst->alu.add.output_pack = V3D_QPU_PACK_NONE; 929 inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE; 930 inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; 931} 932 933static bool 934qpu_merge_inst(const struct v3d_device_info *devinfo, 935 struct v3d_qpu_instr *result, 936 const struct v3d_qpu_instr *a, 937 const struct v3d_qpu_instr *b) 938{ 939 if (a->type != V3D_QPU_INSTR_TYPE_ALU || 940 b->type != V3D_QPU_INSTR_TYPE_ALU) { 941 return false; 942 } 943 944 if (!qpu_compatible_peripheral_access(devinfo, a, b)) 945 return false; 946 947 struct v3d_qpu_instr merge = *a; 948 const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL; 949 950 struct v3d_qpu_instr mul_inst; 951 if (b->alu.add.op != V3D_QPU_A_NOP) { 952 if (a->alu.add.op == V3D_QPU_A_NOP) { 953 merge.alu.add = b->alu.add; 954 955 merge.flags.ac = b->flags.ac; 956 merge.flags.apf = b->flags.apf; 957 merge.flags.auf = b->flags.auf; 958 959 add_instr = b; 960 mul_instr = a; 961 } 962 /* If a's add op is used but its mul op is not, then see if we 963 * can convert either a's add op or b's add op to a mul op 964 * so we can merge. 965 */ 966 else if (a->alu.mul.op == V3D_QPU_M_NOP && 967 can_do_add_as_mul(b->alu.add.op)) { 968 mul_inst = *b; 969 qpu_convert_add_to_mul(&mul_inst); 970 971 merge.alu.mul = mul_inst.alu.mul; 972 973 merge.flags.mc = b->flags.ac; 974 merge.flags.mpf = b->flags.apf; 975 merge.flags.muf = b->flags.auf; 976 977 add_instr = a; 978 mul_instr = &mul_inst; 979 } else if (a->alu.mul.op == V3D_QPU_M_NOP && 980 can_do_add_as_mul(a->alu.add.op)) { 981 mul_inst = *a; 982 qpu_convert_add_to_mul(&mul_inst); 983 984 merge = mul_inst; 985 merge.alu.add = b->alu.add; 986 987 merge.flags.ac = b->flags.ac; 988 merge.flags.apf = b->flags.apf; 989 merge.flags.auf = b->flags.auf; 990 991 add_instr = b; 992 mul_instr = &mul_inst; 993 } else { 994 return false; 995 } 996 } 997 998 if (b->alu.mul.op != V3D_QPU_M_NOP) { 999 if (a->alu.mul.op != V3D_QPU_M_NOP) 1000 return false; 1001 merge.alu.mul = b->alu.mul; 1002 1003 merge.flags.mc = b->flags.mc; 1004 merge.flags.mpf = b->flags.mpf; 1005 merge.flags.muf = b->flags.muf; 1006 1007 mul_instr = b; 1008 add_instr = a; 1009 } 1010 1011 if (add_instr && mul_instr && 1012 !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { 1013 return false; 1014 } 1015 1016 merge.sig.thrsw |= b->sig.thrsw; 1017 merge.sig.ldunif |= b->sig.ldunif; 1018 merge.sig.ldunifrf |= b->sig.ldunifrf; 1019 merge.sig.ldunifa |= b->sig.ldunifa; 1020 merge.sig.ldunifarf |= b->sig.ldunifarf; 1021 merge.sig.ldtmu |= b->sig.ldtmu; 1022 merge.sig.ldvary |= b->sig.ldvary; 1023 merge.sig.ldvpm |= b->sig.ldvpm; 1024 merge.sig.small_imm |= b->sig.small_imm; 1025 merge.sig.ldtlb |= b->sig.ldtlb; 1026 merge.sig.ldtlbu |= b->sig.ldtlbu; 1027 merge.sig.ucb |= b->sig.ucb; 1028 merge.sig.rotate |= b->sig.rotate; 1029 merge.sig.wrtmuc |= b->sig.wrtmuc; 1030 1031 if (v3d_qpu_sig_writes_address(devinfo, &a->sig) && 1032 v3d_qpu_sig_writes_address(devinfo, &b->sig)) 1033 return false; 1034 merge.sig_addr |= b->sig_addr; 1035 merge.sig_magic |= b->sig_magic; 1036 1037 uint64_t packed; 1038 bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed); 1039 1040 *result = merge; 1041 /* No modifying the real instructions on failure. */ 1042 assert(ok || (a != result && b != result)); 1043 1044 return ok; 1045} 1046 1047static inline bool 1048try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst) 1049{ 1050 return inst->sig.ldunif || inst->sig.ldunifrf; 1051} 1052 1053static bool 1054qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, 1055 struct choose_scoreboard *scoreboard, 1056 const struct qinst *qinst); 1057 1058static struct schedule_node * 1059choose_instruction_to_schedule(struct v3d_compile *c, 1060 struct choose_scoreboard *scoreboard, 1061 struct schedule_node *prev_inst) 1062{ 1063 struct schedule_node *chosen = NULL; 1064 int chosen_prio = 0; 1065 1066 /* Don't pair up anything with a thread switch signal -- emit_thrsw() 1067 * will handle pairing it along with filling the delay slots. 1068 */ 1069 if (prev_inst) { 1070 if (prev_inst->inst->qpu.sig.thrsw) 1071 return NULL; 1072 } 1073 1074 bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT && 1075 scoreboard->ldvary_count < c->num_inputs; 1076 bool skipped_insts_for_ldvary_pipelining = false; 1077retry: 1078 list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, 1079 dag.link) { 1080 const struct v3d_qpu_instr *inst = &n->inst->qpu; 1081 1082 if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) { 1083 skipped_insts_for_ldvary_pipelining = true; 1084 continue; 1085 } 1086 1087 /* Don't choose the branch instruction until it's the last one 1088 * left. We'll move it up to fit its delay slots after we 1089 * choose it. 1090 */ 1091 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && 1092 !list_is_singular(&scoreboard->dag->heads)) { 1093 continue; 1094 } 1095 1096 /* We need to have 3 delay slots between a write to unifa and 1097 * a follow-up ldunifa. 1098 */ 1099 if ((inst->sig.ldunifa || inst->sig.ldunifarf) && 1100 scoreboard->tick - scoreboard->last_unifa_write_tick <= 3) 1101 continue; 1102 1103 /* "An instruction must not read from a location in physical 1104 * regfile A or B that was written to by the previous 1105 * instruction." 1106 */ 1107 if (reads_too_soon_after_write(scoreboard, n->inst)) 1108 continue; 1109 1110 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) 1111 continue; 1112 1113 /* "Before doing a TLB access a scoreboard wait must have been 1114 * done. This happens either on the first or last thread 1115 * switch, depending on a setting (scb_wait_on_first_thrsw) in 1116 * the shader state." 1117 */ 1118 if (pixel_scoreboard_too_soon(c, scoreboard, inst)) 1119 continue; 1120 1121 /* ldunif and ldvary both write r5, but ldunif does so a tick 1122 * sooner. If the ldvary's r5 wasn't used, then ldunif might 1123 * otherwise get scheduled so ldunif and ldvary try to update 1124 * r5 in the same tick. 1125 */ 1126 if ((inst->sig.ldunif || inst->sig.ldunifa) && 1127 scoreboard->tick == scoreboard->last_ldvary_tick + 1) { 1128 continue; 1129 } 1130 1131 /* If we are in a thrsw delay slot check that this instruction 1132 * is valid for that. 1133 */ 1134 if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick && 1135 !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard, 1136 n->inst)) { 1137 continue; 1138 } 1139 1140 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 1141 /* Don't try to put a branch in the delay slots of another 1142 * branch or a unifa write. 1143 */ 1144 if (scoreboard->last_branch_tick + 3 >= scoreboard->tick) 1145 continue; 1146 if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick) 1147 continue; 1148 1149 /* No branch with cond != 0,2,3 and msfign != 0 after 1150 * setmsf. 1151 */ 1152 if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 && 1153 inst->branch.msfign != V3D_QPU_MSFIGN_NONE && 1154 inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && 1155 inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && 1156 inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { 1157 continue; 1158 } 1159 } 1160 1161 /* If we're trying to pair with another instruction, check 1162 * that they're compatible. 1163 */ 1164 if (prev_inst) { 1165 /* Don't pair up a thread switch signal -- we'll 1166 * handle pairing it when we pick it on its own. 1167 */ 1168 if (inst->sig.thrsw) 1169 continue; 1170 1171 if (prev_inst->inst->uniform != -1 && 1172 n->inst->uniform != -1) 1173 continue; 1174 1175 /* Simulator complains if we have two uniforms loaded in 1176 * the the same instruction, which could happen if we 1177 * have a ldunif or sideband uniform and we pair that 1178 * with ldunifa. 1179 */ 1180 if (vir_has_uniform(prev_inst->inst) && 1181 (inst->sig.ldunifa || inst->sig.ldunifarf)) { 1182 continue; 1183 } 1184 1185 if ((prev_inst->inst->qpu.sig.ldunifa || 1186 prev_inst->inst->qpu.sig.ldunifarf) && 1187 vir_has_uniform(n->inst)) { 1188 continue; 1189 } 1190 1191 /* Don't merge TLB instructions before we have acquired 1192 * the scoreboard lock. 1193 */ 1194 if (pixel_scoreboard_too_soon(c, scoreboard, inst)) 1195 continue; 1196 1197 /* When we succesfully pair up an ldvary we then try 1198 * to merge it into the previous instruction if 1199 * possible to improve pipelining. Don't pick up the 1200 * ldvary now if the follow-up fixup would place 1201 * it in the delay slots of a thrsw, which is not 1202 * allowed and would prevent the fixup from being 1203 * successul. 1204 */ 1205 if (inst->sig.ldvary && 1206 scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { 1207 continue; 1208 } 1209 1210 struct v3d_qpu_instr merged_inst; 1211 if (!qpu_merge_inst(c->devinfo, &merged_inst, 1212 &prev_inst->inst->qpu, inst)) { 1213 continue; 1214 } 1215 } 1216 1217 int prio = get_instruction_priority(c->devinfo, inst); 1218 1219 if (mux_read_stalls(scoreboard, inst)) { 1220 /* Don't merge an instruction that stalls */ 1221 if (prev_inst) 1222 continue; 1223 else { 1224 /* Any instruction that don't stall will have 1225 * higher scheduling priority */ 1226 prio -= MAX_SCHEDULE_PRIORITY; 1227 assert(prio < 0); 1228 } 1229 } 1230 1231 /* Found a valid instruction. If nothing better comes along, 1232 * this one works. 1233 */ 1234 if (!chosen) { 1235 chosen = n; 1236 chosen_prio = prio; 1237 continue; 1238 } 1239 1240 if (prio > chosen_prio) { 1241 chosen = n; 1242 chosen_prio = prio; 1243 } else if (prio < chosen_prio) { 1244 continue; 1245 } 1246 1247 if (n->delay > chosen->delay) { 1248 chosen = n; 1249 chosen_prio = prio; 1250 } else if (n->delay < chosen->delay) { 1251 continue; 1252 } 1253 } 1254 1255 /* If we did not find any instruction to schedule but we discarded 1256 * some of them to prioritize ldvary pipelining, try again. 1257 */ 1258 if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) { 1259 skipped_insts_for_ldvary_pipelining = false; 1260 ldvary_pipelining = false; 1261 goto retry; 1262 } 1263 1264 if (chosen && chosen->inst->qpu.sig.ldvary) { 1265 scoreboard->ldvary_count++; 1266 /* If we are pairing an ldvary, flag it so we can fix it up for 1267 * optimal pipelining of ldvary sequences. 1268 */ 1269 if (prev_inst) 1270 scoreboard->fixup_ldvary = true; 1271 } 1272 1273 return chosen; 1274} 1275 1276static void 1277update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, 1278 enum v3d_qpu_waddr waddr, 1279 const struct v3d_device_info *devinfo) 1280{ 1281 if (v3d_qpu_magic_waddr_is_sfu(waddr)) 1282 scoreboard->last_magic_sfu_write_tick = scoreboard->tick; 1283 else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA) 1284 scoreboard->last_unifa_write_tick = scoreboard->tick; 1285} 1286 1287static void 1288update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, 1289 const struct v3d_qpu_instr *inst) 1290{ 1291 if (v3d_qpu_instr_is_sfu(inst)) { 1292 scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr; 1293 scoreboard->last_stallable_sfu_tick = scoreboard->tick; 1294 } 1295} 1296 1297static void 1298update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, 1299 const struct v3d_qpu_instr *inst, 1300 const struct v3d_device_info *devinfo) 1301{ 1302 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 1303 return; 1304 1305 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 1306 1307 if (inst->alu.add.op != V3D_QPU_A_NOP) { 1308 if (inst->alu.add.magic_write) { 1309 update_scoreboard_for_magic_waddr(scoreboard, 1310 inst->alu.add.waddr, 1311 devinfo); 1312 } else { 1313 update_scoreboard_for_sfu_stall_waddr(scoreboard, 1314 inst); 1315 } 1316 1317 if (inst->alu.add.op == V3D_QPU_A_SETMSF) 1318 scoreboard->last_setmsf_tick = scoreboard->tick; 1319 } 1320 1321 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 1322 if (inst->alu.mul.magic_write) { 1323 update_scoreboard_for_magic_waddr(scoreboard, 1324 inst->alu.mul.waddr, 1325 devinfo); 1326 } 1327 } 1328 1329 if (inst->sig.ldvary) 1330 scoreboard->last_ldvary_tick = scoreboard->tick; 1331} 1332 1333static void 1334dump_state(const struct v3d_device_info *devinfo, struct dag *dag) 1335{ 1336 list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) { 1337 fprintf(stderr, " t=%4d: ", n->unblocked_time); 1338 v3d_qpu_dump(devinfo, &n->inst->qpu); 1339 fprintf(stderr, "\n"); 1340 1341 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1342 struct schedule_node *child = 1343 (struct schedule_node *)edge->child; 1344 if (!child) 1345 continue; 1346 1347 fprintf(stderr, " - "); 1348 v3d_qpu_dump(devinfo, &child->inst->qpu); 1349 fprintf(stderr, " (%d parents, %c)\n", 1350 child->dag.parent_count, 1351 edge->data ? 'w' : 'r'); 1352 } 1353 } 1354} 1355 1356static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo, 1357 enum v3d_qpu_waddr waddr, 1358 const struct v3d_qpu_instr *after) 1359{ 1360 /* Apply some huge latency between texture fetch requests and getting 1361 * their results back. 1362 * 1363 * FIXME: This is actually pretty bogus. If we do: 1364 * 1365 * mov tmu0_s, a 1366 * <a bit of math> 1367 * mov tmu0_s, b 1368 * load_tmu0 1369 * <more math> 1370 * load_tmu0 1371 * 1372 * we count that as worse than 1373 * 1374 * mov tmu0_s, a 1375 * mov tmu0_s, b 1376 * <lots of math> 1377 * load_tmu0 1378 * <more math> 1379 * load_tmu0 1380 * 1381 * because we associate the first load_tmu0 with the *second* tmu0_s. 1382 */ 1383 if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) && 1384 v3d_qpu_waits_on_tmu(after)) { 1385 return 100; 1386 } 1387 1388 /* Assume that anything depending on us is consuming the SFU result. */ 1389 if (v3d_qpu_magic_waddr_is_sfu(waddr)) 1390 return 3; 1391 1392 return 1; 1393} 1394 1395static uint32_t 1396instruction_latency(const struct v3d_device_info *devinfo, 1397 struct schedule_node *before, struct schedule_node *after) 1398{ 1399 const struct v3d_qpu_instr *before_inst = &before->inst->qpu; 1400 const struct v3d_qpu_instr *after_inst = &after->inst->qpu; 1401 uint32_t latency = 1; 1402 1403 if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU || 1404 after_inst->type != V3D_QPU_INSTR_TYPE_ALU) 1405 return latency; 1406 1407 if (before_inst->alu.add.magic_write) { 1408 latency = MAX2(latency, 1409 magic_waddr_latency(devinfo, 1410 before_inst->alu.add.waddr, 1411 after_inst)); 1412 } 1413 1414 if (before_inst->alu.mul.magic_write) { 1415 latency = MAX2(latency, 1416 magic_waddr_latency(devinfo, 1417 before_inst->alu.mul.waddr, 1418 after_inst)); 1419 } 1420 1421 if (v3d_qpu_instr_is_sfu(before_inst)) 1422 return 2; 1423 1424 return latency; 1425} 1426 1427/** Recursive computation of the delay member of a node. */ 1428static void 1429compute_delay(struct dag_node *node, void *state) 1430{ 1431 struct schedule_node *n = (struct schedule_node *)node; 1432 struct v3d_compile *c = (struct v3d_compile *) state; 1433 1434 n->delay = 1; 1435 1436 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1437 struct schedule_node *child = 1438 (struct schedule_node *)edge->child; 1439 1440 n->delay = MAX2(n->delay, (child->delay + 1441 instruction_latency(c->devinfo, n, 1442 child))); 1443 } 1444} 1445 1446/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head() 1447 * should be called on it later to finish pruning the other edges). 1448 */ 1449static void 1450pre_remove_head(struct dag *dag, struct schedule_node *n) 1451{ 1452 list_delinit(&n->dag.link); 1453 1454 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1455 if (edge->data) 1456 dag_remove_edge(dag, edge); 1457 } 1458} 1459 1460static void 1461mark_instruction_scheduled(const struct v3d_device_info *devinfo, 1462 struct dag *dag, 1463 uint32_t time, 1464 struct schedule_node *node) 1465{ 1466 if (!node) 1467 return; 1468 1469 util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) { 1470 struct schedule_node *child = 1471 (struct schedule_node *)edge->child; 1472 1473 if (!child) 1474 continue; 1475 1476 uint32_t latency = instruction_latency(devinfo, node, child); 1477 1478 child->unblocked_time = MAX2(child->unblocked_time, 1479 time + latency); 1480 } 1481 dag_prune_head(dag, &node->dag); 1482} 1483 1484static void 1485insert_scheduled_instruction(struct v3d_compile *c, 1486 struct qblock *block, 1487 struct choose_scoreboard *scoreboard, 1488 struct qinst *inst) 1489{ 1490 list_addtail(&inst->link, &block->instructions); 1491 1492 update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); 1493 c->qpu_inst_count++; 1494 scoreboard->tick++; 1495} 1496 1497static struct qinst * 1498vir_nop() 1499{ 1500 struct qreg undef = vir_nop_reg(); 1501 struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); 1502 1503 return qinst; 1504} 1505 1506static void 1507emit_nop(struct v3d_compile *c, struct qblock *block, 1508 struct choose_scoreboard *scoreboard) 1509{ 1510 insert_scheduled_instruction(c, block, scoreboard, vir_nop()); 1511} 1512 1513static bool 1514qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, 1515 const struct qinst *qinst, int slot) 1516{ 1517 const struct v3d_qpu_instr *inst = &qinst->qpu; 1518 1519 if (slot == 2 && qinst->is_tlb_z_write) 1520 return false; 1521 1522 if (slot > 0 && qinst->uniform != ~0) 1523 return false; 1524 1525 if (v3d_qpu_waits_vpm(inst)) 1526 return false; 1527 1528 if (inst->sig.ldvary) 1529 return false; 1530 1531 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 1532 /* GFXH-1625: TMUWT not allowed in the final instruction. */ 1533 if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) 1534 return false; 1535 1536 /* No writing physical registers at the end. */ 1537 if (!inst->alu.add.magic_write || 1538 !inst->alu.mul.magic_write) { 1539 return false; 1540 } 1541 1542 if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && 1543 !inst->sig_magic) { 1544 return false; 1545 } 1546 1547 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) 1548 return false; 1549 1550 /* RF0-2 might be overwritten during the delay slots by 1551 * fragment shader setup. 1552 */ 1553 if (inst->raddr_a < 3 && 1554 (inst->alu.add.a == V3D_QPU_MUX_A || 1555 inst->alu.add.b == V3D_QPU_MUX_A || 1556 inst->alu.mul.a == V3D_QPU_MUX_A || 1557 inst->alu.mul.b == V3D_QPU_MUX_A)) { 1558 return false; 1559 } 1560 1561 if (inst->raddr_b < 3 && 1562 !inst->sig.small_imm && 1563 (inst->alu.add.a == V3D_QPU_MUX_B || 1564 inst->alu.add.b == V3D_QPU_MUX_B || 1565 inst->alu.mul.a == V3D_QPU_MUX_B || 1566 inst->alu.mul.b == V3D_QPU_MUX_B)) { 1567 return false; 1568 } 1569 } 1570 1571 return true; 1572} 1573 1574/** 1575 * This is called when trying to merge a thrsw back into the instruction stream 1576 * of instructions that were scheduled *before* the thrsw signal to fill its 1577 * delay slots. Because the actual execution of the thrsw happens after the 1578 * delay slots, it is usually safe to do this, but there are some cases that 1579 * need special care. 1580 */ 1581static bool 1582qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, 1583 const struct qinst *qinst, 1584 uint32_t slot) 1585{ 1586 /* No scheduling SFU when the result would land in the other 1587 * thread. The simulator complains for safety, though it 1588 * would only occur for dead code in our case. 1589 */ 1590 if (slot > 0 && 1591 qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 1592 (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || 1593 v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { 1594 return false; 1595 } 1596 1597 if (slot > 0 && qinst->qpu.sig.ldvary) 1598 return false; 1599 1600 /* unifa and the following 3 instructions can't overlap a 1601 * thread switch/end. The docs further clarify that this means 1602 * the cycle at which the actual thread switch/end happens 1603 * and not when the thrsw instruction is processed, which would 1604 * be after the 2 delay slots following the thrsw instruction. 1605 * This means that we can move up a thrsw up to the instruction 1606 * right after unifa: 1607 * 1608 * unifa, r5 1609 * thrsw 1610 * delay slot 1 1611 * delay slot 2 1612 * Thread switch happens here, 4 instructions away from unifa 1613 */ 1614 if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) 1615 return false; 1616 1617 return true; 1618} 1619 1620/** 1621 * This is called for instructions scheduled *after* a thrsw signal that may 1622 * land in the delay slots of the thrsw. Because these instructions were 1623 * scheduled after the thrsw, we need to be careful when placing them into 1624 * the delay slots, since that means that we are moving them ahead of the 1625 * thread switch and we need to ensure that is not a problem. 1626 */ 1627static bool 1628qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, 1629 struct choose_scoreboard *scoreboard, 1630 const struct qinst *qinst) 1631{ 1632 const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick; 1633 assert(slot <= 2); 1634 1635 /* We merge thrsw instructions back into the instruction stream 1636 * manually, so any instructions scheduled after a thrsw shold be 1637 * in the actual delay slots and not in the same slot as the thrsw. 1638 */ 1639 assert(slot >= 1); 1640 1641 /* No emitting a thrsw while the previous thrsw hasn't happened yet. */ 1642 if (qinst->qpu.sig.thrsw) 1643 return false; 1644 1645 /* The restrictions for instructions scheduled before the the thrsw 1646 * also apply to instructions scheduled after the thrsw that we want 1647 * to place in its delay slots. 1648 */ 1649 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) 1650 return false; 1651 1652 /* TLB access is disallowed until scoreboard wait is executed, which 1653 * we do on the last thread switch. 1654 */ 1655 if (qpu_inst_is_tlb(&qinst->qpu)) 1656 return false; 1657 1658 /* Instruction sequence restrictions: Branch is not allowed in delay 1659 * slots of a thrsw. 1660 */ 1661 if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) 1662 return false; 1663 1664 /* Miscellaneous restrictions: At the point of a thrsw we need to have 1665 * at least one outstanding lookup or TSY wait. 1666 * 1667 * So avoid placing TMU instructions scheduled after the thrsw into 1668 * its delay slots or we may be compromising the integrity of our TMU 1669 * sequences. Also, notice that if we moved these instructions into 1670 * the delay slots of a previous thrsw we could overflow our TMU output 1671 * fifo, since we could be effectively pipelining a lookup scheduled 1672 * after the thrsw into the sequence before the thrsw. 1673 */ 1674 if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) || 1675 qinst->qpu.sig.wrtmuc) { 1676 return false; 1677 } 1678 1679 /* Don't move instructions that wait on the TMU before the thread switch 1680 * happens since that would make the current thread stall before the 1681 * switch, which is exactly what we want to avoid with the thrsw 1682 * instruction. 1683 */ 1684 if (v3d_qpu_waits_on_tmu(&qinst->qpu)) 1685 return false; 1686 1687 /* A thread switch invalidates all accumulators, so don't place any 1688 * instructions that write accumulators into the delay slots. 1689 */ 1690 if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu)) 1691 return false; 1692 1693 /* Multop has an implicit write to the rtop register which is an 1694 * specialized accumulator that is only used with this instruction. 1695 */ 1696 if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP) 1697 return false; 1698 1699 /* Flags are invalidated across a thread switch, so dont' place 1700 * instructions that write flags into delay slots. 1701 */ 1702 if (v3d_qpu_writes_flags(&qinst->qpu)) 1703 return false; 1704 1705 /* TSY sync ops materialize at the point of the next thread switch, 1706 * therefore, if we have a TSY sync right after a thread switch, we 1707 * cannot place it in its delay slots, or we would be moving the sync 1708 * to the thrsw before it instead. 1709 */ 1710 if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID) 1711 return false; 1712 1713 return true; 1714} 1715 1716static bool 1717valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard, 1718 struct qinst *qinst, int instructions_in_sequence, 1719 bool is_thrend) 1720{ 1721 /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ 1722 if (scoreboard->last_thrsw_tick + 3 > 1723 scoreboard->tick - instructions_in_sequence) { 1724 return false; 1725 } 1726 1727 for (int slot = 0; slot < instructions_in_sequence; slot++) { 1728 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) 1729 return false; 1730 1731 if (is_thrend && 1732 !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { 1733 return false; 1734 } 1735 1736 /* Note that the list is circular, so we can only do this up 1737 * to instructions_in_sequence. 1738 */ 1739 qinst = (struct qinst *)qinst->link.next; 1740 } 1741 1742 return true; 1743} 1744 1745/** 1746 * Emits a THRSW signal in the stream, trying to move it up to pair with 1747 * another instruction. 1748 */ 1749static int 1750emit_thrsw(struct v3d_compile *c, 1751 struct qblock *block, 1752 struct choose_scoreboard *scoreboard, 1753 struct qinst *inst, 1754 bool is_thrend) 1755{ 1756 int time = 0; 1757 1758 /* There should be nothing in a thrsw inst being scheduled other than 1759 * the signal bits. 1760 */ 1761 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 1762 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP); 1763 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP); 1764 1765 /* Don't try to emit a thrsw in the delay slots of a previous thrsw 1766 * or branch. 1767 */ 1768 while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) { 1769 emit_nop(c, block, scoreboard); 1770 time++; 1771 } 1772 while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) { 1773 emit_nop(c, block, scoreboard); 1774 time++; 1775 } 1776 1777 /* Find how far back into previous instructions we can put the THRSW. */ 1778 int slots_filled = 0; 1779 int invalid_sig_count = 0; 1780 bool last_thrsw_after_invalid_ok = false; 1781 struct qinst *merge_inst = NULL; 1782 vir_for_each_inst_rev(prev_inst, block) { 1783 if (!valid_thrsw_sequence(c, scoreboard, 1784 prev_inst, slots_filled + 1, 1785 is_thrend)) { 1786 break; 1787 } 1788 1789 struct v3d_qpu_sig sig = prev_inst->qpu.sig; 1790 sig.thrsw = true; 1791 uint32_t packed_sig; 1792 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) { 1793 /* If we can't merge the thrsw here because of signal 1794 * incompatibility, keep going, we might be able to 1795 * merge it in an earlier instruction. 1796 */ 1797 invalid_sig_count++; 1798 goto cont_block; 1799 } 1800 1801 /* For last thrsw we need 2 consecutive slots that are 1802 * thrsw compatible, so if we have previously jumped over 1803 * an incompatible signal, flag that we have found the first 1804 * valid slot here and keep going. 1805 */ 1806 if (inst->is_last_thrsw && invalid_sig_count > 0 && 1807 !last_thrsw_after_invalid_ok) { 1808 last_thrsw_after_invalid_ok = true; 1809 invalid_sig_count++; 1810 goto cont_block; 1811 } 1812 1813 last_thrsw_after_invalid_ok = false; 1814 invalid_sig_count = 0; 1815 merge_inst = prev_inst; 1816 1817cont_block: 1818 if (++slots_filled == 3) 1819 break; 1820 } 1821 1822 /* If we jumped over a signal incompatibility and did not manage to 1823 * merge the thrsw in the end, we need to adjust slots filled to match 1824 * the last valid merge point. 1825 */ 1826 assert(invalid_sig_count == 0 || slots_filled >= invalid_sig_count); 1827 if (invalid_sig_count > 0) 1828 slots_filled -= invalid_sig_count; 1829 1830 bool needs_free = false; 1831 if (merge_inst) { 1832 merge_inst->qpu.sig.thrsw = true; 1833 needs_free = true; 1834 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled; 1835 } else { 1836 scoreboard->last_thrsw_tick = scoreboard->tick; 1837 insert_scheduled_instruction(c, block, scoreboard, inst); 1838 time++; 1839 slots_filled++; 1840 merge_inst = inst; 1841 } 1842 1843 scoreboard->first_thrsw_emitted = true; 1844 1845 /* If we're emitting the last THRSW (other than program end), then 1846 * signal that to the HW by emitting two THRSWs in a row. 1847 */ 1848 if (inst->is_last_thrsw) { 1849 if (slots_filled <= 1) { 1850 emit_nop(c, block, scoreboard); 1851 time++; 1852 } 1853 struct qinst *second_inst = 1854 (struct qinst *)merge_inst->link.next; 1855 second_inst->qpu.sig.thrsw = true; 1856 scoreboard->last_thrsw_emitted = true; 1857 } 1858 1859 /* Make sure the thread end executes within the program lifespan */ 1860 if (is_thrend) { 1861 for (int i = 0; i < 3 - slots_filled; i++) { 1862 emit_nop(c, block, scoreboard); 1863 time++; 1864 } 1865 } 1866 1867 /* If we put our THRSW into another instruction, free up the 1868 * instruction that didn't end up scheduled into the list. 1869 */ 1870 if (needs_free) 1871 free(inst); 1872 1873 return time; 1874} 1875 1876static bool 1877qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst) 1878{ 1879 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) 1880 return false; 1881 1882 if (inst->qpu.sig.thrsw) 1883 return false; 1884 1885 if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu)) 1886 return false; 1887 1888 if (vir_has_uniform(inst)) 1889 return false; 1890 1891 return true; 1892} 1893 1894static void 1895emit_branch(struct v3d_compile *c, 1896 struct qblock *block, 1897 struct choose_scoreboard *scoreboard, 1898 struct qinst *inst) 1899{ 1900 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 1901 1902 /* We should've not picked up a branch for the delay slots of a previous 1903 * thrsw, branch or unifa write instruction. 1904 */ 1905 int branch_tick = scoreboard->tick; 1906 assert(scoreboard->last_thrsw_tick + 2 < branch_tick); 1907 assert(scoreboard->last_branch_tick + 3 < branch_tick); 1908 assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); 1909 1910 /* Can't place a branch with msfign != 0 and cond != 0,2,3 after 1911 * setmsf. 1912 */ 1913 bool is_safe_msf_branch = 1914 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || 1915 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || 1916 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || 1917 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0; 1918 assert(scoreboard->last_setmsf_tick != branch_tick - 1 || 1919 is_safe_msf_branch); 1920 1921 /* Insert the branch instruction */ 1922 insert_scheduled_instruction(c, block, scoreboard, inst); 1923 1924 /* Now see if we can move the branch instruction back into the 1925 * instruction stream to fill its delay slots 1926 */ 1927 int slots_filled = 0; 1928 while (slots_filled < 3 && block->instructions.next != &inst->link) { 1929 struct qinst *prev_inst = (struct qinst *) inst->link.prev; 1930 assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH); 1931 1932 /* Can't move the branch instruction if that would place it 1933 * in the delay slots of other instructions. 1934 */ 1935 if (scoreboard->last_branch_tick + 3 >= 1936 branch_tick - slots_filled - 1) { 1937 break; 1938 } 1939 1940 if (scoreboard->last_thrsw_tick + 2 >= 1941 branch_tick - slots_filled - 1) { 1942 break; 1943 } 1944 1945 if (scoreboard->last_unifa_write_tick + 3 >= 1946 branch_tick - slots_filled - 1) { 1947 break; 1948 } 1949 1950 /* Do not move up a branch if it can disrupt an ldvary sequence 1951 * as that can cause stomping of the r5 register. 1952 */ 1953 if (scoreboard->last_ldvary_tick + 2 >= 1954 branch_tick - slots_filled) { 1955 break; 1956 } 1957 1958 /* Can't move a conditional branch before the instruction 1959 * that writes the flags for its condition. 1960 */ 1961 if (v3d_qpu_writes_flags(&prev_inst->qpu) && 1962 inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) { 1963 break; 1964 } 1965 1966 if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst)) 1967 break; 1968 1969 if (!is_safe_msf_branch) { 1970 struct qinst *prev_prev_inst = 1971 (struct qinst *) prev_inst->link.prev; 1972 if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 1973 prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) { 1974 break; 1975 } 1976 } 1977 1978 list_del(&prev_inst->link); 1979 list_add(&prev_inst->link, &inst->link); 1980 slots_filled++; 1981 } 1982 1983 block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled; 1984 scoreboard->last_branch_tick = branch_tick - slots_filled; 1985 1986 /* Fill any remaining delay slots. 1987 * 1988 * For unconditional branches we'll try to fill these with the 1989 * first instructions in the successor block after scheduling 1990 * all blocks when setting up branch targets. 1991 */ 1992 for (int i = 0; i < 3 - slots_filled; i++) 1993 emit_nop(c, block, scoreboard); 1994} 1995 1996static bool 1997alu_reads_register(struct v3d_qpu_instr *inst, 1998 bool add, bool magic, uint32_t index) 1999{ 2000 uint32_t num_src; 2001 enum v3d_qpu_mux mux_a, mux_b; 2002 2003 if (add) { 2004 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); 2005 mux_a = inst->alu.add.a; 2006 mux_b = inst->alu.add.b; 2007 } else { 2008 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); 2009 mux_a = inst->alu.mul.a; 2010 mux_b = inst->alu.mul.b; 2011 } 2012 2013 for (int i = 0; i < num_src; i++) { 2014 if (magic) { 2015 if (i == 0 && mux_a == index) 2016 return true; 2017 if (i == 1 && mux_b == index) 2018 return true; 2019 } else { 2020 if (i == 0 && mux_a == V3D_QPU_MUX_A && 2021 inst->raddr_a == index) { 2022 return true; 2023 } 2024 if (i == 0 && mux_a == V3D_QPU_MUX_B && 2025 inst->raddr_b == index) { 2026 return true; 2027 } 2028 if (i == 1 && mux_b == V3D_QPU_MUX_A && 2029 inst->raddr_a == index) { 2030 return true; 2031 } 2032 if (i == 1 && mux_b == V3D_QPU_MUX_B && 2033 inst->raddr_b == index) { 2034 return true; 2035 } 2036 } 2037 } 2038 2039 return false; 2040} 2041 2042/** 2043 * This takes and ldvary signal merged into 'inst' and tries to move it up to 2044 * the previous instruction to get good pipelining of ldvary sequences, 2045 * transforming this: 2046 * 2047 * nop ; nop ; ldvary.r4 2048 * nop ; fmul r0, r4, rf0 ; 2049 * fadd rf13, r0, r5 ; nop; ; ldvary.r1 <-- inst 2050 * 2051 * into: 2052 * 2053 * nop ; nop ; ldvary.r4 2054 * nop ; fmul r0, r4, rf0 ; ldvary.r1 2055 * fadd rf13, r0, r5 ; nop; ; <-- inst 2056 * 2057 * If we manage to do this successfully (we return true here), then flagging 2058 * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that 2059 * we will be able to pick up to merge into 'inst', leading to code like this: 2060 * 2061 * nop ; nop ; ldvary.r4 2062 * nop ; fmul r0, r4, rf0 ; ldvary.r1 2063 * fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; <-- inst 2064 */ 2065static bool 2066fixup_pipelined_ldvary(struct v3d_compile *c, 2067 struct choose_scoreboard *scoreboard, 2068 struct qblock *block, 2069 struct v3d_qpu_instr *inst) 2070{ 2071 /* We only call this if we have successfuly merged an ldvary into a 2072 * previous instruction. 2073 */ 2074 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 2075 assert(inst->sig.ldvary); 2076 uint32_t ldvary_magic = inst->sig_magic; 2077 uint32_t ldvary_index = inst->sig_addr; 2078 2079 /* The instruction in which we merged the ldvary cannot read 2080 * the ldvary destination, if it does, then moving the ldvary before 2081 * it would overwrite it. 2082 */ 2083 if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) 2084 return false; 2085 if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) 2086 return false; 2087 2088 /* The implicit ldvary destination may not be written to by a signal 2089 * in the instruction following ldvary. Since we are planning to move 2090 * ldvary to the previous instruction, this means we need to check if 2091 * the current instruction has any other signal that could create this 2092 * conflict. The only other signal that can write to the implicit 2093 * ldvary destination that is compatible with ldvary in the same 2094 * instruction is ldunif. 2095 */ 2096 if (inst->sig.ldunif) 2097 return false; 2098 2099 /* The previous instruction can't write to the same destination as the 2100 * ldvary. 2101 */ 2102 struct qinst *prev = (struct qinst *) block->instructions.prev; 2103 if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU) 2104 return false; 2105 2106 if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) { 2107 if (prev->qpu.alu.add.magic_write == ldvary_magic && 2108 prev->qpu.alu.add.waddr == ldvary_index) { 2109 return false; 2110 } 2111 } 2112 2113 if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) { 2114 if (prev->qpu.alu.mul.magic_write == ldvary_magic && 2115 prev->qpu.alu.mul.waddr == ldvary_index) { 2116 return false; 2117 } 2118 } 2119 2120 /* The previous instruction cannot have a conflicting signal */ 2121 if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) 2122 return false; 2123 2124 uint32_t sig; 2125 struct v3d_qpu_sig new_sig = prev->qpu.sig; 2126 new_sig.ldvary = true; 2127 if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig)) 2128 return false; 2129 2130 /* The previous instruction cannot use flags since ldvary uses the 2131 * 'cond' instruction field to store the destination. 2132 */ 2133 if (v3d_qpu_writes_flags(&prev->qpu)) 2134 return false; 2135 if (v3d_qpu_reads_flags(&prev->qpu)) 2136 return false; 2137 2138 /* We can't put an ldvary in the delay slots of a thrsw. We should've 2139 * prevented this when pairing up the ldvary with another instruction 2140 * and flagging it for a fixup. 2141 */ 2142 assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); 2143 2144 /* Move the ldvary to the previous instruction and remove it from the 2145 * current one. 2146 */ 2147 prev->qpu.sig.ldvary = true; 2148 prev->qpu.sig_magic = ldvary_magic; 2149 prev->qpu.sig_addr = ldvary_index; 2150 scoreboard->last_ldvary_tick = scoreboard->tick - 1; 2151 2152 inst->sig.ldvary = false; 2153 inst->sig_magic = false; 2154 inst->sig_addr = 0; 2155 2156 /* By moving ldvary to the previous instruction we make it update 2157 * r5 in the current one, so nothing else in it should write r5. 2158 * This should've been prevented by our depedency tracking, which 2159 * would not allow ldvary to be paired up with an instruction that 2160 * writes r5 (since our dependency tracking doesn't know that the 2161 * ldvary write r5 happens in the next instruction). 2162 */ 2163 assert(!v3d_qpu_writes_r5(c->devinfo, inst)); 2164 2165 return true; 2166} 2167 2168static uint32_t 2169schedule_instructions(struct v3d_compile *c, 2170 struct choose_scoreboard *scoreboard, 2171 struct qblock *block, 2172 enum quniform_contents *orig_uniform_contents, 2173 uint32_t *orig_uniform_data, 2174 uint32_t *next_uniform) 2175{ 2176 const struct v3d_device_info *devinfo = c->devinfo; 2177 uint32_t time = 0; 2178 2179 while (!list_is_empty(&scoreboard->dag->heads)) { 2180 struct schedule_node *chosen = 2181 choose_instruction_to_schedule(c, scoreboard, NULL); 2182 struct schedule_node *merge = NULL; 2183 2184 /* If there are no valid instructions to schedule, drop a NOP 2185 * in. 2186 */ 2187 struct qinst *qinst = chosen ? chosen->inst : vir_nop(); 2188 struct v3d_qpu_instr *inst = &qinst->qpu; 2189 2190 if (debug) { 2191 fprintf(stderr, "t=%4d: current list:\n", 2192 time); 2193 dump_state(devinfo, scoreboard->dag); 2194 fprintf(stderr, "t=%4d: chose: ", time); 2195 v3d_qpu_dump(devinfo, inst); 2196 fprintf(stderr, "\n"); 2197 } 2198 2199 /* We can't mark_instruction_scheduled() the chosen inst until 2200 * we're done identifying instructions to merge, so put the 2201 * merged instructions on a list for a moment. 2202 */ 2203 struct list_head merged_list; 2204 list_inithead(&merged_list); 2205 2206 /* Schedule this instruction onto the QPU list. Also try to 2207 * find an instruction to pair with it. 2208 */ 2209 if (chosen) { 2210 time = MAX2(chosen->unblocked_time, time); 2211 pre_remove_head(scoreboard->dag, chosen); 2212 2213 while ((merge = 2214 choose_instruction_to_schedule(c, scoreboard, 2215 chosen))) { 2216 time = MAX2(merge->unblocked_time, time); 2217 pre_remove_head(scoreboard->dag, merge); 2218 list_addtail(&merge->link, &merged_list); 2219 (void)qpu_merge_inst(devinfo, inst, 2220 inst, &merge->inst->qpu); 2221 if (merge->inst->uniform != -1) { 2222 chosen->inst->uniform = 2223 merge->inst->uniform; 2224 } 2225 2226 if (debug) { 2227 fprintf(stderr, "t=%4d: merging: ", 2228 time); 2229 v3d_qpu_dump(devinfo, &merge->inst->qpu); 2230 fprintf(stderr, "\n"); 2231 fprintf(stderr, " result: "); 2232 v3d_qpu_dump(devinfo, inst); 2233 fprintf(stderr, "\n"); 2234 } 2235 2236 if (scoreboard->fixup_ldvary) { 2237 scoreboard->fixup_ldvary = false; 2238 if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) { 2239 /* Flag the ldvary as scheduled 2240 * now so we can try to merge the 2241 * follow-up instruction in the 2242 * the ldvary sequence into the 2243 * current instruction. 2244 */ 2245 mark_instruction_scheduled( 2246 devinfo, scoreboard->dag, 2247 time, merge); 2248 } 2249 } 2250 } 2251 if (mux_read_stalls(scoreboard, inst)) 2252 c->qpu_inst_stalled_count++; 2253 } 2254 2255 /* Update the uniform index for the rewritten location -- 2256 * branch target updating will still need to change 2257 * c->uniform_data[] using this index. 2258 */ 2259 if (qinst->uniform != -1) { 2260 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 2261 block->branch_uniform = *next_uniform; 2262 2263 c->uniform_data[*next_uniform] = 2264 orig_uniform_data[qinst->uniform]; 2265 c->uniform_contents[*next_uniform] = 2266 orig_uniform_contents[qinst->uniform]; 2267 qinst->uniform = *next_uniform; 2268 (*next_uniform)++; 2269 } 2270 2271 if (debug) { 2272 fprintf(stderr, "\n"); 2273 } 2274 2275 /* Now that we've scheduled a new instruction, some of its 2276 * children can be promoted to the list of instructions ready to 2277 * be scheduled. Update the children's unblocked time for this 2278 * DAG edge as we do so. 2279 */ 2280 mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen); 2281 list_for_each_entry(struct schedule_node, merge, &merged_list, 2282 link) { 2283 mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge); 2284 2285 /* The merged VIR instruction doesn't get re-added to the 2286 * block, so free it now. 2287 */ 2288 free(merge->inst); 2289 } 2290 2291 if (inst->sig.thrsw) { 2292 time += emit_thrsw(c, block, scoreboard, qinst, false); 2293 } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 2294 emit_branch(c, block, scoreboard, qinst); 2295 } else { 2296 insert_scheduled_instruction(c, block, 2297 scoreboard, qinst); 2298 } 2299 } 2300 2301 return time; 2302} 2303 2304static uint32_t 2305qpu_schedule_instructions_block(struct v3d_compile *c, 2306 struct choose_scoreboard *scoreboard, 2307 struct qblock *block, 2308 enum quniform_contents *orig_uniform_contents, 2309 uint32_t *orig_uniform_data, 2310 uint32_t *next_uniform) 2311{ 2312 void *mem_ctx = ralloc_context(NULL); 2313 scoreboard->dag = dag_create(mem_ctx); 2314 struct list_head setup_list; 2315 2316 list_inithead(&setup_list); 2317 2318 /* Wrap each instruction in a scheduler structure. */ 2319 while (!list_is_empty(&block->instructions)) { 2320 struct qinst *qinst = (struct qinst *)block->instructions.next; 2321 struct schedule_node *n = 2322 rzalloc(mem_ctx, struct schedule_node); 2323 2324 dag_init_node(scoreboard->dag, &n->dag); 2325 n->inst = qinst; 2326 2327 list_del(&qinst->link); 2328 list_addtail(&n->link, &setup_list); 2329 } 2330 2331 calculate_forward_deps(c, scoreboard->dag, &setup_list); 2332 calculate_reverse_deps(c, scoreboard->dag, &setup_list); 2333 2334 dag_traverse_bottom_up(scoreboard->dag, compute_delay, c); 2335 2336 uint32_t cycles = schedule_instructions(c, scoreboard, block, 2337 orig_uniform_contents, 2338 orig_uniform_data, 2339 next_uniform); 2340 2341 ralloc_free(mem_ctx); 2342 scoreboard->dag = NULL; 2343 2344 return cycles; 2345} 2346 2347static void 2348qpu_set_branch_targets(struct v3d_compile *c) 2349{ 2350 vir_for_each_block(block, c) { 2351 /* The end block of the program has no branch. */ 2352 if (!block->successors[0]) 2353 continue; 2354 2355 /* If there was no branch instruction, then the successor 2356 * block must follow immediately after this one. 2357 */ 2358 if (block->branch_qpu_ip == ~0) { 2359 assert(block->end_qpu_ip + 1 == 2360 block->successors[0]->start_qpu_ip); 2361 continue; 2362 } 2363 2364 /* Walk back through the delay slots to find the branch 2365 * instr. 2366 */ 2367 struct qinst *branch = NULL; 2368 struct list_head *entry = block->instructions.prev; 2369 int32_t delay_slot_count = -1; 2370 struct qinst *delay_slots_start = NULL; 2371 for (int i = 0; i < 3; i++) { 2372 entry = entry->prev; 2373 struct qinst *inst = 2374 container_of(entry, struct qinst, link); 2375 2376 if (delay_slot_count == -1) { 2377 if (!v3d_qpu_is_nop(&inst->qpu)) 2378 delay_slot_count = i; 2379 else 2380 delay_slots_start = inst; 2381 } 2382 2383 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) { 2384 branch = inst; 2385 break; 2386 } 2387 } 2388 assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 2389 assert(delay_slot_count >= 0 && delay_slot_count <= 3); 2390 assert(delay_slot_count == 0 || delay_slots_start != NULL); 2391 2392 /* Make sure that the if-we-don't-jump 2393 * successor was scheduled just after the 2394 * delay slots. 2395 */ 2396 assert(!block->successors[1] || 2397 block->successors[1]->start_qpu_ip == 2398 block->branch_qpu_ip + 4); 2399 2400 branch->qpu.branch.offset = 2401 ((block->successors[0]->start_qpu_ip - 2402 (block->branch_qpu_ip + 4)) * 2403 sizeof(uint64_t)); 2404 2405 /* Set up the relative offset to jump in the 2406 * uniform stream. 2407 * 2408 * Use a temporary here, because 2409 * uniform_data[inst->uniform] may be shared 2410 * between multiple instructions. 2411 */ 2412 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT); 2413 c->uniform_data[branch->uniform] = 2414 (block->successors[0]->start_uniform - 2415 (block->branch_uniform + 1)) * 4; 2416 2417 /* If this is an unconditional branch, try to fill any remaining 2418 * delay slots with the initial instructions of the successor 2419 * block. 2420 * 2421 * FIXME: we can do the same for conditional branches if we 2422 * predicate the instructions to match the branch condition. 2423 */ 2424 if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) { 2425 struct list_head *successor_insts = 2426 &block->successors[0]->instructions; 2427 delay_slot_count = MIN2(delay_slot_count, 2428 list_length(successor_insts)); 2429 struct qinst *s_inst = 2430 (struct qinst *) successor_insts->next; 2431 struct qinst *slot = delay_slots_start; 2432 int slots_filled = 0; 2433 while (slots_filled < delay_slot_count && 2434 qpu_inst_valid_in_branch_delay_slot(c, s_inst)) { 2435 memcpy(&slot->qpu, &s_inst->qpu, 2436 sizeof(slot->qpu)); 2437 s_inst = (struct qinst *) s_inst->link.next; 2438 slot = (struct qinst *) slot->link.next; 2439 slots_filled++; 2440 } 2441 branch->qpu.branch.offset += 2442 slots_filled * sizeof(uint64_t); 2443 } 2444 } 2445} 2446 2447uint32_t 2448v3d_qpu_schedule_instructions(struct v3d_compile *c) 2449{ 2450 const struct v3d_device_info *devinfo = c->devinfo; 2451 struct qblock *end_block = list_last_entry(&c->blocks, 2452 struct qblock, link); 2453 2454 /* We reorder the uniforms as we schedule instructions, so save the 2455 * old data off and replace it. 2456 */ 2457 uint32_t *uniform_data = c->uniform_data; 2458 enum quniform_contents *uniform_contents = c->uniform_contents; 2459 c->uniform_contents = ralloc_array(c, enum quniform_contents, 2460 c->num_uniforms); 2461 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); 2462 c->uniform_array_size = c->num_uniforms; 2463 uint32_t next_uniform = 0; 2464 2465 struct choose_scoreboard scoreboard; 2466 memset(&scoreboard, 0, sizeof(scoreboard)); 2467 scoreboard.last_ldvary_tick = -10; 2468 scoreboard.last_unifa_write_tick = -10; 2469 scoreboard.last_magic_sfu_write_tick = -10; 2470 scoreboard.last_uniforms_reset_tick = -10; 2471 scoreboard.last_thrsw_tick = -10; 2472 scoreboard.last_branch_tick = -10; 2473 scoreboard.last_setmsf_tick = -10; 2474 scoreboard.last_stallable_sfu_tick = -10; 2475 2476 if (debug) { 2477 fprintf(stderr, "Pre-schedule instructions\n"); 2478 vir_for_each_block(block, c) { 2479 fprintf(stderr, "BLOCK %d\n", block->index); 2480 list_for_each_entry(struct qinst, qinst, 2481 &block->instructions, link) { 2482 v3d_qpu_dump(devinfo, &qinst->qpu); 2483 fprintf(stderr, "\n"); 2484 } 2485 } 2486 fprintf(stderr, "\n"); 2487 } 2488 2489 uint32_t cycles = 0; 2490 vir_for_each_block(block, c) { 2491 block->start_qpu_ip = c->qpu_inst_count; 2492 block->branch_qpu_ip = ~0; 2493 block->start_uniform = next_uniform; 2494 2495 cycles += qpu_schedule_instructions_block(c, 2496 &scoreboard, 2497 block, 2498 uniform_contents, 2499 uniform_data, 2500 &next_uniform); 2501 2502 block->end_qpu_ip = c->qpu_inst_count - 1; 2503 } 2504 2505 /* Emit the program-end THRSW instruction. */; 2506 struct qinst *thrsw = vir_nop(); 2507 thrsw->qpu.sig.thrsw = true; 2508 emit_thrsw(c, end_block, &scoreboard, thrsw, true); 2509 2510 qpu_set_branch_targets(c); 2511 2512 assert(next_uniform == c->num_uniforms); 2513 2514 return cycles; 2515} 2516