1/* 2 * Copyright © 2010 Intel Corporation 3 * Copyright © 2014 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25/** 26 * @file vc4_qpu_schedule.c 27 * 28 * The basic model of the list scheduler is to take a basic block, compute a 29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically 30 * pick a DAG head, then put all the children that are now DAG heads into the 31 * list of things to schedule. 32 * 33 * The goal of scheduling here is to pack pairs of operations together in a 34 * single QPU instruction. 35 */ 36 37#include "vc4_qir.h" 38#include "vc4_qpu.h" 39#include "util/ralloc.h" 40#include "util/dag.h" 41 42static bool debug; 43 44struct schedule_node_child; 45 46struct schedule_node { 47 struct dag_node dag; 48 struct list_head link; 49 struct queued_qpu_inst *inst; 50 51 /* Longest cycles + instruction_latency() of any parent of this node. */ 52 uint32_t unblocked_time; 53 54 /** 55 * Minimum number of cycles from scheduling this instruction until the 56 * end of the program, based on the slowest dependency chain through 57 * the children. 58 */ 59 uint32_t delay; 60 61 /** 62 * cycles between this instruction being scheduled and when its result 63 * can be consumed. 64 */ 65 uint32_t latency; 66 67 /** 68 * Which uniform from uniform_data[] this instruction read, or -1 if 69 * not reading a uniform. 70 */ 71 int uniform; 72}; 73 74/* When walking the instructions in reverse, we need to swap before/after in 75 * add_dep(). 76 */ 77enum direction { F, R }; 78 79struct schedule_state { 80 struct dag *dag; 81 struct schedule_node *last_r[6]; 82 struct schedule_node *last_ra[32]; 83 struct schedule_node *last_rb[32]; 84 struct schedule_node *last_sf; 85 struct schedule_node *last_vpm_read; 86 struct schedule_node *last_tmu_write; 87 struct schedule_node *last_tlb; 88 struct schedule_node *last_vpm; 89 struct schedule_node *last_uniforms_reset; 90 enum direction dir; 91 /* Estimated cycle when the current instruction would start. */ 92 uint32_t time; 93}; 94 95static void 96add_dep(struct schedule_state *state, 97 struct schedule_node *before, 98 struct schedule_node *after, 99 bool write) 100{ 101 bool write_after_read = !write && state->dir == R; 102 uintptr_t edge_data = write_after_read; 103 104 if (!before || !after) 105 return; 106 107 assert(before != after); 108 109 if (state->dir == F) 110 dag_add_edge(&before->dag, &after->dag, edge_data); 111 else 112 dag_add_edge(&after->dag, &before->dag, edge_data); 113} 114 115static void 116add_read_dep(struct schedule_state *state, 117 struct schedule_node *before, 118 struct schedule_node *after) 119{ 120 add_dep(state, before, after, false); 121} 122 123static void 124add_write_dep(struct schedule_state *state, 125 struct schedule_node **before, 126 struct schedule_node *after) 127{ 128 add_dep(state, *before, after, true); 129 *before = after; 130} 131 132static bool 133qpu_writes_r4(uint64_t inst) 134{ 135 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 136 137 switch(sig) { 138 case QPU_SIG_COLOR_LOAD: 139 case QPU_SIG_LOAD_TMU0: 140 case QPU_SIG_LOAD_TMU1: 141 case QPU_SIG_ALPHA_MASK_LOAD: 142 return true; 143 default: 144 return false; 145 } 146} 147 148static void 149process_raddr_deps(struct schedule_state *state, struct schedule_node *n, 150 uint32_t raddr, bool is_a) 151{ 152 switch (raddr) { 153 case QPU_R_VARY: 154 add_write_dep(state, &state->last_r[5], n); 155 break; 156 157 case QPU_R_VPM: 158 add_write_dep(state, &state->last_vpm_read, n); 159 break; 160 161 case QPU_R_UNIF: 162 add_read_dep(state, state->last_uniforms_reset, n); 163 break; 164 165 case QPU_R_NOP: 166 case QPU_R_ELEM_QPU: 167 case QPU_R_XY_PIXEL_COORD: 168 case QPU_R_MS_REV_FLAGS: 169 break; 170 171 default: 172 if (raddr < 32) { 173 if (is_a) 174 add_read_dep(state, state->last_ra[raddr], n); 175 else 176 add_read_dep(state, state->last_rb[raddr], n); 177 } else { 178 fprintf(stderr, "unknown raddr %d\n", raddr); 179 abort(); 180 } 181 break; 182 } 183} 184 185static bool 186is_tmu_write(uint32_t waddr) 187{ 188 switch (waddr) { 189 case QPU_W_TMU0_S: 190 case QPU_W_TMU0_T: 191 case QPU_W_TMU0_R: 192 case QPU_W_TMU0_B: 193 case QPU_W_TMU1_S: 194 case QPU_W_TMU1_T: 195 case QPU_W_TMU1_R: 196 case QPU_W_TMU1_B: 197 return true; 198 default: 199 return false; 200 } 201} 202 203static bool 204reads_uniform(uint64_t inst) 205{ 206 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM) 207 return false; 208 209 return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF || 210 (QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF && 211 QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) || 212 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) || 213 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL))); 214} 215 216static void 217process_mux_deps(struct schedule_state *state, struct schedule_node *n, 218 uint32_t mux) 219{ 220 if (mux != QPU_MUX_A && mux != QPU_MUX_B) 221 add_read_dep(state, state->last_r[mux], n); 222} 223 224 225static void 226process_waddr_deps(struct schedule_state *state, struct schedule_node *n, 227 uint32_t waddr, bool is_add) 228{ 229 uint64_t inst = n->inst->inst; 230 bool is_a = is_add ^ ((inst & QPU_WS) != 0); 231 232 if (waddr < 32) { 233 if (is_a) { 234 add_write_dep(state, &state->last_ra[waddr], n); 235 } else { 236 add_write_dep(state, &state->last_rb[waddr], n); 237 } 238 } else if (is_tmu_write(waddr)) { 239 add_write_dep(state, &state->last_tmu_write, n); 240 add_read_dep(state, state->last_uniforms_reset, n); 241 } else if (qpu_waddr_is_tlb(waddr) || 242 waddr == QPU_W_MS_FLAGS) { 243 add_write_dep(state, &state->last_tlb, n); 244 } else { 245 switch (waddr) { 246 case QPU_W_ACC0: 247 case QPU_W_ACC1: 248 case QPU_W_ACC2: 249 case QPU_W_ACC3: 250 case QPU_W_ACC5: 251 add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0], 252 n); 253 break; 254 255 case QPU_W_VPM: 256 add_write_dep(state, &state->last_vpm, n); 257 break; 258 259 case QPU_W_VPMVCD_SETUP: 260 if (is_a) 261 add_write_dep(state, &state->last_vpm_read, n); 262 else 263 add_write_dep(state, &state->last_vpm, n); 264 break; 265 266 case QPU_W_SFU_RECIP: 267 case QPU_W_SFU_RECIPSQRT: 268 case QPU_W_SFU_EXP: 269 case QPU_W_SFU_LOG: 270 add_write_dep(state, &state->last_r[4], n); 271 break; 272 273 case QPU_W_TLB_STENCIL_SETUP: 274 /* This isn't a TLB operation that does things like 275 * implicitly lock the scoreboard, but it does have to 276 * appear before TLB_Z, and each of the TLB_STENCILs 277 * have to schedule in the same order relative to each 278 * other. 279 */ 280 add_write_dep(state, &state->last_tlb, n); 281 break; 282 283 case QPU_W_MS_FLAGS: 284 add_write_dep(state, &state->last_tlb, n); 285 break; 286 287 case QPU_W_UNIFORMS_ADDRESS: 288 add_write_dep(state, &state->last_uniforms_reset, n); 289 break; 290 291 case QPU_W_NOP: 292 break; 293 294 default: 295 fprintf(stderr, "Unknown waddr %d\n", waddr); 296 abort(); 297 } 298 } 299} 300 301static void 302process_cond_deps(struct schedule_state *state, struct schedule_node *n, 303 uint32_t cond) 304{ 305 switch (cond) { 306 case QPU_COND_NEVER: 307 case QPU_COND_ALWAYS: 308 break; 309 default: 310 add_read_dep(state, state->last_sf, n); 311 break; 312 } 313} 314 315/** 316 * Common code for dependencies that need to be tracked both forward and 317 * backward. 318 * 319 * This is for things like "all reads of r4 have to happen between the r4 320 * writes that surround them". 321 */ 322static void 323calculate_deps(struct schedule_state *state, struct schedule_node *n) 324{ 325 uint64_t inst = n->inst->inst; 326 uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD); 327 uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL); 328 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 329 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 330 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 331 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 332 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); 333 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); 334 uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A); 335 uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B); 336 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 337 338 if (sig != QPU_SIG_LOAD_IMM) { 339 process_raddr_deps(state, n, raddr_a, true); 340 if (sig != QPU_SIG_SMALL_IMM && 341 sig != QPU_SIG_BRANCH) 342 process_raddr_deps(state, n, raddr_b, false); 343 } 344 345 if (add_op != QPU_A_NOP) { 346 process_mux_deps(state, n, add_a); 347 process_mux_deps(state, n, add_b); 348 } 349 if (mul_op != QPU_M_NOP) { 350 process_mux_deps(state, n, mul_a); 351 process_mux_deps(state, n, mul_b); 352 } 353 354 process_waddr_deps(state, n, waddr_add, true); 355 process_waddr_deps(state, n, waddr_mul, false); 356 if (qpu_writes_r4(inst)) 357 add_write_dep(state, &state->last_r[4], n); 358 359 switch (sig) { 360 case QPU_SIG_SW_BREAKPOINT: 361 case QPU_SIG_NONE: 362 case QPU_SIG_SMALL_IMM: 363 case QPU_SIG_LOAD_IMM: 364 break; 365 366 case QPU_SIG_THREAD_SWITCH: 367 case QPU_SIG_LAST_THREAD_SWITCH: 368 /* All accumulator contents and flags are undefined after the 369 * switch. 370 */ 371 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) 372 add_write_dep(state, &state->last_r[i], n); 373 add_write_dep(state, &state->last_sf, n); 374 375 /* Scoreboard-locking operations have to stay after the last 376 * thread switch. 377 */ 378 add_write_dep(state, &state->last_tlb, n); 379 380 add_write_dep(state, &state->last_tmu_write, n); 381 break; 382 383 case QPU_SIG_LOAD_TMU0: 384 case QPU_SIG_LOAD_TMU1: 385 /* TMU loads are coming from a FIFO, so ordering is important. 386 */ 387 add_write_dep(state, &state->last_tmu_write, n); 388 break; 389 390 case QPU_SIG_COLOR_LOAD: 391 add_read_dep(state, state->last_tlb, n); 392 break; 393 394 case QPU_SIG_BRANCH: 395 add_read_dep(state, state->last_sf, n); 396 break; 397 398 case QPU_SIG_PROG_END: 399 case QPU_SIG_WAIT_FOR_SCOREBOARD: 400 case QPU_SIG_SCOREBOARD_UNLOCK: 401 case QPU_SIG_COVERAGE_LOAD: 402 case QPU_SIG_COLOR_LOAD_END: 403 case QPU_SIG_ALPHA_MASK_LOAD: 404 fprintf(stderr, "Unhandled signal bits %d\n", sig); 405 abort(); 406 } 407 408 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD)); 409 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL)); 410 if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH) 411 add_write_dep(state, &state->last_sf, n); 412} 413 414static void 415calculate_forward_deps(struct vc4_compile *c, struct dag *dag, 416 struct list_head *schedule_list) 417{ 418 struct schedule_state state; 419 420 memset(&state, 0, sizeof(state)); 421 state.dag = dag; 422 state.dir = F; 423 424 list_for_each_entry(struct schedule_node, node, schedule_list, link) 425 calculate_deps(&state, node); 426} 427 428static void 429calculate_reverse_deps(struct vc4_compile *c, struct dag *dag, 430 struct list_head *schedule_list) 431{ 432 struct schedule_state state; 433 434 memset(&state, 0, sizeof(state)); 435 state.dag = dag; 436 state.dir = R; 437 438 list_for_each_entry_rev(struct schedule_node, node, schedule_list, 439 link) { 440 calculate_deps(&state, (struct schedule_node *)node); 441 } 442} 443 444struct choose_scoreboard { 445 struct dag *dag; 446 int tick; 447 int last_sfu_write_tick; 448 int last_uniforms_reset_tick; 449 uint32_t last_waddr_a, last_waddr_b; 450 bool tlb_locked; 451}; 452 453static bool 454reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst) 455{ 456 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 457 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 458 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 459 460 /* Full immediate loads don't read any registers. */ 461 if (sig == QPU_SIG_LOAD_IMM) 462 return false; 463 464 uint32_t src_muxes[] = { 465 QPU_GET_FIELD(inst, QPU_ADD_A), 466 QPU_GET_FIELD(inst, QPU_ADD_B), 467 QPU_GET_FIELD(inst, QPU_MUL_A), 468 QPU_GET_FIELD(inst, QPU_MUL_B), 469 }; 470 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) { 471 if ((src_muxes[i] == QPU_MUX_A && 472 raddr_a < 32 && 473 scoreboard->last_waddr_a == raddr_a) || 474 (src_muxes[i] == QPU_MUX_B && 475 sig != QPU_SIG_SMALL_IMM && 476 raddr_b < 32 && 477 scoreboard->last_waddr_b == raddr_b)) { 478 return true; 479 } 480 481 if (src_muxes[i] == QPU_MUX_R4) { 482 if (scoreboard->tick - 483 scoreboard->last_sfu_write_tick <= 2) { 484 return true; 485 } 486 } 487 } 488 489 if (sig == QPU_SIG_SMALL_IMM && 490 QPU_GET_FIELD(inst, QPU_SMALL_IMM) >= QPU_SMALL_IMM_MUL_ROT) { 491 uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A); 492 uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B); 493 494 if (scoreboard->last_waddr_a == mux_a + QPU_W_ACC0 || 495 scoreboard->last_waddr_a == mux_b + QPU_W_ACC0 || 496 scoreboard->last_waddr_b == mux_a + QPU_W_ACC0 || 497 scoreboard->last_waddr_b == mux_b + QPU_W_ACC0) { 498 return true; 499 } 500 } 501 502 if (reads_uniform(inst) && 503 scoreboard->tick - scoreboard->last_uniforms_reset_tick <= 2) { 504 return true; 505 } 506 507 return false; 508} 509 510static bool 511pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst) 512{ 513 return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst)); 514} 515 516static int 517get_instruction_priority(uint64_t inst) 518{ 519 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 520 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 521 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 522 uint32_t baseline_score; 523 uint32_t next_score = 0; 524 525 /* Schedule TLB operations as late as possible, to get more 526 * parallelism between shaders. 527 */ 528 if (qpu_inst_is_tlb(inst)) 529 return next_score; 530 next_score++; 531 532 /* Schedule texture read results collection late to hide latency. */ 533 if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1) 534 return next_score; 535 next_score++; 536 537 /* Default score for things that aren't otherwise special. */ 538 baseline_score = next_score; 539 next_score++; 540 541 /* Schedule texture read setup early to hide their latency better. */ 542 if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul)) 543 return next_score; 544 next_score++; 545 546 return baseline_score; 547} 548 549static struct schedule_node * 550choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, 551 struct list_head *schedule_list, 552 struct schedule_node *prev_inst) 553{ 554 struct schedule_node *chosen = NULL; 555 int chosen_prio = 0; 556 557 /* Don't pair up anything with a thread switch signal -- emit_thrsw() 558 * will handle pairing it along with filling the delay slots. 559 */ 560 if (prev_inst) { 561 uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst, 562 QPU_SIG); 563 if (prev_sig == QPU_SIG_THREAD_SWITCH || 564 prev_sig == QPU_SIG_LAST_THREAD_SWITCH) { 565 return NULL; 566 } 567 } 568 569 list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, 570 dag.link) { 571 uint64_t inst = n->inst->inst; 572 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 573 574 /* Don't choose the branch instruction until it's the last one 575 * left. XXX: We could potentially choose it before it's the 576 * last one, if the remaining instructions fit in the delay 577 * slots. 578 */ 579 if (sig == QPU_SIG_BRANCH && 580 !list_is_singular(&scoreboard->dag->heads)) { 581 continue; 582 } 583 584 /* "An instruction must not read from a location in physical 585 * regfile A or B that was written to by the previous 586 * instruction." 587 */ 588 if (reads_too_soon_after_write(scoreboard, inst)) 589 continue; 590 591 /* "A scoreboard wait must not occur in the first two 592 * instructions of a fragment shader. This is either the 593 * explicit Wait for Scoreboard signal or an implicit wait 594 * with the first tile-buffer read or write instruction." 595 */ 596 if (pixel_scoreboard_too_soon(scoreboard, inst)) 597 continue; 598 599 /* If we're trying to pair with another instruction, check 600 * that they're compatible. 601 */ 602 if (prev_inst) { 603 /* Don't pair up a thread switch signal -- we'll 604 * handle pairing it when we pick it on its own. 605 */ 606 if (sig == QPU_SIG_THREAD_SWITCH || 607 sig == QPU_SIG_LAST_THREAD_SWITCH) { 608 continue; 609 } 610 611 if (prev_inst->uniform != -1 && n->uniform != -1) 612 continue; 613 614 /* Don't merge in something that will lock the TLB. 615 * Hopefully what we have in inst will release some 616 * other instructions, allowing us to delay the 617 * TLB-locking instruction until later. 618 */ 619 if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) 620 continue; 621 622 inst = qpu_merge_inst(prev_inst->inst->inst, inst); 623 if (!inst) 624 continue; 625 } 626 627 int prio = get_instruction_priority(inst); 628 629 /* Found a valid instruction. If nothing better comes along, 630 * this one works. 631 */ 632 if (!chosen) { 633 chosen = n; 634 chosen_prio = prio; 635 continue; 636 } 637 638 if (prio > chosen_prio) { 639 chosen = n; 640 chosen_prio = prio; 641 } else if (prio < chosen_prio) { 642 continue; 643 } 644 645 if (n->delay > chosen->delay) { 646 chosen = n; 647 chosen_prio = prio; 648 } else if (n->delay < chosen->delay) { 649 continue; 650 } 651 } 652 653 return chosen; 654} 655 656static void 657update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, 658 uint64_t inst) 659{ 660 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 661 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 662 663 if (!(inst & QPU_WS)) { 664 scoreboard->last_waddr_a = waddr_add; 665 scoreboard->last_waddr_b = waddr_mul; 666 } else { 667 scoreboard->last_waddr_b = waddr_add; 668 scoreboard->last_waddr_a = waddr_mul; 669 } 670 671 if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) || 672 (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) { 673 scoreboard->last_sfu_write_tick = scoreboard->tick; 674 } 675 676 if (waddr_add == QPU_W_UNIFORMS_ADDRESS || 677 waddr_mul == QPU_W_UNIFORMS_ADDRESS) { 678 scoreboard->last_uniforms_reset_tick = scoreboard->tick; 679 } 680 681 if (qpu_inst_is_tlb(inst)) 682 scoreboard->tlb_locked = true; 683} 684 685static void 686dump_state(struct dag *dag) 687{ 688 list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) { 689 fprintf(stderr, " t=%4d: ", n->unblocked_time); 690 vc4_qpu_disasm(&n->inst->inst, 1); 691 fprintf(stderr, "\n"); 692 693 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 694 struct schedule_node *child = 695 (struct schedule_node *)edge->child; 696 if (!child) 697 continue; 698 699 fprintf(stderr, " - "); 700 vc4_qpu_disasm(&child->inst->inst, 1); 701 fprintf(stderr, " (%d parents, %c)\n", 702 child->dag.parent_count, 703 edge->data ? 'w' : 'r'); 704 } 705 } 706} 707 708static uint32_t waddr_latency(uint32_t waddr, uint64_t after) 709{ 710 if (waddr < 32) 711 return 2; 712 713 /* Apply some huge latency between texture fetch requests and getting 714 * their results back. 715 * 716 * FIXME: This is actually pretty bogus. If we do: 717 * 718 * mov tmu0_s, a 719 * <a bit of math> 720 * mov tmu0_s, b 721 * load_tmu0 722 * <more math> 723 * load_tmu0 724 * 725 * we count that as worse than 726 * 727 * mov tmu0_s, a 728 * mov tmu0_s, b 729 * <lots of math> 730 * load_tmu0 731 * <more math> 732 * load_tmu0 733 * 734 * because we associate the first load_tmu0 with the *second* tmu0_s. 735 */ 736 if (waddr == QPU_W_TMU0_S) { 737 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0) 738 return 100; 739 } 740 if (waddr == QPU_W_TMU1_S) { 741 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1) 742 return 100; 743 } 744 745 switch(waddr) { 746 case QPU_W_SFU_RECIP: 747 case QPU_W_SFU_RECIPSQRT: 748 case QPU_W_SFU_EXP: 749 case QPU_W_SFU_LOG: 750 return 3; 751 default: 752 return 1; 753 } 754} 755 756static uint32_t 757instruction_latency(struct schedule_node *before, struct schedule_node *after) 758{ 759 uint64_t before_inst = before->inst->inst; 760 uint64_t after_inst = after->inst->inst; 761 762 return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD), 763 after_inst), 764 waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL), 765 after_inst)); 766} 767 768/** Recursive computation of the delay member of a node. */ 769static void 770compute_delay(struct dag_node *node, void *state) 771{ 772 struct schedule_node *n = (struct schedule_node *)node; 773 774 n->delay = 1; 775 776 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 777 struct schedule_node *child = 778 (struct schedule_node *)edge->child; 779 n->delay = MAX2(n->delay, (child->delay + 780 instruction_latency(n, child))); 781 } 782} 783 784/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head() 785 * should be called on it later to finish pruning the other edges). 786 */ 787static void 788pre_remove_head(struct dag *dag, struct schedule_node *n) 789{ 790 list_delinit(&n->dag.link); 791 792 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 793 if (edge->data) 794 dag_remove_edge(dag, edge); 795 } 796} 797 798static void 799mark_instruction_scheduled(struct dag *dag, 800 uint32_t time, 801 struct schedule_node *node) 802{ 803 if (!node) 804 return; 805 806 util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) { 807 struct schedule_node *child = 808 (struct schedule_node *)edge->child; 809 810 if (!child) 811 continue; 812 813 uint32_t latency = instruction_latency(node, child); 814 815 child->unblocked_time = MAX2(child->unblocked_time, 816 time + latency); 817 } 818 dag_prune_head(dag, &node->dag); 819} 820 821/** 822 * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair 823 * with another instruction. 824 */ 825static void 826emit_thrsw(struct vc4_compile *c, 827 struct choose_scoreboard *scoreboard, 828 uint64_t inst) 829{ 830 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 831 832 /* There should be nothing in a thrsw inst being scheduled other than 833 * the signal bits. 834 */ 835 assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP); 836 assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP); 837 838 /* Try to find an earlier scheduled instruction that we can merge the 839 * thrsw into. 840 */ 841 int thrsw_ip = c->qpu_inst_count; 842 for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) { 843 uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i]; 844 uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG); 845 846 if (prev_sig == QPU_SIG_NONE) 847 thrsw_ip = c->qpu_inst_count - i; 848 } 849 850 if (thrsw_ip != c->qpu_inst_count) { 851 /* Merge the thrsw into the existing instruction. */ 852 c->qpu_insts[thrsw_ip] = 853 QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG); 854 } else { 855 qpu_serialize_one_inst(c, inst); 856 update_scoreboard_for_chosen(scoreboard, inst); 857 } 858 859 /* Fill the delay slots. */ 860 while (c->qpu_inst_count < thrsw_ip + 3) { 861 update_scoreboard_for_chosen(scoreboard, qpu_NOP()); 862 qpu_serialize_one_inst(c, qpu_NOP()); 863 } 864} 865 866static uint32_t 867schedule_instructions(struct vc4_compile *c, 868 struct choose_scoreboard *scoreboard, 869 struct qblock *block, 870 struct list_head *schedule_list, 871 enum quniform_contents *orig_uniform_contents, 872 uint32_t *orig_uniform_data, 873 uint32_t *next_uniform) 874{ 875 uint32_t time = 0; 876 877 while (!list_is_empty(&scoreboard->dag->heads)) { 878 struct schedule_node *chosen = 879 choose_instruction_to_schedule(scoreboard, 880 schedule_list, 881 NULL); 882 struct schedule_node *merge = NULL; 883 884 /* If there are no valid instructions to schedule, drop a NOP 885 * in. 886 */ 887 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP(); 888 889 if (debug) { 890 fprintf(stderr, "t=%4d: current list:\n", 891 time); 892 dump_state(scoreboard->dag); 893 fprintf(stderr, "t=%4d: chose: ", time); 894 vc4_qpu_disasm(&inst, 1); 895 fprintf(stderr, "\n"); 896 } 897 898 /* Schedule this instruction onto the QPU list. Also try to 899 * find an instruction to pair with it. 900 */ 901 if (chosen) { 902 time = MAX2(chosen->unblocked_time, time); 903 pre_remove_head(scoreboard->dag, chosen); 904 if (chosen->uniform != -1) { 905 c->uniform_data[*next_uniform] = 906 orig_uniform_data[chosen->uniform]; 907 c->uniform_contents[*next_uniform] = 908 orig_uniform_contents[chosen->uniform]; 909 (*next_uniform)++; 910 } 911 912 merge = choose_instruction_to_schedule(scoreboard, 913 schedule_list, 914 chosen); 915 if (merge) { 916 time = MAX2(merge->unblocked_time, time); 917 inst = qpu_merge_inst(inst, merge->inst->inst); 918 assert(inst != 0); 919 if (merge->uniform != -1) { 920 c->uniform_data[*next_uniform] = 921 orig_uniform_data[merge->uniform]; 922 c->uniform_contents[*next_uniform] = 923 orig_uniform_contents[merge->uniform]; 924 (*next_uniform)++; 925 } 926 927 if (debug) { 928 fprintf(stderr, "t=%4d: merging: ", 929 time); 930 vc4_qpu_disasm(&merge->inst->inst, 1); 931 fprintf(stderr, "\n"); 932 fprintf(stderr, " resulting in: "); 933 vc4_qpu_disasm(&inst, 1); 934 fprintf(stderr, "\n"); 935 } 936 } 937 } 938 939 if (debug) { 940 fprintf(stderr, "\n"); 941 } 942 943 /* Now that we've scheduled a new instruction, some of its 944 * children can be promoted to the list of instructions ready to 945 * be scheduled. Update the children's unblocked time for this 946 * DAG edge as we do so. 947 */ 948 mark_instruction_scheduled(scoreboard->dag, time, chosen); 949 mark_instruction_scheduled(scoreboard->dag, time, merge); 950 951 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH || 952 QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) { 953 emit_thrsw(c, scoreboard, inst); 954 } else { 955 qpu_serialize_one_inst(c, inst); 956 update_scoreboard_for_chosen(scoreboard, inst); 957 } 958 959 scoreboard->tick++; 960 time++; 961 962 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) { 963 block->branch_qpu_ip = c->qpu_inst_count - 1; 964 /* Fill the delay slots. 965 * 966 * We should fill these with actual instructions, 967 * instead, but that will probably need to be done 968 * after this, once we know what the leading 969 * instructions of the successors are (so we can 970 * handle A/B register file write latency) 971 */ 972 inst = qpu_NOP(); 973 update_scoreboard_for_chosen(scoreboard, inst); 974 qpu_serialize_one_inst(c, inst); 975 qpu_serialize_one_inst(c, inst); 976 qpu_serialize_one_inst(c, inst); 977 } 978 } 979 980 return time; 981} 982 983static uint32_t 984qpu_schedule_instructions_block(struct vc4_compile *c, 985 struct choose_scoreboard *scoreboard, 986 struct qblock *block, 987 enum quniform_contents *orig_uniform_contents, 988 uint32_t *orig_uniform_data, 989 uint32_t *next_uniform) 990{ 991 scoreboard->dag = dag_create(NULL); 992 struct list_head setup_list; 993 994 list_inithead(&setup_list); 995 996 /* Wrap each instruction in a scheduler structure. */ 997 uint32_t next_sched_uniform = *next_uniform; 998 while (!list_is_empty(&block->qpu_inst_list)) { 999 struct queued_qpu_inst *inst = 1000 (struct queued_qpu_inst *)block->qpu_inst_list.next; 1001 struct schedule_node *n = rzalloc(scoreboard->dag, 1002 struct schedule_node); 1003 1004 dag_init_node(scoreboard->dag, &n->dag); 1005 n->inst = inst; 1006 1007 if (reads_uniform(inst->inst)) { 1008 n->uniform = next_sched_uniform++; 1009 } else { 1010 n->uniform = -1; 1011 } 1012 list_del(&inst->link); 1013 list_addtail(&n->link, &setup_list); 1014 } 1015 1016 calculate_forward_deps(c, scoreboard->dag, &setup_list); 1017 calculate_reverse_deps(c, scoreboard->dag, &setup_list); 1018 1019 dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL); 1020 1021 uint32_t cycles = schedule_instructions(c, scoreboard, block, 1022 &setup_list, 1023 orig_uniform_contents, 1024 orig_uniform_data, 1025 next_uniform); 1026 1027 ralloc_free(scoreboard->dag); 1028 scoreboard->dag = NULL; 1029 1030 return cycles; 1031} 1032 1033static void 1034qpu_set_branch_targets(struct vc4_compile *c) 1035{ 1036 qir_for_each_block(block, c) { 1037 /* The end block of the program has no branch. */ 1038 if (!block->successors[0]) 1039 continue; 1040 1041 /* If there was no branch instruction, then the successor 1042 * block must follow immediately after this one. 1043 */ 1044 if (block->branch_qpu_ip == ~0) { 1045 assert(block->end_qpu_ip + 1 == 1046 block->successors[0]->start_qpu_ip); 1047 continue; 1048 } 1049 1050 /* Set the branch target for the block that doesn't follow 1051 * immediately after ours. 1052 */ 1053 uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip]; 1054 assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH); 1055 assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0); 1056 1057 uint32_t branch_target = 1058 (block->successors[0]->start_qpu_ip - 1059 (block->branch_qpu_ip + 4)) * sizeof(uint64_t); 1060 *branch_inst = (*branch_inst | 1061 QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET)); 1062 1063 /* Make sure that the if-we-don't-jump successor was scheduled 1064 * just after the delay slots. 1065 */ 1066 if (block->successors[1]) { 1067 assert(block->successors[1]->start_qpu_ip == 1068 block->branch_qpu_ip + 4); 1069 } 1070 } 1071} 1072 1073uint32_t 1074qpu_schedule_instructions(struct vc4_compile *c) 1075{ 1076 /* We reorder the uniforms as we schedule instructions, so save the 1077 * old data off and replace it. 1078 */ 1079 uint32_t *uniform_data = c->uniform_data; 1080 enum quniform_contents *uniform_contents = c->uniform_contents; 1081 c->uniform_contents = ralloc_array(c, enum quniform_contents, 1082 c->num_uniforms); 1083 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); 1084 c->uniform_array_size = c->num_uniforms; 1085 uint32_t next_uniform = 0; 1086 1087 struct choose_scoreboard scoreboard; 1088 memset(&scoreboard, 0, sizeof(scoreboard)); 1089 scoreboard.last_waddr_a = ~0; 1090 scoreboard.last_waddr_b = ~0; 1091 scoreboard.last_sfu_write_tick = -10; 1092 scoreboard.last_uniforms_reset_tick = -10; 1093 1094 if (debug) { 1095 fprintf(stderr, "Pre-schedule instructions\n"); 1096 qir_for_each_block(block, c) { 1097 fprintf(stderr, "BLOCK %d\n", block->index); 1098 list_for_each_entry(struct queued_qpu_inst, q, 1099 &block->qpu_inst_list, link) { 1100 vc4_qpu_disasm(&q->inst, 1); 1101 fprintf(stderr, "\n"); 1102 } 1103 } 1104 fprintf(stderr, "\n"); 1105 } 1106 1107 uint32_t cycles = 0; 1108 qir_for_each_block(block, c) { 1109 block->start_qpu_ip = c->qpu_inst_count; 1110 block->branch_qpu_ip = ~0; 1111 1112 cycles += qpu_schedule_instructions_block(c, 1113 &scoreboard, 1114 block, 1115 uniform_contents, 1116 uniform_data, 1117 &next_uniform); 1118 1119 block->end_qpu_ip = c->qpu_inst_count - 1; 1120 } 1121 1122 qpu_set_branch_targets(c); 1123 1124 assert(next_uniform == c->num_uniforms); 1125 1126 if (debug) { 1127 fprintf(stderr, "Post-schedule instructions\n"); 1128 vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count); 1129 fprintf(stderr, "\n"); 1130 } 1131 1132 return cycles; 1133} 1134