1/* 2 * Copyright © 2010 Intel Corporation 3 * Copyright © 2014-2015 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25/** 26 * @file vc4_qir_schedule.c 27 * 28 * The basic model of the list scheduler is to take a basic block, compute a 29 * DAG of the dependencies from the bottom up, and make a list of the DAG 30 * heads. Heuristically pick a DAG head and schedule (remove) it, then put 31 * all the parents that are now DAG heads into the list of things to 32 * schedule. 33 * 34 * The goal of scheduling here, before register allocation and conversion to 35 * QPU instructions, is to reduce register pressure by reordering instructions 36 * to consume values when possible. 37 */ 38 39#include "vc4_qir.h" 40#include "util/dag.h" 41 42static bool debug; 43 44struct schedule_node { 45 struct dag_node dag; 46 struct list_head link; 47 struct qinst *inst; 48 49 /* Length of the longest (latency) chain from a DAG head to the this 50 * instruction. 51 */ 52 uint32_t delay; 53 54 /* Longest time + latency_between(parent, this) of any parent of this 55 * node. 56 */ 57 uint32_t unblocked_time; 58}; 59 60struct schedule_state { 61 struct dag *dag; 62 63 uint32_t time; 64 65 uint32_t *temp_writes; 66 67 BITSET_WORD *temp_live; 68}; 69 70/* When walking the instructions in reverse, we need to swap before/after in 71 * add_dep(). 72 */ 73enum direction { F, R }; 74 75/** 76 * Marks a dependency between two intructions, that \p after must appear after 77 * \p before. 78 * 79 * Our dependencies are tracked as a DAG. Since we're scheduling bottom-up, 80 * the latest instructions with nothing left to schedule are the DAG heads, 81 * and their inputs are their children. 82 */ 83static void 84add_dep(enum direction dir, 85 struct schedule_node *before, 86 struct schedule_node *after) 87{ 88 if (!before || !after) 89 return; 90 91 assert(before != after); 92 93 if (dir == R) { 94 struct schedule_node *t = before; 95 before = after; 96 after = t; 97 } 98 99 dag_add_edge(&after->dag, &before->dag, 0); 100} 101 102static void 103add_write_dep(enum direction dir, 104 struct schedule_node **before, 105 struct schedule_node *after) 106{ 107 add_dep(dir, *before, after); 108 *before = after; 109} 110 111struct schedule_setup_state { 112 struct schedule_node **last_temp_write; 113 struct schedule_node *last_sf; 114 struct schedule_node *last_vary_read; 115 struct schedule_node *last_vpm_read; 116 struct schedule_node *last_vpm_write; 117 struct schedule_node *last_tex_coord; 118 struct schedule_node *last_tex_result; 119 struct schedule_node *last_tlb; 120 struct schedule_node *last_uniforms_reset; 121 enum direction dir; 122 123 /** 124 * Texture FIFO tracking. This is done top-to-bottom, and is used to 125 * track the QOP_TEX_RESULTs and add dependencies on previous ones 126 * when trying to submit texture coords with TFREQ full or new texture 127 * fetches with TXRCV full. 128 */ 129 struct { 130 struct schedule_node *node; 131 int coords; 132 } tex_fifo[8]; 133 int tfreq_count; /**< Number of texture coords outstanding. */ 134 int tfrcv_count; /**< Number of texture results outstanding. */ 135 int tex_fifo_pos; 136}; 137 138static void 139block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n) 140{ 141 add_dep(state->dir, state->tex_fifo[0].node, n); 142 143 state->tfreq_count -= state->tex_fifo[0].coords; 144 state->tfrcv_count--; 145 146 memmove(&state->tex_fifo[0], 147 &state->tex_fifo[1], 148 state->tex_fifo_pos * sizeof(state->tex_fifo[0])); 149 state->tex_fifo_pos--; 150} 151 152/** 153 * Common code for dependencies that need to be tracked both forward and 154 * backward. 155 * 156 * This is for things like "all VPM reads have to happen in order." 157 */ 158static void 159calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) 160{ 161 struct qinst *inst = n->inst; 162 enum direction dir = state->dir; 163 164 165 /* Add deps for temp registers and varyings accesses. Note that we 166 * ignore uniforms accesses, because qir_reorder_uniforms() happens 167 * after this. 168 */ 169 for (int i = 0; i < qir_get_nsrc(inst); i++) { 170 switch (inst->src[i].file) { 171 case QFILE_TEMP: 172 add_dep(dir, 173 state->last_temp_write[inst->src[i].index], n); 174 break; 175 176 case QFILE_VARY: 177 add_write_dep(dir, &state->last_vary_read, n); 178 break; 179 180 case QFILE_VPM: 181 add_write_dep(dir, &state->last_vpm_read, n); 182 break; 183 184 default: 185 break; 186 } 187 } 188 189 switch (inst->op) { 190 case QOP_VARY_ADD_C: 191 add_dep(dir, state->last_vary_read, n); 192 break; 193 194 case QOP_TEX_RESULT: 195 /* Results have to be fetched in order. */ 196 add_write_dep(dir, &state->last_tex_result, n); 197 break; 198 199 case QOP_THRSW: 200 /* After a new THRSW, one must collect all texture samples 201 * queued since the previous THRSW/program start. For now, we 202 * have one THRSW in between each texture setup and its 203 * results collection as our input, and we just make sure that 204 * that ordering is maintained. 205 */ 206 add_write_dep(dir, &state->last_tex_coord, n); 207 add_write_dep(dir, &state->last_tex_result, n); 208 209 /* accumulators and flags are lost across thread switches. */ 210 add_write_dep(dir, &state->last_sf, n); 211 212 /* Setup, like the varyings, will need to be drained before we 213 * thread switch. 214 */ 215 add_write_dep(dir, &state->last_vary_read, n); 216 217 /* The TLB-locking operations have to stay after the last 218 * thread switch. 219 */ 220 add_write_dep(dir, &state->last_tlb, n); 221 break; 222 223 case QOP_TLB_COLOR_READ: 224 case QOP_MS_MASK: 225 add_write_dep(dir, &state->last_tlb, n); 226 break; 227 228 default: 229 break; 230 } 231 232 switch (inst->dst.file) { 233 case QFILE_VPM: 234 add_write_dep(dir, &state->last_vpm_write, n); 235 break; 236 237 case QFILE_TEMP: 238 add_write_dep(dir, &state->last_temp_write[inst->dst.index], n); 239 break; 240 241 case QFILE_TLB_COLOR_WRITE: 242 case QFILE_TLB_COLOR_WRITE_MS: 243 case QFILE_TLB_Z_WRITE: 244 case QFILE_TLB_STENCIL_SETUP: 245 add_write_dep(dir, &state->last_tlb, n); 246 break; 247 248 case QFILE_TEX_S_DIRECT: 249 case QFILE_TEX_S: 250 case QFILE_TEX_T: 251 case QFILE_TEX_R: 252 case QFILE_TEX_B: 253 /* Texturing setup gets scheduled in order, because 254 * the uniforms referenced by them have to land in a 255 * specific order. 256 */ 257 add_write_dep(dir, &state->last_tex_coord, n); 258 break; 259 260 default: 261 break; 262 } 263 264 if (qir_depends_on_flags(inst)) 265 add_dep(dir, state->last_sf, n); 266 267 if (inst->sf) 268 add_write_dep(dir, &state->last_sf, n); 269} 270 271static void 272calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, 273 struct list_head *schedule_list) 274{ 275 struct schedule_setup_state state; 276 277 memset(&state, 0, sizeof(state)); 278 state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *, 279 c->num_temps); 280 state.dir = F; 281 282 list_for_each_entry(struct schedule_node, n, schedule_list, link) { 283 struct qinst *inst = n->inst; 284 285 calculate_deps(&state, n); 286 287 for (int i = 0; i < qir_get_nsrc(inst); i++) { 288 switch (inst->src[i].file) { 289 case QFILE_UNIF: 290 add_dep(state.dir, state.last_uniforms_reset, n); 291 break; 292 default: 293 break; 294 } 295 } 296 297 switch (inst->dst.file) { 298 case QFILE_TEX_S_DIRECT: 299 case QFILE_TEX_S: 300 case QFILE_TEX_T: 301 case QFILE_TEX_R: 302 case QFILE_TEX_B: 303 /* From the VC4 spec: 304 * 305 * "The TFREQ input FIFO holds two full lots of s, 306 * t, r, b data, plus associated setup data, per 307 * QPU, that is, there are eight data slots. For 308 * each texture request, slots are only consumed 309 * for the components of s, t, r, and b actually 310 * written. Thus the FIFO can hold four requests 311 * of just (s, t) data, or eight requests of just 312 * s data (for direct addressed data lookups). 313 * 314 * Note that there is one FIFO per QPU, and the 315 * FIFO has no concept of threads - that is, 316 * multi-threaded shaders must be careful to use 317 * only 1/2 the FIFO depth before reading 318 * back. Multi-threaded programs must also 319 * therefore always thread switch on texture 320 * fetch as the other thread may have data 321 * waiting in the FIFO." 322 * 323 * If the texture coordinate fifo is full, block this 324 * on the last QOP_TEX_RESULT. 325 */ 326 if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) { 327 block_until_tex_result(&state, n); 328 } 329 330 /* From the VC4 spec: 331 * 332 * "Since the maximum number of texture requests 333 * in the input (TFREQ) FIFO is four lots of (s, 334 * t) data, the output (TFRCV) FIFO is sized to 335 * holds four lots of max-size color data per 336 * QPU. For non-float color, reads are packed 337 * RGBA8888 data (one read per pixel). For 16-bit 338 * float color, two reads are necessary per 339 * pixel, with reads packed as RG1616 then 340 * BA1616. So per QPU there are eight color slots 341 * in the TFRCV FIFO." 342 * 343 * If the texture result fifo is full, block adding 344 * any more to it until the last QOP_TEX_RESULT. 345 */ 346 if (inst->dst.file == QFILE_TEX_S || 347 inst->dst.file == QFILE_TEX_S_DIRECT) { 348 if (state.tfrcv_count == 349 (c->fs_threaded ? 2 : 4)) 350 block_until_tex_result(&state, n); 351 state.tfrcv_count++; 352 } 353 354 state.tex_fifo[state.tex_fifo_pos].coords++; 355 state.tfreq_count++; 356 break; 357 358 default: 359 break; 360 } 361 362 switch (inst->op) { 363 case QOP_TEX_RESULT: 364 /* Results have to be fetched after the 365 * coordinate setup. Note that we're assuming 366 * here that our input shader has the texture 367 * coord setup and result fetch in order, 368 * which is true initially but not of our 369 * instruction stream after this pass. 370 */ 371 add_dep(state.dir, state.last_tex_coord, n); 372 373 state.tex_fifo[state.tex_fifo_pos].node = n; 374 375 state.tex_fifo_pos++; 376 memset(&state.tex_fifo[state.tex_fifo_pos], 0, 377 sizeof(state.tex_fifo[0])); 378 break; 379 380 case QOP_UNIFORMS_RESET: 381 add_write_dep(state.dir, &state.last_uniforms_reset, n); 382 break; 383 384 default: 385 break; 386 } 387 } 388} 389 390static void 391calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx, 392 struct list_head *schedule_list) 393{ 394 struct schedule_setup_state state; 395 396 memset(&state, 0, sizeof(state)); 397 state.dir = R; 398 state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *, 399 c->num_temps); 400 401 list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) { 402 calculate_deps(&state, n); 403 } 404} 405 406static int 407get_register_pressure_cost(struct schedule_state *state, struct qinst *inst) 408{ 409 int cost = 0; 410 411 if (inst->dst.file == QFILE_TEMP && 412 state->temp_writes[inst->dst.index] == 1) 413 cost--; 414 415 for (int i = 0; i < qir_get_nsrc(inst); i++) { 416 if (inst->src[i].file != QFILE_TEMP || 417 BITSET_TEST(state->temp_live, inst->src[i].index)) { 418 continue; 419 } 420 421 bool already_counted = false; 422 for (int j = 0; j < i; j++) { 423 if (inst->src[i].file == inst->src[j].file && 424 inst->src[i].index == inst->src[j].index) { 425 already_counted = true; 426 } 427 } 428 if (!already_counted) 429 cost++; 430 } 431 432 return cost; 433} 434 435static bool 436locks_scoreboard(struct qinst *inst) 437{ 438 if (inst->op == QOP_TLB_COLOR_READ) 439 return true; 440 441 switch (inst->dst.file) { 442 case QFILE_TLB_Z_WRITE: 443 case QFILE_TLB_COLOR_WRITE: 444 case QFILE_TLB_COLOR_WRITE_MS: 445 return true; 446 default: 447 return false; 448 } 449} 450 451static struct schedule_node * 452choose_instruction(struct schedule_state *state) 453{ 454 struct schedule_node *chosen = NULL; 455 456 list_for_each_entry(struct schedule_node, n, &state->dag->heads, 457 dag.link) { 458 /* The branches aren't being tracked as dependencies. Make 459 * sure that they stay scheduled as the last instruction of 460 * the block, which is to say the first one we choose to 461 * schedule. 462 */ 463 if (n->inst->op == QOP_BRANCH) 464 return n; 465 466 if (!chosen) { 467 chosen = n; 468 continue; 469 } 470 471 /* Prefer scheduling things that lock the scoreboard, so that 472 * they appear late in the program and we get more parallelism 473 * between shaders on multiple QPUs hitting the same fragment. 474 */ 475 if (locks_scoreboard(n->inst) && 476 !locks_scoreboard(chosen->inst)) { 477 chosen = n; 478 continue; 479 } else if (!locks_scoreboard(n->inst) && 480 locks_scoreboard(chosen->inst)) { 481 continue; 482 } 483 484 /* If we would block on the previously chosen node, but would 485 * block less on this one, then prefer it. 486 */ 487 if (chosen->unblocked_time > state->time && 488 n->unblocked_time < chosen->unblocked_time) { 489 chosen = n; 490 continue; 491 } else if (n->unblocked_time > state->time && 492 n->unblocked_time > chosen->unblocked_time) { 493 continue; 494 } 495 496 /* If we can definitely reduce register pressure, do so 497 * immediately. 498 */ 499 int register_pressure_cost = 500 get_register_pressure_cost(state, n->inst); 501 int chosen_register_pressure_cost = 502 get_register_pressure_cost(state, chosen->inst); 503 504 if (register_pressure_cost < chosen_register_pressure_cost) { 505 chosen = n; 506 continue; 507 } else if (register_pressure_cost > 508 chosen_register_pressure_cost) { 509 continue; 510 } 511 512 /* Otherwise, prefer instructions with the deepest chain to 513 * the end of the program. This avoids the problem of 514 * "everything generates a temp, nothing finishes freeing one, 515 * guess I'll just keep emitting varying mul/adds". 516 */ 517 if (n->delay > chosen->delay) { 518 chosen = n; 519 continue; 520 } else if (n->delay < chosen->delay) { 521 continue; 522 } 523 } 524 525 return chosen; 526} 527 528static void 529dump_state(struct vc4_compile *c, struct schedule_state *state) 530{ 531 uint32_t i = 0; 532 list_for_each_entry(struct schedule_node, n, &state->dag->heads, 533 dag.link) { 534 fprintf(stderr, "%3d: ", i++); 535 qir_dump_inst(c, n->inst); 536 fprintf(stderr, " (%d cost)\n", 537 get_register_pressure_cost(state, n->inst)); 538 539 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 540 struct schedule_node *child = 541 (struct schedule_node *)edge->child; 542 fprintf(stderr, " - "); 543 qir_dump_inst(c, child->inst); 544 fprintf(stderr, " (%d parents)\n", 545 child->dag.parent_count); 546 } 547 } 548} 549 550/* Estimate of how many instructions we should schedule between operations. 551 * 552 * These aren't in real cycle counts, because we're just estimating cycle 553 * times anyway. QIR instructions will get paired up when turned into QPU 554 * instructions, or extra NOP delays will have to be added due to register 555 * allocation choices. 556 */ 557static uint32_t 558latency_between(struct schedule_node *before, struct schedule_node *after) 559{ 560 if ((before->inst->dst.file == QFILE_TEX_S || 561 before->inst->dst.file == QFILE_TEX_S_DIRECT) && 562 after->inst->op == QOP_TEX_RESULT) 563 return 100; 564 565 switch (before->inst->op) { 566 case QOP_RCP: 567 case QOP_RSQ: 568 case QOP_EXP2: 569 case QOP_LOG2: 570 for (int i = 0; i < qir_get_nsrc(after->inst); i++) { 571 if (after->inst->src[i].file == 572 before->inst->dst.file && 573 after->inst->src[i].index == 574 before->inst->dst.index) { 575 /* There are two QPU delay slots before we can 576 * read a math result, which could be up to 4 577 * QIR instructions if they packed well. 578 */ 579 return 4; 580 } 581 } 582 break; 583 default: 584 break; 585 } 586 587 return 1; 588} 589 590/** Recursive computation of the delay member of a node. */ 591static void 592compute_delay(struct dag_node *node, void *state) 593{ 594 struct schedule_node *n = (struct schedule_node *)node; 595 596 /* The color read needs to be scheduled late, to avoid locking the 597 * scoreboard early. This is our best tool for encouraging that. The 598 * other scoreboard locking ops will have this happen by default, 599 * since they are generally the DAG heads or close to them. 600 */ 601 if (n->inst->op == QOP_TLB_COLOR_READ) 602 n->delay = 1000; 603 else 604 n->delay = 1; 605 606 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 607 struct schedule_node *child = 608 (struct schedule_node *)edge->child; 609 610 n->delay = MAX2(n->delay, (child->delay + 611 latency_between(child, n))); 612 } 613} 614 615static void 616schedule_instructions(struct vc4_compile *c, 617 struct qblock *block, struct schedule_state *state) 618{ 619 if (debug) { 620 fprintf(stderr, "initial deps:\n"); 621 dump_state(c, state); 622 } 623 624 state->time = 0; 625 while (!list_is_empty(&state->dag->heads)) { 626 struct schedule_node *chosen = choose_instruction(state); 627 struct qinst *inst = chosen->inst; 628 629 if (debug) { 630 fprintf(stderr, "current list:\n"); 631 dump_state(c, state); 632 fprintf(stderr, "chose: "); 633 qir_dump_inst(c, inst); 634 fprintf(stderr, " (%d cost)\n", 635 get_register_pressure_cost(state, inst)); 636 } 637 638 state->time = MAX2(state->time, chosen->unblocked_time); 639 640 /* Schedule this instruction back onto the QIR list. */ 641 list_add(&inst->link, &block->instructions); 642 643 /* Now that we've scheduled a new instruction, some of its 644 * children can be promoted to the list of instructions ready to 645 * be scheduled. Update the children's unblocked time for this 646 * DAG edge as we do so. 647 */ 648 util_dynarray_foreach(&chosen->dag.edges, 649 struct dag_edge, edge) { 650 struct schedule_node *child = 651 (struct schedule_node *)edge->child; 652 653 child->unblocked_time = MAX2(child->unblocked_time, 654 state->time + 655 latency_between(child, 656 chosen)); 657 } 658 dag_prune_head(state->dag, &chosen->dag); 659 660 /* Update our tracking of register pressure. */ 661 for (int i = 0; i < qir_get_nsrc(inst); i++) { 662 if (inst->src[i].file == QFILE_TEMP) 663 BITSET_SET(state->temp_live, inst->src[i].index); 664 } 665 if (inst->dst.file == QFILE_TEMP) { 666 state->temp_writes[inst->dst.index]--; 667 if (state->temp_writes[inst->dst.index] == 0) 668 BITSET_CLEAR(state->temp_live, inst->dst.index); 669 } 670 671 state->time++; 672 } 673} 674 675static void 676qir_schedule_instructions_block(struct vc4_compile *c, 677 struct qblock *block) 678{ 679 struct schedule_state *state = rzalloc(NULL, struct schedule_state); 680 681 state->temp_writes = rzalloc_array(state, uint32_t, c->num_temps); 682 state->temp_live = rzalloc_array(state, BITSET_WORD, 683 BITSET_WORDS(c->num_temps)); 684 state->dag = dag_create(state); 685 686 struct list_head setup_list; 687 list_inithead(&setup_list); 688 689 /* Wrap each instruction in a scheduler structure. */ 690 qir_for_each_inst_safe(inst, block) { 691 struct schedule_node *n = rzalloc(state, struct schedule_node); 692 693 n->inst = inst; 694 list_del(&inst->link); 695 list_addtail(&n->link, &setup_list); 696 dag_init_node(state->dag, &n->dag); 697 698 if (inst->dst.file == QFILE_TEMP) 699 state->temp_writes[inst->dst.index]++; 700 } 701 702 /* Dependencies tracked top-to-bottom. */ 703 calculate_forward_deps(c, state, &setup_list); 704 /* Dependencies tracked bottom-to-top. */ 705 calculate_reverse_deps(c, state, &setup_list); 706 707 dag_traverse_bottom_up(state->dag, compute_delay, NULL); 708 709 schedule_instructions(c, block, state); 710 711 ralloc_free(state); 712} 713 714void 715qir_schedule_instructions(struct vc4_compile *c) 716{ 717 718 if (debug) { 719 fprintf(stderr, "Pre-schedule instructions\n"); 720 qir_dump(c); 721 } 722 723 qir_for_each_block(block, c) 724 qir_schedule_instructions_block(c, block); 725 726 if (debug) { 727 fprintf(stderr, "Post-schedule instructions\n"); 728 qir_dump(c); 729 } 730} 731