1/* 2 * Copyright (C) 2022 Collabora Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include "va_compiler.h" 25#include "valhall_enums.h" 26#include "bi_builder.h" 27 28/* 29 * Insert flow control into a scheduled and register allocated shader. This 30 * pass runs after scheduling and register allocation. This pass only 31 * inserts NOPs with the appropriate flow control modifiers. It should be 32 * followed by a cleanup pass to merge flow control modifiers on adjacent 33 * instructions, eliminating the NOPs. This decouples optimization from 34 * correctness, simplifying both passes. 35 * 36 * This pass is responsible for calculating dependencies, according to the 37 * rules: 38 * 39 * 1. An instruction that depends on the results of a previous asyncronous 40 * must first wait for that instruction's slot, unless all 41 * reaching code paths already depended on it. 42 * 2. More generally, any dependencies must be encoded. This includes 43 * Write-After-Write and Write-After-Read hazards with LOAD/STORE to memory. 44 * 3. The shader must wait on slot #6 before running BLEND, ATEST 45 * 4. The shader must wait on slot #7 before running BLEND, ST_TILE 46 * 6. BARRIER must wait on every active slot. 47 * 48 * Unlike Bifrost, it is not necessary to worry about outbound staging 49 * registers, as the hardware stalls reading staging registers when issuing 50 * asynchronous instructions. So we don't track reads in our model of the 51 * hardware scoreboard. This makes things a bit simpler. 52 * 53 * We may reuse slots for multiple asynchronous instructions, though there may 54 * be a performance penalty. 55 */ 56 57#define BI_NUM_REGISTERS 64 58 59/* 60 * Insert a NOP instruction with given flow control. 61 */ 62static void 63bi_flow(bi_context *ctx, bi_cursor cursor, enum va_flow flow) 64{ 65 bi_builder b = bi_init_builder(ctx, cursor); 66 67 bi_nop(&b)->flow = flow; 68} 69 70static uint64_t 71bi_read_mask(bi_instr *I) 72{ 73 uint64_t mask = 0; 74 75 bi_foreach_src(I, s) { 76 if (I->src[s].type == BI_INDEX_REGISTER) { 77 unsigned reg = I->src[s].value; 78 unsigned count = bi_count_read_registers(I, s); 79 80 mask |= (BITFIELD64_MASK(count) << reg); 81 } 82 } 83 84 return mask; 85} 86 87static uint64_t 88bi_write_mask(bi_instr *I) 89{ 90 uint64_t mask = 0; 91 92 bi_foreach_dest(I, d) { 93 if (bi_is_null(I->dest[d])) continue; 94 95 assert(I->dest[d].type == BI_INDEX_REGISTER); 96 97 unsigned reg = I->dest[d].value; 98 unsigned count = bi_count_write_registers(I, d); 99 100 mask |= (BITFIELD64_MASK(count) << reg); 101 } 102 103 return mask; 104} 105 106static bool 107bi_ld_vary_writes_hidden_register(const bi_instr *I) 108{ 109 /* Only varying loads can write the hidden register */ 110 if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_VARYING) 111 return false; 112 113 /* They only write in some update modes */ 114 return (I->update == BI_UPDATE_STORE) || (I->update == BI_UPDATE_CLOBBER); 115} 116 117static bool 118bi_is_memory_access(const bi_instr *I) 119{ 120 /* On the attribute unit but functionally a general memory load */ 121 if (I->op == BI_OPCODE_LD_ATTR_TEX) 122 return true; 123 124 /* UBOs are read-only so there are no ordering constriants */ 125 if (I->seg == BI_SEG_UBO) 126 return false; 127 128 switch (bi_opcode_props[I->op].message) { 129 case BIFROST_MESSAGE_LOAD: 130 case BIFROST_MESSAGE_STORE: 131 case BIFROST_MESSAGE_ATOMIC: 132 return true; 133 default: 134 return false; 135 } 136} 137 138/* Update the scoreboard model to assign an instruction to a given slot */ 139 140static void 141bi_push_instr(struct bi_scoreboard_state *st, bi_instr *I) 142{ 143 if (bi_opcode_props[I->op].sr_write) 144 st->write[I->slot] |= bi_write_mask(I); 145 146 if (bi_is_memory_access(I)) 147 st->memory |= BITFIELD_BIT(I->slot); 148 149 if (bi_opcode_props[I->op].message == BIFROST_MESSAGE_VARYING) 150 st->varying |= BITFIELD_BIT(I->slot); 151} 152 153static uint8_t MUST_CHECK 154bi_pop_slot(struct bi_scoreboard_state *st, unsigned slot) 155{ 156 st->write[slot] = 0; 157 st->varying &= ~BITFIELD_BIT(slot); 158 st->memory &= ~BITFIELD_BIT(slot); 159 160 return BITFIELD_BIT(slot); 161} 162 163/* Adds a dependency on each slot writing any specified register */ 164 165static uint8_t MUST_CHECK 166bi_depend_on_writers(struct bi_scoreboard_state *st, uint64_t regmask) 167{ 168 uint8_t slots = 0; 169 170 for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) { 171 if (st->write[slot] & regmask) 172 slots |= bi_pop_slot(st, slot); 173 } 174 175 return slots; 176} 177 178/* Sets the dependencies for a given clause, updating the model */ 179 180static void 181bi_set_dependencies(bi_block *block, bi_instr *I, struct bi_scoreboard_state *st) 182{ 183 /* Depend on writers to handle read-after-write and write-after-write 184 * dependencies. Write-after-read dependencies are handled in the hardware 185 * where necessary, so we don't worry about them. 186 */ 187 I->flow |= bi_depend_on_writers(st, bi_read_mask(I) | bi_write_mask(I)); 188 189 /* Handle write-after-write and write-after-read dependencies for the varying 190 * hidden registers. Read-after-write dependencies handled in hardware. 191 */ 192 if (bi_ld_vary_writes_hidden_register(I)) { 193 u_foreach_bit(slot, st->varying) 194 I->flow |= bi_pop_slot(st, slot); 195 } 196 197 /* For now, serialize all memory access */ 198 if (bi_is_memory_access(I)) { 199 u_foreach_bit(slot, st->memory) 200 I->flow |= bi_pop_slot(st, slot); 201 } 202 203 /* We need to wait for all general slots before a barrier. The reason is 204 * unknown. In theory, this is redundant, since the BARRIER instruction will 205 * be followed immediately by .wait which waits for all slots. However, that 206 * doesn't seem to work properly in practice. 207 * 208 * The DDK is observed to use the same workaround, going so far as 209 * introducing a NOP before a BARRIER at the beginning of a basic block when 210 * there are outstanding stores. 211 * 212 * NOP.wait12 213 * BARRIER.slot7.wait 214 * 215 * Luckily, this situation is pretty rare. The wait introduced here can 216 * usually be merged into the preceding instruction. 217 * 218 * We also use the same workaround to serialize all async instructions when 219 * debugging this pass with the BIFROST_MESA_DEBUG=nosb option. 220 */ 221 if (I->op == BI_OPCODE_BARRIER || (bifrost_debug & BIFROST_DBG_NOSB)) { 222 for (unsigned i = 0; i < VA_NUM_GENERAL_SLOTS; ++i) { 223 if (st->write[i] || ((st->varying | st->memory) & BITFIELD_BIT(i))) 224 I->flow |= bi_pop_slot(st, i); 225 } 226 } 227} 228 229static bool 230scoreboard_block_update(bi_context *ctx, bi_block *blk) 231{ 232 bool progress = false; 233 234 /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */ 235 bi_foreach_predecessor(blk, pred) { 236 for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) { 237 blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i]; 238 blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i]; 239 blk->scoreboard_in.varying |= (*pred)->scoreboard_out.varying; 240 blk->scoreboard_in.memory |= (*pred)->scoreboard_out.memory; 241 } 242 } 243 244 struct bi_scoreboard_state state = blk->scoreboard_in; 245 246 /* Assign locally */ 247 248 bi_foreach_instr_in_block(blk, I) { 249 bi_set_dependencies(blk, I, &state); 250 bi_push_instr(&state, I); 251 } 252 253 /* Insert a wait for varyings at the end of the block. 254 * 255 * A varying load with .store has to wait for all other varying loads 256 * in the quad to complete. The bad case looks like: 257 * 258 * if (dynamic) { 259 * x = ld_var() 260 * } else { 261 * x = ld_var() 262 * } 263 * 264 * Logically, a given thread executes only a single ld_var instruction. But 265 * if the quad diverges, the second ld_var has to wait for the first ld_var. 266 * For correct handling, we need to maintain a physical control flow graph 267 * and do the dataflow analysis on that instead of the logical control flow 268 * graph. However, this probably doesn't matter much in practice. This seems 269 * like a decent compromise for now. 270 * 271 * TODO: Consider optimizing this case. 272 */ 273 if (state.varying) { 274 uint8_t flow = 0; 275 276 u_foreach_bit(slot, state.varying) 277 flow |= bi_pop_slot(&state, slot); 278 279 bi_flow(ctx, bi_after_block(blk), flow); 280 } 281 282 /* To figure out progress, diff scoreboard_out */ 283 progress = !!memcmp(&state, &blk->scoreboard_out, sizeof(state)); 284 285 blk->scoreboard_out = state; 286 287 return progress; 288} 289 290static void 291va_assign_scoreboard(bi_context *ctx) 292{ 293 u_worklist worklist; 294 bi_worklist_init(ctx, &worklist); 295 296 bi_foreach_block(ctx, block) { 297 bi_worklist_push_tail(&worklist, block); 298 } 299 300 /* Perform forward data flow analysis to calculate dependencies */ 301 while (!u_worklist_is_empty(&worklist)) { 302 /* Pop from the front for forward analysis */ 303 bi_block *blk = bi_worklist_pop_head(&worklist); 304 305 if (scoreboard_block_update(ctx, blk)) { 306 bi_foreach_successor(blk, succ) 307 bi_worklist_push_tail(&worklist, succ); 308 } 309 } 310 311 u_worklist_fini(&worklist); 312} 313 314/* 315 * Determine if execution should terminate after a given block. Execution cannot 316 * terminate within a basic block. 317 */ 318static bool 319va_should_end(bi_block *block) 320{ 321 /* Don't return if we're succeeded by instructions */ 322 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { 323 bi_block *succ = block->successors[i]; 324 325 if (succ) 326 return false; 327 } 328 329 return true; 330} 331 332/* 333 * We should discard helper invocations as soon as helper invocations die after 334 * their last use. Either they die after an instruction using helper 335 * invocations, or they die along a control flow edge. The former is handled by 336 * discarding appropriately after instructions. The latter is handled by 337 * inserting a discard at the _start_ of some blocks: 338 * 339 * Lemma: If a non-critical edge discards helpers, it is the only edge that 340 * enters its destination. 341 * 342 * Proof: An edge discards helpers if helpers are live at the end of the source 343 * block and dead at the start of the destination block. By definition, helpers 344 * are live at the end of a block iff they are live at the start of some 345 * successor of a block. The source block therefore has a successor with helpers 346 * live at the start and a successor with helpers dead at the start. As the 347 * source block has at least two successors, the edge is NOT the only edge 348 * exiting its source. Hence it is the only edge entering the destination, 349 * otherwise the edge would be critical. 350 * 351 * By corrollary, we may handle discards on control flow edges by discarding at 352 * the start of blocks with a single predecessor. 353 * 354 * This routine tests if a block should discard helper invocations at its start. 355 */ 356static bool 357va_discard_before_block(bi_block *block) 358{ 359 /* Do not discard if the block requires helpers at the start */ 360 if (block->pass_flags) 361 return false; 362 363 /* By the lemma, if we need to discard, there is a unique predecessor */ 364 if (bi_num_predecessors(block) != 1) 365 return false; 366 367 bi_block *pred = *util_dynarray_element(&block->predecessors, bi_block *, 0); 368 369 /* Discard if helpers are live at the end of the predecessor, due to helpers 370 * live at the start of some (other) successor. 371 */ 372 bi_foreach_successor(pred, succ) { 373 if (succ->pass_flags) 374 return true; 375 } 376 377 return false; 378} 379 380/* 381 * Test if a program is empty, in the sense of having zero instructions. Empty 382 * shaders get special handling. 383 */ 384static bool 385bi_is_empty(bi_context *ctx) 386{ 387 bi_foreach_instr_global(ctx, _) 388 return false; 389 390 return true; 391} 392 393/* 394 * Given a program with no flow control modifiers, insert NOPs signaling the 395 * required flow control. Not much optimization happens here. 396 */ 397void 398va_insert_flow_control_nops(bi_context *ctx) 399{ 400 /* Special case: if a program is empty, leave it empty. In particular, do not 401 * insert NOP.end. There is special handling in the driver for skipping empty 402 * shaders for shader stage. The .end is not necessary and disrupts 403 * optimizations. 404 */ 405 if (bi_is_empty(ctx)) 406 return; 407 408 /* First do dataflow analysis for the scoreboard. This populates I->flow with 409 * a bitmap of slots to wait on. 410 * 411 * Also do dataflow analysis for helper invocations in fragment shaders. This 412 * populates block->pass_flags with helper invocation information. 413 */ 414 va_assign_scoreboard(ctx); 415 bi_analyze_helper_terminate(ctx); 416 417 bi_foreach_block(ctx, block) { 418 /* Handle discards along control flow edges */ 419 if (va_discard_before_block(block)) 420 bi_flow(ctx, bi_before_block(block), VA_FLOW_DISCARD); 421 422 bi_foreach_instr_in_block_safe(block, I) { 423 switch (I->op) { 424 /* Signal barriers immediately */ 425 case BI_OPCODE_BARRIER: 426 bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT); 427 break; 428 429 /* Insert waits for tilebuffer and depth/stencil instructions. These 430 * only happen in regular fragment shaders, as the required waits are 431 * assumed to already have happened in blend shaders. 432 * 433 * For discarded thread handling, ATEST must be serialized against all 434 * other asynchronous instructions and should be serialized against all 435 * instructions. Wait for slot 0 immediately after the ATEST. 436 */ 437 case BI_OPCODE_BLEND: 438 case BI_OPCODE_LD_TILE: 439 case BI_OPCODE_ST_TILE: 440 if (!ctx->inputs->is_blend) 441 bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT); 442 break; 443 case BI_OPCODE_ATEST: 444 bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126); 445 bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT0); 446 break; 447 case BI_OPCODE_ZS_EMIT: 448 if (!ctx->inputs->is_blend) 449 bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126); 450 break; 451 452 default: 453 break; 454 } 455 456 if (I->flow && I->op != BI_OPCODE_NOP) { 457 /* Wait on the results of asynchronous instructions 458 * 459 * Bitmap of general slots lines up with the encoding of va_flow for 460 * waits on general slots. The dataflow analysis should be ignoring 461 * the special slots #6 and #7, which are handled separately. 462 */ 463 assert((I->flow & ~BITFIELD_MASK(VA_NUM_GENERAL_SLOTS)) == 0); 464 465 bi_flow(ctx, bi_before_instr(I), I->flow); 466 I->flow = 0; 467 } 468 } 469 470 /* Terminate helpers after the last use */ 471 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend && 472 block->pass_flags && bi_block_terminates_helpers(block)) { 473 474 bi_foreach_instr_in_block_safe_rev(block, I) { 475 if (bi_instr_uses_helpers(I)) { 476 bi_flow(ctx, bi_after_instr(I), VA_FLOW_DISCARD); 477 break; 478 } 479 } 480 } 481 482 /* End exeuction at the end of the block if needed, or reconverge if we 483 * continue but we don't need to end execution. 484 */ 485 if (va_should_end(block) || block->needs_nop) { 486 /* Don't bother adding a NOP into an unreachable block */ 487 if (block == bi_start_block(&ctx->blocks) || bi_num_predecessors(block)) 488 bi_flow(ctx, bi_after_block(block), VA_FLOW_END); 489 } else if (bi_reconverge_branches(block)) { 490 /* TODO: Do we have ever need to reconverge from an empty block? */ 491 if (!list_is_empty(&block->instructions)) 492 bi_flow(ctx, bi_after_block(block), VA_FLOW_RECONVERGE); 493 } 494 } 495 496 /* If helpers are not used anywhere, they are not used at the start, so we 497 * terminate at the start. Else, helpers are used somewhere in the shader and 498 * are terminated after last use. 499 */ 500 bi_block *start = bi_start_block(&ctx->blocks); 501 bool frag = (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend); 502 503 if (frag && !start->pass_flags) 504 bi_flow(ctx, bi_before_block(start), VA_FLOW_DISCARD); 505} 506 507/* 508 * Assign slots to all asynchronous instructions. A few special instructions 509 * require specific slots. For the rest, we assign slots in a round-robin 510 * fashion to reduce false dependencies when encoding waits. 511 * 512 * This should be called before va_insert_flow_control_nops. 513 */ 514void 515va_assign_slots(bi_context *ctx) 516{ 517 unsigned counter = 0; 518 519 bi_foreach_instr_global(ctx, I) { 520 if (I->op == BI_OPCODE_BARRIER) { 521 I->slot = 7; 522 } else if (I->op == BI_OPCODE_ZS_EMIT || I->op == BI_OPCODE_ATEST) { 523 I->slot = 0; 524 } else if (bi_opcode_props[I->op].message) { 525 I->slot = counter++; 526 527 if (counter == 3) 528 counter = 0; 529 } 530 } 531} 532