1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright (C) 2022 Collabora Ltd. 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21bf215546Sopenharmony_ci * SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci#include "va_compiler.h" 25bf215546Sopenharmony_ci#include "valhall_enums.h" 26bf215546Sopenharmony_ci#include "bi_builder.h" 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci/* 29bf215546Sopenharmony_ci * Insert flow control into a scheduled and register allocated shader. This 30bf215546Sopenharmony_ci * pass runs after scheduling and register allocation. This pass only 31bf215546Sopenharmony_ci * inserts NOPs with the appropriate flow control modifiers. It should be 32bf215546Sopenharmony_ci * followed by a cleanup pass to merge flow control modifiers on adjacent 33bf215546Sopenharmony_ci * instructions, eliminating the NOPs. This decouples optimization from 34bf215546Sopenharmony_ci * correctness, simplifying both passes. 35bf215546Sopenharmony_ci * 36bf215546Sopenharmony_ci * This pass is responsible for calculating dependencies, according to the 37bf215546Sopenharmony_ci * rules: 38bf215546Sopenharmony_ci * 39bf215546Sopenharmony_ci * 1. An instruction that depends on the results of a previous asyncronous 40bf215546Sopenharmony_ci * must first wait for that instruction's slot, unless all 41bf215546Sopenharmony_ci * reaching code paths already depended on it. 42bf215546Sopenharmony_ci * 2. More generally, any dependencies must be encoded. This includes 43bf215546Sopenharmony_ci * Write-After-Write and Write-After-Read hazards with LOAD/STORE to memory. 44bf215546Sopenharmony_ci * 3. The shader must wait on slot #6 before running BLEND, ATEST 45bf215546Sopenharmony_ci * 4. The shader must wait on slot #7 before running BLEND, ST_TILE 46bf215546Sopenharmony_ci * 6. BARRIER must wait on every active slot. 47bf215546Sopenharmony_ci * 48bf215546Sopenharmony_ci * Unlike Bifrost, it is not necessary to worry about outbound staging 49bf215546Sopenharmony_ci * registers, as the hardware stalls reading staging registers when issuing 50bf215546Sopenharmony_ci * asynchronous instructions. So we don't track reads in our model of the 51bf215546Sopenharmony_ci * hardware scoreboard. This makes things a bit simpler. 52bf215546Sopenharmony_ci * 53bf215546Sopenharmony_ci * We may reuse slots for multiple asynchronous instructions, though there may 54bf215546Sopenharmony_ci * be a performance penalty. 55bf215546Sopenharmony_ci */ 56bf215546Sopenharmony_ci 57bf215546Sopenharmony_ci#define BI_NUM_REGISTERS 64 58bf215546Sopenharmony_ci 59bf215546Sopenharmony_ci/* 60bf215546Sopenharmony_ci * Insert a NOP instruction with given flow control. 61bf215546Sopenharmony_ci */ 62bf215546Sopenharmony_cistatic void 63bf215546Sopenharmony_cibi_flow(bi_context *ctx, bi_cursor cursor, enum va_flow flow) 64bf215546Sopenharmony_ci{ 65bf215546Sopenharmony_ci bi_builder b = bi_init_builder(ctx, cursor); 66bf215546Sopenharmony_ci 67bf215546Sopenharmony_ci bi_nop(&b)->flow = flow; 68bf215546Sopenharmony_ci} 69bf215546Sopenharmony_ci 70bf215546Sopenharmony_cistatic uint64_t 71bf215546Sopenharmony_cibi_read_mask(bi_instr *I) 72bf215546Sopenharmony_ci{ 73bf215546Sopenharmony_ci uint64_t mask = 0; 74bf215546Sopenharmony_ci 75bf215546Sopenharmony_ci bi_foreach_src(I, s) { 76bf215546Sopenharmony_ci if (I->src[s].type == BI_INDEX_REGISTER) { 77bf215546Sopenharmony_ci unsigned reg = I->src[s].value; 78bf215546Sopenharmony_ci unsigned count = bi_count_read_registers(I, s); 79bf215546Sopenharmony_ci 80bf215546Sopenharmony_ci mask |= (BITFIELD64_MASK(count) << reg); 81bf215546Sopenharmony_ci } 82bf215546Sopenharmony_ci } 83bf215546Sopenharmony_ci 84bf215546Sopenharmony_ci return mask; 85bf215546Sopenharmony_ci} 86bf215546Sopenharmony_ci 87bf215546Sopenharmony_cistatic uint64_t 88bf215546Sopenharmony_cibi_write_mask(bi_instr *I) 89bf215546Sopenharmony_ci{ 90bf215546Sopenharmony_ci uint64_t mask = 0; 91bf215546Sopenharmony_ci 92bf215546Sopenharmony_ci bi_foreach_dest(I, d) { 93bf215546Sopenharmony_ci if (bi_is_null(I->dest[d])) continue; 94bf215546Sopenharmony_ci 95bf215546Sopenharmony_ci assert(I->dest[d].type == BI_INDEX_REGISTER); 96bf215546Sopenharmony_ci 97bf215546Sopenharmony_ci unsigned reg = I->dest[d].value; 98bf215546Sopenharmony_ci unsigned count = bi_count_write_registers(I, d); 99bf215546Sopenharmony_ci 100bf215546Sopenharmony_ci mask |= (BITFIELD64_MASK(count) << reg); 101bf215546Sopenharmony_ci } 102bf215546Sopenharmony_ci 103bf215546Sopenharmony_ci return mask; 104bf215546Sopenharmony_ci} 105bf215546Sopenharmony_ci 106bf215546Sopenharmony_cistatic bool 107bf215546Sopenharmony_cibi_ld_vary_writes_hidden_register(const bi_instr *I) 108bf215546Sopenharmony_ci{ 109bf215546Sopenharmony_ci /* Only varying loads can write the hidden register */ 110bf215546Sopenharmony_ci if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_VARYING) 111bf215546Sopenharmony_ci return false; 112bf215546Sopenharmony_ci 113bf215546Sopenharmony_ci /* They only write in some update modes */ 114bf215546Sopenharmony_ci return (I->update == BI_UPDATE_STORE) || (I->update == BI_UPDATE_CLOBBER); 115bf215546Sopenharmony_ci} 116bf215546Sopenharmony_ci 117bf215546Sopenharmony_cistatic bool 118bf215546Sopenharmony_cibi_is_memory_access(const bi_instr *I) 119bf215546Sopenharmony_ci{ 120bf215546Sopenharmony_ci /* On the attribute unit but functionally a general memory load */ 121bf215546Sopenharmony_ci if (I->op == BI_OPCODE_LD_ATTR_TEX) 122bf215546Sopenharmony_ci return true; 123bf215546Sopenharmony_ci 124bf215546Sopenharmony_ci /* UBOs are read-only so there are no ordering constriants */ 125bf215546Sopenharmony_ci if (I->seg == BI_SEG_UBO) 126bf215546Sopenharmony_ci return false; 127bf215546Sopenharmony_ci 128bf215546Sopenharmony_ci switch (bi_opcode_props[I->op].message) { 129bf215546Sopenharmony_ci case BIFROST_MESSAGE_LOAD: 130bf215546Sopenharmony_ci case BIFROST_MESSAGE_STORE: 131bf215546Sopenharmony_ci case BIFROST_MESSAGE_ATOMIC: 132bf215546Sopenharmony_ci return true; 133bf215546Sopenharmony_ci default: 134bf215546Sopenharmony_ci return false; 135bf215546Sopenharmony_ci } 136bf215546Sopenharmony_ci} 137bf215546Sopenharmony_ci 138bf215546Sopenharmony_ci/* Update the scoreboard model to assign an instruction to a given slot */ 139bf215546Sopenharmony_ci 140bf215546Sopenharmony_cistatic void 141bf215546Sopenharmony_cibi_push_instr(struct bi_scoreboard_state *st, bi_instr *I) 142bf215546Sopenharmony_ci{ 143bf215546Sopenharmony_ci if (bi_opcode_props[I->op].sr_write) 144bf215546Sopenharmony_ci st->write[I->slot] |= bi_write_mask(I); 145bf215546Sopenharmony_ci 146bf215546Sopenharmony_ci if (bi_is_memory_access(I)) 147bf215546Sopenharmony_ci st->memory |= BITFIELD_BIT(I->slot); 148bf215546Sopenharmony_ci 149bf215546Sopenharmony_ci if (bi_opcode_props[I->op].message == BIFROST_MESSAGE_VARYING) 150bf215546Sopenharmony_ci st->varying |= BITFIELD_BIT(I->slot); 151bf215546Sopenharmony_ci} 152bf215546Sopenharmony_ci 153bf215546Sopenharmony_cistatic uint8_t MUST_CHECK 154bf215546Sopenharmony_cibi_pop_slot(struct bi_scoreboard_state *st, unsigned slot) 155bf215546Sopenharmony_ci{ 156bf215546Sopenharmony_ci st->write[slot] = 0; 157bf215546Sopenharmony_ci st->varying &= ~BITFIELD_BIT(slot); 158bf215546Sopenharmony_ci st->memory &= ~BITFIELD_BIT(slot); 159bf215546Sopenharmony_ci 160bf215546Sopenharmony_ci return BITFIELD_BIT(slot); 161bf215546Sopenharmony_ci} 162bf215546Sopenharmony_ci 163bf215546Sopenharmony_ci/* Adds a dependency on each slot writing any specified register */ 164bf215546Sopenharmony_ci 165bf215546Sopenharmony_cistatic uint8_t MUST_CHECK 166bf215546Sopenharmony_cibi_depend_on_writers(struct bi_scoreboard_state *st, uint64_t regmask) 167bf215546Sopenharmony_ci{ 168bf215546Sopenharmony_ci uint8_t slots = 0; 169bf215546Sopenharmony_ci 170bf215546Sopenharmony_ci for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) { 171bf215546Sopenharmony_ci if (st->write[slot] & regmask) 172bf215546Sopenharmony_ci slots |= bi_pop_slot(st, slot); 173bf215546Sopenharmony_ci } 174bf215546Sopenharmony_ci 175bf215546Sopenharmony_ci return slots; 176bf215546Sopenharmony_ci} 177bf215546Sopenharmony_ci 178bf215546Sopenharmony_ci/* Sets the dependencies for a given clause, updating the model */ 179bf215546Sopenharmony_ci 180bf215546Sopenharmony_cistatic void 181bf215546Sopenharmony_cibi_set_dependencies(bi_block *block, bi_instr *I, struct bi_scoreboard_state *st) 182bf215546Sopenharmony_ci{ 183bf215546Sopenharmony_ci /* Depend on writers to handle read-after-write and write-after-write 184bf215546Sopenharmony_ci * dependencies. Write-after-read dependencies are handled in the hardware 185bf215546Sopenharmony_ci * where necessary, so we don't worry about them. 186bf215546Sopenharmony_ci */ 187bf215546Sopenharmony_ci I->flow |= bi_depend_on_writers(st, bi_read_mask(I) | bi_write_mask(I)); 188bf215546Sopenharmony_ci 189bf215546Sopenharmony_ci /* Handle write-after-write and write-after-read dependencies for the varying 190bf215546Sopenharmony_ci * hidden registers. Read-after-write dependencies handled in hardware. 191bf215546Sopenharmony_ci */ 192bf215546Sopenharmony_ci if (bi_ld_vary_writes_hidden_register(I)) { 193bf215546Sopenharmony_ci u_foreach_bit(slot, st->varying) 194bf215546Sopenharmony_ci I->flow |= bi_pop_slot(st, slot); 195bf215546Sopenharmony_ci } 196bf215546Sopenharmony_ci 197bf215546Sopenharmony_ci /* For now, serialize all memory access */ 198bf215546Sopenharmony_ci if (bi_is_memory_access(I)) { 199bf215546Sopenharmony_ci u_foreach_bit(slot, st->memory) 200bf215546Sopenharmony_ci I->flow |= bi_pop_slot(st, slot); 201bf215546Sopenharmony_ci } 202bf215546Sopenharmony_ci 203bf215546Sopenharmony_ci /* We need to wait for all general slots before a barrier. The reason is 204bf215546Sopenharmony_ci * unknown. In theory, this is redundant, since the BARRIER instruction will 205bf215546Sopenharmony_ci * be followed immediately by .wait which waits for all slots. However, that 206bf215546Sopenharmony_ci * doesn't seem to work properly in practice. 207bf215546Sopenharmony_ci * 208bf215546Sopenharmony_ci * The DDK is observed to use the same workaround, going so far as 209bf215546Sopenharmony_ci * introducing a NOP before a BARRIER at the beginning of a basic block when 210bf215546Sopenharmony_ci * there are outstanding stores. 211bf215546Sopenharmony_ci * 212bf215546Sopenharmony_ci * NOP.wait12 213bf215546Sopenharmony_ci * BARRIER.slot7.wait 214bf215546Sopenharmony_ci * 215bf215546Sopenharmony_ci * Luckily, this situation is pretty rare. The wait introduced here can 216bf215546Sopenharmony_ci * usually be merged into the preceding instruction. 217bf215546Sopenharmony_ci * 218bf215546Sopenharmony_ci * We also use the same workaround to serialize all async instructions when 219bf215546Sopenharmony_ci * debugging this pass with the BIFROST_MESA_DEBUG=nosb option. 220bf215546Sopenharmony_ci */ 221bf215546Sopenharmony_ci if (I->op == BI_OPCODE_BARRIER || (bifrost_debug & BIFROST_DBG_NOSB)) { 222bf215546Sopenharmony_ci for (unsigned i = 0; i < VA_NUM_GENERAL_SLOTS; ++i) { 223bf215546Sopenharmony_ci if (st->write[i] || ((st->varying | st->memory) & BITFIELD_BIT(i))) 224bf215546Sopenharmony_ci I->flow |= bi_pop_slot(st, i); 225bf215546Sopenharmony_ci } 226bf215546Sopenharmony_ci } 227bf215546Sopenharmony_ci} 228bf215546Sopenharmony_ci 229bf215546Sopenharmony_cistatic bool 230bf215546Sopenharmony_ciscoreboard_block_update(bi_context *ctx, bi_block *blk) 231bf215546Sopenharmony_ci{ 232bf215546Sopenharmony_ci bool progress = false; 233bf215546Sopenharmony_ci 234bf215546Sopenharmony_ci /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */ 235bf215546Sopenharmony_ci bi_foreach_predecessor(blk, pred) { 236bf215546Sopenharmony_ci for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) { 237bf215546Sopenharmony_ci blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i]; 238bf215546Sopenharmony_ci blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i]; 239bf215546Sopenharmony_ci blk->scoreboard_in.varying |= (*pred)->scoreboard_out.varying; 240bf215546Sopenharmony_ci blk->scoreboard_in.memory |= (*pred)->scoreboard_out.memory; 241bf215546Sopenharmony_ci } 242bf215546Sopenharmony_ci } 243bf215546Sopenharmony_ci 244bf215546Sopenharmony_ci struct bi_scoreboard_state state = blk->scoreboard_in; 245bf215546Sopenharmony_ci 246bf215546Sopenharmony_ci /* Assign locally */ 247bf215546Sopenharmony_ci 248bf215546Sopenharmony_ci bi_foreach_instr_in_block(blk, I) { 249bf215546Sopenharmony_ci bi_set_dependencies(blk, I, &state); 250bf215546Sopenharmony_ci bi_push_instr(&state, I); 251bf215546Sopenharmony_ci } 252bf215546Sopenharmony_ci 253bf215546Sopenharmony_ci /* Insert a wait for varyings at the end of the block. 254bf215546Sopenharmony_ci * 255bf215546Sopenharmony_ci * A varying load with .store has to wait for all other varying loads 256bf215546Sopenharmony_ci * in the quad to complete. The bad case looks like: 257bf215546Sopenharmony_ci * 258bf215546Sopenharmony_ci * if (dynamic) { 259bf215546Sopenharmony_ci * x = ld_var() 260bf215546Sopenharmony_ci * } else { 261bf215546Sopenharmony_ci * x = ld_var() 262bf215546Sopenharmony_ci * } 263bf215546Sopenharmony_ci * 264bf215546Sopenharmony_ci * Logically, a given thread executes only a single ld_var instruction. But 265bf215546Sopenharmony_ci * if the quad diverges, the second ld_var has to wait for the first ld_var. 266bf215546Sopenharmony_ci * For correct handling, we need to maintain a physical control flow graph 267bf215546Sopenharmony_ci * and do the dataflow analysis on that instead of the logical control flow 268bf215546Sopenharmony_ci * graph. However, this probably doesn't matter much in practice. This seems 269bf215546Sopenharmony_ci * like a decent compromise for now. 270bf215546Sopenharmony_ci * 271bf215546Sopenharmony_ci * TODO: Consider optimizing this case. 272bf215546Sopenharmony_ci */ 273bf215546Sopenharmony_ci if (state.varying) { 274bf215546Sopenharmony_ci uint8_t flow = 0; 275bf215546Sopenharmony_ci 276bf215546Sopenharmony_ci u_foreach_bit(slot, state.varying) 277bf215546Sopenharmony_ci flow |= bi_pop_slot(&state, slot); 278bf215546Sopenharmony_ci 279bf215546Sopenharmony_ci bi_flow(ctx, bi_after_block(blk), flow); 280bf215546Sopenharmony_ci } 281bf215546Sopenharmony_ci 282bf215546Sopenharmony_ci /* To figure out progress, diff scoreboard_out */ 283bf215546Sopenharmony_ci progress = !!memcmp(&state, &blk->scoreboard_out, sizeof(state)); 284bf215546Sopenharmony_ci 285bf215546Sopenharmony_ci blk->scoreboard_out = state; 286bf215546Sopenharmony_ci 287bf215546Sopenharmony_ci return progress; 288bf215546Sopenharmony_ci} 289bf215546Sopenharmony_ci 290bf215546Sopenharmony_cistatic void 291bf215546Sopenharmony_civa_assign_scoreboard(bi_context *ctx) 292bf215546Sopenharmony_ci{ 293bf215546Sopenharmony_ci u_worklist worklist; 294bf215546Sopenharmony_ci bi_worklist_init(ctx, &worklist); 295bf215546Sopenharmony_ci 296bf215546Sopenharmony_ci bi_foreach_block(ctx, block) { 297bf215546Sopenharmony_ci bi_worklist_push_tail(&worklist, block); 298bf215546Sopenharmony_ci } 299bf215546Sopenharmony_ci 300bf215546Sopenharmony_ci /* Perform forward data flow analysis to calculate dependencies */ 301bf215546Sopenharmony_ci while (!u_worklist_is_empty(&worklist)) { 302bf215546Sopenharmony_ci /* Pop from the front for forward analysis */ 303bf215546Sopenharmony_ci bi_block *blk = bi_worklist_pop_head(&worklist); 304bf215546Sopenharmony_ci 305bf215546Sopenharmony_ci if (scoreboard_block_update(ctx, blk)) { 306bf215546Sopenharmony_ci bi_foreach_successor(blk, succ) 307bf215546Sopenharmony_ci bi_worklist_push_tail(&worklist, succ); 308bf215546Sopenharmony_ci } 309bf215546Sopenharmony_ci } 310bf215546Sopenharmony_ci 311bf215546Sopenharmony_ci u_worklist_fini(&worklist); 312bf215546Sopenharmony_ci} 313bf215546Sopenharmony_ci 314bf215546Sopenharmony_ci/* 315bf215546Sopenharmony_ci * Determine if execution should terminate after a given block. Execution cannot 316bf215546Sopenharmony_ci * terminate within a basic block. 317bf215546Sopenharmony_ci */ 318bf215546Sopenharmony_cistatic bool 319bf215546Sopenharmony_civa_should_end(bi_block *block) 320bf215546Sopenharmony_ci{ 321bf215546Sopenharmony_ci /* Don't return if we're succeeded by instructions */ 322bf215546Sopenharmony_ci for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { 323bf215546Sopenharmony_ci bi_block *succ = block->successors[i]; 324bf215546Sopenharmony_ci 325bf215546Sopenharmony_ci if (succ) 326bf215546Sopenharmony_ci return false; 327bf215546Sopenharmony_ci } 328bf215546Sopenharmony_ci 329bf215546Sopenharmony_ci return true; 330bf215546Sopenharmony_ci} 331bf215546Sopenharmony_ci 332bf215546Sopenharmony_ci/* 333bf215546Sopenharmony_ci * We should discard helper invocations as soon as helper invocations die after 334bf215546Sopenharmony_ci * their last use. Either they die after an instruction using helper 335bf215546Sopenharmony_ci * invocations, or they die along a control flow edge. The former is handled by 336bf215546Sopenharmony_ci * discarding appropriately after instructions. The latter is handled by 337bf215546Sopenharmony_ci * inserting a discard at the _start_ of some blocks: 338bf215546Sopenharmony_ci * 339bf215546Sopenharmony_ci * Lemma: If a non-critical edge discards helpers, it is the only edge that 340bf215546Sopenharmony_ci * enters its destination. 341bf215546Sopenharmony_ci * 342bf215546Sopenharmony_ci * Proof: An edge discards helpers if helpers are live at the end of the source 343bf215546Sopenharmony_ci * block and dead at the start of the destination block. By definition, helpers 344bf215546Sopenharmony_ci * are live at the end of a block iff they are live at the start of some 345bf215546Sopenharmony_ci * successor of a block. The source block therefore has a successor with helpers 346bf215546Sopenharmony_ci * live at the start and a successor with helpers dead at the start. As the 347bf215546Sopenharmony_ci * source block has at least two successors, the edge is NOT the only edge 348bf215546Sopenharmony_ci * exiting its source. Hence it is the only edge entering the destination, 349bf215546Sopenharmony_ci * otherwise the edge would be critical. 350bf215546Sopenharmony_ci * 351bf215546Sopenharmony_ci * By corrollary, we may handle discards on control flow edges by discarding at 352bf215546Sopenharmony_ci * the start of blocks with a single predecessor. 353bf215546Sopenharmony_ci * 354bf215546Sopenharmony_ci * This routine tests if a block should discard helper invocations at its start. 355bf215546Sopenharmony_ci */ 356bf215546Sopenharmony_cistatic bool 357bf215546Sopenharmony_civa_discard_before_block(bi_block *block) 358bf215546Sopenharmony_ci{ 359bf215546Sopenharmony_ci /* Do not discard if the block requires helpers at the start */ 360bf215546Sopenharmony_ci if (block->pass_flags) 361bf215546Sopenharmony_ci return false; 362bf215546Sopenharmony_ci 363bf215546Sopenharmony_ci /* By the lemma, if we need to discard, there is a unique predecessor */ 364bf215546Sopenharmony_ci if (bi_num_predecessors(block) != 1) 365bf215546Sopenharmony_ci return false; 366bf215546Sopenharmony_ci 367bf215546Sopenharmony_ci bi_block *pred = *util_dynarray_element(&block->predecessors, bi_block *, 0); 368bf215546Sopenharmony_ci 369bf215546Sopenharmony_ci /* Discard if helpers are live at the end of the predecessor, due to helpers 370bf215546Sopenharmony_ci * live at the start of some (other) successor. 371bf215546Sopenharmony_ci */ 372bf215546Sopenharmony_ci bi_foreach_successor(pred, succ) { 373bf215546Sopenharmony_ci if (succ->pass_flags) 374bf215546Sopenharmony_ci return true; 375bf215546Sopenharmony_ci } 376bf215546Sopenharmony_ci 377bf215546Sopenharmony_ci return false; 378bf215546Sopenharmony_ci} 379bf215546Sopenharmony_ci 380bf215546Sopenharmony_ci/* 381bf215546Sopenharmony_ci * Test if a program is empty, in the sense of having zero instructions. Empty 382bf215546Sopenharmony_ci * shaders get special handling. 383bf215546Sopenharmony_ci */ 384bf215546Sopenharmony_cistatic bool 385bf215546Sopenharmony_cibi_is_empty(bi_context *ctx) 386bf215546Sopenharmony_ci{ 387bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, _) 388bf215546Sopenharmony_ci return false; 389bf215546Sopenharmony_ci 390bf215546Sopenharmony_ci return true; 391bf215546Sopenharmony_ci} 392bf215546Sopenharmony_ci 393bf215546Sopenharmony_ci/* 394bf215546Sopenharmony_ci * Given a program with no flow control modifiers, insert NOPs signaling the 395bf215546Sopenharmony_ci * required flow control. Not much optimization happens here. 396bf215546Sopenharmony_ci */ 397bf215546Sopenharmony_civoid 398bf215546Sopenharmony_civa_insert_flow_control_nops(bi_context *ctx) 399bf215546Sopenharmony_ci{ 400bf215546Sopenharmony_ci /* Special case: if a program is empty, leave it empty. In particular, do not 401bf215546Sopenharmony_ci * insert NOP.end. There is special handling in the driver for skipping empty 402bf215546Sopenharmony_ci * shaders for shader stage. The .end is not necessary and disrupts 403bf215546Sopenharmony_ci * optimizations. 404bf215546Sopenharmony_ci */ 405bf215546Sopenharmony_ci if (bi_is_empty(ctx)) 406bf215546Sopenharmony_ci return; 407bf215546Sopenharmony_ci 408bf215546Sopenharmony_ci /* First do dataflow analysis for the scoreboard. This populates I->flow with 409bf215546Sopenharmony_ci * a bitmap of slots to wait on. 410bf215546Sopenharmony_ci * 411bf215546Sopenharmony_ci * Also do dataflow analysis for helper invocations in fragment shaders. This 412bf215546Sopenharmony_ci * populates block->pass_flags with helper invocation information. 413bf215546Sopenharmony_ci */ 414bf215546Sopenharmony_ci va_assign_scoreboard(ctx); 415bf215546Sopenharmony_ci bi_analyze_helper_terminate(ctx); 416bf215546Sopenharmony_ci 417bf215546Sopenharmony_ci bi_foreach_block(ctx, block) { 418bf215546Sopenharmony_ci /* Handle discards along control flow edges */ 419bf215546Sopenharmony_ci if (va_discard_before_block(block)) 420bf215546Sopenharmony_ci bi_flow(ctx, bi_before_block(block), VA_FLOW_DISCARD); 421bf215546Sopenharmony_ci 422bf215546Sopenharmony_ci bi_foreach_instr_in_block_safe(block, I) { 423bf215546Sopenharmony_ci switch (I->op) { 424bf215546Sopenharmony_ci /* Signal barriers immediately */ 425bf215546Sopenharmony_ci case BI_OPCODE_BARRIER: 426bf215546Sopenharmony_ci bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT); 427bf215546Sopenharmony_ci break; 428bf215546Sopenharmony_ci 429bf215546Sopenharmony_ci /* Insert waits for tilebuffer and depth/stencil instructions. These 430bf215546Sopenharmony_ci * only happen in regular fragment shaders, as the required waits are 431bf215546Sopenharmony_ci * assumed to already have happened in blend shaders. 432bf215546Sopenharmony_ci * 433bf215546Sopenharmony_ci * For discarded thread handling, ATEST must be serialized against all 434bf215546Sopenharmony_ci * other asynchronous instructions and should be serialized against all 435bf215546Sopenharmony_ci * instructions. Wait for slot 0 immediately after the ATEST. 436bf215546Sopenharmony_ci */ 437bf215546Sopenharmony_ci case BI_OPCODE_BLEND: 438bf215546Sopenharmony_ci case BI_OPCODE_LD_TILE: 439bf215546Sopenharmony_ci case BI_OPCODE_ST_TILE: 440bf215546Sopenharmony_ci if (!ctx->inputs->is_blend) 441bf215546Sopenharmony_ci bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT); 442bf215546Sopenharmony_ci break; 443bf215546Sopenharmony_ci case BI_OPCODE_ATEST: 444bf215546Sopenharmony_ci bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126); 445bf215546Sopenharmony_ci bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT0); 446bf215546Sopenharmony_ci break; 447bf215546Sopenharmony_ci case BI_OPCODE_ZS_EMIT: 448bf215546Sopenharmony_ci if (!ctx->inputs->is_blend) 449bf215546Sopenharmony_ci bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126); 450bf215546Sopenharmony_ci break; 451bf215546Sopenharmony_ci 452bf215546Sopenharmony_ci default: 453bf215546Sopenharmony_ci break; 454bf215546Sopenharmony_ci } 455bf215546Sopenharmony_ci 456bf215546Sopenharmony_ci if (I->flow && I->op != BI_OPCODE_NOP) { 457bf215546Sopenharmony_ci /* Wait on the results of asynchronous instructions 458bf215546Sopenharmony_ci * 459bf215546Sopenharmony_ci * Bitmap of general slots lines up with the encoding of va_flow for 460bf215546Sopenharmony_ci * waits on general slots. The dataflow analysis should be ignoring 461bf215546Sopenharmony_ci * the special slots #6 and #7, which are handled separately. 462bf215546Sopenharmony_ci */ 463bf215546Sopenharmony_ci assert((I->flow & ~BITFIELD_MASK(VA_NUM_GENERAL_SLOTS)) == 0); 464bf215546Sopenharmony_ci 465bf215546Sopenharmony_ci bi_flow(ctx, bi_before_instr(I), I->flow); 466bf215546Sopenharmony_ci I->flow = 0; 467bf215546Sopenharmony_ci } 468bf215546Sopenharmony_ci } 469bf215546Sopenharmony_ci 470bf215546Sopenharmony_ci /* Terminate helpers after the last use */ 471bf215546Sopenharmony_ci if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend && 472bf215546Sopenharmony_ci block->pass_flags && bi_block_terminates_helpers(block)) { 473bf215546Sopenharmony_ci 474bf215546Sopenharmony_ci bi_foreach_instr_in_block_safe_rev(block, I) { 475bf215546Sopenharmony_ci if (bi_instr_uses_helpers(I)) { 476bf215546Sopenharmony_ci bi_flow(ctx, bi_after_instr(I), VA_FLOW_DISCARD); 477bf215546Sopenharmony_ci break; 478bf215546Sopenharmony_ci } 479bf215546Sopenharmony_ci } 480bf215546Sopenharmony_ci } 481bf215546Sopenharmony_ci 482bf215546Sopenharmony_ci /* End exeuction at the end of the block if needed, or reconverge if we 483bf215546Sopenharmony_ci * continue but we don't need to end execution. 484bf215546Sopenharmony_ci */ 485bf215546Sopenharmony_ci if (va_should_end(block) || block->needs_nop) { 486bf215546Sopenharmony_ci /* Don't bother adding a NOP into an unreachable block */ 487bf215546Sopenharmony_ci if (block == bi_start_block(&ctx->blocks) || bi_num_predecessors(block)) 488bf215546Sopenharmony_ci bi_flow(ctx, bi_after_block(block), VA_FLOW_END); 489bf215546Sopenharmony_ci } else if (bi_reconverge_branches(block)) { 490bf215546Sopenharmony_ci /* TODO: Do we have ever need to reconverge from an empty block? */ 491bf215546Sopenharmony_ci if (!list_is_empty(&block->instructions)) 492bf215546Sopenharmony_ci bi_flow(ctx, bi_after_block(block), VA_FLOW_RECONVERGE); 493bf215546Sopenharmony_ci } 494bf215546Sopenharmony_ci } 495bf215546Sopenharmony_ci 496bf215546Sopenharmony_ci /* If helpers are not used anywhere, they are not used at the start, so we 497bf215546Sopenharmony_ci * terminate at the start. Else, helpers are used somewhere in the shader and 498bf215546Sopenharmony_ci * are terminated after last use. 499bf215546Sopenharmony_ci */ 500bf215546Sopenharmony_ci bi_block *start = bi_start_block(&ctx->blocks); 501bf215546Sopenharmony_ci bool frag = (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend); 502bf215546Sopenharmony_ci 503bf215546Sopenharmony_ci if (frag && !start->pass_flags) 504bf215546Sopenharmony_ci bi_flow(ctx, bi_before_block(start), VA_FLOW_DISCARD); 505bf215546Sopenharmony_ci} 506bf215546Sopenharmony_ci 507bf215546Sopenharmony_ci/* 508bf215546Sopenharmony_ci * Assign slots to all asynchronous instructions. A few special instructions 509bf215546Sopenharmony_ci * require specific slots. For the rest, we assign slots in a round-robin 510bf215546Sopenharmony_ci * fashion to reduce false dependencies when encoding waits. 511bf215546Sopenharmony_ci * 512bf215546Sopenharmony_ci * This should be called before va_insert_flow_control_nops. 513bf215546Sopenharmony_ci */ 514bf215546Sopenharmony_civoid 515bf215546Sopenharmony_civa_assign_slots(bi_context *ctx) 516bf215546Sopenharmony_ci{ 517bf215546Sopenharmony_ci unsigned counter = 0; 518bf215546Sopenharmony_ci 519bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, I) { 520bf215546Sopenharmony_ci if (I->op == BI_OPCODE_BARRIER) { 521bf215546Sopenharmony_ci I->slot = 7; 522bf215546Sopenharmony_ci } else if (I->op == BI_OPCODE_ZS_EMIT || I->op == BI_OPCODE_ATEST) { 523bf215546Sopenharmony_ci I->slot = 0; 524bf215546Sopenharmony_ci } else if (bi_opcode_props[I->op].message) { 525bf215546Sopenharmony_ci I->slot = counter++; 526bf215546Sopenharmony_ci 527bf215546Sopenharmony_ci if (counter == 3) 528bf215546Sopenharmony_ci counter = 0; 529bf215546Sopenharmony_ci } 530bf215546Sopenharmony_ci } 531bf215546Sopenharmony_ci} 532