1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright (C) 2022 Collabora Ltd.
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21bf215546Sopenharmony_ci * SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci#include "va_compiler.h"
25bf215546Sopenharmony_ci#include "valhall_enums.h"
26bf215546Sopenharmony_ci#include "bi_builder.h"
27bf215546Sopenharmony_ci
28bf215546Sopenharmony_ci/*
29bf215546Sopenharmony_ci * Insert flow control into a scheduled and register allocated shader.  This
30bf215546Sopenharmony_ci * pass runs after scheduling and register allocation. This pass only
31bf215546Sopenharmony_ci * inserts NOPs with the appropriate flow control modifiers. It should be
32bf215546Sopenharmony_ci * followed by a cleanup pass to merge flow control modifiers on adjacent
33bf215546Sopenharmony_ci * instructions, eliminating the NOPs. This decouples optimization from
34bf215546Sopenharmony_ci * correctness, simplifying both passes.
35bf215546Sopenharmony_ci *
36bf215546Sopenharmony_ci * This pass is responsible for calculating dependencies, according to the
37bf215546Sopenharmony_ci * rules:
38bf215546Sopenharmony_ci *
39bf215546Sopenharmony_ci * 1. An instruction that depends on the results of a previous asyncronous
40bf215546Sopenharmony_ci *    must first wait for that instruction's slot, unless all
41bf215546Sopenharmony_ci *    reaching code paths already depended on it.
42bf215546Sopenharmony_ci * 2. More generally, any dependencies must be encoded. This includes
43bf215546Sopenharmony_ci *    Write-After-Write and Write-After-Read hazards with LOAD/STORE to memory.
44bf215546Sopenharmony_ci * 3. The shader must wait on slot #6 before running BLEND, ATEST
45bf215546Sopenharmony_ci * 4. The shader must wait on slot #7 before running BLEND, ST_TILE
46bf215546Sopenharmony_ci * 6. BARRIER must wait on every active slot.
47bf215546Sopenharmony_ci *
48bf215546Sopenharmony_ci * Unlike Bifrost, it is not necessary to worry about outbound staging
49bf215546Sopenharmony_ci * registers, as the hardware stalls reading staging registers when issuing
50bf215546Sopenharmony_ci * asynchronous instructions. So we don't track reads in our model of the
51bf215546Sopenharmony_ci * hardware scoreboard. This makes things a bit simpler.
52bf215546Sopenharmony_ci *
53bf215546Sopenharmony_ci * We may reuse slots for multiple asynchronous instructions, though there may
54bf215546Sopenharmony_ci * be a performance penalty.
55bf215546Sopenharmony_ci */
56bf215546Sopenharmony_ci
57bf215546Sopenharmony_ci#define BI_NUM_REGISTERS 64
58bf215546Sopenharmony_ci
59bf215546Sopenharmony_ci/*
60bf215546Sopenharmony_ci * Insert a NOP instruction with given flow control.
61bf215546Sopenharmony_ci */
62bf215546Sopenharmony_cistatic void
63bf215546Sopenharmony_cibi_flow(bi_context *ctx, bi_cursor cursor, enum va_flow flow)
64bf215546Sopenharmony_ci{
65bf215546Sopenharmony_ci   bi_builder b = bi_init_builder(ctx, cursor);
66bf215546Sopenharmony_ci
67bf215546Sopenharmony_ci   bi_nop(&b)->flow = flow;
68bf215546Sopenharmony_ci}
69bf215546Sopenharmony_ci
70bf215546Sopenharmony_cistatic uint64_t
71bf215546Sopenharmony_cibi_read_mask(bi_instr *I)
72bf215546Sopenharmony_ci{
73bf215546Sopenharmony_ci   uint64_t mask = 0;
74bf215546Sopenharmony_ci
75bf215546Sopenharmony_ci   bi_foreach_src(I, s) {
76bf215546Sopenharmony_ci      if (I->src[s].type == BI_INDEX_REGISTER) {
77bf215546Sopenharmony_ci         unsigned reg = I->src[s].value;
78bf215546Sopenharmony_ci         unsigned count = bi_count_read_registers(I, s);
79bf215546Sopenharmony_ci
80bf215546Sopenharmony_ci         mask |= (BITFIELD64_MASK(count) << reg);
81bf215546Sopenharmony_ci      }
82bf215546Sopenharmony_ci   }
83bf215546Sopenharmony_ci
84bf215546Sopenharmony_ci   return mask;
85bf215546Sopenharmony_ci}
86bf215546Sopenharmony_ci
87bf215546Sopenharmony_cistatic uint64_t
88bf215546Sopenharmony_cibi_write_mask(bi_instr *I)
89bf215546Sopenharmony_ci{
90bf215546Sopenharmony_ci   uint64_t mask = 0;
91bf215546Sopenharmony_ci
92bf215546Sopenharmony_ci   bi_foreach_dest(I, d) {
93bf215546Sopenharmony_ci      if (bi_is_null(I->dest[d])) continue;
94bf215546Sopenharmony_ci
95bf215546Sopenharmony_ci      assert(I->dest[d].type == BI_INDEX_REGISTER);
96bf215546Sopenharmony_ci
97bf215546Sopenharmony_ci      unsigned reg = I->dest[d].value;
98bf215546Sopenharmony_ci      unsigned count = bi_count_write_registers(I, d);
99bf215546Sopenharmony_ci
100bf215546Sopenharmony_ci      mask |= (BITFIELD64_MASK(count) << reg);
101bf215546Sopenharmony_ci   }
102bf215546Sopenharmony_ci
103bf215546Sopenharmony_ci   return mask;
104bf215546Sopenharmony_ci}
105bf215546Sopenharmony_ci
106bf215546Sopenharmony_cistatic bool
107bf215546Sopenharmony_cibi_ld_vary_writes_hidden_register(const bi_instr *I)
108bf215546Sopenharmony_ci{
109bf215546Sopenharmony_ci   /* Only varying loads can write the hidden register */
110bf215546Sopenharmony_ci   if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_VARYING)
111bf215546Sopenharmony_ci      return false;
112bf215546Sopenharmony_ci
113bf215546Sopenharmony_ci   /* They only write in some update modes */
114bf215546Sopenharmony_ci   return (I->update == BI_UPDATE_STORE) || (I->update == BI_UPDATE_CLOBBER);
115bf215546Sopenharmony_ci}
116bf215546Sopenharmony_ci
117bf215546Sopenharmony_cistatic bool
118bf215546Sopenharmony_cibi_is_memory_access(const bi_instr *I)
119bf215546Sopenharmony_ci{
120bf215546Sopenharmony_ci   /* On the attribute unit but functionally a general memory load */
121bf215546Sopenharmony_ci   if (I->op == BI_OPCODE_LD_ATTR_TEX)
122bf215546Sopenharmony_ci      return true;
123bf215546Sopenharmony_ci
124bf215546Sopenharmony_ci   /* UBOs are read-only so there are no ordering constriants */
125bf215546Sopenharmony_ci   if (I->seg == BI_SEG_UBO)
126bf215546Sopenharmony_ci      return false;
127bf215546Sopenharmony_ci
128bf215546Sopenharmony_ci   switch (bi_opcode_props[I->op].message) {
129bf215546Sopenharmony_ci   case BIFROST_MESSAGE_LOAD:
130bf215546Sopenharmony_ci   case BIFROST_MESSAGE_STORE:
131bf215546Sopenharmony_ci   case BIFROST_MESSAGE_ATOMIC:
132bf215546Sopenharmony_ci      return true;
133bf215546Sopenharmony_ci   default:
134bf215546Sopenharmony_ci      return false;
135bf215546Sopenharmony_ci   }
136bf215546Sopenharmony_ci}
137bf215546Sopenharmony_ci
138bf215546Sopenharmony_ci/* Update the scoreboard model to assign an instruction to a given slot */
139bf215546Sopenharmony_ci
140bf215546Sopenharmony_cistatic void
141bf215546Sopenharmony_cibi_push_instr(struct bi_scoreboard_state *st, bi_instr *I)
142bf215546Sopenharmony_ci{
143bf215546Sopenharmony_ci   if (bi_opcode_props[I->op].sr_write)
144bf215546Sopenharmony_ci      st->write[I->slot] |= bi_write_mask(I);
145bf215546Sopenharmony_ci
146bf215546Sopenharmony_ci   if (bi_is_memory_access(I))
147bf215546Sopenharmony_ci      st->memory |= BITFIELD_BIT(I->slot);
148bf215546Sopenharmony_ci
149bf215546Sopenharmony_ci   if (bi_opcode_props[I->op].message == BIFROST_MESSAGE_VARYING)
150bf215546Sopenharmony_ci      st->varying |= BITFIELD_BIT(I->slot);
151bf215546Sopenharmony_ci}
152bf215546Sopenharmony_ci
153bf215546Sopenharmony_cistatic uint8_t MUST_CHECK
154bf215546Sopenharmony_cibi_pop_slot(struct bi_scoreboard_state *st, unsigned slot)
155bf215546Sopenharmony_ci{
156bf215546Sopenharmony_ci   st->write[slot] = 0;
157bf215546Sopenharmony_ci   st->varying &= ~BITFIELD_BIT(slot);
158bf215546Sopenharmony_ci   st->memory &= ~BITFIELD_BIT(slot);
159bf215546Sopenharmony_ci
160bf215546Sopenharmony_ci   return BITFIELD_BIT(slot);
161bf215546Sopenharmony_ci}
162bf215546Sopenharmony_ci
163bf215546Sopenharmony_ci/* Adds a dependency on each slot writing any specified register */
164bf215546Sopenharmony_ci
165bf215546Sopenharmony_cistatic uint8_t MUST_CHECK
166bf215546Sopenharmony_cibi_depend_on_writers(struct bi_scoreboard_state *st, uint64_t regmask)
167bf215546Sopenharmony_ci{
168bf215546Sopenharmony_ci   uint8_t slots = 0;
169bf215546Sopenharmony_ci
170bf215546Sopenharmony_ci   for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
171bf215546Sopenharmony_ci      if (st->write[slot] & regmask)
172bf215546Sopenharmony_ci         slots |= bi_pop_slot(st, slot);
173bf215546Sopenharmony_ci   }
174bf215546Sopenharmony_ci
175bf215546Sopenharmony_ci   return slots;
176bf215546Sopenharmony_ci}
177bf215546Sopenharmony_ci
178bf215546Sopenharmony_ci/* Sets the dependencies for a given clause, updating the model */
179bf215546Sopenharmony_ci
180bf215546Sopenharmony_cistatic void
181bf215546Sopenharmony_cibi_set_dependencies(bi_block *block, bi_instr *I, struct bi_scoreboard_state *st)
182bf215546Sopenharmony_ci{
183bf215546Sopenharmony_ci   /* Depend on writers to handle read-after-write and write-after-write
184bf215546Sopenharmony_ci    * dependencies. Write-after-read dependencies are handled in the hardware
185bf215546Sopenharmony_ci    * where necessary, so we don't worry about them.
186bf215546Sopenharmony_ci    */
187bf215546Sopenharmony_ci   I->flow |= bi_depend_on_writers(st, bi_read_mask(I) | bi_write_mask(I));
188bf215546Sopenharmony_ci
189bf215546Sopenharmony_ci   /* Handle write-after-write and write-after-read dependencies for the varying
190bf215546Sopenharmony_ci    * hidden registers. Read-after-write dependencies handled in hardware.
191bf215546Sopenharmony_ci    */
192bf215546Sopenharmony_ci   if (bi_ld_vary_writes_hidden_register(I)) {
193bf215546Sopenharmony_ci      u_foreach_bit(slot, st->varying)
194bf215546Sopenharmony_ci         I->flow |= bi_pop_slot(st, slot);
195bf215546Sopenharmony_ci   }
196bf215546Sopenharmony_ci
197bf215546Sopenharmony_ci   /* For now, serialize all memory access */
198bf215546Sopenharmony_ci   if (bi_is_memory_access(I)) {
199bf215546Sopenharmony_ci      u_foreach_bit(slot, st->memory)
200bf215546Sopenharmony_ci         I->flow |= bi_pop_slot(st, slot);
201bf215546Sopenharmony_ci   }
202bf215546Sopenharmony_ci
203bf215546Sopenharmony_ci   /* We need to wait for all general slots before a barrier. The reason is
204bf215546Sopenharmony_ci    * unknown. In theory, this is redundant, since the BARRIER instruction will
205bf215546Sopenharmony_ci    * be followed immediately by .wait which waits for all slots. However, that
206bf215546Sopenharmony_ci    * doesn't seem to work properly in practice.
207bf215546Sopenharmony_ci    *
208bf215546Sopenharmony_ci    * The DDK is observed to use the same workaround, going so far as
209bf215546Sopenharmony_ci    * introducing a NOP before a BARRIER at the beginning of a basic block when
210bf215546Sopenharmony_ci    * there are outstanding stores.
211bf215546Sopenharmony_ci    *
212bf215546Sopenharmony_ci    *     NOP.wait12
213bf215546Sopenharmony_ci    *     BARRIER.slot7.wait
214bf215546Sopenharmony_ci    *
215bf215546Sopenharmony_ci    * Luckily, this situation is pretty rare. The wait introduced here can
216bf215546Sopenharmony_ci    * usually be merged into the preceding instruction.
217bf215546Sopenharmony_ci    *
218bf215546Sopenharmony_ci    * We also use the same workaround to serialize all async instructions when
219bf215546Sopenharmony_ci    * debugging this pass with the BIFROST_MESA_DEBUG=nosb option.
220bf215546Sopenharmony_ci    */
221bf215546Sopenharmony_ci   if (I->op == BI_OPCODE_BARRIER || (bifrost_debug & BIFROST_DBG_NOSB)) {
222bf215546Sopenharmony_ci      for (unsigned i = 0; i < VA_NUM_GENERAL_SLOTS; ++i) {
223bf215546Sopenharmony_ci         if (st->write[i] || ((st->varying | st->memory) & BITFIELD_BIT(i)))
224bf215546Sopenharmony_ci            I->flow |= bi_pop_slot(st, i);
225bf215546Sopenharmony_ci      }
226bf215546Sopenharmony_ci   }
227bf215546Sopenharmony_ci}
228bf215546Sopenharmony_ci
229bf215546Sopenharmony_cistatic bool
230bf215546Sopenharmony_ciscoreboard_block_update(bi_context *ctx, bi_block *blk)
231bf215546Sopenharmony_ci{
232bf215546Sopenharmony_ci   bool progress = false;
233bf215546Sopenharmony_ci
234bf215546Sopenharmony_ci   /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
235bf215546Sopenharmony_ci   bi_foreach_predecessor(blk, pred) {
236bf215546Sopenharmony_ci      for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
237bf215546Sopenharmony_ci         blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
238bf215546Sopenharmony_ci         blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i];
239bf215546Sopenharmony_ci         blk->scoreboard_in.varying |= (*pred)->scoreboard_out.varying;
240bf215546Sopenharmony_ci         blk->scoreboard_in.memory |= (*pred)->scoreboard_out.memory;
241bf215546Sopenharmony_ci      }
242bf215546Sopenharmony_ci   }
243bf215546Sopenharmony_ci
244bf215546Sopenharmony_ci   struct bi_scoreboard_state state = blk->scoreboard_in;
245bf215546Sopenharmony_ci
246bf215546Sopenharmony_ci   /* Assign locally */
247bf215546Sopenharmony_ci
248bf215546Sopenharmony_ci   bi_foreach_instr_in_block(blk, I) {
249bf215546Sopenharmony_ci      bi_set_dependencies(blk, I, &state);
250bf215546Sopenharmony_ci      bi_push_instr(&state, I);
251bf215546Sopenharmony_ci   }
252bf215546Sopenharmony_ci
253bf215546Sopenharmony_ci   /* Insert a wait for varyings at the end of the block.
254bf215546Sopenharmony_ci    *
255bf215546Sopenharmony_ci    * A varying load with .store has to wait for all other varying loads
256bf215546Sopenharmony_ci    * in the quad to complete. The bad case looks like:
257bf215546Sopenharmony_ci    *
258bf215546Sopenharmony_ci    *    if (dynamic) {
259bf215546Sopenharmony_ci    *        x = ld_var()
260bf215546Sopenharmony_ci    *    } else {
261bf215546Sopenharmony_ci    *       x = ld_var()
262bf215546Sopenharmony_ci    *    }
263bf215546Sopenharmony_ci    *
264bf215546Sopenharmony_ci    * Logically, a given thread executes only a single ld_var instruction. But
265bf215546Sopenharmony_ci    * if the quad diverges, the second ld_var has to wait for the first ld_var.
266bf215546Sopenharmony_ci    * For correct handling, we need to maintain a physical control flow graph
267bf215546Sopenharmony_ci    * and do the dataflow analysis on that instead of the logical control flow
268bf215546Sopenharmony_ci    * graph. However, this probably doesn't matter much in practice. This seems
269bf215546Sopenharmony_ci    * like a decent compromise for now.
270bf215546Sopenharmony_ci    *
271bf215546Sopenharmony_ci    * TODO: Consider optimizing this case.
272bf215546Sopenharmony_ci    */
273bf215546Sopenharmony_ci   if (state.varying) {
274bf215546Sopenharmony_ci      uint8_t flow = 0;
275bf215546Sopenharmony_ci
276bf215546Sopenharmony_ci      u_foreach_bit(slot, state.varying)
277bf215546Sopenharmony_ci         flow |= bi_pop_slot(&state, slot);
278bf215546Sopenharmony_ci
279bf215546Sopenharmony_ci      bi_flow(ctx, bi_after_block(blk), flow);
280bf215546Sopenharmony_ci   }
281bf215546Sopenharmony_ci
282bf215546Sopenharmony_ci   /* To figure out progress, diff scoreboard_out */
283bf215546Sopenharmony_ci   progress = !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
284bf215546Sopenharmony_ci
285bf215546Sopenharmony_ci   blk->scoreboard_out = state;
286bf215546Sopenharmony_ci
287bf215546Sopenharmony_ci   return progress;
288bf215546Sopenharmony_ci}
289bf215546Sopenharmony_ci
290bf215546Sopenharmony_cistatic void
291bf215546Sopenharmony_civa_assign_scoreboard(bi_context *ctx)
292bf215546Sopenharmony_ci{
293bf215546Sopenharmony_ci   u_worklist worklist;
294bf215546Sopenharmony_ci   bi_worklist_init(ctx, &worklist);
295bf215546Sopenharmony_ci
296bf215546Sopenharmony_ci   bi_foreach_block(ctx, block) {
297bf215546Sopenharmony_ci      bi_worklist_push_tail(&worklist, block);
298bf215546Sopenharmony_ci   }
299bf215546Sopenharmony_ci
300bf215546Sopenharmony_ci   /* Perform forward data flow analysis to calculate dependencies */
301bf215546Sopenharmony_ci   while (!u_worklist_is_empty(&worklist)) {
302bf215546Sopenharmony_ci      /* Pop from the front for forward analysis */
303bf215546Sopenharmony_ci      bi_block *blk = bi_worklist_pop_head(&worklist);
304bf215546Sopenharmony_ci
305bf215546Sopenharmony_ci      if (scoreboard_block_update(ctx, blk)) {
306bf215546Sopenharmony_ci         bi_foreach_successor(blk, succ)
307bf215546Sopenharmony_ci            bi_worklist_push_tail(&worklist, succ);
308bf215546Sopenharmony_ci      }
309bf215546Sopenharmony_ci   }
310bf215546Sopenharmony_ci
311bf215546Sopenharmony_ci   u_worklist_fini(&worklist);
312bf215546Sopenharmony_ci}
313bf215546Sopenharmony_ci
314bf215546Sopenharmony_ci/*
315bf215546Sopenharmony_ci * Determine if execution should terminate after a given block. Execution cannot
316bf215546Sopenharmony_ci * terminate within a basic block.
317bf215546Sopenharmony_ci */
318bf215546Sopenharmony_cistatic bool
319bf215546Sopenharmony_civa_should_end(bi_block *block)
320bf215546Sopenharmony_ci{
321bf215546Sopenharmony_ci   /* Don't return if we're succeeded by instructions */
322bf215546Sopenharmony_ci   for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
323bf215546Sopenharmony_ci      bi_block *succ = block->successors[i];
324bf215546Sopenharmony_ci
325bf215546Sopenharmony_ci      if (succ)
326bf215546Sopenharmony_ci         return false;
327bf215546Sopenharmony_ci   }
328bf215546Sopenharmony_ci
329bf215546Sopenharmony_ci   return true;
330bf215546Sopenharmony_ci}
331bf215546Sopenharmony_ci
332bf215546Sopenharmony_ci/*
333bf215546Sopenharmony_ci * We should discard helper invocations as soon as helper invocations die after
334bf215546Sopenharmony_ci * their last use. Either they die after an instruction using helper
335bf215546Sopenharmony_ci * invocations, or they die along a control flow edge. The former is handled by
336bf215546Sopenharmony_ci * discarding appropriately after instructions. The latter is handled by
337bf215546Sopenharmony_ci * inserting a discard at the _start_ of some blocks:
338bf215546Sopenharmony_ci *
339bf215546Sopenharmony_ci * Lemma: If a non-critical edge discards helpers, it is the only edge that
340bf215546Sopenharmony_ci * enters its destination.
341bf215546Sopenharmony_ci *
342bf215546Sopenharmony_ci * Proof: An edge discards helpers if helpers are live at the end of the source
343bf215546Sopenharmony_ci * block and dead at the start of the destination block. By definition, helpers
344bf215546Sopenharmony_ci * are live at the end of a block iff they are live at the start of some
345bf215546Sopenharmony_ci * successor of a block. The source block therefore has a successor with helpers
346bf215546Sopenharmony_ci * live at the start and a successor with helpers dead at the start. As the
347bf215546Sopenharmony_ci * source block has at least two successors, the edge is NOT the only edge
348bf215546Sopenharmony_ci * exiting its source. Hence it is the only edge entering the destination,
349bf215546Sopenharmony_ci * otherwise the edge would be critical.
350bf215546Sopenharmony_ci *
351bf215546Sopenharmony_ci * By corrollary, we may handle discards on control flow edges by discarding at
352bf215546Sopenharmony_ci * the start of blocks with a single predecessor.
353bf215546Sopenharmony_ci *
354bf215546Sopenharmony_ci * This routine tests if a block should discard helper invocations at its start.
355bf215546Sopenharmony_ci */
356bf215546Sopenharmony_cistatic bool
357bf215546Sopenharmony_civa_discard_before_block(bi_block *block)
358bf215546Sopenharmony_ci{
359bf215546Sopenharmony_ci   /* Do not discard if the block requires helpers at the start */
360bf215546Sopenharmony_ci   if (block->pass_flags)
361bf215546Sopenharmony_ci      return false;
362bf215546Sopenharmony_ci
363bf215546Sopenharmony_ci   /* By the lemma, if we need to discard, there is a unique predecessor */
364bf215546Sopenharmony_ci   if (bi_num_predecessors(block) != 1)
365bf215546Sopenharmony_ci      return false;
366bf215546Sopenharmony_ci
367bf215546Sopenharmony_ci   bi_block *pred = *util_dynarray_element(&block->predecessors, bi_block *, 0);
368bf215546Sopenharmony_ci
369bf215546Sopenharmony_ci   /* Discard if helpers are live at the end of the predecessor, due to helpers
370bf215546Sopenharmony_ci    * live at the start of some (other) successor.
371bf215546Sopenharmony_ci    */
372bf215546Sopenharmony_ci   bi_foreach_successor(pred, succ) {
373bf215546Sopenharmony_ci      if (succ->pass_flags)
374bf215546Sopenharmony_ci         return true;
375bf215546Sopenharmony_ci   }
376bf215546Sopenharmony_ci
377bf215546Sopenharmony_ci   return false;
378bf215546Sopenharmony_ci}
379bf215546Sopenharmony_ci
380bf215546Sopenharmony_ci/*
381bf215546Sopenharmony_ci * Test if a program is empty, in the sense of having zero instructions. Empty
382bf215546Sopenharmony_ci * shaders get special handling.
383bf215546Sopenharmony_ci */
384bf215546Sopenharmony_cistatic bool
385bf215546Sopenharmony_cibi_is_empty(bi_context *ctx)
386bf215546Sopenharmony_ci{
387bf215546Sopenharmony_ci   bi_foreach_instr_global(ctx, _)
388bf215546Sopenharmony_ci      return false;
389bf215546Sopenharmony_ci
390bf215546Sopenharmony_ci   return true;
391bf215546Sopenharmony_ci}
392bf215546Sopenharmony_ci
393bf215546Sopenharmony_ci/*
394bf215546Sopenharmony_ci * Given a program with no flow control modifiers, insert NOPs signaling the
395bf215546Sopenharmony_ci * required flow control. Not much optimization happens here.
396bf215546Sopenharmony_ci */
397bf215546Sopenharmony_civoid
398bf215546Sopenharmony_civa_insert_flow_control_nops(bi_context *ctx)
399bf215546Sopenharmony_ci{
400bf215546Sopenharmony_ci   /* Special case: if a program is empty, leave it empty. In particular, do not
401bf215546Sopenharmony_ci    * insert NOP.end. There is special handling in the driver for skipping empty
402bf215546Sopenharmony_ci    * shaders for shader stage. The .end is not necessary and disrupts
403bf215546Sopenharmony_ci    * optimizations.
404bf215546Sopenharmony_ci    */
405bf215546Sopenharmony_ci   if (bi_is_empty(ctx))
406bf215546Sopenharmony_ci      return;
407bf215546Sopenharmony_ci
408bf215546Sopenharmony_ci   /* First do dataflow analysis for the scoreboard. This populates I->flow with
409bf215546Sopenharmony_ci    * a bitmap of slots to wait on.
410bf215546Sopenharmony_ci    *
411bf215546Sopenharmony_ci    * Also do dataflow analysis for helper invocations in fragment shaders. This
412bf215546Sopenharmony_ci    * populates block->pass_flags with helper invocation information.
413bf215546Sopenharmony_ci    */
414bf215546Sopenharmony_ci   va_assign_scoreboard(ctx);
415bf215546Sopenharmony_ci   bi_analyze_helper_terminate(ctx);
416bf215546Sopenharmony_ci
417bf215546Sopenharmony_ci   bi_foreach_block(ctx, block) {
418bf215546Sopenharmony_ci      /* Handle discards along control flow edges */
419bf215546Sopenharmony_ci      if (va_discard_before_block(block))
420bf215546Sopenharmony_ci         bi_flow(ctx, bi_before_block(block), VA_FLOW_DISCARD);
421bf215546Sopenharmony_ci
422bf215546Sopenharmony_ci      bi_foreach_instr_in_block_safe(block, I) {
423bf215546Sopenharmony_ci         switch (I->op) {
424bf215546Sopenharmony_ci         /* Signal barriers immediately */
425bf215546Sopenharmony_ci         case BI_OPCODE_BARRIER:
426bf215546Sopenharmony_ci            bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT);
427bf215546Sopenharmony_ci            break;
428bf215546Sopenharmony_ci
429bf215546Sopenharmony_ci         /* Insert waits for tilebuffer and depth/stencil instructions. These
430bf215546Sopenharmony_ci          * only happen in regular fragment shaders, as the required waits are
431bf215546Sopenharmony_ci          * assumed to already have happened in blend shaders.
432bf215546Sopenharmony_ci          *
433bf215546Sopenharmony_ci          * For discarded thread handling, ATEST must be serialized against all
434bf215546Sopenharmony_ci          * other asynchronous instructions and should be serialized against all
435bf215546Sopenharmony_ci          * instructions. Wait for slot 0 immediately after the ATEST.
436bf215546Sopenharmony_ci          */
437bf215546Sopenharmony_ci         case BI_OPCODE_BLEND:
438bf215546Sopenharmony_ci         case BI_OPCODE_LD_TILE:
439bf215546Sopenharmony_ci         case BI_OPCODE_ST_TILE:
440bf215546Sopenharmony_ci            if (!ctx->inputs->is_blend)
441bf215546Sopenharmony_ci               bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT);
442bf215546Sopenharmony_ci            break;
443bf215546Sopenharmony_ci         case BI_OPCODE_ATEST:
444bf215546Sopenharmony_ci            bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126);
445bf215546Sopenharmony_ci            bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT0);
446bf215546Sopenharmony_ci            break;
447bf215546Sopenharmony_ci         case BI_OPCODE_ZS_EMIT:
448bf215546Sopenharmony_ci            if (!ctx->inputs->is_blend)
449bf215546Sopenharmony_ci               bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126);
450bf215546Sopenharmony_ci            break;
451bf215546Sopenharmony_ci
452bf215546Sopenharmony_ci         default:
453bf215546Sopenharmony_ci            break;
454bf215546Sopenharmony_ci         }
455bf215546Sopenharmony_ci
456bf215546Sopenharmony_ci         if (I->flow && I->op != BI_OPCODE_NOP) {
457bf215546Sopenharmony_ci            /* Wait on the results of asynchronous instructions
458bf215546Sopenharmony_ci             *
459bf215546Sopenharmony_ci             * Bitmap of general slots lines up with the encoding of va_flow for
460bf215546Sopenharmony_ci             * waits on general slots. The dataflow analysis should be ignoring
461bf215546Sopenharmony_ci             * the special slots #6 and #7, which are handled separately.
462bf215546Sopenharmony_ci             */
463bf215546Sopenharmony_ci            assert((I->flow & ~BITFIELD_MASK(VA_NUM_GENERAL_SLOTS)) == 0);
464bf215546Sopenharmony_ci
465bf215546Sopenharmony_ci            bi_flow(ctx, bi_before_instr(I), I->flow);
466bf215546Sopenharmony_ci            I->flow = 0;
467bf215546Sopenharmony_ci         }
468bf215546Sopenharmony_ci      }
469bf215546Sopenharmony_ci
470bf215546Sopenharmony_ci      /* Terminate helpers after the last use */
471bf215546Sopenharmony_ci      if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend &&
472bf215546Sopenharmony_ci          block->pass_flags && bi_block_terminates_helpers(block)) {
473bf215546Sopenharmony_ci
474bf215546Sopenharmony_ci         bi_foreach_instr_in_block_safe_rev(block, I) {
475bf215546Sopenharmony_ci            if (bi_instr_uses_helpers(I)) {
476bf215546Sopenharmony_ci               bi_flow(ctx, bi_after_instr(I), VA_FLOW_DISCARD);
477bf215546Sopenharmony_ci               break;
478bf215546Sopenharmony_ci            }
479bf215546Sopenharmony_ci         }
480bf215546Sopenharmony_ci      }
481bf215546Sopenharmony_ci
482bf215546Sopenharmony_ci      /* End exeuction at the end of the block if needed, or reconverge if we
483bf215546Sopenharmony_ci       * continue but we don't need to end execution.
484bf215546Sopenharmony_ci       */
485bf215546Sopenharmony_ci      if (va_should_end(block) || block->needs_nop) {
486bf215546Sopenharmony_ci         /* Don't bother adding a NOP into an unreachable block */
487bf215546Sopenharmony_ci         if (block == bi_start_block(&ctx->blocks) || bi_num_predecessors(block))
488bf215546Sopenharmony_ci            bi_flow(ctx, bi_after_block(block), VA_FLOW_END);
489bf215546Sopenharmony_ci      } else if (bi_reconverge_branches(block)) {
490bf215546Sopenharmony_ci         /* TODO: Do we have ever need to reconverge from an empty block? */
491bf215546Sopenharmony_ci         if (!list_is_empty(&block->instructions))
492bf215546Sopenharmony_ci            bi_flow(ctx, bi_after_block(block), VA_FLOW_RECONVERGE);
493bf215546Sopenharmony_ci      }
494bf215546Sopenharmony_ci   }
495bf215546Sopenharmony_ci
496bf215546Sopenharmony_ci   /* If helpers are not used anywhere, they are not used at the start, so we
497bf215546Sopenharmony_ci    * terminate at the start. Else, helpers are used somewhere in the shader and
498bf215546Sopenharmony_ci    * are terminated after last use.
499bf215546Sopenharmony_ci    */
500bf215546Sopenharmony_ci   bi_block *start = bi_start_block(&ctx->blocks);
501bf215546Sopenharmony_ci   bool frag = (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend);
502bf215546Sopenharmony_ci
503bf215546Sopenharmony_ci   if (frag && !start->pass_flags)
504bf215546Sopenharmony_ci      bi_flow(ctx, bi_before_block(start), VA_FLOW_DISCARD);
505bf215546Sopenharmony_ci}
506bf215546Sopenharmony_ci
507bf215546Sopenharmony_ci/*
508bf215546Sopenharmony_ci * Assign slots to all asynchronous instructions. A few special instructions
509bf215546Sopenharmony_ci * require specific slots. For the rest, we assign slots in a round-robin
510bf215546Sopenharmony_ci * fashion to reduce false dependencies when encoding waits.
511bf215546Sopenharmony_ci *
512bf215546Sopenharmony_ci * This should be called before va_insert_flow_control_nops.
513bf215546Sopenharmony_ci */
514bf215546Sopenharmony_civoid
515bf215546Sopenharmony_civa_assign_slots(bi_context *ctx)
516bf215546Sopenharmony_ci{
517bf215546Sopenharmony_ci   unsigned counter = 0;
518bf215546Sopenharmony_ci
519bf215546Sopenharmony_ci   bi_foreach_instr_global(ctx, I) {
520bf215546Sopenharmony_ci      if (I->op == BI_OPCODE_BARRIER) {
521bf215546Sopenharmony_ci         I->slot = 7;
522bf215546Sopenharmony_ci      } else if (I->op == BI_OPCODE_ZS_EMIT || I->op == BI_OPCODE_ATEST) {
523bf215546Sopenharmony_ci         I->slot = 0;
524bf215546Sopenharmony_ci      } else if (bi_opcode_props[I->op].message) {
525bf215546Sopenharmony_ci         I->slot = counter++;
526bf215546Sopenharmony_ci
527bf215546Sopenharmony_ci         if (counter == 3)
528bf215546Sopenharmony_ci            counter = 0;
529bf215546Sopenharmony_ci      }
530bf215546Sopenharmony_ci   }
531bf215546Sopenharmony_ci}
532