1/*
2 * Copyright (C) 2022 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include "va_compiler.h"
25#include "valhall_enums.h"
26#include "bi_builder.h"
27
28/*
29 * Insert flow control into a scheduled and register allocated shader.  This
30 * pass runs after scheduling and register allocation. This pass only
31 * inserts NOPs with the appropriate flow control modifiers. It should be
32 * followed by a cleanup pass to merge flow control modifiers on adjacent
33 * instructions, eliminating the NOPs. This decouples optimization from
34 * correctness, simplifying both passes.
35 *
36 * This pass is responsible for calculating dependencies, according to the
37 * rules:
38 *
39 * 1. An instruction that depends on the results of a previous asyncronous
40 *    must first wait for that instruction's slot, unless all
41 *    reaching code paths already depended on it.
42 * 2. More generally, any dependencies must be encoded. This includes
43 *    Write-After-Write and Write-After-Read hazards with LOAD/STORE to memory.
44 * 3. The shader must wait on slot #6 before running BLEND, ATEST
45 * 4. The shader must wait on slot #7 before running BLEND, ST_TILE
46 * 6. BARRIER must wait on every active slot.
47 *
48 * Unlike Bifrost, it is not necessary to worry about outbound staging
49 * registers, as the hardware stalls reading staging registers when issuing
50 * asynchronous instructions. So we don't track reads in our model of the
51 * hardware scoreboard. This makes things a bit simpler.
52 *
53 * We may reuse slots for multiple asynchronous instructions, though there may
54 * be a performance penalty.
55 */
56
57#define BI_NUM_REGISTERS 64
58
59/*
60 * Insert a NOP instruction with given flow control.
61 */
62static void
63bi_flow(bi_context *ctx, bi_cursor cursor, enum va_flow flow)
64{
65   bi_builder b = bi_init_builder(ctx, cursor);
66
67   bi_nop(&b)->flow = flow;
68}
69
70static uint64_t
71bi_read_mask(bi_instr *I)
72{
73   uint64_t mask = 0;
74
75   bi_foreach_src(I, s) {
76      if (I->src[s].type == BI_INDEX_REGISTER) {
77         unsigned reg = I->src[s].value;
78         unsigned count = bi_count_read_registers(I, s);
79
80         mask |= (BITFIELD64_MASK(count) << reg);
81      }
82   }
83
84   return mask;
85}
86
87static uint64_t
88bi_write_mask(bi_instr *I)
89{
90   uint64_t mask = 0;
91
92   bi_foreach_dest(I, d) {
93      if (bi_is_null(I->dest[d])) continue;
94
95      assert(I->dest[d].type == BI_INDEX_REGISTER);
96
97      unsigned reg = I->dest[d].value;
98      unsigned count = bi_count_write_registers(I, d);
99
100      mask |= (BITFIELD64_MASK(count) << reg);
101   }
102
103   return mask;
104}
105
106static bool
107bi_ld_vary_writes_hidden_register(const bi_instr *I)
108{
109   /* Only varying loads can write the hidden register */
110   if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_VARYING)
111      return false;
112
113   /* They only write in some update modes */
114   return (I->update == BI_UPDATE_STORE) || (I->update == BI_UPDATE_CLOBBER);
115}
116
117static bool
118bi_is_memory_access(const bi_instr *I)
119{
120   /* On the attribute unit but functionally a general memory load */
121   if (I->op == BI_OPCODE_LD_ATTR_TEX)
122      return true;
123
124   /* UBOs are read-only so there are no ordering constriants */
125   if (I->seg == BI_SEG_UBO)
126      return false;
127
128   switch (bi_opcode_props[I->op].message) {
129   case BIFROST_MESSAGE_LOAD:
130   case BIFROST_MESSAGE_STORE:
131   case BIFROST_MESSAGE_ATOMIC:
132      return true;
133   default:
134      return false;
135   }
136}
137
138/* Update the scoreboard model to assign an instruction to a given slot */
139
140static void
141bi_push_instr(struct bi_scoreboard_state *st, bi_instr *I)
142{
143   if (bi_opcode_props[I->op].sr_write)
144      st->write[I->slot] |= bi_write_mask(I);
145
146   if (bi_is_memory_access(I))
147      st->memory |= BITFIELD_BIT(I->slot);
148
149   if (bi_opcode_props[I->op].message == BIFROST_MESSAGE_VARYING)
150      st->varying |= BITFIELD_BIT(I->slot);
151}
152
153static uint8_t MUST_CHECK
154bi_pop_slot(struct bi_scoreboard_state *st, unsigned slot)
155{
156   st->write[slot] = 0;
157   st->varying &= ~BITFIELD_BIT(slot);
158   st->memory &= ~BITFIELD_BIT(slot);
159
160   return BITFIELD_BIT(slot);
161}
162
163/* Adds a dependency on each slot writing any specified register */
164
165static uint8_t MUST_CHECK
166bi_depend_on_writers(struct bi_scoreboard_state *st, uint64_t regmask)
167{
168   uint8_t slots = 0;
169
170   for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
171      if (st->write[slot] & regmask)
172         slots |= bi_pop_slot(st, slot);
173   }
174
175   return slots;
176}
177
178/* Sets the dependencies for a given clause, updating the model */
179
180static void
181bi_set_dependencies(bi_block *block, bi_instr *I, struct bi_scoreboard_state *st)
182{
183   /* Depend on writers to handle read-after-write and write-after-write
184    * dependencies. Write-after-read dependencies are handled in the hardware
185    * where necessary, so we don't worry about them.
186    */
187   I->flow |= bi_depend_on_writers(st, bi_read_mask(I) | bi_write_mask(I));
188
189   /* Handle write-after-write and write-after-read dependencies for the varying
190    * hidden registers. Read-after-write dependencies handled in hardware.
191    */
192   if (bi_ld_vary_writes_hidden_register(I)) {
193      u_foreach_bit(slot, st->varying)
194         I->flow |= bi_pop_slot(st, slot);
195   }
196
197   /* For now, serialize all memory access */
198   if (bi_is_memory_access(I)) {
199      u_foreach_bit(slot, st->memory)
200         I->flow |= bi_pop_slot(st, slot);
201   }
202
203   /* We need to wait for all general slots before a barrier. The reason is
204    * unknown. In theory, this is redundant, since the BARRIER instruction will
205    * be followed immediately by .wait which waits for all slots. However, that
206    * doesn't seem to work properly in practice.
207    *
208    * The DDK is observed to use the same workaround, going so far as
209    * introducing a NOP before a BARRIER at the beginning of a basic block when
210    * there are outstanding stores.
211    *
212    *     NOP.wait12
213    *     BARRIER.slot7.wait
214    *
215    * Luckily, this situation is pretty rare. The wait introduced here can
216    * usually be merged into the preceding instruction.
217    *
218    * We also use the same workaround to serialize all async instructions when
219    * debugging this pass with the BIFROST_MESA_DEBUG=nosb option.
220    */
221   if (I->op == BI_OPCODE_BARRIER || (bifrost_debug & BIFROST_DBG_NOSB)) {
222      for (unsigned i = 0; i < VA_NUM_GENERAL_SLOTS; ++i) {
223         if (st->write[i] || ((st->varying | st->memory) & BITFIELD_BIT(i)))
224            I->flow |= bi_pop_slot(st, i);
225      }
226   }
227}
228
229static bool
230scoreboard_block_update(bi_context *ctx, bi_block *blk)
231{
232   bool progress = false;
233
234   /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
235   bi_foreach_predecessor(blk, pred) {
236      for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
237         blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
238         blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i];
239         blk->scoreboard_in.varying |= (*pred)->scoreboard_out.varying;
240         blk->scoreboard_in.memory |= (*pred)->scoreboard_out.memory;
241      }
242   }
243
244   struct bi_scoreboard_state state = blk->scoreboard_in;
245
246   /* Assign locally */
247
248   bi_foreach_instr_in_block(blk, I) {
249      bi_set_dependencies(blk, I, &state);
250      bi_push_instr(&state, I);
251   }
252
253   /* Insert a wait for varyings at the end of the block.
254    *
255    * A varying load with .store has to wait for all other varying loads
256    * in the quad to complete. The bad case looks like:
257    *
258    *    if (dynamic) {
259    *        x = ld_var()
260    *    } else {
261    *       x = ld_var()
262    *    }
263    *
264    * Logically, a given thread executes only a single ld_var instruction. But
265    * if the quad diverges, the second ld_var has to wait for the first ld_var.
266    * For correct handling, we need to maintain a physical control flow graph
267    * and do the dataflow analysis on that instead of the logical control flow
268    * graph. However, this probably doesn't matter much in practice. This seems
269    * like a decent compromise for now.
270    *
271    * TODO: Consider optimizing this case.
272    */
273   if (state.varying) {
274      uint8_t flow = 0;
275
276      u_foreach_bit(slot, state.varying)
277         flow |= bi_pop_slot(&state, slot);
278
279      bi_flow(ctx, bi_after_block(blk), flow);
280   }
281
282   /* To figure out progress, diff scoreboard_out */
283   progress = !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
284
285   blk->scoreboard_out = state;
286
287   return progress;
288}
289
290static void
291va_assign_scoreboard(bi_context *ctx)
292{
293   u_worklist worklist;
294   bi_worklist_init(ctx, &worklist);
295
296   bi_foreach_block(ctx, block) {
297      bi_worklist_push_tail(&worklist, block);
298   }
299
300   /* Perform forward data flow analysis to calculate dependencies */
301   while (!u_worklist_is_empty(&worklist)) {
302      /* Pop from the front for forward analysis */
303      bi_block *blk = bi_worklist_pop_head(&worklist);
304
305      if (scoreboard_block_update(ctx, blk)) {
306         bi_foreach_successor(blk, succ)
307            bi_worklist_push_tail(&worklist, succ);
308      }
309   }
310
311   u_worklist_fini(&worklist);
312}
313
314/*
315 * Determine if execution should terminate after a given block. Execution cannot
316 * terminate within a basic block.
317 */
318static bool
319va_should_end(bi_block *block)
320{
321   /* Don't return if we're succeeded by instructions */
322   for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
323      bi_block *succ = block->successors[i];
324
325      if (succ)
326         return false;
327   }
328
329   return true;
330}
331
332/*
333 * We should discard helper invocations as soon as helper invocations die after
334 * their last use. Either they die after an instruction using helper
335 * invocations, or they die along a control flow edge. The former is handled by
336 * discarding appropriately after instructions. The latter is handled by
337 * inserting a discard at the _start_ of some blocks:
338 *
339 * Lemma: If a non-critical edge discards helpers, it is the only edge that
340 * enters its destination.
341 *
342 * Proof: An edge discards helpers if helpers are live at the end of the source
343 * block and dead at the start of the destination block. By definition, helpers
344 * are live at the end of a block iff they are live at the start of some
345 * successor of a block. The source block therefore has a successor with helpers
346 * live at the start and a successor with helpers dead at the start. As the
347 * source block has at least two successors, the edge is NOT the only edge
348 * exiting its source. Hence it is the only edge entering the destination,
349 * otherwise the edge would be critical.
350 *
351 * By corrollary, we may handle discards on control flow edges by discarding at
352 * the start of blocks with a single predecessor.
353 *
354 * This routine tests if a block should discard helper invocations at its start.
355 */
356static bool
357va_discard_before_block(bi_block *block)
358{
359   /* Do not discard if the block requires helpers at the start */
360   if (block->pass_flags)
361      return false;
362
363   /* By the lemma, if we need to discard, there is a unique predecessor */
364   if (bi_num_predecessors(block) != 1)
365      return false;
366
367   bi_block *pred = *util_dynarray_element(&block->predecessors, bi_block *, 0);
368
369   /* Discard if helpers are live at the end of the predecessor, due to helpers
370    * live at the start of some (other) successor.
371    */
372   bi_foreach_successor(pred, succ) {
373      if (succ->pass_flags)
374         return true;
375   }
376
377   return false;
378}
379
380/*
381 * Test if a program is empty, in the sense of having zero instructions. Empty
382 * shaders get special handling.
383 */
384static bool
385bi_is_empty(bi_context *ctx)
386{
387   bi_foreach_instr_global(ctx, _)
388      return false;
389
390   return true;
391}
392
393/*
394 * Given a program with no flow control modifiers, insert NOPs signaling the
395 * required flow control. Not much optimization happens here.
396 */
397void
398va_insert_flow_control_nops(bi_context *ctx)
399{
400   /* Special case: if a program is empty, leave it empty. In particular, do not
401    * insert NOP.end. There is special handling in the driver for skipping empty
402    * shaders for shader stage. The .end is not necessary and disrupts
403    * optimizations.
404    */
405   if (bi_is_empty(ctx))
406      return;
407
408   /* First do dataflow analysis for the scoreboard. This populates I->flow with
409    * a bitmap of slots to wait on.
410    *
411    * Also do dataflow analysis for helper invocations in fragment shaders. This
412    * populates block->pass_flags with helper invocation information.
413    */
414   va_assign_scoreboard(ctx);
415   bi_analyze_helper_terminate(ctx);
416
417   bi_foreach_block(ctx, block) {
418      /* Handle discards along control flow edges */
419      if (va_discard_before_block(block))
420         bi_flow(ctx, bi_before_block(block), VA_FLOW_DISCARD);
421
422      bi_foreach_instr_in_block_safe(block, I) {
423         switch (I->op) {
424         /* Signal barriers immediately */
425         case BI_OPCODE_BARRIER:
426            bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT);
427            break;
428
429         /* Insert waits for tilebuffer and depth/stencil instructions. These
430          * only happen in regular fragment shaders, as the required waits are
431          * assumed to already have happened in blend shaders.
432          *
433          * For discarded thread handling, ATEST must be serialized against all
434          * other asynchronous instructions and should be serialized against all
435          * instructions. Wait for slot 0 immediately after the ATEST.
436          */
437         case BI_OPCODE_BLEND:
438         case BI_OPCODE_LD_TILE:
439         case BI_OPCODE_ST_TILE:
440            if (!ctx->inputs->is_blend)
441               bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT);
442            break;
443         case BI_OPCODE_ATEST:
444            bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126);
445            bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT0);
446            break;
447         case BI_OPCODE_ZS_EMIT:
448            if (!ctx->inputs->is_blend)
449               bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126);
450            break;
451
452         default:
453            break;
454         }
455
456         if (I->flow && I->op != BI_OPCODE_NOP) {
457            /* Wait on the results of asynchronous instructions
458             *
459             * Bitmap of general slots lines up with the encoding of va_flow for
460             * waits on general slots. The dataflow analysis should be ignoring
461             * the special slots #6 and #7, which are handled separately.
462             */
463            assert((I->flow & ~BITFIELD_MASK(VA_NUM_GENERAL_SLOTS)) == 0);
464
465            bi_flow(ctx, bi_before_instr(I), I->flow);
466            I->flow = 0;
467         }
468      }
469
470      /* Terminate helpers after the last use */
471      if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend &&
472          block->pass_flags && bi_block_terminates_helpers(block)) {
473
474         bi_foreach_instr_in_block_safe_rev(block, I) {
475            if (bi_instr_uses_helpers(I)) {
476               bi_flow(ctx, bi_after_instr(I), VA_FLOW_DISCARD);
477               break;
478            }
479         }
480      }
481
482      /* End exeuction at the end of the block if needed, or reconverge if we
483       * continue but we don't need to end execution.
484       */
485      if (va_should_end(block) || block->needs_nop) {
486         /* Don't bother adding a NOP into an unreachable block */
487         if (block == bi_start_block(&ctx->blocks) || bi_num_predecessors(block))
488            bi_flow(ctx, bi_after_block(block), VA_FLOW_END);
489      } else if (bi_reconverge_branches(block)) {
490         /* TODO: Do we have ever need to reconverge from an empty block? */
491         if (!list_is_empty(&block->instructions))
492            bi_flow(ctx, bi_after_block(block), VA_FLOW_RECONVERGE);
493      }
494   }
495
496   /* If helpers are not used anywhere, they are not used at the start, so we
497    * terminate at the start. Else, helpers are used somewhere in the shader and
498    * are terminated after last use.
499    */
500   bi_block *start = bi_start_block(&ctx->blocks);
501   bool frag = (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend);
502
503   if (frag && !start->pass_flags)
504      bi_flow(ctx, bi_before_block(start), VA_FLOW_DISCARD);
505}
506
507/*
508 * Assign slots to all asynchronous instructions. A few special instructions
509 * require specific slots. For the rest, we assign slots in a round-robin
510 * fashion to reduce false dependencies when encoding waits.
511 *
512 * This should be called before va_insert_flow_control_nops.
513 */
514void
515va_assign_slots(bi_context *ctx)
516{
517   unsigned counter = 0;
518
519   bi_foreach_instr_global(ctx, I) {
520      if (I->op == BI_OPCODE_BARRIER) {
521         I->slot = 7;
522      } else if (I->op == BI_OPCODE_ZS_EMIT || I->op == BI_OPCODE_ATEST) {
523         I->slot = 0;
524      } else if (bi_opcode_props[I->op].message) {
525         I->slot = counter++;
526
527         if (counter == 3)
528            counter = 0;
529      }
530   }
531}
532