1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright (C) 2022 Collabora Ltd.
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21bf215546Sopenharmony_ci * SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci#include "va_compiler.h"
25bf215546Sopenharmony_ci#include "valhall_enums.h"
26bf215546Sopenharmony_ci#include "bi_builder.h"
27bf215546Sopenharmony_ci
28bf215546Sopenharmony_ci/*
29bf215546Sopenharmony_ci * Merge NOPs with flow control with nearby instructions to eliminate the NOPs,
30bf215546Sopenharmony_ci * according to the following rules:
31bf215546Sopenharmony_ci *
32bf215546Sopenharmony_ci * 1. Waits may be combined by waiting on a union of the slots.
33bf215546Sopenharmony_ci * 2. Waits may be moved up as far as the first (last) asynchronous instruction
34bf215546Sopenharmony_ci *    writing to a slot waited on, at a performance cost.
35bf215546Sopenharmony_ci * 3. Discard may be moved down, at a performance cost.
36bf215546Sopenharmony_ci * 4. Reconverge must be on the last instruction of the block.
37bf215546Sopenharmony_ci * 5. End must be on the last instruction of the block.
38bf215546Sopenharmony_ci *
39bf215546Sopenharmony_ci * For simplicity, this pass only merges within a single basic block. This
40bf215546Sopenharmony_ci * should be sufficient. The algorithm works as follows.
41bf215546Sopenharmony_ci *
42bf215546Sopenharmony_ci * Because reconverge/end must be on the last instruction, there may be only
43bf215546Sopenharmony_ci * one of these in a block. First check for a NOP at the end, and if it has
44bf215546Sopenharmony_ci * reconverge/end flow control, merge with the penultimate instruction. Now we
45bf215546Sopenharmony_ci * only have to worry about waits and discard.
46bf215546Sopenharmony_ci *
47bf215546Sopenharmony_ci * The cost of moving a wait up is assumed to be greater than the cost of moving
48bf215546Sopenharmony_ci * a discard down, so we next move waits while we have more freedom. For each
49bf215546Sopenharmony_ci * wait, merge with an instruction above that either is free or has another
50bf215546Sopenharmony_ci * wait, and merge the flow control, aborting if we hit a message signaling the
51bf215546Sopenharmony_ci * relevant slot. By maintaining the candidate instruction at all steps, this is
52bf215546Sopenharmony_ci * implemented in a linear time walk.
53bf215546Sopenharmony_ci *
54bf215546Sopenharmony_ci * Finally, all that's left are discards. Move discards down to merge with a
55bf215546Sopenharmony_ci * free instruction with a similar linear time walk.
56bf215546Sopenharmony_ci *
57bf215546Sopenharmony_ci * Since discarding helper threads is an optimization (not required for
58bf215546Sopenharmony_ci * correctness), we may remove discards if we believe them harmful to the
59bf215546Sopenharmony_ci * performance of the shader. Keeping with the local structure of the pass, we
60bf215546Sopenharmony_ci * remove discards only if they are at the end of the last block and cannot be
61bf215546Sopenharmony_ci * merged with any other instruction.  Such a condition means every following
62bf215546Sopenharmony_ci * instruction has incompatible flow control (wait and end). In practice, this
63bf215546Sopenharmony_ci * means BLEND and ATEST run for helper threads in certain shaders to save a
64bf215546Sopenharmony_ci * NOP, but BLEND is a no-op for helper threads anyway.
65bf215546Sopenharmony_ci *
66bf215546Sopenharmony_ci * One case we don't handle is merging "foo, bar, wait, reconverge" to
67bf215546Sopenharmony_ci * "foo.wait, bar.reconverge". This sequence is rarely generated by the dataflow
68bf215546Sopenharmony_ci * analysis, so we don't care for the complexity.
69bf215546Sopenharmony_ci */
70bf215546Sopenharmony_ci
71bf215546Sopenharmony_ci/*
72bf215546Sopenharmony_ci * If the last instruction has reconverge or end, try to merge with the
73bf215546Sopenharmony_ci * second to last instruction.
74bf215546Sopenharmony_ci *
75bf215546Sopenharmony_ci * Precondition: block has at least 2 instructions.
76bf215546Sopenharmony_ci */
77bf215546Sopenharmony_cistatic void
78bf215546Sopenharmony_cimerge_end_reconverge(bi_block *block)
79bf215546Sopenharmony_ci{
80bf215546Sopenharmony_ci   bi_instr *last = list_last_entry(&block->instructions, bi_instr, link);
81bf215546Sopenharmony_ci   bi_instr *penult = bi_prev_op(last);
82bf215546Sopenharmony_ci
83bf215546Sopenharmony_ci   if (last->op != BI_OPCODE_NOP) return;
84bf215546Sopenharmony_ci   if (last->flow != VA_FLOW_RECONVERGE && last->flow != VA_FLOW_END) return;
85bf215546Sopenharmony_ci
86bf215546Sopenharmony_ci   /* End implies all other flow control except for waiting on barriers (slot
87bf215546Sopenharmony_ci    * #7, with VA_FLOW_WAIT), so remove blocking flow control.
88bf215546Sopenharmony_ci    */
89bf215546Sopenharmony_ci   if (last->flow == VA_FLOW_END) {
90bf215546Sopenharmony_ci      while (penult->op == BI_OPCODE_NOP && penult->flow != VA_FLOW_WAIT) {
91bf215546Sopenharmony_ci         bi_remove_instruction(penult);
92bf215546Sopenharmony_ci
93bf215546Sopenharmony_ci         /* There may be nothing left */
94bf215546Sopenharmony_ci         if (list_is_singular(&block->instructions))
95bf215546Sopenharmony_ci            return;
96bf215546Sopenharmony_ci
97bf215546Sopenharmony_ci         penult = bi_prev_op(last);
98bf215546Sopenharmony_ci      }
99bf215546Sopenharmony_ci   }
100bf215546Sopenharmony_ci
101bf215546Sopenharmony_ci   /* If there is blocking flow control, we can't merge */
102bf215546Sopenharmony_ci   if (penult->flow != VA_FLOW_NONE) return;
103bf215546Sopenharmony_ci
104bf215546Sopenharmony_ci   /* Else, merge */
105bf215546Sopenharmony_ci   penult->flow = last->flow;
106bf215546Sopenharmony_ci   bi_remove_instruction(last);
107bf215546Sopenharmony_ci}
108bf215546Sopenharmony_ci
109bf215546Sopenharmony_ci/*
110bf215546Sopenharmony_ci * Calculate the union of two waits. We may wait on any combination of slots #0,
111bf215546Sopenharmony_ci * #1, #2 or the entirety of 0126 and 01267. If we wait on the entirety, the
112bf215546Sopenharmony_ci * union is trivial. If we do not wait on slot #6 (by extension slot #7), we
113bf215546Sopenharmony_ci * wait only on slots #0, #1, and #2, for which the waits are encoded as a
114bf215546Sopenharmony_ci * bitset and the union is just a bitwise OR.
115bf215546Sopenharmony_ci */
116bf215546Sopenharmony_cistatic enum va_flow
117bf215546Sopenharmony_ciunion_waits(enum va_flow x, enum va_flow y)
118bf215546Sopenharmony_ci{
119bf215546Sopenharmony_ci   assert(va_flow_is_wait_or_none(x) && va_flow_is_wait_or_none(y));
120bf215546Sopenharmony_ci
121bf215546Sopenharmony_ci   if ((x == VA_FLOW_WAIT) || (y == VA_FLOW_WAIT))
122bf215546Sopenharmony_ci      return VA_FLOW_WAIT;
123bf215546Sopenharmony_ci   else if ((x == VA_FLOW_WAIT0126) || (y == VA_FLOW_WAIT0126))
124bf215546Sopenharmony_ci      return VA_FLOW_WAIT0126;
125bf215546Sopenharmony_ci   else
126bf215546Sopenharmony_ci      return x | y;
127bf215546Sopenharmony_ci}
128bf215546Sopenharmony_ci
129bf215546Sopenharmony_cistatic void
130bf215546Sopenharmony_cimerge_waits(bi_block *block)
131bf215546Sopenharmony_ci{
132bf215546Sopenharmony_ci   /* Most recent instruction with which we can merge, or NULL if none */
133bf215546Sopenharmony_ci   bi_instr *last_free = NULL;
134bf215546Sopenharmony_ci
135bf215546Sopenharmony_ci   bi_foreach_instr_in_block_safe(block, I) {
136bf215546Sopenharmony_ci      if (last_free != NULL &&
137bf215546Sopenharmony_ci          I->op == BI_OPCODE_NOP && va_flow_is_wait_or_none(I->flow)) {
138bf215546Sopenharmony_ci
139bf215546Sopenharmony_ci         /* Merge waits with compatible instructions */
140bf215546Sopenharmony_ci         last_free->flow = union_waits(last_free->flow, I->flow);
141bf215546Sopenharmony_ci         bi_remove_instruction(I);
142bf215546Sopenharmony_ci         continue;
143bf215546Sopenharmony_ci      }
144bf215546Sopenharmony_ci
145bf215546Sopenharmony_ci      /* Don't move waits past async instructions, since they might be what
146bf215546Sopenharmony_ci       * we're waiting for. If we wanted to optimize this case, we could check
147bf215546Sopenharmony_ci       * the signaled slots.
148bf215546Sopenharmony_ci       */
149bf215546Sopenharmony_ci      if (bi_opcode_props[I->op].message)
150bf215546Sopenharmony_ci         last_free = NULL;
151bf215546Sopenharmony_ci
152bf215546Sopenharmony_ci      /* We can only merge with instructions whose flow control is a wait.
153bf215546Sopenharmony_ci       * This includes such an instruction after merging in a wait. It also
154bf215546Sopenharmony_ci       * includes async instructions.
155bf215546Sopenharmony_ci       */
156bf215546Sopenharmony_ci      if (va_flow_is_wait_or_none(I->flow))
157bf215546Sopenharmony_ci         last_free = I;
158bf215546Sopenharmony_ci   }
159bf215546Sopenharmony_ci}
160bf215546Sopenharmony_ci
161bf215546Sopenharmony_cistatic bool
162bf215546Sopenharmony_cibi_is_first_instr(bi_block *block, bi_instr *I)
163bf215546Sopenharmony_ci{
164bf215546Sopenharmony_ci   return block->instructions.next == &I->link;
165bf215546Sopenharmony_ci}
166bf215546Sopenharmony_ci
167bf215546Sopenharmony_cistatic void
168bf215546Sopenharmony_cimerge_discard(bi_block *block)
169bf215546Sopenharmony_ci{
170bf215546Sopenharmony_ci   bi_instr *last_free = NULL;
171bf215546Sopenharmony_ci
172bf215546Sopenharmony_ci   bi_foreach_instr_in_block_safe_rev(block, I) {
173bf215546Sopenharmony_ci      if ((I->op == BI_OPCODE_NOP) && (I->flow == VA_FLOW_DISCARD)) {
174bf215546Sopenharmony_ci         /* Try to merge with the instruction *preceding* discard, because
175bf215546Sopenharmony_ci          * because flow control happens at the end of an instruction and
176bf215546Sopenharmony_ci          * discard is a NOP.
177bf215546Sopenharmony_ci          */
178bf215546Sopenharmony_ci         if (!bi_is_first_instr(block, I)) {
179bf215546Sopenharmony_ci            bi_instr *prev = bi_prev_op(I);
180bf215546Sopenharmony_ci
181bf215546Sopenharmony_ci            if (prev->flow == VA_FLOW_NONE) {
182bf215546Sopenharmony_ci               prev->flow = VA_FLOW_DISCARD;
183bf215546Sopenharmony_ci               bi_remove_instruction(I);
184bf215546Sopenharmony_ci               continue;
185bf215546Sopenharmony_ci            }
186bf215546Sopenharmony_ci         }
187bf215546Sopenharmony_ci
188bf215546Sopenharmony_ci         /* Or try to merge with the next instruction with no flow control */
189bf215546Sopenharmony_ci         if (last_free != NULL) {
190bf215546Sopenharmony_ci            last_free->flow = VA_FLOW_DISCARD;
191bf215546Sopenharmony_ci            bi_remove_instruction(I);
192bf215546Sopenharmony_ci            continue;
193bf215546Sopenharmony_ci         }
194bf215546Sopenharmony_ci
195bf215546Sopenharmony_ci         /* If there's nowhere to merge and this is the end of the shader, just
196bf215546Sopenharmony_ci          * remove the discard.
197bf215546Sopenharmony_ci          */
198bf215546Sopenharmony_ci         if (!block->successors[0] && !block->successors[1]) {
199bf215546Sopenharmony_ci            bi_remove_instruction(I);
200bf215546Sopenharmony_ci            continue;
201bf215546Sopenharmony_ci         }
202bf215546Sopenharmony_ci      }
203bf215546Sopenharmony_ci
204bf215546Sopenharmony_ci      /* Allow merging into free instructions */
205bf215546Sopenharmony_ci      if (I->flow == VA_FLOW_NONE)
206bf215546Sopenharmony_ci         last_free = I;
207bf215546Sopenharmony_ci   }
208bf215546Sopenharmony_ci}
209bf215546Sopenharmony_ci
210bf215546Sopenharmony_civoid
211bf215546Sopenharmony_civa_merge_flow(bi_context *ctx)
212bf215546Sopenharmony_ci{
213bf215546Sopenharmony_ci   bi_foreach_block(ctx, block) {
214bf215546Sopenharmony_ci      /* If there are less than 2 instructions, there's nothing to merge */
215bf215546Sopenharmony_ci      if (list_is_empty(&block->instructions)) continue;
216bf215546Sopenharmony_ci      if (list_is_singular(&block->instructions)) continue;
217bf215546Sopenharmony_ci
218bf215546Sopenharmony_ci      merge_end_reconverge(block);
219bf215546Sopenharmony_ci      merge_waits(block);
220bf215546Sopenharmony_ci
221bf215546Sopenharmony_ci      if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend)
222bf215546Sopenharmony_ci         merge_discard(block);
223bf215546Sopenharmony_ci   }
224bf215546Sopenharmony_ci}
225