1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright (C) 2019 Google, Inc.
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21bf215546Sopenharmony_ci * SOFTWARE.
22bf215546Sopenharmony_ci *
23bf215546Sopenharmony_ci * Authors:
24bf215546Sopenharmony_ci *    Rob Clark <robclark@freedesktop.org>
25bf215546Sopenharmony_ci */
26bf215546Sopenharmony_ci
27bf215546Sopenharmony_ci#include "ir3.h"
28bf215546Sopenharmony_ci
29bf215546Sopenharmony_ci/* The maximum number of nop's we may need to insert between two instructions.
30bf215546Sopenharmony_ci */
31bf215546Sopenharmony_ci#define MAX_NOPS 6
32bf215546Sopenharmony_ci
33bf215546Sopenharmony_ci/*
34bf215546Sopenharmony_ci * Helpers to figure out the necessary delay slots between instructions.  Used
35bf215546Sopenharmony_ci * both in scheduling pass(es) and the final pass to insert any required nop's
36bf215546Sopenharmony_ci * so that the shader program is valid.
37bf215546Sopenharmony_ci *
38bf215546Sopenharmony_ci * Note that this needs to work both pre and post RA, so we can't assume ssa
39bf215546Sopenharmony_ci * src iterators work.
40bf215546Sopenharmony_ci */
41bf215546Sopenharmony_ci
42bf215546Sopenharmony_ci/* calculate required # of delay slots between the instruction that
43bf215546Sopenharmony_ci * assigns a value and the one that consumes
44bf215546Sopenharmony_ci */
45bf215546Sopenharmony_ciint
46bf215546Sopenharmony_ciir3_delayslots(struct ir3_instruction *assigner,
47bf215546Sopenharmony_ci               struct ir3_instruction *consumer, unsigned n, bool soft)
48bf215546Sopenharmony_ci{
49bf215546Sopenharmony_ci   /* generally don't count false dependencies, since this can just be
50bf215546Sopenharmony_ci    * something like a barrier, or SSBO store.
51bf215546Sopenharmony_ci    */
52bf215546Sopenharmony_ci   if (__is_false_dep(consumer, n))
53bf215546Sopenharmony_ci      return 0;
54bf215546Sopenharmony_ci
55bf215546Sopenharmony_ci   /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
56bf215546Sopenharmony_ci    * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
57bf215546Sopenharmony_ci    * handled with sync bits
58bf215546Sopenharmony_ci    */
59bf215546Sopenharmony_ci
60bf215546Sopenharmony_ci   if (is_meta(assigner) || is_meta(consumer))
61bf215546Sopenharmony_ci      return 0;
62bf215546Sopenharmony_ci
63bf215546Sopenharmony_ci   if (writes_addr0(assigner) || writes_addr1(assigner))
64bf215546Sopenharmony_ci      return 6;
65bf215546Sopenharmony_ci
66bf215546Sopenharmony_ci   if (soft && is_ss_producer(assigner))
67bf215546Sopenharmony_ci      return soft_ss_delay(assigner);
68bf215546Sopenharmony_ci
69bf215546Sopenharmony_ci   /* handled via sync flags: */
70bf215546Sopenharmony_ci   if (is_ss_producer(assigner) || is_sy_producer(assigner))
71bf215546Sopenharmony_ci      return 0;
72bf215546Sopenharmony_ci
73bf215546Sopenharmony_ci   /* As far as we know, shader outputs don't need any delay. */
74bf215546Sopenharmony_ci   if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
75bf215546Sopenharmony_ci      return 0;
76bf215546Sopenharmony_ci
77bf215546Sopenharmony_ci   /* assigner must be alu: */
78bf215546Sopenharmony_ci   if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
79bf215546Sopenharmony_ci       is_mem(consumer)) {
80bf215546Sopenharmony_ci      return 6;
81bf215546Sopenharmony_ci   } else {
82bf215546Sopenharmony_ci      /* In mergedregs mode, there is an extra 2-cycle penalty when half of
83bf215546Sopenharmony_ci       * a full-reg is read as a half-reg or when a half-reg is read as a
84bf215546Sopenharmony_ci       * full-reg.
85bf215546Sopenharmony_ci       */
86bf215546Sopenharmony_ci      bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
87bf215546Sopenharmony_ci                             (consumer->srcs[n]->flags & IR3_REG_HALF);
88bf215546Sopenharmony_ci      unsigned penalty = mismatched_half ? 3 : 0;
89bf215546Sopenharmony_ci      if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
90bf215546Sopenharmony_ci         /* special case, 3rd src to cat3 not required on first cycle */
91bf215546Sopenharmony_ci         return 1 + penalty;
92bf215546Sopenharmony_ci      } else {
93bf215546Sopenharmony_ci         return 3 + penalty;
94bf215546Sopenharmony_ci      }
95bf215546Sopenharmony_ci   }
96bf215546Sopenharmony_ci}
97bf215546Sopenharmony_ci
98bf215546Sopenharmony_cistatic bool
99bf215546Sopenharmony_cicount_instruction(struct ir3_instruction *n)
100bf215546Sopenharmony_ci{
101bf215546Sopenharmony_ci   /* NOTE: don't count branch/jump since we don't know yet if they will
102bf215546Sopenharmony_ci    * be eliminated later in resolve_jumps().. really should do that
103bf215546Sopenharmony_ci    * earlier so we don't have this constraint.
104bf215546Sopenharmony_ci    */
105bf215546Sopenharmony_ci   return is_alu(n) ||
106bf215546Sopenharmony_ci          (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
107bf215546Sopenharmony_ci}
108bf215546Sopenharmony_ci
109bf215546Sopenharmony_ci/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
110bf215546Sopenharmony_ci * and have to handle relative accesses specially.
111bf215546Sopenharmony_ci */
112bf215546Sopenharmony_ci
113bf215546Sopenharmony_cistatic unsigned
114bf215546Sopenharmony_cipost_ra_reg_elems(struct ir3_register *reg)
115bf215546Sopenharmony_ci{
116bf215546Sopenharmony_ci   if (reg->flags & IR3_REG_RELATIV)
117bf215546Sopenharmony_ci      return reg->size;
118bf215546Sopenharmony_ci   return reg_elems(reg);
119bf215546Sopenharmony_ci}
120bf215546Sopenharmony_ci
121bf215546Sopenharmony_cistatic unsigned
122bf215546Sopenharmony_cipost_ra_reg_num(struct ir3_register *reg)
123bf215546Sopenharmony_ci{
124bf215546Sopenharmony_ci   if (reg->flags & IR3_REG_RELATIV)
125bf215546Sopenharmony_ci      return reg->array.base;
126bf215546Sopenharmony_ci   return reg->num;
127bf215546Sopenharmony_ci}
128bf215546Sopenharmony_ci
129bf215546Sopenharmony_ciunsigned
130bf215546Sopenharmony_ciir3_delayslots_with_repeat(struct ir3_instruction *assigner,
131bf215546Sopenharmony_ci                           struct ir3_instruction *consumer,
132bf215546Sopenharmony_ci                           unsigned assigner_n, unsigned consumer_n)
133bf215546Sopenharmony_ci{
134bf215546Sopenharmony_ci   unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false);
135bf215546Sopenharmony_ci
136bf215546Sopenharmony_ci   struct ir3_register *src = consumer->srcs[consumer_n];
137bf215546Sopenharmony_ci   struct ir3_register *dst = assigner->dsts[assigner_n];
138bf215546Sopenharmony_ci
139bf215546Sopenharmony_ci   if (assigner->repeat == 0 && consumer->repeat == 0)
140bf215546Sopenharmony_ci      return delay;
141bf215546Sopenharmony_ci
142bf215546Sopenharmony_ci   unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
143bf215546Sopenharmony_ci   unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
144bf215546Sopenharmony_ci
145bf215546Sopenharmony_ci   /* If either side is a relative access, we can't really apply most of the
146bf215546Sopenharmony_ci    * reasoning below because we don't know which component aliases which.
147bf215546Sopenharmony_ci    * Just bail in this case.
148bf215546Sopenharmony_ci    */
149bf215546Sopenharmony_ci   if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
150bf215546Sopenharmony_ci      return delay;
151bf215546Sopenharmony_ci
152bf215546Sopenharmony_ci   /* MOVMSK seems to require that all users wait until the entire
153bf215546Sopenharmony_ci    * instruction is finished, so just bail here.
154bf215546Sopenharmony_ci    */
155bf215546Sopenharmony_ci   if (assigner->opc == OPC_MOVMSK)
156bf215546Sopenharmony_ci      return delay;
157bf215546Sopenharmony_ci
158bf215546Sopenharmony_ci   bool mismatched_half =
159bf215546Sopenharmony_ci      (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
160bf215546Sopenharmony_ci
161bf215546Sopenharmony_ci   /* TODO: Handle the combination of (rpt) and different component sizes
162bf215546Sopenharmony_ci    * better like below. This complicates things significantly because the
163bf215546Sopenharmony_ci    * components don't line up.
164bf215546Sopenharmony_ci    */
165bf215546Sopenharmony_ci   if (mismatched_half)
166bf215546Sopenharmony_ci      return delay;
167bf215546Sopenharmony_ci
168bf215546Sopenharmony_ci   /* If an instruction has a (rpt), then it acts as a sequence of
169bf215546Sopenharmony_ci    * instructions, reading its non-(r) sources at each cycle. First, get the
170bf215546Sopenharmony_ci    * register num for the first instruction where they interfere:
171bf215546Sopenharmony_ci    */
172bf215546Sopenharmony_ci
173bf215546Sopenharmony_ci   unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
174bf215546Sopenharmony_ci
175bf215546Sopenharmony_ci   /* Now, for that first conflicting half/full register, figure out the
176bf215546Sopenharmony_ci    * sub-instruction within assigner/consumer it corresponds to. For (r)
177bf215546Sopenharmony_ci    * sources, this should already return the correct answer of 0. However we
178bf215546Sopenharmony_ci    * have to special-case the multi-mov instructions, where the
179bf215546Sopenharmony_ci    * sub-instructions sometimes come from the src/dst indices instead.
180bf215546Sopenharmony_ci    */
181bf215546Sopenharmony_ci   unsigned first_src_instr;
182bf215546Sopenharmony_ci   if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
183bf215546Sopenharmony_ci      first_src_instr = consumer_n;
184bf215546Sopenharmony_ci   else
185bf215546Sopenharmony_ci      first_src_instr = first_num - src->num;
186bf215546Sopenharmony_ci
187bf215546Sopenharmony_ci   unsigned first_dst_instr;
188bf215546Sopenharmony_ci   if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
189bf215546Sopenharmony_ci      first_dst_instr = assigner_n;
190bf215546Sopenharmony_ci   else
191bf215546Sopenharmony_ci      first_dst_instr = first_num - dst->num;
192bf215546Sopenharmony_ci
193bf215546Sopenharmony_ci   /* The delay we return is relative to the *end* of assigner and the
194bf215546Sopenharmony_ci    * *beginning* of consumer, because it's the number of nops (or other
195bf215546Sopenharmony_ci    * things) needed between them. Any instructions after first_dst_instr
196bf215546Sopenharmony_ci    * subtract from the delay, and so do any instructions before
197bf215546Sopenharmony_ci    * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
198bf215546Sopenharmony_ci    * delay to account for that.
199bf215546Sopenharmony_ci    *
200bf215546Sopenharmony_ci    * Now, a priori, we need to go through this process for every
201bf215546Sopenharmony_ci    * conflicting regnum and take the minimum of the offsets to make sure
202bf215546Sopenharmony_ci    * that the appropriate number of nop's is inserted for every conflicting
203bf215546Sopenharmony_ci    * pair of sub-instructions. However, as we go to the next conflicting
204bf215546Sopenharmony_ci    * regnum (if any), the number of instructions after first_dst_instr
205bf215546Sopenharmony_ci    * decreases by 1 and the number of source instructions before
206bf215546Sopenharmony_ci    * first_src_instr correspondingly increases by 1, so the offset stays the
207bf215546Sopenharmony_ci    * same for all conflicting registers.
208bf215546Sopenharmony_ci    */
209bf215546Sopenharmony_ci   unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
210bf215546Sopenharmony_ci   return offset > delay ? 0 : delay - offset;
211bf215546Sopenharmony_ci}
212bf215546Sopenharmony_ci
213bf215546Sopenharmony_cistatic unsigned
214bf215546Sopenharmony_cidelay_calc_srcn(struct ir3_instruction *assigner,
215bf215546Sopenharmony_ci                struct ir3_instruction *consumer, unsigned assigner_n,
216bf215546Sopenharmony_ci                unsigned consumer_n, bool mergedregs)
217bf215546Sopenharmony_ci{
218bf215546Sopenharmony_ci   struct ir3_register *src = consumer->srcs[consumer_n];
219bf215546Sopenharmony_ci   struct ir3_register *dst = assigner->dsts[assigner_n];
220bf215546Sopenharmony_ci   bool mismatched_half =
221bf215546Sopenharmony_ci      (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
222bf215546Sopenharmony_ci
223bf215546Sopenharmony_ci   /* In the mergedregs case or when the register is a special register,
224bf215546Sopenharmony_ci    * half-registers do not alias with full registers.
225bf215546Sopenharmony_ci    */
226bf215546Sopenharmony_ci   if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
227bf215546Sopenharmony_ci       mismatched_half)
228bf215546Sopenharmony_ci      return 0;
229bf215546Sopenharmony_ci
230bf215546Sopenharmony_ci   unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
231bf215546Sopenharmony_ci   unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
232bf215546Sopenharmony_ci   unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
233bf215546Sopenharmony_ci   unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
234bf215546Sopenharmony_ci
235bf215546Sopenharmony_ci   if (dst_start >= src_end || src_start >= dst_end)
236bf215546Sopenharmony_ci      return 0;
237bf215546Sopenharmony_ci
238bf215546Sopenharmony_ci   return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n);
239bf215546Sopenharmony_ci}
240bf215546Sopenharmony_ci
241bf215546Sopenharmony_cistatic unsigned
242bf215546Sopenharmony_cidelay_calc(struct ir3_block *block, struct ir3_instruction *start,
243bf215546Sopenharmony_ci           struct ir3_instruction *consumer, unsigned distance,
244bf215546Sopenharmony_ci           regmask_t *in_mask, bool mergedregs)
245bf215546Sopenharmony_ci{
246bf215546Sopenharmony_ci   regmask_t mask;
247bf215546Sopenharmony_ci   memcpy(&mask, in_mask, sizeof(mask));
248bf215546Sopenharmony_ci
249bf215546Sopenharmony_ci   unsigned delay = 0;
250bf215546Sopenharmony_ci   /* Search backwards starting at the instruction before start, unless it's
251bf215546Sopenharmony_ci    * NULL then search backwards from the block end.
252bf215546Sopenharmony_ci    */
253bf215546Sopenharmony_ci   struct list_head *start_list =
254bf215546Sopenharmony_ci      start ? start->node.prev : block->instr_list.prev;
255bf215546Sopenharmony_ci   list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
256bf215546Sopenharmony_ci                                 &block->instr_list, node) {
257bf215546Sopenharmony_ci      if (count_instruction(assigner))
258bf215546Sopenharmony_ci         distance += assigner->nop;
259bf215546Sopenharmony_ci
260bf215546Sopenharmony_ci      if (distance + delay >= MAX_NOPS)
261bf215546Sopenharmony_ci         return delay;
262bf215546Sopenharmony_ci
263bf215546Sopenharmony_ci      if (is_meta(assigner))
264bf215546Sopenharmony_ci         continue;
265bf215546Sopenharmony_ci
266bf215546Sopenharmony_ci      unsigned new_delay = 0;
267bf215546Sopenharmony_ci
268bf215546Sopenharmony_ci      foreach_dst_n (dst, dst_n, assigner) {
269bf215546Sopenharmony_ci         if (dst->wrmask == 0)
270bf215546Sopenharmony_ci            continue;
271bf215546Sopenharmony_ci         if (!regmask_get(&mask, dst))
272bf215546Sopenharmony_ci            continue;
273bf215546Sopenharmony_ci         foreach_src_n (src, src_n, consumer) {
274bf215546Sopenharmony_ci            if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
275bf215546Sopenharmony_ci               continue;
276bf215546Sopenharmony_ci
277bf215546Sopenharmony_ci            unsigned src_delay = delay_calc_srcn(
278bf215546Sopenharmony_ci               assigner, consumer, dst_n, src_n, mergedregs);
279bf215546Sopenharmony_ci            new_delay = MAX2(new_delay, src_delay);
280bf215546Sopenharmony_ci         }
281bf215546Sopenharmony_ci         regmask_clear(&mask, dst);
282bf215546Sopenharmony_ci      }
283bf215546Sopenharmony_ci
284bf215546Sopenharmony_ci      new_delay = new_delay > distance ? new_delay - distance : 0;
285bf215546Sopenharmony_ci      delay = MAX2(delay, new_delay);
286bf215546Sopenharmony_ci
287bf215546Sopenharmony_ci      if (count_instruction(assigner))
288bf215546Sopenharmony_ci         distance += 1 + assigner->repeat;
289bf215546Sopenharmony_ci   }
290bf215546Sopenharmony_ci
291bf215546Sopenharmony_ci   /* Note: this allows recursion into "block" if it has already been
292bf215546Sopenharmony_ci    * visited, but *not* recursion into its predecessors. We may have to
293bf215546Sopenharmony_ci    * visit the original block twice, for the loop case where we have to
294bf215546Sopenharmony_ci    * consider definititons in an earlier iterations of the same loop:
295bf215546Sopenharmony_ci    *
296bf215546Sopenharmony_ci    * while (...) {
297bf215546Sopenharmony_ci    *		mov.u32u32 ..., r0.x
298bf215546Sopenharmony_ci    *		...
299bf215546Sopenharmony_ci    *		mov.u32u32 r0.x, ...
300bf215546Sopenharmony_ci    * }
301bf215546Sopenharmony_ci    *
302bf215546Sopenharmony_ci    * However any other recursion would be unnecessary.
303bf215546Sopenharmony_ci    */
304bf215546Sopenharmony_ci
305bf215546Sopenharmony_ci   if (block->data != block) {
306bf215546Sopenharmony_ci      block->data = block;
307bf215546Sopenharmony_ci
308bf215546Sopenharmony_ci      for (unsigned i = 0; i < block->predecessors_count; i++) {
309bf215546Sopenharmony_ci         struct ir3_block *pred = block->predecessors[i];
310bf215546Sopenharmony_ci         unsigned pred_delay = delay_calc(pred, NULL, consumer, distance,
311bf215546Sopenharmony_ci                                          &mask, mergedregs);
312bf215546Sopenharmony_ci         delay = MAX2(delay, pred_delay);
313bf215546Sopenharmony_ci      }
314bf215546Sopenharmony_ci
315bf215546Sopenharmony_ci      block->data = NULL;
316bf215546Sopenharmony_ci   }
317bf215546Sopenharmony_ci
318bf215546Sopenharmony_ci   return delay;
319bf215546Sopenharmony_ci}
320bf215546Sopenharmony_ci
321bf215546Sopenharmony_ci/**
322bf215546Sopenharmony_ci * Calculate delay for nop insertion. This must exactly match hardware
323bf215546Sopenharmony_ci * requirements, including recursing into predecessor blocks.
324bf215546Sopenharmony_ci */
325bf215546Sopenharmony_ciunsigned
326bf215546Sopenharmony_ciir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
327bf215546Sopenharmony_ci               bool mergedregs)
328bf215546Sopenharmony_ci{
329bf215546Sopenharmony_ci   regmask_t mask;
330bf215546Sopenharmony_ci   regmask_init(&mask, mergedregs);
331bf215546Sopenharmony_ci   foreach_src (src, instr) {
332bf215546Sopenharmony_ci      if (!(src->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
333bf215546Sopenharmony_ci         regmask_set(&mask, src);
334bf215546Sopenharmony_ci   }
335bf215546Sopenharmony_ci
336bf215546Sopenharmony_ci   return delay_calc(block, NULL, instr, 0, &mask, mergedregs);
337bf215546Sopenharmony_ci}
338