1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2021 Advanced Micro Devices, Inc.
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci/* This is a new block-level load instruction scheduler where loads are grouped
25bf215546Sopenharmony_ci * according to their indirection level within a basic block. An indirection
26bf215546Sopenharmony_ci * is when a result of one load is used as a source of another load. The result
27bf215546Sopenharmony_ci * is that disjoint ALU opcode groups and load (texture) opcode groups are
28bf215546Sopenharmony_ci * created where each next load group is the next level of indirection.
29bf215546Sopenharmony_ci * It's done by finding the first and last load with the same indirection
30bf215546Sopenharmony_ci * level, and moving all unrelated instructions between them after the last
31bf215546Sopenharmony_ci * load except for load sources, which are moved before the first load.
32bf215546Sopenharmony_ci * It naturally suits hardware that has limits on texture indirections, but
33bf215546Sopenharmony_ci * other hardware can benefit too. Only texture, image, and SSBO load and
34bf215546Sopenharmony_ci * atomic instructions are grouped.
35bf215546Sopenharmony_ci *
36bf215546Sopenharmony_ci * There is an option to group only those loads that use the same resource
37bf215546Sopenharmony_ci * variable. This increases the chance to get more cache hits than if the loads
38bf215546Sopenharmony_ci * were spread out.
39bf215546Sopenharmony_ci *
40bf215546Sopenharmony_ci * The increased register usage is offset by the increase in observed memory
41bf215546Sopenharmony_ci * bandwidth due to more cache hits (dependent on hw behavior) and thus
42bf215546Sopenharmony_ci * decrease the subgroup lifetime, which allows registers to be deallocated
43bf215546Sopenharmony_ci * and reused sooner. In some bandwidth-bound cases, low register usage doesn't
44bf215546Sopenharmony_ci * benefit at all. Doubling the register usage and using those registers to
45bf215546Sopenharmony_ci * amplify observed bandwidth can improve performance a lot.
46bf215546Sopenharmony_ci *
47bf215546Sopenharmony_ci * It's recommended to run a hw-specific instruction scheduler after this to
48bf215546Sopenharmony_ci * prevent spilling.
49bf215546Sopenharmony_ci */
50bf215546Sopenharmony_ci
51bf215546Sopenharmony_ci#include "nir.h"
52bf215546Sopenharmony_ci
53bf215546Sopenharmony_cistatic bool
54bf215546Sopenharmony_ciis_memory_load(nir_instr *instr)
55bf215546Sopenharmony_ci{
56bf215546Sopenharmony_ci   /* Count texture_size too because it has the same latency as cache hits. */
57bf215546Sopenharmony_ci   if (instr->type == nir_instr_type_tex)
58bf215546Sopenharmony_ci      return true;
59bf215546Sopenharmony_ci
60bf215546Sopenharmony_ci   if (instr->type == nir_instr_type_intrinsic) {
61bf215546Sopenharmony_ci      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
62bf215546Sopenharmony_ci      const char *name = nir_intrinsic_infos[intr->intrinsic].name;
63bf215546Sopenharmony_ci
64bf215546Sopenharmony_ci      /* TODO: nir_intrinsics.py could do this */
65bf215546Sopenharmony_ci      /* load_ubo is ignored because it's usually cheap. */
66bf215546Sopenharmony_ci      if (!nir_intrinsic_writes_external_memory(intr) &&
67bf215546Sopenharmony_ci          !strstr(name, "shared") &&
68bf215546Sopenharmony_ci          (strstr(name, "ssbo") || strstr(name, "image")))
69bf215546Sopenharmony_ci         return true;
70bf215546Sopenharmony_ci   }
71bf215546Sopenharmony_ci
72bf215546Sopenharmony_ci   return false;
73bf215546Sopenharmony_ci}
74bf215546Sopenharmony_ci
75bf215546Sopenharmony_cistatic nir_instr *
76bf215546Sopenharmony_ciget_intrinsic_resource(nir_intrinsic_instr *intr)
77bf215546Sopenharmony_ci{
78bf215546Sopenharmony_ci   /* This is also the list of intrinsics that are grouped. */
79bf215546Sopenharmony_ci   /* load_ubo is ignored because it's usually cheap. */
80bf215546Sopenharmony_ci   switch (intr->intrinsic) {
81bf215546Sopenharmony_ci   case nir_intrinsic_image_load:
82bf215546Sopenharmony_ci   case nir_intrinsic_image_deref_load:
83bf215546Sopenharmony_ci   case nir_intrinsic_image_sparse_load:
84bf215546Sopenharmony_ci   case nir_intrinsic_image_deref_sparse_load:
85bf215546Sopenharmony_ci   /* Group image_size too because it has the same latency as cache hits. */
86bf215546Sopenharmony_ci   case nir_intrinsic_image_size:
87bf215546Sopenharmony_ci   case nir_intrinsic_image_deref_size:
88bf215546Sopenharmony_ci   case nir_intrinsic_bindless_image_load:
89bf215546Sopenharmony_ci   case nir_intrinsic_bindless_image_sparse_load:
90bf215546Sopenharmony_ci   case nir_intrinsic_load_ssbo:
91bf215546Sopenharmony_ci      return intr->src[0].ssa->parent_instr;
92bf215546Sopenharmony_ci   default:
93bf215546Sopenharmony_ci      return NULL;
94bf215546Sopenharmony_ci   }
95bf215546Sopenharmony_ci}
96bf215546Sopenharmony_ci
97bf215546Sopenharmony_ci/* Track only those that we want to group. */
98bf215546Sopenharmony_cistatic bool
99bf215546Sopenharmony_ciis_grouped_load(nir_instr *instr)
100bf215546Sopenharmony_ci{
101bf215546Sopenharmony_ci   /* Count texture_size too because it has the same latency as cache hits. */
102bf215546Sopenharmony_ci   if (instr->type == nir_instr_type_tex)
103bf215546Sopenharmony_ci      return true;
104bf215546Sopenharmony_ci
105bf215546Sopenharmony_ci   if (instr->type == nir_instr_type_intrinsic)
106bf215546Sopenharmony_ci      return get_intrinsic_resource(nir_instr_as_intrinsic(instr)) != NULL;
107bf215546Sopenharmony_ci
108bf215546Sopenharmony_ci   return false;
109bf215546Sopenharmony_ci}
110bf215546Sopenharmony_ci
111bf215546Sopenharmony_cistatic bool
112bf215546Sopenharmony_cican_move(nir_instr *instr, uint8_t current_indirection_level)
113bf215546Sopenharmony_ci{
114bf215546Sopenharmony_ci   /* Grouping is done by moving everything else out of the first/last
115bf215546Sopenharmony_ci    * instruction range of the indirection level.
116bf215546Sopenharmony_ci    */
117bf215546Sopenharmony_ci   if (is_grouped_load(instr) && instr->pass_flags == current_indirection_level)
118bf215546Sopenharmony_ci      return false;
119bf215546Sopenharmony_ci
120bf215546Sopenharmony_ci   if (instr->type == nir_instr_type_alu ||
121bf215546Sopenharmony_ci       instr->type == nir_instr_type_deref ||
122bf215546Sopenharmony_ci       instr->type == nir_instr_type_tex ||
123bf215546Sopenharmony_ci       instr->type == nir_instr_type_load_const ||
124bf215546Sopenharmony_ci       instr->type == nir_instr_type_ssa_undef)
125bf215546Sopenharmony_ci      return true;
126bf215546Sopenharmony_ci
127bf215546Sopenharmony_ci   if (instr->type == nir_instr_type_intrinsic &&
128bf215546Sopenharmony_ci       nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
129bf215546Sopenharmony_ci      return true;
130bf215546Sopenharmony_ci
131bf215546Sopenharmony_ci   return false;
132bf215546Sopenharmony_ci}
133bf215546Sopenharmony_ci
134bf215546Sopenharmony_cistatic nir_instr *
135bf215546Sopenharmony_ciget_uniform_inst_resource(nir_instr *instr)
136bf215546Sopenharmony_ci{
137bf215546Sopenharmony_ci   if (instr->type == nir_instr_type_tex) {
138bf215546Sopenharmony_ci      nir_tex_instr *tex = nir_instr_as_tex(instr);
139bf215546Sopenharmony_ci
140bf215546Sopenharmony_ci      if (tex->texture_non_uniform)
141bf215546Sopenharmony_ci         return NULL;
142bf215546Sopenharmony_ci
143bf215546Sopenharmony_ci      for (unsigned i = 0; i < tex->num_srcs; i++) {
144bf215546Sopenharmony_ci         switch (tex->src[i].src_type) {
145bf215546Sopenharmony_ci         case nir_tex_src_texture_deref:
146bf215546Sopenharmony_ci         case nir_tex_src_texture_handle:
147bf215546Sopenharmony_ci            return tex->src[i].src.ssa->parent_instr;
148bf215546Sopenharmony_ci         default:
149bf215546Sopenharmony_ci            break;
150bf215546Sopenharmony_ci         }
151bf215546Sopenharmony_ci      }
152bf215546Sopenharmony_ci      return NULL;
153bf215546Sopenharmony_ci   }
154bf215546Sopenharmony_ci
155bf215546Sopenharmony_ci   if (instr->type == nir_instr_type_intrinsic)
156bf215546Sopenharmony_ci      return get_intrinsic_resource(nir_instr_as_intrinsic(instr));
157bf215546Sopenharmony_ci
158bf215546Sopenharmony_ci   return NULL;
159bf215546Sopenharmony_ci}
160bf215546Sopenharmony_ci
161bf215546Sopenharmony_cistruct check_sources_state
162bf215546Sopenharmony_ci{
163bf215546Sopenharmony_ci   nir_block *block;
164bf215546Sopenharmony_ci   uint32_t first_index;
165bf215546Sopenharmony_ci};
166bf215546Sopenharmony_ci
167bf215546Sopenharmony_cistatic bool
168bf215546Sopenharmony_cihas_only_sources_less_than(nir_src *src, void *data)
169bf215546Sopenharmony_ci{
170bf215546Sopenharmony_ci   struct check_sources_state *state = (struct check_sources_state *)data;
171bf215546Sopenharmony_ci
172bf215546Sopenharmony_ci   /* true if nir_foreach_src should keep going */
173bf215546Sopenharmony_ci   return state->block != src->ssa->parent_instr->block ||
174bf215546Sopenharmony_ci          src->ssa->parent_instr->index < state->first_index;
175bf215546Sopenharmony_ci}
176bf215546Sopenharmony_ci
177bf215546Sopenharmony_cistatic void
178bf215546Sopenharmony_cigroup_loads(nir_instr *first, nir_instr *last)
179bf215546Sopenharmony_ci{
180bf215546Sopenharmony_ci   /* Walk the instruction range between the first and last backward, and
181bf215546Sopenharmony_ci    * move those that have no uses within the range after the last one.
182bf215546Sopenharmony_ci    */
183bf215546Sopenharmony_ci   for (nir_instr *instr = exec_node_data_backward(nir_instr,
184bf215546Sopenharmony_ci                                                   last->node.prev, node);
185bf215546Sopenharmony_ci        instr != first;
186bf215546Sopenharmony_ci        instr = exec_node_data_backward(nir_instr, instr->node.prev, node)) {
187bf215546Sopenharmony_ci      /* Only move instructions without side effects. */
188bf215546Sopenharmony_ci      if (!can_move(instr, first->pass_flags))
189bf215546Sopenharmony_ci         continue;
190bf215546Sopenharmony_ci
191bf215546Sopenharmony_ci      nir_ssa_def *def = nir_instr_ssa_def(instr);
192bf215546Sopenharmony_ci      if (def) {
193bf215546Sopenharmony_ci         bool all_uses_after_last = true;
194bf215546Sopenharmony_ci
195bf215546Sopenharmony_ci         nir_foreach_use(use, def) {
196bf215546Sopenharmony_ci            if (use->parent_instr->block == instr->block &&
197bf215546Sopenharmony_ci                use->parent_instr->index <= last->index) {
198bf215546Sopenharmony_ci               all_uses_after_last = false;
199bf215546Sopenharmony_ci               break;
200bf215546Sopenharmony_ci            }
201bf215546Sopenharmony_ci         }
202bf215546Sopenharmony_ci
203bf215546Sopenharmony_ci         if (all_uses_after_last) {
204bf215546Sopenharmony_ci            nir_instr *move_instr = instr;
205bf215546Sopenharmony_ci            /* Set the last instruction because we'll delete the current one. */
206bf215546Sopenharmony_ci            instr = exec_node_data_forward(nir_instr, instr->node.next, node);
207bf215546Sopenharmony_ci
208bf215546Sopenharmony_ci            /* Move the instruction after the last and update its index
209bf215546Sopenharmony_ci             * to indicate that it's after it.
210bf215546Sopenharmony_ci             */
211bf215546Sopenharmony_ci            nir_instr_move(nir_after_instr(last), move_instr);
212bf215546Sopenharmony_ci            move_instr->index = last->index + 1;
213bf215546Sopenharmony_ci         }
214bf215546Sopenharmony_ci      }
215bf215546Sopenharmony_ci   }
216bf215546Sopenharmony_ci
217bf215546Sopenharmony_ci   struct check_sources_state state;
218bf215546Sopenharmony_ci   state.block = first->block;
219bf215546Sopenharmony_ci   state.first_index = first->index;
220bf215546Sopenharmony_ci
221bf215546Sopenharmony_ci   /* Walk the instruction range between the first and last forward, and move
222bf215546Sopenharmony_ci    * those that have no sources within the range before the first one.
223bf215546Sopenharmony_ci    */
224bf215546Sopenharmony_ci   for (nir_instr *instr = exec_node_data_forward(nir_instr,
225bf215546Sopenharmony_ci                                                  first->node.next, node);
226bf215546Sopenharmony_ci        instr != last;
227bf215546Sopenharmony_ci        instr = exec_node_data_forward(nir_instr, instr->node.next, node)) {
228bf215546Sopenharmony_ci      /* Only move instructions without side effects. */
229bf215546Sopenharmony_ci      if (!can_move(instr, first->pass_flags))
230bf215546Sopenharmony_ci         continue;
231bf215546Sopenharmony_ci
232bf215546Sopenharmony_ci      if (nir_foreach_src(instr, has_only_sources_less_than, &state)) {
233bf215546Sopenharmony_ci         nir_instr *move_instr = instr;
234bf215546Sopenharmony_ci         /* Set the last instruction because we'll delete the current one. */
235bf215546Sopenharmony_ci         instr = exec_node_data_backward(nir_instr, instr->node.prev, node);
236bf215546Sopenharmony_ci
237bf215546Sopenharmony_ci         /* Move the instruction before the first and update its index
238bf215546Sopenharmony_ci          * to indicate that it's before it.
239bf215546Sopenharmony_ci          */
240bf215546Sopenharmony_ci         nir_instr_move(nir_before_instr(first), move_instr);
241bf215546Sopenharmony_ci         move_instr->index = first->index - 1;
242bf215546Sopenharmony_ci      }
243bf215546Sopenharmony_ci   }
244bf215546Sopenharmony_ci}
245bf215546Sopenharmony_ci
246bf215546Sopenharmony_cistatic bool
247bf215546Sopenharmony_ciis_pseudo_inst(nir_instr *instr)
248bf215546Sopenharmony_ci{
249bf215546Sopenharmony_ci   /* Other instructions do not usually contribute to the shader binary size. */
250bf215546Sopenharmony_ci   return instr->type != nir_instr_type_alu &&
251bf215546Sopenharmony_ci          instr->type != nir_instr_type_call &&
252bf215546Sopenharmony_ci          instr->type != nir_instr_type_tex &&
253bf215546Sopenharmony_ci          instr->type != nir_instr_type_intrinsic;
254bf215546Sopenharmony_ci}
255bf215546Sopenharmony_ci
256bf215546Sopenharmony_cistatic void
257bf215546Sopenharmony_ciset_instr_indices(nir_block *block)
258bf215546Sopenharmony_ci{
259bf215546Sopenharmony_ci   /* Start with 1 because we'll move instruction before the first one
260bf215546Sopenharmony_ci    * and will want to label it 0.
261bf215546Sopenharmony_ci    */
262bf215546Sopenharmony_ci   unsigned counter = 1;
263bf215546Sopenharmony_ci   nir_instr *last = NULL;
264bf215546Sopenharmony_ci
265bf215546Sopenharmony_ci   nir_foreach_instr(instr, block) {
266bf215546Sopenharmony_ci      /* Make sure grouped instructions don't have the same index as pseudo
267bf215546Sopenharmony_ci       * instructions.
268bf215546Sopenharmony_ci       */
269bf215546Sopenharmony_ci      if (last && is_pseudo_inst(last) && is_grouped_load(instr))
270bf215546Sopenharmony_ci          counter++;
271bf215546Sopenharmony_ci
272bf215546Sopenharmony_ci      /* Set each instruction's index within the block. */
273bf215546Sopenharmony_ci      instr->index = counter;
274bf215546Sopenharmony_ci
275bf215546Sopenharmony_ci      /* Only count non-pseudo instructions. */
276bf215546Sopenharmony_ci      if (!is_pseudo_inst(instr))
277bf215546Sopenharmony_ci         counter++;
278bf215546Sopenharmony_ci
279bf215546Sopenharmony_ci      last = instr;
280bf215546Sopenharmony_ci   }
281bf215546Sopenharmony_ci}
282bf215546Sopenharmony_ci
283bf215546Sopenharmony_cistatic void
284bf215546Sopenharmony_cihandle_load_range(nir_instr **first, nir_instr **last,
285bf215546Sopenharmony_ci                  nir_instr *current, unsigned max_distance)
286bf215546Sopenharmony_ci{
287bf215546Sopenharmony_ci   if (*first && *last &&
288bf215546Sopenharmony_ci       (!current || current->index > (*first)->index + max_distance)) {
289bf215546Sopenharmony_ci      assert(*first != *last);
290bf215546Sopenharmony_ci      group_loads(*first, *last);
291bf215546Sopenharmony_ci      set_instr_indices((*first)->block);
292bf215546Sopenharmony_ci      *first = NULL;
293bf215546Sopenharmony_ci      *last = NULL;
294bf215546Sopenharmony_ci   }
295bf215546Sopenharmony_ci}
296bf215546Sopenharmony_ci
297bf215546Sopenharmony_cistatic bool
298bf215546Sopenharmony_ciis_barrier(nir_instr *instr)
299bf215546Sopenharmony_ci{
300bf215546Sopenharmony_ci   if (instr->type == nir_instr_type_intrinsic) {
301bf215546Sopenharmony_ci      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
302bf215546Sopenharmony_ci      const char *name = nir_intrinsic_infos[intr->intrinsic].name;
303bf215546Sopenharmony_ci
304bf215546Sopenharmony_ci
305bf215546Sopenharmony_ci      if (intr->intrinsic == nir_intrinsic_discard ||
306bf215546Sopenharmony_ci          intr->intrinsic == nir_intrinsic_discard_if ||
307bf215546Sopenharmony_ci          intr->intrinsic == nir_intrinsic_terminate ||
308bf215546Sopenharmony_ci          intr->intrinsic == nir_intrinsic_terminate_if ||
309bf215546Sopenharmony_ci          /* TODO: nir_intrinsics.py could do this */
310bf215546Sopenharmony_ci          strstr(name, "barrier"))
311bf215546Sopenharmony_ci         return true;
312bf215546Sopenharmony_ci   }
313bf215546Sopenharmony_ci
314bf215546Sopenharmony_ci   return false;
315bf215546Sopenharmony_ci}
316bf215546Sopenharmony_ci
317bf215546Sopenharmony_cistruct indirection_state
318bf215546Sopenharmony_ci{
319bf215546Sopenharmony_ci   nir_block *block;
320bf215546Sopenharmony_ci   unsigned indirections;
321bf215546Sopenharmony_ci};
322bf215546Sopenharmony_ci
323bf215546Sopenharmony_cistatic unsigned
324bf215546Sopenharmony_ciget_num_indirections(nir_instr *instr);
325bf215546Sopenharmony_ci
326bf215546Sopenharmony_cistatic bool
327bf215546Sopenharmony_cigather_indirections(nir_src *src, void *data)
328bf215546Sopenharmony_ci{
329bf215546Sopenharmony_ci   struct indirection_state *state = (struct indirection_state *)data;
330bf215546Sopenharmony_ci   nir_instr *instr = src->ssa->parent_instr;
331bf215546Sopenharmony_ci
332bf215546Sopenharmony_ci   /* We only count indirections within the same block. */
333bf215546Sopenharmony_ci   if (instr->block == state->block) {
334bf215546Sopenharmony_ci      unsigned indirections = get_num_indirections(src->ssa->parent_instr);
335bf215546Sopenharmony_ci
336bf215546Sopenharmony_ci      if (instr->type == nir_instr_type_tex || is_memory_load(instr))
337bf215546Sopenharmony_ci         indirections++;
338bf215546Sopenharmony_ci
339bf215546Sopenharmony_ci      state->indirections = MAX2(state->indirections, indirections);
340bf215546Sopenharmony_ci   }
341bf215546Sopenharmony_ci
342bf215546Sopenharmony_ci   return true; /* whether nir_foreach_src should keep going */
343bf215546Sopenharmony_ci}
344bf215546Sopenharmony_ci
345bf215546Sopenharmony_ci/* Return the number of load indirections within the block. */
346bf215546Sopenharmony_cistatic unsigned
347bf215546Sopenharmony_ciget_num_indirections(nir_instr *instr)
348bf215546Sopenharmony_ci{
349bf215546Sopenharmony_ci   /* Don't traverse phis because we could end up in an infinite recursion
350bf215546Sopenharmony_ci    * if the phi points to the current block (such as a loop body).
351bf215546Sopenharmony_ci    */
352bf215546Sopenharmony_ci   if (instr->type == nir_instr_type_phi)
353bf215546Sopenharmony_ci      return 0;
354bf215546Sopenharmony_ci
355bf215546Sopenharmony_ci   if (instr->index != UINT32_MAX)
356bf215546Sopenharmony_ci      return instr->index; /* we've visited this instruction before */
357bf215546Sopenharmony_ci
358bf215546Sopenharmony_ci   struct indirection_state state;
359bf215546Sopenharmony_ci   state.block = instr->block;
360bf215546Sopenharmony_ci   state.indirections = 0;
361bf215546Sopenharmony_ci
362bf215546Sopenharmony_ci   nir_foreach_src(instr, gather_indirections, &state);
363bf215546Sopenharmony_ci
364bf215546Sopenharmony_ci   instr->index = state.indirections;
365bf215546Sopenharmony_ci   return state.indirections;
366bf215546Sopenharmony_ci}
367bf215546Sopenharmony_ci
368bf215546Sopenharmony_cistatic void
369bf215546Sopenharmony_ciprocess_block(nir_block *block, nir_load_grouping grouping,
370bf215546Sopenharmony_ci              unsigned max_distance)
371bf215546Sopenharmony_ci{
372bf215546Sopenharmony_ci   int max_indirection = -1;
373bf215546Sopenharmony_ci   unsigned num_inst_per_level[256] = {0};
374bf215546Sopenharmony_ci
375bf215546Sopenharmony_ci   /* UINT32_MAX means the instruction has not been visited. Once
376bf215546Sopenharmony_ci    * an instruction has been visited and its indirection level has been
377bf215546Sopenharmony_ci    * determined, we'll store the indirection level in the index. The next
378bf215546Sopenharmony_ci    * instruction that visits it will use the index instead of recomputing
379bf215546Sopenharmony_ci    * the indirection level, which would result in an exponetial time
380bf215546Sopenharmony_ci    * complexity.
381bf215546Sopenharmony_ci    */
382bf215546Sopenharmony_ci   nir_foreach_instr(instr, block) {
383bf215546Sopenharmony_ci      instr->index = UINT32_MAX; /* unknown */
384bf215546Sopenharmony_ci   }
385bf215546Sopenharmony_ci
386bf215546Sopenharmony_ci   /* Count the number of load indirections for each load instruction
387bf215546Sopenharmony_ci    * within this block. Store it in pass_flags.
388bf215546Sopenharmony_ci    */
389bf215546Sopenharmony_ci   nir_foreach_instr(instr, block) {
390bf215546Sopenharmony_ci      if (is_grouped_load(instr)) {
391bf215546Sopenharmony_ci         unsigned indirections = get_num_indirections(instr);
392bf215546Sopenharmony_ci
393bf215546Sopenharmony_ci         /* pass_flags has only 8 bits */
394bf215546Sopenharmony_ci         indirections = MIN2(indirections, 255);
395bf215546Sopenharmony_ci         num_inst_per_level[indirections]++;
396bf215546Sopenharmony_ci         instr->pass_flags = indirections;
397bf215546Sopenharmony_ci
398bf215546Sopenharmony_ci         max_indirection = MAX2(max_indirection, (int)indirections);
399bf215546Sopenharmony_ci      }
400bf215546Sopenharmony_ci   }
401bf215546Sopenharmony_ci
402bf215546Sopenharmony_ci   /* 255 contains all indirection levels >= 255, so ignore them. */
403bf215546Sopenharmony_ci   max_indirection = MIN2(max_indirection, 254);
404bf215546Sopenharmony_ci
405bf215546Sopenharmony_ci   /* Each indirection level is grouped. */
406bf215546Sopenharmony_ci   for (int level = 0; level <= max_indirection; level++) {
407bf215546Sopenharmony_ci      if (num_inst_per_level[level] <= 1)
408bf215546Sopenharmony_ci         continue;
409bf215546Sopenharmony_ci
410bf215546Sopenharmony_ci      set_instr_indices(block);
411bf215546Sopenharmony_ci
412bf215546Sopenharmony_ci      nir_instr *resource = NULL;
413bf215546Sopenharmony_ci      nir_instr *first_load = NULL, *last_load = NULL;
414bf215546Sopenharmony_ci
415bf215546Sopenharmony_ci      /* Find the first and last instruction that use the same
416bf215546Sopenharmony_ci       * resource and are within a certain distance of each other.
417bf215546Sopenharmony_ci       * If found, group them by moving all movable instructions
418bf215546Sopenharmony_ci       * between them out.
419bf215546Sopenharmony_ci       */
420bf215546Sopenharmony_ci      nir_foreach_instr(current, block) {
421bf215546Sopenharmony_ci         /* Don't group across barriers. */
422bf215546Sopenharmony_ci         if (is_barrier(current)) {
423bf215546Sopenharmony_ci            /* Group unconditionally.  */
424bf215546Sopenharmony_ci            handle_load_range(&first_load, &last_load, NULL, 0);
425bf215546Sopenharmony_ci            first_load = NULL;
426bf215546Sopenharmony_ci            last_load = NULL;
427bf215546Sopenharmony_ci            continue;
428bf215546Sopenharmony_ci         }
429bf215546Sopenharmony_ci
430bf215546Sopenharmony_ci         /* Only group load instructions with the same indirection level. */
431bf215546Sopenharmony_ci         if (is_grouped_load(current) && current->pass_flags == level) {
432bf215546Sopenharmony_ci            nir_instr *current_resource;
433bf215546Sopenharmony_ci
434bf215546Sopenharmony_ci            switch (grouping) {
435bf215546Sopenharmony_ci            case nir_group_all:
436bf215546Sopenharmony_ci               if (!first_load)
437bf215546Sopenharmony_ci                  first_load = current;
438bf215546Sopenharmony_ci               else
439bf215546Sopenharmony_ci                  last_load = current;
440bf215546Sopenharmony_ci               break;
441bf215546Sopenharmony_ci
442bf215546Sopenharmony_ci            case nir_group_same_resource_only:
443bf215546Sopenharmony_ci               current_resource = get_uniform_inst_resource(current);
444bf215546Sopenharmony_ci
445bf215546Sopenharmony_ci               if (current_resource) {
446bf215546Sopenharmony_ci                  if (!first_load) {
447bf215546Sopenharmony_ci                     first_load = current;
448bf215546Sopenharmony_ci                     resource = current_resource;
449bf215546Sopenharmony_ci                  } else if (current_resource == resource) {
450bf215546Sopenharmony_ci                     last_load = current;
451bf215546Sopenharmony_ci                  }
452bf215546Sopenharmony_ci               }
453bf215546Sopenharmony_ci            }
454bf215546Sopenharmony_ci         }
455bf215546Sopenharmony_ci
456bf215546Sopenharmony_ci         /* Group only if we exceeded the maximum distance. */
457bf215546Sopenharmony_ci         handle_load_range(&first_load, &last_load, current, max_distance);
458bf215546Sopenharmony_ci      }
459bf215546Sopenharmony_ci
460bf215546Sopenharmony_ci      /* Group unconditionally.  */
461bf215546Sopenharmony_ci      handle_load_range(&first_load, &last_load, NULL, 0);
462bf215546Sopenharmony_ci   }
463bf215546Sopenharmony_ci}
464bf215546Sopenharmony_ci
465bf215546Sopenharmony_ci/* max_distance is the maximum distance between the first and last instruction
466bf215546Sopenharmony_ci * in a group.
467bf215546Sopenharmony_ci */
468bf215546Sopenharmony_civoid
469bf215546Sopenharmony_cinir_group_loads(nir_shader *shader, nir_load_grouping grouping,
470bf215546Sopenharmony_ci                unsigned max_distance)
471bf215546Sopenharmony_ci{
472bf215546Sopenharmony_ci   nir_foreach_function(function, shader) {
473bf215546Sopenharmony_ci      if (function->impl) {
474bf215546Sopenharmony_ci         nir_foreach_block(block, function->impl) {
475bf215546Sopenharmony_ci            process_block(block, grouping, max_distance);
476bf215546Sopenharmony_ci         }
477bf215546Sopenharmony_ci
478bf215546Sopenharmony_ci         nir_metadata_preserve(function->impl,
479bf215546Sopenharmony_ci                               nir_metadata_block_index |
480bf215546Sopenharmony_ci                               nir_metadata_dominance |
481bf215546Sopenharmony_ci                               nir_metadata_loop_analysis);
482bf215546Sopenharmony_ci      }
483bf215546Sopenharmony_ci   }
484bf215546Sopenharmony_ci}
485