1/*
2 * Copyright © 2021 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/* This is a new block-level load instruction scheduler where loads are grouped
25 * according to their indirection level within a basic block. An indirection
26 * is when a result of one load is used as a source of another load. The result
27 * is that disjoint ALU opcode groups and load (texture) opcode groups are
28 * created where each next load group is the next level of indirection.
29 * It's done by finding the first and last load with the same indirection
30 * level, and moving all unrelated instructions between them after the last
31 * load except for load sources, which are moved before the first load.
32 * It naturally suits hardware that has limits on texture indirections, but
33 * other hardware can benefit too. Only texture, image, and SSBO load and
34 * atomic instructions are grouped.
35 *
36 * There is an option to group only those loads that use the same resource
37 * variable. This increases the chance to get more cache hits than if the loads
38 * were spread out.
39 *
40 * The increased register usage is offset by the increase in observed memory
41 * bandwidth due to more cache hits (dependent on hw behavior) and thus
42 * decrease the subgroup lifetime, which allows registers to be deallocated
43 * and reused sooner. In some bandwidth-bound cases, low register usage doesn't
44 * benefit at all. Doubling the register usage and using those registers to
45 * amplify observed bandwidth can improve performance a lot.
46 *
47 * It's recommended to run a hw-specific instruction scheduler after this to
48 * prevent spilling.
49 */
50
51#include "nir.h"
52
53static bool
54is_memory_load(nir_instr *instr)
55{
56   /* Count texture_size too because it has the same latency as cache hits. */
57   if (instr->type == nir_instr_type_tex)
58      return true;
59
60   if (instr->type == nir_instr_type_intrinsic) {
61      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
62      const char *name = nir_intrinsic_infos[intr->intrinsic].name;
63
64      /* TODO: nir_intrinsics.py could do this */
65      /* load_ubo is ignored because it's usually cheap. */
66      if (!nir_intrinsic_writes_external_memory(intr) &&
67          !strstr(name, "shared") &&
68          (strstr(name, "ssbo") || strstr(name, "image")))
69         return true;
70   }
71
72   return false;
73}
74
75static nir_instr *
76get_intrinsic_resource(nir_intrinsic_instr *intr)
77{
78   /* This is also the list of intrinsics that are grouped. */
79   /* load_ubo is ignored because it's usually cheap. */
80   switch (intr->intrinsic) {
81   case nir_intrinsic_image_load:
82   case nir_intrinsic_image_deref_load:
83   case nir_intrinsic_image_sparse_load:
84   case nir_intrinsic_image_deref_sparse_load:
85   /* Group image_size too because it has the same latency as cache hits. */
86   case nir_intrinsic_image_size:
87   case nir_intrinsic_image_deref_size:
88   case nir_intrinsic_bindless_image_load:
89   case nir_intrinsic_bindless_image_sparse_load:
90   case nir_intrinsic_load_ssbo:
91      return intr->src[0].ssa->parent_instr;
92   default:
93      return NULL;
94   }
95}
96
97/* Track only those that we want to group. */
98static bool
99is_grouped_load(nir_instr *instr)
100{
101   /* Count texture_size too because it has the same latency as cache hits. */
102   if (instr->type == nir_instr_type_tex)
103      return true;
104
105   if (instr->type == nir_instr_type_intrinsic)
106      return get_intrinsic_resource(nir_instr_as_intrinsic(instr)) != NULL;
107
108   return false;
109}
110
111static bool
112can_move(nir_instr *instr, uint8_t current_indirection_level)
113{
114   /* Grouping is done by moving everything else out of the first/last
115    * instruction range of the indirection level.
116    */
117   if (is_grouped_load(instr) && instr->pass_flags == current_indirection_level)
118      return false;
119
120   if (instr->type == nir_instr_type_alu ||
121       instr->type == nir_instr_type_deref ||
122       instr->type == nir_instr_type_tex ||
123       instr->type == nir_instr_type_load_const ||
124       instr->type == nir_instr_type_ssa_undef)
125      return true;
126
127   if (instr->type == nir_instr_type_intrinsic &&
128       nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
129      return true;
130
131   return false;
132}
133
134static nir_instr *
135get_uniform_inst_resource(nir_instr *instr)
136{
137   if (instr->type == nir_instr_type_tex) {
138      nir_tex_instr *tex = nir_instr_as_tex(instr);
139
140      if (tex->texture_non_uniform)
141         return NULL;
142
143      for (unsigned i = 0; i < tex->num_srcs; i++) {
144         switch (tex->src[i].src_type) {
145         case nir_tex_src_texture_deref:
146         case nir_tex_src_texture_handle:
147            return tex->src[i].src.ssa->parent_instr;
148         default:
149            break;
150         }
151      }
152      return NULL;
153   }
154
155   if (instr->type == nir_instr_type_intrinsic)
156      return get_intrinsic_resource(nir_instr_as_intrinsic(instr));
157
158   return NULL;
159}
160
161struct check_sources_state
162{
163   nir_block *block;
164   uint32_t first_index;
165};
166
167static bool
168has_only_sources_less_than(nir_src *src, void *data)
169{
170   struct check_sources_state *state = (struct check_sources_state *)data;
171
172   /* true if nir_foreach_src should keep going */
173   return state->block != src->ssa->parent_instr->block ||
174          src->ssa->parent_instr->index < state->first_index;
175}
176
177static void
178group_loads(nir_instr *first, nir_instr *last)
179{
180   /* Walk the instruction range between the first and last backward, and
181    * move those that have no uses within the range after the last one.
182    */
183   for (nir_instr *instr = exec_node_data_backward(nir_instr,
184                                                   last->node.prev, node);
185        instr != first;
186        instr = exec_node_data_backward(nir_instr, instr->node.prev, node)) {
187      /* Only move instructions without side effects. */
188      if (!can_move(instr, first->pass_flags))
189         continue;
190
191      nir_ssa_def *def = nir_instr_ssa_def(instr);
192      if (def) {
193         bool all_uses_after_last = true;
194
195         nir_foreach_use(use, def) {
196            if (use->parent_instr->block == instr->block &&
197                use->parent_instr->index <= last->index) {
198               all_uses_after_last = false;
199               break;
200            }
201         }
202
203         if (all_uses_after_last) {
204            nir_instr *move_instr = instr;
205            /* Set the last instruction because we'll delete the current one. */
206            instr = exec_node_data_forward(nir_instr, instr->node.next, node);
207
208            /* Move the instruction after the last and update its index
209             * to indicate that it's after it.
210             */
211            nir_instr_move(nir_after_instr(last), move_instr);
212            move_instr->index = last->index + 1;
213         }
214      }
215   }
216
217   struct check_sources_state state;
218   state.block = first->block;
219   state.first_index = first->index;
220
221   /* Walk the instruction range between the first and last forward, and move
222    * those that have no sources within the range before the first one.
223    */
224   for (nir_instr *instr = exec_node_data_forward(nir_instr,
225                                                  first->node.next, node);
226        instr != last;
227        instr = exec_node_data_forward(nir_instr, instr->node.next, node)) {
228      /* Only move instructions without side effects. */
229      if (!can_move(instr, first->pass_flags))
230         continue;
231
232      if (nir_foreach_src(instr, has_only_sources_less_than, &state)) {
233         nir_instr *move_instr = instr;
234         /* Set the last instruction because we'll delete the current one. */
235         instr = exec_node_data_backward(nir_instr, instr->node.prev, node);
236
237         /* Move the instruction before the first and update its index
238          * to indicate that it's before it.
239          */
240         nir_instr_move(nir_before_instr(first), move_instr);
241         move_instr->index = first->index - 1;
242      }
243   }
244}
245
246static bool
247is_pseudo_inst(nir_instr *instr)
248{
249   /* Other instructions do not usually contribute to the shader binary size. */
250   return instr->type != nir_instr_type_alu &&
251          instr->type != nir_instr_type_call &&
252          instr->type != nir_instr_type_tex &&
253          instr->type != nir_instr_type_intrinsic;
254}
255
256static void
257set_instr_indices(nir_block *block)
258{
259   /* Start with 1 because we'll move instruction before the first one
260    * and will want to label it 0.
261    */
262   unsigned counter = 1;
263   nir_instr *last = NULL;
264
265   nir_foreach_instr(instr, block) {
266      /* Make sure grouped instructions don't have the same index as pseudo
267       * instructions.
268       */
269      if (last && is_pseudo_inst(last) && is_grouped_load(instr))
270          counter++;
271
272      /* Set each instruction's index within the block. */
273      instr->index = counter;
274
275      /* Only count non-pseudo instructions. */
276      if (!is_pseudo_inst(instr))
277         counter++;
278
279      last = instr;
280   }
281}
282
283static void
284handle_load_range(nir_instr **first, nir_instr **last,
285                  nir_instr *current, unsigned max_distance)
286{
287   if (*first && *last &&
288       (!current || current->index > (*first)->index + max_distance)) {
289      assert(*first != *last);
290      group_loads(*first, *last);
291      set_instr_indices((*first)->block);
292      *first = NULL;
293      *last = NULL;
294   }
295}
296
297static bool
298is_barrier(nir_instr *instr)
299{
300   if (instr->type == nir_instr_type_intrinsic) {
301      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
302      const char *name = nir_intrinsic_infos[intr->intrinsic].name;
303
304
305      if (intr->intrinsic == nir_intrinsic_discard ||
306          intr->intrinsic == nir_intrinsic_discard_if ||
307          intr->intrinsic == nir_intrinsic_terminate ||
308          intr->intrinsic == nir_intrinsic_terminate_if ||
309          /* TODO: nir_intrinsics.py could do this */
310          strstr(name, "barrier"))
311         return true;
312   }
313
314   return false;
315}
316
317struct indirection_state
318{
319   nir_block *block;
320   unsigned indirections;
321};
322
323static unsigned
324get_num_indirections(nir_instr *instr);
325
326static bool
327gather_indirections(nir_src *src, void *data)
328{
329   struct indirection_state *state = (struct indirection_state *)data;
330   nir_instr *instr = src->ssa->parent_instr;
331
332   /* We only count indirections within the same block. */
333   if (instr->block == state->block) {
334      unsigned indirections = get_num_indirections(src->ssa->parent_instr);
335
336      if (instr->type == nir_instr_type_tex || is_memory_load(instr))
337         indirections++;
338
339      state->indirections = MAX2(state->indirections, indirections);
340   }
341
342   return true; /* whether nir_foreach_src should keep going */
343}
344
345/* Return the number of load indirections within the block. */
346static unsigned
347get_num_indirections(nir_instr *instr)
348{
349   /* Don't traverse phis because we could end up in an infinite recursion
350    * if the phi points to the current block (such as a loop body).
351    */
352   if (instr->type == nir_instr_type_phi)
353      return 0;
354
355   if (instr->index != UINT32_MAX)
356      return instr->index; /* we've visited this instruction before */
357
358   struct indirection_state state;
359   state.block = instr->block;
360   state.indirections = 0;
361
362   nir_foreach_src(instr, gather_indirections, &state);
363
364   instr->index = state.indirections;
365   return state.indirections;
366}
367
368static void
369process_block(nir_block *block, nir_load_grouping grouping,
370              unsigned max_distance)
371{
372   int max_indirection = -1;
373   unsigned num_inst_per_level[256] = {0};
374
375   /* UINT32_MAX means the instruction has not been visited. Once
376    * an instruction has been visited and its indirection level has been
377    * determined, we'll store the indirection level in the index. The next
378    * instruction that visits it will use the index instead of recomputing
379    * the indirection level, which would result in an exponetial time
380    * complexity.
381    */
382   nir_foreach_instr(instr, block) {
383      instr->index = UINT32_MAX; /* unknown */
384   }
385
386   /* Count the number of load indirections for each load instruction
387    * within this block. Store it in pass_flags.
388    */
389   nir_foreach_instr(instr, block) {
390      if (is_grouped_load(instr)) {
391         unsigned indirections = get_num_indirections(instr);
392
393         /* pass_flags has only 8 bits */
394         indirections = MIN2(indirections, 255);
395         num_inst_per_level[indirections]++;
396         instr->pass_flags = indirections;
397
398         max_indirection = MAX2(max_indirection, (int)indirections);
399      }
400   }
401
402   /* 255 contains all indirection levels >= 255, so ignore them. */
403   max_indirection = MIN2(max_indirection, 254);
404
405   /* Each indirection level is grouped. */
406   for (int level = 0; level <= max_indirection; level++) {
407      if (num_inst_per_level[level] <= 1)
408         continue;
409
410      set_instr_indices(block);
411
412      nir_instr *resource = NULL;
413      nir_instr *first_load = NULL, *last_load = NULL;
414
415      /* Find the first and last instruction that use the same
416       * resource and are within a certain distance of each other.
417       * If found, group them by moving all movable instructions
418       * between them out.
419       */
420      nir_foreach_instr(current, block) {
421         /* Don't group across barriers. */
422         if (is_barrier(current)) {
423            /* Group unconditionally.  */
424            handle_load_range(&first_load, &last_load, NULL, 0);
425            first_load = NULL;
426            last_load = NULL;
427            continue;
428         }
429
430         /* Only group load instructions with the same indirection level. */
431         if (is_grouped_load(current) && current->pass_flags == level) {
432            nir_instr *current_resource;
433
434            switch (grouping) {
435            case nir_group_all:
436               if (!first_load)
437                  first_load = current;
438               else
439                  last_load = current;
440               break;
441
442            case nir_group_same_resource_only:
443               current_resource = get_uniform_inst_resource(current);
444
445               if (current_resource) {
446                  if (!first_load) {
447                     first_load = current;
448                     resource = current_resource;
449                  } else if (current_resource == resource) {
450                     last_load = current;
451                  }
452               }
453            }
454         }
455
456         /* Group only if we exceeded the maximum distance. */
457         handle_load_range(&first_load, &last_load, current, max_distance);
458      }
459
460      /* Group unconditionally.  */
461      handle_load_range(&first_load, &last_load, NULL, 0);
462   }
463}
464
465/* max_distance is the maximum distance between the first and last instruction
466 * in a group.
467 */
468void
469nir_group_loads(nir_shader *shader, nir_load_grouping grouping,
470                unsigned max_distance)
471{
472   nir_foreach_function(function, shader) {
473      if (function->impl) {
474         nir_foreach_block(block, function->impl) {
475            process_block(block, grouping, max_distance);
476         }
477
478         nir_metadata_preserve(function->impl,
479                               nir_metadata_block_index |
480                               nir_metadata_dominance |
481                               nir_metadata_loop_analysis);
482      }
483   }
484}
485