1/*
2 * Copyright © 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include "nir.h"
25#include "nir_vla.h"
26
27/* Lowering for amul instructions, for drivers that support imul24.
28 * This pass will analyze indirect derefs, and convert corresponding
29 * amul instructions to either imul or imul24, depending on the
30 * required range.
31 *
32 * 1) Analyze the uniform variables and build a table of UBOs and SSBOs
33 *    that are either too large, or might be too large (unknown size)
34 *    for imul24
35 *
36 * 2) Loop thru looking at all the intrinsics, finding dereferences of
37 *    large variables, and recursively replacing all amul instructions
38 *    used with imul
39 *
40 * 3) Finally loop again thru all instructions replacing any remaining
41 *    amul with imul24.  At this point any remaining amul instructions
42 *    are not involved in calculating an offset into a large variable,
43 *    thanks to the 2nd step, so they can be safely replace with imul24.
44 *
45 * Using two passes over all the instructions lets us handle the case
46 * where, due to CSE, an amul is used to calculate an offset into both
47 * a large and small variable.
48 */
49
50typedef struct {
51   nir_shader *shader;
52
53   int (*type_size)(const struct glsl_type *, bool);
54
55   /* Tables of UBOs and SSBOs mapping driver_location/base whether
56    * they are too large to use imul24:
57    */
58   bool *large_ubos;
59   bool *large_ssbos;
60
61   /* for cases that we cannot determine UBO/SSBO index, track if *any*
62    * UBO/SSBO is too large for imul24:
63    */
64   bool has_large_ubo;
65   bool has_large_ssbo;
66
67   unsigned max_slot;
68
69   bool progress;
70} lower_state;
71
72/* Lower 'amul's in offset src of large variables to 'imul': */
73static bool
74lower_large_src(nir_src *src, void *s)
75{
76   lower_state *state = s;
77
78   assert(src->is_ssa);
79
80   nir_instr *parent = src->ssa->parent_instr;
81
82   /* No need to visit instructions we've already visited.. this also
83    * avoids infinite recursion when phi's are involved:
84    */
85   if (parent->pass_flags)
86      return false;
87
88   nir_foreach_src(parent, lower_large_src, state);
89
90   if (parent->type == nir_instr_type_alu) {
91      nir_alu_instr *alu = nir_instr_as_alu(parent);
92      if (alu->op == nir_op_amul) {
93         alu->op = nir_op_imul;
94         state->progress = true;
95      }
96   }
97
98   parent->pass_flags = 1;
99
100   return true;
101}
102
103static bool
104large_ubo(lower_state *state, nir_src src)
105{
106   if (!nir_src_is_const(src))
107      return state->has_large_ubo;
108   unsigned idx = nir_src_as_uint(src);
109   assert(idx < state->shader->info.num_ubos);
110   return state->large_ubos[idx];
111}
112
113static bool
114large_ssbo(lower_state *state, nir_src src)
115{
116   if (!nir_src_is_const(src))
117      return state->has_large_ssbo;
118   unsigned idx = nir_src_as_uint(src);
119   assert(idx < state->shader->info.num_ssbos);
120   return state->large_ssbos[idx];
121}
122
123static void
124lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
125{
126   switch (intr->intrinsic) {
127   case nir_intrinsic_load_ubo:
128      //# src[] = { buffer_index, offset }.
129      if (large_ubo(state, intr->src[0]))
130         lower_large_src(&intr->src[1], state);
131      return;
132
133   case nir_intrinsic_load_ssbo:
134      //# src[] = { buffer_index, offset }.
135      if (large_ssbo(state, intr->src[0]))
136         lower_large_src(&intr->src[1], state);
137      return;
138
139   case nir_intrinsic_store_ssbo:
140      //# src[] = { value, block_index, offset }
141      if (large_ssbo(state, intr->src[1]))
142         lower_large_src(&intr->src[2], state);
143      return;
144
145   case nir_intrinsic_ssbo_atomic_add:
146   case nir_intrinsic_ssbo_atomic_imin:
147   case nir_intrinsic_ssbo_atomic_umin:
148   case nir_intrinsic_ssbo_atomic_imax:
149   case nir_intrinsic_ssbo_atomic_umax:
150   case nir_intrinsic_ssbo_atomic_and:
151   case nir_intrinsic_ssbo_atomic_or:
152   case nir_intrinsic_ssbo_atomic_xor:
153   case nir_intrinsic_ssbo_atomic_exchange:
154   case nir_intrinsic_ssbo_atomic_comp_swap:
155   case nir_intrinsic_ssbo_atomic_fadd:
156   case nir_intrinsic_ssbo_atomic_fmin:
157   case nir_intrinsic_ssbo_atomic_fmax:
158   case nir_intrinsic_ssbo_atomic_fcomp_swap:
159      /* 0: SSBO index
160       * 1: offset
161       */
162      if (large_ssbo(state, intr->src[0]))
163         lower_large_src(&intr->src[1], state);
164      return;
165
166   case nir_intrinsic_global_atomic_add:
167   case nir_intrinsic_global_atomic_imin:
168   case nir_intrinsic_global_atomic_umin:
169   case nir_intrinsic_global_atomic_imax:
170   case nir_intrinsic_global_atomic_umax:
171   case nir_intrinsic_global_atomic_and:
172   case nir_intrinsic_global_atomic_or:
173   case nir_intrinsic_global_atomic_xor:
174   case nir_intrinsic_global_atomic_exchange:
175   case nir_intrinsic_global_atomic_comp_swap:
176   case nir_intrinsic_global_atomic_fadd:
177   case nir_intrinsic_global_atomic_fmin:
178   case nir_intrinsic_global_atomic_fmax:
179   case nir_intrinsic_global_atomic_fcomp_swap:
180   case nir_intrinsic_load_global_constant:
181   case nir_intrinsic_load_global:
182      /* just assume we that 24b is not sufficient: */
183      lower_large_src(&intr->src[0], state);
184      return;
185
186   case nir_intrinsic_store_global:
187      /* just assume we that 24b is not sufficient: */
188      lower_large_src(&intr->src[1], state);
189      return;
190
191   /* These should all be small enough to unconditionally use imul24: */
192   case nir_intrinsic_shared_atomic_add:
193   case nir_intrinsic_shared_atomic_imin:
194   case nir_intrinsic_shared_atomic_umin:
195   case nir_intrinsic_shared_atomic_imax:
196   case nir_intrinsic_shared_atomic_umax:
197   case nir_intrinsic_shared_atomic_and:
198   case nir_intrinsic_shared_atomic_or:
199   case nir_intrinsic_shared_atomic_xor:
200   case nir_intrinsic_shared_atomic_exchange:
201   case nir_intrinsic_shared_atomic_comp_swap:
202   case nir_intrinsic_shared_atomic_fadd:
203   case nir_intrinsic_shared_atomic_fmin:
204   case nir_intrinsic_shared_atomic_fmax:
205   case nir_intrinsic_shared_atomic_fcomp_swap:
206   case nir_intrinsic_load_uniform:
207   case nir_intrinsic_load_input:
208   case nir_intrinsic_load_output:
209   case nir_intrinsic_store_output:
210   default:
211      return;
212   }
213}
214
215static void
216lower_instr(lower_state *state, nir_instr *instr)
217{
218   if (instr->type == nir_instr_type_intrinsic) {
219      lower_intrinsic(state, nir_instr_as_intrinsic(instr));
220   }
221}
222
223static bool
224is_large(lower_state *state, nir_variable *var)
225{
226   const struct glsl_type *type = glsl_without_array(var->type);
227   unsigned size = state->type_size(type, false);
228
229   /* if size is not known (ie. VLA) then assume the worst: */
230   if (!size)
231      return true;
232
233   return size >= (1 << 23);
234}
235
236bool
237nir_lower_amul(nir_shader *shader,
238               int (*type_size)(const struct glsl_type *, bool))
239{
240   assert(shader->options->has_imul24);
241   assert(type_size);
242
243   NIR_VLA_FILL(bool, large_ubos, shader->info.num_ubos, 0);
244   NIR_VLA_FILL(bool, large_ssbos, shader->info.num_ssbos, 0);
245
246   lower_state state = {
247      .shader = shader,
248      .type_size = type_size,
249      .large_ubos = large_ubos,
250      .large_ssbos = large_ssbos,
251   };
252
253   /* Figure out which UBOs or SSBOs are large enough to be
254    * disqualified from imul24:
255    */
256   nir_foreach_variable_in_shader (var, shader) {
257      if (var->data.mode == nir_var_mem_ubo) {
258         if (is_large(&state, var)) {
259            state.has_large_ubo = true;
260            unsigned size = MAX2(1, glsl_array_size(var->type));
261            for (unsigned i = 0; i < size; i++)
262               state.large_ubos[var->data.binding + i] = true;
263         }
264      } else if (var->data.mode == nir_var_mem_ssbo) {
265         if (is_large(&state, var)) {
266            state.has_large_ssbo = true;
267            unsigned size = MAX2(1, glsl_array_size(var->type));
268            for (unsigned i = 0; i < size; i++)
269               state.large_ssbos[var->data.binding + i] = true;
270         }
271      }
272   }
273
274   /* clear pass flags: */
275   nir_foreach_function(function, shader) {
276      nir_function_impl *impl = function->impl;
277      if (!impl)
278         continue;
279
280      nir_foreach_block(block, impl) {
281         nir_foreach_instr(instr, block) {
282            instr->pass_flags = 0;
283         }
284      }
285   }
286
287   nir_foreach_function(function, shader) {
288      nir_function_impl *impl = function->impl;
289
290      if (!impl)
291         continue;
292
293      nir_foreach_block(block, impl) {
294         nir_foreach_instr(instr, block) {
295            lower_instr(&state, instr);
296         }
297      }
298   }
299
300   /* At this point, all 'amul's used in calculating an offset into
301    * a large variable have been replaced with 'imul'.  So remaining
302    * 'amul's can be replaced with 'imul24':
303    *
304    * Note the exception for 64b (such as load/store_global where
305    * address size is 64b) as imul24 cannot have 64b bitsize
306    */
307   nir_foreach_function(function, shader) {
308      nir_function_impl *impl = function->impl;
309
310      if (!impl)
311         continue;
312
313      nir_foreach_block(block, impl) {
314         nir_foreach_instr(instr, block) {
315            if (instr->type != nir_instr_type_alu)
316               continue;
317
318            nir_alu_instr *alu = nir_instr_as_alu(instr);
319            if (alu->op != nir_op_amul)
320               continue;
321
322            if (nir_dest_bit_size(alu->dest.dest) <= 32)
323               alu->op = nir_op_imul24;
324            else
325               alu->op = nir_op_imul;
326
327            state.progress |= true;
328         }
329      }
330
331      nir_metadata_preserve(impl, nir_metadata_block_index |
332                                  nir_metadata_dominance);
333
334   }
335
336   return state.progress;
337}
338