1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2021 Advanced Micro Devices, Inc.
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci/* This helps separate shaders because the next shader doesn't have to be known.
25bf215546Sopenharmony_ci *
26bf215546Sopenharmony_ci * It optimizes VS and TES outputs before FS as follows:
27bf215546Sopenharmony_ci * - Eliminate and merge equal outputs, and treat undef as equal to everything, e.g.
28bf215546Sopenharmony_ci *   (x,y,undef,undef) == (undef,y,z,undef) --> (x,y,z,undef) regardless of the interpolation
29bf215546Sopenharmony_ci *   qualifier (AMD can map 1 output to multiple PS inputs and interpolate each differently).
30bf215546Sopenharmony_ci * - Remove constant outputs that match AMD DEFAULT_VAL options, e.g. (0,0,0,1),
31bf215546Sopenharmony_ci *   treat undef as whatever.
32bf215546Sopenharmony_ci *
33bf215546Sopenharmony_ci * It requires that there is no indirect indexing and all output stores must be scalar.
34bf215546Sopenharmony_ci */
35bf215546Sopenharmony_ci
36bf215546Sopenharmony_ci#include "ac_nir.h"
37bf215546Sopenharmony_ci#include "nir_builder.h"
38bf215546Sopenharmony_ci
39bf215546Sopenharmony_cistruct ac_chan_info {
40bf215546Sopenharmony_ci   nir_instr *value;
41bf215546Sopenharmony_ci   nir_intrinsic_instr *store_intr; /* The intrinsic writing the value. */
42bf215546Sopenharmony_ci};
43bf215546Sopenharmony_ci
44bf215546Sopenharmony_cistruct ac_out_info {
45bf215546Sopenharmony_ci   unsigned base; /* nir_intrinsic_base */
46bf215546Sopenharmony_ci   nir_alu_type types;
47bf215546Sopenharmony_ci   bool duplicated;
48bf215546Sopenharmony_ci   bool constant;
49bf215546Sopenharmony_ci
50bf215546Sopenharmony_ci   /* Channels 0-3 are 32-bit channels or low bits of 16-bit channels.
51bf215546Sopenharmony_ci    * Channels 4-7 are high bits of 16-bit channels.
52bf215546Sopenharmony_ci    */
53bf215546Sopenharmony_ci   struct ac_chan_info chan[8];
54bf215546Sopenharmony_ci};
55bf215546Sopenharmony_ci
56bf215546Sopenharmony_cistatic void ac_remove_varying(struct ac_out_info *out)
57bf215546Sopenharmony_ci{
58bf215546Sopenharmony_ci   /* Remove the output. (all channels) */
59bf215546Sopenharmony_ci   for (unsigned i = 0; i < ARRAY_SIZE(out->chan); i++) {
60bf215546Sopenharmony_ci      if (out->chan[i].store_intr) {
61bf215546Sopenharmony_ci         nir_remove_varying(out->chan[i].store_intr);
62bf215546Sopenharmony_ci         out->chan[i].store_intr = NULL;
63bf215546Sopenharmony_ci         out->chan[i].value = NULL;
64bf215546Sopenharmony_ci      }
65bf215546Sopenharmony_ci   }
66bf215546Sopenharmony_ci}
67bf215546Sopenharmony_ci
68bf215546Sopenharmony_ci/* Return true if the output matches DEFAULT_VAL and has been eliminated. */
69bf215546Sopenharmony_cistatic bool ac_eliminate_const_output(struct ac_out_info *out,
70bf215546Sopenharmony_ci                                      gl_varying_slot semantic,
71bf215546Sopenharmony_ci                                      uint8_t *param_export_index)
72bf215546Sopenharmony_ci{
73bf215546Sopenharmony_ci   if (!(out->types & 32))
74bf215546Sopenharmony_ci      return false;
75bf215546Sopenharmony_ci
76bf215546Sopenharmony_ci   bool is_zero[4] = {0}, is_one[4] = {0};
77bf215546Sopenharmony_ci
78bf215546Sopenharmony_ci   for (unsigned i = 0; i < 4; i++) {
79bf215546Sopenharmony_ci      /* NULL means undef. */
80bf215546Sopenharmony_ci      if (!out->chan[i].value) {
81bf215546Sopenharmony_ci         is_zero[i] = true;
82bf215546Sopenharmony_ci         is_one[i] = true;
83bf215546Sopenharmony_ci      } else if (out->chan[i].value->type == nir_instr_type_load_const) {
84bf215546Sopenharmony_ci         if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 0)
85bf215546Sopenharmony_ci            is_zero[i] = true;
86bf215546Sopenharmony_ci         else if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 1)
87bf215546Sopenharmony_ci            is_one[i] = true;
88bf215546Sopenharmony_ci         else
89bf215546Sopenharmony_ci            return false; /* other constant */
90bf215546Sopenharmony_ci      } else
91bf215546Sopenharmony_ci         return false;
92bf215546Sopenharmony_ci   }
93bf215546Sopenharmony_ci
94bf215546Sopenharmony_ci   /* Only certain combinations of 0 and 1 are supported. */
95bf215546Sopenharmony_ci   unsigned default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
96bf215546Sopenharmony_ci
97bf215546Sopenharmony_ci   if (is_zero[0] && is_zero[1] && is_zero[2]) {
98bf215546Sopenharmony_ci      if (is_zero[3])
99bf215546Sopenharmony_ci         default_val = AC_EXP_PARAM_DEFAULT_VAL_0000;
100bf215546Sopenharmony_ci      else if (is_one[3])
101bf215546Sopenharmony_ci         default_val = AC_EXP_PARAM_DEFAULT_VAL_0001;
102bf215546Sopenharmony_ci      else
103bf215546Sopenharmony_ci         return false;
104bf215546Sopenharmony_ci   } else if (is_one[0] && is_one[1] && is_one[2]) {
105bf215546Sopenharmony_ci      if (is_zero[3])
106bf215546Sopenharmony_ci         default_val = AC_EXP_PARAM_DEFAULT_VAL_1110;
107bf215546Sopenharmony_ci      else if (is_one[3])
108bf215546Sopenharmony_ci         default_val = AC_EXP_PARAM_DEFAULT_VAL_1111;
109bf215546Sopenharmony_ci      else
110bf215546Sopenharmony_ci         return false;
111bf215546Sopenharmony_ci   } else {
112bf215546Sopenharmony_ci      return false;
113bf215546Sopenharmony_ci   }
114bf215546Sopenharmony_ci
115bf215546Sopenharmony_ci   /* Change OFFSET to DEFAULT_VAL. */
116bf215546Sopenharmony_ci   param_export_index[semantic] = default_val;
117bf215546Sopenharmony_ci   out->constant = true;
118bf215546Sopenharmony_ci   ac_remove_varying(out);
119bf215546Sopenharmony_ci   return true;
120bf215546Sopenharmony_ci}
121bf215546Sopenharmony_ci
122bf215546Sopenharmony_cistatic bool ac_eliminate_duplicated_output(struct ac_out_info *outputs,
123bf215546Sopenharmony_ci                                           BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS),
124bf215546Sopenharmony_ci                                           gl_varying_slot current, struct nir_builder *b,
125bf215546Sopenharmony_ci                                           int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
126bf215546Sopenharmony_ci{
127bf215546Sopenharmony_ci   struct ac_out_info *cur = &outputs[current];
128bf215546Sopenharmony_ci   unsigned p, copy_back_channels = 0;
129bf215546Sopenharmony_ci
130bf215546Sopenharmony_ci   /* Check all outputs before current. */
131bf215546Sopenharmony_ci   BITSET_FOREACH_SET(p, outputs_optimized, current) {
132bf215546Sopenharmony_ci      struct ac_out_info *prev = &outputs[p];
133bf215546Sopenharmony_ci
134bf215546Sopenharmony_ci      /* Only compare with real outputs. */
135bf215546Sopenharmony_ci      if (prev->constant || prev->duplicated)
136bf215546Sopenharmony_ci         continue;
137bf215546Sopenharmony_ci
138bf215546Sopenharmony_ci      /* The types must match (only 16-bit and 32-bit types are allowed). */
139bf215546Sopenharmony_ci      if ((prev->types & 16) != (cur->types & 16))
140bf215546Sopenharmony_ci         continue;
141bf215546Sopenharmony_ci
142bf215546Sopenharmony_ci      bool different = false;
143bf215546Sopenharmony_ci
144bf215546Sopenharmony_ci      /* Iterate over all channels, including 16-bit channels in chan_hi. */
145bf215546Sopenharmony_ci      for (unsigned j = 0; j < 8; j++) {
146bf215546Sopenharmony_ci         nir_instr *prev_chan = prev->chan[j].value;
147bf215546Sopenharmony_ci         nir_instr *cur_chan = cur->chan[j].value;
148bf215546Sopenharmony_ci
149bf215546Sopenharmony_ci         /* Treat undef as a match. */
150bf215546Sopenharmony_ci         if (!cur_chan)
151bf215546Sopenharmony_ci            continue;
152bf215546Sopenharmony_ci
153bf215546Sopenharmony_ci         /* If prev is undef but cur isn't, we can merge the outputs
154bf215546Sopenharmony_ci          * and consider the output duplicated.
155bf215546Sopenharmony_ci          */
156bf215546Sopenharmony_ci         if (!prev_chan) {
157bf215546Sopenharmony_ci            copy_back_channels |= 1 << j;
158bf215546Sopenharmony_ci            continue;
159bf215546Sopenharmony_ci         }
160bf215546Sopenharmony_ci
161bf215546Sopenharmony_ci         /* Test whether the values are different. */
162bf215546Sopenharmony_ci         if (prev_chan != cur_chan &&
163bf215546Sopenharmony_ci             (prev_chan->type != nir_instr_type_load_const ||
164bf215546Sopenharmony_ci              cur_chan->type != nir_instr_type_load_const ||
165bf215546Sopenharmony_ci              nir_instr_as_load_const(prev_chan)->value[0].u32 !=
166bf215546Sopenharmony_ci              nir_instr_as_load_const(cur_chan)->value[0].u32)) {
167bf215546Sopenharmony_ci            different = true;
168bf215546Sopenharmony_ci            break;
169bf215546Sopenharmony_ci         }
170bf215546Sopenharmony_ci      }
171bf215546Sopenharmony_ci      if (!different)
172bf215546Sopenharmony_ci         break;
173bf215546Sopenharmony_ci
174bf215546Sopenharmony_ci      copy_back_channels = 0;
175bf215546Sopenharmony_ci   }
176bf215546Sopenharmony_ci   if (p == current)
177bf215546Sopenharmony_ci      return false;
178bf215546Sopenharmony_ci
179bf215546Sopenharmony_ci   /* An equal output already exists. Make FS use the existing one instead.
180bf215546Sopenharmony_ci    * This effectively disables the current output and the param export shouldn't
181bf215546Sopenharmony_ci    * be generated.
182bf215546Sopenharmony_ci    */
183bf215546Sopenharmony_ci   cur->duplicated = true;
184bf215546Sopenharmony_ci
185bf215546Sopenharmony_ci   /* p is gl_varying_slot in addition to being an index into outputs. */
186bf215546Sopenharmony_ci   slot_remap[current] = p;
187bf215546Sopenharmony_ci
188bf215546Sopenharmony_ci   /* If the matching preceding output has undef where the current one has a proper value,
189bf215546Sopenharmony_ci    * move the value to the preceding output.
190bf215546Sopenharmony_ci    */
191bf215546Sopenharmony_ci   struct ac_out_info *prev = &outputs[p];
192bf215546Sopenharmony_ci
193bf215546Sopenharmony_ci   while (copy_back_channels) {
194bf215546Sopenharmony_ci      unsigned i = u_bit_scan(&copy_back_channels);
195bf215546Sopenharmony_ci      struct ac_chan_info *prev_chan = &prev->chan[i];
196bf215546Sopenharmony_ci      struct ac_chan_info *cur_chan = &cur->chan[i];
197bf215546Sopenharmony_ci
198bf215546Sopenharmony_ci      b->cursor = nir_after_instr(&cur_chan->store_intr->instr);
199bf215546Sopenharmony_ci
200bf215546Sopenharmony_ci      /* The store intrinsic doesn't exist for this channel. Create a new one. */
201bf215546Sopenharmony_ci      nir_alu_type src_type = nir_intrinsic_src_type(cur_chan->store_intr);
202bf215546Sopenharmony_ci      struct nir_io_semantics sem = nir_intrinsic_io_semantics(cur_chan->store_intr);
203bf215546Sopenharmony_ci      struct nir_io_xfb xfb = nir_intrinsic_io_xfb(cur_chan->store_intr);
204bf215546Sopenharmony_ci      struct nir_io_xfb xfb2 = nir_intrinsic_io_xfb2(cur_chan->store_intr);
205bf215546Sopenharmony_ci
206bf215546Sopenharmony_ci      /* p is gl_varying_slot in addition to being an index into outputs. */
207bf215546Sopenharmony_ci      sem.location = p;
208bf215546Sopenharmony_ci      assert(sem.high_16bits == i / 4);
209bf215546Sopenharmony_ci
210bf215546Sopenharmony_ci      /* If it's a sysval output (such as CLIPDIST), we move the varying portion but keep
211bf215546Sopenharmony_ci       * the system value output. This is just the varying portion.
212bf215546Sopenharmony_ci       */
213bf215546Sopenharmony_ci      sem.no_sysval_output = 1;
214bf215546Sopenharmony_ci
215bf215546Sopenharmony_ci      /* Write just one component. */
216bf215546Sopenharmony_ci      prev_chan->store_intr = nir_store_output(b, nir_instr_ssa_def(cur_chan->value),
217bf215546Sopenharmony_ci                                               nir_imm_int(b, 0),
218bf215546Sopenharmony_ci                                               .base = prev->base,
219bf215546Sopenharmony_ci                                               .component = i % 4,
220bf215546Sopenharmony_ci                                               .io_semantics = sem,
221bf215546Sopenharmony_ci                                               .src_type = src_type,
222bf215546Sopenharmony_ci                                               .write_mask = 0x1,
223bf215546Sopenharmony_ci                                               .io_xfb = xfb,
224bf215546Sopenharmony_ci                                               .io_xfb2 = xfb2);
225bf215546Sopenharmony_ci
226bf215546Sopenharmony_ci      /* Update the undef channels in the output info. */
227bf215546Sopenharmony_ci      assert(!prev_chan->value);
228bf215546Sopenharmony_ci      prev_chan->value = cur_chan->value;
229bf215546Sopenharmony_ci
230bf215546Sopenharmony_ci      /* Remove transform feedback info from the current instruction because
231bf215546Sopenharmony_ci       * we moved it too. The instruction might not be removed if it's a system
232bf215546Sopenharmony_ci       * value output.
233bf215546Sopenharmony_ci       */
234bf215546Sopenharmony_ci      static struct nir_io_xfb zero_xfb;
235bf215546Sopenharmony_ci      nir_intrinsic_set_io_xfb(cur->chan[i].store_intr, zero_xfb);
236bf215546Sopenharmony_ci      nir_intrinsic_set_io_xfb2(cur->chan[i].store_intr, zero_xfb);
237bf215546Sopenharmony_ci   }
238bf215546Sopenharmony_ci
239bf215546Sopenharmony_ci   ac_remove_varying(cur);
240bf215546Sopenharmony_ci   return true;
241bf215546Sopenharmony_ci}
242bf215546Sopenharmony_ci
243bf215546Sopenharmony_cibool ac_nir_optimize_outputs(nir_shader *nir, bool sprite_tex_disallowed,
244bf215546Sopenharmony_ci                             int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],
245bf215546Sopenharmony_ci                             uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS])
246bf215546Sopenharmony_ci{
247bf215546Sopenharmony_ci   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
248bf215546Sopenharmony_ci   assert(impl);
249bf215546Sopenharmony_ci
250bf215546Sopenharmony_ci   if (nir->info.stage != MESA_SHADER_VERTEX &&
251bf215546Sopenharmony_ci       nir->info.stage != MESA_SHADER_TESS_EVAL) {
252bf215546Sopenharmony_ci      nir_metadata_preserve(impl, nir_metadata_all);
253bf215546Sopenharmony_ci      return false;
254bf215546Sopenharmony_ci   }
255bf215546Sopenharmony_ci
256bf215546Sopenharmony_ci   struct ac_out_info outputs[NUM_TOTAL_VARYING_SLOTS] = { 0 };
257bf215546Sopenharmony_ci
258bf215546Sopenharmony_ci   BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS);
259bf215546Sopenharmony_ci   BITSET_ZERO(outputs_optimized);
260bf215546Sopenharmony_ci
261bf215546Sopenharmony_ci   /* Gather outputs. */
262bf215546Sopenharmony_ci   nir_foreach_block(block, impl) {
263bf215546Sopenharmony_ci      nir_foreach_instr_safe(instr, block) {
264bf215546Sopenharmony_ci         if (instr->type != nir_instr_type_intrinsic)
265bf215546Sopenharmony_ci            continue;
266bf215546Sopenharmony_ci
267bf215546Sopenharmony_ci         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
268bf215546Sopenharmony_ci         if (intr->intrinsic != nir_intrinsic_store_output)
269bf215546Sopenharmony_ci            continue;
270bf215546Sopenharmony_ci
271bf215546Sopenharmony_ci         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
272bf215546Sopenharmony_ci
273bf215546Sopenharmony_ci         /* Only process varyings that appear as param exports. */
274bf215546Sopenharmony_ci         if (!nir_slot_is_varying(sem.location) || sem.no_varying)
275bf215546Sopenharmony_ci            continue;
276bf215546Sopenharmony_ci
277bf215546Sopenharmony_ci         /* We can't optimize texture coordinates if sprite_coord_enable can override them. */
278bf215546Sopenharmony_ci         if (sem.location >= VARYING_SLOT_TEX0 && sem.location <= VARYING_SLOT_TEX7 &&
279bf215546Sopenharmony_ci             !sprite_tex_disallowed)
280bf215546Sopenharmony_ci            continue;
281bf215546Sopenharmony_ci
282bf215546Sopenharmony_ci         BITSET_SET(outputs_optimized, sem.location);
283bf215546Sopenharmony_ci
284bf215546Sopenharmony_ci         /* No indirect indexing allowed. */
285bf215546Sopenharmony_ci         ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
286bf215546Sopenharmony_ci         assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
287bf215546Sopenharmony_ci
288bf215546Sopenharmony_ci         /* nir_lower_io_to_scalar is required before this */
289bf215546Sopenharmony_ci         assert(intr->src[0].ssa->num_components == 1);
290bf215546Sopenharmony_ci         /* No intrinsic should store undef. */
291bf215546Sopenharmony_ci         assert(intr->src[0].ssa->parent_instr->type != nir_instr_type_ssa_undef);
292bf215546Sopenharmony_ci
293bf215546Sopenharmony_ci         /* Gather the output. */
294bf215546Sopenharmony_ci         struct ac_out_info *out_info = &outputs[sem.location];
295bf215546Sopenharmony_ci         if (!out_info->types)
296bf215546Sopenharmony_ci            out_info->base = nir_intrinsic_base(intr);
297bf215546Sopenharmony_ci         else
298bf215546Sopenharmony_ci            assert(out_info->base == nir_intrinsic_base(intr));
299bf215546Sopenharmony_ci
300bf215546Sopenharmony_ci         out_info->types |= nir_intrinsic_src_type(intr);
301bf215546Sopenharmony_ci
302bf215546Sopenharmony_ci         unsigned chan = sem.high_16bits * 4 + nir_intrinsic_component(intr);
303bf215546Sopenharmony_ci         out_info->chan[chan].store_intr = intr;
304bf215546Sopenharmony_ci         out_info->chan[chan].value = intr->src[0].ssa->parent_instr;
305bf215546Sopenharmony_ci      }
306bf215546Sopenharmony_ci   }
307bf215546Sopenharmony_ci
308bf215546Sopenharmony_ci   unsigned i;
309bf215546Sopenharmony_ci   bool progress = false;
310bf215546Sopenharmony_ci
311bf215546Sopenharmony_ci   struct nir_builder b;
312bf215546Sopenharmony_ci   nir_builder_init(&b, impl);
313bf215546Sopenharmony_ci
314bf215546Sopenharmony_ci   /* Optimize outputs. */
315bf215546Sopenharmony_ci   BITSET_FOREACH_SET(i, outputs_optimized, NUM_TOTAL_VARYING_SLOTS) {
316bf215546Sopenharmony_ci      progress |=
317bf215546Sopenharmony_ci         ac_eliminate_const_output(&outputs[i], i, param_export_index) ||
318bf215546Sopenharmony_ci         ac_eliminate_duplicated_output(outputs, outputs_optimized, i, &b, slot_remap);
319bf215546Sopenharmony_ci   }
320bf215546Sopenharmony_ci
321bf215546Sopenharmony_ci   if (progress) {
322bf215546Sopenharmony_ci      nir_metadata_preserve(impl, nir_metadata_dominance |
323bf215546Sopenharmony_ci                                  nir_metadata_block_index);
324bf215546Sopenharmony_ci   } else {
325bf215546Sopenharmony_ci      nir_metadata_preserve(impl, nir_metadata_all);
326bf215546Sopenharmony_ci   }
327bf215546Sopenharmony_ci   return progress;
328bf215546Sopenharmony_ci}
329