1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2018 Intel Corporation
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci#include "nir_xfb_info.h"
25bf215546Sopenharmony_ci
26bf215546Sopenharmony_ci#include "util/u_dynarray.h"
27bf215546Sopenharmony_ci#include <util/u_math.h>
28bf215546Sopenharmony_ci
29bf215546Sopenharmony_cistatic void
30bf215546Sopenharmony_ciadd_var_xfb_varying(nir_xfb_info *xfb,
31bf215546Sopenharmony_ci                    nir_xfb_varyings_info *varyings,
32bf215546Sopenharmony_ci                    unsigned buffer,
33bf215546Sopenharmony_ci                    unsigned offset,
34bf215546Sopenharmony_ci                    const struct glsl_type *type)
35bf215546Sopenharmony_ci{
36bf215546Sopenharmony_ci   if (varyings == NULL)
37bf215546Sopenharmony_ci      return;
38bf215546Sopenharmony_ci
39bf215546Sopenharmony_ci   nir_xfb_varying_info *varying = &varyings->varyings[varyings->varying_count++];
40bf215546Sopenharmony_ci
41bf215546Sopenharmony_ci   varying->type = type;
42bf215546Sopenharmony_ci   varying->buffer = buffer;
43bf215546Sopenharmony_ci   varying->offset = offset;
44bf215546Sopenharmony_ci   xfb->buffers[buffer].varying_count++;
45bf215546Sopenharmony_ci}
46bf215546Sopenharmony_ci
47bf215546Sopenharmony_ci
48bf215546Sopenharmony_cistatic nir_xfb_info *
49bf215546Sopenharmony_cinir_xfb_info_create(void *mem_ctx, uint16_t output_count)
50bf215546Sopenharmony_ci{
51bf215546Sopenharmony_ci   return rzalloc_size(mem_ctx, nir_xfb_info_size(output_count));
52bf215546Sopenharmony_ci}
53bf215546Sopenharmony_ci
54bf215546Sopenharmony_cistatic size_t
55bf215546Sopenharmony_cinir_xfb_varyings_info_size(uint16_t varying_count)
56bf215546Sopenharmony_ci{
57bf215546Sopenharmony_ci   return sizeof(nir_xfb_info) + sizeof(nir_xfb_varying_info) * varying_count;
58bf215546Sopenharmony_ci}
59bf215546Sopenharmony_ci
60bf215546Sopenharmony_cistatic nir_xfb_varyings_info *
61bf215546Sopenharmony_cinir_xfb_varyings_info_create(void *mem_ctx, uint16_t varying_count)
62bf215546Sopenharmony_ci{
63bf215546Sopenharmony_ci   return rzalloc_size(mem_ctx, nir_xfb_varyings_info_size(varying_count));
64bf215546Sopenharmony_ci}
65bf215546Sopenharmony_ci
66bf215546Sopenharmony_cistatic void
67bf215546Sopenharmony_ciadd_var_xfb_outputs(nir_xfb_info *xfb,
68bf215546Sopenharmony_ci                    nir_xfb_varyings_info *varyings,
69bf215546Sopenharmony_ci                    nir_variable *var,
70bf215546Sopenharmony_ci                    unsigned buffer,
71bf215546Sopenharmony_ci                    unsigned *location,
72bf215546Sopenharmony_ci                    unsigned *offset,
73bf215546Sopenharmony_ci                    const struct glsl_type *type,
74bf215546Sopenharmony_ci                    bool varying_added)
75bf215546Sopenharmony_ci{
76bf215546Sopenharmony_ci   /* If this type contains a 64-bit value, align to 8 bytes */
77bf215546Sopenharmony_ci   if (glsl_type_contains_64bit(type))
78bf215546Sopenharmony_ci      *offset = ALIGN_POT(*offset, 8);
79bf215546Sopenharmony_ci
80bf215546Sopenharmony_ci   if (glsl_type_is_array_or_matrix(type) && !var->data.compact) {
81bf215546Sopenharmony_ci      unsigned length = glsl_get_length(type);
82bf215546Sopenharmony_ci
83bf215546Sopenharmony_ci      const struct glsl_type *child_type = glsl_get_array_element(type);
84bf215546Sopenharmony_ci      if (!glsl_type_is_array(child_type) &&
85bf215546Sopenharmony_ci          !glsl_type_is_struct(child_type)) {
86bf215546Sopenharmony_ci
87bf215546Sopenharmony_ci         add_var_xfb_varying(xfb, varyings, buffer, *offset, type);
88bf215546Sopenharmony_ci         varying_added = true;
89bf215546Sopenharmony_ci      }
90bf215546Sopenharmony_ci
91bf215546Sopenharmony_ci      for (unsigned i = 0; i < length; i++)
92bf215546Sopenharmony_ci         add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset,
93bf215546Sopenharmony_ci                             child_type, varying_added);
94bf215546Sopenharmony_ci   } else if (glsl_type_is_struct_or_ifc(type)) {
95bf215546Sopenharmony_ci      unsigned length = glsl_get_length(type);
96bf215546Sopenharmony_ci      for (unsigned i = 0; i < length; i++) {
97bf215546Sopenharmony_ci         const struct glsl_type *child_type = glsl_get_struct_field(type, i);
98bf215546Sopenharmony_ci         add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset,
99bf215546Sopenharmony_ci                             child_type, varying_added);
100bf215546Sopenharmony_ci      }
101bf215546Sopenharmony_ci   } else {
102bf215546Sopenharmony_ci      assert(buffer < NIR_MAX_XFB_BUFFERS);
103bf215546Sopenharmony_ci      if (xfb->buffers_written & (1 << buffer)) {
104bf215546Sopenharmony_ci         assert(xfb->buffers[buffer].stride == var->data.xfb.stride);
105bf215546Sopenharmony_ci         assert(xfb->buffer_to_stream[buffer] == var->data.stream);
106bf215546Sopenharmony_ci      } else {
107bf215546Sopenharmony_ci         xfb->buffers_written |= (1 << buffer);
108bf215546Sopenharmony_ci         xfb->buffers[buffer].stride = var->data.xfb.stride;
109bf215546Sopenharmony_ci         xfb->buffer_to_stream[buffer] = var->data.stream;
110bf215546Sopenharmony_ci      }
111bf215546Sopenharmony_ci
112bf215546Sopenharmony_ci      assert(var->data.stream < NIR_MAX_XFB_STREAMS);
113bf215546Sopenharmony_ci      xfb->streams_written |= (1 << var->data.stream);
114bf215546Sopenharmony_ci
115bf215546Sopenharmony_ci      unsigned comp_slots;
116bf215546Sopenharmony_ci      if (var->data.compact) {
117bf215546Sopenharmony_ci         /* This only happens for clip/cull which are float arrays */
118bf215546Sopenharmony_ci         assert(glsl_without_array(type) == glsl_float_type());
119bf215546Sopenharmony_ci         assert(var->data.location == VARYING_SLOT_CLIP_DIST0 ||
120bf215546Sopenharmony_ci                var->data.location == VARYING_SLOT_CLIP_DIST1);
121bf215546Sopenharmony_ci         comp_slots = glsl_get_length(type);
122bf215546Sopenharmony_ci      } else {
123bf215546Sopenharmony_ci         comp_slots = glsl_get_component_slots(type);
124bf215546Sopenharmony_ci
125bf215546Sopenharmony_ci         UNUSED unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4);
126bf215546Sopenharmony_ci         assert(attrib_slots == glsl_count_attribute_slots(type, false));
127bf215546Sopenharmony_ci
128bf215546Sopenharmony_ci         /* Ensure that we don't have, for instance, a dvec2 with a
129bf215546Sopenharmony_ci          * location_frac of 2 which would make it crass a location boundary
130bf215546Sopenharmony_ci          * even though it fits in a single slot.  However, you can have a
131bf215546Sopenharmony_ci          * dvec3 which crosses the slot boundary with a location_frac of 2.
132bf215546Sopenharmony_ci          */
133bf215546Sopenharmony_ci         assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) ==
134bf215546Sopenharmony_ci                attrib_slots);
135bf215546Sopenharmony_ci      }
136bf215546Sopenharmony_ci
137bf215546Sopenharmony_ci      assert(var->data.location_frac + comp_slots <= 8);
138bf215546Sopenharmony_ci      uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac;
139bf215546Sopenharmony_ci      unsigned comp_offset = var->data.location_frac;
140bf215546Sopenharmony_ci
141bf215546Sopenharmony_ci      if (!varying_added) {
142bf215546Sopenharmony_ci         add_var_xfb_varying(xfb, varyings, buffer, *offset, type);
143bf215546Sopenharmony_ci      }
144bf215546Sopenharmony_ci
145bf215546Sopenharmony_ci      while (comp_mask) {
146bf215546Sopenharmony_ci         nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++];
147bf215546Sopenharmony_ci
148bf215546Sopenharmony_ci         output->buffer = buffer;
149bf215546Sopenharmony_ci         output->offset = *offset;
150bf215546Sopenharmony_ci         output->location = *location;
151bf215546Sopenharmony_ci         output->component_mask = comp_mask & 0xf;
152bf215546Sopenharmony_ci         output->component_offset = comp_offset;
153bf215546Sopenharmony_ci
154bf215546Sopenharmony_ci         *offset += util_bitcount(output->component_mask) * 4;
155bf215546Sopenharmony_ci         (*location)++;
156bf215546Sopenharmony_ci         comp_mask >>= 4;
157bf215546Sopenharmony_ci         comp_offset = 0;
158bf215546Sopenharmony_ci      }
159bf215546Sopenharmony_ci   }
160bf215546Sopenharmony_ci}
161bf215546Sopenharmony_ci
162bf215546Sopenharmony_cistatic int
163bf215546Sopenharmony_cicompare_xfb_varying_offsets(const void *_a, const void *_b)
164bf215546Sopenharmony_ci{
165bf215546Sopenharmony_ci   const nir_xfb_varying_info *a = _a, *b = _b;
166bf215546Sopenharmony_ci
167bf215546Sopenharmony_ci   if (a->buffer != b->buffer)
168bf215546Sopenharmony_ci      return a->buffer - b->buffer;
169bf215546Sopenharmony_ci
170bf215546Sopenharmony_ci   return a->offset - b->offset;
171bf215546Sopenharmony_ci}
172bf215546Sopenharmony_ci
173bf215546Sopenharmony_cistatic int
174bf215546Sopenharmony_cicompare_xfb_output_offsets(const void *_a, const void *_b)
175bf215546Sopenharmony_ci{
176bf215546Sopenharmony_ci   const nir_xfb_output_info *a = _a, *b = _b;
177bf215546Sopenharmony_ci
178bf215546Sopenharmony_ci   return a->offset - b->offset;
179bf215546Sopenharmony_ci}
180bf215546Sopenharmony_ci
181bf215546Sopenharmony_civoid
182bf215546Sopenharmony_cinir_shader_gather_xfb_info(nir_shader *shader)
183bf215546Sopenharmony_ci{
184bf215546Sopenharmony_ci   nir_gather_xfb_info_with_varyings(shader, NULL, NULL);
185bf215546Sopenharmony_ci}
186bf215546Sopenharmony_ci
187bf215546Sopenharmony_civoid
188bf215546Sopenharmony_cinir_gather_xfb_info_with_varyings(nir_shader *shader,
189bf215546Sopenharmony_ci                                  void *mem_ctx,
190bf215546Sopenharmony_ci                                  nir_xfb_varyings_info **varyings_info_out)
191bf215546Sopenharmony_ci{
192bf215546Sopenharmony_ci   assert(shader->info.stage == MESA_SHADER_VERTEX ||
193bf215546Sopenharmony_ci          shader->info.stage == MESA_SHADER_TESS_EVAL ||
194bf215546Sopenharmony_ci          shader->info.stage == MESA_SHADER_GEOMETRY);
195bf215546Sopenharmony_ci
196bf215546Sopenharmony_ci   /* Compute the number of outputs we have.  This is simply the number of
197bf215546Sopenharmony_ci    * cumulative locations consumed by all the variables.  If a location is
198bf215546Sopenharmony_ci    * represented by multiple variables, then they each count separately in
199bf215546Sopenharmony_ci    * number of outputs.  This is only an estimate as some variables may have
200bf215546Sopenharmony_ci    * an xfb_buffer but not an output so it may end up larger than we need but
201bf215546Sopenharmony_ci    * it should be good enough for allocation.
202bf215546Sopenharmony_ci    */
203bf215546Sopenharmony_ci   unsigned num_outputs = 0;
204bf215546Sopenharmony_ci   unsigned num_varyings = 0;
205bf215546Sopenharmony_ci   nir_xfb_varyings_info *varyings_info = NULL;
206bf215546Sopenharmony_ci   nir_foreach_shader_out_variable(var, shader) {
207bf215546Sopenharmony_ci      if (var->data.explicit_xfb_buffer) {
208bf215546Sopenharmony_ci         num_outputs += glsl_count_attribute_slots(var->type, false);
209bf215546Sopenharmony_ci         num_varyings += glsl_varying_count(var->type);
210bf215546Sopenharmony_ci      }
211bf215546Sopenharmony_ci   }
212bf215546Sopenharmony_ci   if (num_outputs == 0 || num_varyings == 0)
213bf215546Sopenharmony_ci      return;
214bf215546Sopenharmony_ci
215bf215546Sopenharmony_ci   nir_xfb_info *xfb = nir_xfb_info_create(shader, num_outputs);
216bf215546Sopenharmony_ci   if (varyings_info_out != NULL) {
217bf215546Sopenharmony_ci      *varyings_info_out = nir_xfb_varyings_info_create(mem_ctx, num_varyings);
218bf215546Sopenharmony_ci      varyings_info = *varyings_info_out;
219bf215546Sopenharmony_ci   }
220bf215546Sopenharmony_ci
221bf215546Sopenharmony_ci   /* Walk the list of outputs and add them to the array */
222bf215546Sopenharmony_ci   nir_foreach_shader_out_variable(var, shader) {
223bf215546Sopenharmony_ci      if (!var->data.explicit_xfb_buffer)
224bf215546Sopenharmony_ci         continue;
225bf215546Sopenharmony_ci
226bf215546Sopenharmony_ci      unsigned location = var->data.location;
227bf215546Sopenharmony_ci
228bf215546Sopenharmony_ci      /* In order to know if we have a array of blocks can't be done just by
229bf215546Sopenharmony_ci       * checking if we have an interface type and is an array, because due
230bf215546Sopenharmony_ci       * splitting we could end on a case were we received a split struct
231bf215546Sopenharmony_ci       * that contains an array.
232bf215546Sopenharmony_ci       */
233bf215546Sopenharmony_ci      bool is_array_block = var->interface_type != NULL &&
234bf215546Sopenharmony_ci         glsl_type_is_array(var->type) &&
235bf215546Sopenharmony_ci         glsl_without_array(var->type) == var->interface_type;
236bf215546Sopenharmony_ci
237bf215546Sopenharmony_ci      if (var->data.explicit_offset && !is_array_block) {
238bf215546Sopenharmony_ci         unsigned offset = var->data.offset;
239bf215546Sopenharmony_ci         add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer,
240bf215546Sopenharmony_ci                             &location, &offset, var->type, false);
241bf215546Sopenharmony_ci      } else if (is_array_block) {
242bf215546Sopenharmony_ci         assert(glsl_type_is_struct_or_ifc(var->interface_type));
243bf215546Sopenharmony_ci
244bf215546Sopenharmony_ci         unsigned aoa_size = glsl_get_aoa_size(var->type);
245bf215546Sopenharmony_ci         const struct glsl_type *itype = var->interface_type;
246bf215546Sopenharmony_ci         unsigned nfields = glsl_get_length(itype);
247bf215546Sopenharmony_ci         for (unsigned b = 0; b < aoa_size; b++) {
248bf215546Sopenharmony_ci            for (unsigned f = 0; f < nfields; f++) {
249bf215546Sopenharmony_ci               int foffset = glsl_get_struct_field_offset(itype, f);
250bf215546Sopenharmony_ci               const struct glsl_type *ftype = glsl_get_struct_field(itype, f);
251bf215546Sopenharmony_ci               if (foffset < 0) {
252bf215546Sopenharmony_ci                  location += glsl_count_attribute_slots(ftype, false);
253bf215546Sopenharmony_ci                  continue;
254bf215546Sopenharmony_ci               }
255bf215546Sopenharmony_ci
256bf215546Sopenharmony_ci               unsigned offset = foffset;
257bf215546Sopenharmony_ci               add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer + b,
258bf215546Sopenharmony_ci                                   &location, &offset, ftype, false);
259bf215546Sopenharmony_ci            }
260bf215546Sopenharmony_ci         }
261bf215546Sopenharmony_ci      }
262bf215546Sopenharmony_ci   }
263bf215546Sopenharmony_ci
264bf215546Sopenharmony_ci   /* Everything is easier in the state setup code if outputs and varyings are
265bf215546Sopenharmony_ci    * sorted in order of output offset (and buffer for varyings).
266bf215546Sopenharmony_ci    */
267bf215546Sopenharmony_ci   qsort(xfb->outputs, xfb->output_count, sizeof(xfb->outputs[0]),
268bf215546Sopenharmony_ci         compare_xfb_output_offsets);
269bf215546Sopenharmony_ci
270bf215546Sopenharmony_ci   if (varyings_info != NULL) {
271bf215546Sopenharmony_ci      qsort(varyings_info->varyings, varyings_info->varying_count,
272bf215546Sopenharmony_ci            sizeof(varyings_info->varyings[0]),
273bf215546Sopenharmony_ci            compare_xfb_varying_offsets);
274bf215546Sopenharmony_ci   }
275bf215546Sopenharmony_ci
276bf215546Sopenharmony_ci#ifndef NDEBUG
277bf215546Sopenharmony_ci   /* Finally, do a sanity check */
278bf215546Sopenharmony_ci   unsigned max_offset[NIR_MAX_XFB_BUFFERS] = {0};
279bf215546Sopenharmony_ci   for (unsigned i = 0; i < xfb->output_count; i++) {
280bf215546Sopenharmony_ci      assert(xfb->outputs[i].offset >= max_offset[xfb->outputs[i].buffer]);
281bf215546Sopenharmony_ci      assert(xfb->outputs[i].component_mask != 0);
282bf215546Sopenharmony_ci      unsigned slots = util_bitcount(xfb->outputs[i].component_mask);
283bf215546Sopenharmony_ci      max_offset[xfb->outputs[i].buffer] = xfb->outputs[i].offset + slots * 4;
284bf215546Sopenharmony_ci   }
285bf215546Sopenharmony_ci#endif
286bf215546Sopenharmony_ci
287bf215546Sopenharmony_ci   ralloc_free(shader->xfb_info);
288bf215546Sopenharmony_ci   shader->xfb_info = xfb;
289bf215546Sopenharmony_ci}
290bf215546Sopenharmony_ci
291bf215546Sopenharmony_cistatic int
292bf215546Sopenharmony_ciget_xfb_out_sort_index(const nir_xfb_output_info *a)
293bf215546Sopenharmony_ci{
294bf215546Sopenharmony_ci   /* Return the maximum number to put dummy components at the end. */
295bf215546Sopenharmony_ci   if (!a->component_mask)
296bf215546Sopenharmony_ci      return MAX_XFB_BUFFERS << 26;
297bf215546Sopenharmony_ci
298bf215546Sopenharmony_ci   return ((uint32_t)a->buffer << 26) | /* 2 bits for the buffer */
299bf215546Sopenharmony_ci          /* 10 bits for the component location (256 * 4) */
300bf215546Sopenharmony_ci          (((uint32_t)a->location * 4 + a->component_offset) << 16) |
301bf215546Sopenharmony_ci          /* 16 bits for the offset */
302bf215546Sopenharmony_ci          a->offset;
303bf215546Sopenharmony_ci}
304bf215546Sopenharmony_ci
305bf215546Sopenharmony_cistatic int
306bf215546Sopenharmony_cicompare_xfb_out(const void *pa, const void *pb)
307bf215546Sopenharmony_ci{
308bf215546Sopenharmony_ci   const nir_xfb_output_info *a = (const nir_xfb_output_info *)pa;
309bf215546Sopenharmony_ci   const nir_xfb_output_info *b = (const nir_xfb_output_info *)pb;
310bf215546Sopenharmony_ci
311bf215546Sopenharmony_ci   return get_xfb_out_sort_index(a) - get_xfb_out_sort_index(b);
312bf215546Sopenharmony_ci}
313bf215546Sopenharmony_ci
314bf215546Sopenharmony_ci/**
315bf215546Sopenharmony_ci * Gather transform feedback info from lowered IO intrinsics.
316bf215546Sopenharmony_ci *
317bf215546Sopenharmony_ci * Optionally return slot_to_register, an optional table to translate
318bf215546Sopenharmony_ci * gl_varying_slot to "base" indices.
319bf215546Sopenharmony_ci */
320bf215546Sopenharmony_cinir_xfb_info *
321bf215546Sopenharmony_cinir_gather_xfb_info_from_intrinsics(nir_shader *nir,
322bf215546Sopenharmony_ci                                    int slot_to_register[NUM_TOTAL_VARYING_SLOTS])
323bf215546Sopenharmony_ci{
324bf215546Sopenharmony_ci   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
325bf215546Sopenharmony_ci   uint8_t buffer_to_stream[MAX_XFB_BUFFERS] = {0};
326bf215546Sopenharmony_ci   uint8_t buffer_mask = 0;
327bf215546Sopenharmony_ci   uint8_t stream_mask = 0;
328bf215546Sopenharmony_ci
329bf215546Sopenharmony_ci   if (slot_to_register) {
330bf215546Sopenharmony_ci      memset(slot_to_register, -1,
331bf215546Sopenharmony_ci             sizeof(slot_to_register[0] * NUM_TOTAL_VARYING_SLOTS));
332bf215546Sopenharmony_ci   }
333bf215546Sopenharmony_ci
334bf215546Sopenharmony_ci   /* Gather xfb outputs. */
335bf215546Sopenharmony_ci   struct util_dynarray array = {0};
336bf215546Sopenharmony_ci
337bf215546Sopenharmony_ci   nir_foreach_block(block, impl) {
338bf215546Sopenharmony_ci      nir_foreach_instr(instr, block) {
339bf215546Sopenharmony_ci         if (instr->type != nir_instr_type_intrinsic ||
340bf215546Sopenharmony_ci             !nir_instr_xfb_write_mask(nir_instr_as_intrinsic(instr)))
341bf215546Sopenharmony_ci            continue;
342bf215546Sopenharmony_ci
343bf215546Sopenharmony_ci         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
344bf215546Sopenharmony_ci
345bf215546Sopenharmony_ci         unsigned wr_mask = nir_intrinsic_write_mask(intr);
346bf215546Sopenharmony_ci
347bf215546Sopenharmony_ci         while (wr_mask) {
348bf215546Sopenharmony_ci            unsigned i = u_bit_scan(&wr_mask);
349bf215546Sopenharmony_ci            unsigned index = nir_intrinsic_component(intr) + i;
350bf215546Sopenharmony_ci            nir_io_xfb xfb = index < 2 ? nir_intrinsic_io_xfb(intr) :
351bf215546Sopenharmony_ci                                         nir_intrinsic_io_xfb2(intr);
352bf215546Sopenharmony_ci
353bf215546Sopenharmony_ci            if (xfb.out[index % 2].num_components) {
354bf215546Sopenharmony_ci               nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
355bf215546Sopenharmony_ci               nir_xfb_output_info out;
356bf215546Sopenharmony_ci
357bf215546Sopenharmony_ci               out.component_offset = index;
358bf215546Sopenharmony_ci               out.component_mask =
359bf215546Sopenharmony_ci                  BITFIELD_RANGE(index, xfb.out[index % 2].num_components);
360bf215546Sopenharmony_ci               out.location = sem.location;
361bf215546Sopenharmony_ci               out.buffer = xfb.out[index % 2].buffer;
362bf215546Sopenharmony_ci               out.offset = (uint32_t)xfb.out[index % 2].offset * 4;
363bf215546Sopenharmony_ci               util_dynarray_append(&array, nir_xfb_output_info, out);
364bf215546Sopenharmony_ci
365bf215546Sopenharmony_ci               uint8_t stream = (sem.gs_streams >> (i * 2)) & 0x3;
366bf215546Sopenharmony_ci               buffer_to_stream[out.buffer] = stream;
367bf215546Sopenharmony_ci               buffer_mask |= BITFIELD_BIT(out.buffer);
368bf215546Sopenharmony_ci               stream_mask |= BITFIELD_BIT(stream);
369bf215546Sopenharmony_ci
370bf215546Sopenharmony_ci               if (slot_to_register)
371bf215546Sopenharmony_ci                  slot_to_register[sem.location] = nir_intrinsic_base(intr);
372bf215546Sopenharmony_ci
373bf215546Sopenharmony_ci               /* No elements before component_offset are allowed to be set. */
374bf215546Sopenharmony_ci               assert(!(out.component_mask & BITFIELD_MASK(out.component_offset)));
375bf215546Sopenharmony_ci            }
376bf215546Sopenharmony_ci         }
377bf215546Sopenharmony_ci      }
378bf215546Sopenharmony_ci   }
379bf215546Sopenharmony_ci
380bf215546Sopenharmony_ci   nir_xfb_output_info *outputs = (nir_xfb_output_info *)array.data;
381bf215546Sopenharmony_ci   int count = util_dynarray_num_elements(&array, nir_xfb_output_info);
382bf215546Sopenharmony_ci
383bf215546Sopenharmony_ci   if (!count)
384bf215546Sopenharmony_ci      return NULL;
385bf215546Sopenharmony_ci
386bf215546Sopenharmony_ci   if (count > 1) {
387bf215546Sopenharmony_ci      /* Sort outputs by buffer, location, and component. */
388bf215546Sopenharmony_ci      qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out);
389bf215546Sopenharmony_ci
390bf215546Sopenharmony_ci      /* Merge outputs referencing the same slot. */
391bf215546Sopenharmony_ci      for (int i = 0; i < count - 1; i++) {
392bf215546Sopenharmony_ci         nir_xfb_output_info *cur = &outputs[i];
393bf215546Sopenharmony_ci
394bf215546Sopenharmony_ci         if (!cur->component_mask)
395bf215546Sopenharmony_ci            continue;
396bf215546Sopenharmony_ci
397bf215546Sopenharmony_ci         /* Outputs referencing the same buffer and location are contiguous. */
398bf215546Sopenharmony_ci         for (int j = i + 1;
399bf215546Sopenharmony_ci              j < count &&
400bf215546Sopenharmony_ci              cur->buffer == outputs[j].buffer &&
401bf215546Sopenharmony_ci              cur->location == outputs[j].location; j++) {
402bf215546Sopenharmony_ci            if (outputs[j].component_mask &&
403bf215546Sopenharmony_ci                outputs[j].offset - outputs[j].component_offset * 4 ==
404bf215546Sopenharmony_ci                cur->offset - cur->component_offset * 4) {
405bf215546Sopenharmony_ci               unsigned merged_offset = MIN2(cur->component_offset,
406bf215546Sopenharmony_ci                                             outputs[j].component_offset);
407bf215546Sopenharmony_ci               /* component_mask is relative to 0, not component_offset */
408bf215546Sopenharmony_ci               unsigned merged_mask = cur->component_mask | outputs[j].component_mask;
409bf215546Sopenharmony_ci
410bf215546Sopenharmony_ci               /* The component mask should have no holes after merging. */
411bf215546Sopenharmony_ci               if (util_is_power_of_two_nonzero((merged_mask >> merged_offset) + 1)) {
412bf215546Sopenharmony_ci                  /* Merge outputs. */
413bf215546Sopenharmony_ci                  cur->component_offset = merged_offset;
414bf215546Sopenharmony_ci                  cur->component_mask = merged_mask;
415bf215546Sopenharmony_ci                  cur->offset = (uint32_t)cur->offset -
416bf215546Sopenharmony_ci                                (uint32_t)cur->component_offset * 4 +
417bf215546Sopenharmony_ci                                (uint32_t)merged_offset * 4;
418bf215546Sopenharmony_ci                  /* Disable the other output. */
419bf215546Sopenharmony_ci                  outputs[j].component_mask = 0;
420bf215546Sopenharmony_ci               }
421bf215546Sopenharmony_ci            }
422bf215546Sopenharmony_ci         }
423bf215546Sopenharmony_ci      }
424bf215546Sopenharmony_ci
425bf215546Sopenharmony_ci      /* Sort outputs again to put disabled outputs at the end. */
426bf215546Sopenharmony_ci      qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out);
427bf215546Sopenharmony_ci
428bf215546Sopenharmony_ci      /* Remove disabled outputs. */
429bf215546Sopenharmony_ci      for (int i = count - 1; i >= 0 && !outputs[i].component_mask; i--)
430bf215546Sopenharmony_ci         count = i;
431bf215546Sopenharmony_ci   }
432bf215546Sopenharmony_ci
433bf215546Sopenharmony_ci   for (unsigned i = 0; i < count; i++)
434bf215546Sopenharmony_ci      assert(outputs[i].component_mask);
435bf215546Sopenharmony_ci
436bf215546Sopenharmony_ci   /* Create nir_xfb_info. */
437bf215546Sopenharmony_ci   nir_xfb_info *info = calloc(1, nir_xfb_info_size(count));
438bf215546Sopenharmony_ci   if (!info) {
439bf215546Sopenharmony_ci      util_dynarray_fini(&array);
440bf215546Sopenharmony_ci      return NULL;
441bf215546Sopenharmony_ci   }
442bf215546Sopenharmony_ci
443bf215546Sopenharmony_ci   /* Fill nir_xfb_info. */
444bf215546Sopenharmony_ci   info->buffers_written = buffer_mask;
445bf215546Sopenharmony_ci   info->streams_written = stream_mask;
446bf215546Sopenharmony_ci   memcpy(info->buffer_to_stream, buffer_to_stream, sizeof(buffer_to_stream));
447bf215546Sopenharmony_ci   info->output_count = count;
448bf215546Sopenharmony_ci   memcpy(info->outputs, outputs, count * sizeof(outputs[0]));
449bf215546Sopenharmony_ci
450bf215546Sopenharmony_ci   /* Set strides. */
451bf215546Sopenharmony_ci   for (unsigned i = 0; i < MAX_XFB_BUFFERS; i++) {
452bf215546Sopenharmony_ci      if (buffer_mask & BITFIELD_BIT(i))
453bf215546Sopenharmony_ci         info->buffers[i].stride = nir->info.xfb_stride[i];
454bf215546Sopenharmony_ci   }
455bf215546Sopenharmony_ci
456bf215546Sopenharmony_ci   /* Set varying_count. */
457bf215546Sopenharmony_ci   for (unsigned i = 0; i < count; i++)
458bf215546Sopenharmony_ci      info->buffers[outputs[i].buffer].varying_count++;
459bf215546Sopenharmony_ci
460bf215546Sopenharmony_ci   util_dynarray_fini(&array);
461bf215546Sopenharmony_ci   return info;
462bf215546Sopenharmony_ci}
463bf215546Sopenharmony_ci
464bf215546Sopenharmony_civoid
465bf215546Sopenharmony_cinir_print_xfb_info(nir_xfb_info *info, FILE *fp)
466bf215546Sopenharmony_ci{
467bf215546Sopenharmony_ci   fprintf(fp, "buffers_written: 0x%x\n", info->buffers_written);
468bf215546Sopenharmony_ci   fprintf(fp, "streams_written: 0x%x\n", info->streams_written);
469bf215546Sopenharmony_ci
470bf215546Sopenharmony_ci   for (unsigned i = 0; i < NIR_MAX_XFB_BUFFERS; i++) {
471bf215546Sopenharmony_ci      if (BITFIELD_BIT(i) & info->buffers_written) {
472bf215546Sopenharmony_ci         fprintf(fp, "buffer%u: stride=%u varying_count=%u stream=%u\n", i,
473bf215546Sopenharmony_ci                 info->buffers[i].stride,
474bf215546Sopenharmony_ci                 info->buffers[i].varying_count,
475bf215546Sopenharmony_ci                 info->buffer_to_stream[i]);
476bf215546Sopenharmony_ci      }
477bf215546Sopenharmony_ci   }
478bf215546Sopenharmony_ci
479bf215546Sopenharmony_ci   fprintf(fp, "output_count: %u\n", info->output_count);
480bf215546Sopenharmony_ci
481bf215546Sopenharmony_ci   for (unsigned i = 0; i < info->output_count; i++) {
482bf215546Sopenharmony_ci      fprintf(fp, "output%u: buffer=%u, offset=%u, location=%u, "
483bf215546Sopenharmony_ci                  "component_offset=%u, component_mask=0x%x\n",
484bf215546Sopenharmony_ci              i, info->outputs[i].buffer,
485bf215546Sopenharmony_ci              info->outputs[i].offset,
486bf215546Sopenharmony_ci              info->outputs[i].location,
487bf215546Sopenharmony_ci              info->outputs[i].component_offset,
488bf215546Sopenharmony_ci              info->outputs[i].component_mask);
489bf215546Sopenharmony_ci   }
490bf215546Sopenharmony_ci}
491