1/*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "nir_xfb_info.h"
25
26#include "util/u_dynarray.h"
27#include <util/u_math.h>
28
29static void
30add_var_xfb_varying(nir_xfb_info *xfb,
31                    nir_xfb_varyings_info *varyings,
32                    unsigned buffer,
33                    unsigned offset,
34                    const struct glsl_type *type)
35{
36   if (varyings == NULL)
37      return;
38
39   nir_xfb_varying_info *varying = &varyings->varyings[varyings->varying_count++];
40
41   varying->type = type;
42   varying->buffer = buffer;
43   varying->offset = offset;
44   xfb->buffers[buffer].varying_count++;
45}
46
47
48static nir_xfb_info *
49nir_xfb_info_create(void *mem_ctx, uint16_t output_count)
50{
51   return rzalloc_size(mem_ctx, nir_xfb_info_size(output_count));
52}
53
54static size_t
55nir_xfb_varyings_info_size(uint16_t varying_count)
56{
57   return sizeof(nir_xfb_info) + sizeof(nir_xfb_varying_info) * varying_count;
58}
59
60static nir_xfb_varyings_info *
61nir_xfb_varyings_info_create(void *mem_ctx, uint16_t varying_count)
62{
63   return rzalloc_size(mem_ctx, nir_xfb_varyings_info_size(varying_count));
64}
65
66static void
67add_var_xfb_outputs(nir_xfb_info *xfb,
68                    nir_xfb_varyings_info *varyings,
69                    nir_variable *var,
70                    unsigned buffer,
71                    unsigned *location,
72                    unsigned *offset,
73                    const struct glsl_type *type,
74                    bool varying_added)
75{
76   /* If this type contains a 64-bit value, align to 8 bytes */
77   if (glsl_type_contains_64bit(type))
78      *offset = ALIGN_POT(*offset, 8);
79
80   if (glsl_type_is_array_or_matrix(type) && !var->data.compact) {
81      unsigned length = glsl_get_length(type);
82
83      const struct glsl_type *child_type = glsl_get_array_element(type);
84      if (!glsl_type_is_array(child_type) &&
85          !glsl_type_is_struct(child_type)) {
86
87         add_var_xfb_varying(xfb, varyings, buffer, *offset, type);
88         varying_added = true;
89      }
90
91      for (unsigned i = 0; i < length; i++)
92         add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset,
93                             child_type, varying_added);
94   } else if (glsl_type_is_struct_or_ifc(type)) {
95      unsigned length = glsl_get_length(type);
96      for (unsigned i = 0; i < length; i++) {
97         const struct glsl_type *child_type = glsl_get_struct_field(type, i);
98         add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset,
99                             child_type, varying_added);
100      }
101   } else {
102      assert(buffer < NIR_MAX_XFB_BUFFERS);
103      if (xfb->buffers_written & (1 << buffer)) {
104         assert(xfb->buffers[buffer].stride == var->data.xfb.stride);
105         assert(xfb->buffer_to_stream[buffer] == var->data.stream);
106      } else {
107         xfb->buffers_written |= (1 << buffer);
108         xfb->buffers[buffer].stride = var->data.xfb.stride;
109         xfb->buffer_to_stream[buffer] = var->data.stream;
110      }
111
112      assert(var->data.stream < NIR_MAX_XFB_STREAMS);
113      xfb->streams_written |= (1 << var->data.stream);
114
115      unsigned comp_slots;
116      if (var->data.compact) {
117         /* This only happens for clip/cull which are float arrays */
118         assert(glsl_without_array(type) == glsl_float_type());
119         assert(var->data.location == VARYING_SLOT_CLIP_DIST0 ||
120                var->data.location == VARYING_SLOT_CLIP_DIST1);
121         comp_slots = glsl_get_length(type);
122      } else {
123         comp_slots = glsl_get_component_slots(type);
124
125         UNUSED unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4);
126         assert(attrib_slots == glsl_count_attribute_slots(type, false));
127
128         /* Ensure that we don't have, for instance, a dvec2 with a
129          * location_frac of 2 which would make it crass a location boundary
130          * even though it fits in a single slot.  However, you can have a
131          * dvec3 which crosses the slot boundary with a location_frac of 2.
132          */
133         assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) ==
134                attrib_slots);
135      }
136
137      assert(var->data.location_frac + comp_slots <= 8);
138      uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac;
139      unsigned comp_offset = var->data.location_frac;
140
141      if (!varying_added) {
142         add_var_xfb_varying(xfb, varyings, buffer, *offset, type);
143      }
144
145      while (comp_mask) {
146         nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++];
147
148         output->buffer = buffer;
149         output->offset = *offset;
150         output->location = *location;
151         output->component_mask = comp_mask & 0xf;
152         output->component_offset = comp_offset;
153
154         *offset += util_bitcount(output->component_mask) * 4;
155         (*location)++;
156         comp_mask >>= 4;
157         comp_offset = 0;
158      }
159   }
160}
161
162static int
163compare_xfb_varying_offsets(const void *_a, const void *_b)
164{
165   const nir_xfb_varying_info *a = _a, *b = _b;
166
167   if (a->buffer != b->buffer)
168      return a->buffer - b->buffer;
169
170   return a->offset - b->offset;
171}
172
173static int
174compare_xfb_output_offsets(const void *_a, const void *_b)
175{
176   const nir_xfb_output_info *a = _a, *b = _b;
177
178   return a->offset - b->offset;
179}
180
181void
182nir_shader_gather_xfb_info(nir_shader *shader)
183{
184   nir_gather_xfb_info_with_varyings(shader, NULL, NULL);
185}
186
187void
188nir_gather_xfb_info_with_varyings(nir_shader *shader,
189                                  void *mem_ctx,
190                                  nir_xfb_varyings_info **varyings_info_out)
191{
192   assert(shader->info.stage == MESA_SHADER_VERTEX ||
193          shader->info.stage == MESA_SHADER_TESS_EVAL ||
194          shader->info.stage == MESA_SHADER_GEOMETRY);
195
196   /* Compute the number of outputs we have.  This is simply the number of
197    * cumulative locations consumed by all the variables.  If a location is
198    * represented by multiple variables, then they each count separately in
199    * number of outputs.  This is only an estimate as some variables may have
200    * an xfb_buffer but not an output so it may end up larger than we need but
201    * it should be good enough for allocation.
202    */
203   unsigned num_outputs = 0;
204   unsigned num_varyings = 0;
205   nir_xfb_varyings_info *varyings_info = NULL;
206   nir_foreach_shader_out_variable(var, shader) {
207      if (var->data.explicit_xfb_buffer) {
208         num_outputs += glsl_count_attribute_slots(var->type, false);
209         num_varyings += glsl_varying_count(var->type);
210      }
211   }
212   if (num_outputs == 0 || num_varyings == 0)
213      return;
214
215   nir_xfb_info *xfb = nir_xfb_info_create(shader, num_outputs);
216   if (varyings_info_out != NULL) {
217      *varyings_info_out = nir_xfb_varyings_info_create(mem_ctx, num_varyings);
218      varyings_info = *varyings_info_out;
219   }
220
221   /* Walk the list of outputs and add them to the array */
222   nir_foreach_shader_out_variable(var, shader) {
223      if (!var->data.explicit_xfb_buffer)
224         continue;
225
226      unsigned location = var->data.location;
227
228      /* In order to know if we have a array of blocks can't be done just by
229       * checking if we have an interface type and is an array, because due
230       * splitting we could end on a case were we received a split struct
231       * that contains an array.
232       */
233      bool is_array_block = var->interface_type != NULL &&
234         glsl_type_is_array(var->type) &&
235         glsl_without_array(var->type) == var->interface_type;
236
237      if (var->data.explicit_offset && !is_array_block) {
238         unsigned offset = var->data.offset;
239         add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer,
240                             &location, &offset, var->type, false);
241      } else if (is_array_block) {
242         assert(glsl_type_is_struct_or_ifc(var->interface_type));
243
244         unsigned aoa_size = glsl_get_aoa_size(var->type);
245         const struct glsl_type *itype = var->interface_type;
246         unsigned nfields = glsl_get_length(itype);
247         for (unsigned b = 0; b < aoa_size; b++) {
248            for (unsigned f = 0; f < nfields; f++) {
249               int foffset = glsl_get_struct_field_offset(itype, f);
250               const struct glsl_type *ftype = glsl_get_struct_field(itype, f);
251               if (foffset < 0) {
252                  location += glsl_count_attribute_slots(ftype, false);
253                  continue;
254               }
255
256               unsigned offset = foffset;
257               add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer + b,
258                                   &location, &offset, ftype, false);
259            }
260         }
261      }
262   }
263
264   /* Everything is easier in the state setup code if outputs and varyings are
265    * sorted in order of output offset (and buffer for varyings).
266    */
267   qsort(xfb->outputs, xfb->output_count, sizeof(xfb->outputs[0]),
268         compare_xfb_output_offsets);
269
270   if (varyings_info != NULL) {
271      qsort(varyings_info->varyings, varyings_info->varying_count,
272            sizeof(varyings_info->varyings[0]),
273            compare_xfb_varying_offsets);
274   }
275
276#ifndef NDEBUG
277   /* Finally, do a sanity check */
278   unsigned max_offset[NIR_MAX_XFB_BUFFERS] = {0};
279   for (unsigned i = 0; i < xfb->output_count; i++) {
280      assert(xfb->outputs[i].offset >= max_offset[xfb->outputs[i].buffer]);
281      assert(xfb->outputs[i].component_mask != 0);
282      unsigned slots = util_bitcount(xfb->outputs[i].component_mask);
283      max_offset[xfb->outputs[i].buffer] = xfb->outputs[i].offset + slots * 4;
284   }
285#endif
286
287   ralloc_free(shader->xfb_info);
288   shader->xfb_info = xfb;
289}
290
291static int
292get_xfb_out_sort_index(const nir_xfb_output_info *a)
293{
294   /* Return the maximum number to put dummy components at the end. */
295   if (!a->component_mask)
296      return MAX_XFB_BUFFERS << 26;
297
298   return ((uint32_t)a->buffer << 26) | /* 2 bits for the buffer */
299          /* 10 bits for the component location (256 * 4) */
300          (((uint32_t)a->location * 4 + a->component_offset) << 16) |
301          /* 16 bits for the offset */
302          a->offset;
303}
304
305static int
306compare_xfb_out(const void *pa, const void *pb)
307{
308   const nir_xfb_output_info *a = (const nir_xfb_output_info *)pa;
309   const nir_xfb_output_info *b = (const nir_xfb_output_info *)pb;
310
311   return get_xfb_out_sort_index(a) - get_xfb_out_sort_index(b);
312}
313
314/**
315 * Gather transform feedback info from lowered IO intrinsics.
316 *
317 * Optionally return slot_to_register, an optional table to translate
318 * gl_varying_slot to "base" indices.
319 */
320nir_xfb_info *
321nir_gather_xfb_info_from_intrinsics(nir_shader *nir,
322                                    int slot_to_register[NUM_TOTAL_VARYING_SLOTS])
323{
324   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
325   uint8_t buffer_to_stream[MAX_XFB_BUFFERS] = {0};
326   uint8_t buffer_mask = 0;
327   uint8_t stream_mask = 0;
328
329   if (slot_to_register) {
330      memset(slot_to_register, -1,
331             sizeof(slot_to_register[0] * NUM_TOTAL_VARYING_SLOTS));
332   }
333
334   /* Gather xfb outputs. */
335   struct util_dynarray array = {0};
336
337   nir_foreach_block(block, impl) {
338      nir_foreach_instr(instr, block) {
339         if (instr->type != nir_instr_type_intrinsic ||
340             !nir_instr_xfb_write_mask(nir_instr_as_intrinsic(instr)))
341            continue;
342
343         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
344
345         unsigned wr_mask = nir_intrinsic_write_mask(intr);
346
347         while (wr_mask) {
348            unsigned i = u_bit_scan(&wr_mask);
349            unsigned index = nir_intrinsic_component(intr) + i;
350            nir_io_xfb xfb = index < 2 ? nir_intrinsic_io_xfb(intr) :
351                                         nir_intrinsic_io_xfb2(intr);
352
353            if (xfb.out[index % 2].num_components) {
354               nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
355               nir_xfb_output_info out;
356
357               out.component_offset = index;
358               out.component_mask =
359                  BITFIELD_RANGE(index, xfb.out[index % 2].num_components);
360               out.location = sem.location;
361               out.buffer = xfb.out[index % 2].buffer;
362               out.offset = (uint32_t)xfb.out[index % 2].offset * 4;
363               util_dynarray_append(&array, nir_xfb_output_info, out);
364
365               uint8_t stream = (sem.gs_streams >> (i * 2)) & 0x3;
366               buffer_to_stream[out.buffer] = stream;
367               buffer_mask |= BITFIELD_BIT(out.buffer);
368               stream_mask |= BITFIELD_BIT(stream);
369
370               if (slot_to_register)
371                  slot_to_register[sem.location] = nir_intrinsic_base(intr);
372
373               /* No elements before component_offset are allowed to be set. */
374               assert(!(out.component_mask & BITFIELD_MASK(out.component_offset)));
375            }
376         }
377      }
378   }
379
380   nir_xfb_output_info *outputs = (nir_xfb_output_info *)array.data;
381   int count = util_dynarray_num_elements(&array, nir_xfb_output_info);
382
383   if (!count)
384      return NULL;
385
386   if (count > 1) {
387      /* Sort outputs by buffer, location, and component. */
388      qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out);
389
390      /* Merge outputs referencing the same slot. */
391      for (int i = 0; i < count - 1; i++) {
392         nir_xfb_output_info *cur = &outputs[i];
393
394         if (!cur->component_mask)
395            continue;
396
397         /* Outputs referencing the same buffer and location are contiguous. */
398         for (int j = i + 1;
399              j < count &&
400              cur->buffer == outputs[j].buffer &&
401              cur->location == outputs[j].location; j++) {
402            if (outputs[j].component_mask &&
403                outputs[j].offset - outputs[j].component_offset * 4 ==
404                cur->offset - cur->component_offset * 4) {
405               unsigned merged_offset = MIN2(cur->component_offset,
406                                             outputs[j].component_offset);
407               /* component_mask is relative to 0, not component_offset */
408               unsigned merged_mask = cur->component_mask | outputs[j].component_mask;
409
410               /* The component mask should have no holes after merging. */
411               if (util_is_power_of_two_nonzero((merged_mask >> merged_offset) + 1)) {
412                  /* Merge outputs. */
413                  cur->component_offset = merged_offset;
414                  cur->component_mask = merged_mask;
415                  cur->offset = (uint32_t)cur->offset -
416                                (uint32_t)cur->component_offset * 4 +
417                                (uint32_t)merged_offset * 4;
418                  /* Disable the other output. */
419                  outputs[j].component_mask = 0;
420               }
421            }
422         }
423      }
424
425      /* Sort outputs again to put disabled outputs at the end. */
426      qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out);
427
428      /* Remove disabled outputs. */
429      for (int i = count - 1; i >= 0 && !outputs[i].component_mask; i--)
430         count = i;
431   }
432
433   for (unsigned i = 0; i < count; i++)
434      assert(outputs[i].component_mask);
435
436   /* Create nir_xfb_info. */
437   nir_xfb_info *info = calloc(1, nir_xfb_info_size(count));
438   if (!info) {
439      util_dynarray_fini(&array);
440      return NULL;
441   }
442
443   /* Fill nir_xfb_info. */
444   info->buffers_written = buffer_mask;
445   info->streams_written = stream_mask;
446   memcpy(info->buffer_to_stream, buffer_to_stream, sizeof(buffer_to_stream));
447   info->output_count = count;
448   memcpy(info->outputs, outputs, count * sizeof(outputs[0]));
449
450   /* Set strides. */
451   for (unsigned i = 0; i < MAX_XFB_BUFFERS; i++) {
452      if (buffer_mask & BITFIELD_BIT(i))
453         info->buffers[i].stride = nir->info.xfb_stride[i];
454   }
455
456   /* Set varying_count. */
457   for (unsigned i = 0; i < count; i++)
458      info->buffers[outputs[i].buffer].varying_count++;
459
460   util_dynarray_fini(&array);
461   return info;
462}
463
464void
465nir_print_xfb_info(nir_xfb_info *info, FILE *fp)
466{
467   fprintf(fp, "buffers_written: 0x%x\n", info->buffers_written);
468   fprintf(fp, "streams_written: 0x%x\n", info->streams_written);
469
470   for (unsigned i = 0; i < NIR_MAX_XFB_BUFFERS; i++) {
471      if (BITFIELD_BIT(i) & info->buffers_written) {
472         fprintf(fp, "buffer%u: stride=%u varying_count=%u stream=%u\n", i,
473                 info->buffers[i].stride,
474                 info->buffers[i].varying_count,
475                 info->buffer_to_stream[i]);
476      }
477   }
478
479   fprintf(fp, "output_count: %u\n", info->output_count);
480
481   for (unsigned i = 0; i < info->output_count; i++) {
482      fprintf(fp, "output%u: buffer=%u, offset=%u, location=%u, "
483                  "component_offset=%u, component_mask=0x%x\n",
484              i, info->outputs[i].buffer,
485              info->outputs[i].offset,
486              info->outputs[i].location,
487              info->outputs[i].component_offset,
488              info->outputs[i].component_mask);
489   }
490}
491