1/*
2 * Copyright © 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include "compiler/nir/nir_builder.h"
25#include "ir3_compiler.h"
26#include "ir3_nir.h"
27
28struct state {
29   uint32_t topology;
30
31   struct primitive_map {
32      /* +POSITION, +PSIZE, ... - see shader_io_get_unique_index */
33      unsigned loc[12 + 32];
34      unsigned stride;
35   } map;
36
37   nir_ssa_def *header;
38
39   nir_variable *vertex_count_var;
40   nir_variable *emitted_vertex_var;
41   nir_variable *vertex_flags_out;
42
43   struct exec_list old_outputs;
44   struct exec_list new_outputs;
45   struct exec_list emit_outputs;
46
47   /* tess ctrl shader on a650 gets the local primitive id at different bits: */
48   unsigned local_primitive_id_start;
49};
50
51static nir_ssa_def *
52bitfield_extract(nir_builder *b, nir_ssa_def *v, uint32_t start, uint32_t mask)
53{
54   return nir_iand(b, nir_ushr(b, v, nir_imm_int(b, start)),
55                   nir_imm_int(b, mask));
56}
57
58static nir_ssa_def *
59build_invocation_id(nir_builder *b, struct state *state)
60{
61   return bitfield_extract(b, state->header, 11, 31);
62}
63
64static nir_ssa_def *
65build_vertex_id(nir_builder *b, struct state *state)
66{
67   return bitfield_extract(b, state->header, 6, 31);
68}
69
70static nir_ssa_def *
71build_local_primitive_id(nir_builder *b, struct state *state)
72{
73   return bitfield_extract(b, state->header, state->local_primitive_id_start,
74                           63);
75}
76
77static bool
78is_tess_levels(gl_varying_slot slot)
79{
80   return (slot == VARYING_SLOT_PRIMITIVE_ID ||
81           slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
82           slot == VARYING_SLOT_TESS_LEVEL_INNER);
83}
84
85/* Return a deterministic index for varyings. We can't rely on driver_location
86 * to be correct without linking the different stages first, so we create
87 * "primitive maps" where the producer decides on the location of each varying
88 * slot and then exports a per-slot array to the consumer. This compacts the
89 * gl_varying_slot space down a bit so that the primitive maps aren't too
90 * large.
91 *
92 * Note: per-patch varyings are currently handled separately, without any
93 * compacting.
94 *
95 * TODO: We could probably use the driver_location's directly in the non-SSO
96 * (Vulkan) case.
97 */
98
99static unsigned
100shader_io_get_unique_index(gl_varying_slot slot)
101{
102   switch (slot) {
103   case VARYING_SLOT_POS:         return 0;
104   case VARYING_SLOT_PSIZ:        return 1;
105   case VARYING_SLOT_COL0:        return 2;
106   case VARYING_SLOT_COL1:        return 3;
107   case VARYING_SLOT_BFC0:        return 4;
108   case VARYING_SLOT_BFC1:        return 5;
109   case VARYING_SLOT_FOGC:        return 6;
110   case VARYING_SLOT_CLIP_DIST0:  return 7;
111   case VARYING_SLOT_CLIP_DIST1:  return 8;
112   case VARYING_SLOT_CLIP_VERTEX: return 9;
113   case VARYING_SLOT_LAYER:       return 10;
114   case VARYING_SLOT_VIEWPORT:    return 11;
115   case VARYING_SLOT_VAR0 ... VARYING_SLOT_VAR31: {
116      struct state state = {};
117      STATIC_ASSERT(ARRAY_SIZE(state.map.loc) - 1 ==
118                    (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
119      struct ir3_shader_variant v = {};
120      STATIC_ASSERT(ARRAY_SIZE(v.output_loc) - 1 ==
121                    (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
122      return 12 + (slot - VARYING_SLOT_VAR0);
123   }
124   default:
125      unreachable("illegal slot in get unique index\n");
126   }
127}
128
129static nir_ssa_def *
130build_local_offset(nir_builder *b, struct state *state, nir_ssa_def *vertex,
131                   uint32_t location, uint32_t comp, nir_ssa_def *offset)
132{
133   nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
134   nir_ssa_def *primitive_offset =
135      nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
136   nir_ssa_def *attr_offset;
137   nir_ssa_def *vertex_stride;
138   unsigned index = shader_io_get_unique_index(location);
139
140   switch (b->shader->info.stage) {
141   case MESA_SHADER_VERTEX:
142   case MESA_SHADER_TESS_EVAL:
143      vertex_stride = nir_imm_int(b, state->map.stride * 4);
144      attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
145      break;
146   case MESA_SHADER_TESS_CTRL:
147   case MESA_SHADER_GEOMETRY:
148      vertex_stride = nir_load_vs_vertex_stride_ir3(b);
149      attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
150                             nir_imm_int(b, comp * 4));
151      break;
152   default:
153      unreachable("bad shader stage");
154   }
155
156   nir_ssa_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
157
158   return nir_iadd(
159      b, nir_iadd(b, primitive_offset, vertex_offset),
160      nir_iadd(b, attr_offset, nir_ishl(b, offset, nir_imm_int(b, 4))));
161}
162
163static nir_intrinsic_instr *
164replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
165                  nir_intrinsic_op op, nir_ssa_def *src0, nir_ssa_def *src1,
166                  nir_ssa_def *src2)
167{
168   nir_intrinsic_instr *new_intr = nir_intrinsic_instr_create(b->shader, op);
169
170   new_intr->src[0] = nir_src_for_ssa(src0);
171   if (src1)
172      new_intr->src[1] = nir_src_for_ssa(src1);
173   if (src2)
174      new_intr->src[2] = nir_src_for_ssa(src2);
175
176   new_intr->num_components = intr->num_components;
177
178   if (nir_intrinsic_infos[op].has_dest)
179      nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, intr->num_components,
180                        intr->dest.ssa.bit_size, NULL);
181
182   nir_builder_instr_insert(b, &new_intr->instr);
183
184   if (nir_intrinsic_infos[op].has_dest)
185      nir_ssa_def_rewrite_uses(&intr->dest.ssa, &new_intr->dest.ssa);
186
187   nir_instr_remove(&intr->instr);
188
189   return new_intr;
190}
191
192static void
193build_primitive_map(nir_shader *shader, struct primitive_map *map)
194{
195   /* All interfaces except the TCS <-> TES interface use ldlw, which takes
196    * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
197    * ldg, which takes an offset in dwords, but each per-vertex slot has
198    * space for every vertex, and there's space at the beginning for
199    * per-patch varyings.
200    */
201   unsigned slot_size = 16, start = 0;
202   if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
203      slot_size = shader->info.tess.tcs_vertices_out * 4;
204      start = util_last_bit(shader->info.patch_outputs_written) * 4;
205   }
206
207   uint64_t mask = shader->info.outputs_written;
208   unsigned loc = start;
209   while (mask) {
210      int location = u_bit_scan64(&mask);
211      if (is_tess_levels(location))
212         continue;
213
214      unsigned index = shader_io_get_unique_index(location);
215      map->loc[index] = loc;
216      loc += slot_size;
217   }
218
219   map->stride = loc;
220   /* Use units of dwords for the stride. */
221   if (shader->info.stage != MESA_SHADER_TESS_CTRL)
222      map->stride /= 4;
223}
224
225/* For shader stages that receive a primitive map, calculate how big it should
226 * be.
227 */
228
229static unsigned
230calc_primitive_map_size(nir_shader *shader)
231{
232   uint64_t mask = shader->info.inputs_read;
233   unsigned max_index = 0;
234   while (mask) {
235      int location = u_bit_scan64(&mask);
236
237      if (is_tess_levels(location))
238         continue;
239
240      unsigned index = shader_io_get_unique_index(location);
241      max_index = MAX2(max_index, index + 1);
242   }
243
244   return max_index;
245}
246
247static void
248lower_block_to_explicit_output(nir_block *block, nir_builder *b,
249                               struct state *state)
250{
251   nir_foreach_instr_safe (instr, block) {
252      if (instr->type != nir_instr_type_intrinsic)
253         continue;
254
255      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
256
257      switch (intr->intrinsic) {
258      case nir_intrinsic_store_output: {
259         // src[] = { value, offset }.
260
261         /* nir_lower_io_to_temporaries replaces all access to output
262          * variables with temp variables and then emits a nir_copy_var at
263          * the end of the shader.  Thus, we should always get a full wrmask
264          * here.
265          */
266         assert(
267            util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
268
269         b->cursor = nir_instr_remove(&intr->instr);
270
271         nir_ssa_def *vertex_id = build_vertex_id(b, state);
272         nir_ssa_def *offset = build_local_offset(
273            b, state, vertex_id, nir_intrinsic_io_semantics(intr).location,
274            nir_intrinsic_component(intr), intr->src[1].ssa);
275
276         nir_store_shared_ir3(b, intr->src[0].ssa, offset);
277         break;
278      }
279
280      default:
281         break;
282      }
283   }
284}
285
286static nir_ssa_def *
287local_thread_id(nir_builder *b)
288{
289   return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
290}
291
292void
293ir3_nir_lower_to_explicit_output(nir_shader *shader,
294                                 struct ir3_shader_variant *v,
295                                 unsigned topology)
296{
297   struct state state = {};
298
299   build_primitive_map(shader, &state.map);
300   memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
301
302   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
303   assert(impl);
304
305   nir_builder b;
306   nir_builder_init(&b, impl);
307   b.cursor = nir_before_cf_list(&impl->body);
308
309   if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
310      state.header = nir_load_tcs_header_ir3(&b);
311   else
312      state.header = nir_load_gs_header_ir3(&b);
313
314   nir_foreach_block_safe (block, impl)
315      lower_block_to_explicit_output(block, &b, &state);
316
317   nir_metadata_preserve(impl,
318                         nir_metadata_block_index | nir_metadata_dominance);
319
320   v->output_size = state.map.stride;
321}
322
323static void
324lower_block_to_explicit_input(nir_block *block, nir_builder *b,
325                              struct state *state)
326{
327   nir_foreach_instr_safe (instr, block) {
328      if (instr->type != nir_instr_type_intrinsic)
329         continue;
330
331      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
332
333      switch (intr->intrinsic) {
334      case nir_intrinsic_load_per_vertex_input: {
335         // src[] = { vertex, offset }.
336
337         b->cursor = nir_before_instr(&intr->instr);
338
339         nir_ssa_def *offset = build_local_offset(
340            b, state,
341            intr->src[0].ssa, // this is typically gl_InvocationID
342            nir_intrinsic_io_semantics(intr).location,
343            nir_intrinsic_component(intr), intr->src[1].ssa);
344
345         replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL,
346                           NULL);
347         break;
348      }
349
350      case nir_intrinsic_load_invocation_id: {
351         b->cursor = nir_before_instr(&intr->instr);
352
353         nir_ssa_def *iid = build_invocation_id(b, state);
354         nir_ssa_def_rewrite_uses(&intr->dest.ssa, iid);
355         nir_instr_remove(&intr->instr);
356         break;
357      }
358
359      default:
360         break;
361      }
362   }
363}
364
365void
366ir3_nir_lower_to_explicit_input(nir_shader *shader,
367                                struct ir3_shader_variant *v)
368{
369   struct state state = {};
370
371   /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
372    * HS uses a different primitive id, which starts at bit 16 in the header
373    */
374   if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
375       v->compiler->tess_use_shared)
376      state.local_primitive_id_start = 16;
377
378   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
379   assert(impl);
380
381   nir_builder b;
382   nir_builder_init(&b, impl);
383   b.cursor = nir_before_cf_list(&impl->body);
384
385   if (shader->info.stage == MESA_SHADER_GEOMETRY)
386      state.header = nir_load_gs_header_ir3(&b);
387   else
388      state.header = nir_load_tcs_header_ir3(&b);
389
390   nir_foreach_block_safe (block, impl)
391      lower_block_to_explicit_input(block, &b, &state);
392
393   v->input_size = calc_primitive_map_size(shader);
394}
395
396static nir_ssa_def *
397build_tcs_out_vertices(nir_builder *b)
398{
399   if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
400      return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
401   else
402      return nir_load_patch_vertices_in(b);
403}
404
405static nir_ssa_def *
406build_per_vertex_offset(nir_builder *b, struct state *state,
407                        nir_ssa_def *vertex, uint32_t location, uint32_t comp,
408                        nir_ssa_def *offset)
409{
410   nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b);
411   nir_ssa_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
412   nir_ssa_def *patch_offset = nir_imul24(b, patch_id, patch_stride);
413   nir_ssa_def *attr_offset;
414
415   if (nir_src_is_const(nir_src_for_ssa(offset))) {
416      location += nir_src_as_uint(nir_src_for_ssa(offset));
417      offset = nir_imm_int(b, 0);
418   } else {
419      /* Offset is in vec4's, but we need it in unit of components for the
420       * load/store_global_ir3 offset.
421       */
422      offset = nir_ishl(b, offset, nir_imm_int(b, 2));
423   }
424
425   nir_ssa_def *vertex_offset;
426   if (vertex) {
427      unsigned index = shader_io_get_unique_index(location);
428      switch (b->shader->info.stage) {
429      case MESA_SHADER_TESS_CTRL:
430         attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
431         break;
432      case MESA_SHADER_TESS_EVAL:
433         attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
434                                nir_imm_int(b, comp));
435         break;
436      default:
437         unreachable("bad shader state");
438      }
439
440      attr_offset = nir_iadd(b, attr_offset,
441                             nir_imul24(b, offset, build_tcs_out_vertices(b)));
442      vertex_offset = nir_ishl(b, vertex, nir_imm_int(b, 2));
443   } else {
444      assert(location >= VARYING_SLOT_PATCH0 &&
445             location <= VARYING_SLOT_TESS_MAX);
446      unsigned index = location - VARYING_SLOT_PATCH0;
447      attr_offset = nir_iadd(b, nir_imm_int(b, index * 4 + comp), offset);
448      vertex_offset = nir_imm_int(b, 0);
449   }
450
451   return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
452}
453
454static nir_ssa_def *
455build_patch_offset(nir_builder *b, struct state *state, uint32_t base,
456                   uint32_t comp, nir_ssa_def *offset)
457{
458   return build_per_vertex_offset(b, state, NULL, base, comp, offset);
459}
460
461static void
462tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
463{
464   switch (state->topology) {
465   case IR3_TESS_TRIANGLES:
466      *inner = 1;
467      *outer = 3;
468      break;
469   case IR3_TESS_QUADS:
470      *inner = 2;
471      *outer = 4;
472      break;
473   case IR3_TESS_ISOLINES:
474      *inner = 0;
475      *outer = 2;
476      break;
477   default:
478      unreachable("bad");
479   }
480}
481
482static nir_ssa_def *
483build_tessfactor_base(nir_builder *b, gl_varying_slot slot, uint32_t comp,
484                      struct state *state)
485{
486   uint32_t inner_levels, outer_levels;
487   tess_level_components(state, &inner_levels, &outer_levels);
488
489   const uint32_t patch_stride = 1 + inner_levels + outer_levels;
490
491   nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b);
492
493   nir_ssa_def *patch_offset =
494      nir_imul24(b, patch_id, nir_imm_int(b, patch_stride));
495
496   uint32_t offset;
497   switch (slot) {
498   case VARYING_SLOT_PRIMITIVE_ID:
499      offset = 0;
500      break;
501   case VARYING_SLOT_TESS_LEVEL_OUTER:
502      offset = 1;
503      break;
504   case VARYING_SLOT_TESS_LEVEL_INNER:
505      offset = 1 + outer_levels;
506      break;
507   default:
508      unreachable("bad");
509   }
510
511   return nir_iadd(b, patch_offset, nir_imm_int(b, offset + comp));
512}
513
514static void
515lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
516{
517   nir_foreach_instr_safe (instr, block) {
518      if (instr->type != nir_instr_type_intrinsic)
519         continue;
520
521      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
522
523      switch (intr->intrinsic) {
524      case nir_intrinsic_load_per_vertex_output: {
525         // src[] = { vertex, offset }.
526
527         b->cursor = nir_before_instr(&intr->instr);
528
529         nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
530         nir_ssa_def *offset = build_per_vertex_offset(
531            b, state, intr->src[0].ssa,
532            nir_intrinsic_io_semantics(intr).location,
533            nir_intrinsic_component(intr), intr->src[1].ssa);
534
535         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
536                           offset, NULL);
537         break;
538      }
539
540      case nir_intrinsic_store_per_vertex_output: {
541         // src[] = { value, vertex, offset }.
542
543         b->cursor = nir_before_instr(&intr->instr);
544
545         /* sparse writemask not supported */
546         assert(
547            util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
548
549         nir_ssa_def *value = intr->src[0].ssa;
550         nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
551         nir_ssa_def *offset = build_per_vertex_offset(
552            b, state, intr->src[1].ssa,
553            nir_intrinsic_io_semantics(intr).location,
554            nir_intrinsic_component(intr), intr->src[2].ssa);
555
556         replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
557                           address, offset);
558
559         break;
560      }
561
562      case nir_intrinsic_load_output: {
563         // src[] = { offset }.
564
565         b->cursor = nir_before_instr(&intr->instr);
566
567         nir_ssa_def *address, *offset;
568
569         /* note if vectorization of the tess level loads ever happens:
570          * "ldg" across 16-byte boundaries can behave incorrectly if results
571          * are never used. most likely some issue with (sy) not properly
572          * syncing with values coming from a second memory transaction.
573          */
574         gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
575         if (is_tess_levels(location)) {
576            assert(intr->dest.ssa.num_components == 1);
577            address = nir_load_tess_factor_base_ir3(b);
578            offset = build_tessfactor_base(
579               b, location, nir_intrinsic_component(intr), state);
580         } else {
581            address = nir_load_tess_param_base_ir3(b);
582            offset = build_patch_offset(b, state, location,
583                                        nir_intrinsic_component(intr),
584                                        intr->src[0].ssa);
585         }
586
587         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
588                           offset, NULL);
589         break;
590      }
591
592      case nir_intrinsic_store_output: {
593         // src[] = { value, offset }.
594
595         /* write patch output to bo */
596
597         b->cursor = nir_before_instr(&intr->instr);
598
599         /* sparse writemask not supported */
600         assert(
601            util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
602
603         gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
604         if (is_tess_levels(location)) {
605            uint32_t inner_levels, outer_levels, levels;
606            tess_level_components(state, &inner_levels, &outer_levels);
607
608            assert(intr->src[0].ssa->num_components == 1);
609
610            nir_if *nif = NULL;
611            if (location != VARYING_SLOT_PRIMITIVE_ID) {
612               /* with tess levels are defined as float[4] and float[2],
613                * but tess factor BO has smaller sizes for tris/isolines,
614                * so we have to discard any writes beyond the number of
615                * components for inner/outer levels
616                */
617               if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
618                  levels = outer_levels;
619               else
620                  levels = inner_levels;
621
622               nir_ssa_def *offset = nir_iadd_imm(
623                  b, intr->src[1].ssa, nir_intrinsic_component(intr));
624               nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels)));
625            }
626
627            nir_ssa_def *offset = build_tessfactor_base(
628               b, location, nir_intrinsic_component(intr), state);
629
630            replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
631                              intr->src[0].ssa,
632                              nir_load_tess_factor_base_ir3(b),
633                              nir_iadd(b, intr->src[1].ssa, offset));
634
635            if (location != VARYING_SLOT_PRIMITIVE_ID) {
636               nir_pop_if(b, nif);
637            }
638         } else {
639            nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
640            nir_ssa_def *offset = build_patch_offset(
641               b, state, location, nir_intrinsic_component(intr),
642               intr->src[1].ssa);
643
644            replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
645                              intr->src[0].ssa, address, offset);
646         }
647         break;
648      }
649
650      default:
651         break;
652      }
653   }
654}
655
656static void
657emit_tess_epilouge(nir_builder *b, struct state *state)
658{
659   /* Insert endpatch instruction:
660    *
661    * TODO we should re-work this to use normal flow control.
662    */
663
664   nir_end_patch_ir3(b);
665}
666
667void
668ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
669                        unsigned topology)
670{
671   struct state state = {.topology = topology};
672
673   if (shader_debug_enabled(shader->info.stage)) {
674      mesa_logi("NIR (before tess lowering) for %s shader:",
675                _mesa_shader_stage_to_string(shader->info.stage));
676      nir_log_shaderi(shader);
677   }
678
679   build_primitive_map(shader, &state.map);
680   memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
681   v->output_size = state.map.stride;
682
683   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
684   assert(impl);
685
686   nir_builder b;
687   nir_builder_init(&b, impl);
688   b.cursor = nir_before_cf_list(&impl->body);
689
690   state.header = nir_load_tcs_header_ir3(&b);
691
692   /* If required, store gl_PrimitiveID. */
693   if (v->key.tcs_store_primid) {
694      b.cursor = nir_after_cf_list(&impl->body);
695
696      nir_store_output(&b, nir_load_primitive_id(&b), nir_imm_int(&b, 0),
697                       .io_semantics = {
698                           .location = VARYING_SLOT_PRIMITIVE_ID,
699                           .num_slots = 1
700                        });
701
702      b.cursor = nir_before_cf_list(&impl->body);
703   }
704
705   nir_foreach_block_safe (block, impl)
706      lower_tess_ctrl_block(block, &b, &state);
707
708   /* Now move the body of the TCS into a conditional:
709    *
710    *   if (gl_InvocationID < num_vertices)
711    *     // body
712    *
713    */
714
715   nir_cf_list body;
716   nir_cf_extract(&body, nir_before_cf_list(&impl->body),
717                  nir_after_cf_list(&impl->body));
718
719   b.cursor = nir_after_cf_list(&impl->body);
720
721   /* Re-emit the header, since the old one got moved into the if branch */
722   state.header = nir_load_tcs_header_ir3(&b);
723   nir_ssa_def *iid = build_invocation_id(&b, &state);
724
725   const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
726   nir_ssa_def *cond = nir_ult(&b, iid, nir_imm_int(&b, nvertices));
727
728   nir_if *nif = nir_push_if(&b, cond);
729
730   nir_cf_reinsert(&body, b.cursor);
731
732   b.cursor = nir_after_cf_list(&nif->then_list);
733
734   /* Insert conditional exit for threads invocation id != 0 */
735   nir_ssa_def *iid0_cond = nir_ieq_imm(&b, iid, 0);
736   nir_cond_end_ir3(&b, iid0_cond);
737
738   emit_tess_epilouge(&b, &state);
739
740   nir_pop_if(&b, nif);
741
742   nir_metadata_preserve(impl, nir_metadata_none);
743}
744
745static void
746lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
747{
748   nir_foreach_instr_safe (instr, block) {
749      if (instr->type != nir_instr_type_intrinsic)
750         continue;
751
752      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
753
754      switch (intr->intrinsic) {
755      case nir_intrinsic_load_tess_coord: {
756         b->cursor = nir_after_instr(&intr->instr);
757         nir_ssa_def *x = nir_channel(b, &intr->dest.ssa, 0);
758         nir_ssa_def *y = nir_channel(b, &intr->dest.ssa, 1);
759         nir_ssa_def *z;
760
761         if (state->topology == IR3_TESS_TRIANGLES)
762            z = nir_fsub(b, nir_fsub(b, nir_imm_float(b, 1.0f), y), x);
763         else
764            z = nir_imm_float(b, 0.0f);
765
766         nir_ssa_def *coord = nir_vec3(b, x, y, z);
767
768         nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, coord,
769                                        b->cursor.instr);
770         break;
771      }
772
773      case nir_intrinsic_load_per_vertex_input: {
774         // src[] = { vertex, offset }.
775
776         b->cursor = nir_before_instr(&intr->instr);
777
778         nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
779         nir_ssa_def *offset = build_per_vertex_offset(
780            b, state, intr->src[0].ssa,
781            nir_intrinsic_io_semantics(intr).location,
782            nir_intrinsic_component(intr), intr->src[1].ssa);
783
784         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
785                           offset, NULL);
786         break;
787      }
788
789      case nir_intrinsic_load_input: {
790         // src[] = { offset }.
791
792         b->cursor = nir_before_instr(&intr->instr);
793
794         nir_ssa_def *address, *offset;
795
796         /* note if vectorization of the tess level loads ever happens:
797          * "ldg" across 16-byte boundaries can behave incorrectly if results
798          * are never used. most likely some issue with (sy) not properly
799          * syncing with values coming from a second memory transaction.
800          */
801         gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
802         if (is_tess_levels(location)) {
803            assert(intr->dest.ssa.num_components == 1);
804            address = nir_load_tess_factor_base_ir3(b);
805            offset = build_tessfactor_base(
806               b, location, nir_intrinsic_component(intr), state);
807         } else {
808            address = nir_load_tess_param_base_ir3(b);
809            offset = build_patch_offset(b, state, location,
810                                        nir_intrinsic_component(intr),
811                                        intr->src[0].ssa);
812         }
813
814         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
815                           offset, NULL);
816         break;
817      }
818
819      default:
820         break;
821      }
822   }
823}
824
825void
826ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
827                        unsigned topology)
828{
829   struct state state = {.topology = topology};
830
831   if (shader_debug_enabled(shader->info.stage)) {
832      mesa_logi("NIR (before tess lowering) for %s shader:",
833                _mesa_shader_stage_to_string(shader->info.stage));
834      nir_log_shaderi(shader);
835   }
836
837   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
838   assert(impl);
839
840   nir_builder b;
841   nir_builder_init(&b, impl);
842
843   nir_foreach_block_safe (block, impl)
844      lower_tess_eval_block(block, &b, &state);
845
846   v->input_size = calc_primitive_map_size(shader);
847
848   nir_metadata_preserve(impl, nir_metadata_none);
849}
850
851static void
852copy_vars(nir_builder *b, struct exec_list *dests, struct exec_list *srcs)
853{
854   foreach_two_lists (dest_node, dests, src_node, srcs) {
855      nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
856      nir_variable *src = exec_node_data(nir_variable, src_node, node);
857      nir_copy_var(b, dest, src);
858   }
859}
860
861static void
862lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
863{
864   nir_foreach_instr_safe (instr, block) {
865      if (instr->type != nir_instr_type_intrinsic)
866         continue;
867
868      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
869
870      switch (intr->intrinsic) {
871      case nir_intrinsic_end_primitive: {
872         /* Note: This ignores the stream, which seems to match the blob
873          * behavior. I'm guessing the HW ignores any extraneous cut
874          * signals from an EndPrimitive() that doesn't correspond to the
875          * rasterized stream.
876          */
877         b->cursor = nir_before_instr(&intr->instr);
878         nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
879         nir_instr_remove(&intr->instr);
880         break;
881      }
882
883      case nir_intrinsic_emit_vertex: {
884         /* Load the vertex count */
885         b->cursor = nir_before_instr(&intr->instr);
886         nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
887
888         nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
889
890         unsigned stream = nir_intrinsic_stream_id(intr);
891         /* vertex_flags_out |= stream */
892         nir_store_var(b, state->vertex_flags_out,
893                       nir_ior(b, nir_load_var(b, state->vertex_flags_out),
894                               nir_imm_int(b, stream)),
895                       0x1 /* .x */);
896
897         copy_vars(b, &state->emit_outputs, &state->old_outputs);
898
899         nir_instr_remove(&intr->instr);
900
901         nir_store_var(b, state->emitted_vertex_var,
902                       nir_iadd(b, nir_load_var(b, state->emitted_vertex_var),
903                                nir_imm_int(b, 1)),
904                       0x1);
905
906         nir_pop_if(b, NULL);
907
908         /* Increment the vertex count by 1 */
909         nir_store_var(b, state->vertex_count_var,
910                       nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
911         nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
912
913         break;
914      }
915
916      default:
917         break;
918      }
919   }
920}
921
922void
923ir3_nir_lower_gs(nir_shader *shader)
924{
925   struct state state = {};
926
927   /* Don't lower multiple times: */
928   nir_foreach_shader_out_variable (var, shader)
929      if (var->data.location == VARYING_SLOT_GS_VERTEX_FLAGS_IR3)
930         return;
931
932   if (shader_debug_enabled(shader->info.stage)) {
933      mesa_logi("NIR (before gs lowering):");
934      nir_log_shaderi(shader);
935   }
936
937   /* Create an output var for vertex_flags. This will be shadowed below,
938    * same way regular outputs get shadowed, and this variable will become a
939    * temporary.
940    */
941   state.vertex_flags_out = nir_variable_create(
942      shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags");
943   state.vertex_flags_out->data.driver_location = shader->num_outputs++;
944   state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
945   state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
946
947   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
948   assert(impl);
949
950   nir_builder b;
951   nir_builder_init(&b, impl);
952   b.cursor = nir_before_cf_list(&impl->body);
953
954   state.header = nir_load_gs_header_ir3(&b);
955
956   /* Generate two set of shadow vars for the output variables.  The first
957    * set replaces the real outputs and the second set (emit_outputs) we'll
958    * assign in the emit_vertex conditionals.  Then at the end of the shader
959    * we copy the emit_outputs to the real outputs, so that we get
960    * store_output in uniform control flow.
961    */
962   exec_list_make_empty(&state.old_outputs);
963   nir_foreach_shader_out_variable_safe (var, shader) {
964      exec_node_remove(&var->node);
965      exec_list_push_tail(&state.old_outputs, &var->node);
966   }
967   exec_list_make_empty(&state.new_outputs);
968   exec_list_make_empty(&state.emit_outputs);
969   nir_foreach_variable_in_list (var, &state.old_outputs) {
970      /* Create a new output var by cloning the original output var and
971       * stealing the name.
972       */
973      nir_variable *output = nir_variable_clone(var, shader);
974      exec_list_push_tail(&state.new_outputs, &output->node);
975
976      /* Rewrite the original output to be a shadow variable. */
977      var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
978      var->data.mode = nir_var_shader_temp;
979
980      /* Clone the shadow variable to create the emit shadow variable that
981       * we'll assign in the emit conditionals.
982       */
983      nir_variable *emit_output = nir_variable_clone(var, shader);
984      emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
985      exec_list_push_tail(&state.emit_outputs, &emit_output->node);
986   }
987
988   /* During the shader we'll keep track of which vertex we're currently
989    * emitting for the EmitVertex test and how many vertices we emitted so we
990    * know to discard if didn't emit any.  In most simple shaders, this can
991    * all be statically determined and gets optimized away.
992    */
993   state.vertex_count_var =
994      nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
995   state.emitted_vertex_var =
996      nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
997
998   /* Initialize to 0. */
999   b.cursor = nir_before_cf_list(&impl->body);
1000   nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
1001   nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
1002   nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1003
1004   nir_foreach_block_safe (block, impl)
1005      lower_gs_block(block, &b, &state);
1006
1007   /* Note: returns are lowered, so there should be only one block before the
1008    * end block.  If we had real returns, we would probably want to redirect
1009    * them to this new if statement, rather than emitting this code at every
1010    * return statement.
1011    */
1012   assert(impl->end_block->predecessors->entries == 1);
1013   nir_block *block = nir_impl_last_block(impl);
1014   b.cursor = nir_after_block_before_jump(block);
1015
1016   /* If we haven't emitted any vertex we need to copy the shadow (old)
1017    * outputs to emit outputs here.
1018    *
1019    * Also some piglit GS tests[1] don't have EndPrimitive() so throw
1020    * in an extra vertex_flags write for good measure.  If unneeded it
1021    * will be optimized out.
1022    *
1023    * [1] ex, tests/spec/glsl-1.50/execution/compatibility/clipping/gs-clip-vertex-const-accept.shader_test
1024    */
1025   nir_ssa_def *cond =
1026      nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
1027   nir_push_if(&b, cond);
1028   nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1029   copy_vars(&b, &state.emit_outputs, &state.old_outputs);
1030   nir_pop_if(&b, NULL);
1031
1032   nir_discard_if(&b, cond);
1033
1034   copy_vars(&b, &state.new_outputs, &state.emit_outputs);
1035
1036   exec_list_append(&shader->variables, &state.old_outputs);
1037   exec_list_append(&shader->variables, &state.emit_outputs);
1038   exec_list_append(&shader->variables, &state.new_outputs);
1039
1040   nir_metadata_preserve(impl, nir_metadata_none);
1041
1042   nir_lower_global_vars_to_local(shader);
1043   nir_split_var_copies(shader);
1044   nir_lower_var_copies(shader);
1045
1046   nir_fixup_deref_modes(shader);
1047
1048   if (shader_debug_enabled(shader->info.stage)) {
1049      mesa_logi("NIR (after gs lowering):");
1050      nir_log_shaderi(shader);
1051   }
1052}
1053