1/*
2 * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Rob Clark <robclark@freedesktop.org>
25 */
26
27#include <stdarg.h>
28
29#include "util/u_math.h"
30#include "util/u_memory.h"
31#include "util/u_string.h"
32
33#include "ir3_compiler.h"
34#include "ir3_image.h"
35#include "ir3_nir.h"
36#include "ir3_shader.h"
37
38#include "instr-a3xx.h"
39#include "ir3.h"
40#include "ir3_context.h"
41
42void
43ir3_handle_nonuniform(struct ir3_instruction *instr,
44                      nir_intrinsic_instr *intrin)
45{
46   if (nir_intrinsic_has_access(intrin) &&
47       (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) {
48      instr->flags |= IR3_INSTR_NONUNIF;
49   }
50}
51
52void
53ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)
54{
55   nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
56   if (!intrin)
57      return;
58
59   instr->flags |= IR3_INSTR_B;
60   instr->cat6.base = nir_intrinsic_desc_set(intrin);
61}
62
63static struct ir3_instruction *
64create_input(struct ir3_context *ctx, unsigned compmask)
65{
66   struct ir3_instruction *in;
67
68   in = ir3_instr_create(ctx->in_block, OPC_META_INPUT, 1, 0);
69   in->input.sysval = ~0;
70   __ssa_dst(in)->wrmask = compmask;
71
72   array_insert(ctx->ir, ctx->ir->inputs, in);
73
74   return in;
75}
76
77static struct ir3_instruction *
78create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord,
79                  unsigned n)
80{
81   struct ir3_block *block = ctx->block;
82   struct ir3_instruction *instr;
83   /* packed inloc is fixed up later: */
84   struct ir3_instruction *inloc = create_immed(block, n);
85
86   if (coord) {
87      instr = ir3_BARY_F(block, inloc, 0, coord, 0);
88   } else if (ctx->compiler->flat_bypass) {
89      if (ctx->compiler->gen >= 6) {
90         instr = ir3_FLAT_B(block, inloc, 0, inloc, 0);
91      } else {
92         instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
93         instr->cat6.type = TYPE_U32;
94         instr->cat6.iim_val = 1;
95      }
96   } else {
97      instr = ir3_BARY_F(block, inloc, 0, ctx->ij[IJ_PERSP_PIXEL], 0);
98      instr->srcs[1]->wrmask = 0x3;
99   }
100
101   return instr;
102}
103
104static struct ir3_instruction *
105create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
106{
107   /* first four vec4 sysval's reserved for UBOs: */
108   /* NOTE: dp is in scalar, but there can be >4 dp components: */
109   struct ir3_const_state *const_state = ir3_const_state(ctx->so);
110   unsigned n = const_state->offsets.driver_param;
111   unsigned r = regid(n + dp / 4, dp % 4);
112   return create_uniform(ctx->block, r);
113}
114
115/*
116 * Adreno's comparisons produce a 1 for true and 0 for false, in either 16 or
117 * 32-bit registers.  We use NIR's 1-bit integers to represent bools, and
118 * trust that we will only see and/or/xor on those 1-bit values, so we can
119 * safely store NIR i1s in a 32-bit reg while always containing either a 1 or
120 * 0.
121 */
122
123/*
124 * alu/sfu instructions:
125 */
126
127static struct ir3_instruction *
128create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
129           unsigned src_bitsize, nir_op op)
130{
131   type_t src_type, dst_type;
132
133   switch (op) {
134   case nir_op_f2f32:
135   case nir_op_f2f16_rtne:
136   case nir_op_f2f16_rtz:
137   case nir_op_f2f16:
138   case nir_op_f2i32:
139   case nir_op_f2i16:
140   case nir_op_f2i8:
141   case nir_op_f2u32:
142   case nir_op_f2u16:
143   case nir_op_f2u8:
144      switch (src_bitsize) {
145      case 32:
146         src_type = TYPE_F32;
147         break;
148      case 16:
149         src_type = TYPE_F16;
150         break;
151      default:
152         ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
153      }
154      break;
155
156   case nir_op_i2f32:
157   case nir_op_i2f16:
158   case nir_op_i2i32:
159   case nir_op_i2i16:
160   case nir_op_i2i8:
161      switch (src_bitsize) {
162      case 32:
163         src_type = TYPE_S32;
164         break;
165      case 16:
166         src_type = TYPE_S16;
167         break;
168      case 8:
169         src_type = TYPE_S8;
170         break;
171      default:
172         ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
173      }
174      break;
175
176   case nir_op_u2f32:
177   case nir_op_u2f16:
178   case nir_op_u2u32:
179   case nir_op_u2u16:
180   case nir_op_u2u8:
181      switch (src_bitsize) {
182      case 32:
183         src_type = TYPE_U32;
184         break;
185      case 16:
186         src_type = TYPE_U16;
187         break;
188      case 8:
189         src_type = TYPE_U8;
190         break;
191      default:
192         ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
193      }
194      break;
195
196   case nir_op_b2f16:
197   case nir_op_b2f32:
198   case nir_op_b2i8:
199   case nir_op_b2i16:
200   case nir_op_b2i32:
201      src_type = ctx->compiler->bool_type;
202      break;
203
204   default:
205      ir3_context_error(ctx, "invalid conversion op: %u", op);
206   }
207
208   switch (op) {
209   case nir_op_f2f32:
210   case nir_op_i2f32:
211   case nir_op_u2f32:
212   case nir_op_b2f32:
213      dst_type = TYPE_F32;
214      break;
215
216   case nir_op_f2f16_rtne:
217   case nir_op_f2f16_rtz:
218   case nir_op_f2f16:
219   case nir_op_i2f16:
220   case nir_op_u2f16:
221   case nir_op_b2f16:
222      dst_type = TYPE_F16;
223      break;
224
225   case nir_op_f2i32:
226   case nir_op_i2i32:
227   case nir_op_b2i32:
228      dst_type = TYPE_S32;
229      break;
230
231   case nir_op_f2i16:
232   case nir_op_i2i16:
233   case nir_op_b2i16:
234      dst_type = TYPE_S16;
235      break;
236
237   case nir_op_f2i8:
238   case nir_op_i2i8:
239   case nir_op_b2i8:
240      dst_type = TYPE_S8;
241      break;
242
243   case nir_op_f2u32:
244   case nir_op_u2u32:
245      dst_type = TYPE_U32;
246      break;
247
248   case nir_op_f2u16:
249   case nir_op_u2u16:
250      dst_type = TYPE_U16;
251      break;
252
253   case nir_op_f2u8:
254   case nir_op_u2u8:
255      dst_type = TYPE_U8;
256      break;
257
258   default:
259      ir3_context_error(ctx, "invalid conversion op: %u", op);
260   }
261
262   if (src_type == dst_type)
263      return src;
264
265   struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);
266
267   if (op == nir_op_f2f16_rtne) {
268      cov->cat1.round = ROUND_EVEN;
269   } else if (op == nir_op_f2f16) {
270      unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
271      nir_rounding_mode rounding_mode =
272         nir_get_rounding_mode_from_float_controls(execution_mode,
273                                                   nir_type_float16);
274      if (rounding_mode == nir_rounding_mode_rtne)
275         cov->cat1.round = ROUND_EVEN;
276   }
277
278   return cov;
279}
280
281/* For shift instructions NIR always has shift amount as 32 bit integer */
282static struct ir3_instruction *
283resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src,
284                    unsigned bs)
285{
286   if (bs != 16)
287      return src;
288
289   return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);
290}
291
292static void
293emit_alu_dot_4x8_as_dp4acc(struct ir3_context *ctx, nir_alu_instr *alu,
294                           struct ir3_instruction **dst,
295                           struct ir3_instruction **src)
296{
297   struct ir3_instruction *accumulator = NULL;
298   if (alu->op == nir_op_udot_4x8_uadd_sat) {
299      accumulator = create_immed(ctx->block, 0);
300   } else {
301      accumulator = src[2];
302   }
303
304   dst[0] = ir3_DP4ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0);
305
306   if (alu->op == nir_op_udot_4x8_uadd ||
307       alu->op == nir_op_udot_4x8_uadd_sat) {
308      dst[0]->cat3.signedness = IR3_SRC_UNSIGNED;
309   } else {
310      dst[0]->cat3.signedness = IR3_SRC_MIXED;
311   }
312
313   /* For some reason (sat) doesn't work in unsigned case so
314    * we have to emulate it.
315    */
316   if (alu->op == nir_op_udot_4x8_uadd_sat) {
317      dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0);
318      dst[0]->flags |= IR3_INSTR_SAT;
319   } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
320      dst[0]->flags |= IR3_INSTR_SAT;
321   }
322}
323
324static void
325emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu,
326                           struct ir3_instruction **dst,
327                           struct ir3_instruction **src)
328{
329   int signedness;
330   if (alu->op == nir_op_udot_4x8_uadd ||
331       alu->op == nir_op_udot_4x8_uadd_sat) {
332      signedness = IR3_SRC_UNSIGNED;
333   } else {
334      signedness = IR3_SRC_MIXED;
335   }
336
337   struct ir3_instruction *accumulator = NULL;
338   if (alu->op == nir_op_udot_4x8_uadd_sat ||
339       alu->op == nir_op_sudot_4x8_iadd_sat) {
340      accumulator = create_immed(ctx->block, 0);
341   } else {
342      accumulator = src[2];
343   }
344
345   dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0);
346   dst[0]->cat3.packed = IR3_SRC_PACKED_LOW;
347   dst[0]->cat3.signedness = signedness;
348
349   dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, dst[0], 0);
350   dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH;
351   dst[0]->cat3.signedness = signedness;
352
353   if (alu->op == nir_op_udot_4x8_uadd_sat) {
354      dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0);
355      dst[0]->flags |= IR3_INSTR_SAT;
356   } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
357      dst[0] = ir3_ADD_S(ctx->block, dst[0], 0, src[2], 0);
358      dst[0]->flags |= IR3_INSTR_SAT;
359   }
360}
361
362static void
363emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
364{
365   const nir_op_info *info = &nir_op_infos[alu->op];
366   struct ir3_instruction **dst, *src[info->num_inputs];
367   unsigned bs[info->num_inputs]; /* bit size */
368   struct ir3_block *b = ctx->block;
369   unsigned dst_sz, wrmask;
370   type_t dst_type = type_uint_size(nir_dest_bit_size(alu->dest.dest));
371
372   if (alu->dest.dest.is_ssa) {
373      dst_sz = alu->dest.dest.ssa.num_components;
374      wrmask = (1 << dst_sz) - 1;
375   } else {
376      dst_sz = alu->dest.dest.reg.reg->num_components;
377      wrmask = alu->dest.write_mask;
378   }
379
380   dst = ir3_get_dst(ctx, &alu->dest.dest, dst_sz);
381
382   /* Vectors are special in that they have non-scalarized writemasks,
383    * and just take the first swizzle channel for each argument in
384    * order into each writemask channel.
385    */
386   if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
387       (alu->op == nir_op_vec4) || (alu->op == nir_op_vec8) ||
388       (alu->op == nir_op_vec16)) {
389
390      for (int i = 0; i < info->num_inputs; i++) {
391         nir_alu_src *asrc = &alu->src[i];
392
393         compile_assert(ctx, !asrc->abs);
394         compile_assert(ctx, !asrc->negate);
395
396         src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];
397         if (!src[i])
398            src[i] = create_immed_typed(ctx->block, 0, dst_type);
399         dst[i] = ir3_MOV(b, src[i], dst_type);
400      }
401
402      ir3_put_dst(ctx, &alu->dest.dest);
403      return;
404   }
405
406   /* We also get mov's with more than one component for mov's so
407    * handle those specially:
408    */
409   if (alu->op == nir_op_mov) {
410      nir_alu_src *asrc = &alu->src[0];
411      struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src);
412
413      for (unsigned i = 0; i < dst_sz; i++) {
414         if (wrmask & (1 << i)) {
415            dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type);
416         } else {
417            dst[i] = NULL;
418         }
419      }
420
421      ir3_put_dst(ctx, &alu->dest.dest);
422      return;
423   }
424
425   /* General case: We can just grab the one used channel per src. */
426   for (int i = 0; i < info->num_inputs; i++) {
427      unsigned chan = ffs(alu->dest.write_mask) - 1;
428      nir_alu_src *asrc = &alu->src[i];
429
430      compile_assert(ctx, !asrc->abs);
431      compile_assert(ctx, !asrc->negate);
432
433      src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
434      bs[i] = nir_src_bit_size(asrc->src);
435
436      compile_assert(ctx, src[i]);
437   }
438
439   switch (alu->op) {
440   case nir_op_f2f32:
441   case nir_op_f2f16_rtne:
442   case nir_op_f2f16_rtz:
443   case nir_op_f2f16:
444   case nir_op_f2i32:
445   case nir_op_f2i16:
446   case nir_op_f2i8:
447   case nir_op_f2u32:
448   case nir_op_f2u16:
449   case nir_op_f2u8:
450   case nir_op_i2f32:
451   case nir_op_i2f16:
452   case nir_op_i2i32:
453   case nir_op_i2i16:
454   case nir_op_i2i8:
455   case nir_op_u2f32:
456   case nir_op_u2f16:
457   case nir_op_u2u32:
458   case nir_op_u2u16:
459   case nir_op_u2u8:
460   case nir_op_b2f16:
461   case nir_op_b2f32:
462   case nir_op_b2i8:
463   case nir_op_b2i16:
464   case nir_op_b2i32:
465      dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
466      break;
467
468   case nir_op_fquantize2f16:
469      dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16_rtne),
470                          16, nir_op_f2f32);
471      break;
472   case nir_op_f2b1:
473      dst[0] = ir3_CMPS_F(
474         b, src[0], 0,
475         create_immed_typed(b, 0, type_float_size(bs[0])), 0);
476      dst[0]->cat2.condition = IR3_COND_NE;
477      break;
478
479   case nir_op_i2b1:
480      /* i2b1 will appear when translating from nir_load_ubo or
481       * nir_intrinsic_load_ssbo, where any non-zero value is true.
482       */
483      dst[0] = ir3_CMPS_S(
484         b, src[0], 0,
485         create_immed_typed(b, 0, type_uint_size(bs[0])), 0);
486      dst[0]->cat2.condition = IR3_COND_NE;
487      break;
488
489   case nir_op_b2b1:
490      /* b2b1 will appear when translating from
491       *
492       * - nir_intrinsic_load_shared of a 32-bit 0/~0 value.
493       * - nir_intrinsic_load_constant of a 32-bit 0/~0 value
494       *
495       * A negate can turn those into a 1 or 0 for us.
496       */
497      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
498      break;
499
500   case nir_op_b2b32:
501      /* b2b32 will appear when converting our 1-bit bools to a store_shared
502       * argument.
503       *
504       * A negate can turn those into a ~0 for us.
505       */
506      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
507      break;
508
509   case nir_op_fneg:
510      dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
511      break;
512   case nir_op_fabs:
513      dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
514      break;
515   case nir_op_fmax:
516      dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
517      break;
518   case nir_op_fmin:
519      dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
520      break;
521   case nir_op_fsat:
522      /* if there is just a single use of the src, and it supports
523       * (sat) bit, we can just fold the (sat) flag back to the
524       * src instruction and create a mov.  This is easier for cp
525       * to eliminate.
526       */
527      if (alu->src[0].src.is_ssa && is_sat_compatible(src[0]->opc) &&
528          (list_length(&alu->src[0].src.ssa->uses) == 1)) {
529         src[0]->flags |= IR3_INSTR_SAT;
530         dst[0] = ir3_MOV(b, src[0], dst_type);
531      } else {
532         /* otherwise generate a max.f that saturates.. blob does
533          * similar (generating a cat2 mov using max.f)
534          */
535         dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
536         dst[0]->flags |= IR3_INSTR_SAT;
537      }
538      break;
539   case nir_op_fmul:
540      dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
541      break;
542   case nir_op_fadd:
543      dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
544      break;
545   case nir_op_fsub:
546      dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
547      break;
548   case nir_op_ffma:
549      dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
550      break;
551   case nir_op_fddx:
552   case nir_op_fddx_coarse:
553      dst[0] = ir3_DSX(b, src[0], 0);
554      dst[0]->cat5.type = TYPE_F32;
555      break;
556   case nir_op_fddx_fine:
557      dst[0] = ir3_DSXPP_MACRO(b, src[0], 0);
558      dst[0]->cat5.type = TYPE_F32;
559      break;
560   case nir_op_fddy:
561   case nir_op_fddy_coarse:
562      dst[0] = ir3_DSY(b, src[0], 0);
563      dst[0]->cat5.type = TYPE_F32;
564      break;
565      break;
566   case nir_op_fddy_fine:
567      dst[0] = ir3_DSYPP_MACRO(b, src[0], 0);
568      dst[0]->cat5.type = TYPE_F32;
569      break;
570   case nir_op_flt:
571      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
572      dst[0]->cat2.condition = IR3_COND_LT;
573      break;
574   case nir_op_fge:
575      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
576      dst[0]->cat2.condition = IR3_COND_GE;
577      break;
578   case nir_op_feq:
579      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
580      dst[0]->cat2.condition = IR3_COND_EQ;
581      break;
582   case nir_op_fneu:
583      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
584      dst[0]->cat2.condition = IR3_COND_NE;
585      break;
586   case nir_op_fceil:
587      dst[0] = ir3_CEIL_F(b, src[0], 0);
588      break;
589   case nir_op_ffloor:
590      dst[0] = ir3_FLOOR_F(b, src[0], 0);
591      break;
592   case nir_op_ftrunc:
593      dst[0] = ir3_TRUNC_F(b, src[0], 0);
594      break;
595   case nir_op_fround_even:
596      dst[0] = ir3_RNDNE_F(b, src[0], 0);
597      break;
598   case nir_op_fsign:
599      dst[0] = ir3_SIGN_F(b, src[0], 0);
600      break;
601
602   case nir_op_fsin:
603      dst[0] = ir3_SIN(b, src[0], 0);
604      break;
605   case nir_op_fcos:
606      dst[0] = ir3_COS(b, src[0], 0);
607      break;
608   case nir_op_frsq:
609      dst[0] = ir3_RSQ(b, src[0], 0);
610      break;
611   case nir_op_frcp:
612      dst[0] = ir3_RCP(b, src[0], 0);
613      break;
614   case nir_op_flog2:
615      dst[0] = ir3_LOG2(b, src[0], 0);
616      break;
617   case nir_op_fexp2:
618      dst[0] = ir3_EXP2(b, src[0], 0);
619      break;
620   case nir_op_fsqrt:
621      dst[0] = ir3_SQRT(b, src[0], 0);
622      break;
623
624   case nir_op_iabs:
625      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
626      break;
627   case nir_op_iadd:
628      dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
629      break;
630   case nir_op_ihadd:
631      dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0);
632      dst[0]->dsts[0]->flags |= IR3_REG_EI;
633      break;
634   case nir_op_uhadd:
635      dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
636      dst[0]->dsts[0]->flags |= IR3_REG_EI;
637      break;
638   case nir_op_iand:
639      dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
640      break;
641   case nir_op_imax:
642      dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
643      break;
644   case nir_op_umax:
645      dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
646      break;
647   case nir_op_imin:
648      dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
649      break;
650   case nir_op_umin:
651      dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
652      break;
653   case nir_op_umul_low:
654      dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
655      break;
656   case nir_op_imadsh_mix16:
657      dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
658      break;
659   case nir_op_imad24_ir3:
660      dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
661      break;
662   case nir_op_imul:
663      compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) == 16);
664      dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
665      break;
666   case nir_op_imul24:
667      dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
668      break;
669   case nir_op_ineg:
670      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
671      break;
672   case nir_op_inot:
673      if (bs[0] == 1) {
674         struct ir3_instruction *one =
675               create_immed_typed(ctx->block, 1, ctx->compiler->bool_type);
676         dst[0] = ir3_SUB_U(b, one, 0, src[0], 0);
677      } else {
678         dst[0] = ir3_NOT_B(b, src[0], 0);
679      }
680      break;
681   case nir_op_ior:
682      dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
683      break;
684   case nir_op_ishl:
685      dst[0] =
686         ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
687      break;
688   case nir_op_ishr:
689      dst[0] =
690         ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
691      break;
692   case nir_op_isub:
693      dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
694      break;
695   case nir_op_ixor:
696      dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
697      break;
698   case nir_op_ushr:
699      dst[0] =
700         ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
701      break;
702   case nir_op_ilt:
703      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
704      dst[0]->cat2.condition = IR3_COND_LT;
705      break;
706   case nir_op_ige:
707      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
708      dst[0]->cat2.condition = IR3_COND_GE;
709      break;
710   case nir_op_ieq:
711      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
712      dst[0]->cat2.condition = IR3_COND_EQ;
713      break;
714   case nir_op_ine:
715      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
716      dst[0]->cat2.condition = IR3_COND_NE;
717      break;
718   case nir_op_ult:
719      dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
720      dst[0]->cat2.condition = IR3_COND_LT;
721      break;
722   case nir_op_uge:
723      dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
724      dst[0]->cat2.condition = IR3_COND_GE;
725      break;
726
727   case nir_op_bcsel: {
728      struct ir3_instruction *cond = src[0];
729
730      /* If src[0] is a negation (likely as a result of an ir3_b2n(cond)),
731       * we can ignore that and use original cond, since the nonzero-ness of
732       * cond stays the same.
733       */
734      if (cond->opc == OPC_ABSNEG_S && cond->flags == 0 &&
735          (cond->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==
736             IR3_REG_SNEG) {
737         cond = cond->srcs[0]->def->instr;
738      }
739
740      compile_assert(ctx, bs[1] == bs[2]);
741
742      /* The condition's size has to match the other two arguments' size, so
743       * convert down if necessary.
744       *
745       * Single hashtable is fine, because the conversion will either be
746       * 16->32 or 32->16, but never both
747       */
748      if (is_half(src[1]) != is_half(cond)) {
749         struct hash_entry *prev_entry =
750            _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]);
751         if (prev_entry) {
752            cond = prev_entry->data;
753         } else {
754            if (is_half(cond)) {
755               cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32);
756            } else {
757               cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
758            }
759            _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond);
760         }
761      }
762
763      if (is_half(src[1])) {
764         dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0);
765      } else {
766         dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
767      }
768
769      break;
770   }
771   case nir_op_bit_count: {
772      if (ctx->compiler->gen < 5 || (src[0]->dsts[0]->flags & IR3_REG_HALF)) {
773         dst[0] = ir3_CBITS_B(b, src[0], 0);
774         break;
775      }
776
777      // We need to do this 16b at a time on a5xx+a6xx.  Once half-precision
778      // support is in place, this should probably move to a NIR lowering pass:
779      struct ir3_instruction *hi, *lo;
780
781      hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0), TYPE_U32,
782                   TYPE_U16);
783      lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
784
785      hi = ir3_CBITS_B(b, hi, 0);
786      lo = ir3_CBITS_B(b, lo, 0);
787
788      // TODO maybe the builders should default to making dst half-precision
789      // if the src's were half precision, to make this less awkward.. otoh
790      // we should probably just do this lowering in NIR.
791      hi->dsts[0]->flags |= IR3_REG_HALF;
792      lo->dsts[0]->flags |= IR3_REG_HALF;
793
794      dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);
795      dst[0]->dsts[0]->flags |= IR3_REG_HALF;
796      dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);
797      break;
798   }
799   case nir_op_ifind_msb: {
800      struct ir3_instruction *cmp;
801      dst[0] = ir3_CLZ_S(b, src[0], 0);
802      cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
803      cmp->cat2.condition = IR3_COND_GE;
804      dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
805                           0, cmp, 0, dst[0], 0);
806      break;
807   }
808   case nir_op_ufind_msb:
809      dst[0] = ir3_CLZ_B(b, src[0], 0);
810      dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
811                           0, src[0], 0, dst[0], 0);
812      break;
813   case nir_op_find_lsb:
814      dst[0] = ir3_BFREV_B(b, src[0], 0);
815      dst[0] = ir3_CLZ_B(b, dst[0], 0);
816      break;
817   case nir_op_bitfield_reverse:
818      dst[0] = ir3_BFREV_B(b, src[0], 0);
819      break;
820
821   case nir_op_uadd_sat:
822      dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
823      dst[0]->flags |= IR3_INSTR_SAT;
824      break;
825   case nir_op_iadd_sat:
826      dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0);
827      dst[0]->flags |= IR3_INSTR_SAT;
828      break;
829   case nir_op_usub_sat:
830      dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
831      dst[0]->flags |= IR3_INSTR_SAT;
832      break;
833   case nir_op_isub_sat:
834      dst[0] = ir3_SUB_S(b, src[0], 0, src[1], 0);
835      dst[0]->flags |= IR3_INSTR_SAT;
836      break;
837
838   case nir_op_udot_4x8_uadd:
839   case nir_op_udot_4x8_uadd_sat:
840   case nir_op_sudot_4x8_iadd:
841   case nir_op_sudot_4x8_iadd_sat: {
842      if (ctx->compiler->has_dp4acc) {
843         emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst, src);
844      } else if (ctx->compiler->has_dp2acc) {
845         emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst, src);
846      } else {
847         ir3_context_error(ctx, "ALU op should have been lowered: %s\n",
848                           nir_op_infos[alu->op].name);
849      }
850
851      break;
852   }
853
854   default:
855      ir3_context_error(ctx, "Unhandled ALU op: %s\n",
856                        nir_op_infos[alu->op].name);
857      break;
858   }
859
860   if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
861      assert(nir_dest_bit_size(alu->dest.dest) == 1 || alu->op == nir_op_b2b32);
862      assert(dst_sz == 1);
863   } else {
864      /* 1-bit values stored in 32-bit registers are only valid for certain
865       * ALU ops.
866       */
867      switch (alu->op) {
868      case nir_op_iand:
869      case nir_op_ior:
870      case nir_op_ixor:
871      case nir_op_inot:
872      case nir_op_bcsel:
873         break;
874      default:
875         compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) != 1);
876      }
877   }
878
879   ir3_put_dst(ctx, &alu->dest.dest);
880}
881
882static void
883emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
884                            struct ir3_instruction **dst)
885{
886   struct ir3_block *b = ctx->block;
887
888   /* This is only generated for us by nir_lower_ubo_vec4, which leaves base =
889    * 0.
890    */
891   assert(nir_intrinsic_base(intr) == 0);
892
893   unsigned ncomp = intr->num_components;
894   struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
895   struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
896   struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
897   ldc->dsts[0]->wrmask = MASK(ncomp);
898   ldc->cat6.iim_val = ncomp;
899   ldc->cat6.d = nir_intrinsic_component(intr);
900   ldc->cat6.type = TYPE_U32;
901
902   ir3_handle_bindless_cat6(ldc, intr->src[0]);
903   if (ldc->flags & IR3_INSTR_B)
904      ctx->so->bindless_ubo = true;
905   ir3_handle_nonuniform(ldc, intr);
906
907   ir3_split_dest(b, dst, ldc, 0, ncomp);
908}
909
910static void
911emit_intrinsic_copy_ubo_to_uniform(struct ir3_context *ctx,
912                                   nir_intrinsic_instr *intr)
913{
914   struct ir3_block *b = ctx->block;
915
916   unsigned base = nir_intrinsic_base(intr);
917   unsigned size = nir_intrinsic_range(intr);
918
919   struct ir3_instruction *addr1 = ir3_get_addr1(ctx, base);
920
921   struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
922   struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
923   struct ir3_instruction *ldc = ir3_LDC_K(b, idx, 0, offset, 0);
924   ldc->cat6.iim_val = size;
925   ldc->barrier_class = ldc->barrier_conflict = IR3_BARRIER_CONST_W;
926
927   ir3_handle_bindless_cat6(ldc, intr->src[0]);
928   if (ldc->flags & IR3_INSTR_B)
929      ctx->so->bindless_ubo = true;
930
931   ir3_instr_set_address(ldc, addr1);
932
933   array_insert(b, b->keeps, ldc);
934}
935
936/* handles direct/indirect UBO reads: */
937static void
938emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
939                        struct ir3_instruction **dst)
940{
941   struct ir3_block *b = ctx->block;
942   struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
943   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
944   unsigned ubo = regid(const_state->offsets.ubo, 0);
945   const unsigned ptrsz = ir3_pointer_size(ctx->compiler);
946
947   int off = 0;
948
949   /* First src is ubo index, which could either be an immed or not: */
950   src0 = ir3_get_src(ctx, &intr->src[0])[0];
951   if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
952      base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz));
953      base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1);
954   } else {
955      base_lo = create_uniform_indirect(b, ubo, TYPE_U32,
956                                        ir3_get_addr0(ctx, src0, ptrsz));
957      base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32,
958                                        ir3_get_addr0(ctx, src0, ptrsz));
959
960      /* NOTE: since relative addressing is used, make sure constlen is
961       * at least big enough to cover all the UBO addresses, since the
962       * assembler won't know what the max address reg is.
963       */
964      ctx->so->constlen =
965         MAX2(ctx->so->constlen,
966              const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz));
967   }
968
969   /* note: on 32bit gpu's base_hi is ignored and DCE'd */
970   addr = base_lo;
971
972   if (nir_src_is_const(intr->src[1])) {
973      off += nir_src_as_uint(intr->src[1]);
974   } else {
975      /* For load_ubo_indirect, second src is indirect offset: */
976      src1 = ir3_get_src(ctx, &intr->src[1])[0];
977
978      /* and add offset to addr: */
979      addr = ir3_ADD_S(b, addr, 0, src1, 0);
980   }
981
982   /* if offset is to large to encode in the ldg, split it out: */
983   if ((off + (intr->num_components * 4)) > 1024) {
984      /* split out the minimal amount to improve the odds that
985       * cp can fit the immediate in the add.s instruction:
986       */
987      unsigned off2 = off + (intr->num_components * 4) - 1024;
988      addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
989      off -= off2;
990   }
991
992   if (ptrsz == 2) {
993      struct ir3_instruction *carry;
994
995      /* handle 32b rollover, ie:
996       *   if (addr < base_lo)
997       *      base_hi++
998       */
999      carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
1000      carry->cat2.condition = IR3_COND_LT;
1001      base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
1002
1003      addr = ir3_collect(b, addr, base_hi);
1004   }
1005
1006   for (int i = 0; i < intr->num_components; i++) {
1007      struct ir3_instruction *load =
1008         ir3_LDG(b, addr, 0, create_immed(b, off + i * 4), 0,
1009                 create_immed(b, 1), 0); /* num components */
1010      load->cat6.type = TYPE_U32;
1011      dst[i] = load;
1012   }
1013}
1014
1015/* Load a kernel param: src[] = { address }. */
1016static void
1017emit_intrinsic_load_kernel_input(struct ir3_context *ctx,
1018                                 nir_intrinsic_instr *intr,
1019                                 struct ir3_instruction **dst)
1020{
1021   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
1022   struct ir3_block *b = ctx->block;
1023   unsigned offset = nir_intrinsic_base(intr);
1024   unsigned p = regid(const_state->offsets.kernel_params, 0);
1025
1026   struct ir3_instruction *src0 = ir3_get_src(ctx, &intr->src[0])[0];
1027
1028   if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
1029      offset += src0->srcs[0]->iim_val;
1030
1031      /* kernel param position is in bytes, but constant space is 32b registers: */
1032      compile_assert(ctx, !(offset & 0x3));
1033
1034      dst[0] = create_uniform(b, p + (offset / 4));
1035   } else {
1036      /* kernel param position is in bytes, but constant space is 32b registers: */
1037      compile_assert(ctx, !(offset & 0x3));
1038
1039      /* TODO we should probably be lowering this in nir, and also handling
1040       * non-32b inputs.. Also we probably don't want to be using
1041       * SP_MODE_CONTROL.CONSTANT_DEMOTION_ENABLE for KERNEL shaders..
1042       */
1043      src0 = ir3_SHR_B(b, src0, 0, create_immed(b, 2), 0);
1044
1045      dst[0] = create_uniform_indirect(b, offset / 4, TYPE_U32,
1046                                       ir3_get_addr0(ctx, src0, 1));
1047   }
1048}
1049
1050/* src[] = { block_index } */
1051static void
1052emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1053                         struct ir3_instruction **dst)
1054{
1055   struct ir3_block *b = ctx->block;
1056   struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
1057   struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
1058   resinfo->cat6.iim_val = 1;
1059   resinfo->cat6.d = ctx->compiler->gen >= 6 ? 1 : 2;
1060   resinfo->cat6.type = TYPE_U32;
1061   resinfo->cat6.typed = false;
1062   /* resinfo has no writemask and always writes out 3 components */
1063   resinfo->dsts[0]->wrmask = MASK(3);
1064   ir3_handle_bindless_cat6(resinfo, intr->src[0]);
1065   ir3_handle_nonuniform(resinfo, intr);
1066
1067   if (ctx->compiler->gen >= 6) {
1068      ir3_split_dest(b, dst, resinfo, 0, 1);
1069   } else {
1070      /* On a5xx, resinfo returns the low 16 bits of ssbo size in .x and the high 16 bits in .y */
1071      struct ir3_instruction *resinfo_dst[2];
1072      ir3_split_dest(b, resinfo_dst, resinfo, 0, 2);
1073      *dst = ir3_ADD_U(b, ir3_SHL_B(b, resinfo_dst[1], 0, create_immed(b, 16), 0), 0, resinfo_dst[0], 0);
1074   }
1075}
1076
1077/* src[] = { offset }. const_index[] = { base } */
1078static void
1079emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1080                           struct ir3_instruction **dst)
1081{
1082   struct ir3_block *b = ctx->block;
1083   struct ir3_instruction *ldl, *offset;
1084   unsigned base;
1085
1086   offset = ir3_get_src(ctx, &intr->src[0])[0];
1087   base = nir_intrinsic_base(intr);
1088
1089   ldl = ir3_LDL(b, offset, 0, create_immed(b, base), 0,
1090                 create_immed(b, intr->num_components), 0);
1091
1092   ldl->cat6.type = utype_dst(intr->dest);
1093   ldl->dsts[0]->wrmask = MASK(intr->num_components);
1094
1095   ldl->barrier_class = IR3_BARRIER_SHARED_R;
1096   ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
1097
1098   ir3_split_dest(b, dst, ldl, 0, intr->num_components);
1099}
1100
1101/* src[] = { value, offset }. const_index[] = { base, write_mask } */
1102static void
1103emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1104{
1105   struct ir3_block *b = ctx->block;
1106   struct ir3_instruction *stl, *offset;
1107   struct ir3_instruction *const *value;
1108   unsigned base, wrmask, ncomp;
1109
1110   value = ir3_get_src(ctx, &intr->src[0]);
1111   offset = ir3_get_src(ctx, &intr->src[1])[0];
1112
1113   base = nir_intrinsic_base(intr);
1114   wrmask = nir_intrinsic_write_mask(intr);
1115   ncomp = ffs(~wrmask) - 1;
1116
1117   assert(wrmask == BITFIELD_MASK(intr->num_components));
1118
1119   stl = ir3_STL(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1120                 create_immed(b, ncomp), 0);
1121   stl->cat6.dst_offset = base;
1122   stl->cat6.type = utype_src(intr->src[0]);
1123   stl->barrier_class = IR3_BARRIER_SHARED_W;
1124   stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1125
1126   array_insert(b, b->keeps, stl);
1127}
1128
1129/* src[] = { offset }. const_index[] = { base } */
1130static void
1131emit_intrinsic_load_shared_ir3(struct ir3_context *ctx,
1132                               nir_intrinsic_instr *intr,
1133                               struct ir3_instruction **dst)
1134{
1135   struct ir3_block *b = ctx->block;
1136   struct ir3_instruction *load, *offset;
1137   unsigned base;
1138
1139   offset = ir3_get_src(ctx, &intr->src[0])[0];
1140   base = nir_intrinsic_base(intr);
1141
1142   load = ir3_LDLW(b, offset, 0, create_immed(b, base), 0,
1143                   create_immed(b, intr->num_components), 0);
1144
1145   /* for a650, use LDL for tess ctrl inputs: */
1146   if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared)
1147      load->opc = OPC_LDL;
1148
1149   load->cat6.type = utype_dst(intr->dest);
1150   load->dsts[0]->wrmask = MASK(intr->num_components);
1151
1152   load->barrier_class = IR3_BARRIER_SHARED_R;
1153   load->barrier_conflict = IR3_BARRIER_SHARED_W;
1154
1155   ir3_split_dest(b, dst, load, 0, intr->num_components);
1156}
1157
1158/* src[] = { value, offset }. const_index[] = { base } */
1159static void
1160emit_intrinsic_store_shared_ir3(struct ir3_context *ctx,
1161                                nir_intrinsic_instr *intr)
1162{
1163   struct ir3_block *b = ctx->block;
1164   struct ir3_instruction *store, *offset;
1165   struct ir3_instruction *const *value;
1166
1167   value = ir3_get_src(ctx, &intr->src[0]);
1168   offset = ir3_get_src(ctx, &intr->src[1])[0];
1169
1170   store = ir3_STLW(b, offset, 0,
1171                    ir3_create_collect(b, value, intr->num_components), 0,
1172                    create_immed(b, intr->num_components), 0);
1173
1174   /* for a650, use STL for vertex outputs used by tess ctrl shader: */
1175   if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation &&
1176       ctx->compiler->tess_use_shared)
1177      store->opc = OPC_STL;
1178
1179   store->cat6.dst_offset = nir_intrinsic_base(intr);
1180   store->cat6.type = utype_src(intr->src[0]);
1181   store->barrier_class = IR3_BARRIER_SHARED_W;
1182   store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1183
1184   array_insert(b, b->keeps, store);
1185}
1186
1187/*
1188 * CS shared variable atomic intrinsics
1189 *
1190 * All of the shared variable atomic memory operations read a value from
1191 * memory, compute a new value using one of the operations below, write the
1192 * new value to memory, and return the original value read.
1193 *
1194 * All operations take 2 sources except CompSwap that takes 3. These
1195 * sources represent:
1196 *
1197 * 0: The offset into the shared variable storage region that the atomic
1198 *    operation will operate on.
1199 * 1: The data parameter to the atomic function (i.e. the value to add
1200 *    in shared_atomic_add, etc).
1201 * 2: For CompSwap only: the second data parameter.
1202 */
1203static struct ir3_instruction *
1204emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1205{
1206   struct ir3_block *b = ctx->block;
1207   struct ir3_instruction *atomic, *src0, *src1;
1208   type_t type = TYPE_U32;
1209
1210   src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */
1211   src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */
1212
1213   switch (intr->intrinsic) {
1214   case nir_intrinsic_shared_atomic_add:
1215      atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
1216      break;
1217   case nir_intrinsic_shared_atomic_imin:
1218      atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1219      type = TYPE_S32;
1220      break;
1221   case nir_intrinsic_shared_atomic_umin:
1222      atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1223      break;
1224   case nir_intrinsic_shared_atomic_imax:
1225      atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1226      type = TYPE_S32;
1227      break;
1228   case nir_intrinsic_shared_atomic_umax:
1229      atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1230      break;
1231   case nir_intrinsic_shared_atomic_and:
1232      atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
1233      break;
1234   case nir_intrinsic_shared_atomic_or:
1235      atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
1236      break;
1237   case nir_intrinsic_shared_atomic_xor:
1238      atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
1239      break;
1240   case nir_intrinsic_shared_atomic_exchange:
1241      atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
1242      break;
1243   case nir_intrinsic_shared_atomic_comp_swap:
1244      /* for cmpxchg, src1 is [ui]vec2(data, compare): */
1245      src1 = ir3_collect(b, ir3_get_src(ctx, &intr->src[2])[0], src1);
1246      atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
1247      break;
1248   default:
1249      unreachable("boo");
1250   }
1251
1252   atomic->cat6.iim_val = 1;
1253   atomic->cat6.d = 1;
1254   atomic->cat6.type = type;
1255   atomic->barrier_class = IR3_BARRIER_SHARED_W;
1256   atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1257
1258   /* even if nothing consume the result, we can't DCE the instruction: */
1259   array_insert(b, b->keeps, atomic);
1260
1261   return atomic;
1262}
1263
1264static void
1265stp_ldp_offset(struct ir3_context *ctx, nir_src *src,
1266               struct ir3_instruction **offset, int32_t *base)
1267{
1268   struct ir3_block *b = ctx->block;
1269
1270   if (nir_src_is_const(*src)) {
1271      unsigned src_offset = nir_src_as_uint(*src);
1272      /* The base offset field is only 13 bits, and it's signed. Try to make the
1273       * offset constant whenever the original offsets are similar, to avoid
1274       * creating too many constants in the final shader.
1275       */
1276      *base = ((int32_t) src_offset << (32 - 13)) >> (32 - 13);
1277      uint32_t offset_val = src_offset - *base;
1278      *offset = create_immed(b, offset_val);
1279   } else {
1280      /* TODO: match on nir_iadd with a constant that fits */
1281      *base = 0;
1282      *offset = ir3_get_src(ctx, src)[0];
1283   }
1284}
1285
1286/* src[] = { offset }. */
1287static void
1288emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1289                            struct ir3_instruction **dst)
1290{
1291   struct ir3_block *b = ctx->block;
1292   struct ir3_instruction *ldp, *offset;
1293   int32_t base;
1294
1295   stp_ldp_offset(ctx, &intr->src[0], &offset, &base);
1296
1297   ldp = ir3_LDP(b, offset, 0, create_immed(b, base), 0,
1298                 create_immed(b, intr->num_components), 0);
1299
1300   ldp->cat6.type = utype_dst(intr->dest);
1301   ldp->dsts[0]->wrmask = MASK(intr->num_components);
1302
1303   ldp->barrier_class = IR3_BARRIER_PRIVATE_R;
1304   ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;
1305
1306   ir3_split_dest(b, dst, ldp, 0, intr->num_components);
1307}
1308
1309/* src[] = { value, offset }. const_index[] = { write_mask } */
1310static void
1311emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1312{
1313   struct ir3_block *b = ctx->block;
1314   struct ir3_instruction *stp, *offset;
1315   struct ir3_instruction *const *value;
1316   unsigned wrmask, ncomp;
1317   int32_t base;
1318
1319   value = ir3_get_src(ctx, &intr->src[0]);
1320
1321   stp_ldp_offset(ctx, &intr->src[1], &offset, &base);
1322
1323   wrmask = nir_intrinsic_write_mask(intr);
1324   ncomp = ffs(~wrmask) - 1;
1325
1326   assert(wrmask == BITFIELD_MASK(intr->num_components));
1327
1328   stp = ir3_STP(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1329                 create_immed(b, ncomp), 0);
1330   stp->cat6.dst_offset = base;
1331   stp->cat6.type = utype_src(intr->src[0]);
1332   stp->barrier_class = IR3_BARRIER_PRIVATE_W;
1333   stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
1334
1335   array_insert(b, b->keeps, stp);
1336}
1337
1338struct tex_src_info {
1339   /* For prefetch */
1340   unsigned tex_base, samp_base, tex_idx, samp_idx;
1341   /* For normal tex instructions */
1342   unsigned base, a1_val, flags;
1343   struct ir3_instruction *samp_tex;
1344};
1345
1346/* TODO handle actual indirect/dynamic case.. which is going to be weird
1347 * to handle with the image_mapping table..
1348 */
1349static struct tex_src_info
1350get_image_ssbo_samp_tex_src(struct ir3_context *ctx, nir_src *src)
1351{
1352   struct ir3_block *b = ctx->block;
1353   struct tex_src_info info = {0};
1354   nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(*src);
1355
1356   if (bindless_tex) {
1357      /* Bindless case */
1358      ctx->so->bindless_tex = true;
1359      info.flags |= IR3_INSTR_B;
1360
1361      /* Gather information required to determine which encoding to
1362       * choose as well as for prefetch.
1363       */
1364      info.tex_base = nir_intrinsic_desc_set(bindless_tex);
1365      bool tex_const = nir_src_is_const(bindless_tex->src[0]);
1366      if (tex_const)
1367         info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
1368      info.samp_idx = 0;
1369
1370      /* Choose encoding. */
1371      if (tex_const && info.tex_idx < 256) {
1372         if (info.tex_idx < 16) {
1373            /* Everything fits within the instruction */
1374            info.base = info.tex_base;
1375         } else {
1376            info.base = info.tex_base;
1377            info.a1_val = info.tex_idx << 3;
1378            info.flags |= IR3_INSTR_A1EN;
1379         }
1380         info.samp_tex = NULL;
1381      } else {
1382         info.flags |= IR3_INSTR_S2EN;
1383         info.base = info.tex_base;
1384
1385         /* Note: the indirect source is now a vec2 instead of hvec2 */
1386         struct ir3_instruction *texture, *sampler;
1387
1388         texture = ir3_get_src(ctx, src)[0];
1389         sampler = create_immed(b, 0);
1390         info.samp_tex = ir3_collect(b, texture, sampler);
1391      }
1392   } else {
1393      info.flags |= IR3_INSTR_S2EN;
1394      unsigned slot = nir_src_as_uint(*src);
1395      unsigned tex_idx = ir3_image_to_tex(&ctx->so->image_mapping, slot);
1396      struct ir3_instruction *texture, *sampler;
1397
1398      texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1399      sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1400
1401      info.samp_tex = ir3_collect(b, sampler, texture);
1402   }
1403
1404   return info;
1405}
1406
1407static struct ir3_instruction *
1408emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info,
1409         type_t type, unsigned wrmask, struct ir3_instruction *src0,
1410         struct ir3_instruction *src1)
1411{
1412   struct ir3_instruction *sam, *addr;
1413   if (info.flags & IR3_INSTR_A1EN) {
1414      addr = ir3_get_addr1(ctx, info.a1_val);
1415   }
1416   sam = ir3_SAM(ctx->block, opc, type, wrmask, info.flags, info.samp_tex, src0,
1417                 src1);
1418   if (info.flags & IR3_INSTR_A1EN) {
1419      ir3_instr_set_address(sam, addr);
1420   }
1421   if (info.flags & IR3_INSTR_B) {
1422      sam->cat5.tex_base = info.base;
1423      sam->cat5.samp = info.samp_idx;
1424      sam->cat5.tex  = info.tex_idx;
1425   }
1426   return sam;
1427}
1428
1429/* src[] = { deref, coord, sample_index }. const_index[] = {} */
1430static void
1431emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1432                          struct ir3_instruction **dst)
1433{
1434   /* If the image can be written, must use LDIB to retrieve data, rather than
1435    * through ISAM (which uses the texture cache and won't get previous writes).
1436    */
1437   if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER)) {
1438      ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1439      return;
1440   }
1441
1442   /* The sparse set of texture descriptors for non-coherent load_images means we can't do indirection, so
1443    * fall back to coherent load.
1444    */
1445   if (ctx->compiler->gen >= 5 &&
1446       !ir3_bindless_resource(intr->src[0]) &&
1447       !nir_src_is_const(intr->src[0])) {
1448      ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1449      return;
1450   }
1451
1452   struct ir3_block *b = ctx->block;
1453   struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0]);
1454   struct ir3_instruction *sam;
1455   struct ir3_instruction *const *src0 = ir3_get_src(ctx, &intr->src[1]);
1456   struct ir3_instruction *coords[4];
1457   unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1458   type_t type = ir3_get_type_for_image_intrinsic(intr);
1459
1460   info.flags |= flags;
1461
1462   /* hw doesn't do 1d, so we treat it as 2d with height of 1, and patch up the
1463    * y coord. Note that the array index must come after the fake y coord.
1464    */
1465   enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
1466   if (dim == GLSL_SAMPLER_DIM_1D || dim == GLSL_SAMPLER_DIM_BUF) {
1467      coords[0] = src0[0];
1468      coords[1] = create_immed(b, 0);
1469      for (unsigned i = 1; i < ncoords; i++)
1470         coords[i + 1] = src0[i];
1471      ncoords++;
1472   } else {
1473      for (unsigned i = 0; i < ncoords; i++)
1474         coords[i] = src0[i];
1475   }
1476
1477   sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,
1478                  ir3_create_collect(b, coords, ncoords), NULL);
1479
1480   ir3_handle_nonuniform(sam, intr);
1481
1482   sam->barrier_class = IR3_BARRIER_IMAGE_R;
1483   sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
1484
1485   ir3_split_dest(b, dst, sam, 0, 4);
1486}
1487
1488/* A4xx version of image_size, see ir3_a6xx.c for newer resinfo version. */
1489void
1490emit_intrinsic_image_size_tex(struct ir3_context *ctx,
1491                              nir_intrinsic_instr *intr,
1492                              struct ir3_instruction **dst)
1493{
1494   struct ir3_block *b = ctx->block;
1495   struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0]);
1496   struct ir3_instruction *sam, *lod;
1497   unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1498   type_t dst_type = nir_dest_bit_size(intr->dest) == 16 ? TYPE_U16 : TYPE_U32;
1499
1500   info.flags |= flags;
1501   assert(nir_src_as_uint(intr->src[1]) == 0);
1502   lod = create_immed(b, 0);
1503   sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
1504
1505   /* Array size actually ends up in .w rather than .z. This doesn't
1506    * matter for miplevel 0, but for higher mips the value in z is
1507    * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1508    * returned, which means that we have to add 1 to it for arrays for
1509    * a3xx.
1510    *
1511    * Note use a temporary dst and then copy, since the size of the dst
1512    * array that is passed in is based on nir's understanding of the
1513    * result size, not the hardware's
1514    */
1515   struct ir3_instruction *tmp[4];
1516
1517   ir3_split_dest(b, tmp, sam, 0, 4);
1518
1519   for (unsigned i = 0; i < ncoords; i++)
1520      dst[i] = tmp[i];
1521
1522   if (flags & IR3_INSTR_A) {
1523      if (ctx->compiler->levels_add_one) {
1524         dst[ncoords - 1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
1525      } else {
1526         dst[ncoords - 1] = ir3_MOV(b, tmp[3], TYPE_U32);
1527      }
1528   }
1529}
1530
1531/* src[] = { buffer_index, offset }. No const_index */
1532static void
1533emit_intrinsic_load_ssbo(struct ir3_context *ctx,
1534                         nir_intrinsic_instr *intr,
1535                         struct ir3_instruction **dst)
1536{
1537   /* Note: isam currently can't handle vectorized loads/stores */
1538   if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
1539       !ir3_bindless_resource(intr->src[0]) ||
1540       intr->dest.ssa.num_components > 1) {
1541      ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
1542      return;
1543   }
1544
1545   struct ir3_block *b = ctx->block;
1546   struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0];
1547   struct ir3_instruction *coords = ir3_collect(b, offset, create_immed(b, 0));
1548   struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0]);
1549
1550   unsigned num_components = intr->dest.ssa.num_components;
1551   struct ir3_instruction *sam =
1552      emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->dest.ssa.bit_size),
1553               MASK(num_components), coords, NULL);
1554
1555   ir3_handle_nonuniform(sam, intr);
1556
1557   sam->barrier_class = IR3_BARRIER_BUFFER_R;
1558   sam->barrier_conflict = IR3_BARRIER_BUFFER_W;
1559
1560   ir3_split_dest(b, dst, sam, 0, num_components);
1561}
1562
1563static void
1564emit_control_barrier(struct ir3_context *ctx)
1565{
1566   /* Hull shaders dispatch 32 wide so an entire patch will always
1567    * fit in a single warp and execute in lock-step. Consequently,
1568    * we don't need to do anything for TCS barriers. Emitting
1569    * barrier instruction will deadlock.
1570    */
1571   if (ctx->so->type == MESA_SHADER_TESS_CTRL)
1572      return;
1573
1574   struct ir3_block *b = ctx->block;
1575   struct ir3_instruction *barrier = ir3_BAR(b);
1576   barrier->cat7.g = true;
1577   if (ctx->compiler->gen < 6)
1578      barrier->cat7.l = true;
1579   barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
1580   barrier->barrier_class = IR3_BARRIER_EVERYTHING;
1581   array_insert(b, b->keeps, barrier);
1582
1583   ctx->so->has_barrier = true;
1584}
1585
1586static void
1587emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1588{
1589   struct ir3_block *b = ctx->block;
1590   struct ir3_instruction *barrier;
1591
1592   /* TODO: find out why there is a major difference of .l usage
1593    * between a5xx and a6xx,
1594    */
1595
1596   switch (intr->intrinsic) {
1597   case nir_intrinsic_control_barrier:
1598      emit_control_barrier(ctx);
1599      return;
1600   case nir_intrinsic_scoped_barrier: {
1601      nir_scope exec_scope = nir_intrinsic_execution_scope(intr);
1602      nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
1603      /* loads/stores are always cache-coherent so we can filter out
1604       * available/visible.
1605       */
1606      nir_memory_semantics semantics =
1607         nir_intrinsic_memory_semantics(intr) & (NIR_MEMORY_ACQUIRE |
1608                                                 NIR_MEMORY_RELEASE);
1609
1610      if (ctx->so->type == MESA_SHADER_TESS_CTRL) {
1611         /* Remove mode corresponding to nir_intrinsic_memory_barrier_tcs_patch,
1612          * because hull shaders dispatch 32 wide so an entire patch will
1613          * always fit in a single warp and execute in lock-step.
1614          *
1615          * TODO: memory barrier also tells us not to reorder stores, this
1616          * information is lost here (backend doesn't reorder stores so we
1617          * are safe for now).
1618          */
1619         modes &= ~nir_var_shader_out;
1620      }
1621
1622      assert(!(modes & nir_var_shader_out));
1623
1624      if ((modes &
1625           (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_mem_global |
1626            nir_var_image)) && semantics) {
1627         barrier = ir3_FENCE(b);
1628         barrier->cat7.r = true;
1629         barrier->cat7.w = true;
1630
1631         if (modes & (nir_var_mem_ssbo | nir_var_image | nir_var_mem_global)) {
1632            barrier->cat7.g = true;
1633         }
1634
1635         if (ctx->compiler->gen >= 6) {
1636            if (modes & (nir_var_mem_ssbo | nir_var_image)) {
1637               barrier->cat7.l = true;
1638            }
1639         } else {
1640            if (modes & (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_image)) {
1641               barrier->cat7.l = true;
1642            }
1643         }
1644
1645         barrier->barrier_class = 0;
1646         barrier->barrier_conflict = 0;
1647
1648         if (modes & nir_var_mem_shared) {
1649            barrier->barrier_class |= IR3_BARRIER_SHARED_W;
1650            barrier->barrier_conflict |=
1651               IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1652         }
1653
1654         if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
1655            barrier->barrier_class |= IR3_BARRIER_BUFFER_W;
1656            barrier->barrier_conflict |=
1657               IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1658         }
1659
1660         if (modes & nir_var_image) {
1661            barrier->barrier_class |= IR3_BARRIER_IMAGE_W;
1662            barrier->barrier_conflict |=
1663               IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;
1664         }
1665         array_insert(b, b->keeps, barrier);
1666      }
1667
1668      if (exec_scope >= NIR_SCOPE_WORKGROUP) {
1669         emit_control_barrier(ctx);
1670      }
1671
1672      return;
1673   }
1674   case nir_intrinsic_memory_barrier_tcs_patch:
1675      /* Not applicable, see explanation for scoped_barrier + shader_out */
1676      return;
1677   case nir_intrinsic_memory_barrier_buffer:
1678      barrier = ir3_FENCE(b);
1679      barrier->cat7.g = true;
1680      if (ctx->compiler->gen >= 6)
1681         barrier->cat7.l = true;
1682      barrier->cat7.r = true;
1683      barrier->cat7.w = true;
1684      barrier->barrier_class = IR3_BARRIER_BUFFER_W;
1685      barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1686      break;
1687   case nir_intrinsic_memory_barrier_image:
1688      barrier = ir3_FENCE(b);
1689      barrier->cat7.g = true;
1690      barrier->cat7.l = true;
1691      barrier->cat7.r = true;
1692      barrier->cat7.w = true;
1693      barrier->barrier_class = IR3_BARRIER_IMAGE_W;
1694      barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
1695      break;
1696   case nir_intrinsic_memory_barrier_shared:
1697      barrier = ir3_FENCE(b);
1698      if (ctx->compiler->gen < 6)
1699         barrier->cat7.l = true;
1700      barrier->cat7.r = true;
1701      barrier->cat7.w = true;
1702      barrier->barrier_class = IR3_BARRIER_SHARED_W;
1703      barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1704      break;
1705   case nir_intrinsic_memory_barrier:
1706   case nir_intrinsic_group_memory_barrier:
1707      barrier = ir3_FENCE(b);
1708      barrier->cat7.g = true;
1709      barrier->cat7.l = true;
1710      barrier->cat7.r = true;
1711      barrier->cat7.w = true;
1712      barrier->barrier_class =
1713         IR3_BARRIER_SHARED_W | IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
1714      barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
1715                                  IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
1716                                  IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1717      break;
1718   default:
1719      unreachable("boo");
1720   }
1721
1722   /* make sure barrier doesn't get DCE'd */
1723   array_insert(b, b->keeps, barrier);
1724}
1725
1726static void
1727add_sysval_input_compmask(struct ir3_context *ctx, gl_system_value slot,
1728                          unsigned compmask, struct ir3_instruction *instr)
1729{
1730   struct ir3_shader_variant *so = ctx->so;
1731   unsigned n = so->inputs_count++;
1732
1733   assert(instr->opc == OPC_META_INPUT);
1734   instr->input.inidx = n;
1735   instr->input.sysval = slot;
1736
1737   so->inputs[n].sysval = true;
1738   so->inputs[n].slot = slot;
1739   so->inputs[n].compmask = compmask;
1740   so->total_in++;
1741
1742   so->sysval_in += util_last_bit(compmask);
1743}
1744
1745static struct ir3_instruction *
1746create_sysval_input(struct ir3_context *ctx, gl_system_value slot,
1747                    unsigned compmask)
1748{
1749   assert(compmask);
1750   struct ir3_instruction *sysval = create_input(ctx, compmask);
1751   add_sysval_input_compmask(ctx, slot, compmask, sysval);
1752   return sysval;
1753}
1754
1755static struct ir3_instruction *
1756get_barycentric(struct ir3_context *ctx, enum ir3_bary bary)
1757{
1758   STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_PIXEL ==
1759                 SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);
1760   STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_SAMPLE ==
1761                 SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);
1762   STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTROID ==
1763                 SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);
1764   STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTER_RHW ==
1765                 SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW);
1766   STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_PIXEL ==
1767                 SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);
1768   STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_CENTROID ==
1769                 SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);
1770   STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_SAMPLE ==
1771                 SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);
1772
1773   if (!ctx->ij[bary]) {
1774      struct ir3_instruction *xy[2];
1775      struct ir3_instruction *ij;
1776
1777      ij = create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL +
1778                               bary, 0x3);
1779      ir3_split_dest(ctx->in_block, xy, ij, 0, 2);
1780
1781      ctx->ij[bary] = ir3_create_collect(ctx->in_block, xy, 2);
1782   }
1783
1784   return ctx->ij[bary];
1785}
1786
1787/* TODO: make this a common NIR helper?
1788 * there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so
1789 * it can't be extended to work with this
1790 */
1791static gl_system_value
1792nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)
1793{
1794   enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr);
1795   gl_system_value sysval;
1796
1797   switch (intr->intrinsic) {
1798   case nir_intrinsic_load_barycentric_pixel:
1799      if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1800         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1801      else
1802         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1803      break;
1804   case nir_intrinsic_load_barycentric_centroid:
1805      if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1806         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID;
1807      else
1808         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID;
1809      break;
1810   case nir_intrinsic_load_barycentric_sample:
1811      if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1812         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE;
1813      else
1814         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE;
1815      break;
1816   default:
1817      unreachable("invalid barycentric intrinsic");
1818   }
1819
1820   return sysval;
1821}
1822
1823static void
1824emit_intrinsic_barycentric(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1825                           struct ir3_instruction **dst)
1826{
1827   gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr);
1828
1829   if (!ctx->so->key.msaa) {
1830      switch (sysval) {
1831      case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:
1832         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1833         break;
1834      case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
1835         if (ctx->compiler->gen < 6)
1836            sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1837         break;
1838      case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
1839         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1840         break;
1841      case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
1842         if (ctx->compiler->gen < 6)
1843            sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1844         break;
1845      default:
1846         break;
1847      }
1848   }
1849
1850   enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1851
1852   struct ir3_instruction *ij = get_barycentric(ctx, bary);
1853   ir3_split_dest(ctx->block, dst, ij, 0, 2);
1854}
1855
1856static struct ir3_instruction *
1857get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1858{
1859   if (!ctx->frag_coord) {
1860      struct ir3_block *b = ir3_after_preamble(ctx->ir);
1861      struct ir3_instruction *xyzw[4];
1862      struct ir3_instruction *hw_frag_coord;
1863
1864      hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf);
1865      ir3_split_dest(b, xyzw, hw_frag_coord, 0, 4);
1866
1867      /* for frag_coord.xy, we get unsigned values.. we need
1868       * to subtract (integer) 8 and divide by 16 (right-
1869       * shift by 4) then convert to float:
1870       *
1871       *    sub.s tmp, src, 8
1872       *    shr.b tmp, tmp, 4
1873       *    mov.u32f32 dst, tmp
1874       *
1875       */
1876      for (int i = 0; i < 2; i++) {
1877         xyzw[i] = ir3_COV(b, xyzw[i], TYPE_U32, TYPE_F32);
1878         xyzw[i] =
1879            ir3_MUL_F(b, xyzw[i], 0, create_immed(b, fui(1.0 / 16.0)), 0);
1880      }
1881
1882      ctx->frag_coord = ir3_create_collect(b, xyzw, 4);
1883   }
1884
1885   ctx->so->fragcoord_compmask |= nir_ssa_def_components_read(&intr->dest.ssa);
1886
1887   return ctx->frag_coord;
1888}
1889
1890/* This is a bit of a hack until ir3_context is converted to store SSA values
1891 * as ir3_register's instead of ir3_instruction's. Pick out a given destination
1892 * of an instruction with multiple destinations using a mov that will get folded
1893 * away by ir3_cp.
1894 */
1895static struct ir3_instruction *
1896create_multidst_mov(struct ir3_block *block, struct ir3_register *dst)
1897{
1898   struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1899   unsigned dst_flags = dst->flags & IR3_REG_HALF;
1900   unsigned src_flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
1901
1902   __ssa_dst(mov)->flags |= dst_flags;
1903   struct ir3_register *src =
1904      ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | src_flags);
1905   src->wrmask = dst->wrmask;
1906   src->def = dst;
1907   assert(!(dst->flags & IR3_REG_RELATIV));
1908   mov->cat1.src_type = mov->cat1.dst_type =
1909      (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1910   return mov;
1911}
1912
1913static reduce_op_t
1914get_reduce_op(nir_op opc)
1915{
1916   switch (opc) {
1917   case nir_op_iadd: return REDUCE_OP_ADD_U;
1918   case nir_op_fadd: return REDUCE_OP_ADD_F;
1919   case nir_op_imul: return REDUCE_OP_MUL_U;
1920   case nir_op_fmul: return REDUCE_OP_MUL_F;
1921   case nir_op_umin: return REDUCE_OP_MIN_U;
1922   case nir_op_imin: return REDUCE_OP_MIN_S;
1923   case nir_op_fmin: return REDUCE_OP_MIN_F;
1924   case nir_op_umax: return REDUCE_OP_MAX_U;
1925   case nir_op_imax: return REDUCE_OP_MAX_S;
1926   case nir_op_fmax: return REDUCE_OP_MAX_F;
1927   case nir_op_iand: return REDUCE_OP_AND_B;
1928   case nir_op_ior:  return REDUCE_OP_OR_B;
1929   case nir_op_ixor: return REDUCE_OP_XOR_B;
1930   default:
1931      unreachable("unknown NIR reduce op");
1932   }
1933}
1934
1935static uint32_t
1936get_reduce_identity(nir_op opc, unsigned size)
1937{
1938   switch (opc) {
1939   case nir_op_iadd:
1940      return 0;
1941   case nir_op_fadd:
1942      return size == 32 ? fui(0.0f) : _mesa_float_to_half(0.0f);
1943   case nir_op_imul:
1944      return 1;
1945   case nir_op_fmul:
1946      return size == 32 ? fui(1.0f) : _mesa_float_to_half(1.0f);
1947   case nir_op_umax:
1948      return 0;
1949   case nir_op_imax:
1950      return size == 32 ? INT32_MIN : (uint32_t)INT16_MIN;
1951   case nir_op_fmax:
1952      return size == 32 ? fui(-INFINITY) : _mesa_float_to_half(-INFINITY);
1953   case nir_op_umin:
1954      return size == 32 ? UINT32_MAX : UINT16_MAX;
1955   case nir_op_imin:
1956      return size == 32 ? INT32_MAX : (uint32_t)INT16_MAX;
1957   case nir_op_fmin:
1958      return size == 32 ? fui(INFINITY) : _mesa_float_to_half(INFINITY);
1959   case nir_op_iand:
1960      return size == 32 ? ~0 : (size == 16 ? (uint32_t)(uint16_t)~0 : 1);
1961   case nir_op_ior:
1962      return 0;
1963   case nir_op_ixor:
1964      return 0;
1965   default:
1966      unreachable("unknown NIR reduce op");
1967   }
1968}
1969
1970static struct ir3_instruction *
1971emit_intrinsic_reduce(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1972{
1973   struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
1974   nir_op nir_reduce_op = (nir_op) nir_intrinsic_reduction_op(intr);
1975   reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
1976   unsigned dst_size = nir_dest_bit_size(intr->dest);
1977   unsigned flags = (ir3_bitsize(ctx, dst_size) == 16) ? IR3_REG_HALF : 0;
1978
1979   /* Note: the shared reg is initialized to the identity, so we need it to
1980    * always be 32-bit even when the source isn't because half shared regs are
1981    * not supported.
1982    */
1983   struct ir3_instruction *identity =
1984      create_immed(ctx->block, get_reduce_identity(nir_reduce_op, dst_size));
1985   identity = ir3_READ_FIRST_MACRO(ctx->block, identity, 0);
1986   identity->dsts[0]->flags |= IR3_REG_SHARED;
1987
1988   /* OPC_SCAN_MACRO has the following destinations:
1989    * - Exclusive scan result (interferes with source)
1990    * - Inclusive scan result
1991    * - Shared reg reduction result, must be initialized to the identity
1992    *
1993    * The loop computes all three results at the same time, we just have to
1994    * choose which destination to return.
1995    */
1996   struct ir3_instruction *scan =
1997      ir3_instr_create(ctx->block, OPC_SCAN_MACRO, 3, 2);
1998   scan->cat1.reduce_op = reduce_op;
1999
2000   struct ir3_register *exclusive = __ssa_dst(scan);
2001   exclusive->flags |= flags | IR3_REG_EARLY_CLOBBER;
2002   struct ir3_register *inclusive = __ssa_dst(scan);
2003   inclusive->flags |= flags;
2004   struct ir3_register *reduce = __ssa_dst(scan);
2005   reduce->flags |= IR3_REG_SHARED;
2006
2007   /* The 32-bit multiply macro reads its sources after writing a partial result
2008    * to the destination, therefore inclusive also interferes with the source.
2009    */
2010   if (reduce_op == REDUCE_OP_MUL_U && dst_size == 32)
2011      inclusive->flags |= IR3_REG_EARLY_CLOBBER;
2012
2013   /* Normal source */
2014   __ssa_src(scan, src, 0);
2015
2016   /* shared reg tied source */
2017   struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
2018   ir3_reg_tie(reduce, reduce_init);
2019
2020   struct ir3_register *dst;
2021   switch (intr->intrinsic) {
2022   case nir_intrinsic_reduce: dst = reduce; break;
2023   case nir_intrinsic_inclusive_scan: dst = inclusive; break;
2024   case nir_intrinsic_exclusive_scan: dst = exclusive; break;
2025   default:
2026      unreachable("unknown reduce intrinsic");
2027   }
2028
2029   return create_multidst_mov(ctx->block, dst);
2030}
2031
2032static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
2033static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
2034
2035static void
2036emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2037{
2038   const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
2039   struct ir3_instruction **dst;
2040   struct ir3_instruction *const *src;
2041   struct ir3_block *b = ctx->block;
2042   unsigned dest_components = nir_intrinsic_dest_components(intr);
2043   int idx;
2044
2045   if (info->has_dest) {
2046      dst = ir3_get_dst(ctx, &intr->dest, dest_components);
2047   } else {
2048      dst = NULL;
2049   }
2050
2051   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
2052   const unsigned primitive_param = const_state->offsets.primitive_param * 4;
2053   const unsigned primitive_map = const_state->offsets.primitive_map * 4;
2054
2055   switch (intr->intrinsic) {
2056   case nir_intrinsic_load_uniform:
2057      idx = nir_intrinsic_base(intr);
2058      if (nir_src_is_const(intr->src[0])) {
2059         idx += nir_src_as_uint(intr->src[0]);
2060         for (int i = 0; i < dest_components; i++) {
2061            dst[i] = create_uniform_typed(
2062               b, idx + i,
2063               nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32);
2064         }
2065      } else {
2066         src = ir3_get_src(ctx, &intr->src[0]);
2067         for (int i = 0; i < dest_components; i++) {
2068            dst[i] = create_uniform_indirect(
2069               b, idx + i,
2070               nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32,
2071               ir3_get_addr0(ctx, src[0], 1));
2072         }
2073         /* NOTE: if relative addressing is used, we set
2074          * constlen in the compiler (to worst-case value)
2075          * since we don't know in the assembler what the max
2076          * addr reg value can be:
2077          */
2078         ctx->so->constlen =
2079            MAX2(ctx->so->constlen,
2080                 ctx->so->num_reserved_user_consts +
2081                 const_state->ubo_state.size / 16);
2082      }
2083      break;
2084
2085   case nir_intrinsic_load_vs_primitive_stride_ir3:
2086      dst[0] = create_uniform(b, primitive_param + 0);
2087      break;
2088   case nir_intrinsic_load_vs_vertex_stride_ir3:
2089      dst[0] = create_uniform(b, primitive_param + 1);
2090      break;
2091   case nir_intrinsic_load_hs_patch_stride_ir3:
2092      dst[0] = create_uniform(b, primitive_param + 2);
2093      break;
2094   case nir_intrinsic_load_patch_vertices_in:
2095      dst[0] = create_uniform(b, primitive_param + 3);
2096      break;
2097   case nir_intrinsic_load_tess_param_base_ir3:
2098      dst[0] = create_uniform(b, primitive_param + 4);
2099      dst[1] = create_uniform(b, primitive_param + 5);
2100      break;
2101   case nir_intrinsic_load_tess_factor_base_ir3:
2102      dst[0] = create_uniform(b, primitive_param + 6);
2103      dst[1] = create_uniform(b, primitive_param + 7);
2104      break;
2105
2106   case nir_intrinsic_load_primitive_location_ir3:
2107      idx = nir_intrinsic_driver_location(intr);
2108      dst[0] = create_uniform(b, primitive_map + idx);
2109      break;
2110
2111   case nir_intrinsic_load_gs_header_ir3:
2112      dst[0] = ctx->gs_header;
2113      break;
2114   case nir_intrinsic_load_tcs_header_ir3:
2115      dst[0] = ctx->tcs_header;
2116      break;
2117
2118   case nir_intrinsic_load_rel_patch_id_ir3:
2119      dst[0] = ctx->rel_patch_id;
2120      break;
2121
2122   case nir_intrinsic_load_primitive_id:
2123      if (!ctx->primitive_id) {
2124         ctx->primitive_id =
2125            create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
2126      }
2127      dst[0] = ctx->primitive_id;
2128      break;
2129
2130   case nir_intrinsic_load_tess_coord:
2131      if (!ctx->tess_coord) {
2132         ctx->tess_coord =
2133            create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3);
2134      }
2135      ir3_split_dest(b, dst, ctx->tess_coord, 0, 2);
2136
2137      /* Unused, but ir3_put_dst() below wants to free something */
2138      dst[2] = create_immed(b, 0);
2139      break;
2140
2141   case nir_intrinsic_end_patch_ir3:
2142      assert(ctx->so->type == MESA_SHADER_TESS_CTRL);
2143      struct ir3_instruction *end = ir3_PREDE(b);
2144      array_insert(b, b->keeps, end);
2145
2146      end->barrier_class = IR3_BARRIER_EVERYTHING;
2147      end->barrier_conflict = IR3_BARRIER_EVERYTHING;
2148      break;
2149
2150   case nir_intrinsic_store_global_ir3:
2151      ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr);
2152      break;
2153   case nir_intrinsic_load_global_ir3:
2154      ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst);
2155      break;
2156
2157   case nir_intrinsic_load_ubo:
2158      emit_intrinsic_load_ubo(ctx, intr, dst);
2159      break;
2160   case nir_intrinsic_load_ubo_vec4:
2161      emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
2162      break;
2163   case nir_intrinsic_copy_ubo_to_uniform_ir3:
2164      emit_intrinsic_copy_ubo_to_uniform(ctx, intr);
2165      break;
2166   case nir_intrinsic_load_frag_coord:
2167      ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4);
2168      break;
2169   case nir_intrinsic_load_sample_pos_from_id: {
2170      /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,
2171       * but that doesn't seem necessary.
2172       */
2173      struct ir3_instruction *offset =
2174         ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);
2175      offset->dsts[0]->wrmask = 0x3;
2176      offset->cat5.type = TYPE_F32;
2177
2178      ir3_split_dest(b, dst, offset, 0, 2);
2179
2180      break;
2181   }
2182   case nir_intrinsic_load_persp_center_rhw_ir3:
2183      if (!ctx->ij[IJ_PERSP_CENTER_RHW]) {
2184         ctx->ij[IJ_PERSP_CENTER_RHW] =
2185            create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW, 0x1);
2186      }
2187      dst[0] = ctx->ij[IJ_PERSP_CENTER_RHW];
2188      break;
2189   case nir_intrinsic_load_barycentric_centroid:
2190   case nir_intrinsic_load_barycentric_sample:
2191   case nir_intrinsic_load_barycentric_pixel:
2192      emit_intrinsic_barycentric(ctx, intr, dst);
2193      break;
2194   case nir_intrinsic_load_interpolated_input:
2195   case nir_intrinsic_load_input:
2196      setup_input(ctx, intr);
2197      break;
2198   case nir_intrinsic_load_kernel_input:
2199      emit_intrinsic_load_kernel_input(ctx, intr, dst);
2200      break;
2201   /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
2202    * pass and replaced by an ir3-specifc version that adds the
2203    * dword-offset in the last source.
2204    */
2205   case nir_intrinsic_load_ssbo_ir3:
2206      emit_intrinsic_load_ssbo(ctx, intr, dst);
2207      break;
2208   case nir_intrinsic_store_ssbo_ir3:
2209      ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
2210      break;
2211   case nir_intrinsic_get_ssbo_size:
2212      emit_intrinsic_ssbo_size(ctx, intr, dst);
2213      break;
2214   case nir_intrinsic_ssbo_atomic_add_ir3:
2215   case nir_intrinsic_ssbo_atomic_imin_ir3:
2216   case nir_intrinsic_ssbo_atomic_umin_ir3:
2217   case nir_intrinsic_ssbo_atomic_imax_ir3:
2218   case nir_intrinsic_ssbo_atomic_umax_ir3:
2219   case nir_intrinsic_ssbo_atomic_and_ir3:
2220   case nir_intrinsic_ssbo_atomic_or_ir3:
2221   case nir_intrinsic_ssbo_atomic_xor_ir3:
2222   case nir_intrinsic_ssbo_atomic_exchange_ir3:
2223   case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
2224      dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
2225      break;
2226   case nir_intrinsic_load_shared:
2227      emit_intrinsic_load_shared(ctx, intr, dst);
2228      break;
2229   case nir_intrinsic_store_shared:
2230      emit_intrinsic_store_shared(ctx, intr);
2231      break;
2232   case nir_intrinsic_shared_atomic_add:
2233   case nir_intrinsic_shared_atomic_imin:
2234   case nir_intrinsic_shared_atomic_umin:
2235   case nir_intrinsic_shared_atomic_imax:
2236   case nir_intrinsic_shared_atomic_umax:
2237   case nir_intrinsic_shared_atomic_and:
2238   case nir_intrinsic_shared_atomic_or:
2239   case nir_intrinsic_shared_atomic_xor:
2240   case nir_intrinsic_shared_atomic_exchange:
2241   case nir_intrinsic_shared_atomic_comp_swap:
2242      dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
2243      break;
2244   case nir_intrinsic_load_scratch:
2245      emit_intrinsic_load_scratch(ctx, intr, dst);
2246      break;
2247   case nir_intrinsic_store_scratch:
2248      emit_intrinsic_store_scratch(ctx, intr);
2249      break;
2250   case nir_intrinsic_image_load:
2251   case nir_intrinsic_bindless_image_load:
2252      emit_intrinsic_load_image(ctx, intr, dst);
2253      break;
2254   case nir_intrinsic_image_store:
2255   case nir_intrinsic_bindless_image_store:
2256      ctx->funcs->emit_intrinsic_store_image(ctx, intr);
2257      break;
2258   case nir_intrinsic_image_size:
2259   case nir_intrinsic_bindless_image_size:
2260      ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst);
2261      break;
2262   case nir_intrinsic_image_atomic_add:
2263   case nir_intrinsic_bindless_image_atomic_add:
2264   case nir_intrinsic_image_atomic_imin:
2265   case nir_intrinsic_bindless_image_atomic_imin:
2266   case nir_intrinsic_image_atomic_umin:
2267   case nir_intrinsic_bindless_image_atomic_umin:
2268   case nir_intrinsic_image_atomic_imax:
2269   case nir_intrinsic_bindless_image_atomic_imax:
2270   case nir_intrinsic_image_atomic_umax:
2271   case nir_intrinsic_bindless_image_atomic_umax:
2272   case nir_intrinsic_image_atomic_and:
2273   case nir_intrinsic_bindless_image_atomic_and:
2274   case nir_intrinsic_image_atomic_or:
2275   case nir_intrinsic_bindless_image_atomic_or:
2276   case nir_intrinsic_image_atomic_xor:
2277   case nir_intrinsic_bindless_image_atomic_xor:
2278   case nir_intrinsic_image_atomic_exchange:
2279   case nir_intrinsic_bindless_image_atomic_exchange:
2280   case nir_intrinsic_image_atomic_comp_swap:
2281   case nir_intrinsic_bindless_image_atomic_comp_swap:
2282      dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
2283      break;
2284   case nir_intrinsic_scoped_barrier:
2285   case nir_intrinsic_control_barrier:
2286   case nir_intrinsic_memory_barrier:
2287   case nir_intrinsic_group_memory_barrier:
2288   case nir_intrinsic_memory_barrier_buffer:
2289   case nir_intrinsic_memory_barrier_image:
2290   case nir_intrinsic_memory_barrier_shared:
2291   case nir_intrinsic_memory_barrier_tcs_patch:
2292      emit_intrinsic_barrier(ctx, intr);
2293      /* note that blk ptr no longer valid, make that obvious: */
2294      b = NULL;
2295      break;
2296   case nir_intrinsic_store_output:
2297      setup_output(ctx, intr);
2298      break;
2299   case nir_intrinsic_load_base_vertex:
2300   case nir_intrinsic_load_first_vertex:
2301      if (!ctx->basevertex) {
2302         ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
2303      }
2304      dst[0] = ctx->basevertex;
2305      break;
2306   case nir_intrinsic_load_draw_id:
2307      if (!ctx->draw_id) {
2308         ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID);
2309      }
2310      dst[0] = ctx->draw_id;
2311      break;
2312   case nir_intrinsic_load_base_instance:
2313      if (!ctx->base_instance) {
2314         ctx->base_instance = create_driver_param(ctx, IR3_DP_INSTID_BASE);
2315      }
2316      dst[0] = ctx->base_instance;
2317      break;
2318   case nir_intrinsic_load_view_index:
2319      if (!ctx->view_index) {
2320         ctx->view_index =
2321            create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1);
2322      }
2323      dst[0] = ctx->view_index;
2324      break;
2325   case nir_intrinsic_load_vertex_id_zero_base:
2326   case nir_intrinsic_load_vertex_id:
2327      if (!ctx->vertex_id) {
2328         gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id)
2329                                 ? SYSTEM_VALUE_VERTEX_ID
2330                                 : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
2331         ctx->vertex_id = create_sysval_input(ctx, sv, 0x1);
2332      }
2333      dst[0] = ctx->vertex_id;
2334      break;
2335   case nir_intrinsic_load_instance_id:
2336      if (!ctx->instance_id) {
2337         ctx->instance_id =
2338            create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1);
2339      }
2340      dst[0] = ctx->instance_id;
2341      break;
2342   case nir_intrinsic_load_sample_id:
2343      ctx->so->per_samp = true;
2344      FALLTHROUGH;
2345   case nir_intrinsic_load_sample_id_no_per_sample:
2346      if (!ctx->samp_id) {
2347         ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1);
2348         ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF;
2349      }
2350      dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
2351      break;
2352   case nir_intrinsic_load_sample_mask_in:
2353      if (!ctx->samp_mask_in) {
2354         ctx->samp_mask_in =
2355            create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
2356      }
2357      dst[0] = ctx->samp_mask_in;
2358      break;
2359   case nir_intrinsic_load_user_clip_plane:
2360      idx = nir_intrinsic_ucp_id(intr);
2361      for (int i = 0; i < dest_components; i++) {
2362         unsigned n = idx * 4 + i;
2363         dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
2364      }
2365      break;
2366   case nir_intrinsic_load_front_face:
2367      if (!ctx->frag_face) {
2368         ctx->so->frag_face = true;
2369         ctx->frag_face =
2370            create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1);
2371         ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF;
2372      }
2373      /* for fragface, we get -1 for back and 0 for front. However this is
2374       * the inverse of what nir expects (where ~0 is true).
2375       */
2376      dst[0] = ir3_CMPS_S(b, ctx->frag_face, 0,
2377                          create_immed_typed(b, 0, TYPE_U16), 0);
2378      dst[0]->cat2.condition = IR3_COND_EQ;
2379      break;
2380   case nir_intrinsic_load_local_invocation_id:
2381      if (!ctx->local_invocation_id) {
2382         ctx->local_invocation_id =
2383            create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7);
2384      }
2385      ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3);
2386      break;
2387   case nir_intrinsic_load_workgroup_id:
2388   case nir_intrinsic_load_workgroup_id_zero_base:
2389      if (ctx->compiler->has_shared_regfile) {
2390         if (!ctx->work_group_id) {
2391            ctx->work_group_id =
2392               create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7);
2393            ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED;
2394         }
2395         ir3_split_dest(b, dst, ctx->work_group_id, 0, 3);
2396      } else {
2397         /* For a3xx/a4xx, this comes in via const injection by the hw */
2398         for (int i = 0; i < dest_components; i++) {
2399            dst[i] = create_driver_param(ctx, IR3_DP_WORKGROUP_ID_X + i);
2400         }
2401      }
2402      break;
2403   case nir_intrinsic_load_base_workgroup_id:
2404      for (int i = 0; i < dest_components; i++) {
2405         dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);
2406      }
2407      break;
2408   case nir_intrinsic_load_num_workgroups:
2409      for (int i = 0; i < dest_components; i++) {
2410         dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
2411      }
2412      break;
2413   case nir_intrinsic_load_workgroup_size:
2414      for (int i = 0; i < dest_components; i++) {
2415         dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
2416      }
2417      break;
2418   case nir_intrinsic_load_subgroup_size: {
2419      assert(ctx->so->type == MESA_SHADER_COMPUTE ||
2420             ctx->so->type == MESA_SHADER_FRAGMENT);
2421      enum ir3_driver_param size = ctx->so->type == MESA_SHADER_COMPUTE ?
2422         IR3_DP_CS_SUBGROUP_SIZE : IR3_DP_FS_SUBGROUP_SIZE;
2423      dst[0] = create_driver_param(ctx, size);
2424      break;
2425   }
2426   case nir_intrinsic_load_subgroup_id_shift_ir3:
2427      dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
2428      break;
2429   case nir_intrinsic_load_work_dim:
2430      dst[0] = create_driver_param(ctx, IR3_DP_WORK_DIM);
2431      break;
2432   case nir_intrinsic_load_subgroup_invocation:
2433      assert(ctx->compiler->has_getfiberid);
2434      dst[0] = ir3_GETFIBERID(b);
2435      dst[0]->cat6.type = TYPE_U32;
2436      __ssa_dst(dst[0]);
2437      break;
2438   case nir_intrinsic_discard_if:
2439   case nir_intrinsic_discard:
2440   case nir_intrinsic_demote:
2441   case nir_intrinsic_demote_if:
2442   case nir_intrinsic_terminate:
2443   case nir_intrinsic_terminate_if: {
2444      struct ir3_instruction *cond, *kill;
2445
2446      if (intr->intrinsic == nir_intrinsic_discard_if ||
2447          intr->intrinsic == nir_intrinsic_demote_if ||
2448          intr->intrinsic == nir_intrinsic_terminate_if) {
2449         /* conditional discard: */
2450         src = ir3_get_src(ctx, &intr->src[0]);
2451         cond = src[0];
2452      } else {
2453         /* unconditional discard: */
2454         cond = create_immed_typed(b, 1, ctx->compiler->bool_type);
2455      }
2456
2457      /* NOTE: only cmps.*.* can write p0.x: */
2458      struct ir3_instruction *zero =
2459            create_immed_typed(b, 0, is_half(cond) ? TYPE_U16 : TYPE_U32);
2460      cond = ir3_CMPS_S(b, cond, 0, zero, 0);
2461      cond->cat2.condition = IR3_COND_NE;
2462
2463      /* condition always goes in predicate register: */
2464      cond->dsts[0]->num = regid(REG_P0, 0);
2465      cond->dsts[0]->flags &= ~IR3_REG_SSA;
2466
2467      if (intr->intrinsic == nir_intrinsic_demote ||
2468          intr->intrinsic == nir_intrinsic_demote_if) {
2469         kill = ir3_DEMOTE(b, cond, 0);
2470      } else {
2471         kill = ir3_KILL(b, cond, 0);
2472      }
2473
2474      /* - Side-effects should not be moved on a different side of the kill
2475       * - Instructions that depend on active fibers should not be reordered
2476       */
2477      kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W |
2478                            IR3_BARRIER_ACTIVE_FIBERS_W;
2479      kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W |
2480                               IR3_BARRIER_ACTIVE_FIBERS_R;
2481      kill->srcs[0]->num = regid(REG_P0, 0);
2482      array_insert(ctx->ir, ctx->ir->predicates, kill);
2483
2484      array_insert(b, b->keeps, kill);
2485      ctx->so->has_kill = true;
2486
2487      break;
2488   }
2489
2490   case nir_intrinsic_cond_end_ir3: {
2491      struct ir3_instruction *cond, *kill;
2492
2493      src = ir3_get_src(ctx, &intr->src[0]);
2494      cond = src[0];
2495
2496      /* NOTE: only cmps.*.* can write p0.x: */
2497      struct ir3_instruction *zero =
2498            create_immed_typed(b, 0, is_half(cond) ? TYPE_U16 : TYPE_U32);
2499      cond = ir3_CMPS_S(b, cond, 0, zero, 0);
2500      cond->cat2.condition = IR3_COND_NE;
2501
2502      /* condition always goes in predicate register: */
2503      cond->dsts[0]->num = regid(REG_P0, 0);
2504
2505      kill = ir3_PREDT(b, cond, 0);
2506
2507      kill->barrier_class = IR3_BARRIER_EVERYTHING;
2508      kill->barrier_conflict = IR3_BARRIER_EVERYTHING;
2509
2510      array_insert(ctx->ir, ctx->ir->predicates, kill);
2511      array_insert(b, b->keeps, kill);
2512      break;
2513   }
2514
2515   case nir_intrinsic_vote_any:
2516   case nir_intrinsic_vote_all: {
2517      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2518      struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
2519      if (intr->intrinsic == nir_intrinsic_vote_any)
2520         dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0);
2521      else
2522         dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0);
2523      dst[0]->srcs[0]->num = regid(REG_P0, 0);
2524      array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
2525      break;
2526   }
2527   case nir_intrinsic_elect:
2528      dst[0] = ir3_ELECT_MACRO(ctx->block);
2529      /* This may expand to a divergent if/then, so allocate stack space for
2530       * it.
2531       */
2532      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2533      break;
2534   case nir_intrinsic_preamble_start_ir3:
2535      dst[0] = ir3_SHPS_MACRO(ctx->block);
2536      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2537      break;
2538
2539   case nir_intrinsic_read_invocation_cond_ir3: {
2540      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2541      struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0];
2542      dst[0] = ir3_READ_COND_MACRO(ctx->block, ir3_get_predicate(ctx, cond), 0,
2543                                   src, 0);
2544      dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
2545      dst[0]->srcs[0]->num = regid(REG_P0, 0);
2546      array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
2547      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2548      break;
2549   }
2550
2551   case nir_intrinsic_read_first_invocation: {
2552      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2553      dst[0] = ir3_READ_FIRST_MACRO(ctx->block, src, 0);
2554      dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
2555      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2556      break;
2557   }
2558
2559   case nir_intrinsic_ballot: {
2560      struct ir3_instruction *ballot;
2561      unsigned components = intr->dest.ssa.num_components;
2562      if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
2563         /* ballot(true) is just MOVMSK */
2564         ballot = ir3_MOVMSK(ctx->block, components);
2565      } else {
2566         struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2567         struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
2568         ballot = ir3_BALLOT_MACRO(ctx->block, pred, components);
2569         ballot->srcs[0]->num = regid(REG_P0, 0);
2570         array_insert(ctx->ir, ctx->ir->predicates, ballot);
2571         ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2572      }
2573
2574      ballot->barrier_class = IR3_BARRIER_ACTIVE_FIBERS_R;
2575      ballot->barrier_conflict = IR3_BARRIER_ACTIVE_FIBERS_W;
2576
2577      ir3_split_dest(ctx->block, dst, ballot, 0, components);
2578      break;
2579   }
2580
2581   case nir_intrinsic_quad_broadcast: {
2582      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2583      struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0];
2584
2585      type_t dst_type = type_uint_size(nir_dest_bit_size(intr->dest));
2586
2587      if (dst_type != TYPE_U32)
2588         idx = ir3_COV(ctx->block, idx, TYPE_U32, dst_type);
2589
2590      dst[0] = ir3_QUAD_SHUFFLE_BRCST(ctx->block, src, 0, idx, 0);
2591      dst[0]->cat5.type = dst_type;
2592      break;
2593   }
2594
2595   case nir_intrinsic_quad_swap_horizontal: {
2596      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2597      dst[0] = ir3_QUAD_SHUFFLE_HORIZ(ctx->block, src, 0);
2598      dst[0]->cat5.type = type_uint_size(nir_dest_bit_size(intr->dest));
2599      break;
2600   }
2601
2602   case nir_intrinsic_quad_swap_vertical: {
2603      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2604      dst[0] = ir3_QUAD_SHUFFLE_VERT(ctx->block, src, 0);
2605      dst[0]->cat5.type = type_uint_size(nir_dest_bit_size(intr->dest));
2606      break;
2607   }
2608
2609   case nir_intrinsic_quad_swap_diagonal: {
2610      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2611      dst[0] = ir3_QUAD_SHUFFLE_DIAG(ctx->block, src, 0);
2612      dst[0]->cat5.type = type_uint_size(nir_dest_bit_size(intr->dest));
2613      break;
2614   }
2615
2616   case nir_intrinsic_load_shared_ir3:
2617      emit_intrinsic_load_shared_ir3(ctx, intr, dst);
2618      break;
2619   case nir_intrinsic_store_shared_ir3:
2620      emit_intrinsic_store_shared_ir3(ctx, intr);
2621      break;
2622   case nir_intrinsic_bindless_resource_ir3:
2623      dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
2624      break;
2625   case nir_intrinsic_global_atomic_add_ir3:
2626   case nir_intrinsic_global_atomic_imin_ir3:
2627   case nir_intrinsic_global_atomic_umin_ir3:
2628   case nir_intrinsic_global_atomic_imax_ir3:
2629   case nir_intrinsic_global_atomic_umax_ir3:
2630   case nir_intrinsic_global_atomic_and_ir3:
2631   case nir_intrinsic_global_atomic_or_ir3:
2632   case nir_intrinsic_global_atomic_xor_ir3:
2633   case nir_intrinsic_global_atomic_exchange_ir3:
2634   case nir_intrinsic_global_atomic_comp_swap_ir3: {
2635      dst[0] = ctx->funcs->emit_intrinsic_atomic_global(ctx, intr);
2636      break;
2637   }
2638
2639   case nir_intrinsic_reduce:
2640   case nir_intrinsic_inclusive_scan:
2641   case nir_intrinsic_exclusive_scan:
2642      dst[0] = emit_intrinsic_reduce(ctx, intr);
2643      break;
2644
2645   case nir_intrinsic_preamble_end_ir3: {
2646      struct ir3_instruction *instr = ir3_SHPE(ctx->block);
2647      instr->barrier_class = instr->barrier_conflict = IR3_BARRIER_CONST_W;
2648      array_insert(b, b->keeps, instr);
2649      break;
2650   }
2651   case nir_intrinsic_store_uniform_ir3: {
2652      unsigned components = nir_src_num_components(intr->src[0]);
2653      unsigned dst = nir_intrinsic_base(intr);
2654      unsigned dst_lo = dst & 0xff;
2655      unsigned dst_hi = dst >> 8;
2656
2657      struct ir3_instruction *src =
2658         ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), components);
2659      struct ir3_instruction *a1 = NULL;
2660      if (dst_hi) {
2661         /* Encode only the high part of the destination in a1.x to increase the
2662          * chance that we can reuse the a1.x value in subsequent stc
2663          * instructions.
2664          */
2665         a1 = ir3_get_addr1(ctx, dst_hi << 8);
2666      }
2667
2668      struct ir3_instruction *stc =
2669         ir3_STC(ctx->block, create_immed(b, dst_lo),  0, src, 0);
2670      stc->cat6.iim_val = components;
2671      stc->cat6.type = TYPE_U32;
2672      stc->barrier_conflict = IR3_BARRIER_CONST_W;
2673      if (a1) {
2674         ir3_instr_set_address(stc, a1);
2675         stc->flags |= IR3_INSTR_A1EN;
2676      }
2677      array_insert(b, b->keeps, stc);
2678      break;
2679   }
2680   default:
2681      ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
2682                        nir_intrinsic_infos[intr->intrinsic].name);
2683      break;
2684   }
2685
2686   if (info->has_dest)
2687      ir3_put_dst(ctx, &intr->dest);
2688}
2689
2690static void
2691emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
2692{
2693   struct ir3_instruction **dst =
2694      ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components);
2695   unsigned bit_size = ir3_bitsize(ctx, instr->def.bit_size);
2696
2697   if (bit_size <= 8) {
2698      for (int i = 0; i < instr->def.num_components; i++)
2699         dst[i] = create_immed_typed(ctx->block, instr->value[i].u8, TYPE_U8);
2700   } else if (bit_size <= 16) {
2701      for (int i = 0; i < instr->def.num_components; i++)
2702         dst[i] = create_immed_typed(ctx->block, instr->value[i].u16, TYPE_U16);
2703   } else {
2704      for (int i = 0; i < instr->def.num_components; i++)
2705         dst[i] = create_immed_typed(ctx->block, instr->value[i].u32, TYPE_U32);
2706   }
2707}
2708
2709static void
2710emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
2711{
2712   struct ir3_instruction **dst =
2713      ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components);
2714   type_t type = utype_for_size(ir3_bitsize(ctx, undef->def.bit_size));
2715
2716   /* backend doesn't want undefined instructions, so just plug
2717    * in 0.0..
2718    */
2719   for (int i = 0; i < undef->def.num_components; i++)
2720      dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
2721}
2722
2723/*
2724 * texture fetch/sample instructions:
2725 */
2726
2727static type_t
2728get_tex_dest_type(nir_tex_instr *tex)
2729{
2730   type_t type;
2731
2732   switch (tex->dest_type) {
2733   case nir_type_float32:
2734      return TYPE_F32;
2735   case nir_type_float16:
2736      return TYPE_F16;
2737   case nir_type_int32:
2738      return TYPE_S32;
2739   case nir_type_int16:
2740      return TYPE_S16;
2741   case nir_type_bool32:
2742   case nir_type_uint32:
2743      return TYPE_U32;
2744   case nir_type_bool16:
2745   case nir_type_uint16:
2746      return TYPE_U16;
2747   case nir_type_invalid:
2748   default:
2749      unreachable("bad dest_type");
2750   }
2751
2752   return type;
2753}
2754
2755static void
2756tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
2757{
2758   unsigned coords =
2759      glsl_get_sampler_dim_coordinate_components(tex->sampler_dim);
2760   unsigned flags = 0;
2761
2762   /* note: would use tex->coord_components.. except txs.. also,
2763    * since array index goes after shadow ref, we don't want to
2764    * count it:
2765    */
2766   if (coords == 3)
2767      flags |= IR3_INSTR_3D;
2768
2769   if (tex->is_shadow && tex->op != nir_texop_lod)
2770      flags |= IR3_INSTR_S;
2771
2772   if (tex->is_array && tex->op != nir_texop_lod)
2773      flags |= IR3_INSTR_A;
2774
2775   *flagsp = flags;
2776   *coordsp = coords;
2777}
2778
2779/* Gets the sampler/texture idx as a hvec2.  Which could either be dynamic
2780 * or immediate (in which case it will get lowered later to a non .s2en
2781 * version of the tex instruction which encode tex/samp as immediates:
2782 */
2783static struct tex_src_info
2784get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
2785{
2786   struct ir3_block *b = ctx->block;
2787   struct tex_src_info info = {0};
2788   int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
2789   int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
2790   struct ir3_instruction *texture, *sampler;
2791
2792   if (texture_idx >= 0 || sampler_idx >= 0) {
2793      /* Bindless case */
2794      info.flags |= IR3_INSTR_B;
2795
2796      if (tex->texture_non_uniform || tex->sampler_non_uniform)
2797         info.flags |= IR3_INSTR_NONUNIF;
2798
2799      /* Gather information required to determine which encoding to
2800       * choose as well as for prefetch.
2801       */
2802      nir_intrinsic_instr *bindless_tex = NULL;
2803      bool tex_const;
2804      if (texture_idx >= 0) {
2805         ctx->so->bindless_tex = true;
2806         bindless_tex = ir3_bindless_resource(tex->src[texture_idx].src);
2807         assert(bindless_tex);
2808         info.tex_base = nir_intrinsic_desc_set(bindless_tex);
2809         tex_const = nir_src_is_const(bindless_tex->src[0]);
2810         if (tex_const)
2811            info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
2812      } else {
2813         /* To simplify some of the logic below, assume the index is
2814          * constant 0 when it's not enabled.
2815          */
2816         tex_const = true;
2817         info.tex_idx = 0;
2818      }
2819      nir_intrinsic_instr *bindless_samp = NULL;
2820      bool samp_const;
2821      if (sampler_idx >= 0) {
2822         ctx->so->bindless_samp = true;
2823         bindless_samp = ir3_bindless_resource(tex->src[sampler_idx].src);
2824         assert(bindless_samp);
2825         info.samp_base = nir_intrinsic_desc_set(bindless_samp);
2826         samp_const = nir_src_is_const(bindless_samp->src[0]);
2827         if (samp_const)
2828            info.samp_idx = nir_src_as_uint(bindless_samp->src[0]);
2829      } else {
2830         samp_const = true;
2831         info.samp_idx = 0;
2832      }
2833
2834      /* Choose encoding. */
2835      if (tex_const && samp_const && info.tex_idx < 256 &&
2836          info.samp_idx < 256) {
2837         if (info.tex_idx < 16 && info.samp_idx < 16 &&
2838             (!bindless_tex || !bindless_samp ||
2839              info.tex_base == info.samp_base)) {
2840            /* Everything fits within the instruction */
2841            info.base = info.tex_base;
2842         } else {
2843            info.base = info.tex_base;
2844            info.a1_val = info.tex_idx << 3 | info.samp_base;
2845            info.flags |= IR3_INSTR_A1EN;
2846         }
2847         info.samp_tex = NULL;
2848      } else {
2849         info.flags |= IR3_INSTR_S2EN;
2850         /* In the indirect case, we only use a1.x to store the sampler
2851          * base if it differs from the texture base.
2852          */
2853         if (!bindless_tex || !bindless_samp ||
2854             info.tex_base == info.samp_base) {
2855            info.base = info.tex_base;
2856         } else {
2857            info.base = info.tex_base;
2858            info.a1_val = info.samp_base;
2859            info.flags |= IR3_INSTR_A1EN;
2860         }
2861
2862         /* Note: the indirect source is now a vec2 instead of hvec2, and
2863          * for some reason the texture and sampler are swapped.
2864          */
2865         struct ir3_instruction *texture, *sampler;
2866
2867         if (bindless_tex) {
2868            texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
2869         } else {
2870            texture = create_immed(b, 0);
2871         }
2872
2873         if (bindless_samp) {
2874            sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
2875         } else {
2876            sampler = create_immed(b, 0);
2877         }
2878         info.samp_tex = ir3_collect(b, texture, sampler);
2879      }
2880   } else {
2881      info.flags |= IR3_INSTR_S2EN;
2882      texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
2883      sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
2884      if (texture_idx >= 0) {
2885         texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
2886         texture = ir3_COV(ctx->block, texture, TYPE_U32, TYPE_U16);
2887      } else {
2888         /* TODO what to do for dynamic case? I guess we only need the
2889          * max index for astc srgb workaround so maybe not a problem
2890          * to worry about if we don't enable indirect samplers for
2891          * a4xx?
2892          */
2893         ctx->max_texture_index =
2894            MAX2(ctx->max_texture_index, tex->texture_index);
2895         texture = create_immed_typed(ctx->block, tex->texture_index, TYPE_U16);
2896         info.tex_idx = tex->texture_index;
2897      }
2898
2899      if (sampler_idx >= 0) {
2900         sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
2901         sampler = ir3_COV(ctx->block, sampler, TYPE_U32, TYPE_U16);
2902      } else {
2903         sampler = create_immed_typed(ctx->block, tex->sampler_index, TYPE_U16);
2904         info.samp_idx = tex->texture_index;
2905      }
2906
2907      info.samp_tex = ir3_collect(b, sampler, texture);
2908   }
2909
2910   return info;
2911}
2912
2913static void
2914emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
2915{
2916   struct ir3_block *b = ctx->block;
2917   struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
2918   struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;
2919   struct ir3_instruction *lod, *compare, *proj, *sample_index;
2920   struct tex_src_info info = {0};
2921   bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
2922   unsigned i, coords, flags, ncomp;
2923   unsigned nsrc0 = 0, nsrc1 = 0;
2924   type_t type;
2925   opc_t opc = 0;
2926
2927   ncomp = nir_dest_num_components(tex->dest);
2928
2929   coord = off = ddx = ddy = NULL;
2930   lod = proj = compare = sample_index = NULL;
2931
2932   dst = ir3_get_dst(ctx, &tex->dest, ncomp);
2933
2934   for (unsigned i = 0; i < tex->num_srcs; i++) {
2935      switch (tex->src[i].src_type) {
2936      case nir_tex_src_coord:
2937         coord = ir3_get_src(ctx, &tex->src[i].src);
2938         break;
2939      case nir_tex_src_bias:
2940         lod = ir3_get_src(ctx, &tex->src[i].src)[0];
2941         has_bias = true;
2942         break;
2943      case nir_tex_src_lod:
2944         lod = ir3_get_src(ctx, &tex->src[i].src)[0];
2945         has_lod = true;
2946         break;
2947      case nir_tex_src_comparator: /* shadow comparator */
2948         compare = ir3_get_src(ctx, &tex->src[i].src)[0];
2949         break;
2950      case nir_tex_src_projector:
2951         proj = ir3_get_src(ctx, &tex->src[i].src)[0];
2952         has_proj = true;
2953         break;
2954      case nir_tex_src_offset:
2955         off = ir3_get_src(ctx, &tex->src[i].src);
2956         has_off = true;
2957         break;
2958      case nir_tex_src_ddx:
2959         ddx = ir3_get_src(ctx, &tex->src[i].src);
2960         break;
2961      case nir_tex_src_ddy:
2962         ddy = ir3_get_src(ctx, &tex->src[i].src);
2963         break;
2964      case nir_tex_src_ms_index:
2965         sample_index = ir3_get_src(ctx, &tex->src[i].src)[0];
2966         break;
2967      case nir_tex_src_texture_offset:
2968      case nir_tex_src_sampler_offset:
2969      case nir_tex_src_texture_handle:
2970      case nir_tex_src_sampler_handle:
2971         /* handled in get_tex_samp_src() */
2972         break;
2973      default:
2974         ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
2975                           tex->src[i].src_type);
2976         return;
2977      }
2978   }
2979
2980   switch (tex->op) {
2981   case nir_texop_tex_prefetch:
2982      compile_assert(ctx, !has_bias);
2983      compile_assert(ctx, !has_lod);
2984      compile_assert(ctx, !compare);
2985      compile_assert(ctx, !has_proj);
2986      compile_assert(ctx, !has_off);
2987      compile_assert(ctx, !ddx);
2988      compile_assert(ctx, !ddy);
2989      compile_assert(ctx, !sample_index);
2990      compile_assert(
2991         ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0);
2992      compile_assert(
2993         ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0);
2994
2995      if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) {
2996         opc = OPC_META_TEX_PREFETCH;
2997         ctx->so->num_sampler_prefetch++;
2998         break;
2999      }
3000      FALLTHROUGH;
3001   case nir_texop_tex:
3002      opc = has_lod ? OPC_SAML : OPC_SAM;
3003      break;
3004   case nir_texop_txb:
3005      opc = OPC_SAMB;
3006      break;
3007   case nir_texop_txl:
3008      opc = OPC_SAML;
3009      break;
3010   case nir_texop_txd:
3011      opc = OPC_SAMGQ;
3012      break;
3013   case nir_texop_txf:
3014      opc = OPC_ISAML;
3015      break;
3016   case nir_texop_lod:
3017      opc = OPC_GETLOD;
3018      break;
3019   case nir_texop_tg4:
3020      switch (tex->component) {
3021      case 0:
3022         opc = OPC_GATHER4R;
3023         break;
3024      case 1:
3025         opc = OPC_GATHER4G;
3026         break;
3027      case 2:
3028         opc = OPC_GATHER4B;
3029         break;
3030      case 3:
3031         opc = OPC_GATHER4A;
3032         break;
3033      }
3034      break;
3035   case nir_texop_txf_ms_fb:
3036   case nir_texop_txf_ms:
3037      opc = OPC_ISAMM;
3038      break;
3039   default:
3040      ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
3041      return;
3042   }
3043
3044   tex_info(tex, &flags, &coords);
3045
3046   /*
3047    * lay out the first argument in the proper order:
3048    *  - actual coordinates first
3049    *  - shadow reference
3050    *  - array index
3051    *  - projection w
3052    *  - starting at offset 4, dpdx.xy, dpdy.xy
3053    *
3054    * bias/lod go into the second arg
3055    */
3056
3057   /* insert tex coords: */
3058   for (i = 0; i < coords; i++)
3059      src0[i] = coord[i];
3060
3061   nsrc0 = i;
3062
3063   type_t coord_pad_type = is_half(coord[0]) ? TYPE_U16 : TYPE_U32;
3064   /* scale up integer coords for TXF based on the LOD */
3065   if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
3066      assert(has_lod);
3067      for (i = 0; i < coords; i++)
3068         src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
3069   }
3070
3071   if (coords == 1) {
3072      /* hw doesn't do 1d, so we treat it as 2d with
3073       * height of 1, and patch up the y coord.
3074       */
3075      if (is_isam(opc)) {
3076         src0[nsrc0++] = create_immed_typed(b, 0, coord_pad_type);
3077      } else if (is_half(coord[0])) {
3078         src0[nsrc0++] = create_immed_typed(b, _mesa_float_to_half(0.5), coord_pad_type);
3079      } else {
3080         src0[nsrc0++] = create_immed_typed(b, fui(0.5), coord_pad_type);
3081      }
3082   }
3083
3084   if (tex->is_shadow && tex->op != nir_texop_lod)
3085      src0[nsrc0++] = compare;
3086
3087   if (tex->is_array && tex->op != nir_texop_lod)
3088      src0[nsrc0++] = coord[coords];
3089
3090   if (has_proj) {
3091      src0[nsrc0++] = proj;
3092      flags |= IR3_INSTR_P;
3093   }
3094
3095   /* pad to 4, then ddx/ddy: */
3096   if (tex->op == nir_texop_txd) {
3097      while (nsrc0 < 4)
3098         src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3099      for (i = 0; i < coords; i++)
3100         src0[nsrc0++] = ddx[i];
3101      if (coords < 2)
3102         src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3103      for (i = 0; i < coords; i++)
3104         src0[nsrc0++] = ddy[i];
3105      if (coords < 2)
3106         src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3107   }
3108
3109   /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
3110    * with scaled x coord according to requested sample:
3111    */
3112   if (opc == OPC_ISAMM) {
3113      if (ctx->compiler->txf_ms_with_isaml) {
3114         /* the samples are laid out in x dimension as
3115          *     0 1 2 3
3116          * x_ms = (x << ms) + sample_index;
3117          */
3118         struct ir3_instruction *ms;
3119         ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
3120
3121         src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
3122         src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
3123
3124         opc = OPC_ISAML;
3125      } else {
3126         src0[nsrc0++] = sample_index;
3127      }
3128   }
3129
3130   /*
3131    * second argument (if applicable):
3132    *  - offsets
3133    *  - lod
3134    *  - bias
3135    */
3136   if (has_off | has_lod | has_bias) {
3137      if (has_off) {
3138         unsigned off_coords = coords;
3139         if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
3140            off_coords--;
3141         for (i = 0; i < off_coords; i++)
3142            src1[nsrc1++] = off[i];
3143         if (off_coords < 2)
3144            src1[nsrc1++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3145         flags |= IR3_INSTR_O;
3146      }
3147
3148      if (has_lod | has_bias)
3149         src1[nsrc1++] = lod;
3150   }
3151
3152   type = get_tex_dest_type(tex);
3153
3154   if (opc == OPC_GETLOD)
3155      type = TYPE_S32;
3156
3157   if (tex->op == nir_texop_txf_ms_fb) {
3158      /* only expect a single txf_ms_fb per shader: */
3159      compile_assert(ctx, !ctx->so->fb_read);
3160      compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);
3161
3162      ctx->so->fb_read = true;
3163      info.samp_tex = ir3_collect(
3164         b, create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
3165         create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16));
3166      info.flags = IR3_INSTR_S2EN;
3167
3168      ctx->so->num_samp++;
3169   } else {
3170      info = get_tex_samp_tex_src(ctx, tex);
3171   }
3172
3173   bool tg4_swizzle_fixup = false;
3174   if (tex->op == nir_texop_tg4 && ctx->compiler->gen == 4 &&
3175         ctx->sampler_swizzles[tex->texture_index] != 0x688 /* rgba */) {
3176      uint16_t swizzles = ctx->sampler_swizzles[tex->texture_index];
3177      uint16_t swizzle = (swizzles >> (tex->component * 3)) & 7;
3178      if (swizzle > 3) {
3179         /* this would mean that we can just return 0 / 1, no texturing
3180          * necessary
3181          */
3182         struct ir3_instruction *imm = create_immed(b,
3183               type_float(type) ? fui(swizzle - 4) : (swizzle - 4));
3184         for (int i = 0; i < 4; i++)
3185            dst[i] = imm;
3186         ir3_put_dst(ctx, &tex->dest);
3187         return;
3188      }
3189      opc = OPC_GATHER4R + swizzle;
3190      tg4_swizzle_fixup = true;
3191   }
3192
3193   struct ir3_instruction *col0 = ir3_create_collect(b, src0, nsrc0);
3194   struct ir3_instruction *col1 = ir3_create_collect(b, src1, nsrc1);
3195
3196   if (opc == OPC_META_TEX_PREFETCH) {
3197      int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
3198
3199      compile_assert(ctx, tex->src[idx].src.is_ssa);
3200
3201      sam = ir3_SAM(ctx->in_block, opc, type, MASK(ncomp), 0, NULL,
3202                    get_barycentric(ctx, IJ_PERSP_PIXEL), 0);
3203      sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa);
3204      /* make sure not to add irrelevant flags like S2EN */
3205      sam->flags = flags | (info.flags & IR3_INSTR_B);
3206      sam->prefetch.tex = info.tex_idx;
3207      sam->prefetch.samp = info.samp_idx;
3208      sam->prefetch.tex_base = info.tex_base;
3209      sam->prefetch.samp_base = info.samp_base;
3210   } else {
3211      info.flags |= flags;
3212      sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
3213   }
3214
3215   if (tg4_swizzle_fixup) {
3216      /* TODO: fix-up for ASTC when alpha is selected? */
3217      array_insert(ctx->ir, ctx->ir->tg4, sam);
3218
3219      ir3_split_dest(b, dst, sam, 0, 4);
3220
3221      uint8_t tex_bits = ctx->sampler_swizzles[tex->texture_index] >> 12;
3222      if (!type_float(type) && tex_bits != 3 /* 32bpp */ &&
3223            tex_bits != 0 /* key unset */) {
3224         uint8_t bits = 0;
3225         switch (tex_bits) {
3226         case 1: /* 8bpp */
3227            bits = 8;
3228            break;
3229         case 2: /* 16bpp */
3230            bits = 16;
3231            break;
3232         case 4: /* 10bpp or 2bpp for alpha */
3233            if (opc == OPC_GATHER4A)
3234               bits = 2;
3235            else
3236               bits = 10;
3237            break;
3238         default:
3239            assert(0);
3240         }
3241
3242         sam->cat5.type = TYPE_F32;
3243         for (int i = 0; i < 4; i++) {
3244            /* scale and offset the unorm data */
3245            dst[i] = ir3_MAD_F32(b, dst[i], 0, create_immed(b, fui((1 << bits) - 1)), 0, create_immed(b, fui(0.5f)), 0);
3246            /* convert the scaled value to integer */
3247            dst[i] = ir3_COV(b, dst[i], TYPE_F32, TYPE_U32);
3248            /* sign extend for signed values */
3249            if (type == TYPE_S32) {
3250               dst[i] = ir3_SHL_B(b, dst[i], 0, create_immed(b, 32 - bits), 0);
3251               dst[i] = ir3_ASHR_B(b, dst[i], 0, create_immed(b, 32 - bits), 0);
3252            }
3253         }
3254      }
3255   } else if ((ctx->astc_srgb & (1 << tex->texture_index)) &&
3256       tex->op != nir_texop_tg4 && /* leave out tg4, unless it's on alpha? */
3257       !nir_tex_instr_is_query(tex)) {
3258      assert(opc != OPC_META_TEX_PREFETCH);
3259
3260      /* only need first 3 components: */
3261      sam->dsts[0]->wrmask = 0x7;
3262      ir3_split_dest(b, dst, sam, 0, 3);
3263
3264      /* we need to sample the alpha separately with a non-SRGB
3265       * texture state:
3266       */
3267      sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags, info.samp_tex,
3268                    col0, col1);
3269
3270      array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
3271
3272      /* fixup .w component: */
3273      ir3_split_dest(b, &dst[3], sam, 3, 1);
3274   } else {
3275      /* normal (non-workaround) case: */
3276      ir3_split_dest(b, dst, sam, 0, ncomp);
3277   }
3278
3279   /* GETLOD returns results in 4.8 fixed point */
3280   if (opc == OPC_GETLOD) {
3281      bool half = nir_dest_bit_size(tex->dest) == 16;
3282      struct ir3_instruction *factor =
3283         half ? create_immed_typed(b, _mesa_float_to_half(1.0 / 256), TYPE_F16)
3284              : create_immed(b, fui(1.0 / 256));
3285
3286      for (i = 0; i < 2; i++) {
3287         dst[i] = ir3_MUL_F(
3288            b, ir3_COV(b, dst[i], TYPE_S32, half ? TYPE_F16 : TYPE_F32), 0,
3289            factor, 0);
3290      }
3291   }
3292
3293   ir3_put_dst(ctx, &tex->dest);
3294}
3295
3296static void
3297emit_tex_info(struct ir3_context *ctx, nir_tex_instr *tex, unsigned idx)
3298{
3299   struct ir3_block *b = ctx->block;
3300   struct ir3_instruction **dst, *sam;
3301   type_t dst_type = get_tex_dest_type(tex);
3302   struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
3303
3304   dst = ir3_get_dst(ctx, &tex->dest, 1);
3305
3306   sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL);
3307
3308   /* even though there is only one component, since it ends
3309    * up in .y/.z/.w rather than .x, we need a split_dest()
3310    */
3311   ir3_split_dest(b, dst, sam, idx, 1);
3312
3313   /* The # of levels comes from getinfo.z. We need to add 1 to it, since
3314    * the value in TEX_CONST_0 is zero-based.
3315    */
3316   if (ctx->compiler->levels_add_one)
3317      dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
3318
3319   ir3_put_dst(ctx, &tex->dest);
3320}
3321
3322static void
3323emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
3324{
3325   struct ir3_block *b = ctx->block;
3326   struct ir3_instruction **dst, *sam;
3327   struct ir3_instruction *lod;
3328   unsigned flags, coords;
3329   type_t dst_type = get_tex_dest_type(tex);
3330   struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
3331
3332   tex_info(tex, &flags, &coords);
3333   info.flags |= flags;
3334
3335   /* Actually we want the number of dimensions, not coordinates. This
3336    * distinction only matters for cubes.
3337    */
3338   if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
3339      coords = 2;
3340
3341   dst = ir3_get_dst(ctx, &tex->dest, 4);
3342
3343   int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
3344   compile_assert(ctx, lod_idx >= 0);
3345
3346   lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0];
3347
3348   if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
3349      sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
3350   } else {
3351      /*
3352       * The maximum value which OPC_GETSIZE could return for one dimension
3353       * is 0x007ff0, however sampler buffer could be much bigger.
3354       * Blob uses OPC_GETBUF for them.
3355       */
3356      sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL);
3357   }
3358
3359   ir3_split_dest(b, dst, sam, 0, 4);
3360
3361   /* Array size actually ends up in .w rather than .z. This doesn't
3362    * matter for miplevel 0, but for higher mips the value in z is
3363    * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
3364    * returned, which means that we have to add 1 to it for arrays.
3365    */
3366   if (tex->is_array) {
3367      if (ctx->compiler->levels_add_one) {
3368         dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
3369      } else {
3370         dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
3371      }
3372   }
3373
3374   ir3_put_dst(ctx, &tex->dest);
3375}
3376
3377/* phi instructions are left partially constructed.  We don't resolve
3378 * their srcs until the end of the shader, since (eg. loops) one of
3379 * the phi's srcs might be defined after the phi due to back edges in
3380 * the CFG.
3381 */
3382static void
3383emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi)
3384{
3385   struct ir3_instruction *phi, **dst;
3386
3387   /* NOTE: phi's should be lowered to scalar at this point */
3388   compile_assert(ctx, nphi->dest.ssa.num_components == 1);
3389
3390   dst = ir3_get_dst(ctx, &nphi->dest, 1);
3391
3392   phi = ir3_instr_create(ctx->block, OPC_META_PHI, 1,
3393                          exec_list_length(&nphi->srcs));
3394   __ssa_dst(phi);
3395   phi->phi.nphi = nphi;
3396
3397   dst[0] = phi;
3398
3399   ir3_put_dst(ctx, &nphi->dest);
3400}
3401
3402static struct ir3_block *get_block(struct ir3_context *ctx,
3403                                   const nir_block *nblock);
3404
3405static struct ir3_instruction *
3406read_phi_src(struct ir3_context *ctx, struct ir3_block *blk,
3407             struct ir3_instruction *phi, nir_phi_instr *nphi)
3408{
3409   if (!blk->nblock) {
3410      struct ir3_instruction *continue_phi =
3411         ir3_instr_create(blk, OPC_META_PHI, 1, blk->predecessors_count);
3412      __ssa_dst(continue_phi)->flags = phi->dsts[0]->flags;
3413
3414      for (unsigned i = 0; i < blk->predecessors_count; i++) {
3415         struct ir3_instruction *src =
3416            read_phi_src(ctx, blk->predecessors[i], phi, nphi);
3417         if (src)
3418            __ssa_src(continue_phi, src, 0);
3419         else
3420            ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags);
3421      }
3422
3423      return continue_phi;
3424   }
3425
3426   nir_foreach_phi_src (nsrc, nphi) {
3427      if (blk->nblock == nsrc->pred) {
3428         if (nsrc->src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
3429            /* Create an ir3 undef */
3430            return NULL;
3431         } else {
3432            return ir3_get_src(ctx, &nsrc->src)[0];
3433         }
3434      }
3435   }
3436
3437   unreachable("couldn't find phi node ir3 block");
3438   return NULL;
3439}
3440
3441static void
3442resolve_phis(struct ir3_context *ctx, struct ir3_block *block)
3443{
3444   foreach_instr (phi, &block->instr_list) {
3445      if (phi->opc != OPC_META_PHI)
3446         break;
3447
3448      nir_phi_instr *nphi = phi->phi.nphi;
3449
3450      if (!nphi) /* skip continue phis created above */
3451         continue;
3452
3453      for (unsigned i = 0; i < block->predecessors_count; i++) {
3454         struct ir3_block *pred = block->predecessors[i];
3455         struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi);
3456         if (src) {
3457            __ssa_src(phi, src, 0);
3458         } else {
3459            /* Create an ir3 undef */
3460            ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags);
3461         }
3462      }
3463   }
3464}
3465
3466static void
3467emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
3468{
3469   switch (jump->type) {
3470   case nir_jump_break:
3471   case nir_jump_continue:
3472   case nir_jump_return:
3473      /* I *think* we can simply just ignore this, and use the
3474       * successor block link to figure out where we need to
3475       * jump to for break/continue
3476       */
3477      break;
3478   default:
3479      ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
3480      break;
3481   }
3482}
3483
3484static void
3485emit_instr(struct ir3_context *ctx, nir_instr *instr)
3486{
3487   switch (instr->type) {
3488   case nir_instr_type_alu:
3489      emit_alu(ctx, nir_instr_as_alu(instr));
3490      break;
3491   case nir_instr_type_deref:
3492      /* ignored, handled as part of the intrinsic they are src to */
3493      break;
3494   case nir_instr_type_intrinsic:
3495      emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
3496      break;
3497   case nir_instr_type_load_const:
3498      emit_load_const(ctx, nir_instr_as_load_const(instr));
3499      break;
3500   case nir_instr_type_ssa_undef:
3501      emit_undef(ctx, nir_instr_as_ssa_undef(instr));
3502      break;
3503   case nir_instr_type_tex: {
3504      nir_tex_instr *tex = nir_instr_as_tex(instr);
3505      /* couple tex instructions get special-cased:
3506       */
3507      switch (tex->op) {
3508      case nir_texop_txs:
3509         emit_tex_txs(ctx, tex);
3510         break;
3511      case nir_texop_query_levels:
3512         emit_tex_info(ctx, tex, 2);
3513         break;
3514      case nir_texop_texture_samples:
3515         emit_tex_info(ctx, tex, 3);
3516         break;
3517      default:
3518         emit_tex(ctx, tex);
3519         break;
3520      }
3521      break;
3522   }
3523   case nir_instr_type_jump:
3524      emit_jump(ctx, nir_instr_as_jump(instr));
3525      break;
3526   case nir_instr_type_phi:
3527      emit_phi(ctx, nir_instr_as_phi(instr));
3528      break;
3529   case nir_instr_type_call:
3530   case nir_instr_type_parallel_copy:
3531      ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n",
3532                        instr->type);
3533      break;
3534   }
3535}
3536
3537static struct ir3_block *
3538get_block(struct ir3_context *ctx, const nir_block *nblock)
3539{
3540   struct ir3_block *block;
3541   struct hash_entry *hentry;
3542
3543   hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
3544   if (hentry)
3545      return hentry->data;
3546
3547   block = ir3_block_create(ctx->ir);
3548   block->nblock = nblock;
3549   _mesa_hash_table_insert(ctx->block_ht, nblock, block);
3550
3551   return block;
3552}
3553
3554static struct ir3_block *
3555get_block_or_continue(struct ir3_context *ctx, const nir_block *nblock)
3556{
3557   struct hash_entry *hentry;
3558
3559   hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock);
3560   if (hentry)
3561      return hentry->data;
3562
3563   return get_block(ctx, nblock);
3564}
3565
3566static struct ir3_block *
3567create_continue_block(struct ir3_context *ctx, const nir_block *nblock)
3568{
3569   struct ir3_block *block = ir3_block_create(ctx->ir);
3570   block->nblock = NULL;
3571   _mesa_hash_table_insert(ctx->continue_block_ht, nblock, block);
3572   return block;
3573}
3574
3575static void
3576emit_block(struct ir3_context *ctx, nir_block *nblock)
3577{
3578   ctx->block = get_block(ctx, nblock);
3579
3580   list_addtail(&ctx->block->node, &ctx->ir->block_list);
3581
3582   ctx->block->loop_id = ctx->loop_id;
3583   ctx->block->loop_depth = ctx->loop_depth;
3584
3585   /* re-emit addr register in each block if needed: */
3586   for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
3587      _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);
3588      ctx->addr0_ht[i] = NULL;
3589   }
3590
3591   _mesa_hash_table_u64_destroy(ctx->addr1_ht);
3592   ctx->addr1_ht = NULL;
3593
3594   nir_foreach_instr (instr, nblock) {
3595      ctx->cur_instr = instr;
3596      emit_instr(ctx, instr);
3597      ctx->cur_instr = NULL;
3598      if (ctx->error)
3599         return;
3600   }
3601
3602   for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) {
3603      if (nblock->successors[i]) {
3604         ctx->block->successors[i] =
3605            get_block_or_continue(ctx, nblock->successors[i]);
3606         ctx->block->physical_successors[i] = ctx->block->successors[i];
3607      }
3608   }
3609
3610   _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
3611}
3612
3613static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
3614
3615static void
3616emit_if(struct ir3_context *ctx, nir_if *nif)
3617{
3618   struct ir3_instruction *condition = ir3_get_src(ctx, &nif->condition)[0];
3619
3620   if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {
3621      ctx->block->condition = ssa(condition->srcs[0]);
3622      ctx->block->brtype = IR3_BRANCH_ANY;
3623   } else if (condition->opc == OPC_ALL_MACRO &&
3624              condition->block == ctx->block) {
3625      ctx->block->condition = ssa(condition->srcs[0]);
3626      ctx->block->brtype = IR3_BRANCH_ALL;
3627   } else if (condition->opc == OPC_ELECT_MACRO &&
3628              condition->block == ctx->block) {
3629      ctx->block->condition = NULL;
3630      ctx->block->brtype = IR3_BRANCH_GETONE;
3631   } else if (condition->opc == OPC_SHPS_MACRO &&
3632              condition->block == ctx->block) {
3633      /* TODO: technically this only works if the block is the only user of the
3634       * shps, but we only use it in very constrained scenarios so this should
3635       * be ok.
3636       */
3637      ctx->block->condition = NULL;
3638      ctx->block->brtype = IR3_BRANCH_SHPS;
3639   } else {
3640      ctx->block->condition = ir3_get_predicate(ctx, condition);
3641      ctx->block->brtype = IR3_BRANCH_COND;
3642   }
3643
3644   emit_cf_list(ctx, &nif->then_list);
3645   emit_cf_list(ctx, &nif->else_list);
3646
3647   struct ir3_block *last_then = get_block(ctx, nir_if_last_then_block(nif));
3648   struct ir3_block *first_else = get_block(ctx, nir_if_first_else_block(nif));
3649   assert(last_then->physical_successors[0] &&
3650          !last_then->physical_successors[1]);
3651   last_then->physical_successors[1] = first_else;
3652
3653   struct ir3_block *last_else = get_block(ctx, nir_if_last_else_block(nif));
3654   struct ir3_block *after_if =
3655      get_block(ctx, nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
3656   assert(last_else->physical_successors[0] &&
3657          !last_else->physical_successors[1]);
3658   if (after_if != last_else->physical_successors[0])
3659      last_else->physical_successors[1] = after_if;
3660}
3661
3662static void
3663emit_loop(struct ir3_context *ctx, nir_loop *nloop)
3664{
3665   unsigned old_loop_id = ctx->loop_id;
3666   ctx->loop_id = ctx->so->loops + 1;
3667   ctx->loop_depth++;
3668
3669   struct nir_block *nstart = nir_loop_first_block(nloop);
3670   struct ir3_block *continue_blk = NULL;
3671
3672   /* There's always one incoming edge from outside the loop, and if there
3673    * are more than two backedges from inside the loop (so more than 2 total
3674    * edges) then we need to create a continue block after the loop to ensure
3675    * that control reconverges at the end of each loop iteration.
3676    */
3677   if (nstart->predecessors->entries > 2) {
3678      continue_blk = create_continue_block(ctx, nstart);
3679   }
3680
3681   emit_cf_list(ctx, &nloop->body);
3682
3683   if (continue_blk) {
3684      struct ir3_block *start = get_block(ctx, nstart);
3685      continue_blk->successors[0] = start;
3686      continue_blk->physical_successors[0] = start;
3687      continue_blk->loop_id = ctx->loop_id;
3688      continue_blk->loop_depth = ctx->loop_depth;
3689      list_addtail(&continue_blk->node, &ctx->ir->block_list);
3690   }
3691
3692   ctx->so->loops++;
3693   ctx->loop_depth--;
3694   ctx->loop_id = old_loop_id;
3695}
3696
3697static void
3698stack_push(struct ir3_context *ctx)
3699{
3700   ctx->stack++;
3701   ctx->max_stack = MAX2(ctx->max_stack, ctx->stack);
3702}
3703
3704static void
3705stack_pop(struct ir3_context *ctx)
3706{
3707   compile_assert(ctx, ctx->stack > 0);
3708   ctx->stack--;
3709}
3710
3711static void
3712emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
3713{
3714   foreach_list_typed (nir_cf_node, node, node, list) {
3715      switch (node->type) {
3716      case nir_cf_node_block:
3717         emit_block(ctx, nir_cf_node_as_block(node));
3718         break;
3719      case nir_cf_node_if:
3720         stack_push(ctx);
3721         emit_if(ctx, nir_cf_node_as_if(node));
3722         stack_pop(ctx);
3723         break;
3724      case nir_cf_node_loop:
3725         stack_push(ctx);
3726         emit_loop(ctx, nir_cf_node_as_loop(node));
3727         stack_pop(ctx);
3728         break;
3729      case nir_cf_node_function:
3730         ir3_context_error(ctx, "TODO\n");
3731         break;
3732      }
3733   }
3734}
3735
3736/* emit stream-out code.  At this point, the current block is the original
3737 * (nir) end block, and nir ensures that all flow control paths terminate
3738 * into the end block.  We re-purpose the original end block to generate
3739 * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
3740 * block holding stream-out write instructions, followed by the new end
3741 * block:
3742 *
3743 *   blockOrigEnd {
3744 *      p0.x = (vtxcnt < maxvtxcnt)
3745 *      // succs: blockStreamOut, blockNewEnd
3746 *   }
3747 *   blockStreamOut {
3748 *      // preds: blockOrigEnd
3749 *      ... stream-out instructions ...
3750 *      // succs: blockNewEnd
3751 *   }
3752 *   blockNewEnd {
3753 *      // preds: blockOrigEnd, blockStreamOut
3754 *   }
3755 */
3756static void
3757emit_stream_out(struct ir3_context *ctx)
3758{
3759   struct ir3 *ir = ctx->ir;
3760   struct ir3_stream_output_info *strmout = &ctx->so->stream_output;
3761   struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
3762   struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
3763   struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
3764
3765   /* create vtxcnt input in input block at top of shader,
3766    * so that it is seen as live over the entire duration
3767    * of the shader:
3768    */
3769   vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);
3770   maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
3771
3772   /* at this point, we are at the original 'end' block,
3773    * re-purpose this block to stream-out condition, then
3774    * append stream-out block and new-end block
3775    */
3776   orig_end_block = ctx->block;
3777
3778   // maybe w/ store_global intrinsic, we could do this
3779   // stuff in nir->nir pass
3780
3781   stream_out_block = ir3_block_create(ir);
3782   list_addtail(&stream_out_block->node, &ir->block_list);
3783
3784   new_end_block = ir3_block_create(ir);
3785   list_addtail(&new_end_block->node, &ir->block_list);
3786
3787   orig_end_block->successors[0] = stream_out_block;
3788   orig_end_block->successors[1] = new_end_block;
3789
3790   orig_end_block->physical_successors[0] = stream_out_block;
3791   orig_end_block->physical_successors[1] = new_end_block;
3792
3793   stream_out_block->successors[0] = new_end_block;
3794
3795   stream_out_block->physical_successors[0] = new_end_block;
3796
3797   /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
3798   cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
3799   cond->dsts[0]->num = regid(REG_P0, 0);
3800   cond->dsts[0]->flags &= ~IR3_REG_SSA;
3801   cond->cat2.condition = IR3_COND_LT;
3802
3803   /* condition goes on previous block to the conditional,
3804    * since it is used to pick which of the two successor
3805    * paths to take:
3806    */
3807   orig_end_block->condition = cond;
3808
3809   /* switch to stream_out_block to generate the stream-out
3810    * instructions:
3811    */
3812   ctx->block = stream_out_block;
3813
3814   /* Calculate base addresses based on vtxcnt.  Instructions
3815    * generated for bases not used in following loop will be
3816    * stripped out in the backend.
3817    */
3818   for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3819      const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
3820      unsigned stride = strmout->stride[i];
3821      struct ir3_instruction *base, *off;
3822
3823      base = create_uniform(ctx->block, regid(const_state->offsets.tfbo, i));
3824
3825      /* 24-bit should be enough: */
3826      off = ir3_MUL_U24(ctx->block, vtxcnt, 0,
3827                        create_immed(ctx->block, stride * 4), 0);
3828
3829      bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
3830   }
3831
3832   /* Generate the per-output store instructions: */
3833   for (unsigned i = 0; i < strmout->num_outputs; i++) {
3834      for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
3835         unsigned c = j + strmout->output[i].start_component;
3836         struct ir3_instruction *base, *out, *stg;
3837
3838         base = bases[strmout->output[i].output_buffer];
3839         out = ctx->outputs[regid(strmout->output[i].register_index, c)];
3840
3841         stg = ir3_STG(
3842            ctx->block, base, 0,
3843            create_immed(ctx->block, (strmout->output[i].dst_offset + j) * 4),
3844            0, out, 0, create_immed(ctx->block, 1), 0);
3845         stg->cat6.type = TYPE_U32;
3846
3847         array_insert(ctx->block, ctx->block->keeps, stg);
3848      }
3849   }
3850
3851   /* and finally switch to the new_end_block: */
3852   ctx->block = new_end_block;
3853}
3854
3855static void
3856setup_predecessors(struct ir3 *ir)
3857{
3858   foreach_block (block, &ir->block_list) {
3859      for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
3860         if (block->successors[i])
3861            ir3_block_add_predecessor(block->successors[i], block);
3862         if (block->physical_successors[i])
3863            ir3_block_add_physical_predecessor(block->physical_successors[i],
3864                                               block);
3865      }
3866   }
3867}
3868
3869static void
3870emit_function(struct ir3_context *ctx, nir_function_impl *impl)
3871{
3872   nir_metadata_require(impl, nir_metadata_block_index);
3873
3874   compile_assert(ctx, ctx->stack == 0);
3875
3876   emit_cf_list(ctx, &impl->body);
3877   emit_block(ctx, impl->end_block);
3878
3879   compile_assert(ctx, ctx->stack == 0);
3880
3881   /* at this point, we should have a single empty block,
3882    * into which we emit the 'end' instruction.
3883    */
3884   compile_assert(ctx, list_is_empty(&ctx->block->instr_list));
3885
3886   /* If stream-out (aka transform-feedback) enabled, emit the
3887    * stream-out instructions, followed by a new empty block (into
3888    * which the 'end' instruction lands).
3889    *
3890    * NOTE: it is done in this order, rather than inserting before
3891    * we emit end_block, because NIR guarantees that all blocks
3892    * flow into end_block, and that end_block has no successors.
3893    * So by re-purposing end_block as the first block of stream-
3894    * out, we guarantee that all exit paths flow into the stream-
3895    * out instructions.
3896    */
3897   if ((ctx->compiler->gen < 5) &&
3898       (ctx->so->stream_output.num_outputs > 0) &&
3899       !ctx->so->binning_pass) {
3900      assert(ctx->so->type == MESA_SHADER_VERTEX);
3901      emit_stream_out(ctx);
3902   }
3903
3904   setup_predecessors(ctx->ir);
3905   foreach_block (block, &ctx->ir->block_list) {
3906      resolve_phis(ctx, block);
3907   }
3908}
3909
3910static void
3911setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
3912{
3913   struct ir3_shader_variant *so = ctx->so;
3914   struct ir3_instruction *coord = NULL;
3915
3916   if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
3917      coord = ir3_create_collect(ctx->block, ir3_get_src(ctx, &intr->src[0]), 2);
3918
3919   compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
3920
3921   unsigned frac = nir_intrinsic_component(intr);
3922   unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);
3923   unsigned ncomp = nir_intrinsic_dest_components(intr);
3924   unsigned n = nir_intrinsic_base(intr) + offset;
3925   unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;
3926   unsigned compmask;
3927
3928   /* Inputs are loaded using ldlw or ldg for other stages. */
3929   compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||
3930                          ctx->so->type == MESA_SHADER_VERTEX);
3931
3932   if (ctx->so->type == MESA_SHADER_FRAGMENT)
3933      compmask = BITFIELD_MASK(ncomp) << frac;
3934   else
3935      compmask = BITFIELD_MASK(ncomp + frac);
3936
3937   /* for a4xx+ rasterflat */
3938   if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)
3939      coord = NULL;
3940
3941   so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);
3942
3943   so->inputs[n].slot = slot;
3944   so->inputs[n].compmask |= compmask;
3945   so->inputs_count = MAX2(so->inputs_count, n + 1);
3946   compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs));
3947   so->inputs[n].flat = !coord;
3948
3949   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3950      compile_assert(ctx, slot != VARYING_SLOT_POS);
3951
3952      so->inputs[n].bary = true;
3953
3954      for (int i = 0; i < ncomp; i++) {
3955         unsigned idx = (n * 4) + i + frac;
3956         ctx->last_dst[i] = create_frag_input(ctx, coord, idx);
3957      }
3958   } else {
3959      struct ir3_instruction *input = NULL;
3960
3961      foreach_input (in, ctx->ir) {
3962         if (in->input.inidx == n) {
3963            input = in;
3964            break;
3965         }
3966      }
3967
3968      if (!input) {
3969         input = create_input(ctx, compmask);
3970         input->input.inidx = n;
3971      } else {
3972         /* For aliased inputs, just append to the wrmask.. ie. if we
3973          * first see a vec2 index at slot N, and then later a vec4,
3974          * the wrmask of the resulting overlapped vec2 and vec4 is 0xf
3975          */
3976         input->dsts[0]->wrmask |= compmask;
3977      }
3978
3979      for (int i = 0; i < ncomp + frac; i++) {
3980         unsigned idx = (n * 4) + i;
3981         compile_assert(ctx, idx < ctx->ninputs);
3982
3983         /* fixup the src wrmask to avoid validation fail */
3984         if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) {
3985            ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask;
3986            continue;
3987         }
3988
3989         ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1);
3990      }
3991
3992      for (int i = 0; i < ncomp; i++) {
3993         unsigned idx = (n * 4) + i + frac;
3994         ctx->last_dst[i] = ctx->inputs[idx];
3995      }
3996   }
3997}
3998
3999/* Initially we assign non-packed inloc's for varyings, as we don't really
4000 * know up-front which components will be unused.  After all the compilation
4001 * stages we scan the shader to see which components are actually used, and
4002 * re-pack the inlocs to eliminate unneeded varyings.
4003 */
4004static void
4005pack_inlocs(struct ir3_context *ctx)
4006{
4007   struct ir3_shader_variant *so = ctx->so;
4008   uint8_t used_components[so->inputs_count];
4009
4010   memset(used_components, 0, sizeof(used_components));
4011
4012   /*
4013    * First Step: scan shader to find which bary.f/ldlv remain:
4014    */
4015
4016   foreach_block (block, &ctx->ir->block_list) {
4017      foreach_instr (instr, &block->instr_list) {
4018         if (is_input(instr)) {
4019            unsigned inloc = instr->srcs[0]->iim_val;
4020            unsigned i = inloc / 4;
4021            unsigned j = inloc % 4;
4022
4023            compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED);
4024            compile_assert(ctx, i < so->inputs_count);
4025
4026            used_components[i] |= 1 << j;
4027         } else if (instr->opc == OPC_META_TEX_PREFETCH) {
4028            for (int n = 0; n < 2; n++) {
4029               unsigned inloc = instr->prefetch.input_offset + n;
4030               unsigned i = inloc / 4;
4031               unsigned j = inloc % 4;
4032
4033               compile_assert(ctx, i < so->inputs_count);
4034
4035               used_components[i] |= 1 << j;
4036            }
4037         }
4038      }
4039   }
4040
4041   /*
4042    * Second Step: reassign varying inloc/slots:
4043    */
4044
4045   unsigned inloc = 0;
4046
4047   /* for clip+cull distances, unused components can't be eliminated because
4048    * they're read by fixed-function, even if there's a hole.  Note that
4049    * clip/cull distance arrays must be declared in the FS, so we can just
4050    * use the NIR clip/cull distances to avoid reading ucp_enables in the
4051    * shader key.
4052    */
4053   unsigned clip_cull_mask = so->clip_mask | so->cull_mask;
4054
4055   for (unsigned i = 0; i < so->inputs_count; i++) {
4056      unsigned compmask = 0, maxcomp = 0;
4057
4058      so->inputs[i].inloc = inloc;
4059      so->inputs[i].bary = false;
4060
4061      if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 ||
4062          so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) {
4063         if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0)
4064            compmask = clip_cull_mask & 0xf;
4065         else
4066            compmask = clip_cull_mask >> 4;
4067         used_components[i] = compmask;
4068      }
4069
4070      for (unsigned j = 0; j < 4; j++) {
4071         if (!(used_components[i] & (1 << j)))
4072            continue;
4073
4074         compmask |= (1 << j);
4075         maxcomp = j + 1;
4076
4077         /* at this point, since used_components[i] mask is only
4078          * considering varyings (ie. not sysvals) we know this
4079          * is a varying:
4080          */
4081         so->inputs[i].bary = true;
4082      }
4083
4084      if (so->inputs[i].bary) {
4085         so->varying_in++;
4086         so->inputs[i].compmask = (1 << maxcomp) - 1;
4087         inloc += maxcomp;
4088      }
4089   }
4090
4091   /*
4092    * Third Step: reassign packed inloc's:
4093    */
4094
4095   foreach_block (block, &ctx->ir->block_list) {
4096      foreach_instr (instr, &block->instr_list) {
4097         if (is_input(instr)) {
4098            unsigned inloc = instr->srcs[0]->iim_val;
4099            unsigned i = inloc / 4;
4100            unsigned j = inloc % 4;
4101
4102            instr->srcs[0]->iim_val = so->inputs[i].inloc + j;
4103         } else if (instr->opc == OPC_META_TEX_PREFETCH) {
4104            unsigned i = instr->prefetch.input_offset / 4;
4105            unsigned j = instr->prefetch.input_offset % 4;
4106            instr->prefetch.input_offset = so->inputs[i].inloc + j;
4107         }
4108      }
4109   }
4110}
4111
4112static void
4113setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr)
4114{
4115   struct ir3_shader_variant *so = ctx->so;
4116   nir_io_semantics io = nir_intrinsic_io_semantics(intr);
4117
4118   compile_assert(ctx, nir_src_is_const(intr->src[1]));
4119
4120   unsigned offset = nir_src_as_uint(intr->src[1]);
4121   unsigned n = nir_intrinsic_base(intr) + offset;
4122   unsigned frac = nir_intrinsic_component(intr);
4123   unsigned ncomp = nir_intrinsic_src_components(intr, 0);
4124
4125   /* For per-view variables, each user-facing slot corresponds to multiple
4126    * views, each with a corresponding driver_location, and the offset is for
4127    * the driver_location. To properly figure out of the slot, we'd need to
4128    * plumb through the number of views. However, for now we only use
4129    * per-view with gl_Position, so we assume that the variable is not an
4130    * array or matrix (so there are no indirect accesses to the variable
4131    * itself) and the indirect offset corresponds to the view.
4132    */
4133   unsigned slot = io.location + (io.per_view ? 0 : offset);
4134
4135   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
4136      switch (slot) {
4137      case FRAG_RESULT_DEPTH:
4138         so->writes_pos = true;
4139         break;
4140      case FRAG_RESULT_COLOR:
4141         if (!ctx->s->info.fs.color_is_dual_source) {
4142            so->color0_mrt = 1;
4143         } else {
4144            slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index;
4145         }
4146         break;
4147      case FRAG_RESULT_SAMPLE_MASK:
4148         so->writes_smask = true;
4149         break;
4150      case FRAG_RESULT_STENCIL:
4151         so->writes_stencilref = true;
4152         break;
4153      default:
4154         slot += io.dual_source_blend_index; /* For dual-src blend */
4155         if (slot >= FRAG_RESULT_DATA0)
4156            break;
4157         ir3_context_error(ctx, "unknown FS output name: %s\n",
4158                           gl_frag_result_name(slot));
4159      }
4160   } else if (ctx->so->type == MESA_SHADER_VERTEX ||
4161              ctx->so->type == MESA_SHADER_TESS_EVAL ||
4162              ctx->so->type == MESA_SHADER_GEOMETRY) {
4163      switch (slot) {
4164      case VARYING_SLOT_POS:
4165         so->writes_pos = true;
4166         break;
4167      case VARYING_SLOT_PSIZ:
4168         so->writes_psize = true;
4169         break;
4170      case VARYING_SLOT_PRIMITIVE_ID:
4171      case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
4172         assert(ctx->so->type == MESA_SHADER_GEOMETRY);
4173         FALLTHROUGH;
4174      case VARYING_SLOT_COL0:
4175      case VARYING_SLOT_COL1:
4176      case VARYING_SLOT_BFC0:
4177      case VARYING_SLOT_BFC1:
4178      case VARYING_SLOT_FOGC:
4179      case VARYING_SLOT_CLIP_DIST0:
4180      case VARYING_SLOT_CLIP_DIST1:
4181      case VARYING_SLOT_CLIP_VERTEX:
4182      case VARYING_SLOT_LAYER:
4183      case VARYING_SLOT_VIEWPORT:
4184         break;
4185      default:
4186         if (slot >= VARYING_SLOT_VAR0)
4187            break;
4188         if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
4189            break;
4190         ir3_context_error(ctx, "unknown %s shader output name: %s\n",
4191                           _mesa_shader_stage_to_string(ctx->so->type),
4192                           gl_varying_slot_name_for_stage(slot, ctx->so->type));
4193      }
4194   } else {
4195      ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
4196   }
4197
4198   so->outputs_count = MAX2(so->outputs_count, n + 1);
4199   compile_assert(ctx, so->outputs_count <= ARRAY_SIZE(so->outputs));
4200
4201   so->outputs[n].slot = slot;
4202   if (io.per_view)
4203      so->outputs[n].view = offset;
4204
4205   for (int i = 0; i < ncomp; i++) {
4206      unsigned idx = (n * 4) + i + frac;
4207      compile_assert(ctx, idx < ctx->noutputs);
4208      ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
4209   }
4210
4211   /* if varying packing doesn't happen, we could end up in a situation
4212    * with "holes" in the output, and since the per-generation code that
4213    * sets up varying linkage registers doesn't expect to have more than
4214    * one varying per vec4 slot, pad the holes.
4215    *
4216    * Note that this should probably generate a performance warning of
4217    * some sort.
4218    */
4219   for (int i = 0; i < frac; i++) {
4220      unsigned idx = (n * 4) + i;
4221      if (!ctx->outputs[idx]) {
4222         ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
4223      }
4224   }
4225
4226   struct ir3_instruction *const *src = ir3_get_src(ctx, &intr->src[0]);
4227   for (int i = 0; i < ncomp; i++) {
4228      unsigned idx = (n * 4) + i + frac;
4229      ctx->outputs[idx] = src[i];
4230   }
4231}
4232
4233static bool
4234uses_load_input(struct ir3_shader_variant *so)
4235{
4236   return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT;
4237}
4238
4239static bool
4240uses_store_output(struct ir3_shader_variant *so)
4241{
4242   switch (so->type) {
4243   case MESA_SHADER_VERTEX:
4244      return !so->key.has_gs && !so->key.tessellation;
4245   case MESA_SHADER_TESS_EVAL:
4246      return !so->key.has_gs;
4247   case MESA_SHADER_GEOMETRY:
4248   case MESA_SHADER_FRAGMENT:
4249      return true;
4250   case MESA_SHADER_TESS_CTRL:
4251   case MESA_SHADER_COMPUTE:
4252   case MESA_SHADER_KERNEL:
4253      return false;
4254   default:
4255      unreachable("unknown stage");
4256   }
4257}
4258
4259static void
4260emit_instructions(struct ir3_context *ctx)
4261{
4262   nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
4263
4264   /* some varying setup which can't be done in setup_input(): */
4265   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
4266      nir_foreach_shader_in_variable (var, ctx->s) {
4267         /* if any varyings have 'sample' qualifer, that triggers us
4268          * to run in per-sample mode:
4269          */
4270         if (var->data.sample)
4271            ctx->so->per_samp = true;
4272
4273         /* set rasterflat flag for front/back color */
4274         if (var->data.interpolation == INTERP_MODE_NONE) {
4275            switch (var->data.location) {
4276            case VARYING_SLOT_COL0:
4277            case VARYING_SLOT_COL1:
4278            case VARYING_SLOT_BFC0:
4279            case VARYING_SLOT_BFC1:
4280               ctx->so->inputs[var->data.driver_location].rasterflat = true;
4281               break;
4282            default:
4283               break;
4284            }
4285         }
4286      }
4287   }
4288
4289   if (uses_load_input(ctx->so)) {
4290      ctx->so->inputs_count = ctx->s->num_inputs;
4291      compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs));
4292      ctx->ninputs = ctx->s->num_inputs * 4;
4293      ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
4294   } else {
4295      ctx->ninputs = 0;
4296      ctx->so->inputs_count = 0;
4297   }
4298
4299   if (uses_store_output(ctx->so)) {
4300      ctx->noutputs = ctx->s->num_outputs * 4;
4301      ctx->outputs =
4302         rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
4303   } else {
4304      ctx->noutputs = 0;
4305   }
4306
4307   ctx->ir = ir3_create(ctx->compiler, ctx->so);
4308
4309   /* Create inputs in first block: */
4310   ctx->block = get_block(ctx, nir_start_block(fxn));
4311   ctx->in_block = ctx->block;
4312
4313   /* for fragment shader, the vcoord input register is used as the
4314    * base for bary.f varying fetch instrs:
4315    *
4316    * TODO defer creating ctx->ij_pixel and corresponding sysvals
4317    * until emit_intrinsic when we know they are actually needed.
4318    * For now, we defer creating ctx->ij_centroid, etc, since we
4319    * only need ij_pixel for "old style" varying inputs (ie.
4320    * tgsi_to_nir)
4321    */
4322   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
4323      ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);
4324   }
4325
4326   /* Defer add_sysval_input() stuff until after setup_inputs(),
4327    * because sysvals need to be appended after varyings:
4328    */
4329   if (ctx->ij[IJ_PERSP_PIXEL]) {
4330      add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, 0x3,
4331                                ctx->ij[IJ_PERSP_PIXEL]);
4332   }
4333
4334   /* Tesselation shaders always need primitive ID for indexing the
4335    * BO. Geometry shaders don't always need it but when they do it has be
4336    * delivered and unclobbered in the VS. To make things easy, we always
4337    * make room for it in VS/DS.
4338    */
4339   bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE;
4340   bool has_gs = ctx->so->key.has_gs;
4341   switch (ctx->so->type) {
4342   case MESA_SHADER_VERTEX:
4343      if (has_tess) {
4344         ctx->tcs_header =
4345            create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
4346         ctx->rel_patch_id =
4347            create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
4348         ctx->primitive_id =
4349            create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
4350      } else if (has_gs) {
4351         ctx->gs_header =
4352            create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
4353         ctx->primitive_id =
4354            create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
4355      }
4356      break;
4357   case MESA_SHADER_TESS_CTRL:
4358      ctx->tcs_header =
4359         create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
4360      ctx->rel_patch_id =
4361         create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
4362      break;
4363   case MESA_SHADER_TESS_EVAL:
4364      if (has_gs) {
4365         ctx->gs_header =
4366            create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
4367         ctx->primitive_id =
4368            create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
4369      }
4370      ctx->rel_patch_id =
4371         create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
4372      break;
4373   case MESA_SHADER_GEOMETRY:
4374      ctx->gs_header =
4375         create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
4376      break;
4377   default:
4378      break;
4379   }
4380
4381   /* Find # of samplers. Just assume that we'll be reading from images.. if
4382    * it is write-only we don't have to count it, but after lowering derefs
4383    * is too late to compact indices for that.
4384    */
4385   ctx->so->num_samp =
4386      BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images;
4387
4388   /* Save off clip+cull information. Note that in OpenGL clip planes may
4389    * be individually enabled/disabled, and some gens handle lowering in
4390    * backend, so we also need to consider the shader key:
4391    */
4392   ctx->so->clip_mask = ctx->so->key.ucp_enables |
4393                        MASK(ctx->s->info.clip_distance_array_size);
4394   ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size)
4395                        << ctx->s->info.clip_distance_array_size;
4396
4397   ctx->so->pvtmem_size = ctx->s->scratch_size;
4398   ctx->so->shared_size = ctx->s->info.shared_size;
4399
4400   /* NOTE: need to do something more clever when we support >1 fxn */
4401   nir_foreach_register (reg, &fxn->registers) {
4402      ir3_declare_array(ctx, reg);
4403   }
4404
4405   if (ctx->so->type == MESA_SHADER_TESS_CTRL &&
4406       ctx->compiler->tess_use_shared) {
4407      struct ir3_instruction *barrier = ir3_BAR(ctx->block);
4408      barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
4409      barrier->barrier_class = IR3_BARRIER_EVERYTHING;
4410      array_insert(ctx->block, ctx->block->keeps, barrier);
4411      ctx->so->has_barrier = true;
4412   }
4413
4414   /* And emit the body: */
4415   ctx->impl = fxn;
4416   emit_function(ctx, fxn);
4417}
4418
4419/* Fixup tex sampler state for astc/srgb workaround instructions.  We
4420 * need to assign the tex state indexes for these after we know the
4421 * max tex index.
4422 */
4423static void
4424fixup_astc_srgb(struct ir3_context *ctx)
4425{
4426   struct ir3_shader_variant *so = ctx->so;
4427   /* indexed by original tex idx, value is newly assigned alpha sampler
4428    * state tex idx.  Zero is invalid since there is at least one sampler
4429    * if we get here.
4430    */
4431   unsigned alt_tex_state[16] = {0};
4432   unsigned tex_idx = ctx->max_texture_index + 1;
4433   unsigned idx = 0;
4434
4435   so->astc_srgb.base = tex_idx;
4436
4437   for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
4438      struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
4439
4440      compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
4441
4442      if (alt_tex_state[sam->cat5.tex] == 0) {
4443         /* assign new alternate/alpha tex state slot: */
4444         alt_tex_state[sam->cat5.tex] = tex_idx++;
4445         so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
4446         so->astc_srgb.count++;
4447      }
4448
4449      sam->cat5.tex = alt_tex_state[sam->cat5.tex];
4450   }
4451}
4452
4453/* Fixup tex sampler state for tg4 workaround instructions.  We
4454 * need to assign the tex state indexes for these after we know the
4455 * max tex index.
4456 */
4457static void
4458fixup_tg4(struct ir3_context *ctx)
4459{
4460   struct ir3_shader_variant *so = ctx->so;
4461   /* indexed by original tex idx, value is newly assigned alpha sampler
4462    * state tex idx.  Zero is invalid since there is at least one sampler
4463    * if we get here.
4464    */
4465   unsigned alt_tex_state[16] = {0};
4466   unsigned tex_idx = ctx->max_texture_index + so->astc_srgb.count + 1;
4467   unsigned idx = 0;
4468
4469   so->tg4.base = tex_idx;
4470
4471   for (unsigned i = 0; i < ctx->ir->tg4_count; i++) {
4472      struct ir3_instruction *sam = ctx->ir->tg4[i];
4473
4474      compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
4475
4476      if (alt_tex_state[sam->cat5.tex] == 0) {
4477         /* assign new alternate/alpha tex state slot: */
4478         alt_tex_state[sam->cat5.tex] = tex_idx++;
4479         so->tg4.orig_idx[idx++] = sam->cat5.tex;
4480         so->tg4.count++;
4481      }
4482
4483      sam->cat5.tex = alt_tex_state[sam->cat5.tex];
4484   }
4485}
4486
4487static bool
4488output_slot_used_for_binning(gl_varying_slot slot)
4489{
4490   return slot == VARYING_SLOT_POS || slot == VARYING_SLOT_PSIZ ||
4491          slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1 ||
4492          slot == VARYING_SLOT_VIEWPORT;
4493}
4494
4495static struct ir3_instruction *
4496find_end(struct ir3 *ir)
4497{
4498   foreach_block_rev (block, &ir->block_list) {
4499      foreach_instr_rev (instr, &block->instr_list) {
4500         if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
4501            return instr;
4502      }
4503   }
4504   unreachable("couldn't find end instruction");
4505}
4506
4507static void
4508fixup_binning_pass(struct ir3_context *ctx, struct ir3_instruction *end)
4509{
4510   struct ir3_shader_variant *so = ctx->so;
4511   unsigned i, j;
4512
4513   /* first pass, remove unused outputs from the IR level outputs: */
4514   for (i = 0, j = 0; i < end->srcs_count; i++) {
4515      unsigned outidx = end->end.outidxs[i];
4516      unsigned slot = so->outputs[outidx].slot;
4517
4518      if (output_slot_used_for_binning(slot)) {
4519         end->srcs[j] = end->srcs[i];
4520         end->end.outidxs[j] = end->end.outidxs[i];
4521         j++;
4522      }
4523   }
4524   end->srcs_count = j;
4525
4526   /* second pass, cleanup the unused slots in ir3_shader_variant::outputs
4527    * table:
4528    */
4529   for (i = 0, j = 0; i < so->outputs_count; i++) {
4530      unsigned slot = so->outputs[i].slot;
4531
4532      if (output_slot_used_for_binning(slot)) {
4533         so->outputs[j] = so->outputs[i];
4534
4535         /* fixup outidx to point to new output table entry: */
4536         for (unsigned k = 0; k < end->srcs_count; k++) {
4537            if (end->end.outidxs[k] == i) {
4538               end->end.outidxs[k] = j;
4539               break;
4540            }
4541         }
4542
4543         j++;
4544      }
4545   }
4546   so->outputs_count = j;
4547}
4548
4549static void
4550collect_tex_prefetches(struct ir3_context *ctx, struct ir3 *ir)
4551{
4552   unsigned idx = 0;
4553
4554   /* Collect sampling instructions eligible for pre-dispatch. */
4555   foreach_block (block, &ir->block_list) {
4556      foreach_instr_safe (instr, &block->instr_list) {
4557         if (instr->opc == OPC_META_TEX_PREFETCH) {
4558            assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch));
4559            struct ir3_sampler_prefetch *fetch =
4560               &ctx->so->sampler_prefetch[idx];
4561            idx++;
4562
4563            if (instr->flags & IR3_INSTR_B) {
4564               fetch->cmd = IR3_SAMPLER_BINDLESS_PREFETCH_CMD;
4565               /* In bindless mode, the index is actually the base */
4566               fetch->tex_id = instr->prefetch.tex_base;
4567               fetch->samp_id = instr->prefetch.samp_base;
4568               fetch->tex_bindless_id = instr->prefetch.tex;
4569               fetch->samp_bindless_id = instr->prefetch.samp;
4570            } else {
4571               fetch->cmd = IR3_SAMPLER_PREFETCH_CMD;
4572               fetch->tex_id = instr->prefetch.tex;
4573               fetch->samp_id = instr->prefetch.samp;
4574            }
4575            fetch->wrmask = instr->dsts[0]->wrmask;
4576            fetch->dst = instr->dsts[0]->num;
4577            fetch->src = instr->prefetch.input_offset;
4578
4579            /* These are the limits on a5xx/a6xx, we might need to
4580             * revisit if SP_FS_PREFETCH[n] changes on later gens:
4581             */
4582            assert(fetch->dst <= 0x3f);
4583            assert(fetch->tex_id <= 0x1f);
4584            assert(fetch->samp_id <= 0xf);
4585
4586            ctx->so->total_in =
4587               MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2);
4588
4589            fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF);
4590
4591            /* Remove the prefetch placeholder instruction: */
4592            list_delinit(&instr->node);
4593         }
4594      }
4595   }
4596}
4597
4598int
4599ir3_compile_shader_nir(struct ir3_compiler *compiler,
4600                       struct ir3_shader *shader,
4601                       struct ir3_shader_variant *so)
4602{
4603   struct ir3_context *ctx;
4604   struct ir3 *ir;
4605   int ret = 0, max_bary;
4606   bool progress;
4607
4608   assert(!so->ir);
4609
4610   ctx = ir3_context_init(compiler, shader, so);
4611   if (!ctx) {
4612      DBG("INIT failed!");
4613      ret = -1;
4614      goto out;
4615   }
4616
4617   emit_instructions(ctx);
4618
4619   if (ctx->error) {
4620      DBG("EMIT failed!");
4621      ret = -1;
4622      goto out;
4623   }
4624
4625   ir = so->ir = ctx->ir;
4626
4627   if (gl_shader_stage_is_compute(so->type)) {
4628      so->local_size[0] = ctx->s->info.workgroup_size[0];
4629      so->local_size[1] = ctx->s->info.workgroup_size[1];
4630      so->local_size[2] = ctx->s->info.workgroup_size[2];
4631      so->local_size_variable = ctx->s->info.workgroup_size_variable;
4632   }
4633
4634   /* Vertex shaders in a tessellation or geometry pipeline treat END as a
4635    * NOP and has an epilogue that writes the VS outputs to local storage, to
4636    * be read by the HS.  Then it resets execution mask (chmask) and chains
4637    * to the next shader (chsh). There are also a few output values which we
4638    * must send to the next stage via registers, and in order for both stages
4639    * to agree on the register used we must force these to be in specific
4640    * registers.
4641    */
4642   if ((so->type == MESA_SHADER_VERTEX &&
4643        (so->key.has_gs || so->key.tessellation)) ||
4644       (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
4645      struct ir3_instruction *outputs[3];
4646      unsigned outidxs[3];
4647      unsigned regids[3];
4648      unsigned outputs_count = 0;
4649
4650      if (ctx->primitive_id) {
4651         unsigned n = so->outputs_count++;
4652         so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
4653
4654         struct ir3_instruction *out = ir3_collect(ctx->block, ctx->primitive_id);
4655         outputs[outputs_count] = out;
4656         outidxs[outputs_count] = n;
4657         if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id)
4658            regids[outputs_count] = regid(0, 2);
4659         else
4660            regids[outputs_count] = regid(0, 1);
4661         outputs_count++;
4662      }
4663
4664      if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id) {
4665         unsigned n = so->outputs_count++;
4666         so->outputs[n].slot = VARYING_SLOT_REL_PATCH_ID_IR3;
4667         struct ir3_instruction *out = ir3_collect(ctx->block, ctx->rel_patch_id);
4668         outputs[outputs_count] = out;
4669         outidxs[outputs_count] = n;
4670         regids[outputs_count] = regid(0, 1);
4671         outputs_count++;
4672      }
4673
4674      if (ctx->gs_header) {
4675         unsigned n = so->outputs_count++;
4676         so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
4677         struct ir3_instruction *out = ir3_collect(ctx->block, ctx->gs_header);
4678         outputs[outputs_count] = out;
4679         outidxs[outputs_count] = n;
4680         regids[outputs_count] = regid(0, 0);
4681         outputs_count++;
4682      }
4683
4684      if (ctx->tcs_header) {
4685         unsigned n = so->outputs_count++;
4686         so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
4687         struct ir3_instruction *out = ir3_collect(ctx->block, ctx->tcs_header);
4688         outputs[outputs_count] = out;
4689         outidxs[outputs_count] = n;
4690         regids[outputs_count] = regid(0, 0);
4691         outputs_count++;
4692      }
4693
4694      struct ir3_instruction *chmask =
4695         ir3_instr_create(ctx->block, OPC_CHMASK, 0, outputs_count);
4696      chmask->barrier_class = IR3_BARRIER_EVERYTHING;
4697      chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
4698
4699      for (unsigned i = 0; i < outputs_count; i++)
4700         __ssa_src(chmask, outputs[i], 0)->num = regids[i];
4701
4702      chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
4703      memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
4704
4705      array_insert(ctx->block, ctx->block->keeps, chmask);
4706
4707      struct ir3_instruction *chsh = ir3_CHSH(ctx->block);
4708      chsh->barrier_class = IR3_BARRIER_EVERYTHING;
4709      chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
4710   } else {
4711      assert((ctx->noutputs % 4) == 0);
4712      unsigned outidxs[ctx->noutputs / 4];
4713      struct ir3_instruction *outputs[ctx->noutputs / 4];
4714      unsigned outputs_count = 0;
4715
4716      struct ir3_block *b = ctx->block;
4717      /* Insert these collect's in the block before the end-block if
4718       * possible, so that any moves they generate can be shuffled around to
4719       * reduce nop's:
4720       */
4721      if (ctx->block->predecessors_count == 1)
4722         b = ctx->block->predecessors[0];
4723
4724      /* Setup IR level outputs, which are "collects" that gather
4725       * the scalar components of outputs.
4726       */
4727      for (unsigned i = 0; i < ctx->noutputs; i += 4) {
4728         unsigned ncomp = 0;
4729         /* figure out the # of components written:
4730          *
4731          * TODO do we need to handle holes, ie. if .x and .z
4732          * components written, but .y component not written?
4733          */
4734         for (unsigned j = 0; j < 4; j++) {
4735            if (!ctx->outputs[i + j])
4736               break;
4737            ncomp++;
4738         }
4739
4740         /* Note that in some stages, like TCS, store_output is
4741          * lowered to memory writes, so no components of the
4742          * are "written" from the PoV of traditional store-
4743          * output instructions:
4744          */
4745         if (!ncomp)
4746            continue;
4747
4748         struct ir3_instruction *out =
4749            ir3_create_collect(b, &ctx->outputs[i], ncomp);
4750
4751         int outidx = i / 4;
4752         assert(outidx < so->outputs_count);
4753
4754         outidxs[outputs_count] = outidx;
4755         outputs[outputs_count] = out;
4756         outputs_count++;
4757      }
4758
4759      /* for a6xx+, binning and draw pass VS use same VBO state, so we
4760       * need to make sure not to remove any inputs that are used by
4761       * the nonbinning VS.
4762       */
4763      if (ctx->compiler->gen >= 6 && so->binning_pass &&
4764          so->type == MESA_SHADER_VERTEX) {
4765         for (int i = 0; i < ctx->ninputs; i++) {
4766            struct ir3_instruction *in = ctx->inputs[i];
4767
4768            if (!in)
4769               continue;
4770
4771            unsigned n = i / 4;
4772            unsigned c = i % 4;
4773
4774            assert(n < so->nonbinning->inputs_count);
4775
4776            if (so->nonbinning->inputs[n].sysval)
4777               continue;
4778
4779            /* be sure to keep inputs, even if only used in VS */
4780            if (so->nonbinning->inputs[n].compmask & (1 << c))
4781               array_insert(in->block, in->block->keeps, in);
4782         }
4783      }
4784
4785      struct ir3_instruction *end =
4786         ir3_instr_create(ctx->block, OPC_END, 0, outputs_count);
4787
4788      for (unsigned i = 0; i < outputs_count; i++) {
4789         __ssa_src(end, outputs[i], 0);
4790      }
4791
4792      end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
4793      memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
4794
4795      array_insert(ctx->block, ctx->block->keeps, end);
4796
4797      /* at this point, for binning pass, throw away unneeded outputs: */
4798      if (so->binning_pass && (ctx->compiler->gen < 6))
4799         fixup_binning_pass(ctx, end);
4800   }
4801
4802   ir3_debug_print(ir, "AFTER: nir->ir3");
4803   ir3_validate(ir);
4804
4805   IR3_PASS(ir, ir3_remove_unreachable);
4806
4807   IR3_PASS(ir, ir3_array_to_ssa);
4808
4809   do {
4810      progress = false;
4811
4812      /* the folding doesn't seem to work reliably on a4xx */
4813      if (ctx->compiler->gen != 4)
4814         progress |= IR3_PASS(ir, ir3_cf);
4815      progress |= IR3_PASS(ir, ir3_cp, so);
4816      progress |= IR3_PASS(ir, ir3_cse);
4817      progress |= IR3_PASS(ir, ir3_dce, so);
4818   } while (progress);
4819
4820   /* at this point, for binning pass, throw away unneeded outputs:
4821    * Note that for a6xx and later, we do this after ir3_cp to ensure
4822    * that the uniform/constant layout for BS and VS matches, so that
4823    * we can re-use same VS_CONST state group.
4824    */
4825   if (so->binning_pass && (ctx->compiler->gen >= 6)) {
4826      fixup_binning_pass(ctx, find_end(ctx->so->ir));
4827      /* cleanup the result of removing unneeded outputs: */
4828      while (IR3_PASS(ir, ir3_dce, so)) {
4829      }
4830   }
4831
4832   IR3_PASS(ir, ir3_sched_add_deps);
4833
4834   /* At this point, all the dead code should be long gone: */
4835   assert(!IR3_PASS(ir, ir3_dce, so));
4836
4837   ret = ir3_sched(ir);
4838   if (ret) {
4839      DBG("SCHED failed!");
4840      goto out;
4841   }
4842
4843   ir3_debug_print(ir, "AFTER: ir3_sched");
4844
4845   /* Pre-assign VS inputs on a6xx+ binning pass shader, to align
4846    * with draw pass VS, so binning and draw pass can both use the
4847    * same VBO state.
4848    *
4849    * Note that VS inputs are expected to be full precision.
4850    */
4851   bool pre_assign_inputs = (ir->compiler->gen >= 6) &&
4852                            (ir->type == MESA_SHADER_VERTEX) &&
4853                            so->binning_pass;
4854
4855   if (pre_assign_inputs) {
4856      foreach_input (in, ir) {
4857         assert(in->opc == OPC_META_INPUT);
4858         unsigned inidx = in->input.inidx;
4859
4860         in->dsts[0]->num = so->nonbinning->inputs[inidx].regid;
4861      }
4862   } else if (ctx->tcs_header) {
4863      /* We need to have these values in the same registers between VS and TCS
4864       * since the VS chains to TCS and doesn't get the sysvals redelivered.
4865       */
4866
4867      ctx->tcs_header->dsts[0]->num = regid(0, 0);
4868      ctx->rel_patch_id->dsts[0]->num = regid(0, 1);
4869      if (ctx->primitive_id)
4870         ctx->primitive_id->dsts[0]->num = regid(0, 2);
4871   } else if (ctx->gs_header) {
4872      /* We need to have these values in the same registers between producer
4873       * (VS or DS) and GS since the producer chains to GS and doesn't get
4874       * the sysvals redelivered.
4875       */
4876
4877      ctx->gs_header->dsts[0]->num = regid(0, 0);
4878      if (ctx->primitive_id)
4879         ctx->primitive_id->dsts[0]->num = regid(0, 1);
4880   } else if (so->num_sampler_prefetch) {
4881      assert(so->type == MESA_SHADER_FRAGMENT);
4882      int idx = 0;
4883
4884      foreach_input (instr, ir) {
4885         if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
4886            continue;
4887
4888         assert(idx < 2);
4889         instr->dsts[0]->num = idx;
4890         idx++;
4891      }
4892   }
4893
4894   ret = ir3_ra(so);
4895
4896   if (ret) {
4897      mesa_loge("ir3_ra() failed!");
4898      goto out;
4899   }
4900
4901   IR3_PASS(ir, ir3_postsched, so);
4902
4903   IR3_PASS(ir, ir3_legalize_relative);
4904   IR3_PASS(ir, ir3_lower_subgroups);
4905
4906   if (so->type == MESA_SHADER_FRAGMENT)
4907      pack_inlocs(ctx);
4908
4909   /*
4910    * Fixup inputs/outputs to point to the actual registers assigned:
4911    *
4912    * 1) initialize to r63.x (invalid/unused)
4913    * 2) iterate IR level inputs/outputs and update the variants
4914    *    inputs/outputs table based on the assigned registers for
4915    *    the remaining inputs/outputs.
4916    */
4917
4918   for (unsigned i = 0; i < so->inputs_count; i++)
4919      so->inputs[i].regid = INVALID_REG;
4920   for (unsigned i = 0; i < so->outputs_count; i++)
4921      so->outputs[i].regid = INVALID_REG;
4922
4923   struct ir3_instruction *end = find_end(so->ir);
4924
4925   for (unsigned i = 0; i < end->srcs_count; i++) {
4926      unsigned outidx = end->end.outidxs[i];
4927      struct ir3_register *reg = end->srcs[i];
4928
4929      so->outputs[outidx].regid = reg->num;
4930      so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
4931   }
4932
4933   foreach_input (in, ir) {
4934      assert(in->opc == OPC_META_INPUT);
4935      unsigned inidx = in->input.inidx;
4936
4937      if (pre_assign_inputs && !so->inputs[inidx].sysval) {
4938         if (VALIDREG(so->nonbinning->inputs[inidx].regid)) {
4939            compile_assert(
4940               ctx, in->dsts[0]->num == so->nonbinning->inputs[inidx].regid);
4941            compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) ==
4942                                   so->nonbinning->inputs[inidx].half);
4943         }
4944         so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid;
4945         so->inputs[inidx].half = so->nonbinning->inputs[inidx].half;
4946      } else {
4947         so->inputs[inidx].regid = in->dsts[0]->num;
4948         so->inputs[inidx].half = !!(in->dsts[0]->flags & IR3_REG_HALF);
4949      }
4950   }
4951
4952   if (ctx->astc_srgb)
4953      fixup_astc_srgb(ctx);
4954
4955   if (ctx->compiler->gen == 4 && ctx->s->info.uses_texture_gather)
4956      fixup_tg4(ctx);
4957
4958   /* We need to do legalize after (for frag shader's) the "bary.f"
4959    * offsets (inloc) have been assigned.
4960    */
4961   IR3_PASS(ir, ir3_legalize, so, &max_bary);
4962
4963   /* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't
4964    * know what we might have to wait on when coming in from VS chsh.
4965    */
4966   if (so->type == MESA_SHADER_TESS_CTRL || so->type == MESA_SHADER_GEOMETRY) {
4967      foreach_block (block, &ir->block_list) {
4968         foreach_instr (instr, &block->instr_list) {
4969            instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
4970            break;
4971         }
4972      }
4973   }
4974
4975   so->branchstack = ctx->max_stack;
4976
4977   /* Note that max_bary counts inputs that are not bary.f'd for FS: */
4978   if (so->type == MESA_SHADER_FRAGMENT)
4979      so->total_in = max_bary + 1;
4980
4981   /* Collect sampling instructions eligible for pre-dispatch. */
4982   collect_tex_prefetches(ctx, ir);
4983
4984   if (so->type == MESA_SHADER_FRAGMENT &&
4985       ctx->s->info.fs.needs_quad_helper_invocations)
4986      so->need_pixlod = true;
4987
4988   if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
4989       !ctx->s->info.fs.early_fragment_tests)
4990      ctx->so->no_earlyz |= ctx->s->info.writes_memory;
4991
4992out:
4993   if (ret) {
4994      if (so->ir)
4995         ir3_destroy(so->ir);
4996      so->ir = NULL;
4997   }
4998   ir3_context_free(ctx);
4999
5000   return ret;
5001}
5002