1/*
2 * Copyright © 2014-2015 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "compiler/nir/nir.h"
25#include "compiler/nir/nir_deref.h"
26#include "compiler/nir/nir_worklist.h"
27#include "nir/nir_to_tgsi.h"
28#include "pipe/p_screen.h"
29#include "pipe/p_state.h"
30#include "tgsi/tgsi_dump.h"
31#include "tgsi/tgsi_from_mesa.h"
32#include "tgsi/tgsi_info.h"
33#include "tgsi/tgsi_parse.h"
34#include "tgsi/tgsi_ureg.h"
35#include "tgsi/tgsi_util.h"
36#include "util/debug.h"
37#include "util/u_math.h"
38#include "util/u_memory.h"
39#include "util/u_dynarray.h"
40
41struct ntt_insn {
42   enum tgsi_opcode opcode;
43   struct ureg_dst dst[2];
44   struct ureg_src src[4];
45   enum tgsi_texture_type tex_target;
46   enum tgsi_return_type tex_return_type;
47   struct tgsi_texture_offset tex_offset;
48
49   unsigned mem_qualifier;
50   enum pipe_format mem_format;
51
52   bool is_tex : 1;
53   bool is_mem : 1;
54   bool precise : 1;
55};
56
57struct ntt_block {
58   /* Array of struct ntt_insn */
59   struct util_dynarray insns;
60   int start_ip;
61   int end_ip;
62};
63
64struct ntt_reg_interval {
65   uint32_t start, end;
66};
67
68struct ntt_compile {
69   nir_shader *s;
70   nir_function_impl *impl;
71   const struct nir_to_tgsi_options *options;
72   struct pipe_screen *screen;
73   struct ureg_program *ureg;
74
75   bool needs_texcoord_semantic;
76   bool native_integers;
77   bool has_txf_lz;
78
79   bool addr_declared[3];
80   struct ureg_dst addr_reg[3];
81
82   /* if condition set up at the end of a block, for ntt_emit_if(). */
83   struct ureg_src if_cond;
84
85   /* TGSI temps for our NIR SSA and register values. */
86   struct ureg_dst *reg_temp;
87   struct ureg_src *ssa_temp;
88
89   struct ntt_reg_interval *liveness;
90
91   /* Map from nir_block to ntt_block */
92   struct hash_table *blocks;
93   struct ntt_block *cur_block;
94   unsigned current_if_else;
95   unsigned cf_label;
96
97   /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
98   bool precise;
99
100   unsigned num_temps;
101   unsigned first_non_array_temp;
102
103   /* Mappings from driver_location to TGSI input/output number.
104    *
105    * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
106    * their numbers assigned incrementally, unlike inputs or constants.
107    */
108   struct ureg_src *input_index_map;
109   uint64_t centroid_inputs;
110
111   uint32_t first_ubo;
112
113   struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
114};
115
116static struct ureg_dst
117ntt_temp(struct ntt_compile *c)
118{
119   return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
120}
121
122static struct ntt_block *
123ntt_block_from_nir(struct ntt_compile *c, struct nir_block *block)
124{
125   struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
126   return entry->data;
127}
128
129static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
130static void ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list);
131
132static struct ntt_insn *
133ntt_insn(struct ntt_compile *c, enum tgsi_opcode opcode,
134         struct ureg_dst dst,
135         struct ureg_src src0, struct ureg_src src1,
136         struct ureg_src src2, struct ureg_src src3)
137{
138   struct ntt_insn insn = {
139      .opcode = opcode,
140      .dst = { dst, ureg_dst_undef() },
141      .src = { src0, src1, src2, src3 },
142      .precise = c->precise,
143   };
144   util_dynarray_append(&c->cur_block->insns, struct ntt_insn, insn);
145   return util_dynarray_top_ptr(&c->cur_block->insns, struct ntt_insn);
146}
147
148#define OP00( op )                                                                     \
149static inline void ntt_##op(struct ntt_compile *c)                                     \
150{                                                                                      \
151   ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
152}
153
154#define OP01( op )                                                                     \
155static inline void ntt_##op(struct ntt_compile *c,                                     \
156                     struct ureg_src src0)                                             \
157{                                                                                      \
158   ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
159}
160
161
162#define OP10( op )                                                                     \
163static inline void ntt_##op(struct ntt_compile *c,                                     \
164                     struct ureg_dst dst)                                              \
165{                                                                                      \
166   ntt_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
167}
168
169#define OP11( op )                                                                     \
170static inline void ntt_##op(struct ntt_compile *c,                                     \
171                     struct ureg_dst dst,                                              \
172                     struct ureg_src src0)                                             \
173{                                                                                      \
174   ntt_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
175}
176
177#define OP12( op )                                                                     \
178static inline void ntt_##op(struct ntt_compile *c,                                     \
179                     struct ureg_dst dst,                                              \
180                     struct ureg_src src0,                                             \
181                     struct ureg_src src1)                                             \
182{                                                                                      \
183   ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \
184}
185
186#define OP13( op )                                                                     \
187static inline void ntt_##op(struct ntt_compile *c,                                     \
188                     struct ureg_dst dst,                                              \
189                     struct ureg_src src0,                                             \
190                     struct ureg_src src1,                                             \
191                     struct ureg_src src2)                                             \
192{                                                                                      \
193   ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef());             \
194}
195
196#define OP14( op )                                                                     \
197static inline void ntt_##op(struct ntt_compile *c,                                     \
198                     struct ureg_dst dst,                                              \
199                     struct ureg_src src0,                                             \
200                     struct ureg_src src1,                                             \
201                     struct ureg_src src2,                                             \
202                     struct ureg_src src3)                                             \
203{                                                                                      \
204   ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3);                         \
205}
206
207/* We hand-craft our tex instructions */
208#define OP12_TEX(op)
209#define OP14_TEX(op)
210
211/* Use a template include to generate a correctly-typed ntt_OP()
212 * function for each TGSI opcode:
213 */
214#include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
215
216/**
217 * Interprets a nir_load_const used as a NIR src as a uint.
218 *
219 * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
220 * instruction (or in a phi-web used by an integer ALU instruction) were
221 * converted to floats and the ALU instruction swapped to the float equivalent.
222 * However, this means that integer load_consts used by intrinsics (which don't
223 * normally get that conversion) may have been reformatted to be floats.  Given
224 * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
225 * we can just look and see if they look like floats and convert them back to
226 * ints.
227 */
228static uint32_t
229ntt_src_as_uint(struct ntt_compile *c, nir_src src)
230{
231   uint32_t val = nir_src_as_uint(src);
232   if (!c->native_integers && val >= fui(1.0))
233      val = (uint32_t)uif(val);
234   return val;
235}
236
237static unsigned
238ntt_64bit_write_mask(unsigned write_mask)
239{
240   return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
241}
242
243static struct ureg_src
244ntt_64bit_1f(struct ntt_compile *c)
245{
246   return ureg_imm4u(c->ureg,
247                     0x00000000, 0x3ff00000,
248                     0x00000000, 0x3ff00000);
249}
250
251/* Per-channel masks of def/use within the block, and the per-channel
252 * livein/liveout for the block as a whole.
253 */
254struct ntt_live_reg_block_state {
255   uint8_t *def, *use, *livein, *liveout, *defin, *defout;
256};
257
258struct ntt_live_reg_state {
259   unsigned bitset_words;
260
261   struct ntt_reg_interval *regs;
262
263   /* Used in propagate_across_edge() */
264   BITSET_WORD *tmp_live;
265
266   struct ntt_live_reg_block_state *blocks;
267
268   nir_block_worklist worklist;
269};
270
271static void
272ntt_live_reg_mark_use(struct ntt_compile *c, struct ntt_live_reg_block_state *bs,
273                      int ip, unsigned index, unsigned used_mask)
274{
275   bs->use[index] |= used_mask & ~bs->def[index];
276
277   c->liveness[index].start = MIN2(c->liveness[index].start, ip);
278   c->liveness[index].end = MAX2(c->liveness[index].end, ip);
279
280}
281static void
282ntt_live_reg_setup_def_use(struct ntt_compile *c, nir_function_impl *impl, struct ntt_live_reg_state *state)
283{
284   for (int i = 0; i < impl->num_blocks; i++) {
285      state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
286      state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
287      state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
288      state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
289      state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
290      state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
291   }
292
293   int ip = 0;
294   nir_foreach_block(block, impl) {
295      struct ntt_live_reg_block_state *bs = &state->blocks[block->index];
296      struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
297
298      ntt_block->start_ip = ip;
299
300      util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
301         const struct tgsi_opcode_info *opcode_info =
302            tgsi_get_opcode_info(insn->opcode);
303
304         /* Set up use[] for the srcs.
305          *
306          * Uses are the channels of the reg read in the block that don't have a
307          * preceding def to screen them off.  Note that we don't do per-element
308          * tracking of array regs, so they're never screened off.
309          */
310         for (int i = 0; i < opcode_info->num_src; i++) {
311            if (insn->src[i].File != TGSI_FILE_TEMPORARY)
312               continue;
313            int index = insn->src[i].Index;
314
315            uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i,
316                                                              insn->dst->WriteMask,
317                                                              insn->src[i].SwizzleX,
318                                                              insn->src[i].SwizzleY,
319                                                              insn->src[i].SwizzleZ,
320                                                              insn->src[i].SwizzleW,
321                                                              insn->tex_target,
322                                                              insn->tex_target);
323
324            assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
325            ntt_live_reg_mark_use(c, bs, ip, index, used_mask);
326         }
327
328         if (insn->is_tex && insn->tex_offset.File == TGSI_FILE_TEMPORARY)
329            ntt_live_reg_mark_use(c, bs, ip, insn->tex_offset.Index, 0xf);
330
331         /* Set up def[] for the srcs.
332          *
333          * Defs are the unconditionally-written (not R/M/W) channels of the reg in
334          * the block that don't have a preceding use.
335          */
336         for (int i = 0; i < opcode_info->num_dst; i++) {
337            if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
338               continue;
339            int index = insn->dst[i].Index;
340            uint32_t writemask = insn->dst[i].WriteMask;
341
342            bs->def[index] |= writemask & ~bs->use[index];
343            bs->defout[index] |= writemask;
344
345            assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
346            c->liveness[index].start = MIN2(c->liveness[index].start, ip);
347            c->liveness[index].end = MAX2(c->liveness[index].end, ip);
348         }
349         ip++;
350      }
351
352      ntt_block->end_ip = ip;
353   }
354}
355
356static void
357ntt_live_regs(struct ntt_compile *c, nir_function_impl *impl)
358{
359   nir_metadata_require(impl, nir_metadata_block_index);
360
361   c->liveness = rzalloc_array(c, struct ntt_reg_interval, c->num_temps);
362
363   struct ntt_live_reg_state state = {
364       .blocks = rzalloc_array(impl, struct ntt_live_reg_block_state, impl->num_blocks),
365   };
366
367   /* The intervals start out with start > end (indicating unused) */
368   for (int i = 0; i < c->num_temps; i++)
369      c->liveness[i].start = ~0;
370
371   ntt_live_reg_setup_def_use(c, impl, &state);
372
373   /* Make a forward-order worklist of all the blocks. */
374   nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
375   nir_foreach_block(block, impl) {
376      nir_block_worklist_push_tail(&state.worklist, block);
377   }
378
379   /* Propagate defin/defout down the CFG to calculate the live variables
380    * potentially defined along any possible control flow path.  We'll use this
381    * to keep things like conditional defs of the reg (or array regs where we
382    * don't track defs!) from making the reg's live range extend back to the
383    * start of the program.
384    */
385   while (!nir_block_worklist_is_empty(&state.worklist)) {
386      nir_block *block = nir_block_worklist_pop_head(&state.worklist);
387      for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
388         nir_block *succ = block->successors[j];
389         if (!succ || succ->index == impl->num_blocks)
390            continue;
391
392         for (int i = 0; i < c->num_temps; i++) {
393            uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];
394
395            if (new_def) {
396               state.blocks[succ->index].defin[i] |= new_def;
397               state.blocks[succ->index].defout[i] |= new_def;
398               nir_block_worklist_push_tail(&state.worklist, succ);
399            }
400         }
401      }
402   }
403
404   /* Make a reverse-order worklist of all the blocks. */
405   nir_foreach_block(block, impl) {
406      nir_block_worklist_push_head(&state.worklist, block);
407   }
408
409   /* We're now ready to work through the worklist and update the liveness sets
410    * of each of the blocks.  As long as we keep the worklist up-to-date as we
411    * go, everything will get covered.
412    */
413   while (!nir_block_worklist_is_empty(&state.worklist)) {
414      /* We pop them off in the reverse order we pushed them on.  This way
415       * the first walk of the instructions is backwards so we only walk
416       * once in the case of no control flow.
417       */
418      nir_block *block = nir_block_worklist_pop_head(&state.worklist);
419      struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
420      struct ntt_live_reg_block_state *bs = &state.blocks[block->index];
421
422      for (int i = 0; i < c->num_temps; i++) {
423         /* Collect livein from our successors to include in our liveout. */
424         for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
425            nir_block *succ = block->successors[j];
426            if (!succ || succ->index == impl->num_blocks)
427               continue;
428            struct ntt_live_reg_block_state *sbs = &state.blocks[succ->index];
429
430            uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
431            if (new_liveout) {
432               if (state.blocks[block->index].defout[i])
433                  c->liveness[i].end = MAX2(c->liveness[i].end, ntt_block->end_ip);
434               bs->liveout[i] |= sbs->livein[i];
435            }
436         }
437
438         /* Propagate use requests from either our block's uses or our
439          * non-screened-off liveout up to our predecessors.
440          */
441         uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) &
442                               ~bs->livein[i]);
443         if (new_livein) {
444            bs->livein[i] |= new_livein;
445            set_foreach(block->predecessors, entry) {
446               nir_block *pred = (void *)entry->key;
447               nir_block_worklist_push_tail(&state.worklist, pred);
448            }
449
450            if (new_livein & state.blocks[block->index].defin[i])
451               c->liveness[i].start = MIN2(c->liveness[i].start, ntt_block->start_ip);
452         }
453      }
454   }
455
456   ralloc_free(state.blocks);
457   nir_block_worklist_fini(&state.worklist);
458}
459
460static void
461ntt_ra_check(struct ntt_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
462{
463   if (index < c->first_non_array_temp)
464      return;
465
466   if (c->liveness[index].start == ip && ra_map[index] == ~0)
467      ra_map[index] = ureg_DECL_temporary(c->ureg).Index;
468
469   if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
470      ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
471      BITSET_SET(released, index);
472   }
473}
474
475static void
476ntt_allocate_regs(struct ntt_compile *c, nir_function_impl *impl)
477{
478   ntt_live_regs(c, impl);
479
480   unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
481   unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));
482
483   /* No RA on NIR array regs */
484   for (int i = 0; i < c->first_non_array_temp; i++)
485      ra_map[i] = i;
486
487   for (int i = c->first_non_array_temp; i < c->num_temps; i++)
488      ra_map[i] = ~0;
489
490   int ip = 0;
491   nir_foreach_block(block, impl) {
492      struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
493
494      for (int i = 0; i < c->num_temps; i++)
495         ntt_ra_check(c, ra_map, released, ip, i);
496
497      util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
498         const struct tgsi_opcode_info *opcode_info =
499            tgsi_get_opcode_info(insn->opcode);
500
501         for (int i = 0; i < opcode_info->num_src; i++) {
502            if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
503               ntt_ra_check(c, ra_map, released, ip, insn->src[i].Index);
504               insn->src[i].Index = ra_map[insn->src[i].Index];
505            }
506         }
507
508         if (insn->is_tex && insn->tex_offset.File == TGSI_FILE_TEMPORARY) {
509            ntt_ra_check(c, ra_map, released, ip, insn->tex_offset.Index);
510            insn->tex_offset.Index = ra_map[insn->tex_offset.Index];
511         }
512
513         for (int i = 0; i < opcode_info->num_dst; i++) {
514            if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
515               ntt_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
516               insn->dst[i].Index = ra_map[insn->dst[i].Index];
517            }
518         }
519         ip++;
520      }
521
522      for (int i = 0; i < c->num_temps; i++)
523         ntt_ra_check(c, ra_map, released, ip, i);
524   }
525}
526
527static void
528ntt_allocate_regs_unoptimized(struct ntt_compile *c, nir_function_impl *impl)
529{
530   for (int i = c->first_non_array_temp; i < c->num_temps; i++)
531      ureg_DECL_temporary(c->ureg);
532}
533
534
535/**
536 * Try to find an iadd of a constant value with a non-constant value in the
537 * nir_src's first component, returning the constant offset and replacing *src
538 * with the non-constant component.
539 */
540static const uint32_t
541ntt_extract_const_src_offset(nir_src *src)
542{
543   if (!src->is_ssa)
544      return 0;
545
546   nir_ssa_scalar s = nir_get_ssa_scalar(src->ssa, 0);
547
548   while (nir_ssa_scalar_is_alu(s)) {
549      nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr);
550
551      for (int i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
552         if (!alu->src[i].src.is_ssa)
553            return 0;
554      }
555
556      if (alu->op == nir_op_iadd) {
557         for (int i = 0; i < 2; i++) {
558            nir_const_value *v = nir_src_as_const_value(alu->src[i].src);
559            if (v && !alu->src[i].negate && !alu->src[i].abs) {
560               *src = alu->src[1 - i].src;
561               return v[alu->src[i].swizzle[s.comp]].u32;
562            }
563         }
564
565         return 0;
566      }
567
568      /* We'd like to reuse nir_ssa_scalar_chase_movs(), but it assumes SSA and that
569       * seems reasonable for something used in inner loops of the compiler.
570       */
571      if (!nir_alu_instr_is_copy(alu))
572         return 0;
573
574      if (alu->op == nir_op_mov) {
575         s.def = alu->src[0].src.ssa;
576         s.comp = alu->src[0].swizzle[s.comp];
577      } else if (nir_op_is_vec(alu->op)) {
578         s.def = alu->src[s.comp].src.ssa;
579         s.comp = alu->src[s.comp].swizzle[0];
580      } else {
581         return 0;
582      }
583   }
584
585   return 0;
586}
587
588static const struct glsl_type *
589ntt_shader_input_type(struct ntt_compile *c,
590                      struct nir_variable *var)
591{
592   switch (c->s->info.stage) {
593   case MESA_SHADER_GEOMETRY:
594   case MESA_SHADER_TESS_EVAL:
595   case MESA_SHADER_TESS_CTRL:
596      if (glsl_type_is_array(var->type))
597         return glsl_get_array_element(var->type);
598      else
599         return var->type;
600   default:
601      return var->type;
602   }
603}
604
605static void
606ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
607                            unsigned *semantic_name, unsigned *semantic_index)
608{
609   /* We want to use most of tgsi_get_gl_varying_semantic(), but the
610    * !texcoord shifting has already been applied, so avoid that.
611    */
612   if (!c->needs_texcoord_semantic &&
613       (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
614      *semantic_name = TGSI_SEMANTIC_GENERIC;
615      *semantic_index = location - VARYING_SLOT_VAR0;
616      return;
617   }
618
619   tgsi_get_gl_varying_semantic(location, true,
620                                semantic_name, semantic_index);
621}
622
623/* TGSI varying declarations have a component usage mask associated (used by
624 * r600 and svga).
625 */
626static uint32_t
627ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
628                    bool is_64)
629{
630   uint32_t usage_mask =
631      u_bit_consecutive(start_component, num_components);
632
633   if (is_64) {
634      if (start_component >= 2)
635         usage_mask >>= 2;
636
637      uint32_t tgsi_usage_mask = 0;
638
639      if (usage_mask & TGSI_WRITEMASK_X)
640         tgsi_usage_mask |= TGSI_WRITEMASK_XY;
641      if (usage_mask & TGSI_WRITEMASK_Y)
642         tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
643
644      return tgsi_usage_mask;
645   } else {
646      return usage_mask;
647   }
648}
649
650/* TGSI varying declarations have a component usage mask associated (used by
651 * r600 and svga).
652 */
653static uint32_t
654ntt_tgsi_var_usage_mask(const struct nir_variable *var)
655{
656   const struct glsl_type *type_without_array =
657      glsl_without_array(var->type);
658   unsigned num_components = glsl_get_vector_elements(type_without_array);
659   if (num_components == 0) /* structs */
660      num_components = 4;
661
662   return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
663                              glsl_type_is_64bit(type_without_array));
664}
665
666static struct ureg_dst
667ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
668{
669   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
670   int base = nir_intrinsic_base(instr);
671   *frac = nir_intrinsic_component(instr);
672   bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
673
674   struct ureg_dst out;
675   if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
676      unsigned semantic_name, semantic_index;
677      tgsi_get_gl_frag_result_semantic(semantics.location,
678                                       &semantic_name, &semantic_index);
679      semantic_index += semantics.dual_source_blend_index;
680
681      switch (semantics.location) {
682      case FRAG_RESULT_DEPTH:
683         *frac = 2; /* z write is the to the .z channel in TGSI */
684         break;
685      case FRAG_RESULT_STENCIL:
686         *frac = 1;
687         break;
688      default:
689         break;
690      }
691
692      out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
693   } else {
694      unsigned semantic_name, semantic_index;
695
696      ntt_get_gl_varying_semantic(c, semantics.location,
697                                  &semantic_name, &semantic_index);
698
699      uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
700                                                instr->num_components,
701                                                is_64);
702      uint32_t gs_streams = semantics.gs_streams;
703      for (int i = 0; i < 4; i++) {
704         if (!(usage_mask & (1 << i)))
705            gs_streams &= ~(0x3 << 2 * i);
706      }
707
708      /* No driver appears to use array_id of outputs. */
709      unsigned array_id = 0;
710
711      /* This bit is lost in the i/o semantics, but it's unused in in-tree
712       * drivers.
713       */
714      bool invariant = semantics.invariant;
715
716      out = ureg_DECL_output_layout(c->ureg,
717                                    semantic_name, semantic_index,
718                                    gs_streams,
719                                    base,
720                                    usage_mask,
721                                    array_id,
722                                    semantics.num_slots,
723                                    invariant);
724   }
725
726   unsigned write_mask;
727   if (nir_intrinsic_has_write_mask(instr))
728      write_mask = nir_intrinsic_write_mask(instr);
729   else
730      write_mask = ((1 << instr->num_components) - 1) << *frac;
731
732   if (is_64) {
733      write_mask = ntt_64bit_write_mask(write_mask);
734      if (*frac >= 2)
735         write_mask = write_mask << 2;
736   } else {
737      write_mask = write_mask << *frac;
738   }
739   return ureg_writemask(out, write_mask);
740}
741
742/* If this reg or SSA def is used only for storing an output, then in the simple
743 * cases we can write directly to the TGSI output instead of having store_output
744 * emit its own MOV.
745 */
746static bool
747ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
748                             struct list_head *uses, struct list_head *if_uses)
749{
750   *dst = ureg_dst_undef();
751
752   switch (c->s->info.stage) {
753   case MESA_SHADER_FRAGMENT:
754   case MESA_SHADER_VERTEX:
755      break;
756   default:
757      /* tgsi_exec (at least) requires that output stores happen per vertex
758       * emitted, you don't get to reuse a previous output value for the next
759       * vertex.
760       */
761      return false;
762   }
763
764   if (!list_is_empty(if_uses) || !list_is_singular(uses))
765      return false;
766
767   nir_src *src = list_first_entry(uses, nir_src, use_link);
768
769   if (src->parent_instr->type != nir_instr_type_intrinsic)
770      return false;
771
772   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr);
773   if (intr->intrinsic != nir_intrinsic_store_output ||
774       !nir_src_is_const(intr->src[1])) {
775      return false;
776   }
777
778   uint32_t frac;
779   *dst = ntt_output_decl(c, intr, &frac);
780   dst->Index += ntt_src_as_uint(c, intr->src[1]);
781
782   return frac == 0;
783}
784
785static void
786ntt_setup_inputs(struct ntt_compile *c)
787{
788   if (c->s->info.stage != MESA_SHADER_FRAGMENT)
789      return;
790
791   unsigned num_inputs = 0;
792   int num_input_arrays = 0;
793
794   nir_foreach_shader_in_variable(var, c->s) {
795      const struct glsl_type *type = ntt_shader_input_type(c, var);
796      unsigned array_len =
797         glsl_count_attribute_slots(type, false);
798
799      num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
800   }
801
802   c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
803
804   nir_foreach_shader_in_variable(var, c->s) {
805      const struct glsl_type *type = ntt_shader_input_type(c, var);
806      unsigned array_len =
807         glsl_count_attribute_slots(type, false);
808
809      unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
810      unsigned sample_loc;
811      struct ureg_src decl;
812
813      if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
814         interpolation =
815            tgsi_get_interp_mode(var->data.interpolation,
816                                 var->data.location == VARYING_SLOT_COL0 ||
817                                 var->data.location == VARYING_SLOT_COL1);
818
819         if (var->data.location == VARYING_SLOT_POS)
820            interpolation = TGSI_INTERPOLATE_LINEAR;
821      }
822
823      unsigned semantic_name, semantic_index;
824      ntt_get_gl_varying_semantic(c, var->data.location,
825                                  &semantic_name, &semantic_index);
826
827      if (var->data.sample) {
828         sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
829      } else if (var->data.centroid) {
830         sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
831         c->centroid_inputs |= (BITSET_MASK(array_len) <<
832                                var->data.driver_location);
833      } else {
834         sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
835      }
836
837      unsigned array_id = 0;
838      if (glsl_type_is_array(type))
839         array_id = ++num_input_arrays;
840
841      uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
842
843      decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
844                                                semantic_name,
845                                                semantic_index,
846                                                interpolation,
847                                                sample_loc,
848                                                var->data.driver_location,
849                                                usage_mask,
850                                                array_id, array_len);
851
852      if (semantic_name == TGSI_SEMANTIC_FACE) {
853         struct ureg_dst temp = ntt_temp(c);
854         if (c->native_integers) {
855            /* NIR is ~0 front and 0 back, while TGSI is +1 front */
856            ntt_SGE(c, temp, decl, ureg_imm1f(c->ureg, 0));
857         } else {
858            /* tgsi docs say that floating point FACE will be positive for
859             * frontface and negative for backface, but realistically
860             * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
861             * Copy that behavior, since some drivers (r300) have been doing a
862             * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
863             * front face).
864             */
865            temp.Saturate = true;
866            ntt_MOV(c, temp, decl);
867
868         }
869         decl = ureg_src(temp);
870      }
871
872      for (unsigned i = 0; i < array_len; i++) {
873         c->input_index_map[var->data.driver_location + i] = decl;
874         c->input_index_map[var->data.driver_location + i].Index += i;
875      }
876   }
877}
878
879static int
880ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
881{
882   return a->data.location - b->data.location;
883}
884
885/**
886 * Workaround for virglrenderer requiring that TGSI FS output color variables
887 * are declared in order.  Besides, it's a lot nicer to read the TGSI this way.
888 */
889static void
890ntt_setup_outputs(struct ntt_compile *c)
891{
892   if (c->s->info.stage != MESA_SHADER_FRAGMENT)
893      return;
894
895   nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
896
897   nir_foreach_shader_out_variable(var, c->s) {
898      if (var->data.location == FRAG_RESULT_COLOR)
899         ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
900
901      unsigned semantic_name, semantic_index;
902      tgsi_get_gl_frag_result_semantic(var->data.location,
903                                       &semantic_name, &semantic_index);
904
905      (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
906   }
907}
908
909static enum tgsi_texture_type
910tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
911{
912   switch (dim) {
913   case GLSL_SAMPLER_DIM_1D:
914      if (is_shadow)
915         return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
916      else
917         return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
918   case GLSL_SAMPLER_DIM_2D:
919   case GLSL_SAMPLER_DIM_EXTERNAL:
920      if (is_shadow)
921         return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
922      else
923         return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
924   case GLSL_SAMPLER_DIM_3D:
925      return TGSI_TEXTURE_3D;
926   case GLSL_SAMPLER_DIM_CUBE:
927      if (is_shadow)
928         return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
929      else
930         return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
931   case GLSL_SAMPLER_DIM_RECT:
932      if (is_shadow)
933         return TGSI_TEXTURE_SHADOWRECT;
934      else
935         return TGSI_TEXTURE_RECT;
936   case GLSL_SAMPLER_DIM_MS:
937      return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
938   case GLSL_SAMPLER_DIM_BUF:
939      return TGSI_TEXTURE_BUFFER;
940   default:
941      unreachable("unknown sampler dim");
942   }
943}
944
945static enum tgsi_return_type
946tgsi_return_type_from_base_type(enum glsl_base_type type)
947{
948   switch (type) {
949   case GLSL_TYPE_INT:
950      return TGSI_RETURN_TYPE_SINT;
951   case GLSL_TYPE_UINT:
952      return TGSI_RETURN_TYPE_UINT;
953   case GLSL_TYPE_FLOAT:
954     return TGSI_RETURN_TYPE_FLOAT;
955   default:
956      unreachable("unexpected texture type");
957   }
958}
959
960static void
961ntt_setup_uniforms(struct ntt_compile *c)
962{
963   nir_foreach_uniform_variable(var, c->s) {
964      if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
965          glsl_type_is_texture(glsl_without_array(var->type))) {
966         /* Don't use this size for the check for samplers -- arrays of structs
967          * containing samplers should be ignored, and just the separate lowered
968          * sampler uniform decl used.
969          */
970         int size = glsl_type_get_sampler_count(var->type) +
971                    glsl_type_get_texture_count(var->type);
972
973         const struct glsl_type *stype = glsl_without_array(var->type);
974         enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
975                                                                            glsl_sampler_type_is_array(stype),
976                                                                            glsl_sampler_type_is_shadow(stype));
977         enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
978         for (int i = 0; i < size; i++) {
979            ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
980               target, ret_type, ret_type, ret_type, ret_type);
981            ureg_DECL_sampler(c->ureg, var->data.binding + i);
982         }
983      } else if (glsl_contains_atomic(var->type)) {
984         uint32_t offset = var->data.offset / 4;
985         uint32_t size = glsl_atomic_size(var->type) / 4;
986         ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
987      }
988
989      /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
990       * size declaration happens with other UBOs below.
991       */
992   }
993
994   nir_foreach_image_variable(var, c->s) {
995      int image_count = glsl_type_get_image_count(var->type);
996      const struct glsl_type *itype = glsl_without_array(var->type);
997      enum tgsi_texture_type tex_type =
998            tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
999                                               glsl_sampler_type_is_array(itype), false);
1000
1001      for (int i = 0; i < image_count; i++) {
1002         c->images[var->data.binding] = ureg_DECL_image(c->ureg,
1003                                                        var->data.binding + i,
1004                                                        tex_type,
1005                                                        var->data.image.format,
1006                                                        !(var->data.access & ACCESS_NON_WRITEABLE),
1007                                                        false);
1008      }
1009   }
1010
1011   c->first_ubo = ~0;
1012
1013   unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
1014   nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
1015      int ubo = var->data.driver_location;
1016      if (ubo == -1)
1017         continue;
1018
1019      if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
1020         c->first_ubo = MIN2(c->first_ubo, ubo);
1021
1022      unsigned size = glsl_get_explicit_size(var->interface_type, false);
1023
1024      int array_size = 1;
1025      if (glsl_type_is_interface(glsl_without_array(var->type)))
1026         array_size = MAX2(1, glsl_get_aoa_size(var->type));
1027
1028      for (int i = 0; i < array_size; i++) {
1029         /* Even if multiple NIR variables are in the same uniform block, their
1030          * explicit size is the size of the block.
1031          */
1032         if (ubo_sizes[ubo + i])
1033            assert(ubo_sizes[ubo + i] == size);
1034
1035         ubo_sizes[ubo + i] = size;
1036      }
1037   }
1038
1039   for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
1040      if (ubo_sizes[i])
1041         ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
1042   }
1043
1044   for (int i = 0; i < c->s->info.num_ssbos; i++) {
1045      /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
1046       * counters
1047       */
1048      bool atomic = false;
1049      ureg_DECL_buffer(c->ureg, i, atomic);
1050   }
1051}
1052
1053static void
1054ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
1055{
1056   assert(c->num_temps == 0);
1057   /* Permanently allocate all the array regs at the start. */
1058   foreach_list_typed(nir_register, nir_reg, node, list) {
1059      if (nir_reg->num_array_elems != 0) {
1060         struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems, true);
1061         c->reg_temp[nir_reg->index] = decl;
1062         assert(c->num_temps == decl.Index);
1063         c->num_temps += nir_reg->num_array_elems;
1064      }
1065   }
1066   c->first_non_array_temp = c->num_temps;
1067
1068   /* After that, allocate non-array regs in our virtual space that we'll
1069    * register-allocate before ureg emit.
1070    */
1071   foreach_list_typed(nir_register, nir_reg, node, list) {
1072      if (nir_reg->num_array_elems == 0) {
1073         struct ureg_dst decl;
1074         uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
1075         if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses, &nir_reg->if_uses)) {
1076            if (nir_reg->bit_size == 64) {
1077               if (nir_reg->num_components > 2) {
1078                  fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
1079                        nir_reg->num_components, nir_reg->index);
1080               }
1081
1082               write_mask = ntt_64bit_write_mask(write_mask);
1083            }
1084
1085            decl = ureg_writemask(ntt_temp(c), write_mask);
1086         }
1087         c->reg_temp[nir_reg->index] = decl;
1088      }
1089   }
1090}
1091
1092static struct ureg_src
1093ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
1094{
1095   int num_components = instr->def.num_components;
1096
1097   if (!c->native_integers) {
1098      float values[4];
1099      assert(instr->def.bit_size == 32);
1100      for (int i = 0; i < num_components; i++)
1101         values[i] = uif(instr->value[i].u32);
1102
1103      return ureg_DECL_immediate(c->ureg, values, num_components);
1104   } else {
1105      uint32_t values[4];
1106
1107      if (instr->def.bit_size == 32) {
1108         for (int i = 0; i < num_components; i++)
1109            values[i] = instr->value[i].u32;
1110      } else {
1111         assert(num_components <= 2);
1112         for (int i = 0; i < num_components; i++) {
1113            values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
1114            values[i * 2 + 1] = instr->value[i].u64 >> 32;
1115         }
1116         num_components *= 2;
1117      }
1118
1119      return ureg_DECL_immediate_uint(c->ureg, values, num_components);
1120   }
1121}
1122
1123static struct ureg_src
1124ntt_reladdr(struct ntt_compile *c, struct ureg_src addr, int addr_index)
1125{
1126   assert(addr_index < ARRAY_SIZE(c->addr_reg));
1127
1128   for (int i = 0; i <= addr_index; i++) {
1129      if (!c->addr_declared[i]) {
1130         c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg),
1131                                             TGSI_WRITEMASK_X);
1132         c->addr_declared[i] = true;
1133      }
1134   }
1135
1136   if (c->native_integers)
1137      ntt_UARL(c, c->addr_reg[addr_index], addr);
1138   else
1139      ntt_ARL(c, c->addr_reg[addr_index], addr);
1140   return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
1141}
1142
1143static struct ureg_src
1144ntt_get_src(struct ntt_compile *c, nir_src src)
1145{
1146   if (src.is_ssa) {
1147      if (src.ssa->parent_instr->type == nir_instr_type_load_const)
1148         return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
1149
1150      return c->ssa_temp[src.ssa->index];
1151   } else {
1152      nir_register *reg = src.reg.reg;
1153      struct ureg_dst reg_temp = c->reg_temp[reg->index];
1154      reg_temp.Index += src.reg.base_offset;
1155
1156      if (src.reg.indirect) {
1157         struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
1158         return ureg_src_indirect(ureg_src(reg_temp),
1159                                  ntt_reladdr(c, offset, 0));
1160      } else {
1161         return ureg_src(reg_temp);
1162      }
1163   }
1164}
1165
1166static struct ureg_src
1167ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
1168{
1169   nir_alu_src src = instr->src[i];
1170   struct ureg_src usrc = ntt_get_src(c, src.src);
1171
1172   /* Expand double/dvec2 src references to TGSI swizzles using a pair of 32-bit
1173    * channels.  We skip this for undefs, as those don't get split to vec2s (but
1174    * the specific swizzles from an undef don't matter)
1175    */
1176   if (nir_src_bit_size(src.src) == 64 &&
1177      !(src.src.is_ssa && src.src.ssa->parent_instr->type == nir_instr_type_ssa_undef)) {
1178      int chan0 = 0, chan1 = 1;
1179      if (nir_op_infos[instr->op].input_sizes[i] == 0) {
1180         chan0 = ffs(instr->dest.write_mask) - 1;
1181         chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;
1182         if (chan1 == -1)
1183            chan1 = chan0;
1184      }
1185      usrc = ureg_swizzle(usrc,
1186                          src.swizzle[chan0] * 2,
1187                          src.swizzle[chan0] * 2 + 1,
1188                          src.swizzle[chan1] * 2,
1189                          src.swizzle[chan1] * 2 + 1);
1190   } else {
1191      usrc = ureg_swizzle(usrc,
1192                          src.swizzle[0],
1193                          src.swizzle[1],
1194                          src.swizzle[2],
1195                          src.swizzle[3]);
1196   }
1197
1198   if (src.abs)
1199      usrc = ureg_abs(usrc);
1200   if (src.negate)
1201      usrc = ureg_negate(usrc);
1202
1203   return usrc;
1204}
1205
1206/* Reswizzles a source so that the unset channels in the write mask still refer
1207 * to one of the channels present in the write mask.
1208 */
1209static struct ureg_src
1210ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
1211{
1212   assert(write_mask);
1213   int first_chan = ffs(write_mask) - 1;
1214   return ureg_swizzle(src,
1215                       (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
1216                       (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
1217                       (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
1218                       (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
1219}
1220
1221static struct ureg_dst
1222ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
1223{
1224   uint32_t writemask = BITSET_MASK(ssa->num_components);
1225   if (ssa->bit_size == 64)
1226      writemask = ntt_64bit_write_mask(writemask);
1227
1228   struct ureg_dst dst;
1229   if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))
1230      dst = ntt_temp(c);
1231
1232   c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
1233
1234   return ureg_writemask(dst, writemask);
1235}
1236
1237static struct ureg_dst
1238ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
1239{
1240   if (dest->is_ssa)
1241      return ntt_get_ssa_def_decl(c, &dest->ssa);
1242   else
1243      return c->reg_temp[dest->reg.reg->index];
1244}
1245
1246static struct ureg_dst
1247ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
1248{
1249   struct ureg_dst dst = ntt_get_dest_decl(c, dest);
1250
1251   if (!dest->is_ssa) {
1252      dst.Index += dest->reg.base_offset;
1253
1254      if (dest->reg.indirect) {
1255         struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
1256         dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset, 0));
1257      }
1258   }
1259
1260   return dst;
1261}
1262
1263/* For an SSA dest being populated by a constant src, replace the storage with
1264 * a copy of the ureg_src.
1265 */
1266static void
1267ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
1268{
1269   if (!src.Indirect && !src.DimIndirect) {
1270      switch (src.File) {
1271      case TGSI_FILE_IMMEDIATE:
1272      case TGSI_FILE_INPUT:
1273      case TGSI_FILE_CONSTANT:
1274      case TGSI_FILE_SYSTEM_VALUE:
1275         c->ssa_temp[def->index] = src;
1276         return;
1277      }
1278   }
1279
1280   ntt_MOV(c, ntt_get_ssa_def_decl(c, def), src);
1281}
1282
1283static void
1284ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
1285{
1286   if (dest->is_ssa)
1287      ntt_store_def(c, &dest->ssa, src);
1288   else {
1289      struct ureg_dst dst = ntt_get_dest(c, dest);
1290      ntt_MOV(c, dst, src);
1291   }
1292}
1293
1294static void
1295ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
1296                struct ureg_dst dst,
1297                struct ureg_src src0,
1298                struct ureg_src src1)
1299{
1300   unsigned i;
1301
1302   /* POW is the only 2-operand scalar op. */
1303   if (tgsi_op != TGSI_OPCODE_POW)
1304      src1 = src0;
1305
1306   for (i = 0; i < 4; i++) {
1307      if (dst.WriteMask & (1 << i)) {
1308         ntt_insn(c, tgsi_op,
1309                  ureg_writemask(dst, 1 << i),
1310                  ureg_scalar(src0, i),
1311                  ureg_scalar(src1, i),
1312                  ureg_src_undef(), ureg_src_undef());
1313      }
1314   }
1315}
1316
1317static void
1318ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
1319{
1320   struct ureg_src src[4];
1321   struct ureg_dst dst;
1322   unsigned i;
1323   int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;
1324   int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
1325   int num_srcs = nir_op_infos[instr->op].num_inputs;
1326
1327   c->precise = instr->exact;
1328
1329   assert(num_srcs <= ARRAY_SIZE(src));
1330   for (i = 0; i < num_srcs; i++)
1331      src[i] = ntt_get_alu_src(c, instr, i);
1332   for (; i < ARRAY_SIZE(src); i++)
1333      src[i] = ureg_src_undef();
1334
1335   dst = ntt_get_dest(c, &instr->dest.dest);
1336
1337   if (instr->dest.saturate)
1338      dst.Saturate = true;
1339
1340   if (dst_64)
1341      dst = ureg_writemask(dst, ntt_64bit_write_mask(instr->dest.write_mask));
1342   else
1343      dst = ureg_writemask(dst, instr->dest.write_mask);
1344
1345   static enum tgsi_opcode op_map[][2] = {
1346      [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
1347
1348      /* fabs/fneg 32-bit are special-cased below. */
1349      [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
1350      [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
1351
1352      [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
1353      [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
1354      [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
1355      [nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 },
1356      [nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 },
1357      [nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 },
1358      [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
1359      [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
1360      [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
1361      [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
1362      [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
1363      [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
1364      [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
1365
1366      [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
1367      [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
1368      [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
1369
1370      /* The conversions will have one combination of src and dst bitsize. */
1371      [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
1372      [nir_op_f2f64] = { TGSI_OPCODE_F2D },
1373      [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
1374
1375      [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
1376      [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
1377      [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
1378      [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
1379      [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
1380      [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
1381      [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
1382      [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
1383
1384      [nir_op_slt] = { TGSI_OPCODE_SLT },
1385      [nir_op_sge] = { TGSI_OPCODE_SGE },
1386      [nir_op_seq] = { TGSI_OPCODE_SEQ },
1387      [nir_op_sne] = { TGSI_OPCODE_SNE },
1388
1389      [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
1390      [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
1391      [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
1392      [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
1393
1394      [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
1395      [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
1396      [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
1397      [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
1398
1399      [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
1400      [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
1401
1402      [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
1403      [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
1404      [nir_op_fsign] = { TGSI_OPCODE_SSG },
1405      [nir_op_isign] = { TGSI_OPCODE_ISSG },
1406      [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
1407      [nir_op_fddx] = { TGSI_OPCODE_DDX },
1408      [nir_op_fddy] = { TGSI_OPCODE_DDY },
1409      [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
1410      [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
1411      [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
1412      [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
1413      [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
1414      [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
1415      [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
1416      [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
1417      [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
1418      [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
1419      [nir_op_bit_count] = { TGSI_OPCODE_POPC },
1420      [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
1421      [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
1422      [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
1423      [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
1424      [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
1425      [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
1426      [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
1427      [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
1428      [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
1429      [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
1430      [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
1431      [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
1432      [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
1433      [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
1434
1435      /* These bitwise ops don't care about 32 vs 64 types, so they have the
1436       * same TGSI op.
1437       */
1438      [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
1439      [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
1440      [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
1441      [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
1442
1443      [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
1444      [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
1445      [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
1446      [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
1447      [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
1448      [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
1449      [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
1450      [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
1451   };
1452
1453   if (src_64 && !dst_64) {
1454      if (num_srcs == 2 || nir_op_infos[instr->op].output_type == nir_type_bool32) {
1455         /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
1456         * of .xy.
1457         */
1458         assert(!(dst.WriteMask & TGSI_WRITEMASK_YW));
1459      } else {
1460         /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
1461         * channels and will need to get fixed up.
1462         */
1463        assert(!(dst.WriteMask & TGSI_WRITEMASK_ZW));
1464      }
1465   }
1466
1467   bool table_op64 = src_64;
1468   if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
1469      /* The normal path for NIR to TGSI ALU op translation */
1470      ntt_insn(c, op_map[instr->op][table_op64],
1471                dst, src[0], src[1], src[2], src[3]);
1472   } else {
1473      /* Special cases for NIR to TGSI ALU op translation. */
1474
1475      /* TODO: Use something like the ntt_store() path for the MOV calls so we
1476       * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
1477       */
1478
1479      switch (instr->op) {
1480      case nir_op_u2u64:
1481         ntt_AND(c, dst, ureg_swizzle(src[0],
1482                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1483                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1484                  ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
1485         break;
1486
1487      case nir_op_i2i32:
1488      case nir_op_u2u32:
1489         assert(src_64);
1490         ntt_MOV(c, dst, ureg_swizzle(src[0],
1491                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1492                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
1493         break;
1494
1495      case nir_op_fabs:
1496         if (c->options->lower_fabs)
1497            ntt_MAX(c, dst, src[0], ureg_negate(src[0]));
1498         else
1499            ntt_MOV(c, dst, ureg_abs(src[0]));
1500         break;
1501
1502      case nir_op_fsat:
1503         if (dst_64) {
1504            ntt_MIN(c, dst, src[0], ntt_64bit_1f(c));
1505            ntt_MAX(c, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
1506         } else {
1507            ntt_MOV(c, ureg_saturate(dst), src[0]);
1508         }
1509         break;
1510
1511      case nir_op_fneg:
1512         ntt_MOV(c, dst, ureg_negate(src[0]));
1513         break;
1514
1515         /* NOTE: TGSI 32-bit math ops have the old "one source channel
1516          * replicated to all dst channels" behavior, while 64 is normal mapping
1517          * of src channels to dst.
1518          */
1519      case nir_op_frcp:
1520         assert(!dst_64);
1521         ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
1522         break;
1523
1524      case nir_op_frsq:
1525         assert(!dst_64);
1526         ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
1527         break;
1528
1529      case nir_op_fsqrt:
1530         assert(!dst_64);
1531         ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], ureg_src_undef());
1532         break;
1533
1534      case nir_op_fexp2:
1535         assert(!dst_64);
1536         ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
1537         break;
1538
1539      case nir_op_flog2:
1540         assert(!dst_64);
1541         ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
1542         break;
1543
1544      case nir_op_b2f32:
1545         ntt_AND(c, dst, src[0], ureg_imm1f(c->ureg, 1.0));
1546         break;
1547
1548      case nir_op_b2f64:
1549         ntt_AND(c, dst,
1550                  ureg_swizzle(src[0],
1551                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1552                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1553                  ntt_64bit_1f(c));
1554         break;
1555
1556      case nir_op_f2b32:
1557         if (src_64)
1558            ntt_DSNE(c, dst, src[0], ureg_imm1f(c->ureg, 0));
1559         else
1560            ntt_FSNE(c, dst, src[0], ureg_imm1f(c->ureg, 0));
1561         break;
1562
1563      case nir_op_i2b32:
1564         if (src_64) {
1565            ntt_U64SNE(c, dst, src[0], ureg_imm1u(c->ureg, 0));
1566         } else
1567            ntt_USNE(c, dst, src[0], ureg_imm1u(c->ureg, 0));
1568         break;
1569
1570      case nir_op_b2i32:
1571         ntt_AND(c, dst, src[0], ureg_imm1u(c->ureg, 1));
1572         break;
1573
1574      case nir_op_b2i64:
1575         ntt_AND(c, dst,
1576                  ureg_swizzle(src[0],
1577                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1578                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1579                  ureg_imm4u(c->ureg, 1, 0, 1, 0));
1580         break;
1581
1582      case nir_op_fsin:
1583         ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
1584         break;
1585
1586      case nir_op_fcos:
1587         ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
1588         break;
1589
1590      case nir_op_fsub:
1591         assert(!dst_64);
1592         ntt_ADD(c, dst, src[0], ureg_negate(src[1]));
1593         break;
1594
1595      case nir_op_isub:
1596         assert(!dst_64);
1597         ntt_UADD(c, dst, src[0], ureg_negate(src[1]));
1598         break;
1599
1600      case nir_op_fmod:
1601         unreachable("should be handled by .lower_fmod = true");
1602         break;
1603
1604      case nir_op_fpow:
1605         ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1606         break;
1607
1608      case nir_op_flrp:
1609         ntt_LRP(c, dst, src[2], src[1], src[0]);
1610         break;
1611
1612      case nir_op_pack_64_2x32_split:
1613         ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
1614                  ureg_swizzle(src[0],
1615                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1616                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1617         ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_YW),
1618                  ureg_swizzle(src[1],
1619                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1620                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1621         break;
1622
1623      case nir_op_unpack_64_2x32_split_x:
1624         ntt_MOV(c, dst, ureg_swizzle(src[0],
1625                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1626                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
1627         break;
1628
1629      case nir_op_unpack_64_2x32_split_y:
1630         ntt_MOV(c, dst, ureg_swizzle(src[0],
1631                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
1632                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
1633         break;
1634
1635      case nir_op_b32csel:
1636         if (nir_src_bit_size(instr->src[1].src) == 64) {
1637            ntt_UCMP(c, dst, ureg_swizzle(src[0],
1638                                                 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1639                                                 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1640                      src[1], src[2]);
1641         } else {
1642            ntt_UCMP(c, dst, src[0], src[1], src[2]);
1643         }
1644         break;
1645
1646      case nir_op_fcsel:
1647         /* NIR fcsel is src0 != 0 ? src1 : src2.
1648          * TGSI CMP is src0 < 0 ? src1 : src2.
1649          *
1650          * However, fcsel so far as I can find only appears on bools-as-floats
1651          * (1.0 or 0.0), so we can just negate it for the TGSI op.  It's
1652          * important to not have an abs here, as i915g has to make extra
1653          * instructions to do the abs.
1654          */
1655         if (c->options->lower_cmp) {
1656            /* If the HW doesn't support TGSI CMP (r300 VS), then lower it to a
1657             * LRP on the boolean 1.0/0.0 value, instead of requiring the
1658             * backend to turn the src0 into 1.0/0.0 first.
1659             *
1660             * We don't use this in general because some hardware (i915 FS) the
1661             * LRP gets expanded to MUL/MAD.
1662             */
1663            ntt_LRP(c, dst, src[0], src[1], src[2]);
1664         } else {
1665            ntt_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
1666         }
1667         break;
1668
1669         /* It would be nice if we could get this left as scalar in NIR, since
1670          * the TGSI op is scalar.
1671          */
1672      case nir_op_frexp_sig:
1673      case nir_op_frexp_exp: {
1674         assert(src_64);
1675         struct ureg_dst temp = ntt_temp(c);
1676
1677         for (int chan = 0; chan < 2; chan++) {
1678            int wm = 1 << chan;
1679
1680            if (!(instr->dest.write_mask & wm))
1681               continue;
1682
1683            struct ureg_dst dsts[2] = { temp, temp };
1684            if (instr->op == nir_op_frexp_sig) {
1685               dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));
1686            } else {
1687               dsts[1] = ureg_writemask(dst, wm);
1688            }
1689
1690            struct ureg_src chan_src = ureg_swizzle(src[0],
1691                                                    chan * 2, chan * 2 + 1,
1692                                                    chan * 2, chan * 2 + 1);
1693
1694            struct ntt_insn *insn = ntt_insn(c, TGSI_OPCODE_DFRACEXP,
1695                                             dsts[0], chan_src,
1696                                             ureg_src_undef(),
1697                                             ureg_src_undef(),
1698                                             ureg_src_undef());
1699            insn->dst[1] = dsts[1];
1700         }
1701         break;
1702      }
1703
1704      case nir_op_ldexp:
1705         assert(dst_64); /* 32bit handled in table. */
1706         ntt_DLDEXP(c, dst, src[0],
1707                     ureg_swizzle(src[1],
1708                                  TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1709                                  TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1710         break;
1711
1712      case nir_op_vec4:
1713      case nir_op_vec3:
1714      case nir_op_vec2:
1715         unreachable("covered by nir_lower_vec_to_movs()");
1716
1717      default:
1718         fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1719         unreachable("Unknown NIR opcode");
1720      }
1721   }
1722
1723   c->precise = false;
1724}
1725
1726static struct ureg_src
1727ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
1728                      nir_src src, int addr_reg)
1729{
1730   if (nir_src_is_const(src)) {
1731      usrc.Index += ntt_src_as_uint(c, src);
1732      return usrc;
1733   } else {
1734      return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src), addr_reg));
1735   }
1736}
1737
1738static struct ureg_dst
1739ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
1740                      nir_src src)
1741{
1742   if (nir_src_is_const(src)) {
1743      dst.Index += ntt_src_as_uint(c, src);
1744      return dst;
1745   } else {
1746      return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src), 0));
1747   }
1748}
1749
1750static struct ureg_src
1751ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
1752                         nir_src src)
1753{
1754   if (nir_src_is_const(src)) {
1755      return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
1756   }
1757   else
1758   {
1759      return ureg_src_dimension_indirect(usrc,
1760                                         ntt_reladdr(c, ntt_get_src(c, src), 1),
1761                                         0);
1762   }
1763}
1764
1765static struct ureg_dst
1766ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
1767                                nir_src src)
1768{
1769   if (nir_src_is_const(src)) {
1770      return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
1771   } else {
1772      return ureg_dst_dimension_indirect(udst,
1773                                         ntt_reladdr(c, ntt_get_src(c, src), 1),
1774                                         0);
1775   }
1776}
1777/* Some load operations in NIR will have a fractional offset that we need to
1778 * swizzle down before storing to the result register.
1779 */
1780static struct ureg_src
1781ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1782{
1783   return ureg_swizzle(src,
1784                       frac,
1785                       frac + MIN2(num_components - 1, 1),
1786                       frac + MIN2(num_components - 1, 2),
1787                       frac + MIN2(num_components - 1, 3));
1788}
1789
1790
1791static void
1792ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1793{
1794   int bit_size = nir_dest_bit_size(instr->dest);
1795   assert(bit_size == 32 || instr->num_components <= 2);
1796
1797   struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1798
1799   struct ureg_dst addr_temp = ureg_dst_undef();
1800
1801   if (nir_src_is_const(instr->src[0])) {
1802      src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
1803   } else {
1804      /* virglrenderer requires that indirect UBO references have the UBO
1805       * array's base index in the Index field, not added to the indrect
1806       * address.
1807       *
1808       * Many nir intrinsics have a base address const value for the start of
1809       * their array indirection, but load_ubo doesn't.  We fake it by
1810       * subtracting it off here.
1811       */
1812      addr_temp = ntt_temp(c);
1813      ntt_UADD(c, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1814      src = ureg_src_dimension_indirect(src,
1815                                         ntt_reladdr(c, ureg_src(addr_temp), 1),
1816                                         c->first_ubo);
1817   }
1818
1819   if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
1820      /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1821       * file.
1822       */
1823      src.Index = nir_intrinsic_base(instr);
1824
1825      if (nir_src_is_const(instr->src[1])) {
1826         src.Index += ntt_src_as_uint(c, instr->src[1]);
1827      } else {
1828         src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1]), 0));
1829      }
1830
1831      int start_component = nir_intrinsic_component(instr);
1832      if (bit_size == 64)
1833         start_component *= 2;
1834
1835      src = ntt_shift_by_frac(src, start_component,
1836                              instr->num_components * bit_size / 32);
1837
1838      ntt_store(c, &instr->dest, src);
1839   } else {
1840      /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
1841       * TGSI_OPCODE_LOAD instruction from the const file.
1842       */
1843      struct ntt_insn *insn =
1844         ntt_insn(c, TGSI_OPCODE_LOAD,
1845                  ntt_get_dest(c, &instr->dest),
1846                  src, ntt_get_src(c, instr->src[1]),
1847                  ureg_src_undef(), ureg_src_undef());
1848      insn->is_mem = true;
1849      insn->tex_target = 0;
1850      insn->mem_qualifier = 0;
1851      insn->mem_format = 0; /* unused */
1852   }
1853}
1854
1855static unsigned
1856ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1857{
1858   enum gl_access_qualifier access = nir_intrinsic_access(instr);
1859   unsigned qualifier = 0;
1860
1861   if (access & ACCESS_COHERENT)
1862      qualifier |= TGSI_MEMORY_COHERENT;
1863   if (access & ACCESS_VOLATILE)
1864      qualifier |= TGSI_MEMORY_VOLATILE;
1865   if (access & ACCESS_RESTRICT)
1866      qualifier |= TGSI_MEMORY_RESTRICT;
1867
1868   return qualifier;
1869}
1870
1871static void
1872ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1873             nir_variable_mode mode)
1874{
1875   bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1876                    instr->intrinsic == nir_intrinsic_store_shared);
1877   bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
1878                    instr->intrinsic == nir_intrinsic_load_ssbo ||
1879                    instr->intrinsic == nir_intrinsic_load_shared);
1880   unsigned opcode;
1881   struct ureg_src src[4];
1882   int num_src = 0;
1883   int next_src;
1884   struct ureg_dst addr_temp = ureg_dst_undef();
1885
1886   struct ureg_src memory;
1887   switch (mode) {
1888   case nir_var_mem_ssbo:
1889      memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),
1890                                     instr->src[is_store ? 1 : 0], 2);
1891      next_src = 1;
1892      break;
1893   case nir_var_mem_shared:
1894      memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
1895      next_src = 0;
1896      break;
1897   case nir_var_uniform: { /* HW atomic buffers */
1898      nir_src src = instr->src[0];
1899      uint32_t offset = ntt_extract_const_src_offset(&src) / 4;
1900      memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, offset);
1901      /* ntt_ureg_src_indirect, except dividing by 4 */
1902      if (nir_src_is_const(src)) {
1903         memory.Index += nir_src_as_uint(src) / 4;
1904      } else {
1905         addr_temp = ntt_temp(c);
1906         ntt_USHR(c, addr_temp, ntt_get_src(c, src), ureg_imm1i(c->ureg, 2));
1907         memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp), 2));
1908      }
1909      memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
1910      next_src = 0;
1911      break;
1912   }
1913
1914   default:
1915      unreachable("unknown memory type");
1916   }
1917
1918   if (is_store) {
1919      src[num_src++] = ntt_get_src(c, instr->src[next_src + 1]); /* offset */
1920      src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
1921   } else {
1922      src[num_src++] = memory;
1923      if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1924         src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* offset */
1925         switch (instr->intrinsic) {
1926         case nir_intrinsic_atomic_counter_inc:
1927            src[num_src++] = ureg_imm1i(c->ureg, 1);
1928            break;
1929         case nir_intrinsic_atomic_counter_post_dec:
1930            src[num_src++] = ureg_imm1i(c->ureg, -1);
1931            break;
1932         default:
1933            if (!is_load)
1934               src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* value */
1935            break;
1936         }
1937      }
1938   }
1939
1940
1941   switch (instr->intrinsic) {
1942   case nir_intrinsic_atomic_counter_add:
1943   case nir_intrinsic_atomic_counter_inc:
1944   case nir_intrinsic_atomic_counter_post_dec:
1945   case nir_intrinsic_ssbo_atomic_add:
1946   case nir_intrinsic_shared_atomic_add:
1947      opcode = TGSI_OPCODE_ATOMUADD;
1948      break;
1949   case nir_intrinsic_ssbo_atomic_fadd:
1950   case nir_intrinsic_shared_atomic_fadd:
1951      opcode = TGSI_OPCODE_ATOMFADD;
1952      break;
1953   case nir_intrinsic_atomic_counter_min:
1954   case nir_intrinsic_ssbo_atomic_imin:
1955   case nir_intrinsic_shared_atomic_imin:
1956      opcode = TGSI_OPCODE_ATOMIMIN;
1957      break;
1958   case nir_intrinsic_atomic_counter_max:
1959   case nir_intrinsic_ssbo_atomic_imax:
1960   case nir_intrinsic_shared_atomic_imax:
1961      opcode = TGSI_OPCODE_ATOMIMAX;
1962      break;
1963   case nir_intrinsic_ssbo_atomic_umin:
1964   case nir_intrinsic_shared_atomic_umin:
1965      opcode = TGSI_OPCODE_ATOMUMIN;
1966      break;
1967   case nir_intrinsic_ssbo_atomic_umax:
1968   case nir_intrinsic_shared_atomic_umax:
1969      opcode = TGSI_OPCODE_ATOMUMAX;
1970      break;
1971   case nir_intrinsic_atomic_counter_and:
1972   case nir_intrinsic_ssbo_atomic_and:
1973   case nir_intrinsic_shared_atomic_and:
1974      opcode = TGSI_OPCODE_ATOMAND;
1975      break;
1976   case nir_intrinsic_atomic_counter_or:
1977   case nir_intrinsic_ssbo_atomic_or:
1978   case nir_intrinsic_shared_atomic_or:
1979      opcode = TGSI_OPCODE_ATOMOR;
1980      break;
1981   case nir_intrinsic_atomic_counter_xor:
1982   case nir_intrinsic_ssbo_atomic_xor:
1983   case nir_intrinsic_shared_atomic_xor:
1984      opcode = TGSI_OPCODE_ATOMXOR;
1985      break;
1986   case nir_intrinsic_atomic_counter_exchange:
1987   case nir_intrinsic_ssbo_atomic_exchange:
1988   case nir_intrinsic_shared_atomic_exchange:
1989      opcode = TGSI_OPCODE_ATOMXCHG;
1990      break;
1991   case nir_intrinsic_atomic_counter_comp_swap:
1992   case nir_intrinsic_ssbo_atomic_comp_swap:
1993   case nir_intrinsic_shared_atomic_comp_swap:
1994      opcode = TGSI_OPCODE_ATOMCAS;
1995      src[num_src++] = ntt_get_src(c, instr->src[next_src++]);
1996      break;
1997   case nir_intrinsic_atomic_counter_read:
1998   case nir_intrinsic_load_ssbo:
1999   case nir_intrinsic_load_shared:
2000      opcode = TGSI_OPCODE_LOAD;
2001      break;
2002   case nir_intrinsic_store_ssbo:
2003   case nir_intrinsic_store_shared:
2004      opcode = TGSI_OPCODE_STORE;
2005      break;
2006   case nir_intrinsic_get_ssbo_size:
2007      opcode = TGSI_OPCODE_RESQ;
2008      break;
2009   default:
2010      unreachable("unknown memory op");
2011   }
2012
2013   unsigned qualifier = 0;
2014   if (mode == nir_var_mem_ssbo &&
2015       instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2016      qualifier = ntt_get_access_qualifier(instr);
2017   }
2018
2019   struct ureg_dst dst;
2020   if (is_store) {
2021      dst = ureg_dst(memory);
2022
2023      unsigned write_mask = nir_intrinsic_write_mask(instr);
2024      if (nir_src_bit_size(instr->src[0]) == 64)
2025         write_mask = ntt_64bit_write_mask(write_mask);
2026      dst = ureg_writemask(dst, write_mask);
2027   } else {
2028      dst = ntt_get_dest(c, &instr->dest);
2029   }
2030
2031   struct ntt_insn *insn = ntt_insn(c, opcode, dst, src[0], src[1], src[2], src[3]);
2032   insn->tex_target = TGSI_TEXTURE_BUFFER;
2033   insn->mem_qualifier = qualifier;
2034   insn->mem_format = 0; /* unused */
2035   insn->is_mem = true;
2036}
2037
2038static void
2039ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
2040{
2041   unsigned op;
2042   struct ureg_src srcs[4];
2043   int num_src = 0;
2044   enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
2045   bool is_array = nir_intrinsic_image_array(instr);
2046
2047   struct ureg_dst temp = ureg_dst_undef();
2048
2049   enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
2050
2051   struct ureg_src resource =
2052      ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
2053                            instr->src[0], 2);
2054
2055   struct ureg_dst dst;
2056   if (instr->intrinsic == nir_intrinsic_image_store) {
2057      dst = ureg_dst(resource);
2058   } else {
2059      srcs[num_src++] = resource;
2060      dst = ntt_get_dest(c, &instr->dest);
2061   }
2062   struct ureg_dst opcode_dst = dst;
2063
2064   if (instr->intrinsic != nir_intrinsic_image_size && instr->intrinsic != nir_intrinsic_image_samples) {
2065      struct ureg_src coord = ntt_get_src(c, instr->src[1]);
2066
2067      if (dim == GLSL_SAMPLER_DIM_MS) {
2068         temp = ntt_temp(c);
2069         ntt_MOV(c, temp, coord);
2070         ntt_MOV(c, ureg_writemask(temp, TGSI_WRITEMASK_W),
2071                  ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
2072         coord = ureg_src(temp);
2073      }
2074      srcs[num_src++] = coord;
2075
2076      if (instr->intrinsic != nir_intrinsic_image_load) {
2077         srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
2078         if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
2079            srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
2080      }
2081   }
2082
2083   switch (instr->intrinsic) {
2084   case nir_intrinsic_image_load:
2085      op = TGSI_OPCODE_LOAD;
2086      break;
2087   case nir_intrinsic_image_store:
2088      op = TGSI_OPCODE_STORE;
2089      break;
2090   case nir_intrinsic_image_size:
2091      op = TGSI_OPCODE_RESQ;
2092      break;
2093   case nir_intrinsic_image_samples:
2094      op = TGSI_OPCODE_RESQ;
2095      opcode_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2096      break;
2097   case nir_intrinsic_image_atomic_add:
2098      op = TGSI_OPCODE_ATOMUADD;
2099      break;
2100   case nir_intrinsic_image_atomic_fadd:
2101      op = TGSI_OPCODE_ATOMFADD;
2102      break;
2103   case nir_intrinsic_image_atomic_imin:
2104      op = TGSI_OPCODE_ATOMIMIN;
2105      break;
2106   case nir_intrinsic_image_atomic_umin:
2107      op = TGSI_OPCODE_ATOMUMIN;
2108      break;
2109   case nir_intrinsic_image_atomic_imax:
2110      op = TGSI_OPCODE_ATOMIMAX;
2111      break;
2112   case nir_intrinsic_image_atomic_umax:
2113      op = TGSI_OPCODE_ATOMUMAX;
2114      break;
2115   case nir_intrinsic_image_atomic_and:
2116      op = TGSI_OPCODE_ATOMAND;
2117      break;
2118   case nir_intrinsic_image_atomic_or:
2119      op = TGSI_OPCODE_ATOMOR;
2120      break;
2121   case nir_intrinsic_image_atomic_xor:
2122      op = TGSI_OPCODE_ATOMXOR;
2123      break;
2124   case nir_intrinsic_image_atomic_exchange:
2125      op = TGSI_OPCODE_ATOMXCHG;
2126      break;
2127   case nir_intrinsic_image_atomic_comp_swap:
2128      op = TGSI_OPCODE_ATOMCAS;
2129      break;
2130   default:
2131      unreachable("bad op");
2132   }
2133
2134   struct ntt_insn *insn = ntt_insn(c, op, opcode_dst, srcs[0], srcs[1], srcs[2], srcs[3]);
2135   insn->tex_target = target;
2136   insn->mem_qualifier = ntt_get_access_qualifier(instr);
2137   insn->mem_format = nir_intrinsic_format(instr);
2138   insn->is_mem = true;
2139
2140   if (instr->intrinsic == nir_intrinsic_image_samples)
2141      ntt_MOV(c, dst, ureg_scalar(ureg_src(opcode_dst), 3));
2142}
2143
2144static void
2145ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
2146{
2147   uint32_t frac = nir_intrinsic_component(instr);
2148   uint32_t num_components = instr->num_components;
2149   unsigned base = nir_intrinsic_base(instr);
2150   struct ureg_src input;
2151   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2152   bool is_64 = nir_dest_bit_size(instr->dest) == 64;
2153
2154   if (c->s->info.stage == MESA_SHADER_VERTEX) {
2155      input = ureg_DECL_vs_input(c->ureg, base);
2156      for (int i = 1; i < semantics.num_slots; i++)
2157         ureg_DECL_vs_input(c->ureg, base + i);
2158   } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2159      unsigned semantic_name, semantic_index;
2160      ntt_get_gl_varying_semantic(c, semantics.location,
2161                                  &semantic_name, &semantic_index);
2162
2163      /* XXX: ArrayID is used in r600 gs inputs */
2164      uint32_t array_id = 0;
2165
2166      input = ureg_DECL_input_layout(c->ureg,
2167                                     semantic_name,
2168                                     semantic_index,
2169                                     base,
2170                                     ntt_tgsi_usage_mask(frac,
2171                                                         instr->num_components,
2172                                                         is_64),
2173                                     array_id,
2174                                     semantics.num_slots);
2175   } else {
2176      input = c->input_index_map[base];
2177   }
2178
2179   if (is_64)
2180      num_components *= 2;
2181
2182   input = ntt_shift_by_frac(input, frac, num_components);
2183
2184   switch (instr->intrinsic) {
2185   case nir_intrinsic_load_input:
2186      input = ntt_ureg_src_indirect(c, input, instr->src[0], 0);
2187      ntt_store(c, &instr->dest, input);
2188      break;
2189
2190   case nir_intrinsic_load_per_vertex_input:
2191      input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2192      input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
2193      ntt_store(c, &instr->dest, input);
2194      break;
2195
2196   case nir_intrinsic_load_interpolated_input: {
2197      input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2198
2199      nir_intrinsic_instr *bary_instr =
2200         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
2201
2202      switch (bary_instr->intrinsic) {
2203      case nir_intrinsic_load_barycentric_pixel:
2204      case nir_intrinsic_load_barycentric_sample:
2205         /* For these, we know that the barycentric load matches the
2206          * interpolation on the input declaration, so we can use it directly.
2207          */
2208         ntt_store(c, &instr->dest, input);
2209         break;
2210
2211      case nir_intrinsic_load_barycentric_centroid:
2212         /* If the input was declared centroid, then there's no need to
2213          * emit the extra TGSI interp instruction, we can just read the
2214          * input.
2215          */
2216         if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
2217            ntt_store(c, &instr->dest, input);
2218         } else {
2219            ntt_INTERP_CENTROID(c, ntt_get_dest(c, &instr->dest), input);
2220         }
2221         break;
2222
2223      case nir_intrinsic_load_barycentric_at_sample:
2224         /* We stored the sample in the fake "bary" dest. */
2225         ntt_INTERP_SAMPLE(c, ntt_get_dest(c, &instr->dest), input,
2226                            ntt_get_src(c, instr->src[0]));
2227         break;
2228
2229      case nir_intrinsic_load_barycentric_at_offset:
2230         /* We stored the offset in the fake "bary" dest. */
2231         ntt_INTERP_OFFSET(c, ntt_get_dest(c, &instr->dest), input,
2232                            ntt_get_src(c, instr->src[0]));
2233         break;
2234
2235      default:
2236         unreachable("bad barycentric interp intrinsic\n");
2237      }
2238      break;
2239   }
2240
2241   default:
2242      unreachable("bad load input intrinsic\n");
2243   }
2244}
2245
2246static void
2247ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2248{
2249   struct ureg_src src = ntt_get_src(c, instr->src[0]);
2250
2251   if (src.File == TGSI_FILE_OUTPUT) {
2252      /* If our src is the output file, that's an indication that we were able
2253       * to emit the output stores in the generating instructions and we have
2254       * nothing to do here.
2255       */
2256      return;
2257   }
2258
2259   uint32_t frac;
2260   struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2261
2262   if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
2263      out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
2264      out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
2265   } else {
2266      out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2267   }
2268
2269   uint8_t swizzle[4] = { 0, 0, 0, 0 };
2270   for (int i = frac; i <= 4; i++) {
2271      if (out.WriteMask & (1 << i))
2272         swizzle[i] = i - frac;
2273   }
2274
2275   src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2276
2277   ntt_MOV(c, out, src);
2278}
2279
2280static void
2281ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2282{
2283   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2284
2285   /* ntt_try_store_in_tgsi_output() optimization is not valid if normal
2286    * load_output is present.
2287    */
2288   assert(c->s->info.stage != MESA_SHADER_VERTEX &&
2289          (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
2290
2291   uint32_t frac;
2292   struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2293
2294   if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
2295      out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2296      out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
2297   } else {
2298      out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
2299   }
2300
2301   struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
2302   struct ureg_src out_src = ureg_src(out);
2303
2304   /* Don't swizzling unavailable channels of the output in the writemasked-out
2305    * components. Avoids compile failures in virglrenderer with
2306    * TESS_LEVEL_INNER.
2307    */
2308   int fill_channel = ffs(dst.WriteMask) - 1;
2309   uint8_t swizzles[4] = { 0, 1, 2, 3 };
2310   for (int i = 0; i < 4; i++)
2311      if (!(dst.WriteMask & (1 << i)))
2312         swizzles[i] = fill_channel;
2313   out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);
2314
2315   if (semantics.fb_fetch_output)
2316      ntt_FBFETCH(c, dst, out_src);
2317   else
2318      ntt_MOV(c, dst, out_src);
2319}
2320
2321static void
2322ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
2323{
2324   gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
2325   enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
2326   struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
2327
2328   /* virglrenderer doesn't like references to channels of the sysval that
2329    * aren't defined, even if they aren't really read.  (GLSL compile fails on
2330    * gl_NumWorkGroups.w, for example).
2331    */
2332   uint32_t write_mask = BITSET_MASK(nir_dest_num_components(instr->dest));
2333   sv = ntt_swizzle_for_write_mask(sv, write_mask);
2334
2335   /* TGSI and NIR define these intrinsics as always loading ints, but they can
2336    * still appear on hardware with non-native-integers fragment shaders using
2337    * the draw path (i915g).  In that case, having called nir_lower_int_to_float
2338    * means that we actually want floats instead.
2339    */
2340   if (!c->native_integers) {
2341      switch (instr->intrinsic) {
2342      case nir_intrinsic_load_vertex_id:
2343      case nir_intrinsic_load_instance_id:
2344         ntt_U2F(c, ntt_get_dest(c, &instr->dest), sv);
2345         return;
2346
2347      default:
2348         break;
2349      }
2350   }
2351
2352   ntt_store(c, &instr->dest, sv);
2353}
2354
2355static void
2356ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
2357{
2358   switch (instr->intrinsic) {
2359   case nir_intrinsic_load_ubo:
2360   case nir_intrinsic_load_ubo_vec4:
2361      ntt_emit_load_ubo(c, instr);
2362      break;
2363
2364      /* Vertex */
2365   case nir_intrinsic_load_vertex_id:
2366   case nir_intrinsic_load_vertex_id_zero_base:
2367   case nir_intrinsic_load_base_vertex:
2368   case nir_intrinsic_load_base_instance:
2369   case nir_intrinsic_load_instance_id:
2370   case nir_intrinsic_load_draw_id:
2371   case nir_intrinsic_load_invocation_id:
2372   case nir_intrinsic_load_frag_coord:
2373   case nir_intrinsic_load_point_coord:
2374   case nir_intrinsic_load_front_face:
2375   case nir_intrinsic_load_sample_id:
2376   case nir_intrinsic_load_sample_pos:
2377   case nir_intrinsic_load_sample_mask_in:
2378   case nir_intrinsic_load_helper_invocation:
2379   case nir_intrinsic_load_tess_coord:
2380   case nir_intrinsic_load_patch_vertices_in:
2381   case nir_intrinsic_load_primitive_id:
2382   case nir_intrinsic_load_tess_level_outer:
2383   case nir_intrinsic_load_tess_level_inner:
2384   case nir_intrinsic_load_local_invocation_id:
2385   case nir_intrinsic_load_workgroup_id:
2386   case nir_intrinsic_load_num_workgroups:
2387   case nir_intrinsic_load_workgroup_size:
2388   case nir_intrinsic_load_subgroup_size:
2389   case nir_intrinsic_load_subgroup_invocation:
2390   case nir_intrinsic_load_subgroup_eq_mask:
2391   case nir_intrinsic_load_subgroup_ge_mask:
2392   case nir_intrinsic_load_subgroup_gt_mask:
2393   case nir_intrinsic_load_subgroup_lt_mask:
2394      ntt_emit_load_sysval(c, instr);
2395      break;
2396
2397   case nir_intrinsic_load_input:
2398   case nir_intrinsic_load_per_vertex_input:
2399   case nir_intrinsic_load_interpolated_input:
2400      ntt_emit_load_input(c, instr);
2401      break;
2402
2403   case nir_intrinsic_store_output:
2404   case nir_intrinsic_store_per_vertex_output:
2405      ntt_emit_store_output(c, instr);
2406      break;
2407
2408   case nir_intrinsic_load_output:
2409   case nir_intrinsic_load_per_vertex_output:
2410      ntt_emit_load_output(c, instr);
2411      break;
2412
2413   case nir_intrinsic_discard:
2414      ntt_KILL(c);
2415      break;
2416
2417   case nir_intrinsic_discard_if: {
2418      struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
2419
2420      if (c->native_integers) {
2421         struct ureg_dst temp = ureg_writemask(ntt_temp(c), 1);
2422         ntt_AND(c, temp, cond, ureg_imm1f(c->ureg, 1.0));
2423         ntt_KILL_IF(c, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
2424      } else {
2425         /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
2426         ntt_KILL_IF(c, ureg_negate(cond));
2427      }
2428      break;
2429   }
2430
2431   case nir_intrinsic_load_ssbo:
2432   case nir_intrinsic_store_ssbo:
2433   case nir_intrinsic_ssbo_atomic_add:
2434   case nir_intrinsic_ssbo_atomic_fadd:
2435   case nir_intrinsic_ssbo_atomic_imin:
2436   case nir_intrinsic_ssbo_atomic_imax:
2437   case nir_intrinsic_ssbo_atomic_umin:
2438   case nir_intrinsic_ssbo_atomic_umax:
2439   case nir_intrinsic_ssbo_atomic_and:
2440   case nir_intrinsic_ssbo_atomic_or:
2441   case nir_intrinsic_ssbo_atomic_xor:
2442   case nir_intrinsic_ssbo_atomic_exchange:
2443   case nir_intrinsic_ssbo_atomic_comp_swap:
2444   case nir_intrinsic_get_ssbo_size:
2445      ntt_emit_mem(c, instr, nir_var_mem_ssbo);
2446      break;
2447
2448   case nir_intrinsic_load_shared:
2449   case nir_intrinsic_store_shared:
2450   case nir_intrinsic_shared_atomic_add:
2451   case nir_intrinsic_shared_atomic_fadd:
2452   case nir_intrinsic_shared_atomic_imin:
2453   case nir_intrinsic_shared_atomic_imax:
2454   case nir_intrinsic_shared_atomic_umin:
2455   case nir_intrinsic_shared_atomic_umax:
2456   case nir_intrinsic_shared_atomic_and:
2457   case nir_intrinsic_shared_atomic_or:
2458   case nir_intrinsic_shared_atomic_xor:
2459   case nir_intrinsic_shared_atomic_exchange:
2460   case nir_intrinsic_shared_atomic_comp_swap:
2461      ntt_emit_mem(c, instr, nir_var_mem_shared);
2462      break;
2463
2464   case nir_intrinsic_atomic_counter_read:
2465   case nir_intrinsic_atomic_counter_add:
2466   case nir_intrinsic_atomic_counter_inc:
2467   case nir_intrinsic_atomic_counter_post_dec:
2468   case nir_intrinsic_atomic_counter_min:
2469   case nir_intrinsic_atomic_counter_max:
2470   case nir_intrinsic_atomic_counter_and:
2471   case nir_intrinsic_atomic_counter_or:
2472   case nir_intrinsic_atomic_counter_xor:
2473   case nir_intrinsic_atomic_counter_exchange:
2474   case nir_intrinsic_atomic_counter_comp_swap:
2475      ntt_emit_mem(c, instr, nir_var_uniform);
2476      break;
2477   case nir_intrinsic_atomic_counter_pre_dec:
2478      unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
2479      break;
2480
2481   case nir_intrinsic_image_load:
2482   case nir_intrinsic_image_store:
2483   case nir_intrinsic_image_size:
2484   case nir_intrinsic_image_samples:
2485   case nir_intrinsic_image_atomic_add:
2486   case nir_intrinsic_image_atomic_fadd:
2487   case nir_intrinsic_image_atomic_imin:
2488   case nir_intrinsic_image_atomic_umin:
2489   case nir_intrinsic_image_atomic_imax:
2490   case nir_intrinsic_image_atomic_umax:
2491   case nir_intrinsic_image_atomic_and:
2492   case nir_intrinsic_image_atomic_or:
2493   case nir_intrinsic_image_atomic_xor:
2494   case nir_intrinsic_image_atomic_exchange:
2495   case nir_intrinsic_image_atomic_comp_swap:
2496      ntt_emit_image_load_store(c, instr);
2497      break;
2498
2499   case nir_intrinsic_control_barrier:
2500   case nir_intrinsic_memory_barrier_tcs_patch:
2501      ntt_BARRIER(c);
2502      break;
2503
2504   case nir_intrinsic_memory_barrier:
2505      ntt_MEMBAR(c, ureg_imm1u(c->ureg,
2506                               TGSI_MEMBAR_SHADER_BUFFER |
2507                               TGSI_MEMBAR_ATOMIC_BUFFER |
2508                               TGSI_MEMBAR_SHADER_IMAGE |
2509                               TGSI_MEMBAR_SHARED));
2510      break;
2511
2512   case nir_intrinsic_memory_barrier_atomic_counter:
2513      ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));
2514      break;
2515
2516   case nir_intrinsic_memory_barrier_buffer:
2517      ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));
2518      break;
2519
2520   case nir_intrinsic_memory_barrier_image:
2521      ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));
2522      break;
2523
2524   case nir_intrinsic_memory_barrier_shared:
2525      ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));
2526      break;
2527
2528   case nir_intrinsic_group_memory_barrier:
2529      ntt_MEMBAR(c, ureg_imm1u(c->ureg,
2530                               TGSI_MEMBAR_SHADER_BUFFER |
2531                               TGSI_MEMBAR_ATOMIC_BUFFER |
2532                               TGSI_MEMBAR_SHADER_IMAGE |
2533                               TGSI_MEMBAR_SHARED |
2534                               TGSI_MEMBAR_THREAD_GROUP));
2535      break;
2536
2537   case nir_intrinsic_end_primitive:
2538      ntt_ENDPRIM(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2539      break;
2540
2541   case nir_intrinsic_emit_vertex:
2542      ntt_EMIT(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2543      break;
2544
2545      /* In TGSI we don't actually generate the barycentric coords, and emit
2546       * interp intrinsics later.  However, we do need to store the
2547       * load_barycentric_at_* argument so that we can use it at that point.
2548       */
2549   case nir_intrinsic_load_barycentric_pixel:
2550   case nir_intrinsic_load_barycentric_centroid:
2551   case nir_intrinsic_load_barycentric_sample:
2552      break;
2553   case nir_intrinsic_load_barycentric_at_sample:
2554   case nir_intrinsic_load_barycentric_at_offset:
2555      ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
2556      break;
2557
2558   case nir_intrinsic_shader_clock:
2559      ntt_CLOCK(c, ntt_get_dest(c, &instr->dest));
2560      break;
2561
2562   default:
2563      fprintf(stderr, "Unknown intrinsic: ");
2564      nir_print_instr(&instr->instr, stderr);
2565      fprintf(stderr, "\n");
2566      break;
2567   }
2568}
2569
2570struct ntt_tex_operand_state {
2571   struct ureg_src srcs[4];
2572   unsigned i;
2573};
2574
2575static void
2576ntt_push_tex_arg(struct ntt_compile *c,
2577                 nir_tex_instr *instr,
2578                 nir_tex_src_type tex_src_type,
2579                 struct ntt_tex_operand_state *s)
2580{
2581   int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2582   if (tex_src < 0)
2583      return;
2584
2585   nir_src *src = &instr->src[tex_src].src;
2586
2587   /* virglrenderer workaround that's hard to do in tgsi_translate: Make sure
2588    * that TG4's immediate offset arg is float-typed.
2589    */
2590   if (instr->op == nir_texop_tg4 && tex_src_type == nir_tex_src_backend2 &&
2591       nir_src_is_const(*src)) {
2592      nir_const_value *consts = nir_src_as_const_value(*src);
2593      s->srcs[s->i++] = ureg_imm4f(c->ureg,
2594                                   consts[0].f32,
2595                                   consts[1].f32,
2596                                   consts[2].f32,
2597                                   consts[3].f32);
2598      return;
2599   }
2600
2601   s->srcs[s->i++] = ntt_get_src(c, *src);
2602}
2603
2604static void
2605ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
2606{
2607   struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
2608   enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
2609   unsigned tex_opcode;
2610
2611   struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
2612   int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
2613   if (sampler_src >= 0) {
2614      struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
2615      sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr, 2));
2616   }
2617
2618   switch (instr->op) {
2619   case nir_texop_tex:
2620      if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
2621         MAX2(instr->coord_components, 2) + instr->is_shadow)
2622         tex_opcode = TGSI_OPCODE_TXP;
2623      else
2624         tex_opcode = TGSI_OPCODE_TEX;
2625      break;
2626   case nir_texop_txf:
2627   case nir_texop_txf_ms:
2628      tex_opcode = TGSI_OPCODE_TXF;
2629
2630      if (c->has_txf_lz) {
2631         int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2632         if (lod_src >= 0 &&
2633             nir_src_is_const(instr->src[lod_src].src) &&
2634             ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
2635            tex_opcode = TGSI_OPCODE_TXF_LZ;
2636         }
2637      }
2638      break;
2639   case nir_texop_txl:
2640      tex_opcode = TGSI_OPCODE_TXL;
2641      break;
2642   case nir_texop_txb:
2643      tex_opcode = TGSI_OPCODE_TXB;
2644      break;
2645   case nir_texop_txd:
2646      tex_opcode = TGSI_OPCODE_TXD;
2647      break;
2648   case nir_texop_txs:
2649      tex_opcode = TGSI_OPCODE_TXQ;
2650      break;
2651   case nir_texop_tg4:
2652      tex_opcode = TGSI_OPCODE_TG4;
2653      break;
2654   case nir_texop_query_levels:
2655      tex_opcode = TGSI_OPCODE_TXQ;
2656      break;
2657   case nir_texop_lod:
2658      tex_opcode = TGSI_OPCODE_LODQ;
2659      break;
2660   case nir_texop_texture_samples:
2661      tex_opcode = TGSI_OPCODE_TXQS;
2662      break;
2663   default:
2664      unreachable("unsupported tex op");
2665   }
2666
2667   struct ntt_tex_operand_state s = { .i = 0 };
2668   ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
2669   ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
2670
2671   /* non-coord arg for TXQ */
2672   if (tex_opcode == TGSI_OPCODE_TXQ) {
2673      ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
2674      /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
2675       * scalar
2676       */
2677      s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
2678   }
2679
2680   if (s.i > 1) {
2681      if (tex_opcode == TGSI_OPCODE_TEX)
2682         tex_opcode = TGSI_OPCODE_TEX2;
2683      if (tex_opcode == TGSI_OPCODE_TXB)
2684         tex_opcode = TGSI_OPCODE_TXB2;
2685      if (tex_opcode == TGSI_OPCODE_TXL)
2686         tex_opcode = TGSI_OPCODE_TXL2;
2687   }
2688
2689   if (instr->op == nir_texop_txd) {
2690      /* Derivs appear in their own src args */
2691      int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
2692      int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2693      s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
2694      s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
2695   }
2696
2697   if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2698      if (c->screen->get_param(c->screen,
2699                               PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
2700         sampler = ureg_scalar(sampler, instr->component);
2701         s.srcs[s.i++] = ureg_src_undef();
2702      } else {
2703         s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
2704      }
2705   }
2706
2707   s.srcs[s.i++] = sampler;
2708
2709   enum tgsi_return_type tex_type;
2710   switch (instr->dest_type) {
2711   case nir_type_float32:
2712      tex_type = TGSI_RETURN_TYPE_FLOAT;
2713      break;
2714   case nir_type_int32:
2715      tex_type = TGSI_RETURN_TYPE_SINT;
2716      break;
2717   case nir_type_uint32:
2718      tex_type = TGSI_RETURN_TYPE_UINT;
2719      break;
2720   default:
2721      unreachable("unknown texture type");
2722   }
2723
2724   struct tgsi_texture_offset tex_offset = {
2725      .File = TGSI_FILE_NULL
2726   };
2727   int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2728   if (tex_offset_src >= 0) {
2729      struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
2730
2731      tex_offset.File = offset.File;
2732      tex_offset.Index = offset.Index;
2733      tex_offset.SwizzleX = offset.SwizzleX;
2734      tex_offset.SwizzleY = offset.SwizzleY;
2735      tex_offset.SwizzleZ = offset.SwizzleZ;
2736      tex_offset.Padding = 0;
2737   }
2738
2739   struct ureg_dst tex_dst;
2740   if (instr->op == nir_texop_query_levels)
2741      tex_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2742   else
2743      tex_dst = dst;
2744
2745   while (s.i < 4)
2746      s.srcs[s.i++] = ureg_src_undef();
2747
2748   struct ntt_insn *insn = ntt_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
2749   insn->tex_target = target;
2750   insn->tex_return_type = tex_type;
2751   insn->tex_offset = tex_offset;
2752   insn->is_tex = true;
2753
2754   if (instr->op == nir_texop_query_levels)
2755      ntt_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
2756}
2757
2758static void
2759ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
2760{
2761   switch (jump->type) {
2762   case nir_jump_break:
2763      ntt_BRK(c);
2764      break;
2765
2766   case nir_jump_continue:
2767      ntt_CONT(c);
2768      break;
2769
2770   default:
2771      fprintf(stderr, "Unknown jump instruction: ");
2772      nir_print_instr(&jump->instr, stderr);
2773      fprintf(stderr, "\n");
2774      abort();
2775   }
2776}
2777
2778static void
2779ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)
2780{
2781   /* Nothing to do but make sure that we have some storage to deref. */
2782   (void)ntt_get_ssa_def_decl(c, &instr->def);
2783}
2784
2785static void
2786ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2787{
2788   switch (instr->type) {
2789   case nir_instr_type_deref:
2790      /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2791      break;
2792
2793   case nir_instr_type_alu:
2794      ntt_emit_alu(c, nir_instr_as_alu(instr));
2795      break;
2796
2797   case nir_instr_type_intrinsic:
2798      ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2799      break;
2800
2801   case nir_instr_type_load_const:
2802      /* Nothing to do here, as load consts are done directly from
2803       * ntt_get_src() (since many constant NIR srcs will often get folded
2804       * directly into a register file index instead of as a TGSI src).
2805       */
2806      break;
2807
2808   case nir_instr_type_tex:
2809      ntt_emit_texture(c, nir_instr_as_tex(instr));
2810      break;
2811
2812   case nir_instr_type_jump:
2813      ntt_emit_jump(c, nir_instr_as_jump(instr));
2814      break;
2815
2816   case nir_instr_type_ssa_undef:
2817      ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2818      break;
2819
2820   default:
2821      fprintf(stderr, "Unknown NIR instr type: ");
2822      nir_print_instr(instr, stderr);
2823      fprintf(stderr, "\n");
2824      abort();
2825   }
2826}
2827
2828static void
2829ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2830{
2831   if (c->native_integers)
2832      ntt_UIF(c, c->if_cond);
2833   else
2834      ntt_IF(c, c->if_cond);
2835
2836   ntt_emit_cf_list(c, &if_stmt->then_list);
2837
2838   if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
2839      ntt_ELSE(c);
2840      ntt_emit_cf_list(c, &if_stmt->else_list);
2841   }
2842
2843   ntt_ENDIF(c);
2844}
2845
2846static void
2847ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2848{
2849   ntt_BGNLOOP(c);
2850   ntt_emit_cf_list(c, &loop->body);
2851   ntt_ENDLOOP(c);
2852}
2853
2854static void
2855ntt_emit_block(struct ntt_compile *c, nir_block *block)
2856{
2857   struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
2858   c->cur_block = ntt_block;
2859
2860   nir_foreach_instr(instr, block) {
2861      ntt_emit_instr(c, instr);
2862
2863      /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntt_OPCODE(). */
2864      if (ureg_get_instruction_number(c->ureg) != 0) {
2865         fprintf(stderr, "Emitted ureg insn during: ");
2866         nir_print_instr(instr, stderr);
2867         fprintf(stderr, "\n");
2868         unreachable("emitted ureg insn");
2869      }
2870   }
2871
2872   /* Set up the if condition for ntt_emit_if(), which we have to do before
2873    * freeing up the temps (the "if" is treated as inside the block for liveness
2874    * purposes, despite not being an instruction)
2875    *
2876    * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
2877    * looks at all of .xyzw.  No harm in working around the bug.
2878    */
2879   nir_if *nif = nir_block_get_following_if(block);
2880   if (nif)
2881      c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);
2882}
2883
2884static void
2885ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
2886{
2887   foreach_list_typed(nir_cf_node, node, node, list) {
2888      switch (node->type) {
2889      case nir_cf_node_block:
2890         ntt_emit_block(c, nir_cf_node_as_block(node));
2891         break;
2892
2893      case nir_cf_node_if:
2894         ntt_emit_if(c, nir_cf_node_as_if(node));
2895         break;
2896
2897      case nir_cf_node_loop:
2898         ntt_emit_loop(c, nir_cf_node_as_loop(node));
2899         break;
2900
2901      default:
2902         unreachable("unknown CF type");
2903      }
2904   }
2905}
2906
2907static void
2908ntt_emit_block_ureg(struct ntt_compile *c, struct nir_block *block)
2909{
2910   struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
2911
2912   /* Emit the ntt insns to tgsi_ureg. */
2913   util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
2914      const struct tgsi_opcode_info *opcode_info =
2915         tgsi_get_opcode_info(insn->opcode);
2916
2917      switch (insn->opcode) {
2918      case TGSI_OPCODE_UIF:
2919         ureg_UIF(c->ureg, insn->src[0], &c->cf_label);
2920         break;
2921
2922      case TGSI_OPCODE_IF:
2923         ureg_IF(c->ureg, insn->src[0], &c->cf_label);
2924         break;
2925
2926      case TGSI_OPCODE_ELSE:
2927         ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2928         ureg_ELSE(c->ureg, &c->cf_label);
2929         c->current_if_else = c->cf_label;
2930         break;
2931
2932      case TGSI_OPCODE_ENDIF:
2933         ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2934         ureg_ENDIF(c->ureg);
2935         break;
2936
2937      case TGSI_OPCODE_BGNLOOP:
2938         /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2939          * does reference BGNLOOP's.  Follow the former behavior unless something comes up
2940          * with a need.
2941          */
2942         ureg_BGNLOOP(c->ureg, &c->cf_label);
2943         break;
2944
2945      case TGSI_OPCODE_ENDLOOP:
2946         ureg_ENDLOOP(c->ureg, &c->cf_label);
2947         break;
2948
2949      default:
2950         if (insn->is_tex) {
2951            ureg_tex_insn(c->ureg, insn->opcode,
2952                          insn->dst, opcode_info->num_dst,
2953                          insn->tex_target, insn->tex_return_type,
2954                          &insn->tex_offset,
2955                          insn->tex_offset.File != TGSI_FILE_NULL ? 1 : 0,
2956                          insn->src, opcode_info->num_src);
2957         } else if (insn->is_mem) {
2958            ureg_memory_insn(c->ureg, insn->opcode,
2959                             insn->dst, opcode_info->num_dst,
2960                             insn->src, opcode_info->num_src,
2961                             insn->mem_qualifier,
2962                             insn->tex_target,
2963                             insn->mem_format);
2964         } else {
2965            ureg_insn(c->ureg, insn->opcode,
2966                     insn->dst, opcode_info->num_dst,
2967                     insn->src, opcode_info->num_src,
2968                     insn->precise);
2969         }
2970      }
2971   }
2972}
2973
2974static void
2975ntt_emit_if_ureg(struct ntt_compile *c, nir_if *if_stmt)
2976{
2977   /* Note: the last block emitted our IF opcode. */
2978
2979   int if_stack = c->current_if_else;
2980   c->current_if_else = c->cf_label;
2981
2982   /* Either the then or else block includes the ENDIF, which will fix up the
2983    * IF(/ELSE)'s label for jumping
2984    */
2985   ntt_emit_cf_list_ureg(c, &if_stmt->then_list);
2986   ntt_emit_cf_list_ureg(c, &if_stmt->else_list);
2987
2988   c->current_if_else = if_stack;
2989}
2990
2991static void
2992ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list)
2993{
2994   foreach_list_typed(nir_cf_node, node, node, list) {
2995      switch (node->type) {
2996      case nir_cf_node_block:
2997         ntt_emit_block_ureg(c, nir_cf_node_as_block(node));
2998         break;
2999
3000      case nir_cf_node_if:
3001         ntt_emit_if_ureg(c, nir_cf_node_as_if(node));
3002         break;
3003
3004      case nir_cf_node_loop:
3005         /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
3006          * does reference BGNLOOP's.  Follow the former behavior unless something comes up
3007          * with a need.
3008          */
3009         ntt_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
3010         break;
3011
3012      default:
3013         unreachable("unknown CF type");
3014      }
3015   }
3016}
3017
3018static void
3019ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
3020{
3021   c->impl = impl;
3022
3023   c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
3024   c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
3025
3026   /* Set up the struct ntt_blocks to put insns in */
3027   c->blocks = _mesa_pointer_hash_table_create(c);
3028   nir_foreach_block(block, impl) {
3029      struct ntt_block *ntt_block = rzalloc(c->blocks, struct ntt_block);
3030      util_dynarray_init(&ntt_block->insns, ntt_block);
3031      _mesa_hash_table_insert(c->blocks, block, ntt_block);
3032   }
3033
3034
3035   ntt_setup_registers(c, &impl->registers);
3036
3037   c->cur_block = ntt_block_from_nir(c, nir_start_block(impl));
3038   ntt_setup_inputs(c);
3039   ntt_setup_outputs(c);
3040   ntt_setup_uniforms(c);
3041
3042   /* Emit the ntt insns */
3043   ntt_emit_cf_list(c, &impl->body);
3044
3045   /* Don't do optimized RA if the driver requests it, unless the number of
3046    * temps is too large to be covered by the 16 bit signed int that TGSI
3047    * allocates for the register index */
3048   if (!c->options->unoptimized_ra || c->num_temps > 0x7fff)
3049      ntt_allocate_regs(c, impl);
3050   else
3051      ntt_allocate_regs_unoptimized(c, impl);
3052
3053   /* Turn the ntt insns into actual TGSI tokens */
3054   ntt_emit_cf_list_ureg(c, &impl->body);
3055
3056   ralloc_free(c->liveness);
3057   c->liveness = NULL;
3058
3059}
3060
3061static int
3062type_size(const struct glsl_type *type, bool bindless)
3063{
3064   return glsl_count_attribute_slots(type, false);
3065}
3066
3067/* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
3068 * can handle for 64-bit values in TGSI.
3069 */
3070static uint8_t
3071ntt_should_vectorize_instr(const nir_instr *instr, const void *data)
3072{
3073   if (instr->type != nir_instr_type_alu)
3074      return 0;
3075
3076   nir_alu_instr *alu = nir_instr_as_alu(instr);
3077
3078   switch (alu->op) {
3079   case nir_op_ibitfield_extract:
3080   case nir_op_ubitfield_extract:
3081   case nir_op_bitfield_insert:
3082      /* virglrenderer only looks at the .x channel of the offset/bits operands
3083       * when translating to GLSL.  tgsi.rst doesn't seem to require scalar
3084       * offset/bits operands.
3085       *
3086       * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
3087       */
3088      return 1;
3089
3090   default:
3091      break;
3092   }
3093
3094   int src_bit_size = nir_src_bit_size(alu->src[0].src);
3095   int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
3096
3097   if (src_bit_size == 64 || dst_bit_size == 64) {
3098      /* Avoid vectorizing 64-bit instructions at all.  Despite tgsi.rst
3099       * claiming support, virglrenderer generates bad shaders on the host when
3100       * presented with them.  Maybe we can make virgl avoid tickling the
3101       * virglrenderer bugs, but given that glsl-to-TGSI didn't generate vector
3102       * 64-bit instrs in the first place, I don't see much reason to care about
3103       * this.
3104       */
3105      return 1;
3106   }
3107
3108   return 4;
3109}
3110
3111static bool
3112ntt_should_vectorize_io(unsigned align, unsigned bit_size,
3113                        unsigned num_components, unsigned high_offset,
3114                        nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3115                        void *data)
3116{
3117   if (bit_size != 32)
3118      return false;
3119
3120   /* Our offset alignment should aways be at least 4 bytes */
3121   if (align < 4)
3122      return false;
3123
3124   /* No wrapping off the end of a TGSI reg.  We could do a bit better by
3125    * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
3126    * need this restriction.
3127    */
3128   unsigned worst_start_component = align == 4 ? 3 : align / 4;
3129   if (worst_start_component + num_components > 4)
3130      return false;
3131
3132   return true;
3133}
3134
3135static nir_variable_mode
3136ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
3137{
3138   unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3139   unsigned indirect_mask = 0;
3140
3141   if (!screen->get_shader_param(screen, pipe_stage,
3142                                 PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
3143      indirect_mask |= nir_var_shader_in;
3144   }
3145
3146   if (!screen->get_shader_param(screen, pipe_stage,
3147                                 PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
3148      indirect_mask |= nir_var_shader_out;
3149   }
3150
3151   if (!screen->get_shader_param(screen, pipe_stage,
3152                                 PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
3153      indirect_mask |= nir_var_function_temp;
3154   }
3155
3156   return indirect_mask;
3157}
3158
3159static void
3160ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
3161{
3162   bool progress;
3163   unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3164   unsigned control_flow_depth =
3165      screen->get_shader_param(screen, pipe_stage,
3166                               PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
3167   do {
3168      progress = false;
3169
3170      NIR_PASS_V(s, nir_lower_vars_to_ssa);
3171      NIR_PASS_V(s, nir_split_64bit_vec3_and_vec4);
3172
3173      NIR_PASS(progress, s, nir_copy_prop);
3174      NIR_PASS(progress, s, nir_opt_algebraic);
3175      NIR_PASS(progress, s, nir_opt_constant_folding);
3176      NIR_PASS(progress, s, nir_opt_remove_phis);
3177      NIR_PASS(progress, s, nir_opt_conditional_discard);
3178      NIR_PASS(progress, s, nir_opt_dce);
3179      NIR_PASS(progress, s, nir_opt_dead_cf);
3180      NIR_PASS(progress, s, nir_opt_cse);
3181      NIR_PASS(progress, s, nir_opt_find_array_copies);
3182      NIR_PASS(progress, s, nir_opt_copy_prop_vars);
3183      NIR_PASS(progress, s, nir_opt_dead_write_vars);
3184
3185      NIR_PASS(progress, s, nir_opt_if, nir_opt_if_aggressive_last_continue | nir_opt_if_optimize_phi_true_false);
3186      NIR_PASS(progress, s, nir_opt_peephole_select,
3187               control_flow_depth == 0 ? ~0 : 8, true, true);
3188      NIR_PASS(progress, s, nir_opt_algebraic);
3189      NIR_PASS(progress, s, nir_opt_constant_folding);
3190      nir_load_store_vectorize_options vectorize_opts = {
3191         .modes = nir_var_mem_ubo,
3192         .callback = ntt_should_vectorize_io,
3193         .robust_modes = 0,
3194      };
3195      NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
3196      NIR_PASS(progress, s, nir_opt_shrink_stores, true);
3197      NIR_PASS(progress, s, nir_opt_shrink_vectors);
3198      NIR_PASS(progress, s, nir_opt_trivial_continues);
3199      NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
3200      NIR_PASS(progress, s, nir_opt_undef);
3201      NIR_PASS(progress, s, nir_opt_loop_unroll);
3202
3203      /* Try to fold addressing math into ubo_vec4's base to avoid load_consts
3204       * and ALU ops for it.
3205       */
3206      static const nir_opt_offsets_options offset_options = {
3207         .ubo_vec4_max = ~0,
3208
3209         /* No const offset in TGSI for shared accesses. */
3210         .shared_max = 0,
3211
3212         /* unused intrinsics */
3213         .uniform_max = 0,
3214         .buffer_max = 0,
3215      };
3216      NIR_PASS(progress, s, nir_opt_offsets, &offset_options);
3217   } while (progress);
3218
3219   NIR_PASS_V(s, nir_lower_var_copies);
3220}
3221
3222/* Scalarizes all 64-bit ALU ops.  Note that we only actually need to
3223 * scalarize vec3/vec4s, should probably fix that.
3224 */
3225static bool
3226scalarize_64bit(const nir_instr *instr, const void *data)
3227{
3228   const nir_alu_instr *alu = nir_instr_as_alu(instr);
3229
3230   return (nir_dest_bit_size(alu->dest.dest) == 64 ||
3231           nir_src_bit_size(alu->src[0].src) == 64);
3232}
3233
3234static bool
3235nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
3236{
3237   b->cursor = nir_after_instr(&instr->instr);
3238
3239   switch (instr->intrinsic) {
3240   case nir_intrinsic_load_ubo:
3241   case nir_intrinsic_load_ubo_vec4:
3242   case nir_intrinsic_load_ssbo:
3243   case nir_intrinsic_load_input:
3244   case nir_intrinsic_load_interpolated_input:
3245   case nir_intrinsic_load_per_vertex_input:
3246   case nir_intrinsic_store_output:
3247   case nir_intrinsic_store_per_vertex_output:
3248   case nir_intrinsic_store_ssbo:
3249      break;
3250   default:
3251      return false;
3252   }
3253
3254   if (instr->num_components <= 2)
3255      return false;
3256
3257   bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
3258   if (has_dest) {
3259      if (nir_dest_bit_size(instr->dest) != 64)
3260         return false;
3261   } else  {
3262      if (nir_src_bit_size(instr->src[0]) != 64)
3263          return false;
3264   }
3265
3266   nir_intrinsic_instr *first =
3267      nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3268   nir_intrinsic_instr *second =
3269      nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3270
3271   switch (instr->intrinsic) {
3272   case nir_intrinsic_load_ubo:
3273   case nir_intrinsic_load_ubo_vec4:
3274   case nir_intrinsic_load_ssbo:
3275   case nir_intrinsic_store_ssbo:
3276      break;
3277
3278   default: {
3279      nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
3280      semantics.location++;
3281      semantics.num_slots--;
3282      nir_intrinsic_set_io_semantics(second, semantics);
3283
3284      nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
3285      break;
3286   }
3287   }
3288
3289   first->num_components = 2;
3290   second->num_components -= 2;
3291   if (has_dest) {
3292      first->dest.ssa.num_components = 2;
3293      second->dest.ssa.num_components -= 2;
3294   }
3295
3296   nir_builder_instr_insert(b, &first->instr);
3297   nir_builder_instr_insert(b, &second->instr);
3298
3299   if (has_dest) {
3300      /* Merge the two loads' results back into a vector. */
3301      nir_ssa_scalar channels[4] = {
3302         nir_get_ssa_scalar(&first->dest.ssa, 0),
3303         nir_get_ssa_scalar(&first->dest.ssa, 1),
3304         nir_get_ssa_scalar(&second->dest.ssa, 0),
3305         nir_get_ssa_scalar(&second->dest.ssa, second->num_components > 1 ? 1 : 0),
3306      };
3307      nir_ssa_def *new = nir_vec_scalars(b, channels, instr->num_components);
3308      nir_ssa_def_rewrite_uses(&instr->dest.ssa, new);
3309   } else {
3310      /* Split the src value across the two stores. */
3311      b->cursor = nir_before_instr(&instr->instr);
3312
3313      nir_ssa_def *src0 = instr->src[0].ssa;
3314      nir_ssa_scalar channels[4] = { 0 };
3315      for (int i = 0; i < instr->num_components; i++)
3316         channels[i] = nir_get_ssa_scalar(src0, i);
3317
3318      nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
3319      nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
3320
3321      nir_instr_rewrite_src(&first->instr, &first->src[0],
3322                            nir_src_for_ssa(nir_vec_scalars(b, channels, 2)));
3323      nir_instr_rewrite_src(&second->instr, &second->src[0],
3324                            nir_src_for_ssa(nir_vec_scalars(b, &channels[2],
3325                                                           second->num_components)));
3326   }
3327
3328   int offset_src = -1;
3329   uint32_t offset_amount = 16;
3330
3331   switch (instr->intrinsic) {
3332   case nir_intrinsic_load_ssbo:
3333   case nir_intrinsic_load_ubo:
3334      offset_src = 1;
3335      break;
3336   case nir_intrinsic_load_ubo_vec4:
3337      offset_src = 1;
3338      offset_amount = 1;
3339      break;
3340   case nir_intrinsic_store_ssbo:
3341      offset_src = 2;
3342      break;
3343   default:
3344      break;
3345   }
3346   if (offset_src != -1) {
3347      b->cursor = nir_before_instr(&second->instr);
3348      nir_ssa_def *second_offset =
3349         nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
3350      nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
3351                            nir_src_for_ssa(second_offset));
3352   }
3353
3354   /* DCE stores we generated with no writemask (nothing else does this
3355    * currently).
3356    */
3357   if (!has_dest) {
3358      if (nir_intrinsic_write_mask(first) == 0)
3359         nir_instr_remove(&first->instr);
3360      if (nir_intrinsic_write_mask(second) == 0)
3361         nir_instr_remove(&second->instr);
3362   }
3363
3364   nir_instr_remove(&instr->instr);
3365
3366   return true;
3367}
3368
3369static bool
3370nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
3371{
3372   int num_components = instr->def.num_components;
3373
3374   if (instr->def.bit_size != 64 || num_components <= 2)
3375      return false;
3376
3377   b->cursor = nir_before_instr(&instr->instr);
3378
3379   nir_load_const_instr *first =
3380      nir_load_const_instr_create(b->shader, 2, 64);
3381   nir_load_const_instr *second =
3382      nir_load_const_instr_create(b->shader, num_components - 2, 64);
3383
3384   first->value[0] = instr->value[0];
3385   first->value[1] = instr->value[1];
3386   second->value[0] = instr->value[2];
3387   if (num_components == 4)
3388      second->value[1] = instr->value[3];
3389
3390   nir_builder_instr_insert(b, &first->instr);
3391   nir_builder_instr_insert(b, &second->instr);
3392
3393   nir_ssa_def *channels[4] = {
3394      nir_channel(b, &first->def, 0),
3395      nir_channel(b, &first->def, 1),
3396      nir_channel(b, &second->def, 0),
3397      num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
3398   };
3399   nir_ssa_def *new = nir_vec(b, channels, num_components);
3400   nir_ssa_def_rewrite_uses(&instr->def, new);
3401   nir_instr_remove(&instr->instr);
3402
3403   return true;
3404}
3405
3406static bool
3407nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
3408                                      void *data)
3409{
3410   switch (instr->type) {
3411   case nir_instr_type_load_const:
3412      return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
3413
3414   case nir_instr_type_intrinsic:
3415      return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
3416   default:
3417      return false;
3418   }
3419}
3420
3421static bool
3422nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
3423{
3424   return nir_shader_instructions_pass(s,
3425                                       nir_to_tgsi_lower_64bit_to_vec2_instr,
3426                                       nir_metadata_block_index |
3427                                       nir_metadata_dominance,
3428                                       NULL);
3429}
3430
3431struct ntt_lower_tex_state {
3432   nir_ssa_scalar channels[8];
3433   unsigned i;
3434};
3435
3436static void
3437nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
3438                                nir_tex_instr *instr,
3439                                nir_tex_src_type tex_src_type,
3440                                struct ntt_lower_tex_state *s)
3441{
3442   int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
3443   if (tex_src < 0)
3444      return;
3445
3446   assert(instr->src[tex_src].src.is_ssa);
3447
3448   nir_ssa_def *def = instr->src[tex_src].src.ssa;
3449   for (int i = 0; i < def->num_components; i++) {
3450      s->channels[s->i++] = nir_get_ssa_scalar(def, i);
3451   }
3452
3453   nir_tex_instr_remove_src(instr, tex_src);
3454}
3455
3456/**
3457 * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
3458 * src.  This lets NIR handle the coalescing of the vec4 rather than trying to
3459 * manage it on our own, and may lead to more vectorization.
3460 */
3461static bool
3462nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
3463{
3464   if (instr->type != nir_instr_type_tex)
3465      return false;
3466
3467   nir_tex_instr *tex = nir_instr_as_tex(instr);
3468
3469   if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
3470      return false;
3471
3472   b->cursor = nir_before_instr(instr);
3473
3474   struct ntt_lower_tex_state s = {0};
3475
3476   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
3477   /* We always have at least two slots for the coordinate, even on 1D. */
3478   s.i = MAX2(s.i, 2);
3479
3480   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
3481   s.i = MAX2(s.i, 3);
3482
3483   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
3484
3485   /* XXX: LZ */
3486   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
3487   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
3488   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
3489
3490   /* No need to pack undefs in unused channels of the tex instr */
3491   while (!s.channels[s.i - 1].def)
3492      s.i--;
3493
3494   /* Instead of putting undefs in the unused slots of the vecs, just put in
3495    * another used channel.  Otherwise, we'll get unnecessary moves into
3496    * registers.
3497    */
3498   assert(s.channels[0].def != NULL);
3499   for (int i = 1; i < s.i; i++) {
3500      if (!s.channels[i].def)
3501         s.channels[i] = s.channels[0];
3502   }
3503
3504   nir_tex_instr_add_src(tex, nir_tex_src_backend1, nir_src_for_ssa(nir_vec_scalars(b, s.channels, MIN2(s.i, 4))));
3505   if (s.i > 4)
3506      nir_tex_instr_add_src(tex, nir_tex_src_backend2, nir_src_for_ssa(nir_vec_scalars(b, &s.channels[4], s.i - 4)));
3507
3508   return true;
3509}
3510
3511static bool
3512nir_to_tgsi_lower_tex(nir_shader *s)
3513{
3514   return nir_shader_instructions_pass(s,
3515                                       nir_to_tgsi_lower_tex_instr,
3516                                       nir_metadata_block_index |
3517                                       nir_metadata_dominance,
3518                                       NULL);
3519}
3520
3521static void
3522ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s,
3523                    const struct nir_to_tgsi_options *ntt_options)
3524{
3525   const struct nir_shader_compiler_options *options = s->options;
3526   bool lower_fsqrt =
3527      !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
3528                                PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
3529
3530   bool force_indirect_unrolling_sampler =
3531      screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL) < 400;
3532
3533   nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3534
3535   if (!options->lower_extract_byte ||
3536       !options->lower_extract_word ||
3537       !options->lower_insert_byte ||
3538       !options->lower_insert_word ||
3539       !options->lower_fdph ||
3540       !options->lower_flrp64 ||
3541       !options->lower_fmod ||
3542       !options->lower_rotate ||
3543       !options->lower_uadd_sat ||
3544       !options->lower_usub_sat ||
3545       !options->lower_uniforms_to_ubo ||
3546       !options->lower_vector_cmp ||
3547       options->lower_fsqrt != lower_fsqrt ||
3548       options->force_indirect_unrolling != no_indirects_mask ||
3549       force_indirect_unrolling_sampler) {
3550      nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
3551      *new_options = *s->options;
3552
3553      new_options->lower_extract_byte = true;
3554      new_options->lower_extract_word = true;
3555      new_options->lower_insert_byte = true;
3556      new_options->lower_insert_word = true;
3557      new_options->lower_fdph = true;
3558      new_options->lower_flrp64 = true;
3559      new_options->lower_fmod = true;
3560      new_options->lower_rotate = true;
3561      new_options->lower_uadd_sat = true;
3562      new_options->lower_usub_sat = true;
3563      new_options->lower_uniforms_to_ubo = true;
3564      new_options->lower_vector_cmp = true;
3565      new_options->lower_fsqrt = lower_fsqrt;
3566      new_options->force_indirect_unrolling = no_indirects_mask;
3567      new_options->force_indirect_unrolling_sampler = force_indirect_unrolling_sampler;
3568
3569      s->options = new_options;
3570   }
3571}
3572
3573static bool
3574ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
3575{
3576   return (instr->type == nir_instr_type_intrinsic &&
3577           nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
3578}
3579
3580static nir_ssa_def *
3581ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
3582{
3583   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3584
3585   nir_ssa_def *old_result = &intr->dest.ssa;
3586   intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;
3587
3588   return nir_iadd_imm(b, old_result, -1);
3589}
3590
3591static bool
3592ntt_lower_atomic_pre_dec(nir_shader *s)
3593{
3594   return nir_shader_lower_instructions(s,
3595                                        ntt_lower_atomic_pre_dec_filter,
3596                                        ntt_lower_atomic_pre_dec_lower, NULL);
3597}
3598
3599/* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
3600static void
3601nir_to_tgsi_lower_txp(nir_shader *s)
3602{
3603   nir_lower_tex_options lower_tex_options = {
3604       .lower_txp = 0,
3605   };
3606
3607   nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
3608      nir_foreach_instr(instr, block) {
3609         if (instr->type != nir_instr_type_tex)
3610            continue;
3611         nir_tex_instr *tex = nir_instr_as_tex(instr);
3612
3613         if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
3614            continue;
3615
3616         bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
3617         bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
3618         bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
3619
3620         /* We can do TXP for any tex (not txg) where we can fit all the
3621          * coordinates and comparator and projector in one vec4 without any
3622          * other modifiers to add on.
3623          *
3624          * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
3625          * if we get any funny projectors then we just blow them all away.
3626          */
3627         if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
3628            lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
3629      }
3630   }
3631
3632   /* nir_lower_tex must be run even if no options are set, because we need the
3633    * LOD to be set for query_levels and for non-fragment shaders.
3634    */
3635   NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
3636}
3637
3638static bool
3639nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
3640{
3641   return (instr->type == nir_instr_type_intrinsic &&
3642           nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
3643}
3644
3645static nir_ssa_def *
3646nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
3647{
3648   nir_variable *var = *(nir_variable **)data;
3649   if (!var) {
3650      var = nir_variable_create(b->shader, nir_var_shader_in, glsl_uint_type(), "gl_PrimitiveID");
3651      var->data.location = VARYING_SLOT_PRIMITIVE_ID;
3652      b->shader->info.inputs_read |= VARYING_BIT_PRIMITIVE_ID;
3653      var->data.driver_location = b->shader->num_inputs++;
3654
3655      *(nir_variable **)data = var;
3656   }
3657
3658   nir_io_semantics semantics = {
3659      .location = var->data.location,
3660       .num_slots = 1
3661   };
3662   return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
3663                         .base = var->data.driver_location,
3664                         .io_semantics = semantics);
3665}
3666
3667static bool
3668nir_lower_primid_sysval_to_input(nir_shader *s)
3669{
3670   nir_variable *input = NULL;
3671
3672   return nir_shader_lower_instructions(s,
3673                                        nir_lower_primid_sysval_to_input_filter,
3674                                        nir_lower_primid_sysval_to_input_lower, &input);
3675}
3676
3677const void *
3678nir_to_tgsi(struct nir_shader *s,
3679            struct pipe_screen *screen)
3680{
3681   static const struct nir_to_tgsi_options default_ntt_options = {0};
3682   return nir_to_tgsi_options(s, screen, &default_ntt_options);
3683}
3684
3685/* Prevent lower_vec_to_mov from coalescing 64-to-32 conversions and comparisons
3686 * into unsupported channels of registers.
3687 */
3688static bool
3689ntt_vec_to_mov_writemask_cb(const nir_instr *instr, unsigned writemask, UNUSED const void *_data)
3690{
3691   if (instr->type != nir_instr_type_alu)
3692      return false;
3693
3694   nir_alu_instr *alu = nir_instr_as_alu(instr);
3695   int dst_32 = nir_dest_bit_size(alu->dest.dest) == 32;
3696   int src_64 = nir_src_bit_size(alu->src[0].src) == 64;
3697
3698   if (src_64 && dst_32) {
3699      int num_srcs = nir_op_infos[alu->op].num_inputs;
3700
3701      if (num_srcs == 2 || nir_op_infos[alu->op].output_type == nir_type_bool32) {
3702         /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz
3703          * instead of .xy.  Just support scalar compares storing to .x,
3704          * GLSL-to-TGSI only ever emitted scalar ops anyway.
3705          */
3706        if (writemask != TGSI_WRITEMASK_X)
3707           return false;
3708      } else {
3709         /* TGSI's 64-to-32-bit conversions can only store to .xy (since a TGSI
3710          * register can only store a dvec2).  Don't try to coalesce to write to
3711          * .zw.
3712          */
3713         if (writemask & ~(TGSI_WRITEMASK_XY))
3714            return false;
3715      }
3716   }
3717
3718   return true;
3719}
3720
3721/**
3722 * Translates the NIR shader to TGSI.
3723 *
3724 * This requires some lowering of the NIR shader to prepare it for translation.
3725 * We take ownership of the NIR shader passed, returning a reference to the new
3726 * TGSI tokens instead.  If you need to keep the NIR, then pass us a clone.
3727 */
3728const void *nir_to_tgsi_options(struct nir_shader *s,
3729                                struct pipe_screen *screen,
3730                                const struct nir_to_tgsi_options *options)
3731{
3732   struct ntt_compile *c;
3733   const void *tgsi_tokens;
3734   nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3735   bool native_integers = screen->get_shader_param(screen,
3736                                                   pipe_shader_type_from_mesa(s->info.stage),
3737                                                   PIPE_SHADER_CAP_INTEGERS);
3738   const struct nir_shader_compiler_options *original_options = s->options;
3739
3740   ntt_fix_nir_options(screen, s, options);
3741
3742   /* Lower array indexing on FS inputs.  Since we don't set
3743    * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
3744    * elements by ureg, and so dynamically indexing them would be invalid.
3745    * Ideally we would set that ureg flag based on
3746    * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
3747    * splitting NIR VS outputs to elements even if the FS doesn't get the
3748    * corresponding splitting, and virgl depends on TGSI across link boundaries
3749    * having matching declarations.
3750    */
3751   if (s->info.stage == MESA_SHADER_FRAGMENT) {
3752      NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
3753      NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
3754   }
3755
3756   NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3757              type_size, (nir_lower_io_options)0);
3758   NIR_PASS_V(s, nir_lower_regs_to_ssa);
3759
3760   nir_to_tgsi_lower_txp(s);
3761   NIR_PASS_V(s, nir_to_tgsi_lower_tex);
3762
3763   /* While TGSI can represent PRIMID as either an input or a system value,
3764    * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
3765    * depend on that.
3766    */
3767   if (s->info.stage == MESA_SHADER_GEOMETRY)
3768      NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
3769
3770   if (s->info.num_abos)
3771      NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
3772
3773   if (!original_options->lower_uniforms_to_ubo) {
3774      NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
3775                 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
3776                 !native_integers);
3777   }
3778
3779   /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
3780    * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
3781    * duplication logic we just make it so that we only see vec2s.
3782    */
3783   NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
3784   NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
3785
3786   if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
3787      NIR_PASS_V(s, nir_lower_ubo_vec4);
3788
3789   ntt_optimize_nir(s, screen);
3790
3791   NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
3792
3793   bool progress;
3794   do {
3795      progress = false;
3796      NIR_PASS(progress, s, nir_opt_algebraic_late);
3797      if (progress) {
3798         NIR_PASS_V(s, nir_copy_prop);
3799         NIR_PASS_V(s, nir_opt_dce);
3800         NIR_PASS_V(s, nir_opt_cse);
3801      }
3802   } while (progress);
3803
3804   if (screen->get_shader_param(screen,
3805                                pipe_shader_type_from_mesa(s->info.stage),
3806                                PIPE_SHADER_CAP_INTEGERS)) {
3807      NIR_PASS_V(s, nir_lower_bool_to_int32);
3808   } else {
3809      NIR_PASS_V(s, nir_lower_int_to_float);
3810      NIR_PASS_V(s, nir_lower_bool_to_float);
3811      /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
3812      NIR_PASS_V(s, nir_copy_prop);
3813      NIR_PASS_V(s, nir_opt_dce);
3814   }
3815
3816   nir_move_options move_all =
3817       nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
3818       nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
3819
3820   NIR_PASS_V(s, nir_opt_move, move_all);
3821
3822   /* Only lower 32-bit floats.  The only other modifier type officially
3823    * supported by TGSI is 32-bit integer negates, but even those are broken on
3824    * virglrenderer, so skip lowering all integer and f64 float mods.
3825    *
3826    * The options->lower_fabs requests that we not have native source modifiers
3827    * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
3828    */
3829   nir_lower_to_source_mods_flags source_mods = nir_lower_fneg_source_mods;
3830   if (!options->lower_fabs)
3831      source_mods |= nir_lower_fabs_source_mods;
3832   NIR_PASS_V(s, nir_lower_to_source_mods, source_mods);
3833
3834   NIR_PASS_V(s, nir_convert_from_ssa, true);
3835   NIR_PASS_V(s, nir_lower_vec_to_movs, ntt_vec_to_mov_writemask_cb, NULL);
3836
3837   /* locals_to_regs will leave dead derefs that are good to clean up. */
3838   NIR_PASS_V(s, nir_lower_locals_to_regs);
3839   NIR_PASS_V(s, nir_opt_dce);
3840
3841   if (NIR_DEBUG(TGSI)) {
3842      fprintf(stderr, "NIR before translation to TGSI:\n");
3843      nir_print_shader(s, stderr);
3844   }
3845
3846   c = rzalloc(NULL, struct ntt_compile);
3847   c->screen = screen;
3848   c->options = options;
3849
3850   c->needs_texcoord_semantic =
3851      screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
3852   c->has_txf_lz =
3853      screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);
3854
3855   c->s = s;
3856   c->native_integers = native_integers;
3857   c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
3858   ureg_setup_shader_info(c->ureg, &s->info);
3859   if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES))
3860      ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
3861
3862   if (s->info.stage == MESA_SHADER_FRAGMENT) {
3863      /* The draw module's polygon stipple layer doesn't respect the chosen
3864       * coordinate mode, so leave it as unspecified unless we're actually
3865       * reading the position in the shader already.  See
3866       * gl-2.1-polygon-stipple-fs on softpipe.
3867       */
3868      if ((s->info.inputs_read & VARYING_BIT_POS) ||
3869          BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
3870         ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
3871                       s->info.fs.origin_upper_left ?
3872                       TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
3873                       TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
3874
3875         ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
3876                       s->info.fs.pixel_center_integer ?
3877                       TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
3878                       TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
3879      }
3880   }
3881   /* Emit the main function */
3882   nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
3883   ntt_emit_impl(c, impl);
3884   ureg_END(c->ureg);
3885
3886   tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
3887
3888   if (NIR_DEBUG(TGSI)) {
3889      fprintf(stderr, "TGSI after translation from NIR:\n");
3890      tgsi_dump(tgsi_tokens, 0);
3891   }
3892
3893   ureg_destroy(c->ureg);
3894
3895   ralloc_free(c);
3896   ralloc_free(s);
3897
3898   return tgsi_tokens;
3899}
3900
3901static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
3902   .fdot_replicates = true,
3903   .fuse_ffma32 = true,
3904   .fuse_ffma64 = true,
3905   .lower_extract_byte = true,
3906   .lower_extract_word = true,
3907   .lower_insert_byte = true,
3908   .lower_insert_word = true,
3909   .lower_fdph = true,
3910   .lower_flrp64 = true,
3911   .lower_fmod = true,
3912   .lower_rotate = true,
3913   .lower_uniforms_to_ubo = true,
3914   .lower_uadd_sat = true,
3915   .lower_usub_sat = true,
3916   .lower_vector_cmp = true,
3917   .lower_int64_options = nir_lower_imul_2x32_64,
3918   .use_interpolated_input_intrinsics = true,
3919};
3920
3921/* Returns a default compiler options for drivers with only nir-to-tgsi-based
3922 * NIR support.
3923 */
3924const void *
3925nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
3926                                 enum pipe_shader_ir ir,
3927                                 unsigned shader)
3928{
3929   assert(ir == PIPE_SHADER_IR_NIR);
3930   return &nir_to_tgsi_compiler_options;
3931}
3932
3933/** Helper for getting TGSI tokens to store for a pipe_shader_state CSO. */
3934const void *
3935pipe_shader_state_to_tgsi_tokens(struct pipe_screen *screen,
3936                                 const struct pipe_shader_state *cso)
3937{
3938   if (cso->type == PIPE_SHADER_IR_NIR) {
3939      return nir_to_tgsi((nir_shader *)cso->ir.nir, screen);
3940   } else {
3941      assert(cso->type == PIPE_SHADER_IR_TGSI);
3942      /* we need to keep a local copy of the tokens */
3943      return tgsi_dup_tokens(cso->tokens);
3944   }
3945}
3946