1/*
2 * Copyright (C) 2020 Collabora Ltd.
3 * Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors (Collabora):
25 *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
26 */
27
28#include "compiler/glsl/glsl_to_nir.h"
29#include "compiler/nir_types.h"
30#include "compiler/nir/nir_builder.h"
31#include "compiler/nir/nir_schedule.h"
32#include "util/u_debug.h"
33
34#include "disassemble.h"
35#include "valhall/va_compiler.h"
36#include "valhall/disassemble.h"
37#include "bifrost_compile.h"
38#include "compiler.h"
39#include "valhall/va_compiler.h"
40#include "bi_quirks.h"
41#include "bi_builder.h"
42#include "bifrost_nir.h"
43
44static const struct debug_named_value bifrost_debug_options[] = {
45        {"msgs",      BIFROST_DBG_MSGS,		"Print debug messages"},
46        {"shaders",   BIFROST_DBG_SHADERS,	"Dump shaders in NIR and MIR"},
47        {"shaderdb",  BIFROST_DBG_SHADERDB,	"Print statistics"},
48        {"verbose",   BIFROST_DBG_VERBOSE,	"Disassemble verbosely"},
49        {"internal",  BIFROST_DBG_INTERNAL,	"Dump even internal shaders"},
50        {"nosched",   BIFROST_DBG_NOSCHED, 	"Force trivial bundling"},
51        {"nopsched",  BIFROST_DBG_NOPSCHED,     "Disable scheduling for pressure"},
52        {"inorder",   BIFROST_DBG_INORDER, 	"Force in-order bundling"},
53        {"novalidate",BIFROST_DBG_NOVALIDATE,   "Skip IR validation"},
54        {"noopt",     BIFROST_DBG_NOOPT,        "Skip optimization passes"},
55        {"noidvs",    BIFROST_DBG_NOIDVS,       "Disable IDVS"},
56        {"nosb",      BIFROST_DBG_NOSB,         "Disable scoreboarding"},
57        {"nopreload", BIFROST_DBG_NOPRELOAD,    "Disable message preloading"},
58        {"spill",     BIFROST_DBG_SPILL,        "Test register spilling"},
59        DEBUG_NAMED_VALUE_END
60};
61
62DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0)
63
64/* How many bytes are prefetched by the Bifrost shader core. From the final
65 * clause of the shader, this range must be valid instructions or zero. */
66#define BIFROST_SHADER_PREFETCH 128
67
68int bifrost_debug = 0;
69
70#define DBG(fmt, ...) \
71		do { if (bifrost_debug & BIFROST_DBG_MSGS) \
72			fprintf(stderr, "%s:%d: "fmt, \
73				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
74
75static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
76
77static bi_index
78bi_preload(bi_builder *b, unsigned reg)
79{
80        if (bi_is_null(b->shader->preloaded[reg])) {
81                /* Insert at the beginning of the shader */
82                bi_builder b_ = *b;
83                b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
84
85                /* Cache the result */
86                b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
87        }
88
89        return b->shader->preloaded[reg];
90}
91
92static bi_index
93bi_coverage(bi_builder *b)
94{
95        if (bi_is_null(b->shader->coverage))
96                b->shader->coverage = bi_preload(b, 60);
97
98        return b->shader->coverage;
99}
100
101/*
102 * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
103 * changed from Bifrost to Valhall. Provide helpers that smooth over the
104 * architectural difference.
105 */
106static inline bi_index
107bi_vertex_id(bi_builder *b)
108{
109        return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
110}
111
112static inline bi_index
113bi_instance_id(bi_builder *b)
114{
115        return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
116}
117
118static void
119bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
120{
121        bi_instr *branch = bi_jump(b, bi_zero());
122
123        switch (instr->type) {
124        case nir_jump_break:
125                branch->branch_target = b->shader->break_block;
126                break;
127        case nir_jump_continue:
128                branch->branch_target = b->shader->continue_block;
129                break;
130        default:
131                unreachable("Unhandled jump type");
132        }
133
134        bi_block_add_successor(b->shader->current_block, branch->branch_target);
135        b->shader->current_block->unconditional_jumps = true;
136}
137
138/* Builds a 64-bit hash table key for an index */
139static uint64_t
140bi_index_to_key(bi_index idx)
141{
142        static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
143
144        uint64_t key = 0;
145        memcpy(&key, &idx, sizeof(idx));
146        return key;
147}
148
149/*
150 * Extract a single channel out of a vector source. We split vectors with SPLIT
151 * so we can use the split components directly, without emitting an extract.
152 * This has advantages of RA, as the split can usually be optimized away.
153 */
154static bi_index
155bi_extract(bi_builder *b, bi_index vec, unsigned channel)
156{
157        /* Extract caching relies on SSA form. It is incorrect for nir_register.
158         * Bypass the cache and emit an explicit split for registers.
159         */
160        if (vec.reg) {
161                bi_instr *I = bi_split_i32_to(b, bi_null(), vec);
162                I->nr_dests = channel + 1;
163                I->dest[channel] = bi_temp(b->shader);
164                return I->dest[channel];
165        }
166
167        bi_index *components =
168                _mesa_hash_table_u64_search(b->shader->allocated_vec,
169                                            bi_index_to_key(vec));
170
171        /* No extract needed for scalars.
172         *
173         * This is a bit imprecise, but actual bugs (missing splits for vectors)
174         * should be caught by the following assertion. It is too difficult to
175         * ensure bi_extract is only called for real vectors.
176         */
177        if (components == NULL && channel == 0)
178                return vec;
179
180        assert(components != NULL && "missing bi_cache_collect()");
181        return components[channel];
182}
183
184static void
185bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
186{
187        /* Lifetime of a hash table entry has to be at least as long as the table */
188        bi_index *channels = ralloc_array(b->shader, bi_index, n);
189        memcpy(channels, s, sizeof(bi_index) * n);
190
191        _mesa_hash_table_u64_insert(b->shader->allocated_vec,
192                                    bi_index_to_key(dst), channels);
193}
194
195/*
196 * Splits an n-component vector (vec) into n scalar destinations (dests) using a
197 * split pseudo-instruction.
198 *
199 * Pre-condition: dests is filled with bi_null().
200 */
201static void
202bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
203{
204        /* Setup the destinations */
205        for (unsigned i = 0; i < n; ++i) {
206                dests[i] = bi_temp(b->shader);
207        }
208
209        /* Emit the split */
210        if (n == 1) {
211                bi_mov_i32_to(b, dests[0], vec);
212        } else {
213                bi_instr *I = bi_split_i32_to(b, dests[0], vec);
214                I->nr_dests = n;
215
216                for (unsigned j = 1; j < n; ++j)
217                        I->dest[j] = dests[j];
218        }
219}
220
221static void
222bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
223{
224        bi_index dests[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
225        bi_emit_split_i32(b, dests, vec, n);
226        bi_cache_collect(b, vec, dests, n);
227}
228
229/*
230 * Emit and cache a split for a vector of a given bitsize. The vector may not be
231 * composed of 32-bit words, but it will be split at 32-bit word boundaries.
232 */
233static void
234bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
235{
236        bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32));
237}
238
239static void
240bi_split_dest(bi_builder *b, nir_dest dest)
241{
242        bi_emit_cached_split(b, bi_dest_index(&dest),
243                                nir_dest_bit_size(dest) *
244                                nir_dest_num_components(dest));
245}
246
247static bi_instr *
248bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
249{
250        /* Special case: COLLECT of a single value is a scalar move */
251        if (n == 1)
252                return bi_mov_i32_to(b, dst, chan[0]);
253
254        bi_instr *I = bi_collect_i32_to(b, dst);
255        I->nr_srcs = n;
256
257        for (unsigned i = 0; i < n; ++i)
258                I->src[i] = chan[i];
259
260        bi_cache_collect(b, dst, chan, n);
261        return I;
262}
263
264static bi_instr *
265bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
266{
267        return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1 }, 2);
268}
269
270static bi_instr *
271bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2)
272{
273        return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1, s2 }, 3);
274}
275
276static bi_index
277bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
278{
279        bi_index dst = bi_temp(b->shader);
280        bi_collect_v2i32_to(b, dst, s0, s1);
281        return dst;
282}
283
284static bi_index
285bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
286{
287        switch (intr->intrinsic) {
288        case nir_intrinsic_load_barycentric_centroid:
289        case nir_intrinsic_load_barycentric_sample:
290                return bi_preload(b, 61);
291
292        /* Need to put the sample ID in the top 16-bits */
293        case nir_intrinsic_load_barycentric_at_sample:
294                return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false),
295                                bi_half(bi_src_index(&intr->src[0]), false));
296
297        /* Interpret as 8:8 signed fixed point positions in pixels along X and
298         * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
299         * is the center of the pixel so we first fixup and then convert. For
300         * fp16 input:
301         *
302         * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
303         * f2i16((256 * (x, y)) + (128, 128)) =
304         * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
305         *
306         * For fp32 input, that lacks enough precision for MSAA 16x, but the
307         * idea is the same. FIXME: still doesn't pass
308         */
309        case nir_intrinsic_load_barycentric_at_offset: {
310                bi_index offset = bi_src_index(&intr->src[0]);
311                bi_index f16 = bi_null();
312                unsigned sz = nir_src_bit_size(intr->src[0]);
313
314                if (sz == 16) {
315                        f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0),
316                                        bi_imm_f16(128.0));
317                } else {
318                        assert(sz == 32);
319                        bi_index f[2];
320                        for (unsigned i = 0; i < 2; ++i) {
321                                f[i] = bi_fadd_rscale_f32(b,
322                                                bi_extract(b, offset, i),
323                                                bi_imm_f32(0.5), bi_imm_u32(8),
324                                                BI_SPECIAL_NONE);
325                        }
326
327                        f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
328                }
329
330                return bi_v2f16_to_v2s16(b, f16);
331        }
332
333        case nir_intrinsic_load_barycentric_pixel:
334        default:
335                return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
336        }
337}
338
339static enum bi_sample
340bi_interp_for_intrinsic(nir_intrinsic_op op)
341{
342        switch (op) {
343        case nir_intrinsic_load_barycentric_centroid:
344                return BI_SAMPLE_CENTROID;
345        case nir_intrinsic_load_barycentric_sample:
346        case nir_intrinsic_load_barycentric_at_sample:
347                return BI_SAMPLE_SAMPLE;
348        case nir_intrinsic_load_barycentric_at_offset:
349                return BI_SAMPLE_EXPLICIT;
350        case nir_intrinsic_load_barycentric_pixel:
351        default:
352                return BI_SAMPLE_CENTER;
353        }
354}
355
356/* auto, 64-bit omitted */
357static enum bi_register_format
358bi_reg_fmt_for_nir(nir_alu_type T)
359{
360        switch (T) {
361        case nir_type_float16: return BI_REGISTER_FORMAT_F16;
362        case nir_type_float32: return BI_REGISTER_FORMAT_F32;
363        case nir_type_int16:   return BI_REGISTER_FORMAT_S16;
364        case nir_type_uint16:  return BI_REGISTER_FORMAT_U16;
365        case nir_type_int32:   return BI_REGISTER_FORMAT_S32;
366        case nir_type_uint32:  return BI_REGISTER_FORMAT_U32;
367        default: unreachable("Invalid type for register format");
368        }
369}
370
371/* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
372 * immediate to be used (which applies even if _IMM can't be used) */
373
374static bool
375bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max)
376{
377        nir_src *offset = nir_get_io_offset_src(instr);
378
379        if (!nir_src_is_const(*offset))
380                return false;
381
382        *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
383        return (*immediate) < max;
384}
385
386static void
387bi_make_vec_to(bi_builder *b, bi_index final_dst,
388                bi_index *src,
389                unsigned *channel,
390                unsigned count,
391                unsigned bitsize);
392
393/* Bifrost's load instructions lack a component offset despite operating in
394 * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
395 * but they may be unavoidable with separate shaders in use. To solve this, we
396 * lower to a larger load and an explicit copy of the desired components. */
397
398static void
399bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
400{
401        unsigned component = nir_intrinsic_component(instr);
402        unsigned nr = instr->num_components;
403        unsigned total = nr + component;
404        unsigned bitsize = nir_dest_bit_size(instr->dest);
405
406        assert(total <= 4 && "should be vec4");
407        bi_emit_cached_split(b, tmp, total * bitsize);
408
409        if (component == 0)
410                return;
411
412        bi_index srcs[] = { tmp, tmp, tmp };
413        unsigned channels[] = { component, component + 1, component + 2 };
414
415        bi_make_vec_to(b, bi_dest_index(&instr->dest),
416                       srcs, channels, nr, nir_dest_bit_size(instr->dest));
417}
418
419static void
420bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
421{
422        nir_alu_type T = nir_intrinsic_dest_type(instr);
423        enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
424        nir_src *offset = nir_get_io_offset_src(instr);
425        unsigned component = nir_intrinsic_component(instr);
426        enum bi_vecsize vecsize = (instr->num_components + component - 1);
427        unsigned imm_index = 0;
428        unsigned base = nir_intrinsic_base(instr);
429        bool constant = nir_src_is_const(*offset);
430        bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
431        bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
432        bi_instr *I;
433
434        if (immediate) {
435                I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b),
436                                      bi_instance_id(b), regfmt, vecsize,
437                                      imm_index);
438        } else {
439                bi_index idx = bi_src_index(&instr->src[0]);
440
441                if (constant)
442                        idx = bi_imm_u32(imm_index);
443                else if (base != 0)
444                        idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
445
446                I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b),
447                                  idx, regfmt, vecsize);
448        }
449
450        if (b->shader->arch >= 9)
451                I->table = PAN_TABLE_ATTRIBUTE;
452
453        bi_copy_component(b, instr, dest);
454}
455
456/*
457 * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
458 * come later, sparsely packed. This handles both linked and separable shaders
459 * with a common code path, with minimal keying only for desktop GL. Each slot
460 * consumes 16 bytes (TODO: fp16, partial vectors).
461 */
462static unsigned
463bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
464{
465        nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
466        uint32_t mask = ctx->inputs->fixed_varying_mask;
467
468        if (sem.location >= VARYING_SLOT_VAR0) {
469                unsigned nr_special = util_bitcount(mask);
470                unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
471
472                return 16 * (nr_special + general_index);
473        } else {
474                return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
475        }
476}
477
478/*
479 * Compute the offset in bytes of a varying with an immediate offset, adding the
480 * offset to the base computed above. Convenience method.
481 */
482static unsigned
483bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
484{
485        nir_src *src = nir_get_io_offset_src(intr);
486        assert(nir_src_is_const(*src) && "assumes immediate offset");
487
488        return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
489}
490
491static void
492bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
493{
494        enum bi_sample sample = BI_SAMPLE_CENTER;
495        enum bi_update update = BI_UPDATE_STORE;
496        enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
497        bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
498        bi_index src0 = bi_null();
499
500        unsigned component = nir_intrinsic_component(instr);
501        enum bi_vecsize vecsize = (instr->num_components + component - 1);
502        bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
503
504        unsigned sz = nir_dest_bit_size(instr->dest);
505
506        if (smooth) {
507                nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
508                assert(parent);
509
510                sample = bi_interp_for_intrinsic(parent->intrinsic);
511                src0 = bi_varying_src0_for_barycentric(b, parent);
512
513                assert(sz == 16 || sz == 32);
514                regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16
515                        : BI_REGISTER_FORMAT_F32;
516        } else {
517                assert(sz == 32);
518                regfmt = BI_REGISTER_FORMAT_U32;
519
520                /* Valhall can't have bi_null() here, although the source is
521                 * logically unused for flat varyings
522                 */
523                if (b->shader->arch >= 9)
524                        src0 = bi_preload(b, 61);
525
526                /* Gather info as we go */
527                b->shader->info.bifrost->uses_flat_shading = true;
528        }
529
530        enum bi_source_format source_format =
531                smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
532
533        nir_src *offset = nir_get_io_offset_src(instr);
534        unsigned imm_index = 0;
535        bool immediate = bi_is_intr_immediate(instr, &imm_index, 20);
536        bi_instr *I = NULL;
537
538        if (b->shader->malloc_idvs && immediate) {
539                /* Immediate index given in bytes. */
540                bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt,
541                                     sample, source_format, update, vecsize,
542                                     bi_varying_offset(b->shader, instr));
543        } else if (immediate && smooth) {
544                I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
545                                     vecsize, imm_index);
546        } else if (immediate && !smooth) {
547                I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt,
548                                          vecsize, imm_index);
549        } else {
550                bi_index idx = bi_src_index(offset);
551                unsigned base = nir_intrinsic_base(instr);
552
553                if (b->shader->malloc_idvs) {
554                        /* Index needs to be in bytes, but NIR gives the index
555                         * in slots. For now assume 16 bytes per element.
556                         */
557                        bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
558                        unsigned vbase = bi_varying_base_bytes(b->shader, instr);
559
560                        if (vbase != 0)
561                                idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
562
563                        bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt,
564                                         sample, source_format, update,
565                                         vecsize);
566                } else if (smooth) {
567                        if (base != 0)
568                                idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
569
570                        I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
571                                         update, vecsize);
572                } else {
573                        if (base != 0)
574                                idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
575
576                        I = bi_ld_var_flat_to(b, dest, idx,
577                                              BI_FUNCTION_NONE, regfmt,
578                                              vecsize);
579                }
580        }
581
582        /* Valhall usually uses machine-allocated IDVS. If this is disabled, use
583         * a simple Midgard-style ABI.
584         */
585        if (b->shader->arch >= 9 && I != NULL)
586                I->table = PAN_TABLE_ATTRIBUTE;
587
588        bi_copy_component(b, instr, dest);
589}
590
591static void
592bi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src,
593                unsigned *channel, unsigned count)
594{
595        bi_index srcs[BI_MAX_VEC];
596
597        for (unsigned i = 0; i < count; i += 2) {
598                bool next = (i + 1) < count;
599
600                unsigned chan = channel ? channel[i] : 0;
601                unsigned nextc = next && channel ? channel[i + 1] : 0;
602
603                bi_index w0 = bi_extract(b, src[i], chan >> 1);
604                bi_index w1 = next ? bi_extract(b, src[i + 1], nextc >> 1) : bi_zero();
605
606                bi_index h0 = bi_half(w0, chan & 1);
607                bi_index h1 = bi_half(w1, nextc & 1);
608
609                if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1))
610                        srcs[i >> 1] = bi_mov_i32(b, w0);
611                else if (bi_is_word_equiv(w0, w1))
612                        srcs[i >> 1] = bi_swz_v2i16(b, bi_swz_16(w0, chan & 1, nextc & 1));
613                else
614                        srcs[i >> 1] = bi_mkvec_v2i16(b, h0, h1);
615        }
616
617        bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, 2));
618}
619
620static void
621bi_make_vec_to(bi_builder *b, bi_index dst,
622                bi_index *src,
623                unsigned *channel,
624                unsigned count,
625                unsigned bitsize)
626{
627        if (bitsize == 32) {
628                bi_index srcs[BI_MAX_VEC];
629
630                for (unsigned i = 0; i < count; ++i)
631                        srcs[i] = bi_extract(b, src[i], channel ? channel[i] : 0);
632
633                bi_emit_collect_to(b, dst, srcs, count);
634        } else if (bitsize == 16) {
635                bi_make_vec16_to(b, dst, src, channel, count);
636        } else if (bitsize == 8 && count == 1) {
637                bi_swz_v4i8_to(b, dst, bi_byte(
638                                        bi_extract(b, src[0], channel[0] >> 2),
639                                        channel[0] & 3));
640        } else {
641                unreachable("8-bit mkvec not yet supported");
642        }
643}
644
645static inline bi_instr *
646bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0,
647                bi_index src1)
648{
649        bi_instr *I;
650
651        if (b->shader->arch >= 9) {
652                I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1);
653                I->seg = BI_SEG_UBO;
654        } else {
655                I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0);
656        }
657
658        bi_emit_cached_split(b, dest0, bitsize);
659        return I;
660}
661
662static bi_instr *
663bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval,
664                unsigned nr_components, unsigned offset)
665{
666        unsigned sysval_ubo = b->shader->inputs->fixed_sysval_ubo >= 0 ?
667                              b->shader->inputs->fixed_sysval_ubo :
668                              b->shader->nir->info.num_ubos;
669        unsigned uniform =
670                pan_lookup_sysval(b->shader->sysval_to_id,
671                                  b->shader->info.sysvals,
672                                  sysval);
673        unsigned idx = (uniform * 16) + offset;
674
675        return bi_load_ubo_to(b, nr_components * 32, dest,
676                              bi_imm_u32(idx), bi_imm_u32(sysval_ubo));
677}
678
679static void
680bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr,
681                unsigned nr_components, unsigned offset)
682{
683        bi_load_sysval_to(b, bi_dest_index(&intr->dest),
684                        panfrost_sysval_for_instr(&intr->instr, NULL),
685                        nr_components, offset);
686}
687
688static bi_index
689bi_load_sysval(bi_builder *b, int sysval,
690                unsigned nr_components, unsigned offset)
691{
692        bi_index tmp = bi_temp(b->shader);
693        bi_load_sysval_to(b, tmp, sysval, nr_components, offset);
694        return tmp;
695}
696
697static void
698bi_load_sample_id_to(bi_builder *b, bi_index dst)
699{
700        /* r61[16:23] contains the sampleID, mask it out. Upper bits
701         * seem to read garbage (despite being architecturally defined
702         * as zero), so use a 5-bit mask instead of 8-bits */
703
704        bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
705                                bi_imm_u8(16), false);
706}
707
708static bi_index
709bi_load_sample_id(bi_builder *b)
710{
711        bi_index sample_id = bi_temp(b->shader);
712        bi_load_sample_id_to(b, sample_id);
713        return sample_id;
714}
715
716static bi_index
717bi_pixel_indices(bi_builder *b, unsigned rt)
718{
719        /* We want to load the current pixel. */
720        struct bifrost_pixel_indices pix = {
721                .y = BIFROST_CURRENT_PIXEL,
722                .rt = rt
723        };
724
725        uint32_t indices_u32 = 0;
726        memcpy(&indices_u32, &pix, sizeof(indices_u32));
727        bi_index indices = bi_imm_u32(indices_u32);
728
729        /* Sample index above is left as zero. For multisampling, we need to
730         * fill in the actual sample ID in the lower byte */
731
732        if (b->shader->inputs->blend.nr_samples > 1)
733                indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
734
735        return indices;
736}
737
738/* Source color is passed through r0-r3, or r4-r7 for the second source when
739 * dual-source blending. Preload the corresponding vector.
740 */
741static void
742bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
743{
744        nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
745        unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
746        unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
747        assert(size == 16 || size == 32);
748
749        bi_index srcs[] = {
750                bi_preload(b, base + 0), bi_preload(b, base + 1),
751                bi_preload(b, base + 2), bi_preload(b, base + 3)
752        };
753
754        bi_emit_collect_to(b, bi_dest_index(&instr->dest), srcs, size == 32 ? 4 : 2);
755}
756
757static void
758bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T,
759                 bi_index rgba2, nir_alu_type T2, unsigned rt)
760{
761        /* On Valhall, BLEND does not encode the return address */
762        bool bifrost = b->shader->arch <= 8;
763
764        /* Reads 2 or 4 staging registers to cover the input */
765        unsigned size = nir_alu_type_get_type_size(T);
766        unsigned size_2 = nir_alu_type_get_type_size(T2);
767        unsigned sr_count = (size <= 16) ? 2 : 4;
768        unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4;
769        const struct panfrost_compile_inputs *inputs = b->shader->inputs;
770        uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
771        enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
772
773        if (inputs->is_blend && inputs->blend.nr_samples > 1) {
774                /* Conversion descriptor comes from the compile inputs, pixel
775                 * indices derived at run time based on sample ID */
776                bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
777                                bi_imm_u32(blend_desc >> 32),
778                                regfmt, BI_VECSIZE_V4);
779        } else if (b->shader->inputs->is_blend) {
780                uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;
781
782                /* Blend descriptor comes from the compile inputs */
783                /* Put the result in r0 */
784
785                bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba,
786                                bi_coverage(b),
787                                bi_imm_u32(blend_desc),
788                                bi_imm_u32(blend_desc >> 32),
789                                bi_null(), regfmt, sr_count, 0);
790        } else {
791                /* Blend descriptor comes from the FAU RAM. By convention, the
792                 * return address on Bifrost is stored in r48 and will be used
793                 * by the blend shader to jump back to the fragment shader */
794
795                bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba,
796                                bi_coverage(b),
797                                bi_fau(BIR_FAU_BLEND_0 + rt, false),
798                                bi_fau(BIR_FAU_BLEND_0 + rt, true),
799                                rgba2, regfmt, sr_count, sr_count_2);
800        }
801
802        assert(rt < 8);
803        b->shader->info.bifrost->blend[rt].type = T;
804
805        if (T2)
806                b->shader->info.bifrost->blend_src1_type = T2;
807}
808
809/* Blend shaders do not need to run ATEST since they are dependent on a
810 * fragment shader that runs it. Blit shaders may not need to run ATEST, since
811 * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
812 * there are no writes to the coverage mask. The latter two are satisfied for
813 * all blit shaders, so we just care about early-z, which blit shaders force
814 * iff they do not write depth or stencil */
815
816static bool
817bi_skip_atest(bi_context *ctx, bool emit_zs)
818{
819        return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
820}
821
822static void
823bi_emit_atest(bi_builder *b, bi_index alpha)
824{
825        bi_instr *atest = bi_atest_to(b, bi_temp(b->shader), bi_coverage(b), alpha);
826        b->shader->emitted_atest = true;
827        b->shader->coverage = atest->dest[0];
828
829        /* Pseudo-source to encode in the tuple */
830        atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false);
831}
832
833static void
834bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
835{
836        bool combined = instr->intrinsic ==
837                nir_intrinsic_store_combined_output_pan;
838
839        unsigned writeout = combined ? nir_intrinsic_component(instr) :
840                PAN_WRITEOUT_C;
841
842        bool emit_blend = writeout & (PAN_WRITEOUT_C);
843        bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
844
845        const nir_variable *var =
846                nir_find_variable_with_driver_location(b->shader->nir,
847                                                       nir_var_shader_out, nir_intrinsic_base(instr));
848
849        unsigned loc = var ? var->data.location : 0;
850
851        bi_index src0 = bi_src_index(&instr->src[0]);
852
853        /* By ISA convention, the coverage mask is stored in R60. The store
854         * itself will be handled by a subsequent ATEST instruction */
855        if (loc == FRAG_RESULT_SAMPLE_MASK) {
856                bi_index orig = bi_coverage(b);
857                bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0);
858                bi_index new = bi_lshift_and_i32(b, orig, bi_extract(b, src0, 0), bi_imm_u8(0));
859
860                b->shader->coverage =
861                        bi_mux_i32(b, orig, new, msaa, BI_MUX_INT_ZERO);
862                return;
863        }
864
865        /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
866         * value, but render target #0 might not be floating point. However the
867         * alpha value is only used for alpha-to-coverage, a stage which is
868         * skipped for pure integer framebuffers, so the issue is moot. */
869
870        if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
871                nir_alu_type T = nir_intrinsic_src_type(instr);
872
873                bi_index rgba = bi_src_index(&instr->src[0]);
874                bi_index alpha =
875                        (T == nir_type_float16) ? bi_half(bi_extract(b, rgba, 1), true) :
876                        (T == nir_type_float32) ? bi_extract(b, rgba, 3) :
877                        bi_dontcare(b);
878
879                /* Don't read out-of-bounds */
880                if (nir_src_num_components(instr->src[0]) < 4)
881                        alpha = bi_imm_f32(1.0);
882
883                bi_emit_atest(b, alpha);
884        }
885
886        if (emit_zs) {
887                bi_index z = bi_dontcare(b), s = bi_dontcare(b);
888
889                if (writeout & PAN_WRITEOUT_Z)
890                        z = bi_src_index(&instr->src[2]);
891
892                if (writeout & PAN_WRITEOUT_S)
893                        s = bi_src_index(&instr->src[3]);
894
895                b->shader->coverage = bi_zs_emit(b, z, s, bi_coverage(b),
896                                                 writeout & PAN_WRITEOUT_S,
897                                                 writeout & PAN_WRITEOUT_Z);
898        }
899
900        if (emit_blend) {
901                unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0;
902                bool dual = (writeout & PAN_WRITEOUT_2);
903                bi_index color = bi_src_index(&instr->src[0]);
904                bi_index color2 = dual ? bi_src_index(&instr->src[4]) : bi_null();
905                nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0;
906
907                /* Explicit copy since BLEND inputs are precoloured to R0-R3,
908                 * TODO: maybe schedule around this or implement in RA as a
909                 * spill */
910                bool has_mrt = false;
911
912                nir_foreach_shader_out_variable(var, b->shader->nir)
913                        has_mrt |= (var->data.location > FRAG_RESULT_DATA0);
914
915                if (has_mrt) {
916                        bi_index srcs[4] = { color, color, color, color };
917                        unsigned channels[4] = { 0, 1, 2, 3 };
918                        color = bi_temp(b->shader);
919                        bi_make_vec_to(b, color, srcs, channels,
920                                       nir_src_num_components(instr->src[0]),
921                                       nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
922                }
923
924                bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr),
925                                    color2, T2, rt);
926        }
927
928        if (b->shader->inputs->is_blend) {
929                /* Jump back to the fragment shader, return address is stored
930                 * in r48 (see above). On Valhall, only jump if the address is
931                 * nonzero. The check is free there and it implements the "jump
932                 * to 0 terminates the blend shader" that's automatic on
933                 * Bifrost.
934                 */
935                if (b->shader->arch >= 8)
936                        bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
937                else
938                        bi_jump(b, bi_preload(b, 48));
939        }
940}
941
942/**
943 * In a vertex shader, is the specified variable a position output? These kinds
944 * of outputs are written from position shaders when IDVS is enabled. All other
945 * outputs are written from the varying shader.
946 */
947static bool
948bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs)
949{
950        nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
951
952        switch (sem.location) {
953        case VARYING_SLOT_POS:
954        case VARYING_SLOT_PSIZ:
955                return idvs == BI_IDVS_VARYING;
956        default:
957                return idvs == BI_IDVS_POSITION;
958        }
959}
960
961static bool
962bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
963{
964        enum bi_idvs_mode *idvs = data;
965
966        if (instr->type != nir_instr_type_intrinsic)
967                return false;
968
969        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
970
971        if (intr->intrinsic != nir_intrinsic_store_output)
972                return false;
973
974        if (bi_should_remove_store(intr, *idvs)) {
975                nir_instr_remove(instr);
976                return  true;
977        }
978
979        return false;
980}
981
982static void
983bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
984{
985        /* In principle we can do better for 16-bit. At the moment we require
986         * 32-bit to permit the use of .auto, in order to force .u32 for flat
987         * varyings, to handle internal TGSI shaders that set flat in the VS
988         * but smooth in the FS */
989
990        ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
991        ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
992        assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16));
993        enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
994
995        unsigned imm_index = 0;
996        bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
997
998        /* Only look at the total components needed. In effect, we fill in all
999         * the intermediate "holes" in the write mask, since we can't mask off
1000         * stores. Since nir_lower_io_to_temporaries ensures each varying is
1001         * written at most once, anything that's masked out is undefined, so it
1002         * doesn't matter what we write there. So we may as well do the
1003         * simplest thing possible. */
1004        unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
1005        assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
1006
1007        bi_index data = bi_src_index(&instr->src[0]);
1008
1009        /* To keep the vector dimensions consistent, we need to drop some
1010         * components. This should be coalesced.
1011         *
1012         * TODO: This is ugly and maybe inefficient. Would we rather
1013         * introduce a TRIM.i32 pseudoinstruction?
1014         */
1015        if (nr < nir_intrinsic_src_components(instr, 0)) {
1016                assert(T_size == 32 && "todo: 16-bit trim");
1017
1018                bi_instr *split = bi_split_i32_to(b, bi_null(), data);
1019                split->nr_dests = nir_intrinsic_src_components(instr, 0);
1020
1021                bi_index tmp = bi_temp(b->shader);
1022                bi_instr *collect = bi_collect_i32_to(b, tmp);
1023                collect->nr_srcs = nr;
1024
1025                for (unsigned w = 0; w < nr; ++w) {
1026                        split->dest[w] = bi_temp(b->shader);
1027                        collect->src[w] = split->dest[w];
1028                }
1029
1030                data = tmp;
1031        }
1032
1033        bool psiz = (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ);
1034
1035        bi_index a[4] = { bi_null() };
1036
1037        if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
1038                /* Bifrost position shaders have a fast path */
1039                assert(T == nir_type_float16 || T == nir_type_float32);
1040                unsigned regfmt = (T == nir_type_float16) ? 0 : 1;
1041                unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
1042                unsigned snap4 = 0x5E;
1043                uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
1044
1045                bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
1046                          bi_imm_u32(format), regfmt, nr - 1);
1047        } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
1048                bi_index index = bi_preload(b, 59);
1049
1050                if (psiz) {
1051                        assert(T_size == 16 && "should've been lowered");
1052                        index = bi_iadd_imm_i32(b, index, 4);
1053                }
1054
1055                bi_index address = bi_lea_buf_imm(b, index);
1056                bi_emit_split_i32(b, a, address, 2);
1057
1058                bool varying = (b->shader->idvs == BI_IDVS_VARYING);
1059
1060                bi_store(b, nr * nir_src_bit_size(instr->src[0]),
1061                         data, a[0], a[1],
1062                         varying ? BI_SEG_VARY : BI_SEG_POS,
1063                         varying ? bi_varying_offset(b->shader, instr) : 0);
1064        } else if (immediate) {
1065                bi_index address = bi_lea_attr_imm(b,
1066                                          bi_vertex_id(b), bi_instance_id(b),
1067                                          regfmt, imm_index);
1068                bi_emit_split_i32(b, a, address, 3);
1069
1070                bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1071        } else {
1072                bi_index idx =
1073                        bi_iadd_u32(b,
1074                                    bi_src_index(nir_get_io_offset_src(instr)),
1075                                    bi_imm_u32(nir_intrinsic_base(instr)),
1076                                    false);
1077                bi_index address = bi_lea_attr(b,
1078                                      bi_vertex_id(b), bi_instance_id(b),
1079                                      idx, regfmt);
1080                bi_emit_split_i32(b, a, address, 3);
1081
1082                bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1083        }
1084}
1085
1086static void
1087bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
1088{
1089        nir_src *offset = nir_get_io_offset_src(instr);
1090
1091        bool offset_is_const = nir_src_is_const(*offset);
1092        bi_index dyn_offset = bi_src_index(offset);
1093        uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
1094        bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
1095
1096        bi_load_ubo_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
1097                        bi_dest_index(&instr->dest), offset_is_const ?
1098                        bi_imm_u32(const_offset) : dyn_offset,
1099                        kernel_input ? bi_zero() : bi_src_index(&instr->src[0]));
1100}
1101
1102static void
1103bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr)
1104{
1105        assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms");
1106
1107        nir_src *offset = &instr->src[0];
1108        assert(nir_src_is_const(*offset) && "no indirect push constants");
1109        uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
1110        assert((base & 3) == 0 && "unaligned push constants");
1111
1112        unsigned bits = nir_dest_bit_size(instr->dest) *
1113                        nir_dest_num_components(instr->dest);
1114
1115        unsigned n = DIV_ROUND_UP(bits, 32);
1116        assert(n <= 4);
1117        bi_index channels[4] = { bi_null() };
1118
1119        for (unsigned i = 0; i < n; ++i) {
1120                unsigned word = (base >> 2) + i;
1121
1122                channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1);
1123        }
1124
1125        bi_emit_collect_to(b, bi_dest_index(&instr->dest), channels, n);
1126}
1127
1128static bi_index
1129bi_addr_high(bi_builder *b, nir_src *src)
1130{
1131	return (nir_src_bit_size(*src) == 64) ?
1132		bi_extract(b, bi_src_index(src), 1) : bi_zero();
1133}
1134
1135static void
1136bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi, enum bi_seg seg, int16_t *offset)
1137{
1138        /* Not needed on Bifrost or for global accesses */
1139        if (b->shader->arch < 9 || seg == BI_SEG_NONE)
1140                return;
1141
1142        /* There is no segment modifier on Valhall. Instead, we need to
1143         * emit the arithmetic ourselves. We do have an offset
1144         * available, which saves an instruction for constant offsets.
1145         */
1146        bool wls = (seg == BI_SEG_WLS);
1147        assert(wls || (seg == BI_SEG_TL));
1148
1149        enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR;
1150
1151        bi_index base_lo = bi_fau(fau, false);
1152
1153        if (offset && addr_lo->type == BI_INDEX_CONSTANT && addr_lo->value == (int16_t) addr_lo->value) {
1154                *offset = addr_lo->value;
1155                *addr_lo = base_lo;
1156        } else {
1157                *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false);
1158        }
1159
1160        /* Do not allow overflow for WLS or TLS */
1161        *addr_hi = bi_fau(fau, true);
1162}
1163
1164static void
1165bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1166{
1167        int16_t offset = 0;
1168        unsigned bits = instr->num_components * nir_dest_bit_size(instr->dest);
1169        bi_index dest = bi_dest_index(&instr->dest);
1170        bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0);
1171        bi_index addr_hi = bi_addr_high(b, &instr->src[0]);
1172
1173        bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1174
1175        bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset);
1176        bi_emit_cached_split(b, dest, bits);
1177}
1178
1179static void
1180bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1181{
1182        /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
1183        assert(nir_intrinsic_write_mask(instr) ==
1184                        BITFIELD_MASK(instr->num_components));
1185
1186        int16_t offset = 0;
1187        bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0);
1188        bi_index addr_hi = bi_addr_high(b, &instr->src[1]);
1189
1190        bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1191
1192        bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
1193                 bi_src_index(&instr->src[0]),
1194                 addr_lo, addr_hi, seg, offset);
1195}
1196
1197/* Exchanges the staging register with memory */
1198
1199static void
1200bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg)
1201{
1202        assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1203
1204        unsigned sz = nir_src_bit_size(*arg);
1205        assert(sz == 32 || sz == 64);
1206
1207        bi_index data = bi_src_index(arg);
1208
1209        bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1210
1211        if (b->shader->arch >= 9)
1212                bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1213        else if (seg == BI_SEG_WLS)
1214                addr_hi = bi_zero();
1215
1216        bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg);
1217}
1218
1219/* Exchanges the second staging register with memory if comparison with first
1220 * staging register passes */
1221
1222static void
1223bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg)
1224{
1225        assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1226
1227        /* hardware is swapped from NIR */
1228        bi_index src0 = bi_src_index(arg_2);
1229        bi_index src1 = bi_src_index(arg_1);
1230
1231        unsigned sz = nir_src_bit_size(*arg_1);
1232        assert(sz == 32 || sz == 64);
1233
1234        bi_index data_words[] = {
1235                bi_extract(b, src0, 0),
1236                sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1),
1237
1238                /* 64-bit */
1239                bi_extract(b, src1, 0),
1240                sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1),
1241        };
1242
1243        bi_index in = bi_temp(b->shader);
1244        bi_emit_collect_to(b, in, data_words, 2 * (sz / 32));
1245        bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1246
1247        if (b->shader->arch >= 9)
1248                bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1249        else if (seg == BI_SEG_WLS)
1250                addr_hi = bi_zero();
1251
1252        bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg);
1253        bi_emit_cached_split(b, out, sz);
1254
1255        bi_index inout_words[] = {
1256                bi_extract(b, out, 0),
1257                sz == 64 ? bi_extract(b, out, 1) : bi_null()
1258        };
1259
1260        bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
1261}
1262
1263/* Extracts an atomic opcode */
1264
1265static enum bi_atom_opc
1266bi_atom_opc_for_nir(nir_intrinsic_op op)
1267{
1268        switch (op) {
1269        case nir_intrinsic_global_atomic_add:
1270        case nir_intrinsic_shared_atomic_add:
1271        case nir_intrinsic_image_atomic_add:
1272                return BI_ATOM_OPC_AADD;
1273
1274        case nir_intrinsic_global_atomic_imin:
1275        case nir_intrinsic_shared_atomic_imin:
1276        case nir_intrinsic_image_atomic_imin:
1277                return BI_ATOM_OPC_ASMIN;
1278
1279        case nir_intrinsic_global_atomic_umin:
1280        case nir_intrinsic_shared_atomic_umin:
1281        case nir_intrinsic_image_atomic_umin:
1282                return BI_ATOM_OPC_AUMIN;
1283
1284        case nir_intrinsic_global_atomic_imax:
1285        case nir_intrinsic_shared_atomic_imax:
1286        case nir_intrinsic_image_atomic_imax:
1287                return BI_ATOM_OPC_ASMAX;
1288
1289        case nir_intrinsic_global_atomic_umax:
1290        case nir_intrinsic_shared_atomic_umax:
1291        case nir_intrinsic_image_atomic_umax:
1292                return BI_ATOM_OPC_AUMAX;
1293
1294        case nir_intrinsic_global_atomic_and:
1295        case nir_intrinsic_shared_atomic_and:
1296        case nir_intrinsic_image_atomic_and:
1297                return BI_ATOM_OPC_AAND;
1298
1299        case nir_intrinsic_global_atomic_or:
1300        case nir_intrinsic_shared_atomic_or:
1301        case nir_intrinsic_image_atomic_or:
1302                return BI_ATOM_OPC_AOR;
1303
1304        case nir_intrinsic_global_atomic_xor:
1305        case nir_intrinsic_shared_atomic_xor:
1306        case nir_intrinsic_image_atomic_xor:
1307                return BI_ATOM_OPC_AXOR;
1308
1309        default:
1310                unreachable("Unexpected computational atomic");
1311        }
1312}
1313
1314/* Optimized unary atomics are available with an implied #1 argument */
1315
1316static bool
1317bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
1318{
1319        /* Check we have a compatible constant */
1320        if (arg.type != BI_INDEX_CONSTANT)
1321                return false;
1322
1323        if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
1324                return false;
1325
1326        /* Check for a compatible operation */
1327        switch (op) {
1328        case BI_ATOM_OPC_AADD:
1329                *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
1330                return true;
1331        case BI_ATOM_OPC_ASMAX:
1332                *out = BI_ATOM_OPC_ASMAX1;
1333                return true;
1334        case BI_ATOM_OPC_AUMAX:
1335                *out = BI_ATOM_OPC_AUMAX1;
1336                return true;
1337        case BI_ATOM_OPC_AOR:
1338                *out = BI_ATOM_OPC_AOR1;
1339                return true;
1340        default:
1341                return false;
1342        }
1343}
1344
1345/*
1346 * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to
1347 * translate between these forms (with MKVEC.v2i16).
1348 *
1349 * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D
1350 * arrays.  For uniform handling, we also treat 3D textures like 2D arrays.
1351 *
1352 * Our indexing needs to reflects this.
1353 */
1354static bi_index
1355bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
1356                    unsigned coord_comps, bool is_array)
1357{
1358        assert(coord_comps > 0 && coord_comps <= 3);
1359
1360        if (src_idx == 0) {
1361                if (coord_comps == 1 || (coord_comps == 2 && is_array))
1362                        return bi_extract(b, coord, 0);
1363                else
1364                        return bi_mkvec_v2i16(b,
1365                                              bi_half(bi_extract(b, coord, 0), false),
1366                                              bi_half(bi_extract(b, coord, 1), false));
1367        } else {
1368                if (coord_comps == 3 && b->shader->arch >= 9)
1369                        return bi_mkvec_v2i16(b, bi_imm_u16(0),
1370                                              bi_half(bi_extract(b, coord, 2), false));
1371                else if (coord_comps == 3)
1372                        return bi_extract(b, coord, 2);
1373                else if (coord_comps == 2 && is_array)
1374                        return bi_extract(b, coord, 1);
1375                else
1376                        return bi_zero();
1377        }
1378}
1379
1380static bi_index
1381bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr)
1382{
1383        nir_src src = instr->src[0];
1384        bi_index index = bi_src_index(&src);
1385        bi_context *ctx = b->shader;
1386
1387        /* Images come after vertex attributes, so handle an explicit offset */
1388        unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ?
1389                util_bitcount64(ctx->nir->info.inputs_read) : 0;
1390
1391        if (offset == 0)
1392                return index;
1393        else if (nir_src_is_const(src))
1394                return bi_imm_u32(nir_src_as_uint(src) + offset);
1395        else
1396                return bi_iadd_u32(b, index, bi_imm_u32(offset), false);
1397}
1398
1399static void
1400bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
1401{
1402        enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1403        unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1404        bool array = nir_intrinsic_image_array(instr);
1405        ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
1406
1407        bi_index coords = bi_src_index(&instr->src[1]);
1408        bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
1409        bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
1410        bi_index dest = bi_dest_index(&instr->dest);
1411        enum bi_register_format regfmt = bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr));
1412        enum bi_vecsize vecsize = instr->num_components - 1;
1413
1414        /* TODO: MSAA */
1415        assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
1416
1417        if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1418                bi_instr *I = bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize,
1419                                                nir_src_as_uint(instr->src[0]));
1420
1421                I->table = PAN_TABLE_IMAGE;
1422        } else if (b->shader->arch >= 9) {
1423                unreachable("Indirect images on Valhall not yet supported");
1424        } else {
1425                bi_ld_attr_tex_to(b, dest, xy, zw,
1426                                  bi_emit_image_index(b, instr), regfmt,
1427                                  vecsize);
1428        }
1429
1430        bi_split_dest(b, instr->dest);
1431}
1432
1433static bi_index
1434bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
1435{
1436        enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1437        bool array = nir_intrinsic_image_array(instr);
1438        ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
1439        unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1440
1441        /* TODO: MSAA */
1442        assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
1443
1444        enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ?
1445                bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) :
1446                BI_REGISTER_FORMAT_AUTO;
1447
1448        bi_index coords = bi_src_index(&instr->src[1]);
1449        bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
1450        bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
1451        bi_index dest = bi_temp(b->shader);
1452
1453        if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1454                bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false,
1455                                                nir_src_as_uint(instr->src[0]));
1456
1457                I->table = PAN_TABLE_IMAGE;
1458        } else if (b->shader->arch >= 9) {
1459                unreachable("Indirect images on Valhall not yet supported");
1460        } else {
1461                bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw,
1462                                bi_emit_image_index(b, instr), type);
1463
1464                /* LEA_ATTR_TEX defaults to the secondary attribute table, but
1465                 * our ABI has all images in the primary attribute table
1466                 */
1467                I->table = BI_TABLE_ATTRIBUTE_1;
1468        }
1469
1470        bi_emit_cached_split(b, dest, 3 * 32);
1471        return dest;
1472}
1473
1474static void
1475bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
1476{
1477        bi_index a[4] = { bi_null() };
1478        bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3);
1479
1480        bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2],
1481                     bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)),
1482                     instr->num_components - 1);
1483}
1484
1485static void
1486bi_emit_atomic_i32_to(bi_builder *b, bi_index dst,
1487                bi_index addr, bi_index arg, nir_intrinsic_op intrinsic)
1488{
1489        enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic);
1490        enum bi_atom_opc post_opc = opc;
1491        bool bifrost = b->shader->arch <= 8;
1492
1493        /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
1494         * take any vector but can still output in RETURN mode */
1495        bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst;
1496        unsigned sr_count = bifrost ? 2 : 1;
1497
1498        /* Generate either ATOM or ATOM1 as required */
1499        if (bi_promote_atom_c1(opc, arg, &opc)) {
1500                bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0),
1501                                       bi_extract(b, addr, 1), opc, sr_count);
1502        } else {
1503                bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0),
1504                                      bi_extract(b, addr, 1), opc, sr_count);
1505        }
1506
1507        if (bifrost) {
1508                /* Post-process it */
1509                bi_emit_cached_split_i32(b, tmp_dest, 2);
1510                bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0), bi_extract(b, tmp_dest, 1), post_opc);
1511        }
1512}
1513
1514/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5
1515 * gl_FragCoord.z = ld_vary(fragz)
1516 * gl_FragCoord.w = ld_vary(fragw)
1517 */
1518
1519static void
1520bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
1521{
1522        bi_index src[4] = {};
1523
1524        for (unsigned i = 0; i < 2; ++i) {
1525                src[i] = bi_fadd_f32(b,
1526                                bi_u16_to_f32(b, bi_half(bi_preload(b, 59), i)),
1527                                bi_imm_f32(0.5f));
1528        }
1529
1530        for (unsigned i = 0; i < 2; ++i) {
1531                src[2 + i] = bi_ld_var_special(b, bi_zero(),
1532                                BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
1533                                BI_UPDATE_CLOBBER,
1534                                (i == 0) ? BI_VARYING_NAME_FRAG_Z :
1535                                        BI_VARYING_NAME_FRAG_W,
1536                                BI_VECSIZE_NONE);
1537        }
1538
1539        bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32);
1540}
1541
1542static void
1543bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
1544{
1545        bi_index dest = bi_dest_index(&instr->dest);
1546        nir_alu_type T = nir_intrinsic_dest_type(instr);
1547        enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
1548        unsigned rt = b->shader->inputs->blend.rt;
1549        unsigned size = nir_dest_bit_size(instr->dest);
1550        unsigned nr = instr->num_components;
1551
1552        /* Get the render target */
1553        if (!b->shader->inputs->is_blend) {
1554                const nir_variable *var =
1555                        nir_find_variable_with_driver_location(b->shader->nir,
1556                                        nir_var_shader_out, nir_intrinsic_base(instr));
1557                unsigned loc = var->data.location;
1558                assert(loc >= FRAG_RESULT_DATA0);
1559                rt = (loc - FRAG_RESULT_DATA0);
1560        }
1561
1562        bi_index desc = b->shader->inputs->is_blend ?
1563                bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) :
1564                b->shader->inputs->bifrost.static_rt_conv ?
1565                bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) :
1566                bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0);
1567
1568        bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b), desc,
1569                      regfmt, nr - 1);
1570        bi_emit_cached_split(b, dest, size * nr);
1571}
1572
1573static void
1574bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
1575{
1576        bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
1577                bi_dest_index(&instr->dest) : bi_null();
1578        gl_shader_stage stage = b->shader->stage;
1579
1580        switch (instr->intrinsic) {
1581        case nir_intrinsic_load_barycentric_pixel:
1582        case nir_intrinsic_load_barycentric_centroid:
1583        case nir_intrinsic_load_barycentric_sample:
1584        case nir_intrinsic_load_barycentric_at_sample:
1585        case nir_intrinsic_load_barycentric_at_offset:
1586                /* handled later via load_vary */
1587                break;
1588        case nir_intrinsic_load_interpolated_input:
1589        case nir_intrinsic_load_input:
1590                if (b->shader->inputs->is_blend)
1591                        bi_emit_load_blend_input(b, instr);
1592                else if (stage == MESA_SHADER_FRAGMENT)
1593                        bi_emit_load_vary(b, instr);
1594                else if (stage == MESA_SHADER_VERTEX)
1595                        bi_emit_load_attr(b, instr);
1596                else
1597                        unreachable("Unsupported shader stage");
1598                break;
1599
1600        case nir_intrinsic_store_output:
1601                if (stage == MESA_SHADER_FRAGMENT)
1602                        bi_emit_fragment_out(b, instr);
1603                else if (stage == MESA_SHADER_VERTEX)
1604                        bi_emit_store_vary(b, instr);
1605                else
1606                        unreachable("Unsupported shader stage");
1607                break;
1608
1609        case nir_intrinsic_store_combined_output_pan:
1610                assert(stage == MESA_SHADER_FRAGMENT);
1611                bi_emit_fragment_out(b, instr);
1612                break;
1613
1614        case nir_intrinsic_load_ubo:
1615        case nir_intrinsic_load_kernel_input:
1616                bi_emit_load_ubo(b, instr);
1617                break;
1618
1619        case nir_intrinsic_load_push_constant:
1620                bi_emit_load_push_constant(b, instr);
1621                break;
1622
1623        case nir_intrinsic_load_global:
1624        case nir_intrinsic_load_global_constant:
1625                bi_emit_load(b, instr, BI_SEG_NONE);
1626                break;
1627
1628        case nir_intrinsic_store_global:
1629                bi_emit_store(b, instr, BI_SEG_NONE);
1630                break;
1631
1632        case nir_intrinsic_load_scratch:
1633                bi_emit_load(b, instr, BI_SEG_TL);
1634                break;
1635
1636        case nir_intrinsic_store_scratch:
1637                bi_emit_store(b, instr, BI_SEG_TL);
1638                break;
1639
1640        case nir_intrinsic_load_shared:
1641                bi_emit_load(b, instr, BI_SEG_WLS);
1642                break;
1643
1644        case nir_intrinsic_store_shared:
1645                bi_emit_store(b, instr, BI_SEG_WLS);
1646                break;
1647
1648        /* Blob doesn't seem to do anything for memory barriers, note +BARRIER
1649         * is illegal in fragment shaders */
1650        case nir_intrinsic_memory_barrier:
1651        case nir_intrinsic_memory_barrier_buffer:
1652        case nir_intrinsic_memory_barrier_image:
1653        case nir_intrinsic_memory_barrier_shared:
1654        case nir_intrinsic_group_memory_barrier:
1655                break;
1656
1657        case nir_intrinsic_control_barrier:
1658                assert(b->shader->stage != MESA_SHADER_FRAGMENT);
1659                bi_barrier(b);
1660                break;
1661
1662        case nir_intrinsic_shared_atomic_add:
1663        case nir_intrinsic_shared_atomic_imin:
1664        case nir_intrinsic_shared_atomic_umin:
1665        case nir_intrinsic_shared_atomic_imax:
1666        case nir_intrinsic_shared_atomic_umax:
1667        case nir_intrinsic_shared_atomic_and:
1668        case nir_intrinsic_shared_atomic_or:
1669        case nir_intrinsic_shared_atomic_xor: {
1670                assert(nir_src_bit_size(instr->src[1]) == 32);
1671
1672                bi_index addr = bi_src_index(&instr->src[0]);
1673                bi_index addr_hi;
1674
1675                if (b->shader->arch >= 9) {
1676                        bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL);
1677                        addr = bi_collect_v2i32(b, addr, addr_hi);
1678                } else {
1679                        addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS);
1680                        bi_emit_cached_split(b, addr, 64);
1681                }
1682
1683                bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]),
1684                                instr->intrinsic);
1685                bi_split_dest(b, instr->dest);
1686                break;
1687        }
1688
1689        case nir_intrinsic_image_atomic_add:
1690        case nir_intrinsic_image_atomic_imin:
1691        case nir_intrinsic_image_atomic_umin:
1692        case nir_intrinsic_image_atomic_imax:
1693        case nir_intrinsic_image_atomic_umax:
1694        case nir_intrinsic_image_atomic_and:
1695        case nir_intrinsic_image_atomic_or:
1696        case nir_intrinsic_image_atomic_xor:
1697                assert(nir_src_bit_size(instr->src[3]) == 32);
1698
1699                bi_emit_atomic_i32_to(b, dst,
1700                                bi_emit_lea_image(b, instr),
1701                                bi_src_index(&instr->src[3]),
1702                                instr->intrinsic);
1703                bi_split_dest(b, instr->dest);
1704                break;
1705
1706        case nir_intrinsic_global_atomic_add:
1707        case nir_intrinsic_global_atomic_imin:
1708        case nir_intrinsic_global_atomic_umin:
1709        case nir_intrinsic_global_atomic_imax:
1710        case nir_intrinsic_global_atomic_umax:
1711        case nir_intrinsic_global_atomic_and:
1712        case nir_intrinsic_global_atomic_or:
1713        case nir_intrinsic_global_atomic_xor:
1714                assert(nir_src_bit_size(instr->src[1]) == 32);
1715
1716                bi_emit_atomic_i32_to(b, dst,
1717                                bi_src_index(&instr->src[0]),
1718                                bi_src_index(&instr->src[1]),
1719                                instr->intrinsic);
1720
1721                bi_split_dest(b, instr->dest);
1722                break;
1723
1724        case nir_intrinsic_image_load:
1725                bi_emit_image_load(b, instr);
1726                break;
1727
1728        case nir_intrinsic_image_store:
1729                bi_emit_image_store(b, instr);
1730                break;
1731
1732        case nir_intrinsic_global_atomic_exchange:
1733                bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
1734                                &instr->src[1], BI_SEG_NONE);
1735                bi_split_dest(b, instr->dest);
1736                break;
1737
1738        case nir_intrinsic_image_atomic_exchange:
1739                bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr),
1740                                &instr->src[3], BI_SEG_NONE);
1741                bi_split_dest(b, instr->dest);
1742                break;
1743
1744        case nir_intrinsic_shared_atomic_exchange:
1745                bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
1746                                &instr->src[1], BI_SEG_WLS);
1747                bi_split_dest(b, instr->dest);
1748                break;
1749
1750        case nir_intrinsic_global_atomic_comp_swap:
1751                bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
1752                                &instr->src[1], &instr->src[2], BI_SEG_NONE);
1753                bi_split_dest(b, instr->dest);
1754                break;
1755
1756        case nir_intrinsic_image_atomic_comp_swap:
1757                bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr),
1758                                &instr->src[3], &instr->src[4], BI_SEG_NONE);
1759                bi_split_dest(b, instr->dest);
1760                break;
1761
1762        case nir_intrinsic_shared_atomic_comp_swap:
1763                bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
1764                                &instr->src[1], &instr->src[2], BI_SEG_WLS);
1765                bi_split_dest(b, instr->dest);
1766                break;
1767
1768        case nir_intrinsic_load_frag_coord:
1769                bi_emit_load_frag_coord(b, instr);
1770                break;
1771
1772        case nir_intrinsic_load_output:
1773                bi_emit_ld_tile(b, instr);
1774                break;
1775
1776        case nir_intrinsic_discard_if:
1777                bi_discard_b32(b, bi_src_index(&instr->src[0]));
1778                break;
1779
1780        case nir_intrinsic_discard:
1781                bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
1782                break;
1783
1784        case nir_intrinsic_load_ssbo_address:
1785        case nir_intrinsic_load_xfb_address:
1786                bi_load_sysval_nir(b, instr, 2, 0);
1787                break;
1788
1789        case nir_intrinsic_load_work_dim:
1790        case nir_intrinsic_load_num_vertices:
1791                bi_load_sysval_nir(b, instr, 1, 0);
1792                break;
1793
1794        case nir_intrinsic_load_first_vertex:
1795                bi_load_sysval_nir(b, instr, 1, 0);
1796                break;
1797
1798        case nir_intrinsic_load_base_vertex:
1799                bi_load_sysval_nir(b, instr, 1, 4);
1800                break;
1801
1802        case nir_intrinsic_load_base_instance:
1803                bi_load_sysval_nir(b, instr, 1, 8);
1804                break;
1805
1806        case nir_intrinsic_load_draw_id:
1807                bi_load_sysval_nir(b, instr, 1, 0);
1808                break;
1809
1810        case nir_intrinsic_get_ssbo_size:
1811                bi_load_sysval_nir(b, instr, 1, 8);
1812                break;
1813
1814        case nir_intrinsic_load_viewport_scale:
1815        case nir_intrinsic_load_viewport_offset:
1816        case nir_intrinsic_load_num_workgroups:
1817        case nir_intrinsic_load_workgroup_size:
1818                bi_load_sysval_nir(b, instr, 3, 0);
1819                break;
1820
1821        case nir_intrinsic_image_size:
1822                bi_load_sysval_nir(b, instr,
1823                                nir_dest_num_components(instr->dest), 0);
1824                break;
1825
1826        case nir_intrinsic_load_blend_const_color_rgba:
1827                bi_load_sysval_nir(b, instr,
1828                                   nir_dest_num_components(instr->dest), 0);
1829                break;
1830
1831	case nir_intrinsic_load_sample_positions_pan:
1832                bi_collect_v2i32_to(b, dst,
1833                                    bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false),
1834                                    bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
1835                break;
1836
1837	case nir_intrinsic_load_sample_mask_in:
1838                /* r61[0:15] contains the coverage bitmap */
1839                bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
1840                break;
1841
1842        case nir_intrinsic_load_sample_id:
1843                bi_load_sample_id_to(b, dst);
1844                break;
1845
1846	case nir_intrinsic_load_front_face:
1847                /* r58 == 0 means primitive is front facing */
1848                bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
1849                                BI_RESULT_TYPE_M1);
1850                break;
1851
1852        case nir_intrinsic_load_point_coord:
1853                bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
1854                                BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
1855                                BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
1856                bi_emit_cached_split_i32(b, dst, 2);
1857                break;
1858
1859        /* It appears vertex_id is zero-based with Bifrost geometry flows, but
1860         * not with Valhall's memory-allocation IDVS geometry flow. Ostensibly
1861         * we support the legacy geometry flow even on Valhall, so
1862         * vertex_id_zero_based isn't a machine property for us. Don't set it,
1863         * and lower here if needed.
1864         */
1865        case nir_intrinsic_load_vertex_id:
1866                if (b->shader->malloc_idvs) {
1867                        bi_mov_i32_to(b, dst, bi_vertex_id(b));
1868                } else {
1869                        bi_index first = bi_load_sysval(b,
1870                                                        PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS,
1871                                                        1, 0);
1872
1873                        bi_iadd_u32_to(b, dst, bi_vertex_id(b), first, false);
1874                }
1875
1876                break;
1877
1878        /* We only use in our transform feedback lowering */
1879        case nir_intrinsic_load_vertex_id_zero_base:
1880                assert(b->shader->nir->info.has_transform_feedback_varyings);
1881                bi_mov_i32_to(b, dst, bi_vertex_id(b));
1882                break;
1883
1884        case nir_intrinsic_load_instance_id:
1885                bi_mov_i32_to(b, dst, bi_instance_id(b));
1886                break;
1887
1888        case nir_intrinsic_load_subgroup_invocation:
1889                bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
1890                break;
1891
1892        case nir_intrinsic_load_local_invocation_id:
1893                bi_collect_v3i32_to(b, dst,
1894                                    bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
1895                                    bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
1896                                    bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
1897                break;
1898
1899        case nir_intrinsic_load_workgroup_id:
1900                bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
1901                                    bi_preload(b, 59));
1902                break;
1903
1904        case nir_intrinsic_load_global_invocation_id:
1905        case nir_intrinsic_load_global_invocation_id_zero_base:
1906                bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
1907                                    bi_preload(b, 62));
1908                break;
1909
1910        case nir_intrinsic_shader_clock:
1911                bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
1912                bi_split_dest(b, instr->dest);
1913                break;
1914
1915        default:
1916                fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
1917                assert(0);
1918        }
1919}
1920
1921static void
1922bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
1923{
1924        /* Make sure we've been lowered */
1925        assert(instr->def.num_components <= (32 / instr->def.bit_size));
1926
1927        /* Accumulate all the channels of the constant, as if we did an
1928         * implicit SEL over them */
1929        uint32_t acc = 0;
1930
1931        for (unsigned i = 0; i < instr->def.num_components; ++i) {
1932                unsigned v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
1933                acc |= (v << (i * instr->def.bit_size));
1934        }
1935
1936        bi_mov_i32_to(b, bi_get_index(instr->def.index, false, 0), bi_imm_u32(acc));
1937}
1938
1939static bi_index
1940bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
1941{
1942        /* we don't lower modifiers until the backend */
1943        assert(!(src.negate || src.abs));
1944
1945        unsigned bitsize = nir_src_bit_size(src.src);
1946
1947        /* the bi_index carries the 32-bit (word) offset separate from the
1948         * subword swizzle, first handle the offset */
1949
1950        unsigned offset = 0;
1951
1952        assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
1953        unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
1954
1955        for (unsigned i = 0; i < comps; ++i) {
1956                unsigned new_offset = (src.swizzle[i] >> subword_shift);
1957
1958                if (i > 0)
1959                        assert(offset == new_offset && "wrong vectorization");
1960
1961                offset = new_offset;
1962        }
1963
1964        bi_index idx = bi_extract(b, bi_src_index(&src.src), offset);
1965
1966        /* Compose the subword swizzle with existing (identity) swizzle */
1967        assert(idx.swizzle == BI_SWIZZLE_H01);
1968
1969        /* Bigger vectors should have been lowered */
1970        assert(comps <= (1 << subword_shift));
1971
1972        if (bitsize == 16) {
1973                unsigned c0 = src.swizzle[0] & 1;
1974                unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
1975                idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
1976        } else if (bitsize == 8) {
1977                /* 8-bit vectors not yet supported */
1978                assert(comps == 1 && "8-bit vectors not supported");
1979                assert(src.swizzle[0] < 4 && "8-bit vectors not supported");
1980                idx.swizzle = BI_SWIZZLE_B0000 + src.swizzle[0];
1981        }
1982
1983        return idx;
1984}
1985
1986static enum bi_round
1987bi_nir_round(nir_op op)
1988{
1989        switch (op) {
1990        case nir_op_fround_even: return BI_ROUND_NONE;
1991        case nir_op_ftrunc: return BI_ROUND_RTZ;
1992        case nir_op_fceil: return BI_ROUND_RTP;
1993        case nir_op_ffloor: return BI_ROUND_RTN;
1994        default: unreachable("invalid nir round op");
1995        }
1996}
1997
1998/* Convenience for lowered transcendentals */
1999
2000static bi_index
2001bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
2002{
2003        return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
2004}
2005
2006/* Approximate with FRCP_APPROX.f32 and apply a single iteration of
2007 * Newton-Raphson to improve precision */
2008
2009static void
2010bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
2011{
2012        bi_index x1 = bi_frcp_approx_f32(b, s0);
2013        bi_index m  = bi_frexpm_f32(b, s0, false, false);
2014        bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, false);
2015        bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0),
2016                        bi_zero(), BI_SPECIAL_N);
2017        bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
2018}
2019
2020static void
2021bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
2022{
2023        bi_index x1 = bi_frsq_approx_f32(b, s0);
2024        bi_index m  = bi_frexpm_f32(b, s0, false, true);
2025        bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, true);
2026        bi_index t1 = bi_fmul_f32(b, x1, x1);
2027        bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
2028                        bi_imm_u32(-1), BI_SPECIAL_N);
2029        bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
2030}
2031
2032/* More complex transcendentals, see
2033 * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
2034 * for documentation */
2035
2036static void
2037bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
2038{
2039        bi_index t1 = bi_temp(b->shader);
2040        bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
2041        t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
2042
2043        bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));
2044
2045        bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
2046        a2->clamp = BI_CLAMP_CLAMP_M1_1;
2047
2048        bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
2049        bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
2050        bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
2051        bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
2052                        bi_imm_u32(0x3e75fffa));
2053        bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
2054        bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
2055        bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader),
2056                        p3, a1t, a1t, a1i, BI_SPECIAL_NONE);
2057        x->clamp = BI_CLAMP_CLAMP_0_INF;
2058
2059        bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
2060        max->sem = BI_SEM_NAN_PROPAGATE;
2061}
2062
2063static void
2064bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
2065{
2066        /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
2067         * fixed-point input */
2068        bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
2069                        bi_imm_u32(24), BI_SPECIAL_NONE);
2070        bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
2071        fixed_pt->round = BI_ROUND_NONE; // XXX
2072
2073        /* Compute the result for the fixed-point input, but pass along
2074         * the floating-point scale for correct NaN propagation */
2075        bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
2076}
2077
2078static void
2079bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2080{
2081        /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
2082        bi_index a1 = bi_frexpm_f32(b, s0, true, false);
2083        bi_index ei = bi_frexpe_f32(b, s0, true, false);
2084        bi_index ef = bi_s32_to_f32(b, ei);
2085
2086        /* xt estimates -log(r1), a coarse approximation of log(a1) */
2087        bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
2088        bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
2089
2090        /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
2091         * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
2092         * and then log(s0) = x1 + x2 */
2093        bi_index x1 = bi_fadd_f32(b, ef, xt);
2094
2095        /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
2096         * polynomial approximation around 1. The series is expressed around
2097         * 1, so set y = (a1 * r1) - 1.0 */
2098        bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));
2099
2100        /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
2101         * log_e(1 + y) by the Taylor series (lower precision than the blob):
2102         * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
2103        bi_index loge = bi_fmul_f32(b, y,
2104                bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));
2105
2106        bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
2107
2108        /* log(s0) = x1 + x2 */
2109        bi_fadd_f32_to(b, dst, x1, x2);
2110}
2111
2112static void
2113bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2114{
2115        bi_index frexp = bi_frexpe_f32(b, s0, true, false);
2116        bi_index frexpi = bi_s32_to_f32(b, frexp);
2117        bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
2118        bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
2119}
2120
2121static void
2122bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2123{
2124        bi_index log2_base = bi_null();
2125
2126        if (base.type == BI_INDEX_CONSTANT) {
2127                log2_base = bi_imm_f32(log2f(uif(base.value)));
2128        } else {
2129                log2_base = bi_temp(b->shader);
2130                bi_lower_flog2_32(b, log2_base, base);
2131        }
2132
2133        return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
2134}
2135
2136static void
2137bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2138{
2139        bi_index log2_base = bi_null();
2140
2141        if (base.type == BI_INDEX_CONSTANT) {
2142                log2_base = bi_imm_f32(log2f(uif(base.value)));
2143        } else {
2144                log2_base = bi_temp(b->shader);
2145                bi_flog2_32(b, log2_base, base);
2146        }
2147
2148        return bi_fexp_32(b, dst, exp, log2_base);
2149}
2150
2151/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
2152 * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
2153 * calculates the results. We use them to calculate sin/cos via a Taylor
2154 * approximation:
2155 *
2156 * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
2157 * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
2158 * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
2159 */
2160
2161#define TWO_OVER_PI  bi_imm_f32(2.0f / 3.14159f)
2162#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
2163#define SINCOS_BIAS  bi_imm_u32(0x49400000)
2164
2165static void
2166bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
2167{
2168        /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
2169        bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);
2170
2171        /* Approximate domain error (small) */
2172        bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
2173                        MPI_OVER_TWO, s0);
2174
2175        /* Lookup sin(x), cos(x) */
2176        bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
2177        bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
2178
2179        /* e^2 / 2 */
2180        bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(),
2181                        bi_imm_u32(-1), BI_SPECIAL_NONE);
2182
2183        /* (-e^2)/2 f''(x) */
2184        bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),
2185                        cos ? cosx : sinx,
2186                        bi_negzero());
2187
2188        /* e f'(x) - (e^2/2) f''(x) */
2189        bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
2190                        cos ? bi_neg(sinx) : cosx,
2191                        quadratic);
2192        I->clamp = BI_CLAMP_CLAMP_M1_1;
2193
2194        /* f(x) + e f'(x) - (e^2/2) f''(x) */
2195        bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
2196}
2197
2198/*
2199 * The XOR lane op is useful for derivative calculations, but not all Bifrost
2200 * implementations have it. Add a safe helper that uses the hardware
2201 * functionality when available and lowers where unavailable.
2202 */
2203static bi_index
2204bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1)
2205{
2206        if (!(b->shader->quirks & BIFROST_LIMITED_CLPER)) {
2207                return bi_clper_i32(b, s0, s1,
2208                                BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR,
2209                                BI_SUBGROUP_SUBGROUP4);
2210        }
2211
2212        bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
2213        bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
2214        return bi_clper_old_i32(b, s0, lane);
2215}
2216
2217static enum bi_cmpf
2218bi_translate_cmpf(nir_op op)
2219{
2220        switch (op) {
2221        case nir_op_ieq8:
2222        case nir_op_ieq16:
2223        case nir_op_ieq32:
2224        case nir_op_feq16:
2225        case nir_op_feq32:
2226                return BI_CMPF_EQ;
2227
2228        case nir_op_ine8:
2229        case nir_op_ine16:
2230        case nir_op_ine32:
2231        case nir_op_fneu16:
2232        case nir_op_fneu32:
2233                return BI_CMPF_NE;
2234
2235        case nir_op_ilt8:
2236        case nir_op_ilt16:
2237        case nir_op_ilt32:
2238        case nir_op_flt16:
2239        case nir_op_flt32:
2240        case nir_op_ult8:
2241        case nir_op_ult16:
2242        case nir_op_ult32:
2243                return BI_CMPF_LT;
2244
2245        case nir_op_ige8:
2246        case nir_op_ige16:
2247        case nir_op_ige32:
2248        case nir_op_fge16:
2249        case nir_op_fge32:
2250        case nir_op_uge8:
2251        case nir_op_uge16:
2252        case nir_op_uge32:
2253                return BI_CMPF_GE;
2254
2255        default:
2256                unreachable("invalid comparison");
2257        }
2258}
2259
2260static bool
2261bi_nir_is_replicated(nir_alu_src *src)
2262{
2263        for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) {
2264                if (src->swizzle[0] == src->swizzle[i])
2265                        return false;
2266        }
2267
2268        return true;
2269}
2270
2271static void
2272bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
2273{
2274        bi_index dst = bi_dest_index(&instr->dest.dest);
2275        unsigned srcs = nir_op_infos[instr->op].num_inputs;
2276        unsigned sz = nir_dest_bit_size(instr->dest.dest);
2277        unsigned comps = nir_dest_num_components(instr->dest.dest);
2278        unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
2279
2280        /* Indicate scalarness */
2281        if (sz == 16 && comps == 1)
2282                dst.swizzle = BI_SWIZZLE_H00;
2283
2284        if (!instr->dest.dest.is_ssa) {
2285                for (unsigned i = 0; i < comps; ++i)
2286                        assert(instr->dest.write_mask);
2287        }
2288
2289        /* First, match against the various moves in NIR. These are
2290         * special-cased because they can operate on vectors even after
2291         * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
2292         * instruction is no "bigger" than SIMD-within-a-register. These moves
2293         * are the exceptions that need to handle swizzles specially. */
2294
2295        switch (instr->op) {
2296        case nir_op_vec2:
2297        case nir_op_vec3:
2298        case nir_op_vec4: {
2299                bi_index unoffset_srcs[4] = {
2300                        srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(),
2301                        srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(),
2302                        srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(),
2303                        srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(),
2304                };
2305
2306                unsigned channels[4] = {
2307                        instr->src[0].swizzle[0],
2308                        instr->src[1].swizzle[0],
2309                        srcs > 2 ? instr->src[2].swizzle[0] : 0,
2310                        srcs > 3 ? instr->src[3].swizzle[0] : 0,
2311                };
2312
2313                bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
2314                return;
2315        }
2316
2317        case nir_op_vec8:
2318        case nir_op_vec16:
2319                unreachable("should've been lowered");
2320
2321        case nir_op_unpack_32_2x16: {
2322                /* Should have been scalarized */
2323                assert(comps == 2 && sz == 16);
2324
2325                bi_index vec = bi_src_index(&instr->src[0].src);
2326                unsigned chan = instr->src[0].swizzle[0];
2327
2328                bi_mov_i32_to(b, dst, bi_extract(b, vec, chan));
2329                return;
2330        }
2331
2332        case nir_op_unpack_64_2x32_split_x:
2333                bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), 0));
2334                return;
2335
2336        case nir_op_unpack_64_2x32_split_y:
2337                bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), 1));
2338                return;
2339
2340        case nir_op_pack_64_2x32_split:
2341                bi_collect_v2i32_to(b, dst,
2342                                    bi_extract(b, bi_src_index(&instr->src[0].src), instr->src[0].swizzle[0]),
2343                                    bi_extract(b, bi_src_index(&instr->src[1].src), instr->src[1].swizzle[0]));
2344                return;
2345
2346        case nir_op_pack_64_2x32:
2347                bi_collect_v2i32_to(b, dst,
2348                                    bi_extract(b, bi_src_index(&instr->src[0].src), 0),
2349                                    bi_extract(b, bi_src_index(&instr->src[0].src), 1));
2350                return;
2351
2352        case nir_op_pack_uvec2_to_uint: {
2353                bi_index src = bi_src_index(&instr->src[0].src);
2354
2355                assert(sz == 32 && src_sz == 32);
2356                bi_mkvec_v2i16_to(b, dst, bi_half(bi_extract(b, src, 0), false),
2357                                          bi_half(bi_extract(b, src, 1), false));
2358                return;
2359        }
2360
2361        case nir_op_pack_uvec4_to_uint: {
2362                bi_index src = bi_src_index(&instr->src[0].src);
2363
2364                assert(sz == 32 && src_sz == 32);
2365                bi_mkvec_v4i8_to(b, dst, bi_byte(bi_extract(b, src, 0), 0),
2366                                         bi_byte(bi_extract(b, src, 1), 0),
2367                                         bi_byte(bi_extract(b, src, 2), 0),
2368                                         bi_byte(bi_extract(b, src, 3), 0));
2369                return;
2370        }
2371
2372        case nir_op_mov: {
2373                bi_index idx = bi_src_index(&instr->src[0].src);
2374                bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
2375
2376                unsigned channels[4] = {
2377                        comps > 0 ? instr->src[0].swizzle[0] : 0,
2378                        comps > 1 ? instr->src[0].swizzle[1] : 0,
2379                        comps > 2 ? instr->src[0].swizzle[2] : 0,
2380                        comps > 3 ? instr->src[0].swizzle[3] : 0,
2381                };
2382
2383                bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz);
2384                return;
2385        }
2386
2387        case nir_op_pack_32_2x16: {
2388                assert(nir_src_num_components(instr->src[0].src) == 2);
2389                assert(comps == 1);
2390
2391                bi_index idx = bi_src_index(&instr->src[0].src);
2392                bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
2393
2394                unsigned channels[2] = {
2395                        instr->src[0].swizzle[0],
2396                        instr->src[0].swizzle[1]
2397                };
2398
2399                bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16);
2400                return;
2401        }
2402
2403        case nir_op_f2f16:
2404        case nir_op_f2f16_rtz:
2405        case nir_op_f2f16_rtne: {
2406                assert(src_sz == 32);
2407                bi_index idx = bi_src_index(&instr->src[0].src);
2408                bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2409                bi_index s1 = comps > 1 ?
2410                        bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;
2411
2412                bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
2413
2414                /* Override rounding if explicitly requested. Otherwise, the
2415                 * default rounding mode is selected by the builder. Depending
2416                 * on the float controls required by the shader, the default
2417                 * mode may not be nearest-even.
2418                 */
2419                if (instr->op == nir_op_f2f16_rtz)
2420                        I->round = BI_ROUND_RTZ;
2421                else if (instr->op == nir_op_f2f16_rtne)
2422                        I->round = BI_ROUND_NONE; /* Nearest even */
2423
2424                return;
2425        }
2426
2427        /* Vectorized downcasts */
2428        case nir_op_u2u16:
2429        case nir_op_i2i16: {
2430                if (!(src_sz == 32 && comps == 2))
2431                        break;
2432
2433                bi_index idx = bi_src_index(&instr->src[0].src);
2434                bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2435                bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);
2436
2437                bi_mkvec_v2i16_to(b, dst,
2438                                bi_half(s0, false), bi_half(s1, false));
2439                return;
2440        }
2441
2442        /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to
2443         * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than
2444         * scalarizing due to scheduling (equal cost on Valhall). Additionally
2445         * if the source is replicated the MKVEC.v2i16 can be optimized out.
2446         */
2447        case nir_op_u2f16:
2448        case nir_op_i2f16: {
2449                if (!(src_sz == 32 && comps == 2))
2450                        break;
2451
2452                nir_alu_src *src = &instr->src[0];
2453                bi_index idx = bi_src_index(&src->src);
2454                bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
2455                bi_index s1 = bi_extract(b, idx, src->swizzle[1]);
2456
2457                bi_index t = (src->swizzle[0] == src->swizzle[1]) ?
2458                        bi_half(s0, false) :
2459                        bi_mkvec_v2i16(b, bi_half(s0, false),
2460                                          bi_half(s1, false));
2461
2462                if (instr->op == nir_op_u2f16)
2463                        bi_v2u16_to_v2f16_to(b, dst, t);
2464                else
2465                        bi_v2s16_to_v2f16_to(b, dst, t);
2466
2467                return;
2468        }
2469
2470        case nir_op_i2i8:
2471        case nir_op_u2u8:
2472        {
2473                /* Acts like an 8-bit swizzle */
2474                bi_index idx = bi_src_index(&instr->src[0].src);
2475                unsigned factor = src_sz / 8;
2476                unsigned chan[4] = { 0 };
2477
2478                for (unsigned i = 0; i < comps; ++i)
2479                        chan[i] = instr->src[0].swizzle[i] * factor;
2480
2481                bi_make_vec_to(b, dst, &idx, chan, comps, 8);
2482                return;
2483        }
2484
2485        case nir_op_b32csel:
2486        {
2487                if (sz != 16)
2488                        break;
2489
2490                /* We allow vectorizing b32csel(cond, A, B) which can be
2491                 * translated as MUX.v2i16, even though cond is a 32-bit vector.
2492                 *
2493                 * If the source condition vector is replicated, we can use
2494                 * MUX.v2i16 directly, letting each component use the
2495                 * corresponding half of the 32-bit source. NIR uses 0/~0
2496                 * booleans so that's guaranteed to work (that is, 32-bit NIR
2497                 * booleans are 16-bit replicated).
2498                 *
2499                 * If we're not replicated, we use the same trick but must
2500                 * insert a MKVEC.v2i16 first to convert down to 16-bit.
2501                 */
2502                bi_index idx = bi_src_index(&instr->src[0].src);
2503                bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2504                bi_index s1 = bi_alu_src_index(b, instr->src[1], comps);
2505                bi_index s2 = bi_alu_src_index(b, instr->src[2], comps);
2506
2507                if (!bi_nir_is_replicated(&instr->src[0])) {
2508                        s0 = bi_mkvec_v2i16(b, bi_half(s0, false),
2509                                            bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false));
2510                }
2511
2512                bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2513                return;
2514        }
2515
2516        default:
2517                break;
2518        }
2519
2520        bi_index s0 = srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null();
2521        bi_index s1 = srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null();
2522        bi_index s2 = srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();
2523
2524        switch (instr->op) {
2525        case nir_op_ffma:
2526                bi_fma_to(b, sz, dst, s0, s1, s2);
2527                break;
2528
2529        case nir_op_fmul:
2530                bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
2531                break;
2532
2533        case nir_op_fsub:
2534                s1 = bi_neg(s1);
2535                FALLTHROUGH;
2536        case nir_op_fadd:
2537                bi_fadd_to(b, sz, dst, s0, s1);
2538                break;
2539
2540        case nir_op_fsat: {
2541                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2542                I->clamp = BI_CLAMP_CLAMP_0_1;
2543                break;
2544        }
2545
2546        case nir_op_fsat_signed_mali: {
2547                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2548                I->clamp = BI_CLAMP_CLAMP_M1_1;
2549                break;
2550        }
2551
2552        case nir_op_fclamp_pos_mali: {
2553                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2554                I->clamp = BI_CLAMP_CLAMP_0_INF;
2555                break;
2556        }
2557
2558        case nir_op_fneg:
2559                bi_fabsneg_to(b, sz, dst, bi_neg(s0));
2560                break;
2561
2562        case nir_op_fabs:
2563                bi_fabsneg_to(b, sz, dst, bi_abs(s0));
2564                break;
2565
2566        case nir_op_fsin:
2567                bi_lower_fsincos_32(b, dst, s0, false);
2568                break;
2569
2570        case nir_op_fcos:
2571                bi_lower_fsincos_32(b, dst, s0, true);
2572                break;
2573
2574        case nir_op_fexp2:
2575                assert(sz == 32); /* should've been lowered */
2576
2577                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2578                        bi_lower_fexp2_32(b, dst, s0);
2579                else
2580                        bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
2581
2582                break;
2583
2584        case nir_op_flog2:
2585                assert(sz == 32); /* should've been lowered */
2586
2587                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2588                        bi_lower_flog2_32(b, dst, s0);
2589                else
2590                        bi_flog2_32(b, dst, s0);
2591
2592                break;
2593
2594        case nir_op_fpow:
2595                assert(sz == 32); /* should've been lowered */
2596
2597                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2598                        bi_lower_fpow_32(b, dst, s0, s1);
2599                else
2600                        bi_fpow_32(b, dst, s0, s1);
2601
2602                break;
2603
2604        case nir_op_frexp_exp:
2605                bi_frexpe_to(b, sz, dst, s0, false, false);
2606                break;
2607
2608        case nir_op_frexp_sig:
2609                bi_frexpm_to(b, sz, dst, s0, false, false);
2610                break;
2611
2612        case nir_op_ldexp:
2613                bi_ldexp_to(b, sz, dst, s0, s1);
2614                break;
2615
2616        case nir_op_b8csel:
2617                bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2618                break;
2619
2620        case nir_op_b16csel:
2621                bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2622                break;
2623
2624        case nir_op_b32csel:
2625                bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2626                break;
2627
2628        case nir_op_ishl:
2629                bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2630                break;
2631        case nir_op_ushr:
2632                bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false);
2633                break;
2634
2635        case nir_op_ishr:
2636                if (b->shader->arch >= 9)
2637                        bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true);
2638                else
2639                        bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
2640                break;
2641
2642        case nir_op_imin:
2643        case nir_op_umin:
2644                bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
2645                                s0, s1, s0, s1, BI_CMPF_LT);
2646                break;
2647
2648        case nir_op_imax:
2649        case nir_op_umax:
2650                bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
2651                                s0, s1, s0, s1, BI_CMPF_GT);
2652                break;
2653
2654        case nir_op_fddx_must_abs_mali:
2655        case nir_op_fddy_must_abs_mali: {
2656                bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
2657                bi_index adjacent = bi_clper_xor(b, s0, bit);
2658                bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0));
2659                break;
2660        }
2661
2662        case nir_op_fddx:
2663        case nir_op_fddy:
2664        case nir_op_fddx_coarse:
2665        case nir_op_fddy_coarse:
2666        case nir_op_fddx_fine:
2667        case nir_op_fddy_fine: {
2668                unsigned axis;
2669                switch (instr->op) {
2670                case nir_op_fddx:
2671                case nir_op_fddx_coarse:
2672                case nir_op_fddx_fine:
2673                        axis = 1;
2674                        break;
2675                case nir_op_fddy:
2676                case nir_op_fddy_coarse:
2677                case nir_op_fddy_fine:
2678                        axis = 2;
2679                        break;
2680                default:
2681                        unreachable("Invalid derivative op");
2682                }
2683
2684                bi_index lane1, lane2;
2685                switch (instr->op) {
2686                case nir_op_fddx:
2687                case nir_op_fddx_fine:
2688                case nir_op_fddy:
2689                case nir_op_fddy_fine:
2690                        lane1 = bi_lshift_and_i32(b,
2691                                bi_fau(BIR_FAU_LANE_ID, false),
2692                                bi_imm_u32(0x3 & ~axis),
2693                                bi_imm_u8(0));
2694
2695                        lane2 = bi_iadd_u32(b, lane1,
2696                                bi_imm_u32(axis),
2697                                false);
2698                        break;
2699                case nir_op_fddx_coarse:
2700                case nir_op_fddy_coarse:
2701                        lane1 = bi_imm_u32(0);
2702                        lane2 = bi_imm_u32(axis);
2703                        break;
2704                default:
2705                        unreachable("Invalid derivative op");
2706                }
2707
2708                bi_index left, right;
2709
2710                if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
2711                        left = bi_clper_old_i32(b, s0, lane1);
2712                        right = bi_clper_old_i32(b, s0, lane2);
2713                } else {
2714                        left = bi_clper_i32(b, s0, lane1,
2715                                        BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
2716                                        BI_SUBGROUP_SUBGROUP4);
2717
2718                        right = bi_clper_i32(b, s0, lane2,
2719                                        BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
2720                                        BI_SUBGROUP_SUBGROUP4);
2721                }
2722
2723                bi_fadd_to(b, sz, dst, right, bi_neg(left));
2724                break;
2725        }
2726
2727        case nir_op_f2f32:
2728                bi_f16_to_f32_to(b, dst, s0);
2729                break;
2730
2731        case nir_op_fquantize2f16:
2732        {
2733                bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
2734                bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
2735
2736                f16->ftz = f32->ftz = true;
2737                break;
2738        }
2739
2740        case nir_op_f2i32:
2741                if (src_sz == 32)
2742                        bi_f32_to_s32_to(b, dst, s0);
2743                else
2744                        bi_f16_to_s32_to(b, dst, s0);
2745                break;
2746
2747        /* Note 32-bit sources => no vectorization, so 32-bit works */
2748        case nir_op_f2u16:
2749                if (src_sz == 32)
2750                        bi_f32_to_u32_to(b, dst, s0);
2751                else
2752                        bi_v2f16_to_v2u16_to(b, dst, s0);
2753                break;
2754
2755        case nir_op_f2i16:
2756                if (src_sz == 32)
2757                        bi_f32_to_s32_to(b, dst, s0);
2758                else
2759                        bi_v2f16_to_v2s16_to(b, dst, s0);
2760                break;
2761
2762        case nir_op_f2u32:
2763                if (src_sz == 32)
2764                        bi_f32_to_u32_to(b, dst, s0);
2765                else
2766                        bi_f16_to_u32_to(b, dst, s0);
2767                break;
2768
2769        case nir_op_u2f16:
2770                if (src_sz == 32)
2771                        bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
2772                else if (src_sz == 16)
2773                        bi_v2u16_to_v2f16_to(b, dst, s0);
2774                else if (src_sz == 8)
2775                        bi_v2u8_to_v2f16_to(b, dst, s0);
2776                break;
2777
2778        case nir_op_u2f32:
2779                if (src_sz == 32)
2780                        bi_u32_to_f32_to(b, dst, s0);
2781                else if (src_sz == 16)
2782                        bi_u16_to_f32_to(b, dst, s0);
2783                else
2784                        bi_u8_to_f32_to(b, dst, s0);
2785                break;
2786
2787        case nir_op_i2f16:
2788                if (src_sz == 32)
2789                        bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
2790                else if (src_sz == 16)
2791                        bi_v2s16_to_v2f16_to(b, dst, s0);
2792                else if (src_sz == 8)
2793                        bi_v2s8_to_v2f16_to(b, dst, s0);
2794                break;
2795
2796        case nir_op_i2f32:
2797                assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2798
2799                if (src_sz == 32)
2800                        bi_s32_to_f32_to(b, dst, s0);
2801                else if (src_sz == 16)
2802                        bi_s16_to_f32_to(b, dst, s0);
2803                else if (src_sz == 8)
2804                        bi_s8_to_f32_to(b, dst, s0);
2805                break;
2806
2807        case nir_op_i2i32:
2808                assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2809
2810                if (src_sz == 32)
2811                        bi_mov_i32_to(b, dst, s0);
2812                else if (src_sz == 16)
2813                        bi_s16_to_s32_to(b, dst, s0);
2814                else if (src_sz == 8)
2815                        bi_s8_to_s32_to(b, dst, s0);
2816                break;
2817
2818        case nir_op_u2u32:
2819                assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2820
2821                if (src_sz == 32)
2822                        bi_mov_i32_to(b, dst, s0);
2823                else if (src_sz == 16)
2824                        bi_u16_to_u32_to(b, dst, s0);
2825                else if (src_sz == 8)
2826                        bi_u8_to_u32_to(b, dst, s0);
2827
2828                break;
2829
2830        case nir_op_i2i16:
2831                assert(src_sz == 8 || src_sz == 32);
2832
2833                if (src_sz == 8)
2834                        bi_v2s8_to_v2s16_to(b, dst, s0);
2835                else
2836                        bi_mov_i32_to(b, dst, s0);
2837                break;
2838
2839        case nir_op_u2u16:
2840                assert(src_sz == 8 || src_sz == 32);
2841
2842                if (src_sz == 8)
2843                        bi_v2u8_to_v2u16_to(b, dst, s0);
2844                else
2845                        bi_mov_i32_to(b, dst, s0);
2846                break;
2847
2848        case nir_op_b2i8:
2849        case nir_op_b2i16:
2850        case nir_op_b2i32:
2851                bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0, BI_MUX_INT_ZERO);
2852                break;
2853
2854        case nir_op_f2b16:
2855                bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_FP_ZERO);
2856                break;
2857        case nir_op_f2b32:
2858                bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_FP_ZERO);
2859                break;
2860
2861        case nir_op_i2b8:
2862                bi_mux_v4i8_to(b, dst, bi_imm_u8(0), bi_imm_u8(~0), s0, BI_MUX_INT_ZERO);
2863                break;
2864        case nir_op_i2b16:
2865                bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_INT_ZERO);
2866                break;
2867        case nir_op_i2b32:
2868                bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_INT_ZERO);
2869                break;
2870
2871        case nir_op_ieq8:
2872        case nir_op_ine8:
2873        case nir_op_ilt8:
2874        case nir_op_ige8:
2875        case nir_op_ieq16:
2876        case nir_op_ine16:
2877        case nir_op_ilt16:
2878        case nir_op_ige16:
2879        case nir_op_ieq32:
2880        case nir_op_ine32:
2881        case nir_op_ilt32:
2882        case nir_op_ige32:
2883                bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
2884                break;
2885
2886        case nir_op_ult8:
2887        case nir_op_uge8:
2888        case nir_op_ult16:
2889        case nir_op_uge16:
2890        case nir_op_ult32:
2891        case nir_op_uge32:
2892                bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
2893                break;
2894
2895        case nir_op_feq32:
2896        case nir_op_feq16:
2897        case nir_op_flt32:
2898        case nir_op_flt16:
2899        case nir_op_fge32:
2900        case nir_op_fge16:
2901        case nir_op_fneu32:
2902        case nir_op_fneu16:
2903                bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
2904                break;
2905
2906        case nir_op_fround_even:
2907        case nir_op_fceil:
2908        case nir_op_ffloor:
2909        case nir_op_ftrunc:
2910                bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
2911                break;
2912
2913        case nir_op_fmin:
2914                bi_fmin_to(b, sz, dst, s0, s1);
2915                break;
2916
2917        case nir_op_fmax:
2918                bi_fmax_to(b, sz, dst, s0, s1);
2919                break;
2920
2921        case nir_op_iadd:
2922                bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
2923                break;
2924
2925        case nir_op_iadd_sat:
2926                bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
2927                break;
2928
2929        case nir_op_uadd_sat:
2930                bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
2931                break;
2932
2933        case nir_op_ihadd:
2934                bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
2935                break;
2936
2937        case nir_op_irhadd:
2938                bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
2939                break;
2940
2941        case nir_op_ineg:
2942                bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
2943                break;
2944
2945        case nir_op_isub:
2946                bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
2947                break;
2948
2949        case nir_op_isub_sat:
2950                bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
2951                break;
2952
2953        case nir_op_usub_sat:
2954                bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
2955                break;
2956
2957        case nir_op_imul:
2958                bi_imul_to(b, sz, dst, s0, s1);
2959                break;
2960
2961        case nir_op_iabs:
2962                bi_iabs_to(b, sz, dst, s0);
2963                break;
2964
2965        case nir_op_iand:
2966                bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2967                break;
2968
2969        case nir_op_ior:
2970                bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2971                break;
2972
2973        case nir_op_ixor:
2974                bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2975                break;
2976
2977        case nir_op_inot:
2978                bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
2979                break;
2980
2981        case nir_op_frsq:
2982                if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2983                        bi_lower_frsq_32(b, dst, s0);
2984                else
2985                        bi_frsq_to(b, sz, dst, s0);
2986                break;
2987
2988        case nir_op_frcp:
2989                if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2990                        bi_lower_frcp_32(b, dst, s0);
2991                else
2992                        bi_frcp_to(b, sz, dst, s0);
2993                break;
2994
2995        case nir_op_uclz:
2996                bi_clz_to(b, sz, dst, s0, false);
2997                break;
2998
2999        case nir_op_bit_count:
3000                bi_popcount_i32_to(b, dst, s0);
3001                break;
3002
3003        case nir_op_bitfield_reverse:
3004                bi_bitrev_i32_to(b, dst, s0);
3005                break;
3006
3007        case nir_op_ufind_msb: {
3008                bi_index clz = bi_clz(b, src_sz, s0, false);
3009
3010                if (sz == 8)
3011                        clz = bi_byte(clz, 0);
3012                else if (sz == 16)
3013                        clz = bi_half(clz, false);
3014
3015                bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
3016                break;
3017        }
3018
3019        default:
3020                fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
3021                unreachable("Unknown ALU op");
3022        }
3023}
3024
3025/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */
3026static unsigned
3027bifrost_tex_format(enum glsl_sampler_dim dim)
3028{
3029        switch (dim) {
3030        case GLSL_SAMPLER_DIM_1D:
3031        case GLSL_SAMPLER_DIM_BUF:
3032                return 1;
3033
3034        case GLSL_SAMPLER_DIM_2D:
3035        case GLSL_SAMPLER_DIM_MS:
3036        case GLSL_SAMPLER_DIM_EXTERNAL:
3037        case GLSL_SAMPLER_DIM_RECT:
3038                return 2;
3039
3040        case GLSL_SAMPLER_DIM_3D:
3041                return 3;
3042
3043        case GLSL_SAMPLER_DIM_CUBE:
3044                return 0;
3045
3046        default:
3047                DBG("Unknown sampler dim type\n");
3048                assert(0);
3049                return 0;
3050        }
3051}
3052
3053static enum bi_dimension
3054valhall_tex_dimension(enum glsl_sampler_dim dim)
3055{
3056        switch (dim) {
3057        case GLSL_SAMPLER_DIM_1D:
3058        case GLSL_SAMPLER_DIM_BUF:
3059                return BI_DIMENSION_1D;
3060
3061        case GLSL_SAMPLER_DIM_2D:
3062        case GLSL_SAMPLER_DIM_MS:
3063        case GLSL_SAMPLER_DIM_EXTERNAL:
3064        case GLSL_SAMPLER_DIM_RECT:
3065                return BI_DIMENSION_2D;
3066
3067        case GLSL_SAMPLER_DIM_3D:
3068                return BI_DIMENSION_3D;
3069
3070        case GLSL_SAMPLER_DIM_CUBE:
3071                return BI_DIMENSION_CUBE;
3072
3073        default:
3074                unreachable("Unknown sampler dim type");
3075        }
3076}
3077
3078static enum bifrost_texture_format_full
3079bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
3080{
3081        switch (T) {
3082        case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp;
3083        case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp;
3084        case nir_type_uint16:  return BIFROST_TEXTURE_FORMAT_U16;
3085        case nir_type_int16:   return BIFROST_TEXTURE_FORMAT_S16;
3086        case nir_type_uint32:  return BIFROST_TEXTURE_FORMAT_U32;
3087        case nir_type_int32:   return BIFROST_TEXTURE_FORMAT_S32;
3088        default:              unreachable("Invalid type for texturing");
3089        }
3090}
3091
3092/* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */
3093static bi_index
3094bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
3095{
3096        /* For (u)int we can just passthrough */
3097        nir_alu_type base = nir_alu_type_get_base_type(T);
3098        if (base == nir_type_int || base == nir_type_uint)
3099                return idx;
3100
3101        /* Otherwise we convert */
3102        assert(T == nir_type_float32);
3103
3104        /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
3105         * Texel Selection") defines the layer to be taken from clamp(RNE(r),
3106         * 0, dt - 1). So we use round RTE, clamping is handled at the data
3107         * structure level */
3108
3109        bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
3110        I->round = BI_ROUND_NONE;
3111        return I->dest[0];
3112}
3113
3114/* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
3115 * 16-bit 8:8 fixed-point format. We lower as:
3116 *
3117 * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
3118 * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
3119 */
3120
3121static bi_index
3122bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
3123{
3124        /* Precompute for constant LODs to avoid general constant folding */
3125        if (lod.type == BI_INDEX_CONSTANT) {
3126                uint32_t raw = lod.value;
3127                float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
3128                int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
3129                return bi_imm_u32(s32 & 0xFFFF);
3130        }
3131
3132        /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
3133         * the max LOD (16 since we cap at 2^16 texture dimensions), and
3134         * preferably small to minimize precision loss */
3135        const float max_lod = 16.0;
3136
3137        bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader),
3138                        fp16 ? bi_half(lod, false) : lod,
3139                        bi_imm_f32(1.0f / max_lod), bi_negzero());
3140
3141        fsat->clamp = BI_CLAMP_CLAMP_M1_1;
3142
3143        bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f),
3144                        bi_negzero());
3145
3146        return bi_mkvec_v2i16(b,
3147                        bi_half(bi_f32_to_s32(b, fmul), false), bi_imm_u16(0));
3148}
3149
3150/* FETCH takes a 32-bit staging register containing the LOD as an integer in
3151 * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
3152 * TODO: Cube face.
3153 */
3154
3155static bi_index
3156bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
3157{
3158        return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
3159}
3160
3161/* The hardware specifies texel offsets and multisample indices together as a
3162 * u8vec4 <offset, ms index>. By default all are zero, so if have either a
3163 * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
3164 * the bits we need and return that to be passed as a staging register. Else we
3165 * return 0 to avoid allocating a data register when everything is zero. */
3166
3167static bi_index
3168bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
3169{
3170        bi_index dest = bi_zero();
3171
3172        int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3173        if (offs_idx >= 0 &&
3174            (!nir_src_is_const(instr->src[offs_idx].src) ||
3175             nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3176                unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3177                bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3178                dest = bi_mkvec_v4i8(b,
3179                                (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3180                                (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
3181                                (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0),
3182                                bi_imm_u8(0));
3183        }
3184
3185        int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3186        if (ms_idx >= 0 &&
3187            (!nir_src_is_const(instr->src[ms_idx].src) ||
3188             nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3189                dest = bi_lshift_or_i32(b,
3190                                bi_src_index(&instr->src[ms_idx].src), dest,
3191                                bi_imm_u8(24));
3192        }
3193
3194        return dest;
3195}
3196
3197/*
3198 * Valhall specifies specifies texel offsets, multisample indices, and (for
3199 * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third
3200 * component is either offset.z or multisample index depending on context. Build
3201 * this register.
3202 */
3203static bi_index
3204bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr)
3205{
3206        bi_index dest = bi_zero();
3207
3208        int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3209        int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3210        int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3211
3212        /* Components 0-2: offsets */
3213        if (offs_idx >= 0 &&
3214            (!nir_src_is_const(instr->src[offs_idx].src) ||
3215             nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3216                unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3217                bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3218
3219                /* No multisample index with 3D */
3220                assert((nr <= 2) || (ms_idx < 0));
3221
3222                /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */
3223                bi_index z = (nr > 2) ?
3224                             bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0),
3225                                              bi_imm_u8(0), bi_zero()) :
3226                             bi_zero();
3227
3228                dest = bi_mkvec_v2i8(b,
3229                                (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3230                                (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
3231                                z);
3232        }
3233
3234        /* Component 2: multisample index */
3235        if (ms_idx >= 0 &&
3236            (!nir_src_is_const(instr->src[ms_idx].src) ||
3237             nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3238                dest = bi_mkvec_v2i16(b, dest,
3239                                bi_src_index(&instr->src[ms_idx].src));
3240        }
3241
3242        /* Component 3: 8-bit LOD */
3243        if (lod_idx >= 0 &&
3244            (!nir_src_is_const(instr->src[lod_idx].src) ||
3245             nir_src_as_uint(instr->src[lod_idx].src) != 0) &&
3246            nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) {
3247                dest = bi_lshift_or_i32(b,
3248                                bi_src_index(&instr->src[lod_idx].src), dest,
3249                                bi_imm_u8(24));
3250        }
3251
3252        return dest;
3253}
3254
3255static void
3256bi_emit_cube_coord(bi_builder *b, bi_index coord,
3257                    bi_index *face, bi_index *s, bi_index *t)
3258{
3259        /* Compute max { |x|, |y|, |z| } */
3260        bi_index maxxyz = bi_temp(b->shader);
3261        *face = bi_temp(b->shader);
3262
3263        bi_index cx = bi_extract(b, coord, 0),
3264                 cy = bi_extract(b, coord, 1),
3265                 cz = bi_extract(b, coord, 2);
3266
3267        /* Use a pseudo op on Bifrost due to tuple restrictions */
3268        if (b->shader->arch <= 8) {
3269                bi_cubeface_to(b, maxxyz, *face, cx, cy, cz);
3270        } else {
3271                bi_cubeface1_to(b, maxxyz, cx, cy, cz);
3272                bi_cubeface2_v9_to(b, *face, cx, cy, cz);
3273        }
3274
3275        /* Select coordinates */
3276        bi_index ssel = bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face);
3277        bi_index tsel = bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2),
3278                        *face);
3279
3280        /* The OpenGL ES specification requires us to transform an input vector
3281         * (x, y, z) to the coordinate, given the selected S/T:
3282         *
3283         * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
3284         *
3285         * We implement (s shown, t similar) in a form friendlier to FMA
3286         * instructions, and clamp coordinates at the end for correct
3287         * NaN/infinity handling:
3288         *
3289         * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
3290         *
3291         * Take the reciprocal of max{x, y, z}
3292         */
3293        bi_index rcp = bi_frcp_f32(b, maxxyz);
3294
3295        /* Calculate 0.5 * (1.0 / max{x, y, z}) */
3296        bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());
3297
3298        /* Transform the coordinates */
3299        *s = bi_temp(b->shader);
3300        *t = bi_temp(b->shader);
3301
3302        bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
3303        bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));
3304
3305        S->clamp = BI_CLAMP_CLAMP_0_1;
3306        T->clamp = BI_CLAMP_CLAMP_0_1;
3307}
3308
3309/* Emits a cube map descriptor, returning lower 32-bits and putting upper
3310 * 32-bits in passed pointer t. The packing of the face with the S coordinate
3311 * exploits the redundancy of floating points with the range restriction of
3312 * CUBEFACE output.
3313 *
3314 *     struct cube_map_descriptor {
3315 *         float s : 29;
3316 *         unsigned face : 3;
3317 *         float t : 32;
3318 *     }
3319 *
3320 * Since the cube face index is preshifted, this is easy to pack with a bitwise
3321 * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
3322 * bits from face.
3323 */
3324
3325static bi_index
3326bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
3327{
3328        bi_index face, s;
3329        bi_emit_cube_coord(b, coord, &face, &s, t);
3330        bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
3331        return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
3332}
3333
3334/* Map to the main texture op used. Some of these (txd in particular) will
3335 * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
3336 * sequence). We assume that lowering is handled elsewhere.
3337 */
3338
3339static enum bifrost_tex_op
3340bi_tex_op(nir_texop op)
3341{
3342        switch (op) {
3343        case nir_texop_tex:
3344        case nir_texop_txb:
3345        case nir_texop_txl:
3346        case nir_texop_txd:
3347        case nir_texop_tex_prefetch:
3348                return BIFROST_TEX_OP_TEX;
3349        case nir_texop_txf:
3350        case nir_texop_txf_ms:
3351        case nir_texop_txf_ms_fb:
3352        case nir_texop_tg4:
3353                return BIFROST_TEX_OP_FETCH;
3354        case nir_texop_txs:
3355        case nir_texop_lod:
3356        case nir_texop_query_levels:
3357        case nir_texop_texture_samples:
3358        case nir_texop_samples_identical:
3359                unreachable("should've been lowered");
3360        default:
3361                unreachable("unsupported tex op");
3362        }
3363}
3364
3365/* Data registers required by texturing in the order they appear. All are
3366 * optional, the texture operation descriptor determines which are present.
3367 * Note since 3D arrays are not permitted at an API level, Z_COORD and
3368 * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
3369
3370enum bifrost_tex_dreg {
3371        BIFROST_TEX_DREG_Z_COORD = 0,
3372        BIFROST_TEX_DREG_Y_DELTAS = 1,
3373        BIFROST_TEX_DREG_LOD = 2,
3374        BIFROST_TEX_DREG_GRDESC_HI = 3,
3375        BIFROST_TEX_DREG_SHADOW = 4,
3376        BIFROST_TEX_DREG_ARRAY = 5,
3377        BIFROST_TEX_DREG_OFFSETMS = 6,
3378        BIFROST_TEX_DREG_SAMPLER = 7,
3379        BIFROST_TEX_DREG_TEXTURE = 8,
3380        BIFROST_TEX_DREG_COUNT,
3381};
3382
3383static void
3384bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
3385{
3386        struct bifrost_texture_operation desc = {
3387                .op = bi_tex_op(instr->op),
3388                .offset_or_bias_disable = false, /* TODO */
3389                .shadow_or_clamp_disable = instr->is_shadow,
3390                .array = instr->is_array,
3391                .dimension = bifrost_tex_format(instr->sampler_dim),
3392                .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */
3393                .mask = 0xF,
3394        };
3395
3396        switch (desc.op) {
3397        case BIFROST_TEX_OP_TEX:
3398                desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
3399                break;
3400        case BIFROST_TEX_OP_FETCH:
3401                desc.lod_or_fetch = (enum bifrost_lod_mode)
3402                   (instr->op == nir_texop_tg4 ?
3403                        BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component :
3404                        BIFROST_TEXTURE_FETCH_TEXEL);
3405                break;
3406        default:
3407                unreachable("texture op unsupported");
3408        }
3409
3410        /* 32-bit indices to be allocated as consecutive staging registers */
3411        bi_index dregs[BIFROST_TEX_DREG_COUNT] = { };
3412        bi_index cx = bi_null(), cy = bi_null();
3413
3414        for (unsigned i = 0; i < instr->num_srcs; ++i) {
3415                bi_index index = bi_src_index(&instr->src[i].src);
3416                unsigned sz = nir_src_bit_size(instr->src[i].src);
3417                unsigned components = nir_src_num_components(instr->src[i].src);
3418                ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
3419                nir_alu_type T = base | sz;
3420
3421                switch (instr->src[i].src_type) {
3422                case nir_tex_src_coord:
3423                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3424                                cx = bi_emit_texc_cube_coord(b, index, &cy);
3425			} else {
3426                                /* Copy XY (for 2D+) or XX (for 1D) */
3427                                cx = bi_extract(b, index, 0);
3428                                cy = bi_extract(b, index, MIN2(1, components - 1));
3429
3430                                assert(components >= 1 && components <= 3);
3431
3432                                if (components == 3 && !desc.array) {
3433                                        /* 3D */
3434                                        dregs[BIFROST_TEX_DREG_Z_COORD] =
3435                                                bi_extract(b, index, 2);
3436                                }
3437                        }
3438
3439                        if (desc.array) {
3440                                dregs[BIFROST_TEX_DREG_ARRAY] =
3441                                                bi_emit_texc_array_index(b,
3442                                                                bi_extract(b, index, components - 1), T);
3443                        }
3444
3445                        break;
3446
3447                case nir_tex_src_lod:
3448                        if (desc.op == BIFROST_TEX_OP_TEX &&
3449                            nir_src_is_const(instr->src[i].src) &&
3450                            nir_src_as_uint(instr->src[i].src) == 0) {
3451                                desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
3452                        } else if (desc.op == BIFROST_TEX_OP_TEX) {
3453                                assert(base == nir_type_float);
3454
3455                                assert(sz == 16 || sz == 32);
3456                                dregs[BIFROST_TEX_DREG_LOD] =
3457                                        bi_emit_texc_lod_88(b, index, sz == 16);
3458                                desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
3459                        } else {
3460                                assert(desc.op == BIFROST_TEX_OP_FETCH);
3461                                assert(base == nir_type_uint || base == nir_type_int);
3462                                assert(sz == 16 || sz == 32);
3463
3464                                dregs[BIFROST_TEX_DREG_LOD] =
3465                                        bi_emit_texc_lod_cube(b, index);
3466                        }
3467
3468                        break;
3469
3470                case nir_tex_src_bias:
3471                        /* Upper 16-bits interpreted as a clamp, leave zero */
3472                        assert(desc.op == BIFROST_TEX_OP_TEX);
3473                        assert(base == nir_type_float);
3474                        assert(sz == 16 || sz == 32);
3475                        dregs[BIFROST_TEX_DREG_LOD] =
3476                                bi_emit_texc_lod_88(b, index, sz == 16);
3477                        desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
3478                        break;
3479
3480                case nir_tex_src_ms_index:
3481                case nir_tex_src_offset:
3482                        if (desc.offset_or_bias_disable)
3483                                break;
3484
3485                        dregs[BIFROST_TEX_DREG_OFFSETMS] =
3486	                        bi_emit_texc_offset_ms_index(b, instr);
3487                        if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
3488                                desc.offset_or_bias_disable = true;
3489                        break;
3490
3491                case nir_tex_src_comparator:
3492                        dregs[BIFROST_TEX_DREG_SHADOW] = index;
3493                        break;
3494
3495                case nir_tex_src_texture_offset:
3496                        if (instr->texture_index)
3497                                index = bi_iadd_u32(b, index, bi_imm_u32(instr->texture_index), false);
3498
3499                        dregs[BIFROST_TEX_DREG_TEXTURE] = index;
3500
3501                        break;
3502
3503                case nir_tex_src_sampler_offset:
3504                        if (instr->sampler_index)
3505                                index = bi_iadd_u32(b, index, bi_imm_u32(instr->sampler_index), false);
3506
3507                        dregs[BIFROST_TEX_DREG_SAMPLER] = index;
3508                        break;
3509
3510                default:
3511                        unreachable("Unhandled src type in texc emit");
3512                }
3513        }
3514
3515        if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
3516                dregs[BIFROST_TEX_DREG_LOD] =
3517                        bi_emit_texc_lod_cube(b, bi_zero());
3518        }
3519
3520        /* Choose an index mode */
3521
3522        bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
3523        bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
3524        bool direct = direct_tex && direct_samp;
3525
3526        desc.immediate_indices = direct && (instr->sampler_index < 16);
3527
3528        if (desc.immediate_indices) {
3529                desc.sampler_index_or_mode = instr->sampler_index;
3530                desc.index = instr->texture_index;
3531        } else {
3532                unsigned mode = 0;
3533
3534                if (direct && instr->sampler_index == instr->texture_index) {
3535                        mode = BIFROST_INDEX_IMMEDIATE_SHARED;
3536                        desc.index = instr->texture_index;
3537                } else if (direct) {
3538                        mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3539                        desc.index = instr->sampler_index;
3540                        dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b,
3541                                        bi_imm_u32(instr->texture_index));
3542                } else if (direct_tex) {
3543                        assert(!direct_samp);
3544                        mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
3545                        desc.index = instr->texture_index;
3546                } else if (direct_samp) {
3547                        assert(!direct_tex);
3548                        mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3549                        desc.index = instr->sampler_index;
3550                } else {
3551                        mode = BIFROST_INDEX_REGISTER;
3552                }
3553
3554                mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2);
3555                desc.sampler_index_or_mode = mode;
3556        }
3557
3558        /* Allocate staging registers contiguously by compacting the array. */
3559        unsigned sr_count = 0;
3560
3561        for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
3562                if (!bi_is_null(dregs[i]))
3563                        dregs[sr_count++] = dregs[i];
3564        }
3565
3566        unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4;
3567
3568        bi_index sr = sr_count ? bi_temp(b->shader) : bi_null();
3569        bi_index dst = bi_temp(b->shader);
3570
3571        if (sr_count)
3572                bi_emit_collect_to(b, sr, dregs, sr_count);
3573
3574        uint32_t desc_u = 0;
3575        memcpy(&desc_u, &desc, sizeof(desc_u));
3576        bi_instr *I =
3577                bi_texc_to(b, dst, bi_null(), sr, cx, cy,
3578                   bi_imm_u32(desc_u),
3579                   !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0);
3580        I->register_format = bi_reg_fmt_for_nir(instr->dest_type);
3581
3582        bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
3583        bi_emit_split_i32(b, w, dst, res_size);
3584        bi_emit_collect_to(b, bi_dest_index(&instr->dest), w,
3585                        DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4));
3586}
3587
3588/* Staging registers required by texturing in the order they appear (Valhall) */
3589
3590enum valhall_tex_sreg {
3591        VALHALL_TEX_SREG_X_COORD = 0,
3592        VALHALL_TEX_SREG_Y_COORD = 1,
3593        VALHALL_TEX_SREG_Z_COORD = 2,
3594        VALHALL_TEX_SREG_Y_DELTAS = 3,
3595        VALHALL_TEX_SREG_ARRAY = 4,
3596        VALHALL_TEX_SREG_SHADOW = 5,
3597        VALHALL_TEX_SREG_OFFSETMS = 6,
3598        VALHALL_TEX_SREG_LOD = 7,
3599        VALHALL_TEX_SREG_GRDESC = 8,
3600        VALHALL_TEX_SREG_COUNT,
3601};
3602
3603static void
3604bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
3605{
3606        bool explicit_offset = false;
3607        enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD;
3608
3609        bool has_lod_mode =
3610                (instr->op == nir_texop_tex) ||
3611                (instr->op == nir_texop_txl) ||
3612                (instr->op == nir_texop_txb);
3613
3614        /* 32-bit indices to be allocated as consecutive staging registers */
3615        bi_index sregs[VALHALL_TEX_SREG_COUNT] = { };
3616
3617        bi_index sampler = bi_imm_u32(instr->sampler_index);
3618        bi_index texture = bi_imm_u32(instr->texture_index);
3619        uint32_t tables = (PAN_TABLE_SAMPLER << 11) | (PAN_TABLE_TEXTURE << 27);
3620
3621        for (unsigned i = 0; i < instr->num_srcs; ++i) {
3622                bi_index index = bi_src_index(&instr->src[i].src);
3623                unsigned sz = nir_src_bit_size(instr->src[i].src);
3624                unsigned components = nir_src_num_components(instr->src[i].src);
3625
3626                switch (instr->src[i].src_type) {
3627                case nir_tex_src_coord:
3628                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3629                                sregs[VALHALL_TEX_SREG_X_COORD] =
3630                                        bi_emit_texc_cube_coord(b, index,
3631                                                &sregs[VALHALL_TEX_SREG_Y_COORD]);
3632			} else {
3633                                assert(components >= 1 && components <= 3);
3634
3635                                /* Copy XY (for 2D+) or XX (for 1D) */
3636                                sregs[VALHALL_TEX_SREG_X_COORD] = index;
3637
3638                                if (components >= 2)
3639                                        sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1);
3640
3641                                if (components == 3 && !instr->is_array) {
3642                                        sregs[VALHALL_TEX_SREG_Z_COORD] =
3643                                                bi_extract(b, index, 2);
3644                                }
3645                        }
3646
3647                        if (instr->is_array) {
3648                                sregs[VALHALL_TEX_SREG_ARRAY] =
3649                                        bi_extract(b, index, components - 1);
3650                        }
3651
3652                        break;
3653
3654                case nir_tex_src_lod:
3655                        if (nir_src_is_const(instr->src[i].src) &&
3656                            nir_src_as_uint(instr->src[i].src) == 0) {
3657                                lod_mode = BI_VA_LOD_MODE_ZERO_LOD;
3658                        } else if (has_lod_mode) {
3659                                lod_mode = BI_VA_LOD_MODE_EXPLICIT;
3660
3661                                assert(sz == 16 || sz == 32);
3662                                sregs[VALHALL_TEX_SREG_LOD] =
3663                                        bi_emit_texc_lod_88(b, index, sz == 16);
3664                        }
3665                        break;
3666
3667                case nir_tex_src_bias:
3668                        /* Upper 16-bits interpreted as a clamp, leave zero */
3669                        assert(sz == 16 || sz == 32);
3670                        sregs[VALHALL_TEX_SREG_LOD] =
3671                                bi_emit_texc_lod_88(b, index, sz == 16);
3672
3673                        lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS;
3674                        break;
3675                case nir_tex_src_ms_index:
3676                case nir_tex_src_offset:
3677                        /* Handled below */
3678                        break;
3679
3680                case nir_tex_src_comparator:
3681                        sregs[VALHALL_TEX_SREG_SHADOW] = index;
3682                        break;
3683
3684                case nir_tex_src_texture_offset:
3685                        assert(instr->texture_index == 0);
3686                        texture = index;
3687                        break;
3688
3689                case nir_tex_src_sampler_offset:
3690                        assert(instr->sampler_index == 0);
3691                        sampler = index;
3692                        break;
3693
3694                default:
3695                        unreachable("Unhandled src type in tex emit");
3696                }
3697        }
3698
3699        /* Generate packed offset + ms index + LOD register. These default to
3700         * zero so we only need to encode if these features are actually in use.
3701         */
3702        bi_index offsets = bi_emit_valhall_offsets(b, instr);
3703
3704        if (!bi_is_equiv(offsets, bi_zero())) {
3705                sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets;
3706                explicit_offset = true;
3707        }
3708
3709        /* Allocate staging registers contiguously by compacting the array. */
3710        unsigned sr_count = 0;
3711
3712        for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) {
3713                if (!bi_is_null(sregs[i]))
3714                        sregs[sr_count++] = sregs[i];
3715        }
3716
3717        bi_index idx = sr_count ? bi_temp(b->shader) : bi_null();
3718
3719        if (sr_count)
3720                bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32);
3721
3722        bi_index image_src = bi_imm_u32(tables);
3723        image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0));
3724        image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16));
3725
3726        unsigned mask = BI_WRITE_MASK_RGBA;
3727        unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4;
3728        enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
3729        enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);
3730        bi_index dest = bi_temp(b->shader);
3731
3732        switch (instr->op) {
3733        case nir_texop_tex:
3734        case nir_texop_txl:
3735        case nir_texop_txb:
3736                bi_tex_single_to(b, dest, idx, image_src, bi_zero(),
3737                                 instr->is_array, dim, regfmt, instr->is_shadow,
3738                                 explicit_offset, lod_mode, mask, sr_count);
3739                break;
3740        case nir_texop_txf:
3741        case nir_texop_txf_ms:
3742                bi_tex_fetch_to(b, dest, idx, image_src, bi_zero(),
3743                                instr->is_array, dim, regfmt, explicit_offset,
3744                                mask, sr_count);
3745                break;
3746        case nir_texop_tg4:
3747                bi_tex_gather_to(b, dest, idx, image_src, bi_zero(),
3748                                 instr->is_array, dim, instr->component, false,
3749                                 regfmt, instr->is_shadow, explicit_offset,
3750                                 mask, sr_count);
3751                break;
3752        default:
3753                unreachable("Unhandled Valhall texture op");
3754        }
3755
3756        bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
3757        bi_emit_split_i32(b, w, dest, res_size);
3758        bi_emit_collect_to(b, bi_dest_index(&instr->dest), w,
3759                        DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4));
3760}
3761
3762/* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
3763 * textures with sufficiently small immediate indices. Anything else
3764 * needs a complete texture op. */
3765
3766static void
3767bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
3768{
3769        int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
3770        assert(coord_idx >= 0);
3771        bi_index coords = bi_src_index(&instr->src[coord_idx].src);
3772
3773        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3774                bi_index face, s, t;
3775                bi_emit_cube_coord(b, coords, &face, &s, &t);
3776
3777                bi_texs_cube_to(b, nir_dest_bit_size(instr->dest),
3778                                bi_dest_index(&instr->dest),
3779                                s, t, face,
3780                                instr->sampler_index, instr->texture_index);
3781        } else {
3782                bi_texs_2d_to(b, nir_dest_bit_size(instr->dest),
3783                                bi_dest_index(&instr->dest),
3784                                bi_extract(b, coords, 0),
3785                                bi_extract(b, coords, 1),
3786                                instr->op != nir_texop_tex, /* zero LOD */
3787                                instr->sampler_index, instr->texture_index);
3788        }
3789
3790        bi_split_dest(b, instr->dest);
3791}
3792
3793static bool
3794bi_is_simple_tex(nir_tex_instr *instr)
3795{
3796        if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
3797                return false;
3798
3799        if (instr->dest_type != nir_type_float32 &&
3800            instr->dest_type != nir_type_float16)
3801                return false;
3802
3803        if (instr->is_shadow || instr->is_array)
3804                return false;
3805
3806        switch (instr->sampler_dim) {
3807        case GLSL_SAMPLER_DIM_2D:
3808        case GLSL_SAMPLER_DIM_EXTERNAL:
3809        case GLSL_SAMPLER_DIM_RECT:
3810                break;
3811
3812        case GLSL_SAMPLER_DIM_CUBE:
3813                /* LOD can't be specified with TEXS_CUBE */
3814                if (instr->op == nir_texop_txl)
3815                        return false;
3816                break;
3817
3818        default:
3819                return false;
3820        }
3821
3822        for (unsigned i = 0; i < instr->num_srcs; ++i) {
3823                if (instr->src[i].src_type != nir_tex_src_lod &&
3824                    instr->src[i].src_type != nir_tex_src_coord)
3825                        return false;
3826        }
3827
3828        /* Indices need to fit in provided bits */
3829        unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
3830        if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
3831                return false;
3832
3833        int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3834        if (lod_idx < 0)
3835                return true;
3836
3837        nir_src lod = instr->src[lod_idx].src;
3838        return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
3839}
3840
3841static void
3842bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
3843{
3844        switch (instr->op) {
3845        case nir_texop_txs:
3846                bi_load_sysval_to(b, bi_dest_index(&instr->dest),
3847                                panfrost_sysval_for_instr(&instr->instr, NULL),
3848                                nir_dest_num_components(instr->dest), 0);
3849                return;
3850        case nir_texop_tex:
3851        case nir_texop_txl:
3852        case nir_texop_txb:
3853        case nir_texop_txf:
3854        case nir_texop_txf_ms:
3855        case nir_texop_tg4:
3856                break;
3857        default:
3858                unreachable("Invalid texture operation");
3859        }
3860
3861        if (b->shader->arch >= 9)
3862                bi_emit_tex_valhall(b, instr);
3863        else if (bi_is_simple_tex(instr))
3864                bi_emit_texs(b, instr);
3865        else
3866                bi_emit_texc(b, instr);
3867}
3868
3869static void
3870bi_emit_instr(bi_builder *b, struct nir_instr *instr)
3871{
3872        switch (instr->type) {
3873        case nir_instr_type_load_const:
3874                bi_emit_load_const(b, nir_instr_as_load_const(instr));
3875                break;
3876
3877        case nir_instr_type_intrinsic:
3878                bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
3879                break;
3880
3881        case nir_instr_type_alu:
3882                bi_emit_alu(b, nir_instr_as_alu(instr));
3883                break;
3884
3885        case nir_instr_type_tex:
3886                bi_emit_tex(b, nir_instr_as_tex(instr));
3887                break;
3888
3889        case nir_instr_type_jump:
3890                bi_emit_jump(b, nir_instr_as_jump(instr));
3891                break;
3892
3893        default:
3894                unreachable("should've been lowered");
3895        }
3896}
3897
3898static bi_block *
3899create_empty_block(bi_context *ctx)
3900{
3901        bi_block *blk = rzalloc(ctx, bi_block);
3902
3903        util_dynarray_init(&blk->predecessors, blk);
3904
3905        return blk;
3906}
3907
3908static bi_block *
3909emit_block(bi_context *ctx, nir_block *block)
3910{
3911        if (ctx->after_block) {
3912                ctx->current_block = ctx->after_block;
3913                ctx->after_block = NULL;
3914        } else {
3915                ctx->current_block = create_empty_block(ctx);
3916        }
3917
3918        list_addtail(&ctx->current_block->link, &ctx->blocks);
3919        list_inithead(&ctx->current_block->instructions);
3920
3921        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3922
3923        nir_foreach_instr(instr, block) {
3924                bi_emit_instr(&_b, instr);
3925                ++ctx->instruction_count;
3926        }
3927
3928        return ctx->current_block;
3929}
3930
3931static void
3932emit_if(bi_context *ctx, nir_if *nif)
3933{
3934        bi_block *before_block = ctx->current_block;
3935
3936        /* Speculatively emit the branch, but we can't fill it in until later */
3937        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3938        bi_instr *then_branch = bi_branchz_i16(&_b,
3939                        bi_half(bi_src_index(&nif->condition), false),
3940                        bi_zero(), BI_CMPF_EQ);
3941
3942        /* Emit the two subblocks. */
3943        bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
3944        bi_block *end_then_block = ctx->current_block;
3945
3946        /* Emit second block, and check if it's empty */
3947
3948        int count_in = ctx->instruction_count;
3949        bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
3950        bi_block *end_else_block = ctx->current_block;
3951        ctx->after_block = create_empty_block(ctx);
3952
3953        /* Now that we have the subblocks emitted, fix up the branches */
3954
3955        assert(then_block);
3956        assert(else_block);
3957
3958        if (ctx->instruction_count == count_in) {
3959                then_branch->branch_target = ctx->after_block;
3960                bi_block_add_successor(end_then_block, ctx->after_block); /* fallthrough */
3961        } else {
3962                then_branch->branch_target = else_block;
3963
3964                /* Emit a jump from the end of the then block to the end of the else */
3965                _b.cursor = bi_after_block(end_then_block);
3966                bi_instr *then_exit = bi_jump(&_b, bi_zero());
3967                then_exit->branch_target = ctx->after_block;
3968
3969                bi_block_add_successor(end_then_block, then_exit->branch_target);
3970                bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
3971        }
3972
3973        bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */
3974        bi_block_add_successor(before_block, then_block); /* fallthrough */
3975}
3976
3977static void
3978emit_loop(bi_context *ctx, nir_loop *nloop)
3979{
3980        /* Remember where we are */
3981        bi_block *start_block = ctx->current_block;
3982
3983        bi_block *saved_break = ctx->break_block;
3984        bi_block *saved_continue = ctx->continue_block;
3985
3986        ctx->continue_block = create_empty_block(ctx);
3987        ctx->break_block = create_empty_block(ctx);
3988        ctx->after_block = ctx->continue_block;
3989
3990        /* Emit the body itself */
3991        emit_cf_list(ctx, &nloop->body);
3992
3993        /* Branch back to loop back */
3994        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3995        bi_instr *I = bi_jump(&_b, bi_zero());
3996        I->branch_target = ctx->continue_block;
3997        bi_block_add_successor(start_block, ctx->continue_block);
3998        bi_block_add_successor(ctx->current_block, ctx->continue_block);
3999
4000        ctx->after_block = ctx->break_block;
4001
4002        /* Pop off */
4003        ctx->break_block = saved_break;
4004        ctx->continue_block = saved_continue;
4005        ++ctx->loop_count;
4006}
4007
4008static bi_block *
4009emit_cf_list(bi_context *ctx, struct exec_list *list)
4010{
4011        bi_block *start_block = NULL;
4012
4013        foreach_list_typed(nir_cf_node, node, node, list) {
4014                switch (node->type) {
4015                case nir_cf_node_block: {
4016                        bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
4017
4018                        if (!start_block)
4019                                start_block = block;
4020
4021                        break;
4022                }
4023
4024                case nir_cf_node_if:
4025                        emit_if(ctx, nir_cf_node_as_if(node));
4026                        break;
4027
4028                case nir_cf_node_loop:
4029                        emit_loop(ctx, nir_cf_node_as_loop(node));
4030                        break;
4031
4032                default:
4033                        unreachable("Unknown control flow");
4034                }
4035        }
4036
4037        return start_block;
4038}
4039
4040/* shader-db stuff */
4041
4042struct bi_stats {
4043        unsigned nr_clauses, nr_tuples, nr_ins;
4044        unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
4045};
4046
4047static void
4048bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
4049{
4050        /* Count instructions */
4051        stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
4052
4053        /* Non-message passing tuples are always arithmetic */
4054        if (tuple->add != clause->message) {
4055                stats->nr_arith++;
4056                return;
4057        }
4058
4059        /* Message + FMA we'll count as arithmetic _and_ message */
4060        if (tuple->fma)
4061                stats->nr_arith++;
4062
4063        switch (clause->message_type) {
4064        case BIFROST_MESSAGE_VARYING:
4065                /* Check components interpolated */
4066                stats->nr_varying += (clause->message->vecsize + 1) *
4067                        (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
4068                break;
4069
4070        case BIFROST_MESSAGE_VARTEX:
4071                /* 2 coordinates, fp32 each */
4072                stats->nr_varying += (2 * 2);
4073                FALLTHROUGH;
4074        case BIFROST_MESSAGE_TEX:
4075                stats->nr_texture++;
4076                break;
4077
4078        case BIFROST_MESSAGE_ATTRIBUTE:
4079        case BIFROST_MESSAGE_LOAD:
4080        case BIFROST_MESSAGE_STORE:
4081        case BIFROST_MESSAGE_ATOMIC:
4082                stats->nr_ldst++;
4083                break;
4084
4085        case BIFROST_MESSAGE_NONE:
4086        case BIFROST_MESSAGE_BARRIER:
4087        case BIFROST_MESSAGE_BLEND:
4088        case BIFROST_MESSAGE_TILE:
4089        case BIFROST_MESSAGE_Z_STENCIL:
4090        case BIFROST_MESSAGE_ATEST:
4091        case BIFROST_MESSAGE_JOB:
4092        case BIFROST_MESSAGE_64BIT:
4093                /* Nothing to do */
4094                break;
4095        };
4096
4097}
4098
4099/*
4100 * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the
4101 * shader completes. These costs are not accounted for in the general cycle
4102 * counts, so this function calculates the effective cost of these messages, as
4103 * if they were executed by shader code.
4104 */
4105static unsigned
4106bi_count_preload_cost(bi_context *ctx)
4107{
4108        /* Units: 1/16 of a normalized cycle, assuming that we may interpolate
4109         * 16 fp16 varying components per cycle or fetch two texels per cycle.
4110         */
4111        unsigned cost = 0;
4112
4113        for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) {
4114                struct bifrost_message_preload msg = ctx->info.bifrost->messages[i];
4115
4116                if (msg.enabled && msg.texture) {
4117                        /* 2 coordinate, 2 half-words each, plus texture */
4118                        cost += 12;
4119                } else if (msg.enabled) {
4120                        cost += (msg.num_components * (msg.fp16 ? 1 : 2));
4121                }
4122        }
4123
4124        return cost;
4125}
4126
4127static const char *
4128bi_shader_stage_name(bi_context *ctx)
4129{
4130        if (ctx->idvs == BI_IDVS_VARYING)
4131                return "MESA_SHADER_VARYING";
4132        else if (ctx->idvs == BI_IDVS_POSITION)
4133                return "MESA_SHADER_POSITION";
4134        else if (ctx->inputs->is_blend)
4135                return "MESA_SHADER_BLEND";
4136        else
4137                return gl_shader_stage_name(ctx->stage);
4138}
4139
4140static void
4141bi_print_stats(bi_context *ctx, unsigned size, FILE *fp)
4142{
4143        struct bi_stats stats = { 0 };
4144
4145        /* Count instructions, clauses, and tuples. Also attempt to construct
4146         * normalized execution engine cycle counts, using the following ratio:
4147         *
4148         * 24 arith tuples/cycle
4149         * 2 texture messages/cycle
4150         * 16 x 16-bit varying channels interpolated/cycle
4151         * 1 load store message/cycle
4152         *
4153         * These numbers seem to match Arm Mobile Studio's heuristic. The real
4154         * cycle counts are surely more complicated.
4155         */
4156
4157        bi_foreach_block(ctx, block) {
4158                bi_foreach_clause_in_block(block, clause) {
4159                        stats.nr_clauses++;
4160                        stats.nr_tuples += clause->tuple_count;
4161
4162                        for (unsigned i = 0; i < clause->tuple_count; ++i)
4163                                bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
4164                }
4165        }
4166
4167        float cycles_arith = ((float) stats.nr_arith) / 24.0;
4168        float cycles_texture = ((float) stats.nr_texture) / 2.0;
4169        float cycles_varying = ((float) stats.nr_varying) / 16.0;
4170        float cycles_ldst = ((float) stats.nr_ldst) / 1.0;
4171
4172        float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
4173        float cycles_bound = MAX2(cycles_arith, cycles_message);
4174
4175        /* Thread count and register pressure are traded off only on v7 */
4176        bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
4177        unsigned nr_threads = full_threads ? 2 : 1;
4178
4179        /* Dump stats */
4180        char *str = ralloc_asprintf(NULL, "%s - %s shader: "
4181                        "%u inst, %u tuples, %u clauses, "
4182                        "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
4183                        "%u quadwords, %u threads",
4184                        ctx->nir->info.label ?: "",
4185                        bi_shader_stage_name(ctx),
4186                        stats.nr_ins, stats.nr_tuples, stats.nr_clauses,
4187                        cycles_bound, cycles_arith, cycles_texture,
4188                        cycles_varying, cycles_ldst,
4189                        size / 16, nr_threads);
4190
4191        if (ctx->arch == 7) {
4192                ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
4193        }
4194
4195        ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills\n",
4196                        ctx->loop_count, ctx->spills, ctx->fills);
4197
4198        fputs(str, stderr);
4199        ralloc_free(str);
4200}
4201
4202static void
4203va_print_stats(bi_context *ctx, unsigned size, FILE *fp)
4204{
4205        unsigned nr_ins = 0;
4206        struct va_stats stats = { 0 };
4207
4208        /* Count instructions */
4209        bi_foreach_instr_global(ctx, I) {
4210                nr_ins++;
4211                va_count_instr_stats(I, &stats);
4212        }
4213
4214        /* Mali G78 peak performance:
4215         *
4216         * 64 FMA instructions per cycle
4217         * 64 CVT instructions per cycle
4218         * 16 SFU instructions per cycle
4219         * 8 x 32-bit varying channels interpolated per cycle
4220         * 4 texture instructions per cycle
4221         * 1 load/store operation per cycle
4222         */
4223
4224        float cycles_fma = ((float) stats.fma) / 64.0;
4225        float cycles_cvt = ((float) stats.cvt) / 64.0;
4226        float cycles_sfu = ((float) stats.sfu) / 16.0;
4227        float cycles_v = ((float) stats.v) / 16.0;
4228        float cycles_t = ((float) stats.t) / 4.0;
4229        float cycles_ls = ((float) stats.ls) / 1.0;
4230
4231        /* Calculate the bound */
4232        float cycles = MAX2(
4233                        MAX3(cycles_fma, cycles_cvt, cycles_sfu),
4234                        MAX3(cycles_v,   cycles_t,   cycles_ls));
4235
4236
4237        /* Thread count and register pressure are traded off */
4238        unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;
4239
4240        /* Dump stats */
4241        fprintf(stderr, "%s - %s shader: "
4242                        "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
4243                        "%f t, %f ls, %u quadwords, %u threads, %u loops, "
4244                        "%u:%u spills:fills\n",
4245                        ctx->nir->info.label ?: "",
4246                        bi_shader_stage_name(ctx),
4247                        nr_ins, cycles, cycles_fma, cycles_cvt, cycles_sfu,
4248                        cycles_v, cycles_t, cycles_ls, size / 16, nr_threads,
4249                        ctx->loop_count, ctx->spills, ctx->fills);
4250}
4251
4252static int
4253glsl_type_size(const struct glsl_type *type, bool bindless)
4254{
4255        return glsl_count_attribute_slots(type, false);
4256}
4257
4258/* Split stores to memory. We don't split stores to vertex outputs, since
4259 * nir_lower_io_to_temporaries will ensure there's only a single write.
4260 */
4261
4262static bool
4263should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
4264{
4265        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4266
4267        switch (intr->intrinsic) {
4268        case nir_intrinsic_store_ssbo:
4269        case nir_intrinsic_store_shared:
4270        case nir_intrinsic_store_global:
4271        case nir_intrinsic_store_scratch:
4272                return true;
4273        default:
4274                return false;
4275        }
4276}
4277
4278/* Bifrost wants transcendentals as FP32 */
4279
4280static unsigned
4281bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
4282{
4283        if (instr->type != nir_instr_type_alu)
4284                return 0;
4285
4286        nir_alu_instr *alu = nir_instr_as_alu(instr);
4287
4288        switch (alu->op) {
4289        case nir_op_fexp2:
4290        case nir_op_flog2:
4291        case nir_op_fpow:
4292        case nir_op_fsin:
4293        case nir_op_fcos:
4294                return (nir_dest_bit_size(alu->dest.dest) == 32) ? 0 : 32;
4295        default:
4296                return 0;
4297        }
4298}
4299
4300/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
4301 * transcendentals are an exception. Also shifts because of lane size mismatch
4302 * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
4303 * to be scalarized due to type size. */
4304
4305static uint8_t
4306bi_vectorize_filter(const nir_instr *instr, const void *data)
4307{
4308        /* Defaults work for everything else */
4309        if (instr->type != nir_instr_type_alu)
4310                return 0;
4311
4312        const nir_alu_instr *alu = nir_instr_as_alu(instr);
4313
4314        switch (alu->op) {
4315        case nir_op_frcp:
4316        case nir_op_frsq:
4317        case nir_op_ishl:
4318        case nir_op_ishr:
4319        case nir_op_ushr:
4320        case nir_op_f2i16:
4321        case nir_op_f2u16:
4322                return 1;
4323        default:
4324                break;
4325        }
4326
4327        /* Vectorized instructions cannot write more than 32-bit */
4328        int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
4329        if (dst_bit_size == 16)
4330                return 2;
4331        else
4332                return 1;
4333}
4334
4335static bool
4336bi_scalarize_filter(const nir_instr *instr, const void *data)
4337{
4338        if (instr->type != nir_instr_type_alu)
4339                return false;
4340
4341        const nir_alu_instr *alu = nir_instr_as_alu(instr);
4342
4343        switch (alu->op) {
4344        case nir_op_pack_uvec2_to_uint:
4345        case nir_op_pack_uvec4_to_uint:
4346                return false;
4347        default:
4348                return true;
4349        }
4350}
4351
4352/* XXX: This is a kludge to workaround NIR's lack of divergence metadata. If we
4353 * keep divergence info around after we consume it for indirect lowering,
4354 * nir_convert_from_ssa will regress code quality since it will avoid
4355 * coalescing divergent with non-divergent nodes. */
4356
4357static bool
4358nir_invalidate_divergence_ssa(nir_ssa_def *ssa, UNUSED void *data)
4359{
4360        ssa->divergent = false;
4361        return true;
4362}
4363
4364static bool
4365nir_invalidate_divergence(struct nir_builder *b, nir_instr *instr,
4366                UNUSED void *data)
4367{
4368        return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL);
4369}
4370
4371/* Ensure we write exactly 4 components */
4372static nir_ssa_def *
4373bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in,
4374                          unsigned channel, unsigned first, unsigned mask)
4375{
4376        if (!(mask & BITFIELD_BIT(channel)))
4377                channel = first;
4378
4379        return nir_channel(b, in, channel);
4380}
4381
4382/* Lower fragment store_output instructions to always write 4 components,
4383 * matching the hardware semantic. This may require additional moves. Skipping
4384 * these moves is possible in theory, but invokes undefined behaviour in the
4385 * compiler. The DDK inserts these moves, so we will as well. */
4386
4387static bool
4388bifrost_nir_lower_blend_components(struct nir_builder *b,
4389                                   nir_instr *instr, void *data)
4390{
4391        if (instr->type != nir_instr_type_intrinsic)
4392                return false;
4393
4394        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4395
4396        if (intr->intrinsic != nir_intrinsic_store_output)
4397                return false;
4398
4399        nir_ssa_def *in = intr->src[0].ssa;
4400        unsigned first = nir_intrinsic_component(intr);
4401        unsigned mask = nir_intrinsic_write_mask(intr);
4402
4403        assert(first == 0 && "shouldn't get nonzero components");
4404
4405        /* Nothing to do */
4406        if (mask == BITFIELD_MASK(4))
4407                return false;
4408
4409        b->cursor = nir_before_instr(&intr->instr);
4410
4411        /* Replicate the first valid component instead */
4412        nir_ssa_def *replicated =
4413                nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
4414                            bifrost_nir_valid_channel(b, in, 1, first, mask),
4415                            bifrost_nir_valid_channel(b, in, 2, first, mask),
4416                            bifrost_nir_valid_channel(b, in, 3, first, mask));
4417
4418        /* Rewrite to use our replicated version */
4419        nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated);
4420        nir_intrinsic_set_component(intr, 0);
4421        nir_intrinsic_set_write_mask(intr, 0xF);
4422        intr->num_components = 4;
4423
4424        return true;
4425}
4426
4427static void
4428bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
4429{
4430        bool progress;
4431        unsigned lower_flrp = 16 | 32 | 64;
4432
4433        NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
4434
4435        nir_lower_tex_options lower_tex_options = {
4436                .lower_txs_lod = true,
4437                .lower_txp = ~0,
4438                .lower_tg4_broadcom_swizzle = true,
4439                .lower_txd = true,
4440                .lower_invalid_implicit_lod = true,
4441        };
4442
4443        NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin);
4444        NIR_PASS(progress, nir, pan_lower_helper_invocation);
4445
4446        NIR_PASS(progress, nir, nir_lower_int64);
4447
4448        nir_lower_idiv_options idiv_options = {
4449                .imprecise_32bit_lowering = true,
4450                .allow_fp16 = true,
4451        };
4452        NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
4453
4454        NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
4455        NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
4456        NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
4457
4458        do {
4459                progress = false;
4460
4461                NIR_PASS(progress, nir, nir_lower_var_copies);
4462                NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
4463                NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
4464
4465                NIR_PASS(progress, nir, nir_copy_prop);
4466                NIR_PASS(progress, nir, nir_opt_remove_phis);
4467                NIR_PASS(progress, nir, nir_opt_dce);
4468                NIR_PASS(progress, nir, nir_opt_dead_cf);
4469                NIR_PASS(progress, nir, nir_opt_cse);
4470                NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
4471                NIR_PASS(progress, nir, nir_opt_algebraic);
4472                NIR_PASS(progress, nir, nir_opt_constant_folding);
4473
4474                NIR_PASS(progress, nir, nir_lower_alu);
4475
4476                if (lower_flrp != 0) {
4477                        bool lower_flrp_progress = false;
4478                        NIR_PASS(lower_flrp_progress,
4479                                 nir,
4480                                 nir_lower_flrp,
4481                                 lower_flrp,
4482                                 false /* always_precise */);
4483                        if (lower_flrp_progress) {
4484                                NIR_PASS(progress, nir,
4485                                         nir_opt_constant_folding);
4486                                progress = true;
4487                        }
4488
4489                        /* Nothing should rematerialize any flrps, so we only
4490                         * need to do this lowering once.
4491                         */
4492                        lower_flrp = 0;
4493                }
4494
4495                NIR_PASS(progress, nir, nir_opt_undef);
4496                NIR_PASS(progress, nir, nir_lower_undef_to_zero);
4497
4498                NIR_PASS(progress, nir, nir_opt_shrink_vectors);
4499                NIR_PASS(progress, nir, nir_opt_loop_unroll);
4500        } while (progress);
4501
4502        /* TODO: Why is 64-bit getting rematerialized?
4503         * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
4504        NIR_PASS(progress, nir, nir_lower_int64);
4505
4506        /* We need to cleanup after each iteration of late algebraic
4507         * optimizations, since otherwise NIR can produce weird edge cases
4508         * (like fneg of a constant) which we don't handle */
4509        bool late_algebraic = true;
4510        while (late_algebraic) {
4511                late_algebraic = false;
4512                NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4513                NIR_PASS(progress, nir, nir_opt_constant_folding);
4514                NIR_PASS(progress, nir, nir_copy_prop);
4515                NIR_PASS(progress, nir, nir_opt_dce);
4516                NIR_PASS(progress, nir, nir_opt_cse);
4517        }
4518
4519        NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
4520        NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true);
4521        NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
4522        NIR_PASS(progress, nir, nir_lower_bool_to_bitsize);
4523
4524        /* Prepass to simplify instruction selection */
4525        late_algebraic = false;
4526        NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);
4527
4528        while (late_algebraic) {
4529                late_algebraic = false;
4530                NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4531                NIR_PASS(progress, nir, nir_opt_constant_folding);
4532                NIR_PASS(progress, nir, nir_copy_prop);
4533                NIR_PASS(progress, nir, nir_opt_dce);
4534                NIR_PASS(progress, nir, nir_opt_cse);
4535        }
4536
4537        NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
4538        NIR_PASS(progress, nir, nir_opt_dce);
4539
4540        if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4541                NIR_PASS_V(nir, nir_shader_instructions_pass,
4542                           bifrost_nir_lower_blend_components,
4543                           nir_metadata_block_index | nir_metadata_dominance,
4544                           NULL);
4545        }
4546
4547        /* Backend scheduler is purely local, so do some global optimizations
4548         * to reduce register pressure. */
4549        nir_move_options move_all =
4550                nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
4551                nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
4552
4553        NIR_PASS_V(nir, nir_opt_sink, move_all);
4554        NIR_PASS_V(nir, nir_opt_move, move_all);
4555
4556        /* We might lower attribute, varying, and image indirects. Use the
4557         * gathered info to skip the extra analysis in the happy path. */
4558        bool any_indirects =
4559                nir->info.inputs_read_indirectly ||
4560                nir->info.outputs_accessed_indirectly ||
4561                nir->info.patch_inputs_read_indirectly ||
4562                nir->info.patch_outputs_accessed_indirectly ||
4563                nir->info.images_used[0];
4564
4565        if (any_indirects) {
4566                nir_convert_to_lcssa(nir, true, true);
4567                NIR_PASS_V(nir, nir_divergence_analysis);
4568                NIR_PASS_V(nir, bi_lower_divergent_indirects,
4569                                pan_subgroup_size(gpu_id >> 12));
4570                NIR_PASS_V(nir, nir_shader_instructions_pass,
4571                        nir_invalidate_divergence, nir_metadata_all, NULL);
4572        }
4573}
4574
4575/* The cmdstream lowers 8-bit fragment output as 16-bit, so we need to do the
4576 * same lowering here to zero-extend correctly */
4577
4578static bool
4579bifrost_nir_lower_i8_fragout_impl(struct nir_builder *b,
4580                nir_intrinsic_instr *intr, UNUSED void *data)
4581{
4582        if (nir_src_bit_size(intr->src[0]) != 8)
4583                return false;
4584
4585        nir_alu_type type =
4586                nir_alu_type_get_base_type(nir_intrinsic_src_type(intr));
4587
4588        assert(type == nir_type_int || type == nir_type_uint);
4589
4590        b->cursor = nir_before_instr(&intr->instr);
4591        nir_ssa_def *cast = nir_convert_to_bit_size(b, intr->src[0].ssa, type, 16);
4592
4593        nir_intrinsic_set_src_type(intr, type | 16);
4594        nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0], cast);
4595        return true;
4596}
4597
4598static bool
4599bifrost_nir_lower_i8_fragin_impl(struct nir_builder *b,
4600                nir_intrinsic_instr *intr, UNUSED void *data)
4601{
4602        if (nir_dest_bit_size(intr->dest) != 8)
4603                return false;
4604
4605        nir_alu_type type =
4606                nir_alu_type_get_base_type(nir_intrinsic_dest_type(intr));
4607
4608        assert(type == nir_type_int || type == nir_type_uint);
4609
4610        b->cursor = nir_before_instr(&intr->instr);
4611        nir_ssa_def *out =
4612                nir_load_output(b, intr->num_components, 16, intr->src[0].ssa,
4613                        .base = nir_intrinsic_base(intr),
4614                        .component = nir_intrinsic_component(intr),
4615                        .dest_type = type | 16,
4616                        .io_semantics = nir_intrinsic_io_semantics(intr));
4617
4618        nir_ssa_def *cast = nir_convert_to_bit_size(b, out, type, 8);
4619        nir_ssa_def_rewrite_uses(&intr->dest.ssa, cast);
4620        return true;
4621}
4622
4623static bool
4624bifrost_nir_lower_i8_frag(struct nir_builder *b,
4625                nir_instr *instr, UNUSED void *data)
4626{
4627        if (instr->type != nir_instr_type_intrinsic)
4628                return false;
4629
4630        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4631        if (intr->intrinsic == nir_intrinsic_load_output)
4632                return bifrost_nir_lower_i8_fragin_impl(b, intr, data);
4633        else if (intr->intrinsic == nir_intrinsic_store_output)
4634                return bifrost_nir_lower_i8_fragout_impl(b, intr, data);
4635        else
4636                return false;
4637}
4638
4639static void
4640bi_opt_post_ra(bi_context *ctx)
4641{
4642        bi_foreach_instr_global_safe(ctx, ins) {
4643                if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0]))
4644                        bi_remove_instruction(ins);
4645        }
4646}
4647
4648/* If the shader packs multiple varyings into the same location with different
4649 * location_frac, we'll need to lower to a single varying store that collects
4650 * all of the channels together.
4651 */
4652static bool
4653bifrost_nir_lower_store_component(struct nir_builder *b,
4654                nir_instr *instr, void *data)
4655{
4656        if (instr->type != nir_instr_type_intrinsic)
4657                return false;
4658
4659        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4660
4661        if (intr->intrinsic != nir_intrinsic_store_output)
4662                return false;
4663
4664        struct hash_table_u64 *slots = data;
4665        unsigned component = nir_intrinsic_component(intr);
4666        nir_src *slot_src = nir_get_io_offset_src(intr);
4667        uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
4668
4669        nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
4670        unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
4671
4672        nir_ssa_def *value = intr->src[0].ssa;
4673        b->cursor = nir_before_instr(&intr->instr);
4674
4675        nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size);
4676        nir_ssa_def *channels[4] = { undef, undef, undef, undef };
4677
4678        /* Copy old */
4679        u_foreach_bit(i, mask) {
4680                assert(prev != NULL);
4681                nir_ssa_def *prev_ssa = prev->src[0].ssa;
4682                channels[i] = nir_channel(b, prev_ssa, i);
4683        }
4684
4685        /* Copy new */
4686        unsigned new_mask = nir_intrinsic_write_mask(intr);
4687        mask |= (new_mask << component);
4688
4689        u_foreach_bit(i, new_mask) {
4690                assert(component + i < 4);
4691                channels[component + i] = nir_channel(b, value, i);
4692        }
4693
4694        intr->num_components = util_last_bit(mask);
4695        nir_instr_rewrite_src_ssa(instr, &intr->src[0],
4696                        nir_vec(b, channels, intr->num_components));
4697
4698        nir_intrinsic_set_component(intr, 0);
4699        nir_intrinsic_set_write_mask(intr, mask);
4700
4701        if (prev) {
4702                _mesa_hash_table_u64_remove(slots, slot);
4703                nir_instr_remove(&prev->instr);
4704        }
4705
4706        _mesa_hash_table_u64_insert(slots, slot, intr);
4707        return false;
4708}
4709
4710/* Dead code elimination for branches at the end of a block - only one branch
4711 * per block is legal semantically, but unreachable jumps can be generated.
4712 * Likewise on Bifrost we can generate jumps to the terminal block which need
4713 * to be lowered away to a jump to #0x0, which induces successful termination.
4714 * That trick doesn't work on Valhall, which needs a NOP inserted in the
4715 * terminal block instead.
4716 */
4717static void
4718bi_lower_branch(bi_context *ctx, bi_block *block)
4719{
4720        bool cull_terminal = (ctx->arch <= 8);
4721        bool branched = false;
4722        ASSERTED bool was_jump = false;
4723
4724        bi_foreach_instr_in_block_safe(block, ins) {
4725                if (!ins->branch_target) continue;
4726
4727                if (branched) {
4728                        assert(was_jump && (ins->op == BI_OPCODE_JUMP));
4729                        bi_remove_instruction(ins);
4730                        continue;
4731                }
4732
4733                branched = true;
4734                was_jump = ins->op == BI_OPCODE_JUMP;
4735
4736                if (!bi_is_terminal_block(ins->branch_target))
4737                        continue;
4738
4739                if (cull_terminal)
4740                        ins->branch_target = NULL;
4741                else if (ins->branch_target)
4742                        ins->branch_target->needs_nop = true;
4743        }
4744}
4745
4746static void
4747bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
4748{
4749        unsigned final_clause = bi_pack(ctx, binary);
4750
4751        /* If we need to wait for ATEST or BLEND in the first clause, pass the
4752         * corresponding bits through to the renderer state descriptor */
4753        bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
4754        bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
4755
4756        unsigned first_deps = first_clause ? first_clause->dependencies : 0;
4757        ctx->info.bifrost->wait_6 = (first_deps & (1 << 6));
4758        ctx->info.bifrost->wait_7 = (first_deps & (1 << 7));
4759
4760        /* Pad the shader with enough zero bytes to trick the prefetcher,
4761         * unless we're compiling an empty shader (in which case we don't pad
4762         * so the size remains 0) */
4763        unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
4764
4765        if (binary->size - offset) {
4766                memset(util_dynarray_grow(binary, uint8_t, prefetch_size),
4767                       0, prefetch_size);
4768        }
4769}
4770
4771/*
4772 * Build a bit mask of varyings (by location) that are flatshaded. This
4773 * information is needed by lower_mediump_io, as we don't yet support 16-bit
4774 * flat varyings.
4775 *
4776 * Also varyings that are used as texture coordinates should be kept at fp32 so
4777 * the texture instruction may be promoted to VAR_TEX. In general this is a good
4778 * idea, as fp16 texture coordinates are not supported by the hardware and are
4779 * usually inappropriate. (There are both relevant CTS bugs here, even.)
4780 *
4781 * TODO: If we compacted the varyings with some fixup code in the vertex shader,
4782 * we could implement 16-bit flat varyings. Consider if this case matters.
4783 *
4784 * TODO: The texture coordinate handling could be less heavyhanded.
4785 */
4786static bool
4787bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
4788{
4789        uint64_t *mask = data;
4790
4791        if (instr->type != nir_instr_type_tex)
4792                return false;
4793
4794        nir_tex_instr *tex = nir_instr_as_tex(instr);
4795
4796        int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
4797        if (coord_idx < 0)
4798                return false;
4799
4800        nir_src src = tex->src[coord_idx].src;
4801        assert(src.is_ssa);
4802
4803        nir_ssa_scalar x = nir_ssa_scalar_resolved(src.ssa, 0);
4804        nir_ssa_scalar y = nir_ssa_scalar_resolved(src.ssa, 1);
4805
4806        if (x.def != y.def)
4807                return false;
4808
4809        nir_instr *parent = x.def->parent_instr;
4810
4811        if (parent->type != nir_instr_type_intrinsic)
4812                return false;
4813
4814        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
4815
4816        if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
4817                return false;
4818
4819        nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
4820        *mask |= BITFIELD64_BIT(sem.location);
4821        return false;
4822}
4823
4824static uint64_t
4825bi_fp32_varying_mask(nir_shader *nir)
4826{
4827        uint64_t mask = 0;
4828
4829        assert(nir->info.stage == MESA_SHADER_FRAGMENT);
4830
4831        nir_foreach_shader_in_variable(var, nir) {
4832                if (var->data.interpolation == INTERP_MODE_FLAT)
4833                        mask |= BITFIELD64_BIT(var->data.location);
4834        }
4835
4836        nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all, &mask);
4837
4838        return mask;
4839}
4840
4841static void
4842bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
4843{
4844        /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
4845         * (so we don't accidentally duplicate the epilogue since mesa/st has
4846         * messed with our I/O quite a bit already) */
4847
4848        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
4849
4850        if (nir->info.stage == MESA_SHADER_VERTEX) {
4851                NIR_PASS_V(nir, nir_lower_viewport_transform);
4852                NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0);
4853
4854                nir_variable *psiz = nir_find_variable_with_location(nir,
4855                                                                     nir_var_shader_out,
4856                                                                     VARYING_SLOT_PSIZ);
4857                if (psiz != NULL)
4858                        psiz->data.precision = GLSL_PRECISION_MEDIUM;
4859        }
4860
4861        /* Get rid of any global vars before we lower to scratch. */
4862        NIR_PASS_V(nir, nir_lower_global_vars_to_local);
4863
4864        /* Valhall introduces packed thread local storage, which improves cache
4865         * locality of TLS access. However, access to packed TLS cannot
4866         * straddle 16-byte boundaries. As such, when packed TLS is in use
4867         * (currently unconditional for Valhall), we force vec4 alignment for
4868         * scratch access.
4869         */
4870        bool packed_tls = (gpu_id >= 0x9000);
4871
4872        /* Lower large arrays to scratch and small arrays to bcsel */
4873        NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
4874                        packed_tls ?
4875                        glsl_get_vec4_size_align_bytes :
4876                        glsl_get_natural_size_align_bytes);
4877        NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
4878
4879        NIR_PASS_V(nir, nir_split_var_copies);
4880        NIR_PASS_V(nir, nir_lower_var_copies);
4881        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
4882        NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
4883                        glsl_type_size, 0);
4884
4885        /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for
4886         * offsets it could figure out are constant.  Do some constant folding
4887         * before bifrost_nir_lower_store_component below.
4888         */
4889        NIR_PASS_V(nir, nir_opt_constant_folding);
4890
4891        if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4892                NIR_PASS_V(nir, nir_lower_mediump_io,
4893                           nir_var_shader_in | nir_var_shader_out,
4894                           ~bi_fp32_varying_mask(nir), false);
4895        } else {
4896                if (gpu_id >= 0x9000) {
4897                        NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
4898                                        BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
4899                }
4900
4901                struct hash_table_u64 *stores = _mesa_hash_table_u64_create(NULL);
4902                NIR_PASS_V(nir, nir_shader_instructions_pass,
4903                                bifrost_nir_lower_store_component,
4904                                nir_metadata_block_index |
4905                                nir_metadata_dominance, stores);
4906                _mesa_hash_table_u64_destroy(stores);
4907        }
4908
4909        NIR_PASS_V(nir, nir_lower_ssbo);
4910        NIR_PASS_V(nir, pan_nir_lower_zs_store);
4911        NIR_PASS_V(nir, pan_lower_sample_pos);
4912        NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
4913
4914        if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4915                NIR_PASS_V(nir, nir_shader_instructions_pass,
4916                                bifrost_nir_lower_i8_frag,
4917                                nir_metadata_block_index | nir_metadata_dominance,
4918                                NULL);
4919        }
4920
4921        if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) {
4922                NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
4923                           nir_var_shader_in | nir_var_shader_out);
4924                NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info);
4925                NIR_PASS_V(nir, pan_lower_xfb);
4926        }
4927
4928        bi_optimize_nir(nir, gpu_id, is_blend);
4929}
4930
4931static bi_context *
4932bi_compile_variant_nir(nir_shader *nir,
4933                       const struct panfrost_compile_inputs *inputs,
4934                       struct util_dynarray *binary,
4935                       struct hash_table_u64 *sysval_to_id,
4936                       struct bi_shader_info info,
4937                       enum bi_idvs_mode idvs)
4938{
4939        bi_context *ctx = rzalloc(NULL, bi_context);
4940
4941        /* There may be another program in the dynarray, start at the end */
4942        unsigned offset = binary->size;
4943
4944        ctx->sysval_to_id = sysval_to_id;
4945        ctx->inputs = inputs;
4946        ctx->nir = nir;
4947        ctx->stage = nir->info.stage;
4948        ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
4949        ctx->arch = inputs->gpu_id >> 12;
4950        ctx->info = info;
4951        ctx->idvs = idvs;
4952        ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;
4953
4954        if (idvs != BI_IDVS_NONE) {
4955                /* Specializing shaders for IDVS is destructive, so we need to
4956                 * clone. However, the last (second) IDVS shader does not need
4957                 * to be preserved so we can skip cloning that one.
4958                 */
4959                if (offset == 0)
4960                        ctx->nir = nir = nir_shader_clone(ctx, nir);
4961
4962                NIR_PASS_V(nir, nir_shader_instructions_pass,
4963                           bifrost_nir_specialize_idvs,
4964                           nir_metadata_block_index | nir_metadata_dominance,
4965                           &idvs);
4966
4967                /* After specializing, clean up the mess */
4968                bool progress = true;
4969
4970                while (progress) {
4971                        progress = false;
4972
4973                        NIR_PASS(progress, nir, nir_opt_dce);
4974                        NIR_PASS(progress, nir, nir_opt_dead_cf);
4975                }
4976        }
4977
4978        /* We can only go out-of-SSA after speciailizing IDVS, as opt_dead_cf
4979         * doesn't know how to deal with nir_register.
4980         */
4981        NIR_PASS_V(nir, nir_convert_from_ssa, true);
4982
4983        /* If nothing is pushed, all UBOs need to be uploaded */
4984        ctx->ubo_mask = ~0;
4985
4986        list_inithead(&ctx->blocks);
4987
4988        bool skip_internal = nir->info.internal;
4989        skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
4990
4991        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
4992                nir_print_shader(nir, stdout);
4993        }
4994
4995        ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
4996
4997        nir_foreach_function(func, nir) {
4998                if (!func->impl)
4999                        continue;
5000
5001                ctx->ssa_alloc += func->impl->ssa_alloc;
5002                ctx->reg_alloc += func->impl->reg_alloc;
5003
5004                emit_cf_list(ctx, &func->impl->body);
5005                break; /* TODO: Multi-function shaders */
5006        }
5007
5008        /* Index blocks now that we're done emitting */
5009        bi_foreach_block(ctx, block) {
5010                block->index = ctx->num_blocks++;
5011        }
5012
5013        bi_validate(ctx, "NIR -> BIR");
5014
5015        /* If the shader doesn't write any colour or depth outputs, it may
5016         * still need an ATEST at the very end! */
5017        bool need_dummy_atest =
5018                (ctx->stage == MESA_SHADER_FRAGMENT) &&
5019                !ctx->emitted_atest &&
5020                !bi_skip_atest(ctx, false);
5021
5022        if (need_dummy_atest) {
5023                bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
5024                bi_builder b = bi_init_builder(ctx, bi_after_block(end));
5025                bi_emit_atest(&b, bi_zero());
5026        }
5027
5028        bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
5029
5030        /* Runs before constant folding */
5031        bi_lower_swizzle(ctx);
5032        bi_validate(ctx, "Early lowering");
5033
5034        /* Runs before copy prop */
5035        if (optimize && !ctx->inputs->no_ubo_to_push) {
5036                bi_opt_push_ubo(ctx);
5037        }
5038
5039        if (likely(optimize)) {
5040                bi_opt_copy_prop(ctx);
5041
5042                while (bi_opt_constant_fold(ctx))
5043                        bi_opt_copy_prop(ctx);
5044
5045                bi_opt_mod_prop_forward(ctx);
5046                bi_opt_mod_prop_backward(ctx);
5047
5048                /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after
5049                 * mod_prop_backward to fuse VAR_TEX */
5050                if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT &&
5051                    !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) {
5052                        bi_opt_dead_code_eliminate(ctx);
5053                        bi_opt_message_preload(ctx);
5054                        bi_opt_copy_prop(ctx);
5055                }
5056
5057                bi_opt_dead_code_eliminate(ctx);
5058                bi_opt_cse(ctx);
5059                bi_opt_dead_code_eliminate(ctx);
5060                if (!ctx->inputs->no_ubo_to_push)
5061                        bi_opt_reorder_push(ctx);
5062                bi_validate(ctx, "Optimization passes");
5063        }
5064
5065        bi_foreach_instr_global(ctx, I) {
5066                bi_lower_opt_instruction(I);
5067        }
5068
5069        if (ctx->arch >= 9) {
5070                va_optimize(ctx);
5071
5072                bi_foreach_instr_global_safe(ctx, I) {
5073                        va_lower_isel(I);
5074                        va_lower_constants(ctx, I);
5075
5076                        bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
5077                        va_repair_fau(&b, I);
5078                }
5079
5080                /* We need to clean up after constant lowering */
5081                if (likely(optimize)) {
5082                        bi_opt_cse(ctx);
5083                        bi_opt_dead_code_eliminate(ctx);
5084                }
5085
5086                bi_validate(ctx, "Valhall passes");
5087        }
5088
5089        bi_foreach_block(ctx, block) {
5090                bi_lower_branch(ctx, block);
5091        }
5092
5093        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5094                bi_print_shader(ctx, stdout);
5095
5096        /* Analyze before register allocation to avoid false dependencies. The
5097         * skip bit is a function of only the data flow graph and is invariant
5098         * under valid scheduling. Helpers are only defined for fragment
5099         * shaders, so this analysis is only required in fragment shaders.
5100         */
5101        if (ctx->stage == MESA_SHADER_FRAGMENT)
5102                bi_analyze_helper_requirements(ctx);
5103
5104        /* Fuse TEXC after analyzing helper requirements so the analysis
5105         * doesn't have to know about dual textures */
5106        if (likely(optimize)) {
5107                bi_opt_fuse_dual_texture(ctx);
5108        }
5109
5110        /* Lower FAU after fusing dual texture, because fusing dual texture
5111         * creates new immediates that themselves may need lowering.
5112         */
5113        if (ctx->arch <= 8) {
5114                bi_lower_fau(ctx);
5115        }
5116
5117        /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */
5118        if (likely(optimize)) {
5119                bi_opt_cse(ctx);
5120                bi_opt_dead_code_eliminate(ctx);
5121        }
5122
5123        if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED)))
5124                bi_pressure_schedule(ctx);
5125
5126        bi_validate(ctx, "Late lowering");
5127
5128        bi_register_allocate(ctx);
5129
5130        if (likely(optimize))
5131                bi_opt_post_ra(ctx);
5132
5133        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5134                bi_print_shader(ctx, stdout);
5135
5136        if (ctx->arch >= 9) {
5137                va_assign_slots(ctx);
5138                va_insert_flow_control_nops(ctx);
5139                va_merge_flow(ctx);
5140                va_mark_last(ctx);
5141        } else {
5142                bi_schedule(ctx);
5143                bi_assign_scoreboard(ctx);
5144
5145                /* Analyze after scheduling since we depend on instruction
5146                 * order. Valhall calls as part of va_insert_flow_control_nops,
5147                 * as the handling for clauses differs from instructions.
5148                 */
5149                bi_analyze_helper_terminate(ctx);
5150                bi_mark_clauses_td(ctx);
5151        }
5152
5153        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5154                bi_print_shader(ctx, stdout);
5155
5156        if (ctx->arch <= 8) {
5157                bi_pack_clauses(ctx, binary, offset);
5158        } else {
5159                bi_pack_valhall(ctx, binary);
5160        }
5161
5162        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
5163                if (ctx->arch <= 8) {
5164                        disassemble_bifrost(stdout, binary->data + offset,
5165                                            binary->size - offset,
5166                                            bifrost_debug & BIFROST_DBG_VERBOSE);
5167                } else {
5168                        disassemble_valhall(stdout, binary->data + offset,
5169                                            binary->size - offset,
5170                                            bifrost_debug & BIFROST_DBG_VERBOSE);
5171                }
5172
5173                fflush(stdout);
5174        }
5175
5176        if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) &&
5177            !skip_internal) {
5178                if (ctx->arch >= 9) {
5179                        va_print_stats(ctx, binary->size - offset, stderr);
5180                } else {
5181                        bi_print_stats(ctx, binary->size - offset, stderr);
5182                }
5183        }
5184
5185        return ctx;
5186}
5187
5188static void
5189bi_compile_variant(nir_shader *nir,
5190                   const struct panfrost_compile_inputs *inputs,
5191                   struct util_dynarray *binary,
5192                   struct hash_table_u64 *sysval_to_id,
5193                   struct pan_shader_info *info,
5194                   enum bi_idvs_mode idvs)
5195{
5196        struct bi_shader_info local_info = {
5197                .push = &info->push,
5198                .bifrost = &info->bifrost,
5199                .tls_size = info->tls_size,
5200                .sysvals = &info->sysvals,
5201                .push_offset = info->push.count
5202        };
5203
5204        unsigned offset = binary->size;
5205
5206        /* If there is no position shader (gl_Position is not written), then
5207         * there is no need to build a varying shader either. This case is hit
5208         * for transform feedback only vertex shaders which only make sense with
5209         * rasterizer discard.
5210         */
5211        if ((offset == 0) && (idvs == BI_IDVS_VARYING))
5212                return;
5213
5214        /* Software invariant: Only a secondary shader can appear at a nonzero
5215         * offset, to keep the ABI simple. */
5216        assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
5217
5218        bi_context *ctx = bi_compile_variant_nir(nir, inputs, binary, sysval_to_id, local_info, idvs);
5219
5220        /* A register is preloaded <==> it is live before the first block */
5221        bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
5222        uint64_t preload = first_block->reg_live_in;
5223
5224        /* If multisampling is used with a blend shader, the blend shader needs
5225         * to access the sample coverage mask in r60 and the sample ID in r61.
5226         * Blend shaders run in the same context as fragment shaders, so if a
5227         * blend shader could run, we need to preload these registers
5228         * conservatively. There is believed to be little cost to doing so, so
5229         * do so always to avoid variants of the preload descriptor.
5230         *
5231         * We only do this on Valhall, as Bifrost has to update the RSD for
5232         * multisampling w/ blend shader anyway, so this is handled in the
5233         * driver. We could unify the paths if the cost is acceptable.
5234         */
5235        if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
5236                preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
5237
5238        info->ubo_mask |= ctx->ubo_mask;
5239        info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
5240
5241        if (idvs == BI_IDVS_VARYING) {
5242                info->vs.secondary_enable = (binary->size > offset);
5243                info->vs.secondary_offset = offset;
5244                info->vs.secondary_preload = preload;
5245                info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
5246        } else {
5247                info->preload = preload;
5248                info->work_reg_count = ctx->info.work_reg_count;
5249        }
5250
5251        if (idvs == BI_IDVS_POSITION &&
5252            !nir->info.internal &&
5253            nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) {
5254                /* Find the psiz write */
5255                bi_instr *write = NULL;
5256
5257                bi_foreach_instr_global(ctx, I) {
5258                        if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) {
5259                                write = I;
5260                                break;
5261                        }
5262                }
5263
5264                assert(write != NULL);
5265
5266                /* NOP it out, preserving its flow control. TODO: maybe DCE */
5267                if (write->flow) {
5268                        bi_builder b = bi_init_builder(ctx, bi_before_instr(write));
5269                        bi_instr *nop = bi_nop(&b);
5270                        nop->flow = write->flow;
5271                }
5272
5273                bi_remove_instruction(write);
5274
5275                info->vs.no_psiz_offset = binary->size;
5276                bi_pack_valhall(ctx, binary);
5277        }
5278
5279        ralloc_free(ctx);
5280}
5281
5282/* Decide if Index-Driven Vertex Shading should be used for a given shader */
5283static bool
5284bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
5285{
5286        /* Opt-out */
5287        if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS)
5288                return false;
5289
5290        /* IDVS splits up vertex shaders, not defined on other shader stages */
5291        if (nir->info.stage != MESA_SHADER_VERTEX)
5292                return false;
5293
5294        /* Bifrost cannot write gl_PointSize during IDVS */
5295        if ((inputs->gpu_id < 0x9000) &&
5296            nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))
5297                return false;
5298
5299        /* Otherwise, IDVS is usually better */
5300        return true;
5301}
5302
5303void
5304bifrost_compile_shader_nir(nir_shader *nir,
5305                           const struct panfrost_compile_inputs *inputs,
5306                           struct util_dynarray *binary,
5307                           struct pan_shader_info *info)
5308{
5309        bifrost_debug = debug_get_option_bifrost_debug();
5310
5311        bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend);
5312        struct hash_table_u64 *sysval_to_id =
5313                panfrost_init_sysvals(&info->sysvals,
5314                                      inputs->fixed_sysval_layout,
5315                                      NULL);
5316
5317        info->tls_size = nir->scratch_size;
5318        info->vs.idvs = bi_should_idvs(nir, inputs);
5319
5320        if (info->vs.idvs) {
5321                bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_POSITION);
5322                bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_VARYING);
5323        } else {
5324                bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_NONE);
5325        }
5326
5327        if (gl_shader_stage_is_compute(nir->info.stage)) {
5328                /* Workgroups may be merged if the structure of the workgroup is
5329                 * not software visible. This is true if neither shared memory
5330                 * nor barriers are used. The hardware may be able to optimize
5331                 * compute shaders that set this flag.
5332                 */
5333                info->cs.allow_merging_workgroups =
5334                        (nir->info.shared_size == 0) &&
5335                        !nir->info.uses_control_barrier &&
5336                        !nir->info.uses_memory_barrier;
5337        }
5338
5339        info->ubo_mask &= (1 << nir->info.num_ubos) - 1;
5340
5341        _mesa_hash_table_u64_destroy(sysval_to_id);
5342}
5343