1/*
2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 * Copyright (C) 2019-2020 Collabora, Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include <sys/types.h>
26#include <sys/stat.h>
27#include <sys/mman.h>
28#include <fcntl.h>
29#include <stdint.h>
30#include <stdlib.h>
31#include <stdio.h>
32#include <err.h>
33
34#include "compiler/glsl/glsl_to_nir.h"
35#include "compiler/nir_types.h"
36#include "compiler/nir/nir_builder.h"
37#include "util/half_float.h"
38#include "util/u_math.h"
39#include "util/u_debug.h"
40#include "util/u_dynarray.h"
41#include "util/list.h"
42
43#include "midgard.h"
44#include "midgard_nir.h"
45#include "midgard_compile.h"
46#include "midgard_ops.h"
47#include "helpers.h"
48#include "compiler.h"
49#include "midgard_quirks.h"
50#include "panfrost/util/pan_lower_framebuffer.h"
51
52#include "disassemble.h"
53
54static const struct debug_named_value midgard_debug_options[] = {
55        {"msgs",      MIDGARD_DBG_MSGS,		"Print debug messages"},
56        {"shaders",   MIDGARD_DBG_SHADERS,	"Dump shaders in NIR and MIR"},
57        {"shaderdb",  MIDGARD_DBG_SHADERDB,     "Prints shader-db statistics"},
58        {"inorder",   MIDGARD_DBG_INORDER,      "Disables out-of-order scheduling"},
59        {"verbose",   MIDGARD_DBG_VERBOSE,      "Dump shaders verbosely"},
60        {"internal",  MIDGARD_DBG_INTERNAL,     "Dump internal shaders"},
61        DEBUG_NAMED_VALUE_END
62};
63
64DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", midgard_debug_options, 0)
65
66int midgard_debug = 0;
67
68#define DBG(fmt, ...) \
69		do { if (midgard_debug & MIDGARD_DBG_MSGS) \
70			fprintf(stderr, "%s:%d: "fmt, \
71				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
72static midgard_block *
73create_empty_block(compiler_context *ctx)
74{
75        midgard_block *blk = rzalloc(ctx, midgard_block);
76
77        blk->base.predecessors = _mesa_set_create(blk,
78                        _mesa_hash_pointer,
79                        _mesa_key_pointer_equal);
80
81        blk->base.name = ctx->block_source_count++;
82
83        return blk;
84}
85
86static void
87schedule_barrier(compiler_context *ctx)
88{
89        midgard_block *temp = ctx->after_block;
90        ctx->after_block = create_empty_block(ctx);
91        ctx->block_count++;
92        list_addtail(&ctx->after_block->base.link, &ctx->blocks);
93        list_inithead(&ctx->after_block->base.instructions);
94        pan_block_add_successor(&ctx->current_block->base, &ctx->after_block->base);
95        ctx->current_block = ctx->after_block;
96        ctx->after_block = temp;
97}
98
99/* Helpers to generate midgard_instruction's using macro magic, since every
100 * driver seems to do it that way */
101
102#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));
103
104#define M_LOAD_STORE(name, store, T) \
105	static midgard_instruction m_##name(unsigned ssa, unsigned address) { \
106		midgard_instruction i = { \
107			.type = TAG_LOAD_STORE_4, \
108                        .mask = 0xF, \
109                        .dest = ~0, \
110                        .src = { ~0, ~0, ~0, ~0 }, \
111                        .swizzle = SWIZZLE_IDENTITY_4, \
112                        .op = midgard_op_##name, \
113			.load_store = { \
114				.signed_offset = address \
115			} \
116		}; \
117                \
118                if (store) { \
119                        i.src[0] = ssa; \
120                        i.src_types[0] = T; \
121                        i.dest_type = T; \
122                } else { \
123                        i.dest = ssa; \
124                        i.dest_type = T; \
125                } \
126		return i; \
127	}
128
129#define M_LOAD(name, T) M_LOAD_STORE(name, false, T)
130#define M_STORE(name, T) M_LOAD_STORE(name, true, T)
131
132M_LOAD(ld_attr_32, nir_type_uint32);
133M_LOAD(ld_vary_32, nir_type_uint32);
134M_LOAD(ld_ubo_u8, nir_type_uint32); /* mandatory extension to 32-bit */
135M_LOAD(ld_ubo_u16, nir_type_uint32);
136M_LOAD(ld_ubo_32, nir_type_uint32);
137M_LOAD(ld_ubo_64, nir_type_uint32);
138M_LOAD(ld_ubo_128, nir_type_uint32);
139M_LOAD(ld_u8, nir_type_uint8);
140M_LOAD(ld_u16, nir_type_uint16);
141M_LOAD(ld_32, nir_type_uint32);
142M_LOAD(ld_64, nir_type_uint32);
143M_LOAD(ld_128, nir_type_uint32);
144M_STORE(st_u8, nir_type_uint8);
145M_STORE(st_u16, nir_type_uint16);
146M_STORE(st_32, nir_type_uint32);
147M_STORE(st_64, nir_type_uint32);
148M_STORE(st_128, nir_type_uint32);
149M_LOAD(ld_tilebuffer_raw, nir_type_uint32);
150M_LOAD(ld_tilebuffer_16f, nir_type_float16);
151M_LOAD(ld_tilebuffer_32f, nir_type_float32);
152M_STORE(st_vary_32, nir_type_uint32);
153M_LOAD(ld_cubemap_coords, nir_type_uint32);
154M_LOAD(ldst_mov, nir_type_uint32);
155M_LOAD(ld_image_32f, nir_type_float32);
156M_LOAD(ld_image_16f, nir_type_float16);
157M_LOAD(ld_image_32u, nir_type_uint32);
158M_LOAD(ld_image_32i, nir_type_int32);
159M_STORE(st_image_32f, nir_type_float32);
160M_STORE(st_image_16f, nir_type_float16);
161M_STORE(st_image_32u, nir_type_uint32);
162M_STORE(st_image_32i, nir_type_int32);
163M_LOAD(lea_image, nir_type_uint64);
164
165#define M_IMAGE(op) \
166static midgard_instruction \
167op ## _image(nir_alu_type type, unsigned val, unsigned address) \
168{ \
169        switch (type) { \
170        case nir_type_float32: \
171                 return m_ ## op ## _image_32f(val, address); \
172        case nir_type_float16: \
173                 return m_ ## op ## _image_16f(val, address); \
174        case nir_type_uint32: \
175                 return m_ ## op ## _image_32u(val, address); \
176        case nir_type_int32: \
177                 return m_ ## op ## _image_32i(val, address); \
178        default: \
179                 unreachable("Invalid image type"); \
180        } \
181}
182
183M_IMAGE(ld);
184M_IMAGE(st);
185
186static midgard_instruction
187v_branch(bool conditional, bool invert)
188{
189        midgard_instruction ins = {
190                .type = TAG_ALU_4,
191                .unit = ALU_ENAB_BRANCH,
192                .compact_branch = true,
193                .branch = {
194                        .conditional = conditional,
195                        .invert_conditional = invert
196                },
197                .dest = ~0,
198                .src = { ~0, ~0, ~0, ~0 },
199        };
200
201        return ins;
202}
203
204static void
205attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name)
206{
207        ins->has_constants = true;
208        memcpy(&ins->constants, constants, 16);
209}
210
211static int
212glsl_type_size(const struct glsl_type *type, bool bindless)
213{
214        return glsl_count_attribute_slots(type, false);
215}
216
217static bool
218midgard_nir_lower_global_load_instr(nir_builder *b, nir_instr *instr, void *data)
219{
220        if (instr->type != nir_instr_type_intrinsic)
221                return false;
222
223        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
224        if (intr->intrinsic != nir_intrinsic_load_global &&
225            intr->intrinsic != nir_intrinsic_load_shared)
226                return false;
227
228        unsigned compsz = nir_dest_bit_size(intr->dest);
229        unsigned totalsz = compsz * nir_dest_num_components(intr->dest);
230        /* 8, 16, 32, 64 and 128 bit loads don't need to be lowered */
231        if (util_bitcount(totalsz) < 2 && totalsz <= 128)
232                return false;
233
234        b->cursor = nir_before_instr(instr);
235
236        assert(intr->src[0].is_ssa);
237        nir_ssa_def *addr = intr->src[0].ssa;
238
239        nir_ssa_def *comps[MIR_VEC_COMPONENTS];
240        unsigned ncomps = 0;
241
242        while (totalsz) {
243                unsigned loadsz = MIN2(1 << (util_last_bit(totalsz) - 1), 128);
244                unsigned loadncomps = loadsz / compsz;
245
246                nir_ssa_def *load;
247                if (intr->intrinsic == nir_intrinsic_load_global) {
248                        load = nir_load_global(b, addr, compsz / 8, loadncomps, compsz);
249                } else {
250                        assert(intr->intrinsic == nir_intrinsic_load_shared);
251                        nir_intrinsic_instr *shared_load =
252                                nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_shared);
253                        shared_load->num_components = loadncomps;
254                        shared_load->src[0] = nir_src_for_ssa(addr);
255                        nir_intrinsic_set_align(shared_load, compsz / 8, 0);
256                        nir_intrinsic_set_base(shared_load, nir_intrinsic_base(intr));
257                        nir_ssa_dest_init(&shared_load->instr, &shared_load->dest,
258                                          shared_load->num_components, compsz, NULL);
259                        nir_builder_instr_insert(b, &shared_load->instr);
260                        load = &shared_load->dest.ssa;
261                }
262
263                for (unsigned i = 0; i < loadncomps; i++)
264                        comps[ncomps++] = nir_channel(b, load, i);
265
266                totalsz -= loadsz;
267                addr = nir_iadd(b, addr, nir_imm_intN_t(b, loadsz / 8, addr->bit_size));
268        }
269
270        assert(ncomps == nir_dest_num_components(intr->dest));
271        nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_vec(b, comps, ncomps));
272
273        return true;
274}
275
276static bool
277midgard_nir_lower_global_load(nir_shader *shader)
278{
279        return nir_shader_instructions_pass(shader,
280                                            midgard_nir_lower_global_load_instr,
281                                            nir_metadata_block_index | nir_metadata_dominance,
282                                            NULL);
283}
284
285static bool
286mdg_should_scalarize(const nir_instr *instr, const void *_unused)
287{
288        const nir_alu_instr *alu = nir_instr_as_alu(instr);
289
290        if (nir_src_bit_size(alu->src[0].src) == 64)
291                return true;
292
293        if (nir_dest_bit_size(alu->dest.dest) == 64)
294                return true;
295
296        switch (alu->op) {
297        case nir_op_fdot2:
298        case nir_op_umul_high:
299        case nir_op_imul_high:
300        case nir_op_pack_half_2x16:
301        case nir_op_unpack_half_2x16:
302                return true;
303        default:
304                return false;
305        }
306}
307
308/* Only vectorize int64 up to vec2 */
309static uint8_t
310midgard_vectorize_filter(const nir_instr *instr, const void *data)
311{
312        if (instr->type != nir_instr_type_alu)
313                return 0;
314
315        const nir_alu_instr *alu = nir_instr_as_alu(instr);
316        int src_bit_size = nir_src_bit_size(alu->src[0].src);
317        int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
318
319        if (src_bit_size == 64 || dst_bit_size == 64)
320                return 2;
321
322        return 4;
323}
324
325static void
326optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend, bool is_blit)
327{
328        bool progress;
329        unsigned lower_flrp =
330                (nir->options->lower_flrp16 ? 16 : 0) |
331                (nir->options->lower_flrp32 ? 32 : 0) |
332                (nir->options->lower_flrp64 ? 64 : 0);
333
334        NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
335        nir_lower_idiv_options idiv_options = {
336                .imprecise_32bit_lowering = true,
337                .allow_fp16 = true,
338        };
339        NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
340
341        nir_lower_tex_options lower_tex_options = {
342                .lower_txs_lod = true,
343                .lower_txp = ~0,
344                .lower_tg4_broadcom_swizzle = true,
345                /* TODO: we have native gradient.. */
346                .lower_txd = true,
347                .lower_invalid_implicit_lod = true,
348        };
349
350        NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
351
352
353        /* TEX_GRAD fails to apply sampler descriptor settings on some
354         * implementations, requiring a lowering. However, blit shaders do not
355         * use the affected settings and should skip the workaround.
356         */
357        if ((quirks & MIDGARD_BROKEN_LOD) && !is_blit)
358                NIR_PASS_V(nir, midgard_nir_lod_errata);
359
360        /* Midgard image ops coordinates are 16-bit instead of 32-bit */
361        NIR_PASS(progress, nir, midgard_nir_lower_image_bitsize);
362        NIR_PASS(progress, nir, midgard_nir_lower_helper_writes);
363        NIR_PASS(progress, nir, pan_lower_helper_invocation);
364        NIR_PASS(progress, nir, pan_lower_sample_pos);
365
366        if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) {
367                NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
368                           nir_var_shader_in | nir_var_shader_out);
369                NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info);
370                NIR_PASS_V(nir, pan_lower_xfb);
371        }
372
373        NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early);
374        NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL);
375
376        do {
377                progress = false;
378
379                NIR_PASS(progress, nir, nir_lower_var_copies);
380                NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
381
382                NIR_PASS(progress, nir, nir_copy_prop);
383                NIR_PASS(progress, nir, nir_opt_remove_phis);
384                NIR_PASS(progress, nir, nir_opt_dce);
385                NIR_PASS(progress, nir, nir_opt_dead_cf);
386                NIR_PASS(progress, nir, nir_opt_cse);
387                NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
388                NIR_PASS(progress, nir, nir_opt_algebraic);
389                NIR_PASS(progress, nir, nir_opt_constant_folding);
390
391                if (lower_flrp != 0) {
392                        bool lower_flrp_progress = false;
393                        NIR_PASS(lower_flrp_progress,
394                                 nir,
395                                 nir_lower_flrp,
396                                 lower_flrp,
397                                 false /* always_precise */);
398                        if (lower_flrp_progress) {
399                                NIR_PASS(progress, nir,
400                                         nir_opt_constant_folding);
401                                progress = true;
402                        }
403
404                        /* Nothing should rematerialize any flrps, so we only
405                         * need to do this lowering once.
406                         */
407                        lower_flrp = 0;
408                }
409
410                NIR_PASS(progress, nir, nir_opt_undef);
411                NIR_PASS(progress, nir, nir_lower_undef_to_zero);
412
413                NIR_PASS(progress, nir, nir_opt_loop_unroll);
414
415                NIR_PASS(progress, nir, nir_opt_vectorize,
416                         midgard_vectorize_filter, NULL);
417        } while (progress);
418
419        NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL);
420
421        /* Run after opts so it can hit more */
422        if (!is_blend)
423                NIR_PASS(progress, nir, nir_fuse_io_16);
424
425        /* Must be run at the end to prevent creation of fsin/fcos ops */
426        NIR_PASS(progress, nir, midgard_nir_scale_trig);
427
428        do {
429                progress = false;
430
431                NIR_PASS(progress, nir, nir_opt_dce);
432                NIR_PASS(progress, nir, nir_opt_algebraic);
433                NIR_PASS(progress, nir, nir_opt_constant_folding);
434                NIR_PASS(progress, nir, nir_copy_prop);
435        } while (progress);
436
437        NIR_PASS(progress, nir, nir_opt_algebraic_late);
438        NIR_PASS(progress, nir, nir_opt_algebraic_distribute_src_mods);
439
440        /* We implement booleans as 32-bit 0/~0 */
441        NIR_PASS(progress, nir, nir_lower_bool_to_int32);
442
443        /* Now that booleans are lowered, we can run out late opts */
444        NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late);
445        NIR_PASS(progress, nir, midgard_nir_cancel_inot);
446
447        NIR_PASS(progress, nir, nir_copy_prop);
448        NIR_PASS(progress, nir, nir_opt_dce);
449
450        /* Backend scheduler is purely local, so do some global optimizations
451         * to reduce register pressure. */
452        nir_move_options move_all =
453                nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
454                nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
455
456        NIR_PASS_V(nir, nir_opt_sink, move_all);
457        NIR_PASS_V(nir, nir_opt_move, move_all);
458
459        /* Take us out of SSA */
460        NIR_PASS(progress, nir, nir_lower_locals_to_regs);
461        NIR_PASS(progress, nir, nir_convert_from_ssa, true);
462
463        /* We are a vector architecture; write combine where possible */
464        NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
465        NIR_PASS(progress, nir, nir_lower_vec_to_movs, NULL, NULL);
466
467        NIR_PASS(progress, nir, nir_opt_dce);
468}
469
470/* Do not actually emit a load; instead, cache the constant for inlining */
471
472static void
473emit_load_const(compiler_context *ctx, nir_load_const_instr *instr)
474{
475        nir_ssa_def def = instr->def;
476
477        midgard_constants *consts = rzalloc(ctx, midgard_constants);
478
479        assert(instr->def.num_components * instr->def.bit_size <= sizeof(*consts) * 8);
480
481#define RAW_CONST_COPY(bits)                                         \
482        nir_const_value_to_array(consts->u##bits, instr->value,      \
483                                 instr->def.num_components, u##bits)
484
485        switch (instr->def.bit_size) {
486        case 64:
487                RAW_CONST_COPY(64);
488                break;
489        case 32:
490                RAW_CONST_COPY(32);
491                break;
492        case 16:
493                RAW_CONST_COPY(16);
494                break;
495        case 8:
496                RAW_CONST_COPY(8);
497                break;
498        default:
499                unreachable("Invalid bit_size for load_const instruction\n");
500        }
501
502        /* Shifted for SSA, +1 for off-by-one */
503        _mesa_hash_table_u64_insert(ctx->ssa_constants, (def.index << 1) + 1, consts);
504}
505
506/* Normally constants are embedded implicitly, but for I/O and such we have to
507 * explicitly emit a move with the constant source */
508
509static void
510emit_explicit_constant(compiler_context *ctx, unsigned node, unsigned to)
511{
512        void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, node + 1);
513
514        if (constant_value) {
515                midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), to);
516                attach_constants(ctx, &ins, constant_value, node + 1);
517                emit_mir_instruction(ctx, ins);
518        }
519}
520
521static bool
522nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components)
523{
524        unsigned comp = src->swizzle[0];
525
526        for (unsigned c = 1; c < nr_components; ++c) {
527                if (src->swizzle[c] != comp)
528                        return true;
529        }
530
531        return false;
532}
533
534#define ATOMIC_CASE_IMPL(ctx, instr, nir, op, is_shared) \
535        case nir_intrinsic_##nir: \
536                emit_atomic(ctx, instr, is_shared, midgard_op_##op, ~0); \
537                break;
538
539#define ATOMIC_CASE(ctx, instr, nir, op) \
540        ATOMIC_CASE_IMPL(ctx, instr, shared_atomic_##nir, atomic_##op, true); \
541        ATOMIC_CASE_IMPL(ctx, instr, global_atomic_##nir, atomic_##op, false);
542
543#define IMAGE_ATOMIC_CASE(ctx, instr, nir, op) \
544        case nir_intrinsic_image_atomic_##nir: { \
545                midgard_instruction ins = emit_image_op(ctx, instr, true); \
546                emit_atomic(ctx, instr, false, midgard_op_atomic_##op, ins.dest); \
547                break; \
548        }
549
550#define ALU_CASE(nir, _op) \
551	case nir_op_##nir: \
552		op = midgard_alu_op_##_op; \
553                assert(src_bitsize == dst_bitsize); \
554		break;
555
556#define ALU_CASE_RTZ(nir, _op) \
557	case nir_op_##nir: \
558		op = midgard_alu_op_##_op; \
559                roundmode = MIDGARD_RTZ; \
560		break;
561
562#define ALU_CHECK_CMP() \
563                assert(src_bitsize == 16 || src_bitsize == 32 || src_bitsize == 64); \
564                assert(dst_bitsize == 16 || dst_bitsize == 32); \
565
566#define ALU_CASE_BCAST(nir, _op, count) \
567        case nir_op_##nir: \
568                op = midgard_alu_op_##_op; \
569                broadcast_swizzle = count; \
570                ALU_CHECK_CMP(); \
571                break;
572
573#define ALU_CASE_CMP(nir, _op) \
574	case nir_op_##nir: \
575		op = midgard_alu_op_##_op; \
576                ALU_CHECK_CMP(); \
577                break;
578
579/* Compare mir_lower_invert */
580static bool
581nir_accepts_inot(nir_op op, unsigned src)
582{
583        switch (op) {
584        case nir_op_ior:
585        case nir_op_iand: /* TODO: b2f16 */
586        case nir_op_ixor:
587                return true;
588        case nir_op_b32csel:
589                /* Only the condition */
590                return (src == 0);
591        default:
592                return false;
593        }
594}
595
596static bool
597mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op)
598{
599        if (pan_has_dest_mod(dest, op)) {
600                assert((*dest)->is_ssa);
601                BITSET_SET(ctx->already_emitted, (*dest)->ssa.index);
602                return true;
603        }
604
605        return false;
606}
607
608/* Look for floating point mods. We have the mods clamp_m1_1, clamp_0_1,
609 * and clamp_0_inf. We also have the relations (note 3 * 2 = 6 cases):
610 *
611 * clamp_0_1(clamp_0_inf(x))  = clamp_m1_1(x)
612 * clamp_0_1(clamp_m1_1(x))   = clamp_m1_1(x)
613 * clamp_0_inf(clamp_0_1(x))  = clamp_m1_1(x)
614 * clamp_0_inf(clamp_m1_1(x)) = clamp_m1_1(x)
615 * clamp_m1_1(clamp_0_1(x))   = clamp_m1_1(x)
616 * clamp_m1_1(clamp_0_inf(x)) = clamp_m1_1(x)
617 *
618 * So by cases any composition of output modifiers is equivalent to
619 * clamp_m1_1 alone.
620 */
621static unsigned
622mir_determine_float_outmod(compiler_context *ctx, nir_dest **dest, unsigned prior_outmod)
623{
624        bool clamp_0_inf = mir_accept_dest_mod(ctx, dest, nir_op_fclamp_pos_mali);
625        bool clamp_0_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat);
626        bool clamp_m1_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat_signed_mali);
627        bool prior = (prior_outmod != midgard_outmod_none);
628        int count = (int) prior + (int) clamp_0_inf + (int) clamp_0_1 + (int) clamp_m1_1;
629
630        return ((count > 1) || clamp_0_1) ?  midgard_outmod_clamp_0_1 :
631                                clamp_0_inf ? midgard_outmod_clamp_0_inf :
632                                clamp_m1_1 ?   midgard_outmod_clamp_m1_1 :
633                                prior_outmod;
634}
635
636static void
637mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, enum midgard_roundmode *roundmode, bool is_int, unsigned bcast_count)
638{
639        nir_alu_src src = instr->src[i];
640
641        if (!is_int) {
642                if (pan_has_source_mod(&src, nir_op_fneg))
643                        *neg = !(*neg);
644
645                if (pan_has_source_mod(&src, nir_op_fabs))
646                        *abs = true;
647        }
648
649        if (nir_accepts_inot(instr->op, i) && pan_has_source_mod(&src, nir_op_inot))
650                *not = true;
651
652        if (roundmode) {
653                if (pan_has_source_mod(&src, nir_op_fround_even))
654                        *roundmode = MIDGARD_RTE;
655
656                if (pan_has_source_mod(&src, nir_op_ftrunc))
657                        *roundmode = MIDGARD_RTZ;
658
659                if (pan_has_source_mod(&src, nir_op_ffloor))
660                        *roundmode = MIDGARD_RTN;
661
662                if (pan_has_source_mod(&src, nir_op_fceil))
663                        *roundmode = MIDGARD_RTP;
664        }
665
666        unsigned bits = nir_src_bit_size(src.src);
667
668        ins->src[to] = nir_src_index(NULL, &src.src);
669        ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits;
670
671        for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) {
672                ins->swizzle[to][c] = src.swizzle[
673                        (!bcast_count || c < bcast_count) ? c :
674                                (bcast_count - 1)];
675        }
676}
677
678/* Midgard features both fcsel and icsel, depending on whether you want int or
679 * float modifiers. NIR's csel is typeless, so we want a heuristic to guess if
680 * we should emit an int or float csel depending on what modifiers could be
681 * placed. In the absense of modifiers, this is probably arbitrary. */
682
683static bool
684mir_is_bcsel_float(nir_alu_instr *instr)
685{
686        nir_op intmods[] = {
687                nir_op_i2i8, nir_op_i2i16,
688                nir_op_i2i32, nir_op_i2i64
689        };
690
691        nir_op floatmods[] = {
692                nir_op_fabs, nir_op_fneg,
693                nir_op_f2f16, nir_op_f2f32,
694                nir_op_f2f64
695        };
696
697        nir_op floatdestmods[] = {
698                nir_op_fsat, nir_op_fsat_signed_mali, nir_op_fclamp_pos_mali,
699                nir_op_f2f16, nir_op_f2f32
700        };
701
702        signed score = 0;
703
704        for (unsigned i = 1; i < 3; ++i) {
705                nir_alu_src s = instr->src[i];
706                for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) {
707                        if (pan_has_source_mod(&s, intmods[q]))
708                                score--;
709                }
710        }
711
712        for (unsigned i = 1; i < 3; ++i) {
713                nir_alu_src s = instr->src[i];
714                for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) {
715                        if (pan_has_source_mod(&s, floatmods[q]))
716                                score++;
717                }
718        }
719
720        for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) {
721                nir_dest *dest = &instr->dest.dest;
722                if (pan_has_dest_mod(&dest, floatdestmods[q]))
723                        score++;
724        }
725
726        return (score > 0);
727}
728
729static void
730emit_alu(compiler_context *ctx, nir_alu_instr *instr)
731{
732        nir_dest *dest = &instr->dest.dest;
733
734        if (dest->is_ssa && BITSET_TEST(ctx->already_emitted, dest->ssa.index))
735                return;
736
737        /* Derivatives end up emitted on the texture pipe, not the ALUs. This
738         * is handled elsewhere */
739
740        if (instr->op == nir_op_fddx || instr->op == nir_op_fddy) {
741                midgard_emit_derivatives(ctx, instr);
742                return;
743        }
744
745        bool is_ssa = dest->is_ssa;
746
747        unsigned nr_components = nir_dest_num_components(*dest);
748        unsigned nr_inputs = nir_op_infos[instr->op].num_inputs;
749        unsigned op = 0;
750
751        /* Number of components valid to check for the instruction (the rest
752         * will be forced to the last), or 0 to use as-is. Relevant as
753         * ball-type instructions have a channel count in NIR but are all vec4
754         * in Midgard */
755
756        unsigned broadcast_swizzle = 0;
757
758        /* Should we swap arguments? */
759        bool flip_src12 = false;
760
761        ASSERTED unsigned src_bitsize = nir_src_bit_size(instr->src[0].src);
762        ASSERTED unsigned dst_bitsize = nir_dest_bit_size(*dest);
763
764        enum midgard_roundmode roundmode = MIDGARD_RTE;
765
766        switch (instr->op) {
767                ALU_CASE(fadd, fadd);
768                ALU_CASE(fmul, fmul);
769                ALU_CASE(fmin, fmin);
770                ALU_CASE(fmax, fmax);
771                ALU_CASE(imin, imin);
772                ALU_CASE(imax, imax);
773                ALU_CASE(umin, umin);
774                ALU_CASE(umax, umax);
775                ALU_CASE(ffloor, ffloor);
776                ALU_CASE(fround_even, froundeven);
777                ALU_CASE(ftrunc, ftrunc);
778                ALU_CASE(fceil, fceil);
779                ALU_CASE(fdot3, fdot3);
780                ALU_CASE(fdot4, fdot4);
781                ALU_CASE(iadd, iadd);
782                ALU_CASE(isub, isub);
783                ALU_CASE(iadd_sat, iaddsat);
784                ALU_CASE(isub_sat, isubsat);
785                ALU_CASE(uadd_sat, uaddsat);
786                ALU_CASE(usub_sat, usubsat);
787                ALU_CASE(imul, imul);
788                ALU_CASE(imul_high, imul);
789                ALU_CASE(umul_high, imul);
790                ALU_CASE(uclz, iclz);
791
792                /* Zero shoved as second-arg */
793                ALU_CASE(iabs, iabsdiff);
794
795                ALU_CASE(uabs_isub, iabsdiff);
796                ALU_CASE(uabs_usub, uabsdiff);
797
798                ALU_CASE(mov, imov);
799
800                ALU_CASE_CMP(feq32, feq);
801                ALU_CASE_CMP(fneu32, fne);
802                ALU_CASE_CMP(flt32, flt);
803                ALU_CASE_CMP(ieq32, ieq);
804                ALU_CASE_CMP(ine32, ine);
805                ALU_CASE_CMP(ilt32, ilt);
806                ALU_CASE_CMP(ult32, ult);
807
808                /* We don't have a native b2f32 instruction. Instead, like many
809                 * GPUs, we exploit booleans as 0/~0 for false/true, and
810                 * correspondingly AND
811                 * by 1.0 to do the type conversion. For the moment, prime us
812                 * to emit:
813                 *
814                 * iand [whatever], #0
815                 *
816                 * At the end of emit_alu (as MIR), we'll fix-up the constant
817                 */
818
819                ALU_CASE_CMP(b2f32, iand);
820                ALU_CASE_CMP(b2f16, iand);
821                ALU_CASE_CMP(b2i32, iand);
822
823                /* Likewise, we don't have a dedicated f2b32 instruction, but
824                 * we can do a "not equal to 0.0" test. */
825
826                ALU_CASE_CMP(f2b32, fne);
827                ALU_CASE_CMP(i2b32, ine);
828
829                ALU_CASE(frcp, frcp);
830                ALU_CASE(frsq, frsqrt);
831                ALU_CASE(fsqrt, fsqrt);
832                ALU_CASE(fexp2, fexp2);
833                ALU_CASE(flog2, flog2);
834
835                ALU_CASE_RTZ(f2i64, f2i_rte);
836                ALU_CASE_RTZ(f2u64, f2u_rte);
837                ALU_CASE_RTZ(i2f64, i2f_rte);
838                ALU_CASE_RTZ(u2f64, u2f_rte);
839
840                ALU_CASE_RTZ(f2i32, f2i_rte);
841                ALU_CASE_RTZ(f2u32, f2u_rte);
842                ALU_CASE_RTZ(i2f32, i2f_rte);
843                ALU_CASE_RTZ(u2f32, u2f_rte);
844
845                ALU_CASE_RTZ(f2i8, f2i_rte);
846                ALU_CASE_RTZ(f2u8, f2u_rte);
847
848                ALU_CASE_RTZ(f2i16, f2i_rte);
849                ALU_CASE_RTZ(f2u16, f2u_rte);
850                ALU_CASE_RTZ(i2f16, i2f_rte);
851                ALU_CASE_RTZ(u2f16, u2f_rte);
852
853                ALU_CASE(fsin, fsinpi);
854                ALU_CASE(fcos, fcospi);
855
856                /* We'll get 0 in the second arg, so:
857                 * ~a = ~(a | 0) = nor(a, 0) */
858                ALU_CASE(inot, inor);
859                ALU_CASE(iand, iand);
860                ALU_CASE(ior, ior);
861                ALU_CASE(ixor, ixor);
862                ALU_CASE(ishl, ishl);
863                ALU_CASE(ishr, iasr);
864                ALU_CASE(ushr, ilsr);
865
866                ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2);
867                ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3);
868                ALU_CASE_CMP(b32all_fequal4, fball_eq);
869
870                ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2);
871                ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3);
872                ALU_CASE_CMP(b32any_fnequal4, fbany_neq);
873
874                ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2);
875                ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3);
876                ALU_CASE_CMP(b32all_iequal4, iball_eq);
877
878                ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2);
879                ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3);
880                ALU_CASE_CMP(b32any_inequal4, ibany_neq);
881
882                /* Source mods will be shoved in later */
883                ALU_CASE(fabs, fmov);
884                ALU_CASE(fneg, fmov);
885                ALU_CASE(fsat, fmov);
886                ALU_CASE(fsat_signed_mali, fmov);
887                ALU_CASE(fclamp_pos_mali, fmov);
888
889        /* For size conversion, we use a move. Ideally though we would squash
890         * these ops together; maybe that has to happen after in NIR as part of
891         * propagation...? An earlier algebraic pass ensured we step down by
892         * only / exactly one size. If stepping down, we use a dest override to
893         * reduce the size; if stepping up, we use a larger-sized move with a
894         * half source and a sign/zero-extension modifier */
895
896        case nir_op_i2i8:
897        case nir_op_i2i16:
898        case nir_op_i2i32:
899        case nir_op_i2i64:
900        case nir_op_u2u8:
901        case nir_op_u2u16:
902        case nir_op_u2u32:
903        case nir_op_u2u64:
904        case nir_op_f2f16:
905        case nir_op_f2f32:
906        case nir_op_f2f64: {
907                if (instr->op == nir_op_f2f16 || instr->op == nir_op_f2f32 ||
908                    instr->op == nir_op_f2f64)
909                        op = midgard_alu_op_fmov;
910                else
911                        op = midgard_alu_op_imov;
912
913                break;
914        }
915
916        /* For greater-or-equal, we lower to less-or-equal and flip the
917         * arguments */
918
919        case nir_op_fge:
920        case nir_op_fge32:
921        case nir_op_ige32:
922        case nir_op_uge32: {
923                op =
924                        instr->op == nir_op_fge   ? midgard_alu_op_fle :
925                        instr->op == nir_op_fge32 ? midgard_alu_op_fle :
926                        instr->op == nir_op_ige32 ? midgard_alu_op_ile :
927                        instr->op == nir_op_uge32 ? midgard_alu_op_ule :
928                        0;
929
930                flip_src12 = true;
931                ALU_CHECK_CMP();
932                break;
933        }
934
935        case nir_op_b32csel: {
936                bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components);
937                bool is_float = mir_is_bcsel_float(instr);
938                op = is_float ?
939                        (mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) :
940                        (mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel);
941
942                break;
943        }
944
945        case nir_op_unpack_32_2x16:
946        case nir_op_unpack_32_4x8:
947        case nir_op_pack_32_2x16:
948        case nir_op_pack_32_4x8: {
949                op = midgard_alu_op_imov;
950                break;
951        }
952
953        default:
954                mesa_loge("Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
955                assert(0);
956                return;
957        }
958
959        /* Promote imov to fmov if it might help inline a constant */
960        if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src)
961                        && nir_src_bit_size(instr->src[0].src) == 32
962                        && nir_is_same_comp_swizzle(instr->src[0].swizzle,
963                                nir_src_num_components(instr->src[0].src))) {
964                op = midgard_alu_op_fmov;
965        }
966
967        /* Midgard can perform certain modifiers on output of an ALU op */
968
969        unsigned outmod = 0;
970        bool is_int = midgard_is_integer_op(op);
971
972        if (instr->op == nir_op_umul_high || instr->op == nir_op_imul_high) {
973                outmod = midgard_outmod_keephi;
974        } else if (midgard_is_integer_out_op(op)) {
975                outmod = midgard_outmod_keeplo;
976        } else if (instr->op == nir_op_fsat) {
977                outmod = midgard_outmod_clamp_0_1;
978        } else if (instr->op == nir_op_fsat_signed_mali) {
979                outmod = midgard_outmod_clamp_m1_1;
980        } else if (instr->op == nir_op_fclamp_pos_mali) {
981                outmod = midgard_outmod_clamp_0_inf;
982        }
983
984        /* Fetch unit, quirks, etc information */
985        unsigned opcode_props = alu_opcode_props[op].props;
986        bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24;
987
988        if (!midgard_is_integer_out_op(op)) {
989                outmod = mir_determine_float_outmod(ctx, &dest, outmod);
990        }
991
992        midgard_instruction ins = {
993                .type = TAG_ALU_4,
994                .dest = nir_dest_index(dest),
995                .dest_type = nir_op_infos[instr->op].output_type
996                        | nir_dest_bit_size(*dest),
997                .roundmode = roundmode,
998        };
999
1000        enum midgard_roundmode *roundptr = (opcode_props & MIDGARD_ROUNDS) ?
1001                &ins.roundmode : NULL;
1002
1003        for (unsigned i = nr_inputs; i < ARRAY_SIZE(ins.src); ++i)
1004                ins.src[i] = ~0;
1005
1006        if (quirk_flipped_r24) {
1007                ins.src[0] = ~0;
1008                mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], &ins.src_invert[1], roundptr, is_int, broadcast_swizzle);
1009        } else {
1010                for (unsigned i = 0; i < nr_inputs; ++i) {
1011                        unsigned to = i;
1012
1013                        if (instr->op == nir_op_b32csel) {
1014                                /* The condition is the first argument; move
1015                                 * the other arguments up one to be a binary
1016                                 * instruction for Midgard with the condition
1017                                 * last */
1018
1019                                if (i == 0)
1020                                        to = 2;
1021                                else if (flip_src12)
1022                                        to = 2 - i;
1023                                else
1024                                        to = i - 1;
1025                        } else if (flip_src12) {
1026                                to = 1 - to;
1027                        }
1028
1029                        mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], &ins.src_invert[to], roundptr, is_int, broadcast_swizzle);
1030
1031                        /* (!c) ? a : b = c ? b : a */
1032                        if (instr->op == nir_op_b32csel && ins.src_invert[2]) {
1033                                ins.src_invert[2] = false;
1034                                flip_src12 ^= true;
1035                        }
1036                }
1037        }
1038
1039        if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) {
1040                /* Lowered to move */
1041                if (instr->op == nir_op_fneg)
1042                        ins.src_neg[1] ^= true;
1043
1044                if (instr->op == nir_op_fabs)
1045                        ins.src_abs[1] = true;
1046        }
1047
1048        ins.mask = mask_of(nr_components);
1049
1050        /* Apply writemask if non-SSA, keeping in mind that we can't write to
1051         * components that don't exist. Note modifier => SSA => !reg => no
1052         * writemask, so we don't have to worry about writemasks here.*/
1053
1054        if (!is_ssa)
1055                ins.mask &= instr->dest.write_mask;
1056
1057        ins.op = op;
1058        ins.outmod = outmod;
1059
1060        /* Late fixup for emulated instructions */
1061
1062        if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) {
1063                /* Presently, our second argument is an inline #0 constant.
1064                 * Switch over to an embedded 1.0 constant (that can't fit
1065                 * inline, since we're 32-bit, not 16-bit like the inline
1066                 * constants) */
1067
1068                ins.has_inline_constant = false;
1069                ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1070                ins.src_types[1] = nir_type_float32;
1071                ins.has_constants = true;
1072
1073                if (instr->op == nir_op_b2f32)
1074                        ins.constants.f32[0] = 1.0f;
1075                else
1076                        ins.constants.i32[0] = 1;
1077
1078                for (unsigned c = 0; c < 16; ++c)
1079                        ins.swizzle[1][c] = 0;
1080        } else if (instr->op == nir_op_b2f16) {
1081                ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1082                ins.src_types[1] = nir_type_float16;
1083                ins.has_constants = true;
1084                ins.constants.i16[0] = _mesa_float_to_half(1.0);
1085
1086                for (unsigned c = 0; c < 16; ++c)
1087                        ins.swizzle[1][c] = 0;
1088        } else if (nr_inputs == 1 && !quirk_flipped_r24) {
1089                /* Lots of instructions need a 0 plonked in */
1090                ins.has_inline_constant = false;
1091                ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1092                ins.src_types[1] = ins.src_types[0];
1093                ins.has_constants = true;
1094                ins.constants.u32[0] = 0;
1095
1096                for (unsigned c = 0; c < 16; ++c)
1097                        ins.swizzle[1][c] = 0;
1098        } else if (instr->op == nir_op_pack_32_2x16) {
1099                ins.dest_type = nir_type_uint16;
1100                ins.mask = mask_of(nr_components * 2);
1101                ins.is_pack = true;
1102        } else if (instr->op == nir_op_pack_32_4x8) {
1103                ins.dest_type = nir_type_uint8;
1104                ins.mask = mask_of(nr_components * 4);
1105                ins.is_pack = true;
1106        } else if (instr->op == nir_op_unpack_32_2x16) {
1107                ins.dest_type = nir_type_uint32;
1108                ins.mask = mask_of(nr_components >> 1);
1109                ins.is_pack = true;
1110        } else if (instr->op == nir_op_unpack_32_4x8) {
1111                ins.dest_type = nir_type_uint32;
1112                ins.mask = mask_of(nr_components >> 2);
1113                ins.is_pack = true;
1114        }
1115
1116        if ((opcode_props & UNITS_ALL) == UNIT_VLUT) {
1117                /* To avoid duplicating the lookup tables (probably), true LUT
1118                 * instructions can only operate as if they were scalars. Lower
1119                 * them here by changing the component. */
1120
1121                unsigned orig_mask = ins.mask;
1122
1123                unsigned swizzle_back[MIR_VEC_COMPONENTS];
1124                memcpy(&swizzle_back, ins.swizzle[0], sizeof(swizzle_back));
1125
1126                midgard_instruction ins_split[MIR_VEC_COMPONENTS];
1127                unsigned ins_count = 0;
1128
1129                for (int i = 0; i < nr_components; ++i) {
1130                        /* Mask the associated component, dropping the
1131                         * instruction if needed */
1132
1133                        ins.mask = 1 << i;
1134                        ins.mask &= orig_mask;
1135
1136                        for (unsigned j = 0; j < ins_count; ++j) {
1137                                if (swizzle_back[i] == ins_split[j].swizzle[0][0]) {
1138                                        ins_split[j].mask |= ins.mask;
1139                                        ins.mask = 0;
1140                                        break;
1141                                }
1142                        }
1143
1144                        if (!ins.mask)
1145                                continue;
1146
1147                        for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j)
1148                                ins.swizzle[0][j] = swizzle_back[i]; /* Pull from the correct component */
1149
1150                        ins_split[ins_count] = ins;
1151
1152                        ++ins_count;
1153                }
1154
1155                for (unsigned i = 0; i < ins_count; ++i) {
1156                        emit_mir_instruction(ctx, ins_split[i]);
1157                }
1158        } else {
1159                emit_mir_instruction(ctx, ins);
1160        }
1161}
1162
1163#undef ALU_CASE
1164
1165static void
1166mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read)
1167{
1168        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1169        unsigned nir_mask = 0;
1170        unsigned dsize = 0;
1171
1172        if (is_read) {
1173                nir_mask = mask_of(nir_intrinsic_dest_components(intr));
1174
1175                /* Extension is mandatory for 8/16-bit loads */
1176                dsize = nir_dest_bit_size(intr->dest) == 64 ? 64 : 32;
1177        } else {
1178                nir_mask = nir_intrinsic_write_mask(intr);
1179                dsize = OP_IS_COMMON_STORE(ins->op) ?
1180                        nir_src_bit_size(intr->src[0]) : 32;
1181        }
1182
1183        /* Once we have the NIR mask, we need to normalize to work in 32-bit space */
1184        unsigned bytemask = pan_to_bytemask(dsize, nir_mask);
1185        ins->dest_type = nir_type_uint | dsize;
1186        mir_set_bytemask(ins, bytemask);
1187}
1188
1189/* Uniforms and UBOs use a shared code path, as uniforms are just (slightly
1190 * optimized) versions of UBO #0 */
1191
1192static midgard_instruction *
1193emit_ubo_read(
1194        compiler_context *ctx,
1195        nir_instr *instr,
1196        unsigned dest,
1197        unsigned offset,
1198        nir_src *indirect_offset,
1199        unsigned indirect_shift,
1200        unsigned index,
1201        unsigned nr_comps)
1202{
1203        midgard_instruction ins;
1204
1205        unsigned dest_size = (instr->type == nir_instr_type_intrinsic) ?
1206                nir_dest_bit_size(nir_instr_as_intrinsic(instr)->dest) : 32;
1207
1208        unsigned bitsize = dest_size * nr_comps;
1209
1210        /* Pick the smallest intrinsic to avoid out-of-bounds reads */
1211        if (bitsize <= 8)
1212                ins = m_ld_ubo_u8(dest, 0);
1213        else if (bitsize <= 16)
1214                ins = m_ld_ubo_u16(dest, 0);
1215        else if (bitsize <= 32)
1216                ins = m_ld_ubo_32(dest, 0);
1217        else if (bitsize <= 64)
1218                ins = m_ld_ubo_64(dest, 0);
1219        else if (bitsize <= 128)
1220                ins = m_ld_ubo_128(dest, 0);
1221        else
1222                unreachable("Invalid UBO read size");
1223
1224        ins.constants.u32[0] = offset;
1225
1226        if (instr->type == nir_instr_type_intrinsic)
1227                mir_set_intr_mask(instr, &ins, true);
1228
1229        if (indirect_offset) {
1230                ins.src[2] = nir_src_index(ctx, indirect_offset);
1231                ins.src_types[2] = nir_type_uint32;
1232                ins.load_store.index_shift = indirect_shift;
1233
1234                /* X component for the whole swizzle to prevent register
1235                 * pressure from ballooning from the extra components */
1236                for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[2]); ++i)
1237                        ins.swizzle[2][i] = 0;
1238        } else {
1239                ins.load_store.index_reg = REGISTER_LDST_ZERO;
1240        }
1241
1242        if (indirect_offset && indirect_offset->is_ssa && !indirect_shift)
1243                mir_set_ubo_offset(&ins, indirect_offset, offset);
1244
1245        midgard_pack_ubo_index_imm(&ins.load_store, index);
1246
1247        return emit_mir_instruction(ctx, ins);
1248}
1249
1250/* Globals are like UBOs if you squint. And shared memory is like globals if
1251 * you squint even harder */
1252
1253static void
1254emit_global(
1255        compiler_context *ctx,
1256        nir_instr *instr,
1257        bool is_read,
1258        unsigned srcdest,
1259        nir_src *offset,
1260        unsigned seg)
1261{
1262        midgard_instruction ins;
1263
1264        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1265        if (is_read) {
1266                unsigned bitsize = nir_dest_bit_size(intr->dest) *
1267                        nir_dest_num_components(intr->dest);
1268
1269                switch (bitsize) {
1270                case 8: ins = m_ld_u8(srcdest, 0); break;
1271                case 16: ins = m_ld_u16(srcdest, 0); break;
1272                case 32: ins = m_ld_32(srcdest, 0); break;
1273                case 64: ins = m_ld_64(srcdest, 0); break;
1274                case 128: ins = m_ld_128(srcdest, 0); break;
1275                default: unreachable("Invalid global read size");
1276                }
1277
1278                mir_set_intr_mask(instr, &ins, is_read);
1279
1280                /* For anything not aligned on 32bit, make sure we write full
1281                 * 32 bits registers. */
1282                if (bitsize & 31) {
1283                        unsigned comps_per_32b = 32 / nir_dest_bit_size(intr->dest);
1284
1285                        for (unsigned c = 0; c < 4 * comps_per_32b; c += comps_per_32b) {
1286                                if (!(ins.mask & BITFIELD_RANGE(c, comps_per_32b)))
1287                                        continue;
1288
1289                                unsigned base = ~0;
1290                                for (unsigned i = 0; i < comps_per_32b; i++) {
1291                                        if (ins.mask & BITFIELD_BIT(c + i)) {
1292                                                base = ins.swizzle[0][c + i];
1293                                                break;
1294                                        }
1295                                }
1296
1297                                assert(base != ~0);
1298
1299                                for (unsigned i = 0; i < comps_per_32b; i++) {
1300                                        if (!(ins.mask & BITFIELD_BIT(c + i))) {
1301                                                ins.swizzle[0][c + i] = base + i;
1302                                                ins.mask |= BITFIELD_BIT(c + i);
1303                                        }
1304                                        assert(ins.swizzle[0][c + i] == base + i);
1305                                }
1306                        }
1307
1308                }
1309        } else {
1310                unsigned bitsize = nir_src_bit_size(intr->src[0]) *
1311                        nir_src_num_components(intr->src[0]);
1312
1313                if (bitsize == 8)
1314                        ins = m_st_u8(srcdest, 0);
1315                else if (bitsize == 16)
1316                        ins = m_st_u16(srcdest, 0);
1317                else if (bitsize <= 32)
1318                        ins = m_st_32(srcdest, 0);
1319                else if (bitsize <= 64)
1320                        ins = m_st_64(srcdest, 0);
1321                else if (bitsize <= 128)
1322                        ins = m_st_128(srcdest, 0);
1323                else
1324                        unreachable("Invalid global store size");
1325
1326                mir_set_intr_mask(instr, &ins, is_read);
1327        }
1328
1329        mir_set_offset(ctx, &ins, offset, seg);
1330
1331        /* Set a valid swizzle for masked out components */
1332        assert(ins.mask);
1333        unsigned first_component = __builtin_ffs(ins.mask) - 1;
1334
1335        for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) {
1336                if (!(ins.mask & (1 << i)))
1337                        ins.swizzle[0][i] = first_component;
1338        }
1339
1340        emit_mir_instruction(ctx, ins);
1341}
1342
1343/* If is_shared is off, the only other possible value are globals, since
1344 * SSBO's are being lowered to globals through a NIR pass.
1345 * `image_direct_address` should be ~0 when instr is not an image_atomic
1346 * and the destination register of a lea_image op when it is an image_atomic. */
1347static void
1348emit_atomic(
1349        compiler_context *ctx,
1350        nir_intrinsic_instr *instr,
1351        bool is_shared,
1352        midgard_load_store_op op,
1353        unsigned image_direct_address)
1354{
1355        nir_alu_type type =
1356                (op == midgard_op_atomic_imin || op == midgard_op_atomic_imax) ?
1357                nir_type_int : nir_type_uint;
1358
1359        bool is_image = image_direct_address != ~0;
1360
1361        unsigned dest = nir_dest_index(&instr->dest);
1362        unsigned val_src = is_image ? 3 : 1;
1363        unsigned val = nir_src_index(ctx, &instr->src[val_src]);
1364        unsigned bitsize = nir_src_bit_size(instr->src[val_src]);
1365        emit_explicit_constant(ctx, val, val);
1366
1367        midgard_instruction ins = {
1368                .type = TAG_LOAD_STORE_4,
1369                .mask = 0xF,
1370                .dest = dest,
1371                .src = { ~0, ~0, ~0, val },
1372                .src_types = { 0, 0, 0, type | bitsize },
1373                .op = op
1374        };
1375
1376        nir_src *src_offset = nir_get_io_offset_src(instr);
1377
1378        if (op == midgard_op_atomic_cmpxchg) {
1379                unsigned xchg_val_src = is_image ? 4 : 2;
1380                unsigned xchg_val = nir_src_index(ctx, &instr->src[xchg_val_src]);
1381                emit_explicit_constant(ctx, xchg_val, xchg_val);
1382
1383                ins.src[2] = val;
1384                ins.src_types[2] = type | bitsize;
1385                ins.src[3] = xchg_val;
1386
1387                if (is_shared) {
1388                        ins.load_store.arg_reg = REGISTER_LDST_LOCAL_STORAGE_PTR;
1389                        ins.load_store.arg_comp = COMPONENT_Z;
1390                        ins.load_store.bitsize_toggle = true;
1391                } else {
1392                        for(unsigned i = 0; i < 2; ++i)
1393                                ins.swizzle[1][i] = i;
1394
1395                        ins.src[1] = is_image ? image_direct_address :
1396                                                nir_src_index(ctx, src_offset);
1397                        ins.src_types[1] = nir_type_uint64;
1398                }
1399        } else if (is_image) {
1400                for(unsigned i = 0; i < 2; ++i)
1401                        ins.swizzle[2][i] = i;
1402
1403                ins.src[2] = image_direct_address;
1404                ins.src_types[2] = nir_type_uint64;
1405
1406                ins.load_store.arg_reg = REGISTER_LDST_ZERO;
1407                ins.load_store.bitsize_toggle = true;
1408                ins.load_store.index_format = midgard_index_address_u64;
1409        } else
1410                mir_set_offset(ctx, &ins, src_offset, is_shared ? LDST_SHARED : LDST_GLOBAL);
1411
1412        mir_set_intr_mask(&instr->instr, &ins, true);
1413
1414        emit_mir_instruction(ctx, ins);
1415}
1416
1417static void
1418emit_varying_read(
1419        compiler_context *ctx,
1420        unsigned dest, unsigned offset,
1421        unsigned nr_comp, unsigned component,
1422        nir_src *indirect_offset, nir_alu_type type, bool flat)
1423{
1424        /* XXX: Half-floats? */
1425        /* TODO: swizzle, mask */
1426
1427        midgard_instruction ins = m_ld_vary_32(dest, PACK_LDST_ATTRIB_OFS(offset));
1428        ins.mask = mask_of(nr_comp);
1429        ins.dest_type = type;
1430
1431        if (type == nir_type_float16) {
1432                /* Ensure we are aligned so we can pack it later */
1433                ins.mask = mask_of(ALIGN_POT(nr_comp, 2));
1434        }
1435
1436        for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i)
1437                ins.swizzle[0][i] = MIN2(i + component, COMPONENT_W);
1438
1439
1440        midgard_varying_params p = {
1441                .flat_shading = flat,
1442                .perspective_correction = 1,
1443                .interpolate_sample = true,
1444        };
1445        midgard_pack_varying_params(&ins.load_store, p);
1446
1447        if (indirect_offset) {
1448                ins.src[2] = nir_src_index(ctx, indirect_offset);
1449                ins.src_types[2] = nir_type_uint32;
1450        } else
1451                ins.load_store.index_reg = REGISTER_LDST_ZERO;
1452
1453        ins.load_store.arg_reg = REGISTER_LDST_ZERO;
1454        ins.load_store.index_format = midgard_index_address_u32;
1455
1456        /* Use the type appropriate load */
1457        switch (type) {
1458        case nir_type_uint32:
1459        case nir_type_bool32:
1460                ins.op = midgard_op_ld_vary_32u;
1461                break;
1462        case nir_type_int32:
1463                ins.op = midgard_op_ld_vary_32i;
1464                break;
1465        case nir_type_float32:
1466                ins.op = midgard_op_ld_vary_32;
1467                break;
1468        case nir_type_float16:
1469                ins.op = midgard_op_ld_vary_16;
1470                break;
1471        default:
1472                unreachable("Attempted to load unknown type");
1473                break;
1474        }
1475
1476        emit_mir_instruction(ctx, ins);
1477}
1478
1479
1480/* If `is_atomic` is true, we emit a `lea_image` since midgard doesn't not have special
1481 * image_atomic opcodes. The caller can then use that address to emit a normal atomic opcode. */
1482static midgard_instruction
1483emit_image_op(compiler_context *ctx, nir_intrinsic_instr *instr, bool is_atomic)
1484{
1485        enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1486        unsigned nr_attr = ctx->stage == MESA_SHADER_VERTEX ?
1487                util_bitcount64(ctx->nir->info.inputs_read) : 0;
1488        unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
1489        bool is_array = nir_intrinsic_image_array(instr);
1490        bool is_store = instr->intrinsic == nir_intrinsic_image_store;
1491
1492        /* TODO: MSAA */
1493        assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
1494
1495        unsigned coord_reg = nir_src_index(ctx, &instr->src[1]);
1496        emit_explicit_constant(ctx, coord_reg, coord_reg);
1497
1498        nir_src *index = &instr->src[0];
1499        bool is_direct = nir_src_is_const(*index);
1500
1501        /* For image opcodes, address is used as an index into the attribute descriptor */
1502        unsigned address = nr_attr;
1503        if (is_direct)
1504                address += nir_src_as_uint(*index);
1505
1506        midgard_instruction ins;
1507        if (is_store) { /* emit st_image_* */
1508                unsigned val = nir_src_index(ctx, &instr->src[3]);
1509                emit_explicit_constant(ctx, val, val);
1510
1511                nir_alu_type type = nir_intrinsic_src_type(instr);
1512                ins = st_image(type, val, PACK_LDST_ATTRIB_OFS(address));
1513                nir_alu_type base_type = nir_alu_type_get_base_type(type);
1514                ins.src_types[0] = base_type | nir_src_bit_size(instr->src[3]);
1515        } else if (is_atomic) { /* emit lea_image */
1516                unsigned dest = make_compiler_temp_reg(ctx);
1517                ins = m_lea_image(dest, PACK_LDST_ATTRIB_OFS(address));
1518                ins.mask = mask_of(2); /* 64-bit memory address */
1519        } else { /* emit ld_image_* */
1520                nir_alu_type type = nir_intrinsic_dest_type(instr);
1521                ins = ld_image(type, nir_dest_index(&instr->dest), PACK_LDST_ATTRIB_OFS(address));
1522                ins.mask = mask_of(nir_intrinsic_dest_components(instr));
1523                ins.dest_type = type;
1524        }
1525
1526        /* Coord reg */
1527        ins.src[1] = coord_reg;
1528        ins.src_types[1] = nir_type_uint16;
1529        if (nr_dim == 3 || is_array) {
1530                ins.load_store.bitsize_toggle = true;
1531        }
1532
1533        /* Image index reg */
1534        if (!is_direct) {
1535                ins.src[2] = nir_src_index(ctx, index);
1536                ins.src_types[2] = nir_type_uint32;
1537        } else
1538                ins.load_store.index_reg = REGISTER_LDST_ZERO;
1539
1540        emit_mir_instruction(ctx, ins);
1541
1542        return ins;
1543}
1544
1545static void
1546emit_attr_read(
1547        compiler_context *ctx,
1548        unsigned dest, unsigned offset,
1549        unsigned nr_comp, nir_alu_type t)
1550{
1551        midgard_instruction ins = m_ld_attr_32(dest, PACK_LDST_ATTRIB_OFS(offset));
1552        ins.load_store.arg_reg = REGISTER_LDST_ZERO;
1553        ins.load_store.index_reg = REGISTER_LDST_ZERO;
1554        ins.mask = mask_of(nr_comp);
1555
1556        /* Use the type appropriate load */
1557        switch (t) {
1558        case nir_type_uint:
1559        case nir_type_bool:
1560                ins.op = midgard_op_ld_attr_32u;
1561                break;
1562        case nir_type_int:
1563                ins.op = midgard_op_ld_attr_32i;
1564                break;
1565        case nir_type_float:
1566                ins.op = midgard_op_ld_attr_32;
1567                break;
1568        default:
1569                unreachable("Attempted to load unknown type");
1570                break;
1571        }
1572
1573        emit_mir_instruction(ctx, ins);
1574}
1575
1576static void
1577emit_sysval_read(compiler_context *ctx, nir_instr *instr,
1578                unsigned nr_components, unsigned offset)
1579{
1580        nir_dest nir_dest;
1581
1582        /* Figure out which uniform this is */
1583        unsigned sysval_ubo = ctx->inputs->fixed_sysval_ubo >= 0 ?
1584                              ctx->inputs->fixed_sysval_ubo :
1585                              ctx->nir->info.num_ubos;
1586        int sysval = panfrost_sysval_for_instr(instr, &nir_dest);
1587        unsigned dest = nir_dest_index(&nir_dest);
1588        unsigned uniform =
1589                pan_lookup_sysval(ctx->sysval_to_id, &ctx->info->sysvals, sysval);
1590
1591        /* Emit the read itself -- this is never indirect */
1592        midgard_instruction *ins =
1593                emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0,
1594                              sysval_ubo, nr_components);
1595
1596        ins->mask = mask_of(nr_components);
1597}
1598
1599static unsigned
1600compute_builtin_arg(nir_intrinsic_op op)
1601{
1602        switch (op) {
1603        case nir_intrinsic_load_workgroup_id:
1604                return REGISTER_LDST_GROUP_ID;
1605        case nir_intrinsic_load_local_invocation_id:
1606                return REGISTER_LDST_LOCAL_THREAD_ID;
1607        case nir_intrinsic_load_global_invocation_id:
1608        case nir_intrinsic_load_global_invocation_id_zero_base:
1609                return REGISTER_LDST_GLOBAL_THREAD_ID;
1610        default:
1611                unreachable("Invalid compute paramater loaded");
1612        }
1613}
1614
1615static void
1616emit_fragment_store(compiler_context *ctx, unsigned src, unsigned src_z, unsigned src_s,
1617                    enum midgard_rt_id rt, unsigned sample_iter)
1618{
1619        assert(rt < ARRAY_SIZE(ctx->writeout_branch));
1620        assert(sample_iter < ARRAY_SIZE(ctx->writeout_branch[0]));
1621
1622        midgard_instruction *br = ctx->writeout_branch[rt][sample_iter];
1623
1624        assert(!br);
1625
1626        emit_explicit_constant(ctx, src, src);
1627
1628        struct midgard_instruction ins =
1629                v_branch(false, false);
1630
1631        bool depth_only = (rt == MIDGARD_ZS_RT);
1632
1633        ins.writeout = depth_only ? 0 : PAN_WRITEOUT_C;
1634
1635        /* Add dependencies */
1636        ins.src[0] = src;
1637        ins.src_types[0] = nir_type_uint32;
1638
1639        if (depth_only)
1640                ins.constants.u32[0] = 0xFF;
1641        else
1642                ins.constants.u32[0] = ((rt - MIDGARD_COLOR_RT0) << 8) | sample_iter;
1643
1644        for (int i = 0; i < 4; ++i)
1645                ins.swizzle[0][i] = i;
1646
1647        if (~src_z) {
1648                emit_explicit_constant(ctx, src_z, src_z);
1649                ins.src[2] = src_z;
1650                ins.src_types[2] = nir_type_uint32;
1651                ins.writeout |= PAN_WRITEOUT_Z;
1652        }
1653        if (~src_s) {
1654                emit_explicit_constant(ctx, src_s, src_s);
1655                ins.src[3] = src_s;
1656                ins.src_types[3] = nir_type_uint32;
1657                ins.writeout |= PAN_WRITEOUT_S;
1658        }
1659
1660        /* Emit the branch */
1661        br = emit_mir_instruction(ctx, ins);
1662        schedule_barrier(ctx);
1663        ctx->writeout_branch[rt][sample_iter] = br;
1664
1665        /* Push our current location = current block count - 1 = where we'll
1666         * jump to. Maybe a bit too clever for my own good */
1667
1668        br->branch.target_block = ctx->block_count - 1;
1669}
1670
1671static void
1672emit_compute_builtin(compiler_context *ctx, nir_intrinsic_instr *instr)
1673{
1674        unsigned reg = nir_dest_index(&instr->dest);
1675        midgard_instruction ins = m_ldst_mov(reg, 0);
1676        ins.mask = mask_of(3);
1677        ins.swizzle[0][3] = COMPONENT_X; /* xyzx */
1678        ins.load_store.arg_reg = compute_builtin_arg(instr->intrinsic);
1679        emit_mir_instruction(ctx, ins);
1680}
1681
1682static unsigned
1683vertex_builtin_arg(nir_intrinsic_op op)
1684{
1685        switch (op) {
1686        case nir_intrinsic_load_vertex_id_zero_base:
1687                return PAN_VERTEX_ID;
1688        case nir_intrinsic_load_instance_id:
1689                return PAN_INSTANCE_ID;
1690        default:
1691                unreachable("Invalid vertex builtin");
1692        }
1693}
1694
1695static void
1696emit_vertex_builtin(compiler_context *ctx, nir_intrinsic_instr *instr)
1697{
1698        unsigned reg = nir_dest_index(&instr->dest);
1699        emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, nir_type_int);
1700}
1701
1702static void
1703emit_special(compiler_context *ctx, nir_intrinsic_instr *instr, unsigned idx)
1704{
1705        unsigned reg = nir_dest_index(&instr->dest);
1706
1707        midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0);
1708        ld.op = midgard_op_ld_special_32u;
1709        ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(idx);
1710        ld.load_store.index_reg = REGISTER_LDST_ZERO;
1711
1712        for (int i = 0; i < 4; ++i)
1713                ld.swizzle[0][i] = COMPONENT_X;
1714
1715        emit_mir_instruction(ctx, ld);
1716}
1717
1718static void
1719emit_control_barrier(compiler_context *ctx)
1720{
1721        midgard_instruction ins = {
1722                .type = TAG_TEXTURE_4,
1723                .dest = ~0,
1724                .src = { ~0, ~0, ~0, ~0 },
1725                .op = midgard_tex_op_barrier,
1726        };
1727
1728        emit_mir_instruction(ctx, ins);
1729}
1730
1731static unsigned
1732mir_get_branch_cond(nir_src *src, bool *invert)
1733{
1734        /* Wrap it. No swizzle since it's a scalar */
1735
1736        nir_alu_src alu = {
1737                .src = *src
1738        };
1739
1740        *invert = pan_has_source_mod(&alu, nir_op_inot);
1741        return nir_src_index(NULL, &alu.src);
1742}
1743
1744static uint8_t
1745output_load_rt_addr(compiler_context *ctx, nir_intrinsic_instr *instr)
1746{
1747        if (ctx->inputs->is_blend)
1748                return MIDGARD_COLOR_RT0 + ctx->inputs->blend.rt;
1749
1750        const nir_variable *var;
1751        var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out, nir_intrinsic_base(instr));
1752        assert(var);
1753
1754        unsigned loc = var->data.location;
1755
1756        if (loc >= FRAG_RESULT_DATA0)
1757                return loc - FRAG_RESULT_DATA0;
1758
1759        if (loc == FRAG_RESULT_DEPTH)
1760                return 0x1F;
1761        if (loc == FRAG_RESULT_STENCIL)
1762                return 0x1E;
1763
1764        unreachable("Invalid RT to load from");
1765}
1766
1767static void
1768emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
1769{
1770        unsigned offset = 0, reg;
1771
1772        switch (instr->intrinsic) {
1773        case nir_intrinsic_discard_if:
1774        case nir_intrinsic_discard: {
1775                bool conditional = instr->intrinsic == nir_intrinsic_discard_if;
1776                struct midgard_instruction discard = v_branch(conditional, false);
1777                discard.branch.target_type = TARGET_DISCARD;
1778
1779                if (conditional) {
1780                        discard.src[0] = mir_get_branch_cond(&instr->src[0],
1781                                        &discard.branch.invert_conditional);
1782                        discard.src_types[0] = nir_type_uint32;
1783                }
1784
1785                emit_mir_instruction(ctx, discard);
1786                schedule_barrier(ctx);
1787
1788                break;
1789        }
1790
1791        case nir_intrinsic_image_load:
1792        case nir_intrinsic_image_store:
1793                emit_image_op(ctx, instr, false);
1794                break;
1795
1796        case nir_intrinsic_image_size: {
1797                unsigned nr_comp = nir_intrinsic_dest_components(instr);
1798                emit_sysval_read(ctx, &instr->instr, nr_comp, 0);
1799                break;
1800        }
1801
1802        case nir_intrinsic_load_ubo:
1803        case nir_intrinsic_load_global:
1804        case nir_intrinsic_load_global_constant:
1805        case nir_intrinsic_load_shared:
1806        case nir_intrinsic_load_scratch:
1807        case nir_intrinsic_load_input:
1808        case nir_intrinsic_load_kernel_input:
1809        case nir_intrinsic_load_interpolated_input: {
1810                bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
1811                bool is_global = instr->intrinsic == nir_intrinsic_load_global ||
1812                        instr->intrinsic == nir_intrinsic_load_global_constant;
1813                bool is_shared = instr->intrinsic == nir_intrinsic_load_shared;
1814                bool is_scratch = instr->intrinsic == nir_intrinsic_load_scratch;
1815                bool is_flat = instr->intrinsic == nir_intrinsic_load_input;
1816                bool is_kernel = instr->intrinsic == nir_intrinsic_load_kernel_input;
1817                bool is_interp = instr->intrinsic == nir_intrinsic_load_interpolated_input;
1818
1819                /* Get the base type of the intrinsic */
1820                /* TODO: Infer type? Does it matter? */
1821                nir_alu_type t =
1822                        (is_interp) ? nir_type_float :
1823                        (is_flat) ? nir_intrinsic_dest_type(instr) :
1824                        nir_type_uint;
1825
1826                t = nir_alu_type_get_base_type(t);
1827
1828                if (!(is_ubo || is_global || is_scratch)) {
1829                        offset = nir_intrinsic_base(instr);
1830                }
1831
1832                unsigned nr_comp = nir_intrinsic_dest_components(instr);
1833
1834                nir_src *src_offset = nir_get_io_offset_src(instr);
1835
1836                bool direct = nir_src_is_const(*src_offset);
1837                nir_src *indirect_offset = direct ? NULL : src_offset;
1838
1839                if (direct)
1840                        offset += nir_src_as_uint(*src_offset);
1841
1842                /* We may need to apply a fractional offset */
1843                int component = (is_flat || is_interp) ?
1844                                nir_intrinsic_component(instr) : 0;
1845                reg = nir_dest_index(&instr->dest);
1846
1847                if (is_kernel) {
1848                        emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, 0, nr_comp);
1849                } else if (is_ubo) {
1850                        nir_src index = instr->src[0];
1851
1852                        /* TODO: Is indirect block number possible? */
1853                        assert(nir_src_is_const(index));
1854
1855                        uint32_t uindex = nir_src_as_uint(index);
1856                        emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex, nr_comp);
1857                } else if (is_global || is_shared || is_scratch) {
1858                        unsigned seg = is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH);
1859                        emit_global(ctx, &instr->instr, true, reg, src_offset, seg);
1860                } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend) {
1861                        emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t | nir_dest_bit_size(instr->dest), is_flat);
1862                } else if (ctx->inputs->is_blend) {
1863                        /* ctx->blend_input will be precoloured to r0/r2, where
1864                         * the input is preloaded */
1865
1866                        unsigned *input = offset ? &ctx->blend_src1 : &ctx->blend_input;
1867
1868                        if (*input == ~0)
1869                                *input = reg;
1870                        else
1871                                emit_mir_instruction(ctx, v_mov(*input, reg));
1872                } else if (ctx->stage == MESA_SHADER_VERTEX) {
1873                        emit_attr_read(ctx, reg, offset, nr_comp, t);
1874                } else {
1875                        DBG("Unknown load\n");
1876                        assert(0);
1877                }
1878
1879                break;
1880        }
1881
1882        /* Handled together with load_interpolated_input */
1883        case nir_intrinsic_load_barycentric_pixel:
1884        case nir_intrinsic_load_barycentric_centroid:
1885        case nir_intrinsic_load_barycentric_sample:
1886                break;
1887
1888        /* Reads 128-bit value raw off the tilebuffer during blending, tasty */
1889
1890        case nir_intrinsic_load_raw_output_pan: {
1891                reg = nir_dest_index(&instr->dest);
1892
1893                /* T720 and below use different blend opcodes with slightly
1894                 * different semantics than T760 and up */
1895
1896                midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0);
1897
1898                unsigned target = output_load_rt_addr(ctx, instr);
1899                ld.load_store.index_comp = target & 0x3;
1900                ld.load_store.index_reg = target >> 2;
1901
1902                if (nir_src_is_const(instr->src[0])) {
1903                        unsigned sample = nir_src_as_uint(instr->src[0]);
1904                        ld.load_store.arg_comp = sample & 0x3;
1905                        ld.load_store.arg_reg = sample >> 2;
1906                } else {
1907                        /* Enable sample index via register. */
1908                        ld.load_store.signed_offset |= 1;
1909                        ld.src[1] = nir_src_index(ctx, &instr->src[0]);
1910                        ld.src_types[1] = nir_type_int32;
1911                }
1912
1913                if (ctx->quirks & MIDGARD_OLD_BLEND) {
1914                        ld.op = midgard_op_ld_special_32u;
1915                        ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(16);
1916                        ld.load_store.index_reg = REGISTER_LDST_ZERO;
1917                }
1918
1919                emit_mir_instruction(ctx, ld);
1920                break;
1921        }
1922
1923        case nir_intrinsic_load_output: {
1924                reg = nir_dest_index(&instr->dest);
1925
1926                unsigned bits = nir_dest_bit_size(instr->dest);
1927
1928                midgard_instruction ld;
1929                if (bits == 16)
1930                        ld = m_ld_tilebuffer_16f(reg, 0);
1931                else
1932                        ld = m_ld_tilebuffer_32f(reg, 0);
1933
1934                unsigned index = output_load_rt_addr(ctx, instr);
1935                ld.load_store.index_comp = index & 0x3;
1936                ld.load_store.index_reg = index >> 2;
1937
1938                for (unsigned c = 4; c < 16; ++c)
1939                        ld.swizzle[0][c] = 0;
1940
1941                if (ctx->quirks & MIDGARD_OLD_BLEND) {
1942                        if (bits == 16)
1943                                ld.op = midgard_op_ld_special_16f;
1944                        else
1945                                ld.op = midgard_op_ld_special_32f;
1946                        ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(1);
1947                        ld.load_store.index_reg = REGISTER_LDST_ZERO;
1948                }
1949
1950                emit_mir_instruction(ctx, ld);
1951                break;
1952        }
1953
1954        case nir_intrinsic_store_output:
1955        case nir_intrinsic_store_combined_output_pan:
1956                assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
1957
1958                reg = nir_src_index(ctx, &instr->src[0]);
1959
1960                if (ctx->stage == MESA_SHADER_FRAGMENT) {
1961                        bool combined = instr->intrinsic ==
1962                                nir_intrinsic_store_combined_output_pan;
1963
1964                        enum midgard_rt_id rt;
1965
1966                        unsigned reg_z = ~0, reg_s = ~0, reg_2 = ~0;
1967                        unsigned writeout = PAN_WRITEOUT_C;
1968                        if (combined) {
1969                                writeout = nir_intrinsic_component(instr);
1970                                if (writeout & PAN_WRITEOUT_Z)
1971                                        reg_z = nir_src_index(ctx, &instr->src[2]);
1972                                if (writeout & PAN_WRITEOUT_S)
1973                                        reg_s = nir_src_index(ctx, &instr->src[3]);
1974                                if (writeout & PAN_WRITEOUT_2)
1975                                        reg_2 = nir_src_index(ctx, &instr->src[4]);
1976                        }
1977
1978                        if (writeout & PAN_WRITEOUT_C) {
1979                                const nir_variable *var =
1980                                        nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out,
1981                                                 nir_intrinsic_base(instr));
1982
1983                                assert(var != NULL);
1984                                assert(var->data.location >= FRAG_RESULT_DATA0);
1985
1986                                rt = MIDGARD_COLOR_RT0 + var->data.location -
1987                                     FRAG_RESULT_DATA0;
1988                        } else {
1989                                rt = MIDGARD_ZS_RT;
1990                        }
1991
1992                        /* Dual-source blend writeout is done by leaving the
1993                         * value in r2 for the blend shader to use. */
1994                        if (~reg_2) {
1995                                if (instr->src[4].is_ssa) {
1996                                        emit_explicit_constant(ctx, reg_2, reg_2);
1997
1998                                        unsigned out = make_compiler_temp(ctx);
1999
2000                                        midgard_instruction ins = v_mov(reg_2, out);
2001                                        emit_mir_instruction(ctx, ins);
2002
2003                                        ctx->blend_src1 = out;
2004                                } else {
2005                                        ctx->blend_src1 = reg_2;
2006                                }
2007                        }
2008
2009                        emit_fragment_store(ctx, reg, reg_z, reg_s, rt, 0);
2010                } else if (ctx->stage == MESA_SHADER_VERTEX) {
2011                        assert(instr->intrinsic == nir_intrinsic_store_output);
2012
2013                        /* We should have been vectorized, though we don't
2014                         * currently check that st_vary is emitted only once
2015                         * per slot (this is relevant, since there's not a mask
2016                         * parameter available on the store [set to 0 by the
2017                         * blob]). We do respect the component by adjusting the
2018                         * swizzle. If this is a constant source, we'll need to
2019                         * emit that explicitly. */
2020
2021                        emit_explicit_constant(ctx, reg, reg);
2022
2023                        offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]);
2024
2025                        unsigned dst_component = nir_intrinsic_component(instr);
2026                        unsigned nr_comp = nir_src_num_components(instr->src[0]);
2027
2028                        midgard_instruction st = m_st_vary_32(reg, PACK_LDST_ATTRIB_OFS(offset));
2029                        st.load_store.arg_reg = REGISTER_LDST_ZERO;
2030                        st.load_store.index_format = midgard_index_address_u32;
2031                        st.load_store.index_reg = REGISTER_LDST_ZERO;
2032
2033                        switch (nir_alu_type_get_base_type(nir_intrinsic_src_type(instr))) {
2034                        case nir_type_uint:
2035                        case nir_type_bool:
2036                                st.op = midgard_op_st_vary_32u;
2037                                break;
2038                        case nir_type_int:
2039                                st.op = midgard_op_st_vary_32i;
2040                                break;
2041                        case nir_type_float:
2042                                st.op = midgard_op_st_vary_32;
2043                                break;
2044                        default:
2045                                unreachable("Attempted to store unknown type");
2046                                break;
2047                        }
2048
2049                        /* nir_intrinsic_component(store_intr) encodes the
2050                         * destination component start. Source component offset
2051                         * adjustment is taken care of in
2052                         * install_registers_instr(), when offset_swizzle() is
2053                         * called.
2054                         */
2055                        unsigned src_component = COMPONENT_X;
2056
2057                        assert(nr_comp > 0);
2058                        for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle); ++i) {
2059                                st.swizzle[0][i] = src_component;
2060                                if (i >= dst_component && i < dst_component + nr_comp - 1)
2061                                        src_component++;
2062                        }
2063
2064                        emit_mir_instruction(ctx, st);
2065                } else {
2066                        DBG("Unknown store\n");
2067                        assert(0);
2068                }
2069
2070                break;
2071
2072        /* Special case of store_output for lowered blend shaders */
2073        case nir_intrinsic_store_raw_output_pan:
2074                assert (ctx->stage == MESA_SHADER_FRAGMENT);
2075                reg = nir_src_index(ctx, &instr->src[0]);
2076                for (unsigned s = 0; s < ctx->blend_sample_iterations; s++)
2077                        emit_fragment_store(ctx, reg, ~0, ~0,
2078                                            ctx->inputs->blend.rt + MIDGARD_COLOR_RT0,
2079                                            s);
2080                break;
2081
2082        case nir_intrinsic_store_global:
2083        case nir_intrinsic_store_shared:
2084        case nir_intrinsic_store_scratch:
2085                reg = nir_src_index(ctx, &instr->src[0]);
2086                emit_explicit_constant(ctx, reg, reg);
2087
2088                unsigned seg;
2089                if (instr->intrinsic == nir_intrinsic_store_global)
2090                        seg = LDST_GLOBAL;
2091                else if (instr->intrinsic == nir_intrinsic_store_shared)
2092                        seg = LDST_SHARED;
2093                else
2094                        seg = LDST_SCRATCH;
2095
2096                emit_global(ctx, &instr->instr, false, reg, &instr->src[1], seg);
2097                break;
2098
2099        case nir_intrinsic_load_ssbo_address:
2100        case nir_intrinsic_load_xfb_address:
2101                emit_sysval_read(ctx, &instr->instr, 2, 0);
2102                break;
2103
2104        case nir_intrinsic_load_first_vertex:
2105        case nir_intrinsic_load_work_dim:
2106        case nir_intrinsic_load_num_vertices:
2107                emit_sysval_read(ctx, &instr->instr, 1, 0);
2108                break;
2109
2110        case nir_intrinsic_load_base_vertex:
2111                emit_sysval_read(ctx, &instr->instr, 1, 4);
2112                break;
2113
2114        case nir_intrinsic_load_base_instance:
2115        case nir_intrinsic_get_ssbo_size:
2116                emit_sysval_read(ctx, &instr->instr, 1, 8);
2117                break;
2118
2119        case nir_intrinsic_load_sample_positions_pan:
2120                emit_sysval_read(ctx, &instr->instr, 2, 0);
2121                break;
2122
2123        case nir_intrinsic_load_viewport_scale:
2124        case nir_intrinsic_load_viewport_offset:
2125        case nir_intrinsic_load_num_workgroups:
2126        case nir_intrinsic_load_sampler_lod_parameters_pan:
2127        case nir_intrinsic_load_workgroup_size:
2128                emit_sysval_read(ctx, &instr->instr, 3, 0);
2129                break;
2130
2131        case nir_intrinsic_load_blend_const_color_rgba:
2132                emit_sysval_read(ctx, &instr->instr, 4, 0);
2133                break;
2134
2135        case nir_intrinsic_load_workgroup_id:
2136        case nir_intrinsic_load_local_invocation_id:
2137        case nir_intrinsic_load_global_invocation_id:
2138        case nir_intrinsic_load_global_invocation_id_zero_base:
2139                emit_compute_builtin(ctx, instr);
2140                break;
2141
2142        case nir_intrinsic_load_vertex_id_zero_base:
2143        case nir_intrinsic_load_instance_id:
2144                emit_vertex_builtin(ctx, instr);
2145                break;
2146
2147        case nir_intrinsic_load_sample_mask_in:
2148                emit_special(ctx, instr, 96);
2149                break;
2150
2151        case nir_intrinsic_load_sample_id:
2152                emit_special(ctx, instr, 97);
2153                break;
2154
2155        /* Midgard doesn't seem to want special handling */
2156        case nir_intrinsic_memory_barrier:
2157        case nir_intrinsic_memory_barrier_buffer:
2158        case nir_intrinsic_memory_barrier_image:
2159        case nir_intrinsic_memory_barrier_shared:
2160        case nir_intrinsic_group_memory_barrier:
2161                break;
2162
2163        case nir_intrinsic_control_barrier:
2164                schedule_barrier(ctx);
2165                emit_control_barrier(ctx);
2166                schedule_barrier(ctx);
2167                break;
2168
2169        ATOMIC_CASE(ctx, instr, add, add);
2170        ATOMIC_CASE(ctx, instr, and, and);
2171        ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg);
2172        ATOMIC_CASE(ctx, instr, exchange, xchg);
2173        ATOMIC_CASE(ctx, instr, imax, imax);
2174        ATOMIC_CASE(ctx, instr, imin, imin);
2175        ATOMIC_CASE(ctx, instr, or, or);
2176        ATOMIC_CASE(ctx, instr, umax, umax);
2177        ATOMIC_CASE(ctx, instr, umin, umin);
2178        ATOMIC_CASE(ctx, instr, xor, xor);
2179
2180        IMAGE_ATOMIC_CASE(ctx, instr, add, add);
2181        IMAGE_ATOMIC_CASE(ctx, instr, and, and);
2182        IMAGE_ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg);
2183        IMAGE_ATOMIC_CASE(ctx, instr, exchange, xchg);
2184        IMAGE_ATOMIC_CASE(ctx, instr, imax, imax);
2185        IMAGE_ATOMIC_CASE(ctx, instr, imin, imin);
2186        IMAGE_ATOMIC_CASE(ctx, instr, or, or);
2187        IMAGE_ATOMIC_CASE(ctx, instr, umax, umax);
2188        IMAGE_ATOMIC_CASE(ctx, instr, umin, umin);
2189        IMAGE_ATOMIC_CASE(ctx, instr, xor, xor);
2190
2191        default:
2192                fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
2193                assert(0);
2194                break;
2195        }
2196}
2197
2198/* Returns dimension with 0 special casing cubemaps */
2199static unsigned
2200midgard_tex_format(enum glsl_sampler_dim dim)
2201{
2202        switch (dim) {
2203        case GLSL_SAMPLER_DIM_1D:
2204        case GLSL_SAMPLER_DIM_BUF:
2205                return 1;
2206
2207        case GLSL_SAMPLER_DIM_2D:
2208        case GLSL_SAMPLER_DIM_MS:
2209        case GLSL_SAMPLER_DIM_EXTERNAL:
2210        case GLSL_SAMPLER_DIM_RECT:
2211                return 2;
2212
2213        case GLSL_SAMPLER_DIM_3D:
2214                return 3;
2215
2216        case GLSL_SAMPLER_DIM_CUBE:
2217                return 0;
2218
2219        default:
2220                DBG("Unknown sampler dim type\n");
2221                assert(0);
2222                return 0;
2223        }
2224}
2225
2226/* Tries to attach an explicit LOD or bias as a constant. Returns whether this
2227 * was successful */
2228
2229static bool
2230pan_attach_constant_bias(
2231        compiler_context *ctx,
2232        nir_src lod,
2233        midgard_texture_word *word)
2234{
2235        /* To attach as constant, it has to *be* constant */
2236
2237        if (!nir_src_is_const(lod))
2238                return false;
2239
2240        float f = nir_src_as_float(lod);
2241
2242        /* Break into fixed-point */
2243        signed lod_int = f;
2244        float lod_frac = f - lod_int;
2245
2246        /* Carry over negative fractions */
2247        if (lod_frac < 0.0) {
2248                lod_int--;
2249                lod_frac += 1.0;
2250        }
2251
2252        /* Encode */
2253        word->bias = float_to_ubyte(lod_frac);
2254        word->bias_int = lod_int;
2255
2256        return true;
2257}
2258
2259static enum mali_texture_mode
2260mdg_texture_mode(nir_tex_instr *instr)
2261{
2262        if (instr->op == nir_texop_tg4 && instr->is_shadow)
2263                return TEXTURE_GATHER_SHADOW;
2264        else if (instr->op == nir_texop_tg4)
2265                return TEXTURE_GATHER_X + instr->component;
2266        else if (instr->is_shadow)
2267                return TEXTURE_SHADOW;
2268        else
2269                return TEXTURE_NORMAL;
2270}
2271
2272static void
2273set_tex_coord(compiler_context *ctx, nir_tex_instr *instr,
2274              midgard_instruction *ins)
2275{
2276        int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
2277
2278        assert(coord_idx >= 0);
2279
2280        int comparator_idx = nir_tex_instr_src_index(instr, nir_tex_src_comparator);
2281        int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
2282        assert(comparator_idx < 0 || ms_idx < 0);
2283        int ms_or_comparator_idx = ms_idx >= 0 ? ms_idx : comparator_idx;
2284
2285        unsigned coords = nir_src_index(ctx, &instr->src[coord_idx].src);
2286
2287        emit_explicit_constant(ctx, coords, coords);
2288
2289        ins->src_types[1] = nir_tex_instr_src_type(instr, coord_idx) |
2290                            nir_src_bit_size(instr->src[coord_idx].src);
2291
2292        unsigned nr_comps = instr->coord_components;
2293        unsigned written_mask = 0, write_mask = 0;
2294
2295        /* Initialize all components to coord.x which is expected to always be
2296         * present. Swizzle is updated below based on the texture dimension
2297         * and extra attributes that are packed in the coordinate argument.
2298         */
2299        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++)
2300                ins->swizzle[1][c] = COMPONENT_X;
2301
2302        /* Shadow ref value is part of the coordinates if there's no comparator
2303         * source, in that case it's always placed in the last component.
2304         * Midgard wants the ref value in coord.z.
2305         */
2306        if (instr->is_shadow && comparator_idx < 0) {
2307                ins->swizzle[1][COMPONENT_Z] = --nr_comps;
2308                write_mask |= 1 << COMPONENT_Z;
2309        }
2310
2311        /* The array index is the last component if there's no shadow ref value
2312         * or second last if there's one. We already decremented the number of
2313         * components to account for the shadow ref value above.
2314         * Midgard wants the array index in coord.w.
2315         */
2316        if (instr->is_array) {
2317                ins->swizzle[1][COMPONENT_W] = --nr_comps;
2318                write_mask |= 1 << COMPONENT_W;
2319        }
2320
2321        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
2322                /* texelFetch is undefined on samplerCube */
2323                assert(ins->op != midgard_tex_op_fetch);
2324
2325                ins->src[1] = make_compiler_temp_reg(ctx);
2326
2327                /* For cubemaps, we use a special ld/st op to select the face
2328                 * and copy the xy into the texture register
2329                 */
2330                midgard_instruction ld = m_ld_cubemap_coords(ins->src[1], 0);
2331                ld.src[1] = coords;
2332                ld.src_types[1] = ins->src_types[1];
2333                ld.mask = 0x3; /* xy */
2334                ld.load_store.bitsize_toggle = true;
2335                ld.swizzle[1][3] = COMPONENT_X;
2336                emit_mir_instruction(ctx, ld);
2337
2338                /* We packed cube coordiates (X,Y,Z) into (X,Y), update the
2339                 * written mask accordingly and decrement the number of
2340                 * components
2341                 */
2342                nr_comps--;
2343                written_mask |= 3;
2344        }
2345
2346        /* Now flag tex coord components that have not been written yet */
2347        write_mask |= mask_of(nr_comps) & ~written_mask;
2348        for (unsigned c = 0; c < nr_comps; c++)
2349                ins->swizzle[1][c] = c;
2350
2351        /* Sample index and shadow ref are expected in coord.z */
2352        if (ms_or_comparator_idx >= 0) {
2353                assert(!((write_mask | written_mask) & (1 << COMPONENT_Z)));
2354
2355                unsigned sample_or_ref =
2356                        nir_src_index(ctx, &instr->src[ms_or_comparator_idx].src);
2357
2358                emit_explicit_constant(ctx, sample_or_ref, sample_or_ref);
2359
2360                if (ins->src[1] == ~0)
2361                        ins->src[1] = make_compiler_temp_reg(ctx);
2362
2363                midgard_instruction mov = v_mov(sample_or_ref, ins->src[1]);
2364
2365                for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++)
2366                        mov.swizzle[1][c] = COMPONENT_X;
2367
2368                mov.mask = 1 << COMPONENT_Z;
2369                written_mask |= 1 << COMPONENT_Z;
2370                ins->swizzle[1][COMPONENT_Z] = COMPONENT_Z;
2371                emit_mir_instruction(ctx, mov);
2372        }
2373
2374        /* Texelfetch coordinates uses all four elements (xyz/index) regardless
2375         * of texture dimensionality, which means it's necessary to zero the
2376         * unused components to keep everything happy.
2377         */
2378        if (ins->op == midgard_tex_op_fetch &&
2379            (written_mask | write_mask) != 0xF) {
2380                if (ins->src[1] == ~0)
2381                        ins->src[1] = make_compiler_temp_reg(ctx);
2382
2383                /* mov index.zw, #0, or generalized */
2384                midgard_instruction mov =
2385                        v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), ins->src[1]);
2386                mov.has_constants = true;
2387                mov.mask = (written_mask | write_mask) ^ 0xF;
2388                emit_mir_instruction(ctx, mov);
2389                for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) {
2390                        if (mov.mask & (1 << c))
2391                                ins->swizzle[1][c] = c;
2392                }
2393        }
2394
2395        if (ins->src[1] == ~0) {
2396                /* No temporary reg created, use the src coords directly */
2397                ins->src[1] = coords;
2398	} else if (write_mask) {
2399                /* Move the remaining coordinates to the temporary reg */
2400                midgard_instruction mov = v_mov(coords, ins->src[1]);
2401
2402                for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) {
2403                        if ((1 << c) & write_mask) {
2404                                mov.swizzle[1][c] = ins->swizzle[1][c];
2405                                ins->swizzle[1][c] = c;
2406                        } else {
2407                                mov.swizzle[1][c] = COMPONENT_X;
2408                        }
2409                }
2410
2411                mov.mask = write_mask;
2412                emit_mir_instruction(ctx, mov);
2413        }
2414}
2415
2416static void
2417emit_texop_native(compiler_context *ctx, nir_tex_instr *instr,
2418                  unsigned midgard_texop)
2419{
2420        nir_dest *dest = &instr->dest;
2421
2422        int texture_index = instr->texture_index;
2423        int sampler_index = instr->sampler_index;
2424
2425        nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type);
2426
2427        /* texture instructions support float outmods */
2428        unsigned outmod = midgard_outmod_none;
2429        if (dest_base == nir_type_float) {
2430                outmod = mir_determine_float_outmod(ctx, &dest, 0);
2431        }
2432
2433        midgard_instruction ins = {
2434                .type = TAG_TEXTURE_4,
2435                .mask = 0xF,
2436                .dest = nir_dest_index(dest),
2437                .src = { ~0, ~0, ~0, ~0 },
2438                .dest_type = instr->dest_type,
2439                .swizzle = SWIZZLE_IDENTITY_4,
2440                .outmod = outmod,
2441                .op = midgard_texop,
2442                .texture = {
2443                        .format = midgard_tex_format(instr->sampler_dim),
2444                        .texture_handle = texture_index,
2445                        .sampler_handle = sampler_index,
2446                        .mode = mdg_texture_mode(instr)
2447                }
2448        };
2449
2450        if (instr->is_shadow && !instr->is_new_style_shadow && instr->op != nir_texop_tg4)
2451           for (int i = 0; i < 4; ++i)
2452              ins.swizzle[0][i] = COMPONENT_X;
2453
2454        for (unsigned i = 0; i < instr->num_srcs; ++i) {
2455                int index = nir_src_index(ctx, &instr->src[i].src);
2456                unsigned sz = nir_src_bit_size(instr->src[i].src);
2457                nir_alu_type T = nir_tex_instr_src_type(instr, i) | sz;
2458
2459                switch (instr->src[i].src_type) {
2460                case nir_tex_src_coord:
2461                        set_tex_coord(ctx, instr, &ins);
2462                        break;
2463
2464                case nir_tex_src_bias:
2465                case nir_tex_src_lod: {
2466                        /* Try as a constant if we can */
2467
2468                        bool is_txf = midgard_texop == midgard_tex_op_fetch;
2469                        if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture))
2470                                break;
2471
2472                        ins.texture.lod_register = true;
2473                        ins.src[2] = index;
2474                        ins.src_types[2] = T;
2475
2476                        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c)
2477                                ins.swizzle[2][c] = COMPONENT_X;
2478
2479                        emit_explicit_constant(ctx, index, index);
2480
2481                        break;
2482                };
2483
2484                case nir_tex_src_offset: {
2485                        ins.texture.offset_register = true;
2486                        ins.src[3] = index;
2487                        ins.src_types[3] = T;
2488
2489                        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c)
2490                                ins.swizzle[3][c] = (c > COMPONENT_Z) ? 0 : c;
2491
2492                        emit_explicit_constant(ctx, index, index);
2493                        break;
2494                };
2495
2496                case nir_tex_src_comparator:
2497                case nir_tex_src_ms_index:
2498                        /* Nothing to do, handled in set_tex_coord() */
2499                        break;
2500
2501                default: {
2502                        fprintf(stderr, "Unknown texture source type: %d\n", instr->src[i].src_type);
2503                        assert(0);
2504                }
2505                }
2506        }
2507
2508        emit_mir_instruction(ctx, ins);
2509}
2510
2511static void
2512emit_tex(compiler_context *ctx, nir_tex_instr *instr)
2513{
2514        switch (instr->op) {
2515        case nir_texop_tex:
2516        case nir_texop_txb:
2517                emit_texop_native(ctx, instr, midgard_tex_op_normal);
2518                break;
2519        case nir_texop_txl:
2520        case nir_texop_tg4:
2521                emit_texop_native(ctx, instr, midgard_tex_op_gradient);
2522                break;
2523        case nir_texop_txf:
2524        case nir_texop_txf_ms:
2525                emit_texop_native(ctx, instr, midgard_tex_op_fetch);
2526                break;
2527        case nir_texop_txs:
2528                emit_sysval_read(ctx, &instr->instr, 4, 0);
2529                break;
2530        default: {
2531                fprintf(stderr, "Unhandled texture op: %d\n", instr->op);
2532                assert(0);
2533        }
2534        }
2535}
2536
2537static void
2538emit_jump(compiler_context *ctx, nir_jump_instr *instr)
2539{
2540        switch (instr->type) {
2541        case nir_jump_break: {
2542                /* Emit a branch out of the loop */
2543                struct midgard_instruction br = v_branch(false, false);
2544                br.branch.target_type = TARGET_BREAK;
2545                br.branch.target_break = ctx->current_loop_depth;
2546                emit_mir_instruction(ctx, br);
2547                break;
2548        }
2549
2550        default:
2551                unreachable("Unhandled jump");
2552        }
2553}
2554
2555static void
2556emit_instr(compiler_context *ctx, struct nir_instr *instr)
2557{
2558        switch (instr->type) {
2559        case nir_instr_type_load_const:
2560                emit_load_const(ctx, nir_instr_as_load_const(instr));
2561                break;
2562
2563        case nir_instr_type_intrinsic:
2564                emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
2565                break;
2566
2567        case nir_instr_type_alu:
2568                emit_alu(ctx, nir_instr_as_alu(instr));
2569                break;
2570
2571        case nir_instr_type_tex:
2572                emit_tex(ctx, nir_instr_as_tex(instr));
2573                break;
2574
2575        case nir_instr_type_jump:
2576                emit_jump(ctx, nir_instr_as_jump(instr));
2577                break;
2578
2579        case nir_instr_type_ssa_undef:
2580                /* Spurious */
2581                break;
2582
2583        default:
2584                DBG("Unhandled instruction type\n");
2585                break;
2586        }
2587}
2588
2589
2590/* ALU instructions can inline or embed constants, which decreases register
2591 * pressure and saves space. */
2592
2593#define CONDITIONAL_ATTACH(idx) { \
2594	void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[idx] + 1); \
2595\
2596	if (entry) { \
2597		attach_constants(ctx, alu, entry, alu->src[idx] + 1); \
2598		alu->src[idx] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \
2599	} \
2600}
2601
2602static void
2603inline_alu_constants(compiler_context *ctx, midgard_block *block)
2604{
2605        mir_foreach_instr_in_block(block, alu) {
2606                /* Other instructions cannot inline constants */
2607                if (alu->type != TAG_ALU_4) continue;
2608                if (alu->compact_branch) continue;
2609
2610                /* If there is already a constant here, we can do nothing */
2611                if (alu->has_constants) continue;
2612
2613                CONDITIONAL_ATTACH(0);
2614
2615                if (!alu->has_constants) {
2616                        CONDITIONAL_ATTACH(1)
2617                } else if (!alu->inline_constant) {
2618                        /* Corner case: _two_ vec4 constants, for instance with a
2619                         * csel. For this case, we can only use a constant
2620                         * register for one, we'll have to emit a move for the
2621                         * other. */
2622
2623                        void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[1] + 1);
2624                        unsigned scratch = make_compiler_temp(ctx);
2625
2626                        if (entry) {
2627                                midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), scratch);
2628                                attach_constants(ctx, &ins, entry, alu->src[1] + 1);
2629
2630                                /* Set the source */
2631                                alu->src[1] = scratch;
2632
2633                                /* Inject us -before- the last instruction which set r31 */
2634                                mir_insert_instruction_before(ctx, mir_prev_op(alu), ins);
2635                        }
2636                }
2637        }
2638}
2639
2640unsigned
2641max_bitsize_for_alu(midgard_instruction *ins)
2642{
2643        unsigned max_bitsize = 0;
2644        for (int i = 0; i < MIR_SRC_COUNT; i++) {
2645                if (ins->src[i] == ~0) continue;
2646                unsigned src_bitsize = nir_alu_type_get_type_size(ins->src_types[i]);
2647                max_bitsize = MAX2(src_bitsize, max_bitsize);
2648        }
2649        unsigned dst_bitsize = nir_alu_type_get_type_size(ins->dest_type);
2650        max_bitsize = MAX2(dst_bitsize, max_bitsize);
2651
2652        /* We don't have fp16 LUTs, so we'll want to emit code like:
2653         *
2654         *      vlut.fsinr hr0, hr0
2655         *
2656         * where both input and output are 16-bit but the operation is carried
2657         * out in 32-bit
2658         */
2659
2660        switch (ins->op) {
2661        case midgard_alu_op_fsqrt:
2662        case midgard_alu_op_frcp:
2663        case midgard_alu_op_frsqrt:
2664        case midgard_alu_op_fsinpi:
2665        case midgard_alu_op_fcospi:
2666        case midgard_alu_op_fexp2:
2667        case midgard_alu_op_flog2:
2668                max_bitsize = MAX2(max_bitsize, 32);
2669                break;
2670
2671        default:
2672                break;
2673        }
2674
2675        /* High implies computing at a higher bitsize, e.g umul_high of 32-bit
2676         * requires computing at 64-bit */
2677        if (midgard_is_integer_out_op(ins->op) && ins->outmod == midgard_outmod_keephi) {
2678                max_bitsize *= 2;
2679                assert(max_bitsize <= 64);
2680        }
2681
2682        return max_bitsize;
2683}
2684
2685midgard_reg_mode
2686reg_mode_for_bitsize(unsigned bitsize)
2687{
2688        switch (bitsize) {
2689                /* use 16 pipe for 8 since we don't support vec16 yet */
2690        case 8:
2691        case 16:
2692                return midgard_reg_mode_16;
2693        case 32:
2694                return midgard_reg_mode_32;
2695        case 64:
2696                return midgard_reg_mode_64;
2697        default:
2698                unreachable("invalid bit size");
2699        }
2700}
2701
2702/* Midgard supports two types of constants, embedded constants (128-bit) and
2703 * inline constants (16-bit). Sometimes, especially with scalar ops, embedded
2704 * constants can be demoted to inline constants, for space savings and
2705 * sometimes a performance boost */
2706
2707static void
2708embedded_to_inline_constant(compiler_context *ctx, midgard_block *block)
2709{
2710        mir_foreach_instr_in_block(block, ins) {
2711                if (!ins->has_constants) continue;
2712                if (ins->has_inline_constant) continue;
2713
2714                unsigned max_bitsize = max_bitsize_for_alu(ins);
2715
2716                /* We can inline 32-bit (sometimes) or 16-bit (usually) */
2717                bool is_16 = max_bitsize == 16;
2718                bool is_32 = max_bitsize == 32;
2719
2720                if (!(is_16 || is_32))
2721                        continue;
2722
2723                /* src1 cannot be an inline constant due to encoding
2724                 * restrictions. So, if possible we try to flip the arguments
2725                 * in that case */
2726
2727                int op = ins->op;
2728
2729                if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) &&
2730                                alu_opcode_props[op].props & OP_COMMUTES) {
2731                        mir_flip(ins);
2732                }
2733
2734                if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
2735                        /* Component is from the swizzle. Take a nonzero component */
2736                        assert(ins->mask);
2737                        unsigned first_comp = ffs(ins->mask) - 1;
2738                        unsigned component = ins->swizzle[1][first_comp];
2739
2740                        /* Scale constant appropriately, if we can legally */
2741                        int16_t scaled_constant = 0;
2742
2743                        if (is_16) {
2744                                scaled_constant = ins->constants.u16[component];
2745                        } else if (midgard_is_integer_op(op)) {
2746                                scaled_constant = ins->constants.u32[component];
2747
2748                                /* Constant overflow after resize */
2749                                if (scaled_constant != ins->constants.u32[component])
2750                                        continue;
2751                        } else {
2752                                float original = ins->constants.f32[component];
2753                                scaled_constant = _mesa_float_to_half(original);
2754
2755                                /* Check for loss of precision. If this is
2756                                 * mediump, we don't care, but for a highp
2757                                 * shader, we need to pay attention. NIR
2758                                 * doesn't yet tell us which mode we're in!
2759                                 * Practically this prevents most constants
2760                                 * from being inlined, sadly. */
2761
2762                                float fp32 = _mesa_half_to_float(scaled_constant);
2763
2764                                if (fp32 != original)
2765                                        continue;
2766                        }
2767
2768                        /* Should've been const folded */
2769                        if (ins->src_abs[1] || ins->src_neg[1])
2770                                continue;
2771
2772                        /* Make sure that the constant is not itself a vector
2773                         * by checking if all accessed values are the same. */
2774
2775                        const midgard_constants *cons = &ins->constants;
2776                        uint32_t value = is_16 ? cons->u16[component] : cons->u32[component];
2777
2778                        bool is_vector = false;
2779                        unsigned mask = effective_writemask(ins->op, ins->mask);
2780
2781                        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) {
2782                                /* We only care if this component is actually used */
2783                                if (!(mask & (1 << c)))
2784                                        continue;
2785
2786                                uint32_t test = is_16 ?
2787                                                cons->u16[ins->swizzle[1][c]] :
2788                                                cons->u32[ins->swizzle[1][c]];
2789
2790                                if (test != value) {
2791                                        is_vector = true;
2792                                        break;
2793                                }
2794                        }
2795
2796                        if (is_vector)
2797                                continue;
2798
2799                        /* Get rid of the embedded constant */
2800                        ins->has_constants = false;
2801                        ins->src[1] = ~0;
2802                        ins->has_inline_constant = true;
2803                        ins->inline_constant = scaled_constant;
2804                }
2805        }
2806}
2807
2808/* Dead code elimination for branches at the end of a block - only one branch
2809 * per block is legal semantically */
2810
2811static void
2812midgard_cull_dead_branch(compiler_context *ctx, midgard_block *block)
2813{
2814        bool branched = false;
2815
2816        mir_foreach_instr_in_block_safe(block, ins) {
2817                if (!midgard_is_branch_unit(ins->unit)) continue;
2818
2819                if (branched)
2820                        mir_remove_instruction(ins);
2821
2822                branched = true;
2823        }
2824}
2825
2826/* We want to force the invert on AND/OR to the second slot to legalize into
2827 * iandnot/iornot. The relevant patterns are for AND (and OR respectively)
2828 *
2829 *   ~a & #b = ~a & ~(#~b)
2830 *   ~a & b = b & ~a
2831 */
2832
2833static void
2834midgard_legalize_invert(compiler_context *ctx, midgard_block *block)
2835{
2836        mir_foreach_instr_in_block(block, ins) {
2837                if (ins->type != TAG_ALU_4) continue;
2838
2839                if (ins->op != midgard_alu_op_iand &&
2840                    ins->op != midgard_alu_op_ior) continue;
2841
2842                if (ins->src_invert[1] || !ins->src_invert[0]) continue;
2843
2844                if (ins->has_inline_constant) {
2845                        /* ~(#~a) = ~(~#a) = a, so valid, and forces both
2846                         * inverts on */
2847                        ins->inline_constant = ~ins->inline_constant;
2848                        ins->src_invert[1] = true;
2849                } else {
2850                        /* Flip to the right invert order. Note
2851                         * has_inline_constant false by assumption on the
2852                         * branch, so flipping makes sense. */
2853                        mir_flip(ins);
2854                }
2855        }
2856}
2857
2858static unsigned
2859emit_fragment_epilogue(compiler_context *ctx, unsigned rt, unsigned sample_iter)
2860{
2861        /* Loop to ourselves */
2862        midgard_instruction *br = ctx->writeout_branch[rt][sample_iter];
2863        struct midgard_instruction ins = v_branch(false, false);
2864        ins.writeout = br->writeout;
2865        ins.branch.target_block = ctx->block_count - 1;
2866        ins.constants.u32[0] = br->constants.u32[0];
2867        memcpy(&ins.src_types, &br->src_types, sizeof(ins.src_types));
2868        emit_mir_instruction(ctx, ins);
2869
2870        ctx->current_block->epilogue = true;
2871        schedule_barrier(ctx);
2872        return ins.branch.target_block;
2873}
2874
2875static midgard_block *
2876emit_block_init(compiler_context *ctx)
2877{
2878        midgard_block *this_block = ctx->after_block;
2879        ctx->after_block = NULL;
2880
2881        if (!this_block)
2882                this_block = create_empty_block(ctx);
2883
2884        list_addtail(&this_block->base.link, &ctx->blocks);
2885
2886        this_block->scheduled = false;
2887        ++ctx->block_count;
2888
2889        /* Set up current block */
2890        list_inithead(&this_block->base.instructions);
2891        ctx->current_block = this_block;
2892
2893        return this_block;
2894}
2895
2896static midgard_block *
2897emit_block(compiler_context *ctx, nir_block *block)
2898{
2899        midgard_block *this_block = emit_block_init(ctx);
2900
2901        nir_foreach_instr(instr, block) {
2902                emit_instr(ctx, instr);
2903                ++ctx->instruction_count;
2904        }
2905
2906        return this_block;
2907}
2908
2909static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list);
2910
2911static void
2912emit_if(struct compiler_context *ctx, nir_if *nif)
2913{
2914        midgard_block *before_block = ctx->current_block;
2915
2916        /* Speculatively emit the branch, but we can't fill it in until later */
2917        bool inv = false;
2918        EMIT(branch, true, true);
2919        midgard_instruction *then_branch = mir_last_in_block(ctx->current_block);
2920        then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv);
2921        then_branch->src_types[0] = nir_type_uint32;
2922        then_branch->branch.invert_conditional = !inv;
2923
2924        /* Emit the two subblocks. */
2925        midgard_block *then_block = emit_cf_list(ctx, &nif->then_list);
2926        midgard_block *end_then_block = ctx->current_block;
2927
2928        /* Emit a jump from the end of the then block to the end of the else */
2929        EMIT(branch, false, false);
2930        midgard_instruction *then_exit = mir_last_in_block(ctx->current_block);
2931
2932        /* Emit second block, and check if it's empty */
2933
2934        int else_idx = ctx->block_count;
2935        int count_in = ctx->instruction_count;
2936        midgard_block *else_block = emit_cf_list(ctx, &nif->else_list);
2937        midgard_block *end_else_block = ctx->current_block;
2938        int after_else_idx = ctx->block_count;
2939
2940        /* Now that we have the subblocks emitted, fix up the branches */
2941
2942        assert(then_block);
2943        assert(else_block);
2944
2945        if (ctx->instruction_count == count_in) {
2946                /* The else block is empty, so don't emit an exit jump */
2947                mir_remove_instruction(then_exit);
2948                then_branch->branch.target_block = after_else_idx;
2949        } else {
2950                then_branch->branch.target_block = else_idx;
2951                then_exit->branch.target_block = after_else_idx;
2952        }
2953
2954        /* Wire up the successors */
2955
2956        ctx->after_block = create_empty_block(ctx);
2957
2958        pan_block_add_successor(&before_block->base, &then_block->base);
2959        pan_block_add_successor(&before_block->base, &else_block->base);
2960
2961        pan_block_add_successor(&end_then_block->base, &ctx->after_block->base);
2962        pan_block_add_successor(&end_else_block->base, &ctx->after_block->base);
2963}
2964
2965static void
2966emit_loop(struct compiler_context *ctx, nir_loop *nloop)
2967{
2968        /* Remember where we are */
2969        midgard_block *start_block = ctx->current_block;
2970
2971        /* Allocate a loop number, growing the current inner loop depth */
2972        int loop_idx = ++ctx->current_loop_depth;
2973
2974        /* Get index from before the body so we can loop back later */
2975        int start_idx = ctx->block_count;
2976
2977        /* Emit the body itself */
2978        midgard_block *loop_block = emit_cf_list(ctx, &nloop->body);
2979
2980        /* Branch back to loop back */
2981        struct midgard_instruction br_back = v_branch(false, false);
2982        br_back.branch.target_block = start_idx;
2983        emit_mir_instruction(ctx, br_back);
2984
2985        /* Mark down that branch in the graph. */
2986        pan_block_add_successor(&start_block->base, &loop_block->base);
2987        pan_block_add_successor(&ctx->current_block->base, &loop_block->base);
2988
2989        /* Find the index of the block about to follow us (note: we don't add
2990         * one; blocks are 0-indexed so we get a fencepost problem) */
2991        int break_block_idx = ctx->block_count;
2992
2993        /* Fix up the break statements we emitted to point to the right place,
2994         * now that we can allocate a block number for them */
2995        ctx->after_block = create_empty_block(ctx);
2996
2997        mir_foreach_block_from(ctx, start_block, _block) {
2998                mir_foreach_instr_in_block(((midgard_block *) _block), ins) {
2999                        if (ins->type != TAG_ALU_4) continue;
3000                        if (!ins->compact_branch) continue;
3001
3002                        /* We found a branch -- check the type to see if we need to do anything */
3003                        if (ins->branch.target_type != TARGET_BREAK) continue;
3004
3005                        /* It's a break! Check if it's our break */
3006                        if (ins->branch.target_break != loop_idx) continue;
3007
3008                        /* Okay, cool, we're breaking out of this loop.
3009                         * Rewrite from a break to a goto */
3010
3011                        ins->branch.target_type = TARGET_GOTO;
3012                        ins->branch.target_block = break_block_idx;
3013
3014                        pan_block_add_successor(_block, &ctx->after_block->base);
3015                }
3016        }
3017
3018        /* Now that we've finished emitting the loop, free up the depth again
3019         * so we play nice with recursion amid nested loops */
3020        --ctx->current_loop_depth;
3021
3022        /* Dump loop stats */
3023        ++ctx->loop_count;
3024}
3025
3026static midgard_block *
3027emit_cf_list(struct compiler_context *ctx, struct exec_list *list)
3028{
3029        midgard_block *start_block = NULL;
3030
3031        foreach_list_typed(nir_cf_node, node, node, list) {
3032                switch (node->type) {
3033                case nir_cf_node_block: {
3034                        midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node));
3035
3036                        if (!start_block)
3037                                start_block = block;
3038
3039                        break;
3040                }
3041
3042                case nir_cf_node_if:
3043                        emit_if(ctx, nir_cf_node_as_if(node));
3044                        break;
3045
3046                case nir_cf_node_loop:
3047                        emit_loop(ctx, nir_cf_node_as_loop(node));
3048                        break;
3049
3050                case nir_cf_node_function:
3051                        assert(0);
3052                        break;
3053                }
3054        }
3055
3056        return start_block;
3057}
3058
3059/* Due to lookahead, we need to report the first tag executed in the command
3060 * stream and in branch targets. An initial block might be empty, so iterate
3061 * until we find one that 'works' */
3062
3063unsigned
3064midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx)
3065{
3066        midgard_block *initial_block = mir_get_block(ctx, block_idx);
3067
3068        mir_foreach_block_from(ctx, initial_block, _v) {
3069                midgard_block *v = (midgard_block *) _v;
3070                if (v->quadword_count) {
3071                        midgard_bundle *initial_bundle =
3072                                util_dynarray_element(&v->bundles, midgard_bundle, 0);
3073
3074                        return initial_bundle->tag;
3075                }
3076        }
3077
3078        /* Default to a tag 1 which will break from the shader, in case we jump
3079         * to the exit block (i.e. `return` in a compute shader) */
3080
3081        return 1;
3082}
3083
3084/* For each fragment writeout instruction, generate a writeout loop to
3085 * associate with it */
3086
3087static void
3088mir_add_writeout_loops(compiler_context *ctx)
3089{
3090        for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) {
3091                for (unsigned s = 0; s < MIDGARD_MAX_SAMPLE_ITER; ++s) {
3092                        midgard_instruction *br = ctx->writeout_branch[rt][s];
3093                        if (!br) continue;
3094
3095                        unsigned popped = br->branch.target_block;
3096                        pan_block_add_successor(&(mir_get_block(ctx, popped - 1)->base),
3097                                                &ctx->current_block->base);
3098                        br->branch.target_block = emit_fragment_epilogue(ctx, rt, s);
3099                        br->branch.target_type = TARGET_GOTO;
3100
3101                        /* If we have more RTs, we'll need to restore back after our
3102                         * loop terminates */
3103                        midgard_instruction *next_br = NULL;
3104
3105                        if ((s + 1) < MIDGARD_MAX_SAMPLE_ITER)
3106                                next_br = ctx->writeout_branch[rt][s + 1];
3107
3108                        if (!next_br && (rt + 1) < ARRAY_SIZE(ctx->writeout_branch))
3109			        next_br = ctx->writeout_branch[rt + 1][0];
3110
3111                        if (next_br) {
3112                                midgard_instruction uncond = v_branch(false, false);
3113                                uncond.branch.target_block = popped;
3114                                uncond.branch.target_type = TARGET_GOTO;
3115                                emit_mir_instruction(ctx, uncond);
3116                                pan_block_add_successor(&ctx->current_block->base,
3117                                                        &(mir_get_block(ctx, popped)->base));
3118                                schedule_barrier(ctx);
3119                        } else {
3120                                /* We're last, so we can terminate here */
3121                                br->last_writeout = true;
3122                        }
3123                }
3124        }
3125}
3126
3127void
3128midgard_compile_shader_nir(nir_shader *nir,
3129                           const struct panfrost_compile_inputs *inputs,
3130                           struct util_dynarray *binary,
3131                           struct pan_shader_info *info)
3132{
3133        midgard_debug = debug_get_option_midgard_debug();
3134
3135        /* TODO: Bound against what? */
3136        compiler_context *ctx = rzalloc(NULL, compiler_context);
3137        ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals,
3138                                                  inputs->fixed_sysval_layout,
3139                                                  ctx);
3140
3141        ctx->inputs = inputs;
3142        ctx->nir = nir;
3143        ctx->info = info;
3144        ctx->stage = nir->info.stage;
3145
3146        if (inputs->is_blend) {
3147                unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1);
3148                const struct util_format_description *desc =
3149                        util_format_description(inputs->rt_formats[inputs->blend.rt]);
3150
3151                /* We have to split writeout in 128 bit chunks */
3152                ctx->blend_sample_iterations =
3153                        DIV_ROUND_UP(desc->block.bits * nr_samples, 128);
3154        }
3155        ctx->blend_input = ~0;
3156        ctx->blend_src1 = ~0;
3157        ctx->quirks = midgard_get_quirks(inputs->gpu_id);
3158
3159        /* Initialize at a global (not block) level hash tables */
3160
3161        ctx->ssa_constants = _mesa_hash_table_u64_create(ctx);
3162
3163        /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
3164         * (so we don't accidentally duplicate the epilogue since mesa/st has
3165         * messed with our I/O quite a bit already) */
3166
3167        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3168
3169        if (ctx->stage == MESA_SHADER_VERTEX) {
3170                NIR_PASS_V(nir, nir_lower_viewport_transform);
3171                NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0);
3172        }
3173
3174        NIR_PASS_V(nir, nir_lower_var_copies);
3175        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3176        NIR_PASS_V(nir, nir_split_var_copies);
3177        NIR_PASS_V(nir, nir_lower_var_copies);
3178        NIR_PASS_V(nir, nir_lower_global_vars_to_local);
3179        NIR_PASS_V(nir, nir_lower_var_copies);
3180        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3181
3182        NIR_PASS_V(nir, pan_lower_framebuffer,
3183                   inputs->rt_formats, inputs->raw_fmt_mask,
3184                   inputs->is_blend, ctx->quirks & MIDGARD_BROKEN_BLEND_LOADS);
3185
3186        NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3187                        glsl_type_size, 0);
3188        NIR_PASS_V(nir, nir_lower_ssbo);
3189        NIR_PASS_V(nir, pan_nir_lower_zs_store);
3190
3191        NIR_PASS_V(nir, pan_nir_lower_64bit_intrin);
3192
3193        NIR_PASS_V(nir, midgard_nir_lower_global_load);
3194
3195        /* Optimisation passes */
3196
3197        optimise_nir(nir, ctx->quirks, inputs->is_blend, inputs->is_blit);
3198
3199        bool skip_internal = nir->info.internal;
3200        skip_internal &= !(midgard_debug & MIDGARD_DBG_INTERNAL);
3201
3202        if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal)
3203                nir_print_shader(nir, stdout);
3204
3205        info->tls_size = nir->scratch_size;
3206
3207        nir_foreach_function(func, nir) {
3208                if (!func->impl)
3209                        continue;
3210
3211                list_inithead(&ctx->blocks);
3212                ctx->block_count = 0;
3213                ctx->func = func;
3214                ctx->already_emitted = calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD));
3215
3216                if (nir->info.outputs_read && !inputs->is_blend) {
3217                        emit_block_init(ctx);
3218
3219                        struct midgard_instruction wait = v_branch(false, false);
3220                        wait.branch.target_type = TARGET_TILEBUF_WAIT;
3221
3222                        emit_mir_instruction(ctx, wait);
3223
3224                        ++ctx->instruction_count;
3225                }
3226
3227                emit_cf_list(ctx, &func->impl->body);
3228                free(ctx->already_emitted);
3229                break; /* TODO: Multi-function shaders */
3230        }
3231
3232        /* Per-block lowering before opts */
3233
3234        mir_foreach_block(ctx, _block) {
3235                midgard_block *block = (midgard_block *) _block;
3236                inline_alu_constants(ctx, block);
3237                embedded_to_inline_constant(ctx, block);
3238        }
3239        /* MIR-level optimizations */
3240
3241        bool progress = false;
3242
3243        do {
3244                progress = false;
3245                progress |= midgard_opt_dead_code_eliminate(ctx);
3246
3247                mir_foreach_block(ctx, _block) {
3248                        midgard_block *block = (midgard_block *) _block;
3249                        progress |= midgard_opt_copy_prop(ctx, block);
3250                        progress |= midgard_opt_combine_projection(ctx, block);
3251                        progress |= midgard_opt_varying_projection(ctx, block);
3252                }
3253        } while (progress);
3254
3255        mir_foreach_block(ctx, _block) {
3256                midgard_block *block = (midgard_block *) _block;
3257                midgard_lower_derivatives(ctx, block);
3258                midgard_legalize_invert(ctx, block);
3259                midgard_cull_dead_branch(ctx, block);
3260        }
3261
3262        if (ctx->stage == MESA_SHADER_FRAGMENT)
3263                mir_add_writeout_loops(ctx);
3264
3265        /* Analyze now that the code is known but before scheduling creates
3266         * pipeline registers which are harder to track */
3267        mir_analyze_helper_requirements(ctx);
3268
3269        if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal)
3270                mir_print_shader(ctx);
3271
3272        /* Schedule! */
3273        midgard_schedule_program(ctx);
3274        mir_ra(ctx);
3275
3276        if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal)
3277                mir_print_shader(ctx);
3278
3279        /* Analyze after scheduling since this is order-dependent */
3280        mir_analyze_helper_terminate(ctx);
3281
3282        /* Emit flat binary from the instruction arrays. Iterate each block in
3283         * sequence. Save instruction boundaries such that lookahead tags can
3284         * be assigned easily */
3285
3286        /* Cache _all_ bundles in source order for lookahead across failed branches */
3287
3288        int bundle_count = 0;
3289        mir_foreach_block(ctx, _block) {
3290                midgard_block *block = (midgard_block *) _block;
3291                bundle_count += block->bundles.size / sizeof(midgard_bundle);
3292        }
3293        midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count);
3294        int bundle_idx = 0;
3295        mir_foreach_block(ctx, _block) {
3296                midgard_block *block = (midgard_block *) _block;
3297                util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
3298                        source_order_bundles[bundle_idx++] = bundle;
3299                }
3300        }
3301
3302        int current_bundle = 0;
3303
3304        /* Midgard prefetches instruction types, so during emission we
3305         * need to lookahead. Unless this is the last instruction, in
3306         * which we return 1. */
3307
3308        mir_foreach_block(ctx, _block) {
3309                midgard_block *block = (midgard_block *) _block;
3310                mir_foreach_bundle_in_block(block, bundle) {
3311                        int lookahead = 1;
3312
3313                        if (!bundle->last_writeout && (current_bundle + 1 < bundle_count))
3314                                lookahead = source_order_bundles[current_bundle + 1]->tag;
3315
3316                        emit_binary_bundle(ctx, block, bundle, binary, lookahead);
3317                        ++current_bundle;
3318                }
3319
3320                /* TODO: Free deeper */
3321                //util_dynarray_fini(&block->instructions);
3322        }
3323
3324        free(source_order_bundles);
3325
3326        /* Report the very first tag executed */
3327        info->midgard.first_tag = midgard_get_first_tag_from_block(ctx, 0);
3328
3329        info->ubo_mask = ctx->ubo_mask & ((1 << ctx->nir->info.num_ubos) - 1);
3330
3331        if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) {
3332                disassemble_midgard(stdout, binary->data,
3333                                    binary->size, inputs->gpu_id,
3334                                    midgard_debug & MIDGARD_DBG_VERBOSE);
3335                fflush(stdout);
3336        }
3337
3338        /* A shader ending on a 16MB boundary causes INSTR_INVALID_PC faults,
3339         * workaround by adding some padding to the end of the shader. (The
3340         * kernel makes sure shader BOs can't cross 16MB boundaries.) */
3341        if (binary->size)
3342                memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16);
3343
3344        if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->shaderdb) &&
3345            !nir->info.internal) {
3346                unsigned nr_bundles = 0, nr_ins = 0;
3347
3348                /* Count instructions and bundles */
3349
3350                mir_foreach_block(ctx, _block) {
3351                        midgard_block *block = (midgard_block *) _block;
3352                        nr_bundles += util_dynarray_num_elements(
3353                                              &block->bundles, midgard_bundle);
3354
3355                        mir_foreach_bundle_in_block(block, bun)
3356                                nr_ins += bun->instruction_count;
3357                }
3358
3359                /* Calculate thread count. There are certain cutoffs by
3360                 * register count for thread count */
3361
3362                unsigned nr_registers = info->work_reg_count;
3363
3364                unsigned nr_threads =
3365                        (nr_registers <= 4) ? 4 :
3366                        (nr_registers <= 8) ? 2 :
3367                        1;
3368
3369                /* Dump stats */
3370
3371                fprintf(stderr, "%s - %s shader: "
3372                        "%u inst, %u bundles, %u quadwords, "
3373                        "%u registers, %u threads, %u loops, "
3374                        "%u:%u spills:fills\n",
3375                        ctx->nir->info.label ?: "",
3376                        ctx->inputs->is_blend ? "PAN_SHADER_BLEND" :
3377                        gl_shader_stage_name(ctx->stage),
3378                        nr_ins, nr_bundles, ctx->quadword_count,
3379                        nr_registers, nr_threads,
3380                        ctx->loop_count,
3381                        ctx->spills, ctx->fills);
3382        }
3383
3384        _mesa_hash_table_u64_destroy(ctx->ssa_constants);
3385        _mesa_hash_table_u64_destroy(ctx->sysval_to_id);
3386
3387        ralloc_free(ctx);
3388}
3389