1/*
2 * Copyright © 2015 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "compiler/v3d_compiler.h"
25#include "compiler/nir/nir_builder.h"
26
27#include "util/u_helpers.h"
28
29/**
30 * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
31 * intrinsics into something amenable to the V3D architecture.
32 *
33 * Most of the work is turning the VS's store_output intrinsics from working
34 * on a base representing the gallium-level vec4 driver_location to an offset
35 * within the VPM, and emitting the header that's read by the fixed function
36 * hardware between the VS and FS.
37 *
38 * We also adjust the offsets on uniform loads to be in bytes, since that's
39 * what we need for indirect addressing with general TMU access.
40 */
41
42struct v3d_nir_lower_io_state {
43        int pos_vpm_offset;
44        int vp_vpm_offset;
45        int zs_vpm_offset;
46        int rcp_wc_vpm_offset;
47        int psiz_vpm_offset;
48        int varyings_vpm_offset;
49
50        /* Geometry shader state */
51        struct {
52                /* VPM offset for the current vertex data output */
53                nir_variable *output_offset_var;
54                /* VPM offset for the current vertex header */
55                nir_variable *header_offset_var;
56                /* VPM header for the current vertex */
57                nir_variable *header_var;
58
59                /* Size of the complete VPM output header */
60                uint32_t output_header_size;
61                /* Size of the output data for a single vertex */
62                uint32_t output_vertex_data_size;
63        } gs;
64
65        BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)];
66
67        nir_ssa_def *pos[4];
68};
69
70static void
71v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
72                            struct v3d_nir_lower_io_state *state);
73
74static void
75v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
76                     nir_ssa_def *chan)
77{
78        if (offset) {
79                /* When generating the VIR instruction, the base and the offset
80                 * are just going to get added together with an ADD instruction
81                 * so we might as well do the add here at the NIR level instead
82                 * and let the constant folding do its magic.
83                 */
84                offset = nir_iadd_imm(b, offset, base);
85                base = 0;
86        } else {
87                offset = nir_imm_int(b, 0);
88        }
89
90        nir_store_output(b, chan, offset, .base = base, .write_mask = 0x1, .component = 0);
91}
92
93/* Convert the uniform offset to bytes.  If it happens to be a constant,
94 * constant-folding will clean up the shift for us.
95 */
96static void
97v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
98                      nir_intrinsic_instr *intr)
99{
100        /* On SPIR-V/Vulkan we are already getting our offsets in
101         * bytes.
102         */
103        if (c->key->environment == V3D_ENVIRONMENT_VULKAN)
104                return;
105
106        b->cursor = nir_before_instr(&intr->instr);
107
108        nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) * 16);
109
110        nir_instr_rewrite_src(&intr->instr,
111                              &intr->src[0],
112                              nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
113                                                       nir_imm_int(b, 4))));
114}
115
116static int
117v3d_varying_slot_vpm_offset(struct v3d_compile *c, unsigned location, unsigned component)
118{
119        uint32_t num_used_outputs = 0;
120        struct v3d_varying_slot *used_outputs = NULL;
121        switch (c->s->info.stage) {
122        case MESA_SHADER_VERTEX:
123                num_used_outputs = c->vs_key->num_used_outputs;
124                used_outputs = c->vs_key->used_outputs;
125                break;
126        case MESA_SHADER_GEOMETRY:
127                num_used_outputs = c->gs_key->num_used_outputs;
128                used_outputs = c->gs_key->used_outputs;
129                break;
130        default:
131                unreachable("Unsupported shader stage");
132        }
133
134        for (int i = 0; i < num_used_outputs; i++) {
135                struct v3d_varying_slot slot = used_outputs[i];
136
137                if (v3d_slot_get_slot(slot) == location &&
138                    v3d_slot_get_component(slot) == component) {
139                        return i;
140                }
141        }
142
143        return -1;
144}
145
146/* Lowers a store_output(gallium driver location) to a series of store_outputs
147 * with a driver_location equal to the offset in the VPM.
148 *
149 * For geometry shaders we need to emit multiple vertices so the VPM offsets
150 * need to be computed in the shader code based on the current vertex index.
151 */
152static void
153v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
154                         nir_intrinsic_instr *intr,
155                         struct v3d_nir_lower_io_state *state)
156{
157        b->cursor = nir_before_instr(&intr->instr);
158
159        /* If this is a geometry shader we need to emit our outputs
160         * to the current vertex offset in the VPM.
161         */
162        nir_ssa_def *offset_reg =
163                c->s->info.stage == MESA_SHADER_GEOMETRY ?
164                        nir_load_var(b, state->gs.output_offset_var) : NULL;
165
166        int start_comp = nir_intrinsic_component(intr);
167        unsigned location = nir_intrinsic_io_semantics(intr).location;
168        nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],
169                                           intr->num_components);
170        /* Save off the components of the position for the setup of VPM inputs
171         * read by fixed function HW.
172         */
173        if (location == VARYING_SLOT_POS) {
174                for (int i = 0; i < intr->num_components; i++) {
175                        state->pos[start_comp + i] = nir_channel(b, src, i);
176                }
177        }
178
179        /* Just psiz to the position in the FF header right now. */
180        if (location == VARYING_SLOT_PSIZ &&
181            state->psiz_vpm_offset != -1) {
182                v3d_nir_store_output(b, state->psiz_vpm_offset, offset_reg, src);
183        }
184
185        if (location == VARYING_SLOT_LAYER) {
186                assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
187                nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
188                header = nir_iand(b, header, nir_imm_int(b, 0xff00ffff));
189
190                /* From the GLES 3.2 spec:
191                 *
192                 *    "When fragments are written to a layered framebuffer, the
193                 *     fragment’s layer number selects an image from the array
194                 *     of images at each attachment (...). If the fragment’s
195                 *     layer number is negative, or greater than or equal to
196                 *     the minimum number of layers of any attachment, the
197                 *     effects of the fragment on the framebuffer contents are
198                 *     undefined."
199                 *
200                 * This suggests we can just ignore that situation, however,
201                 * for V3D an out-of-bounds layer index means that the binner
202                 * might do out-of-bounds writes access to the tile state. The
203                 * simulator has an assert to catch this, so we play safe here
204                 * and we make sure that doesn't happen by setting gl_Layer
205                 * to 0 in that case (we always allocate tile state for at
206                 * least one layer).
207                 */
208                nir_ssa_def *fb_layers = nir_load_fb_layers_v3d(b, 32);
209                nir_ssa_def *cond = nir_ige(b, src, fb_layers);
210                nir_ssa_def *layer_id =
211                        nir_bcsel(b, cond,
212                                  nir_imm_int(b, 0),
213                                  nir_ishl(b, src, nir_imm_int(b, 16)));
214                header = nir_ior(b, header, layer_id);
215                nir_store_var(b, state->gs.header_var, header, 0x1);
216        }
217
218        /* Scalarize outputs if it hasn't happened already, since we want to
219         * schedule each VPM write individually.  We can skip any outut
220         * components not read by the FS.
221         */
222        for (int i = 0; i < intr->num_components; i++) {
223                int vpm_offset =
224                        v3d_varying_slot_vpm_offset(c, location, start_comp + i);
225
226
227                if (vpm_offset == -1)
228                        continue;
229
230                if (nir_src_is_const(intr->src[1]))
231                    vpm_offset += nir_src_as_uint(intr->src[1]) * 4;
232
233                BITSET_SET(state->varyings_stored, vpm_offset);
234
235                v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset,
236                                     offset_reg, nir_channel(b, src, i));
237        }
238
239        nir_instr_remove(&intr->instr);
240}
241
242static inline void
243reset_gs_header(nir_builder *b, struct v3d_nir_lower_io_state *state)
244{
245        const uint8_t NEW_PRIMITIVE_OFFSET = 0;
246        const uint8_t VERTEX_DATA_LENGTH_OFFSET = 8;
247
248        uint32_t vertex_data_size = state->gs.output_vertex_data_size;
249        assert((vertex_data_size & 0xffffff00) == 0);
250
251        uint32_t header;
252        header  = 1 << NEW_PRIMITIVE_OFFSET;
253        header |= vertex_data_size << VERTEX_DATA_LENGTH_OFFSET;
254        nir_store_var(b, state->gs.header_var, nir_imm_int(b, header), 0x1);
255}
256
257static void
258v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
259                          nir_intrinsic_instr *instr,
260                          struct v3d_nir_lower_io_state *state)
261{
262        b->cursor = nir_before_instr(&instr->instr);
263
264        nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
265        nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
266        nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
267
268        /* Emit fixed function outputs */
269        v3d_nir_emit_ff_vpm_outputs(c, b, state);
270
271        /* Emit vertex header */
272        v3d_nir_store_output(b, 0, header_offset, header);
273
274        /* Update VPM offset for next vertex output data and header */
275        output_offset =
276                nir_iadd(b, output_offset,
277                            nir_imm_int(b, state->gs.output_vertex_data_size));
278
279        header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1));
280
281        /* Reset the New Primitive bit */
282        header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe));
283
284        nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1);
285        nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1);
286        nir_store_var(b, state->gs.header_var, header, 0x1);
287
288        nir_instr_remove(&instr->instr);
289}
290
291static void
292v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b,
293                            nir_intrinsic_instr *instr,
294                            struct v3d_nir_lower_io_state *state)
295{
296        assert(state->gs.header_var);
297        b->cursor = nir_before_instr(&instr->instr);
298        reset_gs_header(b, state);
299
300        nir_instr_remove(&instr->instr);
301}
302
303/* Some vertex attribute formats may require to apply a swizzle but the hardware
304 * doesn't provide means to do that, so we need to apply the swizzle in the
305 * vertex shader.
306 *
307 * This is required at least in Vulkan to support madatory vertex attribute
308 * format VK_FORMAT_B8G8R8A8_UNORM.
309 */
310static void
311v3d_nir_lower_vertex_input(struct v3d_compile *c, nir_builder *b,
312                           nir_intrinsic_instr *instr)
313{
314        assert(c->s->info.stage == MESA_SHADER_VERTEX);
315
316        if (!c->vs_key->va_swap_rb_mask)
317                return;
318
319        const uint32_t location = nir_intrinsic_io_semantics(instr).location;
320
321        if (!(c->vs_key->va_swap_rb_mask & (1 << location)))
322                return;
323
324        assert(instr->num_components == 1);
325        const uint32_t comp = nir_intrinsic_component(instr);
326        if (comp == 0 || comp == 2)
327                nir_intrinsic_set_component(instr, (comp + 2) % 4);
328}
329
330/* Sometimes the origin of gl_PointCoord is in the upper left rather than the
331 * lower left so we need to flip it.
332 *
333 * This is needed for Vulkan, Gallium uses lower_wpos_pntc.
334 */
335static void
336v3d_nir_lower_fragment_input(struct v3d_compile *c, nir_builder *b,
337                             nir_intrinsic_instr *intr)
338{
339        assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
340
341        /* Gallium uses lower_wpos_pntc */
342        if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
343                return;
344
345        b->cursor = nir_after_instr(&intr->instr);
346
347        int comp = nir_intrinsic_component(intr);
348
349        nir_variable *input_var =
350                nir_find_variable_with_driver_location(c->s,
351                                                       nir_var_shader_in,
352                                                       nir_intrinsic_base(intr));
353
354        if (input_var && util_varying_is_point_coord(input_var->data.location,
355                                                     c->fs_key->point_sprite_mask)) {
356                assert(intr->num_components == 1);
357
358                nir_ssa_def *result = &intr->dest.ssa;
359
360                switch (comp) {
361                case 0:
362                case 1:
363                        if (!c->fs_key->is_points)
364                                result = nir_imm_float(b, 0.0);
365                        break;
366                case 2:
367                        result = nir_imm_float(b, 0.0);
368                        break;
369                case 3:
370                        result = nir_imm_float(b, 1.0);
371                        break;
372                }
373                if (c->fs_key->point_coord_upper_left && comp == 1)
374                        result = nir_fsub(b, nir_imm_float(b, 1.0), result);
375                if (result != &intr->dest.ssa) {
376                        nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
377                                                       result,
378                                                       result->parent_instr);
379                }
380        }
381}
382
383static void
384v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
385                       struct nir_instr *instr,
386                       struct v3d_nir_lower_io_state *state)
387{
388        if (instr->type != nir_instr_type_intrinsic)
389                return;
390        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
391
392        switch (intr->intrinsic) {
393        case nir_intrinsic_load_input:
394                if (c->s->info.stage == MESA_SHADER_VERTEX)
395                        v3d_nir_lower_vertex_input(c, b, intr);
396                else if (c->s->info.stage == MESA_SHADER_FRAGMENT)
397                        v3d_nir_lower_fragment_input(c, b, intr);
398                break;
399
400        case nir_intrinsic_load_uniform:
401                v3d_nir_lower_uniform(c, b, intr);
402                break;
403
404        case nir_intrinsic_store_output:
405                if (c->s->info.stage == MESA_SHADER_VERTEX ||
406                    c->s->info.stage == MESA_SHADER_GEOMETRY) {
407                        v3d_nir_lower_vpm_output(c, b, intr, state);
408                }
409                break;
410
411        case nir_intrinsic_emit_vertex:
412                v3d_nir_lower_emit_vertex(c, b, intr, state);
413                break;
414
415        case nir_intrinsic_end_primitive:
416                v3d_nir_lower_end_primitive(c, b, intr, state);
417                break;
418
419        default:
420                break;
421        }
422}
423
424/* Remap the output var's .driver_location.  This is purely for
425 * nir_print_shader() so that store_output can map back to a variable name.
426 */
427static void
428v3d_nir_lower_io_update_output_var_base(struct v3d_compile *c,
429                                        struct v3d_nir_lower_io_state *state)
430{
431        nir_foreach_shader_out_variable_safe(var, c->s) {
432                if (var->data.location == VARYING_SLOT_POS &&
433                    state->pos_vpm_offset != -1) {
434                        var->data.driver_location = state->pos_vpm_offset;
435                        continue;
436                }
437
438                if (var->data.location == VARYING_SLOT_PSIZ &&
439                    state->psiz_vpm_offset != -1) {
440                        var->data.driver_location = state->psiz_vpm_offset;
441                        continue;
442                }
443
444                int vpm_offset =
445                        v3d_varying_slot_vpm_offset(c,
446                                                    var->data.location,
447                                                    var->data.location_frac);
448                if (vpm_offset != -1) {
449                        var->data.driver_location =
450                                state->varyings_vpm_offset + vpm_offset;
451                } else {
452                        /* If we couldn't find a mapping for the var, delete
453                         * it so that its old .driver_location doesn't confuse
454                         * nir_print_shader().
455                         */
456                        exec_node_remove(&var->node);
457                }
458        }
459}
460
461static void
462v3d_nir_setup_vpm_layout_vs(struct v3d_compile *c,
463                            struct v3d_nir_lower_io_state *state)
464{
465        uint32_t vpm_offset = 0;
466
467        state->pos_vpm_offset = -1;
468        state->vp_vpm_offset = -1;
469        state->zs_vpm_offset = -1;
470        state->rcp_wc_vpm_offset = -1;
471        state->psiz_vpm_offset = -1;
472
473        bool needs_ff_outputs = c->vs_key->base.is_last_geometry_stage;
474        if (needs_ff_outputs) {
475                if (c->vs_key->is_coord) {
476                        state->pos_vpm_offset = vpm_offset;
477                        vpm_offset += 4;
478                }
479
480                state->vp_vpm_offset = vpm_offset;
481                vpm_offset += 2;
482
483                if (!c->vs_key->is_coord) {
484                        state->zs_vpm_offset = vpm_offset++;
485                        state->rcp_wc_vpm_offset = vpm_offset++;
486                }
487
488                if (c->vs_key->per_vertex_point_size)
489                        state->psiz_vpm_offset = vpm_offset++;
490        }
491
492        state->varyings_vpm_offset = vpm_offset;
493
494        c->vpm_output_size = MAX2(1, vpm_offset + c->vs_key->num_used_outputs);
495}
496
497static void
498v3d_nir_setup_vpm_layout_gs(struct v3d_compile *c,
499                            struct v3d_nir_lower_io_state *state)
500{
501        /* 1 header slot for number of output vertices */
502        uint32_t vpm_offset = 1;
503
504        /* 1 header slot per output vertex */
505        const uint32_t num_vertices = c->s->info.gs.vertices_out;
506        vpm_offset += num_vertices;
507
508        state->gs.output_header_size = vpm_offset;
509
510        /* Vertex data: here we only compute offsets into a generic vertex data
511         * elements. When it is time to actually write a particular vertex to
512         * the VPM, we will add the offset for that vertex into the VPM output
513         * to these offsets.
514         *
515         * If geometry shaders are present, they are always the last shader
516         * stage before rasterization, so we always emit fixed function outputs.
517         */
518        vpm_offset = 0;
519        if (c->gs_key->is_coord) {
520                state->pos_vpm_offset = vpm_offset;
521                vpm_offset += 4;
522        } else {
523                state->pos_vpm_offset = -1;
524        }
525
526        state->vp_vpm_offset = vpm_offset;
527        vpm_offset += 2;
528
529        if (!c->gs_key->is_coord) {
530                state->zs_vpm_offset = vpm_offset++;
531                state->rcp_wc_vpm_offset = vpm_offset++;
532        } else {
533                state->zs_vpm_offset = -1;
534                state->rcp_wc_vpm_offset = -1;
535        }
536
537        /* Mesa enables OES_geometry_shader_point_size automatically with
538         * OES_geometry_shader so we always need to handle point size
539         * writes if present.
540         */
541        if (c->gs_key->per_vertex_point_size)
542                state->psiz_vpm_offset = vpm_offset++;
543
544        state->varyings_vpm_offset = vpm_offset;
545
546        state->gs.output_vertex_data_size =
547                state->varyings_vpm_offset + c->gs_key->num_used_outputs;
548
549        c->vpm_output_size =
550                state->gs.output_header_size +
551                state->gs.output_vertex_data_size * num_vertices;
552}
553
554static void
555v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
556                            struct v3d_nir_lower_io_state *state)
557{
558        /* If this is a geometry shader we need to emit our fixed function
559         * outputs to the current vertex offset in the VPM.
560         */
561        nir_ssa_def *offset_reg =
562                c->s->info.stage == MESA_SHADER_GEOMETRY ?
563                        nir_load_var(b, state->gs.output_offset_var) : NULL;
564
565        for (int i = 0; i < 4; i++) {
566                if (!state->pos[i])
567                        state->pos[i] = nir_ssa_undef(b, 1, 32);
568        }
569
570        nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]);
571
572        if (state->pos_vpm_offset != -1) {
573                for (int i = 0; i < 4; i++) {
574                        v3d_nir_store_output(b, state->pos_vpm_offset + i,
575                                             offset_reg, state->pos[i]);
576                }
577        }
578
579        if (state->vp_vpm_offset != -1) {
580                for (int i = 0; i < 2; i++) {
581                        nir_ssa_def *pos;
582                        nir_ssa_def *scale;
583                        pos = state->pos[i];
584                        if (i == 0)
585                                scale = nir_load_viewport_x_scale(b);
586                        else
587                                scale = nir_load_viewport_y_scale(b);
588                        pos = nir_fmul(b, pos, scale);
589                        pos = nir_fmul(b, pos, rcp_wc);
590                        /* Pre-V3D 4.3 hardware has a quirk where it expects XY
591                         * coordinates in .8 fixed-point format, but then it
592                         * will internally round it to .6 fixed-point,
593                         * introducing a double rounding. The double rounding
594                         * can cause very slight differences in triangle
595                         * raterization coverage that can actually be noticed by
596                         * some CTS tests.
597                         *
598                         * The correct fix for this as recommended by Broadcom
599                         * is to convert to .8 fixed-point with ffloor().
600                         */
601                        pos = nir_f2i32(b, nir_ffloor(b, pos));
602                        v3d_nir_store_output(b, state->vp_vpm_offset + i,
603                                             offset_reg, pos);
604                }
605        }
606
607        if (state->zs_vpm_offset != -1) {
608                nir_ssa_def *z = state->pos[2];
609                z = nir_fmul(b, z, nir_load_viewport_z_scale(b));
610                z = nir_fmul(b, z, rcp_wc);
611                z = nir_fadd(b, z, nir_load_viewport_z_offset(b));
612                v3d_nir_store_output(b, state->zs_vpm_offset, offset_reg, z);
613        }
614
615        if (state->rcp_wc_vpm_offset != -1) {
616                v3d_nir_store_output(b, state->rcp_wc_vpm_offset,
617                                     offset_reg, rcp_wc);
618        }
619
620        /* Store 0 to varyings requested by the FS but not stored by the
621         * previous stage. This should be undefined behavior, but
622         * glsl-routing seems to rely on it.
623         */
624        uint32_t num_used_outputs;
625        switch (c->s->info.stage) {
626        case MESA_SHADER_VERTEX:
627                num_used_outputs = c->vs_key->num_used_outputs;
628                break;
629        case MESA_SHADER_GEOMETRY:
630                num_used_outputs = c->gs_key->num_used_outputs;
631                break;
632        default:
633                unreachable("Unsupported shader stage");
634        }
635
636        for (int i = 0; i < num_used_outputs; i++) {
637                if (!BITSET_TEST(state->varyings_stored, i)) {
638                        v3d_nir_store_output(b, state->varyings_vpm_offset + i,
639                                             offset_reg, nir_imm_int(b, 0));
640                }
641        }
642}
643
644static void
645emit_gs_prolog(struct v3d_compile *c, nir_builder *b,
646               nir_function_impl *impl,
647               struct v3d_nir_lower_io_state *state)
648{
649        nir_block *first = nir_start_block(impl);
650        b->cursor = nir_before_block(first);
651
652        const struct glsl_type *uint_type = glsl_uint_type();
653
654        assert(!state->gs.output_offset_var);
655        state->gs.output_offset_var =
656                nir_local_variable_create(impl, uint_type, "output_offset");
657        nir_store_var(b, state->gs.output_offset_var,
658                      nir_imm_int(b, state->gs.output_header_size), 0x1);
659
660        assert(!state->gs.header_offset_var);
661        state->gs.header_offset_var =
662                nir_local_variable_create(impl, uint_type, "header_offset");
663        nir_store_var(b, state->gs.header_offset_var, nir_imm_int(b, 1), 0x1);
664
665        assert(!state->gs.header_var);
666        state->gs.header_var =
667                nir_local_variable_create(impl, uint_type, "header");
668        reset_gs_header(b, state);
669}
670
671static void
672emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b,
673                                 struct v3d_nir_lower_io_state *state)
674{
675        const uint8_t VERTEX_COUNT_OFFSET = 16;
676
677        /* Our GS header has 1 generic header slot (at VPM offset 0) and then
678         * one slot per output vertex after it. This means we don't need to
679         * have a variable just to keep track of the number of vertices we
680         * emitted and instead we can just compute it here from the header
681         * offset variable by removing the one generic header slot that always
682         * goes at the begining of out header.
683         */
684        nir_ssa_def *header_offset =
685                nir_load_var(b, state->gs.header_offset_var);
686        nir_ssa_def *vertex_count =
687                nir_isub(b, header_offset, nir_imm_int(b, 1));
688        nir_ssa_def *header =
689                nir_ior(b, nir_imm_int(b, state->gs.output_header_size),
690                           nir_ishl(b, vertex_count,
691                                    nir_imm_int(b, VERTEX_COUNT_OFFSET)));
692
693        v3d_nir_store_output(b, 0, NULL, header);
694}
695
696bool
697v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
698{
699        struct v3d_nir_lower_io_state state = { 0 };
700
701        /* Set up the layout of the VPM outputs. */
702        switch (s->info.stage) {
703        case MESA_SHADER_VERTEX:
704                v3d_nir_setup_vpm_layout_vs(c, &state);
705                break;
706        case MESA_SHADER_GEOMETRY:
707                v3d_nir_setup_vpm_layout_gs(c, &state);
708                break;
709        case MESA_SHADER_FRAGMENT:
710        case MESA_SHADER_COMPUTE:
711                break;
712        default:
713                unreachable("Unsupported shader stage");
714        }
715
716        nir_foreach_function(function, s) {
717                if (function->impl) {
718                        nir_builder b;
719                        nir_builder_init(&b, function->impl);
720
721                        if (c->s->info.stage == MESA_SHADER_GEOMETRY)
722                                emit_gs_prolog(c, &b, function->impl, &state);
723
724                        nir_foreach_block(block, function->impl) {
725                                nir_foreach_instr_safe(instr, block)
726                                        v3d_nir_lower_io_instr(c, &b, instr,
727                                                               &state);
728                        }
729
730                        nir_block *last = nir_impl_last_block(function->impl);
731                        b.cursor = nir_after_block(last);
732                        if (s->info.stage == MESA_SHADER_VERTEX) {
733                                v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
734                        } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
735                                emit_gs_vpm_output_header_prolog(c, &b, &state);
736                        }
737
738                        nir_metadata_preserve(function->impl,
739                                              nir_metadata_block_index |
740                                              nir_metadata_dominance);
741                }
742        }
743
744        if (s->info.stage == MESA_SHADER_VERTEX ||
745            s->info.stage == MESA_SHADER_GEOMETRY) {
746                v3d_nir_lower_io_update_output_var_base(c, &state);
747        }
748
749        /* It is really unlikely that we don't get progress here, and fully
750         * filtering when not would make code more complex, but we are still
751         * interested on getting this lowering going through NIR_PASS
752         */
753        return true;
754}
755