xref: /third_party/mesa3d/src/panfrost/util/pan_ir.h (revision bf215546)
1/*
2 * Copyright (C) 2020 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#ifndef __PAN_IR_H
25#define __PAN_IR_H
26
27#include <stdint.h>
28#include "compiler/nir/nir.h"
29#include "util/u_dynarray.h"
30#include "util/hash_table.h"
31
32/* On Valhall, the driver gives the hardware a table of resource tables.
33 * Resources are addressed as the index of the table together with the index of
34 * the resource within the table. For simplicity, we put one type of resource
35 * in each table and fix the numbering of the tables.
36 *
37 * This numbering is arbitrary. It is a software ABI between the
38 * Gallium driver and the Valhall compiler.
39 */
40enum pan_resource_table {
41        PAN_TABLE_UBO = 0,
42        PAN_TABLE_ATTRIBUTE,
43        PAN_TABLE_ATTRIBUTE_BUFFER,
44        PAN_TABLE_SAMPLER,
45        PAN_TABLE_TEXTURE,
46        PAN_TABLE_IMAGE,
47
48        PAN_NUM_RESOURCE_TABLES
49};
50
51/* Indices for named (non-XFB) varyings that are present. These are packed
52 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
53 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
54 * of a given special field given a shift S by:
55 *
56 *      idx = popcount(P & ((1 << S) - 1))
57 *
58 * That is... look at all of the varyings that come earlier and count them, the
59 * count is the new index since plus one. Likewise, the total number of special
60 * buffers required is simply popcount(P)
61 */
62
63enum pan_special_varying {
64        PAN_VARY_GENERAL = 0,
65        PAN_VARY_POSITION = 1,
66        PAN_VARY_PSIZ = 2,
67        PAN_VARY_PNTCOORD = 3,
68        PAN_VARY_FACE = 4,
69        PAN_VARY_FRAGCOORD = 5,
70
71        /* Keep last */
72        PAN_VARY_MAX,
73};
74
75/* Maximum number of attribute descriptors required for varyings. These include
76 * up to MAX_VARYING source level varyings plus a descriptor each non-GENERAL
77 * special varying */
78#define PAN_MAX_VARYINGS (MAX_VARYING + PAN_VARY_MAX - 1)
79
80/* Define the general compiler entry point */
81
82#define MAX_SYSVAL_COUNT 32
83
84/* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal
85 * their class for equal comparison */
86
87#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type)
88#define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff)
89#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16)
90
91/* Define some common types. We start at one for easy indexing of hash
92 * tables internal to the compiler */
93
94enum {
95        PAN_SYSVAL_VIEWPORT_SCALE = 1,
96        PAN_SYSVAL_VIEWPORT_OFFSET = 2,
97        PAN_SYSVAL_TEXTURE_SIZE = 3,
98        PAN_SYSVAL_SSBO = 4,
99        PAN_SYSVAL_NUM_WORK_GROUPS = 5,
100        PAN_SYSVAL_SAMPLER = 7,
101        PAN_SYSVAL_LOCAL_GROUP_SIZE = 8,
102        PAN_SYSVAL_WORK_DIM = 9,
103        PAN_SYSVAL_IMAGE_SIZE = 10,
104        PAN_SYSVAL_SAMPLE_POSITIONS = 11,
105        PAN_SYSVAL_MULTISAMPLED = 12,
106        PAN_SYSVAL_RT_CONVERSION = 13,
107        PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS = 14,
108        PAN_SYSVAL_DRAWID = 15,
109        PAN_SYSVAL_BLEND_CONSTANTS = 16,
110        PAN_SYSVAL_XFB = 17,
111        PAN_SYSVAL_NUM_VERTICES = 18,
112};
113
114#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array)          \
115	((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0))
116
117#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id)        ((id) & 0x7f)
118#define PAN_SYSVAL_ID_TO_TXS_DIM(id)            (((id) >> 7) & 0x3)
119#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id)       !!((id) & (1 << 9))
120
121/* Special attribute slots for vertex builtins. Sort of arbitrary but let's be
122 * consistent with the blob so we can compare traces easier. */
123
124enum {
125        PAN_VERTEX_ID   = 16,
126        PAN_INSTANCE_ID = 17,
127        PAN_MAX_ATTRIBUTE
128};
129
130struct panfrost_sysvals {
131        /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */
132        unsigned sysvals[MAX_SYSVAL_COUNT];
133        unsigned sysval_count;
134};
135
136/* Architecturally, Bifrost/Valhall can address 128 FAU slots of 64-bits each.
137 * In practice, the maximum number of FAU slots is limited by implementation.
138 * All known Bifrost and Valhall devices limit to 64 FAU slots. Therefore the
139 * maximum number of 32-bit words is 128, since there are 2 words per FAU slot.
140 *
141 * Midgard can push at most 92 words, so this bound suffices. The Midgard
142 * compiler pushes less than this, as Midgard uses register-mapped uniforms
143 * instead of FAU, preventing large numbers of uniforms to be pushed for
144 * nontrivial programs.
145 */
146#define PAN_MAX_PUSH 128
147
148/* Architectural invariants (Midgard and Bifrost): UBO must be <= 2^16 bytes so
149 * an offset to a word must be < 2^16. There are less than 2^8 UBOs */
150
151struct panfrost_ubo_word {
152        uint16_t ubo;
153        uint16_t offset;
154};
155
156struct panfrost_ubo_push {
157        unsigned count;
158        struct panfrost_ubo_word words[PAN_MAX_PUSH];
159};
160
161/* Helper for searching the above. Note this is O(N) to the number of pushed
162 * constants, do not run in the draw call hot path */
163
164unsigned
165pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs);
166
167struct hash_table_u64 *
168panfrost_init_sysvals(struct panfrost_sysvals *sysvals,
169                      struct panfrost_sysvals *fixed_sysvals,
170                      void *memctx);
171
172unsigned
173pan_lookup_sysval(struct hash_table_u64 *sysval_to_id,
174                  struct panfrost_sysvals *sysvals,
175                  int sysval);
176
177int
178panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest);
179
180struct panfrost_compile_inputs {
181        unsigned gpu_id;
182        bool is_blend, is_blit;
183        struct {
184                unsigned rt;
185                unsigned nr_samples;
186                uint64_t bifrost_blend_desc;
187        } blend;
188        int fixed_sysval_ubo;
189        struct panfrost_sysvals *fixed_sysval_layout;
190        bool shaderdb;
191        bool no_idvs;
192        bool no_ubo_to_push;
193
194        enum pipe_format rt_formats[8];
195        uint8_t raw_fmt_mask;
196        unsigned nr_cbufs;
197
198        /* Used on Valhall.
199         *
200         * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0)
201         * written by the previous stage (fragment shader) or written by this
202         * stage (vertex shader). Bits are slots from gl_varying_slot.
203         *
204         * For modern APIs (GLES or VK), this should be 0.
205         */
206        uint32_t fixed_varying_mask;
207
208        union {
209                struct {
210                        bool static_rt_conv;
211                        uint32_t rt_conv[8];
212                } bifrost;
213        };
214};
215
216struct pan_shader_varying {
217        gl_varying_slot location;
218        enum pipe_format format;
219};
220
221struct bifrost_shader_blend_info {
222        nir_alu_type type;
223        uint32_t return_offset;
224
225        /* mali_bifrost_register_file_format corresponding to nir_alu_type */
226        unsigned format;
227};
228
229/*
230 * Unpacked form of a v7 message preload descriptor, produced by the compiler's
231 * message preload optimization. By splitting out this struct, the compiler does
232 * not need to know about data structure packing, avoiding a dependency on
233 * GenXML.
234 */
235struct bifrost_message_preload {
236        /* Whether to preload this message */
237        bool enabled;
238
239        /* Varying to load from */
240        unsigned varying_index;
241
242        /* Register type, FP32 otherwise */
243        bool fp16;
244
245        /* Number of components, ignored if texturing */
246        unsigned num_components;
247
248        /* If texture is set, performs a texture instruction according to
249         * texture_index, skip, and zero_lod. If texture is unset, only the
250         * varying load is performed.
251         */
252        bool texture, skip, zero_lod;
253        unsigned texture_index;
254};
255
256struct bifrost_shader_info {
257        struct bifrost_shader_blend_info blend[8];
258        nir_alu_type blend_src1_type;
259        bool wait_6, wait_7;
260        struct bifrost_message_preload messages[2];
261
262        /* Whether any flat varyings are loaded. This may disable optimizations
263         * that change the provoking vertex, since that would load incorrect
264         * values for flat varyings.
265         */
266        bool uses_flat_shading;
267};
268
269struct midgard_shader_info {
270        unsigned first_tag;
271};
272
273struct pan_shader_info {
274        gl_shader_stage stage;
275        unsigned work_reg_count;
276        unsigned tls_size;
277        unsigned wls_size;
278
279        /* Bit mask of preloaded registers */
280        uint64_t preload;
281
282        union {
283                struct {
284                        bool reads_frag_coord;
285                        bool reads_point_coord;
286                        bool reads_face;
287                        bool can_discard;
288                        bool writes_depth;
289                        bool writes_stencil;
290                        bool writes_coverage;
291                        bool sidefx;
292                        bool sample_shading;
293                        bool early_fragment_tests;
294                        bool can_early_z, can_fpk;
295                        BITSET_WORD outputs_read;
296                        BITSET_WORD outputs_written;
297                } fs;
298
299                struct {
300                        bool writes_point_size;
301
302                        /* If the primary shader writes point size, the Valhall
303                         * driver may need a variant that does not write point
304                         * size. Offset to such a shader in the program binary.
305                         *
306                         * Zero if no such variant is required.
307                         *
308                         * Only used with IDVS on Valhall.
309                         */
310                        unsigned no_psiz_offset;
311
312                        /* Set if Index-Driven Vertex Shading is in use */
313                        bool idvs;
314
315                        /* If IDVS is used, whether a varying shader is used */
316                        bool secondary_enable;
317
318                        /* If a varying shader is used, the varying shader's
319                         * offset in the program binary
320                         */
321                        unsigned secondary_offset;
322
323                        /* If IDVS is in use, number of work registers used by
324                         * the varying shader
325                         */
326                        unsigned secondary_work_reg_count;
327
328                        /* If IDVS is in use, bit mask of preloaded registers
329                         * used by the varying shader
330                         */
331                        uint64_t secondary_preload;
332                } vs;
333
334                struct {
335                        /* Is it legal to merge workgroups? This is true if the
336                         * shader uses neither barriers nor shared memory.
337                         *
338                         * Used by the Valhall hardware.
339                         */
340                        bool allow_merging_workgroups;
341                } cs;
342        };
343
344        /* Does the shader contains a barrier? or (for fragment shaders) does it
345         * require helper invocations, which demand the same ordering guarantees
346         * of the hardware? These notions are unified in the hardware, so we
347         * unify them here as well.
348         */
349        bool contains_barrier;
350        bool separable;
351        bool writes_global;
352        uint64_t outputs_written;
353
354        unsigned sampler_count;
355        unsigned texture_count;
356        unsigned ubo_count;
357        unsigned attributes_read_count;
358        unsigned attribute_count;
359        unsigned attributes_read;
360
361        struct {
362                unsigned input_count;
363                struct pan_shader_varying input[PAN_MAX_VARYINGS];
364                unsigned output_count;
365                struct pan_shader_varying output[PAN_MAX_VARYINGS];
366        } varyings;
367
368        struct panfrost_sysvals sysvals;
369
370        /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access
371         * Uniforms (Bifrost) */
372        struct panfrost_ubo_push push;
373
374        uint32_t ubo_mask;
375
376        union {
377                struct bifrost_shader_info bifrost;
378                struct midgard_shader_info midgard;
379        };
380};
381
382typedef struct pan_block {
383        /* Link to next block. Must be first for mir_get_block */
384        struct list_head link;
385
386        /* List of instructions emitted for the current block */
387        struct list_head instructions;
388
389        /* Index of the block in source order */
390        unsigned name;
391
392        /* Control flow graph */
393        struct pan_block *successors[2];
394        struct set *predecessors;
395        bool unconditional_jumps;
396
397        /* In liveness analysis, these are live masks (per-component) for
398         * indices for the block. Scalar compilers have the luxury of using
399         * simple bit fields, but for us, liveness is a vector idea. */
400        uint16_t *live_in;
401        uint16_t *live_out;
402} pan_block;
403
404struct pan_instruction {
405        struct list_head link;
406};
407
408#define pan_foreach_instr_in_block_rev(block, v) \
409        list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, link)
410
411#define pan_foreach_successor(blk, v) \
412        pan_block *v; \
413        pan_block **_v; \
414        for (_v = (pan_block **) &blk->successors[0], \
415                v = *_v; \
416                v != NULL && _v < (pan_block **) &blk->successors[2]; \
417                _v++, v = *_v) \
418
419#define pan_foreach_predecessor(blk, v) \
420        struct set_entry *_entry_##v; \
421        struct pan_block *v; \
422        for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \
423                v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL);  \
424                _entry_##v != NULL; \
425                _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \
426                v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL))
427
428static inline pan_block *
429pan_exit_block(struct list_head *blocks)
430{
431        pan_block *last = list_last_entry(blocks, pan_block, link);
432        assert(!last->successors[0] && !last->successors[1]);
433        return last;
434}
435
436typedef void (*pan_liveness_update)(uint16_t *, void *, unsigned max);
437
438void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask);
439void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask);
440bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max);
441
442void pan_compute_liveness(struct list_head *blocks,
443                unsigned temp_count,
444                pan_liveness_update callback);
445
446void pan_free_liveness(struct list_head *blocks);
447
448uint16_t
449pan_to_bytemask(unsigned bytes, unsigned mask);
450
451void pan_block_add_successor(pan_block *block, pan_block *successor);
452
453/* IR indexing */
454#define PAN_IS_REG (1)
455
456static inline unsigned
457pan_ssa_index(nir_ssa_def *ssa)
458{
459        /* Off-by-one ensures BIR_NO_ARG is skipped */
460        return ((ssa->index + 1) << 1) | 0;
461}
462
463static inline unsigned
464pan_src_index(nir_src *src)
465{
466        if (src->is_ssa)
467                return pan_ssa_index(src->ssa);
468        else {
469                assert(!src->reg.indirect);
470                return (src->reg.reg->index << 1) | PAN_IS_REG;
471        }
472}
473
474static inline unsigned
475pan_dest_index(nir_dest *dst)
476{
477        if (dst->is_ssa)
478                return pan_ssa_index(&dst->ssa);
479        else {
480                assert(!dst->reg.indirect);
481                return (dst->reg.reg->index << 1) | PAN_IS_REG;
482        }
483}
484
485/* IR printing helpers */
486void pan_print_alu_type(nir_alu_type t, FILE *fp);
487
488/* Until it can be upstreamed.. */
489bool pan_has_source_mod(nir_alu_src *src, nir_op op);
490bool pan_has_dest_mod(nir_dest **dest, nir_op op);
491
492/* NIR passes to do some backend-specific lowering */
493
494#define PAN_WRITEOUT_C 1
495#define PAN_WRITEOUT_Z 2
496#define PAN_WRITEOUT_S 4
497#define PAN_WRITEOUT_2 8
498
499bool pan_nir_lower_zs_store(nir_shader *nir);
500
501bool pan_nir_lower_64bit_intrin(nir_shader *shader);
502
503bool pan_lower_helper_invocation(nir_shader *shader);
504bool pan_lower_sample_pos(nir_shader *shader);
505bool pan_lower_xfb(nir_shader *nir);
506
507/*
508 * Helper returning the subgroup size. Generally, this is equal to the number of
509 * threads in a warp. For Midgard (including warping models), this returns 1, as
510 * subgroups are not supported.
511 */
512static inline unsigned
513pan_subgroup_size(unsigned arch)
514{
515        if (arch >= 9)
516                return 16;
517        else if (arch >= 7)
518                return 8;
519        else if (arch >= 6)
520                return 4;
521        else
522                return 1;
523}
524
525#endif
526