1/*
2 * Copyright (C) 2021 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 */
24
25#include <stdio.h>
26#include "pan_bo.h"
27#include "pan_shader.h"
28#include "pan_scoreboard.h"
29#include "pan_encoder.h"
30#include "pan_indirect_draw.h"
31#include "pan_pool.h"
32#include "pan_util.h"
33#include "compiler/nir/nir_builder.h"
34#include "util/u_memory.h"
35#include "util/macros.h"
36
37#define WORD(x) ((x) * 4)
38
39#define LOOP \
40        for (nir_loop *l = nir_push_loop(b); l != NULL; \
41             nir_pop_loop(b, l), l = NULL)
42#define BREAK nir_jump(b, nir_jump_break)
43#define CONTINUE nir_jump(b, nir_jump_continue)
44
45#define IF(cond) nir_push_if(b, cond);
46#define ELSE nir_push_else(b, NULL);
47#define ENDIF nir_pop_if(b, NULL);
48
49#define MIN_MAX_JOBS 128
50
51struct draw_data {
52        nir_ssa_def *draw_buf;
53        nir_ssa_def *draw_buf_stride;
54        nir_ssa_def *index_buf;
55        nir_ssa_def *restart_index;
56        nir_ssa_def *vertex_count;
57        nir_ssa_def *start_instance;
58        nir_ssa_def *instance_count;
59        nir_ssa_def *vertex_start;
60        nir_ssa_def *index_bias;
61        nir_ssa_def *draw_ctx;
62        nir_ssa_def *min_max_ctx;
63};
64
65struct instance_size {
66        nir_ssa_def *raw;
67        nir_ssa_def *padded;
68        nir_ssa_def *packed;
69};
70
71struct jobs_data {
72        nir_ssa_def *vertex_job;
73        nir_ssa_def *tiler_job;
74        nir_ssa_def *base_vertex_offset;
75        nir_ssa_def *first_vertex_sysval;
76        nir_ssa_def *base_vertex_sysval;
77        nir_ssa_def *base_instance_sysval;
78        nir_ssa_def *offset_start;
79        nir_ssa_def *invocation;
80};
81
82struct varyings_data {
83        nir_ssa_def *varying_bufs;
84        nir_ssa_def *pos_ptr;
85        nir_ssa_def *psiz_ptr;
86        nir_variable *mem_ptr;
87};
88
89struct attribs_data {
90        nir_ssa_def *attrib_count;
91        nir_ssa_def *attrib_bufs;
92        nir_ssa_def *attribs;
93};
94
95struct indirect_draw_shader_builder {
96        nir_builder b;
97        const struct panfrost_device *dev;
98        unsigned flags;
99        bool index_min_max_search;
100        unsigned index_size;
101        struct draw_data draw;
102        struct instance_size instance_size;
103        struct jobs_data jobs;
104        struct varyings_data varyings;
105        struct attribs_data attribs;
106};
107
108/* Describes an indirect draw (see glDrawArraysIndirect()) */
109
110struct indirect_draw_info {
111        uint32_t count;
112        uint32_t instance_count;
113        uint32_t start;
114        uint32_t start_instance;
115};
116
117struct indirect_indexed_draw_info {
118        uint32_t count;
119        uint32_t instance_count;
120        uint32_t start;
121        int32_t index_bias;
122        uint32_t start_instance;
123};
124
125/* Store the min/max index in a separate context. This is not supported yet, but
126 * the DDK seems to put all min/max search jobs at the beginning of the job chain
127 * when multiple indirect draws are issued to avoid the serialization caused by
128 * the draw patching jobs which have the suppress_prefetch flag set. Merging the
129 * min/max and draw contexts would prevent such optimizations (draw contexts are
130 * shared by all indirect draw in a batch).
131 */
132
133struct min_max_context {
134        uint32_t min;
135        uint32_t max;
136};
137
138/* Per-batch context shared by all indirect draws queued to a given batch. */
139
140struct indirect_draw_context {
141        /* Pointer to the top of the varying heap. */
142        mali_ptr varying_mem;
143};
144
145/* Indirect draw shader inputs. Those are stored in FAU. */
146
147struct indirect_draw_inputs {
148        /* indirect_draw_context pointer */
149        mali_ptr draw_ctx;
150
151        /* min_max_context pointer */
152        mali_ptr min_max_ctx;
153
154        /* Pointer to an array of indirect_draw_info objects */
155        mali_ptr draw_buf;
156
157        /* Pointer to an uint32_t containing the number of draws to issue */
158        mali_ptr draw_count_ptr;
159
160        /* index buffer */
161        mali_ptr index_buf;
162
163        /* {base,first}_{vertex,instance} sysvals */
164        mali_ptr first_vertex_sysval;
165        mali_ptr base_vertex_sysval;
166        mali_ptr base_instance_sysval;
167
168        /* Pointers to various cmdstream structs that need to be patched */
169        mali_ptr vertex_job;
170        mali_ptr tiler_job;
171        mali_ptr attrib_bufs;
172        mali_ptr attribs;
173        mali_ptr varying_bufs;
174        uint32_t draw_count;
175        uint32_t draw_buf_stride;
176        uint32_t restart_index;
177        uint32_t attrib_count;
178} PACKED;
179
180#define get_input_field(b, name) \
181        nir_load_push_constant(b, \
182               1, sizeof(((struct indirect_draw_inputs *)0)->name) * 8, \
183               nir_imm_int(b, 0), \
184               .base = offsetof(struct indirect_draw_inputs, name))
185
186static nir_ssa_def *
187get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset)
188{
189        return nir_iadd(b, base, nir_u2u64(b, offset));
190}
191
192static nir_ssa_def *
193get_address_imm(nir_builder *b, nir_ssa_def *base, unsigned offset)
194{
195        return get_address(b, base, nir_imm_int(b, offset));
196}
197
198static nir_ssa_def *
199load_global(nir_builder *b, nir_ssa_def *addr, unsigned ncomps, unsigned bit_size)
200{
201        return nir_load_global(b, addr, 4, ncomps, bit_size);
202}
203
204static void
205store_global(nir_builder *b, nir_ssa_def *addr,
206             nir_ssa_def *value, unsigned ncomps)
207{
208        nir_store_global(b, addr, 4, value, (1 << ncomps) - 1);
209}
210
211static nir_ssa_def *
212get_draw_ctx_data(struct indirect_draw_shader_builder *builder,
213                  unsigned offset, unsigned size)
214{
215        nir_builder *b = &builder->b;
216        return load_global(b,
217                           get_address_imm(b, builder->draw.draw_ctx, offset),
218                           1, size);
219}
220
221static void
222set_draw_ctx_data(struct indirect_draw_shader_builder *builder,
223                  unsigned offset, nir_ssa_def *value, unsigned size)
224{
225        nir_builder *b = &builder->b;
226        store_global(b,
227                     get_address_imm(b, builder->draw.draw_ctx, offset),
228                     value, 1);
229}
230
231#define get_draw_ctx_field(builder, name) \
232        get_draw_ctx_data(builder, \
233                          offsetof(struct indirect_draw_context, name), \
234                          sizeof(((struct indirect_draw_context *)0)->name) * 8)
235
236#define set_draw_ctx_field(builder, name, val) \
237        set_draw_ctx_data(builder, \
238                          offsetof(struct indirect_draw_context, name), \
239                          val, \
240                          sizeof(((struct indirect_draw_context *)0)->name) * 8)
241
242static nir_ssa_def *
243get_min_max_ctx_data(struct indirect_draw_shader_builder *builder,
244                     unsigned offset, unsigned size)
245{
246        nir_builder *b = &builder->b;
247        return load_global(b,
248                           get_address_imm(b, builder->draw.min_max_ctx, offset),
249                           1, size);
250}
251
252#define get_min_max_ctx_field(builder, name) \
253        get_min_max_ctx_data(builder, \
254                             offsetof(struct min_max_context, name), \
255                             sizeof(((struct min_max_context *)0)->name) * 8)
256
257static void
258update_min(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
259{
260        nir_builder *b = &builder->b;
261        nir_ssa_def *addr =
262                get_address_imm(b,
263                                builder->draw.min_max_ctx,
264                                offsetof(struct min_max_context, min));
265        nir_global_atomic_umin(b, 32, addr, val);
266}
267
268static void
269update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
270{
271        nir_builder *b = &builder->b;
272        nir_ssa_def *addr =
273                get_address_imm(b,
274                                builder->draw.min_max_ctx,
275                                offsetof(struct min_max_context, max));
276        nir_global_atomic_umax(b, 32, addr, val);
277}
278
279#define get_draw_field(b, draw_ptr, field) \
280        load_global(b, \
281                    get_address_imm(b, draw_ptr, \
282                                    offsetof(struct indirect_draw_info, field)), \
283                    1, sizeof(((struct indirect_draw_info *)0)->field) * 8)
284
285#define get_indexed_draw_field(b, draw_ptr, field) \
286        load_global(b, \
287                    get_address_imm(b, draw_ptr, \
288                                    offsetof(struct indirect_indexed_draw_info, field)), \
289                    1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8)
290
291static void
292extract_inputs(struct indirect_draw_shader_builder *builder)
293{
294        nir_builder *b = &builder->b;
295
296        builder->draw.draw_ctx = get_input_field(b, draw_ctx);
297        builder->draw.draw_buf = get_input_field(b, draw_buf);
298        builder->draw.draw_buf_stride = get_input_field(b, draw_buf_stride);
299
300        if (builder->index_size) {
301                builder->draw.index_buf = get_input_field(b, index_buf);
302                builder->draw.min_max_ctx = get_input_field(b, min_max_ctx);
303                if (builder->flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) {
304                        builder->draw.restart_index =
305                                get_input_field(b, restart_index);
306                }
307        }
308
309        if (builder->index_min_max_search)
310                return;
311
312        builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval);
313        builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval);
314        builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval);
315        builder->jobs.vertex_job = get_input_field(b, vertex_job);
316        builder->jobs.tiler_job = get_input_field(b, tiler_job);
317        builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs);
318        builder->attribs.attribs = get_input_field(b, attribs);
319        builder->attribs.attrib_count = get_input_field(b, attrib_count);
320        builder->varyings.varying_bufs = get_input_field(b, varying_bufs);
321        builder->varyings.mem_ptr =
322                nir_local_variable_create(b->impl,
323                                          glsl_uint64_t_type(),
324                                          "var_mem_ptr");
325        nir_store_var(b, builder->varyings.mem_ptr,
326                      get_draw_ctx_field(builder, varying_mem), 3);
327}
328
329static void
330init_shader_builder(struct indirect_draw_shader_builder *builder,
331                    const struct panfrost_device *dev,
332                    unsigned flags, unsigned index_size,
333                    bool index_min_max_search)
334{
335        memset(builder, 0, sizeof(*builder));
336        builder->dev = dev;
337        builder->flags = flags;
338        builder->index_size = index_size;
339
340        builder->index_min_max_search = index_min_max_search;
341
342        if (index_min_max_search) {
343                builder->b =
344                        nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
345                                                       GENX(pan_shader_get_compiler_options)(),
346                                                       "indirect_draw_min_max_index(index_size=%d)",
347                                                       builder->index_size);
348        } else {
349                builder->b =
350                        nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
351                                                       GENX(pan_shader_get_compiler_options)(),
352                                                       "indirect_draw(index_size=%d%s%s%s%s)",
353                                                       builder->index_size,
354                                                       flags & PAN_INDIRECT_DRAW_HAS_PSIZ ?
355                                                       ",psiz" : "",
356                                                       flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ?
357                                                       ",primitive_restart" : "",
358                                                       flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ?
359                                                       ",update_primitive_size" : "",
360                                                       flags & PAN_INDIRECT_DRAW_IDVS ?
361                                                       ",idvs" : "");
362        }
363
364        extract_inputs(builder);
365}
366
367static void
368update_dcd(struct indirect_draw_shader_builder *builder,
369           nir_ssa_def *job_ptr,
370           unsigned draw_offset)
371{
372        nir_builder *b = &builder->b;
373        nir_ssa_def *draw_w01 =
374                load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
375        nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
376
377        /* Update DRAW.{instance_size,offset_start} */
378        nir_ssa_def *instance_size =
379                nir_bcsel(b,
380                          nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)),
381                          nir_imm_int(b, 0), builder->instance_size.packed);
382        draw_w01 = nir_vec2(b,
383                            nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
384                                    nir_ishl(b, instance_size, nir_imm_int(b, 16))),
385                            builder->jobs.offset_start);
386        store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
387                     draw_w01, 2);
388}
389
390static void
391update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type)
392{
393        nir_builder *b = &builder->b;
394        nir_ssa_def *job_ptr =
395                type == MALI_JOB_TYPE_VERTEX ?
396                builder->jobs.vertex_job : builder->jobs.tiler_job;
397
398        /* Update the invocation words. */
399        store_global(b, get_address_imm(b, job_ptr, WORD(8)),
400                     builder->jobs.invocation, 2);
401
402        unsigned draw_offset =
403                type == MALI_JOB_TYPE_VERTEX ?
404                pan_section_offset(COMPUTE_JOB, DRAW) :
405                pan_section_offset(TILER_JOB, DRAW);
406        unsigned prim_offset = pan_section_offset(TILER_JOB, PRIMITIVE);
407        unsigned psiz_offset = pan_section_offset(TILER_JOB, PRIMITIVE_SIZE);
408        unsigned index_size = builder->index_size;
409
410        if (type == MALI_JOB_TYPE_TILER) {
411                /* Update PRIMITIVE.{base_vertex_offset,count} */
412                store_global(b,
413                             get_address_imm(b, job_ptr, prim_offset + WORD(1)),
414                             builder->jobs.base_vertex_offset, 1);
415                store_global(b,
416                             get_address_imm(b, job_ptr, prim_offset + WORD(3)),
417                             nir_iadd_imm(b, builder->draw.vertex_count, -1), 1);
418
419                if (index_size) {
420                        nir_ssa_def *addr =
421                                get_address_imm(b, job_ptr, prim_offset + WORD(4));
422                        nir_ssa_def *indices = load_global(b, addr, 1, 64);
423                        nir_ssa_def *offset =
424                                nir_imul_imm(b, builder->draw.vertex_start, index_size);
425
426                        indices = get_address(b, indices, offset);
427                        store_global(b, addr, indices, 2);
428                }
429
430                /* Update PRIMITIVE_SIZE.size_array */
431                if ((builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) &&
432                    (builder->flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE)) {
433                        store_global(b,
434                                     get_address_imm(b, job_ptr, psiz_offset + WORD(0)),
435                                     builder->varyings.psiz_ptr, 2);
436                }
437
438                /* Update DRAW.position */
439                store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(4)),
440                             builder->varyings.pos_ptr, 2);
441        }
442
443        update_dcd(builder, job_ptr, draw_offset);
444
445        if (builder->flags & PAN_INDIRECT_DRAW_IDVS) {
446                assert(type == MALI_JOB_TYPE_TILER);
447
448                update_dcd(builder, job_ptr,
449                           pan_section_offset(INDEXED_VERTEX_JOB, VERTEX_DRAW));
450        }
451}
452
453static void
454split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d)
455{
456        /* TODO: Lower this 64bit div to something GPU-friendly */
457        nir_ssa_def *r = nir_imax(b, nir_ufind_msb(b, div), nir_imm_int(b, 0));
458        nir_ssa_def *div64 = nir_u2u64(b, div);
459        nir_ssa_def *half_div64 = nir_u2u64(b, nir_ushr_imm(b, div, 1));
460        nir_ssa_def *f0 = nir_iadd(b,
461                                   nir_ishl(b, nir_imm_int64(b, 1),
462                                            nir_iadd_imm(b, r, 32)),
463                                   half_div64);
464        nir_ssa_def *fi = nir_idiv(b, f0, div64);
465        nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64));
466        nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff),
467                                   nir_imm_int(b, 1 << 5), nir_imm_int(b, 0));
468        *d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31));
469        *r_e = nir_ior(b, r, e);
470}
471
472static void
473update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder,
474                         nir_ssa_def *attrib_buf_ptr,
475                         enum mali_attribute_type type,
476                         nir_ssa_def *div1,
477                         nir_ssa_def *div2)
478{
479        nir_builder *b = &builder->b;
480        unsigned type_mask = BITFIELD_MASK(6);
481        nir_ssa_def *w01 = load_global(b, attrib_buf_ptr, 2, 32);
482        nir_ssa_def *w0 = nir_channel(b, w01, 0);
483        nir_ssa_def *w1 = nir_channel(b, w01, 1);
484
485        /* Word 0 and 1 of the attribute descriptor contain the type,
486         * pointer and the the divisor exponent.
487         */
488        w0 = nir_iand_imm(b, nir_channel(b, w01, 0), ~type_mask);
489        w0 = nir_ior(b, w0, nir_imm_int(b, type));
490        w1 = nir_ior(b, w1, nir_ishl(b, div1, nir_imm_int(b, 24)));
491
492        store_global(b, attrib_buf_ptr, nir_vec2(b, w0, w1), 2);
493
494        if (type == MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR) {
495                /* If the divisor is not a power of two, the divisor numerator
496                 * is passed in word 1 of the continuation attribute (word 5
497                 * if we consider the attribute and its continuation as a
498                 * single attribute).
499                 */
500                assert(div2);
501                store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(5)),
502                             div2, 1);
503        }
504}
505
506static void
507zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder,
508                       nir_ssa_def *attrib_buf_ptr)
509{
510        /* Stride is an unadorned 32-bit uint at word 2 */
511        nir_builder *b = &builder->b;
512        store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)),
513                        nir_imm_int(b, 0), 1);
514}
515
516static void
517adjust_attrib_offset(struct indirect_draw_shader_builder *builder,
518                     nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr,
519                     nir_ssa_def *instance_div)
520{
521        nir_builder *b = &builder->b;
522        nir_ssa_def *zero = nir_imm_int(b, 0);
523        nir_ssa_def *two = nir_imm_int(b, 2);
524        nir_ssa_def *sub_cur_offset =
525                nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero),
526                         nir_uge(b, builder->draw.instance_count, two));
527
528        nir_ssa_def *add_base_inst_offset =
529                nir_iand(b, nir_ine(b, builder->draw.start_instance, zero),
530                         nir_ine(b, instance_div, zero));
531
532        IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) {
533                nir_ssa_def *offset =
534                        load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
535                nir_ssa_def *stride =
536                        load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
537
538                /* Per-instance data needs to be offset in response to a
539                 * delayed start in an indexed draw.
540                 */
541
542                IF (add_base_inst_offset) {
543                        offset = nir_iadd(b, offset,
544                                          nir_idiv(b,
545                                                   nir_imul(b, stride,
546                                                            builder->draw.start_instance),
547                                                   instance_div));
548                } ENDIF
549
550                IF (sub_cur_offset) {
551                        offset = nir_isub(b, offset,
552                                          nir_imul(b, stride,
553                                                   builder->jobs.offset_start));
554                } ENDIF
555
556                store_global(b, get_address_imm(b, attrib_ptr, WORD(1)),
557                             offset, 1);
558        } ENDIF
559}
560
561/* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */
562
563static nir_ssa_def *
564nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x)
565{
566        return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2));
567}
568
569/* Based on panfrost_emit_vertex_data() */
570
571static void
572update_vertex_attribs(struct indirect_draw_shader_builder *builder)
573{
574        nir_builder *b = &builder->b;
575        nir_variable *attrib_idx_var =
576                nir_local_variable_create(b->impl, glsl_uint_type(),
577                                          "attrib_idx");
578        nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1);
579
580#if PAN_ARCH <= 5
581        nir_ssa_def *single_instance =
582                nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2));
583#endif
584
585        LOOP {
586                nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var);
587                IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count))
588                        BREAK;
589                ENDIF
590
591                nir_ssa_def *attrib_buf_ptr =
592                         get_address(b, builder->attribs.attrib_bufs,
593                                     nir_imul_imm(b, attrib_idx,
594                                                  2 * pan_size(ATTRIBUTE_BUFFER)));
595                nir_ssa_def *attrib_ptr =
596                         get_address(b, builder->attribs.attribs,
597                                     nir_imul_imm(b, attrib_idx,
598                                                  pan_size(ATTRIBUTE)));
599
600                nir_ssa_def *r_e, *d;
601
602#if PAN_ARCH <= 5
603                IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
604                        nir_ssa_def *r_p =
605                                nir_bcsel(b, single_instance,
606                                          nir_imm_int(b, 0x9f),
607                                          builder->instance_size.packed);
608
609                        store_global(b,
610                                     get_address_imm(b, attrib_buf_ptr, WORD(4)),
611                                     nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
612
613                        nir_store_var(b, attrib_idx_var,
614                                      nir_iadd_imm(b, attrib_idx, 1), 1);
615                        CONTINUE;
616                } ENDIF
617
618                IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
619                        split_div(b, builder->instance_size.padded,
620                                  &r_e, &d);
621                        nir_ssa_def *default_div =
622                                nir_ior(b, single_instance,
623                                        nir_ult(b,
624                                                builder->instance_size.padded,
625                                                nir_imm_int(b, 2)));
626                        r_e = nir_bcsel(b, default_div,
627                                        nir_imm_int(b, 0x3f), r_e);
628                        d = nir_bcsel(b, default_div,
629                                      nir_imm_int(b, (1u << 31) - 1), d);
630                        store_global(b,
631                                     get_address_imm(b, attrib_buf_ptr, WORD(1)),
632                                     nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
633                                     2);
634                        nir_store_var(b, attrib_idx_var,
635                                      nir_iadd_imm(b, attrib_idx, 1), 1);
636                        CONTINUE;
637                } ENDIF
638#endif
639
640                nir_ssa_def *instance_div =
641                        load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32);
642
643                nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded);
644
645                nir_ssa_def *multi_instance =
646                        nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2));
647
648                IF (nir_ine(b, div, nir_imm_int(b, 0))) {
649                        IF (multi_instance) {
650                                IF (nir_is_power_of_two_or_zero(b, div)) {
651                                        nir_ssa_def *exp =
652                                                nir_imax(b, nir_ufind_msb(b, div),
653                                                         nir_imm_int(b, 0));
654                                        update_vertex_attrib_buf(builder, attrib_buf_ptr,
655                                                                 MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR,
656                                                                 exp, NULL);
657                                } ELSE {
658                                        split_div(b, div, &r_e, &d);
659                                        update_vertex_attrib_buf(builder, attrib_buf_ptr,
660                                                                 MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR,
661                                                                 r_e, d);
662                                } ENDIF
663                        } ELSE {
664                                /* Single instance with a non-0 divisor: all
665                                 * accesses should point to attribute 0 */
666                                zero_attrib_buf_stride(builder, attrib_buf_ptr);
667                        } ENDIF
668
669                        adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div);
670                } ELSE IF (multi_instance) {
671                        update_vertex_attrib_buf(builder, attrib_buf_ptr,
672                                        MALI_ATTRIBUTE_TYPE_1D_MODULUS,
673                                        builder->instance_size.packed, NULL);
674                } ENDIF ENDIF
675
676                nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1);
677        }
678}
679
680static nir_ssa_def *
681update_varying_buf(struct indirect_draw_shader_builder *builder,
682                   nir_ssa_def *varying_buf_ptr,
683                   nir_ssa_def *vertex_count)
684{
685        nir_builder *b = &builder->b;
686
687        nir_ssa_def *stride =
688                load_global(b, get_address_imm(b, varying_buf_ptr, WORD(2)), 1, 32);
689        nir_ssa_def *size = nir_imul(b, stride, vertex_count);
690        nir_ssa_def *aligned_size =
691                nir_iand_imm(b, nir_iadd_imm(b, size, 63), ~63);
692        nir_ssa_def *var_mem_ptr =
693                nir_load_var(b, builder->varyings.mem_ptr);
694        nir_ssa_def *w0 =
695                nir_ior(b, nir_unpack_64_2x32_split_x(b, var_mem_ptr),
696                        nir_imm_int(b, MALI_ATTRIBUTE_TYPE_1D));
697        nir_ssa_def *w1 = nir_unpack_64_2x32_split_y(b, var_mem_ptr);
698        store_global(b, get_address_imm(b, varying_buf_ptr, WORD(0)),
699                     nir_vec4(b, w0, w1, stride, size), 4);
700
701        nir_store_var(b, builder->varyings.mem_ptr,
702                      get_address(b, var_mem_ptr, aligned_size), 3);
703
704        return var_mem_ptr;
705}
706
707/* Based on panfrost_emit_varying_descriptor() */
708
709static void
710update_varyings(struct indirect_draw_shader_builder *builder)
711{
712        nir_builder *b = &builder->b;
713        nir_ssa_def *vertex_count =
714                nir_imul(b, builder->instance_size.padded,
715                         builder->draw.instance_count);
716        nir_ssa_def *buf_ptr =
717                get_address_imm(b, builder->varyings.varying_bufs,
718                                PAN_VARY_GENERAL *
719                                pan_size(ATTRIBUTE_BUFFER));
720        update_varying_buf(builder, buf_ptr, vertex_count);
721
722        buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
723                                  PAN_VARY_POSITION *
724                                  pan_size(ATTRIBUTE_BUFFER));
725        builder->varyings.pos_ptr =
726                update_varying_buf(builder, buf_ptr, vertex_count);
727
728        if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) {
729                buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
730                                          PAN_VARY_PSIZ *
731                                          pan_size(ATTRIBUTE_BUFFER));
732                builder->varyings.psiz_ptr =
733                        update_varying_buf(builder, buf_ptr, vertex_count);
734        }
735
736        set_draw_ctx_field(builder, varying_mem,
737                           nir_load_var(b, builder->varyings.mem_ptr));
738}
739
740/* Based on panfrost_pack_work_groups_compute() */
741
742static void
743get_invocation(struct indirect_draw_shader_builder *builder)
744{
745        nir_builder *b = &builder->b;
746        nir_ssa_def *one = nir_imm_int(b, 1);
747        nir_ssa_def *max_vertex =
748                nir_usub_sat(b, builder->instance_size.raw, one);
749        nir_ssa_def *max_instance =
750                nir_usub_sat(b, builder->draw.instance_count, one);
751        nir_ssa_def *split =
752                nir_bcsel(b, nir_ieq_imm(b, max_instance, 0),
753                          nir_imm_int(b, 32),
754                          nir_iadd_imm(b, nir_ufind_msb(b, max_vertex), 1));
755
756        builder->jobs.invocation =
757                nir_vec2(b,
758                         nir_ior(b, max_vertex,
759                                 nir_ishl(b, max_instance, split)),
760                         nir_ior(b, nir_ishl(b, split, nir_imm_int(b, 22)),
761                                 nir_imm_int(b, 2 << 28)));
762}
763
764static nir_ssa_def *
765nir_align_pot(nir_builder *b, nir_ssa_def *val, unsigned pot)
766{
767        assert(pot != 0 && util_is_power_of_two_or_zero(pot));
768
769        return nir_iand_imm(b, nir_iadd_imm(b, val, pot - 1), ~(pot - 1));
770}
771
772/* Based on panfrost_padded_vertex_count() */
773
774static nir_ssa_def *
775get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed)
776{
777        nir_ssa_def *one = nir_imm_int(b, 1);
778        nir_ssa_def *zero = nir_imm_int(b, 0);
779        nir_ssa_def *eleven = nir_imm_int(b, 11);
780        nir_ssa_def *four = nir_imm_int(b, 4);
781
782        nir_ssa_def *exp =
783                nir_usub_sat(b, nir_imax(b, nir_ufind_msb(b, val), zero), four);
784        nir_ssa_def *base = nir_ushr(b, val, exp);
785
786        base = nir_iadd(b, base,
787                        nir_bcsel(b, nir_ine(b, val, nir_ishl(b, base, exp)), one, zero));
788
789        nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero);
790        exp = nir_iadd(b, exp, rshift);
791        base = nir_ushr(b, base, rshift);
792        base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero));
793        rshift = nir_imax(b, nir_find_lsb(b, base), zero);
794        exp = nir_iadd(b, exp, rshift);
795        base = nir_ushr(b, base, rshift);
796
797        *packed = nir_ior(b, exp,
798                          nir_ishl(b, nir_ushr_imm(b, base, 1), nir_imm_int(b, 5)));
799        return nir_ishl(b, base, exp);
800}
801
802static void
803update_jobs(struct indirect_draw_shader_builder *builder)
804{
805        get_invocation(builder);
806
807        if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
808                update_job(builder, MALI_JOB_TYPE_VERTEX);
809
810        update_job(builder, MALI_JOB_TYPE_TILER);
811}
812
813
814static void
815set_null_job(struct indirect_draw_shader_builder *builder,
816             nir_ssa_def *job_ptr)
817{
818        nir_builder *b = &builder->b;
819        nir_ssa_def *w4 = get_address_imm(b, job_ptr, WORD(4));
820        nir_ssa_def *val = load_global(b, w4, 1, 32);
821
822        /* Set job type to NULL (AKA NOOP) */
823        val = nir_ior(b, nir_iand_imm(b, val, 0xffffff01),
824                      nir_imm_int(b, MALI_JOB_TYPE_NULL << 1));
825        store_global(b, w4, val, 1);
826}
827
828static void
829get_instance_size(struct indirect_draw_shader_builder *builder)
830{
831        nir_builder *b = &builder->b;
832
833        if (!builder->index_size) {
834                builder->jobs.base_vertex_offset = nir_imm_int(b, 0);
835                builder->jobs.offset_start = builder->draw.vertex_start;
836                builder->instance_size.raw = builder->draw.vertex_count;
837                return;
838        }
839
840        unsigned index_size = builder->index_size;
841        nir_ssa_def *min = get_min_max_ctx_field(builder, min);
842        nir_ssa_def *max = get_min_max_ctx_field(builder, max);
843
844        /* We handle unaligned indices here to avoid the extra complexity in
845         * the min/max search job.
846         */
847        if (builder->index_size < 4) {
848                nir_variable *min_var =
849                        nir_local_variable_create(b->impl, glsl_uint_type(), "min");
850                nir_store_var(b, min_var, min, 1);
851                nir_variable *max_var =
852                        nir_local_variable_create(b->impl, glsl_uint_type(), "max");
853                nir_store_var(b, max_var, max, 1);
854
855                nir_ssa_def *base =
856                        get_address(b, builder->draw.index_buf,
857                                    nir_imul_imm(b, builder->draw.vertex_start, index_size));
858                nir_ssa_def *offset = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
859                nir_ssa_def *end =
860                        nir_iadd(b, offset,
861                                 nir_imul_imm(b, builder->draw.vertex_count, index_size));
862                nir_ssa_def *aligned_end = nir_iand_imm(b, end, ~3);
863                unsigned shift = index_size * 8;
864                unsigned mask = (1 << shift) - 1;
865
866                base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
867
868                /* Unaligned start offset, we need to ignore any data that's
869                 * outside the requested range. We also handle ranges that are
870                 * covering less than 2 words here.
871                 */
872                IF (nir_ior(b, nir_ine(b, offset, nir_imm_int(b, 0)), nir_ieq(b, aligned_end, nir_imm_int(b, 0)))) {
873                        min = nir_load_var(b, min_var);
874                        max = nir_load_var(b, max_var);
875
876                        nir_ssa_def *val = load_global(b, base, 1, 32);
877                        for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
878                                nir_ssa_def *oob =
879                                        nir_ior(b,
880                                                nir_ult(b, nir_imm_int(b, i), offset),
881                                                nir_uge(b, nir_imm_int(b, i), end));
882                                nir_ssa_def *data = nir_iand_imm(b, val, mask);
883
884                                min = nir_umin(b, min,
885                                               nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
886                                max = nir_umax(b, max,
887                                               nir_bcsel(b, oob, nir_imm_int(b, 0), data));
888                                val = nir_ushr_imm(b, val, shift);
889                        }
890
891                        nir_store_var(b, min_var, min, 1);
892                        nir_store_var(b, max_var, max, 1);
893                } ENDIF
894
895                nir_ssa_def *remaining = nir_isub(b, end, aligned_end);
896
897                /* The last word contains less than 4bytes of data, we need to
898                 * discard anything falling outside the requested range.
899                 */
900                IF (nir_iand(b, nir_ine(b, end, aligned_end), nir_ine(b, aligned_end, nir_imm_int(b, 0)))) {
901                        min = nir_load_var(b, min_var);
902                        max = nir_load_var(b, max_var);
903
904                        nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32);
905                        for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
906                                nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining);
907                                nir_ssa_def *data = nir_iand_imm(b, val, mask);
908
909                                min = nir_umin(b, min,
910                                               nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
911                                max = nir_umax(b, max,
912                                               nir_bcsel(b, oob, nir_imm_int(b, 0), data));
913                                val = nir_ushr_imm(b, val, shift);
914                        }
915
916                        nir_store_var(b, min_var, min, 1);
917                        nir_store_var(b, max_var, max, 1);
918                } ENDIF
919
920                min = nir_load_var(b, min_var);
921                max = nir_load_var(b, max_var);
922        }
923
924        builder->jobs.base_vertex_offset = nir_ineg(b, min);
925        builder->jobs.offset_start = nir_iadd(b, min, builder->draw.index_bias);
926        builder->instance_size.raw = nir_iadd_imm(b, nir_usub_sat(b, max, min), 1);
927}
928
929/* Patch a draw sequence */
930
931static void
932patch(struct indirect_draw_shader_builder *builder)
933{
934        unsigned index_size = builder->index_size;
935        nir_builder *b = &builder->b;
936
937        nir_ssa_def *draw_ptr = builder->draw.draw_buf;
938
939        if (index_size) {
940                builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count);
941                builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance);
942                builder->draw.instance_count =
943                        get_indexed_draw_field(b, draw_ptr, instance_count);
944                builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start);
945                builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias);
946        } else {
947                builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
948                builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance);
949                builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count);
950                builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
951        }
952
953        assert(builder->draw.vertex_count->num_components);
954
955        nir_ssa_def *num_vertices =
956                nir_imul(b, builder->draw.vertex_count, builder->draw.instance_count);
957
958        IF (nir_ieq(b, num_vertices, nir_imm_int(b, 0))) {
959                /* If there's nothing to draw, turn the vertex/tiler jobs into
960                 * null jobs.
961                 */
962                if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
963                        set_null_job(builder, builder->jobs.vertex_job);
964
965                set_null_job(builder, builder->jobs.tiler_job);
966        } ELSE {
967                get_instance_size(builder);
968
969                nir_ssa_def *count = builder->instance_size.raw;
970
971                /* IDVS requires padding to a multiple of 4 */
972                if (builder->flags & PAN_INDIRECT_DRAW_IDVS)
973                        count = nir_align_pot(b, count, 4);
974
975                builder->instance_size.padded =
976                        get_padded_count(b, count,
977                                         &builder->instance_size.packed);
978
979                update_varyings(builder);
980                update_jobs(builder);
981                update_vertex_attribs(builder);
982
983                IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) {
984                        store_global(b, builder->jobs.first_vertex_sysval,
985                                     builder->jobs.offset_start, 1);
986                } ENDIF
987
988                IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) {
989                        store_global(b, builder->jobs.base_vertex_sysval,
990                                     index_size ?
991                                     builder->draw.index_bias :
992                                     nir_imm_int(b, 0),
993                                     1);
994                } ENDIF
995
996                IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) {
997                        store_global(b, builder->jobs.base_instance_sysval,
998                                     builder->draw.start_instance, 1);
999                } ENDIF
1000        } ENDIF
1001}
1002
1003/* Search the min/max index in the range covered by the indirect draw call */
1004
1005static void
1006get_index_min_max(struct indirect_draw_shader_builder *builder)
1007{
1008        nir_ssa_def *restart_index = builder->draw.restart_index;
1009        unsigned index_size = builder->index_size;
1010        nir_builder *b = &builder->b;
1011
1012        nir_ssa_def *draw_ptr = builder->draw.draw_buf;
1013
1014        builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
1015        builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
1016
1017        nir_ssa_def *thread_id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
1018        nir_variable *min_var =
1019                nir_local_variable_create(b->impl, glsl_uint_type(), "min");
1020        nir_store_var(b, min_var, nir_imm_int(b, UINT32_MAX), 1);
1021        nir_variable *max_var =
1022                nir_local_variable_create(b->impl, glsl_uint_type(), "max");
1023        nir_store_var(b, max_var, nir_imm_int(b, 0), 1);
1024
1025        nir_ssa_def *base =
1026                get_address(b, builder->draw.index_buf,
1027                            nir_imul_imm(b, builder->draw.vertex_start, index_size));
1028
1029
1030        nir_ssa_def *start = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
1031        nir_ssa_def *end =
1032                nir_iadd(b, start, nir_imul_imm(b, builder->draw.vertex_count, index_size));
1033
1034        base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
1035
1036        /* Align on 4 bytes, non-aligned indices are handled in the indirect draw job. */
1037        start = nir_iand_imm(b, nir_iadd_imm(b, start, 3), ~3);
1038        end = nir_iand_imm(b, end, ~3);
1039
1040        /* Add the job offset. */
1041        start = nir_iadd(b, start, nir_imul_imm(b, thread_id, sizeof(uint32_t)));
1042
1043        nir_variable *offset_var =
1044                nir_local_variable_create(b->impl, glsl_uint_type(), "offset");
1045        nir_store_var(b, offset_var, start, 1);
1046
1047        LOOP {
1048                nir_ssa_def *offset = nir_load_var(b, offset_var);
1049                IF (nir_uge(b, offset, end))
1050                        BREAK;
1051                ENDIF
1052
1053                nir_ssa_def *val = load_global(b, get_address(b, base, offset), 1, 32);
1054                nir_ssa_def *old_min = nir_load_var(b, min_var);
1055                nir_ssa_def *old_max = nir_load_var(b, max_var);
1056                nir_ssa_def *new_min;
1057                nir_ssa_def *new_max;
1058
1059                /* TODO: use 8/16 bit arithmetic when index_size < 4. */
1060                for (unsigned i = 0; i < 4; i += index_size) {
1061                        nir_ssa_def *data = nir_ushr_imm(b, val, i * 8);
1062                        data = nir_iand_imm(b, data, (1ULL << (index_size * 8)) - 1);
1063                        new_min = nir_umin(b, old_min, data);
1064                        new_max = nir_umax(b, old_max, data);
1065                        if (restart_index) {
1066                                new_min = nir_bcsel(b, nir_ine(b, restart_index, data), new_min, old_min);
1067                                new_max = nir_bcsel(b, nir_ine(b, restart_index, data), new_max, old_max);
1068                        }
1069                        old_min = new_min;
1070                        old_max = new_max;
1071                }
1072
1073                nir_store_var(b, min_var, new_min, 1);
1074                nir_store_var(b, max_var, new_max, 1);
1075                nir_store_var(b, offset_var,
1076                              nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1);
1077        }
1078
1079        IF (nir_ult(b, start, end))
1080                update_min(builder, nir_load_var(b, min_var));
1081                update_max(builder, nir_load_var(b, max_var));
1082        ENDIF
1083}
1084
1085static unsigned
1086get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search)
1087{
1088        if (!index_min_max_search) {
1089                flags &= PAN_INDIRECT_DRAW_FLAGS_MASK;
1090                flags &= ~PAN_INDIRECT_DRAW_INDEX_SIZE_MASK;
1091                if (index_size)
1092                        flags |= (util_logbase2(index_size) + 1);
1093                return flags;
1094        }
1095
1096        return ((flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) ?
1097                PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX_PRIM_RESTART :
1098                PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX) +
1099               util_logbase2(index_size);
1100}
1101
1102static void
1103create_indirect_draw_shader(struct panfrost_device *dev,
1104                            unsigned flags, unsigned index_size,
1105                            bool index_min_max_search)
1106{
1107        assert(flags < PAN_INDIRECT_DRAW_NUM_SHADERS);
1108        struct indirect_draw_shader_builder builder;
1109        init_shader_builder(&builder, dev, flags, index_size, index_min_max_search);
1110
1111        nir_builder *b = &builder.b;
1112
1113        if (index_min_max_search)
1114                get_index_min_max(&builder);
1115        else
1116                patch(&builder);
1117
1118        struct panfrost_compile_inputs inputs = {
1119                .gpu_id = dev->gpu_id,
1120                .fixed_sysval_ubo = -1,
1121                .no_ubo_to_push = true,
1122        };
1123        struct pan_shader_info shader_info;
1124        struct util_dynarray binary;
1125
1126        util_dynarray_init(&binary, NULL);
1127        GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
1128
1129        assert(!shader_info.tls_size);
1130        assert(!shader_info.wls_size);
1131        assert(!shader_info.sysvals.sysval_count);
1132
1133        shader_info.push.count =
1134                DIV_ROUND_UP(sizeof(struct indirect_draw_inputs), 4);
1135
1136        unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1137        struct pan_indirect_draw_shader *draw_shader =
1138                &dev->indirect_draw_shaders.shaders[shader_id];
1139        void *state = dev->indirect_draw_shaders.states->ptr.cpu +
1140                      (shader_id * pan_size(RENDERER_STATE));
1141
1142        pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1143        if (!draw_shader->rsd) {
1144                mali_ptr address =
1145                        pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool,
1146                                                binary.data, binary.size,
1147                                                PAN_ARCH >= 6 ? 128 : 64);
1148
1149                util_dynarray_fini(&binary);
1150
1151                pan_pack(state, RENDERER_STATE, cfg) {
1152                        pan_shader_prepare_rsd(&shader_info, address, &cfg);
1153                }
1154
1155                draw_shader->push = shader_info.push;
1156                draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu +
1157                                   (shader_id * pan_size(RENDERER_STATE));
1158        }
1159        pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1160
1161        ralloc_free(b->shader);
1162}
1163
1164static mali_ptr
1165get_renderer_state(struct panfrost_device *dev, unsigned flags,
1166                   unsigned index_size, bool index_min_max_search)
1167{
1168        unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1169        struct pan_indirect_draw_shader *info =
1170                &dev->indirect_draw_shaders.shaders[shader_id];
1171
1172        if (!info->rsd) {
1173                create_indirect_draw_shader(dev, flags, index_size,
1174                                            index_min_max_search);
1175                assert(info->rsd);
1176        }
1177
1178        return info->rsd;
1179}
1180
1181static mali_ptr
1182get_tls(const struct panfrost_device *dev)
1183{
1184        return dev->indirect_draw_shaders.states->ptr.gpu +
1185               (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1186}
1187
1188static void
1189panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
1190{
1191        pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1192        if (dev->indirect_draw_shaders.states)
1193                goto out;
1194
1195        unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS *
1196                                  pan_size(RENDERER_STATE)) +
1197                                 pan_size(LOCAL_STORAGE);
1198
1199        dev->indirect_draw_shaders.states =
1200                panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states");
1201
1202        /* Prepare the thread storage descriptor now since it's invariant. */
1203        void *tsd = dev->indirect_draw_shaders.states->ptr.cpu +
1204                    (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1205        pan_pack(tsd, LOCAL_STORAGE, ls) {
1206                ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
1207        };
1208
1209        /* FIXME: Currently allocating 512M of growable memory, meaning that we
1210         * only allocate what we really use, the problem is:
1211         * - allocation happens 2M at a time, which might be more than we
1212         *   actually need
1213         * - the memory is attached to the device to speed up subsequent
1214         *   indirect draws, but that also means it's never shrinked
1215         */
1216        dev->indirect_draw_shaders.varying_heap =
1217                panfrost_bo_create(dev, 512 * 1024 * 1024,
1218                                   PAN_BO_INVISIBLE | PAN_BO_GROWABLE,
1219                                   "Indirect draw varying heap");
1220
1221out:
1222        pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1223}
1224
1225static unsigned
1226panfrost_emit_index_min_max_search(struct pan_pool *pool,
1227                                   struct pan_scoreboard *scoreboard,
1228                                   const struct pan_indirect_draw_info *draw_info,
1229                                   const struct indirect_draw_inputs *inputs,
1230                                   struct indirect_draw_context *draw_ctx)
1231{
1232        struct panfrost_device *dev = pool->dev;
1233        unsigned index_size = draw_info->index_size;
1234
1235        if (!index_size)
1236                return 0;
1237
1238        mali_ptr rsd =
1239                get_renderer_state(dev, draw_info->flags,
1240                                   draw_info->index_size, true);
1241        struct panfrost_ptr job =
1242                pan_pool_alloc_desc(pool, COMPUTE_JOB);
1243        void *invocation =
1244                pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1245        panfrost_pack_work_groups_compute(invocation,
1246                                          1, 1, 1, MIN_MAX_JOBS, 1, 1,
1247                                          false, false);
1248
1249        pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1250                cfg.job_task_split = 7;
1251        }
1252
1253        pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1254                cfg.state = rsd;
1255                cfg.thread_storage = get_tls(pool->dev);
1256                cfg.push_uniforms =
1257                        pan_pool_upload_aligned(pool, inputs, sizeof(*inputs), 16);
1258        }
1259
1260        return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1261                                false, false, 0, 0, &job, false);
1262}
1263
1264unsigned
1265GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
1266                                  struct pan_scoreboard *scoreboard,
1267                                  const struct pan_indirect_draw_info *draw_info,
1268                                  struct panfrost_ptr *ctx)
1269{
1270        struct panfrost_device *dev = pool->dev;
1271
1272        /* Currently only tested on Bifrost, but the logic should be the same
1273         * on Midgard.
1274         */
1275        assert(pan_is_bifrost(dev));
1276
1277        panfrost_indirect_draw_alloc_deps(dev);
1278
1279        struct panfrost_ptr job =
1280                pan_pool_alloc_desc(pool, COMPUTE_JOB);
1281        mali_ptr rsd =
1282                get_renderer_state(dev, draw_info->flags,
1283                                   draw_info->index_size, false);
1284
1285        struct indirect_draw_context draw_ctx = {
1286                .varying_mem = dev->indirect_draw_shaders.varying_heap->ptr.gpu,
1287        };
1288
1289        struct panfrost_ptr draw_ctx_ptr = *ctx;
1290        if (!draw_ctx_ptr.cpu) {
1291                draw_ctx_ptr = pan_pool_alloc_aligned(pool,
1292                                                      sizeof(draw_ctx),
1293                                                      sizeof(mali_ptr));
1294        }
1295
1296        struct indirect_draw_inputs inputs = {
1297                .draw_ctx = draw_ctx_ptr.gpu,
1298                .draw_buf = draw_info->draw_buf,
1299                .index_buf = draw_info->index_buf,
1300                .first_vertex_sysval = draw_info->first_vertex_sysval,
1301                .base_vertex_sysval = draw_info->base_vertex_sysval,
1302                .base_instance_sysval = draw_info->base_instance_sysval,
1303                .vertex_job = draw_info->vertex_job,
1304                .tiler_job = draw_info->tiler_job,
1305                .attrib_bufs = draw_info->attrib_bufs,
1306                .attribs = draw_info->attribs,
1307                .varying_bufs = draw_info->varying_bufs,
1308                .attrib_count = draw_info->attrib_count,
1309        };
1310
1311        if (draw_info->index_size) {
1312                inputs.restart_index = draw_info->restart_index;
1313
1314                struct panfrost_ptr min_max_ctx_ptr =
1315                        pan_pool_alloc_aligned(pool,
1316                                               sizeof(struct min_max_context),
1317                                               4);
1318                struct min_max_context *ctx = min_max_ctx_ptr.cpu;
1319
1320                ctx->min = UINT32_MAX;
1321                ctx->max = 0;
1322                inputs.min_max_ctx = min_max_ctx_ptr.gpu;
1323        }
1324
1325        void *invocation =
1326                pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1327        panfrost_pack_work_groups_compute(invocation,
1328                                          1, 1, 1, 1, 1, 1,
1329                                          false, false);
1330
1331        pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1332                cfg.job_task_split = 2;
1333        }
1334
1335        pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1336                cfg.state = rsd;
1337                cfg.thread_storage = get_tls(pool->dev);
1338                cfg.push_uniforms =
1339                        pan_pool_upload_aligned(pool, &inputs, sizeof(inputs), 16);
1340        }
1341
1342        unsigned global_dep = draw_info->last_indirect_draw;
1343        unsigned local_dep =
1344                panfrost_emit_index_min_max_search(pool, scoreboard, draw_info,
1345                                                   &inputs, &draw_ctx);
1346
1347        if (!ctx->cpu) {
1348                *ctx = draw_ctx_ptr;
1349                memcpy(ctx->cpu, &draw_ctx, sizeof(draw_ctx));
1350        }
1351
1352        return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1353                                false, true, local_dep, global_dep,
1354                                &job, false);
1355}
1356
1357void
1358GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev,
1359                                          struct pan_pool *bin_pool)
1360{
1361        /* We allocate the states and varying_heap BO lazily to avoid
1362         * reserving memory when indirect draws are not used.
1363         */
1364        pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL);
1365        dev->indirect_draw_shaders.bin_pool = bin_pool;
1366}
1367
1368void
1369GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev)
1370{
1371        panfrost_bo_unreference(dev->indirect_draw_shaders.states);
1372        panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap);
1373        pthread_mutex_destroy(&dev->indirect_draw_shaders.lock);
1374}
1375