1/*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27#include "compiler.h"
28#include "bi_builder.h"
29
30/* Arguments common to worklist, passed by value for convenience */
31
32struct bi_worklist {
33        /* # of instructions in the block */
34        unsigned count;
35
36        /* Instructions in the block */
37        bi_instr **instructions;
38
39        /* Bitset of instructions in the block ready for scheduling */
40        BITSET_WORD *worklist;
41
42        /* The backwards dependency graph. nr_dependencies is the number of
43         * unscheduled instructions that must still be scheduled after (before)
44         * this instruction. dependents are which instructions need to be
45         * scheduled before (after) this instruction. */
46        unsigned *dep_counts;
47        BITSET_WORD **dependents;
48};
49
50/* State of a single tuple and clause under construction */
51
52struct bi_reg_state {
53        /* Number of register writes */
54        unsigned nr_writes;
55
56        /* Register reads, expressed as (equivalence classes of)
57         * sources. Only 3 reads are allowed, but up to 2 may spill as
58         * "forced" for the next scheduled tuple, provided such a tuple
59         * can be constructed */
60        bi_index reads[5];
61        unsigned nr_reads;
62
63        /* The previous tuple scheduled (= the next tuple executed in the
64         * program) may require certain writes, in order to bypass the register
65         * file and use a temporary passthrough for the value. Up to 2 such
66         * constraints are architecturally satisfiable */
67        unsigned forced_count;
68        bi_index forceds[2];
69};
70
71struct bi_tuple_state {
72        /* Is this the last tuple in the clause */
73        bool last;
74
75        /* Scheduled ADD instruction, or null if none */
76        bi_instr *add;
77
78        /* Reads for previous (succeeding) tuple */
79        bi_index prev_reads[5];
80        unsigned nr_prev_reads;
81        bi_tuple *prev;
82
83        /* Register slot state for current tuple */
84        struct bi_reg_state reg;
85
86        /* Constants are shared in the tuple. If constant_count is nonzero, it
87         * is a size for constant count. Otherwise, fau is the slot read from
88         * FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero,
89         * but within a tuple, that should be encoded as constant_count != 0
90         * and constants[0] = constants[1] = 0 */
91        unsigned constant_count;
92
93        union {
94                uint32_t constants[2];
95                enum bir_fau fau;
96        };
97
98        unsigned pcrel_idx;
99};
100
101struct bi_const_state {
102        unsigned constant_count;
103        bool pcrel; /* applies to first const */
104        uint32_t constants[2];
105
106        /* Index of the constant into the clause */
107        unsigned word_idx;
108};
109
110enum bi_ftz_state {
111        /* No flush-to-zero state assigned yet */
112        BI_FTZ_STATE_NONE,
113
114        /* Never flush-to-zero */
115        BI_FTZ_STATE_DISABLE,
116
117        /* Always flush-to-zero */
118        BI_FTZ_STATE_ENABLE,
119};
120
121struct bi_clause_state {
122        /* Has a message-passing instruction already been assigned? */
123        bool message;
124
125        /* Indices already accessed, this needs to be tracked to avoid hazards
126         * around message-passing instructions */
127        unsigned access_count;
128        bi_index accesses[(BI_MAX_SRCS + BI_MAX_DESTS) * 16];
129
130        unsigned tuple_count;
131        struct bi_const_state consts[8];
132
133        /* Numerical state of the clause */
134        enum bi_ftz_state ftz;
135};
136
137/* Determines messsage type by checking the table and a few special cases. Only
138 * case missing is tilebuffer instructions that access depth/stencil, which
139 * require a Z_STENCIL message (to implement
140 * ARM_shader_framebuffer_fetch_depth_stencil) */
141
142static enum bifrost_message_type
143bi_message_type_for_instr(bi_instr *ins)
144{
145        enum bifrost_message_type msg = bi_opcode_props[ins->op].message;
146        bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL);
147
148        if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z)
149                return BIFROST_MESSAGE_Z_STENCIL;
150
151        if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO)
152                return BIFROST_MESSAGE_ATTRIBUTE;
153
154        return msg;
155}
156
157/* Attribute, texture, and UBO load (attribute message) instructions support
158 * bindless, so just check the message type */
159
160ASSERTED static bool
161bi_supports_dtsel(bi_instr *ins)
162{
163        switch (bi_message_type_for_instr(ins)) {
164        case BIFROST_MESSAGE_ATTRIBUTE:
165                return ins->op != BI_OPCODE_LD_GCLK_U64;
166        case BIFROST_MESSAGE_TEX:
167                return true;
168        default:
169                return false;
170        }
171}
172
173/* Adds an edge to the dependency graph */
174
175static void
176bi_push_dependency(unsigned parent, unsigned child,
177                BITSET_WORD **dependents, unsigned *dep_counts)
178{
179        if (!BITSET_TEST(dependents[parent], child)) {
180                BITSET_SET(dependents[parent], child);
181                dep_counts[child]++;
182        }
183}
184
185static void
186add_dependency(struct util_dynarray *table, unsigned index, unsigned child,
187                BITSET_WORD **dependents, unsigned *dep_counts)
188{
189        assert(index < 64);
190        util_dynarray_foreach(table + index, unsigned, parent)
191                bi_push_dependency(*parent, child, dependents, dep_counts);
192}
193
194static void
195mark_access(struct util_dynarray *table, unsigned index, unsigned parent)
196{
197        assert(index < 64);
198        util_dynarray_append(&table[index], unsigned, parent);
199}
200
201static bool
202bi_is_sched_barrier(bi_instr *I)
203{
204        switch (I->op) {
205        case BI_OPCODE_BARRIER:
206        case BI_OPCODE_DISCARD_F32:
207                return true;
208        default:
209                return false;
210        }
211}
212
213static void
214bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend)
215{
216        struct util_dynarray last_read[64], last_write[64];
217
218        for (unsigned i = 0; i < 64; ++i) {
219                util_dynarray_init(&last_read[i], NULL);
220                util_dynarray_init(&last_write[i], NULL);
221        }
222
223        /* Initialize dependency graph */
224        for (unsigned i = 0; i < st.count; ++i) {
225                st.dependents[i] =
226                        calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
227
228                st.dep_counts[i] = 0;
229        }
230
231        unsigned prev_msg = ~0;
232
233        /* Populate dependency graph */
234        for (signed i = st.count - 1; i >= 0; --i) {
235                bi_instr *ins = st.instructions[i];
236
237                bi_foreach_src(ins, s) {
238                        if (ins->src[s].type != BI_INDEX_REGISTER) continue;
239                        unsigned count = bi_count_read_registers(ins, s);
240
241                        for (unsigned c = 0; c < count; ++c)
242                                add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts);
243                }
244
245                /* Keep message-passing ops in order. (This pass only cares
246                 * about bundling; reordering of message-passing instructions
247                 * happens during earlier scheduling.) */
248
249                if (bi_message_type_for_instr(ins)) {
250                        if (prev_msg != ~0)
251                                bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);
252
253                        prev_msg = i;
254                }
255
256                /* Handle schedule barriers, adding All the deps */
257                if (inorder || bi_is_sched_barrier(ins)) {
258                        for (unsigned j = 0; j < st.count; ++j) {
259                                if (i == j) continue;
260
261                                bi_push_dependency(MAX2(i, j), MIN2(i, j),
262                                                st.dependents, st.dep_counts);
263                        }
264                }
265
266                bi_foreach_dest(ins, d) {
267                        if (ins->dest[d].type != BI_INDEX_REGISTER) continue;
268                        unsigned dest = ins->dest[d].value;
269
270                        unsigned count = bi_count_write_registers(ins, d);
271
272                        for (unsigned c = 0; c < count; ++c) {
273                                add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts);
274                                add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts);
275                                mark_access(last_write, dest + c, i);
276                        }
277                }
278
279                /* Blend shaders are allowed to clobber R0-R15. Treat these
280                 * registers like extra destinations for scheduling purposes.
281                 */
282                if (ins->op == BI_OPCODE_BLEND && !is_blend) {
283                        for (unsigned c = 0; c < 16; ++c) {
284                                add_dependency(last_read, c, i, st.dependents, st.dep_counts);
285                                add_dependency(last_write, c, i, st.dependents, st.dep_counts);
286                                mark_access(last_write, c, i);
287                        }
288                }
289
290                bi_foreach_src(ins, s) {
291                        if (ins->src[s].type != BI_INDEX_REGISTER) continue;
292
293                        unsigned count = bi_count_read_registers(ins, s);
294
295                        for (unsigned c = 0; c < count; ++c)
296                                mark_access(last_read, ins->src[s].value + c, i);
297                }
298        }
299
300        /* If there is a branch, all instructions depend on it, as interblock
301         * execution must be purely in-order */
302
303        bi_instr *last = st.instructions[st.count - 1];
304        if (last->branch_target || last->op == BI_OPCODE_JUMP) {
305                for (signed i = st.count - 2; i >= 0; --i)
306                        bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);
307        }
308
309        /* Free the intermediate structures */
310        for (unsigned i = 0; i < 64; ++i) {
311                util_dynarray_fini(&last_read[i]);
312                util_dynarray_fini(&last_write[i]);
313        }
314}
315
316/* Scheduler pseudoinstruction lowerings to enable instruction pairings.
317 * Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2
318 */
319
320static bi_instr *
321bi_lower_cubeface(bi_context *ctx,
322                struct bi_clause_state *clause, struct bi_tuple_state *tuple)
323{
324        bi_instr *pinstr = tuple->add;
325        bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
326        bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0],
327                        pinstr->src[0], pinstr->src[1], pinstr->src[2]);
328
329        pinstr->op = BI_OPCODE_CUBEFACE2;
330        pinstr->dest[0] = pinstr->dest[1];
331        pinstr->dest[1] = bi_null();
332        pinstr->src[0] = cubeface1->dest[0];
333        pinstr->src[1] = bi_null();
334        pinstr->src[2] = bi_null();
335
336        return cubeface1;
337}
338
339/* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to
340 * have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the
341 * arguments (rbase, address lo, address hi, rbase) */
342
343static bi_instr *
344bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct
345                bi_tuple_state *tuple)
346{
347        bi_instr *pinstr = tuple->add;
348        bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
349        bi_instr *atom_c = bi_atom_c_return_i32(&b,
350                        pinstr->src[1], pinstr->src[2], pinstr->src[0],
351                        pinstr->atom_opc);
352
353        if (bi_is_null(pinstr->dest[0]))
354                atom_c->op = BI_OPCODE_ATOM_C_I32;
355
356        pinstr->op = BI_OPCODE_ATOM_CX;
357        pinstr->src[3] = atom_c->src[2];
358
359        return atom_c;
360}
361
362static bi_instr *
363bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct
364                bi_tuple_state *tuple)
365{
366        bi_instr *pinstr = tuple->add;
367        bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
368        bi_instr *atom_c = bi_atom_c1_return_i32(&b,
369                        pinstr->src[0], pinstr->src[1], pinstr->atom_opc);
370
371        if (bi_is_null(pinstr->dest[0]))
372                atom_c->op = BI_OPCODE_ATOM_C1_I32;
373
374        pinstr->op = BI_OPCODE_ATOM_CX;
375        pinstr->src[2] = pinstr->src[1];
376        pinstr->src[1] = pinstr->src[0];
377        pinstr->src[3] = bi_dontcare(&b);
378        pinstr->src[0] = bi_null();
379
380        return atom_c;
381}
382
383static bi_instr *
384bi_lower_seg_add(bi_context *ctx,
385                struct bi_clause_state *clause, struct bi_tuple_state *tuple)
386{
387        bi_instr *pinstr = tuple->add;
388        bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
389
390        bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],
391                        pinstr->preserve_null, pinstr->seg);
392
393        pinstr->op = BI_OPCODE_SEG_ADD;
394        pinstr->src[0] = pinstr->src[1];
395        pinstr->src[1] = bi_null();
396
397        assert(pinstr->dest[0].type == BI_INDEX_REGISTER);
398        pinstr->dest[0].value += 1;
399
400        return fma;
401}
402
403static bi_instr *
404bi_lower_dtsel(bi_context *ctx,
405                struct bi_clause_state *clause, struct bi_tuple_state *tuple)
406{
407        bi_instr *add = tuple->add;
408        bi_builder b = bi_init_builder(ctx, bi_before_instr(add));
409
410        bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader),
411                        add->src[0], add->table);
412        add->src[0] = dtsel->dest[0];
413
414        assert(bi_supports_dtsel(add));
415        return dtsel;
416}
417
418/* Flatten linked list to array for O(1) indexing */
419
420static bi_instr **
421bi_flatten_block(bi_block *block, unsigned *len)
422{
423        if (list_is_empty(&block->instructions))
424                return NULL;
425
426        *len = list_length(&block->instructions);
427        bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len));
428
429        unsigned i = 0;
430
431        bi_foreach_instr_in_block(block, ins)
432                instructions[i++] = ins;
433
434        return instructions;
435}
436
437/* The worklist would track instructions without outstanding dependencies. For
438 * debug, force in-order scheduling (no dependency graph is constructed).
439 */
440
441static struct bi_worklist
442bi_initialize_worklist(bi_block *block, bool inorder, bool is_blend)
443{
444        struct bi_worklist st = { };
445        st.instructions = bi_flatten_block(block, &st.count);
446
447        if (!st.count)
448                return st;
449
450        st.dependents = calloc(st.count, sizeof(st.dependents[0]));
451        st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));
452
453        bi_create_dependency_graph(st, inorder, is_blend);
454        st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
455
456        for (unsigned i = 0; i < st.count; ++i) {
457                if (st.dep_counts[i] == 0)
458                        BITSET_SET(st.worklist, i);
459        }
460
461        return st;
462}
463
464static void
465bi_free_worklist(struct bi_worklist st)
466{
467        free(st.dep_counts);
468        free(st.dependents);
469        free(st.instructions);
470        free(st.worklist);
471}
472
473static void
474bi_update_worklist(struct bi_worklist st, unsigned idx)
475{
476        assert(st.dep_counts[idx] == 0);
477
478        if (!st.dependents[idx])
479                return;
480
481        /* Iterate each dependent to remove one dependency (`done`),
482         * adding dependents to the worklist where possible. */
483
484        unsigned i;
485        BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {
486                assert(st.dep_counts[i] != 0);
487                unsigned new_deps = --st.dep_counts[i];
488
489                if (new_deps == 0)
490                        BITSET_SET(st.worklist, i);
491        }
492
493        free(st.dependents[idx]);
494}
495
496/* Scheduler predicates */
497
498/* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */
499static bool
500bi_can_iaddc(bi_instr *ins)
501{
502        return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate &&
503                ins->src[0].swizzle == BI_SWIZZLE_H01 &&
504                ins->src[1].swizzle == BI_SWIZZLE_H01);
505}
506
507/*
508 * The encoding of *FADD.v2f16 only specifies a single abs flag. All abs
509 * encodings are permitted by swapping operands; however, this scheme fails if
510 * both operands are equal. Test for this case.
511 */
512static bool
513bi_impacted_abs(bi_instr *I)
514{
515        return I->src[0].abs && I->src[1].abs &&
516               bi_is_word_equiv(I->src[0], I->src[1]);
517}
518
519bool
520bi_can_fma(bi_instr *ins)
521{
522        /* +IADD.i32 -> *IADDC.i32 */
523        if (bi_can_iaddc(ins))
524                return true;
525
526        /* +MUX -> *CSEL */
527        if (bi_can_replace_with_csel(ins))
528                return true;
529
530        /* *FADD.v2f16 has restricted abs modifiers, use +FADD.v2f16 instead */
531        if (ins->op == BI_OPCODE_FADD_V2F16 && bi_impacted_abs(ins))
532                return false;
533
534        /* TODO: some additional fp16 constraints */
535        return bi_opcode_props[ins->op].fma;
536}
537
538static bool
539bi_impacted_fadd_widens(bi_instr *I)
540{
541        enum bi_swizzle swz0 = I->src[0].swizzle;
542        enum bi_swizzle swz1 = I->src[1].swizzle;
543
544        return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) ||
545                (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) ||
546                (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00);
547}
548
549bool
550bi_can_add(bi_instr *ins)
551{
552        /* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */
553        if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp)
554                return false;
555
556        /* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */
557        if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs))
558                return false;
559
560        /* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */
561        if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins))
562               return false;
563
564        /* TODO: some additional fp16 constraints */
565        return bi_opcode_props[ins->op].add;
566}
567
568/* Architecturally, no single instruction has a "not last" constraint. However,
569 * pseudoinstructions writing multiple destinations (expanding to multiple
570 * paired instructions) can run afoul of the "no two writes on the last clause"
571 * constraint, so we check for that here.
572 *
573 * Exception to the exception: TEXC, which writes to multiple sets of staging
574 * registers. Staging registers bypass the usual register write mechanism so
575 * this restriction does not apply.
576 */
577
578static bool
579bi_must_not_last(bi_instr *ins)
580{
581        return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]) &&
582               (ins->op != BI_OPCODE_TEXC);
583}
584
585/* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we
586 * treat it as a message-passing instruction for the purpose of scheduling
587 * despite no passing no logical message. Otherwise invalid encoding faults may
588 * be raised for unknown reasons (possibly an errata).
589 */
590
591bool
592bi_must_message(bi_instr *ins)
593{
594        return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) ||
595                (ins->op == BI_OPCODE_DISCARD_F32);
596}
597
598static bool
599bi_fma_atomic(enum bi_opcode op)
600{
601        switch (op) {
602        case BI_OPCODE_ATOM_C_I32:
603        case BI_OPCODE_ATOM_C_I64:
604        case BI_OPCODE_ATOM_C1_I32:
605        case BI_OPCODE_ATOM_C1_I64:
606        case BI_OPCODE_ATOM_C1_RETURN_I32:
607        case BI_OPCODE_ATOM_C1_RETURN_I64:
608        case BI_OPCODE_ATOM_C_RETURN_I32:
609        case BI_OPCODE_ATOM_C_RETURN_I64:
610        case BI_OPCODE_ATOM_POST_I32:
611        case BI_OPCODE_ATOM_POST_I64:
612        case BI_OPCODE_ATOM_PRE_I64:
613                return true;
614        default:
615                return false;
616        }
617}
618
619bool
620bi_reads_zero(bi_instr *ins)
621{
622        return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD);
623}
624
625bool
626bi_reads_temps(bi_instr *ins, unsigned src)
627{
628        switch (ins->op) {
629        /* Cannot permute a temporary */
630        case BI_OPCODE_CLPER_I32:
631        case BI_OPCODE_CLPER_OLD_I32:
632                return src != 0;
633
634        /* ATEST isn't supposed to be restricted, but in practice it always
635         * wants to source its coverage mask input (source 0) from register 60,
636         * which won't work properly if we put the input in a temp. This
637         * requires workarounds in both RA and clause scheduling.
638         */
639        case BI_OPCODE_ATEST:
640                return src != 0;
641
642        case BI_OPCODE_IMULD:
643                return false;
644        default:
645                return true;
646        }
647}
648
649static bool
650bi_impacted_t_modifiers(bi_instr *I, unsigned src)
651{
652        enum bi_swizzle swizzle = I->src[src].swizzle;
653
654        switch (I->op) {
655        case BI_OPCODE_F16_TO_F32:
656        case BI_OPCODE_F16_TO_S32:
657        case BI_OPCODE_F16_TO_U32:
658        case BI_OPCODE_MKVEC_V2I16:
659        case BI_OPCODE_S16_TO_F32:
660        case BI_OPCODE_S16_TO_S32:
661        case BI_OPCODE_U16_TO_F32:
662        case BI_OPCODE_U16_TO_U32:
663                return (swizzle != BI_SWIZZLE_H00);
664
665        case BI_OPCODE_BRANCH_F32:
666        case BI_OPCODE_LOGB_F32:
667        case BI_OPCODE_ILOGB_F32:
668        case BI_OPCODE_FADD_F32:
669        case BI_OPCODE_FCMP_F32:
670        case BI_OPCODE_FREXPE_F32:
671        case BI_OPCODE_FREXPM_F32:
672        case BI_OPCODE_FROUND_F32:
673                return (swizzle != BI_SWIZZLE_H01);
674
675        case BI_OPCODE_IADD_S32:
676        case BI_OPCODE_IADD_U32:
677        case BI_OPCODE_ISUB_S32:
678        case BI_OPCODE_ISUB_U32:
679        case BI_OPCODE_IADD_V4S8:
680        case BI_OPCODE_IADD_V4U8:
681        case BI_OPCODE_ISUB_V4S8:
682        case BI_OPCODE_ISUB_V4U8:
683                return (src == 1) && (swizzle != BI_SWIZZLE_H01);
684
685        case BI_OPCODE_S8_TO_F32:
686        case BI_OPCODE_S8_TO_S32:
687        case BI_OPCODE_U8_TO_F32:
688        case BI_OPCODE_U8_TO_U32:
689                return (swizzle != BI_SWIZZLE_B0000);
690
691        case BI_OPCODE_V2S8_TO_V2F16:
692        case BI_OPCODE_V2S8_TO_V2S16:
693        case BI_OPCODE_V2U8_TO_V2F16:
694        case BI_OPCODE_V2U8_TO_V2U16:
695                return (swizzle != BI_SWIZZLE_B0022);
696
697        case BI_OPCODE_IADD_V2S16:
698        case BI_OPCODE_IADD_V2U16:
699        case BI_OPCODE_ISUB_V2S16:
700        case BI_OPCODE_ISUB_V2U16:
701                return (src == 1) && (swizzle >= BI_SWIZZLE_H11);
702
703#if 0
704        /* Restriction on IADD in 64-bit clauses on G72 */
705        case BI_OPCODE_IADD_S64:
706        case BI_OPCODE_IADD_U64:
707                return (src == 1) && (swizzle != BI_SWIZZLE_D0);
708#endif
709
710        default:
711                return false;
712        }
713}
714
715bool
716bi_reads_t(bi_instr *ins, unsigned src)
717{
718        /* Branch offset cannot come from passthrough */
719        if (bi_opcode_props[ins->op].branch)
720                return src != 2;
721
722        /* Table can never read passthrough */
723        if (bi_opcode_props[ins->op].table)
724                return false;
725
726        /* Staging register reads may happen before the succeeding register
727         * block encodes a write, so effectively there is no passthrough */
728        if (bi_is_staging_src(ins, src))
729                return false;
730
731        /* Bifrost cores newer than Mali G71 have restrictions on swizzles on
732         * same-cycle temporaries. Check the list for these hazards. */
733        if (bi_impacted_t_modifiers(ins, src))
734                return false;
735
736        /* Descriptor must not come from a passthrough */
737        switch (ins->op) {
738        case BI_OPCODE_LD_CVT:
739        case BI_OPCODE_LD_TILE:
740        case BI_OPCODE_ST_CVT:
741        case BI_OPCODE_ST_TILE:
742        case BI_OPCODE_TEXC:
743                return src != 2;
744        case BI_OPCODE_BLEND:
745                return src != 2 && src != 3;
746
747        /* +JUMP can't read the offset from T */
748        case BI_OPCODE_JUMP:
749                return false;
750
751        /* Else, just check if we can read any temps */
752        default:
753                return bi_reads_temps(ins, src);
754        }
755}
756
757/* Counts the number of 64-bit constants required by a clause. TODO: We
758 * might want to account for merging, right now we overestimate, but
759 * that's probably fine most of the time */
760
761static unsigned
762bi_nconstants(struct bi_clause_state *clause)
763{
764        unsigned count_32 = 0;
765
766        for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i)
767                count_32 += clause->consts[i].constant_count;
768
769        return DIV_ROUND_UP(count_32, 2);
770}
771
772/* Would there be space for constants if we added one tuple? */
773
774static bool
775bi_space_for_more_constants(struct bi_clause_state *clause)
776{
777        return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1));
778}
779
780/* Updates the FAU assignment for a tuple. A valid FAU assignment must be
781 * possible (as a precondition), though not necessarily on the selected unit;
782 * this is gauranteed per-instruction by bi_lower_fau and per-tuple by
783 * bi_instr_schedulable */
784
785static bool
786bi_update_fau(struct bi_clause_state *clause,
787                struct bi_tuple_state *tuple,
788                bi_instr *instr, bool fma, bool destructive)
789{
790        /* Maintain our own constants, for nondestructive mode */
791        uint32_t copied_constants[2], copied_count;
792        unsigned *constant_count = &tuple->constant_count;
793        uint32_t *constants = tuple->constants;
794        enum bir_fau fau = tuple->fau;
795
796        if (!destructive) {
797                memcpy(copied_constants, tuple->constants,
798                                (*constant_count) * sizeof(constants[0]));
799                copied_count = tuple->constant_count;
800
801                constant_count = &copied_count;
802                constants = copied_constants;
803        }
804
805        bi_foreach_src(instr, s) {
806                bi_index src = instr->src[s];
807
808                if (src.type == BI_INDEX_FAU) {
809                        bool no_constants = *constant_count == 0;
810                        bool no_other_fau = (fau == src.value) || !fau;
811                        bool mergable = no_constants && no_other_fau;
812
813                        if (destructive) {
814                                assert(mergable);
815                                tuple->fau = src.value;
816                        } else if (!mergable) {
817                                return false;
818                        }
819
820                        fau = src.value;
821                } else if (src.type == BI_INDEX_CONSTANT) {
822                        /* No need to reserve space if we have a fast 0 */
823                        if (src.value == 0 && fma && bi_reads_zero(instr))
824                                continue;
825
826                        /* If there is a branch target, #0 by convention is the
827                         * PC-relative offset to the target */
828                        bool pcrel = instr->branch_target && src.value == 0;
829                        bool found = false;
830
831                        for (unsigned i = 0; i < *constant_count; ++i) {
832                                found |= (constants[i] == src.value) &&
833                                        (i != tuple->pcrel_idx);
834                        }
835
836                        /* pcrel constants are unique, so don't match */
837                        if (found && !pcrel)
838                                continue;
839
840                        bool no_fau = (*constant_count > 0) || !fau;
841                        bool mergable = no_fau && ((*constant_count) < 2);
842
843                        if (destructive) {
844                                assert(mergable);
845
846                                if (pcrel)
847                                        tuple->pcrel_idx = *constant_count;
848                        } else if (!mergable)
849                                return false;
850
851                        constants[(*constant_count)++] = src.value;
852                }
853        }
854
855        /* Constants per clause may be limited by tuple count */
856        bool room_for_constants = (*constant_count == 0) ||
857                bi_space_for_more_constants(clause);
858
859        if (destructive)
860                assert(room_for_constants);
861        else if (!room_for_constants)
862                return false;
863
864        return true;
865}
866
867/* Given an in-progress tuple, a candidate new instruction to add to the tuple,
868 * and a source (index) from that candidate, determine whether this source is
869 * "new", in the sense of requiring an additional read slot. That is, checks
870 * whether the specified source reads from the register file via a read slot
871 * (determined by its type and placement) and whether the source was already
872 * specified by a prior read slot (to avoid double counting) */
873
874static bool
875bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx)
876{
877        bi_index src = instr->src[src_idx];
878
879        /* Only consider sources which come from the register file */
880        if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER))
881                return false;
882
883        /* Staging register reads bypass the usual register file mechanism */
884        if (bi_is_staging_src(instr, src_idx))
885                return false;
886
887        /* If a source is already read in the tuple, it is already counted */
888        for (unsigned t = 0; t < reg->nr_reads; ++t)
889                if (bi_is_word_equiv(src, reg->reads[t]))
890                        return false;
891
892        /* If a source is read in _this instruction_, it is already counted */
893        for (unsigned t = 0; t < src_idx; ++t)
894                if (bi_is_word_equiv(src, instr->src[t]))
895                        return false;
896
897        return true;
898}
899
900/* Given two tuples in source order, count the number of register reads of the
901 * successor, determined as the number of unique words accessed that aren't
902 * written by the predecessor (since those are tempable).
903 */
904
905static unsigned
906bi_count_succ_reads(bi_index t0, bi_index t1,
907                bi_index *succ_reads, unsigned nr_succ_reads)
908{
909        unsigned reads = 0;
910
911        for (unsigned i = 0; i < nr_succ_reads; ++i) {
912                bool unique = true;
913
914                for (unsigned j = 0; j < i; ++j)
915                        if (bi_is_word_equiv(succ_reads[i], succ_reads[j]))
916                                unique = false;
917
918                if (!unique)
919                        continue;
920
921                if (bi_is_word_equiv(succ_reads[i], t0))
922                        continue;
923
924                if (bi_is_word_equiv(succ_reads[i], t1))
925                        continue;
926
927                reads++;
928        }
929
930        return reads;
931}
932
933/* Not all instructions can read from the staging passthrough (as determined by
934 * reads_t), check if a given pair of instructions has such a restriction. Note
935 * we also use this mechanism to prevent data races around staging register
936 * reads, so we allow the input source to potentially be vector-valued */
937
938static bool
939bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add)
940{
941        bi_foreach_src(add, s) {
942                bi_index src = add->src[s];
943
944                if (src.type != BI_INDEX_REGISTER)
945                        continue;
946
947                unsigned count = bi_count_read_registers(add, s);
948                bool read = false;
949
950                for (unsigned d = 0; d < count; ++d)
951                        read |= bi_is_equiv(fma, bi_register(src.value + d));
952
953                if (read && !bi_reads_t(add, s))
954                        return true;
955        }
956
957        return false;
958}
959
960/* Likewise for cross-tuple passthrough (reads_temps) */
961
962static bool
963bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)
964{
965        bi_foreach_instr_in_tuple(succ, pins) {
966                bi_foreach_src(pins, s) {
967                        if (bi_is_word_equiv(ins->dest[0], pins->src[s]) &&
968                                        !bi_reads_temps(pins, s))
969                                return true;
970                }
971        }
972
973        return false;
974}
975
976/* Is a register written other than the staging mechanism? ATEST is special,
977 * writing to both a staging register and a regular register (fixed packing).
978 * BLEND is special since it has to write r48 the normal way even if it never
979 * gets read. This depends on liveness analysis, as a register is not needed
980 * for a write that will be discarded after one tuple. */
981
982static unsigned
983bi_write_count(bi_instr *instr, uint64_t live_after_temp)
984{
985        if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)
986                return 1;
987
988        unsigned count = 0;
989
990        bi_foreach_dest(instr, d) {
991                if (d == 0 && bi_opcode_props[instr->op].sr_write)
992                        continue;
993
994                if (bi_is_null(instr->dest[d]))
995                        continue;
996
997                assert(instr->dest[0].type == BI_INDEX_REGISTER);
998                if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value))
999                        count++;
1000        }
1001
1002        return count;
1003}
1004
1005/*
1006 * Test if an instruction required flush-to-zero mode. Currently only supported
1007 * for f16<-->f32 conversions to implement fquantize16
1008 */
1009static bool
1010bi_needs_ftz(bi_instr *I)
1011{
1012        return (I->op == BI_OPCODE_F16_TO_F32 ||
1013                I->op == BI_OPCODE_V2F32_TO_V2F16) && I->ftz;
1014}
1015
1016/*
1017 * Test if an instruction would be numerically incompatible with the clause. At
1018 * present we only consider flush-to-zero modes.
1019 */
1020static bool
1021bi_numerically_incompatible(struct bi_clause_state *clause, bi_instr *instr)
1022{
1023        return (clause->ftz != BI_FTZ_STATE_NONE) &&
1024               ((clause->ftz == BI_FTZ_STATE_ENABLE) != bi_needs_ftz(instr));
1025}
1026
1027/* Instruction placement entails two questions: what subset of instructions in
1028 * the block can legally be scheduled? and of those which is the best? That is,
1029 * we seek to maximize a cost function on a subset of the worklist satisfying a
1030 * particular predicate. The necessary predicate is determined entirely by
1031 * Bifrost's architectural limitations and is described in the accompanying
1032 * whitepaper. The cost function is a heuristic. */
1033
1034static bool
1035bi_instr_schedulable(bi_instr *instr,
1036                struct bi_clause_state *clause,
1037                struct bi_tuple_state *tuple,
1038                uint64_t live_after_temp,
1039                bool fma)
1040{
1041        /* The units must match */
1042        if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr)))
1043                return false;
1044
1045        /* There can only be one message-passing instruction per clause */
1046        if (bi_must_message(instr) && clause->message)
1047                return false;
1048
1049        /* Some instructions have placement requirements */
1050        if (bi_opcode_props[instr->op].last && !tuple->last)
1051                return false;
1052
1053        if (bi_must_not_last(instr) && tuple->last)
1054                return false;
1055
1056        /* Numerical properties must be compatible with the clause */
1057        if (bi_numerically_incompatible(clause, instr))
1058                return false;
1059
1060        /* Message-passing instructions are not guaranteed write within the
1061         * same clause (most likely they will not), so if a later instruction
1062         * in the clause accesses the destination, the message-passing
1063         * instruction can't be scheduled */
1064        if (bi_opcode_props[instr->op].sr_write) {
1065                bi_foreach_dest(instr, d) {
1066                        if (bi_is_null(instr->dest[d]))
1067                                continue;
1068
1069                        unsigned nr = bi_count_write_registers(instr, d);
1070                        assert(instr->dest[d].type == BI_INDEX_REGISTER);
1071                        unsigned reg = instr->dest[d].value;
1072
1073                        for (unsigned i = 0; i < clause->access_count; ++i) {
1074                                bi_index idx = clause->accesses[i];
1075                                for (unsigned d = 0; d < nr; ++d) {
1076                                        if (bi_is_equiv(bi_register(reg + d), idx))
1077                                                return false;
1078                                }
1079                        }
1080                }
1081        }
1082
1083        if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {
1084                unsigned nr = bi_count_read_registers(instr, 0);
1085                assert(instr->src[0].type == BI_INDEX_REGISTER);
1086                unsigned reg = instr->src[0].value;
1087
1088                for (unsigned i = 0; i < clause->access_count; ++i) {
1089                        bi_index idx = clause->accesses[i];
1090                        for (unsigned d = 0; d < nr; ++d) {
1091                                if (bi_is_equiv(bi_register(reg + d), idx))
1092                                        return false;
1093                        }
1094                }
1095        }
1096
1097        /* If FAU is already assigned, we may not disrupt that. Do a
1098         * non-disruptive test update */
1099        if (!bi_update_fau(clause, tuple, instr, fma, false))
1100                return false;
1101
1102        /* If this choice of FMA would force a staging passthrough, the ADD
1103         * instruction must support such a passthrough */
1104        if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add))
1105                return false;
1106
1107        /* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */
1108        if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))
1109                return false;
1110
1111        /* Register file writes are limited */
1112        unsigned total_writes = tuple->reg.nr_writes;
1113        total_writes += bi_write_count(instr, live_after_temp);
1114
1115        /* Last tuple in a clause can only write a single value */
1116        if (tuple->last && total_writes > 1)
1117                return false;
1118
1119        /* Register file reads are limited, so count unique */
1120
1121        unsigned unique_new_srcs = 0;
1122
1123        bi_foreach_src(instr, s) {
1124                if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1125                        unique_new_srcs++;
1126        }
1127
1128        unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs;
1129
1130        bool can_spill_to_moves = (!tuple->add);
1131        can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2));
1132        can_spill_to_moves &= (clause->tuple_count < 7);
1133
1134        /* However, we can get an extra 1 or 2 sources by inserting moves */
1135        if (total_srcs > (can_spill_to_moves ? 4 : 3))
1136                return false;
1137
1138        /* Count effective reads for the successor */
1139        unsigned succ_reads = bi_count_succ_reads(instr->dest[0],
1140                        tuple->add ? tuple->add->dest[0] : bi_null(),
1141                        tuple->prev_reads, tuple->nr_prev_reads);
1142
1143        /* Successor must satisfy R+W <= 4, so we require W <= 4-R */
1144        if ((signed) total_writes > (4 - (signed) succ_reads))
1145                return false;
1146
1147        return true;
1148}
1149
1150static signed
1151bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple)
1152{
1153        signed cost = 0;
1154
1155        /* Instructions that can schedule to either FMA or to ADD should be
1156         * deprioritized since they're easier to reschedule elsewhere */
1157        if (bi_can_fma(instr) && bi_can_add(instr))
1158                cost++;
1159
1160        /* Message-passing instructions impose constraints on the registers
1161         * later in the clause, so schedule them as late within a clause as
1162         * possible (<==> prioritize them since we're backwards <==> decrease
1163         * cost) */
1164        if (bi_must_message(instr))
1165                cost--;
1166
1167        /* Last instructions are big constraints (XXX: no effect on shader-db) */
1168        if (bi_opcode_props[instr->op].last)
1169                cost -= 2;
1170
1171        return cost;
1172}
1173
1174static unsigned
1175bi_choose_index(struct bi_worklist st,
1176                struct bi_clause_state *clause,
1177                struct bi_tuple_state *tuple,
1178                uint64_t live_after_temp,
1179                bool fma)
1180{
1181        unsigned i, best_idx = ~0;
1182        signed best_cost = INT_MAX;
1183
1184        BITSET_FOREACH_SET(i, st.worklist, st.count) {
1185                bi_instr *instr = st.instructions[i];
1186
1187                if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))
1188                        continue;
1189
1190                signed cost = bi_instr_cost(instr, tuple);
1191
1192                /* Tie break in favour of later instructions, under the
1193                 * assumption this promotes temporary usage (reducing pressure
1194                 * on the register file). This is a side effect of a prepass
1195                 * scheduling for pressure. */
1196
1197                if (cost <= best_cost) {
1198                        best_idx = i;
1199                        best_cost = cost;
1200                }
1201        }
1202
1203        return best_idx;
1204}
1205
1206static void
1207bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
1208                bi_instr *instr, uint64_t live_after_temp, bool fma)
1209{
1210        bi_update_fau(clause, tuple, instr, fma, true);
1211
1212        /* TODO: maybe opt a bit? or maybe doesn't matter */
1213        assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses));
1214        memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src));
1215        clause->access_count += BI_MAX_SRCS;
1216        memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest));
1217        clause->access_count += BI_MAX_DESTS;
1218        tuple->reg.nr_writes += bi_write_count(instr, live_after_temp);
1219
1220        bi_foreach_src(instr, s) {
1221                if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1222                        tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
1223        }
1224
1225        /* This could be optimized to allow pairing integer instructions with
1226         * special flush-to-zero instructions, but punting on this until we have
1227         * a workload that cares.
1228         */
1229        clause->ftz = bi_needs_ftz(instr) ? BI_FTZ_STATE_ENABLE :
1230                                            BI_FTZ_STATE_DISABLE;
1231}
1232
1233/* Choose the best instruction and pop it off the worklist. Returns NULL if no
1234 * instruction is available. This function is destructive. */
1235
1236static bi_instr *
1237bi_take_instr(bi_context *ctx, struct bi_worklist st,
1238                struct bi_clause_state *clause,
1239                struct bi_tuple_state *tuple,
1240                uint64_t live_after_temp,
1241                bool fma)
1242{
1243        if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)
1244                return bi_lower_cubeface(ctx, clause, tuple);
1245        else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM_RETURN_I32)
1246                return bi_lower_atom_c(ctx, clause, tuple);
1247        else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM1_RETURN_I32)
1248                return bi_lower_atom_c1(ctx, clause, tuple);
1249        else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64)
1250                return bi_lower_seg_add(ctx, clause, tuple);
1251        else if (tuple->add && tuple->add->table)
1252                return bi_lower_dtsel(ctx, clause, tuple);
1253
1254        /* TODO: Optimize these moves */
1255        if (!fma && tuple->nr_prev_reads > 3) {
1256                /* Only spill by one source for now */
1257                assert(tuple->nr_prev_reads == 4);
1258
1259                /* Pick a source to spill */
1260                bi_index src = tuple->prev_reads[0];
1261
1262                /* Schedule the spill */
1263                bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev));
1264                bi_instr *mov = bi_mov_i32_to(&b, src, src);
1265                bi_pop_instr(clause, tuple, mov, live_after_temp, fma);
1266                return mov;
1267        }
1268
1269#ifndef NDEBUG
1270        /* Don't pair instructions if debugging */
1271        if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add)
1272                return NULL;
1273#endif
1274
1275        unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);
1276
1277        if (idx >= st.count)
1278                return NULL;
1279
1280        /* Update state to reflect taking the instruction */
1281        bi_instr *instr = st.instructions[idx];
1282
1283        BITSET_CLEAR(st.worklist, idx);
1284        bi_update_worklist(st, idx);
1285        bi_pop_instr(clause, tuple, instr, live_after_temp, fma);
1286
1287        /* Fixups */
1288        if (instr->op == BI_OPCODE_IADD_U32 && fma) {
1289                assert(bi_can_iaddc(instr));
1290                instr->op = BI_OPCODE_IADDC_I32;
1291                instr->src[2] = bi_zero();
1292        } else if (fma && bi_can_replace_with_csel(instr)) {
1293                bi_replace_mux_with_csel(instr, false);
1294        }
1295
1296        return instr;
1297}
1298
1299/* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting
1300 * to a passthrough register. If except_sr is true, the staging sources are
1301 * skipped, so staging register reads are not accidentally encoded as
1302 * passthrough (which is impossible) */
1303
1304static void
1305bi_use_passthrough(bi_instr *ins, bi_index old,
1306                enum bifrost_packed_src new,
1307                bool except_sr)
1308{
1309        /* Optional for convenience */
1310        if (!ins || bi_is_null(old))
1311                return;
1312
1313        bi_foreach_src(ins, i) {
1314                if ((i == 0 || i == 4) && except_sr)
1315                        continue;
1316
1317                if (bi_is_word_equiv(ins->src[i], old)) {
1318                        ins->src[i].type = BI_INDEX_PASS;
1319                        ins->src[i].value = new;
1320                        ins->src[i].reg = false;
1321                        ins->src[i].offset = 0;
1322                }
1323        }
1324}
1325
1326/* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use
1327 * intertuple passthroughs where necessary. Passthroughs are allowed as a
1328 * post-condition of scheduling. Note we rewrite ADD first, FMA second --
1329 * opposite the order of execution. This is deliberate -- if both FMA and ADD
1330 * write to the same logical register, the next executed tuple will get the
1331 * latter result. There's no interference issue under the assumption of correct
1332 * register allocation. */
1333
1334static void
1335bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ)
1336{
1337        bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false;
1338
1339        if (prec.add) {
1340                bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false);
1341                bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read);
1342        }
1343
1344        if (prec.fma) {
1345                bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false);
1346                bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read);
1347        }
1348}
1349
1350static void
1351bi_rewrite_fau_to_pass(bi_tuple *tuple)
1352{
1353        bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1354                if (ins->src[s].type != BI_INDEX_FAU) continue;
1355
1356                bi_index pass = bi_passthrough(ins->src[s].offset ?
1357                                BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
1358
1359                ins->src[s] = bi_replace_index(ins->src[s], pass);
1360        }
1361}
1362
1363static void
1364bi_rewrite_zero(bi_instr *ins, bool fma)
1365{
1366        bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO);
1367
1368        bi_foreach_src(ins, s) {
1369                bi_index src = ins->src[s];
1370
1371                if (src.type == BI_INDEX_CONSTANT && src.value == 0)
1372                        ins->src[s] = bi_replace_index(src, zero);
1373        }
1374}
1375
1376/* Assumes #0 to {T, FAU} rewrite has already occurred */
1377
1378static void
1379bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel)
1380{
1381        bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1382                if (ins->src[s].type != BI_INDEX_CONSTANT) continue;
1383
1384                uint32_t cons = ins->src[s].value;
1385
1386                ASSERTED bool lo = (cons == (constant & 0xffffffff));
1387                bool hi = (cons == (constant >> 32ull));
1388
1389                /* PC offsets always live in the upper half, set to zero by
1390                 * convention before pack time. (This is safe, since if you
1391                 * wanted to compare against zero, you would use a BRANCHZ
1392                 * instruction instead.) */
1393                if (cons == 0 && ins->branch_target != NULL) {
1394                        assert(pcrel);
1395                        hi = true;
1396                        lo = false;
1397                } else if (pcrel) {
1398                        hi = false;
1399                }
1400
1401                assert(lo || hi);
1402
1403                ins->src[s] = bi_replace_index(ins->src[s],
1404                                bi_passthrough(hi ?  BIFROST_SRC_FAU_HI :
1405                                        BIFROST_SRC_FAU_LO));
1406        }
1407}
1408
1409/* Constructs a constant state given a tuple state. This has the
1410 * postcondition that pcrel applies to the first constant by convention,
1411 * and PC-relative constants will be #0 by convention here, so swap to
1412 * match if needed */
1413
1414static struct bi_const_state
1415bi_get_const_state(struct bi_tuple_state *tuple)
1416{
1417        struct bi_const_state consts = {
1418                .constant_count = tuple->constant_count,
1419                .constants[0] = tuple->constants[0],
1420                .constants[1] = tuple->constants[1],
1421                .pcrel = tuple->add && tuple->add->branch_target,
1422        };
1423
1424        /* pcrel applies to the first constant by convention, and
1425         * PC-relative constants will be #0 by convention here, so swap
1426         * to match if needed */
1427        if (consts.pcrel && consts.constants[0]) {
1428                assert(consts.constant_count == 2);
1429                assert(consts.constants[1] == 0);
1430
1431                consts.constants[1] = consts.constants[0];
1432                consts.constants[0] = 0;
1433        }
1434
1435        return consts;
1436}
1437
1438/* Merges constants in a clause, satisfying the following rules, assuming no
1439 * more than one tuple has pcrel:
1440 *
1441 * 1. If a tuple has two constants, they must be packed together. If one is
1442 * pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) +
1443 * (PC << 32)]. Otherwise choose an arbitrary order.
1444 *
1445 * 4. If a tuple has one constant, it may be shared with an existing
1446 * pair that already contains that constant, or it may be combined with another
1447 * (distinct) tuple of a single constant.
1448 *
1449 * This gaurantees a packing is possible. The next routine handles modification
1450 * related swapping, to satisfy format 12 and the lack of modification for
1451 * tuple count 5/8 in EC0.
1452 */
1453
1454static uint64_t
1455bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel)
1456{
1457        /* At this point in the constant merge algorithm, pcrel constants are
1458         * treated as zero, so pcrel implies at least one constants is zero */
1459        assert(!pcrel || (c0 == 0 || c1 == 0));
1460
1461        /* Order: pcrel, maximum non-pcrel, minimum non-pcrel */
1462        uint32_t hi = pcrel ? 0 : MAX2(c0, c1);
1463        uint32_t lo = (c0 == hi) ? c1 : c0;
1464
1465        /* Merge in the selected order */
1466        return lo | (((uint64_t) hi) << 32ull);
1467}
1468
1469static unsigned
1470bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count,
1471                uint64_t *merged, unsigned *pcrel_pair)
1472{
1473        unsigned merge_count = 0;
1474
1475        for (unsigned t = 0; t < tuple_count; ++t) {
1476                if (consts[t].constant_count != 2) continue;
1477
1478                unsigned idx = ~0;
1479                uint64_t val = bi_merge_u32(consts[t].constants[0],
1480                                consts[t].constants[1], consts[t].pcrel);
1481
1482                /* Skip the pcrel pair if assigned, because if one is assigned,
1483                 * this one is not pcrel by uniqueness so it's a mismatch */
1484                for (unsigned s = 0; s < merge_count; ++s) {
1485                        if (merged[s] == val && (*pcrel_pair) != s) {
1486                                idx = s;
1487                                break;
1488                        }
1489                }
1490
1491                if (idx == ~0) {
1492                        idx = merge_count++;
1493                        merged[idx] = val;
1494
1495                        if (consts[t].pcrel)
1496                                (*pcrel_pair) = idx;
1497                }
1498
1499                consts[t].word_idx = idx;
1500        }
1501
1502        return merge_count;
1503}
1504
1505static unsigned
1506bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count,
1507                uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair)
1508{
1509        bool pending = false, pending_pcrel = false;
1510        uint32_t pending_single = 0;
1511
1512        for (unsigned t = 0; t < tuple_count; ++t) {
1513                if (consts[t].constant_count != 1) continue;
1514
1515                uint32_t val = consts[t].constants[0];
1516                unsigned idx = ~0;
1517
1518                /* Try to match, but don't match pcrel with non-pcrel, even
1519                 * though we can merge a pcrel with a non-pcrel single */
1520                for (unsigned i = 0; i < pair_count; ++i) {
1521                        bool lo = ((pairs[i] & 0xffffffff) == val);
1522                        bool hi = ((pairs[i] >> 32) == val);
1523                        bool match = (lo || hi);
1524                        match &= ((*pcrel_pair) != i);
1525                        if (match && !consts[t].pcrel) {
1526                                idx = i;
1527                                break;
1528                        }
1529                }
1530
1531                if (idx == ~0) {
1532                        idx = pair_count;
1533
1534                        if (pending && pending_single != val) {
1535                                assert(!(pending_pcrel && consts[t].pcrel));
1536                                bool pcrel = pending_pcrel || consts[t].pcrel;
1537
1538                                if (pcrel)
1539                                        *pcrel_pair = idx;
1540
1541                                pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel);
1542
1543                                pending = pending_pcrel = false;
1544                        } else {
1545                                pending = true;
1546                                pending_pcrel = consts[t].pcrel;
1547                                pending_single = val;
1548                        }
1549                }
1550
1551                consts[t].word_idx = idx;
1552        }
1553
1554        /* Shift so it works whether pending_pcrel is set or not */
1555        if (pending) {
1556                if (pending_pcrel)
1557                        *pcrel_pair = pair_count;
1558
1559                pairs[pair_count++] = ((uint64_t) pending_single) << 32ull;
1560        }
1561
1562        return pair_count;
1563}
1564
1565static unsigned
1566bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx)
1567{
1568        unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx);
1569        return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx);
1570}
1571
1572/* Swap two constants at word i and i+1 by swapping their actual positions and
1573 * swapping all references so the meaning of the clause is preserved */
1574
1575static void
1576bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i)
1577{
1578        uint64_t tmp_pair = pairs[i + 0];
1579        pairs[i + 0] = pairs[i + 1];
1580        pairs[i + 1] = tmp_pair;
1581
1582        for (unsigned t = 0; t < 8; ++t) {
1583                if (consts[t].word_idx == i)
1584                        consts[t].word_idx = (i + 1);
1585                else if (consts[t].word_idx == (i + 1))
1586                        consts[t].word_idx = i;
1587        }
1588}
1589
1590/* Given merged constants, one of which might be PC-relative, fix up the M
1591 * values so the PC-relative constant (if it exists) has the M1=4 modification
1592 * and other constants are used as-is (which might require swapping) */
1593
1594static unsigned
1595bi_apply_constant_modifiers(struct bi_const_state *consts,
1596                uint64_t *pairs, unsigned *pcrel_idx,
1597                unsigned tuple_count, unsigned constant_count)
1598{
1599        unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0;
1600
1601        /* Clauses with these tuple counts lack an M field for the packed EC0,
1602         * so EC0 cannot be PC-relative, which might require swapping (and
1603         * possibly adding an unused constant) to fit */
1604
1605        if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) {
1606                constant_count = MAX2(constant_count, 2);
1607                *pcrel_idx = 1;
1608                bi_swap_constants(consts, pairs, 0);
1609        }
1610
1611        /* EC0 might be packed free, after that constants are packed in pairs
1612         * (with clause format 12), with M1 values computed from the pair */
1613
1614        for (unsigned i = start; i < constant_count; i += 2) {
1615                bool swap = false;
1616                bool last = (i + 1) == constant_count;
1617
1618                unsigned A1 = (pairs[i] >> 60);
1619                unsigned B1 = (pairs[i + 1] >> 60);
1620
1621                if (*pcrel_idx == i || *pcrel_idx == (i + 1)) {
1622                        /* PC-relative constant must be E0, not E1 */
1623                        swap = (*pcrel_idx == (i + 1));
1624
1625                        /* Set M1 = 4 by noting (A - B) mod 16 = 4 is
1626                         * equivalent to A = (B + 4) mod 16 and that we can
1627                         * control A */
1628                        unsigned B = swap ? A1 : B1;
1629                        unsigned A = (B + 4) & 0xF;
1630                        pairs[*pcrel_idx] |= ((uint64_t) A) << 60;
1631
1632                        /* Swapped if swap set, identity if swap not set */
1633                        *pcrel_idx = i;
1634                } else {
1635                        /* Compute M1 value if we don't swap */
1636                        unsigned M1 = (16 + A1 - B1) & 0xF;
1637
1638                        /* For M1 = 0 or M1 >= 8, the constants are unchanged,
1639                         * we have 0 < (A1 - B1) % 16 < 8, which implies (B1 -
1640                         * A1) % 16 >= 8, so swapping will let them be used
1641                         * unchanged */
1642                        swap = (M1 != 0) && (M1 < 8);
1643
1644                        /* However, we can't swap the last constant, so we
1645                         * force M1 = 0 instead for this case */
1646                        if (last && swap) {
1647                                pairs[i + 1] |= pairs[i] & (0xfull << 60);
1648                                swap = false;
1649                        }
1650                }
1651
1652                if (swap) {
1653                        assert(!last);
1654                        bi_swap_constants(consts, pairs, i);
1655                }
1656        }
1657
1658        return constant_count;
1659}
1660
1661/* Schedule a single clause. If no instructions remain, return NULL. */
1662
1663static bi_clause *
1664bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live)
1665{
1666        struct bi_clause_state clause_state = { 0 };
1667        bi_clause *clause = rzalloc(ctx, bi_clause);
1668        bi_tuple *tuple = NULL;
1669
1670        const unsigned max_tuples = ARRAY_SIZE(clause->tuples);
1671
1672        /* TODO: Decide flow control better */
1673        clause->flow_control = BIFROST_FLOW_NBTB;
1674
1675        /* The last clause can only write one instruction, so initialize that */
1676        struct bi_reg_state reg_state = {};
1677        bi_index prev_reads[5] = { bi_null() };
1678        unsigned nr_prev_reads = 0;
1679
1680        /* We need to track future liveness. The main *live set tracks what is
1681         * live at the current point int he program we are scheduling, but to
1682         * determine temp eligibility, we instead want what will be live after
1683         * the next tuple in the program. If you scheduled forwards, you'd need
1684         * a crystall ball for this. Luckily we schedule backwards, so we just
1685         * delay updates to the live_after_temp by an extra tuple. */
1686        uint64_t live_after_temp = *live;
1687        uint64_t live_next_tuple = live_after_temp;
1688
1689        do {
1690                struct bi_tuple_state tuple_state = {
1691                        .last = (clause->tuple_count == 0),
1692                        .reg = reg_state,
1693                        .nr_prev_reads = nr_prev_reads,
1694                        .prev = tuple,
1695                        .pcrel_idx = ~0,
1696                };
1697
1698                assert(nr_prev_reads < ARRAY_SIZE(prev_reads));
1699                memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads));
1700
1701                unsigned idx = max_tuples - clause->tuple_count - 1;
1702
1703                tuple = &clause->tuples[idx];
1704
1705                if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) {
1706                        unsigned nr = bi_count_read_registers(clause->message, 0);
1707                        live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value);
1708                }
1709
1710                /* Since we schedule backwards, we schedule ADD first */
1711                tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false);
1712                tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true);
1713                tuple->add = tuple_state.add;
1714
1715                /* Update liveness from the new instructions */
1716                if (tuple->add)
1717                        *live = bi_postra_liveness_ins(*live, tuple->add);
1718
1719                if (tuple->fma)
1720                        *live = bi_postra_liveness_ins(*live, tuple->fma);
1721
1722               /* Rotate in the new per-tuple liveness */
1723                live_after_temp = live_next_tuple;
1724                live_next_tuple = *live;
1725
1726                /* We may have a message, but only one per clause */
1727                if (tuple->add && bi_must_message(tuple->add)) {
1728                        assert(!clause_state.message);
1729                        clause_state.message = true;
1730
1731                        clause->message_type =
1732                                bi_message_type_for_instr(tuple->add);
1733                        clause->message = tuple->add;
1734
1735                        /* We don't need to set dependencies for blend shaders
1736                         * because the BLEND instruction in the fragment
1737                         * shader should have already done the wait */
1738                        if (!ctx->inputs->is_blend) {
1739                                switch (tuple->add->op) {
1740                                case BI_OPCODE_ATEST:
1741                                        clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1742                                        break;
1743                                case BI_OPCODE_LD_TILE:
1744                                case BI_OPCODE_ST_TILE:
1745                                        clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1746                                        break;
1747                                case BI_OPCODE_BLEND:
1748                                        clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1749                                        clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1750                                        break;
1751                                default:
1752                                        break;
1753                                }
1754                        }
1755                }
1756
1757                clause_state.consts[idx] = bi_get_const_state(&tuple_state);
1758
1759                /* Before merging constants, eliminate zeroes, otherwise the
1760                 * merging will fight over the #0 that never gets read (and is
1761                 * never marked as read by update_fau) */
1762                if (tuple->fma && bi_reads_zero(tuple->fma))
1763                        bi_rewrite_zero(tuple->fma, true);
1764
1765                /* Rewrite away FAU, constant write is deferred */
1766                if (!tuple_state.constant_count) {
1767                        tuple->fau_idx = tuple_state.fau;
1768                        bi_rewrite_fau_to_pass(tuple);
1769                }
1770
1771                /* Use passthrough register for cross-stage accesses. Since
1772                 * there are just FMA and ADD stages, that means we rewrite to
1773                 * passthrough the sources of the ADD that read from the
1774                 * destination of the FMA */
1775
1776                if (tuple->fma) {
1777                        bi_use_passthrough(tuple->add, tuple->fma->dest[0],
1778                                        BIFROST_SRC_STAGE, false);
1779                }
1780
1781                /* Don't add an empty tuple, unless the worklist has nothing
1782                 * but a (pseudo)instruction failing to schedule due to a "not
1783                 * last instruction" constraint */
1784
1785                int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count));
1786                bool not_last = (some_instruction > 0) &&
1787                        bi_must_not_last(st.instructions[some_instruction - 1]);
1788
1789                bool insert_empty = tuple_state.last && not_last;
1790
1791                if (!(tuple->fma || tuple->add || insert_empty))
1792                        break;
1793
1794                clause->tuple_count++;
1795
1796                /* Adding enough tuple might overflow constants */
1797                if (!bi_space_for_more_constants(&clause_state))
1798                        break;
1799
1800#ifndef NDEBUG
1801                /* Don't schedule more than 1 tuple if debugging */
1802                if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty)
1803                        break;
1804#endif
1805
1806                /* Link through the register state */
1807                STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads));
1808                memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads));
1809                nr_prev_reads = tuple_state.reg.nr_reads;
1810                clause_state.tuple_count++;
1811        } while(clause->tuple_count < 8);
1812
1813        /* Don't schedule an empty clause */
1814        if (!clause->tuple_count)
1815                return NULL;
1816
1817        /* Before merging, rewrite away any tuples that read only zero */
1818        for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1819                bi_tuple *tuple = &clause->tuples[i];
1820                struct bi_const_state *st = &clause_state.consts[i];
1821
1822                if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel)
1823                        continue;
1824
1825                bi_foreach_instr_in_tuple(tuple, ins)
1826                        bi_rewrite_zero(ins, false);
1827
1828                /* Constant has been demoted to FAU, so don't pack it separately */
1829                st->constant_count = 0;
1830
1831                /* Default */
1832                assert(tuple->fau_idx == BIR_FAU_ZERO);
1833        }
1834
1835        uint64_t constant_pairs[8] = { 0 };
1836        unsigned pcrel_idx = ~0;
1837        unsigned constant_words =
1838                bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx);
1839
1840        constant_words = bi_apply_constant_modifiers(clause_state.consts,
1841                        constant_pairs, &pcrel_idx, clause->tuple_count,
1842                        constant_words);
1843
1844        clause->pcrel_idx = pcrel_idx;
1845
1846        for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1847                bi_tuple *tuple = &clause->tuples[i];
1848
1849                /* If no constants, leave FAU as it is, possibly defaulting to 0 */
1850                if (clause_state.consts[i].constant_count == 0)
1851                        continue;
1852
1853                /* FAU is already handled */
1854                assert(!tuple->fau_idx);
1855
1856                unsigned word_idx = clause_state.consts[i].word_idx;
1857                assert(word_idx <= 8);
1858
1859                /* We could try to merge regardless of bottom bits as well, but
1860                 * that's probably diminishing returns */
1861                uint64_t pair = constant_pairs[word_idx];
1862                unsigned lo = pair & 0xF;
1863
1864                tuple->fau_idx = bi_constant_field(word_idx) | lo;
1865                bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx);
1866        }
1867
1868        clause->constant_count = constant_words;
1869        memcpy(clause->constants, constant_pairs, sizeof(constant_pairs));
1870
1871        /* Branches must be last, so this can be factored out */
1872        bi_instr *last = clause->tuples[max_tuples - 1].add;
1873        clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
1874        clause->block = block;
1875
1876        clause->ftz = (clause_state.ftz == BI_FTZ_STATE_ENABLE);
1877
1878        /* We emit in reverse and emitted to the back of the tuples array, so
1879         * move it up front for easy indexing */
1880        memmove(clause->tuples,
1881                       clause->tuples + (max_tuples - clause->tuple_count),
1882                       clause->tuple_count * sizeof(clause->tuples[0]));
1883
1884        /* Use passthrough register for cross-tuple accesses. Note this is
1885         * after the memmove, so this is forwards. Skip the first tuple since
1886         * there is nothing before it to passthrough */
1887
1888        for (unsigned t = 1; t < clause->tuple_count; ++t)
1889                bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]);
1890
1891        return clause;
1892}
1893
1894static void
1895bi_schedule_block(bi_context *ctx, bi_block *block)
1896{
1897        list_inithead(&block->clauses);
1898
1899        /* Copy list to dynamic array */
1900        struct bi_worklist st = bi_initialize_worklist(block,
1901                        bifrost_debug & BIFROST_DBG_INORDER,
1902                        ctx->inputs->is_blend);
1903
1904        if (!st.count) {
1905                bi_free_worklist(st);
1906                return;
1907        }
1908
1909        /* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */
1910        uint64_t live = block->reg_live_out;
1911
1912        /* Schedule as many clauses as needed to fill the block */
1913        bi_clause *u = NULL;
1914        while((u = bi_schedule_clause(ctx, block, st, &live)))
1915                list_add(&u->link, &block->clauses);
1916
1917        /* Back-to-back bit affects only the last clause of a block,
1918         * the rest are implicitly true */
1919        if (!list_is_empty(&block->clauses)) {
1920                bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link);
1921                if (bi_reconverge_branches(block))
1922                        last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
1923        }
1924
1925        /* Reorder instructions to match the new schedule. First remove
1926         * existing instructions and then recreate the list */
1927
1928        bi_foreach_instr_in_block_safe(block, ins) {
1929                list_del(&ins->link);
1930        }
1931
1932        bi_foreach_clause_in_block(block, clause) {
1933                for (unsigned i = 0; i < clause->tuple_count; ++i)  {
1934                        bi_foreach_instr_in_tuple(&clause->tuples[i], ins) {
1935                                list_addtail(&ins->link, &block->instructions);
1936                        }
1937                }
1938        }
1939
1940        block->scheduled = true;
1941
1942#ifndef NDEBUG
1943        unsigned i;
1944        bool incomplete = false;
1945
1946        BITSET_FOREACH_SET(i, st.worklist, st.count) {
1947                bi_print_instr(st.instructions[i], stderr);
1948                incomplete = true;
1949        }
1950
1951        if (incomplete)
1952                unreachable("The above instructions failed to schedule.");
1953#endif
1954
1955        bi_free_worklist(st);
1956}
1957
1958static bool
1959bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau)
1960{
1961        bi_index src = ins->src[s];
1962
1963        /* Staging registers can't have FAU accesses */
1964        if (bi_is_staging_src(ins, s))
1965                return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU);
1966
1967        if (src.type == BI_INDEX_CONSTANT) {
1968                /* Allow fast zero */
1969                if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins))
1970                        return true;
1971
1972                if (!bi_is_null(*fau))
1973                        return false;
1974
1975                /* Else, try to inline a constant */
1976                for (unsigned i = 0; i < *cwords; ++i) {
1977                        if (src.value == constants[i])
1978                                return true;
1979                }
1980
1981                if (*cwords >= 2)
1982                        return false;
1983
1984                constants[(*cwords)++] = src.value;
1985        } else if (src.type == BI_INDEX_FAU) {
1986                if (*cwords != 0)
1987                        return false;
1988
1989                /* Can only read from one pair of FAU words */
1990                if (!bi_is_null(*fau) && (src.value != fau->value))
1991                        return false;
1992
1993                /* If there is a target, we'll need a PC-relative constant */
1994                if (ins->branch_target)
1995                        return false;
1996
1997                *fau = src;
1998        }
1999
2000        return true;
2001}
2002
2003void
2004bi_lower_fau(bi_context *ctx)
2005{
2006        bi_foreach_instr_global_safe(ctx, ins) {
2007                bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
2008
2009                uint32_t constants[2];
2010                unsigned cwords = 0;
2011                bi_index fau = bi_null();
2012
2013                /* ATEST must have the ATEST datum encoded, not any other
2014                 * uniform. See to it this is the case. */
2015                if (ins->op == BI_OPCODE_ATEST)
2016                        fau = ins->src[2];
2017
2018                /* Dual texturing requires the texture operation descriptor
2019                 * encoded as an immediate so we can fix up.
2020                 */
2021                if (ins->op == BI_OPCODE_TEXC) {
2022                        assert(ins->src[3].type == BI_INDEX_CONSTANT);
2023                        constants[cwords++] = ins->src[3].value;
2024                }
2025
2026                bi_foreach_src(ins, s) {
2027                        if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue;
2028
2029                        bi_index copy = bi_mov_i32(&b, ins->src[s]);
2030                        ins->src[s] = bi_replace_index(ins->src[s], copy);
2031                }
2032        }
2033}
2034
2035/* Only v7 allows specifying a dependency on the tilebuffer for the first
2036 * clause of a shader. v6 requires adding a NOP clause with the depedency. */
2037
2038static void
2039bi_add_nop_for_atest(bi_context *ctx)
2040{
2041        /* Only needed on v6 */
2042        if (ctx->arch >= 7)
2043                return;
2044
2045        if (list_is_empty(&ctx->blocks))
2046                return;
2047
2048        /* Fetch the first clause of the shader */
2049        bi_block *block = list_first_entry(&ctx->blocks, bi_block, link);
2050        bi_clause *clause = bi_next_clause(ctx, block, NULL);
2051
2052        if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) |
2053                                                 (1 << BIFROST_SLOT_ELDEST_COLOUR))))
2054                return;
2055
2056        /* Add a NOP so we can wait for the dependencies required by the first
2057         * clause */
2058
2059        bi_instr *I = rzalloc(ctx, bi_instr);
2060        I->op = BI_OPCODE_NOP;
2061        I->dest[0] = bi_null();
2062
2063        bi_clause *new_clause = ralloc(ctx, bi_clause);
2064        *new_clause = (bi_clause) {
2065                .flow_control = BIFROST_FLOW_NBTB,
2066                .next_clause_prefetch = true,
2067                .block = clause->block,
2068
2069                .tuple_count = 1,
2070                .tuples[0] = { .fma = I, },
2071        };
2072
2073        list_add(&new_clause->link, &clause->block->clauses);
2074}
2075
2076void
2077bi_schedule(bi_context *ctx)
2078{
2079        /* Fed into both scheduling and DCE */
2080        bi_postra_liveness(ctx);
2081
2082        bi_foreach_block(ctx, block) {
2083                bi_schedule_block(ctx, block);
2084        }
2085
2086        bi_opt_dce_post_ra(ctx);
2087        bi_add_nop_for_atest(ctx);
2088}
2089