1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright (C) 2020 Collabora Ltd.
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21bf215546Sopenharmony_ci * SOFTWARE.
22bf215546Sopenharmony_ci *
23bf215546Sopenharmony_ci * Authors (Collabora):
24bf215546Sopenharmony_ci *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25bf215546Sopenharmony_ci */
26bf215546Sopenharmony_ci
27bf215546Sopenharmony_ci#include "compiler.h"
28bf215546Sopenharmony_ci#include "nodearray.h"
29bf215546Sopenharmony_ci#include "bi_builder.h"
30bf215546Sopenharmony_ci#include "util/u_memory.h"
31bf215546Sopenharmony_ci
32bf215546Sopenharmony_cistruct lcra_state {
33bf215546Sopenharmony_ci        unsigned node_count;
34bf215546Sopenharmony_ci        uint64_t *affinity;
35bf215546Sopenharmony_ci
36bf215546Sopenharmony_ci        /* Linear constraints imposed. For each node there there is a
37bf215546Sopenharmony_ci         * 'nodearray' structure, which changes between a sparse and dense
38bf215546Sopenharmony_ci         * array depending on the number of elements.
39bf215546Sopenharmony_ci         *
40bf215546Sopenharmony_ci         * Each element is itself a bit field denoting whether (c_j - c_i) bias
41bf215546Sopenharmony_ci         * is present or not, including negative biases.
42bf215546Sopenharmony_ci         *
43bf215546Sopenharmony_ci         * We support up to 8 components so the bias is in range
44bf215546Sopenharmony_ci         * [-7, 7] encoded by a 16-bit field
45bf215546Sopenharmony_ci         */
46bf215546Sopenharmony_ci        nodearray *linear;
47bf215546Sopenharmony_ci
48bf215546Sopenharmony_ci        /* Before solving, forced registers; after solving, solutions. */
49bf215546Sopenharmony_ci        unsigned *solutions;
50bf215546Sopenharmony_ci
51bf215546Sopenharmony_ci        /** Node which caused register allocation to fail */
52bf215546Sopenharmony_ci        unsigned spill_node;
53bf215546Sopenharmony_ci};
54bf215546Sopenharmony_ci
55bf215546Sopenharmony_ci/* This module is an implementation of "Linearly Constrained
56bf215546Sopenharmony_ci * Register Allocation". The paper is available in PDF form
57bf215546Sopenharmony_ci * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
58bf215546Sopenharmony_ci * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
59bf215546Sopenharmony_ci */
60bf215546Sopenharmony_ci
61bf215546Sopenharmony_cistatic struct lcra_state *
62bf215546Sopenharmony_cilcra_alloc_equations(unsigned node_count)
63bf215546Sopenharmony_ci{
64bf215546Sopenharmony_ci        struct lcra_state *l = calloc(1, sizeof(*l));
65bf215546Sopenharmony_ci
66bf215546Sopenharmony_ci        l->node_count = node_count;
67bf215546Sopenharmony_ci
68bf215546Sopenharmony_ci        l->linear = calloc(sizeof(l->linear[0]), node_count);
69bf215546Sopenharmony_ci        l->solutions = calloc(sizeof(l->solutions[0]), node_count);
70bf215546Sopenharmony_ci        l->affinity = calloc(sizeof(l->affinity[0]), node_count);
71bf215546Sopenharmony_ci
72bf215546Sopenharmony_ci        memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
73bf215546Sopenharmony_ci
74bf215546Sopenharmony_ci        return l;
75bf215546Sopenharmony_ci}
76bf215546Sopenharmony_ci
77bf215546Sopenharmony_cistatic void
78bf215546Sopenharmony_cilcra_free(struct lcra_state *l)
79bf215546Sopenharmony_ci{
80bf215546Sopenharmony_ci        for (unsigned i = 0; i < l->node_count; ++i)
81bf215546Sopenharmony_ci                nodearray_reset(&l->linear[i]);
82bf215546Sopenharmony_ci
83bf215546Sopenharmony_ci        free(l->linear);
84bf215546Sopenharmony_ci        free(l->affinity);
85bf215546Sopenharmony_ci        free(l->solutions);
86bf215546Sopenharmony_ci        free(l);
87bf215546Sopenharmony_ci}
88bf215546Sopenharmony_ci
89bf215546Sopenharmony_cistatic void
90bf215546Sopenharmony_cilcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)
91bf215546Sopenharmony_ci{
92bf215546Sopenharmony_ci        if (i == j)
93bf215546Sopenharmony_ci                return;
94bf215546Sopenharmony_ci
95bf215546Sopenharmony_ci        nodearray_value constraint_fw = 0;
96bf215546Sopenharmony_ci        nodearray_value constraint_bw = 0;
97bf215546Sopenharmony_ci
98bf215546Sopenharmony_ci        /* The constraint bits are reversed from lcra.c so that register
99bf215546Sopenharmony_ci         * allocation can be done in parallel for every possible solution,
100bf215546Sopenharmony_ci         * with lower-order bits representing smaller registers. */
101bf215546Sopenharmony_ci
102bf215546Sopenharmony_ci        for (unsigned D = 0; D < 8; ++D) {
103bf215546Sopenharmony_ci                if (cmask_i & (cmask_j << D)) {
104bf215546Sopenharmony_ci                        constraint_fw |= (1 << (7 + D));
105bf215546Sopenharmony_ci                        constraint_bw |= (1 << (7 - D));
106bf215546Sopenharmony_ci                }
107bf215546Sopenharmony_ci
108bf215546Sopenharmony_ci                if (cmask_i & (cmask_j >> D)) {
109bf215546Sopenharmony_ci                        constraint_bw |= (1 << (7 + D));
110bf215546Sopenharmony_ci                        constraint_fw |= (1 << (7 - D));
111bf215546Sopenharmony_ci                }
112bf215546Sopenharmony_ci        }
113bf215546Sopenharmony_ci
114bf215546Sopenharmony_ci        /* Use dense arrays after adding 256 elements */
115bf215546Sopenharmony_ci        nodearray_orr(&l->linear[j], i, constraint_fw, 256, l->node_count);
116bf215546Sopenharmony_ci        nodearray_orr(&l->linear[i], j, constraint_bw, 256, l->node_count);
117bf215546Sopenharmony_ci}
118bf215546Sopenharmony_ci
119bf215546Sopenharmony_cistatic bool
120bf215546Sopenharmony_cilcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
121bf215546Sopenharmony_ci{
122bf215546Sopenharmony_ci        signed constant = solutions[i];
123bf215546Sopenharmony_ci
124bf215546Sopenharmony_ci        if (nodearray_is_sparse(&l->linear[i])) {
125bf215546Sopenharmony_ci                nodearray_sparse_foreach(&l->linear[i], elem) {
126bf215546Sopenharmony_ci                        unsigned j = nodearray_sparse_key(elem);
127bf215546Sopenharmony_ci                        nodearray_value constraint = nodearray_sparse_value(elem);
128bf215546Sopenharmony_ci
129bf215546Sopenharmony_ci                        if (solutions[j] == ~0) continue;
130bf215546Sopenharmony_ci
131bf215546Sopenharmony_ci                        signed lhs = constant - solutions[j];
132bf215546Sopenharmony_ci
133bf215546Sopenharmony_ci                        if (lhs < -7 || lhs > 7)
134bf215546Sopenharmony_ci                                continue;
135bf215546Sopenharmony_ci
136bf215546Sopenharmony_ci                        if (constraint & (1 << (lhs + 7)))
137bf215546Sopenharmony_ci                                return false;
138bf215546Sopenharmony_ci                }
139bf215546Sopenharmony_ci
140bf215546Sopenharmony_ci                return true;
141bf215546Sopenharmony_ci        }
142bf215546Sopenharmony_ci
143bf215546Sopenharmony_ci        nodearray_value *row = l->linear[i].dense;
144bf215546Sopenharmony_ci
145bf215546Sopenharmony_ci        for (unsigned j = 0; j < l->node_count; ++j) {
146bf215546Sopenharmony_ci                if (solutions[j] == ~0) continue;
147bf215546Sopenharmony_ci
148bf215546Sopenharmony_ci                signed lhs = constant - solutions[j];
149bf215546Sopenharmony_ci
150bf215546Sopenharmony_ci                if (lhs < -7 || lhs > 7)
151bf215546Sopenharmony_ci                        continue;
152bf215546Sopenharmony_ci
153bf215546Sopenharmony_ci                if (row[j] & (1 << (lhs + 7)))
154bf215546Sopenharmony_ci                        return false;
155bf215546Sopenharmony_ci        }
156bf215546Sopenharmony_ci
157bf215546Sopenharmony_ci        return true;
158bf215546Sopenharmony_ci}
159bf215546Sopenharmony_ci
160bf215546Sopenharmony_cistatic bool
161bf215546Sopenharmony_cilcra_solve(struct lcra_state *l)
162bf215546Sopenharmony_ci{
163bf215546Sopenharmony_ci        for (unsigned step = 0; step < l->node_count; ++step) {
164bf215546Sopenharmony_ci                if (l->solutions[step] != ~0) continue;
165bf215546Sopenharmony_ci                if (l->affinity[step] == 0) continue;
166bf215546Sopenharmony_ci
167bf215546Sopenharmony_ci                bool succ = false;
168bf215546Sopenharmony_ci
169bf215546Sopenharmony_ci                u_foreach_bit64(r, l->affinity[step]) {
170bf215546Sopenharmony_ci                        l->solutions[step] = r;
171bf215546Sopenharmony_ci
172bf215546Sopenharmony_ci                        if (lcra_test_linear(l, l->solutions, step)) {
173bf215546Sopenharmony_ci                                succ = true;
174bf215546Sopenharmony_ci                                break;
175bf215546Sopenharmony_ci                        }
176bf215546Sopenharmony_ci                }
177bf215546Sopenharmony_ci
178bf215546Sopenharmony_ci                /* Out of registers - prepare to spill */
179bf215546Sopenharmony_ci                if (!succ) {
180bf215546Sopenharmony_ci                        l->spill_node = step;
181bf215546Sopenharmony_ci                        return false;
182bf215546Sopenharmony_ci                }
183bf215546Sopenharmony_ci        }
184bf215546Sopenharmony_ci
185bf215546Sopenharmony_ci        return true;
186bf215546Sopenharmony_ci}
187bf215546Sopenharmony_ci
188bf215546Sopenharmony_ci/* Register spilling is implemented with a cost-benefit system. Costs are set
189bf215546Sopenharmony_ci * by the user. Benefits are calculated from the constraints. */
190bf215546Sopenharmony_ci
191bf215546Sopenharmony_cistatic unsigned
192bf215546Sopenharmony_cilcra_count_constraints(struct lcra_state *l, unsigned i)
193bf215546Sopenharmony_ci{
194bf215546Sopenharmony_ci        unsigned count = 0;
195bf215546Sopenharmony_ci        nodearray *constraints = &l->linear[i];
196bf215546Sopenharmony_ci
197bf215546Sopenharmony_ci        if (nodearray_is_sparse(constraints)) {
198bf215546Sopenharmony_ci                nodearray_sparse_foreach(constraints, elem)
199bf215546Sopenharmony_ci                        count += util_bitcount(nodearray_sparse_value(elem));
200bf215546Sopenharmony_ci        } else {
201bf215546Sopenharmony_ci                nodearray_dense_foreach_64(constraints, elem)
202bf215546Sopenharmony_ci                        count += util_bitcount64(*elem);
203bf215546Sopenharmony_ci        }
204bf215546Sopenharmony_ci
205bf215546Sopenharmony_ci        return count;
206bf215546Sopenharmony_ci}
207bf215546Sopenharmony_ci
208bf215546Sopenharmony_ci/* Construct an affinity mask such that the vector with `count` elements does
209bf215546Sopenharmony_ci * not intersect any of the registers in the bitset `clobber`. In other words,
210bf215546Sopenharmony_ci * an allocated register r needs to satisfy for each i < count: a + i != b.
211bf215546Sopenharmony_ci * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
212bf215546Sopenharmony_ci * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
213bf215546Sopenharmony_ci * that union is the desired clobber set. That may be written equivalently as
214bf215546Sopenharmony_ci * the union over i < n of (B - i), where subtraction is defined elementwise
215bf215546Sopenharmony_ci * and corresponds to a shift of the entire bitset.
216bf215546Sopenharmony_ci *
217bf215546Sopenharmony_ci * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted
218bf215546Sopenharmony_ci * as a bit set, it is { x : 0 <= x < 64 if x is even }
219bf215546Sopenharmony_ci */
220bf215546Sopenharmony_ci
221bf215546Sopenharmony_ci#define EVEN_BITS_MASK (0x5555555555555555ull)
222bf215546Sopenharmony_ci
223bf215546Sopenharmony_cistatic uint64_t
224bf215546Sopenharmony_cibi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
225bf215546Sopenharmony_ci{
226bf215546Sopenharmony_ci        uint64_t clobbered = 0;
227bf215546Sopenharmony_ci
228bf215546Sopenharmony_ci        for (unsigned i = 0; i < count; ++i)
229bf215546Sopenharmony_ci                clobbered |= (clobber >> i);
230bf215546Sopenharmony_ci
231bf215546Sopenharmony_ci        /* Don't allocate past the end of the register file */
232bf215546Sopenharmony_ci        if (count > 1) {
233bf215546Sopenharmony_ci                unsigned excess = count - 1;
234bf215546Sopenharmony_ci                uint64_t mask = BITFIELD_MASK(excess);
235bf215546Sopenharmony_ci                clobbered |= mask << (64 - excess);
236bf215546Sopenharmony_ci
237bf215546Sopenharmony_ci                if (split_file)
238bf215546Sopenharmony_ci                        clobbered |= mask << (16 - excess);
239bf215546Sopenharmony_ci        }
240bf215546Sopenharmony_ci
241bf215546Sopenharmony_ci        /* Don't allocate the middle if we split out the middle */
242bf215546Sopenharmony_ci        if (split_file)
243bf215546Sopenharmony_ci                clobbered |= BITFIELD64_MASK(32) << 16;
244bf215546Sopenharmony_ci
245bf215546Sopenharmony_ci        /* We can use a register iff it's not clobberred */
246bf215546Sopenharmony_ci        return ~clobbered;
247bf215546Sopenharmony_ci}
248bf215546Sopenharmony_ci
249bf215546Sopenharmony_cistatic void
250bf215546Sopenharmony_cibi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr)
251bf215546Sopenharmony_ci{
252bf215546Sopenharmony_ci        bi_foreach_instr_in_block_rev(block, ins) {
253bf215546Sopenharmony_ci                /* Mark all registers live after the instruction as
254bf215546Sopenharmony_ci                 * interfering with the destination */
255bf215546Sopenharmony_ci
256bf215546Sopenharmony_ci                bi_foreach_dest(ins, d) {
257bf215546Sopenharmony_ci                        unsigned node = bi_get_node(ins->dest[d]);
258bf215546Sopenharmony_ci
259bf215546Sopenharmony_ci                        if (node >= node_count)
260bf215546Sopenharmony_ci                                continue;
261bf215546Sopenharmony_ci
262bf215546Sopenharmony_ci                        /* Don't allocate to anything that's read later as a
263bf215546Sopenharmony_ci                         * preloaded register. The affinity is the intersection
264bf215546Sopenharmony_ci                         * of affinity masks for each write. Since writes have
265bf215546Sopenharmony_ci                         * offsets, but the affinity is for the whole node, we
266bf215546Sopenharmony_ci                         * need to offset the affinity opposite the write
267bf215546Sopenharmony_ci                         * offset, so we shift right. */
268bf215546Sopenharmony_ci                        unsigned count = bi_count_write_registers(ins, d);
269bf215546Sopenharmony_ci                        unsigned offset = ins->dest[d].offset;
270bf215546Sopenharmony_ci                        uint64_t affinity = bi_make_affinity(preload_live, count, split_file) >> offset;
271bf215546Sopenharmony_ci                        /* Valhall needs >= 64-bit staging writes to be pair-aligned */
272bf215546Sopenharmony_ci                        if (aligned_sr && (count >= 2 || offset))
273bf215546Sopenharmony_ci                                affinity &= EVEN_BITS_MASK;
274bf215546Sopenharmony_ci
275bf215546Sopenharmony_ci                        l->affinity[node] &= affinity;
276bf215546Sopenharmony_ci
277bf215546Sopenharmony_ci                        for (unsigned i = 0; i < node_count; ++i) {
278bf215546Sopenharmony_ci                                uint8_t r = live[i];
279bf215546Sopenharmony_ci
280bf215546Sopenharmony_ci                                /* Nodes only interfere if they occupy
281bf215546Sopenharmony_ci                                 * /different values/ at the same time
282bf215546Sopenharmony_ci                                 * (Boissinot). In particular, sources of
283bf215546Sopenharmony_ci                                 * moves do not interfere with their
284bf215546Sopenharmony_ci                                 * destinations. This enables a limited form of
285bf215546Sopenharmony_ci                                 * coalescing.
286bf215546Sopenharmony_ci                                 */
287bf215546Sopenharmony_ci                                if (ins->op == BI_OPCODE_MOV_I32 &&
288bf215546Sopenharmony_ci                                    i == bi_get_node(ins->src[0])) {
289bf215546Sopenharmony_ci
290bf215546Sopenharmony_ci                                        r &= ~BITFIELD_BIT(ins->src[0].offset);
291bf215546Sopenharmony_ci                                }
292bf215546Sopenharmony_ci
293bf215546Sopenharmony_ci                                if (r) {
294bf215546Sopenharmony_ci                                        lcra_add_node_interference(l, node,
295bf215546Sopenharmony_ci                                                        bi_writemask(ins, d), i, r);
296bf215546Sopenharmony_ci                                }
297bf215546Sopenharmony_ci                        }
298bf215546Sopenharmony_ci
299bf215546Sopenharmony_ci                        unsigned node_first = bi_get_node(ins->dest[0]);
300bf215546Sopenharmony_ci                        if (d == 1 && node_first < node_count) {
301bf215546Sopenharmony_ci                                lcra_add_node_interference(l, node, bi_writemask(ins, 1),
302bf215546Sopenharmony_ci                                                           node_first, bi_writemask(ins, 0));
303bf215546Sopenharmony_ci                        }
304bf215546Sopenharmony_ci                }
305bf215546Sopenharmony_ci
306bf215546Sopenharmony_ci                /* Valhall needs >= 64-bit reads to be pair-aligned */
307bf215546Sopenharmony_ci                if (aligned_sr) {
308bf215546Sopenharmony_ci                        bi_foreach_src(ins, s) {
309bf215546Sopenharmony_ci                                if (bi_count_read_registers(ins, s) >= 2) {
310bf215546Sopenharmony_ci                                        unsigned node = bi_get_node(ins->src[s]);
311bf215546Sopenharmony_ci
312bf215546Sopenharmony_ci                                        if (node < node_count)
313bf215546Sopenharmony_ci                                                l->affinity[node] &= EVEN_BITS_MASK;
314bf215546Sopenharmony_ci                                }
315bf215546Sopenharmony_ci                        }
316bf215546Sopenharmony_ci                }
317bf215546Sopenharmony_ci
318bf215546Sopenharmony_ci                if (!is_blend && ins->op == BI_OPCODE_BLEND) {
319bf215546Sopenharmony_ci                        /* Blend shaders might clobber r0-r15, r48. */
320bf215546Sopenharmony_ci                        uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
321bf215546Sopenharmony_ci
322bf215546Sopenharmony_ci                        for (unsigned i = 0; i < node_count; ++i) {
323bf215546Sopenharmony_ci                                if (live[i])
324bf215546Sopenharmony_ci                                        l->affinity[i] &= ~clobber;
325bf215546Sopenharmony_ci                        }
326bf215546Sopenharmony_ci                }
327bf215546Sopenharmony_ci
328bf215546Sopenharmony_ci                /* Update live_in */
329bf215546Sopenharmony_ci                preload_live = bi_postra_liveness_ins(preload_live, ins);
330bf215546Sopenharmony_ci                bi_liveness_ins_update(live, ins, node_count);
331bf215546Sopenharmony_ci        }
332bf215546Sopenharmony_ci
333bf215546Sopenharmony_ci        block->reg_live_in = preload_live;
334bf215546Sopenharmony_ci}
335bf215546Sopenharmony_ci
336bf215546Sopenharmony_cistatic void
337bf215546Sopenharmony_cibi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
338bf215546Sopenharmony_ci{
339bf215546Sopenharmony_ci        unsigned node_count = bi_max_temp(ctx);
340bf215546Sopenharmony_ci
341bf215546Sopenharmony_ci        bi_compute_liveness(ctx);
342bf215546Sopenharmony_ci        bi_postra_liveness(ctx);
343bf215546Sopenharmony_ci
344bf215546Sopenharmony_ci        bi_foreach_block_rev(ctx, blk) {
345bf215546Sopenharmony_ci                uint8_t *live = mem_dup(blk->live_out, node_count);
346bf215546Sopenharmony_ci
347bf215546Sopenharmony_ci                bi_mark_interference(blk, l, live, blk->reg_live_out,
348bf215546Sopenharmony_ci                                node_count, ctx->inputs->is_blend, !full_regs,
349bf215546Sopenharmony_ci                                ctx->arch >= 9);
350bf215546Sopenharmony_ci
351bf215546Sopenharmony_ci                free(live);
352bf215546Sopenharmony_ci        }
353bf215546Sopenharmony_ci}
354bf215546Sopenharmony_ci
355bf215546Sopenharmony_cistatic struct lcra_state *
356bf215546Sopenharmony_cibi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
357bf215546Sopenharmony_ci{
358bf215546Sopenharmony_ci        unsigned node_count = bi_max_temp(ctx);
359bf215546Sopenharmony_ci        struct lcra_state *l = lcra_alloc_equations(node_count);
360bf215546Sopenharmony_ci
361bf215546Sopenharmony_ci        /* Blend shaders are restricted to R0-R15. Other shaders at full
362bf215546Sopenharmony_ci         * occupancy also can access R48-R63. At half occupancy they can access
363bf215546Sopenharmony_ci         * the whole file. */
364bf215546Sopenharmony_ci
365bf215546Sopenharmony_ci        uint64_t default_affinity =
366bf215546Sopenharmony_ci                ctx->inputs->is_blend ? BITFIELD64_MASK(16) :
367bf215546Sopenharmony_ci                full_regs ? BITFIELD64_MASK(64) :
368bf215546Sopenharmony_ci                (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
369bf215546Sopenharmony_ci
370bf215546Sopenharmony_ci        /* To test spilling, mimic a small register file */
371bf215546Sopenharmony_ci        if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend)
372bf215546Sopenharmony_ci                default_affinity &= BITFIELD64_MASK(48) << 8;
373bf215546Sopenharmony_ci
374bf215546Sopenharmony_ci        bi_foreach_instr_global(ctx, ins) {
375bf215546Sopenharmony_ci                bi_foreach_dest(ins, d) {
376bf215546Sopenharmony_ci                        unsigned dest = bi_get_node(ins->dest[d]);
377bf215546Sopenharmony_ci
378bf215546Sopenharmony_ci                        if (dest < node_count)
379bf215546Sopenharmony_ci                                l->affinity[dest] = default_affinity;
380bf215546Sopenharmony_ci                }
381bf215546Sopenharmony_ci
382bf215546Sopenharmony_ci                /* Blend shaders expect the src colour to be in r0-r3 */
383bf215546Sopenharmony_ci                if (ins->op == BI_OPCODE_BLEND &&
384bf215546Sopenharmony_ci                    !ctx->inputs->is_blend) {
385bf215546Sopenharmony_ci                        unsigned node = bi_get_node(ins->src[0]);
386bf215546Sopenharmony_ci                        assert(node < node_count);
387bf215546Sopenharmony_ci                        l->solutions[node] = 0;
388bf215546Sopenharmony_ci
389bf215546Sopenharmony_ci                        /* Dual source blend input in r4-r7 */
390bf215546Sopenharmony_ci                        node = bi_get_node(ins->src[4]);
391bf215546Sopenharmony_ci                        if (node < node_count)
392bf215546Sopenharmony_ci                                l->solutions[node] = 4;
393bf215546Sopenharmony_ci
394bf215546Sopenharmony_ci                        /* Writes to R48 */
395bf215546Sopenharmony_ci                        node = bi_get_node(ins->dest[0]);
396bf215546Sopenharmony_ci                        if (!bi_is_null(ins->dest[0])) {
397bf215546Sopenharmony_ci                                assert(node < node_count);
398bf215546Sopenharmony_ci                                l->solutions[node] = 48;
399bf215546Sopenharmony_ci                        }
400bf215546Sopenharmony_ci                }
401bf215546Sopenharmony_ci
402bf215546Sopenharmony_ci                /* Coverage mask writes stay in R60 */
403bf215546Sopenharmony_ci                if ((ins->op == BI_OPCODE_ATEST ||
404bf215546Sopenharmony_ci                     ins->op == BI_OPCODE_ZS_EMIT) &&
405bf215546Sopenharmony_ci                    !bi_is_null(ins->dest[0])) {
406bf215546Sopenharmony_ci                        unsigned node = bi_get_node(ins->dest[0]);
407bf215546Sopenharmony_ci                        assert(node < node_count);
408bf215546Sopenharmony_ci                        l->solutions[node] = 60;
409bf215546Sopenharmony_ci                }
410bf215546Sopenharmony_ci
411bf215546Sopenharmony_ci                /* Experimentally, it seems coverage masks inputs to ATEST must
412bf215546Sopenharmony_ci                 * be in R60. Otherwise coverage mask writes do not work with
413bf215546Sopenharmony_ci                 * early-ZS with pixel-frequency-shading (this combination of
414bf215546Sopenharmony_ci                 * settings is legal if depth/stencil writes are disabled).
415bf215546Sopenharmony_ci                 */
416bf215546Sopenharmony_ci                if (ins->op == BI_OPCODE_ATEST) {
417bf215546Sopenharmony_ci                        unsigned node = bi_get_node(ins->src[0]);
418bf215546Sopenharmony_ci                        assert(node < node_count);
419bf215546Sopenharmony_ci                        l->solutions[node] = 60;
420bf215546Sopenharmony_ci                }
421bf215546Sopenharmony_ci        }
422bf215546Sopenharmony_ci
423bf215546Sopenharmony_ci        bi_compute_interference(ctx, l, full_regs);
424bf215546Sopenharmony_ci
425bf215546Sopenharmony_ci        /* Coalesce register moves if we're allowed. We need to be careful due
426bf215546Sopenharmony_ci         * to the restricted affinity induced by the blend shader ABI.
427bf215546Sopenharmony_ci         */
428bf215546Sopenharmony_ci        bi_foreach_instr_global(ctx, I) {
429bf215546Sopenharmony_ci                if (I->op != BI_OPCODE_MOV_I32) continue;
430bf215546Sopenharmony_ci                if (I->src[0].type != BI_INDEX_REGISTER) continue;
431bf215546Sopenharmony_ci
432bf215546Sopenharmony_ci                unsigned reg = I->src[0].value;
433bf215546Sopenharmony_ci                unsigned node = bi_get_node(I->dest[0]);
434bf215546Sopenharmony_ci                assert(node < node_count);
435bf215546Sopenharmony_ci
436bf215546Sopenharmony_ci                if (l->solutions[node] != ~0) continue;
437bf215546Sopenharmony_ci
438bf215546Sopenharmony_ci                uint64_t affinity = l->affinity[node];
439bf215546Sopenharmony_ci
440bf215546Sopenharmony_ci                if (ctx->inputs->is_blend) {
441bf215546Sopenharmony_ci                        /* We're allowed to coalesce the moves to these */
442bf215546Sopenharmony_ci                        affinity |= BITFIELD64_BIT(48);
443bf215546Sopenharmony_ci                        affinity |= BITFIELD64_BIT(60);
444bf215546Sopenharmony_ci                }
445bf215546Sopenharmony_ci
446bf215546Sopenharmony_ci                /* Try to coalesce */
447bf215546Sopenharmony_ci                if (affinity & BITFIELD64_BIT(reg)) {
448bf215546Sopenharmony_ci                        l->solutions[node] = reg;
449bf215546Sopenharmony_ci
450bf215546Sopenharmony_ci                        if (!lcra_test_linear(l, l->solutions, node))
451bf215546Sopenharmony_ci                                l->solutions[node] = ~0;
452bf215546Sopenharmony_ci                }
453bf215546Sopenharmony_ci        }
454bf215546Sopenharmony_ci
455bf215546Sopenharmony_ci        *success = lcra_solve(l);
456bf215546Sopenharmony_ci
457bf215546Sopenharmony_ci        return l;
458bf215546Sopenharmony_ci}
459bf215546Sopenharmony_ci
460bf215546Sopenharmony_cistatic bi_index
461bf215546Sopenharmony_cibi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
462bf215546Sopenharmony_ci{
463bf215546Sopenharmony_ci        /* Offsets can only be applied when we register allocated an index, or
464bf215546Sopenharmony_ci         * alternatively for FAU's encoding */
465bf215546Sopenharmony_ci
466bf215546Sopenharmony_ci        ASSERTED bool is_offset = (index.offset > 0) &&
467bf215546Sopenharmony_ci                (index.type != BI_INDEX_FAU);
468bf215546Sopenharmony_ci        unsigned node_count = bi_max_temp(ctx);
469bf215546Sopenharmony_ci
470bf215546Sopenharmony_ci        /* Did we run RA for this index at all */
471bf215546Sopenharmony_ci        if (bi_get_node(index) >= node_count) {
472bf215546Sopenharmony_ci                assert(!is_offset);
473bf215546Sopenharmony_ci                return index;
474bf215546Sopenharmony_ci        }
475bf215546Sopenharmony_ci
476bf215546Sopenharmony_ci        /* LCRA didn't bother solving this index (how lazy!) */
477bf215546Sopenharmony_ci        signed solution = l->solutions[bi_get_node(index)];
478bf215546Sopenharmony_ci        if (solution < 0) {
479bf215546Sopenharmony_ci                assert(!is_offset);
480bf215546Sopenharmony_ci                return index;
481bf215546Sopenharmony_ci        }
482bf215546Sopenharmony_ci
483bf215546Sopenharmony_ci        /* todo: do we want to compose with the subword swizzle? */
484bf215546Sopenharmony_ci        bi_index new_index = bi_register(solution + index.offset);
485bf215546Sopenharmony_ci        new_index.swizzle = index.swizzle;
486bf215546Sopenharmony_ci        new_index.abs = index.abs;
487bf215546Sopenharmony_ci        new_index.neg = index.neg;
488bf215546Sopenharmony_ci        return new_index;
489bf215546Sopenharmony_ci}
490bf215546Sopenharmony_ci
491bf215546Sopenharmony_ci/* Dual texture instructions write to two sets of staging registers, modeled as
492bf215546Sopenharmony_ci * two destinations in the IR. The first set is communicated with the usual
493bf215546Sopenharmony_ci * staging register mechanism. The second set is encoded in the texture
494bf215546Sopenharmony_ci * operation descriptor. This is quite unusual, and requires the following late
495bf215546Sopenharmony_ci * fixup.
496bf215546Sopenharmony_ci */
497bf215546Sopenharmony_cistatic void
498bf215546Sopenharmony_cibi_fixup_dual_tex_register(bi_instr *I)
499bf215546Sopenharmony_ci{
500bf215546Sopenharmony_ci        assert(I->dest[1].type == BI_INDEX_REGISTER);
501bf215546Sopenharmony_ci        assert(I->src[3].type == BI_INDEX_CONSTANT);
502bf215546Sopenharmony_ci
503bf215546Sopenharmony_ci        struct bifrost_dual_texture_operation desc = {
504bf215546Sopenharmony_ci                .secondary_register = I->dest[1].value
505bf215546Sopenharmony_ci        };
506bf215546Sopenharmony_ci
507bf215546Sopenharmony_ci        I->src[3].value |= bi_dual_tex_as_u32(desc);
508bf215546Sopenharmony_ci}
509bf215546Sopenharmony_ci
510bf215546Sopenharmony_cistatic void
511bf215546Sopenharmony_cibi_install_registers(bi_context *ctx, struct lcra_state *l)
512bf215546Sopenharmony_ci{
513bf215546Sopenharmony_ci        bi_foreach_instr_global(ctx, ins) {
514bf215546Sopenharmony_ci                bi_foreach_dest(ins, d)
515bf215546Sopenharmony_ci                        ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
516bf215546Sopenharmony_ci
517bf215546Sopenharmony_ci                bi_foreach_src(ins, s)
518bf215546Sopenharmony_ci                        ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
519bf215546Sopenharmony_ci
520bf215546Sopenharmony_ci                if (ins->op == BI_OPCODE_TEXC && !bi_is_null(ins->dest[1]))
521bf215546Sopenharmony_ci                        bi_fixup_dual_tex_register(ins);
522bf215546Sopenharmony_ci        }
523bf215546Sopenharmony_ci}
524bf215546Sopenharmony_ci
525bf215546Sopenharmony_cistatic void
526bf215546Sopenharmony_cibi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
527bf215546Sopenharmony_ci{
528bf215546Sopenharmony_ci        bi_foreach_src(ins, i) {
529bf215546Sopenharmony_ci                if (bi_is_equiv(ins->src[i], old)) {
530bf215546Sopenharmony_ci                        ins->src[i].type = new.type;
531bf215546Sopenharmony_ci                        ins->src[i].reg = new.reg;
532bf215546Sopenharmony_ci                        ins->src[i].value = new.value;
533bf215546Sopenharmony_ci                }
534bf215546Sopenharmony_ci        }
535bf215546Sopenharmony_ci}
536bf215546Sopenharmony_ci
537bf215546Sopenharmony_ci/* If register allocation fails, find the best spill node */
538bf215546Sopenharmony_ci
539bf215546Sopenharmony_cistatic signed
540bf215546Sopenharmony_cibi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
541bf215546Sopenharmony_ci{
542bf215546Sopenharmony_ci        /* Pick a node satisfying bi_spill_register's preconditions */
543bf215546Sopenharmony_ci        BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
544bf215546Sopenharmony_ci
545bf215546Sopenharmony_ci        bi_foreach_instr_global(ctx, ins) {
546bf215546Sopenharmony_ci                bi_foreach_dest(ins, d) {
547bf215546Sopenharmony_ci                        unsigned node = bi_get_node(ins->dest[d]);
548bf215546Sopenharmony_ci
549bf215546Sopenharmony_ci                        if (node >= l->node_count)
550bf215546Sopenharmony_ci                                continue;
551bf215546Sopenharmony_ci
552bf215546Sopenharmony_ci                        /* Don't allow spilling coverage mask writes because the
553bf215546Sopenharmony_ci                         * register preload logic assumes it will stay in R60.
554bf215546Sopenharmony_ci                         * This could be optimized.
555bf215546Sopenharmony_ci                         */
556bf215546Sopenharmony_ci                        if (ins->no_spill ||
557bf215546Sopenharmony_ci                            ins->op == BI_OPCODE_ATEST ||
558bf215546Sopenharmony_ci                            ins->op == BI_OPCODE_ZS_EMIT ||
559bf215546Sopenharmony_ci                            (ins->op == BI_OPCODE_MOV_I32 &&
560bf215546Sopenharmony_ci                             ins->src[0].type == BI_INDEX_REGISTER &&
561bf215546Sopenharmony_ci                             ins->src[0].value == 60)) {
562bf215546Sopenharmony_ci                                BITSET_SET(no_spill, node);
563bf215546Sopenharmony_ci                        }
564bf215546Sopenharmony_ci                }
565bf215546Sopenharmony_ci        }
566bf215546Sopenharmony_ci
567bf215546Sopenharmony_ci        unsigned best_benefit = 0.0;
568bf215546Sopenharmony_ci        signed best_node = -1;
569bf215546Sopenharmony_ci
570bf215546Sopenharmony_ci        if (nodearray_is_sparse(&l->linear[l->spill_node])) {
571bf215546Sopenharmony_ci                nodearray_sparse_foreach(&l->linear[l->spill_node], elem) {
572bf215546Sopenharmony_ci                        unsigned i = nodearray_sparse_key(elem);
573bf215546Sopenharmony_ci                        unsigned constraint = nodearray_sparse_value(elem);
574bf215546Sopenharmony_ci
575bf215546Sopenharmony_ci                        /* Only spill nodes that interfere with the node failing
576bf215546Sopenharmony_ci                         * register allocation. It's pointless to spill anything else */
577bf215546Sopenharmony_ci                        if (!constraint) continue;
578bf215546Sopenharmony_ci
579bf215546Sopenharmony_ci                        if (BITSET_TEST(no_spill, i)) continue;
580bf215546Sopenharmony_ci
581bf215546Sopenharmony_ci                        unsigned benefit = lcra_count_constraints(l, i);
582bf215546Sopenharmony_ci
583bf215546Sopenharmony_ci                        if (benefit > best_benefit) {
584bf215546Sopenharmony_ci                                best_benefit = benefit;
585bf215546Sopenharmony_ci                                best_node = i;
586bf215546Sopenharmony_ci                        }
587bf215546Sopenharmony_ci                }
588bf215546Sopenharmony_ci        } else {
589bf215546Sopenharmony_ci                nodearray_value *row = l->linear[l->spill_node].dense;
590bf215546Sopenharmony_ci
591bf215546Sopenharmony_ci                for (unsigned i = 0; i < l->node_count; ++i) {
592bf215546Sopenharmony_ci                        /* Only spill nodes that interfere with the node failing
593bf215546Sopenharmony_ci                         * register allocation. It's pointless to spill anything else */
594bf215546Sopenharmony_ci                        if (!row[i]) continue;
595bf215546Sopenharmony_ci
596bf215546Sopenharmony_ci                        if (BITSET_TEST(no_spill, i)) continue;
597bf215546Sopenharmony_ci
598bf215546Sopenharmony_ci                        unsigned benefit = lcra_count_constraints(l, i);
599bf215546Sopenharmony_ci
600bf215546Sopenharmony_ci                        if (benefit > best_benefit) {
601bf215546Sopenharmony_ci                                best_benefit = benefit;
602bf215546Sopenharmony_ci                                best_node = i;
603bf215546Sopenharmony_ci                        }
604bf215546Sopenharmony_ci                }
605bf215546Sopenharmony_ci        }
606bf215546Sopenharmony_ci
607bf215546Sopenharmony_ci        free(no_spill);
608bf215546Sopenharmony_ci        return best_node;
609bf215546Sopenharmony_ci}
610bf215546Sopenharmony_ci
611bf215546Sopenharmony_cistatic unsigned
612bf215546Sopenharmony_cibi_count_read_index(bi_instr *I, bi_index index)
613bf215546Sopenharmony_ci{
614bf215546Sopenharmony_ci        unsigned max = 0;
615bf215546Sopenharmony_ci
616bf215546Sopenharmony_ci        bi_foreach_src(I, s) {
617bf215546Sopenharmony_ci                if (bi_is_equiv(I->src[s], index)) {
618bf215546Sopenharmony_ci                        unsigned count = bi_count_read_registers(I, s);
619bf215546Sopenharmony_ci                        max = MAX2(max, count + I->src[s].offset);
620bf215546Sopenharmony_ci                }
621bf215546Sopenharmony_ci        }
622bf215546Sopenharmony_ci
623bf215546Sopenharmony_ci        return max;
624bf215546Sopenharmony_ci}
625bf215546Sopenharmony_ci
626bf215546Sopenharmony_ci/*
627bf215546Sopenharmony_ci * Wrappers to emit loads/stores to thread-local storage in an appropriate way
628bf215546Sopenharmony_ci * for the target, so the spill/fill code becomes architecture-independent.
629bf215546Sopenharmony_ci */
630bf215546Sopenharmony_ci
631bf215546Sopenharmony_cistatic bi_index
632bf215546Sopenharmony_cibi_tls_ptr(bool hi)
633bf215546Sopenharmony_ci{
634bf215546Sopenharmony_ci        return bi_fau(BIR_FAU_TLS_PTR, hi);
635bf215546Sopenharmony_ci}
636bf215546Sopenharmony_ci
637bf215546Sopenharmony_cistatic bi_instr *
638bf215546Sopenharmony_cibi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
639bf215546Sopenharmony_ci{
640bf215546Sopenharmony_ci        if (b->shader->arch >= 9) {
641bf215546Sopenharmony_ci                return bi_load_to(b, bits, src, bi_tls_ptr(false),
642bf215546Sopenharmony_ci                                  bi_tls_ptr(true), BI_SEG_TL, offset);
643bf215546Sopenharmony_ci        } else {
644bf215546Sopenharmony_ci                return bi_load_to(b, bits, src, bi_imm_u32(offset), bi_zero(),
645bf215546Sopenharmony_ci                                  BI_SEG_TL, 0);
646bf215546Sopenharmony_ci        }
647bf215546Sopenharmony_ci}
648bf215546Sopenharmony_ci
649bf215546Sopenharmony_cistatic void
650bf215546Sopenharmony_cibi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
651bf215546Sopenharmony_ci{
652bf215546Sopenharmony_ci        if (b->shader->arch >= 9) {
653bf215546Sopenharmony_ci                bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL, offset);
654bf215546Sopenharmony_ci        } else {
655bf215546Sopenharmony_ci                bi_store(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL, 0);
656bf215546Sopenharmony_ci        }
657bf215546Sopenharmony_ci}
658bf215546Sopenharmony_ci
659bf215546Sopenharmony_ci/* Once we've chosen a spill node, spill it and returns bytes spilled */
660bf215546Sopenharmony_ci
661bf215546Sopenharmony_cistatic unsigned
662bf215546Sopenharmony_cibi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
663bf215546Sopenharmony_ci{
664bf215546Sopenharmony_ci        bi_builder b = { .shader = ctx };
665bf215546Sopenharmony_ci        unsigned channels = 0;
666bf215546Sopenharmony_ci
667bf215546Sopenharmony_ci        /* Spill after every store, fill before every load */
668bf215546Sopenharmony_ci        bi_foreach_instr_global_safe(ctx, I) {
669bf215546Sopenharmony_ci                bi_foreach_dest(I, d) {
670bf215546Sopenharmony_ci                        if (!bi_is_equiv(I->dest[d], index)) continue;
671bf215546Sopenharmony_ci
672bf215546Sopenharmony_ci                        unsigned extra = I->dest[d].offset;
673bf215546Sopenharmony_ci                        bi_index tmp = bi_temp(ctx);
674bf215546Sopenharmony_ci
675bf215546Sopenharmony_ci                        I->dest[d] = bi_replace_index(I->dest[d], tmp);
676bf215546Sopenharmony_ci                        I->no_spill = true;
677bf215546Sopenharmony_ci
678bf215546Sopenharmony_ci                        unsigned count = bi_count_write_registers(I, d);
679bf215546Sopenharmony_ci                        unsigned bits = count * 32;
680bf215546Sopenharmony_ci
681bf215546Sopenharmony_ci                        b.cursor = bi_after_instr(I);
682bf215546Sopenharmony_ci                        bi_store_tl(&b, bits, tmp, offset + 4 * extra);
683bf215546Sopenharmony_ci
684bf215546Sopenharmony_ci                        ctx->spills++;
685bf215546Sopenharmony_ci                        channels = MAX2(channels, extra + count);
686bf215546Sopenharmony_ci                }
687bf215546Sopenharmony_ci
688bf215546Sopenharmony_ci                if (bi_has_arg(I, index)) {
689bf215546Sopenharmony_ci                        b.cursor = bi_before_instr(I);
690bf215546Sopenharmony_ci                        bi_index tmp = bi_temp(ctx);
691bf215546Sopenharmony_ci
692bf215546Sopenharmony_ci                        unsigned bits = bi_count_read_index(I, index) * 32;
693bf215546Sopenharmony_ci                        bi_rewrite_index_src_single(I, index, tmp);
694bf215546Sopenharmony_ci
695bf215546Sopenharmony_ci                        bi_instr *ld = bi_load_tl(&b, bits, tmp, offset);
696bf215546Sopenharmony_ci                        ld->no_spill = true;
697bf215546Sopenharmony_ci                        ctx->fills++;
698bf215546Sopenharmony_ci                }
699bf215546Sopenharmony_ci        }
700bf215546Sopenharmony_ci
701bf215546Sopenharmony_ci        return (channels * 4);
702bf215546Sopenharmony_ci}
703bf215546Sopenharmony_ci
704bf215546Sopenharmony_ci/*
705bf215546Sopenharmony_ci * For transition, lower collects and splits before RA, rather than after RA.
706bf215546Sopenharmony_ci * LCRA knows how to deal with offsets (broken SSA), but not how to coalesce
707bf215546Sopenharmony_ci * these vector moves.
708bf215546Sopenharmony_ci */
709bf215546Sopenharmony_cistatic void
710bf215546Sopenharmony_cibi_lower_vector(bi_context *ctx)
711bf215546Sopenharmony_ci{
712bf215546Sopenharmony_ci        bi_index *remap = calloc(ctx->ssa_alloc, sizeof(bi_index));
713bf215546Sopenharmony_ci
714bf215546Sopenharmony_ci        bi_foreach_instr_global_safe(ctx, I) {
715bf215546Sopenharmony_ci                bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
716bf215546Sopenharmony_ci
717bf215546Sopenharmony_ci                if (I->op == BI_OPCODE_SPLIT_I32) {
718bf215546Sopenharmony_ci                        bi_index src = I->src[0];
719bf215546Sopenharmony_ci                        assert(src.offset == 0);
720bf215546Sopenharmony_ci
721bf215546Sopenharmony_ci                        for (unsigned i = 0; i < I->nr_dests; ++i) {
722bf215546Sopenharmony_ci                                if (bi_is_null(I->dest[i]))
723bf215546Sopenharmony_ci                                        continue;
724bf215546Sopenharmony_ci
725bf215546Sopenharmony_ci                                src.offset = i;
726bf215546Sopenharmony_ci                                bi_mov_i32_to(&b, I->dest[i], src);
727bf215546Sopenharmony_ci
728bf215546Sopenharmony_ci                                if (bi_is_ssa(I->dest[i]))
729bf215546Sopenharmony_ci                                        remap[I->dest[i].value] = src;
730bf215546Sopenharmony_ci                        }
731bf215546Sopenharmony_ci
732bf215546Sopenharmony_ci                        bi_remove_instruction(I);
733bf215546Sopenharmony_ci                } else if (I->op == BI_OPCODE_COLLECT_I32) {
734bf215546Sopenharmony_ci                        bi_index dest = I->dest[0];
735bf215546Sopenharmony_ci                        assert(dest.offset == 0);
736bf215546Sopenharmony_ci                        assert((bi_is_ssa(dest) || I->nr_srcs == 1) && "nir_lower_phis_to_scalar");
737bf215546Sopenharmony_ci
738bf215546Sopenharmony_ci                        for (unsigned i = 0; i < I->nr_srcs; ++i) {
739bf215546Sopenharmony_ci                                if (bi_is_null(I->src[i]))
740bf215546Sopenharmony_ci                                        continue;
741bf215546Sopenharmony_ci
742bf215546Sopenharmony_ci                                dest.offset = i;
743bf215546Sopenharmony_ci                                bi_mov_i32_to(&b, dest, I->src[i]);
744bf215546Sopenharmony_ci                        }
745bf215546Sopenharmony_ci
746bf215546Sopenharmony_ci                        bi_remove_instruction(I);
747bf215546Sopenharmony_ci                }
748bf215546Sopenharmony_ci        }
749bf215546Sopenharmony_ci
750bf215546Sopenharmony_ci        bi_foreach_instr_global(ctx, I) {
751bf215546Sopenharmony_ci                bi_foreach_src(I, s) {
752bf215546Sopenharmony_ci                        if (bi_is_ssa(I->src[s]) && !bi_is_null(remap[I->src[s].value]))
753bf215546Sopenharmony_ci                                I->src[s] = bi_replace_index(I->src[s], remap[I->src[s].value]);
754bf215546Sopenharmony_ci                }
755bf215546Sopenharmony_ci        }
756bf215546Sopenharmony_ci
757bf215546Sopenharmony_ci        free(remap);
758bf215546Sopenharmony_ci
759bf215546Sopenharmony_ci        /* After generating a pile of moves, clean up */
760bf215546Sopenharmony_ci        bi_opt_dead_code_eliminate(ctx);
761bf215546Sopenharmony_ci}
762bf215546Sopenharmony_ci
763bf215546Sopenharmony_ci/*
764bf215546Sopenharmony_ci * Check if the instruction requires a "tied" operand. Such instructions MUST
765bf215546Sopenharmony_ci * allocate their source and destination to the same register. This is a
766bf215546Sopenharmony_ci * constraint on RA, and may require extra moves.
767bf215546Sopenharmony_ci *
768bf215546Sopenharmony_ci * In particular, this is the case for Bifrost instructions that both read and
769bf215546Sopenharmony_ci * write with the staging register mechanism.
770bf215546Sopenharmony_ci */
771bf215546Sopenharmony_cistatic bool
772bf215546Sopenharmony_cibi_is_tied(const bi_instr *I)
773bf215546Sopenharmony_ci{
774bf215546Sopenharmony_ci        if (bi_is_null(I->src[0]))
775bf215546Sopenharmony_ci                return false;
776bf215546Sopenharmony_ci
777bf215546Sopenharmony_ci        return (I->op == BI_OPCODE_TEXC ||
778bf215546Sopenharmony_ci                I->op == BI_OPCODE_ATOM_RETURN_I32 ||
779bf215546Sopenharmony_ci                I->op == BI_OPCODE_AXCHG_I32 ||
780bf215546Sopenharmony_ci                I->op == BI_OPCODE_ACMPXCHG_I32);
781bf215546Sopenharmony_ci}
782bf215546Sopenharmony_ci
783bf215546Sopenharmony_ci/*
784bf215546Sopenharmony_ci * For transition, coalesce tied operands together, as LCRA knows how to handle
785bf215546Sopenharmony_ci * non-SSA operands but doesn't know about tied operands.
786bf215546Sopenharmony_ci *
787bf215546Sopenharmony_ci * This breaks the SSA form of the program, but that doesn't matter for LCRA.
788bf215546Sopenharmony_ci */
789bf215546Sopenharmony_cistatic void
790bf215546Sopenharmony_cibi_coalesce_tied(bi_context *ctx)
791bf215546Sopenharmony_ci{
792bf215546Sopenharmony_ci        bi_foreach_instr_global(ctx, I) {
793bf215546Sopenharmony_ci                if (!bi_is_tied(I)) continue;
794bf215546Sopenharmony_ci
795bf215546Sopenharmony_ci                bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
796bf215546Sopenharmony_ci                unsigned n = bi_count_read_registers(I, 0);
797bf215546Sopenharmony_ci
798bf215546Sopenharmony_ci                for (unsigned i = 0; i < n; ++i) {
799bf215546Sopenharmony_ci                        bi_index dst = I->dest[0], src = I->src[0];
800bf215546Sopenharmony_ci
801bf215546Sopenharmony_ci                        assert(dst.offset == 0 && src.offset == 0);
802bf215546Sopenharmony_ci                        dst.offset = src.offset = i;
803bf215546Sopenharmony_ci
804bf215546Sopenharmony_ci                        bi_mov_i32_to(&b, dst, src);
805bf215546Sopenharmony_ci                }
806bf215546Sopenharmony_ci
807bf215546Sopenharmony_ci                I->src[0] = bi_replace_index(I->src[0], I->dest[0]);
808bf215546Sopenharmony_ci        }
809bf215546Sopenharmony_ci}
810bf215546Sopenharmony_ci
811bf215546Sopenharmony_cistatic unsigned
812bf215546Sopenharmony_cifind_or_allocate_temp(unsigned *map, unsigned value, unsigned *alloc)
813bf215546Sopenharmony_ci{
814bf215546Sopenharmony_ci        if (!map[value])
815bf215546Sopenharmony_ci                map[value] = ++(*alloc);
816bf215546Sopenharmony_ci
817bf215546Sopenharmony_ci        assert(map[value]);
818bf215546Sopenharmony_ci        return map[value] - 1;
819bf215546Sopenharmony_ci}
820bf215546Sopenharmony_ci
821bf215546Sopenharmony_ci/* Reassigns numbering to get rid of gaps in the indices and to prioritize
822bf215546Sopenharmony_ci * smaller register classes */
823bf215546Sopenharmony_ci
824bf215546Sopenharmony_cistatic void
825bf215546Sopenharmony_cisqueeze_index(bi_context *ctx)
826bf215546Sopenharmony_ci{
827bf215546Sopenharmony_ci        unsigned *map = rzalloc_array(ctx, unsigned, ctx->ssa_alloc);
828bf215546Sopenharmony_ci        ctx->ssa_alloc = 0;
829bf215546Sopenharmony_ci
830bf215546Sopenharmony_ci        bi_foreach_instr_global(ctx, I) {
831bf215546Sopenharmony_ci                bi_foreach_dest(I, d) {
832bf215546Sopenharmony_ci                        if (I->dest[d].type == BI_INDEX_NORMAL)
833bf215546Sopenharmony_ci                                I->dest[d].value = find_or_allocate_temp(map, I->dest[d].value, &ctx->ssa_alloc);
834bf215546Sopenharmony_ci                }
835bf215546Sopenharmony_ci
836bf215546Sopenharmony_ci                bi_foreach_src(I, s) {
837bf215546Sopenharmony_ci                        if (I->src[s].type == BI_INDEX_NORMAL)
838bf215546Sopenharmony_ci                                I->src[s].value = find_or_allocate_temp(map, I->src[s].value, &ctx->ssa_alloc);
839bf215546Sopenharmony_ci                }
840bf215546Sopenharmony_ci        }
841bf215546Sopenharmony_ci
842bf215546Sopenharmony_ci        ralloc_free(map);
843bf215546Sopenharmony_ci}
844bf215546Sopenharmony_ci
845bf215546Sopenharmony_civoid
846bf215546Sopenharmony_cibi_register_allocate(bi_context *ctx)
847bf215546Sopenharmony_ci{
848bf215546Sopenharmony_ci        struct lcra_state *l = NULL;
849bf215546Sopenharmony_ci        bool success = false;
850bf215546Sopenharmony_ci
851bf215546Sopenharmony_ci        unsigned iter_count = 1000; /* max iterations */
852bf215546Sopenharmony_ci
853bf215546Sopenharmony_ci        /* Number of bytes of memory we've spilled into */
854bf215546Sopenharmony_ci        unsigned spill_count = ctx->info.tls_size;
855bf215546Sopenharmony_ci
856bf215546Sopenharmony_ci        if (ctx->arch >= 9)
857bf215546Sopenharmony_ci                va_lower_split_64bit(ctx);
858bf215546Sopenharmony_ci
859bf215546Sopenharmony_ci        bi_lower_vector(ctx);
860bf215546Sopenharmony_ci
861bf215546Sopenharmony_ci        /* Lower tied operands. SSA is broken from here on. */
862bf215546Sopenharmony_ci        bi_coalesce_tied(ctx);
863bf215546Sopenharmony_ci        squeeze_index(ctx);
864bf215546Sopenharmony_ci
865bf215546Sopenharmony_ci        /* Try with reduced register pressure to improve thread count */
866bf215546Sopenharmony_ci        if (ctx->arch >= 7) {
867bf215546Sopenharmony_ci                l = bi_allocate_registers(ctx, &success, false);
868bf215546Sopenharmony_ci
869bf215546Sopenharmony_ci                if (success) {
870bf215546Sopenharmony_ci                        ctx->info.work_reg_count = 32;
871bf215546Sopenharmony_ci                } else {
872bf215546Sopenharmony_ci                        lcra_free(l);
873bf215546Sopenharmony_ci                        l = NULL;
874bf215546Sopenharmony_ci                }
875bf215546Sopenharmony_ci        }
876bf215546Sopenharmony_ci
877bf215546Sopenharmony_ci        /* Otherwise, use the register file and spill until we succeed */
878bf215546Sopenharmony_ci        while (!success && ((iter_count--) > 0)) {
879bf215546Sopenharmony_ci                l = bi_allocate_registers(ctx, &success, true);
880bf215546Sopenharmony_ci
881bf215546Sopenharmony_ci                if (success) {
882bf215546Sopenharmony_ci                        ctx->info.work_reg_count = 64;
883bf215546Sopenharmony_ci                } else {
884bf215546Sopenharmony_ci                        signed spill_node = bi_choose_spill_node(ctx, l);
885bf215546Sopenharmony_ci                        lcra_free(l);
886bf215546Sopenharmony_ci                        l = NULL;
887bf215546Sopenharmony_ci
888bf215546Sopenharmony_ci                        if (spill_node == -1)
889bf215546Sopenharmony_ci                                unreachable("Failed to choose spill node\n");
890bf215546Sopenharmony_ci
891bf215546Sopenharmony_ci                        if (ctx->inputs->is_blend)
892bf215546Sopenharmony_ci                                unreachable("Blend shaders may not spill");
893bf215546Sopenharmony_ci
894bf215546Sopenharmony_ci                        /* By default, we use packed TLS addressing on Valhall.
895bf215546Sopenharmony_ci                         * We cannot cross 16 byte boundaries with packed TLS
896bf215546Sopenharmony_ci                         * addressing. Align to ensure this doesn't happen. This
897bf215546Sopenharmony_ci                         * could be optimized a bit.
898bf215546Sopenharmony_ci                         */
899bf215546Sopenharmony_ci                        if (ctx->arch >= 9)
900bf215546Sopenharmony_ci                                spill_count = ALIGN_POT(spill_count, 16);
901bf215546Sopenharmony_ci
902bf215546Sopenharmony_ci                        spill_count += bi_spill_register(ctx,
903bf215546Sopenharmony_ci                                        bi_node_to_index(spill_node, bi_max_temp(ctx)),
904bf215546Sopenharmony_ci                                        spill_count);
905bf215546Sopenharmony_ci
906bf215546Sopenharmony_ci                        /* In case the spill affected an instruction with tied
907bf215546Sopenharmony_ci                         * operands, we need to fix up.
908bf215546Sopenharmony_ci                         */
909bf215546Sopenharmony_ci                        bi_coalesce_tied(ctx);
910bf215546Sopenharmony_ci                }
911bf215546Sopenharmony_ci        }
912bf215546Sopenharmony_ci
913bf215546Sopenharmony_ci        assert(success);
914bf215546Sopenharmony_ci        assert(l != NULL);
915bf215546Sopenharmony_ci
916bf215546Sopenharmony_ci        ctx->info.tls_size = spill_count;
917bf215546Sopenharmony_ci        bi_install_registers(ctx, l);
918bf215546Sopenharmony_ci
919bf215546Sopenharmony_ci        lcra_free(l);
920bf215546Sopenharmony_ci}
921