1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright (C) 2020 Collabora Ltd. 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21bf215546Sopenharmony_ci * SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci * Authors (Collabora): 24bf215546Sopenharmony_ci * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> 25bf215546Sopenharmony_ci */ 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci#include "compiler.h" 28bf215546Sopenharmony_ci#include "nodearray.h" 29bf215546Sopenharmony_ci#include "bi_builder.h" 30bf215546Sopenharmony_ci#include "util/u_memory.h" 31bf215546Sopenharmony_ci 32bf215546Sopenharmony_cistruct lcra_state { 33bf215546Sopenharmony_ci unsigned node_count; 34bf215546Sopenharmony_ci uint64_t *affinity; 35bf215546Sopenharmony_ci 36bf215546Sopenharmony_ci /* Linear constraints imposed. For each node there there is a 37bf215546Sopenharmony_ci * 'nodearray' structure, which changes between a sparse and dense 38bf215546Sopenharmony_ci * array depending on the number of elements. 39bf215546Sopenharmony_ci * 40bf215546Sopenharmony_ci * Each element is itself a bit field denoting whether (c_j - c_i) bias 41bf215546Sopenharmony_ci * is present or not, including negative biases. 42bf215546Sopenharmony_ci * 43bf215546Sopenharmony_ci * We support up to 8 components so the bias is in range 44bf215546Sopenharmony_ci * [-7, 7] encoded by a 16-bit field 45bf215546Sopenharmony_ci */ 46bf215546Sopenharmony_ci nodearray *linear; 47bf215546Sopenharmony_ci 48bf215546Sopenharmony_ci /* Before solving, forced registers; after solving, solutions. */ 49bf215546Sopenharmony_ci unsigned *solutions; 50bf215546Sopenharmony_ci 51bf215546Sopenharmony_ci /** Node which caused register allocation to fail */ 52bf215546Sopenharmony_ci unsigned spill_node; 53bf215546Sopenharmony_ci}; 54bf215546Sopenharmony_ci 55bf215546Sopenharmony_ci/* This module is an implementation of "Linearly Constrained 56bf215546Sopenharmony_ci * Register Allocation". The paper is available in PDF form 57bf215546Sopenharmony_ci * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX 58bf215546Sopenharmony_ci * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md) 59bf215546Sopenharmony_ci */ 60bf215546Sopenharmony_ci 61bf215546Sopenharmony_cistatic struct lcra_state * 62bf215546Sopenharmony_cilcra_alloc_equations(unsigned node_count) 63bf215546Sopenharmony_ci{ 64bf215546Sopenharmony_ci struct lcra_state *l = calloc(1, sizeof(*l)); 65bf215546Sopenharmony_ci 66bf215546Sopenharmony_ci l->node_count = node_count; 67bf215546Sopenharmony_ci 68bf215546Sopenharmony_ci l->linear = calloc(sizeof(l->linear[0]), node_count); 69bf215546Sopenharmony_ci l->solutions = calloc(sizeof(l->solutions[0]), node_count); 70bf215546Sopenharmony_ci l->affinity = calloc(sizeof(l->affinity[0]), node_count); 71bf215546Sopenharmony_ci 72bf215546Sopenharmony_ci memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count); 73bf215546Sopenharmony_ci 74bf215546Sopenharmony_ci return l; 75bf215546Sopenharmony_ci} 76bf215546Sopenharmony_ci 77bf215546Sopenharmony_cistatic void 78bf215546Sopenharmony_cilcra_free(struct lcra_state *l) 79bf215546Sopenharmony_ci{ 80bf215546Sopenharmony_ci for (unsigned i = 0; i < l->node_count; ++i) 81bf215546Sopenharmony_ci nodearray_reset(&l->linear[i]); 82bf215546Sopenharmony_ci 83bf215546Sopenharmony_ci free(l->linear); 84bf215546Sopenharmony_ci free(l->affinity); 85bf215546Sopenharmony_ci free(l->solutions); 86bf215546Sopenharmony_ci free(l); 87bf215546Sopenharmony_ci} 88bf215546Sopenharmony_ci 89bf215546Sopenharmony_cistatic void 90bf215546Sopenharmony_cilcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j) 91bf215546Sopenharmony_ci{ 92bf215546Sopenharmony_ci if (i == j) 93bf215546Sopenharmony_ci return; 94bf215546Sopenharmony_ci 95bf215546Sopenharmony_ci nodearray_value constraint_fw = 0; 96bf215546Sopenharmony_ci nodearray_value constraint_bw = 0; 97bf215546Sopenharmony_ci 98bf215546Sopenharmony_ci /* The constraint bits are reversed from lcra.c so that register 99bf215546Sopenharmony_ci * allocation can be done in parallel for every possible solution, 100bf215546Sopenharmony_ci * with lower-order bits representing smaller registers. */ 101bf215546Sopenharmony_ci 102bf215546Sopenharmony_ci for (unsigned D = 0; D < 8; ++D) { 103bf215546Sopenharmony_ci if (cmask_i & (cmask_j << D)) { 104bf215546Sopenharmony_ci constraint_fw |= (1 << (7 + D)); 105bf215546Sopenharmony_ci constraint_bw |= (1 << (7 - D)); 106bf215546Sopenharmony_ci } 107bf215546Sopenharmony_ci 108bf215546Sopenharmony_ci if (cmask_i & (cmask_j >> D)) { 109bf215546Sopenharmony_ci constraint_bw |= (1 << (7 + D)); 110bf215546Sopenharmony_ci constraint_fw |= (1 << (7 - D)); 111bf215546Sopenharmony_ci } 112bf215546Sopenharmony_ci } 113bf215546Sopenharmony_ci 114bf215546Sopenharmony_ci /* Use dense arrays after adding 256 elements */ 115bf215546Sopenharmony_ci nodearray_orr(&l->linear[j], i, constraint_fw, 256, l->node_count); 116bf215546Sopenharmony_ci nodearray_orr(&l->linear[i], j, constraint_bw, 256, l->node_count); 117bf215546Sopenharmony_ci} 118bf215546Sopenharmony_ci 119bf215546Sopenharmony_cistatic bool 120bf215546Sopenharmony_cilcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i) 121bf215546Sopenharmony_ci{ 122bf215546Sopenharmony_ci signed constant = solutions[i]; 123bf215546Sopenharmony_ci 124bf215546Sopenharmony_ci if (nodearray_is_sparse(&l->linear[i])) { 125bf215546Sopenharmony_ci nodearray_sparse_foreach(&l->linear[i], elem) { 126bf215546Sopenharmony_ci unsigned j = nodearray_sparse_key(elem); 127bf215546Sopenharmony_ci nodearray_value constraint = nodearray_sparse_value(elem); 128bf215546Sopenharmony_ci 129bf215546Sopenharmony_ci if (solutions[j] == ~0) continue; 130bf215546Sopenharmony_ci 131bf215546Sopenharmony_ci signed lhs = constant - solutions[j]; 132bf215546Sopenharmony_ci 133bf215546Sopenharmony_ci if (lhs < -7 || lhs > 7) 134bf215546Sopenharmony_ci continue; 135bf215546Sopenharmony_ci 136bf215546Sopenharmony_ci if (constraint & (1 << (lhs + 7))) 137bf215546Sopenharmony_ci return false; 138bf215546Sopenharmony_ci } 139bf215546Sopenharmony_ci 140bf215546Sopenharmony_ci return true; 141bf215546Sopenharmony_ci } 142bf215546Sopenharmony_ci 143bf215546Sopenharmony_ci nodearray_value *row = l->linear[i].dense; 144bf215546Sopenharmony_ci 145bf215546Sopenharmony_ci for (unsigned j = 0; j < l->node_count; ++j) { 146bf215546Sopenharmony_ci if (solutions[j] == ~0) continue; 147bf215546Sopenharmony_ci 148bf215546Sopenharmony_ci signed lhs = constant - solutions[j]; 149bf215546Sopenharmony_ci 150bf215546Sopenharmony_ci if (lhs < -7 || lhs > 7) 151bf215546Sopenharmony_ci continue; 152bf215546Sopenharmony_ci 153bf215546Sopenharmony_ci if (row[j] & (1 << (lhs + 7))) 154bf215546Sopenharmony_ci return false; 155bf215546Sopenharmony_ci } 156bf215546Sopenharmony_ci 157bf215546Sopenharmony_ci return true; 158bf215546Sopenharmony_ci} 159bf215546Sopenharmony_ci 160bf215546Sopenharmony_cistatic bool 161bf215546Sopenharmony_cilcra_solve(struct lcra_state *l) 162bf215546Sopenharmony_ci{ 163bf215546Sopenharmony_ci for (unsigned step = 0; step < l->node_count; ++step) { 164bf215546Sopenharmony_ci if (l->solutions[step] != ~0) continue; 165bf215546Sopenharmony_ci if (l->affinity[step] == 0) continue; 166bf215546Sopenharmony_ci 167bf215546Sopenharmony_ci bool succ = false; 168bf215546Sopenharmony_ci 169bf215546Sopenharmony_ci u_foreach_bit64(r, l->affinity[step]) { 170bf215546Sopenharmony_ci l->solutions[step] = r; 171bf215546Sopenharmony_ci 172bf215546Sopenharmony_ci if (lcra_test_linear(l, l->solutions, step)) { 173bf215546Sopenharmony_ci succ = true; 174bf215546Sopenharmony_ci break; 175bf215546Sopenharmony_ci } 176bf215546Sopenharmony_ci } 177bf215546Sopenharmony_ci 178bf215546Sopenharmony_ci /* Out of registers - prepare to spill */ 179bf215546Sopenharmony_ci if (!succ) { 180bf215546Sopenharmony_ci l->spill_node = step; 181bf215546Sopenharmony_ci return false; 182bf215546Sopenharmony_ci } 183bf215546Sopenharmony_ci } 184bf215546Sopenharmony_ci 185bf215546Sopenharmony_ci return true; 186bf215546Sopenharmony_ci} 187bf215546Sopenharmony_ci 188bf215546Sopenharmony_ci/* Register spilling is implemented with a cost-benefit system. Costs are set 189bf215546Sopenharmony_ci * by the user. Benefits are calculated from the constraints. */ 190bf215546Sopenharmony_ci 191bf215546Sopenharmony_cistatic unsigned 192bf215546Sopenharmony_cilcra_count_constraints(struct lcra_state *l, unsigned i) 193bf215546Sopenharmony_ci{ 194bf215546Sopenharmony_ci unsigned count = 0; 195bf215546Sopenharmony_ci nodearray *constraints = &l->linear[i]; 196bf215546Sopenharmony_ci 197bf215546Sopenharmony_ci if (nodearray_is_sparse(constraints)) { 198bf215546Sopenharmony_ci nodearray_sparse_foreach(constraints, elem) 199bf215546Sopenharmony_ci count += util_bitcount(nodearray_sparse_value(elem)); 200bf215546Sopenharmony_ci } else { 201bf215546Sopenharmony_ci nodearray_dense_foreach_64(constraints, elem) 202bf215546Sopenharmony_ci count += util_bitcount64(*elem); 203bf215546Sopenharmony_ci } 204bf215546Sopenharmony_ci 205bf215546Sopenharmony_ci return count; 206bf215546Sopenharmony_ci} 207bf215546Sopenharmony_ci 208bf215546Sopenharmony_ci/* Construct an affinity mask such that the vector with `count` elements does 209bf215546Sopenharmony_ci * not intersect any of the registers in the bitset `clobber`. In other words, 210bf215546Sopenharmony_ci * an allocated register r needs to satisfy for each i < count: a + i != b. 211bf215546Sopenharmony_ci * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the 212bf215546Sopenharmony_ci * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where 213bf215546Sopenharmony_ci * that union is the desired clobber set. That may be written equivalently as 214bf215546Sopenharmony_ci * the union over i < n of (B - i), where subtraction is defined elementwise 215bf215546Sopenharmony_ci * and corresponds to a shift of the entire bitset. 216bf215546Sopenharmony_ci * 217bf215546Sopenharmony_ci * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted 218bf215546Sopenharmony_ci * as a bit set, it is { x : 0 <= x < 64 if x is even } 219bf215546Sopenharmony_ci */ 220bf215546Sopenharmony_ci 221bf215546Sopenharmony_ci#define EVEN_BITS_MASK (0x5555555555555555ull) 222bf215546Sopenharmony_ci 223bf215546Sopenharmony_cistatic uint64_t 224bf215546Sopenharmony_cibi_make_affinity(uint64_t clobber, unsigned count, bool split_file) 225bf215546Sopenharmony_ci{ 226bf215546Sopenharmony_ci uint64_t clobbered = 0; 227bf215546Sopenharmony_ci 228bf215546Sopenharmony_ci for (unsigned i = 0; i < count; ++i) 229bf215546Sopenharmony_ci clobbered |= (clobber >> i); 230bf215546Sopenharmony_ci 231bf215546Sopenharmony_ci /* Don't allocate past the end of the register file */ 232bf215546Sopenharmony_ci if (count > 1) { 233bf215546Sopenharmony_ci unsigned excess = count - 1; 234bf215546Sopenharmony_ci uint64_t mask = BITFIELD_MASK(excess); 235bf215546Sopenharmony_ci clobbered |= mask << (64 - excess); 236bf215546Sopenharmony_ci 237bf215546Sopenharmony_ci if (split_file) 238bf215546Sopenharmony_ci clobbered |= mask << (16 - excess); 239bf215546Sopenharmony_ci } 240bf215546Sopenharmony_ci 241bf215546Sopenharmony_ci /* Don't allocate the middle if we split out the middle */ 242bf215546Sopenharmony_ci if (split_file) 243bf215546Sopenharmony_ci clobbered |= BITFIELD64_MASK(32) << 16; 244bf215546Sopenharmony_ci 245bf215546Sopenharmony_ci /* We can use a register iff it's not clobberred */ 246bf215546Sopenharmony_ci return ~clobbered; 247bf215546Sopenharmony_ci} 248bf215546Sopenharmony_ci 249bf215546Sopenharmony_cistatic void 250bf215546Sopenharmony_cibi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr) 251bf215546Sopenharmony_ci{ 252bf215546Sopenharmony_ci bi_foreach_instr_in_block_rev(block, ins) { 253bf215546Sopenharmony_ci /* Mark all registers live after the instruction as 254bf215546Sopenharmony_ci * interfering with the destination */ 255bf215546Sopenharmony_ci 256bf215546Sopenharmony_ci bi_foreach_dest(ins, d) { 257bf215546Sopenharmony_ci unsigned node = bi_get_node(ins->dest[d]); 258bf215546Sopenharmony_ci 259bf215546Sopenharmony_ci if (node >= node_count) 260bf215546Sopenharmony_ci continue; 261bf215546Sopenharmony_ci 262bf215546Sopenharmony_ci /* Don't allocate to anything that's read later as a 263bf215546Sopenharmony_ci * preloaded register. The affinity is the intersection 264bf215546Sopenharmony_ci * of affinity masks for each write. Since writes have 265bf215546Sopenharmony_ci * offsets, but the affinity is for the whole node, we 266bf215546Sopenharmony_ci * need to offset the affinity opposite the write 267bf215546Sopenharmony_ci * offset, so we shift right. */ 268bf215546Sopenharmony_ci unsigned count = bi_count_write_registers(ins, d); 269bf215546Sopenharmony_ci unsigned offset = ins->dest[d].offset; 270bf215546Sopenharmony_ci uint64_t affinity = bi_make_affinity(preload_live, count, split_file) >> offset; 271bf215546Sopenharmony_ci /* Valhall needs >= 64-bit staging writes to be pair-aligned */ 272bf215546Sopenharmony_ci if (aligned_sr && (count >= 2 || offset)) 273bf215546Sopenharmony_ci affinity &= EVEN_BITS_MASK; 274bf215546Sopenharmony_ci 275bf215546Sopenharmony_ci l->affinity[node] &= affinity; 276bf215546Sopenharmony_ci 277bf215546Sopenharmony_ci for (unsigned i = 0; i < node_count; ++i) { 278bf215546Sopenharmony_ci uint8_t r = live[i]; 279bf215546Sopenharmony_ci 280bf215546Sopenharmony_ci /* Nodes only interfere if they occupy 281bf215546Sopenharmony_ci * /different values/ at the same time 282bf215546Sopenharmony_ci * (Boissinot). In particular, sources of 283bf215546Sopenharmony_ci * moves do not interfere with their 284bf215546Sopenharmony_ci * destinations. This enables a limited form of 285bf215546Sopenharmony_ci * coalescing. 286bf215546Sopenharmony_ci */ 287bf215546Sopenharmony_ci if (ins->op == BI_OPCODE_MOV_I32 && 288bf215546Sopenharmony_ci i == bi_get_node(ins->src[0])) { 289bf215546Sopenharmony_ci 290bf215546Sopenharmony_ci r &= ~BITFIELD_BIT(ins->src[0].offset); 291bf215546Sopenharmony_ci } 292bf215546Sopenharmony_ci 293bf215546Sopenharmony_ci if (r) { 294bf215546Sopenharmony_ci lcra_add_node_interference(l, node, 295bf215546Sopenharmony_ci bi_writemask(ins, d), i, r); 296bf215546Sopenharmony_ci } 297bf215546Sopenharmony_ci } 298bf215546Sopenharmony_ci 299bf215546Sopenharmony_ci unsigned node_first = bi_get_node(ins->dest[0]); 300bf215546Sopenharmony_ci if (d == 1 && node_first < node_count) { 301bf215546Sopenharmony_ci lcra_add_node_interference(l, node, bi_writemask(ins, 1), 302bf215546Sopenharmony_ci node_first, bi_writemask(ins, 0)); 303bf215546Sopenharmony_ci } 304bf215546Sopenharmony_ci } 305bf215546Sopenharmony_ci 306bf215546Sopenharmony_ci /* Valhall needs >= 64-bit reads to be pair-aligned */ 307bf215546Sopenharmony_ci if (aligned_sr) { 308bf215546Sopenharmony_ci bi_foreach_src(ins, s) { 309bf215546Sopenharmony_ci if (bi_count_read_registers(ins, s) >= 2) { 310bf215546Sopenharmony_ci unsigned node = bi_get_node(ins->src[s]); 311bf215546Sopenharmony_ci 312bf215546Sopenharmony_ci if (node < node_count) 313bf215546Sopenharmony_ci l->affinity[node] &= EVEN_BITS_MASK; 314bf215546Sopenharmony_ci } 315bf215546Sopenharmony_ci } 316bf215546Sopenharmony_ci } 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci if (!is_blend && ins->op == BI_OPCODE_BLEND) { 319bf215546Sopenharmony_ci /* Blend shaders might clobber r0-r15, r48. */ 320bf215546Sopenharmony_ci uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48); 321bf215546Sopenharmony_ci 322bf215546Sopenharmony_ci for (unsigned i = 0; i < node_count; ++i) { 323bf215546Sopenharmony_ci if (live[i]) 324bf215546Sopenharmony_ci l->affinity[i] &= ~clobber; 325bf215546Sopenharmony_ci } 326bf215546Sopenharmony_ci } 327bf215546Sopenharmony_ci 328bf215546Sopenharmony_ci /* Update live_in */ 329bf215546Sopenharmony_ci preload_live = bi_postra_liveness_ins(preload_live, ins); 330bf215546Sopenharmony_ci bi_liveness_ins_update(live, ins, node_count); 331bf215546Sopenharmony_ci } 332bf215546Sopenharmony_ci 333bf215546Sopenharmony_ci block->reg_live_in = preload_live; 334bf215546Sopenharmony_ci} 335bf215546Sopenharmony_ci 336bf215546Sopenharmony_cistatic void 337bf215546Sopenharmony_cibi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs) 338bf215546Sopenharmony_ci{ 339bf215546Sopenharmony_ci unsigned node_count = bi_max_temp(ctx); 340bf215546Sopenharmony_ci 341bf215546Sopenharmony_ci bi_compute_liveness(ctx); 342bf215546Sopenharmony_ci bi_postra_liveness(ctx); 343bf215546Sopenharmony_ci 344bf215546Sopenharmony_ci bi_foreach_block_rev(ctx, blk) { 345bf215546Sopenharmony_ci uint8_t *live = mem_dup(blk->live_out, node_count); 346bf215546Sopenharmony_ci 347bf215546Sopenharmony_ci bi_mark_interference(blk, l, live, blk->reg_live_out, 348bf215546Sopenharmony_ci node_count, ctx->inputs->is_blend, !full_regs, 349bf215546Sopenharmony_ci ctx->arch >= 9); 350bf215546Sopenharmony_ci 351bf215546Sopenharmony_ci free(live); 352bf215546Sopenharmony_ci } 353bf215546Sopenharmony_ci} 354bf215546Sopenharmony_ci 355bf215546Sopenharmony_cistatic struct lcra_state * 356bf215546Sopenharmony_cibi_allocate_registers(bi_context *ctx, bool *success, bool full_regs) 357bf215546Sopenharmony_ci{ 358bf215546Sopenharmony_ci unsigned node_count = bi_max_temp(ctx); 359bf215546Sopenharmony_ci struct lcra_state *l = lcra_alloc_equations(node_count); 360bf215546Sopenharmony_ci 361bf215546Sopenharmony_ci /* Blend shaders are restricted to R0-R15. Other shaders at full 362bf215546Sopenharmony_ci * occupancy also can access R48-R63. At half occupancy they can access 363bf215546Sopenharmony_ci * the whole file. */ 364bf215546Sopenharmony_ci 365bf215546Sopenharmony_ci uint64_t default_affinity = 366bf215546Sopenharmony_ci ctx->inputs->is_blend ? BITFIELD64_MASK(16) : 367bf215546Sopenharmony_ci full_regs ? BITFIELD64_MASK(64) : 368bf215546Sopenharmony_ci (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48)); 369bf215546Sopenharmony_ci 370bf215546Sopenharmony_ci /* To test spilling, mimic a small register file */ 371bf215546Sopenharmony_ci if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend) 372bf215546Sopenharmony_ci default_affinity &= BITFIELD64_MASK(48) << 8; 373bf215546Sopenharmony_ci 374bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, ins) { 375bf215546Sopenharmony_ci bi_foreach_dest(ins, d) { 376bf215546Sopenharmony_ci unsigned dest = bi_get_node(ins->dest[d]); 377bf215546Sopenharmony_ci 378bf215546Sopenharmony_ci if (dest < node_count) 379bf215546Sopenharmony_ci l->affinity[dest] = default_affinity; 380bf215546Sopenharmony_ci } 381bf215546Sopenharmony_ci 382bf215546Sopenharmony_ci /* Blend shaders expect the src colour to be in r0-r3 */ 383bf215546Sopenharmony_ci if (ins->op == BI_OPCODE_BLEND && 384bf215546Sopenharmony_ci !ctx->inputs->is_blend) { 385bf215546Sopenharmony_ci unsigned node = bi_get_node(ins->src[0]); 386bf215546Sopenharmony_ci assert(node < node_count); 387bf215546Sopenharmony_ci l->solutions[node] = 0; 388bf215546Sopenharmony_ci 389bf215546Sopenharmony_ci /* Dual source blend input in r4-r7 */ 390bf215546Sopenharmony_ci node = bi_get_node(ins->src[4]); 391bf215546Sopenharmony_ci if (node < node_count) 392bf215546Sopenharmony_ci l->solutions[node] = 4; 393bf215546Sopenharmony_ci 394bf215546Sopenharmony_ci /* Writes to R48 */ 395bf215546Sopenharmony_ci node = bi_get_node(ins->dest[0]); 396bf215546Sopenharmony_ci if (!bi_is_null(ins->dest[0])) { 397bf215546Sopenharmony_ci assert(node < node_count); 398bf215546Sopenharmony_ci l->solutions[node] = 48; 399bf215546Sopenharmony_ci } 400bf215546Sopenharmony_ci } 401bf215546Sopenharmony_ci 402bf215546Sopenharmony_ci /* Coverage mask writes stay in R60 */ 403bf215546Sopenharmony_ci if ((ins->op == BI_OPCODE_ATEST || 404bf215546Sopenharmony_ci ins->op == BI_OPCODE_ZS_EMIT) && 405bf215546Sopenharmony_ci !bi_is_null(ins->dest[0])) { 406bf215546Sopenharmony_ci unsigned node = bi_get_node(ins->dest[0]); 407bf215546Sopenharmony_ci assert(node < node_count); 408bf215546Sopenharmony_ci l->solutions[node] = 60; 409bf215546Sopenharmony_ci } 410bf215546Sopenharmony_ci 411bf215546Sopenharmony_ci /* Experimentally, it seems coverage masks inputs to ATEST must 412bf215546Sopenharmony_ci * be in R60. Otherwise coverage mask writes do not work with 413bf215546Sopenharmony_ci * early-ZS with pixel-frequency-shading (this combination of 414bf215546Sopenharmony_ci * settings is legal if depth/stencil writes are disabled). 415bf215546Sopenharmony_ci */ 416bf215546Sopenharmony_ci if (ins->op == BI_OPCODE_ATEST) { 417bf215546Sopenharmony_ci unsigned node = bi_get_node(ins->src[0]); 418bf215546Sopenharmony_ci assert(node < node_count); 419bf215546Sopenharmony_ci l->solutions[node] = 60; 420bf215546Sopenharmony_ci } 421bf215546Sopenharmony_ci } 422bf215546Sopenharmony_ci 423bf215546Sopenharmony_ci bi_compute_interference(ctx, l, full_regs); 424bf215546Sopenharmony_ci 425bf215546Sopenharmony_ci /* Coalesce register moves if we're allowed. We need to be careful due 426bf215546Sopenharmony_ci * to the restricted affinity induced by the blend shader ABI. 427bf215546Sopenharmony_ci */ 428bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, I) { 429bf215546Sopenharmony_ci if (I->op != BI_OPCODE_MOV_I32) continue; 430bf215546Sopenharmony_ci if (I->src[0].type != BI_INDEX_REGISTER) continue; 431bf215546Sopenharmony_ci 432bf215546Sopenharmony_ci unsigned reg = I->src[0].value; 433bf215546Sopenharmony_ci unsigned node = bi_get_node(I->dest[0]); 434bf215546Sopenharmony_ci assert(node < node_count); 435bf215546Sopenharmony_ci 436bf215546Sopenharmony_ci if (l->solutions[node] != ~0) continue; 437bf215546Sopenharmony_ci 438bf215546Sopenharmony_ci uint64_t affinity = l->affinity[node]; 439bf215546Sopenharmony_ci 440bf215546Sopenharmony_ci if (ctx->inputs->is_blend) { 441bf215546Sopenharmony_ci /* We're allowed to coalesce the moves to these */ 442bf215546Sopenharmony_ci affinity |= BITFIELD64_BIT(48); 443bf215546Sopenharmony_ci affinity |= BITFIELD64_BIT(60); 444bf215546Sopenharmony_ci } 445bf215546Sopenharmony_ci 446bf215546Sopenharmony_ci /* Try to coalesce */ 447bf215546Sopenharmony_ci if (affinity & BITFIELD64_BIT(reg)) { 448bf215546Sopenharmony_ci l->solutions[node] = reg; 449bf215546Sopenharmony_ci 450bf215546Sopenharmony_ci if (!lcra_test_linear(l, l->solutions, node)) 451bf215546Sopenharmony_ci l->solutions[node] = ~0; 452bf215546Sopenharmony_ci } 453bf215546Sopenharmony_ci } 454bf215546Sopenharmony_ci 455bf215546Sopenharmony_ci *success = lcra_solve(l); 456bf215546Sopenharmony_ci 457bf215546Sopenharmony_ci return l; 458bf215546Sopenharmony_ci} 459bf215546Sopenharmony_ci 460bf215546Sopenharmony_cistatic bi_index 461bf215546Sopenharmony_cibi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index) 462bf215546Sopenharmony_ci{ 463bf215546Sopenharmony_ci /* Offsets can only be applied when we register allocated an index, or 464bf215546Sopenharmony_ci * alternatively for FAU's encoding */ 465bf215546Sopenharmony_ci 466bf215546Sopenharmony_ci ASSERTED bool is_offset = (index.offset > 0) && 467bf215546Sopenharmony_ci (index.type != BI_INDEX_FAU); 468bf215546Sopenharmony_ci unsigned node_count = bi_max_temp(ctx); 469bf215546Sopenharmony_ci 470bf215546Sopenharmony_ci /* Did we run RA for this index at all */ 471bf215546Sopenharmony_ci if (bi_get_node(index) >= node_count) { 472bf215546Sopenharmony_ci assert(!is_offset); 473bf215546Sopenharmony_ci return index; 474bf215546Sopenharmony_ci } 475bf215546Sopenharmony_ci 476bf215546Sopenharmony_ci /* LCRA didn't bother solving this index (how lazy!) */ 477bf215546Sopenharmony_ci signed solution = l->solutions[bi_get_node(index)]; 478bf215546Sopenharmony_ci if (solution < 0) { 479bf215546Sopenharmony_ci assert(!is_offset); 480bf215546Sopenharmony_ci return index; 481bf215546Sopenharmony_ci } 482bf215546Sopenharmony_ci 483bf215546Sopenharmony_ci /* todo: do we want to compose with the subword swizzle? */ 484bf215546Sopenharmony_ci bi_index new_index = bi_register(solution + index.offset); 485bf215546Sopenharmony_ci new_index.swizzle = index.swizzle; 486bf215546Sopenharmony_ci new_index.abs = index.abs; 487bf215546Sopenharmony_ci new_index.neg = index.neg; 488bf215546Sopenharmony_ci return new_index; 489bf215546Sopenharmony_ci} 490bf215546Sopenharmony_ci 491bf215546Sopenharmony_ci/* Dual texture instructions write to two sets of staging registers, modeled as 492bf215546Sopenharmony_ci * two destinations in the IR. The first set is communicated with the usual 493bf215546Sopenharmony_ci * staging register mechanism. The second set is encoded in the texture 494bf215546Sopenharmony_ci * operation descriptor. This is quite unusual, and requires the following late 495bf215546Sopenharmony_ci * fixup. 496bf215546Sopenharmony_ci */ 497bf215546Sopenharmony_cistatic void 498bf215546Sopenharmony_cibi_fixup_dual_tex_register(bi_instr *I) 499bf215546Sopenharmony_ci{ 500bf215546Sopenharmony_ci assert(I->dest[1].type == BI_INDEX_REGISTER); 501bf215546Sopenharmony_ci assert(I->src[3].type == BI_INDEX_CONSTANT); 502bf215546Sopenharmony_ci 503bf215546Sopenharmony_ci struct bifrost_dual_texture_operation desc = { 504bf215546Sopenharmony_ci .secondary_register = I->dest[1].value 505bf215546Sopenharmony_ci }; 506bf215546Sopenharmony_ci 507bf215546Sopenharmony_ci I->src[3].value |= bi_dual_tex_as_u32(desc); 508bf215546Sopenharmony_ci} 509bf215546Sopenharmony_ci 510bf215546Sopenharmony_cistatic void 511bf215546Sopenharmony_cibi_install_registers(bi_context *ctx, struct lcra_state *l) 512bf215546Sopenharmony_ci{ 513bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, ins) { 514bf215546Sopenharmony_ci bi_foreach_dest(ins, d) 515bf215546Sopenharmony_ci ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]); 516bf215546Sopenharmony_ci 517bf215546Sopenharmony_ci bi_foreach_src(ins, s) 518bf215546Sopenharmony_ci ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]); 519bf215546Sopenharmony_ci 520bf215546Sopenharmony_ci if (ins->op == BI_OPCODE_TEXC && !bi_is_null(ins->dest[1])) 521bf215546Sopenharmony_ci bi_fixup_dual_tex_register(ins); 522bf215546Sopenharmony_ci } 523bf215546Sopenharmony_ci} 524bf215546Sopenharmony_ci 525bf215546Sopenharmony_cistatic void 526bf215546Sopenharmony_cibi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new) 527bf215546Sopenharmony_ci{ 528bf215546Sopenharmony_ci bi_foreach_src(ins, i) { 529bf215546Sopenharmony_ci if (bi_is_equiv(ins->src[i], old)) { 530bf215546Sopenharmony_ci ins->src[i].type = new.type; 531bf215546Sopenharmony_ci ins->src[i].reg = new.reg; 532bf215546Sopenharmony_ci ins->src[i].value = new.value; 533bf215546Sopenharmony_ci } 534bf215546Sopenharmony_ci } 535bf215546Sopenharmony_ci} 536bf215546Sopenharmony_ci 537bf215546Sopenharmony_ci/* If register allocation fails, find the best spill node */ 538bf215546Sopenharmony_ci 539bf215546Sopenharmony_cistatic signed 540bf215546Sopenharmony_cibi_choose_spill_node(bi_context *ctx, struct lcra_state *l) 541bf215546Sopenharmony_ci{ 542bf215546Sopenharmony_ci /* Pick a node satisfying bi_spill_register's preconditions */ 543bf215546Sopenharmony_ci BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count)); 544bf215546Sopenharmony_ci 545bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, ins) { 546bf215546Sopenharmony_ci bi_foreach_dest(ins, d) { 547bf215546Sopenharmony_ci unsigned node = bi_get_node(ins->dest[d]); 548bf215546Sopenharmony_ci 549bf215546Sopenharmony_ci if (node >= l->node_count) 550bf215546Sopenharmony_ci continue; 551bf215546Sopenharmony_ci 552bf215546Sopenharmony_ci /* Don't allow spilling coverage mask writes because the 553bf215546Sopenharmony_ci * register preload logic assumes it will stay in R60. 554bf215546Sopenharmony_ci * This could be optimized. 555bf215546Sopenharmony_ci */ 556bf215546Sopenharmony_ci if (ins->no_spill || 557bf215546Sopenharmony_ci ins->op == BI_OPCODE_ATEST || 558bf215546Sopenharmony_ci ins->op == BI_OPCODE_ZS_EMIT || 559bf215546Sopenharmony_ci (ins->op == BI_OPCODE_MOV_I32 && 560bf215546Sopenharmony_ci ins->src[0].type == BI_INDEX_REGISTER && 561bf215546Sopenharmony_ci ins->src[0].value == 60)) { 562bf215546Sopenharmony_ci BITSET_SET(no_spill, node); 563bf215546Sopenharmony_ci } 564bf215546Sopenharmony_ci } 565bf215546Sopenharmony_ci } 566bf215546Sopenharmony_ci 567bf215546Sopenharmony_ci unsigned best_benefit = 0.0; 568bf215546Sopenharmony_ci signed best_node = -1; 569bf215546Sopenharmony_ci 570bf215546Sopenharmony_ci if (nodearray_is_sparse(&l->linear[l->spill_node])) { 571bf215546Sopenharmony_ci nodearray_sparse_foreach(&l->linear[l->spill_node], elem) { 572bf215546Sopenharmony_ci unsigned i = nodearray_sparse_key(elem); 573bf215546Sopenharmony_ci unsigned constraint = nodearray_sparse_value(elem); 574bf215546Sopenharmony_ci 575bf215546Sopenharmony_ci /* Only spill nodes that interfere with the node failing 576bf215546Sopenharmony_ci * register allocation. It's pointless to spill anything else */ 577bf215546Sopenharmony_ci if (!constraint) continue; 578bf215546Sopenharmony_ci 579bf215546Sopenharmony_ci if (BITSET_TEST(no_spill, i)) continue; 580bf215546Sopenharmony_ci 581bf215546Sopenharmony_ci unsigned benefit = lcra_count_constraints(l, i); 582bf215546Sopenharmony_ci 583bf215546Sopenharmony_ci if (benefit > best_benefit) { 584bf215546Sopenharmony_ci best_benefit = benefit; 585bf215546Sopenharmony_ci best_node = i; 586bf215546Sopenharmony_ci } 587bf215546Sopenharmony_ci } 588bf215546Sopenharmony_ci } else { 589bf215546Sopenharmony_ci nodearray_value *row = l->linear[l->spill_node].dense; 590bf215546Sopenharmony_ci 591bf215546Sopenharmony_ci for (unsigned i = 0; i < l->node_count; ++i) { 592bf215546Sopenharmony_ci /* Only spill nodes that interfere with the node failing 593bf215546Sopenharmony_ci * register allocation. It's pointless to spill anything else */ 594bf215546Sopenharmony_ci if (!row[i]) continue; 595bf215546Sopenharmony_ci 596bf215546Sopenharmony_ci if (BITSET_TEST(no_spill, i)) continue; 597bf215546Sopenharmony_ci 598bf215546Sopenharmony_ci unsigned benefit = lcra_count_constraints(l, i); 599bf215546Sopenharmony_ci 600bf215546Sopenharmony_ci if (benefit > best_benefit) { 601bf215546Sopenharmony_ci best_benefit = benefit; 602bf215546Sopenharmony_ci best_node = i; 603bf215546Sopenharmony_ci } 604bf215546Sopenharmony_ci } 605bf215546Sopenharmony_ci } 606bf215546Sopenharmony_ci 607bf215546Sopenharmony_ci free(no_spill); 608bf215546Sopenharmony_ci return best_node; 609bf215546Sopenharmony_ci} 610bf215546Sopenharmony_ci 611bf215546Sopenharmony_cistatic unsigned 612bf215546Sopenharmony_cibi_count_read_index(bi_instr *I, bi_index index) 613bf215546Sopenharmony_ci{ 614bf215546Sopenharmony_ci unsigned max = 0; 615bf215546Sopenharmony_ci 616bf215546Sopenharmony_ci bi_foreach_src(I, s) { 617bf215546Sopenharmony_ci if (bi_is_equiv(I->src[s], index)) { 618bf215546Sopenharmony_ci unsigned count = bi_count_read_registers(I, s); 619bf215546Sopenharmony_ci max = MAX2(max, count + I->src[s].offset); 620bf215546Sopenharmony_ci } 621bf215546Sopenharmony_ci } 622bf215546Sopenharmony_ci 623bf215546Sopenharmony_ci return max; 624bf215546Sopenharmony_ci} 625bf215546Sopenharmony_ci 626bf215546Sopenharmony_ci/* 627bf215546Sopenharmony_ci * Wrappers to emit loads/stores to thread-local storage in an appropriate way 628bf215546Sopenharmony_ci * for the target, so the spill/fill code becomes architecture-independent. 629bf215546Sopenharmony_ci */ 630bf215546Sopenharmony_ci 631bf215546Sopenharmony_cistatic bi_index 632bf215546Sopenharmony_cibi_tls_ptr(bool hi) 633bf215546Sopenharmony_ci{ 634bf215546Sopenharmony_ci return bi_fau(BIR_FAU_TLS_PTR, hi); 635bf215546Sopenharmony_ci} 636bf215546Sopenharmony_ci 637bf215546Sopenharmony_cistatic bi_instr * 638bf215546Sopenharmony_cibi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset) 639bf215546Sopenharmony_ci{ 640bf215546Sopenharmony_ci if (b->shader->arch >= 9) { 641bf215546Sopenharmony_ci return bi_load_to(b, bits, src, bi_tls_ptr(false), 642bf215546Sopenharmony_ci bi_tls_ptr(true), BI_SEG_TL, offset); 643bf215546Sopenharmony_ci } else { 644bf215546Sopenharmony_ci return bi_load_to(b, bits, src, bi_imm_u32(offset), bi_zero(), 645bf215546Sopenharmony_ci BI_SEG_TL, 0); 646bf215546Sopenharmony_ci } 647bf215546Sopenharmony_ci} 648bf215546Sopenharmony_ci 649bf215546Sopenharmony_cistatic void 650bf215546Sopenharmony_cibi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset) 651bf215546Sopenharmony_ci{ 652bf215546Sopenharmony_ci if (b->shader->arch >= 9) { 653bf215546Sopenharmony_ci bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL, offset); 654bf215546Sopenharmony_ci } else { 655bf215546Sopenharmony_ci bi_store(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL, 0); 656bf215546Sopenharmony_ci } 657bf215546Sopenharmony_ci} 658bf215546Sopenharmony_ci 659bf215546Sopenharmony_ci/* Once we've chosen a spill node, spill it and returns bytes spilled */ 660bf215546Sopenharmony_ci 661bf215546Sopenharmony_cistatic unsigned 662bf215546Sopenharmony_cibi_spill_register(bi_context *ctx, bi_index index, uint32_t offset) 663bf215546Sopenharmony_ci{ 664bf215546Sopenharmony_ci bi_builder b = { .shader = ctx }; 665bf215546Sopenharmony_ci unsigned channels = 0; 666bf215546Sopenharmony_ci 667bf215546Sopenharmony_ci /* Spill after every store, fill before every load */ 668bf215546Sopenharmony_ci bi_foreach_instr_global_safe(ctx, I) { 669bf215546Sopenharmony_ci bi_foreach_dest(I, d) { 670bf215546Sopenharmony_ci if (!bi_is_equiv(I->dest[d], index)) continue; 671bf215546Sopenharmony_ci 672bf215546Sopenharmony_ci unsigned extra = I->dest[d].offset; 673bf215546Sopenharmony_ci bi_index tmp = bi_temp(ctx); 674bf215546Sopenharmony_ci 675bf215546Sopenharmony_ci I->dest[d] = bi_replace_index(I->dest[d], tmp); 676bf215546Sopenharmony_ci I->no_spill = true; 677bf215546Sopenharmony_ci 678bf215546Sopenharmony_ci unsigned count = bi_count_write_registers(I, d); 679bf215546Sopenharmony_ci unsigned bits = count * 32; 680bf215546Sopenharmony_ci 681bf215546Sopenharmony_ci b.cursor = bi_after_instr(I); 682bf215546Sopenharmony_ci bi_store_tl(&b, bits, tmp, offset + 4 * extra); 683bf215546Sopenharmony_ci 684bf215546Sopenharmony_ci ctx->spills++; 685bf215546Sopenharmony_ci channels = MAX2(channels, extra + count); 686bf215546Sopenharmony_ci } 687bf215546Sopenharmony_ci 688bf215546Sopenharmony_ci if (bi_has_arg(I, index)) { 689bf215546Sopenharmony_ci b.cursor = bi_before_instr(I); 690bf215546Sopenharmony_ci bi_index tmp = bi_temp(ctx); 691bf215546Sopenharmony_ci 692bf215546Sopenharmony_ci unsigned bits = bi_count_read_index(I, index) * 32; 693bf215546Sopenharmony_ci bi_rewrite_index_src_single(I, index, tmp); 694bf215546Sopenharmony_ci 695bf215546Sopenharmony_ci bi_instr *ld = bi_load_tl(&b, bits, tmp, offset); 696bf215546Sopenharmony_ci ld->no_spill = true; 697bf215546Sopenharmony_ci ctx->fills++; 698bf215546Sopenharmony_ci } 699bf215546Sopenharmony_ci } 700bf215546Sopenharmony_ci 701bf215546Sopenharmony_ci return (channels * 4); 702bf215546Sopenharmony_ci} 703bf215546Sopenharmony_ci 704bf215546Sopenharmony_ci/* 705bf215546Sopenharmony_ci * For transition, lower collects and splits before RA, rather than after RA. 706bf215546Sopenharmony_ci * LCRA knows how to deal with offsets (broken SSA), but not how to coalesce 707bf215546Sopenharmony_ci * these vector moves. 708bf215546Sopenharmony_ci */ 709bf215546Sopenharmony_cistatic void 710bf215546Sopenharmony_cibi_lower_vector(bi_context *ctx) 711bf215546Sopenharmony_ci{ 712bf215546Sopenharmony_ci bi_index *remap = calloc(ctx->ssa_alloc, sizeof(bi_index)); 713bf215546Sopenharmony_ci 714bf215546Sopenharmony_ci bi_foreach_instr_global_safe(ctx, I) { 715bf215546Sopenharmony_ci bi_builder b = bi_init_builder(ctx, bi_after_instr(I)); 716bf215546Sopenharmony_ci 717bf215546Sopenharmony_ci if (I->op == BI_OPCODE_SPLIT_I32) { 718bf215546Sopenharmony_ci bi_index src = I->src[0]; 719bf215546Sopenharmony_ci assert(src.offset == 0); 720bf215546Sopenharmony_ci 721bf215546Sopenharmony_ci for (unsigned i = 0; i < I->nr_dests; ++i) { 722bf215546Sopenharmony_ci if (bi_is_null(I->dest[i])) 723bf215546Sopenharmony_ci continue; 724bf215546Sopenharmony_ci 725bf215546Sopenharmony_ci src.offset = i; 726bf215546Sopenharmony_ci bi_mov_i32_to(&b, I->dest[i], src); 727bf215546Sopenharmony_ci 728bf215546Sopenharmony_ci if (bi_is_ssa(I->dest[i])) 729bf215546Sopenharmony_ci remap[I->dest[i].value] = src; 730bf215546Sopenharmony_ci } 731bf215546Sopenharmony_ci 732bf215546Sopenharmony_ci bi_remove_instruction(I); 733bf215546Sopenharmony_ci } else if (I->op == BI_OPCODE_COLLECT_I32) { 734bf215546Sopenharmony_ci bi_index dest = I->dest[0]; 735bf215546Sopenharmony_ci assert(dest.offset == 0); 736bf215546Sopenharmony_ci assert((bi_is_ssa(dest) || I->nr_srcs == 1) && "nir_lower_phis_to_scalar"); 737bf215546Sopenharmony_ci 738bf215546Sopenharmony_ci for (unsigned i = 0; i < I->nr_srcs; ++i) { 739bf215546Sopenharmony_ci if (bi_is_null(I->src[i])) 740bf215546Sopenharmony_ci continue; 741bf215546Sopenharmony_ci 742bf215546Sopenharmony_ci dest.offset = i; 743bf215546Sopenharmony_ci bi_mov_i32_to(&b, dest, I->src[i]); 744bf215546Sopenharmony_ci } 745bf215546Sopenharmony_ci 746bf215546Sopenharmony_ci bi_remove_instruction(I); 747bf215546Sopenharmony_ci } 748bf215546Sopenharmony_ci } 749bf215546Sopenharmony_ci 750bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, I) { 751bf215546Sopenharmony_ci bi_foreach_src(I, s) { 752bf215546Sopenharmony_ci if (bi_is_ssa(I->src[s]) && !bi_is_null(remap[I->src[s].value])) 753bf215546Sopenharmony_ci I->src[s] = bi_replace_index(I->src[s], remap[I->src[s].value]); 754bf215546Sopenharmony_ci } 755bf215546Sopenharmony_ci } 756bf215546Sopenharmony_ci 757bf215546Sopenharmony_ci free(remap); 758bf215546Sopenharmony_ci 759bf215546Sopenharmony_ci /* After generating a pile of moves, clean up */ 760bf215546Sopenharmony_ci bi_opt_dead_code_eliminate(ctx); 761bf215546Sopenharmony_ci} 762bf215546Sopenharmony_ci 763bf215546Sopenharmony_ci/* 764bf215546Sopenharmony_ci * Check if the instruction requires a "tied" operand. Such instructions MUST 765bf215546Sopenharmony_ci * allocate their source and destination to the same register. This is a 766bf215546Sopenharmony_ci * constraint on RA, and may require extra moves. 767bf215546Sopenharmony_ci * 768bf215546Sopenharmony_ci * In particular, this is the case for Bifrost instructions that both read and 769bf215546Sopenharmony_ci * write with the staging register mechanism. 770bf215546Sopenharmony_ci */ 771bf215546Sopenharmony_cistatic bool 772bf215546Sopenharmony_cibi_is_tied(const bi_instr *I) 773bf215546Sopenharmony_ci{ 774bf215546Sopenharmony_ci if (bi_is_null(I->src[0])) 775bf215546Sopenharmony_ci return false; 776bf215546Sopenharmony_ci 777bf215546Sopenharmony_ci return (I->op == BI_OPCODE_TEXC || 778bf215546Sopenharmony_ci I->op == BI_OPCODE_ATOM_RETURN_I32 || 779bf215546Sopenharmony_ci I->op == BI_OPCODE_AXCHG_I32 || 780bf215546Sopenharmony_ci I->op == BI_OPCODE_ACMPXCHG_I32); 781bf215546Sopenharmony_ci} 782bf215546Sopenharmony_ci 783bf215546Sopenharmony_ci/* 784bf215546Sopenharmony_ci * For transition, coalesce tied operands together, as LCRA knows how to handle 785bf215546Sopenharmony_ci * non-SSA operands but doesn't know about tied operands. 786bf215546Sopenharmony_ci * 787bf215546Sopenharmony_ci * This breaks the SSA form of the program, but that doesn't matter for LCRA. 788bf215546Sopenharmony_ci */ 789bf215546Sopenharmony_cistatic void 790bf215546Sopenharmony_cibi_coalesce_tied(bi_context *ctx) 791bf215546Sopenharmony_ci{ 792bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, I) { 793bf215546Sopenharmony_ci if (!bi_is_tied(I)) continue; 794bf215546Sopenharmony_ci 795bf215546Sopenharmony_ci bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); 796bf215546Sopenharmony_ci unsigned n = bi_count_read_registers(I, 0); 797bf215546Sopenharmony_ci 798bf215546Sopenharmony_ci for (unsigned i = 0; i < n; ++i) { 799bf215546Sopenharmony_ci bi_index dst = I->dest[0], src = I->src[0]; 800bf215546Sopenharmony_ci 801bf215546Sopenharmony_ci assert(dst.offset == 0 && src.offset == 0); 802bf215546Sopenharmony_ci dst.offset = src.offset = i; 803bf215546Sopenharmony_ci 804bf215546Sopenharmony_ci bi_mov_i32_to(&b, dst, src); 805bf215546Sopenharmony_ci } 806bf215546Sopenharmony_ci 807bf215546Sopenharmony_ci I->src[0] = bi_replace_index(I->src[0], I->dest[0]); 808bf215546Sopenharmony_ci } 809bf215546Sopenharmony_ci} 810bf215546Sopenharmony_ci 811bf215546Sopenharmony_cistatic unsigned 812bf215546Sopenharmony_cifind_or_allocate_temp(unsigned *map, unsigned value, unsigned *alloc) 813bf215546Sopenharmony_ci{ 814bf215546Sopenharmony_ci if (!map[value]) 815bf215546Sopenharmony_ci map[value] = ++(*alloc); 816bf215546Sopenharmony_ci 817bf215546Sopenharmony_ci assert(map[value]); 818bf215546Sopenharmony_ci return map[value] - 1; 819bf215546Sopenharmony_ci} 820bf215546Sopenharmony_ci 821bf215546Sopenharmony_ci/* Reassigns numbering to get rid of gaps in the indices and to prioritize 822bf215546Sopenharmony_ci * smaller register classes */ 823bf215546Sopenharmony_ci 824bf215546Sopenharmony_cistatic void 825bf215546Sopenharmony_cisqueeze_index(bi_context *ctx) 826bf215546Sopenharmony_ci{ 827bf215546Sopenharmony_ci unsigned *map = rzalloc_array(ctx, unsigned, ctx->ssa_alloc); 828bf215546Sopenharmony_ci ctx->ssa_alloc = 0; 829bf215546Sopenharmony_ci 830bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, I) { 831bf215546Sopenharmony_ci bi_foreach_dest(I, d) { 832bf215546Sopenharmony_ci if (I->dest[d].type == BI_INDEX_NORMAL) 833bf215546Sopenharmony_ci I->dest[d].value = find_or_allocate_temp(map, I->dest[d].value, &ctx->ssa_alloc); 834bf215546Sopenharmony_ci } 835bf215546Sopenharmony_ci 836bf215546Sopenharmony_ci bi_foreach_src(I, s) { 837bf215546Sopenharmony_ci if (I->src[s].type == BI_INDEX_NORMAL) 838bf215546Sopenharmony_ci I->src[s].value = find_or_allocate_temp(map, I->src[s].value, &ctx->ssa_alloc); 839bf215546Sopenharmony_ci } 840bf215546Sopenharmony_ci } 841bf215546Sopenharmony_ci 842bf215546Sopenharmony_ci ralloc_free(map); 843bf215546Sopenharmony_ci} 844bf215546Sopenharmony_ci 845bf215546Sopenharmony_civoid 846bf215546Sopenharmony_cibi_register_allocate(bi_context *ctx) 847bf215546Sopenharmony_ci{ 848bf215546Sopenharmony_ci struct lcra_state *l = NULL; 849bf215546Sopenharmony_ci bool success = false; 850bf215546Sopenharmony_ci 851bf215546Sopenharmony_ci unsigned iter_count = 1000; /* max iterations */ 852bf215546Sopenharmony_ci 853bf215546Sopenharmony_ci /* Number of bytes of memory we've spilled into */ 854bf215546Sopenharmony_ci unsigned spill_count = ctx->info.tls_size; 855bf215546Sopenharmony_ci 856bf215546Sopenharmony_ci if (ctx->arch >= 9) 857bf215546Sopenharmony_ci va_lower_split_64bit(ctx); 858bf215546Sopenharmony_ci 859bf215546Sopenharmony_ci bi_lower_vector(ctx); 860bf215546Sopenharmony_ci 861bf215546Sopenharmony_ci /* Lower tied operands. SSA is broken from here on. */ 862bf215546Sopenharmony_ci bi_coalesce_tied(ctx); 863bf215546Sopenharmony_ci squeeze_index(ctx); 864bf215546Sopenharmony_ci 865bf215546Sopenharmony_ci /* Try with reduced register pressure to improve thread count */ 866bf215546Sopenharmony_ci if (ctx->arch >= 7) { 867bf215546Sopenharmony_ci l = bi_allocate_registers(ctx, &success, false); 868bf215546Sopenharmony_ci 869bf215546Sopenharmony_ci if (success) { 870bf215546Sopenharmony_ci ctx->info.work_reg_count = 32; 871bf215546Sopenharmony_ci } else { 872bf215546Sopenharmony_ci lcra_free(l); 873bf215546Sopenharmony_ci l = NULL; 874bf215546Sopenharmony_ci } 875bf215546Sopenharmony_ci } 876bf215546Sopenharmony_ci 877bf215546Sopenharmony_ci /* Otherwise, use the register file and spill until we succeed */ 878bf215546Sopenharmony_ci while (!success && ((iter_count--) > 0)) { 879bf215546Sopenharmony_ci l = bi_allocate_registers(ctx, &success, true); 880bf215546Sopenharmony_ci 881bf215546Sopenharmony_ci if (success) { 882bf215546Sopenharmony_ci ctx->info.work_reg_count = 64; 883bf215546Sopenharmony_ci } else { 884bf215546Sopenharmony_ci signed spill_node = bi_choose_spill_node(ctx, l); 885bf215546Sopenharmony_ci lcra_free(l); 886bf215546Sopenharmony_ci l = NULL; 887bf215546Sopenharmony_ci 888bf215546Sopenharmony_ci if (spill_node == -1) 889bf215546Sopenharmony_ci unreachable("Failed to choose spill node\n"); 890bf215546Sopenharmony_ci 891bf215546Sopenharmony_ci if (ctx->inputs->is_blend) 892bf215546Sopenharmony_ci unreachable("Blend shaders may not spill"); 893bf215546Sopenharmony_ci 894bf215546Sopenharmony_ci /* By default, we use packed TLS addressing on Valhall. 895bf215546Sopenharmony_ci * We cannot cross 16 byte boundaries with packed TLS 896bf215546Sopenharmony_ci * addressing. Align to ensure this doesn't happen. This 897bf215546Sopenharmony_ci * could be optimized a bit. 898bf215546Sopenharmony_ci */ 899bf215546Sopenharmony_ci if (ctx->arch >= 9) 900bf215546Sopenharmony_ci spill_count = ALIGN_POT(spill_count, 16); 901bf215546Sopenharmony_ci 902bf215546Sopenharmony_ci spill_count += bi_spill_register(ctx, 903bf215546Sopenharmony_ci bi_node_to_index(spill_node, bi_max_temp(ctx)), 904bf215546Sopenharmony_ci spill_count); 905bf215546Sopenharmony_ci 906bf215546Sopenharmony_ci /* In case the spill affected an instruction with tied 907bf215546Sopenharmony_ci * operands, we need to fix up. 908bf215546Sopenharmony_ci */ 909bf215546Sopenharmony_ci bi_coalesce_tied(ctx); 910bf215546Sopenharmony_ci } 911bf215546Sopenharmony_ci } 912bf215546Sopenharmony_ci 913bf215546Sopenharmony_ci assert(success); 914bf215546Sopenharmony_ci assert(l != NULL); 915bf215546Sopenharmony_ci 916bf215546Sopenharmony_ci ctx->info.tls_size = spill_count; 917bf215546Sopenharmony_ci bi_install_registers(ctx, l); 918bf215546Sopenharmony_ci 919bf215546Sopenharmony_ci lcra_free(l); 920bf215546Sopenharmony_ci} 921