1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright (C) 2021 Collabora, Ltd. 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21bf215546Sopenharmony_ci * SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci#include "compiler.h" 25bf215546Sopenharmony_ci#include "bi_builder.h" 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci/* This optimization pass, intended to run once after code emission but before 28bf215546Sopenharmony_ci * copy propagation, analyzes direct word-aligned UBO reads and promotes a 29bf215546Sopenharmony_ci * subset to moves from FAU. It is the sole populator of the UBO push data 30bf215546Sopenharmony_ci * structure returned back to the command stream. */ 31bf215546Sopenharmony_ci 32bf215546Sopenharmony_cistatic bool 33bf215546Sopenharmony_cibi_is_ubo(bi_instr *ins) 34bf215546Sopenharmony_ci{ 35bf215546Sopenharmony_ci return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) && 36bf215546Sopenharmony_ci (ins->seg == BI_SEG_UBO); 37bf215546Sopenharmony_ci} 38bf215546Sopenharmony_ci 39bf215546Sopenharmony_cistatic bool 40bf215546Sopenharmony_cibi_is_direct_aligned_ubo(bi_instr *ins) 41bf215546Sopenharmony_ci{ 42bf215546Sopenharmony_ci return bi_is_ubo(ins) && 43bf215546Sopenharmony_ci (ins->src[0].type == BI_INDEX_CONSTANT) && 44bf215546Sopenharmony_ci (ins->src[1].type == BI_INDEX_CONSTANT) && 45bf215546Sopenharmony_ci ((ins->src[0].value & 0x3) == 0); 46bf215546Sopenharmony_ci} 47bf215546Sopenharmony_ci 48bf215546Sopenharmony_ci/* Represents use data for a single UBO */ 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_ci#define MAX_UBO_WORDS (65536 / 16) 51bf215546Sopenharmony_ci 52bf215546Sopenharmony_cistruct bi_ubo_block { 53bf215546Sopenharmony_ci BITSET_DECLARE(pushed, MAX_UBO_WORDS); 54bf215546Sopenharmony_ci uint8_t range[MAX_UBO_WORDS]; 55bf215546Sopenharmony_ci}; 56bf215546Sopenharmony_ci 57bf215546Sopenharmony_cistruct bi_ubo_analysis { 58bf215546Sopenharmony_ci /* Per block analysis */ 59bf215546Sopenharmony_ci unsigned nr_blocks; 60bf215546Sopenharmony_ci struct bi_ubo_block *blocks; 61bf215546Sopenharmony_ci}; 62bf215546Sopenharmony_ci 63bf215546Sopenharmony_cistatic struct bi_ubo_analysis 64bf215546Sopenharmony_cibi_analyze_ranges(bi_context *ctx) 65bf215546Sopenharmony_ci{ 66bf215546Sopenharmony_ci struct bi_ubo_analysis res = { 67bf215546Sopenharmony_ci .nr_blocks = ctx->nir->info.num_ubos + 1, 68bf215546Sopenharmony_ci }; 69bf215546Sopenharmony_ci 70bf215546Sopenharmony_ci res.blocks = calloc(res.nr_blocks, sizeof(struct bi_ubo_block)); 71bf215546Sopenharmony_ci 72bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, ins) { 73bf215546Sopenharmony_ci if (!bi_is_direct_aligned_ubo(ins)) continue; 74bf215546Sopenharmony_ci 75bf215546Sopenharmony_ci unsigned ubo = ins->src[1].value; 76bf215546Sopenharmony_ci unsigned word = ins->src[0].value / 4; 77bf215546Sopenharmony_ci unsigned channels = bi_opcode_props[ins->op].sr_count; 78bf215546Sopenharmony_ci 79bf215546Sopenharmony_ci assert(ubo < res.nr_blocks); 80bf215546Sopenharmony_ci assert(channels > 0 && channels <= 4); 81bf215546Sopenharmony_ci 82bf215546Sopenharmony_ci if (word >= MAX_UBO_WORDS) continue; 83bf215546Sopenharmony_ci 84bf215546Sopenharmony_ci /* Must use max if the same base is read with different channel 85bf215546Sopenharmony_ci * counts, which is possible with nir_opt_shrink_vectors */ 86bf215546Sopenharmony_ci uint8_t *range = res.blocks[ubo].range; 87bf215546Sopenharmony_ci range[word] = MAX2(range[word], channels); 88bf215546Sopenharmony_ci } 89bf215546Sopenharmony_ci 90bf215546Sopenharmony_ci return res; 91bf215546Sopenharmony_ci} 92bf215546Sopenharmony_ci 93bf215546Sopenharmony_ci/* Select UBO words to push. A sophisticated implementation would consider the 94bf215546Sopenharmony_ci * number of uses and perhaps the control flow to estimate benefit. This is not 95bf215546Sopenharmony_ci * sophisticated. Select from the last UBO first to prioritize sysvals. */ 96bf215546Sopenharmony_ci 97bf215546Sopenharmony_cistatic void 98bf215546Sopenharmony_cibi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis) 99bf215546Sopenharmony_ci{ 100bf215546Sopenharmony_ci for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) { 101bf215546Sopenharmony_ci struct bi_ubo_block *block = &analysis->blocks[ubo]; 102bf215546Sopenharmony_ci 103bf215546Sopenharmony_ci for (unsigned r = 0; r < MAX_UBO_WORDS; ++r) { 104bf215546Sopenharmony_ci unsigned range = block->range[r]; 105bf215546Sopenharmony_ci 106bf215546Sopenharmony_ci /* Don't push something we don't access */ 107bf215546Sopenharmony_ci if (range == 0) continue; 108bf215546Sopenharmony_ci 109bf215546Sopenharmony_ci /* Don't push more than possible */ 110bf215546Sopenharmony_ci if (push->count > PAN_MAX_PUSH - range) 111bf215546Sopenharmony_ci return; 112bf215546Sopenharmony_ci 113bf215546Sopenharmony_ci for (unsigned offs = 0; offs < range; ++offs) { 114bf215546Sopenharmony_ci struct panfrost_ubo_word word = { 115bf215546Sopenharmony_ci .ubo = ubo, 116bf215546Sopenharmony_ci .offset = (r + offs) * 4 117bf215546Sopenharmony_ci }; 118bf215546Sopenharmony_ci 119bf215546Sopenharmony_ci push->words[push->count++] = word; 120bf215546Sopenharmony_ci } 121bf215546Sopenharmony_ci 122bf215546Sopenharmony_ci /* Mark it as pushed so we can rewrite */ 123bf215546Sopenharmony_ci BITSET_SET(block->pushed, r); 124bf215546Sopenharmony_ci } 125bf215546Sopenharmony_ci } 126bf215546Sopenharmony_ci} 127bf215546Sopenharmony_ci 128bf215546Sopenharmony_civoid 129bf215546Sopenharmony_cibi_opt_push_ubo(bi_context *ctx) 130bf215546Sopenharmony_ci{ 131bf215546Sopenharmony_ci struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx); 132bf215546Sopenharmony_ci bi_pick_ubo(ctx->info.push, &analysis); 133bf215546Sopenharmony_ci 134bf215546Sopenharmony_ci ctx->ubo_mask = 0; 135bf215546Sopenharmony_ci 136bf215546Sopenharmony_ci bi_foreach_instr_global_safe(ctx, ins) { 137bf215546Sopenharmony_ci if (!bi_is_ubo(ins)) continue; 138bf215546Sopenharmony_ci 139bf215546Sopenharmony_ci unsigned ubo = ins->src[1].value; 140bf215546Sopenharmony_ci unsigned offset = ins->src[0].value; 141bf215546Sopenharmony_ci 142bf215546Sopenharmony_ci if (!bi_is_direct_aligned_ubo(ins)) { 143bf215546Sopenharmony_ci /* The load can't be pushed, so this UBO needs to be 144bf215546Sopenharmony_ci * uploaded conventionally */ 145bf215546Sopenharmony_ci if (ins->src[1].type == BI_INDEX_CONSTANT) 146bf215546Sopenharmony_ci ctx->ubo_mask |= BITSET_BIT(ubo); 147bf215546Sopenharmony_ci else 148bf215546Sopenharmony_ci ctx->ubo_mask = ~0; 149bf215546Sopenharmony_ci 150bf215546Sopenharmony_ci continue; 151bf215546Sopenharmony_ci } 152bf215546Sopenharmony_ci 153bf215546Sopenharmony_ci /* Check if we decided to push this */ 154bf215546Sopenharmony_ci assert(ubo < analysis.nr_blocks); 155bf215546Sopenharmony_ci if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) { 156bf215546Sopenharmony_ci ctx->ubo_mask |= BITSET_BIT(ubo); 157bf215546Sopenharmony_ci continue; 158bf215546Sopenharmony_ci } 159bf215546Sopenharmony_ci 160bf215546Sopenharmony_ci /* Replace the UBO load with moves from FAU */ 161bf215546Sopenharmony_ci bi_builder b = bi_init_builder(ctx, bi_after_instr(ins)); 162bf215546Sopenharmony_ci 163bf215546Sopenharmony_ci bi_instr *vec = bi_collect_i32_to(&b, ins->dest[0]); 164bf215546Sopenharmony_ci vec->nr_srcs = bi_opcode_props[ins->op].sr_count; 165bf215546Sopenharmony_ci 166bf215546Sopenharmony_ci for (unsigned w = 0; w < vec->nr_srcs; ++w) { 167bf215546Sopenharmony_ci /* FAU is grouped in pairs (2 x 4-byte) */ 168bf215546Sopenharmony_ci unsigned base = 169bf215546Sopenharmony_ci pan_lookup_pushed_ubo(ctx->info.push, ubo, 170bf215546Sopenharmony_ci (offset + 4 * w)); 171bf215546Sopenharmony_ci 172bf215546Sopenharmony_ci unsigned fau_idx = (base >> 1); 173bf215546Sopenharmony_ci unsigned fau_hi = (base & 1); 174bf215546Sopenharmony_ci 175bf215546Sopenharmony_ci vec->src[w] = bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi); 176bf215546Sopenharmony_ci } 177bf215546Sopenharmony_ci 178bf215546Sopenharmony_ci bi_remove_instruction(ins); 179bf215546Sopenharmony_ci } 180bf215546Sopenharmony_ci 181bf215546Sopenharmony_ci free(analysis.blocks); 182bf215546Sopenharmony_ci} 183bf215546Sopenharmony_ci 184bf215546Sopenharmony_citypedef struct { 185bf215546Sopenharmony_ci BITSET_DECLARE(row, PAN_MAX_PUSH); 186bf215546Sopenharmony_ci} adjacency_row; 187bf215546Sopenharmony_ci 188bf215546Sopenharmony_ci/* Find the connected component containing `node` with depth-first search */ 189bf215546Sopenharmony_cistatic void 190bf215546Sopenharmony_cibi_find_component(adjacency_row *adjacency, BITSET_WORD *visited, 191bf215546Sopenharmony_ci unsigned *component, unsigned *size, unsigned node) 192bf215546Sopenharmony_ci{ 193bf215546Sopenharmony_ci unsigned neighbour; 194bf215546Sopenharmony_ci 195bf215546Sopenharmony_ci BITSET_SET(visited, node); 196bf215546Sopenharmony_ci component[(*size)++] = node; 197bf215546Sopenharmony_ci 198bf215546Sopenharmony_ci BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) { 199bf215546Sopenharmony_ci if (!BITSET_TEST(visited, neighbour)) { 200bf215546Sopenharmony_ci bi_find_component(adjacency, visited, component, size, 201bf215546Sopenharmony_ci neighbour); 202bf215546Sopenharmony_ci } 203bf215546Sopenharmony_ci } 204bf215546Sopenharmony_ci} 205bf215546Sopenharmony_ci 206bf215546Sopenharmony_cistatic bool 207bf215546Sopenharmony_cibi_is_uniform(bi_index idx) 208bf215546Sopenharmony_ci{ 209bf215546Sopenharmony_ci return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM); 210bf215546Sopenharmony_ci} 211bf215546Sopenharmony_ci 212bf215546Sopenharmony_ci/* Get the index of a uniform in 32-bit words from the start of FAU-RAM */ 213bf215546Sopenharmony_cistatic unsigned 214bf215546Sopenharmony_cibi_uniform_word(bi_index idx) 215bf215546Sopenharmony_ci{ 216bf215546Sopenharmony_ci assert(bi_is_uniform(idx)); 217bf215546Sopenharmony_ci assert(idx.offset <= 1); 218bf215546Sopenharmony_ci 219bf215546Sopenharmony_ci return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset; 220bf215546Sopenharmony_ci} 221bf215546Sopenharmony_ci 222bf215546Sopenharmony_ci/* 223bf215546Sopenharmony_ci * Create an undirected graph where nodes are 32-bit uniform indices and edges 224bf215546Sopenharmony_ci * represent that two nodes are used in the same instruction. 225bf215546Sopenharmony_ci * 226bf215546Sopenharmony_ci * The graph is constructed as an adjacency matrix stored in adjacency. 227bf215546Sopenharmony_ci */ 228bf215546Sopenharmony_cistatic void 229bf215546Sopenharmony_cibi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency) 230bf215546Sopenharmony_ci{ 231bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, I) { 232bf215546Sopenharmony_ci unsigned nodes[BI_MAX_SRCS] = {}; 233bf215546Sopenharmony_ci unsigned node_count = 0; 234bf215546Sopenharmony_ci 235bf215546Sopenharmony_ci /* Set nodes[] to 32-bit uniforms accessed */ 236bf215546Sopenharmony_ci bi_foreach_src(I, s) { 237bf215546Sopenharmony_ci if (bi_is_uniform(I->src[s])) { 238bf215546Sopenharmony_ci unsigned word = bi_uniform_word(I->src[s]); 239bf215546Sopenharmony_ci 240bf215546Sopenharmony_ci if (word >= ctx->info.push_offset) 241bf215546Sopenharmony_ci nodes[node_count++] = word; 242bf215546Sopenharmony_ci } 243bf215546Sopenharmony_ci } 244bf215546Sopenharmony_ci 245bf215546Sopenharmony_ci /* Create clique connecting nodes[] */ 246bf215546Sopenharmony_ci for (unsigned i = 0; i < node_count; ++i) { 247bf215546Sopenharmony_ci for (unsigned j = 0; j < node_count; ++j) { 248bf215546Sopenharmony_ci if (i == j) 249bf215546Sopenharmony_ci continue; 250bf215546Sopenharmony_ci 251bf215546Sopenharmony_ci unsigned x = nodes[i], y = nodes[j]; 252bf215546Sopenharmony_ci assert(MAX2(x, y) < ctx->info.push->count); 253bf215546Sopenharmony_ci 254bf215546Sopenharmony_ci /* Add undirected edge between the nodes */ 255bf215546Sopenharmony_ci BITSET_SET(adjacency[x].row, y); 256bf215546Sopenharmony_ci BITSET_SET(adjacency[y].row, x); 257bf215546Sopenharmony_ci } 258bf215546Sopenharmony_ci } 259bf215546Sopenharmony_ci } 260bf215546Sopenharmony_ci} 261bf215546Sopenharmony_ci 262bf215546Sopenharmony_ci/* 263bf215546Sopenharmony_ci * Optimization pass to reorder uniforms. The goal is to reduce the number of 264bf215546Sopenharmony_ci * moves we emit when lowering FAU. The pass groups uniforms used by the same 265bf215546Sopenharmony_ci * instruction. 266bf215546Sopenharmony_ci * 267bf215546Sopenharmony_ci * The pass works by creating a graph of pushed uniforms, where edges denote the 268bf215546Sopenharmony_ci * "both 32-bit uniforms required by the same instruction" relationship. We 269bf215546Sopenharmony_ci * perform depth-first search on this graph to find the connected components, 270bf215546Sopenharmony_ci * where each connected component is a cluster of uniforms that are used 271bf215546Sopenharmony_ci * together. We then select pairs of uniforms from each connected component. 272bf215546Sopenharmony_ci * The remaining unpaired uniforms (from components of odd sizes) are paired 273bf215546Sopenharmony_ci * together arbitrarily. 274bf215546Sopenharmony_ci * 275bf215546Sopenharmony_ci * After a new ordering is selected, pushed uniforms in the program and the 276bf215546Sopenharmony_ci * panfrost_ubo_push data structure must be remapped to use the new ordering. 277bf215546Sopenharmony_ci */ 278bf215546Sopenharmony_civoid 279bf215546Sopenharmony_cibi_opt_reorder_push(bi_context *ctx) 280bf215546Sopenharmony_ci{ 281bf215546Sopenharmony_ci adjacency_row adjacency[PAN_MAX_PUSH] = { 0 }; 282bf215546Sopenharmony_ci BITSET_DECLARE(visited, PAN_MAX_PUSH) = { 0 }; 283bf215546Sopenharmony_ci 284bf215546Sopenharmony_ci unsigned ordering[PAN_MAX_PUSH] = { 0 }; 285bf215546Sopenharmony_ci unsigned unpaired[PAN_MAX_PUSH] = { 0 }; 286bf215546Sopenharmony_ci unsigned pushed = 0, unpaired_count = 0; 287bf215546Sopenharmony_ci 288bf215546Sopenharmony_ci struct panfrost_ubo_push *push = ctx->info.push; 289bf215546Sopenharmony_ci unsigned push_offset = ctx->info.push_offset; 290bf215546Sopenharmony_ci 291bf215546Sopenharmony_ci bi_create_fau_interference_graph(ctx, adjacency); 292bf215546Sopenharmony_ci 293bf215546Sopenharmony_ci for (unsigned i = push_offset; i < push->count; ++i) { 294bf215546Sopenharmony_ci if (BITSET_TEST(visited, i)) continue; 295bf215546Sopenharmony_ci 296bf215546Sopenharmony_ci unsigned component[PAN_MAX_PUSH] = { 0 }; 297bf215546Sopenharmony_ci unsigned size = 0; 298bf215546Sopenharmony_ci bi_find_component(adjacency, visited, component, &size, i); 299bf215546Sopenharmony_ci 300bf215546Sopenharmony_ci /* If there is an odd number of uses, at least one use must be 301bf215546Sopenharmony_ci * unpaired. Arbitrarily take the last one. 302bf215546Sopenharmony_ci */ 303bf215546Sopenharmony_ci if (size % 2) 304bf215546Sopenharmony_ci unpaired[unpaired_count++] = component[--size]; 305bf215546Sopenharmony_ci 306bf215546Sopenharmony_ci /* The rest of uses are paired */ 307bf215546Sopenharmony_ci assert((size % 2) == 0); 308bf215546Sopenharmony_ci 309bf215546Sopenharmony_ci /* Push the paired uses */ 310bf215546Sopenharmony_ci memcpy(ordering + pushed, component, sizeof(unsigned) * size); 311bf215546Sopenharmony_ci pushed += size; 312bf215546Sopenharmony_ci } 313bf215546Sopenharmony_ci 314bf215546Sopenharmony_ci /* Push unpaired nodes at the end */ 315bf215546Sopenharmony_ci memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count); 316bf215546Sopenharmony_ci pushed += unpaired_count; 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci /* Ordering is a permutation. Invert it for O(1) lookup. */ 319bf215546Sopenharmony_ci unsigned old_to_new[PAN_MAX_PUSH] = { 0 }; 320bf215546Sopenharmony_ci 321bf215546Sopenharmony_ci for (unsigned i = 0; i < push_offset; ++i) { 322bf215546Sopenharmony_ci old_to_new[i] = i; 323bf215546Sopenharmony_ci } 324bf215546Sopenharmony_ci 325bf215546Sopenharmony_ci for (unsigned i = 0; i < pushed; ++i) { 326bf215546Sopenharmony_ci assert(ordering[i] >= push_offset); 327bf215546Sopenharmony_ci old_to_new[ordering[i]] = push_offset + i; 328bf215546Sopenharmony_ci } 329bf215546Sopenharmony_ci 330bf215546Sopenharmony_ci /* Use new ordering throughout the program */ 331bf215546Sopenharmony_ci bi_foreach_instr_global(ctx, I) { 332bf215546Sopenharmony_ci bi_foreach_src(I, s) { 333bf215546Sopenharmony_ci if (bi_is_uniform(I->src[s])) { 334bf215546Sopenharmony_ci unsigned node = bi_uniform_word(I->src[s]); 335bf215546Sopenharmony_ci unsigned new_node = old_to_new[node]; 336bf215546Sopenharmony_ci I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1); 337bf215546Sopenharmony_ci I->src[s].offset = new_node & 1; 338bf215546Sopenharmony_ci } 339bf215546Sopenharmony_ci } 340bf215546Sopenharmony_ci } 341bf215546Sopenharmony_ci 342bf215546Sopenharmony_ci /* Use new ordering for push */ 343bf215546Sopenharmony_ci struct panfrost_ubo_push old = *push; 344bf215546Sopenharmony_ci for (unsigned i = 0; i < pushed; ++i) 345bf215546Sopenharmony_ci push->words[push_offset + i] = old.words[ordering[i]]; 346bf215546Sopenharmony_ci 347bf215546Sopenharmony_ci push->count = push_offset + pushed; 348bf215546Sopenharmony_ci} 349