1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright (C) 2022 Collabora Ltd. 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21bf215546Sopenharmony_ci * SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci * Authors (Collabora): 24bf215546Sopenharmony_ci * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> 25bf215546Sopenharmony_ci */ 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci/* Bottom-up local scheduler to reduce register pressure */ 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_ci#include "compiler.h" 30bf215546Sopenharmony_ci#include "util/dag.h" 31bf215546Sopenharmony_ci 32bf215546Sopenharmony_cistruct sched_ctx { 33bf215546Sopenharmony_ci /* Dependency graph */ 34bf215546Sopenharmony_ci struct dag *dag; 35bf215546Sopenharmony_ci 36bf215546Sopenharmony_ci /* Live set */ 37bf215546Sopenharmony_ci uint8_t *live; 38bf215546Sopenharmony_ci 39bf215546Sopenharmony_ci /* Size of the live set */ 40bf215546Sopenharmony_ci unsigned max; 41bf215546Sopenharmony_ci}; 42bf215546Sopenharmony_ci 43bf215546Sopenharmony_cistruct sched_node { 44bf215546Sopenharmony_ci struct dag_node dag; 45bf215546Sopenharmony_ci 46bf215546Sopenharmony_ci /* Instruction this node represents */ 47bf215546Sopenharmony_ci bi_instr *instr; 48bf215546Sopenharmony_ci}; 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_cistatic unsigned 51bf215546Sopenharmony_cilabel_index(bi_context *ctx, bi_index idx) 52bf215546Sopenharmony_ci{ 53bf215546Sopenharmony_ci if (idx.reg) { 54bf215546Sopenharmony_ci assert(idx.value < ctx->reg_alloc); 55bf215546Sopenharmony_ci return idx.value + ctx->ssa_alloc; 56bf215546Sopenharmony_ci } else { 57bf215546Sopenharmony_ci assert(idx.value < ctx->ssa_alloc); 58bf215546Sopenharmony_ci return idx.value; 59bf215546Sopenharmony_ci } 60bf215546Sopenharmony_ci} 61bf215546Sopenharmony_ci 62bf215546Sopenharmony_cistatic void 63bf215546Sopenharmony_ciadd_dep(struct sched_node *a, struct sched_node *b) 64bf215546Sopenharmony_ci{ 65bf215546Sopenharmony_ci if (a && b) 66bf215546Sopenharmony_ci dag_add_edge(&a->dag, &b->dag, 0); 67bf215546Sopenharmony_ci} 68bf215546Sopenharmony_ci 69bf215546Sopenharmony_cistatic struct dag * 70bf215546Sopenharmony_cicreate_dag(bi_context *ctx, bi_block *block, void *memctx) 71bf215546Sopenharmony_ci{ 72bf215546Sopenharmony_ci struct dag *dag = dag_create(ctx); 73bf215546Sopenharmony_ci 74bf215546Sopenharmony_ci unsigned count = ctx->ssa_alloc + ctx->reg_alloc; 75bf215546Sopenharmony_ci struct sched_node **last_read = 76bf215546Sopenharmony_ci calloc(count, sizeof(struct sched_node *)); 77bf215546Sopenharmony_ci struct sched_node **last_write = 78bf215546Sopenharmony_ci calloc(count, sizeof(struct sched_node *)); 79bf215546Sopenharmony_ci struct sched_node *coverage = NULL; 80bf215546Sopenharmony_ci struct sched_node *preload = NULL; 81bf215546Sopenharmony_ci 82bf215546Sopenharmony_ci /* Last memory load, to serialize stores against */ 83bf215546Sopenharmony_ci struct sched_node *memory_load = NULL; 84bf215546Sopenharmony_ci 85bf215546Sopenharmony_ci /* Last memory store, to serialize loads and stores against */ 86bf215546Sopenharmony_ci struct sched_node *memory_store = NULL; 87bf215546Sopenharmony_ci 88bf215546Sopenharmony_ci bi_foreach_instr_in_block(block, I) { 89bf215546Sopenharmony_ci /* Leave branches at the end */ 90bf215546Sopenharmony_ci if (I->op == BI_OPCODE_JUMP || bi_opcode_props[I->op].branch) 91bf215546Sopenharmony_ci break; 92bf215546Sopenharmony_ci 93bf215546Sopenharmony_ci assert(I->branch_target == NULL); 94bf215546Sopenharmony_ci 95bf215546Sopenharmony_ci struct sched_node *node = rzalloc(memctx, struct sched_node); 96bf215546Sopenharmony_ci node->instr = I; 97bf215546Sopenharmony_ci dag_init_node(dag, &node->dag); 98bf215546Sopenharmony_ci 99bf215546Sopenharmony_ci /* Reads depend on writes */ 100bf215546Sopenharmony_ci bi_foreach_src(I, s) { 101bf215546Sopenharmony_ci bi_index src = I->src[s]; 102bf215546Sopenharmony_ci 103bf215546Sopenharmony_ci if (src.type == BI_INDEX_NORMAL) { 104bf215546Sopenharmony_ci add_dep(node, last_write[label_index(ctx, src)]); 105bf215546Sopenharmony_ci 106bf215546Sopenharmony_ci /* Serialize access to nir_register for 107bf215546Sopenharmony_ci * simplicity. We could do better. 108bf215546Sopenharmony_ci */ 109bf215546Sopenharmony_ci if (src.reg) 110bf215546Sopenharmony_ci add_dep(node, last_read[label_index(ctx, src)]); 111bf215546Sopenharmony_ci } 112bf215546Sopenharmony_ci } 113bf215546Sopenharmony_ci 114bf215546Sopenharmony_ci /* Writes depend on reads and writes */ 115bf215546Sopenharmony_ci bi_foreach_dest(I, s) { 116bf215546Sopenharmony_ci bi_index dest = I->dest[s]; 117bf215546Sopenharmony_ci 118bf215546Sopenharmony_ci if (dest.type == BI_INDEX_NORMAL) { 119bf215546Sopenharmony_ci add_dep(node, last_read[label_index(ctx, dest)]); 120bf215546Sopenharmony_ci add_dep(node, last_write[label_index(ctx, dest)]); 121bf215546Sopenharmony_ci 122bf215546Sopenharmony_ci last_write[label_index(ctx, dest)] = node; 123bf215546Sopenharmony_ci } 124bf215546Sopenharmony_ci } 125bf215546Sopenharmony_ci 126bf215546Sopenharmony_ci bi_foreach_src(I, s) { 127bf215546Sopenharmony_ci bi_index src = I->src[s]; 128bf215546Sopenharmony_ci 129bf215546Sopenharmony_ci if (src.type == BI_INDEX_NORMAL) { 130bf215546Sopenharmony_ci last_read[label_index(ctx, src)] = node; 131bf215546Sopenharmony_ci } 132bf215546Sopenharmony_ci } 133bf215546Sopenharmony_ci 134bf215546Sopenharmony_ci switch (bi_opcode_props[I->op].message) { 135bf215546Sopenharmony_ci case BIFROST_MESSAGE_LOAD: 136bf215546Sopenharmony_ci /* Regular memory loads needs to be serialized against 137bf215546Sopenharmony_ci * other memory access. However, UBO memory is read-only 138bf215546Sopenharmony_ci * so it can be moved around freely. 139bf215546Sopenharmony_ci */ 140bf215546Sopenharmony_ci if (I->seg != BI_SEG_UBO) { 141bf215546Sopenharmony_ci add_dep(node, memory_store); 142bf215546Sopenharmony_ci memory_load = node; 143bf215546Sopenharmony_ci } 144bf215546Sopenharmony_ci 145bf215546Sopenharmony_ci break; 146bf215546Sopenharmony_ci 147bf215546Sopenharmony_ci case BIFROST_MESSAGE_ATTRIBUTE: 148bf215546Sopenharmony_ci /* Regular attribute loads can be reordered, but 149bf215546Sopenharmony_ci * writeable attributes can't be. Our one use of 150bf215546Sopenharmony_ci * writeable attributes are images. 151bf215546Sopenharmony_ci */ 152bf215546Sopenharmony_ci if ((I->op == BI_OPCODE_LD_TEX) || 153bf215546Sopenharmony_ci (I->op == BI_OPCODE_LD_TEX_IMM) || 154bf215546Sopenharmony_ci (I->op == BI_OPCODE_LD_ATTR_TEX)) { 155bf215546Sopenharmony_ci add_dep(node, memory_store); 156bf215546Sopenharmony_ci memory_load = node; 157bf215546Sopenharmony_ci } 158bf215546Sopenharmony_ci 159bf215546Sopenharmony_ci break; 160bf215546Sopenharmony_ci 161bf215546Sopenharmony_ci case BIFROST_MESSAGE_STORE: 162bf215546Sopenharmony_ci assert(I->seg != BI_SEG_UBO); 163bf215546Sopenharmony_ci add_dep(node, memory_load); 164bf215546Sopenharmony_ci add_dep(node, memory_store); 165bf215546Sopenharmony_ci memory_store = node; 166bf215546Sopenharmony_ci break; 167bf215546Sopenharmony_ci 168bf215546Sopenharmony_ci case BIFROST_MESSAGE_ATOMIC: 169bf215546Sopenharmony_ci case BIFROST_MESSAGE_BARRIER: 170bf215546Sopenharmony_ci add_dep(node, memory_load); 171bf215546Sopenharmony_ci add_dep(node, memory_store); 172bf215546Sopenharmony_ci memory_load = node; 173bf215546Sopenharmony_ci memory_store = node; 174bf215546Sopenharmony_ci break; 175bf215546Sopenharmony_ci 176bf215546Sopenharmony_ci case BIFROST_MESSAGE_BLEND: 177bf215546Sopenharmony_ci case BIFROST_MESSAGE_Z_STENCIL: 178bf215546Sopenharmony_ci case BIFROST_MESSAGE_TILE: 179bf215546Sopenharmony_ci add_dep(node, coverage); 180bf215546Sopenharmony_ci coverage = node; 181bf215546Sopenharmony_ci break; 182bf215546Sopenharmony_ci 183bf215546Sopenharmony_ci case BIFROST_MESSAGE_ATEST: 184bf215546Sopenharmony_ci /* ATEST signals the end of shader side effects */ 185bf215546Sopenharmony_ci add_dep(node, memory_store); 186bf215546Sopenharmony_ci memory_store = node; 187bf215546Sopenharmony_ci 188bf215546Sopenharmony_ci /* ATEST also updates coverage */ 189bf215546Sopenharmony_ci add_dep(node, coverage); 190bf215546Sopenharmony_ci coverage = node; 191bf215546Sopenharmony_ci break; 192bf215546Sopenharmony_ci default: 193bf215546Sopenharmony_ci break; 194bf215546Sopenharmony_ci } 195bf215546Sopenharmony_ci 196bf215546Sopenharmony_ci add_dep(node, preload); 197bf215546Sopenharmony_ci 198bf215546Sopenharmony_ci if (I->op == BI_OPCODE_DISCARD_F32) { 199bf215546Sopenharmony_ci /* Serialize against ATEST */ 200bf215546Sopenharmony_ci add_dep(node, coverage); 201bf215546Sopenharmony_ci coverage = node; 202bf215546Sopenharmony_ci 203bf215546Sopenharmony_ci /* Also serialize against memory and barriers */ 204bf215546Sopenharmony_ci add_dep(node, memory_load); 205bf215546Sopenharmony_ci add_dep(node, memory_store); 206bf215546Sopenharmony_ci memory_load = node; 207bf215546Sopenharmony_ci memory_store = node; 208bf215546Sopenharmony_ci } else if (I->op == BI_OPCODE_MOV_I32 && I->src[0].type == BI_INDEX_REGISTER) { 209bf215546Sopenharmony_ci preload = node; 210bf215546Sopenharmony_ci } 211bf215546Sopenharmony_ci } 212bf215546Sopenharmony_ci 213bf215546Sopenharmony_ci free(last_read); 214bf215546Sopenharmony_ci free(last_write); 215bf215546Sopenharmony_ci 216bf215546Sopenharmony_ci return dag; 217bf215546Sopenharmony_ci} 218bf215546Sopenharmony_ci 219bf215546Sopenharmony_ci/* 220bf215546Sopenharmony_ci * Calculate the change in register pressure from scheduling a given 221bf215546Sopenharmony_ci * instruction. Equivalently, calculate the difference in the number of live 222bf215546Sopenharmony_ci * registers before and after the instruction, given the live set after the 223bf215546Sopenharmony_ci * instruction. This calculation follows immediately from the dataflow 224bf215546Sopenharmony_ci * definition of liveness: 225bf215546Sopenharmony_ci * 226bf215546Sopenharmony_ci * live_in = (live_out - KILL) + GEN 227bf215546Sopenharmony_ci */ 228bf215546Sopenharmony_cistatic signed 229bf215546Sopenharmony_cicalculate_pressure_delta(bi_instr *I, uint8_t *live, unsigned max) 230bf215546Sopenharmony_ci{ 231bf215546Sopenharmony_ci signed delta = 0; 232bf215546Sopenharmony_ci 233bf215546Sopenharmony_ci /* Destinations must be unique */ 234bf215546Sopenharmony_ci bi_foreach_dest(I, d) { 235bf215546Sopenharmony_ci unsigned node = bi_get_node(I->dest[d]); 236bf215546Sopenharmony_ci 237bf215546Sopenharmony_ci if (node < max && live[node]) 238bf215546Sopenharmony_ci delta -= bi_count_write_registers(I, d); 239bf215546Sopenharmony_ci } 240bf215546Sopenharmony_ci 241bf215546Sopenharmony_ci bi_foreach_src(I, src) { 242bf215546Sopenharmony_ci unsigned node = bi_get_node(I->src[src]); 243bf215546Sopenharmony_ci if (node >= max) 244bf215546Sopenharmony_ci continue; 245bf215546Sopenharmony_ci 246bf215546Sopenharmony_ci /* Filter duplicates */ 247bf215546Sopenharmony_ci bool dupe = false; 248bf215546Sopenharmony_ci 249bf215546Sopenharmony_ci for (unsigned i = 0; i < src; ++i) { 250bf215546Sopenharmony_ci if (bi_get_node(I->src[i]) == node) { 251bf215546Sopenharmony_ci dupe = true; 252bf215546Sopenharmony_ci break; 253bf215546Sopenharmony_ci } 254bf215546Sopenharmony_ci } 255bf215546Sopenharmony_ci 256bf215546Sopenharmony_ci if (!dupe && !live[node]) 257bf215546Sopenharmony_ci delta += bi_count_read_registers(I, src); 258bf215546Sopenharmony_ci } 259bf215546Sopenharmony_ci 260bf215546Sopenharmony_ci return delta; 261bf215546Sopenharmony_ci} 262bf215546Sopenharmony_ci 263bf215546Sopenharmony_ci/* 264bf215546Sopenharmony_ci * Choose the next instruction, bottom-up. For now we use a simple greedy 265bf215546Sopenharmony_ci * heuristic: choose the instruction that has the best effect on liveness. 266bf215546Sopenharmony_ci */ 267bf215546Sopenharmony_cistatic struct sched_node * 268bf215546Sopenharmony_cichoose_instr(struct sched_ctx *s) 269bf215546Sopenharmony_ci{ 270bf215546Sopenharmony_ci int32_t min_delta = INT32_MAX; 271bf215546Sopenharmony_ci struct sched_node *best = NULL; 272bf215546Sopenharmony_ci 273bf215546Sopenharmony_ci list_for_each_entry(struct sched_node, n, &s->dag->heads, dag.link) { 274bf215546Sopenharmony_ci int32_t delta = calculate_pressure_delta(n->instr, s->live, s->max); 275bf215546Sopenharmony_ci 276bf215546Sopenharmony_ci if (delta < min_delta) { 277bf215546Sopenharmony_ci best = n; 278bf215546Sopenharmony_ci min_delta = delta; 279bf215546Sopenharmony_ci } 280bf215546Sopenharmony_ci } 281bf215546Sopenharmony_ci 282bf215546Sopenharmony_ci return best; 283bf215546Sopenharmony_ci} 284bf215546Sopenharmony_ci 285bf215546Sopenharmony_cistatic void 286bf215546Sopenharmony_cipressure_schedule_block(bi_context *ctx, bi_block *block, struct sched_ctx *s) 287bf215546Sopenharmony_ci{ 288bf215546Sopenharmony_ci /* off by a constant, that's ok */ 289bf215546Sopenharmony_ci signed pressure = 0; 290bf215546Sopenharmony_ci signed orig_max_pressure = 0; 291bf215546Sopenharmony_ci unsigned nr_ins = 0; 292bf215546Sopenharmony_ci 293bf215546Sopenharmony_ci memcpy(s->live, block->live_out, s->max); 294bf215546Sopenharmony_ci 295bf215546Sopenharmony_ci bi_foreach_instr_in_block_rev(block, I) { 296bf215546Sopenharmony_ci pressure += calculate_pressure_delta(I, s->live, s->max); 297bf215546Sopenharmony_ci orig_max_pressure = MAX2(pressure, orig_max_pressure); 298bf215546Sopenharmony_ci bi_liveness_ins_update(s->live, I, s->max); 299bf215546Sopenharmony_ci nr_ins++; 300bf215546Sopenharmony_ci } 301bf215546Sopenharmony_ci 302bf215546Sopenharmony_ci memcpy(s->live, block->live_out, s->max); 303bf215546Sopenharmony_ci 304bf215546Sopenharmony_ci /* off by a constant, that's ok */ 305bf215546Sopenharmony_ci signed max_pressure = 0; 306bf215546Sopenharmony_ci pressure = 0; 307bf215546Sopenharmony_ci 308bf215546Sopenharmony_ci struct sched_node **schedule = calloc(nr_ins, sizeof(struct sched_node *)); 309bf215546Sopenharmony_ci nr_ins = 0; 310bf215546Sopenharmony_ci 311bf215546Sopenharmony_ci while (!list_is_empty(&s->dag->heads)) { 312bf215546Sopenharmony_ci struct sched_node *node = choose_instr(s); 313bf215546Sopenharmony_ci pressure += calculate_pressure_delta(node->instr, s->live, s->max); 314bf215546Sopenharmony_ci max_pressure = MAX2(pressure, max_pressure); 315bf215546Sopenharmony_ci dag_prune_head(s->dag, &node->dag); 316bf215546Sopenharmony_ci 317bf215546Sopenharmony_ci schedule[nr_ins++] = node; 318bf215546Sopenharmony_ci bi_liveness_ins_update(s->live, node->instr, s->max); 319bf215546Sopenharmony_ci } 320bf215546Sopenharmony_ci 321bf215546Sopenharmony_ci /* Bail if it looks like it's worse */ 322bf215546Sopenharmony_ci if (max_pressure >= orig_max_pressure) { 323bf215546Sopenharmony_ci free(schedule); 324bf215546Sopenharmony_ci return; 325bf215546Sopenharmony_ci } 326bf215546Sopenharmony_ci 327bf215546Sopenharmony_ci /* Apply the schedule */ 328bf215546Sopenharmony_ci for (unsigned i = 0; i < nr_ins; ++i) { 329bf215546Sopenharmony_ci bi_remove_instruction(schedule[i]->instr); 330bf215546Sopenharmony_ci list_add(&schedule[i]->instr->link, &block->instructions); 331bf215546Sopenharmony_ci } 332bf215546Sopenharmony_ci 333bf215546Sopenharmony_ci free(schedule); 334bf215546Sopenharmony_ci} 335bf215546Sopenharmony_ci 336bf215546Sopenharmony_civoid 337bf215546Sopenharmony_cibi_pressure_schedule(bi_context *ctx) 338bf215546Sopenharmony_ci{ 339bf215546Sopenharmony_ci bi_compute_liveness(ctx); 340bf215546Sopenharmony_ci unsigned temp_count = bi_max_temp(ctx); 341bf215546Sopenharmony_ci void *memctx = ralloc_context(ctx); 342bf215546Sopenharmony_ci uint8_t *live = ralloc_array(memctx, uint8_t, temp_count); 343bf215546Sopenharmony_ci 344bf215546Sopenharmony_ci bi_foreach_block(ctx, block) { 345bf215546Sopenharmony_ci struct sched_ctx sctx = { 346bf215546Sopenharmony_ci .dag = create_dag(ctx, block, memctx), 347bf215546Sopenharmony_ci .max = temp_count, 348bf215546Sopenharmony_ci .live = live 349bf215546Sopenharmony_ci }; 350bf215546Sopenharmony_ci 351bf215546Sopenharmony_ci pressure_schedule_block(ctx, block, &sctx); 352bf215546Sopenharmony_ci } 353bf215546Sopenharmony_ci 354bf215546Sopenharmony_ci ralloc_free(memctx); 355bf215546Sopenharmony_ci} 356