1/* 2 * Copyright (C) 2022 Collabora Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors (Collabora): 24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> 25 */ 26 27/* Bottom-up local scheduler to reduce register pressure */ 28 29#include "compiler.h" 30#include "util/dag.h" 31 32struct sched_ctx { 33 /* Dependency graph */ 34 struct dag *dag; 35 36 /* Live set */ 37 uint8_t *live; 38 39 /* Size of the live set */ 40 unsigned max; 41}; 42 43struct sched_node { 44 struct dag_node dag; 45 46 /* Instruction this node represents */ 47 bi_instr *instr; 48}; 49 50static unsigned 51label_index(bi_context *ctx, bi_index idx) 52{ 53 if (idx.reg) { 54 assert(idx.value < ctx->reg_alloc); 55 return idx.value + ctx->ssa_alloc; 56 } else { 57 assert(idx.value < ctx->ssa_alloc); 58 return idx.value; 59 } 60} 61 62static void 63add_dep(struct sched_node *a, struct sched_node *b) 64{ 65 if (a && b) 66 dag_add_edge(&a->dag, &b->dag, 0); 67} 68 69static struct dag * 70create_dag(bi_context *ctx, bi_block *block, void *memctx) 71{ 72 struct dag *dag = dag_create(ctx); 73 74 unsigned count = ctx->ssa_alloc + ctx->reg_alloc; 75 struct sched_node **last_read = 76 calloc(count, sizeof(struct sched_node *)); 77 struct sched_node **last_write = 78 calloc(count, sizeof(struct sched_node *)); 79 struct sched_node *coverage = NULL; 80 struct sched_node *preload = NULL; 81 82 /* Last memory load, to serialize stores against */ 83 struct sched_node *memory_load = NULL; 84 85 /* Last memory store, to serialize loads and stores against */ 86 struct sched_node *memory_store = NULL; 87 88 bi_foreach_instr_in_block(block, I) { 89 /* Leave branches at the end */ 90 if (I->op == BI_OPCODE_JUMP || bi_opcode_props[I->op].branch) 91 break; 92 93 assert(I->branch_target == NULL); 94 95 struct sched_node *node = rzalloc(memctx, struct sched_node); 96 node->instr = I; 97 dag_init_node(dag, &node->dag); 98 99 /* Reads depend on writes */ 100 bi_foreach_src(I, s) { 101 bi_index src = I->src[s]; 102 103 if (src.type == BI_INDEX_NORMAL) { 104 add_dep(node, last_write[label_index(ctx, src)]); 105 106 /* Serialize access to nir_register for 107 * simplicity. We could do better. 108 */ 109 if (src.reg) 110 add_dep(node, last_read[label_index(ctx, src)]); 111 } 112 } 113 114 /* Writes depend on reads and writes */ 115 bi_foreach_dest(I, s) { 116 bi_index dest = I->dest[s]; 117 118 if (dest.type == BI_INDEX_NORMAL) { 119 add_dep(node, last_read[label_index(ctx, dest)]); 120 add_dep(node, last_write[label_index(ctx, dest)]); 121 122 last_write[label_index(ctx, dest)] = node; 123 } 124 } 125 126 bi_foreach_src(I, s) { 127 bi_index src = I->src[s]; 128 129 if (src.type == BI_INDEX_NORMAL) { 130 last_read[label_index(ctx, src)] = node; 131 } 132 } 133 134 switch (bi_opcode_props[I->op].message) { 135 case BIFROST_MESSAGE_LOAD: 136 /* Regular memory loads needs to be serialized against 137 * other memory access. However, UBO memory is read-only 138 * so it can be moved around freely. 139 */ 140 if (I->seg != BI_SEG_UBO) { 141 add_dep(node, memory_store); 142 memory_load = node; 143 } 144 145 break; 146 147 case BIFROST_MESSAGE_ATTRIBUTE: 148 /* Regular attribute loads can be reordered, but 149 * writeable attributes can't be. Our one use of 150 * writeable attributes are images. 151 */ 152 if ((I->op == BI_OPCODE_LD_TEX) || 153 (I->op == BI_OPCODE_LD_TEX_IMM) || 154 (I->op == BI_OPCODE_LD_ATTR_TEX)) { 155 add_dep(node, memory_store); 156 memory_load = node; 157 } 158 159 break; 160 161 case BIFROST_MESSAGE_STORE: 162 assert(I->seg != BI_SEG_UBO); 163 add_dep(node, memory_load); 164 add_dep(node, memory_store); 165 memory_store = node; 166 break; 167 168 case BIFROST_MESSAGE_ATOMIC: 169 case BIFROST_MESSAGE_BARRIER: 170 add_dep(node, memory_load); 171 add_dep(node, memory_store); 172 memory_load = node; 173 memory_store = node; 174 break; 175 176 case BIFROST_MESSAGE_BLEND: 177 case BIFROST_MESSAGE_Z_STENCIL: 178 case BIFROST_MESSAGE_TILE: 179 add_dep(node, coverage); 180 coverage = node; 181 break; 182 183 case BIFROST_MESSAGE_ATEST: 184 /* ATEST signals the end of shader side effects */ 185 add_dep(node, memory_store); 186 memory_store = node; 187 188 /* ATEST also updates coverage */ 189 add_dep(node, coverage); 190 coverage = node; 191 break; 192 default: 193 break; 194 } 195 196 add_dep(node, preload); 197 198 if (I->op == BI_OPCODE_DISCARD_F32) { 199 /* Serialize against ATEST */ 200 add_dep(node, coverage); 201 coverage = node; 202 203 /* Also serialize against memory and barriers */ 204 add_dep(node, memory_load); 205 add_dep(node, memory_store); 206 memory_load = node; 207 memory_store = node; 208 } else if (I->op == BI_OPCODE_MOV_I32 && I->src[0].type == BI_INDEX_REGISTER) { 209 preload = node; 210 } 211 } 212 213 free(last_read); 214 free(last_write); 215 216 return dag; 217} 218 219/* 220 * Calculate the change in register pressure from scheduling a given 221 * instruction. Equivalently, calculate the difference in the number of live 222 * registers before and after the instruction, given the live set after the 223 * instruction. This calculation follows immediately from the dataflow 224 * definition of liveness: 225 * 226 * live_in = (live_out - KILL) + GEN 227 */ 228static signed 229calculate_pressure_delta(bi_instr *I, uint8_t *live, unsigned max) 230{ 231 signed delta = 0; 232 233 /* Destinations must be unique */ 234 bi_foreach_dest(I, d) { 235 unsigned node = bi_get_node(I->dest[d]); 236 237 if (node < max && live[node]) 238 delta -= bi_count_write_registers(I, d); 239 } 240 241 bi_foreach_src(I, src) { 242 unsigned node = bi_get_node(I->src[src]); 243 if (node >= max) 244 continue; 245 246 /* Filter duplicates */ 247 bool dupe = false; 248 249 for (unsigned i = 0; i < src; ++i) { 250 if (bi_get_node(I->src[i]) == node) { 251 dupe = true; 252 break; 253 } 254 } 255 256 if (!dupe && !live[node]) 257 delta += bi_count_read_registers(I, src); 258 } 259 260 return delta; 261} 262 263/* 264 * Choose the next instruction, bottom-up. For now we use a simple greedy 265 * heuristic: choose the instruction that has the best effect on liveness. 266 */ 267static struct sched_node * 268choose_instr(struct sched_ctx *s) 269{ 270 int32_t min_delta = INT32_MAX; 271 struct sched_node *best = NULL; 272 273 list_for_each_entry(struct sched_node, n, &s->dag->heads, dag.link) { 274 int32_t delta = calculate_pressure_delta(n->instr, s->live, s->max); 275 276 if (delta < min_delta) { 277 best = n; 278 min_delta = delta; 279 } 280 } 281 282 return best; 283} 284 285static void 286pressure_schedule_block(bi_context *ctx, bi_block *block, struct sched_ctx *s) 287{ 288 /* off by a constant, that's ok */ 289 signed pressure = 0; 290 signed orig_max_pressure = 0; 291 unsigned nr_ins = 0; 292 293 memcpy(s->live, block->live_out, s->max); 294 295 bi_foreach_instr_in_block_rev(block, I) { 296 pressure += calculate_pressure_delta(I, s->live, s->max); 297 orig_max_pressure = MAX2(pressure, orig_max_pressure); 298 bi_liveness_ins_update(s->live, I, s->max); 299 nr_ins++; 300 } 301 302 memcpy(s->live, block->live_out, s->max); 303 304 /* off by a constant, that's ok */ 305 signed max_pressure = 0; 306 pressure = 0; 307 308 struct sched_node **schedule = calloc(nr_ins, sizeof(struct sched_node *)); 309 nr_ins = 0; 310 311 while (!list_is_empty(&s->dag->heads)) { 312 struct sched_node *node = choose_instr(s); 313 pressure += calculate_pressure_delta(node->instr, s->live, s->max); 314 max_pressure = MAX2(pressure, max_pressure); 315 dag_prune_head(s->dag, &node->dag); 316 317 schedule[nr_ins++] = node; 318 bi_liveness_ins_update(s->live, node->instr, s->max); 319 } 320 321 /* Bail if it looks like it's worse */ 322 if (max_pressure >= orig_max_pressure) { 323 free(schedule); 324 return; 325 } 326 327 /* Apply the schedule */ 328 for (unsigned i = 0; i < nr_ins; ++i) { 329 bi_remove_instruction(schedule[i]->instr); 330 list_add(&schedule[i]->instr->link, &block->instructions); 331 } 332 333 free(schedule); 334} 335 336void 337bi_pressure_schedule(bi_context *ctx) 338{ 339 bi_compute_liveness(ctx); 340 unsigned temp_count = bi_max_temp(ctx); 341 void *memctx = ralloc_context(ctx); 342 uint8_t *live = ralloc_array(memctx, uint8_t, temp_count); 343 344 bi_foreach_block(ctx, block) { 345 struct sched_ctx sctx = { 346 .dag = create_dag(ctx, block, memctx), 347 .max = temp_count, 348 .live = live 349 }; 350 351 pressure_schedule_block(ctx, block, &sctx); 352 } 353 354 ralloc_free(memctx); 355} 356