1/* 2 * Copyright (c) 2017 Lima Project 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sub license, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the 12 * next paragraph) shall be included in all copies or substantial portions 13 * of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25#include "gpir.h" 26#include "util/u_dynarray.h" 27 28/* Per-register information */ 29 30struct reg_info { 31 BITSET_WORD *conflicts; 32 struct util_dynarray conflict_list; 33 34 unsigned num_conflicts; 35 36 int assigned_color; 37 38 bool visited; 39}; 40 41struct regalloc_ctx { 42 unsigned bitset_words; 43 struct reg_info *registers; 44 45 /* Reusable scratch liveness array */ 46 BITSET_WORD *live; 47 48 unsigned *worklist; 49 unsigned worklist_start, worklist_end; 50 51 unsigned *stack; 52 unsigned stack_size; 53 54 gpir_compiler *comp; 55 void *mem_ctx; 56}; 57 58/* Liveness analysis */ 59 60static void propagate_liveness_node(gpir_node *node, BITSET_WORD *live, 61 gpir_compiler *comp) 62{ 63 /* KILL */ 64 if (node->type == gpir_node_type_store) { 65 if (node->op == gpir_op_store_reg) { 66 gpir_store_node *store = gpir_node_to_store(node); 67 BITSET_CLEAR(live, store->reg->index); 68 } 69 } 70 71 /* GEN */ 72 if (node->type == gpir_node_type_load) { 73 if (node->op == gpir_op_load_reg) { 74 gpir_load_node *load = gpir_node_to_load(node); 75 BITSET_SET(live, load->reg->index); 76 } 77 } 78} 79 80static bool propagate_liveness_block(gpir_block *block, struct regalloc_ctx *ctx) 81{ 82 for (unsigned i = 0; i < 2; i++) { 83 if (block->successors[i]) { 84 for (unsigned j = 0; j < ctx->bitset_words; j++) 85 block->live_out[j] |= block->successors[i]->live_in[j]; 86 } 87 } 88 89 memcpy(ctx->live, block->live_out, ctx->bitset_words * sizeof(BITSET_WORD)); 90 91 list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { 92 propagate_liveness_node(node, ctx->live, block->comp); 93 } 94 95 bool changed = false; 96 for (unsigned i = 0; i < ctx->bitset_words; i++) { 97 changed |= (block->live_in[i] != ctx->live[i]); 98 block->live_in[i] = ctx->live[i]; 99 } 100 return changed; 101} 102 103static void calc_def_block(gpir_block *block) 104{ 105 list_for_each_entry(gpir_node, node, &block->node_list, list) { 106 if (node->op == gpir_op_store_reg) { 107 gpir_store_node *store = gpir_node_to_store(node); 108 BITSET_SET(block->def_out, store->reg->index); 109 } 110 } 111} 112 113static void calc_liveness(struct regalloc_ctx *ctx) 114{ 115 bool changed = true; 116 while (changed) { 117 changed = false; 118 list_for_each_entry_rev(gpir_block, block, &ctx->comp->block_list, list) { 119 changed |= propagate_liveness_block(block, ctx); 120 } 121 } 122 123 list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { 124 calc_def_block(block); 125 } 126 127 changed = true; 128 while (changed) { 129 changed = false; 130 list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { 131 for (unsigned i = 0; i < 2; i++) { 132 gpir_block *succ = block->successors[i]; 133 if (!succ) 134 continue; 135 136 for (unsigned j = 0; j < ctx->bitset_words; j++) { 137 BITSET_WORD new = block->def_out[j] & ~succ->def_out[j]; 138 changed |= (new != 0); 139 succ->def_out[j] |= block->def_out[j]; 140 } 141 } 142 } 143 } 144} 145 146/* Interference calculation */ 147 148static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j) 149{ 150 if (i == j) 151 return; 152 153 struct reg_info *a = &ctx->registers[i]; 154 struct reg_info *b = &ctx->registers[j]; 155 156 if (BITSET_TEST(a->conflicts, j)) 157 return; 158 159 BITSET_SET(a->conflicts, j); 160 BITSET_SET(b->conflicts, i); 161 162 a->num_conflicts++; 163 b->num_conflicts++; 164 util_dynarray_append(&a->conflict_list, unsigned, j); 165 util_dynarray_append(&b->conflict_list, unsigned, i); 166} 167 168/* Make the register or node "i" interfere with all the other live registers 169 * and nodes. 170 */ 171static void add_all_interferences(struct regalloc_ctx *ctx, 172 unsigned i, 173 BITSET_WORD *live_regs) 174{ 175 int live_reg; 176 BITSET_FOREACH_SET(live_reg, ctx->live, ctx->comp->cur_reg) { 177 add_interference(ctx, i, live_reg); 178 } 179 180} 181 182static void print_liveness(struct regalloc_ctx *ctx, 183 BITSET_WORD *live_reg) 184{ 185 if (!(lima_debug & LIMA_DEBUG_GP)) 186 return; 187 188 int live_idx; 189 BITSET_FOREACH_SET(live_idx, live_reg, ctx->comp->cur_reg) { 190 printf("reg%d ", live_idx); 191 } 192 printf("\n"); 193} 194 195static void calc_interference(struct regalloc_ctx *ctx) 196{ 197 list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { 198 /* Initialize liveness at the end of the block, but exclude values that 199 * definitely aren't defined by the end. This helps out with 200 * partially-defined registers, like: 201 * 202 * if (condition) { 203 * foo = ...; 204 * } 205 * if (condition) { 206 * ... = foo; 207 * } 208 * 209 * If we naively propagated liveness backwards, foo would be live from 210 * the beginning of the program, but if we're not inside a loop then 211 * its value is undefined before the first if and we don't have to 212 * consider it live. Mask out registers like foo here. 213 */ 214 for (unsigned i = 0; i < ctx->bitset_words; i++) { 215 ctx->live[i] = block->live_out[i] & block->def_out[i]; 216 } 217 218 list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { 219 gpir_debug("processing node %d\n", node->index); 220 print_liveness(ctx, ctx->live); 221 if (node->op == gpir_op_store_reg) { 222 gpir_store_node *store = gpir_node_to_store(node); 223 add_all_interferences(ctx, store->reg->index, ctx->live); 224 225 /* KILL */ 226 BITSET_CLEAR(ctx->live, store->reg->index); 227 } else if (node->op == gpir_op_load_reg) { 228 /* GEN */ 229 gpir_load_node *load = gpir_node_to_load(node); 230 BITSET_SET(ctx->live, load->reg->index); 231 } 232 } 233 } 234} 235 236/* Register allocation */ 237 238static bool can_simplify(struct regalloc_ctx *ctx, unsigned i) 239{ 240 struct reg_info *info = &ctx->registers[i]; 241 return info->num_conflicts < GPIR_PHYSICAL_REG_NUM; 242} 243 244static void push_stack(struct regalloc_ctx *ctx, unsigned i) 245{ 246 ctx->stack[ctx->stack_size++] = i; 247 gpir_debug("pushing reg%u\n", i); 248 249 struct reg_info *info = &ctx->registers[i]; 250 assert(info->visited); 251 252 util_dynarray_foreach(&info->conflict_list, unsigned, conflict) { 253 struct reg_info *conflict_info = &ctx->registers[*conflict]; 254 assert(conflict_info->num_conflicts > 0); 255 conflict_info->num_conflicts--; 256 if (!ctx->registers[*conflict].visited && can_simplify(ctx, *conflict)) { 257 ctx->worklist[ctx->worklist_end++] = *conflict; 258 ctx->registers[*conflict].visited = true; 259 } 260 } 261} 262 263static bool do_regalloc(struct regalloc_ctx *ctx) 264{ 265 ctx->worklist_start = 0; 266 ctx->worklist_end = 0; 267 ctx->stack_size = 0; 268 269 /* Step 1: find the initially simplifiable registers */ 270 for (int i = 0; i < ctx->comp->cur_reg; i++) { 271 if (can_simplify(ctx, i)) { 272 ctx->worklist[ctx->worklist_end++] = i; 273 ctx->registers[i].visited = true; 274 } 275 } 276 277 while (true) { 278 /* Step 2: push onto the stack whatever we can */ 279 while (ctx->worklist_start != ctx->worklist_end) { 280 push_stack(ctx, ctx->worklist[ctx->worklist_start++]); 281 } 282 283 if (ctx->stack_size < ctx->comp->cur_reg) { 284 /* If there are still unsimplifiable nodes left, we need to 285 * optimistically push a node onto the stack. Choose the one with 286 * the smallest number of current neighbors, since that's the most 287 * likely to succeed. 288 */ 289 unsigned min_conflicts = UINT_MAX; 290 unsigned best_reg = 0; 291 for (int reg = 0; reg < ctx->comp->cur_reg; reg++) { 292 struct reg_info *info = &ctx->registers[reg]; 293 if (info->visited) 294 continue; 295 if (info->num_conflicts < min_conflicts) { 296 best_reg = reg; 297 min_conflicts = info->num_conflicts; 298 } 299 } 300 gpir_debug("optimistic triggered\n"); 301 ctx->registers[best_reg].visited = true; 302 push_stack(ctx, best_reg); 303 } else { 304 break; 305 } 306 } 307 308 /* Step 4: pop off the stack and assign colors */ 309 for (int i = ctx->comp->cur_reg - 1; i >= 0; i--) { 310 unsigned idx = ctx->stack[i]; 311 struct reg_info *reg = &ctx->registers[idx]; 312 313 bool found = false; 314 unsigned start = i % GPIR_PHYSICAL_REG_NUM; 315 for (unsigned j = 0; j < GPIR_PHYSICAL_REG_NUM; j++) { 316 unsigned candidate = (j + start) % GPIR_PHYSICAL_REG_NUM; 317 bool available = true; 318 util_dynarray_foreach(®->conflict_list, unsigned, conflict_idx) { 319 struct reg_info *conflict = &ctx->registers[*conflict_idx]; 320 if (conflict->assigned_color >= 0 && 321 conflict->assigned_color == (int) candidate) { 322 available = false; 323 break; 324 } 325 } 326 327 if (available) { 328 reg->assigned_color = candidate; 329 found = true; 330 break; 331 } 332 } 333 334 /* TODO: spilling */ 335 if (!found) { 336 gpir_error("Failed to allocate registers\n"); 337 return false; 338 } 339 } 340 341 return true; 342} 343 344static void assign_regs(struct regalloc_ctx *ctx) 345{ 346 list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { 347 list_for_each_entry(gpir_node, node, &block->node_list, list) { 348 if (node->op == gpir_op_load_reg) { 349 gpir_load_node *load = gpir_node_to_load(node); 350 unsigned color = ctx->registers[load->reg->index].assigned_color; 351 load->index = color / 4; 352 load->component = color % 4; 353 } 354 355 if (node->op == gpir_op_store_reg) { 356 gpir_store_node *store = gpir_node_to_store(node); 357 unsigned color = ctx->registers[store->reg->index].assigned_color; 358 store->index = color / 4; 359 store->component = color % 4; 360 node->value_reg = color; 361 } 362 } 363 364 block->live_out_phys = 0; 365 366 int reg_idx; 367 BITSET_FOREACH_SET(reg_idx, block->live_out, ctx->comp->cur_reg) { 368 if (BITSET_TEST(block->def_out, reg_idx)) { 369 block->live_out_phys |= (1ull << ctx->registers[reg_idx].assigned_color); 370 } 371 } 372 } 373} 374 375/* Value register allocation */ 376 377/* Define a special token for when the register is occupied by a preallocated 378 * physical register (i.e. load_reg/store_reg). Normally entries in the "live" 379 * array points to the definition of the value, but there may be multiple 380 * definitions in this case, and they will certainly come from other basic 381 * blocks, so it doesn't make sense to do that here. 382 */ 383static gpir_node __physreg_live; 384#define PHYSREG_LIVE (&__physreg_live) 385 386struct value_regalloc_ctx { 387 gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM]; 388 gpir_node *complex1_last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM]; 389 gpir_node *live[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM]; 390 gpir_node *last_complex1; 391 unsigned alloc_start; 392}; 393 394static unsigned find_free_value_reg(struct value_regalloc_ctx *ctx) 395{ 396 /* Implement round-robin allocation */ 397 unsigned reg_offset = ctx->alloc_start++; 398 if (ctx->alloc_start == GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM) 399 ctx->alloc_start = 0; 400 401 unsigned reg = UINT_MAX; 402 for (unsigned reg_base = 0; 403 reg_base < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM; 404 reg_base++) { 405 unsigned cur_reg = (reg_base + reg_offset) % (GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM); 406 if (!ctx->live[cur_reg]) { 407 reg = cur_reg; 408 break; 409 } 410 } 411 412 return reg; 413} 414 415static void add_fake_dep(gpir_node *node, gpir_node *src, 416 struct value_regalloc_ctx *ctx) 417{ 418 assert(src->value_reg >= 0); 419 if (ctx->last_written[src->value_reg] && 420 ctx->last_written[src->value_reg] != node) { 421 gpir_node_add_dep(ctx->last_written[src->value_reg], node, 422 GPIR_DEP_WRITE_AFTER_READ); 423 } 424 425 /* For a sequence of schedule_first nodes right before a complex1 426 * node, add any extra fake dependencies necessary so that the 427 * schedule_first nodes can be scheduled right after the complex1 is 428 * scheduled. We have to save the last_written before complex1 happens to 429 * avoid adding dependencies to children of the complex1 node which would 430 * create a cycle. 431 */ 432 433 if (gpir_op_infos[node->op].schedule_first && 434 ctx->last_complex1 && 435 ctx->complex1_last_written[src->value_reg]) { 436 gpir_node_add_dep(ctx->complex1_last_written[src->value_reg], 437 ctx->last_complex1, 438 GPIR_DEP_WRITE_AFTER_READ); 439 } 440} 441 442static bool handle_value_read(gpir_node *node, gpir_node *src, 443 struct value_regalloc_ctx *ctx) 444{ 445 /* If already allocated, don't allocate it */ 446 if (src->value_reg < 0) { 447 unsigned reg = find_free_value_reg(ctx); 448 if (reg == UINT_MAX) 449 return false; 450 451 src->value_reg = reg; 452 ctx->live[reg] = src; 453 } 454 455 /* Add any fake dependencies. Note: this is the actual result of value 456 * register allocation. We throw away node->value_reg afterwards, since 457 * it's really the fake dependencies which constrain the post-RA scheduler 458 * enough to make sure it never needs to spill to temporaries. 459 */ 460 add_fake_dep(node, src, ctx); 461 462 return true; 463} 464 465static bool handle_reg_read(gpir_load_node *load, 466 struct value_regalloc_ctx *ctx) 467{ 468 unsigned idx = load->index * 4 + load->component; 469 if (!ctx->live[idx]) { 470 ctx->live[idx] = PHYSREG_LIVE; 471 } else if (ctx->live[idx] != PHYSREG_LIVE) { 472 /* This slot is occupied by some other value register, so we need to 473 * evict it. This effectively splits the live range of the value 474 * register. NB: since we create fake dependencies on the fly, and the 475 * fake dependencies are the only output of this pass, we don't actually 476 * have to record where the split happened or that this value was 477 * assigned to two different registers. Any actual live range splitting 478 * happens in the post-RA scheduler, which moves the value to and from 479 * the register file. This will just cause some reads of the value 480 * register to have different fake dependencies. 481 */ 482 unsigned new_reg = find_free_value_reg(ctx); 483 if (new_reg == UINT_MAX) 484 return false; 485 ctx->live[new_reg] = ctx->live[idx]; 486 ctx->live[new_reg]->value_reg = new_reg; 487 ctx->live[idx] = PHYSREG_LIVE; 488 } 489 490 if (ctx->last_written[idx]) { 491 gpir_node_add_dep(ctx->last_written[idx], &load->node, 492 GPIR_DEP_WRITE_AFTER_READ); 493 } 494 495 return true; 496} 497 498static void handle_reg_write(gpir_store_node *store, 499 struct value_regalloc_ctx *ctx) 500{ 501 unsigned idx = store->index * 4 + store->component; 502 store->node.value_reg = idx; 503 ctx->last_written[idx] = &store->node; 504 ctx->live[idx] = NULL; 505} 506 507static void handle_value_write(gpir_node *node, 508 struct value_regalloc_ctx *ctx) 509{ 510 /* TODO: why does an uninitialized node->value_reg 511 * sometimes end up here? */ 512 if (node->value_reg < 0) 513 return; 514 515 ctx->last_written[node->value_reg] = node; 516 ctx->live[node->value_reg] = NULL; 517} 518 519static bool regalloc_value_regs(gpir_block *block) 520{ 521 struct value_regalloc_ctx ctx = { { 0 } }; 522 523 list_for_each_entry(gpir_node, node, &block->node_list, list) { 524 node->value_reg = -1; 525 } 526 527 list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { 528 if (node->op == gpir_op_complex1) { 529 ctx.last_complex1 = node; 530 memcpy(ctx.complex1_last_written, ctx.last_written, 531 sizeof(ctx.complex1_last_written)); 532 } 533 534 if (node->type != gpir_node_type_store && 535 node->type != gpir_node_type_branch) { 536 handle_value_write(node, &ctx); 537 } else if (node->op == gpir_op_store_reg) { 538 handle_reg_write(gpir_node_to_store(node), &ctx); 539 } 540 541 if (node->type == gpir_node_type_store) { 542 gpir_store_node *store = gpir_node_to_store(node); 543 if (!handle_value_read(&store->node, store->child, &ctx)) 544 return false; 545 } else if (node->type == gpir_node_type_alu) { 546 gpir_alu_node *alu = gpir_node_to_alu(node); 547 for (int i = 0; i < alu->num_child; i++) { 548 if (!handle_value_read(&alu->node, alu->children[i], &ctx)) 549 return false; 550 } 551 } else if (node->type == gpir_node_type_branch) { 552 /* At the end of a block the top 11 values are always free, so 553 * branches should always succeed. 554 */ 555 gpir_branch_node *branch = gpir_node_to_branch(node); 556 ASSERTED bool result = handle_value_read(&branch->node, 557 branch->cond, &ctx); 558 assert(result); 559 } else if (node->op == gpir_op_load_reg) { 560 gpir_load_node *load = gpir_node_to_load(node); 561 if (!handle_reg_read(load, &ctx)) 562 return false; 563 } 564 } 565 566 return true; 567} 568 569static void regalloc_print_result(gpir_compiler *comp) 570{ 571 if (!(lima_debug & LIMA_DEBUG_GP)) 572 return; 573 574 int index = 0; 575 printf("======== regalloc ========\n"); 576 list_for_each_entry(gpir_block, block, &comp->block_list, list) { 577 list_for_each_entry(gpir_node, node, &block->node_list, list) { 578 printf("%03d: %d/%d %s ", index++, node->index, node->value_reg, 579 gpir_op_infos[node->op].name); 580 gpir_node_foreach_pred(node, dep) { 581 gpir_node *pred = dep->pred; 582 printf(" %d/%d", pred->index, pred->value_reg); 583 } 584 if (node->op == gpir_op_load_reg) { 585 gpir_load_node *load = gpir_node_to_load(node); 586 printf(" -/%d", 4 * load->index + load->component); 587 printf(" (%d)", load->reg->index); 588 } else if (node->op == gpir_op_store_reg) { 589 gpir_store_node *store = gpir_node_to_store(node); 590 printf(" (%d)", store->reg->index); 591 } 592 printf("\n"); 593 } 594 printf("----------------------------\n"); 595 } 596} 597 598bool gpir_regalloc_prog(gpir_compiler *comp) 599{ 600 struct regalloc_ctx ctx; 601 602 ctx.mem_ctx = ralloc_context(NULL); 603 ctx.bitset_words = BITSET_WORDS(comp->cur_reg); 604 ctx.live = ralloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); 605 ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, comp->cur_reg); 606 ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, comp->cur_reg); 607 ctx.comp = comp; 608 609 ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, comp->cur_reg); 610 for (int i = 0; i < comp->cur_reg; i++) { 611 ctx.registers[i].conflicts = rzalloc_array(ctx.mem_ctx, BITSET_WORD, 612 ctx.bitset_words); 613 util_dynarray_init(&ctx.registers[i].conflict_list, ctx.mem_ctx); 614 } 615 616 list_for_each_entry(gpir_block, block, &comp->block_list, list) { 617 block->live_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); 618 block->live_in = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); 619 block->def_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); 620 } 621 622 calc_liveness(&ctx); 623 calc_interference(&ctx); 624 if (!do_regalloc(&ctx)) { 625 ralloc_free(ctx.mem_ctx); 626 return false; 627 } 628 assign_regs(&ctx); 629 630 list_for_each_entry(gpir_block, block, &comp->block_list, list) { 631 if (!regalloc_value_regs(block)) 632 return false; 633 } 634 635 regalloc_print_result(comp); 636 ralloc_free(ctx.mem_ctx); 637 return true; 638} 639