1/* 2 * Copyright (C) 2021 Valve Corporation 3 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25#include "ir3_ra.h" 26#include "util/rb_tree.h" 27#include "util/u_math.h" 28#include "ir3_shader.h" 29 30/* This file implements an SSA-based register allocator. Unlike other 31 * SSA-based allocators, it handles vector split/collect "smartly," meaning 32 * that multiple values may share the same register interval. From the 33 * perspective of the allocator itself, only the top-level intervals matter, 34 * and the allocator is only concerned with allocating top-level intervals, 35 * which may mean moving other top-level intervals around. Other intervals, 36 * like the destination of a split instruction or the source of a collect 37 * instruction, are "locked" to their parent interval. The details of this are 38 * mostly handled by ir3_merge_regs and ir3_reg_ctx. 39 * 40 * We currently don't do any backtracking, but we do use the merge sets as a 41 * form of affinity to try to avoid moves from phis/splits/collects. Each 42 * merge set is what a more "classic" graph-coloring or live-range based 43 * allocator would consider a single register, but here we use it as merely a 44 * hint, except when multiple overlapping values are live at the same time. 45 * Each merge set has a "preferred" register, and we try to honor that when 46 * allocating values in the merge set. 47 */ 48 49/* ir3_reg_ctx implementation. */ 50 51static int 52ir3_reg_interval_cmp(const struct rb_node *node, const void *data) 53{ 54 unsigned reg = *(const unsigned *)data; 55 const struct ir3_reg_interval *interval = 56 ir3_rb_node_to_interval_const(node); 57 if (interval->reg->interval_start > reg) 58 return -1; 59 else if (interval->reg->interval_end <= reg) 60 return 1; 61 else 62 return 0; 63} 64 65static struct ir3_reg_interval * 66ir3_reg_interval_search(struct rb_tree *tree, unsigned offset) 67{ 68 struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp); 69 return node ? ir3_rb_node_to_interval(node) : NULL; 70} 71 72static struct ir3_reg_interval * 73ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset) 74{ 75 struct rb_node *node = 76 rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp); 77 return node ? ir3_rb_node_to_interval(node) : NULL; 78} 79 80/* Get the interval covering the reg, or the closest to the right if it 81 * doesn't exist. 82 */ 83static struct ir3_reg_interval * 84ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset) 85{ 86 struct ir3_reg_interval *interval = 87 ir3_reg_interval_search_sloppy(tree, offset); 88 if (!interval) { 89 return NULL; 90 } else if (interval->reg->interval_end > offset) { 91 return interval; 92 } else { 93 /* There is no interval covering reg, and ra_file_search_sloppy() 94 * returned the closest range to the left, so the next interval to the 95 * right should be the closest to the right. 96 */ 97 return ir3_reg_interval_next_or_null(interval); 98 } 99} 100 101static int 102ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b) 103{ 104 const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a); 105 const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b); 106 return b->reg->interval_start - a->reg->interval_start; 107} 108 109static void 110interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree, 111 struct ir3_reg_interval *interval) 112{ 113 struct ir3_reg_interval *right = 114 ir3_reg_interval_search_right(tree, interval->reg->interval_start); 115 if (right && right->reg->interval_start < interval->reg->interval_end) { 116 /* We disallow trees where different members have different half-ness. 117 * This means that we can't treat bitcasts as copies like normal 118 * split/collect, so something like this would require an extra copy 119 * in mergedregs mode, and count as 4 half-units of register pressure 120 * instead of 2: 121 * 122 * f16vec2 foo = unpackFloat2x16(bar) 123 * ... = foo.x 124 * ... = bar 125 * 126 * However, relaxing this rule would open a huge can of worms. What 127 * happens when there's a vector of 16 things, and the fifth element 128 * has been bitcasted as a half-reg? Would that element alone have to 129 * be small enough to be used as a half-reg source? Let's keep that 130 * can of worms firmly shut for now. 131 */ 132 assert((interval->reg->flags & IR3_REG_HALF) == 133 (right->reg->flags & IR3_REG_HALF)); 134 135 if (right->reg->interval_end <= interval->reg->interval_end && 136 right->reg->interval_start >= interval->reg->interval_start) { 137 /* Check if we're inserting something that's already inserted */ 138 assert(interval != right); 139 140 /* "right" is contained in "interval" and must become a child of 141 * it. There may be further children too. 142 */ 143 for (struct ir3_reg_interval *next = ir3_reg_interval_next(right); 144 right && right->reg->interval_start < interval->reg->interval_end; 145 right = next, next = ir3_reg_interval_next_or_null(next)) { 146 /* "right" must be contained in "interval." */ 147 assert(right->reg->interval_end <= interval->reg->interval_end); 148 assert((interval->reg->flags & IR3_REG_HALF) == 149 (right->reg->flags & IR3_REG_HALF)); 150 if (!right->parent) 151 ctx->interval_delete(ctx, right); 152 right->parent = interval; 153 rb_tree_remove(tree, &right->node); 154 rb_tree_insert(&interval->children, &right->node, 155 ir3_reg_interval_insert_cmp); 156 } 157 } else { 158 /* "right" must contain "interval," since intervals must form a 159 * tree. 160 */ 161 assert(right->reg->interval_start <= interval->reg->interval_start); 162 interval->parent = right; 163 interval_insert(ctx, &right->children, interval); 164 return; 165 } 166 } 167 168 if (!interval->parent) 169 ctx->interval_add(ctx, interval); 170 rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp); 171 interval->inserted = true; 172} 173 174void 175ir3_reg_interval_insert(struct ir3_reg_ctx *ctx, 176 struct ir3_reg_interval *interval) 177{ 178 rb_tree_init(&interval->children); 179 interval->parent = NULL; 180 interval_insert(ctx, &ctx->intervals, interval); 181} 182 183/* Call after ir3_reg_interval_remove_temp() to reinsert the interval */ 184static void 185ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx, 186 struct ir3_reg_interval *interval) 187{ 188 interval->parent = NULL; 189 interval_insert(ctx, &ctx->intervals, interval); 190} 191 192void 193ir3_reg_interval_remove(struct ir3_reg_ctx *ctx, 194 struct ir3_reg_interval *interval) 195{ 196 if (interval->parent) { 197 rb_tree_remove(&interval->parent->children, &interval->node); 198 } else { 199 ctx->interval_delete(ctx, interval); 200 rb_tree_remove(&ctx->intervals, &interval->node); 201 } 202 203 rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children, 204 node) { 205 rb_tree_remove(&interval->children, &child->node); 206 child->parent = interval->parent; 207 208 if (interval->parent) { 209 rb_tree_insert(&child->parent->children, &child->node, 210 ir3_reg_interval_insert_cmp); 211 } else { 212 ctx->interval_readd(ctx, interval, child); 213 rb_tree_insert(&ctx->intervals, &child->node, 214 ir3_reg_interval_insert_cmp); 215 } 216 } 217 218 interval->inserted = false; 219} 220 221static void 222_mark_free(struct ir3_reg_interval *interval) 223{ 224 interval->inserted = false; 225 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) { 226 _mark_free(child); 227 } 228} 229 230/* Remove an interval and all its children from the tree. */ 231void 232ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx, 233 struct ir3_reg_interval *interval) 234{ 235 assert(!interval->parent); 236 237 ctx->interval_delete(ctx, interval); 238 rb_tree_remove(&ctx->intervals, &interval->node); 239 _mark_free(interval); 240} 241 242/* Used when popping an interval to be shuffled around. Don't disturb children 243 * so that it can be later reinserted. 244 */ 245static void 246ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx, 247 struct ir3_reg_interval *interval) 248{ 249 assert(!interval->parent); 250 251 ctx->interval_delete(ctx, interval); 252 rb_tree_remove(&ctx->intervals, &interval->node); 253} 254 255static void 256interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval, 257 unsigned indent) 258{ 259 for (unsigned i = 0; i < indent; i++) 260 mesa_log_stream_printf(stream, "\t"); 261 mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name, 262 interval->reg->interval_start); 263 264 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) { 265 interval_dump(stream, child, indent + 1); 266 } 267 268 for (unsigned i = 0; i < indent; i++) 269 mesa_log_stream_printf(stream, "\t"); 270 mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name, 271 interval->reg->interval_end); 272} 273 274void 275ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval) 276{ 277 interval_dump(stream, interval, 0); 278} 279 280/* These are the core datastructures used by the register allocator. First 281 * ra_interval and ra_file, which are used for intra-block tracking and use 282 * the ir3_reg_ctx infrastructure: 283 */ 284 285struct ra_interval { 286 struct ir3_reg_interval interval; 287 288 struct rb_node physreg_node; 289 physreg_t physreg_start, physreg_end; 290 291 /* True if this is a source of the current instruction which is entirely 292 * killed. This means we can allocate the dest over it, but we can't break 293 * it up. 294 */ 295 bool is_killed; 296 297 /* True if this interval cannot be moved from its position. This is only 298 * used for precolored inputs to ensure that other inputs don't get 299 * allocated on top of them. 300 */ 301 bool frozen; 302}; 303 304struct ra_file { 305 struct ir3_reg_ctx reg_ctx; 306 307 BITSET_DECLARE(available, RA_MAX_FILE_SIZE); 308 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE); 309 310 struct rb_tree physreg_intervals; 311 312 unsigned size; 313 unsigned start; 314}; 315 316/* State for inter-block tracking. When we split a live range to make space 317 * for a vector, we may need to insert fixup code when a block has multiple 318 * predecessors that have moved the same live value to different registers. 319 * This keeps track of state required to do that. 320 */ 321 322struct ra_block_state { 323 /* Map of defining ir3_register -> physreg it was allocated to at the end 324 * of the block. 325 */ 326 struct hash_table *renames; 327 328 /* For loops, we need to process a block before all its predecessors have 329 * been processed. In particular, we need to pick registers for values 330 * without knowing if all the predecessors have been renamed. This keeps 331 * track of the registers we chose so that when we visit the back-edge we 332 * can move them appropriately. If all predecessors have been visited 333 * before this block is visited then we don't need to fill this out. This 334 * is a map from ir3_register -> physreg. 335 */ 336 struct hash_table *entry_regs; 337 338 /* True if the block has been visited and "renames" is complete. 339 */ 340 bool visited; 341}; 342 343struct ra_parallel_copy { 344 struct ra_interval *interval; 345 physreg_t src; 346}; 347 348/* The main context: */ 349 350struct ra_ctx { 351 /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom 352 * half of this file too. 353 */ 354 struct ra_file full; 355 356 /* hr0.x - hr63.w, only used without merged-regs. */ 357 struct ra_file half; 358 359 /* Shared regs. */ 360 struct ra_file shared; 361 362 struct ir3_liveness *live; 363 364 struct ir3_block *block; 365 366 const struct ir3_compiler *compiler; 367 gl_shader_stage stage; 368 369 /* Pending moves of top-level intervals that will be emitted once we're 370 * finished: 371 */ 372 DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies); 373 374 struct ra_interval *intervals; 375 struct ra_block_state *blocks; 376 377 bool merged_regs; 378}; 379 380#define foreach_interval(interval, file) \ 381 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \ 382 physreg_node) 383#define foreach_interval_rev(interval, file) \ 384 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \ 385 physreg_node) 386#define foreach_interval_safe(interval, file) \ 387 rb_tree_foreach_safe (struct ra_interval, interval, \ 388 &(file)->physreg_intervals, physreg_node) 389#define foreach_interval_rev_safe(interval, file) \ 390 rb_tree_foreach_rev_safe(struct ra_interval, interval, \ 391 &(file)->physreg_intervals, physreg_node) 392 393static struct ra_interval * 394rb_node_to_interval(struct rb_node *node) 395{ 396 return rb_node_data(struct ra_interval, node, physreg_node); 397} 398 399static const struct ra_interval * 400rb_node_to_interval_const(const struct rb_node *node) 401{ 402 return rb_node_data(struct ra_interval, node, physreg_node); 403} 404 405static struct ra_interval * 406ra_interval_next(struct ra_interval *interval) 407{ 408 struct rb_node *next = rb_node_next(&interval->physreg_node); 409 return next ? rb_node_to_interval(next) : NULL; 410} 411 412static struct ra_interval * 413ra_interval_next_or_null(struct ra_interval *interval) 414{ 415 return interval ? ra_interval_next(interval) : NULL; 416} 417 418static int 419ra_interval_cmp(const struct rb_node *node, const void *data) 420{ 421 physreg_t reg = *(const physreg_t *)data; 422 const struct ra_interval *interval = rb_node_to_interval_const(node); 423 if (interval->physreg_start > reg) 424 return -1; 425 else if (interval->physreg_end <= reg) 426 return 1; 427 else 428 return 0; 429} 430 431static struct ra_interval * 432ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg) 433{ 434 struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp); 435 return node ? rb_node_to_interval(node) : NULL; 436} 437 438/* Get the interval covering the reg, or the closest to the right if it 439 * doesn't exist. 440 */ 441static struct ra_interval * 442ra_interval_search_right(struct rb_tree *tree, physreg_t reg) 443{ 444 struct ra_interval *interval = ra_interval_search_sloppy(tree, reg); 445 if (!interval) { 446 return NULL; 447 } else if (interval->physreg_end > reg) { 448 return interval; 449 } else { 450 /* There is no interval covering reg, and ra_file_search_sloppy() 451 * returned the closest range to the left, so the next interval to the 452 * right should be the closest to the right. 453 */ 454 return ra_interval_next_or_null(interval); 455 } 456} 457 458static struct ra_interval * 459ra_file_search_right(struct ra_file *file, physreg_t reg) 460{ 461 return ra_interval_search_right(&file->physreg_intervals, reg); 462} 463 464static int 465ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b) 466{ 467 const struct ra_interval *a = rb_node_to_interval_const(_a); 468 const struct ra_interval *b = rb_node_to_interval_const(_b); 469 return b->physreg_start - a->physreg_start; 470} 471 472static struct ra_interval * 473ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval) 474{ 475 return rb_node_data(struct ra_interval, interval, interval); 476} 477 478static struct ra_file * 479ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx) 480{ 481 return rb_node_data(struct ra_file, ctx, reg_ctx); 482} 483 484static void 485interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval) 486{ 487 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval); 488 struct ra_file *file = ir3_reg_ctx_to_file(ctx); 489 490 /* We can assume in this case that physreg_start/physreg_end is already 491 * initialized. 492 */ 493 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { 494 BITSET_CLEAR(file->available, i); 495 BITSET_CLEAR(file->available_to_evict, i); 496 } 497 498 rb_tree_insert(&file->physreg_intervals, &interval->physreg_node, 499 ra_interval_insert_cmp); 500} 501 502static void 503interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval) 504{ 505 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval); 506 struct ra_file *file = ir3_reg_ctx_to_file(ctx); 507 508 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { 509 BITSET_SET(file->available, i); 510 BITSET_SET(file->available_to_evict, i); 511 } 512 513 rb_tree_remove(&file->physreg_intervals, &interval->physreg_node); 514} 515 516static void 517interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent, 518 struct ir3_reg_interval *_child) 519{ 520 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent); 521 struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child); 522 523 child->physreg_start = 524 parent->physreg_start + (child->interval.reg->interval_start - 525 parent->interval.reg->interval_start); 526 child->physreg_end = 527 child->physreg_start + 528 (child->interval.reg->interval_end - child->interval.reg->interval_start); 529 530 interval_add(ctx, _child); 531} 532 533static void 534ra_file_init(struct ra_file *file) 535{ 536 for (unsigned i = 0; i < file->size; i++) { 537 BITSET_SET(file->available, i); 538 BITSET_SET(file->available_to_evict, i); 539 } 540 541 rb_tree_init(&file->reg_ctx.intervals); 542 rb_tree_init(&file->physreg_intervals); 543 544 file->reg_ctx.interval_add = interval_add; 545 file->reg_ctx.interval_delete = interval_delete; 546 file->reg_ctx.interval_readd = interval_readd; 547} 548 549static void 550ra_file_insert(struct ra_file *file, struct ra_interval *interval) 551{ 552 assert(interval->physreg_start < interval->physreg_end); 553 assert(interval->physreg_end <= file->size); 554 if (interval->interval.reg->flags & IR3_REG_HALF) 555 assert(interval->physreg_end <= RA_HALF_SIZE); 556 557 ir3_reg_interval_insert(&file->reg_ctx, &interval->interval); 558} 559 560static void 561ra_file_remove(struct ra_file *file, struct ra_interval *interval) 562{ 563 ir3_reg_interval_remove(&file->reg_ctx, &interval->interval); 564} 565 566static void 567ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval) 568{ 569 assert(!interval->interval.parent); 570 571 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { 572 BITSET_SET(file->available, i); 573 } 574 575 interval->is_killed = true; 576} 577 578static void 579ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval) 580{ 581 assert(!interval->interval.parent); 582 583 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { 584 BITSET_CLEAR(file->available, i); 585 } 586 587 interval->is_killed = false; 588} 589 590static physreg_t 591ra_interval_get_physreg(const struct ra_interval *interval) 592{ 593 unsigned child_start = interval->interval.reg->interval_start; 594 595 while (interval->interval.parent) { 596 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); 597 } 598 599 return interval->physreg_start + 600 (child_start - interval->interval.reg->interval_start); 601} 602 603static unsigned 604ra_interval_get_num(const struct ra_interval *interval) 605{ 606 return ra_physreg_to_num(ra_interval_get_physreg(interval), 607 interval->interval.reg->flags); 608} 609 610static void 611ra_interval_init(struct ra_interval *interval, struct ir3_register *reg) 612{ 613 ir3_reg_interval_init(&interval->interval, reg); 614 interval->is_killed = false; 615 interval->frozen = false; 616} 617 618static void 619ra_interval_dump(struct log_stream *stream, struct ra_interval *interval) 620{ 621 mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start); 622 623 ir3_reg_interval_dump(stream, &interval->interval); 624} 625 626static void 627ra_file_dump(struct log_stream *stream, struct ra_file *file) 628{ 629 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, 630 physreg_node) { 631 ra_interval_dump(stream, interval); 632 } 633 634 unsigned start, end; 635 mesa_log_stream_printf(stream, "available:\n"); 636 BITSET_FOREACH_RANGE (start, end, file->available, file->size) { 637 mesa_log_stream_printf(stream, "%u-%u ", start, end); 638 } 639 mesa_log_stream_printf(stream, "\n"); 640 641 mesa_log_stream_printf(stream, "available to evict:\n"); 642 BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) { 643 mesa_log_stream_printf(stream, "%u-%u ", start, end); 644 } 645 mesa_log_stream_printf(stream, "\n"); 646 mesa_log_stream_printf(stream, "start: %u\n", file->start); 647} 648 649static void 650ra_ctx_dump(struct ra_ctx *ctx) 651{ 652 struct log_stream *stream = mesa_log_streami(); 653 mesa_log_stream_printf(stream, "full:\n"); 654 ra_file_dump(stream, &ctx->full); 655 mesa_log_stream_printf(stream, "half:\n"); 656 ra_file_dump(stream, &ctx->half); 657 mesa_log_stream_printf(stream, "shared:"); 658 ra_file_dump(stream, &ctx->shared); 659 mesa_log_stream_destroy(stream); 660} 661 662static unsigned 663reg_file_size(struct ra_file *file, struct ir3_register *reg) 664{ 665 /* Half-regs can only take up the first half of the combined regfile */ 666 if (reg->flags & IR3_REG_HALF) 667 return MIN2(file->size, RA_HALF_SIZE); 668 else 669 return file->size; 670} 671 672/* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple 673 * top-level intervals at once. Pop multiple intervals, then push them back in 674 * any order. 675 */ 676 677struct ra_removed_interval { 678 struct ra_interval *interval; 679 unsigned size; 680}; 681 682static struct ra_removed_interval 683ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file, 684 struct ra_interval *interval) 685{ 686 assert(!interval->interval.parent); 687 688 /* Check if we've already moved this reg before */ 689 unsigned pcopy_index; 690 for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count; 691 pcopy_index++) { 692 if (ctx->parallel_copies[pcopy_index].interval == interval) 693 break; 694 } 695 696 if (pcopy_index == ctx->parallel_copies_count) { 697 array_insert(ctx, ctx->parallel_copies, 698 (struct ra_parallel_copy){ 699 .interval = interval, 700 .src = interval->physreg_start, 701 }); 702 } 703 704 ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval); 705 706 return (struct ra_removed_interval){ 707 .interval = interval, 708 .size = interval->physreg_end - interval->physreg_start, 709 }; 710} 711 712static void 713ra_push_interval(struct ra_ctx *ctx, struct ra_file *file, 714 const struct ra_removed_interval *removed, physreg_t dst) 715{ 716 struct ra_interval *interval = removed->interval; 717 718 interval->physreg_start = dst; 719 interval->physreg_end = dst + removed->size; 720 721 assert(interval->physreg_end <= file->size); 722 if (interval->interval.reg->flags & IR3_REG_HALF) 723 assert(interval->physreg_end <= RA_HALF_SIZE); 724 725 ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval); 726} 727 728/* Pick up the interval and place it at "dst". */ 729static void 730ra_move_interval(struct ra_ctx *ctx, struct ra_file *file, 731 struct ra_interval *interval, physreg_t dst) 732{ 733 struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval); 734 ra_push_interval(ctx, file, &temp, dst); 735} 736 737static struct ra_file * 738ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg) 739{ 740 if (reg->flags & IR3_REG_SHARED) 741 return &ctx->shared; 742 else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF)) 743 return &ctx->full; 744 else 745 return &ctx->half; 746} 747 748 749/* Returns true if the proposed spot for "dst" or a killed source overlaps a 750 * destination that's been allocated. 751 */ 752static bool 753check_dst_overlap(struct ra_ctx *ctx, struct ra_file *file, 754 struct ir3_register *dst, physreg_t start, 755 physreg_t end) 756{ 757 struct ir3_instruction *instr = dst->instr; 758 759 ra_foreach_dst (other_dst, instr) { 760 /* We assume only destinations before the current one have been allocated. 761 */ 762 if (other_dst == dst) 763 break; 764 765 if (ra_get_file(ctx, other_dst) != file) 766 continue; 767 768 struct ra_interval *other_interval = &ctx->intervals[other_dst->name]; 769 assert(!other_interval->interval.parent); 770 physreg_t other_start = other_interval->physreg_start; 771 physreg_t other_end = other_interval->physreg_end; 772 773 if (other_end > start && end > other_start) 774 return true; 775 } 776 777 return false; 778} 779 780/* True if the destination is "early-clobber," meaning that it cannot be 781 * allocated over killed sources. Some destinations always require it, but it 782 * also is implicitly true for tied destinations whose source is live-through. 783 * If the source is killed, then we skip allocating a register for the 784 * destination altogether so we don't need to worry about that case here. 785 */ 786static bool 787is_early_clobber(struct ir3_register *reg) 788{ 789 return (reg->flags & IR3_REG_EARLY_CLOBBER) || reg->tied; 790} 791 792static bool 793get_reg_specified(struct ra_ctx *ctx, struct ra_file *file, 794 struct ir3_register *reg, physreg_t physreg, bool is_source) 795{ 796 for (unsigned i = 0; i < reg_size(reg); i++) { 797 if (!BITSET_TEST(is_early_clobber(reg) || is_source ? 798 file->available_to_evict : file->available, 799 physreg + i)) 800 return false; 801 } 802 803 if (!is_source && 804 check_dst_overlap(ctx, file, reg, physreg, physreg + reg_size(reg))) 805 return false; 806 807 return true; 808} 809 810/* Try to evict any registers conflicting with the proposed spot "physreg" for 811 * "reg". That is, move them to other places so that we can allocate "physreg" 812 * here. 813 */ 814 815static bool 816try_evict_regs(struct ra_ctx *ctx, struct ra_file *file, 817 struct ir3_register *reg, physreg_t physreg, 818 unsigned *_eviction_count, bool is_source, bool speculative) 819{ 820 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE); 821 memcpy(available_to_evict, file->available_to_evict, 822 sizeof(available_to_evict)); 823 824 BITSET_DECLARE(available, RA_MAX_FILE_SIZE); 825 memcpy(available, file->available, sizeof(available)); 826 827 for (unsigned i = 0; i < reg_size(reg); i++) { 828 BITSET_CLEAR(available_to_evict, physreg + i); 829 BITSET_CLEAR(available, physreg + i); 830 } 831 832 unsigned eviction_count = 0; 833 /* Iterate over each range conflicting with physreg */ 834 for (struct ra_interval *conflicting = ra_file_search_right(file, physreg), 835 *next = ra_interval_next_or_null(conflicting); 836 conflicting != NULL && 837 conflicting->physreg_start < physreg + reg_size(reg); 838 conflicting = next, next = ra_interval_next_or_null(next)) { 839 if (!is_early_clobber(reg) && !is_source && conflicting->is_killed) 840 continue; 841 842 if (conflicting->frozen) { 843 assert(speculative); 844 return false; 845 } 846 847 unsigned conflicting_file_size = 848 reg_file_size(file, conflicting->interval.reg); 849 unsigned avail_start, avail_end; 850 bool evicted = false; 851 BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict, 852 conflicting_file_size) { 853 unsigned size = avail_end - avail_start; 854 855 /* non-half registers must be aligned */ 856 if (!(conflicting->interval.reg->flags & IR3_REG_HALF) && 857 avail_start % 2 == 1) { 858 avail_start++; 859 size--; 860 } 861 862 unsigned conflicting_size = 863 conflicting->physreg_end - conflicting->physreg_start; 864 if (size >= conflicting_size && 865 !check_dst_overlap(ctx, file, reg, avail_start, avail_start + 866 conflicting_size)) { 867 for (unsigned i = 0; 868 i < conflicting->physreg_end - conflicting->physreg_start; i++) 869 BITSET_CLEAR(available_to_evict, avail_start + i); 870 eviction_count += 871 conflicting->physreg_end - conflicting->physreg_start; 872 if (!speculative) 873 ra_move_interval(ctx, file, conflicting, avail_start); 874 evicted = true; 875 break; 876 } 877 } 878 879 if (evicted) 880 continue; 881 882 /* If we couldn't evict this range, we may be able to swap it with a 883 * killed range to acheive the same effect. 884 */ 885 foreach_interval (killed, file) { 886 if (!killed->is_killed) 887 continue; 888 889 if (killed->physreg_end - killed->physreg_start != 890 conflicting->physreg_end - conflicting->physreg_start) 891 continue; 892 893 if (killed->physreg_end > conflicting_file_size || 894 conflicting->physreg_end > reg_file_size(file, killed->interval.reg)) 895 continue; 896 897 /* We can't swap the killed range if it partially/fully overlaps the 898 * space we're trying to allocate or (in speculative mode) if it's 899 * already been swapped and will overlap when we actually evict. 900 */ 901 bool killed_available = true; 902 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) { 903 if (!BITSET_TEST(available, i)) { 904 killed_available = false; 905 break; 906 } 907 } 908 909 if (!killed_available) 910 continue; 911 912 if (check_dst_overlap(ctx, file, reg, killed->physreg_start, 913 killed->physreg_end)) 914 continue; 915 916 /* Check for alignment if one is a full reg */ 917 if ((!(killed->interval.reg->flags & IR3_REG_HALF) || 918 !(conflicting->interval.reg->flags & IR3_REG_HALF)) && 919 (killed->physreg_start % 2 != 0 || 920 conflicting->physreg_start % 2 != 0)) 921 continue; 922 923 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) { 924 BITSET_CLEAR(available, i); 925 } 926 /* Because this will generate swaps instead of moves, multiply the 927 * cost by 2. 928 */ 929 eviction_count += (killed->physreg_end - killed->physreg_start) * 2; 930 if (!speculative) { 931 physreg_t killed_start = killed->physreg_start, 932 conflicting_start = conflicting->physreg_start; 933 struct ra_removed_interval killed_removed = 934 ra_pop_interval(ctx, file, killed); 935 struct ra_removed_interval conflicting_removed = 936 ra_pop_interval(ctx, file, conflicting); 937 ra_push_interval(ctx, file, &killed_removed, conflicting_start); 938 ra_push_interval(ctx, file, &conflicting_removed, killed_start); 939 } 940 941 evicted = true; 942 break; 943 } 944 945 if (!evicted) 946 return false; 947 } 948 949 *_eviction_count = eviction_count; 950 return true; 951} 952 953static int 954removed_interval_cmp(const void *_i1, const void *_i2) 955{ 956 const struct ra_removed_interval *i1 = _i1; 957 const struct ra_removed_interval *i2 = _i2; 958 959 /* We sort the registers as follows: 960 * 961 * |------------------------------------------------------------------------------------------| 962 * | | | | | | | 963 * | Half | Half early-clobber | Half | Full | Full early-clobber | Full | 964 * | live-through | destination | killed | killed | destination | live-through | 965 * | | | | | | | 966 * |------------------------------------------------------------------------------------------| 967 * | | 968 * | Destination | 969 * | | 970 * |-----------------| 971 * 972 * Half-registers have to be first so that they stay in the low half of 973 * the register file. Then half and full killed must stay together so that 974 * there's a contiguous range where we can put the register. With this 975 * structure we should be able to accomodate any collection of intervals 976 * such that the total number of half components is within the half limit 977 * and the combined components are within the full limit. 978 */ 979 980 unsigned i1_align = reg_elem_size(i1->interval->interval.reg); 981 unsigned i2_align = reg_elem_size(i2->interval->interval.reg); 982 if (i1_align > i2_align) 983 return 1; 984 if (i1_align < i2_align) 985 return -1; 986 987 if (i1_align == 1) { 988 if (i2->interval->is_killed) 989 return -1; 990 if (i1->interval->is_killed) 991 return 1; 992 } else { 993 if (i2->interval->is_killed) 994 return 1; 995 if (i1->interval->is_killed) 996 return -1; 997 } 998 999 return 0; 1000} 1001 1002static int 1003dsts_cmp(const void *_i1, const void *_i2) 1004{ 1005 struct ir3_register *i1 = *(struct ir3_register *const *) _i1; 1006 struct ir3_register *i2 = *(struct ir3_register *const *) _i2; 1007 1008 /* Treat tied destinations as-if they are live-through sources, and normal 1009 * destinations as killed sources. 1010 */ 1011 unsigned i1_align = reg_elem_size(i1); 1012 unsigned i2_align = reg_elem_size(i2); 1013 if (i1_align > i2_align) 1014 return 1; 1015 if (i1_align < i2_align) 1016 return -1; 1017 1018 if (i1_align == 1) { 1019 if (!is_early_clobber(i2)) 1020 return -1; 1021 if (!is_early_clobber(i1)) 1022 return 1; 1023 } else { 1024 if (!is_early_clobber(i2)) 1025 return 1; 1026 if (!is_early_clobber(i1)) 1027 return -1; 1028 } 1029 1030 return 0; 1031} 1032 1033/* "Compress" all the live intervals so that there is enough space for the 1034 * destination register. As there can be gaps when a more-aligned interval 1035 * follows a less-aligned interval, this also sorts them to remove such 1036 * "padding", which may be required when space is very tight. This isn't 1037 * amazing, but should be used only as a last resort in case the register file 1038 * is almost full and badly fragmented. 1039 * 1040 * Return the physreg to use. 1041 */ 1042static physreg_t 1043compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, 1044 struct ir3_register *reg) 1045{ 1046 unsigned align = reg_elem_size(reg); 1047 DECLARE_ARRAY(struct ra_removed_interval, intervals); 1048 intervals_count = intervals_sz = 0; 1049 intervals = NULL; 1050 1051 DECLARE_ARRAY(struct ir3_register *, dsts); 1052 dsts_count = dsts_sz = 0; 1053 dsts = NULL; 1054 array_insert(ctx, dsts, reg); 1055 bool dst_inserted[reg->instr->dsts_count]; 1056 1057 unsigned dst_size = reg->tied ? 0 : reg_size(reg); 1058 unsigned ec_dst_size = is_early_clobber(reg) ? reg_size(reg) : 0; 1059 unsigned half_dst_size = 0, ec_half_dst_size = 0; 1060 if (align == 1) { 1061 half_dst_size = dst_size; 1062 ec_half_dst_size = ec_dst_size; 1063 } 1064 1065 unsigned removed_size = 0, removed_half_size = 0; 1066 unsigned removed_killed_size = 0, removed_killed_half_size = 0; 1067 unsigned file_size = 1068 align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size; 1069 physreg_t start_reg = 0; 1070 1071 foreach_interval_rev_safe (interval, file) { 1072 /* We'll check if we can compact the intervals starting here. */ 1073 physreg_t candidate_start = interval->physreg_end; 1074 1075 /* Check if there are any other destinations we need to compact. */ 1076 ra_foreach_dst_n (other_dst, n, reg->instr) { 1077 if (other_dst == reg) 1078 break; 1079 if (ra_get_file(ctx, other_dst) != file) 1080 continue; 1081 if (dst_inserted[n]) 1082 continue; 1083 1084 struct ra_interval *other_interval = &ctx->intervals[other_dst->name]; 1085 /* if the destination partially overlaps this interval, we need to 1086 * extend candidate_start to the end. 1087 */ 1088 if (other_interval->physreg_start < candidate_start) { 1089 candidate_start = MAX2(candidate_start, 1090 other_interval->physreg_end); 1091 continue; 1092 } 1093 1094 dst_inserted[n] = true; 1095 1096 /* dst intervals with a tied killed source are considered attached to 1097 * that source. Don't actually insert them. This means we have to 1098 * update them below if their tied source moves. 1099 */ 1100 if (other_dst->tied) { 1101 struct ra_interval *tied_interval = 1102 &ctx->intervals[other_dst->tied->def->name]; 1103 if (tied_interval->is_killed) 1104 continue; 1105 } 1106 1107 d("popping destination %u physreg %u\n", 1108 other_interval->interval.reg->name, 1109 other_interval->physreg_start); 1110 1111 array_insert(ctx, dsts, other_dst); 1112 unsigned interval_size = reg_size(other_dst); 1113 if (is_early_clobber(other_dst)) { 1114 ec_dst_size += interval_size; 1115 if (other_interval->interval.reg->flags & IR3_REG_HALF) 1116 ec_half_dst_size += interval_size; 1117 } else { 1118 dst_size += interval_size; 1119 if (other_interval->interval.reg->flags & IR3_REG_HALF) 1120 half_dst_size += interval_size; 1121 } 1122 } 1123 1124 /* Check if we can sort the intervals *after* this one and have enough 1125 * space leftover to accomodate all intervals, keeping in mind that killed 1126 * sources overlap non-tied destinations. Also check that we have enough 1127 * space leftover for half-registers, if we're inserting a half-register 1128 * (otherwise we only shift any half-registers down so they should be 1129 * safe). 1130 */ 1131 if (candidate_start + removed_size + ec_dst_size + 1132 MAX2(removed_killed_size, dst_size) <= file->size && 1133 (align != 1 || 1134 candidate_start + removed_half_size + ec_half_dst_size + 1135 MAX2(removed_killed_half_size, half_dst_size) <= file_size)) { 1136 start_reg = candidate_start; 1137 break; 1138 } 1139 1140 /* We assume that all frozen intervals are at the start and that we 1141 * can avoid popping them. 1142 */ 1143 assert(!interval->frozen); 1144 1145 /* Killed sources are different because they go at the end and can 1146 * overlap the register we're trying to add. 1147 */ 1148 unsigned interval_size = interval->physreg_end - interval->physreg_start; 1149 if (interval->is_killed) { 1150 removed_killed_size += interval_size; 1151 if (interval->interval.reg->flags & IR3_REG_HALF) 1152 removed_killed_half_size += interval_size; 1153 } else { 1154 removed_size += interval_size; 1155 if (interval->interval.reg->flags & IR3_REG_HALF) 1156 removed_half_size += interval_size; 1157 } 1158 1159 /* Now that we've done the accounting, pop this off */ 1160 d("popping interval %u physreg %u%s\n", interval->interval.reg->name, 1161 interval->physreg_start, interval->is_killed ? ", killed" : ""); 1162 array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval)); 1163 } 1164 1165 /* TODO: In addition to skipping registers at the beginning that are 1166 * well-packed, we should try to skip registers at the end. 1167 */ 1168 1169 qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp); 1170 qsort(dsts, dsts_count, sizeof(*dsts), dsts_cmp); 1171 1172 physreg_t live_reg = start_reg; 1173 physreg_t dst_reg = (physreg_t)~0; 1174 physreg_t ret_reg = (physreg_t)~0; 1175 unsigned dst_index = 0; 1176 unsigned live_index = 0; 1177 1178 /* We have two lists of intervals to process, live intervals and destination 1179 * intervals. Process them in the order of the disgram in insert_cmp(). 1180 */ 1181 while (live_index < intervals_count || dst_index < dsts_count) { 1182 bool process_dst; 1183 if (live_index == intervals_count) { 1184 process_dst = true; 1185 } else if (dst_index == dsts_count) { 1186 process_dst = false; 1187 } else { 1188 struct ir3_register *dst = dsts[dst_index]; 1189 struct ra_interval *live_interval = intervals[live_index].interval; 1190 1191 bool live_half = live_interval->interval.reg->flags & IR3_REG_HALF; 1192 bool live_killed = live_interval->is_killed; 1193 bool dst_half = dst->flags & IR3_REG_HALF; 1194 bool dst_early_clobber = is_early_clobber(dst); 1195 1196 if (live_half && !live_killed) { 1197 /* far-left of diagram. */ 1198 process_dst = false; 1199 } else if (dst_half && dst_early_clobber) { 1200 /* mid-left of diagram. */ 1201 process_dst = true; 1202 } else if (!dst_early_clobber) { 1203 /* bottom of disagram. */ 1204 process_dst = true; 1205 } else if (live_killed) { 1206 /* middle of diagram. */ 1207 process_dst = false; 1208 } else if (!dst_half && dst_early_clobber) { 1209 /* mid-right of diagram. */ 1210 process_dst = true; 1211 } else { 1212 /* far right of diagram. */ 1213 assert(!live_killed && !live_half); 1214 process_dst = false; 1215 } 1216 } 1217 1218 struct ir3_register *cur_reg = 1219 process_dst ? dsts[dst_index] : 1220 intervals[live_index].interval->interval.reg; 1221 1222 physreg_t physreg; 1223 if (process_dst && !is_early_clobber(cur_reg)) { 1224 if (dst_reg == (physreg_t)~0) 1225 dst_reg = live_reg; 1226 physreg = dst_reg; 1227 } else { 1228 physreg = live_reg; 1229 struct ra_interval *live_interval = intervals[live_index].interval; 1230 bool live_killed = live_interval->is_killed; 1231 /* If this is live-through and we've processed the destinations, we 1232 * need to make sure we take into account any overlapping destinations. 1233 */ 1234 if (!live_killed && dst_reg != (physreg_t)~0) 1235 physreg = MAX2(physreg, dst_reg); 1236 } 1237 1238 if (!(cur_reg->flags & IR3_REG_HALF)) 1239 physreg = ALIGN(physreg, 2); 1240 1241 d("pushing reg %u physreg %u\n", cur_reg->name, physreg); 1242 1243 unsigned interval_size = reg_size(cur_reg); 1244 if (physreg + interval_size > 1245 reg_file_size(file, cur_reg)) { 1246 d("ran out of room for interval %u!\n", 1247 cur_reg->name); 1248 unreachable("reg pressure calculation was wrong!"); 1249 return 0; 1250 } 1251 1252 if (process_dst) { 1253 if (cur_reg == reg) { 1254 ret_reg = physreg; 1255 } else { 1256 struct ra_interval *interval = &ctx->intervals[cur_reg->name]; 1257 interval->physreg_start = physreg; 1258 interval->physreg_end = physreg + interval_size; 1259 } 1260 dst_index++; 1261 } else { 1262 ra_push_interval(ctx, file, &intervals[live_index], physreg); 1263 live_index++; 1264 } 1265 1266 physreg += interval_size; 1267 1268 if (process_dst && !is_early_clobber(cur_reg)) { 1269 dst_reg = physreg; 1270 } else { 1271 live_reg = physreg; 1272 } 1273 } 1274 1275 /* If we shuffled around a tied source that is killed, we may have to update 1276 * its corresponding destination since we didn't insert it above. 1277 */ 1278 ra_foreach_dst (dst, reg->instr) { 1279 if (dst == reg) 1280 break; 1281 1282 struct ir3_register *tied = dst->tied; 1283 if (!tied) 1284 continue; 1285 1286 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name]; 1287 if (!tied_interval->is_killed) 1288 continue; 1289 1290 struct ra_interval *dst_interval = &ctx->intervals[dst->name]; 1291 unsigned dst_size = reg_size(dst); 1292 dst_interval->physreg_start = ra_interval_get_physreg(tied_interval); 1293 dst_interval->physreg_end = dst_interval->physreg_start + dst_size; 1294 } 1295 1296 return ret_reg; 1297} 1298 1299static void 1300update_affinity(struct ra_file *file, struct ir3_register *reg, 1301 physreg_t physreg) 1302{ 1303 if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0) 1304 return; 1305 1306 if (physreg < reg->merge_set_offset) 1307 return; 1308 1309 if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size) 1310 return; 1311 1312 reg->merge_set->preferred_reg = physreg - reg->merge_set_offset; 1313} 1314 1315/* Try to find free space for a register without shuffling anything. This uses 1316 * a round-robin algorithm to reduce false dependencies. 1317 */ 1318static physreg_t 1319find_best_gap(struct ra_ctx *ctx, struct ra_file *file, 1320 struct ir3_register *dst, unsigned file_size, unsigned size, 1321 unsigned align) 1322{ 1323 /* This can happen if we create a very large merge set. Just bail out in that 1324 * case. 1325 */ 1326 if (size > file_size) 1327 return (physreg_t) ~0; 1328 1329 BITSET_WORD *available = 1330 is_early_clobber(dst) ? file->available_to_evict : file->available; 1331 1332 unsigned start = ALIGN(file->start, align) % (file_size - size + align); 1333 unsigned candidate = start; 1334 do { 1335 bool is_available = true; 1336 for (unsigned i = 0; i < size; i++) { 1337 if (!BITSET_TEST(available, candidate + i)) { 1338 is_available = false; 1339 break; 1340 } 1341 } 1342 1343 if (is_available) { 1344 is_available = 1345 !check_dst_overlap(ctx, file, dst, candidate, candidate + size); 1346 } 1347 1348 if (is_available) { 1349 file->start = (candidate + size) % file_size; 1350 return candidate; 1351 } 1352 1353 candidate += align; 1354 if (candidate + size > file_size) 1355 candidate = 0; 1356 } while (candidate != start); 1357 1358 return (physreg_t)~0; 1359} 1360 1361/* This is the main entrypoint for picking a register. Pick a free register 1362 * for "reg", shuffling around sources if necessary. In the normal case where 1363 * "is_source" is false, this register can overlap with killed sources 1364 * (intervals with "is_killed == true"). If "is_source" is true, then 1365 * is_killed is ignored and the register returned must not overlap with killed 1366 * sources. This must be used for tied registers, because we're actually 1367 * allocating the destination and the tied source at the same time. 1368 */ 1369 1370static physreg_t 1371get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg) 1372{ 1373 unsigned file_size = reg_file_size(file, reg); 1374 if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) { 1375 physreg_t preferred_reg = 1376 reg->merge_set->preferred_reg + reg->merge_set_offset; 1377 if (preferred_reg + reg_size(reg) <= file_size && 1378 preferred_reg % reg_elem_size(reg) == 0 && 1379 get_reg_specified(ctx, file, reg, preferred_reg, false)) 1380 return preferred_reg; 1381 } 1382 1383 /* If this register is a subset of a merge set which we have not picked a 1384 * register for, first try to allocate enough space for the entire merge 1385 * set. 1386 */ 1387 unsigned size = reg_size(reg); 1388 if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 && 1389 size < reg->merge_set->size) { 1390 physreg_t best_reg = find_best_gap(ctx, file, reg, file_size, 1391 reg->merge_set->size, 1392 reg->merge_set->alignment); 1393 if (best_reg != (physreg_t)~0u) { 1394 best_reg += reg->merge_set_offset; 1395 return best_reg; 1396 } 1397 } 1398 1399 /* For ALU and SFU instructions, if the src reg is avail to pick, use it. 1400 * Because this doesn't introduce unnecessary dependencies, and it 1401 * potentially avoids needing (ss) syncs for write after read hazards for 1402 * SFU instructions: 1403 */ 1404 if (is_sfu(reg->instr) || is_alu(reg->instr)) { 1405 for (unsigned i = 0; i < reg->instr->srcs_count; i++) { 1406 struct ir3_register *src = reg->instr->srcs[i]; 1407 if (!ra_reg_is_src(src)) 1408 continue; 1409 if (ra_get_file(ctx, src) == file && reg_size(src) >= size) { 1410 struct ra_interval *src_interval = &ctx->intervals[src->def->name]; 1411 physreg_t src_physreg = ra_interval_get_physreg(src_interval); 1412 if (src_physreg % reg_elem_size(reg) == 0 && 1413 src_physreg + size <= file_size && 1414 get_reg_specified(ctx, file, reg, src_physreg, false)) 1415 return src_physreg; 1416 } 1417 } 1418 } 1419 1420 physreg_t best_reg = 1421 find_best_gap(ctx, file, reg, file_size, size, reg_elem_size(reg)); 1422 if (best_reg != (physreg_t)~0u) { 1423 return best_reg; 1424 } 1425 1426 /* Ok, we couldn't find anything that fits. Here is where we have to start 1427 * moving things around to make stuff fit. First try solely evicting 1428 * registers in the way. 1429 */ 1430 unsigned best_eviction_count = ~0; 1431 for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) { 1432 unsigned eviction_count; 1433 if (try_evict_regs(ctx, file, reg, i, &eviction_count, false, true)) { 1434 if (eviction_count < best_eviction_count) { 1435 best_eviction_count = eviction_count; 1436 best_reg = i; 1437 } 1438 } 1439 } 1440 1441 if (best_eviction_count != ~0) { 1442 ASSERTED bool result = try_evict_regs( 1443 ctx, file, reg, best_reg, &best_eviction_count, false, false); 1444 assert(result); 1445 return best_reg; 1446 } 1447 1448 /* Use the dumb fallback only if try_evict_regs() fails. */ 1449 return compress_regs_left(ctx, file, reg); 1450} 1451 1452static void 1453assign_reg(struct ir3_instruction *instr, struct ir3_register *reg, 1454 unsigned num) 1455{ 1456 if (reg->flags & IR3_REG_ARRAY) { 1457 reg->array.base = num; 1458 if (reg->flags & IR3_REG_RELATIV) 1459 reg->array.offset += num; 1460 else 1461 reg->num = num + reg->array.offset; 1462 } else { 1463 reg->num = num; 1464 } 1465} 1466 1467static void 1468mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src) 1469{ 1470 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1471 1472 if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed || 1473 interval->interval.parent || 1474 !rb_tree_is_empty(&interval->interval.children)) 1475 return; 1476 1477 ra_file_mark_killed(ra_get_file(ctx, src), interval); 1478} 1479 1480static void 1481insert_dst(struct ra_ctx *ctx, struct ir3_register *dst) 1482{ 1483 struct ra_file *file = ra_get_file(ctx, dst); 1484 struct ra_interval *interval = &ctx->intervals[dst->name]; 1485 1486 d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval)); 1487 1488 if (!(dst->flags & IR3_REG_UNUSED)) 1489 ra_file_insert(file, interval); 1490 1491 assign_reg(dst->instr, dst, ra_interval_get_num(interval)); 1492} 1493 1494static void 1495allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst, 1496 physreg_t physreg) 1497{ 1498 struct ra_file *file = ra_get_file(ctx, dst); 1499 struct ra_interval *interval = &ctx->intervals[dst->name]; 1500 update_affinity(file, dst, physreg); 1501 1502 ra_interval_init(interval, dst); 1503 interval->physreg_start = physreg; 1504 interval->physreg_end = physreg + reg_size(dst); 1505} 1506 1507/* If a tied destination interferes with its source register, we have to insert 1508 * a copy beforehand to copy the source to the destination. Because we are using 1509 * the parallel_copies array and not creating a separate copy, this copy will 1510 * happen in parallel with any shuffling around of the tied source, so we have 1511 * to copy the source *as it exists before it is shuffled around*. We do this by 1512 * inserting the copy early, before any other copies are inserted. We don't 1513 * actually know the destination of the copy, but that's ok because the 1514 * dst_interval will be filled out later. 1515 */ 1516static void 1517insert_tied_dst_copy(struct ra_ctx *ctx, struct ir3_register *dst) 1518{ 1519 struct ir3_register *tied = dst->tied; 1520 1521 if (!tied) 1522 return; 1523 1524 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name]; 1525 struct ra_interval *dst_interval = &ctx->intervals[dst->name]; 1526 1527 if (tied_interval->is_killed) 1528 return; 1529 1530 physreg_t tied_physreg = ra_interval_get_physreg(tied_interval); 1531 1532 array_insert(ctx, ctx->parallel_copies, 1533 (struct ra_parallel_copy){ 1534 .interval = dst_interval, 1535 .src = tied_physreg, 1536 }); 1537} 1538 1539static void 1540allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst) 1541{ 1542 struct ra_file *file = ra_get_file(ctx, dst); 1543 1544 struct ir3_register *tied = dst->tied; 1545 if (tied) { 1546 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name]; 1547 if (tied_interval->is_killed) { 1548 /* The easy case: the source is killed, so we can just reuse it 1549 * for the destination. 1550 */ 1551 allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval)); 1552 return; 1553 } 1554 } 1555 1556 /* All the hard work is done by get_reg here. */ 1557 physreg_t physreg = get_reg(ctx, file, dst); 1558 1559 allocate_dst_fixed(ctx, dst, physreg); 1560} 1561 1562static void 1563assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr, 1564 struct ir3_register *src) 1565{ 1566 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1567 struct ra_file *file = ra_get_file(ctx, src); 1568 1569 struct ir3_register *tied = src->tied; 1570 physreg_t physreg; 1571 if (tied) { 1572 struct ra_interval *tied_interval = &ctx->intervals[tied->name]; 1573 physreg = ra_interval_get_physreg(tied_interval); 1574 } else { 1575 physreg = ra_interval_get_physreg(interval); 1576 } 1577 1578 assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags)); 1579 1580 if (src->flags & IR3_REG_FIRST_KILL) 1581 ra_file_remove(file, interval); 1582} 1583 1584/* Insert a parallel copy instruction before the instruction with the parallel 1585 * copy entries we've built up. 1586 */ 1587static void 1588insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr) 1589{ 1590 if (ctx->parallel_copies_count == 0) 1591 return; 1592 1593 struct ir3_instruction *pcopy = 1594 ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY, 1595 ctx->parallel_copies_count, ctx->parallel_copies_count); 1596 1597 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) { 1598 struct ra_parallel_copy *entry = &ctx->parallel_copies[i]; 1599 struct ir3_register *reg = 1600 ir3_dst_create(pcopy, INVALID_REG, 1601 entry->interval->interval.reg->flags & 1602 (IR3_REG_HALF | IR3_REG_ARRAY)); 1603 reg->size = entry->interval->interval.reg->size; 1604 reg->wrmask = entry->interval->interval.reg->wrmask; 1605 assign_reg(pcopy, reg, ra_interval_get_num(entry->interval)); 1606 } 1607 1608 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) { 1609 struct ra_parallel_copy *entry = &ctx->parallel_copies[i]; 1610 struct ir3_register *reg = 1611 ir3_src_create(pcopy, INVALID_REG, 1612 entry->interval->interval.reg->flags & 1613 (IR3_REG_HALF | IR3_REG_ARRAY)); 1614 reg->size = entry->interval->interval.reg->size; 1615 reg->wrmask = entry->interval->interval.reg->wrmask; 1616 assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags)); 1617 } 1618 1619 list_del(&pcopy->node); 1620 list_addtail(&pcopy->node, &instr->node); 1621 ctx->parallel_copies_count = 0; 1622} 1623 1624static void 1625handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr) 1626{ 1627 /* First, mark sources as going-to-be-killed while allocating the dest. */ 1628 ra_foreach_src (src, instr) { 1629 mark_src_killed(ctx, src); 1630 } 1631 1632 /* Pre-insert tied dst copies. */ 1633 ra_foreach_dst (dst, instr) { 1634 insert_tied_dst_copy(ctx, dst); 1635 } 1636 1637 /* Allocate the destination. */ 1638 ra_foreach_dst (dst, instr) { 1639 allocate_dst(ctx, dst); 1640 } 1641 1642 /* Now handle sources. Go backward so that in case there are multiple 1643 * sources with the same def and that def is killed we only remove it at 1644 * the end. 1645 */ 1646 ra_foreach_src_rev (src, instr) { 1647 assign_src(ctx, instr, src); 1648 } 1649 1650 /* Now finally insert the destination into the map. */ 1651 ra_foreach_dst (dst, instr) { 1652 insert_dst(ctx, dst); 1653 } 1654 1655 insert_parallel_copy_instr(ctx, instr); 1656} 1657 1658static void 1659handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr) 1660{ 1661 struct ir3_register *dst = instr->dsts[0]; 1662 struct ir3_register *src = instr->srcs[0]; 1663 1664 if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) { 1665 handle_normal_instr(ctx, instr); 1666 return; 1667 } 1668 1669 struct ra_interval *src_interval = &ctx->intervals[src->def->name]; 1670 1671 physreg_t physreg = ra_interval_get_physreg(src_interval); 1672 assign_src(ctx, instr, src); 1673 1674 allocate_dst_fixed( 1675 ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset); 1676 insert_dst(ctx, dst); 1677} 1678 1679static void 1680handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr) 1681{ 1682 struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set; 1683 unsigned dst_offset = instr->dsts[0]->merge_set_offset; 1684 1685 if (!dst_set || dst_set->regs_count == 1) { 1686 handle_normal_instr(ctx, instr); 1687 return; 1688 } 1689 1690 /* We need to check if any of the sources are contained in an interval 1691 * that is at least as large as the vector. In this case, we should put 1692 * the vector inside that larger interval. (There should be one 1693 * unambiguous place to put it, because values sharing the same merge set 1694 * should be allocated together.) This can happen in a case like: 1695 * 1696 * ssa_1 (wrmask=0xf) = ... 1697 * ssa_2 = split ssa_1 off:0 1698 * ssa_3 = split ssa_1 off:1 1699 * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3 1700 * ... = (kill)ssa_1 1701 * ... = (kill)ssa_4 1702 * 1703 * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it. 1704 */ 1705 physreg_t dst_fixed = (physreg_t)~0u; 1706 1707 ra_foreach_src (src, instr) { 1708 if (src->flags & IR3_REG_FIRST_KILL) { 1709 mark_src_killed(ctx, src); 1710 } 1711 1712 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1713 1714 if (src->def->merge_set != dst_set || interval->is_killed) 1715 continue; 1716 while (interval->interval.parent != NULL) { 1717 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); 1718 } 1719 if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) { 1720 dst_fixed = interval->physreg_start - 1721 interval->interval.reg->merge_set_offset + dst_offset; 1722 } else { 1723 /* For sources whose root interval is smaller than the 1724 * destination (i.e. the normal case), we will shuffle them 1725 * around after allocating the destination. Mark them killed so 1726 * that the destination can be allocated over them, even if they 1727 * aren't actually killed. 1728 */ 1729 ra_file_mark_killed(ra_get_file(ctx, src), interval); 1730 } 1731 } 1732 1733 if (dst_fixed != (physreg_t)~0u) 1734 allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed); 1735 else 1736 allocate_dst(ctx, instr->dsts[0]); 1737 1738 /* Remove the temporary is_killed we added */ 1739 ra_foreach_src (src, instr) { 1740 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1741 while (interval->interval.parent != NULL) { 1742 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); 1743 } 1744 1745 /* Filter out cases where it actually should be killed */ 1746 if (interval != &ctx->intervals[src->def->name] || 1747 !(src->flags & IR3_REG_KILL)) { 1748 ra_file_unmark_killed(ra_get_file(ctx, src), interval); 1749 } 1750 } 1751 1752 ra_foreach_src_rev (src, instr) { 1753 assign_src(ctx, instr, src); 1754 } 1755 1756 /* We need to do this before insert_dst(), so that children of the 1757 * destination which got marked as killed and then shuffled around to make 1758 * space for the destination have the correct pcopy destination that 1759 * matches what we assign the source of the collect to in assign_src(). 1760 * 1761 * TODO: In this case we'll wind up copying the value in the pcopy and 1762 * then again in the collect. We could avoid one of those by updating the 1763 * pcopy destination to match up with the final location of the source 1764 * after the collect and making the collect a no-op. However this doesn't 1765 * seem to happen often. 1766 */ 1767 insert_parallel_copy_instr(ctx, instr); 1768 1769 /* Note: insert_dst will automatically shuffle around any intervals that 1770 * are a child of the collect by making them children of the collect. 1771 */ 1772 1773 insert_dst(ctx, instr->dsts[0]); 1774} 1775 1776/* Parallel copies before RA should only be at the end of the block, for 1777 * phi's. For these we only need to fill in the sources, and then we fill in 1778 * the destinations in the successor block. 1779 */ 1780static void 1781handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr) 1782{ 1783 ra_foreach_src_rev (src, instr) { 1784 assign_src(ctx, instr, src); 1785 } 1786} 1787 1788/* Some inputs may need to be precolored. We need to handle those first, so 1789 * that other non-precolored inputs don't accidentally get allocated over 1790 * them. Inputs are the very first thing in the shader, so it shouldn't be a 1791 * problem to allocate them to a specific physreg. 1792 */ 1793 1794static void 1795handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr) 1796{ 1797 if (instr->dsts[0]->num == INVALID_REG) 1798 return; 1799 1800 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]); 1801 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name]; 1802 physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]); 1803 allocate_dst_fixed(ctx, instr->dsts[0], physreg); 1804 1805 d("insert precolored dst %u physreg %u", instr->dsts[0]->name, 1806 ra_interval_get_physreg(interval)); 1807 1808 ra_file_insert(file, interval); 1809 interval->frozen = true; 1810} 1811 1812static void 1813handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr) 1814{ 1815 if (instr->dsts[0]->num != INVALID_REG) 1816 return; 1817 1818 allocate_dst(ctx, instr->dsts[0]); 1819 1820 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]); 1821 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name]; 1822 ra_file_insert(file, interval); 1823} 1824 1825static void 1826assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr) 1827{ 1828 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name]; 1829 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]); 1830 1831 if (instr->dsts[0]->num == INVALID_REG) { 1832 assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval)); 1833 } else { 1834 interval->frozen = false; 1835 } 1836 1837 if (instr->dsts[0]->flags & IR3_REG_UNUSED) 1838 ra_file_remove(file, interval); 1839 1840 ra_foreach_src_rev (src, instr) 1841 assign_src(ctx, instr, src); 1842} 1843 1844/* chmask is a bit weird, because it has pre-colored sources due to the need 1845 * to pass some registers to the next stage. Fortunately there are only at 1846 * most two, and there should be no other live values by the time we get to 1847 * this instruction, so we only have to do the minimum and don't need any 1848 * fancy fallbacks. 1849 * 1850 * TODO: Add more complete handling of precolored sources, e.g. for function 1851 * argument handling. We'd need a way to mark sources as fixed so that they 1852 * don't get moved around when placing other sources in the fallback case, and 1853 * a duplication of much of the logic in get_reg(). This also opens another 1854 * can of worms, e.g. what if the precolored source is a split of a vector 1855 * which is still live -- this breaks our assumption that splits don't incur 1856 * any "extra" register requirements and we'd have to break it out of the 1857 * parent ra_interval. 1858 */ 1859 1860static void 1861handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src) 1862{ 1863 struct ra_file *file = ra_get_file(ctx, src); 1864 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1865 physreg_t physreg = ra_reg_get_physreg(src); 1866 1867 if (ra_interval_get_num(interval) == src->num) 1868 return; 1869 1870 /* Try evicting stuff in our way if it isn't free. This won't move 1871 * anything unless it overlaps with our precolored physreg, so we don't 1872 * have to worry about evicting other precolored sources. 1873 */ 1874 if (!get_reg_specified(ctx, file, src, physreg, true)) { 1875 unsigned eviction_count; 1876 if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true, 1877 false)) { 1878 unreachable("failed to evict for precolored source!"); 1879 return; 1880 } 1881 } 1882 1883 ra_move_interval(ctx, file, interval, physreg); 1884} 1885 1886static void 1887handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr) 1888{ 1889 /* Note: we purposely don't mark sources as killed, so that we can reuse 1890 * some of the get_reg() machinery as-if the source is a destination. 1891 * Marking it as killed would make e.g. get_reg_specified() wouldn't work 1892 * correctly. 1893 */ 1894 ra_foreach_src (src, instr) { 1895 assert(src->num != INVALID_REG); 1896 handle_precolored_source(ctx, src); 1897 } 1898 1899 ra_foreach_src (src, instr) { 1900 struct ra_file *file = ra_get_file(ctx, src); 1901 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1902 if (src->flags & IR3_REG_FIRST_KILL) 1903 ra_file_remove(file, interval); 1904 } 1905 1906 insert_parallel_copy_instr(ctx, instr); 1907} 1908 1909static physreg_t 1910read_register(struct ra_ctx *ctx, struct ir3_block *block, 1911 struct ir3_register *def) 1912{ 1913 struct ra_block_state *state = &ctx->blocks[block->index]; 1914 if (state->renames) { 1915 struct hash_entry *entry = _mesa_hash_table_search(state->renames, def); 1916 if (entry) { 1917 return (physreg_t)(uintptr_t)entry->data; 1918 } 1919 } 1920 1921 return ra_reg_get_physreg(def); 1922} 1923 1924static void 1925handle_live_in(struct ra_ctx *ctx, struct ir3_register *def) 1926{ 1927 physreg_t physreg = ~0; 1928 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) { 1929 struct ir3_block *pred = ctx->block->predecessors[i]; 1930 struct ra_block_state *pred_state = &ctx->blocks[pred->index]; 1931 1932 if (!pred_state->visited) 1933 continue; 1934 1935 physreg = read_register(ctx, pred, def); 1936 break; 1937 } 1938 1939 assert(physreg != (physreg_t)~0); 1940 1941 struct ra_interval *interval = &ctx->intervals[def->name]; 1942 struct ra_file *file = ra_get_file(ctx, def); 1943 ra_interval_init(interval, def); 1944 interval->physreg_start = physreg; 1945 interval->physreg_end = physreg + reg_size(def); 1946 ra_file_insert(file, interval); 1947} 1948 1949static void 1950handle_live_out(struct ra_ctx *ctx, struct ir3_register *def) 1951{ 1952 /* Skip parallelcopy's which in the original program are only used as phi 1953 * arguments. Even though phi arguments are live out, they are only 1954 * assigned when the phi is. 1955 */ 1956 if (def->instr->opc == OPC_META_PARALLEL_COPY) 1957 return; 1958 1959 struct ra_block_state *state = &ctx->blocks[ctx->block->index]; 1960 struct ra_interval *interval = &ctx->intervals[def->name]; 1961 physreg_t physreg = ra_interval_get_physreg(interval); 1962 if (physreg != ra_reg_get_physreg(def)) { 1963 if (!state->renames) 1964 state->renames = _mesa_pointer_hash_table_create(ctx); 1965 _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg); 1966 } 1967} 1968 1969static void 1970handle_phi(struct ra_ctx *ctx, struct ir3_register *def) 1971{ 1972 struct ra_file *file = ra_get_file(ctx, def); 1973 struct ra_interval *interval = &ctx->intervals[def->name]; 1974 1975 /* phis are always scalar, so they should already be the smallest possible 1976 * size. However they may be coalesced with other live-in values/phi 1977 * nodes, so check for that here. 1978 */ 1979 struct ir3_reg_interval *parent_ir3 = 1980 ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start); 1981 physreg_t physreg; 1982 if (parent_ir3) { 1983 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3); 1984 physreg = ra_interval_get_physreg(parent) + 1985 (def->interval_start - parent_ir3->reg->interval_start); 1986 } else { 1987 physreg = get_reg(ctx, file, def); 1988 } 1989 1990 allocate_dst_fixed(ctx, def, physreg); 1991 1992 ra_file_insert(file, interval); 1993} 1994 1995static void 1996assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi) 1997{ 1998 struct ra_file *file = ra_get_file(ctx, phi->dsts[0]); 1999 struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name]; 2000 assert(!interval->interval.parent); 2001 unsigned num = ra_interval_get_num(interval); 2002 assign_reg(phi, phi->dsts[0], num); 2003 2004 /* Assign the parallelcopy sources of this phi */ 2005 for (unsigned i = 0; i < phi->srcs_count; i++) { 2006 if (phi->srcs[i]->def) { 2007 assign_reg(phi, phi->srcs[i], num); 2008 assign_reg(phi, phi->srcs[i]->def, num); 2009 } 2010 } 2011 2012 if (phi->dsts[0]->flags & IR3_REG_UNUSED) 2013 ra_file_remove(file, interval); 2014} 2015 2016/* When we split a live range, we sometimes need to emit fixup code at the end 2017 * of a block. For example, something like: 2018 * 2019 * a = ... 2020 * if (...) { 2021 * ... 2022 * a' = a 2023 * b = ... // a evicted to make room for b 2024 * ... 2025 * } 2026 * ... = a 2027 * 2028 * When we insert the copy to a' in insert_parallel_copy_instr(), this forces 2029 * to insert another copy "a = a'" at the end of the if. Normally this would 2030 * also entail adding a phi node, but since we're about to go out of SSA 2031 * anyway we just insert an extra move. Note, however, that "b" might be used 2032 * in a phi node at the end of the if and share registers with "a", so we 2033 * have to be careful to extend any preexisting parallelcopy instruction 2034 * instead of creating our own in order to guarantee that they properly get 2035 * swapped. 2036 */ 2037 2038static void 2039insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src, 2040 struct ir3_register *reg) 2041{ 2042 struct ir3_instruction *old_pcopy = NULL; 2043 if (!list_is_empty(&block->instr_list)) { 2044 struct ir3_instruction *last = 2045 list_entry(block->instr_list.prev, struct ir3_instruction, node); 2046 if (last->opc == OPC_META_PARALLEL_COPY) 2047 old_pcopy = last; 2048 } 2049 2050 unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0; 2051 struct ir3_instruction *pcopy = ir3_instr_create( 2052 block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1); 2053 2054 for (unsigned i = 0; i < old_pcopy_srcs; i++) { 2055 old_pcopy->dsts[i]->instr = pcopy; 2056 pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i]; 2057 } 2058 2059 unsigned flags = reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY); 2060 2061 struct ir3_register *dst_reg = ir3_dst_create(pcopy, INVALID_REG, flags); 2062 dst_reg->wrmask = reg->wrmask; 2063 dst_reg->size = reg->size; 2064 assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags)); 2065 2066 for (unsigned i = 0; i < old_pcopy_srcs; i++) { 2067 pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i]; 2068 } 2069 2070 struct ir3_register *src_reg = ir3_src_create(pcopy, INVALID_REG, flags); 2071 src_reg->wrmask = reg->wrmask; 2072 src_reg->size = reg->size; 2073 assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags)); 2074 2075 if (old_pcopy) 2076 list_del(&old_pcopy->node); 2077} 2078 2079static void 2080insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval) 2081{ 2082 physreg_t physreg = ra_interval_get_physreg(interval); 2083 2084 bool shared = interval->interval.reg->flags & IR3_REG_SHARED; 2085 struct ir3_block **predecessors = 2086 shared ? ctx->block->physical_predecessors : ctx->block->predecessors; 2087 unsigned predecessors_count = shared 2088 ? ctx->block->physical_predecessors_count 2089 : ctx->block->predecessors_count; 2090 2091 for (unsigned i = 0; i < predecessors_count; i++) { 2092 struct ir3_block *pred = predecessors[i]; 2093 struct ra_block_state *pred_state = &ctx->blocks[pred->index]; 2094 2095 if (!pred_state->visited) 2096 continue; 2097 2098 physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg); 2099 if (pred_reg != physreg) { 2100 insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg); 2101 2102 /* This is a bit tricky, but when visiting the destination of a 2103 * physical-only edge, we have two predecessors (the if and the 2104 * header block) and both have multiple successors. We pick the 2105 * register for all live-ins from the normal edge, which should 2106 * guarantee that there's no need for shuffling things around in 2107 * the normal predecessor as long as there are no phi nodes, but 2108 * we still may need to insert fixup code in the physical 2109 * predecessor (i.e. the last block of the if) and that has 2110 * another successor (the block after the if) so we need to update 2111 * the renames state for when we process the other successor. This 2112 * crucially depends on the other successor getting processed 2113 * after this. 2114 * 2115 * For normal (non-physical) edges we disallow critical edges so 2116 * that hacks like this aren't necessary. 2117 */ 2118 if (!pred_state->renames) 2119 pred_state->renames = _mesa_pointer_hash_table_create(ctx); 2120 _mesa_hash_table_insert(pred_state->renames, interval->interval.reg, 2121 (void *)(uintptr_t)physreg); 2122 } 2123 } 2124} 2125 2126static void 2127insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file) 2128{ 2129 BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index]; 2130 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, 2131 physreg_node) { 2132 /* Skip phi nodes. This needs to happen after phi nodes are allocated, 2133 * because we may have to move live-ins around to make space for phi 2134 * nodes, but we shouldn't be handling phi nodes here. 2135 */ 2136 if (BITSET_TEST(live_in, interval->interval.reg->name)) 2137 insert_live_in_move(ctx, interval); 2138 } 2139} 2140 2141static void 2142insert_entry_regs(struct ra_block_state *state, struct ra_file *file) 2143{ 2144 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, 2145 physreg_node) { 2146 _mesa_hash_table_insert(state->entry_regs, interval->interval.reg, 2147 (void *)(uintptr_t)interval->physreg_start); 2148 } 2149} 2150 2151static void 2152insert_live_in_moves(struct ra_ctx *ctx) 2153{ 2154 insert_file_live_in_moves(ctx, &ctx->full); 2155 insert_file_live_in_moves(ctx, &ctx->half); 2156 insert_file_live_in_moves(ctx, &ctx->shared); 2157 2158 /* If not all predecessors are visited, insert live-in regs so that 2159 * insert_live_out_moves() will work. 2160 */ 2161 bool all_preds_visited = true; 2162 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) { 2163 if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) { 2164 all_preds_visited = false; 2165 break; 2166 } 2167 } 2168 2169 if (!all_preds_visited) { 2170 struct ra_block_state *state = &ctx->blocks[ctx->block->index]; 2171 state->entry_regs = _mesa_pointer_hash_table_create(ctx); 2172 2173 insert_entry_regs(state, &ctx->full); 2174 insert_entry_regs(state, &ctx->half); 2175 insert_entry_regs(state, &ctx->shared); 2176 } 2177} 2178 2179static void 2180insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval) 2181{ 2182 for (unsigned i = 0; i < 2; i++) { 2183 if (!ctx->block->successors[i]) 2184 continue; 2185 2186 struct ir3_block *succ = ctx->block->successors[i]; 2187 struct ra_block_state *succ_state = &ctx->blocks[succ->index]; 2188 2189 if (!succ_state->visited) 2190 continue; 2191 2192 struct hash_entry *entry = _mesa_hash_table_search( 2193 succ_state->entry_regs, interval->interval.reg); 2194 if (!entry) 2195 continue; 2196 2197 physreg_t new_reg = (physreg_t)(uintptr_t)entry->data; 2198 if (new_reg != interval->physreg_start) { 2199 insert_liveout_copy(ctx->block, new_reg, interval->physreg_start, 2200 interval->interval.reg); 2201 } 2202 } 2203} 2204 2205static void 2206insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file) 2207{ 2208 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, 2209 physreg_node) { 2210 insert_live_out_move(ctx, interval); 2211 } 2212} 2213 2214static void 2215insert_live_out_moves(struct ra_ctx *ctx) 2216{ 2217 insert_file_live_out_moves(ctx, &ctx->full); 2218 insert_file_live_out_moves(ctx, &ctx->half); 2219 insert_file_live_out_moves(ctx, &ctx->shared); 2220} 2221 2222static void 2223handle_block(struct ra_ctx *ctx, struct ir3_block *block) 2224{ 2225 ctx->block = block; 2226 2227 /* Reset the register files from the last block */ 2228 ra_file_init(&ctx->full); 2229 ra_file_init(&ctx->half); 2230 ra_file_init(&ctx->shared); 2231 2232 /* Handle live-ins, phis, and input meta-instructions. These all appear 2233 * live at the beginning of the block, and interfere with each other 2234 * therefore need to be allocated "in parallel". This means that we 2235 * have to allocate all of them, inserting them into the file, and then 2236 * delay updating the IR until all of them are allocated. 2237 * 2238 * Handle precolored inputs first, because we need to make sure that other 2239 * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes 2240 * and inputs at the same time, because the first block doesn't have 2241 * predecessors. Therefore handle_live_in doesn't have to worry about 2242 * them. 2243 */ 2244 2245 foreach_instr (instr, &block->instr_list) { 2246 if (instr->opc == OPC_META_INPUT) 2247 handle_precolored_input(ctx, instr); 2248 else 2249 break; 2250 } 2251 2252 unsigned name; 2253 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index], 2254 ctx->live->definitions_count) { 2255 struct ir3_register *reg = ctx->live->definitions[name]; 2256 handle_live_in(ctx, reg); 2257 } 2258 2259 foreach_instr (instr, &block->instr_list) { 2260 if (instr->opc == OPC_META_PHI) 2261 handle_phi(ctx, instr->dsts[0]); 2262 else if (instr->opc == OPC_META_INPUT || 2263 instr->opc == OPC_META_TEX_PREFETCH) 2264 handle_input(ctx, instr); 2265 else 2266 break; 2267 } 2268 2269 /* After this point, every live-in/phi/input has an interval assigned to 2270 * it. We delay actually assigning values until everything has been 2271 * allocated, so we can simply ignore any parallel copy entries created 2272 * when shuffling them around. 2273 */ 2274 ctx->parallel_copies_count = 0; 2275 2276 insert_live_in_moves(ctx); 2277 2278 if (RA_DEBUG) { 2279 d("after live-in block %u:\n", block->index); 2280 ra_ctx_dump(ctx); 2281 } 2282 2283 /* Now we're done with processing live-ins, and can handle the body of the 2284 * block. 2285 */ 2286 foreach_instr (instr, &block->instr_list) { 2287 di(instr, "processing"); 2288 2289 if (instr->opc == OPC_META_PHI) 2290 assign_phi(ctx, instr); 2291 else if (instr->opc == OPC_META_INPUT || 2292 instr->opc == OPC_META_TEX_PREFETCH) 2293 assign_input(ctx, instr); 2294 else if (instr->opc == OPC_META_SPLIT) 2295 handle_split(ctx, instr); 2296 else if (instr->opc == OPC_META_COLLECT) 2297 handle_collect(ctx, instr); 2298 else if (instr->opc == OPC_META_PARALLEL_COPY) 2299 handle_pcopy(ctx, instr); 2300 else if (instr->opc == OPC_CHMASK) 2301 handle_chmask(ctx, instr); 2302 else 2303 handle_normal_instr(ctx, instr); 2304 2305 if (RA_DEBUG) 2306 ra_ctx_dump(ctx); 2307 } 2308 2309 insert_live_out_moves(ctx); 2310 2311 BITSET_FOREACH_SET (name, ctx->live->live_out[block->index], 2312 ctx->live->definitions_count) { 2313 struct ir3_register *reg = ctx->live->definitions[name]; 2314 handle_live_out(ctx, reg); 2315 } 2316 2317 ctx->blocks[block->index].visited = true; 2318} 2319 2320static unsigned 2321calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure) 2322{ 2323 /* Registers are allocated in units of vec4, so switch from units of 2324 * half-regs to vec4. 2325 */ 2326 unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4); 2327 2328 bool double_threadsize = ir3_should_double_threadsize(v, reg_count); 2329 2330 unsigned target = reg_count; 2331 unsigned reg_independent_max_waves = 2332 ir3_get_reg_independent_max_waves(v, double_threadsize); 2333 unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves( 2334 v->compiler, reg_count, double_threadsize); 2335 unsigned target_waves = 2336 MIN2(reg_independent_max_waves, reg_dependent_max_waves); 2337 2338 while (target <= RA_FULL_SIZE / (2 * 4) && 2339 ir3_should_double_threadsize(v, target) == double_threadsize && 2340 ir3_get_reg_dependent_max_waves(v->compiler, target, 2341 double_threadsize) >= target_waves) 2342 target++; 2343 2344 return (target - 1) * 2 * 4; 2345} 2346 2347static void 2348add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg, 2349 bool merged_regs) 2350{ 2351 unsigned size = reg_size(reg); 2352 if (reg->flags & IR3_REG_HALF) 2353 pressure->half += size; 2354 if (!(reg->flags & IR3_REG_HALF) || merged_regs) 2355 pressure->full += size; 2356} 2357 2358static void 2359dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) 2360{ 2361} 2362 2363static void 2364dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) 2365{ 2366} 2367 2368static void 2369dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent, 2370 struct ir3_reg_interval *child) 2371{ 2372} 2373 2374/* Calculate the minimum possible limit on register pressure so that spilling 2375 * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall. 2376 */ 2377 2378static void 2379calc_min_limit_pressure(struct ir3_shader_variant *v, 2380 struct ir3_liveness *live, 2381 struct ir3_pressure *limit) 2382{ 2383 struct ir3_block *start = ir3_start_block(v->ir); 2384 struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx); 2385 struct ir3_reg_interval *intervals = 2386 rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count); 2387 2388 ctx->interval_add = dummy_interval_add; 2389 ctx->interval_delete = dummy_interval_delete; 2390 ctx->interval_readd = dummy_interval_readd; 2391 2392 limit->full = limit->half = 0; 2393 2394 struct ir3_pressure cur_pressure = {0}; 2395 foreach_instr (input, &start->instr_list) { 2396 if (input->opc != OPC_META_INPUT && 2397 input->opc != OPC_META_TEX_PREFETCH) 2398 break; 2399 2400 add_pressure(&cur_pressure, input->dsts[0], v->mergedregs); 2401 } 2402 2403 limit->full = MAX2(limit->full, cur_pressure.full); 2404 limit->half = MAX2(limit->half, cur_pressure.half); 2405 2406 foreach_instr (input, &start->instr_list) { 2407 if (input->opc != OPC_META_INPUT && 2408 input->opc != OPC_META_TEX_PREFETCH) 2409 break; 2410 2411 /* pre-colored inputs may have holes, which increases the pressure. */ 2412 struct ir3_register *dst = input->dsts[0]; 2413 if (dst->num != INVALID_REG) { 2414 unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst); 2415 if (dst->flags & IR3_REG_HALF) 2416 limit->half = MAX2(limit->half, physreg); 2417 if (!(dst->flags & IR3_REG_HALF) || v->mergedregs) 2418 limit->full = MAX2(limit->full, physreg); 2419 } 2420 } 2421 2422 foreach_block (block, &v->ir->block_list) { 2423 rb_tree_init(&ctx->intervals); 2424 2425 unsigned name; 2426 BITSET_FOREACH_SET (name, live->live_in[block->index], 2427 live->definitions_count) { 2428 struct ir3_register *reg = live->definitions[name]; 2429 ir3_reg_interval_init(&intervals[reg->name], reg); 2430 ir3_reg_interval_insert(ctx, &intervals[reg->name]); 2431 } 2432 2433 foreach_instr (instr, &block->instr_list) { 2434 ra_foreach_dst (dst, instr) { 2435 ir3_reg_interval_init(&intervals[dst->name], dst); 2436 } 2437 /* phis and parallel copies can be deleted via spilling */ 2438 2439 if (instr->opc == OPC_META_PHI) { 2440 ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]); 2441 continue; 2442 } 2443 2444 if (instr->opc == OPC_META_PARALLEL_COPY) 2445 continue; 2446 2447 cur_pressure = (struct ir3_pressure) {0}; 2448 2449 ra_foreach_dst (dst, instr) { 2450 if (dst->tied && !(dst->tied->flags & IR3_REG_KILL)) 2451 add_pressure(&cur_pressure, dst, v->mergedregs); 2452 } 2453 2454 ra_foreach_src_rev (src, instr) { 2455 /* We currently don't support spilling the parent of a source when 2456 * making space for sources, so we have to keep track of the 2457 * intervals and figure out the root of the tree to figure out how 2458 * much space we need. 2459 * 2460 * TODO: We should probably support this in the spiller. 2461 */ 2462 struct ir3_reg_interval *interval = &intervals[src->def->name]; 2463 while (interval->parent) 2464 interval = interval->parent; 2465 add_pressure(&cur_pressure, interval->reg, v->mergedregs); 2466 2467 if (src->flags & IR3_REG_FIRST_KILL) 2468 ir3_reg_interval_remove(ctx, &intervals[src->def->name]); 2469 } 2470 2471 limit->full = MAX2(limit->full, cur_pressure.full); 2472 limit->half = MAX2(limit->half, cur_pressure.half); 2473 2474 cur_pressure = (struct ir3_pressure) {0}; 2475 2476 ra_foreach_dst (dst, instr) { 2477 ir3_reg_interval_init(&intervals[dst->name], dst); 2478 ir3_reg_interval_insert(ctx, &intervals[dst->name]); 2479 add_pressure(&cur_pressure, dst, v->mergedregs); 2480 } 2481 2482 limit->full = MAX2(limit->full, cur_pressure.full); 2483 limit->half = MAX2(limit->half, cur_pressure.half); 2484 } 2485 } 2486 2487 /* Account for the base register, which needs to be available everywhere. */ 2488 limit->full += 2; 2489 2490 ralloc_free(ctx); 2491} 2492 2493/* 2494 * If barriers are used, it must be possible for all waves in the workgroup 2495 * to execute concurrently. Thus we may have to reduce the registers limit. 2496 */ 2497static void 2498calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v, 2499 struct ir3_pressure *limit_pressure) 2500{ 2501 const struct ir3_compiler *compiler = v->compiler; 2502 2503 unsigned threads_per_wg; 2504 if (v->local_size_variable) { 2505 /* We have to expect the worst case. */ 2506 threads_per_wg = compiler->max_variable_workgroup_size; 2507 } else { 2508 threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2]; 2509 } 2510 2511 /* The register file is grouped into reg_size_vec4 number of parts. 2512 * Each part has enough registers to add a single vec4 register to 2513 * each thread of a single-sized wave-pair. With double threadsize 2514 * each wave-pair would consume two parts of the register file to get 2515 * a single vec4 for a thread. The more active wave-pairs the less 2516 * parts each could get. 2517 */ 2518 2519 bool double_threadsize = ir3_should_double_threadsize(v, 0); 2520 unsigned waves_per_wg = DIV_ROUND_UP( 2521 threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) * 2522 compiler->wave_granularity); 2523 2524 uint32_t vec4_regs_per_thread = 2525 compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1)); 2526 assert(vec4_regs_per_thread > 0); 2527 2528 uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2; 2529 2530 if (limit_pressure->full > half_regs_per_thread) { 2531 if (v->mergedregs) { 2532 limit_pressure->full = half_regs_per_thread; 2533 } else { 2534 /* TODO: Handle !mergedregs case, probably we would have to do this 2535 * after the first register pressure pass. 2536 */ 2537 } 2538 } 2539} 2540 2541int 2542ir3_ra(struct ir3_shader_variant *v) 2543{ 2544 ir3_calc_dominance(v->ir); 2545 2546 ir3_create_parallel_copies(v->ir); 2547 2548 struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx); 2549 2550 ctx->merged_regs = v->mergedregs; 2551 ctx->compiler = v->compiler; 2552 ctx->stage = v->type; 2553 2554 struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir); 2555 2556 ir3_debug_print(v->ir, "AFTER: create_parallel_copies"); 2557 2558 ir3_merge_regs(live, v->ir); 2559 2560 struct ir3_pressure max_pressure; 2561 ir3_calc_pressure(v, live, &max_pressure); 2562 d("max pressure:"); 2563 d("\tfull: %u", max_pressure.full); 2564 d("\thalf: %u", max_pressure.half); 2565 d("\tshared: %u", max_pressure.shared); 2566 2567 struct ir3_pressure limit_pressure; 2568 limit_pressure.full = RA_FULL_SIZE; 2569 limit_pressure.half = RA_HALF_SIZE; 2570 limit_pressure.shared = RA_SHARED_SIZE; 2571 2572 if (gl_shader_stage_is_compute(v->type) && v->has_barrier) { 2573 calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure); 2574 } 2575 2576 /* If the user forces a doubled threadsize, we may have to lower the limit 2577 * because on some gens the register file is not big enough to hold a 2578 * double-size wave with all 48 registers in use. 2579 */ 2580 if (v->real_wavesize == IR3_DOUBLE_ONLY) { 2581 limit_pressure.full = 2582 MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16); 2583 } 2584 2585 /* If requested, lower the limit so that spilling happens more often. */ 2586 if (ir3_shader_debug & IR3_DBG_SPILLALL) 2587 calc_min_limit_pressure(v, live, &limit_pressure); 2588 2589 if (max_pressure.shared > limit_pressure.shared) { 2590 /* TODO shared reg -> normal reg spilling */ 2591 d("shared max pressure exceeded!"); 2592 goto fail; 2593 } 2594 2595 bool spilled = false; 2596 if (max_pressure.full > limit_pressure.full || 2597 max_pressure.half > limit_pressure.half) { 2598 if (!v->compiler->has_pvtmem) { 2599 d("max pressure exceeded!"); 2600 goto fail; 2601 } 2602 d("max pressure exceeded, spilling!"); 2603 IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure); 2604 ir3_calc_pressure(v, live, &max_pressure); 2605 assert(max_pressure.full <= limit_pressure.full && 2606 max_pressure.half <= limit_pressure.half); 2607 spilled = true; 2608 } 2609 2610 ctx->live = live; 2611 ctx->intervals = 2612 rzalloc_array(ctx, struct ra_interval, live->definitions_count); 2613 ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count); 2614 2615 ctx->full.size = calc_target_full_pressure(v, max_pressure.full); 2616 d("full size: %u", ctx->full.size); 2617 2618 if (!v->mergedregs) 2619 ctx->half.size = RA_HALF_SIZE; 2620 2621 ctx->shared.size = RA_SHARED_SIZE; 2622 2623 ctx->full.start = ctx->half.start = ctx->shared.start = 0; 2624 2625 foreach_block (block, &v->ir->block_list) 2626 handle_block(ctx, block); 2627 2628 ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count); 2629 2630 /* Strip array-ness and SSA-ness at the end, because various helpers still 2631 * need to work even on definitions that have already been assigned. For 2632 * example, we need to preserve array-ness so that array live-ins have the 2633 * right size. 2634 */ 2635 foreach_block (block, &v->ir->block_list) { 2636 foreach_instr (instr, &block->instr_list) { 2637 for (unsigned i = 0; i < instr->dsts_count; i++) { 2638 instr->dsts[i]->flags &= ~IR3_REG_SSA; 2639 2640 /* Parallel copies of array registers copy the whole register, and 2641 * we need some way to let the parallel copy code know that this was 2642 * an array whose size is determined by reg->size. So keep the array 2643 * flag on those. spill/reload also need to work on the entire 2644 * array. 2645 */ 2646 if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO) 2647 instr->dsts[i]->flags &= ~IR3_REG_ARRAY; 2648 } 2649 2650 for (unsigned i = 0; i < instr->srcs_count; i++) { 2651 instr->srcs[i]->flags &= ~IR3_REG_SSA; 2652 2653 if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO) 2654 instr->srcs[i]->flags &= ~IR3_REG_ARRAY; 2655 } 2656 } 2657 } 2658 2659 ir3_debug_print(v->ir, "AFTER: register allocation"); 2660 2661 if (spilled) { 2662 IR3_PASS(v->ir, ir3_lower_spill); 2663 } 2664 2665 ir3_lower_copies(v); 2666 2667 ir3_debug_print(v->ir, "AFTER: ir3_lower_copies"); 2668 2669 ralloc_free(ctx); 2670 2671 return 0; 2672fail: 2673 ralloc_free(ctx); 2674 return -1; 2675} 2676