xref: /third_party/mesa3d/src/freedreno/ir3/ir3_ra.c (revision bf215546)
1/*
2 * Copyright (C) 2021 Valve Corporation
3 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "ir3_ra.h"
26#include "util/rb_tree.h"
27#include "util/u_math.h"
28#include "ir3_shader.h"
29
30/* This file implements an SSA-based register allocator. Unlike other
31 * SSA-based allocators, it handles vector split/collect "smartly," meaning
32 * that multiple values may share the same register interval. From the
33 * perspective of the allocator itself, only the top-level intervals matter,
34 * and the allocator is only concerned with allocating top-level intervals,
35 * which may mean moving other top-level intervals around. Other intervals,
36 * like the destination of a split instruction or the source of a collect
37 * instruction, are "locked" to their parent interval. The details of this are
38 * mostly handled by ir3_merge_regs and ir3_reg_ctx.
39 *
40 * We currently don't do any backtracking, but we do use the merge sets as a
41 * form of affinity to try to avoid moves from phis/splits/collects. Each
42 * merge set is what a more "classic" graph-coloring or live-range based
43 * allocator would consider a single register, but here we use it as merely a
44 * hint, except when multiple overlapping values are live at the same time.
45 * Each merge set has a "preferred" register, and we try to honor that when
46 * allocating values in the merge set.
47 */
48
49/* ir3_reg_ctx implementation. */
50
51static int
52ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
53{
54   unsigned reg = *(const unsigned *)data;
55   const struct ir3_reg_interval *interval =
56      ir3_rb_node_to_interval_const(node);
57   if (interval->reg->interval_start > reg)
58      return -1;
59   else if (interval->reg->interval_end <= reg)
60      return 1;
61   else
62      return 0;
63}
64
65static struct ir3_reg_interval *
66ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
67{
68   struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
69   return node ? ir3_rb_node_to_interval(node) : NULL;
70}
71
72static struct ir3_reg_interval *
73ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
74{
75   struct rb_node *node =
76      rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
77   return node ? ir3_rb_node_to_interval(node) : NULL;
78}
79
80/* Get the interval covering the reg, or the closest to the right if it
81 * doesn't exist.
82 */
83static struct ir3_reg_interval *
84ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
85{
86   struct ir3_reg_interval *interval =
87      ir3_reg_interval_search_sloppy(tree, offset);
88   if (!interval) {
89      return NULL;
90   } else if (interval->reg->interval_end > offset) {
91      return interval;
92   } else {
93      /* There is no interval covering reg, and ra_file_search_sloppy()
94       * returned the closest range to the left, so the next interval to the
95       * right should be the closest to the right.
96       */
97      return ir3_reg_interval_next_or_null(interval);
98   }
99}
100
101static int
102ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
103{
104   const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
105   const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
106   return b->reg->interval_start - a->reg->interval_start;
107}
108
109static void
110interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
111                struct ir3_reg_interval *interval)
112{
113   struct ir3_reg_interval *right =
114      ir3_reg_interval_search_right(tree, interval->reg->interval_start);
115   if (right && right->reg->interval_start < interval->reg->interval_end) {
116      /* We disallow trees where different members have different half-ness.
117       * This means that we can't treat bitcasts as copies like normal
118       * split/collect, so something like this would require an extra copy
119       * in mergedregs mode, and count as 4 half-units of register pressure
120       * instead of 2:
121       *
122       * f16vec2 foo = unpackFloat2x16(bar)
123       * ... = foo.x
124       * ... = bar
125       *
126       * However, relaxing this rule would open a huge can of worms. What
127       * happens when there's a vector of 16 things, and the fifth element
128       * has been bitcasted as a half-reg? Would that element alone have to
129       * be small enough to be used as a half-reg source? Let's keep that
130       * can of worms firmly shut for now.
131       */
132      assert((interval->reg->flags & IR3_REG_HALF) ==
133             (right->reg->flags & IR3_REG_HALF));
134
135      if (right->reg->interval_end <= interval->reg->interval_end &&
136          right->reg->interval_start >= interval->reg->interval_start) {
137         /* Check if we're inserting something that's already inserted */
138         assert(interval != right);
139
140         /* "right" is contained in "interval" and must become a child of
141          * it. There may be further children too.
142          */
143         for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
144              right && right->reg->interval_start < interval->reg->interval_end;
145              right = next, next = ir3_reg_interval_next_or_null(next)) {
146            /* "right" must be contained in "interval." */
147            assert(right->reg->interval_end <= interval->reg->interval_end);
148            assert((interval->reg->flags & IR3_REG_HALF) ==
149                   (right->reg->flags & IR3_REG_HALF));
150            if (!right->parent)
151               ctx->interval_delete(ctx, right);
152            right->parent = interval;
153            rb_tree_remove(tree, &right->node);
154            rb_tree_insert(&interval->children, &right->node,
155                           ir3_reg_interval_insert_cmp);
156         }
157      } else {
158         /* "right" must contain "interval," since intervals must form a
159          * tree.
160          */
161         assert(right->reg->interval_start <= interval->reg->interval_start);
162         interval->parent = right;
163         interval_insert(ctx, &right->children, interval);
164         return;
165      }
166   }
167
168   if (!interval->parent)
169      ctx->interval_add(ctx, interval);
170   rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
171   interval->inserted = true;
172}
173
174void
175ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
176                        struct ir3_reg_interval *interval)
177{
178   rb_tree_init(&interval->children);
179   interval->parent = NULL;
180   interval_insert(ctx, &ctx->intervals, interval);
181}
182
183/* Call after ir3_reg_interval_remove_temp() to reinsert the interval */
184static void
185ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx,
186                          struct ir3_reg_interval *interval)
187{
188   interval->parent = NULL;
189   interval_insert(ctx, &ctx->intervals, interval);
190}
191
192void
193ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
194                        struct ir3_reg_interval *interval)
195{
196   if (interval->parent) {
197      rb_tree_remove(&interval->parent->children, &interval->node);
198   } else {
199      ctx->interval_delete(ctx, interval);
200      rb_tree_remove(&ctx->intervals, &interval->node);
201   }
202
203   rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
204                         node) {
205      rb_tree_remove(&interval->children, &child->node);
206      child->parent = interval->parent;
207
208      if (interval->parent) {
209         rb_tree_insert(&child->parent->children, &child->node,
210                        ir3_reg_interval_insert_cmp);
211      } else {
212         ctx->interval_readd(ctx, interval, child);
213         rb_tree_insert(&ctx->intervals, &child->node,
214                        ir3_reg_interval_insert_cmp);
215      }
216   }
217
218   interval->inserted = false;
219}
220
221static void
222_mark_free(struct ir3_reg_interval *interval)
223{
224   interval->inserted = false;
225   rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
226      _mark_free(child);
227   }
228}
229
230/* Remove an interval and all its children from the tree. */
231void
232ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
233                            struct ir3_reg_interval *interval)
234{
235   assert(!interval->parent);
236
237   ctx->interval_delete(ctx, interval);
238   rb_tree_remove(&ctx->intervals, &interval->node);
239   _mark_free(interval);
240}
241
242/* Used when popping an interval to be shuffled around. Don't disturb children
243 * so that it can be later reinserted.
244 */
245static void
246ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx,
247                             struct ir3_reg_interval *interval)
248{
249   assert(!interval->parent);
250
251   ctx->interval_delete(ctx, interval);
252   rb_tree_remove(&ctx->intervals, &interval->node);
253}
254
255static void
256interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval,
257              unsigned indent)
258{
259   for (unsigned i = 0; i < indent; i++)
260      mesa_log_stream_printf(stream, "\t");
261   mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name,
262                          interval->reg->interval_start);
263
264   rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
265      interval_dump(stream, child, indent + 1);
266   }
267
268   for (unsigned i = 0; i < indent; i++)
269      mesa_log_stream_printf(stream, "\t");
270   mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name,
271                          interval->reg->interval_end);
272}
273
274void
275ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval)
276{
277   interval_dump(stream, interval, 0);
278}
279
280/* These are the core datastructures used by the register allocator. First
281 * ra_interval and ra_file, which are used for intra-block tracking and use
282 * the ir3_reg_ctx infrastructure:
283 */
284
285struct ra_interval {
286   struct ir3_reg_interval interval;
287
288   struct rb_node physreg_node;
289   physreg_t physreg_start, physreg_end;
290
291   /* True if this is a source of the current instruction which is entirely
292    * killed. This means we can allocate the dest over it, but we can't break
293    * it up.
294    */
295   bool is_killed;
296
297   /* True if this interval cannot be moved from its position. This is only
298    * used for precolored inputs to ensure that other inputs don't get
299    * allocated on top of them.
300    */
301   bool frozen;
302};
303
304struct ra_file {
305   struct ir3_reg_ctx reg_ctx;
306
307   BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
308   BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
309
310   struct rb_tree physreg_intervals;
311
312   unsigned size;
313   unsigned start;
314};
315
316/* State for inter-block tracking. When we split a live range to make space
317 * for a vector, we may need to insert fixup code when a block has multiple
318 * predecessors that have moved the same live value to different registers.
319 * This keeps track of state required to do that.
320 */
321
322struct ra_block_state {
323   /* Map of defining ir3_register -> physreg it was allocated to at the end
324    * of the block.
325    */
326   struct hash_table *renames;
327
328   /* For loops, we need to process a block before all its predecessors have
329    * been processed. In particular, we need to pick registers for values
330    * without knowing if all the predecessors have been renamed. This keeps
331    * track of the registers we chose so that when we visit the back-edge we
332    * can move them appropriately. If all predecessors have been visited
333    * before this block is visited then we don't need to fill this out. This
334    * is a map from ir3_register -> physreg.
335    */
336   struct hash_table *entry_regs;
337
338   /* True if the block has been visited and "renames" is complete.
339    */
340   bool visited;
341};
342
343struct ra_parallel_copy {
344   struct ra_interval *interval;
345   physreg_t src;
346};
347
348/* The main context: */
349
350struct ra_ctx {
351   /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
352    * half of this file too.
353    */
354   struct ra_file full;
355
356   /* hr0.x - hr63.w, only used without merged-regs. */
357   struct ra_file half;
358
359   /* Shared regs. */
360   struct ra_file shared;
361
362   struct ir3_liveness *live;
363
364   struct ir3_block *block;
365
366   const struct ir3_compiler *compiler;
367   gl_shader_stage stage;
368
369   /* Pending moves of top-level intervals that will be emitted once we're
370    * finished:
371    */
372   DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
373
374   struct ra_interval *intervals;
375   struct ra_block_state *blocks;
376
377   bool merged_regs;
378};
379
380#define foreach_interval(interval, file)                                       \
381   rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
382                    physreg_node)
383#define foreach_interval_rev(interval, file)                                   \
384   rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
385                    physreg_node)
386#define foreach_interval_safe(interval, file)                                  \
387   rb_tree_foreach_safe (struct ra_interval, interval,                         \
388                         &(file)->physreg_intervals, physreg_node)
389#define foreach_interval_rev_safe(interval, file)                              \
390   rb_tree_foreach_rev_safe(struct ra_interval, interval,                      \
391                            &(file)->physreg_intervals, physreg_node)
392
393static struct ra_interval *
394rb_node_to_interval(struct rb_node *node)
395{
396   return rb_node_data(struct ra_interval, node, physreg_node);
397}
398
399static const struct ra_interval *
400rb_node_to_interval_const(const struct rb_node *node)
401{
402   return rb_node_data(struct ra_interval, node, physreg_node);
403}
404
405static struct ra_interval *
406ra_interval_next(struct ra_interval *interval)
407{
408   struct rb_node *next = rb_node_next(&interval->physreg_node);
409   return next ? rb_node_to_interval(next) : NULL;
410}
411
412static struct ra_interval *
413ra_interval_next_or_null(struct ra_interval *interval)
414{
415   return interval ? ra_interval_next(interval) : NULL;
416}
417
418static int
419ra_interval_cmp(const struct rb_node *node, const void *data)
420{
421   physreg_t reg = *(const physreg_t *)data;
422   const struct ra_interval *interval = rb_node_to_interval_const(node);
423   if (interval->physreg_start > reg)
424      return -1;
425   else if (interval->physreg_end <= reg)
426      return 1;
427   else
428      return 0;
429}
430
431static struct ra_interval *
432ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
433{
434   struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
435   return node ? rb_node_to_interval(node) : NULL;
436}
437
438/* Get the interval covering the reg, or the closest to the right if it
439 * doesn't exist.
440 */
441static struct ra_interval *
442ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
443{
444   struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
445   if (!interval) {
446      return NULL;
447   } else if (interval->physreg_end > reg) {
448      return interval;
449   } else {
450      /* There is no interval covering reg, and ra_file_search_sloppy()
451       * returned the closest range to the left, so the next interval to the
452       * right should be the closest to the right.
453       */
454      return ra_interval_next_or_null(interval);
455   }
456}
457
458static struct ra_interval *
459ra_file_search_right(struct ra_file *file, physreg_t reg)
460{
461   return ra_interval_search_right(&file->physreg_intervals, reg);
462}
463
464static int
465ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
466{
467   const struct ra_interval *a = rb_node_to_interval_const(_a);
468   const struct ra_interval *b = rb_node_to_interval_const(_b);
469   return b->physreg_start - a->physreg_start;
470}
471
472static struct ra_interval *
473ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
474{
475   return rb_node_data(struct ra_interval, interval, interval);
476}
477
478static struct ra_file *
479ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
480{
481   return rb_node_data(struct ra_file, ctx, reg_ctx);
482}
483
484static void
485interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
486{
487   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
488   struct ra_file *file = ir3_reg_ctx_to_file(ctx);
489
490   /* We can assume in this case that physreg_start/physreg_end is already
491    * initialized.
492    */
493   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
494      BITSET_CLEAR(file->available, i);
495      BITSET_CLEAR(file->available_to_evict, i);
496   }
497
498   rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
499                  ra_interval_insert_cmp);
500}
501
502static void
503interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
504{
505   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
506   struct ra_file *file = ir3_reg_ctx_to_file(ctx);
507
508   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
509      BITSET_SET(file->available, i);
510      BITSET_SET(file->available_to_evict, i);
511   }
512
513   rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
514}
515
516static void
517interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
518               struct ir3_reg_interval *_child)
519{
520   struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
521   struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
522
523   child->physreg_start =
524      parent->physreg_start + (child->interval.reg->interval_start -
525                               parent->interval.reg->interval_start);
526   child->physreg_end =
527      child->physreg_start +
528      (child->interval.reg->interval_end - child->interval.reg->interval_start);
529
530   interval_add(ctx, _child);
531}
532
533static void
534ra_file_init(struct ra_file *file)
535{
536   for (unsigned i = 0; i < file->size; i++) {
537      BITSET_SET(file->available, i);
538      BITSET_SET(file->available_to_evict, i);
539   }
540
541   rb_tree_init(&file->reg_ctx.intervals);
542   rb_tree_init(&file->physreg_intervals);
543
544   file->reg_ctx.interval_add = interval_add;
545   file->reg_ctx.interval_delete = interval_delete;
546   file->reg_ctx.interval_readd = interval_readd;
547}
548
549static void
550ra_file_insert(struct ra_file *file, struct ra_interval *interval)
551{
552   assert(interval->physreg_start < interval->physreg_end);
553   assert(interval->physreg_end <= file->size);
554   if (interval->interval.reg->flags & IR3_REG_HALF)
555      assert(interval->physreg_end <= RA_HALF_SIZE);
556
557   ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
558}
559
560static void
561ra_file_remove(struct ra_file *file, struct ra_interval *interval)
562{
563   ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
564}
565
566static void
567ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
568{
569   assert(!interval->interval.parent);
570
571   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
572      BITSET_SET(file->available, i);
573   }
574
575   interval->is_killed = true;
576}
577
578static void
579ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval)
580{
581   assert(!interval->interval.parent);
582
583   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
584      BITSET_CLEAR(file->available, i);
585   }
586
587   interval->is_killed = false;
588}
589
590static physreg_t
591ra_interval_get_physreg(const struct ra_interval *interval)
592{
593   unsigned child_start = interval->interval.reg->interval_start;
594
595   while (interval->interval.parent) {
596      interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
597   }
598
599   return interval->physreg_start +
600          (child_start - interval->interval.reg->interval_start);
601}
602
603static unsigned
604ra_interval_get_num(const struct ra_interval *interval)
605{
606   return ra_physreg_to_num(ra_interval_get_physreg(interval),
607                            interval->interval.reg->flags);
608}
609
610static void
611ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
612{
613   ir3_reg_interval_init(&interval->interval, reg);
614   interval->is_killed = false;
615   interval->frozen = false;
616}
617
618static void
619ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
620{
621   mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
622
623   ir3_reg_interval_dump(stream, &interval->interval);
624}
625
626static void
627ra_file_dump(struct log_stream *stream, struct ra_file *file)
628{
629   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
630                    physreg_node) {
631      ra_interval_dump(stream, interval);
632   }
633
634   unsigned start, end;
635   mesa_log_stream_printf(stream, "available:\n");
636   BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
637      mesa_log_stream_printf(stream, "%u-%u ", start, end);
638   }
639   mesa_log_stream_printf(stream, "\n");
640
641   mesa_log_stream_printf(stream, "available to evict:\n");
642   BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
643      mesa_log_stream_printf(stream, "%u-%u ", start, end);
644   }
645   mesa_log_stream_printf(stream, "\n");
646   mesa_log_stream_printf(stream, "start: %u\n", file->start);
647}
648
649static void
650ra_ctx_dump(struct ra_ctx *ctx)
651{
652   struct log_stream *stream = mesa_log_streami();
653   mesa_log_stream_printf(stream, "full:\n");
654   ra_file_dump(stream, &ctx->full);
655   mesa_log_stream_printf(stream, "half:\n");
656   ra_file_dump(stream, &ctx->half);
657   mesa_log_stream_printf(stream, "shared:");
658   ra_file_dump(stream, &ctx->shared);
659   mesa_log_stream_destroy(stream);
660}
661
662static unsigned
663reg_file_size(struct ra_file *file, struct ir3_register *reg)
664{
665   /* Half-regs can only take up the first half of the combined regfile */
666   if (reg->flags & IR3_REG_HALF)
667      return MIN2(file->size, RA_HALF_SIZE);
668   else
669      return file->size;
670}
671
672/* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
673 * top-level intervals at once. Pop multiple intervals, then push them back in
674 * any order.
675 */
676
677struct ra_removed_interval {
678   struct ra_interval *interval;
679   unsigned size;
680};
681
682static struct ra_removed_interval
683ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
684                struct ra_interval *interval)
685{
686   assert(!interval->interval.parent);
687
688   /* Check if we've already moved this reg before */
689   unsigned pcopy_index;
690   for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
691        pcopy_index++) {
692      if (ctx->parallel_copies[pcopy_index].interval == interval)
693         break;
694   }
695
696   if (pcopy_index == ctx->parallel_copies_count) {
697      array_insert(ctx, ctx->parallel_copies,
698                   (struct ra_parallel_copy){
699                      .interval = interval,
700                      .src = interval->physreg_start,
701                   });
702   }
703
704   ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval);
705
706   return (struct ra_removed_interval){
707      .interval = interval,
708      .size = interval->physreg_end - interval->physreg_start,
709   };
710}
711
712static void
713ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
714                 const struct ra_removed_interval *removed, physreg_t dst)
715{
716   struct ra_interval *interval = removed->interval;
717
718   interval->physreg_start = dst;
719   interval->physreg_end = dst + removed->size;
720
721   assert(interval->physreg_end <= file->size);
722   if (interval->interval.reg->flags & IR3_REG_HALF)
723      assert(interval->physreg_end <= RA_HALF_SIZE);
724
725   ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
726}
727
728/* Pick up the interval and place it at "dst". */
729static void
730ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
731                 struct ra_interval *interval, physreg_t dst)
732{
733   struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
734   ra_push_interval(ctx, file, &temp, dst);
735}
736
737static struct ra_file *
738ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
739{
740   if (reg->flags & IR3_REG_SHARED)
741      return &ctx->shared;
742   else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
743      return &ctx->full;
744   else
745      return &ctx->half;
746}
747
748
749/* Returns true if the proposed spot for "dst" or a killed source overlaps a
750 * destination that's been allocated.
751 */
752static bool
753check_dst_overlap(struct ra_ctx *ctx, struct ra_file *file,
754                  struct ir3_register *dst, physreg_t start,
755                  physreg_t end)
756{
757   struct ir3_instruction *instr = dst->instr;
758
759   ra_foreach_dst (other_dst, instr) {
760      /* We assume only destinations before the current one have been allocated.
761       */
762      if (other_dst == dst)
763         break;
764
765      if (ra_get_file(ctx, other_dst) != file)
766         continue;
767
768      struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
769      assert(!other_interval->interval.parent);
770      physreg_t other_start = other_interval->physreg_start;
771      physreg_t other_end = other_interval->physreg_end;
772
773      if (other_end > start && end > other_start)
774         return true;
775   }
776
777   return false;
778}
779
780/* True if the destination is "early-clobber," meaning that it cannot be
781 * allocated over killed sources. Some destinations always require it, but it
782 * also is implicitly true for tied destinations whose source is live-through.
783 * If the source is killed, then we skip allocating a register for the
784 * destination altogether so we don't need to worry about that case here.
785 */
786static bool
787is_early_clobber(struct ir3_register *reg)
788{
789   return (reg->flags & IR3_REG_EARLY_CLOBBER) || reg->tied;
790}
791
792static bool
793get_reg_specified(struct ra_ctx *ctx, struct ra_file *file,
794                  struct ir3_register *reg, physreg_t physreg, bool is_source)
795{
796   for (unsigned i = 0; i < reg_size(reg); i++) {
797      if (!BITSET_TEST(is_early_clobber(reg) || is_source ?
798                           file->available_to_evict : file->available,
799                       physreg + i))
800         return false;
801   }
802
803   if (!is_source &&
804       check_dst_overlap(ctx, file, reg, physreg, physreg + reg_size(reg)))
805      return false;
806
807   return true;
808}
809
810/* Try to evict any registers conflicting with the proposed spot "physreg" for
811 * "reg". That is, move them to other places so that we can allocate "physreg"
812 * here.
813 */
814
815static bool
816try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
817               struct ir3_register *reg, physreg_t physreg,
818               unsigned *_eviction_count, bool is_source, bool speculative)
819{
820   BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
821   memcpy(available_to_evict, file->available_to_evict,
822          sizeof(available_to_evict));
823
824   BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
825   memcpy(available, file->available, sizeof(available));
826
827   for (unsigned i = 0; i < reg_size(reg); i++) {
828      BITSET_CLEAR(available_to_evict, physreg + i);
829      BITSET_CLEAR(available, physreg + i);
830   }
831
832   unsigned eviction_count = 0;
833   /* Iterate over each range conflicting with physreg */
834   for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
835                           *next = ra_interval_next_or_null(conflicting);
836        conflicting != NULL &&
837        conflicting->physreg_start < physreg + reg_size(reg);
838        conflicting = next, next = ra_interval_next_or_null(next)) {
839      if (!is_early_clobber(reg) && !is_source && conflicting->is_killed)
840         continue;
841
842      if (conflicting->frozen) {
843         assert(speculative);
844         return false;
845      }
846
847      unsigned conflicting_file_size =
848         reg_file_size(file, conflicting->interval.reg);
849      unsigned avail_start, avail_end;
850      bool evicted = false;
851      BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
852                            conflicting_file_size) {
853         unsigned size = avail_end - avail_start;
854
855         /* non-half registers must be aligned */
856         if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
857             avail_start % 2 == 1) {
858            avail_start++;
859            size--;
860         }
861
862         unsigned conflicting_size =
863            conflicting->physreg_end - conflicting->physreg_start;
864         if (size >= conflicting_size &&
865             !check_dst_overlap(ctx, file, reg, avail_start, avail_start +
866                                conflicting_size)) {
867            for (unsigned i = 0;
868                 i < conflicting->physreg_end - conflicting->physreg_start; i++)
869               BITSET_CLEAR(available_to_evict, avail_start + i);
870            eviction_count +=
871               conflicting->physreg_end - conflicting->physreg_start;
872            if (!speculative)
873               ra_move_interval(ctx, file, conflicting, avail_start);
874            evicted = true;
875            break;
876         }
877      }
878
879      if (evicted)
880         continue;
881
882      /* If we couldn't evict this range, we may be able to swap it with a
883       * killed range to acheive the same effect.
884       */
885      foreach_interval (killed, file) {
886         if (!killed->is_killed)
887            continue;
888
889         if (killed->physreg_end - killed->physreg_start !=
890             conflicting->physreg_end - conflicting->physreg_start)
891            continue;
892
893         if (killed->physreg_end > conflicting_file_size ||
894             conflicting->physreg_end > reg_file_size(file, killed->interval.reg))
895            continue;
896
897         /* We can't swap the killed range if it partially/fully overlaps the
898          * space we're trying to allocate or (in speculative mode) if it's
899          * already been swapped and will overlap when we actually evict.
900          */
901         bool killed_available = true;
902         for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
903            if (!BITSET_TEST(available, i)) {
904               killed_available = false;
905               break;
906            }
907         }
908
909         if (!killed_available)
910            continue;
911
912         if (check_dst_overlap(ctx, file, reg, killed->physreg_start,
913                               killed->physreg_end))
914            continue;
915
916         /* Check for alignment if one is a full reg */
917         if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
918              !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
919             (killed->physreg_start % 2 != 0 ||
920              conflicting->physreg_start % 2 != 0))
921            continue;
922
923         for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
924            BITSET_CLEAR(available, i);
925         }
926         /* Because this will generate swaps instead of moves, multiply the
927          * cost by 2.
928          */
929         eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
930         if (!speculative) {
931            physreg_t killed_start = killed->physreg_start,
932                      conflicting_start = conflicting->physreg_start;
933            struct ra_removed_interval killed_removed =
934               ra_pop_interval(ctx, file, killed);
935            struct ra_removed_interval conflicting_removed =
936               ra_pop_interval(ctx, file, conflicting);
937            ra_push_interval(ctx, file, &killed_removed, conflicting_start);
938            ra_push_interval(ctx, file, &conflicting_removed, killed_start);
939         }
940
941         evicted = true;
942         break;
943      }
944
945      if (!evicted)
946         return false;
947   }
948
949   *_eviction_count = eviction_count;
950   return true;
951}
952
953static int
954removed_interval_cmp(const void *_i1, const void *_i2)
955{
956   const struct ra_removed_interval *i1 = _i1;
957   const struct ra_removed_interval *i2 = _i2;
958
959   /* We sort the registers as follows:
960    *
961    * |------------------------------------------------------------------------------------------|
962    * |               |                    |        |        |                    |              |
963    * |  Half         | Half early-clobber | Half   | Full   | Full early-clobber | Full         |
964    * |  live-through | destination        | killed | killed | destination        | live-through |
965    * |               |                    |        |        |                    |              |
966    * |------------------------------------------------------------------------------------------|
967    *                                      |                 |
968    *                                      |   Destination   |
969    *                                      |                 |
970    *                                      |-----------------|
971    *
972    * Half-registers have to be first so that they stay in the low half of
973    * the register file. Then half and full killed must stay together so that
974    * there's a contiguous range where we can put the register. With this
975    * structure we should be able to accomodate any collection of intervals
976    * such that the total number of half components is within the half limit
977    * and the combined components are within the full limit.
978    */
979
980   unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
981   unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
982   if (i1_align > i2_align)
983      return 1;
984   if (i1_align < i2_align)
985      return -1;
986
987   if (i1_align == 1) {
988      if (i2->interval->is_killed)
989         return -1;
990      if (i1->interval->is_killed)
991         return 1;
992   } else {
993      if (i2->interval->is_killed)
994         return 1;
995      if (i1->interval->is_killed)
996         return -1;
997   }
998
999   return 0;
1000}
1001
1002static int
1003dsts_cmp(const void *_i1, const void *_i2)
1004{
1005   struct ir3_register *i1 = *(struct ir3_register *const *) _i1;
1006   struct ir3_register *i2 = *(struct ir3_register *const *) _i2;
1007
1008   /* Treat tied destinations as-if they are live-through sources, and normal
1009    * destinations as killed sources.
1010    */
1011   unsigned i1_align = reg_elem_size(i1);
1012   unsigned i2_align = reg_elem_size(i2);
1013   if (i1_align > i2_align)
1014      return 1;
1015   if (i1_align < i2_align)
1016      return -1;
1017
1018   if (i1_align == 1) {
1019      if (!is_early_clobber(i2))
1020         return -1;
1021      if (!is_early_clobber(i1))
1022         return 1;
1023   } else {
1024      if (!is_early_clobber(i2))
1025         return 1;
1026      if (!is_early_clobber(i1))
1027         return -1;
1028   }
1029
1030   return 0;
1031}
1032
1033/* "Compress" all the live intervals so that there is enough space for the
1034 * destination register. As there can be gaps when a more-aligned interval
1035 * follows a less-aligned interval, this also sorts them to remove such
1036 * "padding", which may be required when space is very tight.  This isn't
1037 * amazing, but should be used only as a last resort in case the register file
1038 * is almost full and badly fragmented.
1039 *
1040 * Return the physreg to use.
1041 */
1042static physreg_t
1043compress_regs_left(struct ra_ctx *ctx, struct ra_file *file,
1044                   struct ir3_register *reg)
1045{
1046   unsigned align = reg_elem_size(reg);
1047   DECLARE_ARRAY(struct ra_removed_interval, intervals);
1048   intervals_count = intervals_sz = 0;
1049   intervals = NULL;
1050
1051   DECLARE_ARRAY(struct ir3_register *, dsts);
1052   dsts_count = dsts_sz = 0;
1053   dsts = NULL;
1054   array_insert(ctx, dsts, reg);
1055   bool dst_inserted[reg->instr->dsts_count];
1056
1057   unsigned dst_size = reg->tied ? 0 : reg_size(reg);
1058   unsigned ec_dst_size = is_early_clobber(reg) ? reg_size(reg) : 0;
1059   unsigned half_dst_size = 0, ec_half_dst_size = 0;
1060   if (align == 1) {
1061      half_dst_size = dst_size;
1062      ec_half_dst_size = ec_dst_size;
1063   }
1064
1065   unsigned removed_size = 0, removed_half_size = 0;
1066   unsigned removed_killed_size = 0, removed_killed_half_size = 0;
1067   unsigned file_size =
1068      align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
1069   physreg_t start_reg = 0;
1070
1071   foreach_interval_rev_safe (interval, file) {
1072      /* We'll check if we can compact the intervals starting here. */
1073      physreg_t candidate_start = interval->physreg_end;
1074
1075      /* Check if there are any other destinations we need to compact. */
1076      ra_foreach_dst_n (other_dst, n, reg->instr) {
1077         if (other_dst == reg)
1078            break;
1079         if (ra_get_file(ctx, other_dst) != file)
1080            continue;
1081         if (dst_inserted[n])
1082            continue;
1083
1084         struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
1085         /* if the destination partially overlaps this interval, we need to
1086          * extend candidate_start to the end.
1087          */
1088         if (other_interval->physreg_start < candidate_start) {
1089            candidate_start = MAX2(candidate_start,
1090                                   other_interval->physreg_end);
1091            continue;
1092         }
1093
1094         dst_inserted[n] = true;
1095
1096         /* dst intervals with a tied killed source are considered attached to
1097          * that source. Don't actually insert them. This means we have to
1098          * update them below if their tied source moves.
1099          */
1100         if (other_dst->tied) {
1101            struct ra_interval *tied_interval =
1102               &ctx->intervals[other_dst->tied->def->name];
1103            if (tied_interval->is_killed)
1104               continue;
1105         }
1106
1107         d("popping destination %u physreg %u\n",
1108           other_interval->interval.reg->name,
1109           other_interval->physreg_start);
1110
1111         array_insert(ctx, dsts, other_dst);
1112         unsigned interval_size = reg_size(other_dst);
1113         if (is_early_clobber(other_dst)) {
1114            ec_dst_size += interval_size;
1115            if (other_interval->interval.reg->flags & IR3_REG_HALF)
1116               ec_half_dst_size += interval_size;
1117         } else {
1118            dst_size += interval_size;
1119            if (other_interval->interval.reg->flags & IR3_REG_HALF)
1120               half_dst_size += interval_size;
1121         }
1122      }
1123
1124      /* Check if we can sort the intervals *after* this one and have enough
1125       * space leftover to accomodate all intervals, keeping in mind that killed
1126       * sources overlap non-tied destinations. Also check that we have enough
1127       * space leftover for half-registers, if we're inserting a half-register
1128       * (otherwise we only shift any half-registers down so they should be
1129       * safe).
1130       */
1131      if (candidate_start + removed_size + ec_dst_size +
1132          MAX2(removed_killed_size, dst_size) <= file->size &&
1133          (align != 1 ||
1134           candidate_start + removed_half_size + ec_half_dst_size +
1135           MAX2(removed_killed_half_size, half_dst_size) <= file_size)) {
1136         start_reg = candidate_start;
1137         break;
1138      }
1139
1140      /* We assume that all frozen intervals are at the start and that we
1141       * can avoid popping them.
1142       */
1143      assert(!interval->frozen);
1144
1145      /* Killed sources are different because they go at the end and can
1146       * overlap the register we're trying to add.
1147       */
1148      unsigned interval_size = interval->physreg_end - interval->physreg_start;
1149      if (interval->is_killed) {
1150         removed_killed_size += interval_size;
1151         if (interval->interval.reg->flags & IR3_REG_HALF)
1152            removed_killed_half_size += interval_size;
1153      } else {
1154         removed_size += interval_size;
1155         if (interval->interval.reg->flags & IR3_REG_HALF)
1156            removed_half_size += interval_size;
1157      }
1158
1159      /* Now that we've done the accounting, pop this off */
1160      d("popping interval %u physreg %u%s\n", interval->interval.reg->name,
1161        interval->physreg_start, interval->is_killed ? ", killed" : "");
1162      array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
1163   }
1164
1165   /* TODO: In addition to skipping registers at the beginning that are
1166    * well-packed, we should try to skip registers at the end.
1167    */
1168
1169   qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
1170   qsort(dsts, dsts_count, sizeof(*dsts), dsts_cmp);
1171
1172   physreg_t live_reg = start_reg;
1173   physreg_t dst_reg = (physreg_t)~0;
1174   physreg_t ret_reg = (physreg_t)~0;
1175   unsigned dst_index = 0;
1176   unsigned live_index = 0;
1177
1178   /* We have two lists of intervals to process, live intervals and destination
1179    * intervals. Process them in the order of the disgram in insert_cmp().
1180    */
1181   while (live_index < intervals_count || dst_index < dsts_count) {
1182      bool process_dst;
1183      if (live_index == intervals_count) {
1184         process_dst = true;
1185      } else if (dst_index == dsts_count) {
1186         process_dst = false;
1187      } else {
1188         struct ir3_register *dst = dsts[dst_index];
1189         struct ra_interval *live_interval = intervals[live_index].interval;
1190
1191         bool live_half = live_interval->interval.reg->flags & IR3_REG_HALF;
1192         bool live_killed = live_interval->is_killed;
1193         bool dst_half = dst->flags & IR3_REG_HALF;
1194         bool dst_early_clobber = is_early_clobber(dst);
1195
1196         if (live_half && !live_killed) {
1197            /* far-left of diagram. */
1198            process_dst = false;
1199         } else if (dst_half && dst_early_clobber) {
1200            /* mid-left of diagram. */
1201            process_dst = true;
1202         } else if (!dst_early_clobber) {
1203            /* bottom of disagram. */
1204            process_dst = true;
1205         } else if (live_killed) {
1206            /* middle of diagram. */
1207            process_dst = false;
1208         } else if (!dst_half && dst_early_clobber) {
1209            /* mid-right of diagram. */
1210            process_dst = true;
1211         } else {
1212            /* far right of diagram. */
1213            assert(!live_killed && !live_half);
1214            process_dst = false;
1215         }
1216      }
1217
1218      struct ir3_register *cur_reg =
1219         process_dst ? dsts[dst_index] :
1220         intervals[live_index].interval->interval.reg;
1221
1222      physreg_t physreg;
1223      if (process_dst && !is_early_clobber(cur_reg)) {
1224         if (dst_reg == (physreg_t)~0)
1225            dst_reg = live_reg;
1226         physreg = dst_reg;
1227      } else {
1228         physreg = live_reg;
1229         struct ra_interval *live_interval = intervals[live_index].interval;
1230         bool live_killed = live_interval->is_killed;
1231         /* If this is live-through and we've processed the destinations, we
1232          * need to make sure we take into account any overlapping destinations.
1233          */
1234         if (!live_killed && dst_reg != (physreg_t)~0)
1235            physreg = MAX2(physreg, dst_reg);
1236      }
1237
1238      if (!(cur_reg->flags & IR3_REG_HALF))
1239         physreg = ALIGN(physreg, 2);
1240
1241      d("pushing reg %u physreg %u\n", cur_reg->name, physreg);
1242
1243      unsigned interval_size = reg_size(cur_reg);
1244      if (physreg + interval_size >
1245          reg_file_size(file, cur_reg)) {
1246         d("ran out of room for interval %u!\n",
1247           cur_reg->name);
1248         unreachable("reg pressure calculation was wrong!");
1249         return 0;
1250      }
1251
1252      if (process_dst) {
1253         if (cur_reg == reg) {
1254            ret_reg = physreg;
1255         } else {
1256            struct ra_interval *interval = &ctx->intervals[cur_reg->name];
1257            interval->physreg_start = physreg;
1258            interval->physreg_end = physreg + interval_size;
1259         }
1260         dst_index++;
1261      } else {
1262         ra_push_interval(ctx, file, &intervals[live_index], physreg);
1263         live_index++;
1264      }
1265
1266      physreg += interval_size;
1267
1268      if (process_dst && !is_early_clobber(cur_reg)) {
1269         dst_reg = physreg;
1270      } else {
1271         live_reg = physreg;
1272      }
1273   }
1274
1275   /* If we shuffled around a tied source that is killed, we may have to update
1276    * its corresponding destination since we didn't insert it above.
1277    */
1278   ra_foreach_dst (dst, reg->instr) {
1279      if (dst == reg)
1280         break;
1281
1282      struct ir3_register *tied = dst->tied;
1283      if (!tied)
1284         continue;
1285
1286      struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1287      if (!tied_interval->is_killed)
1288         continue;
1289
1290      struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1291      unsigned dst_size = reg_size(dst);
1292      dst_interval->physreg_start = ra_interval_get_physreg(tied_interval);
1293      dst_interval->physreg_end = dst_interval->physreg_start + dst_size;
1294   }
1295
1296   return ret_reg;
1297}
1298
1299static void
1300update_affinity(struct ra_file *file, struct ir3_register *reg,
1301                physreg_t physreg)
1302{
1303   if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
1304      return;
1305
1306   if (physreg < reg->merge_set_offset)
1307      return;
1308
1309   if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size)
1310      return;
1311
1312   reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
1313}
1314
1315/* Try to find free space for a register without shuffling anything. This uses
1316 * a round-robin algorithm to reduce false dependencies.
1317 */
1318static physreg_t
1319find_best_gap(struct ra_ctx *ctx, struct ra_file *file,
1320              struct ir3_register *dst, unsigned file_size, unsigned size,
1321              unsigned align)
1322{
1323   /* This can happen if we create a very large merge set. Just bail out in that
1324    * case.
1325    */
1326   if (size > file_size)
1327      return (physreg_t) ~0;
1328
1329   BITSET_WORD *available =
1330      is_early_clobber(dst) ? file->available_to_evict : file->available;
1331
1332   unsigned start = ALIGN(file->start, align) % (file_size - size + align);
1333   unsigned candidate = start;
1334   do {
1335      bool is_available = true;
1336      for (unsigned i = 0; i < size; i++) {
1337         if (!BITSET_TEST(available, candidate + i)) {
1338            is_available = false;
1339            break;
1340         }
1341      }
1342
1343      if (is_available) {
1344         is_available =
1345            !check_dst_overlap(ctx, file, dst, candidate, candidate + size);
1346      }
1347
1348      if (is_available) {
1349         file->start = (candidate + size) % file_size;
1350         return candidate;
1351      }
1352
1353      candidate += align;
1354      if (candidate + size > file_size)
1355         candidate = 0;
1356   } while (candidate != start);
1357
1358   return (physreg_t)~0;
1359}
1360
1361/* This is the main entrypoint for picking a register. Pick a free register
1362 * for "reg", shuffling around sources if necessary. In the normal case where
1363 * "is_source" is false, this register can overlap with killed sources
1364 * (intervals with "is_killed == true"). If "is_source" is true, then
1365 * is_killed is ignored and the register returned must not overlap with killed
1366 * sources. This must be used for tied registers, because we're actually
1367 * allocating the destination and the tied source at the same time.
1368 */
1369
1370static physreg_t
1371get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg)
1372{
1373   unsigned file_size = reg_file_size(file, reg);
1374   if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1375      physreg_t preferred_reg =
1376         reg->merge_set->preferred_reg + reg->merge_set_offset;
1377      if (preferred_reg + reg_size(reg) <= file_size &&
1378          preferred_reg % reg_elem_size(reg) == 0 &&
1379          get_reg_specified(ctx, file, reg, preferred_reg, false))
1380         return preferred_reg;
1381   }
1382
1383   /* If this register is a subset of a merge set which we have not picked a
1384    * register for, first try to allocate enough space for the entire merge
1385    * set.
1386    */
1387   unsigned size = reg_size(reg);
1388   if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1389       size < reg->merge_set->size) {
1390      physreg_t best_reg = find_best_gap(ctx, file, reg, file_size,
1391                                         reg->merge_set->size,
1392                                         reg->merge_set->alignment);
1393      if (best_reg != (physreg_t)~0u) {
1394         best_reg += reg->merge_set_offset;
1395         return best_reg;
1396      }
1397   }
1398
1399   /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1400    * Because this doesn't introduce unnecessary dependencies, and it
1401    * potentially avoids needing (ss) syncs for write after read hazards for
1402    * SFU instructions:
1403    */
1404   if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1405      for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1406         struct ir3_register *src = reg->instr->srcs[i];
1407         if (!ra_reg_is_src(src))
1408            continue;
1409         if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1410            struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1411            physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1412            if (src_physreg % reg_elem_size(reg) == 0 &&
1413                src_physreg + size <= file_size &&
1414                get_reg_specified(ctx, file, reg, src_physreg, false))
1415               return src_physreg;
1416         }
1417      }
1418   }
1419
1420   physreg_t best_reg =
1421      find_best_gap(ctx, file, reg, file_size, size, reg_elem_size(reg));
1422   if (best_reg != (physreg_t)~0u) {
1423      return best_reg;
1424   }
1425
1426   /* Ok, we couldn't find anything that fits. Here is where we have to start
1427    * moving things around to make stuff fit. First try solely evicting
1428    * registers in the way.
1429    */
1430   unsigned best_eviction_count = ~0;
1431   for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1432      unsigned eviction_count;
1433      if (try_evict_regs(ctx, file, reg, i, &eviction_count, false, true)) {
1434         if (eviction_count < best_eviction_count) {
1435            best_eviction_count = eviction_count;
1436            best_reg = i;
1437         }
1438      }
1439   }
1440
1441   if (best_eviction_count != ~0) {
1442      ASSERTED bool result = try_evict_regs(
1443         ctx, file, reg, best_reg, &best_eviction_count, false, false);
1444      assert(result);
1445      return best_reg;
1446   }
1447
1448   /* Use the dumb fallback only if try_evict_regs() fails. */
1449   return compress_regs_left(ctx, file, reg);
1450}
1451
1452static void
1453assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1454           unsigned num)
1455{
1456   if (reg->flags & IR3_REG_ARRAY) {
1457      reg->array.base = num;
1458      if (reg->flags & IR3_REG_RELATIV)
1459         reg->array.offset += num;
1460      else
1461         reg->num = num + reg->array.offset;
1462   } else {
1463      reg->num = num;
1464   }
1465}
1466
1467static void
1468mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1469{
1470   struct ra_interval *interval = &ctx->intervals[src->def->name];
1471
1472   if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1473       interval->interval.parent ||
1474       !rb_tree_is_empty(&interval->interval.children))
1475      return;
1476
1477   ra_file_mark_killed(ra_get_file(ctx, src), interval);
1478}
1479
1480static void
1481insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1482{
1483   struct ra_file *file = ra_get_file(ctx, dst);
1484   struct ra_interval *interval = &ctx->intervals[dst->name];
1485
1486   d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1487
1488   if (!(dst->flags & IR3_REG_UNUSED))
1489      ra_file_insert(file, interval);
1490
1491   assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1492}
1493
1494static void
1495allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1496                   physreg_t physreg)
1497{
1498   struct ra_file *file = ra_get_file(ctx, dst);
1499   struct ra_interval *interval = &ctx->intervals[dst->name];
1500   update_affinity(file, dst, physreg);
1501
1502   ra_interval_init(interval, dst);
1503   interval->physreg_start = physreg;
1504   interval->physreg_end = physreg + reg_size(dst);
1505}
1506
1507/* If a tied destination interferes with its source register, we have to insert
1508 * a copy beforehand to copy the source to the destination. Because we are using
1509 * the parallel_copies array and not creating a separate copy, this copy will
1510 * happen in parallel with any shuffling around of the tied source, so we have
1511 * to copy the source *as it exists before it is shuffled around*. We do this by
1512 * inserting the copy early, before any other copies are inserted. We don't
1513 * actually know the destination of the copy, but that's ok because the
1514 * dst_interval will be filled out later.
1515 */
1516static void
1517insert_tied_dst_copy(struct ra_ctx *ctx, struct ir3_register *dst)
1518{
1519   struct ir3_register *tied = dst->tied;
1520
1521   if (!tied)
1522      return;
1523
1524   struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1525   struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1526
1527   if (tied_interval->is_killed)
1528      return;
1529
1530   physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1531
1532   array_insert(ctx, ctx->parallel_copies,
1533                (struct ra_parallel_copy){
1534                   .interval = dst_interval,
1535                   .src = tied_physreg,
1536                });
1537}
1538
1539static void
1540allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1541{
1542   struct ra_file *file = ra_get_file(ctx, dst);
1543
1544   struct ir3_register *tied = dst->tied;
1545   if (tied) {
1546      struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1547      if (tied_interval->is_killed) {
1548         /* The easy case: the source is killed, so we can just reuse it
1549          * for the destination.
1550          */
1551         allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1552         return;
1553      }
1554   }
1555
1556   /* All the hard work is done by get_reg here. */
1557   physreg_t physreg = get_reg(ctx, file, dst);
1558
1559   allocate_dst_fixed(ctx, dst, physreg);
1560}
1561
1562static void
1563assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1564           struct ir3_register *src)
1565{
1566   struct ra_interval *interval = &ctx->intervals[src->def->name];
1567   struct ra_file *file = ra_get_file(ctx, src);
1568
1569   struct ir3_register *tied = src->tied;
1570   physreg_t physreg;
1571   if (tied) {
1572      struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1573      physreg = ra_interval_get_physreg(tied_interval);
1574   } else {
1575      physreg = ra_interval_get_physreg(interval);
1576   }
1577
1578   assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1579
1580   if (src->flags & IR3_REG_FIRST_KILL)
1581      ra_file_remove(file, interval);
1582}
1583
1584/* Insert a parallel copy instruction before the instruction with the parallel
1585 * copy entries we've built up.
1586 */
1587static void
1588insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1589{
1590   if (ctx->parallel_copies_count == 0)
1591      return;
1592
1593   struct ir3_instruction *pcopy =
1594      ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1595                       ctx->parallel_copies_count, ctx->parallel_copies_count);
1596
1597   for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1598      struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1599      struct ir3_register *reg =
1600         ir3_dst_create(pcopy, INVALID_REG,
1601                        entry->interval->interval.reg->flags &
1602                        (IR3_REG_HALF | IR3_REG_ARRAY));
1603      reg->size = entry->interval->interval.reg->size;
1604      reg->wrmask = entry->interval->interval.reg->wrmask;
1605      assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1606   }
1607
1608   for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1609      struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1610      struct ir3_register *reg =
1611         ir3_src_create(pcopy, INVALID_REG,
1612                        entry->interval->interval.reg->flags &
1613                        (IR3_REG_HALF | IR3_REG_ARRAY));
1614      reg->size = entry->interval->interval.reg->size;
1615      reg->wrmask = entry->interval->interval.reg->wrmask;
1616      assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1617   }
1618
1619   list_del(&pcopy->node);
1620   list_addtail(&pcopy->node, &instr->node);
1621   ctx->parallel_copies_count = 0;
1622}
1623
1624static void
1625handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1626{
1627   /* First, mark sources as going-to-be-killed while allocating the dest. */
1628   ra_foreach_src (src, instr) {
1629      mark_src_killed(ctx, src);
1630   }
1631
1632   /* Pre-insert tied dst copies. */
1633   ra_foreach_dst (dst, instr) {
1634      insert_tied_dst_copy(ctx, dst);
1635   }
1636
1637   /* Allocate the destination. */
1638   ra_foreach_dst (dst, instr) {
1639      allocate_dst(ctx, dst);
1640   }
1641
1642   /* Now handle sources. Go backward so that in case there are multiple
1643    * sources with the same def and that def is killed we only remove it at
1644    * the end.
1645    */
1646   ra_foreach_src_rev (src, instr) {
1647      assign_src(ctx, instr, src);
1648   }
1649
1650   /* Now finally insert the destination into the map. */
1651   ra_foreach_dst (dst, instr) {
1652      insert_dst(ctx, dst);
1653   }
1654
1655   insert_parallel_copy_instr(ctx, instr);
1656}
1657
1658static void
1659handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1660{
1661   struct ir3_register *dst = instr->dsts[0];
1662   struct ir3_register *src = instr->srcs[0];
1663
1664   if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1665      handle_normal_instr(ctx, instr);
1666      return;
1667   }
1668
1669   struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1670
1671   physreg_t physreg = ra_interval_get_physreg(src_interval);
1672   assign_src(ctx, instr, src);
1673
1674   allocate_dst_fixed(
1675      ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1676   insert_dst(ctx, dst);
1677}
1678
1679static void
1680handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1681{
1682   struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1683   unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1684
1685   if (!dst_set || dst_set->regs_count == 1) {
1686      handle_normal_instr(ctx, instr);
1687      return;
1688   }
1689
1690   /* We need to check if any of the sources are contained in an interval
1691    * that is at least as large as the vector. In this case, we should put
1692    * the vector inside that larger interval. (There should be one
1693    * unambiguous place to put it, because values sharing the same merge set
1694    * should be allocated together.) This can happen in a case like:
1695    *
1696    * ssa_1 (wrmask=0xf) = ...
1697    * ssa_2 = split ssa_1 off:0
1698    * ssa_3 = split ssa_1 off:1
1699    * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1700    * ... = (kill)ssa_1
1701    * ... = (kill)ssa_4
1702    *
1703    * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1704    */
1705   physreg_t dst_fixed = (physreg_t)~0u;
1706
1707   ra_foreach_src (src, instr) {
1708      if (src->flags & IR3_REG_FIRST_KILL) {
1709         mark_src_killed(ctx, src);
1710      }
1711
1712      struct ra_interval *interval = &ctx->intervals[src->def->name];
1713
1714      if (src->def->merge_set != dst_set || interval->is_killed)
1715         continue;
1716      while (interval->interval.parent != NULL) {
1717         interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1718      }
1719      if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1720         dst_fixed = interval->physreg_start -
1721                     interval->interval.reg->merge_set_offset + dst_offset;
1722      } else {
1723         /* For sources whose root interval is smaller than the
1724          * destination (i.e. the normal case), we will shuffle them
1725          * around after allocating the destination. Mark them killed so
1726          * that the destination can be allocated over them, even if they
1727          * aren't actually killed.
1728          */
1729         ra_file_mark_killed(ra_get_file(ctx, src), interval);
1730      }
1731   }
1732
1733   if (dst_fixed != (physreg_t)~0u)
1734      allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1735   else
1736      allocate_dst(ctx, instr->dsts[0]);
1737
1738   /* Remove the temporary is_killed we added */
1739   ra_foreach_src (src, instr) {
1740      struct ra_interval *interval = &ctx->intervals[src->def->name];
1741      while (interval->interval.parent != NULL) {
1742         interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1743      }
1744
1745      /* Filter out cases where it actually should be killed */
1746      if (interval != &ctx->intervals[src->def->name] ||
1747          !(src->flags & IR3_REG_KILL)) {
1748         ra_file_unmark_killed(ra_get_file(ctx, src), interval);
1749      }
1750   }
1751
1752   ra_foreach_src_rev (src, instr) {
1753      assign_src(ctx, instr, src);
1754   }
1755
1756   /* We need to do this before insert_dst(), so that children of the
1757    * destination which got marked as killed and then shuffled around to make
1758    * space for the destination have the correct pcopy destination that
1759    * matches what we assign the source of the collect to in assign_src().
1760    *
1761    * TODO: In this case we'll wind up copying the value in the pcopy and
1762    * then again in the collect. We could avoid one of those by updating the
1763    * pcopy destination to match up with the final location of the source
1764    * after the collect and making the collect a no-op. However this doesn't
1765    * seem to happen often.
1766    */
1767   insert_parallel_copy_instr(ctx, instr);
1768
1769   /* Note: insert_dst will automatically shuffle around any intervals that
1770    * are a child of the collect by making them children of the collect.
1771    */
1772
1773   insert_dst(ctx, instr->dsts[0]);
1774}
1775
1776/* Parallel copies before RA should only be at the end of the block, for
1777 * phi's. For these we only need to fill in the sources, and then we fill in
1778 * the destinations in the successor block.
1779 */
1780static void
1781handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1782{
1783   ra_foreach_src_rev (src, instr) {
1784      assign_src(ctx, instr, src);
1785   }
1786}
1787
1788/* Some inputs may need to be precolored. We need to handle those first, so
1789 * that other non-precolored inputs don't accidentally get allocated over
1790 * them. Inputs are the very first thing in the shader, so it shouldn't be a
1791 * problem to allocate them to a specific physreg.
1792 */
1793
1794static void
1795handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1796{
1797   if (instr->dsts[0]->num == INVALID_REG)
1798      return;
1799
1800   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1801   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1802   physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1803   allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1804
1805   d("insert precolored dst %u physreg %u", instr->dsts[0]->name,
1806     ra_interval_get_physreg(interval));
1807
1808   ra_file_insert(file, interval);
1809   interval->frozen = true;
1810}
1811
1812static void
1813handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1814{
1815   if (instr->dsts[0]->num != INVALID_REG)
1816      return;
1817
1818   allocate_dst(ctx, instr->dsts[0]);
1819
1820   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1821   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1822   ra_file_insert(file, interval);
1823}
1824
1825static void
1826assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1827{
1828   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1829   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1830
1831   if (instr->dsts[0]->num == INVALID_REG) {
1832      assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1833   } else {
1834      interval->frozen = false;
1835   }
1836
1837   if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1838      ra_file_remove(file, interval);
1839
1840   ra_foreach_src_rev (src, instr)
1841      assign_src(ctx, instr, src);
1842}
1843
1844/* chmask is a bit weird, because it has pre-colored sources due to the need
1845 * to pass some registers to the next stage. Fortunately there are only at
1846 * most two, and there should be no other live values by the time we get to
1847 * this instruction, so we only have to do the minimum and don't need any
1848 * fancy fallbacks.
1849 *
1850 * TODO: Add more complete handling of precolored sources, e.g. for function
1851 * argument handling. We'd need a way to mark sources as fixed so that they
1852 * don't get moved around when placing other sources in the fallback case, and
1853 * a duplication of much of the logic in get_reg(). This also opens another
1854 * can of worms, e.g. what if the precolored source is a split of a vector
1855 * which is still live -- this breaks our assumption that splits don't incur
1856 * any "extra" register requirements and we'd have to break it out of the
1857 * parent ra_interval.
1858 */
1859
1860static void
1861handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1862{
1863   struct ra_file *file = ra_get_file(ctx, src);
1864   struct ra_interval *interval = &ctx->intervals[src->def->name];
1865   physreg_t physreg = ra_reg_get_physreg(src);
1866
1867   if (ra_interval_get_num(interval) == src->num)
1868      return;
1869
1870   /* Try evicting stuff in our way if it isn't free. This won't move
1871    * anything unless it overlaps with our precolored physreg, so we don't
1872    * have to worry about evicting other precolored sources.
1873    */
1874   if (!get_reg_specified(ctx, file, src, physreg, true)) {
1875      unsigned eviction_count;
1876      if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1877                          false)) {
1878         unreachable("failed to evict for precolored source!");
1879         return;
1880      }
1881   }
1882
1883   ra_move_interval(ctx, file, interval, physreg);
1884}
1885
1886static void
1887handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1888{
1889   /* Note: we purposely don't mark sources as killed, so that we can reuse
1890    * some of the get_reg() machinery as-if the source is a destination.
1891    * Marking it as killed would make e.g. get_reg_specified() wouldn't work
1892    * correctly.
1893    */
1894   ra_foreach_src (src, instr) {
1895      assert(src->num != INVALID_REG);
1896      handle_precolored_source(ctx, src);
1897   }
1898
1899   ra_foreach_src (src, instr) {
1900      struct ra_file *file = ra_get_file(ctx, src);
1901      struct ra_interval *interval = &ctx->intervals[src->def->name];
1902      if (src->flags & IR3_REG_FIRST_KILL)
1903         ra_file_remove(file, interval);
1904   }
1905
1906   insert_parallel_copy_instr(ctx, instr);
1907}
1908
1909static physreg_t
1910read_register(struct ra_ctx *ctx, struct ir3_block *block,
1911              struct ir3_register *def)
1912{
1913   struct ra_block_state *state = &ctx->blocks[block->index];
1914   if (state->renames) {
1915      struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1916      if (entry) {
1917         return (physreg_t)(uintptr_t)entry->data;
1918      }
1919   }
1920
1921   return ra_reg_get_physreg(def);
1922}
1923
1924static void
1925handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1926{
1927   physreg_t physreg = ~0;
1928   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1929      struct ir3_block *pred = ctx->block->predecessors[i];
1930      struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1931
1932      if (!pred_state->visited)
1933         continue;
1934
1935      physreg = read_register(ctx, pred, def);
1936      break;
1937   }
1938
1939   assert(physreg != (physreg_t)~0);
1940
1941   struct ra_interval *interval = &ctx->intervals[def->name];
1942   struct ra_file *file = ra_get_file(ctx, def);
1943   ra_interval_init(interval, def);
1944   interval->physreg_start = physreg;
1945   interval->physreg_end = physreg + reg_size(def);
1946   ra_file_insert(file, interval);
1947}
1948
1949static void
1950handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
1951{
1952   /* Skip parallelcopy's which in the original program are only used as phi
1953    * arguments. Even though phi arguments are live out, they are only
1954    * assigned when the phi is.
1955    */
1956   if (def->instr->opc == OPC_META_PARALLEL_COPY)
1957      return;
1958
1959   struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1960   struct ra_interval *interval = &ctx->intervals[def->name];
1961   physreg_t physreg = ra_interval_get_physreg(interval);
1962   if (physreg != ra_reg_get_physreg(def)) {
1963      if (!state->renames)
1964         state->renames = _mesa_pointer_hash_table_create(ctx);
1965      _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
1966   }
1967}
1968
1969static void
1970handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
1971{
1972   struct ra_file *file = ra_get_file(ctx, def);
1973   struct ra_interval *interval = &ctx->intervals[def->name];
1974
1975   /* phis are always scalar, so they should already be the smallest possible
1976    * size. However they may be coalesced with other live-in values/phi
1977    * nodes, so check for that here.
1978    */
1979   struct ir3_reg_interval *parent_ir3 =
1980      ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
1981   physreg_t physreg;
1982   if (parent_ir3) {
1983      struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
1984      physreg = ra_interval_get_physreg(parent) +
1985                (def->interval_start - parent_ir3->reg->interval_start);
1986   } else {
1987      physreg = get_reg(ctx, file, def);
1988   }
1989
1990   allocate_dst_fixed(ctx, def, physreg);
1991
1992   ra_file_insert(file, interval);
1993}
1994
1995static void
1996assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1997{
1998   struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
1999   struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
2000   assert(!interval->interval.parent);
2001   unsigned num = ra_interval_get_num(interval);
2002   assign_reg(phi, phi->dsts[0], num);
2003
2004   /* Assign the parallelcopy sources of this phi */
2005   for (unsigned i = 0; i < phi->srcs_count; i++) {
2006      if (phi->srcs[i]->def) {
2007         assign_reg(phi, phi->srcs[i], num);
2008         assign_reg(phi, phi->srcs[i]->def, num);
2009      }
2010   }
2011
2012   if (phi->dsts[0]->flags & IR3_REG_UNUSED)
2013      ra_file_remove(file, interval);
2014}
2015
2016/* When we split a live range, we sometimes need to emit fixup code at the end
2017 * of a block. For example, something like:
2018 *
2019 * a = ...
2020 * if (...) {
2021 *    ...
2022 *    a' = a
2023 *    b = ... // a evicted to make room for b
2024 *    ...
2025 * }
2026 * ... = a
2027 *
2028 * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
2029 * to insert another copy "a = a'" at the end of the if. Normally this would
2030 * also entail adding a phi node, but since we're about to go out of SSA
2031 * anyway we just insert an extra move. Note, however, that "b" might be used
2032 * in a phi node at the end of the if and share registers with "a", so we
2033 * have to be careful to extend any preexisting parallelcopy instruction
2034 * instead of creating our own in order to guarantee that they properly get
2035 * swapped.
2036 */
2037
2038static void
2039insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
2040                    struct ir3_register *reg)
2041{
2042   struct ir3_instruction *old_pcopy = NULL;
2043   if (!list_is_empty(&block->instr_list)) {
2044      struct ir3_instruction *last =
2045         list_entry(block->instr_list.prev, struct ir3_instruction, node);
2046      if (last->opc == OPC_META_PARALLEL_COPY)
2047         old_pcopy = last;
2048   }
2049
2050   unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
2051   struct ir3_instruction *pcopy = ir3_instr_create(
2052      block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
2053
2054   for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2055      old_pcopy->dsts[i]->instr = pcopy;
2056      pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
2057   }
2058
2059   unsigned flags = reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
2060
2061   struct ir3_register *dst_reg = ir3_dst_create(pcopy, INVALID_REG, flags);
2062   dst_reg->wrmask = reg->wrmask;
2063   dst_reg->size = reg->size;
2064   assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
2065
2066   for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2067      pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
2068   }
2069
2070   struct ir3_register *src_reg = ir3_src_create(pcopy, INVALID_REG, flags);
2071   src_reg->wrmask = reg->wrmask;
2072   src_reg->size = reg->size;
2073   assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
2074
2075   if (old_pcopy)
2076      list_del(&old_pcopy->node);
2077}
2078
2079static void
2080insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
2081{
2082   physreg_t physreg = ra_interval_get_physreg(interval);
2083
2084   bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
2085   struct ir3_block **predecessors =
2086      shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
2087   unsigned predecessors_count = shared
2088                                    ? ctx->block->physical_predecessors_count
2089                                    : ctx->block->predecessors_count;
2090
2091   for (unsigned i = 0; i < predecessors_count; i++) {
2092      struct ir3_block *pred = predecessors[i];
2093      struct ra_block_state *pred_state = &ctx->blocks[pred->index];
2094
2095      if (!pred_state->visited)
2096         continue;
2097
2098      physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
2099      if (pred_reg != physreg) {
2100         insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
2101
2102         /* This is a bit tricky, but when visiting the destination of a
2103          * physical-only edge, we have two predecessors (the if and the
2104          * header block) and both have multiple successors. We pick the
2105          * register for all live-ins from the normal edge, which should
2106          * guarantee that there's no need for shuffling things around in
2107          * the normal predecessor as long as there are no phi nodes, but
2108          * we still may need to insert fixup code in the physical
2109          * predecessor (i.e. the last block of the if) and that has
2110          * another successor (the block after the if) so we need to update
2111          * the renames state for when we process the other successor. This
2112          * crucially depends on the other successor getting processed
2113          * after this.
2114          *
2115          * For normal (non-physical) edges we disallow critical edges so
2116          * that hacks like this aren't necessary.
2117          */
2118         if (!pred_state->renames)
2119            pred_state->renames = _mesa_pointer_hash_table_create(ctx);
2120         _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
2121                                 (void *)(uintptr_t)physreg);
2122      }
2123   }
2124}
2125
2126static void
2127insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
2128{
2129   BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
2130   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2131                    physreg_node) {
2132      /* Skip phi nodes. This needs to happen after phi nodes are allocated,
2133       * because we may have to move live-ins around to make space for phi
2134       * nodes, but we shouldn't be handling phi nodes here.
2135       */
2136      if (BITSET_TEST(live_in, interval->interval.reg->name))
2137         insert_live_in_move(ctx, interval);
2138   }
2139}
2140
2141static void
2142insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
2143{
2144   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2145                    physreg_node) {
2146      _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
2147                              (void *)(uintptr_t)interval->physreg_start);
2148   }
2149}
2150
2151static void
2152insert_live_in_moves(struct ra_ctx *ctx)
2153{
2154   insert_file_live_in_moves(ctx, &ctx->full);
2155   insert_file_live_in_moves(ctx, &ctx->half);
2156   insert_file_live_in_moves(ctx, &ctx->shared);
2157
2158   /* If not all predecessors are visited, insert live-in regs so that
2159    * insert_live_out_moves() will work.
2160    */
2161   bool all_preds_visited = true;
2162   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
2163      if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
2164         all_preds_visited = false;
2165         break;
2166      }
2167   }
2168
2169   if (!all_preds_visited) {
2170      struct ra_block_state *state = &ctx->blocks[ctx->block->index];
2171      state->entry_regs = _mesa_pointer_hash_table_create(ctx);
2172
2173      insert_entry_regs(state, &ctx->full);
2174      insert_entry_regs(state, &ctx->half);
2175      insert_entry_regs(state, &ctx->shared);
2176   }
2177}
2178
2179static void
2180insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
2181{
2182   for (unsigned i = 0; i < 2; i++) {
2183      if (!ctx->block->successors[i])
2184         continue;
2185
2186      struct ir3_block *succ = ctx->block->successors[i];
2187      struct ra_block_state *succ_state = &ctx->blocks[succ->index];
2188
2189      if (!succ_state->visited)
2190         continue;
2191
2192      struct hash_entry *entry = _mesa_hash_table_search(
2193         succ_state->entry_regs, interval->interval.reg);
2194      if (!entry)
2195         continue;
2196
2197      physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
2198      if (new_reg != interval->physreg_start) {
2199         insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
2200                             interval->interval.reg);
2201      }
2202   }
2203}
2204
2205static void
2206insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
2207{
2208   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2209                    physreg_node) {
2210      insert_live_out_move(ctx, interval);
2211   }
2212}
2213
2214static void
2215insert_live_out_moves(struct ra_ctx *ctx)
2216{
2217   insert_file_live_out_moves(ctx, &ctx->full);
2218   insert_file_live_out_moves(ctx, &ctx->half);
2219   insert_file_live_out_moves(ctx, &ctx->shared);
2220}
2221
2222static void
2223handle_block(struct ra_ctx *ctx, struct ir3_block *block)
2224{
2225   ctx->block = block;
2226
2227   /* Reset the register files from the last block */
2228   ra_file_init(&ctx->full);
2229   ra_file_init(&ctx->half);
2230   ra_file_init(&ctx->shared);
2231
2232   /* Handle live-ins, phis, and input meta-instructions. These all appear
2233    * live at the beginning of the block, and interfere with each other
2234    * therefore need to be allocated "in parallel". This means that we
2235    * have to allocate all of them, inserting them into the file, and then
2236    * delay updating the IR until all of them are allocated.
2237    *
2238    * Handle precolored inputs first, because we need to make sure that other
2239    * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
2240    * and inputs at the same time, because the first block doesn't have
2241    * predecessors. Therefore handle_live_in doesn't have to worry about
2242    * them.
2243    */
2244
2245   foreach_instr (instr, &block->instr_list) {
2246      if (instr->opc == OPC_META_INPUT)
2247         handle_precolored_input(ctx, instr);
2248      else
2249         break;
2250   }
2251
2252   unsigned name;
2253   BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
2254                       ctx->live->definitions_count) {
2255      struct ir3_register *reg = ctx->live->definitions[name];
2256      handle_live_in(ctx, reg);
2257   }
2258
2259   foreach_instr (instr, &block->instr_list) {
2260      if (instr->opc == OPC_META_PHI)
2261         handle_phi(ctx, instr->dsts[0]);
2262      else if (instr->opc == OPC_META_INPUT ||
2263               instr->opc == OPC_META_TEX_PREFETCH)
2264         handle_input(ctx, instr);
2265      else
2266         break;
2267   }
2268
2269   /* After this point, every live-in/phi/input has an interval assigned to
2270    * it. We delay actually assigning values until everything has been
2271    * allocated, so we can simply ignore any parallel copy entries created
2272    * when shuffling them around.
2273    */
2274   ctx->parallel_copies_count = 0;
2275
2276   insert_live_in_moves(ctx);
2277
2278   if (RA_DEBUG) {
2279      d("after live-in block %u:\n", block->index);
2280      ra_ctx_dump(ctx);
2281   }
2282
2283   /* Now we're done with processing live-ins, and can handle the body of the
2284    * block.
2285    */
2286   foreach_instr (instr, &block->instr_list) {
2287      di(instr, "processing");
2288
2289      if (instr->opc == OPC_META_PHI)
2290         assign_phi(ctx, instr);
2291      else if (instr->opc == OPC_META_INPUT ||
2292               instr->opc == OPC_META_TEX_PREFETCH)
2293         assign_input(ctx, instr);
2294      else if (instr->opc == OPC_META_SPLIT)
2295         handle_split(ctx, instr);
2296      else if (instr->opc == OPC_META_COLLECT)
2297         handle_collect(ctx, instr);
2298      else if (instr->opc == OPC_META_PARALLEL_COPY)
2299         handle_pcopy(ctx, instr);
2300      else if (instr->opc == OPC_CHMASK)
2301         handle_chmask(ctx, instr);
2302      else
2303         handle_normal_instr(ctx, instr);
2304
2305      if (RA_DEBUG)
2306         ra_ctx_dump(ctx);
2307   }
2308
2309   insert_live_out_moves(ctx);
2310
2311   BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
2312                       ctx->live->definitions_count) {
2313      struct ir3_register *reg = ctx->live->definitions[name];
2314      handle_live_out(ctx, reg);
2315   }
2316
2317   ctx->blocks[block->index].visited = true;
2318}
2319
2320static unsigned
2321calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
2322{
2323   /* Registers are allocated in units of vec4, so switch from units of
2324    * half-regs to vec4.
2325    */
2326   unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
2327
2328   bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
2329
2330   unsigned target = reg_count;
2331   unsigned reg_independent_max_waves =
2332      ir3_get_reg_independent_max_waves(v, double_threadsize);
2333   unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
2334      v->compiler, reg_count, double_threadsize);
2335   unsigned target_waves =
2336      MIN2(reg_independent_max_waves, reg_dependent_max_waves);
2337
2338   while (target <= RA_FULL_SIZE / (2 * 4) &&
2339          ir3_should_double_threadsize(v, target) == double_threadsize &&
2340          ir3_get_reg_dependent_max_waves(v->compiler, target,
2341                                          double_threadsize) >= target_waves)
2342      target++;
2343
2344   return (target - 1) * 2 * 4;
2345}
2346
2347static void
2348add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
2349             bool merged_regs)
2350{
2351   unsigned size = reg_size(reg);
2352   if (reg->flags & IR3_REG_HALF)
2353      pressure->half += size;
2354   if (!(reg->flags & IR3_REG_HALF) || merged_regs)
2355      pressure->full += size;
2356}
2357
2358static void
2359dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2360{
2361}
2362
2363static void
2364dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2365{
2366}
2367
2368static void
2369dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
2370                     struct ir3_reg_interval *child)
2371{
2372}
2373
2374/* Calculate the minimum possible limit on register pressure so that spilling
2375 * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
2376 */
2377
2378static void
2379calc_min_limit_pressure(struct ir3_shader_variant *v,
2380                        struct ir3_liveness *live,
2381                        struct ir3_pressure *limit)
2382{
2383   struct ir3_block *start = ir3_start_block(v->ir);
2384   struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
2385   struct ir3_reg_interval *intervals =
2386      rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
2387
2388   ctx->interval_add = dummy_interval_add;
2389   ctx->interval_delete = dummy_interval_delete;
2390   ctx->interval_readd = dummy_interval_readd;
2391
2392   limit->full = limit->half = 0;
2393
2394   struct ir3_pressure cur_pressure = {0};
2395   foreach_instr (input, &start->instr_list) {
2396      if (input->opc != OPC_META_INPUT &&
2397          input->opc != OPC_META_TEX_PREFETCH)
2398         break;
2399
2400      add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
2401   }
2402
2403   limit->full = MAX2(limit->full, cur_pressure.full);
2404   limit->half = MAX2(limit->half, cur_pressure.half);
2405
2406   foreach_instr (input, &start->instr_list) {
2407      if (input->opc != OPC_META_INPUT &&
2408          input->opc != OPC_META_TEX_PREFETCH)
2409         break;
2410
2411      /* pre-colored inputs may have holes, which increases the pressure. */
2412      struct ir3_register *dst = input->dsts[0];
2413      if (dst->num != INVALID_REG) {
2414         unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
2415         if (dst->flags & IR3_REG_HALF)
2416            limit->half = MAX2(limit->half, physreg);
2417         if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
2418            limit->full = MAX2(limit->full, physreg);
2419      }
2420   }
2421
2422   foreach_block (block, &v->ir->block_list) {
2423      rb_tree_init(&ctx->intervals);
2424
2425      unsigned name;
2426      BITSET_FOREACH_SET (name, live->live_in[block->index],
2427                          live->definitions_count) {
2428         struct ir3_register *reg = live->definitions[name];
2429         ir3_reg_interval_init(&intervals[reg->name], reg);
2430         ir3_reg_interval_insert(ctx, &intervals[reg->name]);
2431      }
2432
2433      foreach_instr (instr, &block->instr_list) {
2434         ra_foreach_dst (dst, instr) {
2435            ir3_reg_interval_init(&intervals[dst->name], dst);
2436         }
2437         /* phis and parallel copies can be deleted via spilling */
2438
2439         if (instr->opc == OPC_META_PHI) {
2440            ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
2441            continue;
2442         }
2443
2444         if (instr->opc == OPC_META_PARALLEL_COPY)
2445            continue;
2446
2447         cur_pressure = (struct ir3_pressure) {0};
2448
2449         ra_foreach_dst (dst, instr) {
2450            if (dst->tied && !(dst->tied->flags & IR3_REG_KILL))
2451               add_pressure(&cur_pressure, dst, v->mergedregs);
2452         }
2453
2454         ra_foreach_src_rev (src, instr) {
2455            /* We currently don't support spilling the parent of a source when
2456             * making space for sources, so we have to keep track of the
2457             * intervals and figure out the root of the tree to figure out how
2458             * much space we need.
2459             *
2460             * TODO: We should probably support this in the spiller.
2461             */
2462            struct ir3_reg_interval *interval = &intervals[src->def->name];
2463            while (interval->parent)
2464               interval = interval->parent;
2465            add_pressure(&cur_pressure, interval->reg, v->mergedregs);
2466
2467            if (src->flags & IR3_REG_FIRST_KILL)
2468               ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
2469         }
2470
2471         limit->full = MAX2(limit->full, cur_pressure.full);
2472         limit->half = MAX2(limit->half, cur_pressure.half);
2473
2474         cur_pressure = (struct ir3_pressure) {0};
2475
2476         ra_foreach_dst (dst, instr) {
2477            ir3_reg_interval_init(&intervals[dst->name], dst);
2478            ir3_reg_interval_insert(ctx, &intervals[dst->name]);
2479            add_pressure(&cur_pressure, dst, v->mergedregs);
2480         }
2481
2482         limit->full = MAX2(limit->full, cur_pressure.full);
2483         limit->half = MAX2(limit->half, cur_pressure.half);
2484      }
2485   }
2486
2487   /* Account for the base register, which needs to be available everywhere. */
2488   limit->full += 2;
2489
2490   ralloc_free(ctx);
2491}
2492
2493/*
2494 * If barriers are used, it must be possible for all waves in the workgroup
2495 * to execute concurrently. Thus we may have to reduce the registers limit.
2496 */
2497static void
2498calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v,
2499                                        struct ir3_pressure *limit_pressure)
2500{
2501   const struct ir3_compiler *compiler = v->compiler;
2502
2503   unsigned threads_per_wg;
2504   if (v->local_size_variable) {
2505      /* We have to expect the worst case. */
2506      threads_per_wg = compiler->max_variable_workgroup_size;
2507   } else {
2508      threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
2509   }
2510
2511   /* The register file is grouped into reg_size_vec4 number of parts.
2512    * Each part has enough registers to add a single vec4 register to
2513    * each thread of a single-sized wave-pair. With double threadsize
2514    * each wave-pair would consume two parts of the register file to get
2515    * a single vec4 for a thread. The more active wave-pairs the less
2516    * parts each could get.
2517    */
2518
2519   bool double_threadsize = ir3_should_double_threadsize(v, 0);
2520   unsigned waves_per_wg = DIV_ROUND_UP(
2521      threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) *
2522                         compiler->wave_granularity);
2523
2524   uint32_t vec4_regs_per_thread =
2525      compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1));
2526   assert(vec4_regs_per_thread > 0);
2527
2528   uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2;
2529
2530   if (limit_pressure->full > half_regs_per_thread) {
2531      if (v->mergedregs) {
2532         limit_pressure->full = half_regs_per_thread;
2533      } else {
2534         /* TODO: Handle !mergedregs case, probably we would have to do this
2535          * after the first register pressure pass.
2536          */
2537      }
2538   }
2539}
2540
2541int
2542ir3_ra(struct ir3_shader_variant *v)
2543{
2544   ir3_calc_dominance(v->ir);
2545
2546   ir3_create_parallel_copies(v->ir);
2547
2548   struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2549
2550   ctx->merged_regs = v->mergedregs;
2551   ctx->compiler = v->compiler;
2552   ctx->stage = v->type;
2553
2554   struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
2555
2556   ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
2557
2558   ir3_merge_regs(live, v->ir);
2559
2560   struct ir3_pressure max_pressure;
2561   ir3_calc_pressure(v, live, &max_pressure);
2562   d("max pressure:");
2563   d("\tfull: %u", max_pressure.full);
2564   d("\thalf: %u", max_pressure.half);
2565   d("\tshared: %u", max_pressure.shared);
2566
2567   struct ir3_pressure limit_pressure;
2568   limit_pressure.full = RA_FULL_SIZE;
2569   limit_pressure.half = RA_HALF_SIZE;
2570   limit_pressure.shared = RA_SHARED_SIZE;
2571
2572   if (gl_shader_stage_is_compute(v->type) && v->has_barrier) {
2573      calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure);
2574   }
2575
2576   /* If the user forces a doubled threadsize, we may have to lower the limit
2577    * because on some gens the register file is not big enough to hold a
2578    * double-size wave with all 48 registers in use.
2579    */
2580   if (v->real_wavesize == IR3_DOUBLE_ONLY) {
2581      limit_pressure.full =
2582         MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16);
2583   }
2584
2585   /* If requested, lower the limit so that spilling happens more often. */
2586   if (ir3_shader_debug & IR3_DBG_SPILLALL)
2587      calc_min_limit_pressure(v, live, &limit_pressure);
2588
2589   if (max_pressure.shared > limit_pressure.shared) {
2590      /* TODO shared reg -> normal reg spilling */
2591      d("shared max pressure exceeded!");
2592      goto fail;
2593   }
2594
2595   bool spilled = false;
2596   if (max_pressure.full > limit_pressure.full ||
2597       max_pressure.half > limit_pressure.half) {
2598      if (!v->compiler->has_pvtmem) {
2599         d("max pressure exceeded!");
2600         goto fail;
2601      }
2602      d("max pressure exceeded, spilling!");
2603      IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
2604      ir3_calc_pressure(v, live, &max_pressure);
2605      assert(max_pressure.full <= limit_pressure.full &&
2606             max_pressure.half <= limit_pressure.half);
2607      spilled = true;
2608   }
2609
2610   ctx->live = live;
2611   ctx->intervals =
2612      rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2613   ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2614
2615   ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2616   d("full size: %u", ctx->full.size);
2617
2618   if (!v->mergedregs)
2619      ctx->half.size = RA_HALF_SIZE;
2620
2621   ctx->shared.size = RA_SHARED_SIZE;
2622
2623   ctx->full.start = ctx->half.start = ctx->shared.start = 0;
2624
2625   foreach_block (block, &v->ir->block_list)
2626      handle_block(ctx, block);
2627
2628   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
2629
2630   /* Strip array-ness and SSA-ness at the end, because various helpers still
2631    * need to work even on definitions that have already been assigned. For
2632    * example, we need to preserve array-ness so that array live-ins have the
2633    * right size.
2634    */
2635   foreach_block (block, &v->ir->block_list) {
2636      foreach_instr (instr, &block->instr_list) {
2637         for (unsigned i = 0; i < instr->dsts_count; i++) {
2638            instr->dsts[i]->flags &= ~IR3_REG_SSA;
2639
2640            /* Parallel copies of array registers copy the whole register, and
2641             * we need some way to let the parallel copy code know that this was
2642             * an array whose size is determined by reg->size. So keep the array
2643             * flag on those. spill/reload also need to work on the entire
2644             * array.
2645             */
2646            if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
2647               instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2648         }
2649
2650         for (unsigned i = 0; i < instr->srcs_count; i++) {
2651            instr->srcs[i]->flags &= ~IR3_REG_SSA;
2652
2653            if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
2654               instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2655         }
2656      }
2657   }
2658
2659   ir3_debug_print(v->ir, "AFTER: register allocation");
2660
2661   if (spilled) {
2662      IR3_PASS(v->ir, ir3_lower_spill);
2663   }
2664
2665   ir3_lower_copies(v);
2666
2667   ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2668
2669   ralloc_free(ctx);
2670
2671   return 0;
2672fail:
2673   ralloc_free(ctx);
2674   return -1;
2675}
2676