1/*
2 * Copyright (c) 2017 Lima Project
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sub license,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the
12 * next paragraph) shall be included in all copies or substantial portions
13 * of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25#include "gpir.h"
26#include "util/u_dynarray.h"
27
28/* Per-register information */
29
30struct reg_info {
31   BITSET_WORD *conflicts;
32   struct util_dynarray conflict_list;
33
34   unsigned num_conflicts;
35
36   int assigned_color;
37
38   bool visited;
39};
40
41struct regalloc_ctx {
42   unsigned bitset_words;
43   struct reg_info *registers;
44
45   /* Reusable scratch liveness array */
46   BITSET_WORD *live;
47
48   unsigned *worklist;
49   unsigned worklist_start, worklist_end;
50
51   unsigned *stack;
52   unsigned stack_size;
53
54   gpir_compiler *comp;
55   void *mem_ctx;
56};
57
58/* Liveness analysis */
59
60static void propagate_liveness_node(gpir_node *node, BITSET_WORD *live,
61                                    gpir_compiler *comp)
62{
63   /* KILL */
64   if (node->type == gpir_node_type_store) {
65      if (node->op == gpir_op_store_reg) {
66         gpir_store_node *store = gpir_node_to_store(node);
67         BITSET_CLEAR(live, store->reg->index);
68      }
69   }
70
71   /* GEN */
72   if (node->type == gpir_node_type_load) {
73      if (node->op == gpir_op_load_reg) {
74         gpir_load_node *load = gpir_node_to_load(node);
75         BITSET_SET(live, load->reg->index);
76      }
77   }
78}
79
80static bool propagate_liveness_block(gpir_block *block, struct regalloc_ctx *ctx)
81{
82   for (unsigned i = 0; i < 2; i++) {
83      if (block->successors[i]) {
84         for (unsigned j = 0; j < ctx->bitset_words; j++)
85            block->live_out[j] |= block->successors[i]->live_in[j];
86      }
87   }
88
89   memcpy(ctx->live, block->live_out, ctx->bitset_words * sizeof(BITSET_WORD));
90
91   list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
92      propagate_liveness_node(node, ctx->live, block->comp);
93   }
94
95   bool changed = false;
96   for (unsigned i = 0; i < ctx->bitset_words; i++) {
97      changed |= (block->live_in[i] != ctx->live[i]);
98      block->live_in[i] = ctx->live[i];
99   }
100   return changed;
101}
102
103static void calc_def_block(gpir_block *block)
104{
105   list_for_each_entry(gpir_node, node, &block->node_list, list) {
106      if (node->op == gpir_op_store_reg) {
107         gpir_store_node *store = gpir_node_to_store(node);
108         BITSET_SET(block->def_out, store->reg->index);
109      }
110   }
111}
112
113static void calc_liveness(struct regalloc_ctx *ctx)
114{
115   bool changed = true;
116   while (changed) {
117      changed = false;
118      list_for_each_entry_rev(gpir_block, block, &ctx->comp->block_list, list) {
119         changed |= propagate_liveness_block(block, ctx);
120      }
121   }
122
123   list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
124      calc_def_block(block);
125   }
126
127   changed = true;
128   while (changed) {
129      changed = false;
130      list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
131         for (unsigned i = 0; i < 2; i++) {
132            gpir_block *succ = block->successors[i];
133            if (!succ)
134               continue;
135
136            for (unsigned j = 0; j < ctx->bitset_words; j++) {
137               BITSET_WORD new = block->def_out[j] & ~succ->def_out[j];
138               changed |= (new != 0);
139               succ->def_out[j] |= block->def_out[j];
140            }
141         }
142      }
143   }
144}
145
146/* Interference calculation */
147
148static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j)
149{
150   if (i == j)
151      return;
152
153   struct reg_info *a = &ctx->registers[i];
154   struct reg_info *b = &ctx->registers[j];
155
156   if (BITSET_TEST(a->conflicts, j))
157      return;
158
159   BITSET_SET(a->conflicts, j);
160   BITSET_SET(b->conflicts, i);
161
162   a->num_conflicts++;
163   b->num_conflicts++;
164   util_dynarray_append(&a->conflict_list, unsigned, j);
165   util_dynarray_append(&b->conflict_list, unsigned, i);
166}
167
168/* Make the register or node "i" interfere with all the other live registers
169 * and nodes.
170 */
171static void add_all_interferences(struct regalloc_ctx *ctx,
172                                  unsigned i,
173                                  BITSET_WORD *live_regs)
174{
175   int live_reg;
176   BITSET_FOREACH_SET(live_reg, ctx->live, ctx->comp->cur_reg) {
177      add_interference(ctx, i, live_reg);
178   }
179
180}
181
182static void print_liveness(struct regalloc_ctx *ctx,
183                           BITSET_WORD *live_reg)
184{
185   if (!(lima_debug & LIMA_DEBUG_GP))
186      return;
187
188   int live_idx;
189   BITSET_FOREACH_SET(live_idx, live_reg, ctx->comp->cur_reg) {
190      printf("reg%d ", live_idx);
191   }
192   printf("\n");
193}
194
195static void calc_interference(struct regalloc_ctx *ctx)
196{
197   list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
198      /* Initialize liveness at the end of the block, but exclude values that
199       * definitely aren't defined by the end. This helps out with
200       * partially-defined registers, like:
201       *
202       * if (condition) {
203       *    foo = ...;
204       * }
205       * if (condition) {
206       *    ... = foo;
207       * }
208       *
209       * If we naively propagated liveness backwards, foo would be live from
210       * the beginning of the program, but if we're not inside a loop then
211       * its value is undefined before the first if and we don't have to
212       * consider it live. Mask out registers like foo here.
213       */
214      for (unsigned i = 0; i < ctx->bitset_words; i++) {
215         ctx->live[i] = block->live_out[i] & block->def_out[i];
216      }
217
218      list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
219         gpir_debug("processing node %d\n", node->index);
220         print_liveness(ctx, ctx->live);
221         if (node->op == gpir_op_store_reg) {
222            gpir_store_node *store = gpir_node_to_store(node);
223            add_all_interferences(ctx, store->reg->index, ctx->live);
224
225            /* KILL */
226            BITSET_CLEAR(ctx->live, store->reg->index);
227         } else if (node->op == gpir_op_load_reg) {
228            /* GEN */
229            gpir_load_node *load = gpir_node_to_load(node);
230            BITSET_SET(ctx->live, load->reg->index);
231         }
232      }
233   }
234}
235
236/* Register allocation */
237
238static bool can_simplify(struct regalloc_ctx *ctx, unsigned i)
239{
240   struct reg_info *info = &ctx->registers[i];
241   return info->num_conflicts < GPIR_PHYSICAL_REG_NUM;
242}
243
244static void push_stack(struct regalloc_ctx *ctx, unsigned i)
245{
246   ctx->stack[ctx->stack_size++] = i;
247   gpir_debug("pushing reg%u\n", i);
248
249   struct reg_info *info = &ctx->registers[i];
250   assert(info->visited);
251
252   util_dynarray_foreach(&info->conflict_list, unsigned, conflict) {
253      struct reg_info *conflict_info = &ctx->registers[*conflict];
254      assert(conflict_info->num_conflicts > 0);
255      conflict_info->num_conflicts--;
256      if (!ctx->registers[*conflict].visited && can_simplify(ctx, *conflict)) {
257         ctx->worklist[ctx->worklist_end++] = *conflict;
258         ctx->registers[*conflict].visited = true;
259      }
260   }
261}
262
263static bool do_regalloc(struct regalloc_ctx *ctx)
264{
265   ctx->worklist_start = 0;
266   ctx->worklist_end = 0;
267   ctx->stack_size = 0;
268
269   /* Step 1: find the initially simplifiable registers */
270   for (int i = 0; i < ctx->comp->cur_reg; i++) {
271      if (can_simplify(ctx, i)) {
272         ctx->worklist[ctx->worklist_end++] = i;
273         ctx->registers[i].visited = true;
274      }
275   }
276
277   while (true) {
278      /* Step 2: push onto the stack whatever we can */
279      while (ctx->worklist_start != ctx->worklist_end) {
280         push_stack(ctx, ctx->worklist[ctx->worklist_start++]);
281      }
282
283      if (ctx->stack_size < ctx->comp->cur_reg) {
284         /* If there are still unsimplifiable nodes left, we need to
285          * optimistically push a node onto the stack.  Choose the one with
286          * the smallest number of current neighbors, since that's the most
287          * likely to succeed.
288          */
289         unsigned min_conflicts = UINT_MAX;
290         unsigned best_reg = 0;
291         for (int reg = 0; reg < ctx->comp->cur_reg; reg++) {
292            struct reg_info *info = &ctx->registers[reg];
293            if (info->visited)
294               continue;
295            if (info->num_conflicts < min_conflicts) {
296               best_reg = reg;
297               min_conflicts = info->num_conflicts;
298            }
299         }
300         gpir_debug("optimistic triggered\n");
301         ctx->registers[best_reg].visited = true;
302         push_stack(ctx, best_reg);
303      } else {
304         break;
305      }
306   }
307
308   /* Step 4: pop off the stack and assign colors */
309   for (int i = ctx->comp->cur_reg - 1; i >= 0; i--) {
310      unsigned idx = ctx->stack[i];
311      struct reg_info *reg = &ctx->registers[idx];
312
313      bool found = false;
314      unsigned start = i % GPIR_PHYSICAL_REG_NUM;
315      for (unsigned j = 0; j < GPIR_PHYSICAL_REG_NUM; j++) {
316         unsigned candidate = (j + start) % GPIR_PHYSICAL_REG_NUM;
317         bool available = true;
318         util_dynarray_foreach(&reg->conflict_list, unsigned, conflict_idx) {
319            struct reg_info *conflict = &ctx->registers[*conflict_idx];
320            if (conflict->assigned_color >= 0 &&
321                conflict->assigned_color == (int) candidate) {
322               available = false;
323               break;
324            }
325         }
326
327         if (available) {
328            reg->assigned_color = candidate;
329            found = true;
330            break;
331         }
332      }
333
334      /* TODO: spilling */
335      if (!found) {
336         gpir_error("Failed to allocate registers\n");
337         return false;
338      }
339   }
340
341   return true;
342}
343
344static void assign_regs(struct regalloc_ctx *ctx)
345{
346   list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
347      list_for_each_entry(gpir_node, node, &block->node_list, list) {
348         if (node->op == gpir_op_load_reg) {
349            gpir_load_node *load = gpir_node_to_load(node);
350            unsigned color = ctx->registers[load->reg->index].assigned_color;
351            load->index = color / 4;
352            load->component = color % 4;
353         }
354
355         if (node->op == gpir_op_store_reg) {
356            gpir_store_node *store = gpir_node_to_store(node);
357            unsigned color = ctx->registers[store->reg->index].assigned_color;
358            store->index = color / 4;
359            store->component = color % 4;
360            node->value_reg = color;
361         }
362      }
363
364      block->live_out_phys = 0;
365
366      int reg_idx;
367      BITSET_FOREACH_SET(reg_idx, block->live_out, ctx->comp->cur_reg) {
368         if (BITSET_TEST(block->def_out, reg_idx)) {
369            block->live_out_phys |= (1ull << ctx->registers[reg_idx].assigned_color);
370         }
371      }
372   }
373}
374
375/* Value register allocation */
376
377/* Define a special token for when the register is occupied by a preallocated
378 * physical register (i.e. load_reg/store_reg). Normally entries in the "live"
379 * array points to the definition of the value, but there may be multiple
380 * definitions in this case, and they will certainly come from other basic
381 * blocks, so it doesn't make sense to do that here.
382 */
383static gpir_node __physreg_live;
384#define PHYSREG_LIVE (&__physreg_live)
385
386struct value_regalloc_ctx {
387   gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM];
388   gpir_node *complex1_last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM];
389   gpir_node *live[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM];
390   gpir_node *last_complex1;
391   unsigned alloc_start;
392};
393
394static unsigned find_free_value_reg(struct value_regalloc_ctx *ctx)
395{
396   /* Implement round-robin allocation */
397   unsigned reg_offset = ctx->alloc_start++;
398   if (ctx->alloc_start == GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM)
399      ctx->alloc_start = 0;
400
401   unsigned reg = UINT_MAX;
402   for (unsigned reg_base = 0;
403        reg_base < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM;
404        reg_base++) {
405      unsigned cur_reg = (reg_base + reg_offset) % (GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM);
406      if (!ctx->live[cur_reg]) {
407         reg = cur_reg;
408         break;
409      }
410   }
411
412   return reg;
413}
414
415static void add_fake_dep(gpir_node *node, gpir_node *src,
416                         struct value_regalloc_ctx *ctx)
417{
418   assert(src->value_reg >= 0);
419   if (ctx->last_written[src->value_reg] &&
420       ctx->last_written[src->value_reg] != node) {
421      gpir_node_add_dep(ctx->last_written[src->value_reg], node,
422                        GPIR_DEP_WRITE_AFTER_READ);
423   }
424
425   /* For a sequence of schedule_first nodes right before a complex1
426    * node, add any extra fake dependencies necessary so that the
427    * schedule_first nodes can be scheduled right after the complex1 is
428    * scheduled. We have to save the last_written before complex1 happens to
429    * avoid adding dependencies to children of the complex1 node which would
430    * create a cycle.
431    */
432
433   if (gpir_op_infos[node->op].schedule_first &&
434       ctx->last_complex1 &&
435       ctx->complex1_last_written[src->value_reg]) {
436      gpir_node_add_dep(ctx->complex1_last_written[src->value_reg],
437                        ctx->last_complex1,
438                        GPIR_DEP_WRITE_AFTER_READ);
439   }
440}
441
442static bool handle_value_read(gpir_node *node, gpir_node *src,
443                              struct value_regalloc_ctx *ctx)
444{
445   /* If already allocated, don't allocate it */
446   if (src->value_reg < 0) {
447      unsigned reg = find_free_value_reg(ctx);
448      if (reg == UINT_MAX)
449         return false;
450
451      src->value_reg = reg;
452      ctx->live[reg] = src;
453   }
454
455   /* Add any fake dependencies. Note: this is the actual result of value
456    * register allocation. We throw away node->value_reg afterwards, since
457    * it's really the fake dependencies which constrain the post-RA scheduler
458    * enough to make sure it never needs to spill to temporaries.
459    */
460   add_fake_dep(node, src, ctx);
461
462   return true;
463}
464
465static bool handle_reg_read(gpir_load_node *load,
466                            struct value_regalloc_ctx *ctx)
467{
468   unsigned idx = load->index * 4 + load->component;
469   if (!ctx->live[idx]) {
470      ctx->live[idx] = PHYSREG_LIVE;
471   } else if (ctx->live[idx] != PHYSREG_LIVE) {
472      /* This slot is occupied by some other value register, so we need to
473       * evict it. This effectively splits the live range of the value
474       * register. NB: since we create fake dependencies on the fly, and the
475       * fake dependencies are the only output of this pass, we don't actually
476       * have to record where the split happened or that this value was
477       * assigned to two different registers. Any actual live range splitting
478       * happens in the post-RA scheduler, which moves the value to and from
479       * the register file. This will just cause some reads of the value
480       * register to have different fake dependencies.
481       */
482      unsigned new_reg = find_free_value_reg(ctx);
483      if (new_reg == UINT_MAX)
484         return false;
485      ctx->live[new_reg] = ctx->live[idx];
486      ctx->live[new_reg]->value_reg = new_reg;
487      ctx->live[idx] = PHYSREG_LIVE;
488   }
489
490   if (ctx->last_written[idx]) {
491      gpir_node_add_dep(ctx->last_written[idx], &load->node,
492                        GPIR_DEP_WRITE_AFTER_READ);
493   }
494
495   return true;
496}
497
498static void handle_reg_write(gpir_store_node *store,
499                             struct value_regalloc_ctx *ctx)
500{
501   unsigned idx = store->index * 4 + store->component;
502   store->node.value_reg = idx;
503   ctx->last_written[idx] = &store->node;
504   ctx->live[idx] = NULL;
505}
506
507static void handle_value_write(gpir_node *node,
508                               struct value_regalloc_ctx *ctx)
509{
510   /* TODO: why does an uninitialized node->value_reg
511    * sometimes end up here? */
512   if (node->value_reg < 0)
513      return;
514
515   ctx->last_written[node->value_reg] = node;
516   ctx->live[node->value_reg] = NULL;
517}
518
519static bool regalloc_value_regs(gpir_block *block)
520{
521   struct value_regalloc_ctx ctx = { { 0 } };
522
523   list_for_each_entry(gpir_node, node, &block->node_list, list) {
524      node->value_reg = -1;
525   }
526
527   list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
528      if (node->op == gpir_op_complex1) {
529         ctx.last_complex1 = node;
530         memcpy(ctx.complex1_last_written, ctx.last_written,
531                sizeof(ctx.complex1_last_written));
532      }
533
534      if (node->type != gpir_node_type_store &&
535          node->type != gpir_node_type_branch) {
536         handle_value_write(node, &ctx);
537      } else if (node->op == gpir_op_store_reg) {
538         handle_reg_write(gpir_node_to_store(node), &ctx);
539      }
540
541      if (node->type == gpir_node_type_store) {
542         gpir_store_node *store = gpir_node_to_store(node);
543         if (!handle_value_read(&store->node, store->child, &ctx))
544            return false;
545      } else if (node->type == gpir_node_type_alu) {
546         gpir_alu_node *alu = gpir_node_to_alu(node);
547         for (int i = 0; i < alu->num_child; i++) {
548            if (!handle_value_read(&alu->node, alu->children[i], &ctx))
549               return false;
550         }
551      } else if (node->type == gpir_node_type_branch) {
552         /* At the end of a block the top 11 values are always free, so
553          * branches should always succeed.
554          */
555         gpir_branch_node *branch = gpir_node_to_branch(node);
556         ASSERTED bool result = handle_value_read(&branch->node,
557                                                  branch->cond, &ctx);
558         assert(result);
559      } else if (node->op == gpir_op_load_reg) {
560         gpir_load_node *load = gpir_node_to_load(node);
561         if (!handle_reg_read(load, &ctx))
562             return false;
563      }
564   }
565
566   return true;
567}
568
569static void regalloc_print_result(gpir_compiler *comp)
570{
571   if (!(lima_debug & LIMA_DEBUG_GP))
572      return;
573
574   int index = 0;
575   printf("======== regalloc ========\n");
576   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
577      list_for_each_entry(gpir_node, node, &block->node_list, list) {
578         printf("%03d: %d/%d %s ", index++, node->index, node->value_reg,
579                gpir_op_infos[node->op].name);
580         gpir_node_foreach_pred(node, dep) {
581            gpir_node *pred = dep->pred;
582            printf(" %d/%d", pred->index, pred->value_reg);
583         }
584         if (node->op == gpir_op_load_reg) {
585            gpir_load_node *load = gpir_node_to_load(node);
586            printf(" -/%d", 4 * load->index + load->component);
587            printf(" (%d)", load->reg->index);
588         } else if (node->op == gpir_op_store_reg) {
589            gpir_store_node *store = gpir_node_to_store(node);
590            printf(" (%d)", store->reg->index);
591         }
592         printf("\n");
593      }
594      printf("----------------------------\n");
595   }
596}
597
598bool gpir_regalloc_prog(gpir_compiler *comp)
599{
600   struct regalloc_ctx ctx;
601
602   ctx.mem_ctx = ralloc_context(NULL);
603   ctx.bitset_words = BITSET_WORDS(comp->cur_reg);
604   ctx.live = ralloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
605   ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, comp->cur_reg);
606   ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, comp->cur_reg);
607   ctx.comp = comp;
608
609   ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, comp->cur_reg);
610   for (int i = 0; i < comp->cur_reg; i++) {
611      ctx.registers[i].conflicts = rzalloc_array(ctx.mem_ctx, BITSET_WORD,
612                                                 ctx.bitset_words);
613      util_dynarray_init(&ctx.registers[i].conflict_list, ctx.mem_ctx);
614   }
615
616   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
617      block->live_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
618      block->live_in = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
619      block->def_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
620   }
621
622   calc_liveness(&ctx);
623   calc_interference(&ctx);
624   if (!do_regalloc(&ctx)) {
625      ralloc_free(ctx.mem_ctx);
626      return false;
627   }
628   assign_regs(&ctx);
629
630   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
631      if (!regalloc_value_regs(block))
632         return false;
633   }
634
635   regalloc_print_result(comp);
636   ralloc_free(ctx.mem_ctx);
637   return true;
638}
639