1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "common/v3d_device_info.h"
27 #include "v3d_compiler.h"
28 
29 #define ACC_INDEX     0
30 #define ACC_COUNT     6
31 #define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
32 #define PHYS_COUNT    64
33 
34 #define CLASS_BITS_PHYS   (1 << 0)
35 #define CLASS_BITS_ACC    (1 << 1)
36 #define CLASS_BITS_R5     (1 << 4)
37 #define CLASS_BITS_ANY    (CLASS_BITS_PHYS | \
38                            CLASS_BITS_ACC | \
39                            CLASS_BITS_R5)
40 
41 static inline uint32_t
temp_to_node(uint32_t temp)42 temp_to_node(uint32_t temp)
43 {
44         return temp + ACC_COUNT;
45 }
46 
47 static inline uint32_t
node_to_temp(uint32_t node)48 node_to_temp(uint32_t node)
49 {
50         assert(node >= ACC_COUNT);
51         return node - ACC_COUNT;
52 }
53 
54 static inline uint8_t
get_temp_class_bits(struct v3d_ra_node_info *nodes, uint32_t temp)55 get_temp_class_bits(struct v3d_ra_node_info *nodes,
56                     uint32_t temp)
57 {
58         return nodes->info[temp_to_node(temp)].class_bits;
59 }
60 
61 static inline void
set_temp_class_bits(struct v3d_ra_node_info *nodes, uint32_t temp, uint8_t class_bits)62 set_temp_class_bits(struct v3d_ra_node_info *nodes,
63                     uint32_t temp, uint8_t class_bits)
64 {
65         nodes->info[temp_to_node(temp)].class_bits = class_bits;
66 }
67 
68 static struct ra_class *
choose_reg_class(struct v3d_compile *c, uint8_t class_bits)69 choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
70 {
71         if (class_bits == CLASS_BITS_PHYS) {
72                 return c->compiler->reg_class_phys[c->thread_index];
73         } else if (class_bits == (CLASS_BITS_R5)) {
74                 return c->compiler->reg_class_r5[c->thread_index];
75         } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
76                 return c->compiler->reg_class_phys_or_acc[c->thread_index];
77         } else {
78                 assert(class_bits == CLASS_BITS_ANY);
79                 return c->compiler->reg_class_any[c->thread_index];
80         }
81 }
82 
83 static inline struct ra_class *
choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)84 choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
85 {
86         assert(temp < c->num_temps && temp < c->nodes.alloc_count);
87         return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp));
88 }
89 
90 static inline bool
qinst_writes_tmu(const struct v3d_device_info *devinfo, struct qinst *inst)91 qinst_writes_tmu(const struct v3d_device_info *devinfo,
92                  struct qinst *inst)
93 {
94         return (inst->dst.file == QFILE_MAGIC &&
95                 v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index)) ||
96                 inst->qpu.sig.wrtmuc;
97 }
98 
99 static bool
is_end_of_tmu_sequence(const struct v3d_device_info *devinfo, struct qinst *inst, struct qblock *block)100 is_end_of_tmu_sequence(const struct v3d_device_info *devinfo,
101                        struct qinst *inst, struct qblock *block)
102 {
103         /* Only tmuwt and ldtmu can finish TMU sequences */
104         bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
105                         inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
106         bool is_ldtmu = inst->qpu.sig.ldtmu;
107         if (!is_tmuwt && !is_ldtmu)
108                 return false;
109 
110         /* Check if this is the last tmuwt or ldtmu in the sequence */
111         list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
112                                  &block->instructions, link) {
113                 is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
114                            scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
115                 is_ldtmu = scan_inst->qpu.sig.ldtmu;
116 
117                 if (is_tmuwt || is_ldtmu)
118                         return false;
119 
120                 if (qinst_writes_tmu(devinfo, scan_inst))
121                         return true;
122         }
123 
124         return true;
125 }
126 
127 static bool
vir_is_mov_uniform(struct v3d_compile *c, int temp)128 vir_is_mov_uniform(struct v3d_compile *c, int temp)
129 {
130         struct qinst *def = c->defs[temp];
131 
132         return def && def->qpu.sig.ldunif;
133 }
134 
135 static bool
can_reconstruct_inst(struct qinst *inst)136 can_reconstruct_inst(struct qinst *inst)
137 {
138         assert(inst);
139 
140         if (vir_is_add(inst)) {
141                 switch (inst->qpu.alu.add.op) {
142                 case V3D_QPU_A_FXCD:
143                 case V3D_QPU_A_FYCD:
144                 case V3D_QPU_A_XCD:
145                 case V3D_QPU_A_YCD:
146                 case V3D_QPU_A_IID:
147                 case V3D_QPU_A_EIDX:
148                 case V3D_QPU_A_TIDX:
149                 case V3D_QPU_A_SAMPID:
150                         /* No need to check input unpacks because none of these
151                          * opcodes read sources. FXCD,FYCD have pack variants.
152                          */
153                         return inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
154                                inst->qpu.flags.auf == V3D_QPU_UF_NONE &&
155                                inst->qpu.flags.apf == V3D_QPU_PF_NONE &&
156                                inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE;
157                 default:
158                         return false;
159                 }
160         }
161 
162         return false;
163 }
164 
165 static bool
can_reconstruct_temp(struct v3d_compile *c, int temp)166 can_reconstruct_temp(struct v3d_compile *c, int temp)
167 {
168         struct qinst *def = c->defs[temp];
169         return def && can_reconstruct_inst(def);
170 }
171 
172 static struct qreg
reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op)173 reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op)
174 {
175         struct qreg dest;
176         switch (op) {
177         case V3D_QPU_A_FXCD:
178                 dest = vir_FXCD(c);
179                 break;
180         case V3D_QPU_A_FYCD:
181                 dest = vir_FYCD(c);
182                 break;
183         case V3D_QPU_A_XCD:
184                 dest = vir_XCD(c);
185                 break;
186         case V3D_QPU_A_YCD:
187                 dest = vir_YCD(c);
188                 break;
189         case V3D_QPU_A_IID:
190                 dest = vir_IID(c);
191                 break;
192         case V3D_QPU_A_EIDX:
193                 dest = vir_EIDX(c);
194                 break;
195         case V3D_QPU_A_TIDX:
196                 dest = vir_TIDX(c);
197                 break;
198         case V3D_QPU_A_SAMPID:
199                 dest = vir_SAMPID(c);
200                 break;
201         default:
202             unreachable("Unexpected opcode for reconstruction");
203         }
204 
205         return dest;
206 }
207 
208 enum temp_spill_type {
209         SPILL_TYPE_UNIFORM,
210         SPILL_TYPE_RECONSTRUCT,
211         SPILL_TYPE_TMU
212 };
213 
214 static enum temp_spill_type
get_spill_type_for_temp(struct v3d_compile *c, int temp)215 get_spill_type_for_temp(struct v3d_compile *c, int temp)
216 {
217    if (vir_is_mov_uniform(c, temp))
218       return SPILL_TYPE_UNIFORM;
219 
220    if (can_reconstruct_temp(c, temp))
221       return SPILL_TYPE_RECONSTRUCT;
222 
223    return SPILL_TYPE_TMU;
224 }
225 
226 static int
v3d_choose_spill_node(struct v3d_compile *c)227 v3d_choose_spill_node(struct v3d_compile *c)
228 {
229         const float tmu_scale = 10;
230         float block_scale = 1.0;
231         float spill_costs[c->num_temps];
232         bool in_tmu_operation = false;
233         bool started_last_seg = false;
234 
235         for (unsigned i = 0; i < c->num_temps; i++)
236                 spill_costs[i] = 0.0;
237 
238         /* XXX: Scale the cost up when inside of a loop. */
239         vir_for_each_block(block, c) {
240                 vir_for_each_inst(inst, block) {
241                         /* We can't insert new thread switches after
242                          * starting output writes.
243                          */
244                         bool no_spilling =
245                                 (c->threads > 1 && started_last_seg) ||
246                                 (c->max_tmu_spills == 0);
247 
248                         /* Discourage spilling of TMU operations */
249                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
250                                 if (inst->src[i].file != QFILE_TEMP)
251                                         continue;
252 
253                                 int temp = inst->src[i].index;
254                                 enum temp_spill_type spill_type =
255                                         get_spill_type_for_temp(c, temp);
256 
257                                 if (spill_type != SPILL_TYPE_TMU) {
258                                         spill_costs[temp] += block_scale;
259                                 } else if (!no_spilling) {
260                                         float tmu_op_scale = in_tmu_operation ?
261                                                 3.0 : 1.0;
262                                         spill_costs[temp] += (block_scale *
263                                                               tmu_scale *
264                                                               tmu_op_scale);
265                                 } else {
266                                         BITSET_CLEAR(c->spillable, temp);
267                                 }
268                         }
269 
270                         if (inst->dst.file == QFILE_TEMP) {
271                                 int temp = inst->dst.index;
272                                 enum temp_spill_type spill_type =
273                                         get_spill_type_for_temp(c, temp);
274 
275                                 if (spill_type != SPILL_TYPE_TMU) {
276                                         /* We just rematerialize it later */
277                                 } else if (!no_spilling) {
278                                         spill_costs[temp] += (block_scale *
279                                                               tmu_scale);
280                                 } else {
281                                         BITSET_CLEAR(c->spillable, temp);
282                                 }
283                         }
284 
285                         /* Refuse to spill a ldvary's dst, because that means
286                          * that ldvary's r5 would end up being used across a
287                          * thrsw.
288                          */
289                         if (inst->qpu.sig.ldvary) {
290                                 assert(inst->dst.file == QFILE_TEMP);
291                                 BITSET_CLEAR(c->spillable, inst->dst.index);
292                         }
293 
294                         if (inst->is_last_thrsw)
295                                 started_last_seg = true;
296 
297                         if (v3d_qpu_writes_vpm(&inst->qpu) ||
298                             v3d_qpu_uses_tlb(&inst->qpu))
299                                 started_last_seg = true;
300 
301                         /* Track when we're in between a TMU setup and the
302                          * final LDTMU or TMUWT from that TMU setup.  We
303                          * penalize spills during that time.
304                          */
305                         if (is_end_of_tmu_sequence(c->devinfo, inst, block))
306                                 in_tmu_operation = false;
307 
308                         if (qinst_writes_tmu(c->devinfo, inst))
309                                 in_tmu_operation = true;
310                 }
311         }
312 
313         for (unsigned i = 0; i < c->num_temps; i++) {
314                 if (BITSET_TEST(c->spillable, i)) {
315                         ra_set_node_spill_cost(c->g, temp_to_node(i),
316                                                spill_costs[i]);
317                 }
318         }
319 
320         return ra_get_best_spill_node(c->g);
321 }
322 
323 static void
ensure_nodes(struct v3d_compile *c)324 ensure_nodes(struct v3d_compile *c)
325 {
326         if (c->num_temps < c->nodes.alloc_count)
327                 return;
328 
329         c->nodes.alloc_count *= 2;
330         c->nodes.info = reralloc_array_size(c,
331                                             c->nodes.info,
332                                             sizeof(c->nodes.info[0]),
333                                             c->nodes.alloc_count + ACC_COUNT);
334 }
335 
336 /* Creates the interference node for a new temp. We use this to keep the node
337  * list updated during the spilling process, which generates new temps/nodes.
338  */
339 static void
add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)340 add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
341 {
342         ensure_nodes(c);
343 
344         int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
345         assert(node == temp + ACC_COUNT);
346 
347         /* We fill the node priority after we are done inserting spills */
348         c->nodes.info[node].class_bits = class_bits;
349         c->nodes.info[node].priority = 0;
350 }
351 
352 /* The spill offset for this thread takes a bit of setup, so do it once at
353  * program start.
354  */
355 void
v3d_setup_spill_base(struct v3d_compile *c)356 v3d_setup_spill_base(struct v3d_compile *c)
357 {
358         /* Setting up the spill base is done in the entry block; so change
359          * both the current block to emit and the cursor.
360          */
361         struct qblock *current_block = c->cur_block;
362         c->cur_block = vir_entry_block(c);
363         c->cursor = vir_before_block(c->cur_block);
364 
365         int start_num_temps = c->num_temps;
366 
367         /* Each thread wants to be in a separate region of the scratch space
368          * so that the QPUs aren't fighting over cache lines.  We have the
369          * driver keep a single global spill BO rather than
370          * per-spilling-program BOs, so we need a uniform from the driver for
371          * what the per-thread scale is.
372          */
373         struct qreg thread_offset =
374                 vir_UMUL(c,
375                          vir_TIDX(c),
376                          vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0));
377 
378         /* Each channel in a reg is 4 bytes, so scale them up by that. */
379         struct qreg element_offset = vir_SHL(c, vir_EIDX(c),
380                                              vir_uniform_ui(c, 2));
381 
382         c->spill_base = vir_ADD(c,
383                                 vir_ADD(c, thread_offset, element_offset),
384                                 vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
385 
386         /* Make sure that we don't spill the spilling setup instructions. */
387         for (int i = start_num_temps; i < c->num_temps; i++) {
388                 BITSET_CLEAR(c->spillable, i);
389 
390                 /* If we are spilling, update the RA map with the temps added
391                  * by the spill setup. Our spill_base register can never be an
392                  * accumulator because it is used for TMU spill/fill and thus
393                  * needs to persist across thread switches.
394                  */
395                 if (c->spilling) {
396                         int temp_class = CLASS_BITS_PHYS;
397                         if (i != c->spill_base.index)
398                                 temp_class |= CLASS_BITS_ACC;
399                         add_node(c, i, temp_class);
400                 }
401         }
402 
403         /* Restore the current block. */
404         c->cur_block = current_block;
405         c->cursor = vir_after_block(c->cur_block);
406 }
407 
408 /**
409  * Computes the address for a spill/fill sequence and completes the spill/fill
410  * sequence by emitting the following code:
411  *
412  * ldunif.spill_offset
413  * add tmua spill_base spill_offset
414  * thrsw
415  *
416  * If the sequence is for a spill, then it will emit a tmuwt after the thrsw,
417  * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'.
418  *
419  * The parameter 'ip' represents the ip at which the spill/fill is happening.
420  * This is used to disallow accumulators on temps that cross this ip boundary
421  * due to the new thrsw itroduced in the sequence above.
422  */
423 static void
v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset, enum v3d_qpu_cond cond, int32_t ip, struct qreg *fill_dst)424 v3d_emit_spill_tmua(struct v3d_compile *c,
425                     uint32_t spill_offset,
426                     enum v3d_qpu_cond cond,
427                     int32_t ip,
428                     struct qreg *fill_dst)
429 {
430         assert(ip >= 0);
431 
432         /* Load a uniform with the spill offset and add it to the spill base
433          * to obtain the TMUA address. It can be of class ANY because we know
434          * we are consuming it immediately without thrsw in between.
435          */
436         assert(c->disable_ldunif_opt);
437         struct qreg offset = vir_uniform_ui(c, spill_offset);
438         add_node(c, offset.index, CLASS_BITS_ANY);
439 
440         /* We always enable per-quad on spills/fills to ensure we spill
441          * any channels involved with helper invocations.
442          */
443         struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
444         struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
445         inst->qpu.flags.ac = cond;
446         inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
447                                               0xffffff7f); /* per-quad */
448 
449         vir_emit_thrsw(c);
450 
451         /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the
452          * result of the fill. The TMUWT temp is not really read, the ldtmu
453          * temp will be used immediately so just like the uniform above we
454          * can allow accumulators.
455          */
456         if (!fill_dst) {
457                 struct qreg dst = vir_TMUWT(c);
458                 assert(dst.file == QFILE_TEMP);
459                 add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
460         } else {
461                 *fill_dst = vir_LDTMU(c);
462                 assert(fill_dst->file == QFILE_TEMP);
463                 add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
464         }
465 
466         /* Temps across the thread switch we injected can't be assigned to
467          * accumulators.
468          *
469          * Fills inject code before ip, so anything that starts at ip or later
470          * is not affected by the thrsw. Something that ends at ip will be
471          * affected though.
472          *
473          * Spills inject code after ip, so anything that starts strictly later
474          * than ip is not affected (the temp starting at ip is usually the
475          * spilled temp except for postponed spills). Something that ends at ip
476          * won't be affected either.
477          */
478         for (int i = 0; i < c->spill_start_num_temps; i++) {
479                 bool thrsw_cross = fill_dst ?
480                         c->temp_start[i] < ip && c->temp_end[i] >= ip :
481                         c->temp_start[i] <= ip && c->temp_end[i] > ip;
482                 if (thrsw_cross) {
483                         ra_set_node_class(c->g, temp_to_node(i),
484                                           choose_reg_class(c, CLASS_BITS_PHYS));
485                 }
486         }
487 }
488 
489 static void
v3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst, struct qreg spill_temp, struct qinst *position, uint32_t ip, uint32_t spill_offset)490 v3d_emit_tmu_spill(struct v3d_compile *c,
491                    struct qinst *inst,
492                    struct qreg spill_temp,
493                    struct qinst *position,
494                    uint32_t ip,
495                    uint32_t spill_offset)
496 {
497         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
498         assert(inst->dst.file == QFILE_TEMP);
499 
500         c->cursor = vir_after_inst(position);
501 
502         enum v3d_qpu_cond cond = vir_get_cond(inst);
503 
504         /* If inst and position don't match, this is a postponed spill,
505          * in which case we have already allocated the temp for the spill
506          * and we should use that, otherwise create a new temp with the
507          * same register class bits as the original.
508          */
509         if (inst == position) {
510                 uint8_t class_bits = get_temp_class_bits(&c->nodes,
511                                                          inst->dst.index);
512                 inst->dst = vir_get_temp(c);
513                 add_node(c, inst->dst.index, class_bits);
514         } else {
515                 inst->dst = spill_temp;
516 
517                 /* If this is a postponed spill the register being spilled may
518                  * have been written more than once including conditional
519                  * writes, so ignore predication on the spill instruction and
520                  * always spill the full register.
521                  */
522                 cond = V3D_QPU_COND_NONE;
523         }
524 
525         struct qinst *tmp =
526                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
527                              inst->dst);
528         tmp->qpu.flags.mc = cond;
529 
530         v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL);
531 
532         c->spills++;
533         c->tmu_dirty_rcl = true;
534 }
535 
536 static inline bool
interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)537 interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
538 {
539         return !(t0_start >= t1_end || t1_start >= t0_end);
540 }
541 
542 static void
v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)543 v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
544 {
545         c->spill_start_num_temps = c->num_temps;
546         c->spilling = true;
547 
548         enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp);
549 
550         uint32_t spill_offset = 0;
551         if (spill_type == SPILL_TYPE_TMU) {
552                 spill_offset = c->spill_size;
553                 c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
554 
555                 if (spill_offset == 0)
556                         v3d_setup_spill_base(c);
557         }
558 
559         struct qinst *last_thrsw = c->last_thrsw;
560         assert(last_thrsw && last_thrsw->is_last_thrsw);
561 
562         int uniform_index = ~0;
563         if (spill_type == SPILL_TYPE_UNIFORM) {
564                 struct qinst *orig_unif = c->defs[spill_temp];
565                 uniform_index = orig_unif->uniform;
566         }
567 
568         enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP;
569         if (spill_type == SPILL_TYPE_RECONSTRUCT) {
570                 struct qinst *orig_def = c->defs[spill_temp];
571                 assert(vir_is_add(orig_def));
572                 reconstruct_op = orig_def->qpu.alu.add.op;
573         }
574 
575         uint32_t spill_node = temp_to_node(spill_temp);
576 
577         /* We must disable the ldunif optimization if we are spilling uniforms */
578         bool had_disable_ldunif_opt = c->disable_ldunif_opt;
579         c->disable_ldunif_opt = true;
580 
581         struct qinst *start_of_tmu_sequence = NULL;
582         struct qinst *postponed_spill = NULL;
583         struct qreg postponed_spill_temp = { 0 };
584         vir_for_each_block(block, c) {
585                 vir_for_each_inst_safe(inst, block) {
586                         int32_t ip = inst->ip;
587 
588                         /* Track when we're in between a TMU setup and the final
589                          * LDTMU or TMUWT from that TMU setup. We can't spill/fill any
590                          * temps during that time, because that involves inserting a
591                          * new TMU setup/LDTMU sequence, so we postpone the spill or
592                          * move the fill up to not intrude in the middle of the TMU
593                          * sequence.
594                          */
595                         if (is_end_of_tmu_sequence(c->devinfo, inst, block)) {
596                                 if (postponed_spill) {
597                                         v3d_emit_tmu_spill(c, postponed_spill,
598                                                            postponed_spill_temp,
599                                                            inst, ip, spill_offset);
600                                 }
601 
602                                 start_of_tmu_sequence = NULL;
603                                 postponed_spill = NULL;
604                         }
605 
606                         if (!start_of_tmu_sequence &&
607                             qinst_writes_tmu(c->devinfo, inst)) {
608                                 start_of_tmu_sequence = inst;
609                         }
610 
611                         /* fills */
612                         int filled_src = -1;
613                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
614                                 if (inst->src[i].file != QFILE_TEMP ||
615                                     inst->src[i].index != spill_temp) {
616                                         continue;
617                                 }
618 
619                                 if (filled_src >= 0) {
620                                         inst->src[i] = inst->src[filled_src];
621                                         continue;
622                                 }
623 
624                                 c->cursor = vir_before_inst(inst);
625 
626                                 if (spill_type == SPILL_TYPE_UNIFORM) {
627                                         struct qreg unif =
628                                                 vir_uniform(c,
629                                                             c->uniform_contents[uniform_index],
630                                                             c->uniform_data[uniform_index]);
631                                         inst->src[i] = unif;
632                                         /* We are using the uniform in the
633                                          * instruction immediately after, so
634                                          * we can use any register class for it.
635                                          */
636                                         add_node(c, unif.index, CLASS_BITS_ANY);
637                                 } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
638                                         struct qreg temp =
639                                                 reconstruct_temp(c, reconstruct_op);
640                                         inst->src[i] = temp;
641                                         /* We are using the temp in the
642                                          * instruction immediately after so we
643                                          * can use ACC.
644                                          */
645                                         add_node(c, temp.index, CLASS_BITS_PHYS |
646                                                                 CLASS_BITS_ACC);
647                                 } else {
648                                         /* If we have a postponed spill, we
649                                          * don't need a fill as the temp would
650                                          * not have been spilled yet, however,
651                                          * we need to update the temp index.
652                                          */
653                                         if (postponed_spill) {
654                                                 inst->src[i] =
655                                                         postponed_spill_temp;
656                                         } else {
657                                                 int32_t fill_ip = ip;
658                                                 if (start_of_tmu_sequence) {
659                                                         c->cursor = vir_before_inst(start_of_tmu_sequence);
660                                                         fill_ip = start_of_tmu_sequence->ip;
661                                                 }
662 
663                                                 v3d_emit_spill_tmua(c,  spill_offset,
664                                                                     V3D_QPU_COND_NONE,
665                                                                     fill_ip, &inst->src[i]);
666                                                 c->fills++;
667                                         }
668                                 }
669 
670                                 filled_src = i;
671                         }
672 
673                         /* spills */
674                         if (inst->dst.file == QFILE_TEMP &&
675                             inst->dst.index == spill_temp) {
676                                 if (spill_type != SPILL_TYPE_TMU) {
677                                         c->cursor.link = NULL;
678                                         vir_remove_instruction(c, inst);
679                                 } else {
680                                         /* If we are in the middle of a TMU
681                                          * sequence, we postpone the actual
682                                          * spill until we have finished it. We,
683                                          * still need to replace the spill temp
684                                          * with a new temp though.
685                                          */
686                                         if (start_of_tmu_sequence) {
687                                                 if (postponed_spill) {
688                                                         postponed_spill->dst =
689                                                                 postponed_spill_temp;
690                                                 }
691                                                 if (!postponed_spill ||
692                                                     vir_get_cond(inst) == V3D_QPU_COND_NONE) {
693                                                         postponed_spill_temp =
694                                                                 vir_get_temp(c);
695                                                         add_node(c,
696                                                                  postponed_spill_temp.index,
697                                                                  c->nodes.info[spill_node].class_bits);
698                                                 }
699                                                 postponed_spill = inst;
700                                         } else {
701                                                 v3d_emit_tmu_spill(c, inst,
702                                                                    postponed_spill_temp,
703                                                                    inst, ip,
704                                                                    spill_offset);
705                                         }
706                                 }
707                         }
708                 }
709         }
710 
711         /* Make sure c->last_thrsw is the actual last thrsw, not just one we
712          * inserted in our most recent unspill.
713          */
714         c->last_thrsw = last_thrsw;
715 
716         /* Don't allow spilling of our spilling instructions.  There's no way
717          * they can help get things colored.
718          */
719         for (int i = c->spill_start_num_temps; i < c->num_temps; i++)
720                 BITSET_CLEAR(c->spillable, i);
721 
722         /* Reset interference for spilled node */
723         ra_set_node_spill_cost(c->g, spill_node, 0);
724         ra_reset_node_interference(c->g, spill_node);
725         BITSET_CLEAR(c->spillable, spill_temp);
726 
727         /* Rebuild program ips */
728         int32_t ip = 0;
729         vir_for_each_inst_inorder(inst, c)
730                 inst->ip = ip++;
731 
732         /* Rebuild liveness */
733         vir_calculate_live_intervals(c);
734 
735         /* Add interferences for the new spilled temps and update interferences
736          * for c->spill_base (since we may have modified its liveness). Also,
737          * update node priorities based one new liveness data.
738          */
739         uint32_t sb_temp =c->spill_base.index;
740         uint32_t sb_node = temp_to_node(sb_temp);
741         for (uint32_t i = 0; i < c->num_temps; i++) {
742                 if (c->temp_end[i] == -1)
743                         continue;
744 
745                 uint32_t node_i = temp_to_node(i);
746                 c->nodes.info[node_i].priority =
747                         c->temp_end[i] - c->temp_start[i];
748 
749                 for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps);
750                      j < c->num_temps; j++) {
751                         if (interferes(c->temp_start[i], c->temp_end[i],
752                                        c->temp_start[j], c->temp_end[j])) {
753                                 uint32_t node_j = temp_to_node(j);
754                                 ra_add_node_interference(c->g, node_i, node_j);
755                         }
756                 }
757 
758                 if (spill_type == SPILL_TYPE_TMU) {
759                         if (i != sb_temp &&
760                             interferes(c->temp_start[i], c->temp_end[i],
761                                        c->temp_start[sb_temp], c->temp_end[sb_temp])) {
762                                 ra_add_node_interference(c->g, node_i, sb_node);
763                         }
764                 }
765         }
766 
767         c->disable_ldunif_opt = had_disable_ldunif_opt;
768         c->spilling = false;
769 }
770 
771 struct v3d_ra_select_callback_data {
772         uint32_t next_acc;
773         uint32_t next_phys;
774         struct v3d_ra_node_info *nodes;
775 };
776 
777 /* Choosing accumulators improves chances of merging QPU instructions
778  * due to these merges requiring that at most 2 rf registers are used
779  * by the add and mul instructions.
780  */
781 static bool
v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, BITSET_WORD *regs, int priority)782 v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
783                    BITSET_WORD *regs,
784                    int priority)
785 {
786         /* Favor accumulators if we have less that this number of physical
787          * registers. Accumulators have more restrictions (like being
788          * invalidated through thrsw), so running out of physical registers
789          * even if we have accumulators available can lead to register
790          * allocation failures.
791          */
792         static const int available_rf_threshold = 5;
793         int available_rf = 0 ;
794         for (int i = 0; i < PHYS_COUNT; i++) {
795                 if (BITSET_TEST(regs, PHYS_INDEX + i))
796                         available_rf++;
797                 if (available_rf >= available_rf_threshold)
798                         break;
799         }
800         if (available_rf < available_rf_threshold)
801                 return true;
802 
803         /* Favor accumulators for short-lived temps (our priority represents
804          * liveness), to prevent long-lived temps from grabbing accumulators
805          * and preventing follow-up instructions from using them, potentially
806          * leading to large portions of the shader being unable to use
807          * accumulators and therefore merge instructions successfully.
808          */
809         static const int priority_threshold = 20;
810         if (priority <= priority_threshold)
811                 return true;
812 
813         return false;
814 }
815 
816 static bool
v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, BITSET_WORD *regs, unsigned int *out)817 v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
818                     BITSET_WORD *regs,
819                     unsigned int *out)
820 {
821         /* Choose r5 for our ldunifs if possible (nobody else can load to that
822          * reg, and it keeps the QPU cond field free from being occupied by
823          * ldunifrf).
824          */
825         int r5 = ACC_INDEX + 5;
826         if (BITSET_TEST(regs, r5)) {
827                 *out = r5;
828                 return true;
829         }
830 
831         /* Round-robin through our accumulators to give post-RA instruction
832          * selection more options.
833          */
834         for (int i = 0; i < ACC_COUNT; i++) {
835                 int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
836                 int acc = ACC_INDEX + acc_off;
837 
838                 if (BITSET_TEST(regs, acc)) {
839                         v3d_ra->next_acc = acc_off + 1;
840                         *out = acc;
841                         return true;
842                 }
843         }
844 
845         return false;
846 }
847 
848 static bool
v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, BITSET_WORD *regs, unsigned int *out)849 v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
850                  BITSET_WORD *regs,
851                  unsigned int *out)
852 {
853         for (int i = 0; i < PHYS_COUNT; i++) {
854                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
855                 int phys = PHYS_INDEX + phys_off;
856 
857                 if (BITSET_TEST(regs, phys)) {
858                         v3d_ra->next_phys = phys_off + 1;
859                         *out = phys;
860                         return true;
861                 }
862         }
863 
864         return false;
865 }
866 
867 static unsigned int
v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)868 v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
869 {
870         struct v3d_ra_select_callback_data *v3d_ra = data;
871 
872         unsigned int reg;
873         if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) &&
874             v3d_ra_select_accum(v3d_ra, regs, &reg)) {
875                 return reg;
876         }
877 
878         if (v3d_ra_select_rf(v3d_ra, regs, &reg))
879                 return reg;
880 
881         /* If we ran out of physical registers try to assign an accumulator
882          * if we didn't favor that option earlier.
883          */
884         if (v3d_ra_select_accum(v3d_ra, regs, &reg))
885                 return reg;
886 
887         unreachable("RA must pass us at least one possible reg.");
888 }
889 
890 bool
vir_init_reg_sets(struct v3d_compiler *compiler)891 vir_init_reg_sets(struct v3d_compiler *compiler)
892 {
893         /* Allocate up to 3 regfile classes, for the ways the physical
894          * register file can be divided up for fragment shader threading.
895          */
896         int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
897 
898         compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
899                                           false);
900         if (!compiler->regs)
901                 return false;
902 
903         for (int threads = 0; threads < max_thread_index; threads++) {
904                 compiler->reg_class_any[threads] =
905                         ra_alloc_contig_reg_class(compiler->regs, 1);
906                 compiler->reg_class_r5[threads] =
907                         ra_alloc_contig_reg_class(compiler->regs, 1);
908                 compiler->reg_class_phys_or_acc[threads] =
909                         ra_alloc_contig_reg_class(compiler->regs, 1);
910                 compiler->reg_class_phys[threads] =
911                         ra_alloc_contig_reg_class(compiler->regs, 1);
912 
913                 for (int i = PHYS_INDEX;
914                      i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
915                         ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
916                         ra_class_add_reg(compiler->reg_class_phys[threads], i);
917                         ra_class_add_reg(compiler->reg_class_any[threads], i);
918                 }
919 
920                 for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
921                         ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
922                         ra_class_add_reg(compiler->reg_class_any[threads], i);
923                 }
924                 /* r5 can only store a single 32-bit value, so not much can
925                  * use it.
926                  */
927                 ra_class_add_reg(compiler->reg_class_r5[threads],
928                                  ACC_INDEX + 5);
929                 ra_class_add_reg(compiler->reg_class_any[threads],
930                                  ACC_INDEX + 5);
931         }
932 
933         ra_set_finalize(compiler->regs, NULL);
934 
935         return true;
936 }
937 
938 static inline bool
tmu_spilling_allowed(struct v3d_compile *c)939 tmu_spilling_allowed(struct v3d_compile *c)
940 {
941         return c->spills + c->fills < c->max_tmu_spills;
942 }
943 
944 static void
update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, struct qinst *inst)945 update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
946                                       struct qinst *inst)
947 {
948         int32_t ip = inst->ip;
949         assert(ip >= 0);
950 
951         /* If the instruction writes r3/r4 (and optionally moves its
952          * result to a temp), nothing else can be stored in r3/r4 across
953          * it.
954          */
955         if (vir_writes_r3(c->devinfo, inst)) {
956                 for (int i = 0; i < c->num_temps; i++) {
957                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
958                                 ra_add_node_interference(c->g,
959                                                          temp_to_node(i),
960                                                          acc_nodes[3]);
961                         }
962                 }
963         }
964 
965         if (vir_writes_r4(c->devinfo, inst)) {
966                 for (int i = 0; i < c->num_temps; i++) {
967                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
968                                 ra_add_node_interference(c->g,
969                                                          temp_to_node(i),
970                                                          acc_nodes[4]);
971                         }
972                 }
973         }
974 
975         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
976                 switch (inst->qpu.alu.add.op) {
977                 case V3D_QPU_A_LDVPMV_IN:
978                 case V3D_QPU_A_LDVPMV_OUT:
979                 case V3D_QPU_A_LDVPMD_IN:
980                 case V3D_QPU_A_LDVPMD_OUT:
981                 case V3D_QPU_A_LDVPMP:
982                 case V3D_QPU_A_LDVPMG_IN:
983                 case V3D_QPU_A_LDVPMG_OUT: {
984                         /* LDVPMs only store to temps (the MA flag
985                          * decides whether the LDVPM is in or out)
986                          */
987                         assert(inst->dst.file == QFILE_TEMP);
988                         set_temp_class_bits(&c->nodes, inst->dst.index,
989                                             CLASS_BITS_PHYS);
990                         break;
991                 }
992 
993                 case V3D_QPU_A_RECIP:
994                 case V3D_QPU_A_RSQRT:
995                 case V3D_QPU_A_EXP:
996                 case V3D_QPU_A_LOG:
997                 case V3D_QPU_A_SIN:
998                 case V3D_QPU_A_RSQRT2: {
999                         /* The SFU instructions write directly to the
1000                          * phys regfile.
1001                          */
1002                         assert(inst->dst.file == QFILE_TEMP);
1003                         set_temp_class_bits(&c->nodes, inst->dst.index,
1004                                             CLASS_BITS_PHYS);
1005                         break;
1006                 }
1007 
1008                 default:
1009                         break;
1010                 }
1011         }
1012 
1013         if (inst->src[0].file == QFILE_REG) {
1014                 switch (inst->src[0].index) {
1015                 case 0:
1016                 case 1:
1017                 case 2:
1018                 case 3: {
1019                         /* Payload setup instructions: Force allocate
1020                          * the dst to the given register (so the MOV
1021                          * will disappear).
1022                          */
1023                         assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
1024                         assert(inst->dst.file == QFILE_TEMP);
1025                         uint32_t node = temp_to_node(inst->dst.index);
1026                         ra_set_node_reg(c->g, node,
1027                                         PHYS_INDEX + inst->src[0].index);
1028                         break;
1029                 }
1030                 }
1031         }
1032 
1033         if (inst->dst.file == QFILE_TEMP) {
1034                 /* Only a ldunif gets to write to R5, which only has a
1035                  * single 32-bit channel of storage.
1036                  *
1037                  * NOTE: ldunifa is subject to the same, however, going by
1038                  * shader-db it is best to keep r5 exclusive to ldunif, probably
1039                  * because ldunif has usually a shorter lifespan, allowing for
1040                  * more accumulator reuse and QPU merges.
1041                  */
1042                 if (!inst->qpu.sig.ldunif) {
1043                         uint8_t class_bits =
1044                                 get_temp_class_bits(&c->nodes, inst->dst.index) &
1045                                 ~CLASS_BITS_R5;
1046                         set_temp_class_bits(&c->nodes, inst->dst.index,
1047                                             class_bits);
1048 
1049                 } else {
1050                         /* Until V3D 4.x, we could only load a uniform
1051                          * to r5, so we'll need to spill if uniform
1052                          * loads interfere with each other.
1053                          */
1054                         if (c->devinfo->ver < 40) {
1055                                 set_temp_class_bits(&c->nodes, inst->dst.index,
1056                                                     CLASS_BITS_R5);
1057                         }
1058                 }
1059         }
1060 
1061         /* All accumulators are invalidated across a thread switch. */
1062         if (inst->qpu.sig.thrsw) {
1063                 for (int i = 0; i < c->num_temps; i++) {
1064                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1065                                 set_temp_class_bits(&c->nodes, i,
1066                                                     CLASS_BITS_PHYS);
1067                         }
1068                 }
1069         }
1070 }
1071 
1072 /**
1073  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
1074  *
1075  * The return value should be freed by the caller.
1076  */
1077 struct qpu_reg *
v3d_register_allocate(struct v3d_compile *c)1078 v3d_register_allocate(struct v3d_compile *c)
1079 {
1080         int acc_nodes[ACC_COUNT];
1081         c->nodes = (struct v3d_ra_node_info) {
1082                 .alloc_count = c->num_temps,
1083                 .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
1084                                           c->num_temps + ACC_COUNT),
1085         };
1086 
1087         struct v3d_ra_select_callback_data callback_data = {
1088                 .next_acc = 0,
1089                 /* Start at RF3, to try to keep the TLB writes from using
1090                  * RF0-2.
1091                  */
1092                 .next_phys = 3,
1093                 .nodes = &c->nodes,
1094         };
1095 
1096         vir_calculate_live_intervals(c);
1097 
1098         /* Convert 1, 2, 4 threads to 0, 1, 2 index.
1099          *
1100          * V3D 4.x has double the physical register space, so 64 physical regs
1101          * are available at both 1x and 2x threading, and 4x has 32.
1102          */
1103         c->thread_index = ffs(c->threads) - 1;
1104         if (c->devinfo->ver >= 40) {
1105                 if (c->thread_index >= 1)
1106                         c->thread_index--;
1107         }
1108 
1109         c->g = ra_alloc_interference_graph(c->compiler->regs,
1110                                            c->num_temps + ARRAY_SIZE(acc_nodes));
1111         ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
1112 
1113         /* Make some fixed nodes for the accumulators, which we will need to
1114          * interfere with when ops have implied r3/r4 writes or for the thread
1115          * switches.  We could represent these as classes for the nodes to
1116          * live in, but the classes take up a lot of memory to set up, so we
1117          * don't want to make too many.
1118          */
1119         for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) {
1120                 if (i < ACC_COUNT) {
1121                         acc_nodes[i] = i;
1122                         ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
1123                         c->nodes.info[i].priority = 0;
1124                         c->nodes.info[i].class_bits = 0;
1125                 } else {
1126                         uint32_t t = node_to_temp(i);
1127                         c->nodes.info[i].priority =
1128                                 c->temp_end[t] - c->temp_start[t];
1129                         c->nodes.info[i].class_bits = CLASS_BITS_ANY;
1130                 }
1131         }
1132 
1133         /* Walk the instructions adding register class restrictions and
1134          * interferences.
1135          */
1136         int ip = 0;
1137         vir_for_each_inst_inorder(inst, c) {
1138                 inst->ip = ip++;
1139                 update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
1140         }
1141 
1142         /* Set the register classes for all our temporaries in the graph */
1143         for (uint32_t i = 0; i < c->num_temps; i++) {
1144                 ra_set_node_class(c->g, temp_to_node(i),
1145                                   choose_reg_class_for_temp(c, i));
1146         }
1147 
1148         /* Add register interferences based on liveness data */
1149         for (uint32_t i = 0; i < c->num_temps; i++) {
1150                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
1151                         if (interferes(c->temp_start[i], c->temp_end[i],
1152                                        c->temp_start[j], c->temp_end[j])) {
1153                                 ra_add_node_interference(c->g,
1154                                                          temp_to_node(i),
1155                                                          temp_to_node(j));
1156                         }
1157                 }
1158         }
1159 
1160         /* Debug option to force a bit of TMU spilling, for running
1161          * across conformance tests to make sure that spilling works.
1162          */
1163         const int force_register_spills = 0;
1164         if (force_register_spills > 0)
1165                 c->max_tmu_spills = UINT32_MAX;
1166 
1167         struct qpu_reg *temp_registers = NULL;
1168         while (true) {
1169                 if (c->spill_size <
1170                     V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
1171                         int node = v3d_choose_spill_node(c);
1172                         uint32_t temp = node_to_temp(node);
1173                         if (node != -1) {
1174                                 v3d_spill_reg(c, acc_nodes, temp);
1175                                 continue;
1176                         }
1177                 }
1178 
1179                 if (ra_allocate(c->g))
1180                         break;
1181 
1182                 /* Failed allocation, try to spill */
1183                 int node = v3d_choose_spill_node(c);
1184                 if (node == -1)
1185                         goto spill_fail;
1186 
1187                 uint32_t temp = node_to_temp(node);
1188                 enum temp_spill_type spill_type =
1189                         get_spill_type_for_temp(c, temp);
1190                 if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
1191                         v3d_spill_reg(c, acc_nodes, temp);
1192                         if (c->spills + c->fills > c->max_tmu_spills)
1193                                 goto spill_fail;
1194                 } else {
1195                         goto spill_fail;
1196                 }
1197         }
1198 
1199         /* Allocation was successful, build the 'temp -> reg' map */
1200         temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
1201         for (uint32_t i = 0; i < c->num_temps; i++) {
1202                 int ra_reg = ra_get_node_reg(c->g, temp_to_node(i));
1203                 if (ra_reg < PHYS_INDEX) {
1204                         temp_registers[i].magic = true;
1205                         temp_registers[i].index = (V3D_QPU_WADDR_R0 +
1206                                                    ra_reg - ACC_INDEX);
1207                 } else {
1208                         temp_registers[i].magic = false;
1209                         temp_registers[i].index = ra_reg - PHYS_INDEX;
1210                 }
1211         }
1212 
1213 spill_fail:
1214         ralloc_free(c->nodes.info);
1215         c->nodes.info = NULL;
1216         c->nodes.alloc_count = 0;
1217         ralloc_free(c->g);
1218         c->g = NULL;
1219         return temp_registers;
1220 }
1221