1/*
2 * Copyright (C) 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include "ir3_ra.h"
25
26/* The spilling pass leaves out a few details required to successfully operate
27 * ldp/stp:
28 *
29 * 1. ldp/stp can only load/store 4 components at a time, but spilling ignores
30 *    that and just spills/restores entire values, including arrays and values
31 *    created for texture setup which can be more than 4 components.
32 * 2. The immediate offset only has 13 bits and is signed, so if we spill a lot
33 *    or have very large arrays before spilling then we could run out.
34 * 3. The spiller doesn't add barrier dependencies needed for post-RA
35 *    scheduling.
36 *
37 * The first one, in particular, is much easier to handle after RA because
38 * arrays and normal values can be treated the same way. Therefore this pass
39 * runs after RA, and handles all three issues. This keeps the complexity out of
40 * the spiller.
41 */
42
43static unsigned
44component_bytes(struct ir3_register *src)
45{
46   return (src->flags & IR3_REG_HALF) ? 2 : 4;
47}
48
49/* Note: this won't work if the base register is anything other than 0!
50 * Dynamic bases, which we'll need for "real" function call support, will
51 * probably be a lot harder to handle and may require reserving another
52 * register.
53 */
54static void
55set_base_reg(struct ir3_instruction *mem, unsigned val)
56{
57   struct ir3_instruction *mov = ir3_instr_create(mem->block, OPC_MOV, 1, 1);
58   ir3_dst_create(mov, mem->srcs[0]->num, mem->srcs[0]->flags);
59   ir3_src_create(mov, INVALID_REG, IR3_REG_IMMED)->uim_val = val;
60   mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
61
62   ir3_instr_move_before(mov, mem);
63}
64
65static void
66reset_base_reg(struct ir3_instruction *mem)
67{
68   /* If the base register is killed, then we don't need to clobber it and it
69    * may be reused as a destination so we can't always clobber it after the
70    * instruction anyway.
71    */
72   struct ir3_register *base = mem->srcs[0];
73   if (base->flags & IR3_REG_KILL)
74      return;
75
76   struct ir3_instruction *mov = ir3_instr_create(mem->block, OPC_MOV, 1, 1);
77   ir3_dst_create(mov, base->num, base->flags);
78   ir3_src_create(mov, INVALID_REG, IR3_REG_IMMED)->uim_val = 0;
79   mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
80
81   ir3_instr_move_after(mov, mem);
82}
83
84/* There are 13 bits, but 1 << 12 will be sign-extended into a negative offset
85 * so it can't be used directly. Therefore only offsets under 1 << 12 can be
86 * used without any adjustments.
87 */
88#define MAX_CAT6_SIZE (1u << 12)
89
90static void
91handle_oob_offset_spill(struct ir3_instruction *spill)
92{
93   unsigned components = spill->srcs[2]->uim_val;
94
95   if (spill->cat6.dst_offset + components * component_bytes(spill->srcs[1]) < MAX_CAT6_SIZE)
96      return;
97
98   set_base_reg(spill, spill->cat6.dst_offset);
99   reset_base_reg(spill);
100   spill->cat6.dst_offset = 0;
101}
102
103static void
104handle_oob_offset_reload(struct ir3_instruction *reload)
105{
106   unsigned components = reload->srcs[2]->uim_val;
107   unsigned offset = reload->srcs[1]->uim_val;
108   if (offset + components * component_bytes(reload->dsts[0]) < MAX_CAT6_SIZE)
109      return;
110
111   set_base_reg(reload, offset);
112   reset_base_reg(reload);
113   reload->srcs[1]->uim_val = 0;
114}
115
116static void
117split_spill(struct ir3_instruction *spill)
118{
119   unsigned orig_components = spill->srcs[2]->uim_val;
120
121   /* We don't handle splitting dependencies. */
122   assert(spill->deps_count == 0);
123
124   if (orig_components <= 4) {
125      if (spill->srcs[1]->flags & IR3_REG_ARRAY) {
126         spill->srcs[1]->wrmask = MASK(orig_components);
127         spill->srcs[1]->num = spill->srcs[1]->array.base;
128         spill->srcs[1]->flags &= ~IR3_REG_ARRAY;
129      }
130      return;
131   }
132
133   for (unsigned comp = 0; comp < orig_components; comp += 4) {
134      unsigned components = MIN2(orig_components - comp, 4);
135      struct ir3_instruction *clone = ir3_instr_clone(spill);
136      ir3_instr_move_before(clone, spill);
137
138      clone->srcs[1]->wrmask = MASK(components);
139      if (clone->srcs[1]->flags & IR3_REG_ARRAY) {
140         clone->srcs[1]->num = clone->srcs[1]->array.base + comp;
141         clone->srcs[1]->flags &= ~IR3_REG_ARRAY;
142      }
143
144      clone->srcs[2]->uim_val = components;
145      clone->cat6.dst_offset += comp * component_bytes(spill->srcs[1]);
146   }
147
148   list_delinit(&spill->node);
149}
150
151static void
152split_reload(struct ir3_instruction *reload)
153{
154   unsigned orig_components = reload->srcs[2]->uim_val;
155
156   assert(reload->deps_count == 0);
157
158   if (orig_components <= 4) {
159      if (reload->dsts[0]->flags & IR3_REG_ARRAY) {
160         reload->dsts[0]->wrmask = MASK(orig_components);
161         reload->dsts[0]->num = reload->dsts[0]->array.base;
162         reload->dsts[0]->flags &= ~IR3_REG_ARRAY;
163      }
164      return;
165   }
166
167   for (unsigned comp = 0; comp < orig_components; comp += 4) {
168      unsigned components = MIN2(orig_components - comp, 4);
169      struct ir3_instruction *clone = ir3_instr_clone(reload);
170      ir3_instr_move_before(clone, reload);
171
172      clone->dsts[0]->wrmask = MASK(components);
173      if (clone->dsts[0]->flags & IR3_REG_ARRAY) {
174         clone->dsts[0]->num = clone->dsts[0]->array.base + comp;
175         clone->dsts[0]->flags &= ~IR3_REG_ARRAY;
176      }
177
178      clone->srcs[2]->uim_val = components;
179      clone->srcs[1]->uim_val += comp * component_bytes(reload->dsts[0]);
180   }
181
182   list_delinit(&reload->node);
183}
184
185static void
186add_spill_reload_deps(struct ir3_block *block)
187{
188   struct ir3_instruction *last_spill = NULL;
189
190   foreach_instr (instr, &block->instr_list) {
191      if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) &&
192          last_spill) {
193         ir3_instr_add_dep(instr, last_spill);
194      }
195
196      if (instr->opc == OPC_SPILL_MACRO)
197         last_spill = instr;
198   }
199
200
201   last_spill = NULL;
202
203   foreach_instr_rev (instr, &block->instr_list) {
204      if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) &&
205          last_spill) {
206         ir3_instr_add_dep(last_spill, instr);
207      }
208
209      if (instr->opc == OPC_SPILL_MACRO)
210         last_spill = instr;
211   }
212}
213
214bool
215ir3_lower_spill(struct ir3 *ir)
216{
217   foreach_block (block, &ir->block_list) {
218      foreach_instr_safe (instr, &block->instr_list) {
219         if (instr->opc == OPC_SPILL_MACRO) {
220            handle_oob_offset_spill(instr);
221            split_spill(instr);
222         } else if (instr->opc == OPC_RELOAD_MACRO) {
223            handle_oob_offset_reload(instr);
224            split_reload(instr);
225         }
226      }
227
228      add_spill_reload_deps(block);
229
230      foreach_instr (instr, &block->instr_list) {
231         if (instr->opc == OPC_SPILL_MACRO)
232            instr->opc = OPC_STP;
233         else if (instr->opc == OPC_RELOAD_MACRO)
234            instr->opc = OPC_LDP;
235      }
236   }
237
238   return true;
239}
240