1/* 2 * Copyright (C) 2021 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include "ir3_ra.h" 25 26/* The spilling pass leaves out a few details required to successfully operate 27 * ldp/stp: 28 * 29 * 1. ldp/stp can only load/store 4 components at a time, but spilling ignores 30 * that and just spills/restores entire values, including arrays and values 31 * created for texture setup which can be more than 4 components. 32 * 2. The immediate offset only has 13 bits and is signed, so if we spill a lot 33 * or have very large arrays before spilling then we could run out. 34 * 3. The spiller doesn't add barrier dependencies needed for post-RA 35 * scheduling. 36 * 37 * The first one, in particular, is much easier to handle after RA because 38 * arrays and normal values can be treated the same way. Therefore this pass 39 * runs after RA, and handles all three issues. This keeps the complexity out of 40 * the spiller. 41 */ 42 43static unsigned 44component_bytes(struct ir3_register *src) 45{ 46 return (src->flags & IR3_REG_HALF) ? 2 : 4; 47} 48 49/* Note: this won't work if the base register is anything other than 0! 50 * Dynamic bases, which we'll need for "real" function call support, will 51 * probably be a lot harder to handle and may require reserving another 52 * register. 53 */ 54static void 55set_base_reg(struct ir3_instruction *mem, unsigned val) 56{ 57 struct ir3_instruction *mov = ir3_instr_create(mem->block, OPC_MOV, 1, 1); 58 ir3_dst_create(mov, mem->srcs[0]->num, mem->srcs[0]->flags); 59 ir3_src_create(mov, INVALID_REG, IR3_REG_IMMED)->uim_val = val; 60 mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32; 61 62 ir3_instr_move_before(mov, mem); 63} 64 65static void 66reset_base_reg(struct ir3_instruction *mem) 67{ 68 /* If the base register is killed, then we don't need to clobber it and it 69 * may be reused as a destination so we can't always clobber it after the 70 * instruction anyway. 71 */ 72 struct ir3_register *base = mem->srcs[0]; 73 if (base->flags & IR3_REG_KILL) 74 return; 75 76 struct ir3_instruction *mov = ir3_instr_create(mem->block, OPC_MOV, 1, 1); 77 ir3_dst_create(mov, base->num, base->flags); 78 ir3_src_create(mov, INVALID_REG, IR3_REG_IMMED)->uim_val = 0; 79 mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32; 80 81 ir3_instr_move_after(mov, mem); 82} 83 84/* There are 13 bits, but 1 << 12 will be sign-extended into a negative offset 85 * so it can't be used directly. Therefore only offsets under 1 << 12 can be 86 * used without any adjustments. 87 */ 88#define MAX_CAT6_SIZE (1u << 12) 89 90static void 91handle_oob_offset_spill(struct ir3_instruction *spill) 92{ 93 unsigned components = spill->srcs[2]->uim_val; 94 95 if (spill->cat6.dst_offset + components * component_bytes(spill->srcs[1]) < MAX_CAT6_SIZE) 96 return; 97 98 set_base_reg(spill, spill->cat6.dst_offset); 99 reset_base_reg(spill); 100 spill->cat6.dst_offset = 0; 101} 102 103static void 104handle_oob_offset_reload(struct ir3_instruction *reload) 105{ 106 unsigned components = reload->srcs[2]->uim_val; 107 unsigned offset = reload->srcs[1]->uim_val; 108 if (offset + components * component_bytes(reload->dsts[0]) < MAX_CAT6_SIZE) 109 return; 110 111 set_base_reg(reload, offset); 112 reset_base_reg(reload); 113 reload->srcs[1]->uim_val = 0; 114} 115 116static void 117split_spill(struct ir3_instruction *spill) 118{ 119 unsigned orig_components = spill->srcs[2]->uim_val; 120 121 /* We don't handle splitting dependencies. */ 122 assert(spill->deps_count == 0); 123 124 if (orig_components <= 4) { 125 if (spill->srcs[1]->flags & IR3_REG_ARRAY) { 126 spill->srcs[1]->wrmask = MASK(orig_components); 127 spill->srcs[1]->num = spill->srcs[1]->array.base; 128 spill->srcs[1]->flags &= ~IR3_REG_ARRAY; 129 } 130 return; 131 } 132 133 for (unsigned comp = 0; comp < orig_components; comp += 4) { 134 unsigned components = MIN2(orig_components - comp, 4); 135 struct ir3_instruction *clone = ir3_instr_clone(spill); 136 ir3_instr_move_before(clone, spill); 137 138 clone->srcs[1]->wrmask = MASK(components); 139 if (clone->srcs[1]->flags & IR3_REG_ARRAY) { 140 clone->srcs[1]->num = clone->srcs[1]->array.base + comp; 141 clone->srcs[1]->flags &= ~IR3_REG_ARRAY; 142 } 143 144 clone->srcs[2]->uim_val = components; 145 clone->cat6.dst_offset += comp * component_bytes(spill->srcs[1]); 146 } 147 148 list_delinit(&spill->node); 149} 150 151static void 152split_reload(struct ir3_instruction *reload) 153{ 154 unsigned orig_components = reload->srcs[2]->uim_val; 155 156 assert(reload->deps_count == 0); 157 158 if (orig_components <= 4) { 159 if (reload->dsts[0]->flags & IR3_REG_ARRAY) { 160 reload->dsts[0]->wrmask = MASK(orig_components); 161 reload->dsts[0]->num = reload->dsts[0]->array.base; 162 reload->dsts[0]->flags &= ~IR3_REG_ARRAY; 163 } 164 return; 165 } 166 167 for (unsigned comp = 0; comp < orig_components; comp += 4) { 168 unsigned components = MIN2(orig_components - comp, 4); 169 struct ir3_instruction *clone = ir3_instr_clone(reload); 170 ir3_instr_move_before(clone, reload); 171 172 clone->dsts[0]->wrmask = MASK(components); 173 if (clone->dsts[0]->flags & IR3_REG_ARRAY) { 174 clone->dsts[0]->num = clone->dsts[0]->array.base + comp; 175 clone->dsts[0]->flags &= ~IR3_REG_ARRAY; 176 } 177 178 clone->srcs[2]->uim_val = components; 179 clone->srcs[1]->uim_val += comp * component_bytes(reload->dsts[0]); 180 } 181 182 list_delinit(&reload->node); 183} 184 185static void 186add_spill_reload_deps(struct ir3_block *block) 187{ 188 struct ir3_instruction *last_spill = NULL; 189 190 foreach_instr (instr, &block->instr_list) { 191 if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) && 192 last_spill) { 193 ir3_instr_add_dep(instr, last_spill); 194 } 195 196 if (instr->opc == OPC_SPILL_MACRO) 197 last_spill = instr; 198 } 199 200 201 last_spill = NULL; 202 203 foreach_instr_rev (instr, &block->instr_list) { 204 if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) && 205 last_spill) { 206 ir3_instr_add_dep(last_spill, instr); 207 } 208 209 if (instr->opc == OPC_SPILL_MACRO) 210 last_spill = instr; 211 } 212} 213 214bool 215ir3_lower_spill(struct ir3 *ir) 216{ 217 foreach_block (block, &ir->block_list) { 218 foreach_instr_safe (instr, &block->instr_list) { 219 if (instr->opc == OPC_SPILL_MACRO) { 220 handle_oob_offset_spill(instr); 221 split_spill(instr); 222 } else if (instr->opc == OPC_RELOAD_MACRO) { 223 handle_oob_offset_reload(instr); 224 split_reload(instr); 225 } 226 } 227 228 add_spill_reload_deps(block); 229 230 foreach_instr (instr, &block->instr_list) { 231 if (instr->opc == OPC_SPILL_MACRO) 232 instr->opc = OPC_STP; 233 else if (instr->opc == OPC_RELOAD_MACRO) 234 instr->opc = OPC_LDP; 235 } 236 } 237 238 return true; 239} 240