1/* 2 * Copyright © 2021 Google, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include "ir3_nir.h" 25 26/* 27 * Lowering for 64b intrinsics generated with OpenCL or with 28 * VK_KHR_buffer_device_address. All our intrinsics from a hw 29 * standpoint are 32b, so we just need to combine in zero for 30 * the upper 32bits and let the other nir passes clean up the mess. 31 */ 32 33static bool 34lower_64b_intrinsics_filter(const nir_instr *instr, const void *unused) 35{ 36 (void)unused; 37 38 if (instr->type != nir_instr_type_intrinsic) 39 return false; 40 41 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 42 43 if (intr->intrinsic == nir_intrinsic_load_deref || 44 intr->intrinsic == nir_intrinsic_store_deref) 45 return false; 46 47 if (is_intrinsic_store(intr->intrinsic)) 48 return nir_src_bit_size(intr->src[0]) == 64; 49 50 if (nir_intrinsic_dest_components(intr) == 0) 51 return false; 52 53 return nir_dest_bit_size(intr->dest) == 64; 54} 55 56static nir_ssa_def * 57lower_64b_intrinsics(nir_builder *b, nir_instr *instr, void *unused) 58{ 59 (void)unused; 60 61 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 62 63 /* We could be *slightly* more clever and, for ex, turn a 64b vec4 64 * load into two 32b vec4 loads, rather than 4 32b vec2 loads. 65 */ 66 67 if (is_intrinsic_store(intr->intrinsic)) { 68 unsigned offset_src_idx; 69 switch (intr->intrinsic) { 70 case nir_intrinsic_store_ssbo: 71 case nir_intrinsic_store_global_ir3: 72 offset_src_idx = 2; 73 break; 74 default: 75 offset_src_idx = 1; 76 } 77 78 unsigned num_comp = nir_intrinsic_src_components(intr, 0); 79 unsigned wrmask = nir_intrinsic_has_write_mask(intr) ? 80 nir_intrinsic_write_mask(intr) : BITSET_MASK(num_comp); 81 nir_ssa_def *val = nir_ssa_for_src(b, intr->src[0], num_comp); 82 nir_ssa_def *off = nir_ssa_for_src(b, intr->src[offset_src_idx], 1); 83 84 for (unsigned i = 0; i < num_comp; i++) { 85 if (!(wrmask & BITFIELD_BIT(i))) 86 continue; 87 88 nir_ssa_def *c64 = nir_channel(b, val, i); 89 nir_ssa_def *c32 = nir_unpack_64_2x32(b, c64); 90 91 nir_intrinsic_instr *store = 92 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr)); 93 store->num_components = 2; 94 store->src[0] = nir_src_for_ssa(c32); 95 store->src[offset_src_idx] = nir_src_for_ssa(off); 96 97 if (nir_intrinsic_has_write_mask(intr)) 98 nir_intrinsic_set_write_mask(store, 0x3); 99 nir_builder_instr_insert(b, &store->instr); 100 101 off = nir_iadd(b, off, nir_imm_intN_t(b, 8, off->bit_size)); 102 } 103 104 return NIR_LOWER_INSTR_PROGRESS_REPLACE; 105 } 106 107 unsigned num_comp = nir_intrinsic_dest_components(intr); 108 109 nir_ssa_def *def = &intr->dest.ssa; 110 def->bit_size = 32; 111 112 /* load_kernel_input is handled specially, lowering to two 32b inputs: 113 */ 114 if (intr->intrinsic == nir_intrinsic_load_kernel_input) { 115 assert(num_comp == 1); 116 117 nir_ssa_def *offset = nir_iadd(b, 118 nir_ssa_for_src(b, intr->src[0], 1), 119 nir_imm_int(b, 4)); 120 121 nir_ssa_def *upper = nir_build_load_kernel_input( 122 b, 1, 32, offset); 123 124 return nir_pack_64_2x32_split(b, def, upper); 125 } 126 127 nir_ssa_def *components[num_comp]; 128 129 if (is_intrinsic_load(intr->intrinsic)) { 130 unsigned offset_src_idx; 131 switch(intr->intrinsic) { 132 case nir_intrinsic_load_ssbo: 133 case nir_intrinsic_load_ubo: 134 case nir_intrinsic_load_global_ir3: 135 offset_src_idx = 1; 136 break; 137 default: 138 offset_src_idx = 0; 139 } 140 141 nir_ssa_def *off = nir_ssa_for_src(b, intr->src[offset_src_idx], 1); 142 143 for (unsigned i = 0; i < num_comp; i++) { 144 nir_intrinsic_instr *load = 145 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr)); 146 load->num_components = 2; 147 load->src[offset_src_idx] = nir_src_for_ssa(off); 148 149 nir_ssa_dest_init(&load->instr, &load->dest, 2, 32, NULL); 150 nir_builder_instr_insert(b, &load->instr); 151 152 components[i] = nir_pack_64_2x32(b, &load->dest.ssa); 153 154 off = nir_iadd(b, off, nir_imm_intN_t(b, 8, off->bit_size)); 155 } 156 } else { 157 /* The remaining (non load/store) intrinsics just get zero- 158 * extended from 32b to 64b: 159 */ 160 for (unsigned i = 0; i < num_comp; i++) { 161 nir_ssa_def *c = nir_channel(b, def, i); 162 components[i] = nir_pack_64_2x32_split(b, c, nir_imm_zero(b, 1, 32)); 163 } 164 } 165 166 return nir_build_alu_src_arr(b, nir_op_vec(num_comp), components); 167} 168 169bool 170ir3_nir_lower_64b_intrinsics(nir_shader *shader) 171{ 172 return nir_shader_lower_instructions( 173 shader, lower_64b_intrinsics_filter, 174 lower_64b_intrinsics, NULL); 175} 176 177/* 178 * Lowering for 64b undef instructions, splitting into a two 32b undefs 179 */ 180 181static nir_ssa_def * 182lower_64b_undef(nir_builder *b, nir_instr *instr, void *unused) 183{ 184 (void)unused; 185 186 nir_ssa_undef_instr *undef = nir_instr_as_ssa_undef(instr); 187 unsigned num_comp = undef->def.num_components; 188 nir_ssa_def *components[num_comp]; 189 190 for (unsigned i = 0; i < num_comp; i++) { 191 nir_ssa_def *lowered = nir_ssa_undef(b, 2, 32); 192 193 components[i] = nir_pack_64_2x32_split(b, 194 nir_channel(b, lowered, 0), 195 nir_channel(b, lowered, 1)); 196 } 197 198 return nir_build_alu_src_arr(b, nir_op_vec(num_comp), components); 199} 200 201static bool 202lower_64b_undef_filter(const nir_instr *instr, const void *unused) 203{ 204 (void)unused; 205 206 return instr->type == nir_instr_type_ssa_undef && 207 nir_instr_as_ssa_undef(instr)->def.bit_size == 64; 208} 209 210bool 211ir3_nir_lower_64b_undef(nir_shader *shader) 212{ 213 return nir_shader_lower_instructions( 214 shader, lower_64b_undef_filter, 215 lower_64b_undef, NULL); 216} 217 218/* 219 * Lowering for load_global/store_global with 64b addresses to ir3 220 * variants, which instead take a uvec2_32 221 */ 222 223static bool 224lower_64b_global_filter(const nir_instr *instr, const void *unused) 225{ 226 (void)unused; 227 228 if (instr->type != nir_instr_type_intrinsic) 229 return false; 230 231 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 232 switch (intr->intrinsic) { 233 case nir_intrinsic_load_global: 234 case nir_intrinsic_load_global_constant: 235 case nir_intrinsic_store_global: 236 case nir_intrinsic_global_atomic_add: 237 case nir_intrinsic_global_atomic_imin: 238 case nir_intrinsic_global_atomic_umin: 239 case nir_intrinsic_global_atomic_imax: 240 case nir_intrinsic_global_atomic_umax: 241 case nir_intrinsic_global_atomic_and: 242 case nir_intrinsic_global_atomic_or: 243 case nir_intrinsic_global_atomic_xor: 244 case nir_intrinsic_global_atomic_exchange: 245 case nir_intrinsic_global_atomic_comp_swap: 246 return true; 247 default: 248 return false; 249 } 250} 251 252static nir_ssa_def * 253lower_64b_global(nir_builder *b, nir_instr *instr, void *unused) 254{ 255 (void)unused; 256 257 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 258 bool load = intr->intrinsic != nir_intrinsic_store_global; 259 260 nir_ssa_def *addr64 = nir_ssa_for_src(b, intr->src[load ? 0 : 1], 1); 261 nir_ssa_def *addr = nir_unpack_64_2x32(b, addr64); 262 263 /* 264 * Note that we can get vec8/vec16 with OpenCL.. we need to split 265 * those up into max 4 components per load/store. 266 */ 267 268#define GLOBAL_IR3_2SRC(name) \ 269 case nir_intrinsic_##name: { \ 270 return nir_build_##name##_ir3(b, nir_dest_bit_size(intr->dest), addr, \ 271 nir_ssa_for_src(b, intr->src[1], 1)); \ 272 } 273 274 switch (intr->intrinsic) { 275 GLOBAL_IR3_2SRC(global_atomic_add) 276 GLOBAL_IR3_2SRC(global_atomic_imin) 277 GLOBAL_IR3_2SRC(global_atomic_umin) 278 GLOBAL_IR3_2SRC(global_atomic_imax) 279 GLOBAL_IR3_2SRC(global_atomic_umax) 280 GLOBAL_IR3_2SRC(global_atomic_and) 281 GLOBAL_IR3_2SRC(global_atomic_or) 282 GLOBAL_IR3_2SRC(global_atomic_xor) 283 GLOBAL_IR3_2SRC(global_atomic_exchange) 284 case nir_intrinsic_global_atomic_comp_swap: 285 return nir_build_global_atomic_comp_swap_ir3( 286 b, nir_dest_bit_size(intr->dest), addr, 287 nir_ssa_for_src(b, intr->src[1], 1), 288 nir_ssa_for_src(b, intr->src[2], 1)); 289 default: 290 break; 291 } 292#undef GLOBAL_IR3_2SRC 293 294 if (load) { 295 unsigned num_comp = nir_intrinsic_dest_components(intr); 296 nir_ssa_def *components[num_comp]; 297 for (unsigned off = 0; off < num_comp;) { 298 unsigned c = MIN2(num_comp - off, 4); 299 nir_ssa_def *val = nir_build_load_global_ir3( 300 b, c, nir_dest_bit_size(intr->dest), 301 addr, nir_imm_int(b, off)); 302 for (unsigned i = 0; i < c; i++) { 303 components[off++] = nir_channel(b, val, i); 304 } 305 } 306 return nir_build_alu_src_arr(b, nir_op_vec(num_comp), components); 307 } else { 308 unsigned num_comp = nir_intrinsic_src_components(intr, 0); 309 nir_ssa_def *value = nir_ssa_for_src(b, intr->src[0], num_comp); 310 for (unsigned off = 0; off < num_comp; off += 4) { 311 unsigned c = MIN2(num_comp - off, 4); 312 nir_ssa_def *v = nir_channels(b, value, BITFIELD_MASK(c) << off); 313 nir_build_store_global_ir3(b, v, addr, nir_imm_int(b, off)); 314 } 315 return NIR_LOWER_INSTR_PROGRESS_REPLACE; 316 } 317} 318 319bool 320ir3_nir_lower_64b_global(nir_shader *shader) 321{ 322 return nir_shader_lower_instructions( 323 shader, lower_64b_global_filter, 324 lower_64b_global, NULL); 325} 326