1/* 2 * Copyright © 2020 Google LLC 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/* Lowers nir_intrinsic_load_ubo() to nir_intrinsic_load_ubo_vec4() taking an 25 * offset in vec4 units. This is a fairly common mode of UBO addressing for 26 * hardware to have, and it gives NIR a chance to optimize the addressing math 27 * and CSE the loads. 28 * 29 * This pass handles lowering for loads that straddle a vec4 alignment 30 * boundary. We try to minimize the extra loads we generate for that case, 31 * and are ensured non-straddling loads with: 32 * 33 * - std140 (GLSL 1.40, GLSL ES) 34 * - Vulkan "Extended Layout" (the baseline for UBOs) 35 * 36 * but not: 37 * 38 * - GLSL 4.30's new packed mode (enabled by PIPE_CAP_LOAD_CONSTBUF) where 39 * vec3 arrays are packed tightly. 40 * 41 * - PackedDriverUniformStorage in GL (enabled by PIPE_CAP_PACKED_UNIFORMS) 42 * combined with nir_lower_uniforms_to_ubo, where values in the default 43 * uniform block are packed tightly. 44 * 45 * - Vulkan's scalarBlockLayout optional feature: 46 * 47 * "A member is defined to improperly straddle if either of the following are 48 * true: 49 * 50 * • It is a vector with total size less than or equal to 16 bytes, and has 51 * Offset decorations placing its first byte at F and its last byte at L 52 * where floor(F / 16) != floor(L / 16). 53 * • It is a vector with total size greater than 16 bytes and has its Offset 54 * decorations placing its first byte at a non-integer multiple of 16. 55 * 56 * [...] 57 * 58 * Unless the scalarBlockLayout feature is enabled on the device: 59 * 60 * • Vectors must not improperly straddle, as defined above." 61 */ 62 63#include "nir.h" 64#include "nir_builder.h" 65 66static bool 67nir_lower_ubo_vec4_filter(const nir_instr *instr, const void *data) 68{ 69 if (instr->type != nir_instr_type_intrinsic) 70 return false; 71 72 return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo; 73} 74 75static nir_intrinsic_instr * 76create_load(nir_builder *b, nir_ssa_def *block, nir_ssa_def *offset, 77 unsigned bit_size, unsigned num_components) 78{ 79 nir_ssa_def *def = nir_load_ubo_vec4(b, num_components, bit_size, block, offset); 80 return nir_instr_as_intrinsic(def->parent_instr); 81} 82 83static nir_ssa_def * 84nir_lower_ubo_vec4_lower(nir_builder *b, nir_instr *instr, void *data) 85{ 86 b->cursor = nir_before_instr(instr); 87 88 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 89 90 nir_ssa_def *byte_offset = nir_ssa_for_src(b, intr->src[1], 1); 91 nir_ssa_def *vec4_offset = nir_ushr_imm(b, byte_offset, 4); 92 93 unsigned align_mul = nir_intrinsic_align_mul(intr); 94 unsigned align_offset = nir_intrinsic_align_offset(intr); 95 96 int chan_size_bytes = intr->dest.ssa.bit_size / 8; 97 int chans_per_vec4 = 16 / chan_size_bytes; 98 99 /* We don't care if someone figured out that things are aligned beyond 100 * vec4. 101 */ 102 align_mul = MIN2(align_mul, 16); 103 align_offset &= 15; 104 assert(align_offset % chan_size_bytes == 0); 105 106 unsigned num_components = intr->num_components; 107 bool aligned_mul = (align_mul == 16 && 108 align_offset + chan_size_bytes * num_components <= 16); 109 if (!aligned_mul) 110 num_components = chans_per_vec4; 111 112 nir_intrinsic_instr *load = create_load(b, intr->src[0].ssa, vec4_offset, 113 intr->dest.ssa.bit_size, 114 num_components); 115 116 nir_intrinsic_set_access(load, nir_intrinsic_access(intr)); 117 118 nir_ssa_def *result = &load->dest.ssa; 119 120 int align_chan_offset = align_offset / chan_size_bytes; 121 if (aligned_mul) { 122 /* For an aligned load, just ask the backend to load from the known 123 * offset's component. 124 */ 125 nir_intrinsic_set_component(load, align_chan_offset); 126 } else if (intr->num_components == 1) { 127 /* If we're loading a single component, that component alone won't 128 * straddle a vec4 boundary so we can do this with a single UBO load. 129 */ 130 nir_ssa_def *component = 131 nir_iand_imm(b, 132 nir_udiv_imm(b, byte_offset, chan_size_bytes), 133 chans_per_vec4 - 1); 134 135 result = nir_vector_extract(b, result, component); 136 } else if (align_mul == 8 && 137 align_offset + chan_size_bytes * intr->num_components <= 8) { 138 /* Special case: Loading small vectors from offset % 8 == 0 can be done 139 * with just one load and one bcsel. 140 */ 141 nir_component_mask_t low_channels = 142 BITSET_MASK(intr->num_components) << (align_chan_offset); 143 nir_component_mask_t high_channels = 144 low_channels << (8 / chan_size_bytes); 145 result = nir_bcsel(b, nir_test_mask(b, byte_offset, 8), 146 nir_channels(b, result, high_channels), 147 nir_channels(b, result, low_channels)); 148 } else { 149 /* General fallback case: Per-result-channel bcsel-based extraction 150 * from two separate vec4 loads. 151 */ 152 assert(num_components == 4); 153 nir_ssa_def *next_vec4_offset = nir_iadd_imm(b, vec4_offset, 1); 154 nir_intrinsic_instr *next_load = create_load(b, intr->src[0].ssa, next_vec4_offset, 155 intr->dest.ssa.bit_size, 156 num_components); 157 158 nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS]; 159 for (unsigned i = 0; i < intr->num_components; i++) { 160 nir_ssa_def *chan_byte_offset = nir_iadd_imm(b, byte_offset, i * chan_size_bytes); 161 162 nir_ssa_def *chan_vec4_offset = nir_ushr_imm(b, chan_byte_offset, 4); 163 164 nir_ssa_def *component = 165 nir_iand_imm(b, 166 nir_udiv_imm(b, chan_byte_offset, chan_size_bytes), 167 chans_per_vec4 - 1); 168 169 channels[i] = nir_vector_extract(b, 170 nir_bcsel(b, 171 nir_ieq(b, 172 chan_vec4_offset, 173 vec4_offset), 174 &load->dest.ssa, 175 &next_load->dest.ssa), 176 component); 177 } 178 179 result = nir_vec(b, channels, intr->num_components); 180 } 181 182 return result; 183} 184 185bool 186nir_lower_ubo_vec4(nir_shader *shader) 187{ 188 return nir_shader_lower_instructions(shader, 189 nir_lower_ubo_vec4_filter, 190 nir_lower_ubo_vec4_lower, 191 NULL); 192} 193