1/* 2 * Copyright © 2021 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/* This helps separate shaders because the next shader doesn't have to be known. 25 * 26 * It optimizes VS and TES outputs before FS as follows: 27 * - Eliminate and merge equal outputs, and treat undef as equal to everything, e.g. 28 * (x,y,undef,undef) == (undef,y,z,undef) --> (x,y,z,undef) regardless of the interpolation 29 * qualifier (AMD can map 1 output to multiple PS inputs and interpolate each differently). 30 * - Remove constant outputs that match AMD DEFAULT_VAL options, e.g. (0,0,0,1), 31 * treat undef as whatever. 32 * 33 * It requires that there is no indirect indexing and all output stores must be scalar. 34 */ 35 36#include "ac_nir.h" 37#include "nir_builder.h" 38 39struct ac_chan_info { 40 nir_instr *value; 41 nir_intrinsic_instr *store_intr; /* The intrinsic writing the value. */ 42}; 43 44struct ac_out_info { 45 unsigned base; /* nir_intrinsic_base */ 46 nir_alu_type types; 47 bool duplicated; 48 bool constant; 49 50 /* Channels 0-3 are 32-bit channels or low bits of 16-bit channels. 51 * Channels 4-7 are high bits of 16-bit channels. 52 */ 53 struct ac_chan_info chan[8]; 54}; 55 56static void ac_remove_varying(struct ac_out_info *out) 57{ 58 /* Remove the output. (all channels) */ 59 for (unsigned i = 0; i < ARRAY_SIZE(out->chan); i++) { 60 if (out->chan[i].store_intr) { 61 nir_remove_varying(out->chan[i].store_intr); 62 out->chan[i].store_intr = NULL; 63 out->chan[i].value = NULL; 64 } 65 } 66} 67 68/* Return true if the output matches DEFAULT_VAL and has been eliminated. */ 69static bool ac_eliminate_const_output(struct ac_out_info *out, 70 gl_varying_slot semantic, 71 uint8_t *param_export_index) 72{ 73 if (!(out->types & 32)) 74 return false; 75 76 bool is_zero[4] = {0}, is_one[4] = {0}; 77 78 for (unsigned i = 0; i < 4; i++) { 79 /* NULL means undef. */ 80 if (!out->chan[i].value) { 81 is_zero[i] = true; 82 is_one[i] = true; 83 } else if (out->chan[i].value->type == nir_instr_type_load_const) { 84 if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 0) 85 is_zero[i] = true; 86 else if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 1) 87 is_one[i] = true; 88 else 89 return false; /* other constant */ 90 } else 91 return false; 92 } 93 94 /* Only certain combinations of 0 and 1 are supported. */ 95 unsigned default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */ 96 97 if (is_zero[0] && is_zero[1] && is_zero[2]) { 98 if (is_zero[3]) 99 default_val = AC_EXP_PARAM_DEFAULT_VAL_0000; 100 else if (is_one[3]) 101 default_val = AC_EXP_PARAM_DEFAULT_VAL_0001; 102 else 103 return false; 104 } else if (is_one[0] && is_one[1] && is_one[2]) { 105 if (is_zero[3]) 106 default_val = AC_EXP_PARAM_DEFAULT_VAL_1110; 107 else if (is_one[3]) 108 default_val = AC_EXP_PARAM_DEFAULT_VAL_1111; 109 else 110 return false; 111 } else { 112 return false; 113 } 114 115 /* Change OFFSET to DEFAULT_VAL. */ 116 param_export_index[semantic] = default_val; 117 out->constant = true; 118 ac_remove_varying(out); 119 return true; 120} 121 122static bool ac_eliminate_duplicated_output(struct ac_out_info *outputs, 123 BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS), 124 gl_varying_slot current, struct nir_builder *b, 125 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS]) 126{ 127 struct ac_out_info *cur = &outputs[current]; 128 unsigned p, copy_back_channels = 0; 129 130 /* Check all outputs before current. */ 131 BITSET_FOREACH_SET(p, outputs_optimized, current) { 132 struct ac_out_info *prev = &outputs[p]; 133 134 /* Only compare with real outputs. */ 135 if (prev->constant || prev->duplicated) 136 continue; 137 138 /* The types must match (only 16-bit and 32-bit types are allowed). */ 139 if ((prev->types & 16) != (cur->types & 16)) 140 continue; 141 142 bool different = false; 143 144 /* Iterate over all channels, including 16-bit channels in chan_hi. */ 145 for (unsigned j = 0; j < 8; j++) { 146 nir_instr *prev_chan = prev->chan[j].value; 147 nir_instr *cur_chan = cur->chan[j].value; 148 149 /* Treat undef as a match. */ 150 if (!cur_chan) 151 continue; 152 153 /* If prev is undef but cur isn't, we can merge the outputs 154 * and consider the output duplicated. 155 */ 156 if (!prev_chan) { 157 copy_back_channels |= 1 << j; 158 continue; 159 } 160 161 /* Test whether the values are different. */ 162 if (prev_chan != cur_chan && 163 (prev_chan->type != nir_instr_type_load_const || 164 cur_chan->type != nir_instr_type_load_const || 165 nir_instr_as_load_const(prev_chan)->value[0].u32 != 166 nir_instr_as_load_const(cur_chan)->value[0].u32)) { 167 different = true; 168 break; 169 } 170 } 171 if (!different) 172 break; 173 174 copy_back_channels = 0; 175 } 176 if (p == current) 177 return false; 178 179 /* An equal output already exists. Make FS use the existing one instead. 180 * This effectively disables the current output and the param export shouldn't 181 * be generated. 182 */ 183 cur->duplicated = true; 184 185 /* p is gl_varying_slot in addition to being an index into outputs. */ 186 slot_remap[current] = p; 187 188 /* If the matching preceding output has undef where the current one has a proper value, 189 * move the value to the preceding output. 190 */ 191 struct ac_out_info *prev = &outputs[p]; 192 193 while (copy_back_channels) { 194 unsigned i = u_bit_scan(©_back_channels); 195 struct ac_chan_info *prev_chan = &prev->chan[i]; 196 struct ac_chan_info *cur_chan = &cur->chan[i]; 197 198 b->cursor = nir_after_instr(&cur_chan->store_intr->instr); 199 200 /* The store intrinsic doesn't exist for this channel. Create a new one. */ 201 nir_alu_type src_type = nir_intrinsic_src_type(cur_chan->store_intr); 202 struct nir_io_semantics sem = nir_intrinsic_io_semantics(cur_chan->store_intr); 203 struct nir_io_xfb xfb = nir_intrinsic_io_xfb(cur_chan->store_intr); 204 struct nir_io_xfb xfb2 = nir_intrinsic_io_xfb2(cur_chan->store_intr); 205 206 /* p is gl_varying_slot in addition to being an index into outputs. */ 207 sem.location = p; 208 assert(sem.high_16bits == i / 4); 209 210 /* If it's a sysval output (such as CLIPDIST), we move the varying portion but keep 211 * the system value output. This is just the varying portion. 212 */ 213 sem.no_sysval_output = 1; 214 215 /* Write just one component. */ 216 prev_chan->store_intr = nir_store_output(b, nir_instr_ssa_def(cur_chan->value), 217 nir_imm_int(b, 0), 218 .base = prev->base, 219 .component = i % 4, 220 .io_semantics = sem, 221 .src_type = src_type, 222 .write_mask = 0x1, 223 .io_xfb = xfb, 224 .io_xfb2 = xfb2); 225 226 /* Update the undef channels in the output info. */ 227 assert(!prev_chan->value); 228 prev_chan->value = cur_chan->value; 229 230 /* Remove transform feedback info from the current instruction because 231 * we moved it too. The instruction might not be removed if it's a system 232 * value output. 233 */ 234 static struct nir_io_xfb zero_xfb; 235 nir_intrinsic_set_io_xfb(cur->chan[i].store_intr, zero_xfb); 236 nir_intrinsic_set_io_xfb2(cur->chan[i].store_intr, zero_xfb); 237 } 238 239 ac_remove_varying(cur); 240 return true; 241} 242 243bool ac_nir_optimize_outputs(nir_shader *nir, bool sprite_tex_disallowed, 244 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS], 245 uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS]) 246{ 247 nir_function_impl *impl = nir_shader_get_entrypoint(nir); 248 assert(impl); 249 250 if (nir->info.stage != MESA_SHADER_VERTEX && 251 nir->info.stage != MESA_SHADER_TESS_EVAL) { 252 nir_metadata_preserve(impl, nir_metadata_all); 253 return false; 254 } 255 256 struct ac_out_info outputs[NUM_TOTAL_VARYING_SLOTS] = { 0 }; 257 258 BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS); 259 BITSET_ZERO(outputs_optimized); 260 261 /* Gather outputs. */ 262 nir_foreach_block(block, impl) { 263 nir_foreach_instr_safe(instr, block) { 264 if (instr->type != nir_instr_type_intrinsic) 265 continue; 266 267 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 268 if (intr->intrinsic != nir_intrinsic_store_output) 269 continue; 270 271 nir_io_semantics sem = nir_intrinsic_io_semantics(intr); 272 273 /* Only process varyings that appear as param exports. */ 274 if (!nir_slot_is_varying(sem.location) || sem.no_varying) 275 continue; 276 277 /* We can't optimize texture coordinates if sprite_coord_enable can override them. */ 278 if (sem.location >= VARYING_SLOT_TEX0 && sem.location <= VARYING_SLOT_TEX7 && 279 !sprite_tex_disallowed) 280 continue; 281 282 BITSET_SET(outputs_optimized, sem.location); 283 284 /* No indirect indexing allowed. */ 285 ASSERTED nir_src offset = *nir_get_io_offset_src(intr); 286 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0); 287 288 /* nir_lower_io_to_scalar is required before this */ 289 assert(intr->src[0].ssa->num_components == 1); 290 /* No intrinsic should store undef. */ 291 assert(intr->src[0].ssa->parent_instr->type != nir_instr_type_ssa_undef); 292 293 /* Gather the output. */ 294 struct ac_out_info *out_info = &outputs[sem.location]; 295 if (!out_info->types) 296 out_info->base = nir_intrinsic_base(intr); 297 else 298 assert(out_info->base == nir_intrinsic_base(intr)); 299 300 out_info->types |= nir_intrinsic_src_type(intr); 301 302 unsigned chan = sem.high_16bits * 4 + nir_intrinsic_component(intr); 303 out_info->chan[chan].store_intr = intr; 304 out_info->chan[chan].value = intr->src[0].ssa->parent_instr; 305 } 306 } 307 308 unsigned i; 309 bool progress = false; 310 311 struct nir_builder b; 312 nir_builder_init(&b, impl); 313 314 /* Optimize outputs. */ 315 BITSET_FOREACH_SET(i, outputs_optimized, NUM_TOTAL_VARYING_SLOTS) { 316 progress |= 317 ac_eliminate_const_output(&outputs[i], i, param_export_index) || 318 ac_eliminate_duplicated_output(outputs, outputs_optimized, i, &b, slot_remap); 319 } 320 321 if (progress) { 322 nir_metadata_preserve(impl, nir_metadata_dominance | 323 nir_metadata_block_index); 324 } else { 325 nir_metadata_preserve(impl, nir_metadata_all); 326 } 327 return progress; 328} 329