1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2021 Valve Corporation 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci * Authors: 24bf215546Sopenharmony_ci * Timur Kristóf 25bf215546Sopenharmony_ci * 26bf215546Sopenharmony_ci */ 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci#include "nir.h" 29bf215546Sopenharmony_ci#include "nir_builder.h" 30bf215546Sopenharmony_ci 31bf215546Sopenharmony_citypedef struct 32bf215546Sopenharmony_ci{ 33bf215546Sopenharmony_ci struct hash_table *range_ht; 34bf215546Sopenharmony_ci const nir_opt_offsets_options *options; 35bf215546Sopenharmony_ci} opt_offsets_state; 36bf215546Sopenharmony_ci 37bf215546Sopenharmony_cistatic nir_ssa_scalar 38bf215546Sopenharmony_citry_extract_const_addition(nir_builder *b, nir_ssa_scalar val, opt_offsets_state *state, unsigned *out_const, uint32_t max) 39bf215546Sopenharmony_ci{ 40bf215546Sopenharmony_ci val = nir_ssa_scalar_chase_movs(val); 41bf215546Sopenharmony_ci 42bf215546Sopenharmony_ci if (!nir_ssa_scalar_is_alu(val)) 43bf215546Sopenharmony_ci return val; 44bf215546Sopenharmony_ci 45bf215546Sopenharmony_ci nir_alu_instr *alu = nir_instr_as_alu(val.def->parent_instr); 46bf215546Sopenharmony_ci if (alu->op != nir_op_iadd || 47bf215546Sopenharmony_ci !alu->src[0].src.is_ssa || 48bf215546Sopenharmony_ci !alu->src[1].src.is_ssa || 49bf215546Sopenharmony_ci alu->src[0].negate || alu->src[0].abs || 50bf215546Sopenharmony_ci alu->src[1].negate || alu->src[1].abs) 51bf215546Sopenharmony_ci return val; 52bf215546Sopenharmony_ci 53bf215546Sopenharmony_ci nir_ssa_scalar src[2] = { 54bf215546Sopenharmony_ci {alu->src[0].src.ssa, alu->src[0].swizzle[val.comp]}, 55bf215546Sopenharmony_ci {alu->src[1].src.ssa, alu->src[1].swizzle[val.comp]}, 56bf215546Sopenharmony_ci }; 57bf215546Sopenharmony_ci 58bf215546Sopenharmony_ci /* Make sure that we aren't taking out an addition that could trigger 59bf215546Sopenharmony_ci * unsigned wrapping in a way that would change the semantics of the load. 60bf215546Sopenharmony_ci * Ignored for ints-as-floats (lower_bitops is a proxy for that), where 61bf215546Sopenharmony_ci * unsigned wrapping doesn't make sense. 62bf215546Sopenharmony_ci */ 63bf215546Sopenharmony_ci if (!alu->no_unsigned_wrap && !b->shader->options->lower_bitops) { 64bf215546Sopenharmony_ci if (!state->range_ht) { 65bf215546Sopenharmony_ci /* Cache for nir_unsigned_upper_bound */ 66bf215546Sopenharmony_ci state->range_ht = _mesa_pointer_hash_table_create(NULL); 67bf215546Sopenharmony_ci } 68bf215546Sopenharmony_ci 69bf215546Sopenharmony_ci /* Check if there can really be an unsigned wrap. */ 70bf215546Sopenharmony_ci uint32_t ub0 = nir_unsigned_upper_bound(b->shader, state->range_ht, src[0], NULL); 71bf215546Sopenharmony_ci uint32_t ub1 = nir_unsigned_upper_bound(b->shader, state->range_ht, src[1], NULL); 72bf215546Sopenharmony_ci 73bf215546Sopenharmony_ci if ((UINT32_MAX - ub0) < ub1) 74bf215546Sopenharmony_ci return val; 75bf215546Sopenharmony_ci 76bf215546Sopenharmony_ci /* We proved that unsigned wrap won't be possible, so we can set the flag too. */ 77bf215546Sopenharmony_ci alu->no_unsigned_wrap = true; 78bf215546Sopenharmony_ci } 79bf215546Sopenharmony_ci 80bf215546Sopenharmony_ci for (unsigned i = 0; i < 2; ++i) { 81bf215546Sopenharmony_ci src[i] = nir_ssa_scalar_chase_movs(src[i]); 82bf215546Sopenharmony_ci if (nir_ssa_scalar_is_const(src[i])) { 83bf215546Sopenharmony_ci uint32_t offset = nir_ssa_scalar_as_uint(src[i]); 84bf215546Sopenharmony_ci if (offset + *out_const <= max) { 85bf215546Sopenharmony_ci *out_const += offset; 86bf215546Sopenharmony_ci return try_extract_const_addition(b, src[1 - i], state, out_const, max); 87bf215546Sopenharmony_ci } 88bf215546Sopenharmony_ci } 89bf215546Sopenharmony_ci } 90bf215546Sopenharmony_ci 91bf215546Sopenharmony_ci uint32_t orig_offset = *out_const; 92bf215546Sopenharmony_ci src[0] = try_extract_const_addition(b, src[0], state, out_const, max); 93bf215546Sopenharmony_ci src[1] = try_extract_const_addition(b, src[1], state, out_const, max); 94bf215546Sopenharmony_ci if (*out_const == orig_offset) 95bf215546Sopenharmony_ci return val; 96bf215546Sopenharmony_ci 97bf215546Sopenharmony_ci b->cursor = nir_before_instr(&alu->instr); 98bf215546Sopenharmony_ci nir_ssa_def *r = 99bf215546Sopenharmony_ci nir_iadd(b, nir_channel(b, src[0].def, src[0].comp), 100bf215546Sopenharmony_ci nir_channel(b, src[1].def, src[1].comp)); 101bf215546Sopenharmony_ci return nir_get_ssa_scalar(r, 0); 102bf215546Sopenharmony_ci} 103bf215546Sopenharmony_ci 104bf215546Sopenharmony_cistatic bool 105bf215546Sopenharmony_citry_fold_load_store(nir_builder *b, 106bf215546Sopenharmony_ci nir_intrinsic_instr *intrin, 107bf215546Sopenharmony_ci opt_offsets_state *state, 108bf215546Sopenharmony_ci unsigned offset_src_idx, 109bf215546Sopenharmony_ci uint32_t max) 110bf215546Sopenharmony_ci{ 111bf215546Sopenharmony_ci /* Assume that BASE is the constant offset of a load/store. 112bf215546Sopenharmony_ci * Try to constant-fold additions to the offset source 113bf215546Sopenharmony_ci * into the actual const offset of the instruction. 114bf215546Sopenharmony_ci */ 115bf215546Sopenharmony_ci 116bf215546Sopenharmony_ci unsigned off_const = nir_intrinsic_base(intrin); 117bf215546Sopenharmony_ci nir_src *off_src = &intrin->src[offset_src_idx]; 118bf215546Sopenharmony_ci nir_ssa_def *replace_src = NULL; 119bf215546Sopenharmony_ci 120bf215546Sopenharmony_ci if (!off_src->is_ssa || off_src->ssa->bit_size != 32) 121bf215546Sopenharmony_ci return false; 122bf215546Sopenharmony_ci 123bf215546Sopenharmony_ci if (!nir_src_is_const(*off_src)) { 124bf215546Sopenharmony_ci uint32_t add_offset = 0; 125bf215546Sopenharmony_ci nir_ssa_scalar val = {.def = off_src->ssa, .comp = 0}; 126bf215546Sopenharmony_ci val = try_extract_const_addition(b, val, state, &add_offset, max); 127bf215546Sopenharmony_ci if (add_offset == 0) 128bf215546Sopenharmony_ci return false; 129bf215546Sopenharmony_ci off_const += add_offset; 130bf215546Sopenharmony_ci b->cursor = nir_before_instr(&intrin->instr); 131bf215546Sopenharmony_ci replace_src = nir_channel(b, val.def, val.comp); 132bf215546Sopenharmony_ci } else if (nir_src_as_uint(*off_src) && off_const + nir_src_as_uint(*off_src) <= max) { 133bf215546Sopenharmony_ci off_const += nir_src_as_uint(*off_src); 134bf215546Sopenharmony_ci b->cursor = nir_before_instr(&intrin->instr); 135bf215546Sopenharmony_ci replace_src = nir_imm_zero(b, off_src->ssa->num_components, off_src->ssa->bit_size); 136bf215546Sopenharmony_ci } 137bf215546Sopenharmony_ci 138bf215546Sopenharmony_ci if (!replace_src) 139bf215546Sopenharmony_ci return false; 140bf215546Sopenharmony_ci 141bf215546Sopenharmony_ci nir_instr_rewrite_src(&intrin->instr, &intrin->src[offset_src_idx], nir_src_for_ssa(replace_src)); 142bf215546Sopenharmony_ci nir_intrinsic_set_base(intrin, off_const); 143bf215546Sopenharmony_ci return true; 144bf215546Sopenharmony_ci} 145bf215546Sopenharmony_ci 146bf215546Sopenharmony_cistatic bool 147bf215546Sopenharmony_citry_fold_shared2(nir_builder *b, 148bf215546Sopenharmony_ci nir_intrinsic_instr *intrin, 149bf215546Sopenharmony_ci opt_offsets_state *state, 150bf215546Sopenharmony_ci unsigned offset_src_idx) 151bf215546Sopenharmony_ci{ 152bf215546Sopenharmony_ci unsigned comp_size = (intrin->intrinsic == nir_intrinsic_load_shared2_amd ? 153bf215546Sopenharmony_ci intrin->dest.ssa.bit_size : intrin->src[0].ssa->bit_size) / 8; 154bf215546Sopenharmony_ci unsigned stride = (nir_intrinsic_st64(intrin) ? 64 : 1) * comp_size; 155bf215546Sopenharmony_ci unsigned offset0 = nir_intrinsic_offset0(intrin) * stride; 156bf215546Sopenharmony_ci unsigned offset1 = nir_intrinsic_offset1(intrin) * stride; 157bf215546Sopenharmony_ci nir_src *off_src = &intrin->src[offset_src_idx]; 158bf215546Sopenharmony_ci 159bf215546Sopenharmony_ci if (!nir_src_is_const(*off_src)) 160bf215546Sopenharmony_ci return false; 161bf215546Sopenharmony_ci 162bf215546Sopenharmony_ci unsigned const_offset = nir_src_as_uint(*off_src); 163bf215546Sopenharmony_ci offset0 += const_offset; 164bf215546Sopenharmony_ci offset1 += const_offset; 165bf215546Sopenharmony_ci bool st64 = offset0 % (64 * comp_size) == 0 && offset1 % (64 * comp_size) == 0; 166bf215546Sopenharmony_ci stride = (st64 ? 64 : 1) * comp_size; 167bf215546Sopenharmony_ci if (const_offset % stride || offset0 > 255 * stride || offset1 > 255 * stride) 168bf215546Sopenharmony_ci return false; 169bf215546Sopenharmony_ci 170bf215546Sopenharmony_ci b->cursor = nir_before_instr(&intrin->instr); 171bf215546Sopenharmony_ci nir_instr_rewrite_src(&intrin->instr, off_src, nir_src_for_ssa(nir_imm_zero(b, 1, 32))); 172bf215546Sopenharmony_ci nir_intrinsic_set_offset0(intrin, offset0 / stride); 173bf215546Sopenharmony_ci nir_intrinsic_set_offset1(intrin, offset1 / stride); 174bf215546Sopenharmony_ci nir_intrinsic_set_st64(intrin, st64); 175bf215546Sopenharmony_ci 176bf215546Sopenharmony_ci return true; 177bf215546Sopenharmony_ci} 178bf215546Sopenharmony_ci 179bf215546Sopenharmony_cistatic bool 180bf215546Sopenharmony_ciprocess_instr(nir_builder *b, nir_instr *instr, void *s) 181bf215546Sopenharmony_ci{ 182bf215546Sopenharmony_ci if (instr->type != nir_instr_type_intrinsic) 183bf215546Sopenharmony_ci return false; 184bf215546Sopenharmony_ci 185bf215546Sopenharmony_ci opt_offsets_state *state = (opt_offsets_state *) s; 186bf215546Sopenharmony_ci nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 187bf215546Sopenharmony_ci 188bf215546Sopenharmony_ci switch (intrin->intrinsic) { 189bf215546Sopenharmony_ci case nir_intrinsic_load_uniform: 190bf215546Sopenharmony_ci return try_fold_load_store(b, intrin, state, 0, state->options->uniform_max); 191bf215546Sopenharmony_ci case nir_intrinsic_load_ubo_vec4: 192bf215546Sopenharmony_ci return try_fold_load_store(b, intrin, state, 1, state->options->ubo_vec4_max); 193bf215546Sopenharmony_ci case nir_intrinsic_load_shared: 194bf215546Sopenharmony_ci case nir_intrinsic_load_shared_ir3: 195bf215546Sopenharmony_ci return try_fold_load_store(b, intrin, state, 0, state->options->shared_max); 196bf215546Sopenharmony_ci case nir_intrinsic_store_shared: 197bf215546Sopenharmony_ci case nir_intrinsic_store_shared_ir3: 198bf215546Sopenharmony_ci return try_fold_load_store(b, intrin, state, 1, state->options->shared_max); 199bf215546Sopenharmony_ci case nir_intrinsic_load_shared2_amd: 200bf215546Sopenharmony_ci return try_fold_shared2(b, intrin, state, 0); 201bf215546Sopenharmony_ci case nir_intrinsic_store_shared2_amd: 202bf215546Sopenharmony_ci return try_fold_shared2(b, intrin, state, 1); 203bf215546Sopenharmony_ci case nir_intrinsic_load_buffer_amd: 204bf215546Sopenharmony_ci return try_fold_load_store(b, intrin, state, 1, state->options->buffer_max); 205bf215546Sopenharmony_ci case nir_intrinsic_store_buffer_amd: 206bf215546Sopenharmony_ci return try_fold_load_store(b, intrin, state, 2, state->options->buffer_max); 207bf215546Sopenharmony_ci default: 208bf215546Sopenharmony_ci return false; 209bf215546Sopenharmony_ci } 210bf215546Sopenharmony_ci 211bf215546Sopenharmony_ci unreachable("Can't reach here."); 212bf215546Sopenharmony_ci} 213bf215546Sopenharmony_ci 214bf215546Sopenharmony_cibool 215bf215546Sopenharmony_cinir_opt_offsets(nir_shader *shader, const nir_opt_offsets_options *options) 216bf215546Sopenharmony_ci{ 217bf215546Sopenharmony_ci opt_offsets_state state; 218bf215546Sopenharmony_ci state.range_ht = NULL; 219bf215546Sopenharmony_ci state.options = options; 220bf215546Sopenharmony_ci 221bf215546Sopenharmony_ci bool p = nir_shader_instructions_pass(shader, process_instr, 222bf215546Sopenharmony_ci nir_metadata_block_index | 223bf215546Sopenharmony_ci nir_metadata_dominance, 224bf215546Sopenharmony_ci &state); 225bf215546Sopenharmony_ci 226bf215546Sopenharmony_ci if (state.range_ht) 227bf215546Sopenharmony_ci _mesa_hash_table_destroy(state.range_ht, NULL); 228bf215546Sopenharmony_ci 229bf215546Sopenharmony_ci 230bf215546Sopenharmony_ci return p; 231bf215546Sopenharmony_ci} 232