1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2014-2015 Broadcom 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci#include "nir.h" 25bf215546Sopenharmony_ci#include "nir_builder.h" 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_cistruct alu_width_data { 28bf215546Sopenharmony_ci nir_vectorize_cb cb; 29bf215546Sopenharmony_ci const void *data; 30bf215546Sopenharmony_ci}; 31bf215546Sopenharmony_ci 32bf215546Sopenharmony_ci/** @file nir_lower_alu_width.c 33bf215546Sopenharmony_ci * 34bf215546Sopenharmony_ci * Replaces nir_alu_instr operations with more than one channel used in the 35bf215546Sopenharmony_ci * arguments with individual per-channel operations. 36bf215546Sopenharmony_ci * 37bf215546Sopenharmony_ci * Optionally, a callback function which returns the max vectorization width 38bf215546Sopenharmony_ci * per instruction can be provided. 39bf215546Sopenharmony_ci * 40bf215546Sopenharmony_ci * The max vectorization width must be a power of 2. 41bf215546Sopenharmony_ci */ 42bf215546Sopenharmony_ci 43bf215546Sopenharmony_cistatic bool 44bf215546Sopenharmony_ciinst_is_vector_alu(const nir_instr *instr, const void *_state) 45bf215546Sopenharmony_ci{ 46bf215546Sopenharmony_ci if (instr->type != nir_instr_type_alu) 47bf215546Sopenharmony_ci return false; 48bf215546Sopenharmony_ci 49bf215546Sopenharmony_ci nir_alu_instr *alu = nir_instr_as_alu(instr); 50bf215546Sopenharmony_ci 51bf215546Sopenharmony_ci /* There is no ALU instruction which has a scalar destination, scalar 52bf215546Sopenharmony_ci * src[0], and some other vector source. 53bf215546Sopenharmony_ci */ 54bf215546Sopenharmony_ci assert(alu->dest.dest.is_ssa); 55bf215546Sopenharmony_ci assert(alu->src[0].src.is_ssa); 56bf215546Sopenharmony_ci return alu->dest.dest.ssa.num_components > 1 || 57bf215546Sopenharmony_ci nir_op_infos[alu->op].input_sizes[0] > 1; 58bf215546Sopenharmony_ci} 59bf215546Sopenharmony_ci 60bf215546Sopenharmony_ci/* Checks whether all operands of an ALU instruction are swizzled 61bf215546Sopenharmony_ci * within the targeted vectorization width. 62bf215546Sopenharmony_ci * 63bf215546Sopenharmony_ci * The assumption here is that a vecN instruction can only swizzle 64bf215546Sopenharmony_ci * within the first N channels of the values it consumes, irrespective 65bf215546Sopenharmony_ci * of the capabilities of the instruction which produced those values. 66bf215546Sopenharmony_ci * If we assume values are packed consistently (i.e., they always start 67bf215546Sopenharmony_ci * at the beginning of a hardware register), we can actually access any 68bf215546Sopenharmony_ci * aligned group of N channels so long as we stay within the group. 69bf215546Sopenharmony_ci * This means for a vectorization width of 4 that only swizzles from 70bf215546Sopenharmony_ci * either [xyzw] or [abcd] etc are allowed. For a width of 2 these are 71bf215546Sopenharmony_ci * swizzles from either [xy] or [zw] etc. 72bf215546Sopenharmony_ci */ 73bf215546Sopenharmony_cistatic bool 74bf215546Sopenharmony_cialu_is_swizzled_in_bounds(const nir_alu_instr *alu, unsigned width) 75bf215546Sopenharmony_ci{ 76bf215546Sopenharmony_ci for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { 77bf215546Sopenharmony_ci if (nir_op_infos[alu->op].input_sizes[i] == 1) 78bf215546Sopenharmony_ci continue; 79bf215546Sopenharmony_ci 80bf215546Sopenharmony_ci unsigned mask = ~(width - 1); 81bf215546Sopenharmony_ci for (unsigned j = 1; j < alu->dest.dest.ssa.num_components; j++) { 82bf215546Sopenharmony_ci if ((alu->src[i].swizzle[0] & mask) != (alu->src[i].swizzle[j] & mask)) 83bf215546Sopenharmony_ci return false; 84bf215546Sopenharmony_ci } 85bf215546Sopenharmony_ci } 86bf215546Sopenharmony_ci 87bf215546Sopenharmony_ci return true; 88bf215546Sopenharmony_ci} 89bf215546Sopenharmony_ci 90bf215546Sopenharmony_cistatic void 91bf215546Sopenharmony_cinir_alu_ssa_dest_init(nir_alu_instr *alu, unsigned num_components, 92bf215546Sopenharmony_ci unsigned bit_size) 93bf215546Sopenharmony_ci{ 94bf215546Sopenharmony_ci nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components, 95bf215546Sopenharmony_ci bit_size, NULL); 96bf215546Sopenharmony_ci alu->dest.write_mask = (1 << num_components) - 1; 97bf215546Sopenharmony_ci} 98bf215546Sopenharmony_ci 99bf215546Sopenharmony_cistatic nir_ssa_def * 100bf215546Sopenharmony_cilower_reduction(nir_alu_instr *alu, nir_op chan_op, nir_op merge_op, 101bf215546Sopenharmony_ci nir_builder *builder) 102bf215546Sopenharmony_ci{ 103bf215546Sopenharmony_ci unsigned num_components = nir_op_infos[alu->op].input_sizes[0]; 104bf215546Sopenharmony_ci 105bf215546Sopenharmony_ci nir_ssa_def *last = NULL; 106bf215546Sopenharmony_ci for (int i = num_components - 1; i >= 0; i--) { 107bf215546Sopenharmony_ci nir_alu_instr *chan = nir_alu_instr_create(builder->shader, chan_op); 108bf215546Sopenharmony_ci nir_alu_ssa_dest_init(chan, 1, alu->dest.dest.ssa.bit_size); 109bf215546Sopenharmony_ci nir_alu_src_copy(&chan->src[0], &alu->src[0]); 110bf215546Sopenharmony_ci chan->src[0].swizzle[0] = chan->src[0].swizzle[i]; 111bf215546Sopenharmony_ci if (nir_op_infos[chan_op].num_inputs > 1) { 112bf215546Sopenharmony_ci assert(nir_op_infos[chan_op].num_inputs == 2); 113bf215546Sopenharmony_ci nir_alu_src_copy(&chan->src[1], &alu->src[1]); 114bf215546Sopenharmony_ci chan->src[1].swizzle[0] = chan->src[1].swizzle[i]; 115bf215546Sopenharmony_ci } 116bf215546Sopenharmony_ci chan->exact = alu->exact; 117bf215546Sopenharmony_ci 118bf215546Sopenharmony_ci nir_builder_instr_insert(builder, &chan->instr); 119bf215546Sopenharmony_ci 120bf215546Sopenharmony_ci if (i == num_components - 1) { 121bf215546Sopenharmony_ci last = &chan->dest.dest.ssa; 122bf215546Sopenharmony_ci } else { 123bf215546Sopenharmony_ci last = nir_build_alu(builder, merge_op, 124bf215546Sopenharmony_ci last, &chan->dest.dest.ssa, NULL, NULL); 125bf215546Sopenharmony_ci } 126bf215546Sopenharmony_ci } 127bf215546Sopenharmony_ci 128bf215546Sopenharmony_ci return last; 129bf215546Sopenharmony_ci} 130bf215546Sopenharmony_ci 131bf215546Sopenharmony_cistatic inline bool 132bf215546Sopenharmony_ciwill_lower_ffma(nir_shader *shader, unsigned bit_size) 133bf215546Sopenharmony_ci{ 134bf215546Sopenharmony_ci switch (bit_size) { 135bf215546Sopenharmony_ci case 16: 136bf215546Sopenharmony_ci return shader->options->lower_ffma16; 137bf215546Sopenharmony_ci case 32: 138bf215546Sopenharmony_ci return shader->options->lower_ffma32; 139bf215546Sopenharmony_ci case 64: 140bf215546Sopenharmony_ci return shader->options->lower_ffma64; 141bf215546Sopenharmony_ci } 142bf215546Sopenharmony_ci unreachable("bad bit size"); 143bf215546Sopenharmony_ci} 144bf215546Sopenharmony_ci 145bf215546Sopenharmony_cistatic nir_ssa_def * 146bf215546Sopenharmony_cilower_fdot(nir_alu_instr *alu, nir_builder *builder) 147bf215546Sopenharmony_ci{ 148bf215546Sopenharmony_ci /* If we don't want to lower ffma, create several ffma instead of fmul+fadd 149bf215546Sopenharmony_ci * and fusing later because fusing is not possible for exact fdot instructions. 150bf215546Sopenharmony_ci */ 151bf215546Sopenharmony_ci if (will_lower_ffma(builder->shader, alu->dest.dest.ssa.bit_size)) 152bf215546Sopenharmony_ci return lower_reduction(alu, nir_op_fmul, nir_op_fadd, builder); 153bf215546Sopenharmony_ci 154bf215546Sopenharmony_ci unsigned num_components = nir_op_infos[alu->op].input_sizes[0]; 155bf215546Sopenharmony_ci 156bf215546Sopenharmony_ci nir_ssa_def *prev = NULL; 157bf215546Sopenharmony_ci for (int i = num_components - 1; i >= 0; i--) { 158bf215546Sopenharmony_ci nir_alu_instr *instr = nir_alu_instr_create( 159bf215546Sopenharmony_ci builder->shader, prev ? nir_op_ffma : nir_op_fmul); 160bf215546Sopenharmony_ci nir_alu_ssa_dest_init(instr, 1, alu->dest.dest.ssa.bit_size); 161bf215546Sopenharmony_ci for (unsigned j = 0; j < 2; j++) { 162bf215546Sopenharmony_ci nir_alu_src_copy(&instr->src[j], &alu->src[j]); 163bf215546Sopenharmony_ci instr->src[j].swizzle[0] = alu->src[j].swizzle[i]; 164bf215546Sopenharmony_ci } 165bf215546Sopenharmony_ci if (i != num_components - 1) 166bf215546Sopenharmony_ci instr->src[2].src = nir_src_for_ssa(prev); 167bf215546Sopenharmony_ci instr->exact = builder->exact; 168bf215546Sopenharmony_ci 169bf215546Sopenharmony_ci nir_builder_instr_insert(builder, &instr->instr); 170bf215546Sopenharmony_ci 171bf215546Sopenharmony_ci prev = &instr->dest.dest.ssa; 172bf215546Sopenharmony_ci } 173bf215546Sopenharmony_ci 174bf215546Sopenharmony_ci return prev; 175bf215546Sopenharmony_ci} 176bf215546Sopenharmony_ci 177bf215546Sopenharmony_cistatic nir_ssa_def * 178bf215546Sopenharmony_cilower_alu_instr_width(nir_builder *b, nir_instr *instr, void *_data) 179bf215546Sopenharmony_ci{ 180bf215546Sopenharmony_ci struct alu_width_data *data = _data; 181bf215546Sopenharmony_ci nir_alu_instr *alu = nir_instr_as_alu(instr); 182bf215546Sopenharmony_ci unsigned num_src = nir_op_infos[alu->op].num_inputs; 183bf215546Sopenharmony_ci unsigned i, chan; 184bf215546Sopenharmony_ci 185bf215546Sopenharmony_ci assert(alu->dest.dest.is_ssa); 186bf215546Sopenharmony_ci assert(alu->dest.write_mask != 0); 187bf215546Sopenharmony_ci 188bf215546Sopenharmony_ci b->exact = alu->exact; 189bf215546Sopenharmony_ci 190bf215546Sopenharmony_ci unsigned num_components = alu->dest.dest.ssa.num_components; 191bf215546Sopenharmony_ci unsigned target_width = 1; 192bf215546Sopenharmony_ci 193bf215546Sopenharmony_ci if (data->cb) { 194bf215546Sopenharmony_ci target_width = data->cb(instr, data->data); 195bf215546Sopenharmony_ci assert(util_is_power_of_two_or_zero(target_width)); 196bf215546Sopenharmony_ci if (target_width == 0) 197bf215546Sopenharmony_ci return NULL; 198bf215546Sopenharmony_ci } 199bf215546Sopenharmony_ci 200bf215546Sopenharmony_ci#define LOWER_REDUCTION(name, chan, merge) \ 201bf215546Sopenharmony_ci case name##2: \ 202bf215546Sopenharmony_ci case name##3: \ 203bf215546Sopenharmony_ci case name##4: \ 204bf215546Sopenharmony_ci case name##8: \ 205bf215546Sopenharmony_ci case name##16: \ 206bf215546Sopenharmony_ci return lower_reduction(alu, chan, merge, b); \ 207bf215546Sopenharmony_ci 208bf215546Sopenharmony_ci switch (alu->op) { 209bf215546Sopenharmony_ci case nir_op_vec16: 210bf215546Sopenharmony_ci case nir_op_vec8: 211bf215546Sopenharmony_ci case nir_op_vec5: 212bf215546Sopenharmony_ci case nir_op_vec4: 213bf215546Sopenharmony_ci case nir_op_vec3: 214bf215546Sopenharmony_ci case nir_op_vec2: 215bf215546Sopenharmony_ci case nir_op_cube_face_coord_amd: 216bf215546Sopenharmony_ci case nir_op_cube_face_index_amd: 217bf215546Sopenharmony_ci /* We don't need to scalarize these ops, they're the ones generated to 218bf215546Sopenharmony_ci * group up outputs into a value that can be SSAed. 219bf215546Sopenharmony_ci */ 220bf215546Sopenharmony_ci return NULL; 221bf215546Sopenharmony_ci 222bf215546Sopenharmony_ci case nir_op_pack_half_2x16: { 223bf215546Sopenharmony_ci if (!b->shader->options->lower_pack_half_2x16) 224bf215546Sopenharmony_ci return NULL; 225bf215546Sopenharmony_ci 226bf215546Sopenharmony_ci nir_ssa_def *src_vec2 = nir_ssa_for_alu_src(b, alu, 0); 227bf215546Sopenharmony_ci return nir_pack_half_2x16_split(b, nir_channel(b, src_vec2, 0), 228bf215546Sopenharmony_ci nir_channel(b, src_vec2, 1)); 229bf215546Sopenharmony_ci } 230bf215546Sopenharmony_ci 231bf215546Sopenharmony_ci case nir_op_unpack_unorm_4x8: 232bf215546Sopenharmony_ci case nir_op_unpack_snorm_4x8: 233bf215546Sopenharmony_ci case nir_op_unpack_unorm_2x16: 234bf215546Sopenharmony_ci case nir_op_unpack_snorm_2x16: 235bf215546Sopenharmony_ci /* There is no scalar version of these ops, unless we were to break it 236bf215546Sopenharmony_ci * down to bitshifts and math (which is definitely not intended). 237bf215546Sopenharmony_ci */ 238bf215546Sopenharmony_ci return NULL; 239bf215546Sopenharmony_ci 240bf215546Sopenharmony_ci case nir_op_unpack_half_2x16_flush_to_zero: 241bf215546Sopenharmony_ci case nir_op_unpack_half_2x16: { 242bf215546Sopenharmony_ci if (!b->shader->options->lower_unpack_half_2x16) 243bf215546Sopenharmony_ci return NULL; 244bf215546Sopenharmony_ci 245bf215546Sopenharmony_ci nir_ssa_def *packed = nir_ssa_for_alu_src(b, alu, 0); 246bf215546Sopenharmony_ci if (alu->op == nir_op_unpack_half_2x16_flush_to_zero) { 247bf215546Sopenharmony_ci return nir_vec2(b, 248bf215546Sopenharmony_ci nir_unpack_half_2x16_split_x_flush_to_zero(b, 249bf215546Sopenharmony_ci packed), 250bf215546Sopenharmony_ci nir_unpack_half_2x16_split_y_flush_to_zero(b, 251bf215546Sopenharmony_ci packed)); 252bf215546Sopenharmony_ci } else { 253bf215546Sopenharmony_ci return nir_vec2(b, 254bf215546Sopenharmony_ci nir_unpack_half_2x16_split_x(b, packed), 255bf215546Sopenharmony_ci nir_unpack_half_2x16_split_y(b, packed)); 256bf215546Sopenharmony_ci } 257bf215546Sopenharmony_ci } 258bf215546Sopenharmony_ci 259bf215546Sopenharmony_ci case nir_op_pack_uvec2_to_uint: { 260bf215546Sopenharmony_ci assert(b->shader->options->lower_pack_snorm_2x16 || 261bf215546Sopenharmony_ci b->shader->options->lower_pack_unorm_2x16); 262bf215546Sopenharmony_ci 263bf215546Sopenharmony_ci nir_ssa_def *word = nir_extract_u16(b, nir_ssa_for_alu_src(b, alu, 0), 264bf215546Sopenharmony_ci nir_imm_int(b, 0)); 265bf215546Sopenharmony_ci return nir_ior(b, nir_ishl(b, nir_channel(b, word, 1), 266bf215546Sopenharmony_ci nir_imm_int(b, 16)), 267bf215546Sopenharmony_ci nir_channel(b, word, 0)); 268bf215546Sopenharmony_ci } 269bf215546Sopenharmony_ci 270bf215546Sopenharmony_ci case nir_op_pack_uvec4_to_uint: { 271bf215546Sopenharmony_ci assert(b->shader->options->lower_pack_snorm_4x8 || 272bf215546Sopenharmony_ci b->shader->options->lower_pack_unorm_4x8); 273bf215546Sopenharmony_ci 274bf215546Sopenharmony_ci nir_ssa_def *byte = nir_extract_u8(b, nir_ssa_for_alu_src(b, alu, 0), 275bf215546Sopenharmony_ci nir_imm_int(b, 0)); 276bf215546Sopenharmony_ci return nir_ior(b, nir_ior(b, nir_ishl(b, nir_channel(b, byte, 3), 277bf215546Sopenharmony_ci nir_imm_int(b, 24)), 278bf215546Sopenharmony_ci nir_ishl(b, nir_channel(b, byte, 2), 279bf215546Sopenharmony_ci nir_imm_int(b, 16))), 280bf215546Sopenharmony_ci nir_ior(b, nir_ishl(b, nir_channel(b, byte, 1), 281bf215546Sopenharmony_ci nir_imm_int(b, 8)), 282bf215546Sopenharmony_ci nir_channel(b, byte, 0))); 283bf215546Sopenharmony_ci } 284bf215546Sopenharmony_ci 285bf215546Sopenharmony_ci case nir_op_fdph: { 286bf215546Sopenharmony_ci nir_ssa_def *src0_vec = nir_ssa_for_alu_src(b, alu, 0); 287bf215546Sopenharmony_ci nir_ssa_def *src1_vec = nir_ssa_for_alu_src(b, alu, 1); 288bf215546Sopenharmony_ci 289bf215546Sopenharmony_ci nir_ssa_def *sum[4]; 290bf215546Sopenharmony_ci for (unsigned i = 0; i < 3; i++) { 291bf215546Sopenharmony_ci sum[i] = nir_fmul(b, nir_channel(b, src0_vec, i), 292bf215546Sopenharmony_ci nir_channel(b, src1_vec, i)); 293bf215546Sopenharmony_ci } 294bf215546Sopenharmony_ci sum[3] = nir_channel(b, src1_vec, 3); 295bf215546Sopenharmony_ci 296bf215546Sopenharmony_ci return nir_fadd(b, nir_fadd(b, sum[0], sum[1]), 297bf215546Sopenharmony_ci nir_fadd(b, sum[2], sum[3])); 298bf215546Sopenharmony_ci } 299bf215546Sopenharmony_ci 300bf215546Sopenharmony_ci case nir_op_pack_64_2x32: { 301bf215546Sopenharmony_ci if (!b->shader->options->lower_pack_64_2x32) 302bf215546Sopenharmony_ci return NULL; 303bf215546Sopenharmony_ci 304bf215546Sopenharmony_ci nir_ssa_def *src_vec2 = nir_ssa_for_alu_src(b, alu, 0); 305bf215546Sopenharmony_ci return nir_pack_64_2x32_split(b, nir_channel(b, src_vec2, 0), 306bf215546Sopenharmony_ci nir_channel(b, src_vec2, 1)); 307bf215546Sopenharmony_ci } 308bf215546Sopenharmony_ci case nir_op_pack_64_4x16: { 309bf215546Sopenharmony_ci if (!b->shader->options->lower_pack_64_4x16) 310bf215546Sopenharmony_ci return NULL; 311bf215546Sopenharmony_ci 312bf215546Sopenharmony_ci nir_ssa_def *src_vec4 = nir_ssa_for_alu_src(b, alu, 0); 313bf215546Sopenharmony_ci nir_ssa_def *xy = nir_pack_32_2x16_split(b, nir_channel(b, src_vec4, 0), 314bf215546Sopenharmony_ci nir_channel(b, src_vec4, 1)); 315bf215546Sopenharmony_ci nir_ssa_def *zw = nir_pack_32_2x16_split(b, nir_channel(b, src_vec4, 2), 316bf215546Sopenharmony_ci nir_channel(b, src_vec4, 3)); 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci return nir_pack_64_2x32_split(b, xy, zw); 319bf215546Sopenharmony_ci } 320bf215546Sopenharmony_ci case nir_op_pack_32_2x16: { 321bf215546Sopenharmony_ci if (!b->shader->options->lower_pack_32_2x16) 322bf215546Sopenharmony_ci return NULL; 323bf215546Sopenharmony_ci 324bf215546Sopenharmony_ci nir_ssa_def *src_vec2 = nir_ssa_for_alu_src(b, alu, 0); 325bf215546Sopenharmony_ci return nir_pack_32_2x16_split(b, nir_channel(b, src_vec2, 0), 326bf215546Sopenharmony_ci nir_channel(b, src_vec2, 1)); 327bf215546Sopenharmony_ci } 328bf215546Sopenharmony_ci case nir_op_unpack_64_2x32: 329bf215546Sopenharmony_ci case nir_op_unpack_64_4x16: 330bf215546Sopenharmony_ci case nir_op_unpack_32_2x16: 331bf215546Sopenharmony_ci case nir_op_unpack_double_2x32_dxil: 332bf215546Sopenharmony_ci return NULL; 333bf215546Sopenharmony_ci 334bf215546Sopenharmony_ci case nir_op_fdot2: 335bf215546Sopenharmony_ci case nir_op_fdot3: 336bf215546Sopenharmony_ci case nir_op_fdot4: 337bf215546Sopenharmony_ci case nir_op_fdot8: 338bf215546Sopenharmony_ci case nir_op_fdot16: 339bf215546Sopenharmony_ci return lower_fdot(alu, b); 340bf215546Sopenharmony_ci 341bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_ball_fequal, nir_op_feq, nir_op_iand); 342bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_ball_iequal, nir_op_ieq, nir_op_iand); 343bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_bany_fnequal, nir_op_fneu, nir_op_ior); 344bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_bany_inequal, nir_op_ine, nir_op_ior); 345bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b8all_fequal, nir_op_feq8, nir_op_iand); 346bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b8all_iequal, nir_op_ieq8, nir_op_iand); 347bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b8any_fnequal, nir_op_fneu8, nir_op_ior); 348bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b8any_inequal, nir_op_ine8, nir_op_ior); 349bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b16all_fequal, nir_op_feq16, nir_op_iand); 350bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b16all_iequal, nir_op_ieq16, nir_op_iand); 351bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b16any_fnequal, nir_op_fneu16, nir_op_ior); 352bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b16any_inequal, nir_op_ine16, nir_op_ior); 353bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b32all_fequal, nir_op_feq32, nir_op_iand); 354bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b32all_iequal, nir_op_ieq32, nir_op_iand); 355bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b32any_fnequal, nir_op_fneu32, nir_op_ior); 356bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_b32any_inequal, nir_op_ine32, nir_op_ior); 357bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_fall_equal, nir_op_seq, nir_op_fmin); 358bf215546Sopenharmony_ci LOWER_REDUCTION(nir_op_fany_nequal, nir_op_sne, nir_op_fmax); 359bf215546Sopenharmony_ci 360bf215546Sopenharmony_ci default: 361bf215546Sopenharmony_ci break; 362bf215546Sopenharmony_ci } 363bf215546Sopenharmony_ci 364bf215546Sopenharmony_ci if (num_components == 1) 365bf215546Sopenharmony_ci return NULL; 366bf215546Sopenharmony_ci 367bf215546Sopenharmony_ci if (num_components <= target_width) { 368bf215546Sopenharmony_ci /* If the ALU instr is swizzled outside the target width, 369bf215546Sopenharmony_ci * reduce the target width. 370bf215546Sopenharmony_ci */ 371bf215546Sopenharmony_ci if (alu_is_swizzled_in_bounds(alu, target_width)) 372bf215546Sopenharmony_ci return NULL; 373bf215546Sopenharmony_ci else 374bf215546Sopenharmony_ci target_width = DIV_ROUND_UP(num_components, 2); 375bf215546Sopenharmony_ci } 376bf215546Sopenharmony_ci 377bf215546Sopenharmony_ci nir_alu_instr *vec = nir_alu_instr_create(b->shader, nir_op_vec(num_components)); 378bf215546Sopenharmony_ci 379bf215546Sopenharmony_ci for (chan = 0; chan < num_components; chan += target_width) { 380bf215546Sopenharmony_ci unsigned components = MIN2(target_width, num_components - chan); 381bf215546Sopenharmony_ci nir_alu_instr *lower = nir_alu_instr_create(b->shader, alu->op); 382bf215546Sopenharmony_ci 383bf215546Sopenharmony_ci for (i = 0; i < num_src; i++) { 384bf215546Sopenharmony_ci nir_alu_src_copy(&lower->src[i], &alu->src[i]); 385bf215546Sopenharmony_ci 386bf215546Sopenharmony_ci /* We only handle same-size-as-dest (input_sizes[] == 0) or scalar 387bf215546Sopenharmony_ci * args (input_sizes[] == 1). 388bf215546Sopenharmony_ci */ 389bf215546Sopenharmony_ci assert(nir_op_infos[alu->op].input_sizes[i] < 2); 390bf215546Sopenharmony_ci for (int j = 0; j < components; j++) { 391bf215546Sopenharmony_ci unsigned src_chan = nir_op_infos[alu->op].input_sizes[i] == 1 ? 0 : chan + j; 392bf215546Sopenharmony_ci lower->src[i].swizzle[j] = alu->src[i].swizzle[src_chan]; 393bf215546Sopenharmony_ci } 394bf215546Sopenharmony_ci } 395bf215546Sopenharmony_ci 396bf215546Sopenharmony_ci nir_alu_ssa_dest_init(lower, components, alu->dest.dest.ssa.bit_size); 397bf215546Sopenharmony_ci lower->dest.saturate = alu->dest.saturate; 398bf215546Sopenharmony_ci lower->exact = alu->exact; 399bf215546Sopenharmony_ci 400bf215546Sopenharmony_ci for (i = 0; i < components; i++) { 401bf215546Sopenharmony_ci vec->src[chan + i].src = nir_src_for_ssa(&lower->dest.dest.ssa); 402bf215546Sopenharmony_ci vec->src[chan + i].swizzle[0] = i; 403bf215546Sopenharmony_ci } 404bf215546Sopenharmony_ci 405bf215546Sopenharmony_ci nir_builder_instr_insert(b, &lower->instr); 406bf215546Sopenharmony_ci } 407bf215546Sopenharmony_ci 408bf215546Sopenharmony_ci return nir_builder_alu_instr_finish_and_insert(b, vec); 409bf215546Sopenharmony_ci} 410bf215546Sopenharmony_ci 411bf215546Sopenharmony_cibool 412bf215546Sopenharmony_cinir_lower_alu_width(nir_shader *shader, nir_vectorize_cb cb, const void *_data) 413bf215546Sopenharmony_ci{ 414bf215546Sopenharmony_ci struct alu_width_data data = { 415bf215546Sopenharmony_ci .cb = cb, 416bf215546Sopenharmony_ci .data = _data, 417bf215546Sopenharmony_ci }; 418bf215546Sopenharmony_ci 419bf215546Sopenharmony_ci return nir_shader_lower_instructions(shader, 420bf215546Sopenharmony_ci inst_is_vector_alu, 421bf215546Sopenharmony_ci lower_alu_instr_width, 422bf215546Sopenharmony_ci &data); 423bf215546Sopenharmony_ci} 424bf215546Sopenharmony_ci 425bf215546Sopenharmony_cistruct alu_to_scalar_data { 426bf215546Sopenharmony_ci nir_instr_filter_cb cb; 427bf215546Sopenharmony_ci const void *data; 428bf215546Sopenharmony_ci}; 429bf215546Sopenharmony_ci 430bf215546Sopenharmony_cistatic uint8_t 431bf215546Sopenharmony_ciscalar_cb(const nir_instr *instr, const void *data) 432bf215546Sopenharmony_ci{ 433bf215546Sopenharmony_ci /* return vectorization-width = 1 for filtered instructions */ 434bf215546Sopenharmony_ci const struct alu_to_scalar_data *filter = data; 435bf215546Sopenharmony_ci return filter->cb(instr, filter->data) ? 1 : 0; 436bf215546Sopenharmony_ci} 437bf215546Sopenharmony_ci 438bf215546Sopenharmony_cibool 439bf215546Sopenharmony_cinir_lower_alu_to_scalar(nir_shader *shader, nir_instr_filter_cb cb, const void *_data) 440bf215546Sopenharmony_ci{ 441bf215546Sopenharmony_ci struct alu_to_scalar_data data = { 442bf215546Sopenharmony_ci .cb = cb, 443bf215546Sopenharmony_ci .data = _data, 444bf215546Sopenharmony_ci }; 445bf215546Sopenharmony_ci 446bf215546Sopenharmony_ci return nir_lower_alu_width(shader, cb ? scalar_cb : NULL, &data); 447bf215546Sopenharmony_ci} 448bf215546Sopenharmony_ci 449