1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2017 Intel Corporation 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci#include "nir.h" 25bf215546Sopenharmony_ci#include "nir_builder.h" 26bf215546Sopenharmony_ci#include "util/u_math.h" 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci/** 29bf215546Sopenharmony_ci * \file nir_opt_intrinsics.c 30bf215546Sopenharmony_ci */ 31bf215546Sopenharmony_ci 32bf215546Sopenharmony_cistatic nir_intrinsic_instr * 33bf215546Sopenharmony_cilower_subgroups_64bit_split_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, 34bf215546Sopenharmony_ci unsigned int component) 35bf215546Sopenharmony_ci{ 36bf215546Sopenharmony_ci nir_ssa_def *comp; 37bf215546Sopenharmony_ci if (component == 0) 38bf215546Sopenharmony_ci comp = nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa); 39bf215546Sopenharmony_ci else 40bf215546Sopenharmony_ci comp = nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa); 41bf215546Sopenharmony_ci 42bf215546Sopenharmony_ci nir_intrinsic_instr *intr = nir_intrinsic_instr_create(b->shader, intrin->intrinsic); 43bf215546Sopenharmony_ci nir_ssa_dest_init(&intr->instr, &intr->dest, 1, 32, NULL); 44bf215546Sopenharmony_ci intr->const_index[0] = intrin->const_index[0]; 45bf215546Sopenharmony_ci intr->const_index[1] = intrin->const_index[1]; 46bf215546Sopenharmony_ci intr->src[0] = nir_src_for_ssa(comp); 47bf215546Sopenharmony_ci if (nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2) 48bf215546Sopenharmony_ci nir_src_copy(&intr->src[1], &intrin->src[1]); 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_ci intr->num_components = 1; 51bf215546Sopenharmony_ci nir_builder_instr_insert(b, &intr->instr); 52bf215546Sopenharmony_ci return intr; 53bf215546Sopenharmony_ci} 54bf215546Sopenharmony_ci 55bf215546Sopenharmony_cistatic nir_ssa_def * 56bf215546Sopenharmony_cilower_subgroup_op_to_32bit(nir_builder *b, nir_intrinsic_instr *intrin) 57bf215546Sopenharmony_ci{ 58bf215546Sopenharmony_ci assert(intrin->src[0].ssa->bit_size == 64); 59bf215546Sopenharmony_ci nir_intrinsic_instr *intr_x = lower_subgroups_64bit_split_intrinsic(b, intrin, 0); 60bf215546Sopenharmony_ci nir_intrinsic_instr *intr_y = lower_subgroups_64bit_split_intrinsic(b, intrin, 1); 61bf215546Sopenharmony_ci return nir_pack_64_2x32_split(b, &intr_x->dest.ssa, &intr_y->dest.ssa); 62bf215546Sopenharmony_ci} 63bf215546Sopenharmony_ci 64bf215546Sopenharmony_cistatic nir_ssa_def * 65bf215546Sopenharmony_ciballot_type_to_uint(nir_builder *b, nir_ssa_def *value, 66bf215546Sopenharmony_ci const nir_lower_subgroups_options *options) 67bf215546Sopenharmony_ci{ 68bf215546Sopenharmony_ci /* Only the new-style SPIR-V subgroup instructions take a ballot result as 69bf215546Sopenharmony_ci * an argument, so we only use this on uvec4 types. 70bf215546Sopenharmony_ci */ 71bf215546Sopenharmony_ci assert(value->num_components == 4 && value->bit_size == 32); 72bf215546Sopenharmony_ci 73bf215546Sopenharmony_ci return nir_extract_bits(b, &value, 1, 0, options->ballot_components, 74bf215546Sopenharmony_ci options->ballot_bit_size); 75bf215546Sopenharmony_ci} 76bf215546Sopenharmony_ci 77bf215546Sopenharmony_cistatic nir_ssa_def * 78bf215546Sopenharmony_ciuint_to_ballot_type(nir_builder *b, nir_ssa_def *value, 79bf215546Sopenharmony_ci unsigned num_components, unsigned bit_size) 80bf215546Sopenharmony_ci{ 81bf215546Sopenharmony_ci assert(util_is_power_of_two_nonzero(num_components)); 82bf215546Sopenharmony_ci assert(util_is_power_of_two_nonzero(value->num_components)); 83bf215546Sopenharmony_ci 84bf215546Sopenharmony_ci unsigned total_bits = bit_size * num_components; 85bf215546Sopenharmony_ci 86bf215546Sopenharmony_ci /* If the source doesn't have enough bits, zero-pad */ 87bf215546Sopenharmony_ci if (total_bits > value->bit_size * value->num_components) 88bf215546Sopenharmony_ci value = nir_pad_vector_imm_int(b, value, 0, total_bits / value->bit_size); 89bf215546Sopenharmony_ci 90bf215546Sopenharmony_ci value = nir_bitcast_vector(b, value, bit_size); 91bf215546Sopenharmony_ci 92bf215546Sopenharmony_ci /* If the source has too many components, truncate. This can happen if, 93bf215546Sopenharmony_ci * for instance, we're implementing GL_ARB_shader_ballot or 94bf215546Sopenharmony_ci * VK_EXT_shader_subgroup_ballot which have 64-bit ballot values on an 95bf215546Sopenharmony_ci * architecture with a native 128-bit uvec4 ballot. This comes up in Zink 96bf215546Sopenharmony_ci * for OpenGL on Vulkan. It's the job of the driver calling this lowering 97bf215546Sopenharmony_ci * pass to ensure that it's restricted subgroup sizes sufficiently that we 98bf215546Sopenharmony_ci * have enough ballot bits. 99bf215546Sopenharmony_ci */ 100bf215546Sopenharmony_ci if (value->num_components > num_components) 101bf215546Sopenharmony_ci value = nir_trim_vector(b, value, num_components); 102bf215546Sopenharmony_ci 103bf215546Sopenharmony_ci return value; 104bf215546Sopenharmony_ci} 105bf215546Sopenharmony_ci 106bf215546Sopenharmony_cistatic nir_ssa_def * 107bf215546Sopenharmony_cilower_subgroup_op_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin, 108bf215546Sopenharmony_ci bool lower_to_32bit) 109bf215546Sopenharmony_ci{ 110bf215546Sopenharmony_ci /* This is safe to call on scalar things but it would be silly */ 111bf215546Sopenharmony_ci assert(intrin->dest.ssa.num_components > 1); 112bf215546Sopenharmony_ci 113bf215546Sopenharmony_ci nir_ssa_def *value = nir_ssa_for_src(b, intrin->src[0], 114bf215546Sopenharmony_ci intrin->num_components); 115bf215546Sopenharmony_ci nir_ssa_def *reads[NIR_MAX_VEC_COMPONENTS]; 116bf215546Sopenharmony_ci 117bf215546Sopenharmony_ci for (unsigned i = 0; i < intrin->num_components; i++) { 118bf215546Sopenharmony_ci nir_intrinsic_instr *chan_intrin = 119bf215546Sopenharmony_ci nir_intrinsic_instr_create(b->shader, intrin->intrinsic); 120bf215546Sopenharmony_ci nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest, 121bf215546Sopenharmony_ci 1, intrin->dest.ssa.bit_size, NULL); 122bf215546Sopenharmony_ci chan_intrin->num_components = 1; 123bf215546Sopenharmony_ci 124bf215546Sopenharmony_ci /* value */ 125bf215546Sopenharmony_ci chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i)); 126bf215546Sopenharmony_ci /* invocation */ 127bf215546Sopenharmony_ci if (nir_intrinsic_infos[intrin->intrinsic].num_srcs > 1) { 128bf215546Sopenharmony_ci assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2); 129bf215546Sopenharmony_ci nir_src_copy(&chan_intrin->src[1], &intrin->src[1]); 130bf215546Sopenharmony_ci } 131bf215546Sopenharmony_ci 132bf215546Sopenharmony_ci chan_intrin->const_index[0] = intrin->const_index[0]; 133bf215546Sopenharmony_ci chan_intrin->const_index[1] = intrin->const_index[1]; 134bf215546Sopenharmony_ci 135bf215546Sopenharmony_ci if (lower_to_32bit && chan_intrin->src[0].ssa->bit_size == 64) { 136bf215546Sopenharmony_ci reads[i] = lower_subgroup_op_to_32bit(b, chan_intrin); 137bf215546Sopenharmony_ci } else { 138bf215546Sopenharmony_ci nir_builder_instr_insert(b, &chan_intrin->instr); 139bf215546Sopenharmony_ci reads[i] = &chan_intrin->dest.ssa; 140bf215546Sopenharmony_ci } 141bf215546Sopenharmony_ci } 142bf215546Sopenharmony_ci 143bf215546Sopenharmony_ci return nir_vec(b, reads, intrin->num_components); 144bf215546Sopenharmony_ci} 145bf215546Sopenharmony_ci 146bf215546Sopenharmony_cistatic nir_ssa_def * 147bf215546Sopenharmony_cilower_vote_eq_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin) 148bf215546Sopenharmony_ci{ 149bf215546Sopenharmony_ci assert(intrin->src[0].is_ssa); 150bf215546Sopenharmony_ci nir_ssa_def *value = intrin->src[0].ssa; 151bf215546Sopenharmony_ci 152bf215546Sopenharmony_ci nir_ssa_def *result = NULL; 153bf215546Sopenharmony_ci for (unsigned i = 0; i < intrin->num_components; i++) { 154bf215546Sopenharmony_ci nir_intrinsic_instr *chan_intrin = 155bf215546Sopenharmony_ci nir_intrinsic_instr_create(b->shader, intrin->intrinsic); 156bf215546Sopenharmony_ci nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest, 157bf215546Sopenharmony_ci 1, intrin->dest.ssa.bit_size, NULL); 158bf215546Sopenharmony_ci chan_intrin->num_components = 1; 159bf215546Sopenharmony_ci chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i)); 160bf215546Sopenharmony_ci nir_builder_instr_insert(b, &chan_intrin->instr); 161bf215546Sopenharmony_ci 162bf215546Sopenharmony_ci if (result) { 163bf215546Sopenharmony_ci result = nir_iand(b, result, &chan_intrin->dest.ssa); 164bf215546Sopenharmony_ci } else { 165bf215546Sopenharmony_ci result = &chan_intrin->dest.ssa; 166bf215546Sopenharmony_ci } 167bf215546Sopenharmony_ci } 168bf215546Sopenharmony_ci 169bf215546Sopenharmony_ci return result; 170bf215546Sopenharmony_ci} 171bf215546Sopenharmony_ci 172bf215546Sopenharmony_cistatic nir_ssa_def * 173bf215546Sopenharmony_cilower_vote_eq(nir_builder *b, nir_intrinsic_instr *intrin) 174bf215546Sopenharmony_ci{ 175bf215546Sopenharmony_ci assert(intrin->src[0].is_ssa); 176bf215546Sopenharmony_ci nir_ssa_def *value = intrin->src[0].ssa; 177bf215546Sopenharmony_ci 178bf215546Sopenharmony_ci /* We have to implicitly lower to scalar */ 179bf215546Sopenharmony_ci nir_ssa_def *all_eq = NULL; 180bf215546Sopenharmony_ci for (unsigned i = 0; i < intrin->num_components; i++) { 181bf215546Sopenharmony_ci nir_ssa_def *rfi = nir_read_first_invocation(b, nir_channel(b, value, i)); 182bf215546Sopenharmony_ci 183bf215546Sopenharmony_ci nir_ssa_def *is_eq; 184bf215546Sopenharmony_ci if (intrin->intrinsic == nir_intrinsic_vote_feq) { 185bf215546Sopenharmony_ci is_eq = nir_feq(b, rfi, nir_channel(b, value, i)); 186bf215546Sopenharmony_ci } else { 187bf215546Sopenharmony_ci is_eq = nir_ieq(b, rfi, nir_channel(b, value, i)); 188bf215546Sopenharmony_ci } 189bf215546Sopenharmony_ci 190bf215546Sopenharmony_ci if (all_eq == NULL) { 191bf215546Sopenharmony_ci all_eq = is_eq; 192bf215546Sopenharmony_ci } else { 193bf215546Sopenharmony_ci all_eq = nir_iand(b, all_eq, is_eq); 194bf215546Sopenharmony_ci } 195bf215546Sopenharmony_ci } 196bf215546Sopenharmony_ci 197bf215546Sopenharmony_ci return nir_vote_all(b, 1, all_eq); 198bf215546Sopenharmony_ci} 199bf215546Sopenharmony_ci 200bf215546Sopenharmony_cistatic nir_ssa_def * 201bf215546Sopenharmony_cilower_shuffle_to_swizzle(nir_builder *b, nir_intrinsic_instr *intrin, 202bf215546Sopenharmony_ci const nir_lower_subgroups_options *options) 203bf215546Sopenharmony_ci{ 204bf215546Sopenharmony_ci unsigned mask = nir_src_as_uint(intrin->src[1]); 205bf215546Sopenharmony_ci 206bf215546Sopenharmony_ci if (mask >= 32) 207bf215546Sopenharmony_ci return NULL; 208bf215546Sopenharmony_ci 209bf215546Sopenharmony_ci nir_intrinsic_instr *swizzle = nir_intrinsic_instr_create( 210bf215546Sopenharmony_ci b->shader, nir_intrinsic_masked_swizzle_amd); 211bf215546Sopenharmony_ci swizzle->num_components = intrin->num_components; 212bf215546Sopenharmony_ci nir_src_copy(&swizzle->src[0], &intrin->src[0]); 213bf215546Sopenharmony_ci nir_intrinsic_set_swizzle_mask(swizzle, (mask << 10) | 0x1f); 214bf215546Sopenharmony_ci nir_ssa_dest_init(&swizzle->instr, &swizzle->dest, 215bf215546Sopenharmony_ci intrin->dest.ssa.num_components, 216bf215546Sopenharmony_ci intrin->dest.ssa.bit_size, NULL); 217bf215546Sopenharmony_ci 218bf215546Sopenharmony_ci if (options->lower_to_scalar && swizzle->num_components > 1) { 219bf215546Sopenharmony_ci return lower_subgroup_op_to_scalar(b, swizzle, options->lower_shuffle_to_32bit); 220bf215546Sopenharmony_ci } else if (options->lower_shuffle_to_32bit && swizzle->src[0].ssa->bit_size == 64) { 221bf215546Sopenharmony_ci return lower_subgroup_op_to_32bit(b, swizzle); 222bf215546Sopenharmony_ci } else { 223bf215546Sopenharmony_ci nir_builder_instr_insert(b, &swizzle->instr); 224bf215546Sopenharmony_ci return &swizzle->dest.ssa; 225bf215546Sopenharmony_ci } 226bf215546Sopenharmony_ci} 227bf215546Sopenharmony_ci 228bf215546Sopenharmony_ci/* Lowers "specialized" shuffles to a generic nir_intrinsic_shuffle. */ 229bf215546Sopenharmony_ci 230bf215546Sopenharmony_cistatic nir_ssa_def * 231bf215546Sopenharmony_cilower_to_shuffle(nir_builder *b, nir_intrinsic_instr *intrin, 232bf215546Sopenharmony_ci const nir_lower_subgroups_options *options) 233bf215546Sopenharmony_ci{ 234bf215546Sopenharmony_ci if (intrin->intrinsic == nir_intrinsic_shuffle_xor && 235bf215546Sopenharmony_ci options->lower_shuffle_to_swizzle_amd && 236bf215546Sopenharmony_ci nir_src_is_const(intrin->src[1])) { 237bf215546Sopenharmony_ci nir_ssa_def *result = 238bf215546Sopenharmony_ci lower_shuffle_to_swizzle(b, intrin, options); 239bf215546Sopenharmony_ci if (result) 240bf215546Sopenharmony_ci return result; 241bf215546Sopenharmony_ci } 242bf215546Sopenharmony_ci 243bf215546Sopenharmony_ci nir_ssa_def *index = nir_load_subgroup_invocation(b); 244bf215546Sopenharmony_ci bool is_shuffle = false; 245bf215546Sopenharmony_ci switch (intrin->intrinsic) { 246bf215546Sopenharmony_ci case nir_intrinsic_shuffle_xor: 247bf215546Sopenharmony_ci assert(intrin->src[1].is_ssa); 248bf215546Sopenharmony_ci index = nir_ixor(b, index, intrin->src[1].ssa); 249bf215546Sopenharmony_ci is_shuffle = true; 250bf215546Sopenharmony_ci break; 251bf215546Sopenharmony_ci case nir_intrinsic_shuffle_up: 252bf215546Sopenharmony_ci assert(intrin->src[1].is_ssa); 253bf215546Sopenharmony_ci index = nir_isub(b, index, intrin->src[1].ssa); 254bf215546Sopenharmony_ci is_shuffle = true; 255bf215546Sopenharmony_ci break; 256bf215546Sopenharmony_ci case nir_intrinsic_shuffle_down: 257bf215546Sopenharmony_ci assert(intrin->src[1].is_ssa); 258bf215546Sopenharmony_ci index = nir_iadd(b, index, intrin->src[1].ssa); 259bf215546Sopenharmony_ci is_shuffle = true; 260bf215546Sopenharmony_ci break; 261bf215546Sopenharmony_ci case nir_intrinsic_quad_broadcast: 262bf215546Sopenharmony_ci assert(intrin->src[1].is_ssa); 263bf215546Sopenharmony_ci index = nir_ior(b, nir_iand(b, index, nir_imm_int(b, ~0x3)), 264bf215546Sopenharmony_ci intrin->src[1].ssa); 265bf215546Sopenharmony_ci break; 266bf215546Sopenharmony_ci case nir_intrinsic_quad_swap_horizontal: 267bf215546Sopenharmony_ci /* For Quad operations, subgroups are divided into quads where 268bf215546Sopenharmony_ci * (invocation % 4) is the index to a square arranged as follows: 269bf215546Sopenharmony_ci * 270bf215546Sopenharmony_ci * +---+---+ 271bf215546Sopenharmony_ci * | 0 | 1 | 272bf215546Sopenharmony_ci * +---+---+ 273bf215546Sopenharmony_ci * | 2 | 3 | 274bf215546Sopenharmony_ci * +---+---+ 275bf215546Sopenharmony_ci */ 276bf215546Sopenharmony_ci index = nir_ixor(b, index, nir_imm_int(b, 0x1)); 277bf215546Sopenharmony_ci break; 278bf215546Sopenharmony_ci case nir_intrinsic_quad_swap_vertical: 279bf215546Sopenharmony_ci index = nir_ixor(b, index, nir_imm_int(b, 0x2)); 280bf215546Sopenharmony_ci break; 281bf215546Sopenharmony_ci case nir_intrinsic_quad_swap_diagonal: 282bf215546Sopenharmony_ci index = nir_ixor(b, index, nir_imm_int(b, 0x3)); 283bf215546Sopenharmony_ci break; 284bf215546Sopenharmony_ci default: 285bf215546Sopenharmony_ci unreachable("Invalid intrinsic"); 286bf215546Sopenharmony_ci } 287bf215546Sopenharmony_ci 288bf215546Sopenharmony_ci nir_intrinsic_instr *shuffle = 289bf215546Sopenharmony_ci nir_intrinsic_instr_create(b->shader, nir_intrinsic_shuffle); 290bf215546Sopenharmony_ci shuffle->num_components = intrin->num_components; 291bf215546Sopenharmony_ci nir_src_copy(&shuffle->src[0], &intrin->src[0]); 292bf215546Sopenharmony_ci shuffle->src[1] = nir_src_for_ssa(index); 293bf215546Sopenharmony_ci nir_ssa_dest_init(&shuffle->instr, &shuffle->dest, 294bf215546Sopenharmony_ci intrin->dest.ssa.num_components, 295bf215546Sopenharmony_ci intrin->dest.ssa.bit_size, NULL); 296bf215546Sopenharmony_ci 297bf215546Sopenharmony_ci bool lower_to_32bit = options->lower_shuffle_to_32bit && is_shuffle; 298bf215546Sopenharmony_ci if (options->lower_to_scalar && shuffle->num_components > 1) { 299bf215546Sopenharmony_ci return lower_subgroup_op_to_scalar(b, shuffle, lower_to_32bit); 300bf215546Sopenharmony_ci } else if (lower_to_32bit && shuffle->src[0].ssa->bit_size == 64) { 301bf215546Sopenharmony_ci return lower_subgroup_op_to_32bit(b, shuffle); 302bf215546Sopenharmony_ci } else { 303bf215546Sopenharmony_ci nir_builder_instr_insert(b, &shuffle->instr); 304bf215546Sopenharmony_ci return &shuffle->dest.ssa; 305bf215546Sopenharmony_ci } 306bf215546Sopenharmony_ci} 307bf215546Sopenharmony_ci 308bf215546Sopenharmony_cistatic const struct glsl_type * 309bf215546Sopenharmony_ciglsl_type_for_ssa(nir_ssa_def *def) 310bf215546Sopenharmony_ci{ 311bf215546Sopenharmony_ci const struct glsl_type *comp_type = def->bit_size == 1 ? glsl_bool_type() : 312bf215546Sopenharmony_ci glsl_uintN_t_type(def->bit_size); 313bf215546Sopenharmony_ci return glsl_replace_vector_type(comp_type, def->num_components); 314bf215546Sopenharmony_ci} 315bf215546Sopenharmony_ci 316bf215546Sopenharmony_ci/* Lower nir_intrinsic_shuffle to a waterfall loop + nir_read_invocation. 317bf215546Sopenharmony_ci */ 318bf215546Sopenharmony_cistatic nir_ssa_def * 319bf215546Sopenharmony_cilower_shuffle(nir_builder *b, nir_intrinsic_instr *intrin) 320bf215546Sopenharmony_ci{ 321bf215546Sopenharmony_ci assert(intrin->src[0].is_ssa); 322bf215546Sopenharmony_ci assert(intrin->src[1].is_ssa); 323bf215546Sopenharmony_ci nir_ssa_def *val = intrin->src[0].ssa; 324bf215546Sopenharmony_ci nir_ssa_def *id = intrin->src[1].ssa; 325bf215546Sopenharmony_ci 326bf215546Sopenharmony_ci /* The loop is something like: 327bf215546Sopenharmony_ci * 328bf215546Sopenharmony_ci * while (true) { 329bf215546Sopenharmony_ci * first_id = readFirstInvocation(gl_SubgroupInvocationID); 330bf215546Sopenharmony_ci * first_val = readFirstInvocation(val); 331bf215546Sopenharmony_ci * first_result = readInvocation(val, readFirstInvocation(id)); 332bf215546Sopenharmony_ci * if (id == first_id) 333bf215546Sopenharmony_ci * result = first_val; 334bf215546Sopenharmony_ci * if (elect()) { 335bf215546Sopenharmony_ci * if (id > gl_SubgroupInvocationID) { 336bf215546Sopenharmony_ci * result = first_result; 337bf215546Sopenharmony_ci * } 338bf215546Sopenharmony_ci * break; 339bf215546Sopenharmony_ci * } 340bf215546Sopenharmony_ci * } 341bf215546Sopenharmony_ci * 342bf215546Sopenharmony_ci * The idea is to guarantee, on each iteration of the loop, that anything 343bf215546Sopenharmony_ci * reading from first_id gets the correct value, so that we can then kill 344bf215546Sopenharmony_ci * it off by breaking out of the loop. Before doing that we also have to 345bf215546Sopenharmony_ci * ensure that first_id invocation gets the correct value. It only won't be 346bf215546Sopenharmony_ci * assigned the correct value already if the invocation it's reading from 347bf215546Sopenharmony_ci * isn't already killed off, that is, if it's later than its own ID. 348bf215546Sopenharmony_ci * Invocations where id <= gl_SubgroupInvocationID will be assigned their 349bf215546Sopenharmony_ci * result in the first if, and invocations where id > 350bf215546Sopenharmony_ci * gl_SubgroupInvocationID will be assigned their result in the second if. 351bf215546Sopenharmony_ci * 352bf215546Sopenharmony_ci * We do this more complicated loop rather than looping over all id's 353bf215546Sopenharmony_ci * explicitly because at this point we don't know the "actual" subgroup 354bf215546Sopenharmony_ci * size and at the moment there's no way to get at it, which means we may 355bf215546Sopenharmony_ci * loop over always-inactive invocations. 356bf215546Sopenharmony_ci */ 357bf215546Sopenharmony_ci 358bf215546Sopenharmony_ci nir_ssa_def *subgroup_id = nir_load_subgroup_invocation(b); 359bf215546Sopenharmony_ci 360bf215546Sopenharmony_ci nir_variable *result = 361bf215546Sopenharmony_ci nir_local_variable_create(b->impl, glsl_type_for_ssa(val), "result"); 362bf215546Sopenharmony_ci 363bf215546Sopenharmony_ci nir_loop *loop = nir_push_loop(b); { 364bf215546Sopenharmony_ci nir_ssa_def *first_id = nir_read_first_invocation(b, subgroup_id); 365bf215546Sopenharmony_ci nir_ssa_def *first_val = nir_read_first_invocation(b, val); 366bf215546Sopenharmony_ci nir_ssa_def *first_result = 367bf215546Sopenharmony_ci nir_read_invocation(b, val, nir_read_first_invocation(b, id)); 368bf215546Sopenharmony_ci 369bf215546Sopenharmony_ci nir_if *nif = nir_push_if(b, nir_ieq(b, id, first_id)); { 370bf215546Sopenharmony_ci nir_store_var(b, result, first_val, BITFIELD_MASK(val->num_components)); 371bf215546Sopenharmony_ci } nir_pop_if(b, nif); 372bf215546Sopenharmony_ci 373bf215546Sopenharmony_ci nir_if *nif2 = nir_push_if(b, nir_elect(b, 1)); { 374bf215546Sopenharmony_ci nir_if *nif3 = nir_push_if(b, nir_ult(b, subgroup_id, id)); { 375bf215546Sopenharmony_ci nir_store_var(b, result, first_result, BITFIELD_MASK(val->num_components)); 376bf215546Sopenharmony_ci } nir_pop_if(b, nif3); 377bf215546Sopenharmony_ci 378bf215546Sopenharmony_ci nir_jump(b, nir_jump_break); 379bf215546Sopenharmony_ci } nir_pop_if(b, nif2); 380bf215546Sopenharmony_ci } nir_pop_loop(b, loop); 381bf215546Sopenharmony_ci 382bf215546Sopenharmony_ci return nir_load_var(b, result); 383bf215546Sopenharmony_ci} 384bf215546Sopenharmony_ci 385bf215546Sopenharmony_cistatic bool 386bf215546Sopenharmony_cilower_subgroups_filter(const nir_instr *instr, const void *_options) 387bf215546Sopenharmony_ci{ 388bf215546Sopenharmony_ci return instr->type == nir_instr_type_intrinsic; 389bf215546Sopenharmony_ci} 390bf215546Sopenharmony_ci 391bf215546Sopenharmony_ci/* Return a ballot-mask-sized value which represents "val" sign-extended and 392bf215546Sopenharmony_ci * then shifted left by "shift". Only particular values for "val" are 393bf215546Sopenharmony_ci * supported, see below. 394bf215546Sopenharmony_ci */ 395bf215546Sopenharmony_cistatic nir_ssa_def * 396bf215546Sopenharmony_cibuild_ballot_imm_ishl(nir_builder *b, int64_t val, nir_ssa_def *shift, 397bf215546Sopenharmony_ci const nir_lower_subgroups_options *options) 398bf215546Sopenharmony_ci{ 399bf215546Sopenharmony_ci /* This only works if all the high bits are the same as bit 1. */ 400bf215546Sopenharmony_ci assert((val >> 2) == (val & 0x2 ? -1 : 0)); 401bf215546Sopenharmony_ci 402bf215546Sopenharmony_ci /* First compute the result assuming one ballot component. */ 403bf215546Sopenharmony_ci nir_ssa_def *result = 404bf215546Sopenharmony_ci nir_ishl(b, nir_imm_intN_t(b, val, options->ballot_bit_size), shift); 405bf215546Sopenharmony_ci 406bf215546Sopenharmony_ci if (options->ballot_components == 1) 407bf215546Sopenharmony_ci return result; 408bf215546Sopenharmony_ci 409bf215546Sopenharmony_ci /* Fix up the result when there is > 1 component. The idea is that nir_ishl 410bf215546Sopenharmony_ci * masks out the high bits of the shift value already, so in case there's 411bf215546Sopenharmony_ci * more than one component the component which 1 would be shifted into 412bf215546Sopenharmony_ci * already has the right value and all we have to do is fixup the other 413bf215546Sopenharmony_ci * components. Components below it should always be 0, and components above 414bf215546Sopenharmony_ci * it must be either 0 or ~0 because of the assert above. For example, if 415bf215546Sopenharmony_ci * the target ballot size is 2 x uint32, and we're shifting 1 by 33, then 416bf215546Sopenharmony_ci * we'll feed 33 into ishl, which will mask it off to get 1, so we'll 417bf215546Sopenharmony_ci * compute a single-component result of 2, which is correct for the second 418bf215546Sopenharmony_ci * component, but the first component needs to be 0, which we get by 419bf215546Sopenharmony_ci * comparing the high bits of the shift with 0 and selecting the original 420bf215546Sopenharmony_ci * answer or 0 for the first component (and something similar with the 421bf215546Sopenharmony_ci * second component). This idea is generalized here for any component count 422bf215546Sopenharmony_ci */ 423bf215546Sopenharmony_ci nir_const_value min_shift[4] = { 0 }; 424bf215546Sopenharmony_ci for (unsigned i = 0; i < options->ballot_components; i++) 425bf215546Sopenharmony_ci min_shift[i].i32 = i * options->ballot_bit_size; 426bf215546Sopenharmony_ci nir_ssa_def *min_shift_val = nir_build_imm(b, options->ballot_components, 32, min_shift); 427bf215546Sopenharmony_ci 428bf215546Sopenharmony_ci nir_const_value max_shift[4] = { 0 }; 429bf215546Sopenharmony_ci for (unsigned i = 0; i < options->ballot_components; i++) 430bf215546Sopenharmony_ci max_shift[i].i32 = (i + 1) * options->ballot_bit_size; 431bf215546Sopenharmony_ci nir_ssa_def *max_shift_val = nir_build_imm(b, options->ballot_components, 32, max_shift); 432bf215546Sopenharmony_ci 433bf215546Sopenharmony_ci return nir_bcsel(b, nir_ult(b, shift, max_shift_val), 434bf215546Sopenharmony_ci nir_bcsel(b, nir_ult(b, shift, min_shift_val), 435bf215546Sopenharmony_ci nir_imm_intN_t(b, val >> 63, result->bit_size), 436bf215546Sopenharmony_ci result), 437bf215546Sopenharmony_ci nir_imm_intN_t(b, 0, result->bit_size)); 438bf215546Sopenharmony_ci} 439bf215546Sopenharmony_ci 440bf215546Sopenharmony_cistatic nir_ssa_def * 441bf215546Sopenharmony_cibuild_subgroup_eq_mask(nir_builder *b, 442bf215546Sopenharmony_ci const nir_lower_subgroups_options *options) 443bf215546Sopenharmony_ci{ 444bf215546Sopenharmony_ci nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b); 445bf215546Sopenharmony_ci 446bf215546Sopenharmony_ci return build_ballot_imm_ishl(b, 1, subgroup_idx, options); 447bf215546Sopenharmony_ci} 448bf215546Sopenharmony_ci 449bf215546Sopenharmony_cistatic nir_ssa_def * 450bf215546Sopenharmony_cibuild_subgroup_ge_mask(nir_builder *b, 451bf215546Sopenharmony_ci const nir_lower_subgroups_options *options) 452bf215546Sopenharmony_ci{ 453bf215546Sopenharmony_ci nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b); 454bf215546Sopenharmony_ci 455bf215546Sopenharmony_ci return build_ballot_imm_ishl(b, ~0ull, subgroup_idx, options); 456bf215546Sopenharmony_ci} 457bf215546Sopenharmony_ci 458bf215546Sopenharmony_cistatic nir_ssa_def * 459bf215546Sopenharmony_cibuild_subgroup_gt_mask(nir_builder *b, 460bf215546Sopenharmony_ci const nir_lower_subgroups_options *options) 461bf215546Sopenharmony_ci{ 462bf215546Sopenharmony_ci nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b); 463bf215546Sopenharmony_ci 464bf215546Sopenharmony_ci return build_ballot_imm_ishl(b, ~1ull, subgroup_idx, options); 465bf215546Sopenharmony_ci} 466bf215546Sopenharmony_ci 467bf215546Sopenharmony_ci/* Return a mask which is 1 for threads up to the run-time subgroup size, i.e. 468bf215546Sopenharmony_ci * 1 for the entire subgroup. SPIR-V requires us to return 0 for indices at or 469bf215546Sopenharmony_ci * above the subgroup size for the masks, but gt_mask and ge_mask make them 1 470bf215546Sopenharmony_ci * so we have to "and" with this mask. 471bf215546Sopenharmony_ci */ 472bf215546Sopenharmony_cistatic nir_ssa_def * 473bf215546Sopenharmony_cibuild_subgroup_mask(nir_builder *b, 474bf215546Sopenharmony_ci const nir_lower_subgroups_options *options) 475bf215546Sopenharmony_ci{ 476bf215546Sopenharmony_ci nir_ssa_def *subgroup_size = nir_load_subgroup_size(b); 477bf215546Sopenharmony_ci 478bf215546Sopenharmony_ci /* First compute the result assuming one ballot component. */ 479bf215546Sopenharmony_ci nir_ssa_def *result = 480bf215546Sopenharmony_ci nir_ushr(b, nir_imm_intN_t(b, ~0ull, options->ballot_bit_size), 481bf215546Sopenharmony_ci nir_isub_imm(b, options->ballot_bit_size, 482bf215546Sopenharmony_ci subgroup_size)); 483bf215546Sopenharmony_ci 484bf215546Sopenharmony_ci /* Since the subgroup size and ballot bitsize are both powers of two, there 485bf215546Sopenharmony_ci * are two possible cases to consider: 486bf215546Sopenharmony_ci * 487bf215546Sopenharmony_ci * (1) The subgroup size is less than the ballot bitsize. We need to return 488bf215546Sopenharmony_ci * "result" in the first component and 0 in every other component. 489bf215546Sopenharmony_ci * (2) The subgroup size is a multiple of the ballot bitsize. We need to 490bf215546Sopenharmony_ci * return ~0 if the subgroup size divided by the ballot bitsize is less 491bf215546Sopenharmony_ci * than or equal to the index in the vector and 0 otherwise. For example, 492bf215546Sopenharmony_ci * with a target ballot type of 4 x uint32 and subgroup_size = 64 we'd need 493bf215546Sopenharmony_ci * to return { ~0, ~0, 0, 0 }. 494bf215546Sopenharmony_ci * 495bf215546Sopenharmony_ci * In case (2) it turns out that "result" will be ~0, because 496bf215546Sopenharmony_ci * "ballot_bit_size - subgroup_size" is also a multiple of 497bf215546Sopenharmony_ci * "ballot_bit_size" and since nir_ushr masks the shift value it will 498bf215546Sopenharmony_ci * shifted by 0. This means that the first component can just be "result" 499bf215546Sopenharmony_ci * in all cases. The other components will also get the correct value in 500bf215546Sopenharmony_ci * case (1) if we just use the rule in case (2), so we'll get the correct 501bf215546Sopenharmony_ci * result if we just follow (2) and then replace the first component with 502bf215546Sopenharmony_ci * "result". 503bf215546Sopenharmony_ci */ 504bf215546Sopenharmony_ci nir_const_value min_idx[4] = { 0 }; 505bf215546Sopenharmony_ci for (unsigned i = 0; i < options->ballot_components; i++) 506bf215546Sopenharmony_ci min_idx[i].i32 = i * options->ballot_bit_size; 507bf215546Sopenharmony_ci nir_ssa_def *min_idx_val = nir_build_imm(b, options->ballot_components, 32, min_idx); 508bf215546Sopenharmony_ci 509bf215546Sopenharmony_ci nir_ssa_def *result_extended = 510bf215546Sopenharmony_ci nir_pad_vector_imm_int(b, result, ~0ull, options->ballot_components); 511bf215546Sopenharmony_ci 512bf215546Sopenharmony_ci return nir_bcsel(b, nir_ult(b, min_idx_val, subgroup_size), 513bf215546Sopenharmony_ci result_extended, nir_imm_intN_t(b, 0, options->ballot_bit_size)); 514bf215546Sopenharmony_ci} 515bf215546Sopenharmony_ci 516bf215546Sopenharmony_cistatic nir_ssa_def * 517bf215546Sopenharmony_civec_bit_count(nir_builder *b, nir_ssa_def *value) 518bf215546Sopenharmony_ci{ 519bf215546Sopenharmony_ci nir_ssa_def *vec_result = nir_bit_count(b, value); 520bf215546Sopenharmony_ci nir_ssa_def *result = nir_channel(b, vec_result, 0); 521bf215546Sopenharmony_ci for (unsigned i = 1; i < value->num_components; i++) 522bf215546Sopenharmony_ci result = nir_iadd(b, result, nir_channel(b, vec_result, i)); 523bf215546Sopenharmony_ci return result; 524bf215546Sopenharmony_ci} 525bf215546Sopenharmony_ci 526bf215546Sopenharmony_cistatic nir_ssa_def * 527bf215546Sopenharmony_civec_find_lsb(nir_builder *b, nir_ssa_def *value) 528bf215546Sopenharmony_ci{ 529bf215546Sopenharmony_ci nir_ssa_def *vec_result = nir_find_lsb(b, value); 530bf215546Sopenharmony_ci nir_ssa_def *result = nir_imm_int(b, -1); 531bf215546Sopenharmony_ci for (int i = value->num_components - 1; i >= 0; i--) { 532bf215546Sopenharmony_ci nir_ssa_def *channel = nir_channel(b, vec_result, i); 533bf215546Sopenharmony_ci /* result = channel >= 0 ? (i * bitsize + channel) : result */ 534bf215546Sopenharmony_ci result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)), 535bf215546Sopenharmony_ci nir_iadd_imm(b, channel, i * value->bit_size), 536bf215546Sopenharmony_ci result); 537bf215546Sopenharmony_ci } 538bf215546Sopenharmony_ci return result; 539bf215546Sopenharmony_ci} 540bf215546Sopenharmony_ci 541bf215546Sopenharmony_cistatic nir_ssa_def * 542bf215546Sopenharmony_civec_find_msb(nir_builder *b, nir_ssa_def *value) 543bf215546Sopenharmony_ci{ 544bf215546Sopenharmony_ci nir_ssa_def *vec_result = nir_ufind_msb(b, value); 545bf215546Sopenharmony_ci nir_ssa_def *result = nir_imm_int(b, -1); 546bf215546Sopenharmony_ci for (unsigned i = 0; i < value->num_components; i++) { 547bf215546Sopenharmony_ci nir_ssa_def *channel = nir_channel(b, vec_result, i); 548bf215546Sopenharmony_ci /* result = channel >= 0 ? (i * bitsize + channel) : result */ 549bf215546Sopenharmony_ci result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)), 550bf215546Sopenharmony_ci nir_iadd_imm(b, channel, i * value->bit_size), 551bf215546Sopenharmony_ci result); 552bf215546Sopenharmony_ci } 553bf215546Sopenharmony_ci return result; 554bf215546Sopenharmony_ci} 555bf215546Sopenharmony_ci 556bf215546Sopenharmony_cistatic nir_ssa_def * 557bf215546Sopenharmony_cilower_dynamic_quad_broadcast(nir_builder *b, nir_intrinsic_instr *intrin, 558bf215546Sopenharmony_ci const nir_lower_subgroups_options *options) 559bf215546Sopenharmony_ci{ 560bf215546Sopenharmony_ci if (!options->lower_quad_broadcast_dynamic_to_const) 561bf215546Sopenharmony_ci return lower_to_shuffle(b, intrin, options); 562bf215546Sopenharmony_ci 563bf215546Sopenharmony_ci nir_ssa_def *dst = NULL; 564bf215546Sopenharmony_ci 565bf215546Sopenharmony_ci for (unsigned i = 0; i < 4; ++i) { 566bf215546Sopenharmony_ci nir_intrinsic_instr *qbcst = 567bf215546Sopenharmony_ci nir_intrinsic_instr_create(b->shader, nir_intrinsic_quad_broadcast); 568bf215546Sopenharmony_ci 569bf215546Sopenharmony_ci qbcst->num_components = intrin->num_components; 570bf215546Sopenharmony_ci qbcst->src[1] = nir_src_for_ssa(nir_imm_int(b, i)); 571bf215546Sopenharmony_ci nir_src_copy(&qbcst->src[0], &intrin->src[0]); 572bf215546Sopenharmony_ci nir_ssa_dest_init(&qbcst->instr, &qbcst->dest, 573bf215546Sopenharmony_ci intrin->dest.ssa.num_components, 574bf215546Sopenharmony_ci intrin->dest.ssa.bit_size, NULL); 575bf215546Sopenharmony_ci 576bf215546Sopenharmony_ci nir_ssa_def *qbcst_dst = NULL; 577bf215546Sopenharmony_ci 578bf215546Sopenharmony_ci if (options->lower_to_scalar && qbcst->num_components > 1) { 579bf215546Sopenharmony_ci qbcst_dst = lower_subgroup_op_to_scalar(b, qbcst, false); 580bf215546Sopenharmony_ci } else { 581bf215546Sopenharmony_ci nir_builder_instr_insert(b, &qbcst->instr); 582bf215546Sopenharmony_ci qbcst_dst = &qbcst->dest.ssa; 583bf215546Sopenharmony_ci } 584bf215546Sopenharmony_ci 585bf215546Sopenharmony_ci if (i) 586bf215546Sopenharmony_ci dst = nir_bcsel(b, nir_ieq(b, intrin->src[1].ssa, 587bf215546Sopenharmony_ci nir_src_for_ssa(nir_imm_int(b, i)).ssa), 588bf215546Sopenharmony_ci qbcst_dst, dst); 589bf215546Sopenharmony_ci else 590bf215546Sopenharmony_ci dst = qbcst_dst; 591bf215546Sopenharmony_ci } 592bf215546Sopenharmony_ci 593bf215546Sopenharmony_ci return dst; 594bf215546Sopenharmony_ci} 595bf215546Sopenharmony_ci 596bf215546Sopenharmony_cistatic nir_ssa_def * 597bf215546Sopenharmony_cilower_read_invocation_to_cond(nir_builder *b, nir_intrinsic_instr *intrin) 598bf215546Sopenharmony_ci{ 599bf215546Sopenharmony_ci return nir_read_invocation_cond_ir3(b, intrin->dest.ssa.bit_size, 600bf215546Sopenharmony_ci intrin->src[0].ssa, 601bf215546Sopenharmony_ci nir_ieq(b, intrin->src[1].ssa, 602bf215546Sopenharmony_ci nir_load_subgroup_invocation(b))); 603bf215546Sopenharmony_ci} 604bf215546Sopenharmony_ci 605bf215546Sopenharmony_cistatic nir_ssa_def * 606bf215546Sopenharmony_cilower_subgroups_instr(nir_builder *b, nir_instr *instr, void *_options) 607bf215546Sopenharmony_ci{ 608bf215546Sopenharmony_ci const nir_lower_subgroups_options *options = _options; 609bf215546Sopenharmony_ci 610bf215546Sopenharmony_ci nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 611bf215546Sopenharmony_ci switch (intrin->intrinsic) { 612bf215546Sopenharmony_ci case nir_intrinsic_vote_any: 613bf215546Sopenharmony_ci case nir_intrinsic_vote_all: 614bf215546Sopenharmony_ci if (options->lower_vote_trivial) 615bf215546Sopenharmony_ci return nir_ssa_for_src(b, intrin->src[0], 1); 616bf215546Sopenharmony_ci break; 617bf215546Sopenharmony_ci 618bf215546Sopenharmony_ci case nir_intrinsic_vote_feq: 619bf215546Sopenharmony_ci case nir_intrinsic_vote_ieq: 620bf215546Sopenharmony_ci if (options->lower_vote_trivial) 621bf215546Sopenharmony_ci return nir_imm_true(b); 622bf215546Sopenharmony_ci 623bf215546Sopenharmony_ci if (options->lower_vote_eq) 624bf215546Sopenharmony_ci return lower_vote_eq(b, intrin); 625bf215546Sopenharmony_ci 626bf215546Sopenharmony_ci if (options->lower_to_scalar && intrin->num_components > 1) 627bf215546Sopenharmony_ci return lower_vote_eq_to_scalar(b, intrin); 628bf215546Sopenharmony_ci break; 629bf215546Sopenharmony_ci 630bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_size: 631bf215546Sopenharmony_ci if (options->subgroup_size) 632bf215546Sopenharmony_ci return nir_imm_int(b, options->subgroup_size); 633bf215546Sopenharmony_ci break; 634bf215546Sopenharmony_ci 635bf215546Sopenharmony_ci case nir_intrinsic_read_invocation: 636bf215546Sopenharmony_ci if (options->lower_to_scalar && intrin->num_components > 1) 637bf215546Sopenharmony_ci return lower_subgroup_op_to_scalar(b, intrin, false); 638bf215546Sopenharmony_ci 639bf215546Sopenharmony_ci if (options->lower_read_invocation_to_cond) 640bf215546Sopenharmony_ci return lower_read_invocation_to_cond(b, intrin); 641bf215546Sopenharmony_ci 642bf215546Sopenharmony_ci break; 643bf215546Sopenharmony_ci 644bf215546Sopenharmony_ci case nir_intrinsic_read_first_invocation: 645bf215546Sopenharmony_ci if (options->lower_to_scalar && intrin->num_components > 1) 646bf215546Sopenharmony_ci return lower_subgroup_op_to_scalar(b, intrin, false); 647bf215546Sopenharmony_ci break; 648bf215546Sopenharmony_ci 649bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_eq_mask: 650bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_ge_mask: 651bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_gt_mask: 652bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_le_mask: 653bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_lt_mask: { 654bf215546Sopenharmony_ci if (!options->lower_subgroup_masks) 655bf215546Sopenharmony_ci return NULL; 656bf215546Sopenharmony_ci 657bf215546Sopenharmony_ci nir_ssa_def *val; 658bf215546Sopenharmony_ci switch (intrin->intrinsic) { 659bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_eq_mask: 660bf215546Sopenharmony_ci val = build_subgroup_eq_mask(b, options); 661bf215546Sopenharmony_ci break; 662bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_ge_mask: 663bf215546Sopenharmony_ci val = nir_iand(b, build_subgroup_ge_mask(b, options), 664bf215546Sopenharmony_ci build_subgroup_mask(b, options)); 665bf215546Sopenharmony_ci break; 666bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_gt_mask: 667bf215546Sopenharmony_ci val = nir_iand(b, build_subgroup_gt_mask(b, options), 668bf215546Sopenharmony_ci build_subgroup_mask(b, options)); 669bf215546Sopenharmony_ci break; 670bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_le_mask: 671bf215546Sopenharmony_ci val = nir_inot(b, build_subgroup_gt_mask(b, options)); 672bf215546Sopenharmony_ci break; 673bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_lt_mask: 674bf215546Sopenharmony_ci val = nir_inot(b, build_subgroup_ge_mask(b, options)); 675bf215546Sopenharmony_ci break; 676bf215546Sopenharmony_ci default: 677bf215546Sopenharmony_ci unreachable("you seriously can't tell this is unreachable?"); 678bf215546Sopenharmony_ci } 679bf215546Sopenharmony_ci 680bf215546Sopenharmony_ci return uint_to_ballot_type(b, val, 681bf215546Sopenharmony_ci intrin->dest.ssa.num_components, 682bf215546Sopenharmony_ci intrin->dest.ssa.bit_size); 683bf215546Sopenharmony_ci } 684bf215546Sopenharmony_ci 685bf215546Sopenharmony_ci case nir_intrinsic_ballot: { 686bf215546Sopenharmony_ci if (intrin->dest.ssa.num_components == options->ballot_components && 687bf215546Sopenharmony_ci intrin->dest.ssa.bit_size == options->ballot_bit_size) 688bf215546Sopenharmony_ci return NULL; 689bf215546Sopenharmony_ci 690bf215546Sopenharmony_ci nir_ssa_def *ballot = 691bf215546Sopenharmony_ci nir_ballot(b, options->ballot_components, options->ballot_bit_size, 692bf215546Sopenharmony_ci intrin->src[0].ssa); 693bf215546Sopenharmony_ci 694bf215546Sopenharmony_ci return uint_to_ballot_type(b, ballot, 695bf215546Sopenharmony_ci intrin->dest.ssa.num_components, 696bf215546Sopenharmony_ci intrin->dest.ssa.bit_size); 697bf215546Sopenharmony_ci } 698bf215546Sopenharmony_ci 699bf215546Sopenharmony_ci case nir_intrinsic_ballot_bitfield_extract: 700bf215546Sopenharmony_ci case nir_intrinsic_ballot_bit_count_reduce: 701bf215546Sopenharmony_ci case nir_intrinsic_ballot_find_lsb: 702bf215546Sopenharmony_ci case nir_intrinsic_ballot_find_msb: { 703bf215546Sopenharmony_ci assert(intrin->src[0].is_ssa); 704bf215546Sopenharmony_ci nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa, 705bf215546Sopenharmony_ci options); 706bf215546Sopenharmony_ci 707bf215546Sopenharmony_ci if (intrin->intrinsic != nir_intrinsic_ballot_bitfield_extract && 708bf215546Sopenharmony_ci intrin->intrinsic != nir_intrinsic_ballot_find_lsb) { 709bf215546Sopenharmony_ci /* For OpGroupNonUniformBallotFindMSB, the SPIR-V Spec says: 710bf215546Sopenharmony_ci * 711bf215546Sopenharmony_ci * "Find the most significant bit set to 1 in Value, considering 712bf215546Sopenharmony_ci * only the bits in Value required to represent all bits of the 713bf215546Sopenharmony_ci * group’s invocations. If none of the considered bits is set to 714bf215546Sopenharmony_ci * 1, the result is undefined." 715bf215546Sopenharmony_ci * 716bf215546Sopenharmony_ci * It has similar text for the other three. This means that, in case 717bf215546Sopenharmony_ci * the subgroup size is less than 32, we have to mask off the unused 718bf215546Sopenharmony_ci * bits. If the subgroup size is fixed and greater than or equal to 719bf215546Sopenharmony_ci * 32, the mask will be 0xffffffff and nir_opt_algebraic will delete 720bf215546Sopenharmony_ci * the iand. 721bf215546Sopenharmony_ci * 722bf215546Sopenharmony_ci * We only have to worry about this for BitCount and FindMSB because 723bf215546Sopenharmony_ci * FindLSB counts from the bottom and BitfieldExtract selects 724bf215546Sopenharmony_ci * individual bits. In either case, if run outside the range of 725bf215546Sopenharmony_ci * valid bits, we hit the undefined results case and we can return 726bf215546Sopenharmony_ci * anything we want. 727bf215546Sopenharmony_ci */ 728bf215546Sopenharmony_ci int_val = nir_iand(b, int_val, build_subgroup_mask(b, options)); 729bf215546Sopenharmony_ci } 730bf215546Sopenharmony_ci 731bf215546Sopenharmony_ci switch (intrin->intrinsic) { 732bf215546Sopenharmony_ci case nir_intrinsic_ballot_bitfield_extract: { 733bf215546Sopenharmony_ci assert(intrin->src[1].is_ssa); 734bf215546Sopenharmony_ci nir_ssa_def *idx = intrin->src[1].ssa; 735bf215546Sopenharmony_ci if (int_val->num_components > 1) { 736bf215546Sopenharmony_ci /* idx will be truncated by nir_ushr, so we just need to select 737bf215546Sopenharmony_ci * the right component using the bits of idx that are truncated in 738bf215546Sopenharmony_ci * the shift. 739bf215546Sopenharmony_ci */ 740bf215546Sopenharmony_ci int_val = 741bf215546Sopenharmony_ci nir_vector_extract(b, int_val, 742bf215546Sopenharmony_ci nir_udiv_imm(b, idx, int_val->bit_size)); 743bf215546Sopenharmony_ci } 744bf215546Sopenharmony_ci 745bf215546Sopenharmony_ci return nir_test_mask(b, nir_ushr(b, int_val, idx), 1); 746bf215546Sopenharmony_ci } 747bf215546Sopenharmony_ci case nir_intrinsic_ballot_bit_count_reduce: 748bf215546Sopenharmony_ci return vec_bit_count(b, int_val); 749bf215546Sopenharmony_ci case nir_intrinsic_ballot_find_lsb: 750bf215546Sopenharmony_ci return vec_find_lsb(b, int_val); 751bf215546Sopenharmony_ci case nir_intrinsic_ballot_find_msb: 752bf215546Sopenharmony_ci return vec_find_msb(b, int_val); 753bf215546Sopenharmony_ci default: 754bf215546Sopenharmony_ci unreachable("you seriously can't tell this is unreachable?"); 755bf215546Sopenharmony_ci } 756bf215546Sopenharmony_ci } 757bf215546Sopenharmony_ci 758bf215546Sopenharmony_ci case nir_intrinsic_ballot_bit_count_exclusive: 759bf215546Sopenharmony_ci case nir_intrinsic_ballot_bit_count_inclusive: { 760bf215546Sopenharmony_ci nir_ssa_def *mask; 761bf215546Sopenharmony_ci if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_inclusive) { 762bf215546Sopenharmony_ci mask = nir_inot(b, build_subgroup_gt_mask(b, options)); 763bf215546Sopenharmony_ci } else { 764bf215546Sopenharmony_ci mask = nir_inot(b, build_subgroup_ge_mask(b, options)); 765bf215546Sopenharmony_ci } 766bf215546Sopenharmony_ci 767bf215546Sopenharmony_ci assert(intrin->src[0].is_ssa); 768bf215546Sopenharmony_ci nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa, 769bf215546Sopenharmony_ci options); 770bf215546Sopenharmony_ci 771bf215546Sopenharmony_ci return vec_bit_count(b, nir_iand(b, int_val, mask)); 772bf215546Sopenharmony_ci } 773bf215546Sopenharmony_ci 774bf215546Sopenharmony_ci case nir_intrinsic_elect: { 775bf215546Sopenharmony_ci if (!options->lower_elect) 776bf215546Sopenharmony_ci return NULL; 777bf215546Sopenharmony_ci 778bf215546Sopenharmony_ci return nir_ieq(b, nir_load_subgroup_invocation(b), nir_first_invocation(b)); 779bf215546Sopenharmony_ci } 780bf215546Sopenharmony_ci 781bf215546Sopenharmony_ci case nir_intrinsic_shuffle: 782bf215546Sopenharmony_ci if (options->lower_shuffle) 783bf215546Sopenharmony_ci return lower_shuffle(b, intrin); 784bf215546Sopenharmony_ci else if (options->lower_to_scalar && intrin->num_components > 1) 785bf215546Sopenharmony_ci return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit); 786bf215546Sopenharmony_ci else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64) 787bf215546Sopenharmony_ci return lower_subgroup_op_to_32bit(b, intrin); 788bf215546Sopenharmony_ci break; 789bf215546Sopenharmony_ci case nir_intrinsic_shuffle_xor: 790bf215546Sopenharmony_ci case nir_intrinsic_shuffle_up: 791bf215546Sopenharmony_ci case nir_intrinsic_shuffle_down: 792bf215546Sopenharmony_ci if (options->lower_relative_shuffle) 793bf215546Sopenharmony_ci return lower_to_shuffle(b, intrin, options); 794bf215546Sopenharmony_ci else if (options->lower_to_scalar && intrin->num_components > 1) 795bf215546Sopenharmony_ci return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit); 796bf215546Sopenharmony_ci else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64) 797bf215546Sopenharmony_ci return lower_subgroup_op_to_32bit(b, intrin); 798bf215546Sopenharmony_ci break; 799bf215546Sopenharmony_ci 800bf215546Sopenharmony_ci case nir_intrinsic_quad_broadcast: 801bf215546Sopenharmony_ci case nir_intrinsic_quad_swap_horizontal: 802bf215546Sopenharmony_ci case nir_intrinsic_quad_swap_vertical: 803bf215546Sopenharmony_ci case nir_intrinsic_quad_swap_diagonal: 804bf215546Sopenharmony_ci if (options->lower_quad || 805bf215546Sopenharmony_ci (options->lower_quad_broadcast_dynamic && 806bf215546Sopenharmony_ci intrin->intrinsic == nir_intrinsic_quad_broadcast && 807bf215546Sopenharmony_ci !nir_src_is_const(intrin->src[1]))) 808bf215546Sopenharmony_ci return lower_dynamic_quad_broadcast(b, intrin, options); 809bf215546Sopenharmony_ci else if (options->lower_to_scalar && intrin->num_components > 1) 810bf215546Sopenharmony_ci return lower_subgroup_op_to_scalar(b, intrin, false); 811bf215546Sopenharmony_ci break; 812bf215546Sopenharmony_ci 813bf215546Sopenharmony_ci case nir_intrinsic_reduce: { 814bf215546Sopenharmony_ci nir_ssa_def *ret = NULL; 815bf215546Sopenharmony_ci /* A cluster size greater than the subgroup size is implemention defined */ 816bf215546Sopenharmony_ci if (options->subgroup_size && 817bf215546Sopenharmony_ci nir_intrinsic_cluster_size(intrin) >= options->subgroup_size) { 818bf215546Sopenharmony_ci nir_intrinsic_set_cluster_size(intrin, 0); 819bf215546Sopenharmony_ci ret = NIR_LOWER_INSTR_PROGRESS; 820bf215546Sopenharmony_ci } 821bf215546Sopenharmony_ci if (options->lower_to_scalar && intrin->num_components > 1) 822bf215546Sopenharmony_ci ret = lower_subgroup_op_to_scalar(b, intrin, false); 823bf215546Sopenharmony_ci return ret; 824bf215546Sopenharmony_ci } 825bf215546Sopenharmony_ci case nir_intrinsic_inclusive_scan: 826bf215546Sopenharmony_ci case nir_intrinsic_exclusive_scan: 827bf215546Sopenharmony_ci if (options->lower_to_scalar && intrin->num_components > 1) 828bf215546Sopenharmony_ci return lower_subgroup_op_to_scalar(b, intrin, false); 829bf215546Sopenharmony_ci break; 830bf215546Sopenharmony_ci 831bf215546Sopenharmony_ci default: 832bf215546Sopenharmony_ci break; 833bf215546Sopenharmony_ci } 834bf215546Sopenharmony_ci 835bf215546Sopenharmony_ci return NULL; 836bf215546Sopenharmony_ci} 837bf215546Sopenharmony_ci 838bf215546Sopenharmony_cibool 839bf215546Sopenharmony_cinir_lower_subgroups(nir_shader *shader, 840bf215546Sopenharmony_ci const nir_lower_subgroups_options *options) 841bf215546Sopenharmony_ci{ 842bf215546Sopenharmony_ci return nir_shader_lower_instructions(shader, 843bf215546Sopenharmony_ci lower_subgroups_filter, 844bf215546Sopenharmony_ci lower_subgroups_instr, 845bf215546Sopenharmony_ci (void *)options); 846bf215546Sopenharmony_ci} 847