1/* 2 * Copyright © 2017 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "nir.h" 25#include "nir_builder.h" 26#include "util/u_math.h" 27 28/** 29 * \file nir_opt_intrinsics.c 30 */ 31 32static nir_intrinsic_instr * 33lower_subgroups_64bit_split_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, 34 unsigned int component) 35{ 36 nir_ssa_def *comp; 37 if (component == 0) 38 comp = nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa); 39 else 40 comp = nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa); 41 42 nir_intrinsic_instr *intr = nir_intrinsic_instr_create(b->shader, intrin->intrinsic); 43 nir_ssa_dest_init(&intr->instr, &intr->dest, 1, 32, NULL); 44 intr->const_index[0] = intrin->const_index[0]; 45 intr->const_index[1] = intrin->const_index[1]; 46 intr->src[0] = nir_src_for_ssa(comp); 47 if (nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2) 48 nir_src_copy(&intr->src[1], &intrin->src[1]); 49 50 intr->num_components = 1; 51 nir_builder_instr_insert(b, &intr->instr); 52 return intr; 53} 54 55static nir_ssa_def * 56lower_subgroup_op_to_32bit(nir_builder *b, nir_intrinsic_instr *intrin) 57{ 58 assert(intrin->src[0].ssa->bit_size == 64); 59 nir_intrinsic_instr *intr_x = lower_subgroups_64bit_split_intrinsic(b, intrin, 0); 60 nir_intrinsic_instr *intr_y = lower_subgroups_64bit_split_intrinsic(b, intrin, 1); 61 return nir_pack_64_2x32_split(b, &intr_x->dest.ssa, &intr_y->dest.ssa); 62} 63 64static nir_ssa_def * 65ballot_type_to_uint(nir_builder *b, nir_ssa_def *value, 66 const nir_lower_subgroups_options *options) 67{ 68 /* Only the new-style SPIR-V subgroup instructions take a ballot result as 69 * an argument, so we only use this on uvec4 types. 70 */ 71 assert(value->num_components == 4 && value->bit_size == 32); 72 73 return nir_extract_bits(b, &value, 1, 0, options->ballot_components, 74 options->ballot_bit_size); 75} 76 77static nir_ssa_def * 78uint_to_ballot_type(nir_builder *b, nir_ssa_def *value, 79 unsigned num_components, unsigned bit_size) 80{ 81 assert(util_is_power_of_two_nonzero(num_components)); 82 assert(util_is_power_of_two_nonzero(value->num_components)); 83 84 unsigned total_bits = bit_size * num_components; 85 86 /* If the source doesn't have enough bits, zero-pad */ 87 if (total_bits > value->bit_size * value->num_components) 88 value = nir_pad_vector_imm_int(b, value, 0, total_bits / value->bit_size); 89 90 value = nir_bitcast_vector(b, value, bit_size); 91 92 /* If the source has too many components, truncate. This can happen if, 93 * for instance, we're implementing GL_ARB_shader_ballot or 94 * VK_EXT_shader_subgroup_ballot which have 64-bit ballot values on an 95 * architecture with a native 128-bit uvec4 ballot. This comes up in Zink 96 * for OpenGL on Vulkan. It's the job of the driver calling this lowering 97 * pass to ensure that it's restricted subgroup sizes sufficiently that we 98 * have enough ballot bits. 99 */ 100 if (value->num_components > num_components) 101 value = nir_trim_vector(b, value, num_components); 102 103 return value; 104} 105 106static nir_ssa_def * 107lower_subgroup_op_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin, 108 bool lower_to_32bit) 109{ 110 /* This is safe to call on scalar things but it would be silly */ 111 assert(intrin->dest.ssa.num_components > 1); 112 113 nir_ssa_def *value = nir_ssa_for_src(b, intrin->src[0], 114 intrin->num_components); 115 nir_ssa_def *reads[NIR_MAX_VEC_COMPONENTS]; 116 117 for (unsigned i = 0; i < intrin->num_components; i++) { 118 nir_intrinsic_instr *chan_intrin = 119 nir_intrinsic_instr_create(b->shader, intrin->intrinsic); 120 nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest, 121 1, intrin->dest.ssa.bit_size, NULL); 122 chan_intrin->num_components = 1; 123 124 /* value */ 125 chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i)); 126 /* invocation */ 127 if (nir_intrinsic_infos[intrin->intrinsic].num_srcs > 1) { 128 assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2); 129 nir_src_copy(&chan_intrin->src[1], &intrin->src[1]); 130 } 131 132 chan_intrin->const_index[0] = intrin->const_index[0]; 133 chan_intrin->const_index[1] = intrin->const_index[1]; 134 135 if (lower_to_32bit && chan_intrin->src[0].ssa->bit_size == 64) { 136 reads[i] = lower_subgroup_op_to_32bit(b, chan_intrin); 137 } else { 138 nir_builder_instr_insert(b, &chan_intrin->instr); 139 reads[i] = &chan_intrin->dest.ssa; 140 } 141 } 142 143 return nir_vec(b, reads, intrin->num_components); 144} 145 146static nir_ssa_def * 147lower_vote_eq_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin) 148{ 149 assert(intrin->src[0].is_ssa); 150 nir_ssa_def *value = intrin->src[0].ssa; 151 152 nir_ssa_def *result = NULL; 153 for (unsigned i = 0; i < intrin->num_components; i++) { 154 nir_intrinsic_instr *chan_intrin = 155 nir_intrinsic_instr_create(b->shader, intrin->intrinsic); 156 nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest, 157 1, intrin->dest.ssa.bit_size, NULL); 158 chan_intrin->num_components = 1; 159 chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i)); 160 nir_builder_instr_insert(b, &chan_intrin->instr); 161 162 if (result) { 163 result = nir_iand(b, result, &chan_intrin->dest.ssa); 164 } else { 165 result = &chan_intrin->dest.ssa; 166 } 167 } 168 169 return result; 170} 171 172static nir_ssa_def * 173lower_vote_eq(nir_builder *b, nir_intrinsic_instr *intrin) 174{ 175 assert(intrin->src[0].is_ssa); 176 nir_ssa_def *value = intrin->src[0].ssa; 177 178 /* We have to implicitly lower to scalar */ 179 nir_ssa_def *all_eq = NULL; 180 for (unsigned i = 0; i < intrin->num_components; i++) { 181 nir_ssa_def *rfi = nir_read_first_invocation(b, nir_channel(b, value, i)); 182 183 nir_ssa_def *is_eq; 184 if (intrin->intrinsic == nir_intrinsic_vote_feq) { 185 is_eq = nir_feq(b, rfi, nir_channel(b, value, i)); 186 } else { 187 is_eq = nir_ieq(b, rfi, nir_channel(b, value, i)); 188 } 189 190 if (all_eq == NULL) { 191 all_eq = is_eq; 192 } else { 193 all_eq = nir_iand(b, all_eq, is_eq); 194 } 195 } 196 197 return nir_vote_all(b, 1, all_eq); 198} 199 200static nir_ssa_def * 201lower_shuffle_to_swizzle(nir_builder *b, nir_intrinsic_instr *intrin, 202 const nir_lower_subgroups_options *options) 203{ 204 unsigned mask = nir_src_as_uint(intrin->src[1]); 205 206 if (mask >= 32) 207 return NULL; 208 209 nir_intrinsic_instr *swizzle = nir_intrinsic_instr_create( 210 b->shader, nir_intrinsic_masked_swizzle_amd); 211 swizzle->num_components = intrin->num_components; 212 nir_src_copy(&swizzle->src[0], &intrin->src[0]); 213 nir_intrinsic_set_swizzle_mask(swizzle, (mask << 10) | 0x1f); 214 nir_ssa_dest_init(&swizzle->instr, &swizzle->dest, 215 intrin->dest.ssa.num_components, 216 intrin->dest.ssa.bit_size, NULL); 217 218 if (options->lower_to_scalar && swizzle->num_components > 1) { 219 return lower_subgroup_op_to_scalar(b, swizzle, options->lower_shuffle_to_32bit); 220 } else if (options->lower_shuffle_to_32bit && swizzle->src[0].ssa->bit_size == 64) { 221 return lower_subgroup_op_to_32bit(b, swizzle); 222 } else { 223 nir_builder_instr_insert(b, &swizzle->instr); 224 return &swizzle->dest.ssa; 225 } 226} 227 228/* Lowers "specialized" shuffles to a generic nir_intrinsic_shuffle. */ 229 230static nir_ssa_def * 231lower_to_shuffle(nir_builder *b, nir_intrinsic_instr *intrin, 232 const nir_lower_subgroups_options *options) 233{ 234 if (intrin->intrinsic == nir_intrinsic_shuffle_xor && 235 options->lower_shuffle_to_swizzle_amd && 236 nir_src_is_const(intrin->src[1])) { 237 nir_ssa_def *result = 238 lower_shuffle_to_swizzle(b, intrin, options); 239 if (result) 240 return result; 241 } 242 243 nir_ssa_def *index = nir_load_subgroup_invocation(b); 244 bool is_shuffle = false; 245 switch (intrin->intrinsic) { 246 case nir_intrinsic_shuffle_xor: 247 assert(intrin->src[1].is_ssa); 248 index = nir_ixor(b, index, intrin->src[1].ssa); 249 is_shuffle = true; 250 break; 251 case nir_intrinsic_shuffle_up: 252 assert(intrin->src[1].is_ssa); 253 index = nir_isub(b, index, intrin->src[1].ssa); 254 is_shuffle = true; 255 break; 256 case nir_intrinsic_shuffle_down: 257 assert(intrin->src[1].is_ssa); 258 index = nir_iadd(b, index, intrin->src[1].ssa); 259 is_shuffle = true; 260 break; 261 case nir_intrinsic_quad_broadcast: 262 assert(intrin->src[1].is_ssa); 263 index = nir_ior(b, nir_iand(b, index, nir_imm_int(b, ~0x3)), 264 intrin->src[1].ssa); 265 break; 266 case nir_intrinsic_quad_swap_horizontal: 267 /* For Quad operations, subgroups are divided into quads where 268 * (invocation % 4) is the index to a square arranged as follows: 269 * 270 * +---+---+ 271 * | 0 | 1 | 272 * +---+---+ 273 * | 2 | 3 | 274 * +---+---+ 275 */ 276 index = nir_ixor(b, index, nir_imm_int(b, 0x1)); 277 break; 278 case nir_intrinsic_quad_swap_vertical: 279 index = nir_ixor(b, index, nir_imm_int(b, 0x2)); 280 break; 281 case nir_intrinsic_quad_swap_diagonal: 282 index = nir_ixor(b, index, nir_imm_int(b, 0x3)); 283 break; 284 default: 285 unreachable("Invalid intrinsic"); 286 } 287 288 nir_intrinsic_instr *shuffle = 289 nir_intrinsic_instr_create(b->shader, nir_intrinsic_shuffle); 290 shuffle->num_components = intrin->num_components; 291 nir_src_copy(&shuffle->src[0], &intrin->src[0]); 292 shuffle->src[1] = nir_src_for_ssa(index); 293 nir_ssa_dest_init(&shuffle->instr, &shuffle->dest, 294 intrin->dest.ssa.num_components, 295 intrin->dest.ssa.bit_size, NULL); 296 297 bool lower_to_32bit = options->lower_shuffle_to_32bit && is_shuffle; 298 if (options->lower_to_scalar && shuffle->num_components > 1) { 299 return lower_subgroup_op_to_scalar(b, shuffle, lower_to_32bit); 300 } else if (lower_to_32bit && shuffle->src[0].ssa->bit_size == 64) { 301 return lower_subgroup_op_to_32bit(b, shuffle); 302 } else { 303 nir_builder_instr_insert(b, &shuffle->instr); 304 return &shuffle->dest.ssa; 305 } 306} 307 308static const struct glsl_type * 309glsl_type_for_ssa(nir_ssa_def *def) 310{ 311 const struct glsl_type *comp_type = def->bit_size == 1 ? glsl_bool_type() : 312 glsl_uintN_t_type(def->bit_size); 313 return glsl_replace_vector_type(comp_type, def->num_components); 314} 315 316/* Lower nir_intrinsic_shuffle to a waterfall loop + nir_read_invocation. 317 */ 318static nir_ssa_def * 319lower_shuffle(nir_builder *b, nir_intrinsic_instr *intrin) 320{ 321 assert(intrin->src[0].is_ssa); 322 assert(intrin->src[1].is_ssa); 323 nir_ssa_def *val = intrin->src[0].ssa; 324 nir_ssa_def *id = intrin->src[1].ssa; 325 326 /* The loop is something like: 327 * 328 * while (true) { 329 * first_id = readFirstInvocation(gl_SubgroupInvocationID); 330 * first_val = readFirstInvocation(val); 331 * first_result = readInvocation(val, readFirstInvocation(id)); 332 * if (id == first_id) 333 * result = first_val; 334 * if (elect()) { 335 * if (id > gl_SubgroupInvocationID) { 336 * result = first_result; 337 * } 338 * break; 339 * } 340 * } 341 * 342 * The idea is to guarantee, on each iteration of the loop, that anything 343 * reading from first_id gets the correct value, so that we can then kill 344 * it off by breaking out of the loop. Before doing that we also have to 345 * ensure that first_id invocation gets the correct value. It only won't be 346 * assigned the correct value already if the invocation it's reading from 347 * isn't already killed off, that is, if it's later than its own ID. 348 * Invocations where id <= gl_SubgroupInvocationID will be assigned their 349 * result in the first if, and invocations where id > 350 * gl_SubgroupInvocationID will be assigned their result in the second if. 351 * 352 * We do this more complicated loop rather than looping over all id's 353 * explicitly because at this point we don't know the "actual" subgroup 354 * size and at the moment there's no way to get at it, which means we may 355 * loop over always-inactive invocations. 356 */ 357 358 nir_ssa_def *subgroup_id = nir_load_subgroup_invocation(b); 359 360 nir_variable *result = 361 nir_local_variable_create(b->impl, glsl_type_for_ssa(val), "result"); 362 363 nir_loop *loop = nir_push_loop(b); { 364 nir_ssa_def *first_id = nir_read_first_invocation(b, subgroup_id); 365 nir_ssa_def *first_val = nir_read_first_invocation(b, val); 366 nir_ssa_def *first_result = 367 nir_read_invocation(b, val, nir_read_first_invocation(b, id)); 368 369 nir_if *nif = nir_push_if(b, nir_ieq(b, id, first_id)); { 370 nir_store_var(b, result, first_val, BITFIELD_MASK(val->num_components)); 371 } nir_pop_if(b, nif); 372 373 nir_if *nif2 = nir_push_if(b, nir_elect(b, 1)); { 374 nir_if *nif3 = nir_push_if(b, nir_ult(b, subgroup_id, id)); { 375 nir_store_var(b, result, first_result, BITFIELD_MASK(val->num_components)); 376 } nir_pop_if(b, nif3); 377 378 nir_jump(b, nir_jump_break); 379 } nir_pop_if(b, nif2); 380 } nir_pop_loop(b, loop); 381 382 return nir_load_var(b, result); 383} 384 385static bool 386lower_subgroups_filter(const nir_instr *instr, const void *_options) 387{ 388 return instr->type == nir_instr_type_intrinsic; 389} 390 391/* Return a ballot-mask-sized value which represents "val" sign-extended and 392 * then shifted left by "shift". Only particular values for "val" are 393 * supported, see below. 394 */ 395static nir_ssa_def * 396build_ballot_imm_ishl(nir_builder *b, int64_t val, nir_ssa_def *shift, 397 const nir_lower_subgroups_options *options) 398{ 399 /* This only works if all the high bits are the same as bit 1. */ 400 assert((val >> 2) == (val & 0x2 ? -1 : 0)); 401 402 /* First compute the result assuming one ballot component. */ 403 nir_ssa_def *result = 404 nir_ishl(b, nir_imm_intN_t(b, val, options->ballot_bit_size), shift); 405 406 if (options->ballot_components == 1) 407 return result; 408 409 /* Fix up the result when there is > 1 component. The idea is that nir_ishl 410 * masks out the high bits of the shift value already, so in case there's 411 * more than one component the component which 1 would be shifted into 412 * already has the right value and all we have to do is fixup the other 413 * components. Components below it should always be 0, and components above 414 * it must be either 0 or ~0 because of the assert above. For example, if 415 * the target ballot size is 2 x uint32, and we're shifting 1 by 33, then 416 * we'll feed 33 into ishl, which will mask it off to get 1, so we'll 417 * compute a single-component result of 2, which is correct for the second 418 * component, but the first component needs to be 0, which we get by 419 * comparing the high bits of the shift with 0 and selecting the original 420 * answer or 0 for the first component (and something similar with the 421 * second component). This idea is generalized here for any component count 422 */ 423 nir_const_value min_shift[4] = { 0 }; 424 for (unsigned i = 0; i < options->ballot_components; i++) 425 min_shift[i].i32 = i * options->ballot_bit_size; 426 nir_ssa_def *min_shift_val = nir_build_imm(b, options->ballot_components, 32, min_shift); 427 428 nir_const_value max_shift[4] = { 0 }; 429 for (unsigned i = 0; i < options->ballot_components; i++) 430 max_shift[i].i32 = (i + 1) * options->ballot_bit_size; 431 nir_ssa_def *max_shift_val = nir_build_imm(b, options->ballot_components, 32, max_shift); 432 433 return nir_bcsel(b, nir_ult(b, shift, max_shift_val), 434 nir_bcsel(b, nir_ult(b, shift, min_shift_val), 435 nir_imm_intN_t(b, val >> 63, result->bit_size), 436 result), 437 nir_imm_intN_t(b, 0, result->bit_size)); 438} 439 440static nir_ssa_def * 441build_subgroup_eq_mask(nir_builder *b, 442 const nir_lower_subgroups_options *options) 443{ 444 nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b); 445 446 return build_ballot_imm_ishl(b, 1, subgroup_idx, options); 447} 448 449static nir_ssa_def * 450build_subgroup_ge_mask(nir_builder *b, 451 const nir_lower_subgroups_options *options) 452{ 453 nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b); 454 455 return build_ballot_imm_ishl(b, ~0ull, subgroup_idx, options); 456} 457 458static nir_ssa_def * 459build_subgroup_gt_mask(nir_builder *b, 460 const nir_lower_subgroups_options *options) 461{ 462 nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b); 463 464 return build_ballot_imm_ishl(b, ~1ull, subgroup_idx, options); 465} 466 467/* Return a mask which is 1 for threads up to the run-time subgroup size, i.e. 468 * 1 for the entire subgroup. SPIR-V requires us to return 0 for indices at or 469 * above the subgroup size for the masks, but gt_mask and ge_mask make them 1 470 * so we have to "and" with this mask. 471 */ 472static nir_ssa_def * 473build_subgroup_mask(nir_builder *b, 474 const nir_lower_subgroups_options *options) 475{ 476 nir_ssa_def *subgroup_size = nir_load_subgroup_size(b); 477 478 /* First compute the result assuming one ballot component. */ 479 nir_ssa_def *result = 480 nir_ushr(b, nir_imm_intN_t(b, ~0ull, options->ballot_bit_size), 481 nir_isub_imm(b, options->ballot_bit_size, 482 subgroup_size)); 483 484 /* Since the subgroup size and ballot bitsize are both powers of two, there 485 * are two possible cases to consider: 486 * 487 * (1) The subgroup size is less than the ballot bitsize. We need to return 488 * "result" in the first component and 0 in every other component. 489 * (2) The subgroup size is a multiple of the ballot bitsize. We need to 490 * return ~0 if the subgroup size divided by the ballot bitsize is less 491 * than or equal to the index in the vector and 0 otherwise. For example, 492 * with a target ballot type of 4 x uint32 and subgroup_size = 64 we'd need 493 * to return { ~0, ~0, 0, 0 }. 494 * 495 * In case (2) it turns out that "result" will be ~0, because 496 * "ballot_bit_size - subgroup_size" is also a multiple of 497 * "ballot_bit_size" and since nir_ushr masks the shift value it will 498 * shifted by 0. This means that the first component can just be "result" 499 * in all cases. The other components will also get the correct value in 500 * case (1) if we just use the rule in case (2), so we'll get the correct 501 * result if we just follow (2) and then replace the first component with 502 * "result". 503 */ 504 nir_const_value min_idx[4] = { 0 }; 505 for (unsigned i = 0; i < options->ballot_components; i++) 506 min_idx[i].i32 = i * options->ballot_bit_size; 507 nir_ssa_def *min_idx_val = nir_build_imm(b, options->ballot_components, 32, min_idx); 508 509 nir_ssa_def *result_extended = 510 nir_pad_vector_imm_int(b, result, ~0ull, options->ballot_components); 511 512 return nir_bcsel(b, nir_ult(b, min_idx_val, subgroup_size), 513 result_extended, nir_imm_intN_t(b, 0, options->ballot_bit_size)); 514} 515 516static nir_ssa_def * 517vec_bit_count(nir_builder *b, nir_ssa_def *value) 518{ 519 nir_ssa_def *vec_result = nir_bit_count(b, value); 520 nir_ssa_def *result = nir_channel(b, vec_result, 0); 521 for (unsigned i = 1; i < value->num_components; i++) 522 result = nir_iadd(b, result, nir_channel(b, vec_result, i)); 523 return result; 524} 525 526static nir_ssa_def * 527vec_find_lsb(nir_builder *b, nir_ssa_def *value) 528{ 529 nir_ssa_def *vec_result = nir_find_lsb(b, value); 530 nir_ssa_def *result = nir_imm_int(b, -1); 531 for (int i = value->num_components - 1; i >= 0; i--) { 532 nir_ssa_def *channel = nir_channel(b, vec_result, i); 533 /* result = channel >= 0 ? (i * bitsize + channel) : result */ 534 result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)), 535 nir_iadd_imm(b, channel, i * value->bit_size), 536 result); 537 } 538 return result; 539} 540 541static nir_ssa_def * 542vec_find_msb(nir_builder *b, nir_ssa_def *value) 543{ 544 nir_ssa_def *vec_result = nir_ufind_msb(b, value); 545 nir_ssa_def *result = nir_imm_int(b, -1); 546 for (unsigned i = 0; i < value->num_components; i++) { 547 nir_ssa_def *channel = nir_channel(b, vec_result, i); 548 /* result = channel >= 0 ? (i * bitsize + channel) : result */ 549 result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)), 550 nir_iadd_imm(b, channel, i * value->bit_size), 551 result); 552 } 553 return result; 554} 555 556static nir_ssa_def * 557lower_dynamic_quad_broadcast(nir_builder *b, nir_intrinsic_instr *intrin, 558 const nir_lower_subgroups_options *options) 559{ 560 if (!options->lower_quad_broadcast_dynamic_to_const) 561 return lower_to_shuffle(b, intrin, options); 562 563 nir_ssa_def *dst = NULL; 564 565 for (unsigned i = 0; i < 4; ++i) { 566 nir_intrinsic_instr *qbcst = 567 nir_intrinsic_instr_create(b->shader, nir_intrinsic_quad_broadcast); 568 569 qbcst->num_components = intrin->num_components; 570 qbcst->src[1] = nir_src_for_ssa(nir_imm_int(b, i)); 571 nir_src_copy(&qbcst->src[0], &intrin->src[0]); 572 nir_ssa_dest_init(&qbcst->instr, &qbcst->dest, 573 intrin->dest.ssa.num_components, 574 intrin->dest.ssa.bit_size, NULL); 575 576 nir_ssa_def *qbcst_dst = NULL; 577 578 if (options->lower_to_scalar && qbcst->num_components > 1) { 579 qbcst_dst = lower_subgroup_op_to_scalar(b, qbcst, false); 580 } else { 581 nir_builder_instr_insert(b, &qbcst->instr); 582 qbcst_dst = &qbcst->dest.ssa; 583 } 584 585 if (i) 586 dst = nir_bcsel(b, nir_ieq(b, intrin->src[1].ssa, 587 nir_src_for_ssa(nir_imm_int(b, i)).ssa), 588 qbcst_dst, dst); 589 else 590 dst = qbcst_dst; 591 } 592 593 return dst; 594} 595 596static nir_ssa_def * 597lower_read_invocation_to_cond(nir_builder *b, nir_intrinsic_instr *intrin) 598{ 599 return nir_read_invocation_cond_ir3(b, intrin->dest.ssa.bit_size, 600 intrin->src[0].ssa, 601 nir_ieq(b, intrin->src[1].ssa, 602 nir_load_subgroup_invocation(b))); 603} 604 605static nir_ssa_def * 606lower_subgroups_instr(nir_builder *b, nir_instr *instr, void *_options) 607{ 608 const nir_lower_subgroups_options *options = _options; 609 610 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 611 switch (intrin->intrinsic) { 612 case nir_intrinsic_vote_any: 613 case nir_intrinsic_vote_all: 614 if (options->lower_vote_trivial) 615 return nir_ssa_for_src(b, intrin->src[0], 1); 616 break; 617 618 case nir_intrinsic_vote_feq: 619 case nir_intrinsic_vote_ieq: 620 if (options->lower_vote_trivial) 621 return nir_imm_true(b); 622 623 if (options->lower_vote_eq) 624 return lower_vote_eq(b, intrin); 625 626 if (options->lower_to_scalar && intrin->num_components > 1) 627 return lower_vote_eq_to_scalar(b, intrin); 628 break; 629 630 case nir_intrinsic_load_subgroup_size: 631 if (options->subgroup_size) 632 return nir_imm_int(b, options->subgroup_size); 633 break; 634 635 case nir_intrinsic_read_invocation: 636 if (options->lower_to_scalar && intrin->num_components > 1) 637 return lower_subgroup_op_to_scalar(b, intrin, false); 638 639 if (options->lower_read_invocation_to_cond) 640 return lower_read_invocation_to_cond(b, intrin); 641 642 break; 643 644 case nir_intrinsic_read_first_invocation: 645 if (options->lower_to_scalar && intrin->num_components > 1) 646 return lower_subgroup_op_to_scalar(b, intrin, false); 647 break; 648 649 case nir_intrinsic_load_subgroup_eq_mask: 650 case nir_intrinsic_load_subgroup_ge_mask: 651 case nir_intrinsic_load_subgroup_gt_mask: 652 case nir_intrinsic_load_subgroup_le_mask: 653 case nir_intrinsic_load_subgroup_lt_mask: { 654 if (!options->lower_subgroup_masks) 655 return NULL; 656 657 nir_ssa_def *val; 658 switch (intrin->intrinsic) { 659 case nir_intrinsic_load_subgroup_eq_mask: 660 val = build_subgroup_eq_mask(b, options); 661 break; 662 case nir_intrinsic_load_subgroup_ge_mask: 663 val = nir_iand(b, build_subgroup_ge_mask(b, options), 664 build_subgroup_mask(b, options)); 665 break; 666 case nir_intrinsic_load_subgroup_gt_mask: 667 val = nir_iand(b, build_subgroup_gt_mask(b, options), 668 build_subgroup_mask(b, options)); 669 break; 670 case nir_intrinsic_load_subgroup_le_mask: 671 val = nir_inot(b, build_subgroup_gt_mask(b, options)); 672 break; 673 case nir_intrinsic_load_subgroup_lt_mask: 674 val = nir_inot(b, build_subgroup_ge_mask(b, options)); 675 break; 676 default: 677 unreachable("you seriously can't tell this is unreachable?"); 678 } 679 680 return uint_to_ballot_type(b, val, 681 intrin->dest.ssa.num_components, 682 intrin->dest.ssa.bit_size); 683 } 684 685 case nir_intrinsic_ballot: { 686 if (intrin->dest.ssa.num_components == options->ballot_components && 687 intrin->dest.ssa.bit_size == options->ballot_bit_size) 688 return NULL; 689 690 nir_ssa_def *ballot = 691 nir_ballot(b, options->ballot_components, options->ballot_bit_size, 692 intrin->src[0].ssa); 693 694 return uint_to_ballot_type(b, ballot, 695 intrin->dest.ssa.num_components, 696 intrin->dest.ssa.bit_size); 697 } 698 699 case nir_intrinsic_ballot_bitfield_extract: 700 case nir_intrinsic_ballot_bit_count_reduce: 701 case nir_intrinsic_ballot_find_lsb: 702 case nir_intrinsic_ballot_find_msb: { 703 assert(intrin->src[0].is_ssa); 704 nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa, 705 options); 706 707 if (intrin->intrinsic != nir_intrinsic_ballot_bitfield_extract && 708 intrin->intrinsic != nir_intrinsic_ballot_find_lsb) { 709 /* For OpGroupNonUniformBallotFindMSB, the SPIR-V Spec says: 710 * 711 * "Find the most significant bit set to 1 in Value, considering 712 * only the bits in Value required to represent all bits of the 713 * group’s invocations. If none of the considered bits is set to 714 * 1, the result is undefined." 715 * 716 * It has similar text for the other three. This means that, in case 717 * the subgroup size is less than 32, we have to mask off the unused 718 * bits. If the subgroup size is fixed and greater than or equal to 719 * 32, the mask will be 0xffffffff and nir_opt_algebraic will delete 720 * the iand. 721 * 722 * We only have to worry about this for BitCount and FindMSB because 723 * FindLSB counts from the bottom and BitfieldExtract selects 724 * individual bits. In either case, if run outside the range of 725 * valid bits, we hit the undefined results case and we can return 726 * anything we want. 727 */ 728 int_val = nir_iand(b, int_val, build_subgroup_mask(b, options)); 729 } 730 731 switch (intrin->intrinsic) { 732 case nir_intrinsic_ballot_bitfield_extract: { 733 assert(intrin->src[1].is_ssa); 734 nir_ssa_def *idx = intrin->src[1].ssa; 735 if (int_val->num_components > 1) { 736 /* idx will be truncated by nir_ushr, so we just need to select 737 * the right component using the bits of idx that are truncated in 738 * the shift. 739 */ 740 int_val = 741 nir_vector_extract(b, int_val, 742 nir_udiv_imm(b, idx, int_val->bit_size)); 743 } 744 745 return nir_test_mask(b, nir_ushr(b, int_val, idx), 1); 746 } 747 case nir_intrinsic_ballot_bit_count_reduce: 748 return vec_bit_count(b, int_val); 749 case nir_intrinsic_ballot_find_lsb: 750 return vec_find_lsb(b, int_val); 751 case nir_intrinsic_ballot_find_msb: 752 return vec_find_msb(b, int_val); 753 default: 754 unreachable("you seriously can't tell this is unreachable?"); 755 } 756 } 757 758 case nir_intrinsic_ballot_bit_count_exclusive: 759 case nir_intrinsic_ballot_bit_count_inclusive: { 760 nir_ssa_def *mask; 761 if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_inclusive) { 762 mask = nir_inot(b, build_subgroup_gt_mask(b, options)); 763 } else { 764 mask = nir_inot(b, build_subgroup_ge_mask(b, options)); 765 } 766 767 assert(intrin->src[0].is_ssa); 768 nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa, 769 options); 770 771 return vec_bit_count(b, nir_iand(b, int_val, mask)); 772 } 773 774 case nir_intrinsic_elect: { 775 if (!options->lower_elect) 776 return NULL; 777 778 return nir_ieq(b, nir_load_subgroup_invocation(b), nir_first_invocation(b)); 779 } 780 781 case nir_intrinsic_shuffle: 782 if (options->lower_shuffle) 783 return lower_shuffle(b, intrin); 784 else if (options->lower_to_scalar && intrin->num_components > 1) 785 return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit); 786 else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64) 787 return lower_subgroup_op_to_32bit(b, intrin); 788 break; 789 case nir_intrinsic_shuffle_xor: 790 case nir_intrinsic_shuffle_up: 791 case nir_intrinsic_shuffle_down: 792 if (options->lower_relative_shuffle) 793 return lower_to_shuffle(b, intrin, options); 794 else if (options->lower_to_scalar && intrin->num_components > 1) 795 return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit); 796 else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64) 797 return lower_subgroup_op_to_32bit(b, intrin); 798 break; 799 800 case nir_intrinsic_quad_broadcast: 801 case nir_intrinsic_quad_swap_horizontal: 802 case nir_intrinsic_quad_swap_vertical: 803 case nir_intrinsic_quad_swap_diagonal: 804 if (options->lower_quad || 805 (options->lower_quad_broadcast_dynamic && 806 intrin->intrinsic == nir_intrinsic_quad_broadcast && 807 !nir_src_is_const(intrin->src[1]))) 808 return lower_dynamic_quad_broadcast(b, intrin, options); 809 else if (options->lower_to_scalar && intrin->num_components > 1) 810 return lower_subgroup_op_to_scalar(b, intrin, false); 811 break; 812 813 case nir_intrinsic_reduce: { 814 nir_ssa_def *ret = NULL; 815 /* A cluster size greater than the subgroup size is implemention defined */ 816 if (options->subgroup_size && 817 nir_intrinsic_cluster_size(intrin) >= options->subgroup_size) { 818 nir_intrinsic_set_cluster_size(intrin, 0); 819 ret = NIR_LOWER_INSTR_PROGRESS; 820 } 821 if (options->lower_to_scalar && intrin->num_components > 1) 822 ret = lower_subgroup_op_to_scalar(b, intrin, false); 823 return ret; 824 } 825 case nir_intrinsic_inclusive_scan: 826 case nir_intrinsic_exclusive_scan: 827 if (options->lower_to_scalar && intrin->num_components > 1) 828 return lower_subgroup_op_to_scalar(b, intrin, false); 829 break; 830 831 default: 832 break; 833 } 834 835 return NULL; 836} 837 838bool 839nir_lower_subgroups(nir_shader *shader, 840 const nir_lower_subgroups_options *options) 841{ 842 return nir_shader_lower_instructions(shader, 843 lower_subgroups_filter, 844 lower_subgroups_instr, 845 (void *)options); 846} 847