1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2017 Intel Corporation
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci#include "nir.h"
25bf215546Sopenharmony_ci#include "nir_builder.h"
26bf215546Sopenharmony_ci#include "util/u_math.h"
27bf215546Sopenharmony_ci
28bf215546Sopenharmony_ci/**
29bf215546Sopenharmony_ci * \file nir_opt_intrinsics.c
30bf215546Sopenharmony_ci */
31bf215546Sopenharmony_ci
32bf215546Sopenharmony_cistatic nir_intrinsic_instr *
33bf215546Sopenharmony_cilower_subgroups_64bit_split_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
34bf215546Sopenharmony_ci                                      unsigned int component)
35bf215546Sopenharmony_ci{
36bf215546Sopenharmony_ci   nir_ssa_def *comp;
37bf215546Sopenharmony_ci   if (component == 0)
38bf215546Sopenharmony_ci      comp = nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa);
39bf215546Sopenharmony_ci   else
40bf215546Sopenharmony_ci      comp = nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa);
41bf215546Sopenharmony_ci
42bf215546Sopenharmony_ci   nir_intrinsic_instr *intr = nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
43bf215546Sopenharmony_ci   nir_ssa_dest_init(&intr->instr, &intr->dest, 1, 32, NULL);
44bf215546Sopenharmony_ci   intr->const_index[0] = intrin->const_index[0];
45bf215546Sopenharmony_ci   intr->const_index[1] = intrin->const_index[1];
46bf215546Sopenharmony_ci   intr->src[0] = nir_src_for_ssa(comp);
47bf215546Sopenharmony_ci   if (nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2)
48bf215546Sopenharmony_ci      nir_src_copy(&intr->src[1], &intrin->src[1]);
49bf215546Sopenharmony_ci
50bf215546Sopenharmony_ci   intr->num_components = 1;
51bf215546Sopenharmony_ci   nir_builder_instr_insert(b, &intr->instr);
52bf215546Sopenharmony_ci   return intr;
53bf215546Sopenharmony_ci}
54bf215546Sopenharmony_ci
55bf215546Sopenharmony_cistatic nir_ssa_def *
56bf215546Sopenharmony_cilower_subgroup_op_to_32bit(nir_builder *b, nir_intrinsic_instr *intrin)
57bf215546Sopenharmony_ci{
58bf215546Sopenharmony_ci   assert(intrin->src[0].ssa->bit_size == 64);
59bf215546Sopenharmony_ci   nir_intrinsic_instr *intr_x = lower_subgroups_64bit_split_intrinsic(b, intrin, 0);
60bf215546Sopenharmony_ci   nir_intrinsic_instr *intr_y = lower_subgroups_64bit_split_intrinsic(b, intrin, 1);
61bf215546Sopenharmony_ci   return nir_pack_64_2x32_split(b, &intr_x->dest.ssa, &intr_y->dest.ssa);
62bf215546Sopenharmony_ci}
63bf215546Sopenharmony_ci
64bf215546Sopenharmony_cistatic nir_ssa_def *
65bf215546Sopenharmony_ciballot_type_to_uint(nir_builder *b, nir_ssa_def *value,
66bf215546Sopenharmony_ci                    const nir_lower_subgroups_options *options)
67bf215546Sopenharmony_ci{
68bf215546Sopenharmony_ci   /* Only the new-style SPIR-V subgroup instructions take a ballot result as
69bf215546Sopenharmony_ci    * an argument, so we only use this on uvec4 types.
70bf215546Sopenharmony_ci    */
71bf215546Sopenharmony_ci   assert(value->num_components == 4 && value->bit_size == 32);
72bf215546Sopenharmony_ci
73bf215546Sopenharmony_ci   return nir_extract_bits(b, &value, 1, 0, options->ballot_components,
74bf215546Sopenharmony_ci                           options->ballot_bit_size);
75bf215546Sopenharmony_ci}
76bf215546Sopenharmony_ci
77bf215546Sopenharmony_cistatic nir_ssa_def *
78bf215546Sopenharmony_ciuint_to_ballot_type(nir_builder *b, nir_ssa_def *value,
79bf215546Sopenharmony_ci                    unsigned num_components, unsigned bit_size)
80bf215546Sopenharmony_ci{
81bf215546Sopenharmony_ci   assert(util_is_power_of_two_nonzero(num_components));
82bf215546Sopenharmony_ci   assert(util_is_power_of_two_nonzero(value->num_components));
83bf215546Sopenharmony_ci
84bf215546Sopenharmony_ci   unsigned total_bits = bit_size * num_components;
85bf215546Sopenharmony_ci
86bf215546Sopenharmony_ci   /* If the source doesn't have enough bits, zero-pad */
87bf215546Sopenharmony_ci   if (total_bits > value->bit_size * value->num_components)
88bf215546Sopenharmony_ci      value = nir_pad_vector_imm_int(b, value, 0, total_bits / value->bit_size);
89bf215546Sopenharmony_ci
90bf215546Sopenharmony_ci   value = nir_bitcast_vector(b, value, bit_size);
91bf215546Sopenharmony_ci
92bf215546Sopenharmony_ci   /* If the source has too many components, truncate.  This can happen if,
93bf215546Sopenharmony_ci    * for instance, we're implementing GL_ARB_shader_ballot or
94bf215546Sopenharmony_ci    * VK_EXT_shader_subgroup_ballot which have 64-bit ballot values on an
95bf215546Sopenharmony_ci    * architecture with a native 128-bit uvec4 ballot.  This comes up in Zink
96bf215546Sopenharmony_ci    * for OpenGL on Vulkan.  It's the job of the driver calling this lowering
97bf215546Sopenharmony_ci    * pass to ensure that it's restricted subgroup sizes sufficiently that we
98bf215546Sopenharmony_ci    * have enough ballot bits.
99bf215546Sopenharmony_ci    */
100bf215546Sopenharmony_ci   if (value->num_components > num_components)
101bf215546Sopenharmony_ci      value = nir_trim_vector(b, value, num_components);
102bf215546Sopenharmony_ci
103bf215546Sopenharmony_ci   return value;
104bf215546Sopenharmony_ci}
105bf215546Sopenharmony_ci
106bf215546Sopenharmony_cistatic nir_ssa_def *
107bf215546Sopenharmony_cilower_subgroup_op_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin,
108bf215546Sopenharmony_ci                            bool lower_to_32bit)
109bf215546Sopenharmony_ci{
110bf215546Sopenharmony_ci   /* This is safe to call on scalar things but it would be silly */
111bf215546Sopenharmony_ci   assert(intrin->dest.ssa.num_components > 1);
112bf215546Sopenharmony_ci
113bf215546Sopenharmony_ci   nir_ssa_def *value = nir_ssa_for_src(b, intrin->src[0],
114bf215546Sopenharmony_ci                                           intrin->num_components);
115bf215546Sopenharmony_ci   nir_ssa_def *reads[NIR_MAX_VEC_COMPONENTS];
116bf215546Sopenharmony_ci
117bf215546Sopenharmony_ci   for (unsigned i = 0; i < intrin->num_components; i++) {
118bf215546Sopenharmony_ci      nir_intrinsic_instr *chan_intrin =
119bf215546Sopenharmony_ci         nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
120bf215546Sopenharmony_ci      nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest,
121bf215546Sopenharmony_ci                        1, intrin->dest.ssa.bit_size, NULL);
122bf215546Sopenharmony_ci      chan_intrin->num_components = 1;
123bf215546Sopenharmony_ci
124bf215546Sopenharmony_ci      /* value */
125bf215546Sopenharmony_ci      chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
126bf215546Sopenharmony_ci      /* invocation */
127bf215546Sopenharmony_ci      if (nir_intrinsic_infos[intrin->intrinsic].num_srcs > 1) {
128bf215546Sopenharmony_ci         assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2);
129bf215546Sopenharmony_ci         nir_src_copy(&chan_intrin->src[1], &intrin->src[1]);
130bf215546Sopenharmony_ci      }
131bf215546Sopenharmony_ci
132bf215546Sopenharmony_ci      chan_intrin->const_index[0] = intrin->const_index[0];
133bf215546Sopenharmony_ci      chan_intrin->const_index[1] = intrin->const_index[1];
134bf215546Sopenharmony_ci
135bf215546Sopenharmony_ci      if (lower_to_32bit && chan_intrin->src[0].ssa->bit_size == 64) {
136bf215546Sopenharmony_ci         reads[i] = lower_subgroup_op_to_32bit(b, chan_intrin);
137bf215546Sopenharmony_ci      } else {
138bf215546Sopenharmony_ci         nir_builder_instr_insert(b, &chan_intrin->instr);
139bf215546Sopenharmony_ci         reads[i] = &chan_intrin->dest.ssa;
140bf215546Sopenharmony_ci      }
141bf215546Sopenharmony_ci   }
142bf215546Sopenharmony_ci
143bf215546Sopenharmony_ci   return nir_vec(b, reads, intrin->num_components);
144bf215546Sopenharmony_ci}
145bf215546Sopenharmony_ci
146bf215546Sopenharmony_cistatic nir_ssa_def *
147bf215546Sopenharmony_cilower_vote_eq_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
148bf215546Sopenharmony_ci{
149bf215546Sopenharmony_ci   assert(intrin->src[0].is_ssa);
150bf215546Sopenharmony_ci   nir_ssa_def *value = intrin->src[0].ssa;
151bf215546Sopenharmony_ci
152bf215546Sopenharmony_ci   nir_ssa_def *result = NULL;
153bf215546Sopenharmony_ci   for (unsigned i = 0; i < intrin->num_components; i++) {
154bf215546Sopenharmony_ci      nir_intrinsic_instr *chan_intrin =
155bf215546Sopenharmony_ci         nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
156bf215546Sopenharmony_ci      nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest,
157bf215546Sopenharmony_ci                        1, intrin->dest.ssa.bit_size, NULL);
158bf215546Sopenharmony_ci      chan_intrin->num_components = 1;
159bf215546Sopenharmony_ci      chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
160bf215546Sopenharmony_ci      nir_builder_instr_insert(b, &chan_intrin->instr);
161bf215546Sopenharmony_ci
162bf215546Sopenharmony_ci      if (result) {
163bf215546Sopenharmony_ci         result = nir_iand(b, result, &chan_intrin->dest.ssa);
164bf215546Sopenharmony_ci      } else {
165bf215546Sopenharmony_ci         result = &chan_intrin->dest.ssa;
166bf215546Sopenharmony_ci      }
167bf215546Sopenharmony_ci   }
168bf215546Sopenharmony_ci
169bf215546Sopenharmony_ci   return result;
170bf215546Sopenharmony_ci}
171bf215546Sopenharmony_ci
172bf215546Sopenharmony_cistatic nir_ssa_def *
173bf215546Sopenharmony_cilower_vote_eq(nir_builder *b, nir_intrinsic_instr *intrin)
174bf215546Sopenharmony_ci{
175bf215546Sopenharmony_ci   assert(intrin->src[0].is_ssa);
176bf215546Sopenharmony_ci   nir_ssa_def *value = intrin->src[0].ssa;
177bf215546Sopenharmony_ci
178bf215546Sopenharmony_ci   /* We have to implicitly lower to scalar */
179bf215546Sopenharmony_ci   nir_ssa_def *all_eq = NULL;
180bf215546Sopenharmony_ci   for (unsigned i = 0; i < intrin->num_components; i++) {
181bf215546Sopenharmony_ci      nir_ssa_def *rfi = nir_read_first_invocation(b, nir_channel(b, value, i));
182bf215546Sopenharmony_ci
183bf215546Sopenharmony_ci      nir_ssa_def *is_eq;
184bf215546Sopenharmony_ci      if (intrin->intrinsic == nir_intrinsic_vote_feq) {
185bf215546Sopenharmony_ci         is_eq = nir_feq(b, rfi, nir_channel(b, value, i));
186bf215546Sopenharmony_ci      } else {
187bf215546Sopenharmony_ci         is_eq = nir_ieq(b, rfi, nir_channel(b, value, i));
188bf215546Sopenharmony_ci      }
189bf215546Sopenharmony_ci
190bf215546Sopenharmony_ci      if (all_eq == NULL) {
191bf215546Sopenharmony_ci         all_eq = is_eq;
192bf215546Sopenharmony_ci      } else {
193bf215546Sopenharmony_ci         all_eq = nir_iand(b, all_eq, is_eq);
194bf215546Sopenharmony_ci      }
195bf215546Sopenharmony_ci   }
196bf215546Sopenharmony_ci
197bf215546Sopenharmony_ci   return nir_vote_all(b, 1, all_eq);
198bf215546Sopenharmony_ci}
199bf215546Sopenharmony_ci
200bf215546Sopenharmony_cistatic nir_ssa_def *
201bf215546Sopenharmony_cilower_shuffle_to_swizzle(nir_builder *b, nir_intrinsic_instr *intrin,
202bf215546Sopenharmony_ci                         const nir_lower_subgroups_options *options)
203bf215546Sopenharmony_ci{
204bf215546Sopenharmony_ci   unsigned mask = nir_src_as_uint(intrin->src[1]);
205bf215546Sopenharmony_ci
206bf215546Sopenharmony_ci   if (mask >= 32)
207bf215546Sopenharmony_ci      return NULL;
208bf215546Sopenharmony_ci
209bf215546Sopenharmony_ci   nir_intrinsic_instr *swizzle = nir_intrinsic_instr_create(
210bf215546Sopenharmony_ci      b->shader, nir_intrinsic_masked_swizzle_amd);
211bf215546Sopenharmony_ci   swizzle->num_components = intrin->num_components;
212bf215546Sopenharmony_ci   nir_src_copy(&swizzle->src[0], &intrin->src[0]);
213bf215546Sopenharmony_ci   nir_intrinsic_set_swizzle_mask(swizzle, (mask << 10) | 0x1f);
214bf215546Sopenharmony_ci   nir_ssa_dest_init(&swizzle->instr, &swizzle->dest,
215bf215546Sopenharmony_ci                     intrin->dest.ssa.num_components,
216bf215546Sopenharmony_ci                     intrin->dest.ssa.bit_size, NULL);
217bf215546Sopenharmony_ci
218bf215546Sopenharmony_ci   if (options->lower_to_scalar && swizzle->num_components > 1) {
219bf215546Sopenharmony_ci      return lower_subgroup_op_to_scalar(b, swizzle, options->lower_shuffle_to_32bit);
220bf215546Sopenharmony_ci   } else if (options->lower_shuffle_to_32bit && swizzle->src[0].ssa->bit_size == 64) {
221bf215546Sopenharmony_ci      return lower_subgroup_op_to_32bit(b, swizzle);
222bf215546Sopenharmony_ci   } else {
223bf215546Sopenharmony_ci      nir_builder_instr_insert(b, &swizzle->instr);
224bf215546Sopenharmony_ci      return &swizzle->dest.ssa;
225bf215546Sopenharmony_ci   }
226bf215546Sopenharmony_ci}
227bf215546Sopenharmony_ci
228bf215546Sopenharmony_ci/* Lowers "specialized" shuffles to a generic nir_intrinsic_shuffle. */
229bf215546Sopenharmony_ci
230bf215546Sopenharmony_cistatic nir_ssa_def *
231bf215546Sopenharmony_cilower_to_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
232bf215546Sopenharmony_ci                 const nir_lower_subgroups_options *options)
233bf215546Sopenharmony_ci{
234bf215546Sopenharmony_ci   if (intrin->intrinsic == nir_intrinsic_shuffle_xor &&
235bf215546Sopenharmony_ci       options->lower_shuffle_to_swizzle_amd &&
236bf215546Sopenharmony_ci       nir_src_is_const(intrin->src[1])) {
237bf215546Sopenharmony_ci      nir_ssa_def *result =
238bf215546Sopenharmony_ci         lower_shuffle_to_swizzle(b, intrin, options);
239bf215546Sopenharmony_ci      if (result)
240bf215546Sopenharmony_ci         return result;
241bf215546Sopenharmony_ci   }
242bf215546Sopenharmony_ci
243bf215546Sopenharmony_ci   nir_ssa_def *index = nir_load_subgroup_invocation(b);
244bf215546Sopenharmony_ci   bool is_shuffle = false;
245bf215546Sopenharmony_ci   switch (intrin->intrinsic) {
246bf215546Sopenharmony_ci   case nir_intrinsic_shuffle_xor:
247bf215546Sopenharmony_ci      assert(intrin->src[1].is_ssa);
248bf215546Sopenharmony_ci      index = nir_ixor(b, index, intrin->src[1].ssa);
249bf215546Sopenharmony_ci      is_shuffle = true;
250bf215546Sopenharmony_ci      break;
251bf215546Sopenharmony_ci   case nir_intrinsic_shuffle_up:
252bf215546Sopenharmony_ci      assert(intrin->src[1].is_ssa);
253bf215546Sopenharmony_ci      index = nir_isub(b, index, intrin->src[1].ssa);
254bf215546Sopenharmony_ci      is_shuffle = true;
255bf215546Sopenharmony_ci      break;
256bf215546Sopenharmony_ci   case nir_intrinsic_shuffle_down:
257bf215546Sopenharmony_ci      assert(intrin->src[1].is_ssa);
258bf215546Sopenharmony_ci      index = nir_iadd(b, index, intrin->src[1].ssa);
259bf215546Sopenharmony_ci      is_shuffle = true;
260bf215546Sopenharmony_ci      break;
261bf215546Sopenharmony_ci   case nir_intrinsic_quad_broadcast:
262bf215546Sopenharmony_ci      assert(intrin->src[1].is_ssa);
263bf215546Sopenharmony_ci      index = nir_ior(b, nir_iand(b, index, nir_imm_int(b, ~0x3)),
264bf215546Sopenharmony_ci                         intrin->src[1].ssa);
265bf215546Sopenharmony_ci      break;
266bf215546Sopenharmony_ci   case nir_intrinsic_quad_swap_horizontal:
267bf215546Sopenharmony_ci      /* For Quad operations, subgroups are divided into quads where
268bf215546Sopenharmony_ci       * (invocation % 4) is the index to a square arranged as follows:
269bf215546Sopenharmony_ci       *
270bf215546Sopenharmony_ci       *    +---+---+
271bf215546Sopenharmony_ci       *    | 0 | 1 |
272bf215546Sopenharmony_ci       *    +---+---+
273bf215546Sopenharmony_ci       *    | 2 | 3 |
274bf215546Sopenharmony_ci       *    +---+---+
275bf215546Sopenharmony_ci       */
276bf215546Sopenharmony_ci      index = nir_ixor(b, index, nir_imm_int(b, 0x1));
277bf215546Sopenharmony_ci      break;
278bf215546Sopenharmony_ci   case nir_intrinsic_quad_swap_vertical:
279bf215546Sopenharmony_ci      index = nir_ixor(b, index, nir_imm_int(b, 0x2));
280bf215546Sopenharmony_ci      break;
281bf215546Sopenharmony_ci   case nir_intrinsic_quad_swap_diagonal:
282bf215546Sopenharmony_ci      index = nir_ixor(b, index, nir_imm_int(b, 0x3));
283bf215546Sopenharmony_ci      break;
284bf215546Sopenharmony_ci   default:
285bf215546Sopenharmony_ci      unreachable("Invalid intrinsic");
286bf215546Sopenharmony_ci   }
287bf215546Sopenharmony_ci
288bf215546Sopenharmony_ci   nir_intrinsic_instr *shuffle =
289bf215546Sopenharmony_ci      nir_intrinsic_instr_create(b->shader, nir_intrinsic_shuffle);
290bf215546Sopenharmony_ci   shuffle->num_components = intrin->num_components;
291bf215546Sopenharmony_ci   nir_src_copy(&shuffle->src[0], &intrin->src[0]);
292bf215546Sopenharmony_ci   shuffle->src[1] = nir_src_for_ssa(index);
293bf215546Sopenharmony_ci   nir_ssa_dest_init(&shuffle->instr, &shuffle->dest,
294bf215546Sopenharmony_ci                     intrin->dest.ssa.num_components,
295bf215546Sopenharmony_ci                     intrin->dest.ssa.bit_size, NULL);
296bf215546Sopenharmony_ci
297bf215546Sopenharmony_ci   bool lower_to_32bit = options->lower_shuffle_to_32bit && is_shuffle;
298bf215546Sopenharmony_ci   if (options->lower_to_scalar && shuffle->num_components > 1) {
299bf215546Sopenharmony_ci      return lower_subgroup_op_to_scalar(b, shuffle, lower_to_32bit);
300bf215546Sopenharmony_ci   } else if (lower_to_32bit && shuffle->src[0].ssa->bit_size == 64) {
301bf215546Sopenharmony_ci      return lower_subgroup_op_to_32bit(b, shuffle);
302bf215546Sopenharmony_ci   } else {
303bf215546Sopenharmony_ci      nir_builder_instr_insert(b, &shuffle->instr);
304bf215546Sopenharmony_ci      return &shuffle->dest.ssa;
305bf215546Sopenharmony_ci   }
306bf215546Sopenharmony_ci}
307bf215546Sopenharmony_ci
308bf215546Sopenharmony_cistatic const struct glsl_type *
309bf215546Sopenharmony_ciglsl_type_for_ssa(nir_ssa_def *def)
310bf215546Sopenharmony_ci{
311bf215546Sopenharmony_ci   const struct glsl_type *comp_type = def->bit_size == 1 ? glsl_bool_type() :
312bf215546Sopenharmony_ci      glsl_uintN_t_type(def->bit_size);
313bf215546Sopenharmony_ci   return glsl_replace_vector_type(comp_type, def->num_components);
314bf215546Sopenharmony_ci}
315bf215546Sopenharmony_ci
316bf215546Sopenharmony_ci/* Lower nir_intrinsic_shuffle to a waterfall loop + nir_read_invocation.
317bf215546Sopenharmony_ci */
318bf215546Sopenharmony_cistatic nir_ssa_def *
319bf215546Sopenharmony_cilower_shuffle(nir_builder *b, nir_intrinsic_instr *intrin)
320bf215546Sopenharmony_ci{
321bf215546Sopenharmony_ci   assert(intrin->src[0].is_ssa);
322bf215546Sopenharmony_ci   assert(intrin->src[1].is_ssa);
323bf215546Sopenharmony_ci   nir_ssa_def *val = intrin->src[0].ssa;
324bf215546Sopenharmony_ci   nir_ssa_def *id = intrin->src[1].ssa;
325bf215546Sopenharmony_ci
326bf215546Sopenharmony_ci   /* The loop is something like:
327bf215546Sopenharmony_ci    *
328bf215546Sopenharmony_ci    * while (true) {
329bf215546Sopenharmony_ci    *    first_id = readFirstInvocation(gl_SubgroupInvocationID);
330bf215546Sopenharmony_ci    *    first_val = readFirstInvocation(val);
331bf215546Sopenharmony_ci    *    first_result = readInvocation(val, readFirstInvocation(id));
332bf215546Sopenharmony_ci    *    if (id == first_id)
333bf215546Sopenharmony_ci    *       result = first_val;
334bf215546Sopenharmony_ci    *    if (elect()) {
335bf215546Sopenharmony_ci    *       if (id > gl_SubgroupInvocationID) {
336bf215546Sopenharmony_ci    *          result = first_result;
337bf215546Sopenharmony_ci    *       }
338bf215546Sopenharmony_ci    *       break;
339bf215546Sopenharmony_ci    *    }
340bf215546Sopenharmony_ci    * }
341bf215546Sopenharmony_ci    *
342bf215546Sopenharmony_ci    * The idea is to guarantee, on each iteration of the loop, that anything
343bf215546Sopenharmony_ci    * reading from first_id gets the correct value, so that we can then kill
344bf215546Sopenharmony_ci    * it off by breaking out of the loop. Before doing that we also have to
345bf215546Sopenharmony_ci    * ensure that first_id invocation gets the correct value. It only won't be
346bf215546Sopenharmony_ci    * assigned the correct value already if the invocation it's reading from
347bf215546Sopenharmony_ci    * isn't already killed off, that is, if it's later than its own ID.
348bf215546Sopenharmony_ci    * Invocations where id <= gl_SubgroupInvocationID will be assigned their
349bf215546Sopenharmony_ci    * result in the first if, and invocations where id >
350bf215546Sopenharmony_ci    * gl_SubgroupInvocationID will be assigned their result in the second if.
351bf215546Sopenharmony_ci    *
352bf215546Sopenharmony_ci    * We do this more complicated loop rather than looping over all id's
353bf215546Sopenharmony_ci    * explicitly because at this point we don't know the "actual" subgroup
354bf215546Sopenharmony_ci    * size and at the moment there's no way to get at it, which means we may
355bf215546Sopenharmony_ci    * loop over always-inactive invocations.
356bf215546Sopenharmony_ci    */
357bf215546Sopenharmony_ci
358bf215546Sopenharmony_ci   nir_ssa_def *subgroup_id = nir_load_subgroup_invocation(b);
359bf215546Sopenharmony_ci
360bf215546Sopenharmony_ci   nir_variable *result =
361bf215546Sopenharmony_ci      nir_local_variable_create(b->impl, glsl_type_for_ssa(val), "result");
362bf215546Sopenharmony_ci
363bf215546Sopenharmony_ci   nir_loop *loop = nir_push_loop(b); {
364bf215546Sopenharmony_ci      nir_ssa_def *first_id = nir_read_first_invocation(b, subgroup_id);
365bf215546Sopenharmony_ci      nir_ssa_def *first_val = nir_read_first_invocation(b, val);
366bf215546Sopenharmony_ci      nir_ssa_def *first_result =
367bf215546Sopenharmony_ci         nir_read_invocation(b, val, nir_read_first_invocation(b, id));
368bf215546Sopenharmony_ci
369bf215546Sopenharmony_ci      nir_if *nif = nir_push_if(b, nir_ieq(b, id, first_id)); {
370bf215546Sopenharmony_ci         nir_store_var(b, result, first_val, BITFIELD_MASK(val->num_components));
371bf215546Sopenharmony_ci      } nir_pop_if(b, nif);
372bf215546Sopenharmony_ci
373bf215546Sopenharmony_ci      nir_if *nif2 = nir_push_if(b, nir_elect(b, 1)); {
374bf215546Sopenharmony_ci         nir_if *nif3 = nir_push_if(b, nir_ult(b, subgroup_id, id)); {
375bf215546Sopenharmony_ci            nir_store_var(b, result, first_result, BITFIELD_MASK(val->num_components));
376bf215546Sopenharmony_ci         } nir_pop_if(b, nif3);
377bf215546Sopenharmony_ci
378bf215546Sopenharmony_ci         nir_jump(b, nir_jump_break);
379bf215546Sopenharmony_ci      } nir_pop_if(b, nif2);
380bf215546Sopenharmony_ci   } nir_pop_loop(b, loop);
381bf215546Sopenharmony_ci
382bf215546Sopenharmony_ci   return nir_load_var(b, result);
383bf215546Sopenharmony_ci}
384bf215546Sopenharmony_ci
385bf215546Sopenharmony_cistatic bool
386bf215546Sopenharmony_cilower_subgroups_filter(const nir_instr *instr, const void *_options)
387bf215546Sopenharmony_ci{
388bf215546Sopenharmony_ci   return instr->type == nir_instr_type_intrinsic;
389bf215546Sopenharmony_ci}
390bf215546Sopenharmony_ci
391bf215546Sopenharmony_ci/* Return a ballot-mask-sized value which represents "val" sign-extended and
392bf215546Sopenharmony_ci * then shifted left by "shift". Only particular values for "val" are
393bf215546Sopenharmony_ci * supported, see below.
394bf215546Sopenharmony_ci */
395bf215546Sopenharmony_cistatic nir_ssa_def *
396bf215546Sopenharmony_cibuild_ballot_imm_ishl(nir_builder *b, int64_t val, nir_ssa_def *shift,
397bf215546Sopenharmony_ci                      const nir_lower_subgroups_options *options)
398bf215546Sopenharmony_ci{
399bf215546Sopenharmony_ci   /* This only works if all the high bits are the same as bit 1. */
400bf215546Sopenharmony_ci   assert((val >> 2) == (val & 0x2 ? -1 : 0));
401bf215546Sopenharmony_ci
402bf215546Sopenharmony_ci   /* First compute the result assuming one ballot component. */
403bf215546Sopenharmony_ci   nir_ssa_def *result =
404bf215546Sopenharmony_ci      nir_ishl(b, nir_imm_intN_t(b, val, options->ballot_bit_size), shift);
405bf215546Sopenharmony_ci
406bf215546Sopenharmony_ci   if (options->ballot_components == 1)
407bf215546Sopenharmony_ci      return result;
408bf215546Sopenharmony_ci
409bf215546Sopenharmony_ci   /* Fix up the result when there is > 1 component. The idea is that nir_ishl
410bf215546Sopenharmony_ci    * masks out the high bits of the shift value already, so in case there's
411bf215546Sopenharmony_ci    * more than one component the component which 1 would be shifted into
412bf215546Sopenharmony_ci    * already has the right value and all we have to do is fixup the other
413bf215546Sopenharmony_ci    * components. Components below it should always be 0, and components above
414bf215546Sopenharmony_ci    * it must be either 0 or ~0 because of the assert above. For example, if
415bf215546Sopenharmony_ci    * the target ballot size is 2 x uint32, and we're shifting 1 by 33, then
416bf215546Sopenharmony_ci    * we'll feed 33 into ishl, which will mask it off to get 1, so we'll
417bf215546Sopenharmony_ci    * compute a single-component result of 2, which is correct for the second
418bf215546Sopenharmony_ci    * component, but the first component needs to be 0, which we get by
419bf215546Sopenharmony_ci    * comparing the high bits of the shift with 0 and selecting the original
420bf215546Sopenharmony_ci    * answer or 0 for the first component (and something similar with the
421bf215546Sopenharmony_ci    * second component). This idea is generalized here for any component count
422bf215546Sopenharmony_ci    */
423bf215546Sopenharmony_ci   nir_const_value min_shift[4] = { 0 };
424bf215546Sopenharmony_ci   for (unsigned i = 0; i < options->ballot_components; i++)
425bf215546Sopenharmony_ci      min_shift[i].i32 = i * options->ballot_bit_size;
426bf215546Sopenharmony_ci   nir_ssa_def *min_shift_val = nir_build_imm(b, options->ballot_components, 32, min_shift);
427bf215546Sopenharmony_ci
428bf215546Sopenharmony_ci   nir_const_value max_shift[4] = { 0 };
429bf215546Sopenharmony_ci   for (unsigned i = 0; i < options->ballot_components; i++)
430bf215546Sopenharmony_ci      max_shift[i].i32 = (i + 1) * options->ballot_bit_size;
431bf215546Sopenharmony_ci   nir_ssa_def *max_shift_val = nir_build_imm(b, options->ballot_components, 32, max_shift);
432bf215546Sopenharmony_ci
433bf215546Sopenharmony_ci   return nir_bcsel(b, nir_ult(b, shift, max_shift_val),
434bf215546Sopenharmony_ci                    nir_bcsel(b, nir_ult(b, shift, min_shift_val),
435bf215546Sopenharmony_ci                              nir_imm_intN_t(b, val >> 63, result->bit_size),
436bf215546Sopenharmony_ci                              result),
437bf215546Sopenharmony_ci                    nir_imm_intN_t(b, 0, result->bit_size));
438bf215546Sopenharmony_ci}
439bf215546Sopenharmony_ci
440bf215546Sopenharmony_cistatic nir_ssa_def *
441bf215546Sopenharmony_cibuild_subgroup_eq_mask(nir_builder *b,
442bf215546Sopenharmony_ci                       const nir_lower_subgroups_options *options)
443bf215546Sopenharmony_ci{
444bf215546Sopenharmony_ci   nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
445bf215546Sopenharmony_ci
446bf215546Sopenharmony_ci   return build_ballot_imm_ishl(b, 1, subgroup_idx, options);
447bf215546Sopenharmony_ci}
448bf215546Sopenharmony_ci
449bf215546Sopenharmony_cistatic nir_ssa_def *
450bf215546Sopenharmony_cibuild_subgroup_ge_mask(nir_builder *b,
451bf215546Sopenharmony_ci                       const nir_lower_subgroups_options *options)
452bf215546Sopenharmony_ci{
453bf215546Sopenharmony_ci   nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
454bf215546Sopenharmony_ci
455bf215546Sopenharmony_ci   return build_ballot_imm_ishl(b, ~0ull, subgroup_idx, options);
456bf215546Sopenharmony_ci}
457bf215546Sopenharmony_ci
458bf215546Sopenharmony_cistatic nir_ssa_def *
459bf215546Sopenharmony_cibuild_subgroup_gt_mask(nir_builder *b,
460bf215546Sopenharmony_ci                       const nir_lower_subgroups_options *options)
461bf215546Sopenharmony_ci{
462bf215546Sopenharmony_ci   nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
463bf215546Sopenharmony_ci
464bf215546Sopenharmony_ci   return build_ballot_imm_ishl(b, ~1ull, subgroup_idx, options);
465bf215546Sopenharmony_ci}
466bf215546Sopenharmony_ci
467bf215546Sopenharmony_ci/* Return a mask which is 1 for threads up to the run-time subgroup size, i.e.
468bf215546Sopenharmony_ci * 1 for the entire subgroup. SPIR-V requires us to return 0 for indices at or
469bf215546Sopenharmony_ci * above the subgroup size for the masks, but gt_mask and ge_mask make them 1
470bf215546Sopenharmony_ci * so we have to "and" with this mask.
471bf215546Sopenharmony_ci */
472bf215546Sopenharmony_cistatic nir_ssa_def *
473bf215546Sopenharmony_cibuild_subgroup_mask(nir_builder *b,
474bf215546Sopenharmony_ci                    const nir_lower_subgroups_options *options)
475bf215546Sopenharmony_ci{
476bf215546Sopenharmony_ci   nir_ssa_def *subgroup_size = nir_load_subgroup_size(b);
477bf215546Sopenharmony_ci
478bf215546Sopenharmony_ci   /* First compute the result assuming one ballot component. */
479bf215546Sopenharmony_ci   nir_ssa_def *result =
480bf215546Sopenharmony_ci      nir_ushr(b, nir_imm_intN_t(b, ~0ull, options->ballot_bit_size),
481bf215546Sopenharmony_ci                  nir_isub_imm(b, options->ballot_bit_size,
482bf215546Sopenharmony_ci                               subgroup_size));
483bf215546Sopenharmony_ci
484bf215546Sopenharmony_ci   /* Since the subgroup size and ballot bitsize are both powers of two, there
485bf215546Sopenharmony_ci    * are two possible cases to consider:
486bf215546Sopenharmony_ci    *
487bf215546Sopenharmony_ci    * (1) The subgroup size is less than the ballot bitsize. We need to return
488bf215546Sopenharmony_ci    * "result" in the first component and 0 in every other component.
489bf215546Sopenharmony_ci    * (2) The subgroup size is a multiple of the ballot bitsize. We need to
490bf215546Sopenharmony_ci    * return ~0 if the subgroup size divided by the ballot bitsize is less
491bf215546Sopenharmony_ci    * than or equal to the index in the vector and 0 otherwise. For example,
492bf215546Sopenharmony_ci    * with a target ballot type of 4 x uint32 and subgroup_size = 64 we'd need
493bf215546Sopenharmony_ci    * to return { ~0, ~0, 0, 0 }.
494bf215546Sopenharmony_ci    *
495bf215546Sopenharmony_ci    * In case (2) it turns out that "result" will be ~0, because
496bf215546Sopenharmony_ci    * "ballot_bit_size - subgroup_size" is also a multiple of
497bf215546Sopenharmony_ci    * "ballot_bit_size" and since nir_ushr masks the shift value it will
498bf215546Sopenharmony_ci    * shifted by 0. This means that the first component can just be "result"
499bf215546Sopenharmony_ci    * in all cases.  The other components will also get the correct value in
500bf215546Sopenharmony_ci    * case (1) if we just use the rule in case (2), so we'll get the correct
501bf215546Sopenharmony_ci    * result if we just follow (2) and then replace the first component with
502bf215546Sopenharmony_ci    * "result".
503bf215546Sopenharmony_ci    */
504bf215546Sopenharmony_ci   nir_const_value min_idx[4] = { 0 };
505bf215546Sopenharmony_ci   for (unsigned i = 0; i < options->ballot_components; i++)
506bf215546Sopenharmony_ci      min_idx[i].i32 = i * options->ballot_bit_size;
507bf215546Sopenharmony_ci   nir_ssa_def *min_idx_val = nir_build_imm(b, options->ballot_components, 32, min_idx);
508bf215546Sopenharmony_ci
509bf215546Sopenharmony_ci   nir_ssa_def *result_extended =
510bf215546Sopenharmony_ci      nir_pad_vector_imm_int(b, result, ~0ull, options->ballot_components);
511bf215546Sopenharmony_ci
512bf215546Sopenharmony_ci   return nir_bcsel(b, nir_ult(b, min_idx_val, subgroup_size),
513bf215546Sopenharmony_ci                    result_extended, nir_imm_intN_t(b, 0, options->ballot_bit_size));
514bf215546Sopenharmony_ci}
515bf215546Sopenharmony_ci
516bf215546Sopenharmony_cistatic nir_ssa_def *
517bf215546Sopenharmony_civec_bit_count(nir_builder *b, nir_ssa_def *value)
518bf215546Sopenharmony_ci{
519bf215546Sopenharmony_ci   nir_ssa_def *vec_result = nir_bit_count(b, value);
520bf215546Sopenharmony_ci   nir_ssa_def *result = nir_channel(b, vec_result, 0);
521bf215546Sopenharmony_ci   for (unsigned i = 1; i < value->num_components; i++)
522bf215546Sopenharmony_ci      result = nir_iadd(b, result, nir_channel(b, vec_result, i));
523bf215546Sopenharmony_ci   return result;
524bf215546Sopenharmony_ci}
525bf215546Sopenharmony_ci
526bf215546Sopenharmony_cistatic nir_ssa_def *
527bf215546Sopenharmony_civec_find_lsb(nir_builder *b, nir_ssa_def *value)
528bf215546Sopenharmony_ci{
529bf215546Sopenharmony_ci   nir_ssa_def *vec_result = nir_find_lsb(b, value);
530bf215546Sopenharmony_ci   nir_ssa_def *result = nir_imm_int(b, -1);
531bf215546Sopenharmony_ci   for (int i = value->num_components - 1; i >= 0; i--) {
532bf215546Sopenharmony_ci      nir_ssa_def *channel = nir_channel(b, vec_result, i);
533bf215546Sopenharmony_ci      /* result = channel >= 0 ? (i * bitsize + channel) : result */
534bf215546Sopenharmony_ci      result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)),
535bf215546Sopenharmony_ci                         nir_iadd_imm(b, channel, i * value->bit_size),
536bf215546Sopenharmony_ci                         result);
537bf215546Sopenharmony_ci   }
538bf215546Sopenharmony_ci   return result;
539bf215546Sopenharmony_ci}
540bf215546Sopenharmony_ci
541bf215546Sopenharmony_cistatic nir_ssa_def *
542bf215546Sopenharmony_civec_find_msb(nir_builder *b, nir_ssa_def *value)
543bf215546Sopenharmony_ci{
544bf215546Sopenharmony_ci   nir_ssa_def *vec_result = nir_ufind_msb(b, value);
545bf215546Sopenharmony_ci   nir_ssa_def *result = nir_imm_int(b, -1);
546bf215546Sopenharmony_ci   for (unsigned i = 0; i < value->num_components; i++) {
547bf215546Sopenharmony_ci      nir_ssa_def *channel = nir_channel(b, vec_result, i);
548bf215546Sopenharmony_ci      /* result = channel >= 0 ? (i * bitsize + channel) : result */
549bf215546Sopenharmony_ci      result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)),
550bf215546Sopenharmony_ci                         nir_iadd_imm(b, channel, i * value->bit_size),
551bf215546Sopenharmony_ci                         result);
552bf215546Sopenharmony_ci   }
553bf215546Sopenharmony_ci   return result;
554bf215546Sopenharmony_ci}
555bf215546Sopenharmony_ci
556bf215546Sopenharmony_cistatic nir_ssa_def *
557bf215546Sopenharmony_cilower_dynamic_quad_broadcast(nir_builder *b, nir_intrinsic_instr *intrin,
558bf215546Sopenharmony_ci                             const nir_lower_subgroups_options *options)
559bf215546Sopenharmony_ci{
560bf215546Sopenharmony_ci   if (!options->lower_quad_broadcast_dynamic_to_const)
561bf215546Sopenharmony_ci      return lower_to_shuffle(b, intrin, options);
562bf215546Sopenharmony_ci
563bf215546Sopenharmony_ci   nir_ssa_def *dst = NULL;
564bf215546Sopenharmony_ci
565bf215546Sopenharmony_ci   for (unsigned i = 0; i < 4; ++i) {
566bf215546Sopenharmony_ci      nir_intrinsic_instr *qbcst =
567bf215546Sopenharmony_ci         nir_intrinsic_instr_create(b->shader, nir_intrinsic_quad_broadcast);
568bf215546Sopenharmony_ci
569bf215546Sopenharmony_ci      qbcst->num_components = intrin->num_components;
570bf215546Sopenharmony_ci      qbcst->src[1] = nir_src_for_ssa(nir_imm_int(b, i));
571bf215546Sopenharmony_ci      nir_src_copy(&qbcst->src[0], &intrin->src[0]);
572bf215546Sopenharmony_ci      nir_ssa_dest_init(&qbcst->instr, &qbcst->dest,
573bf215546Sopenharmony_ci                        intrin->dest.ssa.num_components,
574bf215546Sopenharmony_ci                        intrin->dest.ssa.bit_size, NULL);
575bf215546Sopenharmony_ci
576bf215546Sopenharmony_ci      nir_ssa_def *qbcst_dst = NULL;
577bf215546Sopenharmony_ci
578bf215546Sopenharmony_ci      if (options->lower_to_scalar && qbcst->num_components > 1) {
579bf215546Sopenharmony_ci         qbcst_dst = lower_subgroup_op_to_scalar(b, qbcst, false);
580bf215546Sopenharmony_ci      } else {
581bf215546Sopenharmony_ci         nir_builder_instr_insert(b, &qbcst->instr);
582bf215546Sopenharmony_ci         qbcst_dst = &qbcst->dest.ssa;
583bf215546Sopenharmony_ci      }
584bf215546Sopenharmony_ci
585bf215546Sopenharmony_ci      if (i)
586bf215546Sopenharmony_ci         dst = nir_bcsel(b, nir_ieq(b, intrin->src[1].ssa,
587bf215546Sopenharmony_ci                                    nir_src_for_ssa(nir_imm_int(b, i)).ssa),
588bf215546Sopenharmony_ci                         qbcst_dst, dst);
589bf215546Sopenharmony_ci      else
590bf215546Sopenharmony_ci         dst = qbcst_dst;
591bf215546Sopenharmony_ci   }
592bf215546Sopenharmony_ci
593bf215546Sopenharmony_ci   return dst;
594bf215546Sopenharmony_ci}
595bf215546Sopenharmony_ci
596bf215546Sopenharmony_cistatic nir_ssa_def *
597bf215546Sopenharmony_cilower_read_invocation_to_cond(nir_builder *b, nir_intrinsic_instr *intrin)
598bf215546Sopenharmony_ci{
599bf215546Sopenharmony_ci   return nir_read_invocation_cond_ir3(b, intrin->dest.ssa.bit_size,
600bf215546Sopenharmony_ci                                       intrin->src[0].ssa,
601bf215546Sopenharmony_ci                                       nir_ieq(b, intrin->src[1].ssa,
602bf215546Sopenharmony_ci                                               nir_load_subgroup_invocation(b)));
603bf215546Sopenharmony_ci}
604bf215546Sopenharmony_ci
605bf215546Sopenharmony_cistatic nir_ssa_def *
606bf215546Sopenharmony_cilower_subgroups_instr(nir_builder *b, nir_instr *instr, void *_options)
607bf215546Sopenharmony_ci{
608bf215546Sopenharmony_ci   const nir_lower_subgroups_options *options = _options;
609bf215546Sopenharmony_ci
610bf215546Sopenharmony_ci   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
611bf215546Sopenharmony_ci   switch (intrin->intrinsic) {
612bf215546Sopenharmony_ci   case nir_intrinsic_vote_any:
613bf215546Sopenharmony_ci   case nir_intrinsic_vote_all:
614bf215546Sopenharmony_ci      if (options->lower_vote_trivial)
615bf215546Sopenharmony_ci         return nir_ssa_for_src(b, intrin->src[0], 1);
616bf215546Sopenharmony_ci      break;
617bf215546Sopenharmony_ci
618bf215546Sopenharmony_ci   case nir_intrinsic_vote_feq:
619bf215546Sopenharmony_ci   case nir_intrinsic_vote_ieq:
620bf215546Sopenharmony_ci      if (options->lower_vote_trivial)
621bf215546Sopenharmony_ci         return nir_imm_true(b);
622bf215546Sopenharmony_ci
623bf215546Sopenharmony_ci      if (options->lower_vote_eq)
624bf215546Sopenharmony_ci         return lower_vote_eq(b, intrin);
625bf215546Sopenharmony_ci
626bf215546Sopenharmony_ci      if (options->lower_to_scalar && intrin->num_components > 1)
627bf215546Sopenharmony_ci         return lower_vote_eq_to_scalar(b, intrin);
628bf215546Sopenharmony_ci      break;
629bf215546Sopenharmony_ci
630bf215546Sopenharmony_ci   case nir_intrinsic_load_subgroup_size:
631bf215546Sopenharmony_ci      if (options->subgroup_size)
632bf215546Sopenharmony_ci         return nir_imm_int(b, options->subgroup_size);
633bf215546Sopenharmony_ci      break;
634bf215546Sopenharmony_ci
635bf215546Sopenharmony_ci   case nir_intrinsic_read_invocation:
636bf215546Sopenharmony_ci      if (options->lower_to_scalar && intrin->num_components > 1)
637bf215546Sopenharmony_ci         return lower_subgroup_op_to_scalar(b, intrin, false);
638bf215546Sopenharmony_ci
639bf215546Sopenharmony_ci      if (options->lower_read_invocation_to_cond)
640bf215546Sopenharmony_ci         return lower_read_invocation_to_cond(b, intrin);
641bf215546Sopenharmony_ci
642bf215546Sopenharmony_ci      break;
643bf215546Sopenharmony_ci
644bf215546Sopenharmony_ci   case nir_intrinsic_read_first_invocation:
645bf215546Sopenharmony_ci      if (options->lower_to_scalar && intrin->num_components > 1)
646bf215546Sopenharmony_ci         return lower_subgroup_op_to_scalar(b, intrin, false);
647bf215546Sopenharmony_ci      break;
648bf215546Sopenharmony_ci
649bf215546Sopenharmony_ci   case nir_intrinsic_load_subgroup_eq_mask:
650bf215546Sopenharmony_ci   case nir_intrinsic_load_subgroup_ge_mask:
651bf215546Sopenharmony_ci   case nir_intrinsic_load_subgroup_gt_mask:
652bf215546Sopenharmony_ci   case nir_intrinsic_load_subgroup_le_mask:
653bf215546Sopenharmony_ci   case nir_intrinsic_load_subgroup_lt_mask: {
654bf215546Sopenharmony_ci      if (!options->lower_subgroup_masks)
655bf215546Sopenharmony_ci         return NULL;
656bf215546Sopenharmony_ci
657bf215546Sopenharmony_ci      nir_ssa_def *val;
658bf215546Sopenharmony_ci      switch (intrin->intrinsic) {
659bf215546Sopenharmony_ci      case nir_intrinsic_load_subgroup_eq_mask:
660bf215546Sopenharmony_ci         val = build_subgroup_eq_mask(b, options);
661bf215546Sopenharmony_ci         break;
662bf215546Sopenharmony_ci      case nir_intrinsic_load_subgroup_ge_mask:
663bf215546Sopenharmony_ci         val = nir_iand(b, build_subgroup_ge_mask(b, options),
664bf215546Sopenharmony_ci                           build_subgroup_mask(b, options));
665bf215546Sopenharmony_ci         break;
666bf215546Sopenharmony_ci      case nir_intrinsic_load_subgroup_gt_mask:
667bf215546Sopenharmony_ci         val = nir_iand(b, build_subgroup_gt_mask(b, options),
668bf215546Sopenharmony_ci                           build_subgroup_mask(b, options));
669bf215546Sopenharmony_ci         break;
670bf215546Sopenharmony_ci      case nir_intrinsic_load_subgroup_le_mask:
671bf215546Sopenharmony_ci         val = nir_inot(b, build_subgroup_gt_mask(b, options));
672bf215546Sopenharmony_ci         break;
673bf215546Sopenharmony_ci      case nir_intrinsic_load_subgroup_lt_mask:
674bf215546Sopenharmony_ci         val = nir_inot(b, build_subgroup_ge_mask(b, options));
675bf215546Sopenharmony_ci         break;
676bf215546Sopenharmony_ci      default:
677bf215546Sopenharmony_ci         unreachable("you seriously can't tell this is unreachable?");
678bf215546Sopenharmony_ci      }
679bf215546Sopenharmony_ci
680bf215546Sopenharmony_ci      return uint_to_ballot_type(b, val,
681bf215546Sopenharmony_ci                                 intrin->dest.ssa.num_components,
682bf215546Sopenharmony_ci                                 intrin->dest.ssa.bit_size);
683bf215546Sopenharmony_ci   }
684bf215546Sopenharmony_ci
685bf215546Sopenharmony_ci   case nir_intrinsic_ballot: {
686bf215546Sopenharmony_ci      if (intrin->dest.ssa.num_components == options->ballot_components &&
687bf215546Sopenharmony_ci          intrin->dest.ssa.bit_size == options->ballot_bit_size)
688bf215546Sopenharmony_ci         return NULL;
689bf215546Sopenharmony_ci
690bf215546Sopenharmony_ci      nir_ssa_def *ballot =
691bf215546Sopenharmony_ci         nir_ballot(b, options->ballot_components, options->ballot_bit_size,
692bf215546Sopenharmony_ci                    intrin->src[0].ssa);
693bf215546Sopenharmony_ci
694bf215546Sopenharmony_ci      return uint_to_ballot_type(b, ballot,
695bf215546Sopenharmony_ci                                 intrin->dest.ssa.num_components,
696bf215546Sopenharmony_ci                                 intrin->dest.ssa.bit_size);
697bf215546Sopenharmony_ci   }
698bf215546Sopenharmony_ci
699bf215546Sopenharmony_ci   case nir_intrinsic_ballot_bitfield_extract:
700bf215546Sopenharmony_ci   case nir_intrinsic_ballot_bit_count_reduce:
701bf215546Sopenharmony_ci   case nir_intrinsic_ballot_find_lsb:
702bf215546Sopenharmony_ci   case nir_intrinsic_ballot_find_msb: {
703bf215546Sopenharmony_ci      assert(intrin->src[0].is_ssa);
704bf215546Sopenharmony_ci      nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
705bf215546Sopenharmony_ci                                                 options);
706bf215546Sopenharmony_ci
707bf215546Sopenharmony_ci      if (intrin->intrinsic != nir_intrinsic_ballot_bitfield_extract &&
708bf215546Sopenharmony_ci          intrin->intrinsic != nir_intrinsic_ballot_find_lsb) {
709bf215546Sopenharmony_ci         /* For OpGroupNonUniformBallotFindMSB, the SPIR-V Spec says:
710bf215546Sopenharmony_ci          *
711bf215546Sopenharmony_ci          *    "Find the most significant bit set to 1 in Value, considering
712bf215546Sopenharmony_ci          *    only the bits in Value required to represent all bits of the
713bf215546Sopenharmony_ci          *    group’s invocations.  If none of the considered bits is set to
714bf215546Sopenharmony_ci          *    1, the result is undefined."
715bf215546Sopenharmony_ci          *
716bf215546Sopenharmony_ci          * It has similar text for the other three.  This means that, in case
717bf215546Sopenharmony_ci          * the subgroup size is less than 32, we have to mask off the unused
718bf215546Sopenharmony_ci          * bits.  If the subgroup size is fixed and greater than or equal to
719bf215546Sopenharmony_ci          * 32, the mask will be 0xffffffff and nir_opt_algebraic will delete
720bf215546Sopenharmony_ci          * the iand.
721bf215546Sopenharmony_ci          *
722bf215546Sopenharmony_ci          * We only have to worry about this for BitCount and FindMSB because
723bf215546Sopenharmony_ci          * FindLSB counts from the bottom and BitfieldExtract selects
724bf215546Sopenharmony_ci          * individual bits.  In either case, if run outside the range of
725bf215546Sopenharmony_ci          * valid bits, we hit the undefined results case and we can return
726bf215546Sopenharmony_ci          * anything we want.
727bf215546Sopenharmony_ci          */
728bf215546Sopenharmony_ci         int_val = nir_iand(b, int_val, build_subgroup_mask(b, options));
729bf215546Sopenharmony_ci      }
730bf215546Sopenharmony_ci
731bf215546Sopenharmony_ci      switch (intrin->intrinsic) {
732bf215546Sopenharmony_ci      case nir_intrinsic_ballot_bitfield_extract: {
733bf215546Sopenharmony_ci         assert(intrin->src[1].is_ssa);
734bf215546Sopenharmony_ci         nir_ssa_def *idx = intrin->src[1].ssa;
735bf215546Sopenharmony_ci         if (int_val->num_components > 1) {
736bf215546Sopenharmony_ci            /* idx will be truncated by nir_ushr, so we just need to select
737bf215546Sopenharmony_ci             * the right component using the bits of idx that are truncated in
738bf215546Sopenharmony_ci             * the shift.
739bf215546Sopenharmony_ci             */
740bf215546Sopenharmony_ci            int_val =
741bf215546Sopenharmony_ci               nir_vector_extract(b, int_val,
742bf215546Sopenharmony_ci                                  nir_udiv_imm(b, idx, int_val->bit_size));
743bf215546Sopenharmony_ci         }
744bf215546Sopenharmony_ci
745bf215546Sopenharmony_ci         return nir_test_mask(b, nir_ushr(b, int_val, idx), 1);
746bf215546Sopenharmony_ci      }
747bf215546Sopenharmony_ci      case nir_intrinsic_ballot_bit_count_reduce:
748bf215546Sopenharmony_ci         return vec_bit_count(b, int_val);
749bf215546Sopenharmony_ci      case nir_intrinsic_ballot_find_lsb:
750bf215546Sopenharmony_ci         return vec_find_lsb(b, int_val);
751bf215546Sopenharmony_ci      case nir_intrinsic_ballot_find_msb:
752bf215546Sopenharmony_ci         return vec_find_msb(b, int_val);
753bf215546Sopenharmony_ci      default:
754bf215546Sopenharmony_ci         unreachable("you seriously can't tell this is unreachable?");
755bf215546Sopenharmony_ci      }
756bf215546Sopenharmony_ci   }
757bf215546Sopenharmony_ci
758bf215546Sopenharmony_ci   case nir_intrinsic_ballot_bit_count_exclusive:
759bf215546Sopenharmony_ci   case nir_intrinsic_ballot_bit_count_inclusive: {
760bf215546Sopenharmony_ci      nir_ssa_def *mask;
761bf215546Sopenharmony_ci      if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_inclusive) {
762bf215546Sopenharmony_ci         mask = nir_inot(b, build_subgroup_gt_mask(b, options));
763bf215546Sopenharmony_ci      } else {
764bf215546Sopenharmony_ci         mask = nir_inot(b, build_subgroup_ge_mask(b, options));
765bf215546Sopenharmony_ci      }
766bf215546Sopenharmony_ci
767bf215546Sopenharmony_ci      assert(intrin->src[0].is_ssa);
768bf215546Sopenharmony_ci      nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
769bf215546Sopenharmony_ci                                                 options);
770bf215546Sopenharmony_ci
771bf215546Sopenharmony_ci      return vec_bit_count(b, nir_iand(b, int_val, mask));
772bf215546Sopenharmony_ci   }
773bf215546Sopenharmony_ci
774bf215546Sopenharmony_ci   case nir_intrinsic_elect: {
775bf215546Sopenharmony_ci      if (!options->lower_elect)
776bf215546Sopenharmony_ci         return NULL;
777bf215546Sopenharmony_ci
778bf215546Sopenharmony_ci      return nir_ieq(b, nir_load_subgroup_invocation(b), nir_first_invocation(b));
779bf215546Sopenharmony_ci   }
780bf215546Sopenharmony_ci
781bf215546Sopenharmony_ci   case nir_intrinsic_shuffle:
782bf215546Sopenharmony_ci      if (options->lower_shuffle)
783bf215546Sopenharmony_ci         return lower_shuffle(b, intrin);
784bf215546Sopenharmony_ci      else if (options->lower_to_scalar && intrin->num_components > 1)
785bf215546Sopenharmony_ci         return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit);
786bf215546Sopenharmony_ci      else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
787bf215546Sopenharmony_ci         return lower_subgroup_op_to_32bit(b, intrin);
788bf215546Sopenharmony_ci      break;
789bf215546Sopenharmony_ci   case nir_intrinsic_shuffle_xor:
790bf215546Sopenharmony_ci   case nir_intrinsic_shuffle_up:
791bf215546Sopenharmony_ci   case nir_intrinsic_shuffle_down:
792bf215546Sopenharmony_ci      if (options->lower_relative_shuffle)
793bf215546Sopenharmony_ci         return lower_to_shuffle(b, intrin, options);
794bf215546Sopenharmony_ci      else if (options->lower_to_scalar && intrin->num_components > 1)
795bf215546Sopenharmony_ci         return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit);
796bf215546Sopenharmony_ci      else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
797bf215546Sopenharmony_ci         return lower_subgroup_op_to_32bit(b, intrin);
798bf215546Sopenharmony_ci      break;
799bf215546Sopenharmony_ci
800bf215546Sopenharmony_ci   case nir_intrinsic_quad_broadcast:
801bf215546Sopenharmony_ci   case nir_intrinsic_quad_swap_horizontal:
802bf215546Sopenharmony_ci   case nir_intrinsic_quad_swap_vertical:
803bf215546Sopenharmony_ci   case nir_intrinsic_quad_swap_diagonal:
804bf215546Sopenharmony_ci      if (options->lower_quad ||
805bf215546Sopenharmony_ci          (options->lower_quad_broadcast_dynamic &&
806bf215546Sopenharmony_ci           intrin->intrinsic == nir_intrinsic_quad_broadcast &&
807bf215546Sopenharmony_ci           !nir_src_is_const(intrin->src[1])))
808bf215546Sopenharmony_ci         return lower_dynamic_quad_broadcast(b, intrin, options);
809bf215546Sopenharmony_ci      else if (options->lower_to_scalar && intrin->num_components > 1)
810bf215546Sopenharmony_ci         return lower_subgroup_op_to_scalar(b, intrin, false);
811bf215546Sopenharmony_ci      break;
812bf215546Sopenharmony_ci
813bf215546Sopenharmony_ci   case nir_intrinsic_reduce: {
814bf215546Sopenharmony_ci      nir_ssa_def *ret = NULL;
815bf215546Sopenharmony_ci      /* A cluster size greater than the subgroup size is implemention defined */
816bf215546Sopenharmony_ci      if (options->subgroup_size &&
817bf215546Sopenharmony_ci          nir_intrinsic_cluster_size(intrin) >= options->subgroup_size) {
818bf215546Sopenharmony_ci         nir_intrinsic_set_cluster_size(intrin, 0);
819bf215546Sopenharmony_ci         ret = NIR_LOWER_INSTR_PROGRESS;
820bf215546Sopenharmony_ci      }
821bf215546Sopenharmony_ci      if (options->lower_to_scalar && intrin->num_components > 1)
822bf215546Sopenharmony_ci         ret = lower_subgroup_op_to_scalar(b, intrin, false);
823bf215546Sopenharmony_ci      return ret;
824bf215546Sopenharmony_ci   }
825bf215546Sopenharmony_ci   case nir_intrinsic_inclusive_scan:
826bf215546Sopenharmony_ci   case nir_intrinsic_exclusive_scan:
827bf215546Sopenharmony_ci      if (options->lower_to_scalar && intrin->num_components > 1)
828bf215546Sopenharmony_ci         return lower_subgroup_op_to_scalar(b, intrin, false);
829bf215546Sopenharmony_ci      break;
830bf215546Sopenharmony_ci
831bf215546Sopenharmony_ci   default:
832bf215546Sopenharmony_ci      break;
833bf215546Sopenharmony_ci   }
834bf215546Sopenharmony_ci
835bf215546Sopenharmony_ci   return NULL;
836bf215546Sopenharmony_ci}
837bf215546Sopenharmony_ci
838bf215546Sopenharmony_cibool
839bf215546Sopenharmony_cinir_lower_subgroups(nir_shader *shader,
840bf215546Sopenharmony_ci                    const nir_lower_subgroups_options *options)
841bf215546Sopenharmony_ci{
842bf215546Sopenharmony_ci   return nir_shader_lower_instructions(shader,
843bf215546Sopenharmony_ci                                        lower_subgroups_filter,
844bf215546Sopenharmony_ci                                        lower_subgroups_instr,
845bf215546Sopenharmony_ci                                        (void *)options);
846bf215546Sopenharmony_ci}
847