1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2020 Intel Corporation
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci#include "nir.h"
25bf215546Sopenharmony_ci#include "nir_builder.h"
26bf215546Sopenharmony_ci#include "nir_phi_builder.h"
27bf215546Sopenharmony_ci#include "util/u_math.h"
28bf215546Sopenharmony_ci
29bf215546Sopenharmony_cistatic bool
30bf215546Sopenharmony_cimove_system_values_to_top(nir_shader *shader)
31bf215546Sopenharmony_ci{
32bf215546Sopenharmony_ci   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
33bf215546Sopenharmony_ci
34bf215546Sopenharmony_ci   bool progress = false;
35bf215546Sopenharmony_ci   nir_foreach_block(block, impl) {
36bf215546Sopenharmony_ci      nir_foreach_instr_safe(instr, block) {
37bf215546Sopenharmony_ci         if (instr->type != nir_instr_type_intrinsic)
38bf215546Sopenharmony_ci            continue;
39bf215546Sopenharmony_ci
40bf215546Sopenharmony_ci         /* These intrinsics not only can't be re-materialized but aren't
41bf215546Sopenharmony_ci          * preserved when moving to the continuation shader.  We have to move
42bf215546Sopenharmony_ci          * them to the top to ensure they get spilled as needed.
43bf215546Sopenharmony_ci          */
44bf215546Sopenharmony_ci         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
45bf215546Sopenharmony_ci         switch (intrin->intrinsic) {
46bf215546Sopenharmony_ci         case nir_intrinsic_load_shader_record_ptr:
47bf215546Sopenharmony_ci         case nir_intrinsic_load_btd_local_arg_addr_intel:
48bf215546Sopenharmony_ci            nir_instr_remove(instr);
49bf215546Sopenharmony_ci            nir_instr_insert(nir_before_cf_list(&impl->body), instr);
50bf215546Sopenharmony_ci            progress = true;
51bf215546Sopenharmony_ci            break;
52bf215546Sopenharmony_ci
53bf215546Sopenharmony_ci         default:
54bf215546Sopenharmony_ci            break;
55bf215546Sopenharmony_ci         }
56bf215546Sopenharmony_ci      }
57bf215546Sopenharmony_ci   }
58bf215546Sopenharmony_ci
59bf215546Sopenharmony_ci   if (progress) {
60bf215546Sopenharmony_ci      nir_metadata_preserve(impl, nir_metadata_block_index |
61bf215546Sopenharmony_ci                                  nir_metadata_dominance);
62bf215546Sopenharmony_ci   } else {
63bf215546Sopenharmony_ci      nir_metadata_preserve(impl, nir_metadata_all);
64bf215546Sopenharmony_ci   }
65bf215546Sopenharmony_ci
66bf215546Sopenharmony_ci   return progress;
67bf215546Sopenharmony_ci}
68bf215546Sopenharmony_ci
69bf215546Sopenharmony_cistatic bool
70bf215546Sopenharmony_ciinstr_is_shader_call(nir_instr *instr)
71bf215546Sopenharmony_ci{
72bf215546Sopenharmony_ci   if (instr->type != nir_instr_type_intrinsic)
73bf215546Sopenharmony_ci      return false;
74bf215546Sopenharmony_ci
75bf215546Sopenharmony_ci   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
76bf215546Sopenharmony_ci   return intrin->intrinsic == nir_intrinsic_trace_ray ||
77bf215546Sopenharmony_ci          intrin->intrinsic == nir_intrinsic_report_ray_intersection ||
78bf215546Sopenharmony_ci          intrin->intrinsic == nir_intrinsic_execute_callable;
79bf215546Sopenharmony_ci}
80bf215546Sopenharmony_ci
81bf215546Sopenharmony_ci/* Previously named bitset, it had to be renamed as FreeBSD defines a struct
82bf215546Sopenharmony_ci * named bitset in sys/_bitset.h required by pthread_np.h which is included
83bf215546Sopenharmony_ci * from src/util/u_thread.h that is indirectly included by this file.
84bf215546Sopenharmony_ci */
85bf215546Sopenharmony_cistruct brw_bitset {
86bf215546Sopenharmony_ci   BITSET_WORD *set;
87bf215546Sopenharmony_ci   unsigned size;
88bf215546Sopenharmony_ci};
89bf215546Sopenharmony_ci
90bf215546Sopenharmony_cistatic struct brw_bitset
91bf215546Sopenharmony_cibitset_create(void *mem_ctx, unsigned size)
92bf215546Sopenharmony_ci{
93bf215546Sopenharmony_ci   return (struct brw_bitset) {
94bf215546Sopenharmony_ci      .set = rzalloc_array(mem_ctx, BITSET_WORD, BITSET_WORDS(size)),
95bf215546Sopenharmony_ci      .size = size,
96bf215546Sopenharmony_ci   };
97bf215546Sopenharmony_ci}
98bf215546Sopenharmony_ci
99bf215546Sopenharmony_cistatic bool
100bf215546Sopenharmony_cisrc_is_in_bitset(nir_src *src, void *_set)
101bf215546Sopenharmony_ci{
102bf215546Sopenharmony_ci   struct brw_bitset *set = _set;
103bf215546Sopenharmony_ci   assert(src->is_ssa);
104bf215546Sopenharmony_ci
105bf215546Sopenharmony_ci   /* Any SSA values which were added after we generated liveness information
106bf215546Sopenharmony_ci    * are things generated by this pass and, while most of it is arithmetic
107bf215546Sopenharmony_ci    * which we could re-materialize, we don't need to because it's only used
108bf215546Sopenharmony_ci    * for a single load/store and so shouldn't cross any shader calls.
109bf215546Sopenharmony_ci    */
110bf215546Sopenharmony_ci   if (src->ssa->index >= set->size)
111bf215546Sopenharmony_ci      return false;
112bf215546Sopenharmony_ci
113bf215546Sopenharmony_ci   return BITSET_TEST(set->set, src->ssa->index);
114bf215546Sopenharmony_ci}
115bf215546Sopenharmony_ci
116bf215546Sopenharmony_cistatic void
117bf215546Sopenharmony_ciadd_ssa_def_to_bitset(nir_ssa_def *def, struct brw_bitset *set)
118bf215546Sopenharmony_ci{
119bf215546Sopenharmony_ci   if (def->index >= set->size)
120bf215546Sopenharmony_ci      return;
121bf215546Sopenharmony_ci
122bf215546Sopenharmony_ci   BITSET_SET(set->set, def->index);
123bf215546Sopenharmony_ci}
124bf215546Sopenharmony_ci
125bf215546Sopenharmony_cistatic bool
126bf215546Sopenharmony_cican_remat_instr(nir_instr *instr, struct brw_bitset *remat)
127bf215546Sopenharmony_ci{
128bf215546Sopenharmony_ci   /* Set of all values which are trivially re-materializable and we shouldn't
129bf215546Sopenharmony_ci    * ever spill them.  This includes:
130bf215546Sopenharmony_ci    *
131bf215546Sopenharmony_ci    *   - Undef values
132bf215546Sopenharmony_ci    *   - Constants
133bf215546Sopenharmony_ci    *   - Uniforms (UBO or push constant)
134bf215546Sopenharmony_ci    *   - ALU combinations of any of the above
135bf215546Sopenharmony_ci    *   - Derefs which are either complete or casts of any of the above
136bf215546Sopenharmony_ci    *
137bf215546Sopenharmony_ci    * Because this pass rewrites things in-order and phis are always turned
138bf215546Sopenharmony_ci    * into register writes, We can use "is it SSA?" to answer the question
139bf215546Sopenharmony_ci    * "can my source be re-materialized?".
140bf215546Sopenharmony_ci    */
141bf215546Sopenharmony_ci   switch (instr->type) {
142bf215546Sopenharmony_ci   case nir_instr_type_alu:
143bf215546Sopenharmony_ci      if (!nir_instr_as_alu(instr)->dest.dest.is_ssa)
144bf215546Sopenharmony_ci         return false;
145bf215546Sopenharmony_ci
146bf215546Sopenharmony_ci      return nir_foreach_src(instr, src_is_in_bitset, remat);
147bf215546Sopenharmony_ci
148bf215546Sopenharmony_ci   case nir_instr_type_deref:
149bf215546Sopenharmony_ci      return nir_foreach_src(instr, src_is_in_bitset, remat);
150bf215546Sopenharmony_ci
151bf215546Sopenharmony_ci   case nir_instr_type_intrinsic: {
152bf215546Sopenharmony_ci      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
153bf215546Sopenharmony_ci      switch (intrin->intrinsic) {
154bf215546Sopenharmony_ci      case nir_intrinsic_load_ubo:
155bf215546Sopenharmony_ci      case nir_intrinsic_vulkan_resource_index:
156bf215546Sopenharmony_ci      case nir_intrinsic_vulkan_resource_reindex:
157bf215546Sopenharmony_ci      case nir_intrinsic_load_vulkan_descriptor:
158bf215546Sopenharmony_ci      case nir_intrinsic_load_push_constant:
159bf215546Sopenharmony_ci         /* These intrinsics don't need to be spilled as long as they don't
160bf215546Sopenharmony_ci          * depend on any spilled values.
161bf215546Sopenharmony_ci          */
162bf215546Sopenharmony_ci         return nir_foreach_src(instr, src_is_in_bitset, remat);
163bf215546Sopenharmony_ci
164bf215546Sopenharmony_ci      case nir_intrinsic_load_scratch_base_ptr:
165bf215546Sopenharmony_ci      case nir_intrinsic_load_ray_launch_id:
166bf215546Sopenharmony_ci      case nir_intrinsic_load_topology_id_intel:
167bf215546Sopenharmony_ci      case nir_intrinsic_load_btd_global_arg_addr_intel:
168bf215546Sopenharmony_ci      case nir_intrinsic_load_btd_resume_sbt_addr_intel:
169bf215546Sopenharmony_ci      case nir_intrinsic_load_ray_base_mem_addr_intel:
170bf215546Sopenharmony_ci      case nir_intrinsic_load_ray_hw_stack_size_intel:
171bf215546Sopenharmony_ci      case nir_intrinsic_load_ray_sw_stack_size_intel:
172bf215546Sopenharmony_ci      case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
173bf215546Sopenharmony_ci      case nir_intrinsic_load_ray_hit_sbt_addr_intel:
174bf215546Sopenharmony_ci      case nir_intrinsic_load_ray_hit_sbt_stride_intel:
175bf215546Sopenharmony_ci      case nir_intrinsic_load_ray_miss_sbt_addr_intel:
176bf215546Sopenharmony_ci      case nir_intrinsic_load_ray_miss_sbt_stride_intel:
177bf215546Sopenharmony_ci      case nir_intrinsic_load_callable_sbt_addr_intel:
178bf215546Sopenharmony_ci      case nir_intrinsic_load_callable_sbt_stride_intel:
179bf215546Sopenharmony_ci      case nir_intrinsic_load_reloc_const_intel:
180bf215546Sopenharmony_ci      case nir_intrinsic_load_ray_query_global_intel:
181bf215546Sopenharmony_ci         /* Notably missing from the above list is btd_local_arg_addr_intel.
182bf215546Sopenharmony_ci          * This is because the resume shader will have a different local
183bf215546Sopenharmony_ci          * argument pointer because it has a different BSR.  Any access of
184bf215546Sopenharmony_ci          * the original shader's local arguments needs to be preserved so
185bf215546Sopenharmony_ci          * that pointer has to be saved on the stack.
186bf215546Sopenharmony_ci          *
187bf215546Sopenharmony_ci          * TODO: There may be some system values we want to avoid
188bf215546Sopenharmony_ci          *       re-materializing as well but we have to be very careful
189bf215546Sopenharmony_ci          *       to ensure that it's a system value which cannot change
190bf215546Sopenharmony_ci          *       across a shader call.
191bf215546Sopenharmony_ci          */
192bf215546Sopenharmony_ci         return true;
193bf215546Sopenharmony_ci
194bf215546Sopenharmony_ci      default:
195bf215546Sopenharmony_ci         return false;
196bf215546Sopenharmony_ci      }
197bf215546Sopenharmony_ci   }
198bf215546Sopenharmony_ci
199bf215546Sopenharmony_ci   case nir_instr_type_ssa_undef:
200bf215546Sopenharmony_ci   case nir_instr_type_load_const:
201bf215546Sopenharmony_ci      return true;
202bf215546Sopenharmony_ci
203bf215546Sopenharmony_ci   default:
204bf215546Sopenharmony_ci      return false;
205bf215546Sopenharmony_ci   }
206bf215546Sopenharmony_ci}
207bf215546Sopenharmony_ci
208bf215546Sopenharmony_cistatic bool
209bf215546Sopenharmony_cican_remat_ssa_def(nir_ssa_def *def, struct brw_bitset *remat)
210bf215546Sopenharmony_ci{
211bf215546Sopenharmony_ci   return can_remat_instr(def->parent_instr, remat);
212bf215546Sopenharmony_ci}
213bf215546Sopenharmony_ci
214bf215546Sopenharmony_cistatic nir_ssa_def *
215bf215546Sopenharmony_ciremat_ssa_def(nir_builder *b, nir_ssa_def *def)
216bf215546Sopenharmony_ci{
217bf215546Sopenharmony_ci   nir_instr *clone = nir_instr_clone(b->shader, def->parent_instr);
218bf215546Sopenharmony_ci   nir_builder_instr_insert(b, clone);
219bf215546Sopenharmony_ci   return nir_instr_ssa_def(clone);
220bf215546Sopenharmony_ci}
221bf215546Sopenharmony_ci
222bf215546Sopenharmony_cistruct pbv_array {
223bf215546Sopenharmony_ci   struct nir_phi_builder_value **arr;
224bf215546Sopenharmony_ci   unsigned len;
225bf215546Sopenharmony_ci};
226bf215546Sopenharmony_ci
227bf215546Sopenharmony_cistatic struct nir_phi_builder_value *
228bf215546Sopenharmony_ciget_phi_builder_value_for_def(nir_ssa_def *def,
229bf215546Sopenharmony_ci                              struct pbv_array *pbv_arr)
230bf215546Sopenharmony_ci{
231bf215546Sopenharmony_ci   if (def->index >= pbv_arr->len)
232bf215546Sopenharmony_ci      return NULL;
233bf215546Sopenharmony_ci
234bf215546Sopenharmony_ci   return pbv_arr->arr[def->index];
235bf215546Sopenharmony_ci}
236bf215546Sopenharmony_ci
237bf215546Sopenharmony_cistatic nir_ssa_def *
238bf215546Sopenharmony_ciget_phi_builder_def_for_src(nir_src *src, struct pbv_array *pbv_arr,
239bf215546Sopenharmony_ci                            nir_block *block)
240bf215546Sopenharmony_ci{
241bf215546Sopenharmony_ci   assert(src->is_ssa);
242bf215546Sopenharmony_ci
243bf215546Sopenharmony_ci   struct nir_phi_builder_value *pbv =
244bf215546Sopenharmony_ci      get_phi_builder_value_for_def(src->ssa, pbv_arr);
245bf215546Sopenharmony_ci   if (pbv == NULL)
246bf215546Sopenharmony_ci      return NULL;
247bf215546Sopenharmony_ci
248bf215546Sopenharmony_ci   return nir_phi_builder_value_get_block_def(pbv, block);
249bf215546Sopenharmony_ci}
250bf215546Sopenharmony_ci
251bf215546Sopenharmony_cistatic bool
252bf215546Sopenharmony_cirewrite_instr_src_from_phi_builder(nir_src *src, void *_pbv_arr)
253bf215546Sopenharmony_ci{
254bf215546Sopenharmony_ci   nir_block *block;
255bf215546Sopenharmony_ci   if (src->parent_instr->type == nir_instr_type_phi) {
256bf215546Sopenharmony_ci      nir_phi_src *phi_src = exec_node_data(nir_phi_src, src, src);
257bf215546Sopenharmony_ci      block = phi_src->pred;
258bf215546Sopenharmony_ci   } else {
259bf215546Sopenharmony_ci      block = src->parent_instr->block;
260bf215546Sopenharmony_ci   }
261bf215546Sopenharmony_ci
262bf215546Sopenharmony_ci   nir_ssa_def *new_def = get_phi_builder_def_for_src(src, _pbv_arr, block);
263bf215546Sopenharmony_ci   if (new_def != NULL)
264bf215546Sopenharmony_ci      nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(new_def));
265bf215546Sopenharmony_ci   return true;
266bf215546Sopenharmony_ci}
267bf215546Sopenharmony_ci
268bf215546Sopenharmony_cistatic nir_ssa_def *
269bf215546Sopenharmony_cispill_fill(nir_builder *before, nir_builder *after, nir_ssa_def *def, unsigned offset,
270bf215546Sopenharmony_ci           nir_address_format address_format, unsigned stack_alignment)
271bf215546Sopenharmony_ci{
272bf215546Sopenharmony_ci   const unsigned comp_size = def->bit_size / 8;
273bf215546Sopenharmony_ci
274bf215546Sopenharmony_ci   switch(address_format) {
275bf215546Sopenharmony_ci   case nir_address_format_32bit_offset:
276bf215546Sopenharmony_ci      nir_store_scratch(before, def, nir_imm_int(before, offset),
277bf215546Sopenharmony_ci                        .align_mul = MIN2(comp_size, stack_alignment),
278bf215546Sopenharmony_ci                        .write_mask = BITFIELD_MASK(def->num_components));
279bf215546Sopenharmony_ci      def = nir_load_scratch(after, def->num_components, def->bit_size,
280bf215546Sopenharmony_ci                             nir_imm_int(after, offset), .align_mul = MIN2(comp_size, stack_alignment));
281bf215546Sopenharmony_ci      break;
282bf215546Sopenharmony_ci   case nir_address_format_64bit_global: {
283bf215546Sopenharmony_ci      nir_ssa_def *addr = nir_iadd_imm(before, nir_load_scratch_base_ptr(before, 1, 64, 1), offset);
284bf215546Sopenharmony_ci      nir_store_global(before, addr, MIN2(comp_size, stack_alignment), def, ~0);
285bf215546Sopenharmony_ci      addr = nir_iadd_imm(after, nir_load_scratch_base_ptr(after, 1, 64, 1), offset);
286bf215546Sopenharmony_ci      def = nir_load_global(after, addr, MIN2(comp_size, stack_alignment),
287bf215546Sopenharmony_ci                            def->num_components, def->bit_size);
288bf215546Sopenharmony_ci      break;
289bf215546Sopenharmony_ci   }
290bf215546Sopenharmony_ci   default:
291bf215546Sopenharmony_ci      unreachable("Unimplemented address format");
292bf215546Sopenharmony_ci   }
293bf215546Sopenharmony_ci   return def;
294bf215546Sopenharmony_ci}
295bf215546Sopenharmony_ci
296bf215546Sopenharmony_cistatic void
297bf215546Sopenharmony_cispill_ssa_defs_and_lower_shader_calls(nir_shader *shader, uint32_t num_calls,
298bf215546Sopenharmony_ci                                      nir_address_format address_format,
299bf215546Sopenharmony_ci                                      unsigned stack_alignment)
300bf215546Sopenharmony_ci{
301bf215546Sopenharmony_ci   /* TODO: If a SSA def is filled more than once, we probably want to just
302bf215546Sopenharmony_ci    *       spill it at the LCM of the fill sites so we avoid unnecessary
303bf215546Sopenharmony_ci    *       extra spills
304bf215546Sopenharmony_ci    *
305bf215546Sopenharmony_ci    * TODO: If a SSA def is defined outside a loop but live through some call
306bf215546Sopenharmony_ci    *       inside the loop, we probably want to spill outside the loop.  We
307bf215546Sopenharmony_ci    *       may also want to fill outside the loop if it's not used in the
308bf215546Sopenharmony_ci    *       loop.
309bf215546Sopenharmony_ci    *
310bf215546Sopenharmony_ci    * TODO: Right now, we only re-materialize things if their immediate
311bf215546Sopenharmony_ci    *       sources are things which we filled.  We probably want to expand
312bf215546Sopenharmony_ci    *       that to re-materialize things whose sources are things we can
313bf215546Sopenharmony_ci    *       re-materialize from things we filled.  We may want some DAG depth
314bf215546Sopenharmony_ci    *       heuristic on this.
315bf215546Sopenharmony_ci    */
316bf215546Sopenharmony_ci
317bf215546Sopenharmony_ci   /* This happens per-shader rather than per-impl because we mess with
318bf215546Sopenharmony_ci    * nir_shader::scratch_size.
319bf215546Sopenharmony_ci    */
320bf215546Sopenharmony_ci   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
321bf215546Sopenharmony_ci
322bf215546Sopenharmony_ci   nir_metadata_require(impl, nir_metadata_live_ssa_defs |
323bf215546Sopenharmony_ci                              nir_metadata_dominance |
324bf215546Sopenharmony_ci                              nir_metadata_block_index);
325bf215546Sopenharmony_ci
326bf215546Sopenharmony_ci   void *mem_ctx = ralloc_context(shader);
327bf215546Sopenharmony_ci
328bf215546Sopenharmony_ci   const unsigned num_ssa_defs = impl->ssa_alloc;
329bf215546Sopenharmony_ci   const unsigned live_words = BITSET_WORDS(num_ssa_defs);
330bf215546Sopenharmony_ci   struct brw_bitset trivial_remat = bitset_create(mem_ctx, num_ssa_defs);
331bf215546Sopenharmony_ci
332bf215546Sopenharmony_ci   /* Array of all live SSA defs which are spill candidates */
333bf215546Sopenharmony_ci   nir_ssa_def **spill_defs =
334bf215546Sopenharmony_ci      rzalloc_array(mem_ctx, nir_ssa_def *, num_ssa_defs);
335bf215546Sopenharmony_ci
336bf215546Sopenharmony_ci   /* For each spill candidate, an array of every time it's defined by a fill,
337bf215546Sopenharmony_ci    * indexed by call instruction index.
338bf215546Sopenharmony_ci    */
339bf215546Sopenharmony_ci   nir_ssa_def ***fill_defs =
340bf215546Sopenharmony_ci      rzalloc_array(mem_ctx, nir_ssa_def **, num_ssa_defs);
341bf215546Sopenharmony_ci
342bf215546Sopenharmony_ci   /* For each call instruction, the liveness set at the call */
343bf215546Sopenharmony_ci   const BITSET_WORD **call_live =
344bf215546Sopenharmony_ci      rzalloc_array(mem_ctx, const BITSET_WORD *, num_calls);
345bf215546Sopenharmony_ci
346bf215546Sopenharmony_ci   /* For each call instruction, the block index of the block it lives in */
347bf215546Sopenharmony_ci   uint32_t *call_block_indices = rzalloc_array(mem_ctx, uint32_t, num_calls);
348bf215546Sopenharmony_ci
349bf215546Sopenharmony_ci   /* Walk the call instructions and fetch the liveness set and block index
350bf215546Sopenharmony_ci    * for each one.  We need to do this before we start modifying the shader
351bf215546Sopenharmony_ci    * so that liveness doesn't complain that it's been invalidated.  Don't
352bf215546Sopenharmony_ci    * worry, we'll be very careful with our live sets. :-)
353bf215546Sopenharmony_ci    */
354bf215546Sopenharmony_ci   unsigned call_idx = 0;
355bf215546Sopenharmony_ci   nir_foreach_block(block, impl) {
356bf215546Sopenharmony_ci      nir_foreach_instr(instr, block) {
357bf215546Sopenharmony_ci         if (!instr_is_shader_call(instr))
358bf215546Sopenharmony_ci            continue;
359bf215546Sopenharmony_ci
360bf215546Sopenharmony_ci         call_block_indices[call_idx] = block->index;
361bf215546Sopenharmony_ci
362bf215546Sopenharmony_ci         /* The objective here is to preserve values around shader call
363bf215546Sopenharmony_ci          * instructions.  Therefore, we use the live set after the
364bf215546Sopenharmony_ci          * instruction as the set of things we want to preserve.  Because
365bf215546Sopenharmony_ci          * none of our shader call intrinsics return anything, we don't have
366bf215546Sopenharmony_ci          * to worry about spilling over a return value.
367bf215546Sopenharmony_ci          *
368bf215546Sopenharmony_ci          * TODO: This isn't quite true for report_intersection.
369bf215546Sopenharmony_ci          */
370bf215546Sopenharmony_ci         call_live[call_idx] =
371bf215546Sopenharmony_ci            nir_get_live_ssa_defs(nir_after_instr(instr), mem_ctx);
372bf215546Sopenharmony_ci
373bf215546Sopenharmony_ci         call_idx++;
374bf215546Sopenharmony_ci      }
375bf215546Sopenharmony_ci   }
376bf215546Sopenharmony_ci
377bf215546Sopenharmony_ci   nir_builder before, after;
378bf215546Sopenharmony_ci   nir_builder_init(&before, impl);
379bf215546Sopenharmony_ci   nir_builder_init(&after, impl);
380bf215546Sopenharmony_ci
381bf215546Sopenharmony_ci   call_idx = 0;
382bf215546Sopenharmony_ci   unsigned max_scratch_size = shader->scratch_size;
383bf215546Sopenharmony_ci   nir_foreach_block(block, impl) {
384bf215546Sopenharmony_ci      nir_foreach_instr_safe(instr, block) {
385bf215546Sopenharmony_ci         nir_ssa_def *def = nir_instr_ssa_def(instr);
386bf215546Sopenharmony_ci         if (def != NULL) {
387bf215546Sopenharmony_ci            if (can_remat_ssa_def(def, &trivial_remat)) {
388bf215546Sopenharmony_ci               add_ssa_def_to_bitset(def, &trivial_remat);
389bf215546Sopenharmony_ci            } else {
390bf215546Sopenharmony_ci               spill_defs[def->index] = def;
391bf215546Sopenharmony_ci            }
392bf215546Sopenharmony_ci         }
393bf215546Sopenharmony_ci
394bf215546Sopenharmony_ci         if (!instr_is_shader_call(instr))
395bf215546Sopenharmony_ci            continue;
396bf215546Sopenharmony_ci
397bf215546Sopenharmony_ci         const BITSET_WORD *live = call_live[call_idx];
398bf215546Sopenharmony_ci
399bf215546Sopenharmony_ci         /* Make a copy of trivial_remat that we'll update as we crawl through
400bf215546Sopenharmony_ci          * the live SSA defs and unspill them.
401bf215546Sopenharmony_ci          */
402bf215546Sopenharmony_ci         struct brw_bitset remat = bitset_create(mem_ctx, num_ssa_defs);
403bf215546Sopenharmony_ci         memcpy(remat.set, trivial_remat.set, live_words * sizeof(BITSET_WORD));
404bf215546Sopenharmony_ci
405bf215546Sopenharmony_ci         /* Before the two builders are always separated by the call
406bf215546Sopenharmony_ci          * instruction, it won't break anything to have two of them.
407bf215546Sopenharmony_ci          */
408bf215546Sopenharmony_ci         before.cursor = nir_before_instr(instr);
409bf215546Sopenharmony_ci         after.cursor = nir_after_instr(instr);
410bf215546Sopenharmony_ci
411bf215546Sopenharmony_ci         unsigned offset = shader->scratch_size;
412bf215546Sopenharmony_ci         for (unsigned w = 0; w < live_words; w++) {
413bf215546Sopenharmony_ci            BITSET_WORD spill_mask = live[w] & ~trivial_remat.set[w];
414bf215546Sopenharmony_ci            while (spill_mask) {
415bf215546Sopenharmony_ci               int i = u_bit_scan(&spill_mask);
416bf215546Sopenharmony_ci               assert(i >= 0);
417bf215546Sopenharmony_ci               unsigned index = w * BITSET_WORDBITS + i;
418bf215546Sopenharmony_ci               assert(index < num_ssa_defs);
419bf215546Sopenharmony_ci
420bf215546Sopenharmony_ci               nir_ssa_def *def = spill_defs[index];
421bf215546Sopenharmony_ci               if (can_remat_ssa_def(def, &remat)) {
422bf215546Sopenharmony_ci                  /* If this SSA def is re-materializable or based on other
423bf215546Sopenharmony_ci                   * things we've already spilled, re-materialize it rather
424bf215546Sopenharmony_ci                   * than spilling and filling.  Anything which is trivially
425bf215546Sopenharmony_ci                   * re-materializable won't even get here because we take
426bf215546Sopenharmony_ci                   * those into account in spill_mask above.
427bf215546Sopenharmony_ci                   */
428bf215546Sopenharmony_ci                  def = remat_ssa_def(&after, def);
429bf215546Sopenharmony_ci               } else {
430bf215546Sopenharmony_ci                  bool is_bool = def->bit_size == 1;
431bf215546Sopenharmony_ci                  if (is_bool)
432bf215546Sopenharmony_ci                     def = nir_b2b32(&before, def);
433bf215546Sopenharmony_ci
434bf215546Sopenharmony_ci                  const unsigned comp_size = def->bit_size / 8;
435bf215546Sopenharmony_ci                  offset = ALIGN(offset, comp_size);
436bf215546Sopenharmony_ci
437bf215546Sopenharmony_ci                  def = spill_fill(&before, &after, def, offset,
438bf215546Sopenharmony_ci                                   address_format,stack_alignment);
439bf215546Sopenharmony_ci
440bf215546Sopenharmony_ci                  if (is_bool)
441bf215546Sopenharmony_ci                     def = nir_b2b1(&after, def);
442bf215546Sopenharmony_ci
443bf215546Sopenharmony_ci                  offset += def->num_components * comp_size;
444bf215546Sopenharmony_ci               }
445bf215546Sopenharmony_ci
446bf215546Sopenharmony_ci               /* Mark this SSA def as available in the remat set so that, if
447bf215546Sopenharmony_ci                * some other SSA def we need is computed based on it, we can
448bf215546Sopenharmony_ci                * just re-compute instead of fetching from memory.
449bf215546Sopenharmony_ci                */
450bf215546Sopenharmony_ci               BITSET_SET(remat.set, index);
451bf215546Sopenharmony_ci
452bf215546Sopenharmony_ci               /* For now, we just make a note of this new SSA def.  We'll
453bf215546Sopenharmony_ci                * fix things up with the phi builder as a second pass.
454bf215546Sopenharmony_ci                */
455bf215546Sopenharmony_ci               if (fill_defs[index] == NULL) {
456bf215546Sopenharmony_ci                  fill_defs[index] =
457bf215546Sopenharmony_ci                     rzalloc_array(mem_ctx, nir_ssa_def *, num_calls);
458bf215546Sopenharmony_ci               }
459bf215546Sopenharmony_ci               fill_defs[index][call_idx] = def;
460bf215546Sopenharmony_ci            }
461bf215546Sopenharmony_ci         }
462bf215546Sopenharmony_ci
463bf215546Sopenharmony_ci         nir_builder *b = &before;
464bf215546Sopenharmony_ci
465bf215546Sopenharmony_ci         offset = ALIGN(offset, stack_alignment);
466bf215546Sopenharmony_ci         max_scratch_size = MAX2(max_scratch_size, offset);
467bf215546Sopenharmony_ci
468bf215546Sopenharmony_ci         /* First thing on the called shader's stack is the resume address
469bf215546Sopenharmony_ci          * followed by a pointer to the payload.
470bf215546Sopenharmony_ci          */
471bf215546Sopenharmony_ci         nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr);
472bf215546Sopenharmony_ci
473bf215546Sopenharmony_ci         /* Lower to generic intrinsics with information about the stack & resume shader. */
474bf215546Sopenharmony_ci         switch (call->intrinsic) {
475bf215546Sopenharmony_ci         case nir_intrinsic_trace_ray: {
476bf215546Sopenharmony_ci            nir_rt_trace_ray(b, call->src[0].ssa, call->src[1].ssa,
477bf215546Sopenharmony_ci                              call->src[2].ssa, call->src[3].ssa,
478bf215546Sopenharmony_ci                              call->src[4].ssa, call->src[5].ssa,
479bf215546Sopenharmony_ci                              call->src[6].ssa, call->src[7].ssa,
480bf215546Sopenharmony_ci                              call->src[8].ssa, call->src[9].ssa,
481bf215546Sopenharmony_ci                              call->src[10].ssa,
482bf215546Sopenharmony_ci                              .call_idx = call_idx, .stack_size = offset);
483bf215546Sopenharmony_ci            break;
484bf215546Sopenharmony_ci         }
485bf215546Sopenharmony_ci
486bf215546Sopenharmony_ci         case nir_intrinsic_report_ray_intersection:
487bf215546Sopenharmony_ci            unreachable("Any-hit shaders must be inlined");
488bf215546Sopenharmony_ci
489bf215546Sopenharmony_ci         case nir_intrinsic_execute_callable: {
490bf215546Sopenharmony_ci            nir_rt_execute_callable(b, call->src[0].ssa, call->src[1].ssa, .call_idx = call_idx, .stack_size = offset);
491bf215546Sopenharmony_ci            break;
492bf215546Sopenharmony_ci         }
493bf215546Sopenharmony_ci
494bf215546Sopenharmony_ci         default:
495bf215546Sopenharmony_ci            unreachable("Invalid shader call instruction");
496bf215546Sopenharmony_ci         }
497bf215546Sopenharmony_ci
498bf215546Sopenharmony_ci         nir_rt_resume(b, .call_idx = call_idx, .stack_size = offset);
499bf215546Sopenharmony_ci
500bf215546Sopenharmony_ci         nir_instr_remove(&call->instr);
501bf215546Sopenharmony_ci
502bf215546Sopenharmony_ci         call_idx++;
503bf215546Sopenharmony_ci      }
504bf215546Sopenharmony_ci   }
505bf215546Sopenharmony_ci   assert(call_idx == num_calls);
506bf215546Sopenharmony_ci   shader->scratch_size = max_scratch_size;
507bf215546Sopenharmony_ci
508bf215546Sopenharmony_ci   struct nir_phi_builder *pb = nir_phi_builder_create(impl);
509bf215546Sopenharmony_ci   struct pbv_array pbv_arr = {
510bf215546Sopenharmony_ci      .arr = rzalloc_array(mem_ctx, struct nir_phi_builder_value *,
511bf215546Sopenharmony_ci                           num_ssa_defs),
512bf215546Sopenharmony_ci      .len = num_ssa_defs,
513bf215546Sopenharmony_ci   };
514bf215546Sopenharmony_ci
515bf215546Sopenharmony_ci   const unsigned block_words = BITSET_WORDS(impl->num_blocks);
516bf215546Sopenharmony_ci   BITSET_WORD *def_blocks = ralloc_array(mem_ctx, BITSET_WORD, block_words);
517bf215546Sopenharmony_ci
518bf215546Sopenharmony_ci   /* Go through and set up phi builder values for each spillable value which
519bf215546Sopenharmony_ci    * we ever needed to spill at any point.
520bf215546Sopenharmony_ci    */
521bf215546Sopenharmony_ci   for (unsigned index = 0; index < num_ssa_defs; index++) {
522bf215546Sopenharmony_ci      if (fill_defs[index] == NULL)
523bf215546Sopenharmony_ci         continue;
524bf215546Sopenharmony_ci
525bf215546Sopenharmony_ci      nir_ssa_def *def = spill_defs[index];
526bf215546Sopenharmony_ci
527bf215546Sopenharmony_ci      memset(def_blocks, 0, block_words * sizeof(BITSET_WORD));
528bf215546Sopenharmony_ci      BITSET_SET(def_blocks, def->parent_instr->block->index);
529bf215546Sopenharmony_ci      for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) {
530bf215546Sopenharmony_ci         if (fill_defs[index][call_idx] != NULL)
531bf215546Sopenharmony_ci            BITSET_SET(def_blocks, call_block_indices[call_idx]);
532bf215546Sopenharmony_ci      }
533bf215546Sopenharmony_ci
534bf215546Sopenharmony_ci      pbv_arr.arr[index] = nir_phi_builder_add_value(pb, def->num_components,
535bf215546Sopenharmony_ci                                                     def->bit_size, def_blocks);
536bf215546Sopenharmony_ci   }
537bf215546Sopenharmony_ci
538bf215546Sopenharmony_ci   /* Walk the shader one more time and rewrite SSA defs as needed using the
539bf215546Sopenharmony_ci    * phi builder.
540bf215546Sopenharmony_ci    */
541bf215546Sopenharmony_ci   nir_foreach_block(block, impl) {
542bf215546Sopenharmony_ci      nir_foreach_instr_safe(instr, block) {
543bf215546Sopenharmony_ci         nir_ssa_def *def = nir_instr_ssa_def(instr);
544bf215546Sopenharmony_ci         if (def != NULL) {
545bf215546Sopenharmony_ci            struct nir_phi_builder_value *pbv =
546bf215546Sopenharmony_ci               get_phi_builder_value_for_def(def, &pbv_arr);
547bf215546Sopenharmony_ci            if (pbv != NULL)
548bf215546Sopenharmony_ci               nir_phi_builder_value_set_block_def(pbv, block, def);
549bf215546Sopenharmony_ci         }
550bf215546Sopenharmony_ci
551bf215546Sopenharmony_ci         if (instr->type == nir_instr_type_phi)
552bf215546Sopenharmony_ci            continue;
553bf215546Sopenharmony_ci
554bf215546Sopenharmony_ci         nir_foreach_src(instr, rewrite_instr_src_from_phi_builder, &pbv_arr);
555bf215546Sopenharmony_ci
556bf215546Sopenharmony_ci         if (instr->type != nir_instr_type_intrinsic)
557bf215546Sopenharmony_ci            continue;
558bf215546Sopenharmony_ci
559bf215546Sopenharmony_ci         nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
560bf215546Sopenharmony_ci         if (resume->intrinsic != nir_intrinsic_rt_resume)
561bf215546Sopenharmony_ci            continue;
562bf215546Sopenharmony_ci
563bf215546Sopenharmony_ci         call_idx = nir_intrinsic_call_idx(resume);
564bf215546Sopenharmony_ci
565bf215546Sopenharmony_ci         /* Technically, this is the wrong place to add the fill defs to the
566bf215546Sopenharmony_ci          * phi builder values because we haven't seen any of the load_scratch
567bf215546Sopenharmony_ci          * instructions for this call yet.  However, we know based on how we
568bf215546Sopenharmony_ci          * emitted them that no value ever gets used until after the load
569bf215546Sopenharmony_ci          * instruction has been emitted so this should be safe.  If we ever
570bf215546Sopenharmony_ci          * fail validation due this it likely means a bug in our spilling
571bf215546Sopenharmony_ci          * code and not the phi re-construction code here.
572bf215546Sopenharmony_ci          */
573bf215546Sopenharmony_ci         for (unsigned index = 0; index < num_ssa_defs; index++) {
574bf215546Sopenharmony_ci            if (fill_defs[index] && fill_defs[index][call_idx]) {
575bf215546Sopenharmony_ci               nir_phi_builder_value_set_block_def(pbv_arr.arr[index], block,
576bf215546Sopenharmony_ci                                                   fill_defs[index][call_idx]);
577bf215546Sopenharmony_ci            }
578bf215546Sopenharmony_ci         }
579bf215546Sopenharmony_ci      }
580bf215546Sopenharmony_ci
581bf215546Sopenharmony_ci      nir_if *following_if = nir_block_get_following_if(block);
582bf215546Sopenharmony_ci      if (following_if) {
583bf215546Sopenharmony_ci         nir_ssa_def *new_def =
584bf215546Sopenharmony_ci            get_phi_builder_def_for_src(&following_if->condition,
585bf215546Sopenharmony_ci                                        &pbv_arr, block);
586bf215546Sopenharmony_ci         if (new_def != NULL)
587bf215546Sopenharmony_ci            nir_if_rewrite_condition(following_if, nir_src_for_ssa(new_def));
588bf215546Sopenharmony_ci      }
589bf215546Sopenharmony_ci
590bf215546Sopenharmony_ci      /* Handle phi sources that source from this block.  We have to do this
591bf215546Sopenharmony_ci       * as a separate pass because the phi builder assumes that uses and
592bf215546Sopenharmony_ci       * defs are processed in an order that respects dominance.  When we have
593bf215546Sopenharmony_ci       * loops, a phi source may be a back-edge so we have to handle it as if
594bf215546Sopenharmony_ci       * it were one of the last instructions in the predecessor block.
595bf215546Sopenharmony_ci       */
596bf215546Sopenharmony_ci      nir_foreach_phi_src_leaving_block(block,
597bf215546Sopenharmony_ci                                        rewrite_instr_src_from_phi_builder,
598bf215546Sopenharmony_ci                                        &pbv_arr);
599bf215546Sopenharmony_ci   }
600bf215546Sopenharmony_ci
601bf215546Sopenharmony_ci   nir_phi_builder_finish(pb);
602bf215546Sopenharmony_ci
603bf215546Sopenharmony_ci   ralloc_free(mem_ctx);
604bf215546Sopenharmony_ci
605bf215546Sopenharmony_ci   nir_metadata_preserve(impl, nir_metadata_block_index |
606bf215546Sopenharmony_ci                               nir_metadata_dominance);
607bf215546Sopenharmony_ci}
608bf215546Sopenharmony_ci
609bf215546Sopenharmony_cistatic nir_instr *
610bf215546Sopenharmony_cifind_resume_instr(nir_function_impl *impl, unsigned call_idx)
611bf215546Sopenharmony_ci{
612bf215546Sopenharmony_ci   nir_foreach_block(block, impl) {
613bf215546Sopenharmony_ci      nir_foreach_instr(instr, block) {
614bf215546Sopenharmony_ci         if (instr->type != nir_instr_type_intrinsic)
615bf215546Sopenharmony_ci            continue;
616bf215546Sopenharmony_ci
617bf215546Sopenharmony_ci         nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
618bf215546Sopenharmony_ci         if (resume->intrinsic != nir_intrinsic_rt_resume)
619bf215546Sopenharmony_ci            continue;
620bf215546Sopenharmony_ci
621bf215546Sopenharmony_ci         if (nir_intrinsic_call_idx(resume) == call_idx)
622bf215546Sopenharmony_ci            return &resume->instr;
623bf215546Sopenharmony_ci      }
624bf215546Sopenharmony_ci   }
625bf215546Sopenharmony_ci   unreachable("Couldn't find resume instruction");
626bf215546Sopenharmony_ci}
627bf215546Sopenharmony_ci
628bf215546Sopenharmony_ci/* Walk the CF tree and duplicate the contents of every loop, one half runs on
629bf215546Sopenharmony_ci * resume and the other half is for any post-resume loop iterations.  We are
630bf215546Sopenharmony_ci * careful in our duplication to ensure that resume_instr is in the resume
631bf215546Sopenharmony_ci * half of the loop though a copy of resume_instr will remain in the other
632bf215546Sopenharmony_ci * half as well in case the same shader call happens twice.
633bf215546Sopenharmony_ci */
634bf215546Sopenharmony_cistatic bool
635bf215546Sopenharmony_ciduplicate_loop_bodies(nir_function_impl *impl, nir_instr *resume_instr)
636bf215546Sopenharmony_ci{
637bf215546Sopenharmony_ci   nir_register *resume_reg = NULL;
638bf215546Sopenharmony_ci   for (nir_cf_node *node = resume_instr->block->cf_node.parent;
639bf215546Sopenharmony_ci        node->type != nir_cf_node_function; node = node->parent) {
640bf215546Sopenharmony_ci      if (node->type != nir_cf_node_loop)
641bf215546Sopenharmony_ci         continue;
642bf215546Sopenharmony_ci
643bf215546Sopenharmony_ci      nir_loop *loop = nir_cf_node_as_loop(node);
644bf215546Sopenharmony_ci
645bf215546Sopenharmony_ci      if (resume_reg == NULL) {
646bf215546Sopenharmony_ci         /* We only create resume_reg if we encounter a loop.  This way we can
647bf215546Sopenharmony_ci          * avoid re-validating the shader and calling ssa_to_regs in the case
648bf215546Sopenharmony_ci          * where it's just if-ladders.
649bf215546Sopenharmony_ci          */
650bf215546Sopenharmony_ci         resume_reg = nir_local_reg_create(impl);
651bf215546Sopenharmony_ci         resume_reg->num_components = 1;
652bf215546Sopenharmony_ci         resume_reg->bit_size = 1;
653bf215546Sopenharmony_ci
654bf215546Sopenharmony_ci         nir_builder b;
655bf215546Sopenharmony_ci         nir_builder_init(&b, impl);
656bf215546Sopenharmony_ci
657bf215546Sopenharmony_ci         /* Initialize resume to true */
658bf215546Sopenharmony_ci         b.cursor = nir_before_cf_list(&impl->body);
659bf215546Sopenharmony_ci         nir_store_reg(&b, resume_reg, nir_imm_true(&b), 1);
660bf215546Sopenharmony_ci
661bf215546Sopenharmony_ci         /* Set resume to false right after the resume instruction */
662bf215546Sopenharmony_ci         b.cursor = nir_after_instr(resume_instr);
663bf215546Sopenharmony_ci         nir_store_reg(&b, resume_reg, nir_imm_false(&b), 1);
664bf215546Sopenharmony_ci      }
665bf215546Sopenharmony_ci
666bf215546Sopenharmony_ci      /* Before we go any further, make sure that everything which exits the
667bf215546Sopenharmony_ci       * loop or continues around to the top of the loop does so through
668bf215546Sopenharmony_ci       * registers.  We're about to duplicate the loop body and we'll have
669bf215546Sopenharmony_ci       * serious trouble if we don't do this.
670bf215546Sopenharmony_ci       */
671bf215546Sopenharmony_ci      nir_convert_loop_to_lcssa(loop);
672bf215546Sopenharmony_ci      nir_lower_phis_to_regs_block(nir_loop_first_block(loop));
673bf215546Sopenharmony_ci      nir_lower_phis_to_regs_block(
674bf215546Sopenharmony_ci         nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
675bf215546Sopenharmony_ci
676bf215546Sopenharmony_ci      nir_cf_list cf_list;
677bf215546Sopenharmony_ci      nir_cf_list_extract(&cf_list, &loop->body);
678bf215546Sopenharmony_ci
679bf215546Sopenharmony_ci      nir_if *_if = nir_if_create(impl->function->shader);
680bf215546Sopenharmony_ci      _if->condition = nir_src_for_reg(resume_reg);
681bf215546Sopenharmony_ci      nir_cf_node_insert(nir_after_cf_list(&loop->body), &_if->cf_node);
682bf215546Sopenharmony_ci
683bf215546Sopenharmony_ci      nir_cf_list clone;
684bf215546Sopenharmony_ci      nir_cf_list_clone(&clone, &cf_list, &loop->cf_node, NULL);
685bf215546Sopenharmony_ci
686bf215546Sopenharmony_ci      /* Insert the clone in the else and the original in the then so that
687bf215546Sopenharmony_ci       * the resume_instr remains valid even after the duplication.
688bf215546Sopenharmony_ci       */
689bf215546Sopenharmony_ci      nir_cf_reinsert(&cf_list, nir_before_cf_list(&_if->then_list));
690bf215546Sopenharmony_ci      nir_cf_reinsert(&clone, nir_before_cf_list(&_if->else_list));
691bf215546Sopenharmony_ci   }
692bf215546Sopenharmony_ci
693bf215546Sopenharmony_ci   if (resume_reg != NULL)
694bf215546Sopenharmony_ci      nir_metadata_preserve(impl, nir_metadata_none);
695bf215546Sopenharmony_ci
696bf215546Sopenharmony_ci   return resume_reg != NULL;
697bf215546Sopenharmony_ci}
698bf215546Sopenharmony_ci
699bf215546Sopenharmony_cistatic bool
700bf215546Sopenharmony_cicf_node_contains_block(nir_cf_node *node, nir_block *block)
701bf215546Sopenharmony_ci{
702bf215546Sopenharmony_ci   for (nir_cf_node *n = &block->cf_node; n != NULL; n = n->parent) {
703bf215546Sopenharmony_ci      if (n == node)
704bf215546Sopenharmony_ci         return true;
705bf215546Sopenharmony_ci   }
706bf215546Sopenharmony_ci
707bf215546Sopenharmony_ci   return false;
708bf215546Sopenharmony_ci}
709bf215546Sopenharmony_ci
710bf215546Sopenharmony_cistatic void
711bf215546Sopenharmony_cirewrite_phis_to_pred(nir_block *block, nir_block *pred)
712bf215546Sopenharmony_ci{
713bf215546Sopenharmony_ci   nir_foreach_instr(instr, block) {
714bf215546Sopenharmony_ci      if (instr->type != nir_instr_type_phi)
715bf215546Sopenharmony_ci         break;
716bf215546Sopenharmony_ci
717bf215546Sopenharmony_ci      nir_phi_instr *phi = nir_instr_as_phi(instr);
718bf215546Sopenharmony_ci
719bf215546Sopenharmony_ci      ASSERTED bool found = false;
720bf215546Sopenharmony_ci      nir_foreach_phi_src(phi_src, phi) {
721bf215546Sopenharmony_ci         if (phi_src->pred == pred) {
722bf215546Sopenharmony_ci            found = true;
723bf215546Sopenharmony_ci            assert(phi_src->src.is_ssa);
724bf215546Sopenharmony_ci            nir_ssa_def_rewrite_uses(&phi->dest.ssa, phi_src->src.ssa);
725bf215546Sopenharmony_ci            break;
726bf215546Sopenharmony_ci         }
727bf215546Sopenharmony_ci      }
728bf215546Sopenharmony_ci      assert(found);
729bf215546Sopenharmony_ci   }
730bf215546Sopenharmony_ci}
731bf215546Sopenharmony_ci
732bf215546Sopenharmony_cistatic bool
733bf215546Sopenharmony_cicursor_is_after_jump(nir_cursor cursor)
734bf215546Sopenharmony_ci{
735bf215546Sopenharmony_ci   switch (cursor.option) {
736bf215546Sopenharmony_ci   case nir_cursor_before_instr:
737bf215546Sopenharmony_ci   case nir_cursor_before_block:
738bf215546Sopenharmony_ci      return false;
739bf215546Sopenharmony_ci   case nir_cursor_after_instr:
740bf215546Sopenharmony_ci      return cursor.instr->type == nir_instr_type_jump;
741bf215546Sopenharmony_ci   case nir_cursor_after_block:
742bf215546Sopenharmony_ci      return nir_block_ends_in_jump(cursor.block);;
743bf215546Sopenharmony_ci   }
744bf215546Sopenharmony_ci   unreachable("Invalid cursor option");
745bf215546Sopenharmony_ci}
746bf215546Sopenharmony_ci
747bf215546Sopenharmony_ci/** Flattens if ladders leading up to a resume
748bf215546Sopenharmony_ci *
749bf215546Sopenharmony_ci * Given a resume_instr, this function flattens any if ladders leading to the
750bf215546Sopenharmony_ci * resume instruction and deletes any code that cannot be encountered on a
751bf215546Sopenharmony_ci * direct path to the resume instruction.  This way we get, for the most part,
752bf215546Sopenharmony_ci * straight-line control-flow up to the resume instruction.
753bf215546Sopenharmony_ci *
754bf215546Sopenharmony_ci * While we do this flattening, we also move any code which is in the remat
755bf215546Sopenharmony_ci * set up to the top of the function or to the top of the resume portion of
756bf215546Sopenharmony_ci * the current loop.  We don't worry about control-flow as we do this because
757bf215546Sopenharmony_ci * phis will never be in the remat set (see can_remat_instr) and so nothing
758bf215546Sopenharmony_ci * control-dependent will ever need to be re-materialized.  It is possible
759bf215546Sopenharmony_ci * that this algorithm will preserve too many instructions by moving them to
760bf215546Sopenharmony_ci * the top but we leave that for DCE to clean up.  Any code not in the remat
761bf215546Sopenharmony_ci * set is deleted because it's either unused in the continuation or else
762bf215546Sopenharmony_ci * unspilled from a previous continuation and the unspill code is after the
763bf215546Sopenharmony_ci * resume instruction.
764bf215546Sopenharmony_ci *
765bf215546Sopenharmony_ci * If, for instance, we have something like this:
766bf215546Sopenharmony_ci *
767bf215546Sopenharmony_ci *    // block 0
768bf215546Sopenharmony_ci *    if (cond1) {
769bf215546Sopenharmony_ci *       // block 1
770bf215546Sopenharmony_ci *    } else {
771bf215546Sopenharmony_ci *       // block 2
772bf215546Sopenharmony_ci *       if (cond2) {
773bf215546Sopenharmony_ci *          // block 3
774bf215546Sopenharmony_ci *          resume;
775bf215546Sopenharmony_ci *          if (cond3) {
776bf215546Sopenharmony_ci *             // block 4
777bf215546Sopenharmony_ci *          }
778bf215546Sopenharmony_ci *       } else {
779bf215546Sopenharmony_ci *          // block 5
780bf215546Sopenharmony_ci *       }
781bf215546Sopenharmony_ci *    }
782bf215546Sopenharmony_ci *
783bf215546Sopenharmony_ci * then we know, because we know the resume instruction had to be encoutered,
784bf215546Sopenharmony_ci * that cond1 = false and cond2 = true and we lower as follows:
785bf215546Sopenharmony_ci *
786bf215546Sopenharmony_ci *    // block 0
787bf215546Sopenharmony_ci *    // block 2
788bf215546Sopenharmony_ci *    // block 3
789bf215546Sopenharmony_ci *    resume;
790bf215546Sopenharmony_ci *    if (cond3) {
791bf215546Sopenharmony_ci *       // block 4
792bf215546Sopenharmony_ci *    }
793bf215546Sopenharmony_ci *
794bf215546Sopenharmony_ci * As you can see, the code in blocks 1 and 5 was removed because there is no
795bf215546Sopenharmony_ci * path from the start of the shader to the resume instruction which execute
796bf215546Sopenharmony_ci * blocks 1 or 5.  Any remat code from blocks 0, 2, and 3 is preserved and
797bf215546Sopenharmony_ci * moved to the top.  If the resume instruction is inside a loop then we know
798bf215546Sopenharmony_ci * a priori that it is of the form
799bf215546Sopenharmony_ci *
800bf215546Sopenharmony_ci *    loop {
801bf215546Sopenharmony_ci *       if (resume) {
802bf215546Sopenharmony_ci *          // Contents containing resume_instr
803bf215546Sopenharmony_ci *       } else {
804bf215546Sopenharmony_ci *          // Second copy of contents
805bf215546Sopenharmony_ci *       }
806bf215546Sopenharmony_ci *    }
807bf215546Sopenharmony_ci *
808bf215546Sopenharmony_ci * In this case, we only descend into the first half of the loop.  The second
809bf215546Sopenharmony_ci * half is left alone as that portion is only ever executed after the resume
810bf215546Sopenharmony_ci * instruction.
811bf215546Sopenharmony_ci */
812bf215546Sopenharmony_cistatic bool
813bf215546Sopenharmony_ciflatten_resume_if_ladder(nir_builder *b,
814bf215546Sopenharmony_ci                         nir_cf_node *parent_node,
815bf215546Sopenharmony_ci                         struct exec_list *child_list,
816bf215546Sopenharmony_ci                         bool child_list_contains_cursor,
817bf215546Sopenharmony_ci                         nir_instr *resume_instr,
818bf215546Sopenharmony_ci                         struct brw_bitset *remat)
819bf215546Sopenharmony_ci{
820bf215546Sopenharmony_ci   nir_cf_list cf_list;
821bf215546Sopenharmony_ci
822bf215546Sopenharmony_ci   /* If our child list contains the cursor instruction then we start out
823bf215546Sopenharmony_ci    * before the cursor instruction.  We need to know this so that we can skip
824bf215546Sopenharmony_ci    * moving instructions which are already before the cursor.
825bf215546Sopenharmony_ci    */
826bf215546Sopenharmony_ci   bool before_cursor = child_list_contains_cursor;
827bf215546Sopenharmony_ci
828bf215546Sopenharmony_ci   nir_cf_node *resume_node = NULL;
829bf215546Sopenharmony_ci   foreach_list_typed_safe(nir_cf_node, child, node, child_list) {
830bf215546Sopenharmony_ci      switch (child->type) {
831bf215546Sopenharmony_ci      case nir_cf_node_block: {
832bf215546Sopenharmony_ci         nir_block *block = nir_cf_node_as_block(child);
833bf215546Sopenharmony_ci         if (b->cursor.option == nir_cursor_before_block &&
834bf215546Sopenharmony_ci             b->cursor.block == block) {
835bf215546Sopenharmony_ci            assert(before_cursor);
836bf215546Sopenharmony_ci            before_cursor = false;
837bf215546Sopenharmony_ci         }
838bf215546Sopenharmony_ci         nir_foreach_instr_safe(instr, block) {
839bf215546Sopenharmony_ci            if ((b->cursor.option == nir_cursor_before_instr ||
840bf215546Sopenharmony_ci                 b->cursor.option == nir_cursor_after_instr) &&
841bf215546Sopenharmony_ci                b->cursor.instr == instr) {
842bf215546Sopenharmony_ci               assert(nir_cf_node_is_first(&block->cf_node));
843bf215546Sopenharmony_ci               assert(before_cursor);
844bf215546Sopenharmony_ci               before_cursor = false;
845bf215546Sopenharmony_ci               continue;
846bf215546Sopenharmony_ci            }
847bf215546Sopenharmony_ci
848bf215546Sopenharmony_ci            if (instr == resume_instr)
849bf215546Sopenharmony_ci               goto found_resume;
850bf215546Sopenharmony_ci
851bf215546Sopenharmony_ci            if (!before_cursor && can_remat_instr(instr, remat)) {
852bf215546Sopenharmony_ci               nir_instr_remove(instr);
853bf215546Sopenharmony_ci               nir_instr_insert(b->cursor, instr);
854bf215546Sopenharmony_ci               b->cursor = nir_after_instr(instr);
855bf215546Sopenharmony_ci
856bf215546Sopenharmony_ci               nir_ssa_def *def = nir_instr_ssa_def(instr);
857bf215546Sopenharmony_ci               BITSET_SET(remat->set, def->index);
858bf215546Sopenharmony_ci            }
859bf215546Sopenharmony_ci         }
860bf215546Sopenharmony_ci         if (b->cursor.option == nir_cursor_after_block &&
861bf215546Sopenharmony_ci             b->cursor.block == block) {
862bf215546Sopenharmony_ci            assert(before_cursor);
863bf215546Sopenharmony_ci            before_cursor = false;
864bf215546Sopenharmony_ci         }
865bf215546Sopenharmony_ci         break;
866bf215546Sopenharmony_ci      }
867bf215546Sopenharmony_ci
868bf215546Sopenharmony_ci      case nir_cf_node_if: {
869bf215546Sopenharmony_ci         nir_if *_if = nir_cf_node_as_if(child);
870bf215546Sopenharmony_ci
871bf215546Sopenharmony_ci         /* Because of the dummy blocks inserted in the first if block of the
872bf215546Sopenharmony_ci          * loops, it's possible we find an empty if block that contains our
873bf215546Sopenharmony_ci          * cursor. At this point, the block should still be empty and we can
874bf215546Sopenharmony_ci          * just skip it and consider we're after the cursor.
875bf215546Sopenharmony_ci          */
876bf215546Sopenharmony_ci         if (cf_node_contains_block(&_if->cf_node,
877bf215546Sopenharmony_ci                                    nir_cursor_current_block(b->cursor))) {
878bf215546Sopenharmony_ci            /* Some sanity checks to verify this is actually a dummy block */
879bf215546Sopenharmony_ci            assert(nir_src_as_bool(_if->condition) == true);
880bf215546Sopenharmony_ci            assert(nir_cf_list_is_empty_block(&_if->then_list));
881bf215546Sopenharmony_ci            assert(nir_cf_list_is_empty_block(&_if->else_list));
882bf215546Sopenharmony_ci            before_cursor = false;
883bf215546Sopenharmony_ci            break;
884bf215546Sopenharmony_ci         }
885bf215546Sopenharmony_ci         assert(!before_cursor);
886bf215546Sopenharmony_ci
887bf215546Sopenharmony_ci         if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->then_list,
888bf215546Sopenharmony_ci                                      false, resume_instr, remat)) {
889bf215546Sopenharmony_ci            resume_node = child;
890bf215546Sopenharmony_ci            rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)),
891bf215546Sopenharmony_ci                                 nir_if_last_then_block(_if));
892bf215546Sopenharmony_ci            goto found_resume;
893bf215546Sopenharmony_ci         }
894bf215546Sopenharmony_ci
895bf215546Sopenharmony_ci         if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->else_list,
896bf215546Sopenharmony_ci                                      false, resume_instr, remat)) {
897bf215546Sopenharmony_ci            resume_node = child;
898bf215546Sopenharmony_ci            rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)),
899bf215546Sopenharmony_ci                                 nir_if_last_else_block(_if));
900bf215546Sopenharmony_ci            goto found_resume;
901bf215546Sopenharmony_ci         }
902bf215546Sopenharmony_ci         break;
903bf215546Sopenharmony_ci      }
904bf215546Sopenharmony_ci
905bf215546Sopenharmony_ci      case nir_cf_node_loop: {
906bf215546Sopenharmony_ci         assert(!before_cursor);
907bf215546Sopenharmony_ci         nir_loop *loop = nir_cf_node_as_loop(child);
908bf215546Sopenharmony_ci
909bf215546Sopenharmony_ci         if (cf_node_contains_block(&loop->cf_node, resume_instr->block)) {
910bf215546Sopenharmony_ci            /* Thanks to our loop body duplication pass, every level of loop
911bf215546Sopenharmony_ci             * containing the resume instruction contains exactly three nodes:
912bf215546Sopenharmony_ci             * two blocks and an if.  We don't want to lower away this if
913bf215546Sopenharmony_ci             * because it's the resume selection if.  The resume half is
914bf215546Sopenharmony_ci             * always the then_list so that's what we want to flatten.
915bf215546Sopenharmony_ci             */
916bf215546Sopenharmony_ci            nir_block *header = nir_loop_first_block(loop);
917bf215546Sopenharmony_ci            nir_if *_if = nir_cf_node_as_if(nir_cf_node_next(&header->cf_node));
918bf215546Sopenharmony_ci
919bf215546Sopenharmony_ci            nir_builder bl;
920bf215546Sopenharmony_ci            nir_builder_init(&bl, b->impl);
921bf215546Sopenharmony_ci            bl.cursor = nir_before_cf_list(&_if->then_list);
922bf215546Sopenharmony_ci            /* We want to place anything re-materialized from inside the loop
923bf215546Sopenharmony_ci             * at the top of the resume half of the loop.
924bf215546Sopenharmony_ci             *
925bf215546Sopenharmony_ci             * Because we're inside a loop, we might run into a break/continue
926bf215546Sopenharmony_ci             * instructions. We can't place those within a block of
927bf215546Sopenharmony_ci             * instructions, they need to be at the end of a block. So we
928bf215546Sopenharmony_ci             * build our own dummy block to place them.
929bf215546Sopenharmony_ci             */
930bf215546Sopenharmony_ci            nir_push_if(&bl, nir_imm_true(&bl));
931bf215546Sopenharmony_ci            {
932bf215546Sopenharmony_ci               ASSERTED bool found =
933bf215546Sopenharmony_ci                  flatten_resume_if_ladder(&bl, &_if->cf_node, &_if->then_list,
934bf215546Sopenharmony_ci                                           true, resume_instr, remat);
935bf215546Sopenharmony_ci               assert(found);
936bf215546Sopenharmony_ci            }
937bf215546Sopenharmony_ci            nir_pop_if(&bl, NULL);
938bf215546Sopenharmony_ci
939bf215546Sopenharmony_ci            resume_node = child;
940bf215546Sopenharmony_ci            goto found_resume;
941bf215546Sopenharmony_ci         } else {
942bf215546Sopenharmony_ci            ASSERTED bool found =
943bf215546Sopenharmony_ci               flatten_resume_if_ladder(b, &loop->cf_node, &loop->body,
944bf215546Sopenharmony_ci                                        false, resume_instr, remat);
945bf215546Sopenharmony_ci            assert(!found);
946bf215546Sopenharmony_ci         }
947bf215546Sopenharmony_ci         break;
948bf215546Sopenharmony_ci      }
949bf215546Sopenharmony_ci
950bf215546Sopenharmony_ci      case nir_cf_node_function:
951bf215546Sopenharmony_ci         unreachable("Unsupported CF node type");
952bf215546Sopenharmony_ci      }
953bf215546Sopenharmony_ci   }
954bf215546Sopenharmony_ci   assert(!before_cursor);
955bf215546Sopenharmony_ci
956bf215546Sopenharmony_ci   /* If we got here, we didn't find the resume node or instruction. */
957bf215546Sopenharmony_ci   return false;
958bf215546Sopenharmony_ci
959bf215546Sopenharmony_cifound_resume:
960bf215546Sopenharmony_ci   /* If we got here then we found either the resume node or the resume
961bf215546Sopenharmony_ci    * instruction in this CF list.
962bf215546Sopenharmony_ci    */
963bf215546Sopenharmony_ci   if (resume_node) {
964bf215546Sopenharmony_ci      /* If the resume instruction is buried in side one of our children CF
965bf215546Sopenharmony_ci       * nodes, resume_node now points to that child.
966bf215546Sopenharmony_ci       */
967bf215546Sopenharmony_ci      if (resume_node->type == nir_cf_node_if) {
968bf215546Sopenharmony_ci         /* Thanks to the recursive call, all of the interesting contents of
969bf215546Sopenharmony_ci          * resume_node have been copied before the cursor.  We just need to
970bf215546Sopenharmony_ci          * copy the stuff after resume_node.
971bf215546Sopenharmony_ci          */
972bf215546Sopenharmony_ci         nir_cf_extract(&cf_list, nir_after_cf_node(resume_node),
973bf215546Sopenharmony_ci                                  nir_after_cf_list(child_list));
974bf215546Sopenharmony_ci      } else {
975bf215546Sopenharmony_ci         /* The loop contains its own cursor and still has useful stuff in it.
976bf215546Sopenharmony_ci          * We want to move everything after and including the loop to before
977bf215546Sopenharmony_ci          * the cursor.
978bf215546Sopenharmony_ci          */
979bf215546Sopenharmony_ci         assert(resume_node->type == nir_cf_node_loop);
980bf215546Sopenharmony_ci         nir_cf_extract(&cf_list, nir_before_cf_node(resume_node),
981bf215546Sopenharmony_ci                                  nir_after_cf_list(child_list));
982bf215546Sopenharmony_ci      }
983bf215546Sopenharmony_ci   } else {
984bf215546Sopenharmony_ci      /* If we found the resume instruction in one of our blocks, grab
985bf215546Sopenharmony_ci       * everything after it in the entire list (not just the one block), and
986bf215546Sopenharmony_ci       * place it before the cursor instr.
987bf215546Sopenharmony_ci       */
988bf215546Sopenharmony_ci      nir_cf_extract(&cf_list, nir_after_instr(resume_instr),
989bf215546Sopenharmony_ci                               nir_after_cf_list(child_list));
990bf215546Sopenharmony_ci   }
991bf215546Sopenharmony_ci
992bf215546Sopenharmony_ci   if (cursor_is_after_jump(b->cursor)) {
993bf215546Sopenharmony_ci      /* If the resume instruction is in a loop, it's possible cf_list ends
994bf215546Sopenharmony_ci       * in a break or continue instruction, in which case we don't want to
995bf215546Sopenharmony_ci       * insert anything.  It's also possible we have an early return if
996bf215546Sopenharmony_ci       * someone hasn't lowered those yet.  In either case, nothing after that
997bf215546Sopenharmony_ci       * point executes in this context so we can delete it.
998bf215546Sopenharmony_ci       */
999bf215546Sopenharmony_ci      nir_cf_delete(&cf_list);
1000bf215546Sopenharmony_ci   } else {
1001bf215546Sopenharmony_ci      b->cursor = nir_cf_reinsert(&cf_list, b->cursor);
1002bf215546Sopenharmony_ci   }
1003bf215546Sopenharmony_ci
1004bf215546Sopenharmony_ci   if (!resume_node) {
1005bf215546Sopenharmony_ci      /* We want the resume to be the first "interesting" instruction */
1006bf215546Sopenharmony_ci      nir_instr_remove(resume_instr);
1007bf215546Sopenharmony_ci      nir_instr_insert(nir_before_cf_list(&b->impl->body), resume_instr);
1008bf215546Sopenharmony_ci   }
1009bf215546Sopenharmony_ci
1010bf215546Sopenharmony_ci   /* We've copied everything interesting out of this CF list to before the
1011bf215546Sopenharmony_ci    * cursor.  Delete everything else.
1012bf215546Sopenharmony_ci    */
1013bf215546Sopenharmony_ci   if (child_list_contains_cursor) {
1014bf215546Sopenharmony_ci      /* If the cursor is in child_list, then we're either a loop or function
1015bf215546Sopenharmony_ci       * that contains the cursor. Cursors are always placed in a wrapper if
1016bf215546Sopenharmony_ci       * (true) to deal with break/continue and early returns. We've already
1017bf215546Sopenharmony_ci       * moved everything interesting inside the wrapper if and we want to
1018bf215546Sopenharmony_ci       * remove whatever is left after it.
1019bf215546Sopenharmony_ci       */
1020bf215546Sopenharmony_ci      nir_block *cursor_block = nir_cursor_current_block(b->cursor);
1021bf215546Sopenharmony_ci      nir_if *wrapper_if = nir_cf_node_as_if(cursor_block->cf_node.parent);
1022bf215546Sopenharmony_ci      assert(wrapper_if->cf_node.parent == parent_node);
1023bf215546Sopenharmony_ci      /* The wrapper if blocks are either put into the body of the main
1024bf215546Sopenharmony_ci       * function, or within the resume if block of the loops.
1025bf215546Sopenharmony_ci       */
1026bf215546Sopenharmony_ci      assert(parent_node->type == nir_cf_node_function ||
1027bf215546Sopenharmony_ci             (parent_node->type == nir_cf_node_if &&
1028bf215546Sopenharmony_ci              parent_node->parent->type == nir_cf_node_loop));
1029bf215546Sopenharmony_ci      nir_cf_extract(&cf_list, nir_after_cf_node(&wrapper_if->cf_node),
1030bf215546Sopenharmony_ci                     nir_after_cf_list(child_list));
1031bf215546Sopenharmony_ci   } else {
1032bf215546Sopenharmony_ci      nir_cf_list_extract(&cf_list, child_list);
1033bf215546Sopenharmony_ci   }
1034bf215546Sopenharmony_ci   nir_cf_delete(&cf_list);
1035bf215546Sopenharmony_ci
1036bf215546Sopenharmony_ci   return true;
1037bf215546Sopenharmony_ci}
1038bf215546Sopenharmony_ci
1039bf215546Sopenharmony_cistatic nir_instr *
1040bf215546Sopenharmony_cilower_resume(nir_shader *shader, int call_idx)
1041bf215546Sopenharmony_ci{
1042bf215546Sopenharmony_ci   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
1043bf215546Sopenharmony_ci
1044bf215546Sopenharmony_ci   nir_instr *resume_instr = find_resume_instr(impl, call_idx);
1045bf215546Sopenharmony_ci
1046bf215546Sopenharmony_ci   if (duplicate_loop_bodies(impl, resume_instr)) {
1047bf215546Sopenharmony_ci      nir_validate_shader(shader, "after duplicate_loop_bodies in "
1048bf215546Sopenharmony_ci                                  "brw_nir_lower_shader_calls");
1049bf215546Sopenharmony_ci      /* If we duplicated the bodies of any loops, run regs_to_ssa to get rid
1050bf215546Sopenharmony_ci       * of all those pesky registers we just added.
1051bf215546Sopenharmony_ci       */
1052bf215546Sopenharmony_ci      NIR_PASS_V(shader, nir_lower_regs_to_ssa);
1053bf215546Sopenharmony_ci   }
1054bf215546Sopenharmony_ci
1055bf215546Sopenharmony_ci   /* Re-index nir_ssa_def::index.  We don't care about actual liveness in
1056bf215546Sopenharmony_ci    * this pass but, so we can use the same helpers as the spilling pass, we
1057bf215546Sopenharmony_ci    * need to make sure that live_index is something sane.  It's used
1058bf215546Sopenharmony_ci    * constantly for determining if an SSA value has been added since the
1059bf215546Sopenharmony_ci    * start of the pass.
1060bf215546Sopenharmony_ci    */
1061bf215546Sopenharmony_ci   nir_index_ssa_defs(impl);
1062bf215546Sopenharmony_ci
1063bf215546Sopenharmony_ci   void *mem_ctx = ralloc_context(shader);
1064bf215546Sopenharmony_ci
1065bf215546Sopenharmony_ci   /* Used to track which things may have been assumed to be re-materialized
1066bf215546Sopenharmony_ci    * by the spilling pass and which we shouldn't delete.
1067bf215546Sopenharmony_ci    */
1068bf215546Sopenharmony_ci   struct brw_bitset remat = bitset_create(mem_ctx, impl->ssa_alloc);
1069bf215546Sopenharmony_ci
1070bf215546Sopenharmony_ci   /* Create a nop instruction to use as a cursor as we extract and re-insert
1071bf215546Sopenharmony_ci    * stuff into the CFG.
1072bf215546Sopenharmony_ci    */
1073bf215546Sopenharmony_ci   nir_builder b;
1074bf215546Sopenharmony_ci   nir_builder_init(&b, impl);
1075bf215546Sopenharmony_ci   b.cursor = nir_before_cf_list(&impl->body);
1076bf215546Sopenharmony_ci
1077bf215546Sopenharmony_ci   nir_push_if(&b, nir_imm_true(&b));
1078bf215546Sopenharmony_ci   {
1079bf215546Sopenharmony_ci      ASSERTED bool found =
1080bf215546Sopenharmony_ci         flatten_resume_if_ladder(&b, &impl->cf_node, &impl->body,
1081bf215546Sopenharmony_ci                                  true, resume_instr, &remat);
1082bf215546Sopenharmony_ci      assert(found);
1083bf215546Sopenharmony_ci   }
1084bf215546Sopenharmony_ci   nir_pop_if(&b, NULL);
1085bf215546Sopenharmony_ci
1086bf215546Sopenharmony_ci   ralloc_free(mem_ctx);
1087bf215546Sopenharmony_ci
1088bf215546Sopenharmony_ci   nir_validate_shader(shader, "after flatten_resume_if_ladder in "
1089bf215546Sopenharmony_ci                               "brw_nir_lower_shader_calls");
1090bf215546Sopenharmony_ci
1091bf215546Sopenharmony_ci   nir_metadata_preserve(impl, nir_metadata_none);
1092bf215546Sopenharmony_ci
1093bf215546Sopenharmony_ci   return resume_instr;
1094bf215546Sopenharmony_ci}
1095bf215546Sopenharmony_ci
1096bf215546Sopenharmony_cistatic void
1097bf215546Sopenharmony_cireplace_resume_with_halt(nir_shader *shader, nir_instr *keep)
1098bf215546Sopenharmony_ci{
1099bf215546Sopenharmony_ci   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
1100bf215546Sopenharmony_ci
1101bf215546Sopenharmony_ci   nir_builder b;
1102bf215546Sopenharmony_ci   nir_builder_init(&b, impl);
1103bf215546Sopenharmony_ci
1104bf215546Sopenharmony_ci   nir_foreach_block_safe(block, impl) {
1105bf215546Sopenharmony_ci      nir_foreach_instr_safe(instr, block) {
1106bf215546Sopenharmony_ci         if (instr == keep)
1107bf215546Sopenharmony_ci            continue;
1108bf215546Sopenharmony_ci
1109bf215546Sopenharmony_ci         if (instr->type != nir_instr_type_intrinsic)
1110bf215546Sopenharmony_ci            continue;
1111bf215546Sopenharmony_ci
1112bf215546Sopenharmony_ci         nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
1113bf215546Sopenharmony_ci         if (resume->intrinsic != nir_intrinsic_rt_resume)
1114bf215546Sopenharmony_ci            continue;
1115bf215546Sopenharmony_ci
1116bf215546Sopenharmony_ci         /* If this is some other resume, then we've kicked off a ray or
1117bf215546Sopenharmony_ci          * bindless thread and we don't want to go any further in this
1118bf215546Sopenharmony_ci          * shader.  Insert a halt so that NIR will delete any instructions
1119bf215546Sopenharmony_ci          * dominated by this call instruction including the scratch_load
1120bf215546Sopenharmony_ci          * instructions we inserted.
1121bf215546Sopenharmony_ci          */
1122bf215546Sopenharmony_ci         nir_cf_list cf_list;
1123bf215546Sopenharmony_ci         nir_cf_extract(&cf_list, nir_after_instr(&resume->instr),
1124bf215546Sopenharmony_ci                                  nir_after_block(block));
1125bf215546Sopenharmony_ci         nir_cf_delete(&cf_list);
1126bf215546Sopenharmony_ci         b.cursor = nir_instr_remove(&resume->instr);
1127bf215546Sopenharmony_ci         nir_jump(&b, nir_jump_halt);
1128bf215546Sopenharmony_ci         break;
1129bf215546Sopenharmony_ci      }
1130bf215546Sopenharmony_ci   }
1131bf215546Sopenharmony_ci}
1132bf215546Sopenharmony_ci
1133bf215546Sopenharmony_ci/** Lower shader call instructions to split shaders.
1134bf215546Sopenharmony_ci *
1135bf215546Sopenharmony_ci * Shader calls can be split into an initial shader and a series of "resume"
1136bf215546Sopenharmony_ci * shaders.   When the shader is first invoked, it is the initial shader which
1137bf215546Sopenharmony_ci * is executed.  At any point in the initial shader or any one of the resume
1138bf215546Sopenharmony_ci * shaders, a shader call operation may be performed.  The possible shader call
1139bf215546Sopenharmony_ci * operations are:
1140bf215546Sopenharmony_ci *
1141bf215546Sopenharmony_ci *  - trace_ray
1142bf215546Sopenharmony_ci *  - report_ray_intersection
1143bf215546Sopenharmony_ci *  - execute_callable
1144bf215546Sopenharmony_ci *
1145bf215546Sopenharmony_ci * When a shader call operation is performed, we push all live values to the
1146bf215546Sopenharmony_ci * stack,call rt_trace_ray/rt_execute_callable and then kill the shader. Once
1147bf215546Sopenharmony_ci * the operation we invoked is complete, a callee shader will return execution
1148bf215546Sopenharmony_ci * to the respective resume shader. The resume shader pops the contents off
1149bf215546Sopenharmony_ci * the stack and picks up where the calling shader left off.
1150bf215546Sopenharmony_ci *
1151bf215546Sopenharmony_ci * Stack management is assumed to be done after this pass. Call
1152bf215546Sopenharmony_ci * instructions and their resumes get annotated with stack information that
1153bf215546Sopenharmony_ci * should be enough for the backend to implement proper stack management.
1154bf215546Sopenharmony_ci */
1155bf215546Sopenharmony_cibool
1156bf215546Sopenharmony_cinir_lower_shader_calls(nir_shader *shader,
1157bf215546Sopenharmony_ci                       nir_address_format address_format,
1158bf215546Sopenharmony_ci                       unsigned stack_alignment,
1159bf215546Sopenharmony_ci                       nir_shader ***resume_shaders_out,
1160bf215546Sopenharmony_ci                       uint32_t *num_resume_shaders_out,
1161bf215546Sopenharmony_ci                       void *mem_ctx)
1162bf215546Sopenharmony_ci{
1163bf215546Sopenharmony_ci   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
1164bf215546Sopenharmony_ci
1165bf215546Sopenharmony_ci   nir_builder b;
1166bf215546Sopenharmony_ci   nir_builder_init(&b, impl);
1167bf215546Sopenharmony_ci
1168bf215546Sopenharmony_ci   int num_calls = 0;
1169bf215546Sopenharmony_ci   nir_foreach_block(block, impl) {
1170bf215546Sopenharmony_ci      nir_foreach_instr_safe(instr, block) {
1171bf215546Sopenharmony_ci         if (instr_is_shader_call(instr))
1172bf215546Sopenharmony_ci            num_calls++;
1173bf215546Sopenharmony_ci      }
1174bf215546Sopenharmony_ci   }
1175bf215546Sopenharmony_ci
1176bf215546Sopenharmony_ci   if (num_calls == 0) {
1177bf215546Sopenharmony_ci      nir_shader_preserve_all_metadata(shader);
1178bf215546Sopenharmony_ci      *num_resume_shaders_out = 0;
1179bf215546Sopenharmony_ci      return false;
1180bf215546Sopenharmony_ci   }
1181bf215546Sopenharmony_ci
1182bf215546Sopenharmony_ci   /* Some intrinsics not only can't be re-materialized but aren't preserved
1183bf215546Sopenharmony_ci    * when moving to the continuation shader.  We have to move them to the top
1184bf215546Sopenharmony_ci    * to ensure they get spilled as needed.
1185bf215546Sopenharmony_ci    */
1186bf215546Sopenharmony_ci   {
1187bf215546Sopenharmony_ci      bool progress = false;
1188bf215546Sopenharmony_ci      NIR_PASS(progress, shader, move_system_values_to_top);
1189bf215546Sopenharmony_ci      if (progress)
1190bf215546Sopenharmony_ci         NIR_PASS(progress, shader, nir_opt_cse);
1191bf215546Sopenharmony_ci   }
1192bf215546Sopenharmony_ci
1193bf215546Sopenharmony_ci   NIR_PASS_V(shader, spill_ssa_defs_and_lower_shader_calls,
1194bf215546Sopenharmony_ci              num_calls, address_format, stack_alignment);
1195bf215546Sopenharmony_ci
1196bf215546Sopenharmony_ci   nir_opt_remove_phis(shader);
1197bf215546Sopenharmony_ci
1198bf215546Sopenharmony_ci   /* Make N copies of our shader */
1199bf215546Sopenharmony_ci   nir_shader **resume_shaders = ralloc_array(mem_ctx, nir_shader *, num_calls);
1200bf215546Sopenharmony_ci   for (unsigned i = 0; i < num_calls; i++) {
1201bf215546Sopenharmony_ci      resume_shaders[i] = nir_shader_clone(mem_ctx, shader);
1202bf215546Sopenharmony_ci
1203bf215546Sopenharmony_ci      /* Give them a recognizable name */
1204bf215546Sopenharmony_ci      resume_shaders[i]->info.name =
1205bf215546Sopenharmony_ci         ralloc_asprintf(mem_ctx, "%s%sresume_%u",
1206bf215546Sopenharmony_ci                         shader->info.name ? shader->info.name : "",
1207bf215546Sopenharmony_ci                         shader->info.name ? "-" : "",
1208bf215546Sopenharmony_ci                         i);
1209bf215546Sopenharmony_ci   }
1210bf215546Sopenharmony_ci
1211bf215546Sopenharmony_ci   replace_resume_with_halt(shader, NULL);
1212bf215546Sopenharmony_ci   for (unsigned i = 0; i < num_calls; i++) {
1213bf215546Sopenharmony_ci      nir_instr *resume_instr = lower_resume(resume_shaders[i], i);
1214bf215546Sopenharmony_ci      replace_resume_with_halt(resume_shaders[i], resume_instr);
1215bf215546Sopenharmony_ci      nir_opt_remove_phis(resume_shaders[i]);
1216bf215546Sopenharmony_ci      /* Remove the dummy blocks added by flatten_resume_if_ladder() */
1217bf215546Sopenharmony_ci      nir_opt_if(resume_shaders[i], nir_opt_if_optimize_phi_true_false);
1218bf215546Sopenharmony_ci   }
1219bf215546Sopenharmony_ci
1220bf215546Sopenharmony_ci   *resume_shaders_out = resume_shaders;
1221bf215546Sopenharmony_ci   *num_resume_shaders_out = num_calls;
1222bf215546Sopenharmony_ci
1223bf215546Sopenharmony_ci   return true;
1224bf215546Sopenharmony_ci}
1225