1/*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23#include <math.h>
24#include "nir.h"
25#include "nir_builder.h"
26#include "util/u_vector.h"
27
28/**
29 * Lower flrp instructions.
30 *
31 * Unlike the lowerings that are possible in nir_opt_algrbraic, this pass can
32 * examine more global information to determine a possibly more efficient
33 * lowering for each flrp.
34 */
35
36static void
37append_flrp_to_dead_list(struct u_vector *dead_flrp, struct nir_alu_instr *alu)
38{
39   struct nir_alu_instr **tail = u_vector_add(dead_flrp);
40   *tail = alu;
41}
42
43/**
44 * Replace flrp(a, b, c) with ffma(b, c, ffma(-a, c, a)).
45 */
46static void
47replace_with_strict_ffma(struct nir_builder *bld, struct u_vector *dead_flrp,
48                         struct nir_alu_instr *alu)
49{
50   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
51   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
52   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
53
54   nir_ssa_def *const neg_a = nir_fneg(bld, a);
55   nir_instr_as_alu(neg_a->parent_instr)->exact = alu->exact;
56
57   nir_ssa_def *const inner_ffma = nir_ffma(bld, neg_a, c, a);
58   nir_instr_as_alu(inner_ffma->parent_instr)->exact = alu->exact;
59
60   nir_ssa_def *const outer_ffma = nir_ffma(bld, b, c, inner_ffma);
61   nir_instr_as_alu(outer_ffma->parent_instr)->exact = alu->exact;
62
63   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, outer_ffma);
64
65   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
66    * based on other uses of the sources.  Removing the flrp may cause the
67    * last flrp in a sequence to make a different, incorrect choice.
68    */
69   append_flrp_to_dead_list(dead_flrp, alu);
70}
71
72/**
73 * Replace flrp(a, b, c) with ffma(a, (1 - c), bc)
74 */
75static void
76replace_with_single_ffma(struct nir_builder *bld, struct u_vector *dead_flrp,
77                         struct nir_alu_instr *alu)
78{
79   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
80   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
81   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
82
83   nir_ssa_def *const neg_c = nir_fneg(bld, c);
84   nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact;
85
86   nir_ssa_def *const one_minus_c =
87      nir_fadd(bld, nir_imm_floatN_t(bld, 1.0f, c->bit_size), neg_c);
88   nir_instr_as_alu(one_minus_c->parent_instr)->exact = alu->exact;
89
90   nir_ssa_def *const b_times_c = nir_fmul(bld, b, c);
91   nir_instr_as_alu(b_times_c->parent_instr)->exact = alu->exact;
92
93   nir_ssa_def *const final_ffma = nir_ffma(bld, a, one_minus_c, b_times_c);
94   nir_instr_as_alu(final_ffma->parent_instr)->exact = alu->exact;
95
96   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, final_ffma);
97
98   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
99    * based on other uses of the sources.  Removing the flrp may cause the
100    * last flrp in a sequence to make a different, incorrect choice.
101    */
102   append_flrp_to_dead_list(dead_flrp, alu);
103}
104
105/**
106 * Replace flrp(a, b, c) with a(1-c) + bc.
107 */
108static void
109replace_with_strict(struct nir_builder *bld, struct u_vector *dead_flrp,
110                    struct nir_alu_instr *alu)
111{
112   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
113   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
114   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
115
116   nir_ssa_def *const neg_c = nir_fneg(bld, c);
117   nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact;
118
119   nir_ssa_def *const one_minus_c =
120      nir_fadd(bld, nir_imm_floatN_t(bld, 1.0f, c->bit_size), neg_c);
121   nir_instr_as_alu(one_minus_c->parent_instr)->exact = alu->exact;
122
123   nir_ssa_def *const first_product = nir_fmul(bld, a, one_minus_c);
124   nir_instr_as_alu(first_product->parent_instr)->exact = alu->exact;
125
126   nir_ssa_def *const second_product = nir_fmul(bld, b, c);
127   nir_instr_as_alu(second_product->parent_instr)->exact = alu->exact;
128
129   nir_ssa_def *const sum = nir_fadd(bld, first_product, second_product);
130   nir_instr_as_alu(sum->parent_instr)->exact = alu->exact;
131
132   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum);
133
134   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
135    * based on other uses of the sources.  Removing the flrp may cause the
136    * last flrp in a sequence to make a different, incorrect choice.
137    */
138   append_flrp_to_dead_list(dead_flrp, alu);
139}
140
141/**
142 * Replace flrp(a, b, c) with a + c(b-a).
143 */
144static void
145replace_with_fast(struct nir_builder *bld, struct u_vector *dead_flrp,
146                  struct nir_alu_instr *alu)
147{
148   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
149   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
150   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
151
152   nir_ssa_def *const neg_a = nir_fneg(bld, a);
153   nir_instr_as_alu(neg_a->parent_instr)->exact = alu->exact;
154
155   nir_ssa_def *const b_minus_a = nir_fadd(bld, b, neg_a);
156   nir_instr_as_alu(b_minus_a->parent_instr)->exact = alu->exact;
157
158   nir_ssa_def *const product = nir_fmul(bld, c, b_minus_a);
159   nir_instr_as_alu(product->parent_instr)->exact = alu->exact;
160
161   nir_ssa_def *const sum = nir_fadd(bld, a, product);
162   nir_instr_as_alu(sum->parent_instr)->exact = alu->exact;
163
164   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum);
165
166   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
167    * based on other uses of the sources.  Removing the flrp may cause the
168    * last flrp in a sequence to make a different, incorrect choice.
169    */
170   append_flrp_to_dead_list(dead_flrp, alu);
171}
172
173/**
174 * Replace flrp(a, b, c) with (b*c ± c) + a => b*c + (a ± c)
175 *
176 * \note: This only works if a = ±1.
177 */
178static void
179replace_with_expanded_ffma_and_add(struct nir_builder *bld,
180                                   struct u_vector *dead_flrp,
181                                   struct nir_alu_instr *alu, bool subtract_c)
182{
183   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
184   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
185   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
186
187   nir_ssa_def *const b_times_c = nir_fmul(bld, b, c);
188   nir_instr_as_alu(b_times_c->parent_instr)->exact = alu->exact;
189
190   nir_ssa_def *inner_sum;
191
192   if (subtract_c) {
193      nir_ssa_def *const neg_c = nir_fneg(bld, c);
194      nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact;
195
196      inner_sum = nir_fadd(bld, a, neg_c);
197   } else {
198      inner_sum = nir_fadd(bld, a, c);
199   }
200
201   nir_instr_as_alu(inner_sum->parent_instr)->exact = alu->exact;
202
203   nir_ssa_def *const outer_sum = nir_fadd(bld, inner_sum, b_times_c);
204   nir_instr_as_alu(outer_sum->parent_instr)->exact = alu->exact;
205
206   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, outer_sum);
207
208   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
209    * based on other uses of the sources.  Removing the flrp may cause the
210    * last flrp in a sequence to make a different, incorrect choice.
211    */
212   append_flrp_to_dead_list(dead_flrp, alu);
213}
214
215/**
216 * Determines whether a swizzled source is constant w/ all components the same.
217 *
218 * The value of the constant is stored in \c result.
219 *
220 * \return
221 * True if all components of the swizzled source are the same constant.
222 * Otherwise false is returned.
223 */
224static bool
225all_same_constant(const nir_alu_instr *instr, unsigned src, double *result)
226{
227   nir_const_value *val = nir_src_as_const_value(instr->src[src].src);
228
229   if (!val)
230      return false;
231
232   const uint8_t *const swizzle = instr->src[src].swizzle;
233   const unsigned num_components = nir_dest_num_components(instr->dest.dest);
234
235   if (instr->dest.dest.ssa.bit_size == 32) {
236      const float first = val[swizzle[0]].f32;
237
238      for (unsigned i = 1; i < num_components; i++) {
239         if (val[swizzle[i]].f32 != first)
240            return false;
241      }
242
243      *result = first;
244   } else {
245      const double first = val[swizzle[0]].f64;
246
247      for (unsigned i = 1; i < num_components; i++) {
248         if (val[swizzle[i]].f64 != first)
249            return false;
250      }
251
252      *result = first;
253   }
254
255   return true;
256}
257
258static bool
259sources_are_constants_with_similar_magnitudes(const nir_alu_instr *instr)
260{
261   nir_const_value *val0 = nir_src_as_const_value(instr->src[0].src);
262   nir_const_value *val1 = nir_src_as_const_value(instr->src[1].src);
263
264   if (val0 == NULL || val1 == NULL)
265      return false;
266
267   const uint8_t *const swizzle0 = instr->src[0].swizzle;
268   const uint8_t *const swizzle1 = instr->src[1].swizzle;
269   const unsigned num_components = nir_dest_num_components(instr->dest.dest);
270
271   if (instr->dest.dest.ssa.bit_size == 32) {
272      for (unsigned i = 0; i < num_components; i++) {
273         int exp0;
274         int exp1;
275
276         frexpf(val0[swizzle0[i]].f32, &exp0);
277         frexpf(val1[swizzle1[i]].f32, &exp1);
278
279         /* If the difference between exponents is >= 24, then A+B will always
280          * have the value whichever between A and B has the largest absolute
281          * value.  So, [0, 23] is the valid range.  The smaller the limit
282          * value, the more precision will be maintained at a potential
283          * performance cost.  Somewhat arbitrarilly split the range in half.
284          */
285         if (abs(exp0 - exp1) > (23 / 2))
286            return false;
287      }
288   } else {
289      for (unsigned i = 0; i < num_components; i++) {
290         int exp0;
291         int exp1;
292
293         frexp(val0[swizzle0[i]].f64, &exp0);
294         frexp(val1[swizzle1[i]].f64, &exp1);
295
296         /* If the difference between exponents is >= 53, then A+B will always
297          * have the value whichever between A and B has the largest absolute
298          * value.  So, [0, 52] is the valid range.  The smaller the limit
299          * value, the more precision will be maintained at a potential
300          * performance cost.  Somewhat arbitrarilly split the range in half.
301          */
302         if (abs(exp0 - exp1) > (52 / 2))
303            return false;
304      }
305   }
306
307   return true;
308}
309
310/**
311 * Counts of similar types of nir_op_flrp instructions
312 *
313 * If a similar instruction fits into more than one category, it will only be
314 * counted once.  The assumption is that no other instruction will have all
315 * sources the same, or CSE would have removed one of the instructions.
316 */
317struct similar_flrp_stats {
318   unsigned src2;
319   unsigned src0_and_src2;
320   unsigned src1_and_src2;
321};
322
323/**
324 * Collection counts of similar FLRP instructions.
325 *
326 * This function only cares about similar instructions that have src2 in
327 * common.
328 */
329static void
330get_similar_flrp_stats(nir_alu_instr *alu, struct similar_flrp_stats *st)
331{
332   memset(st, 0, sizeof(*st));
333
334   nir_foreach_use(other_use, alu->src[2].src.ssa) {
335      /* Is the use also a flrp? */
336      nir_instr *const other_instr = other_use->parent_instr;
337      if (other_instr->type != nir_instr_type_alu)
338         continue;
339
340      /* Eh-hem... don't match the instruction with itself. */
341      if (other_instr == &alu->instr)
342         continue;
343
344      nir_alu_instr *const other_alu = nir_instr_as_alu(other_instr);
345      if (other_alu->op != nir_op_flrp)
346         continue;
347
348      /* Does the other flrp use source 2 from the first flrp as its source 2
349       * as well?
350       */
351      if (!nir_alu_srcs_equal(alu, other_alu, 2, 2))
352         continue;
353
354      if (nir_alu_srcs_equal(alu, other_alu, 0, 0))
355         st->src0_and_src2++;
356      else if (nir_alu_srcs_equal(alu, other_alu, 1, 1))
357         st->src1_and_src2++;
358      else
359         st->src2++;
360   }
361}
362
363static void
364convert_flrp_instruction(nir_builder *bld,
365                         struct u_vector *dead_flrp,
366                         nir_alu_instr *alu,
367                         bool always_precise)
368{
369   bool have_ffma = false;
370   unsigned bit_size = nir_dest_bit_size(alu->dest.dest);
371
372   if (bit_size == 16)
373      have_ffma = !bld->shader->options->lower_ffma16;
374   else if (bit_size == 32)
375      have_ffma = !bld->shader->options->lower_ffma32;
376   else if (bit_size == 64)
377      have_ffma = !bld->shader->options->lower_ffma64;
378   else
379      unreachable("invalid bit_size");
380
381   bld->cursor = nir_before_instr(&alu->instr);
382
383   /* There are two methods to implement flrp(x, y, t).  The strictly correct
384    * implementation according to the GLSL spec is:
385    *
386    *    x(1 - t) + yt
387    *
388    * This can also be implemented using two chained FMAs
389    *
390    *    fma(y, t, fma(-x, t, x))
391    *
392    * This method, using either formulation, has better precision when the
393    * difference between x and y is very large.  It guarantess that flrp(x, y,
394    * 1) = y.  For example, flrp(1e38, 1.0, 1.0) is 1.0.  This is correct.
395    *
396    * The other possible implementation is:
397    *
398    *    x + t(y - x)
399    *
400    * This can also be formuated as an FMA:
401    *
402    *    fma(y - x, t, x)
403    *
404    * For this implementation, flrp(1e38, 1.0, 1.0) is 0.0.  Since 1.0 was
405    * expected, that's a pretty significant error.
406    *
407    * The choice made for lowering depends on a number of factors.
408    *
409    * - If the flrp is marked precise and FMA is supported:
410    *
411    *        fma(y, t, fma(-x, t, x))
412    *
413    *   This is strictly correct (maybe?), and the cost is two FMA
414    *   instructions.  It at least maintains the flrp(x, y, 1.0) == y
415    *   condition.
416    *
417    * - If the flrp is marked precise and FMA is not supported:
418    *
419    *        x(1 - t) + yt
420    *
421    *   This is strictly correct, and the cost is 4 instructions.  If FMA is
422    *   supported, this may or may not be reduced to 3 instructions (a
423    *   subtract, a multiply, and an FMA)... but in that case the other
424    *   formulation should have been used.
425    */
426   if (alu->exact) {
427      if (have_ffma)
428         replace_with_strict_ffma(bld, dead_flrp, alu);
429      else
430         replace_with_strict(bld, dead_flrp, alu);
431
432      return;
433   }
434
435   /*
436    * - If x and y are both immediates and the relative magnitude of the
437    *   values is similar (such that x-y does not lose too much precision):
438    *
439    *        x + t(x - y)
440    *
441    *   We rely on constant folding to eliminate x-y, and we rely on
442    *   nir_opt_algebraic to possibly generate an FMA.  The cost is either one
443    *   FMA or two instructions.
444    */
445   if (sources_are_constants_with_similar_magnitudes(alu)) {
446      replace_with_fast(bld, dead_flrp, alu);
447      return;
448   }
449
450   /*
451    * - If x = 1:
452    *
453    *        (yt + -t) + 1
454    *
455    * - If x = -1:
456    *
457    *        (yt + t) - 1
458    *
459    *   In both cases, x is used in place of ±1 for simplicity.  Both forms
460    *   lend to ffma generation on platforms that support ffma.
461    */
462   double src0_as_constant;
463   if (all_same_constant(alu, 0, &src0_as_constant)) {
464      if (src0_as_constant == 1.0) {
465         replace_with_expanded_ffma_and_add(bld, dead_flrp, alu,
466                                            true /* subtract t */);
467         return;
468      } else if (src0_as_constant == -1.0) {
469         replace_with_expanded_ffma_and_add(bld, dead_flrp, alu,
470                                            false /* add t */);
471         return;
472      }
473   }
474
475   /*
476    * - If y = ±1:
477    *
478    *        x(1 - t) + yt
479    *
480    *   In this case either the multiply in yt will be eliminated by
481    *   nir_opt_algebraic.  If FMA is supported, this results in fma(x, (1 -
482    *   t), ±t) for two instructions.  If FMA is not supported, then the cost
483    *   is 3 instructions.  We rely on nir_opt_algebraic to generate the FMA
484    *   instructions as well.
485    *
486    *   Another possible replacement is
487    *
488    *        -xt + x ± t
489    *
490    *   Some groupings of this may be better on some platforms in some
491    *   circumstances, bit it is probably dependent on scheduling.  Futher
492    *   investigation may be required.
493    */
494   double src1_as_constant;
495   if ((all_same_constant(alu, 1, &src1_as_constant) &&
496        (src1_as_constant == -1.0 || src1_as_constant == 1.0))) {
497      replace_with_strict(bld, dead_flrp, alu);
498      return;
499   }
500
501   if (have_ffma) {
502      if (always_precise) {
503         replace_with_strict_ffma(bld, dead_flrp, alu);
504         return;
505      }
506
507      /*
508       * - If FMA is supported and other flrp(x, _, t) exists:
509       *
510       *        fma(y, t, fma(-x, t, x))
511       *
512       *   The hope is that the inner FMA calculation will be shared with the
513       *   other lowered flrp.  This results in two FMA instructions for the
514       *   first flrp and one FMA instruction for each additional flrp.  It
515       *   also means that the live range for x might be complete after the
516       *   inner ffma instead of after the last flrp.
517       */
518      struct similar_flrp_stats st;
519
520      get_similar_flrp_stats(alu, &st);
521      if (st.src0_and_src2 > 0) {
522         replace_with_strict_ffma(bld, dead_flrp, alu);
523         return;
524      }
525
526      /*
527       * - If FMA is supported and another flrp(_, y, t) exists:
528       *
529       *        fma(x, (1 - t), yt)
530       *
531       *   The hope is that the (1 - t) and the yt will be shared with the
532       *   other lowered flrp.  This results in 3 insructions for the first
533       *   flrp and 1 for each additional flrp.
534       */
535      if (st.src1_and_src2 > 0) {
536         replace_with_single_ffma(bld, dead_flrp, alu);
537         return;
538      }
539   } else {
540      if (always_precise) {
541         replace_with_strict(bld, dead_flrp, alu);
542         return;
543      }
544
545      /*
546       * - If FMA is not supported and another flrp(x, _, t) exists:
547       *
548       *        x(1 - t) + yt
549       *
550       *   The hope is that the x(1 - t) will be shared with the other lowered
551       *   flrp.  This results in 4 insructions for the first flrp and 2 for
552       *   each additional flrp.
553       *
554       * - If FMA is not supported and another flrp(_, y, t) exists:
555       *
556       *        x(1 - t) + yt
557       *
558       *   The hope is that the (1 - t) and the yt will be shared with the
559       *   other lowered flrp.  This results in 4 insructions for the first
560       *   flrp and 2 for each additional flrp.
561       */
562      struct similar_flrp_stats st;
563
564      get_similar_flrp_stats(alu, &st);
565      if (st.src0_and_src2 > 0 || st.src1_and_src2 > 0) {
566         replace_with_strict(bld, dead_flrp, alu);
567         return;
568      }
569   }
570
571   /*
572    * - If t is constant:
573    *
574    *        x(1 - t) + yt
575    *
576    *   The cost is three instructions without FMA or two instructions with
577    *   FMA.  This is the same cost as the imprecise lowering, but it gives
578    *   the instruction scheduler a little more freedom.
579    *
580    *   There is no need to handle t = 0.5 specially.  nir_opt_algebraic
581    *   already has optimizations to convert 0.5x + 0.5y to 0.5(x + y).
582    */
583   if (alu->src[2].src.ssa->parent_instr->type == nir_instr_type_load_const) {
584      replace_with_strict(bld, dead_flrp, alu);
585      return;
586   }
587
588   /*
589    * - Otherwise
590    *
591    *        x + t(x - y)
592    */
593   replace_with_fast(bld, dead_flrp, alu);
594}
595
596static void
597lower_flrp_impl(nir_function_impl *impl,
598                struct u_vector *dead_flrp,
599                unsigned lowering_mask,
600                bool always_precise)
601{
602   nir_builder b;
603   nir_builder_init(&b, impl);
604
605   nir_foreach_block(block, impl) {
606      nir_foreach_instr_safe(instr, block) {
607         if (instr->type == nir_instr_type_alu) {
608            nir_alu_instr *const alu = nir_instr_as_alu(instr);
609
610            if (alu->op == nir_op_flrp &&
611                (alu->dest.dest.ssa.bit_size & lowering_mask)) {
612               convert_flrp_instruction(&b, dead_flrp, alu, always_precise);
613            }
614         }
615      }
616   }
617
618   nir_metadata_preserve(impl, nir_metadata_block_index |
619                               nir_metadata_dominance);
620}
621
622/**
623 * \param lowering_mask - Bitwise-or of the bit sizes that need to be lowered
624 *                        (e.g., 16 | 64 if only 16-bit and 64-bit flrp need
625 *                        lowering).
626 * \param always_precise - Always require precise lowering for flrp.  This
627 *                        will always lower flrp to (a * (1 - c)) + (b * c).
628 * \param have_ffma - Set to true if the GPU has an FFMA instruction that
629 *                    should be used.
630 */
631bool
632nir_lower_flrp(nir_shader *shader,
633               unsigned lowering_mask,
634               bool always_precise)
635{
636   struct u_vector dead_flrp;
637
638   if (!u_vector_init_pow2(&dead_flrp, 8, sizeof(struct nir_alu_instr *)))
639      return false;
640
641   nir_foreach_function(function, shader) {
642      if (function->impl) {
643         lower_flrp_impl(function->impl, &dead_flrp, lowering_mask,
644                         always_precise);
645      }
646   }
647
648   /* Progress was made if the dead list is not empty.  Remove all the
649    * instructions from the dead list.
650    */
651   const bool progress = u_vector_length(&dead_flrp) != 0;
652
653   struct nir_alu_instr **instr;
654   u_vector_foreach(instr, &dead_flrp)
655      nir_instr_remove(&(*instr)->instr);
656
657   u_vector_finish(&dead_flrp);
658
659   return progress;
660}
661