1/*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "nir_instr_set.h"
25#include "nir_search_helpers.h"
26#include "nir_builder.h"
27#include "util/u_vector.h"
28
29/* Partial redundancy elimination of compares
30 *
31 * Seaches for comparisons of the form 'a cmp b' that dominate arithmetic
32 * instructions like 'b - a'.  The comparison is replaced by the arithmetic
33 * instruction, and the result is compared with zero.  For example,
34 *
35 *       vec1 32 ssa_111 = flt 0.37, ssa_110.w
36 *       if ssa_111 {
37 *               block block_1:
38 *              vec1 32 ssa_112 = fadd ssa_110.w, -0.37
39 *              ...
40 *
41 * becomes
42 *
43 *       vec1 32 ssa_111 = fadd ssa_110.w, -0.37
44 *       vec1 32 ssa_112 = flt 0.0, ssa_111
45 *       if ssa_112 {
46 *               block block_1:
47 *              ...
48 */
49
50struct block_queue {
51   /**
52    * Stack of blocks from the current location in the CFG to the entry point
53    * of the function.
54    *
55    * This is sort of a poor man's dominator tree.
56    */
57   struct exec_list blocks;
58
59   /** List of freed block_instructions structures that can be reused. */
60   struct exec_list reusable_blocks;
61};
62
63struct block_instructions {
64   struct exec_node node;
65
66   /**
67    * Set of comparison instructions from the block that are candidates for
68    * being replaced by add instructions.
69    */
70   struct u_vector instructions;
71};
72
73static void
74block_queue_init(struct block_queue *bq)
75{
76   exec_list_make_empty(&bq->blocks);
77   exec_list_make_empty(&bq->reusable_blocks);
78}
79
80static void
81block_queue_finish(struct block_queue *bq)
82{
83   struct block_instructions *n;
84
85   while ((n = (struct block_instructions *) exec_list_pop_head(&bq->blocks)) != NULL) {
86      u_vector_finish(&n->instructions);
87      free(n);
88   }
89
90   while ((n = (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks)) != NULL) {
91      free(n);
92   }
93}
94
95static struct block_instructions *
96push_block(struct block_queue *bq)
97{
98   struct block_instructions *bi =
99      (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks);
100
101   if (bi == NULL) {
102      bi = calloc(1, sizeof(struct block_instructions));
103
104      if (bi == NULL)
105         return NULL;
106   }
107
108   if (!u_vector_init_pow2(&bi->instructions, 8, sizeof(nir_alu_instr *))) {
109      free(bi);
110      return NULL;
111   }
112
113   exec_list_push_tail(&bq->blocks, &bi->node);
114
115   return bi;
116}
117
118static void
119pop_block(struct block_queue *bq, struct block_instructions *bi)
120{
121   u_vector_finish(&bi->instructions);
122   exec_node_remove(&bi->node);
123   exec_list_push_head(&bq->reusable_blocks, &bi->node);
124}
125
126static void
127add_instruction_for_block(struct block_instructions *bi,
128                          nir_alu_instr *alu)
129{
130   nir_alu_instr **data =
131      u_vector_add(&bi->instructions);
132
133   *data = alu;
134}
135
136static void
137rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
138                            nir_alu_instr *orig_add, bool zero_on_left)
139{
140   bld->cursor = nir_before_instr(&orig_cmp->instr);
141
142   /* This is somewhat tricky.  The compare instruction may be something like
143    * (fcmp, a, b) while the add instruction is something like (fadd, fneg(a),
144    * b).  This is problematic because the SSA value for the fneg(a) may not
145    * exist yet at the compare instruction.
146    *
147    * We fabricate the operands of the new add.  This is done using
148    * information provided by zero_on_left.  If zero_on_left is true, we know
149    * the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)).  If the
150    * original compare instruction was (fcmp, a, b), x = b and y = -a.  If
151    * zero_on_left is false, the resulting compare instruction is (fcmp,
152    * (fadd, x, y), 0.0) and x = a and y = -b.
153    */
154   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0);
155   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1);
156
157   nir_ssa_def *const fadd = zero_on_left
158      ? nir_fadd(bld, b, nir_fneg(bld, a))
159      : nir_fadd(bld, a, nir_fneg(bld, b));
160
161   nir_ssa_def *const zero =
162      nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size);
163
164   nir_ssa_def *const cmp = zero_on_left
165      ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL)
166      : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL);
167
168   /* Generating extra moves of the results is the easy way to make sure the
169    * writemasks match the original instructions.  Later optimization passes
170    * will clean these up.  This is similar to nir_replace_instr (in
171    * nir_search.c).
172    */
173   nir_alu_instr *mov_add = nir_alu_instr_create(bld->shader, nir_op_mov);
174   mov_add->dest.write_mask = orig_add->dest.write_mask;
175   nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest,
176                     orig_add->dest.dest.ssa.num_components,
177                     orig_add->dest.dest.ssa.bit_size, NULL);
178   mov_add->src[0].src = nir_src_for_ssa(fadd);
179
180   nir_builder_instr_insert(bld, &mov_add->instr);
181
182   nir_alu_instr *mov_cmp = nir_alu_instr_create(bld->shader, nir_op_mov);
183   mov_cmp->dest.write_mask = orig_cmp->dest.write_mask;
184   nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest,
185                     orig_cmp->dest.dest.ssa.num_components,
186                     orig_cmp->dest.dest.ssa.bit_size, NULL);
187   mov_cmp->src[0].src = nir_src_for_ssa(cmp);
188
189   nir_builder_instr_insert(bld, &mov_cmp->instr);
190
191   nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa,
192                            &mov_cmp->dest.dest.ssa);
193   nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa,
194                            &mov_add->dest.dest.ssa);
195
196   /* We know these have no more uses because we just rewrote them all, so we
197    * can remove them.
198    */
199   nir_instr_remove(&orig_cmp->instr);
200   nir_instr_remove(&orig_add->instr);
201}
202
203static bool
204comparison_pre_block(nir_block *block, struct block_queue *bq, nir_builder *bld)
205{
206   bool progress = false;
207
208   struct block_instructions *bi = push_block(bq);
209   if (bi == NULL)
210      return false;
211
212   /* Starting with the current block, examine each instruction.  If the
213    * instruction is a comparison that matches the '±a cmp ±b' pattern, add it
214    * to the block_instructions::instructions set.  If the instruction is an
215    * add instruction, walk up the block queue looking at the stored
216    * instructions.  If a matching comparison is found, move the addition and
217    * replace the comparison with a different comparison based on the result
218    * of the addition.  All of the blocks in the queue are guaranteed to be
219    * dominators of the current block.
220    *
221    * After processing the current block, recurse into the blocks dominated by
222    * the current block.
223    */
224   nir_foreach_instr_safe(instr, block) {
225      if (instr->type != nir_instr_type_alu)
226         continue;
227
228      nir_alu_instr *const alu = nir_instr_as_alu(instr);
229
230      if (alu->dest.dest.ssa.num_components != 1)
231         continue;
232
233      if (alu->dest.saturate)
234         continue;
235
236      static const uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
237
238      switch (alu->op) {
239      case nir_op_fadd: {
240         /* If the instruction is fadd, check it against comparison
241          * instructions that dominate it.
242          */
243         struct block_instructions *b =
244            (struct block_instructions *) exec_list_get_head_raw(&bq->blocks);
245
246         while (b->node.next != NULL) {
247            nir_alu_instr **a;
248            bool rewrote_compare = false;
249
250            u_vector_foreach(a, &b->instructions) {
251               nir_alu_instr *const cmp = *a;
252
253               if (cmp == NULL)
254                  continue;
255
256               /* The operands of both instructions are, with some liberty,
257                * commutative.  Check all four permutations.  The third and
258                * fourth permutations are negations of the first two.
259                */
260               if ((nir_alu_srcs_equal(cmp, alu, 0, 0) &&
261                    nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) ||
262                   (nir_alu_srcs_equal(cmp, alu, 0, 1) &&
263                    nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) {
264                  /* These are the cases where (A cmp B) matches either (A +
265                   * -B) or (-B + A)
266                   *
267                   *    A cmp B <=> A + -B cmp 0
268                   */
269                  rewrite_compare_instruction(bld, cmp, alu, false);
270
271                  *a = NULL;
272                  rewrote_compare = true;
273                  break;
274               } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) &&
275                           nir_alu_srcs_negative_equal(cmp, alu, 0, 1)) ||
276                          (nir_alu_srcs_equal(cmp, alu, 1, 1) &&
277                           nir_alu_srcs_negative_equal(cmp, alu, 0, 0))) {
278                  /* This is the case where (A cmp B) matches (B + -A) or (-A
279                   * + B).
280                   *
281                   *    A cmp B <=> 0 cmp B + -A
282                   */
283                  rewrite_compare_instruction(bld, cmp, alu, true);
284
285                  *a = NULL;
286                  rewrote_compare = true;
287                  break;
288               }
289            }
290
291            /* Bail after a compare in the most dominating block is found.
292             * This is necessary because 'alu' has been removed from the
293             * instruction stream.  Should there be a matching compare in
294             * another block, calling rewrite_compare_instruction again will
295             * try to operate on a node that is not in the list as if it were
296             * in the list.
297             *
298             * FINISHME: There may be opportunity for additional optimization
299             * here.  I discovered this problem due to a shader in Guacamelee.
300             * It may be possible to rewrite the matching compares that are
301             * encountered later to reuse the result from the compare that was
302             * first rewritten.  It's also possible that this is just taken
303             * care of by calling the optimization pass repeatedly.
304             */
305            if (rewrote_compare) {
306               progress = true;
307               break;
308            }
309
310            b = (struct block_instructions *) b->node.next;
311         }
312
313         break;
314      }
315
316      case nir_op_flt:
317      case nir_op_fge:
318      case nir_op_fneu:
319      case nir_op_feq:
320         /* If the instruction is a comparison that is used by an if-statement
321          * and neither operand is immediate value 0, add it to the set.
322          */
323         if (is_used_by_if(alu) &&
324             is_not_const_zero(NULL, alu, 0, 1, swizzle) &&
325             is_not_const_zero(NULL, alu, 1, 1, swizzle))
326            add_instruction_for_block(bi, alu);
327
328         break;
329
330      default:
331         break;
332      }
333   }
334
335   for (unsigned i = 0; i < block->num_dom_children; i++) {
336      nir_block *child = block->dom_children[i];
337
338      if (comparison_pre_block(child, bq, bld))
339         progress = true;
340   }
341
342   pop_block(bq, bi);
343
344   return progress;
345}
346
347bool
348nir_opt_comparison_pre_impl(nir_function_impl *impl)
349{
350   struct block_queue bq;
351   nir_builder bld;
352
353   block_queue_init(&bq);
354   nir_builder_init(&bld, impl);
355
356   nir_metadata_require(impl, nir_metadata_dominance);
357
358   const bool progress =
359      comparison_pre_block(nir_start_block(impl), &bq, &bld);
360
361   block_queue_finish(&bq);
362
363   if (progress) {
364      nir_metadata_preserve(impl, nir_metadata_block_index |
365                                  nir_metadata_dominance);
366   } else {
367      nir_metadata_preserve(impl, nir_metadata_all);
368   }
369
370   return progress;
371}
372
373bool
374nir_opt_comparison_pre(nir_shader *shader)
375{
376   bool progress = false;
377
378   nir_foreach_function(function, shader) {
379      if (function->impl)
380         progress |= nir_opt_comparison_pre_impl(function->impl);
381   }
382
383   return progress;
384}
385