1/*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_fs.h"
25#include "brw_cfg.h"
26#include "brw_eu.h"
27
28/** @file brw_fs_cmod_propagation.cpp
29 *
30 * Implements a pass that propagates the conditional modifier from a CMP x 0.0
31 * instruction into the instruction that generated x. For instance, in this
32 * sequence
33 *
34 *    add(8)          g70<1>F    g69<8,8,1>F    4096F
35 *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
36 *
37 * we can do the comparison as part of the ADD instruction directly:
38 *
39 *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
40 *
41 * If there had been a use of the flag register and another CMP using g70
42 *
43 *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
44 *    (+f0) sel(8)    g71<F>     g72<8,8,1>F    g73<8,8,1>F
45 *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
46 *
47 * we can recognize that the CMP is generating the flag value that already
48 * exists and therefore remove the instruction.
49 */
50
51using namespace brw;
52
53static bool
54cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block,
55                          fs_inst *inst)
56{
57   bool read_flag = false;
58   const unsigned flags_written = inst->flags_written(devinfo);
59
60   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
61      if (scan_inst->opcode == BRW_OPCODE_ADD &&
62          !scan_inst->is_partial_write() &&
63          scan_inst->exec_size == inst->exec_size) {
64         bool negate;
65
66         /* A CMP is basically a subtraction.  The result of the
67          * subtraction must be the same as the result of the addition.
68          * This means that one of the operands must be negated.  So (a +
69          * b) vs (a == -b) or (a + -b) vs (a == b).
70          */
71         if ((inst->src[0].equals(scan_inst->src[0]) &&
72              inst->src[1].negative_equals(scan_inst->src[1])) ||
73             (inst->src[0].equals(scan_inst->src[1]) &&
74              inst->src[1].negative_equals(scan_inst->src[0]))) {
75            negate = false;
76         } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
77                     inst->src[1].equals(scan_inst->src[1])) ||
78                    (inst->src[0].negative_equals(scan_inst->src[1]) &&
79                     inst->src[1].equals(scan_inst->src[0]))) {
80            negate = true;
81         } else {
82            goto not_match;
83         }
84
85         /* If the scan instruction writes a different flag register than the
86          * instruction we're trying to propagate from, bail.
87          *
88          * FINISHME: The second part of the condition may be too strong.
89          * Perhaps (scan_inst->flags_written() & flags_written) !=
90          * flags_written?
91          */
92         if (scan_inst->flags_written(devinfo) != 0 &&
93             scan_inst->flags_written(devinfo) != flags_written)
94            goto not_match;
95
96         /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
97          *
98          *    * Note that the [post condition signal] bits generated at
99          *      the output of a compute are before the .sat.
100          *
101          * Paragraph about post_zero does not mention saturation, but
102          * testing it on actual GPUs shows that conditional modifiers
103          * are applied after saturation.
104          *
105          *    * post_zero bit: This bit reflects whether the final
106          *      result is zero after all the clamping, normalizing,
107          *      or format conversion logic.
108          *
109          * For signed types we don't care about saturation: it won't
110          * change the result of conditional modifier.
111          *
112          * For floating and unsigned types there two special cases,
113          * when we can remove inst even if scan_inst is saturated: G
114          * and LE. Since conditional modifiers are just comparisons
115          * against zero, saturating positive values to the upper
116          * limit never changes the result of comparison.
117          *
118          * For negative values:
119          * (sat(x) >  0) == (x >  0) --- false
120          * (sat(x) <= 0) == (x <= 0) --- true
121          */
122         const enum brw_conditional_mod cond =
123            negate ? brw_swap_cmod(inst->conditional_mod)
124            : inst->conditional_mod;
125
126         if (scan_inst->saturate &&
127             (brw_reg_type_is_floating_point(scan_inst->dst.type) ||
128              brw_reg_type_is_unsigned_integer(scan_inst->dst.type)) &&
129             (cond != BRW_CONDITIONAL_G &&
130              cond != BRW_CONDITIONAL_LE))
131            goto not_match;
132
133         /* Otherwise, try propagating the conditional. */
134         if (scan_inst->can_do_cmod() &&
135             ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
136              scan_inst->conditional_mod == cond)) {
137            scan_inst->conditional_mod = cond;
138            scan_inst->flag_subreg = inst->flag_subreg;
139            inst->remove(block, true);
140            return true;
141         }
142         break;
143      }
144
145   not_match:
146      if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
147         break;
148
149      read_flag = read_flag ||
150                  (scan_inst->flags_read(devinfo) & flags_written) != 0;
151   }
152
153   return false;
154}
155
156/**
157 * Propagate conditional modifiers from NOT instructions
158 *
159 * Attempt to convert sequences like
160 *
161 *    or(8)           g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
162 *    ...
163 *    not.nz.f0(8)    null            g78<8,8,1>UD
164 *
165 * into
166 *
167 *    or.z.f0(8)      g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
168 */
169static bool
170cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block,
171                   fs_inst *inst)
172{
173   const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
174   bool read_flag = false;
175   const unsigned flags_written = inst->flags_written(devinfo);
176
177   if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ)
178      return false;
179
180   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
181      if (regions_overlap(scan_inst->dst, scan_inst->size_written,
182                          inst->src[0], inst->size_read(0))) {
183         if (scan_inst->opcode != BRW_OPCODE_OR &&
184             scan_inst->opcode != BRW_OPCODE_AND)
185            break;
186
187         if (scan_inst->is_partial_write() ||
188             scan_inst->dst.offset != inst->src[0].offset ||
189             scan_inst->exec_size != inst->exec_size)
190            break;
191
192         /* If the scan instruction writes a different flag register than the
193          * instruction we're trying to propagate from, bail.
194          *
195          * FINISHME: The second part of the condition may be too strong.
196          * Perhaps (scan_inst->flags_written() & flags_written) !=
197          * flags_written?
198          */
199         if (scan_inst->flags_written(devinfo) != 0 &&
200             scan_inst->flags_written(devinfo) != flags_written)
201            break;
202
203         if (scan_inst->can_do_cmod() &&
204             ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
205              scan_inst->conditional_mod == cond)) {
206            scan_inst->conditional_mod = cond;
207            scan_inst->flag_subreg = inst->flag_subreg;
208            inst->remove(block, true);
209            return true;
210         }
211         break;
212      }
213
214      if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
215         break;
216
217      read_flag = read_flag ||
218                  (scan_inst->flags_read(devinfo) & flags_written) != 0;
219   }
220
221   return false;
222}
223
224static bool
225opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
226{
227   bool progress = false;
228   int ip = block->end_ip + 1;
229
230   foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
231      ip--;
232
233      if ((inst->opcode != BRW_OPCODE_AND &&
234           inst->opcode != BRW_OPCODE_CMP &&
235           inst->opcode != BRW_OPCODE_MOV &&
236           inst->opcode != BRW_OPCODE_NOT) ||
237          inst->predicate != BRW_PREDICATE_NONE ||
238          !inst->dst.is_null() ||
239          (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
240           inst->src[0].file != UNIFORM))
241         continue;
242
243      /* An ABS source modifier can only be handled when processing a compare
244       * with a value other than zero.
245       */
246      if (inst->src[0].abs &&
247          (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
248         continue;
249
250      /* Only an AND.NZ can be propagated.  Many AND.Z instructions are
251       * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
252       * Propagating those would require inverting the condition on the CMP.
253       * This changes both the flag value and the register destination of the
254       * CMP.  That result may be used elsewhere, so we can't change its value
255       * on a whim.
256       */
257      if (inst->opcode == BRW_OPCODE_AND &&
258          !(inst->src[1].is_one() &&
259            inst->conditional_mod == BRW_CONDITIONAL_NZ &&
260            !inst->src[0].negate))
261         continue;
262
263      /* A CMP with a second source of zero can match with anything.  A CMP
264       * with a second source that is not zero can only match with an ADD
265       * instruction.
266       *
267       * Only apply this optimization to float-point sources.  It can fail for
268       * integers.  For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but
269       * int(0x80000000) - 4 overflows and results in 0x7ffffffc.  that's not
270       * less than zero, so the flags get set differently than for (a < b).
271       */
272      if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
273         if (brw_reg_type_is_floating_point(inst->src[0].type) &&
274             cmod_propagate_cmp_to_add(devinfo, block, inst))
275            progress = true;
276
277         continue;
278      }
279
280      if (inst->opcode == BRW_OPCODE_NOT) {
281         progress = cmod_propagate_not(devinfo, block, inst) || progress;
282         continue;
283      }
284
285      bool read_flag = false;
286      const unsigned flags_written = inst->flags_written(devinfo);
287      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
288         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
289                             inst->src[0], inst->size_read(0))) {
290            /* If the scan instruction writes a different flag register than
291             * the instruction we're trying to propagate from, bail.
292             *
293             * FINISHME: The second part of the condition may be too strong.
294             * Perhaps (scan_inst->flags_written() & flags_written) !=
295             * flags_written?
296             */
297            if (scan_inst->flags_written(devinfo) != 0 &&
298                scan_inst->flags_written(devinfo) != flags_written)
299               break;
300
301            if (scan_inst->is_partial_write() ||
302                scan_inst->dst.offset != inst->src[0].offset ||
303                scan_inst->exec_size != inst->exec_size)
304               break;
305
306            /* CMP's result is the same regardless of dest type. */
307            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
308                scan_inst->opcode == BRW_OPCODE_CMP &&
309                brw_reg_type_is_integer(inst->dst.type)) {
310               inst->remove(block, true);
311               progress = true;
312               break;
313            }
314
315            /* If the AND wasn't handled by the previous case, it isn't safe
316             * to remove it.
317             */
318            if (inst->opcode == BRW_OPCODE_AND)
319               break;
320
321            if (inst->opcode == BRW_OPCODE_MOV) {
322               if (brw_reg_type_is_floating_point(scan_inst->dst.type)) {
323                  /* If the destination type of scan_inst is floating-point,
324                   * then:
325                   *
326                   * - The source of the MOV instruction must be the same
327                   *   type.
328                   *
329                   * - The destination of the MOV instruction must be float
330                   *   point with a size at least as large as the destination
331                   *   of inst.  Size-reducing f2f conversions could cause
332                   *   non-zero values to become zero, etc.
333                   */
334                  if (scan_inst->dst.type != inst->src[0].type)
335                     break;
336
337                  if (!brw_reg_type_is_floating_point(inst->dst.type))
338                     break;
339
340                  if (type_sz(scan_inst->dst.type) > type_sz(inst->dst.type))
341                     break;
342               } else {
343                  /* If the destination type of scan_inst is integer, then:
344                   *
345                   * - The source of the MOV instruction must be integer with
346                   *   the same size.
347                   *
348                   * - If the conditional modifier is Z or NZ, then the
349                   *   destination type of inst must either be floating point
350                   *   (of any size) or integer with a size at least as large
351                   *   as the destination of inst.
352                   *
353                   * - If the conditional modifier is neither Z nor NZ, then the
354                   *   destination type of inst must either be floating point
355                   *   (of any size) or integer with a size at least as large
356                   *   as the destination of inst and the same signedness.
357                   */
358                  if (!brw_reg_type_is_integer(inst->src[0].type) ||
359                      type_sz(scan_inst->dst.type) != type_sz(inst->src[0].type))
360                     break;
361
362                  if (brw_reg_type_is_integer(inst->dst.type)) {
363                     if (type_sz(inst->dst.type) < type_sz(scan_inst->dst.type))
364                        break;
365
366                     if (inst->conditional_mod != BRW_CONDITIONAL_Z &&
367                         inst->conditional_mod != BRW_CONDITIONAL_NZ &&
368                         brw_reg_type_is_unsigned_integer(inst->dst.type) !=
369                         brw_reg_type_is_unsigned_integer(scan_inst->dst.type))
370                        break;
371                  }
372               }
373            } else {
374               /* Not safe to use inequality operators if the types are
375                * different.
376                */
377               if (scan_inst->dst.type != inst->src[0].type &&
378                   inst->conditional_mod != BRW_CONDITIONAL_Z &&
379                   inst->conditional_mod != BRW_CONDITIONAL_NZ)
380                  break;
381
382               /* Comparisons operate differently for ints and floats */
383               if (scan_inst->dst.type != inst->dst.type) {
384                  /* Comparison result may be altered if the bit-size changes
385                   * since that affects range, denorms, etc
386                   */
387                  if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type))
388                     break;
389
390                  if (brw_reg_type_is_floating_point(scan_inst->dst.type) !=
391                      brw_reg_type_is_floating_point(inst->dst.type))
392                     break;
393               }
394            }
395
396            /* Knowing following:
397             * - CMP writes to flag register the result of
398             *   applying cmod to the `src0 - src1`.
399             *   After that it stores the same value to dst.
400             *   Other instructions first store their result to
401             *   dst, and then store cmod(dst) to the flag
402             *   register.
403             * - inst is either CMP or MOV
404             * - inst->dst is null
405             * - inst->src[0] overlaps with scan_inst->dst
406             * - inst->src[1] is zero
407             * - scan_inst wrote to a flag register
408             *
409             * There can be three possible paths:
410             *
411             * - scan_inst is CMP:
412             *
413             *   Considering that src0 is either 0x0 (false),
414             *   or 0xffffffff (true), and src1 is 0x0:
415             *
416             *   - If inst's cmod is NZ, we can always remove
417             *     scan_inst: NZ is invariant for false and true. This
418             *     holds even if src0 is NaN: .nz is the only cmod,
419             *     that returns true for NaN.
420             *
421             *   - .g is invariant if src0 has a UD type
422             *
423             *   - .l is invariant if src0 has a D type
424             *
425             * - scan_inst and inst have the same cmod:
426             *
427             *   If scan_inst is anything than CMP, it already
428             *   wrote the appropriate value to the flag register.
429             *
430             * - else:
431             *
432             *   We can change cmod of scan_inst to that of inst,
433             *   and remove inst. It is valid as long as we make
434             *   sure that no instruction uses the flag register
435             *   between scan_inst and inst.
436             */
437            if (!inst->src[0].negate &&
438                scan_inst->flags_written(devinfo)) {
439               if (scan_inst->opcode == BRW_OPCODE_CMP) {
440                  if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) ||
441                      (inst->conditional_mod == BRW_CONDITIONAL_G &&
442                       inst->src[0].type == BRW_REGISTER_TYPE_UD) ||
443                      (inst->conditional_mod == BRW_CONDITIONAL_L &&
444                       inst->src[0].type == BRW_REGISTER_TYPE_D)) {
445                     inst->remove(block, true);
446                     progress = true;
447                     break;
448                  }
449               } else if (scan_inst->conditional_mod == inst->conditional_mod) {
450                  /* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the
451                   * flags value is not based on the result stored in the
452                   * destination.  On all other platforms sel.cond will not
453                   * write the flags, so execution will not get to this point.
454                   */
455                  if (scan_inst->opcode == BRW_OPCODE_SEL) {
456                     assert(devinfo->ver <= 5);
457                  } else {
458                     inst->remove(block, true);
459                     progress = true;
460                  }
461
462                  break;
463               } else if (!read_flag && scan_inst->can_do_cmod()) {
464                  scan_inst->conditional_mod = inst->conditional_mod;
465                  scan_inst->flag_subreg = inst->flag_subreg;
466                  inst->remove(block, true);
467                  progress = true;
468                  break;
469               }
470            }
471
472            /* The conditional mod of the CMP/CMPN instructions behaves
473             * specially because the flag output is not calculated from the
474             * result of the instruction, but the other way around, which
475             * means that even if the condmod to propagate and the condmod
476             * from the CMP instruction are the same they will in general give
477             * different results because they are evaluated based on different
478             * inputs.
479             */
480            if (scan_inst->opcode == BRW_OPCODE_CMP ||
481                scan_inst->opcode == BRW_OPCODE_CMPN)
482               break;
483
484            /* From the Sky Lake PRM, Vol 2a, "Multiply":
485             *
486             *    "When multiplying integer data types, if one of the sources
487             *     is a DW, the resulting full precision data is stored in
488             *     the accumulator. However, if the destination data type is
489             *     either W or DW, the low bits of the result are written to
490             *     the destination register and the remaining high bits are
491             *     discarded. This results in undefined Overflow and Sign
492             *     flags. Therefore, conditional modifiers and saturation
493             *     (.sat) cannot be used in this case."
494             *
495             * We just disallow cmod propagation on all integer multiplies.
496             */
497            if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
498                scan_inst->opcode == BRW_OPCODE_MUL)
499               break;
500
501            enum brw_conditional_mod cond =
502               inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
503                                   : inst->conditional_mod;
504
505            /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
506             *
507             *    * Note that the [post condition signal] bits generated at
508             *      the output of a compute are before the .sat.
509             *
510             * Paragraph about post_zero does not mention saturation, but
511             * testing it on actual GPUs shows that conditional modifiers are
512             * applied after saturation.
513             *
514             *    * post_zero bit: This bit reflects whether the final
515             *      result is zero after all the clamping, normalizing,
516             *      or format conversion logic.
517             *
518             * For this reason, no additional restrictions are necessary on
519             * instructions with saturate.
520             */
521
522            /* Otherwise, try propagating the conditional. */
523            if (scan_inst->can_do_cmod() &&
524                ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
525                 scan_inst->conditional_mod == cond)) {
526               scan_inst->conditional_mod = cond;
527               scan_inst->flag_subreg = inst->flag_subreg;
528               inst->remove(block, true);
529               progress = true;
530            }
531            break;
532         }
533
534         if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
535            break;
536
537         read_flag = read_flag ||
538                     (scan_inst->flags_read(devinfo) & flags_written) != 0;
539      }
540   }
541
542   /* There is progress if and only if instructions were removed. */
543   assert(progress == (block->end_ip_delta != 0));
544
545   return progress;
546}
547
548bool
549fs_visitor::opt_cmod_propagation()
550{
551   bool progress = false;
552
553   foreach_block_reverse(block, cfg) {
554      progress = opt_cmod_propagation_local(devinfo, block) || progress;
555   }
556
557   if (progress) {
558      cfg->adjust_block_ips();
559
560      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
561   }
562
563   return progress;
564}
565