1/*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25/** @file brw_vec4_cmod_propagation.cpp
26 *
27 * Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check
28 * brw_fs_cmod_propagation for further details on the rationale behind this
29 * optimization.
30 */
31
32#include "brw_vec4.h"
33#include "brw_cfg.h"
34#include "brw_eu.h"
35
36namespace brw {
37
38static bool
39writemasks_incompatible(const vec4_instruction *earlier,
40                        const vec4_instruction *later)
41{
42   return (earlier->dst.writemask != WRITEMASK_X &&
43           earlier->dst.writemask != WRITEMASK_XYZW) ||
44          (earlier->dst.writemask == WRITEMASK_XYZW &&
45           later->src[0].swizzle != BRW_SWIZZLE_XYZW) ||
46          (later->dst.writemask & ~earlier->dst.writemask) != 0;
47}
48
49static bool
50opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v)
51{
52   bool progress = false;
53   int ip = block->end_ip + 1;
54
55   foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
56      ip--;
57
58      if ((inst->opcode != BRW_OPCODE_AND &&
59           inst->opcode != BRW_OPCODE_CMP &&
60           inst->opcode != BRW_OPCODE_MOV) ||
61          inst->predicate != BRW_PREDICATE_NONE ||
62          !inst->dst.is_null() ||
63          (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
64           inst->src[0].file != UNIFORM))
65         continue;
66
67      /* An ABS source modifier can only be handled when processing a compare
68       * with a value other than zero.
69       */
70      if (inst->src[0].abs &&
71          (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
72         continue;
73
74      if (inst->opcode == BRW_OPCODE_AND &&
75          !(inst->src[1].is_one() &&
76            inst->conditional_mod == BRW_CONDITIONAL_NZ &&
77            !inst->src[0].negate))
78         continue;
79
80      if (inst->opcode == BRW_OPCODE_MOV &&
81          inst->conditional_mod != BRW_CONDITIONAL_NZ)
82         continue;
83
84      bool read_flag = false;
85      foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) {
86         /* A CMP with a second source of zero can match with anything.  A CMP
87          * with a second source that is not zero can only match with an ADD
88          * instruction.
89          */
90         if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
91            bool negate;
92
93            if (scan_inst->opcode != BRW_OPCODE_ADD)
94               goto not_match;
95
96            if (writemasks_incompatible(scan_inst, inst))
97               goto not_match;
98
99            /* A CMP is basically a subtraction.  The result of the
100             * subtraction must be the same as the result of the addition.
101             * This means that one of the operands must be negated.  So (a +
102             * b) vs (a == -b) or (a + -b) vs (a == b).
103             */
104            if ((inst->src[0].equals(scan_inst->src[0]) &&
105                 inst->src[1].negative_equals(scan_inst->src[1])) ||
106                (inst->src[0].equals(scan_inst->src[1]) &&
107                 inst->src[1].negative_equals(scan_inst->src[0]))) {
108               negate = false;
109            } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
110                        inst->src[1].equals(scan_inst->src[1])) ||
111                       (inst->src[0].negative_equals(scan_inst->src[1]) &&
112                        inst->src[1].equals(scan_inst->src[0]))) {
113               negate = true;
114            } else {
115               goto not_match;
116            }
117
118            if (scan_inst->exec_size != inst->exec_size ||
119                scan_inst->group != inst->group)
120               goto not_match;
121
122            /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
123             *
124             *    * Note that the [post condition signal] bits generated at
125             *      the output of a compute are before the .sat.
126             *
127             * So we don't have to bail if scan_inst has saturate.
128             */
129
130            /* Otherwise, try propagating the conditional. */
131            const enum brw_conditional_mod cond =
132               negate ? brw_swap_cmod(inst->conditional_mod)
133                      : inst->conditional_mod;
134
135            if (scan_inst->can_do_cmod() &&
136                ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
137                 scan_inst->conditional_mod == cond)) {
138               scan_inst->conditional_mod = cond;
139               inst->remove(block);
140               progress = true;
141            }
142            break;
143         }
144
145         if (regions_overlap(inst->src[0], inst->size_read(0),
146                             scan_inst->dst, scan_inst->size_written)) {
147            if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) ||
148                scan_inst->dst.offset != inst->src[0].offset ||
149                scan_inst->exec_size != inst->exec_size ||
150                scan_inst->group != inst->group) {
151               break;
152            }
153
154            /* If scan_inst is a CMP that produces a single value and inst is
155             * a CMP.NZ that consumes only that value, remove inst.
156             */
157            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
158                (inst->src[0].type == BRW_REGISTER_TYPE_D ||
159                 inst->src[0].type == BRW_REGISTER_TYPE_UD) &&
160                (inst->opcode == BRW_OPCODE_CMP ||
161                 inst->opcode == BRW_OPCODE_MOV) &&
162                scan_inst->opcode == BRW_OPCODE_CMP &&
163                ((inst->src[0].swizzle == BRW_SWIZZLE_XXXX &&
164                  scan_inst->dst.writemask == WRITEMASK_X) ||
165                 (inst->src[0].swizzle == BRW_SWIZZLE_YYYY &&
166                  scan_inst->dst.writemask == WRITEMASK_Y) ||
167                 (inst->src[0].swizzle == BRW_SWIZZLE_ZZZZ &&
168                  scan_inst->dst.writemask == WRITEMASK_Z) ||
169                 (inst->src[0].swizzle == BRW_SWIZZLE_WWWW &&
170                  scan_inst->dst.writemask == WRITEMASK_W))) {
171               if (inst->dst.writemask != scan_inst->dst.writemask) {
172                  src_reg temp(v, glsl_type::vec4_type, 1);
173
174                  /* Given a sequence like:
175                   *
176                   *    cmp.ge.f0(8)  g21<1>.zF      g20<4>.xF      g18<4>.xF
177                   *    ...
178                   *    cmp.nz.f0(8)  null<1>D       g21<4>.zD      0D
179                   *
180                   * Replace it with something like:
181                   *
182                   *    cmp.ge.f0(8)  g22<1>.zF      g20<4>.xF      g18<4>.xF
183                   *    mov(8)        g21<1>.xF      g22<1>.zzzzF
184                   *
185                   * The added MOV will most likely be removed later.  In the
186                   * worst case, it should be cheaper to schedule.
187                   */
188                  temp.swizzle = brw_swizzle_for_mask(inst->dst.writemask);
189                  temp.type = scan_inst->src[0].type;
190
191                  vec4_instruction *mov = v->MOV(scan_inst->dst, temp);
192
193                  /* Modify the source swizzles on scan_inst.  If scan_inst
194                   * was
195                   *
196                   *    cmp.ge.f0(8)  g21<1>.zF      g20<4>.wzyxF   g18<4>.yxwzF
197                   *
198                   * replace it with
199                   *
200                   *    cmp.ge.f0(8)  g21<1>.zF      g20<4>.yyyyF   g18<4>.wwwwF
201                   */
202                  unsigned src0_chan;
203                  unsigned src1_chan;
204                  switch (scan_inst->dst.writemask) {
205                  case WRITEMASK_X:
206                     src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 0);
207                     src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 0);
208                     break;
209                  case WRITEMASK_Y:
210                     src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 1);
211                     src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 1);
212                     break;
213                  case WRITEMASK_Z:
214                     src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 2);
215                     src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 2);
216                     break;
217                  case WRITEMASK_W:
218                     src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 3);
219                     src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 3);
220                     break;
221                  default:
222                     unreachable("Impossible writemask");
223                  }
224
225                  scan_inst->src[0].swizzle = BRW_SWIZZLE4(src0_chan,
226                                                           src0_chan,
227                                                           src0_chan,
228                                                           src0_chan);
229
230                  /* There's no swizzle on immediate value sources. */
231                  if (scan_inst->src[1].file != IMM) {
232                     scan_inst->src[1].swizzle = BRW_SWIZZLE4(src1_chan,
233                                                              src1_chan,
234                                                              src1_chan,
235                                                              src1_chan);
236                  }
237
238                  scan_inst->dst = dst_reg(temp);
239                  scan_inst->dst.writemask = inst->dst.writemask;
240
241                  scan_inst->insert_after(block, mov);
242               }
243
244               inst->remove(block);
245               progress = true;
246               break;
247            }
248
249            if (writemasks_incompatible(scan_inst, inst))
250               break;
251
252            /* CMP's result is the same regardless of dest type. */
253            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
254                scan_inst->opcode == BRW_OPCODE_CMP &&
255                (inst->dst.type == BRW_REGISTER_TYPE_D ||
256                 inst->dst.type == BRW_REGISTER_TYPE_UD)) {
257               inst->remove(block);
258               progress = true;
259               break;
260            }
261
262            /* If the AND wasn't handled by the previous case, it isn't safe
263             * to remove it.
264             */
265            if (inst->opcode == BRW_OPCODE_AND)
266               break;
267
268            /* Comparisons operate differently for ints and floats */
269            if (scan_inst->dst.type != inst->dst.type &&
270                (scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
271                 inst->dst.type == BRW_REGISTER_TYPE_F))
272               break;
273
274            /* If the instruction generating inst's source also wrote the
275             * flag, and inst is doing a simple .nz comparison, then inst
276             * is redundant - the appropriate value is already in the flag
277             * register.  Delete inst.
278             */
279            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
280                !inst->src[0].negate &&
281                scan_inst->writes_flag(v->devinfo)) {
282               inst->remove(block);
283               progress = true;
284               break;
285            }
286
287            /* The conditional mod of the CMP/CMPN instructions behaves
288             * specially because the flag output is not calculated from the
289             * result of the instruction, but the other way around, which
290             * means that even if the condmod to propagate and the condmod
291             * from the CMP instruction are the same they will in general give
292             * different results because they are evaluated based on different
293             * inputs.
294             */
295            if (scan_inst->opcode == BRW_OPCODE_CMP ||
296                scan_inst->opcode == BRW_OPCODE_CMPN)
297               break;
298
299            /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
300             *
301             *    * Note that the [post condition signal] bits generated at
302             *      the output of a compute are before the .sat.
303             */
304            if (scan_inst->saturate)
305               break;
306
307            /* From the Sky Lake PRM, Vol 2a, "Multiply":
308             *
309             *    "When multiplying integer data types, if one of the sources
310             *    is a DW, the resulting full precision data is stored in
311             *    the accumulator. However, if the destination data type is
312             *    either W or DW, the low bits of the result are written to
313             *    the destination register and the remaining high bits are
314             *    discarded. This results in undefined Overflow and Sign
315             *    flags. Therefore, conditional modifiers and saturation
316             *    (.sat) cannot be used in this case.
317             *
318             * We just disallow cmod propagation on all integer multiplies.
319             */
320            if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
321                scan_inst->opcode == BRW_OPCODE_MUL)
322               break;
323
324            /* Otherwise, try propagating the conditional. */
325            enum brw_conditional_mod cond =
326               inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
327                                   : inst->conditional_mod;
328
329            if (scan_inst->can_do_cmod() &&
330                ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
331                 scan_inst->conditional_mod == cond)) {
332               scan_inst->conditional_mod = cond;
333               inst->remove(block);
334               progress = true;
335            }
336            break;
337         }
338
339      not_match:
340         if (scan_inst->writes_flag(v->devinfo))
341            break;
342
343         read_flag = read_flag || scan_inst->reads_flag();
344      }
345   }
346
347   return progress;
348}
349
350bool
351vec4_visitor::opt_cmod_propagation()
352{
353   bool progress = false;
354
355   foreach_block_reverse(block, cfg) {
356      progress = opt_cmod_propagation_local(block, this) || progress;
357   }
358
359   if (progress)
360      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
361
362   return progress;
363}
364
365} /* namespace brw */
366