1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25/** @file brw_vec4_cmod_propagation.cpp 26 * 27 * Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check 28 * brw_fs_cmod_propagation for further details on the rationale behind this 29 * optimization. 30 */ 31 32#include "brw_vec4.h" 33#include "brw_cfg.h" 34#include "brw_eu.h" 35 36namespace brw { 37 38static bool 39writemasks_incompatible(const vec4_instruction *earlier, 40 const vec4_instruction *later) 41{ 42 return (earlier->dst.writemask != WRITEMASK_X && 43 earlier->dst.writemask != WRITEMASK_XYZW) || 44 (earlier->dst.writemask == WRITEMASK_XYZW && 45 later->src[0].swizzle != BRW_SWIZZLE_XYZW) || 46 (later->dst.writemask & ~earlier->dst.writemask) != 0; 47} 48 49static bool 50opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v) 51{ 52 bool progress = false; 53 int ip = block->end_ip + 1; 54 55 foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) { 56 ip--; 57 58 if ((inst->opcode != BRW_OPCODE_AND && 59 inst->opcode != BRW_OPCODE_CMP && 60 inst->opcode != BRW_OPCODE_MOV) || 61 inst->predicate != BRW_PREDICATE_NONE || 62 !inst->dst.is_null() || 63 (inst->src[0].file != VGRF && inst->src[0].file != ATTR && 64 inst->src[0].file != UNIFORM)) 65 continue; 66 67 /* An ABS source modifier can only be handled when processing a compare 68 * with a value other than zero. 69 */ 70 if (inst->src[0].abs && 71 (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero())) 72 continue; 73 74 if (inst->opcode == BRW_OPCODE_AND && 75 !(inst->src[1].is_one() && 76 inst->conditional_mod == BRW_CONDITIONAL_NZ && 77 !inst->src[0].negate)) 78 continue; 79 80 if (inst->opcode == BRW_OPCODE_MOV && 81 inst->conditional_mod != BRW_CONDITIONAL_NZ) 82 continue; 83 84 bool read_flag = false; 85 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) { 86 /* A CMP with a second source of zero can match with anything. A CMP 87 * with a second source that is not zero can only match with an ADD 88 * instruction. 89 */ 90 if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) { 91 bool negate; 92 93 if (scan_inst->opcode != BRW_OPCODE_ADD) 94 goto not_match; 95 96 if (writemasks_incompatible(scan_inst, inst)) 97 goto not_match; 98 99 /* A CMP is basically a subtraction. The result of the 100 * subtraction must be the same as the result of the addition. 101 * This means that one of the operands must be negated. So (a + 102 * b) vs (a == -b) or (a + -b) vs (a == b). 103 */ 104 if ((inst->src[0].equals(scan_inst->src[0]) && 105 inst->src[1].negative_equals(scan_inst->src[1])) || 106 (inst->src[0].equals(scan_inst->src[1]) && 107 inst->src[1].negative_equals(scan_inst->src[0]))) { 108 negate = false; 109 } else if ((inst->src[0].negative_equals(scan_inst->src[0]) && 110 inst->src[1].equals(scan_inst->src[1])) || 111 (inst->src[0].negative_equals(scan_inst->src[1]) && 112 inst->src[1].equals(scan_inst->src[0]))) { 113 negate = true; 114 } else { 115 goto not_match; 116 } 117 118 if (scan_inst->exec_size != inst->exec_size || 119 scan_inst->group != inst->group) 120 goto not_match; 121 122 /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods": 123 * 124 * * Note that the [post condition signal] bits generated at 125 * the output of a compute are before the .sat. 126 * 127 * So we don't have to bail if scan_inst has saturate. 128 */ 129 130 /* Otherwise, try propagating the conditional. */ 131 const enum brw_conditional_mod cond = 132 negate ? brw_swap_cmod(inst->conditional_mod) 133 : inst->conditional_mod; 134 135 if (scan_inst->can_do_cmod() && 136 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || 137 scan_inst->conditional_mod == cond)) { 138 scan_inst->conditional_mod = cond; 139 inst->remove(block); 140 progress = true; 141 } 142 break; 143 } 144 145 if (regions_overlap(inst->src[0], inst->size_read(0), 146 scan_inst->dst, scan_inst->size_written)) { 147 if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) || 148 scan_inst->dst.offset != inst->src[0].offset || 149 scan_inst->exec_size != inst->exec_size || 150 scan_inst->group != inst->group) { 151 break; 152 } 153 154 /* If scan_inst is a CMP that produces a single value and inst is 155 * a CMP.NZ that consumes only that value, remove inst. 156 */ 157 if (inst->conditional_mod == BRW_CONDITIONAL_NZ && 158 (inst->src[0].type == BRW_REGISTER_TYPE_D || 159 inst->src[0].type == BRW_REGISTER_TYPE_UD) && 160 (inst->opcode == BRW_OPCODE_CMP || 161 inst->opcode == BRW_OPCODE_MOV) && 162 scan_inst->opcode == BRW_OPCODE_CMP && 163 ((inst->src[0].swizzle == BRW_SWIZZLE_XXXX && 164 scan_inst->dst.writemask == WRITEMASK_X) || 165 (inst->src[0].swizzle == BRW_SWIZZLE_YYYY && 166 scan_inst->dst.writemask == WRITEMASK_Y) || 167 (inst->src[0].swizzle == BRW_SWIZZLE_ZZZZ && 168 scan_inst->dst.writemask == WRITEMASK_Z) || 169 (inst->src[0].swizzle == BRW_SWIZZLE_WWWW && 170 scan_inst->dst.writemask == WRITEMASK_W))) { 171 if (inst->dst.writemask != scan_inst->dst.writemask) { 172 src_reg temp(v, glsl_type::vec4_type, 1); 173 174 /* Given a sequence like: 175 * 176 * cmp.ge.f0(8) g21<1>.zF g20<4>.xF g18<4>.xF 177 * ... 178 * cmp.nz.f0(8) null<1>D g21<4>.zD 0D 179 * 180 * Replace it with something like: 181 * 182 * cmp.ge.f0(8) g22<1>.zF g20<4>.xF g18<4>.xF 183 * mov(8) g21<1>.xF g22<1>.zzzzF 184 * 185 * The added MOV will most likely be removed later. In the 186 * worst case, it should be cheaper to schedule. 187 */ 188 temp.swizzle = brw_swizzle_for_mask(inst->dst.writemask); 189 temp.type = scan_inst->src[0].type; 190 191 vec4_instruction *mov = v->MOV(scan_inst->dst, temp); 192 193 /* Modify the source swizzles on scan_inst. If scan_inst 194 * was 195 * 196 * cmp.ge.f0(8) g21<1>.zF g20<4>.wzyxF g18<4>.yxwzF 197 * 198 * replace it with 199 * 200 * cmp.ge.f0(8) g21<1>.zF g20<4>.yyyyF g18<4>.wwwwF 201 */ 202 unsigned src0_chan; 203 unsigned src1_chan; 204 switch (scan_inst->dst.writemask) { 205 case WRITEMASK_X: 206 src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 0); 207 src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 0); 208 break; 209 case WRITEMASK_Y: 210 src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 1); 211 src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 1); 212 break; 213 case WRITEMASK_Z: 214 src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 2); 215 src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 2); 216 break; 217 case WRITEMASK_W: 218 src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 3); 219 src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 3); 220 break; 221 default: 222 unreachable("Impossible writemask"); 223 } 224 225 scan_inst->src[0].swizzle = BRW_SWIZZLE4(src0_chan, 226 src0_chan, 227 src0_chan, 228 src0_chan); 229 230 /* There's no swizzle on immediate value sources. */ 231 if (scan_inst->src[1].file != IMM) { 232 scan_inst->src[1].swizzle = BRW_SWIZZLE4(src1_chan, 233 src1_chan, 234 src1_chan, 235 src1_chan); 236 } 237 238 scan_inst->dst = dst_reg(temp); 239 scan_inst->dst.writemask = inst->dst.writemask; 240 241 scan_inst->insert_after(block, mov); 242 } 243 244 inst->remove(block); 245 progress = true; 246 break; 247 } 248 249 if (writemasks_incompatible(scan_inst, inst)) 250 break; 251 252 /* CMP's result is the same regardless of dest type. */ 253 if (inst->conditional_mod == BRW_CONDITIONAL_NZ && 254 scan_inst->opcode == BRW_OPCODE_CMP && 255 (inst->dst.type == BRW_REGISTER_TYPE_D || 256 inst->dst.type == BRW_REGISTER_TYPE_UD)) { 257 inst->remove(block); 258 progress = true; 259 break; 260 } 261 262 /* If the AND wasn't handled by the previous case, it isn't safe 263 * to remove it. 264 */ 265 if (inst->opcode == BRW_OPCODE_AND) 266 break; 267 268 /* Comparisons operate differently for ints and floats */ 269 if (scan_inst->dst.type != inst->dst.type && 270 (scan_inst->dst.type == BRW_REGISTER_TYPE_F || 271 inst->dst.type == BRW_REGISTER_TYPE_F)) 272 break; 273 274 /* If the instruction generating inst's source also wrote the 275 * flag, and inst is doing a simple .nz comparison, then inst 276 * is redundant - the appropriate value is already in the flag 277 * register. Delete inst. 278 */ 279 if (inst->conditional_mod == BRW_CONDITIONAL_NZ && 280 !inst->src[0].negate && 281 scan_inst->writes_flag(v->devinfo)) { 282 inst->remove(block); 283 progress = true; 284 break; 285 } 286 287 /* The conditional mod of the CMP/CMPN instructions behaves 288 * specially because the flag output is not calculated from the 289 * result of the instruction, but the other way around, which 290 * means that even if the condmod to propagate and the condmod 291 * from the CMP instruction are the same they will in general give 292 * different results because they are evaluated based on different 293 * inputs. 294 */ 295 if (scan_inst->opcode == BRW_OPCODE_CMP || 296 scan_inst->opcode == BRW_OPCODE_CMPN) 297 break; 298 299 /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods": 300 * 301 * * Note that the [post condition signal] bits generated at 302 * the output of a compute are before the .sat. 303 */ 304 if (scan_inst->saturate) 305 break; 306 307 /* From the Sky Lake PRM, Vol 2a, "Multiply": 308 * 309 * "When multiplying integer data types, if one of the sources 310 * is a DW, the resulting full precision data is stored in 311 * the accumulator. However, if the destination data type is 312 * either W or DW, the low bits of the result are written to 313 * the destination register and the remaining high bits are 314 * discarded. This results in undefined Overflow and Sign 315 * flags. Therefore, conditional modifiers and saturation 316 * (.sat) cannot be used in this case. 317 * 318 * We just disallow cmod propagation on all integer multiplies. 319 */ 320 if (!brw_reg_type_is_floating_point(scan_inst->dst.type) && 321 scan_inst->opcode == BRW_OPCODE_MUL) 322 break; 323 324 /* Otherwise, try propagating the conditional. */ 325 enum brw_conditional_mod cond = 326 inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod) 327 : inst->conditional_mod; 328 329 if (scan_inst->can_do_cmod() && 330 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || 331 scan_inst->conditional_mod == cond)) { 332 scan_inst->conditional_mod = cond; 333 inst->remove(block); 334 progress = true; 335 } 336 break; 337 } 338 339 not_match: 340 if (scan_inst->writes_flag(v->devinfo)) 341 break; 342 343 read_flag = read_flag || scan_inst->reads_flag(); 344 } 345 } 346 347 return progress; 348} 349 350bool 351vec4_visitor::opt_cmod_propagation() 352{ 353 bool progress = false; 354 355 foreach_block_reverse(block, cfg) { 356 progress = opt_cmod_propagation_local(block, this) || progress; 357 } 358 359 if (progress) 360 invalidate_analysis(DEPENDENCY_INSTRUCTIONS); 361 362 return progress; 363} 364 365} /* namespace brw */ 366