1/* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_fs.h" 25#include "brw_cfg.h" 26#include "brw_eu.h" 27 28/** @file brw_fs_cmod_propagation.cpp 29 * 30 * Implements a pass that propagates the conditional modifier from a CMP x 0.0 31 * instruction into the instruction that generated x. For instance, in this 32 * sequence 33 * 34 * add(8) g70<1>F g69<8,8,1>F 4096F 35 * cmp.ge.f0(8) null g70<8,8,1>F 0F 36 * 37 * we can do the comparison as part of the ADD instruction directly: 38 * 39 * add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F 40 * 41 * If there had been a use of the flag register and another CMP using g70 42 * 43 * add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F 44 * (+f0) sel(8) g71<F> g72<8,8,1>F g73<8,8,1>F 45 * cmp.ge.f0(8) null g70<8,8,1>F 0F 46 * 47 * we can recognize that the CMP is generating the flag value that already 48 * exists and therefore remove the instruction. 49 */ 50 51using namespace brw; 52 53static bool 54cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block, 55 fs_inst *inst) 56{ 57 bool read_flag = false; 58 const unsigned flags_written = inst->flags_written(devinfo); 59 60 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { 61 if (scan_inst->opcode == BRW_OPCODE_ADD && 62 !scan_inst->is_partial_write() && 63 scan_inst->exec_size == inst->exec_size) { 64 bool negate; 65 66 /* A CMP is basically a subtraction. The result of the 67 * subtraction must be the same as the result of the addition. 68 * This means that one of the operands must be negated. So (a + 69 * b) vs (a == -b) or (a + -b) vs (a == b). 70 */ 71 if ((inst->src[0].equals(scan_inst->src[0]) && 72 inst->src[1].negative_equals(scan_inst->src[1])) || 73 (inst->src[0].equals(scan_inst->src[1]) && 74 inst->src[1].negative_equals(scan_inst->src[0]))) { 75 negate = false; 76 } else if ((inst->src[0].negative_equals(scan_inst->src[0]) && 77 inst->src[1].equals(scan_inst->src[1])) || 78 (inst->src[0].negative_equals(scan_inst->src[1]) && 79 inst->src[1].equals(scan_inst->src[0]))) { 80 negate = true; 81 } else { 82 goto not_match; 83 } 84 85 /* If the scan instruction writes a different flag register than the 86 * instruction we're trying to propagate from, bail. 87 * 88 * FINISHME: The second part of the condition may be too strong. 89 * Perhaps (scan_inst->flags_written() & flags_written) != 90 * flags_written? 91 */ 92 if (scan_inst->flags_written(devinfo) != 0 && 93 scan_inst->flags_written(devinfo) != flags_written) 94 goto not_match; 95 96 /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags": 97 * 98 * * Note that the [post condition signal] bits generated at 99 * the output of a compute are before the .sat. 100 * 101 * Paragraph about post_zero does not mention saturation, but 102 * testing it on actual GPUs shows that conditional modifiers 103 * are applied after saturation. 104 * 105 * * post_zero bit: This bit reflects whether the final 106 * result is zero after all the clamping, normalizing, 107 * or format conversion logic. 108 * 109 * For signed types we don't care about saturation: it won't 110 * change the result of conditional modifier. 111 * 112 * For floating and unsigned types there two special cases, 113 * when we can remove inst even if scan_inst is saturated: G 114 * and LE. Since conditional modifiers are just comparisons 115 * against zero, saturating positive values to the upper 116 * limit never changes the result of comparison. 117 * 118 * For negative values: 119 * (sat(x) > 0) == (x > 0) --- false 120 * (sat(x) <= 0) == (x <= 0) --- true 121 */ 122 const enum brw_conditional_mod cond = 123 negate ? brw_swap_cmod(inst->conditional_mod) 124 : inst->conditional_mod; 125 126 if (scan_inst->saturate && 127 (brw_reg_type_is_floating_point(scan_inst->dst.type) || 128 brw_reg_type_is_unsigned_integer(scan_inst->dst.type)) && 129 (cond != BRW_CONDITIONAL_G && 130 cond != BRW_CONDITIONAL_LE)) 131 goto not_match; 132 133 /* Otherwise, try propagating the conditional. */ 134 if (scan_inst->can_do_cmod() && 135 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || 136 scan_inst->conditional_mod == cond)) { 137 scan_inst->conditional_mod = cond; 138 scan_inst->flag_subreg = inst->flag_subreg; 139 inst->remove(block, true); 140 return true; 141 } 142 break; 143 } 144 145 not_match: 146 if ((scan_inst->flags_written(devinfo) & flags_written) != 0) 147 break; 148 149 read_flag = read_flag || 150 (scan_inst->flags_read(devinfo) & flags_written) != 0; 151 } 152 153 return false; 154} 155 156/** 157 * Propagate conditional modifiers from NOT instructions 158 * 159 * Attempt to convert sequences like 160 * 161 * or(8) g78<8,8,1> g76<8,8,1>UD g77<8,8,1>UD 162 * ... 163 * not.nz.f0(8) null g78<8,8,1>UD 164 * 165 * into 166 * 167 * or.z.f0(8) g78<8,8,1> g76<8,8,1>UD g77<8,8,1>UD 168 */ 169static bool 170cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block, 171 fs_inst *inst) 172{ 173 const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod); 174 bool read_flag = false; 175 const unsigned flags_written = inst->flags_written(devinfo); 176 177 if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ) 178 return false; 179 180 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { 181 if (regions_overlap(scan_inst->dst, scan_inst->size_written, 182 inst->src[0], inst->size_read(0))) { 183 if (scan_inst->opcode != BRW_OPCODE_OR && 184 scan_inst->opcode != BRW_OPCODE_AND) 185 break; 186 187 if (scan_inst->is_partial_write() || 188 scan_inst->dst.offset != inst->src[0].offset || 189 scan_inst->exec_size != inst->exec_size) 190 break; 191 192 /* If the scan instruction writes a different flag register than the 193 * instruction we're trying to propagate from, bail. 194 * 195 * FINISHME: The second part of the condition may be too strong. 196 * Perhaps (scan_inst->flags_written() & flags_written) != 197 * flags_written? 198 */ 199 if (scan_inst->flags_written(devinfo) != 0 && 200 scan_inst->flags_written(devinfo) != flags_written) 201 break; 202 203 if (scan_inst->can_do_cmod() && 204 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || 205 scan_inst->conditional_mod == cond)) { 206 scan_inst->conditional_mod = cond; 207 scan_inst->flag_subreg = inst->flag_subreg; 208 inst->remove(block, true); 209 return true; 210 } 211 break; 212 } 213 214 if ((scan_inst->flags_written(devinfo) & flags_written) != 0) 215 break; 216 217 read_flag = read_flag || 218 (scan_inst->flags_read(devinfo) & flags_written) != 0; 219 } 220 221 return false; 222} 223 224static bool 225opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block) 226{ 227 bool progress = false; 228 int ip = block->end_ip + 1; 229 230 foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { 231 ip--; 232 233 if ((inst->opcode != BRW_OPCODE_AND && 234 inst->opcode != BRW_OPCODE_CMP && 235 inst->opcode != BRW_OPCODE_MOV && 236 inst->opcode != BRW_OPCODE_NOT) || 237 inst->predicate != BRW_PREDICATE_NONE || 238 !inst->dst.is_null() || 239 (inst->src[0].file != VGRF && inst->src[0].file != ATTR && 240 inst->src[0].file != UNIFORM)) 241 continue; 242 243 /* An ABS source modifier can only be handled when processing a compare 244 * with a value other than zero. 245 */ 246 if (inst->src[0].abs && 247 (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero())) 248 continue; 249 250 /* Only an AND.NZ can be propagated. Many AND.Z instructions are 251 * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code). 252 * Propagating those would require inverting the condition on the CMP. 253 * This changes both the flag value and the register destination of the 254 * CMP. That result may be used elsewhere, so we can't change its value 255 * on a whim. 256 */ 257 if (inst->opcode == BRW_OPCODE_AND && 258 !(inst->src[1].is_one() && 259 inst->conditional_mod == BRW_CONDITIONAL_NZ && 260 !inst->src[0].negate)) 261 continue; 262 263 /* A CMP with a second source of zero can match with anything. A CMP 264 * with a second source that is not zero can only match with an ADD 265 * instruction. 266 * 267 * Only apply this optimization to float-point sources. It can fail for 268 * integers. For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but 269 * int(0x80000000) - 4 overflows and results in 0x7ffffffc. that's not 270 * less than zero, so the flags get set differently than for (a < b). 271 */ 272 if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) { 273 if (brw_reg_type_is_floating_point(inst->src[0].type) && 274 cmod_propagate_cmp_to_add(devinfo, block, inst)) 275 progress = true; 276 277 continue; 278 } 279 280 if (inst->opcode == BRW_OPCODE_NOT) { 281 progress = cmod_propagate_not(devinfo, block, inst) || progress; 282 continue; 283 } 284 285 bool read_flag = false; 286 const unsigned flags_written = inst->flags_written(devinfo); 287 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { 288 if (regions_overlap(scan_inst->dst, scan_inst->size_written, 289 inst->src[0], inst->size_read(0))) { 290 /* If the scan instruction writes a different flag register than 291 * the instruction we're trying to propagate from, bail. 292 * 293 * FINISHME: The second part of the condition may be too strong. 294 * Perhaps (scan_inst->flags_written() & flags_written) != 295 * flags_written? 296 */ 297 if (scan_inst->flags_written(devinfo) != 0 && 298 scan_inst->flags_written(devinfo) != flags_written) 299 break; 300 301 if (scan_inst->is_partial_write() || 302 scan_inst->dst.offset != inst->src[0].offset || 303 scan_inst->exec_size != inst->exec_size) 304 break; 305 306 /* CMP's result is the same regardless of dest type. */ 307 if (inst->conditional_mod == BRW_CONDITIONAL_NZ && 308 scan_inst->opcode == BRW_OPCODE_CMP && 309 brw_reg_type_is_integer(inst->dst.type)) { 310 inst->remove(block, true); 311 progress = true; 312 break; 313 } 314 315 /* If the AND wasn't handled by the previous case, it isn't safe 316 * to remove it. 317 */ 318 if (inst->opcode == BRW_OPCODE_AND) 319 break; 320 321 if (inst->opcode == BRW_OPCODE_MOV) { 322 if (brw_reg_type_is_floating_point(scan_inst->dst.type)) { 323 /* If the destination type of scan_inst is floating-point, 324 * then: 325 * 326 * - The source of the MOV instruction must be the same 327 * type. 328 * 329 * - The destination of the MOV instruction must be float 330 * point with a size at least as large as the destination 331 * of inst. Size-reducing f2f conversions could cause 332 * non-zero values to become zero, etc. 333 */ 334 if (scan_inst->dst.type != inst->src[0].type) 335 break; 336 337 if (!brw_reg_type_is_floating_point(inst->dst.type)) 338 break; 339 340 if (type_sz(scan_inst->dst.type) > type_sz(inst->dst.type)) 341 break; 342 } else { 343 /* If the destination type of scan_inst is integer, then: 344 * 345 * - The source of the MOV instruction must be integer with 346 * the same size. 347 * 348 * - If the conditional modifier is Z or NZ, then the 349 * destination type of inst must either be floating point 350 * (of any size) or integer with a size at least as large 351 * as the destination of inst. 352 * 353 * - If the conditional modifier is neither Z nor NZ, then the 354 * destination type of inst must either be floating point 355 * (of any size) or integer with a size at least as large 356 * as the destination of inst and the same signedness. 357 */ 358 if (!brw_reg_type_is_integer(inst->src[0].type) || 359 type_sz(scan_inst->dst.type) != type_sz(inst->src[0].type)) 360 break; 361 362 if (brw_reg_type_is_integer(inst->dst.type)) { 363 if (type_sz(inst->dst.type) < type_sz(scan_inst->dst.type)) 364 break; 365 366 if (inst->conditional_mod != BRW_CONDITIONAL_Z && 367 inst->conditional_mod != BRW_CONDITIONAL_NZ && 368 brw_reg_type_is_unsigned_integer(inst->dst.type) != 369 brw_reg_type_is_unsigned_integer(scan_inst->dst.type)) 370 break; 371 } 372 } 373 } else { 374 /* Not safe to use inequality operators if the types are 375 * different. 376 */ 377 if (scan_inst->dst.type != inst->src[0].type && 378 inst->conditional_mod != BRW_CONDITIONAL_Z && 379 inst->conditional_mod != BRW_CONDITIONAL_NZ) 380 break; 381 382 /* Comparisons operate differently for ints and floats */ 383 if (scan_inst->dst.type != inst->dst.type) { 384 /* Comparison result may be altered if the bit-size changes 385 * since that affects range, denorms, etc 386 */ 387 if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type)) 388 break; 389 390 if (brw_reg_type_is_floating_point(scan_inst->dst.type) != 391 brw_reg_type_is_floating_point(inst->dst.type)) 392 break; 393 } 394 } 395 396 /* Knowing following: 397 * - CMP writes to flag register the result of 398 * applying cmod to the `src0 - src1`. 399 * After that it stores the same value to dst. 400 * Other instructions first store their result to 401 * dst, and then store cmod(dst) to the flag 402 * register. 403 * - inst is either CMP or MOV 404 * - inst->dst is null 405 * - inst->src[0] overlaps with scan_inst->dst 406 * - inst->src[1] is zero 407 * - scan_inst wrote to a flag register 408 * 409 * There can be three possible paths: 410 * 411 * - scan_inst is CMP: 412 * 413 * Considering that src0 is either 0x0 (false), 414 * or 0xffffffff (true), and src1 is 0x0: 415 * 416 * - If inst's cmod is NZ, we can always remove 417 * scan_inst: NZ is invariant for false and true. This 418 * holds even if src0 is NaN: .nz is the only cmod, 419 * that returns true for NaN. 420 * 421 * - .g is invariant if src0 has a UD type 422 * 423 * - .l is invariant if src0 has a D type 424 * 425 * - scan_inst and inst have the same cmod: 426 * 427 * If scan_inst is anything than CMP, it already 428 * wrote the appropriate value to the flag register. 429 * 430 * - else: 431 * 432 * We can change cmod of scan_inst to that of inst, 433 * and remove inst. It is valid as long as we make 434 * sure that no instruction uses the flag register 435 * between scan_inst and inst. 436 */ 437 if (!inst->src[0].negate && 438 scan_inst->flags_written(devinfo)) { 439 if (scan_inst->opcode == BRW_OPCODE_CMP) { 440 if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) || 441 (inst->conditional_mod == BRW_CONDITIONAL_G && 442 inst->src[0].type == BRW_REGISTER_TYPE_UD) || 443 (inst->conditional_mod == BRW_CONDITIONAL_L && 444 inst->src[0].type == BRW_REGISTER_TYPE_D)) { 445 inst->remove(block, true); 446 progress = true; 447 break; 448 } 449 } else if (scan_inst->conditional_mod == inst->conditional_mod) { 450 /* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the 451 * flags value is not based on the result stored in the 452 * destination. On all other platforms sel.cond will not 453 * write the flags, so execution will not get to this point. 454 */ 455 if (scan_inst->opcode == BRW_OPCODE_SEL) { 456 assert(devinfo->ver <= 5); 457 } else { 458 inst->remove(block, true); 459 progress = true; 460 } 461 462 break; 463 } else if (!read_flag && scan_inst->can_do_cmod()) { 464 scan_inst->conditional_mod = inst->conditional_mod; 465 scan_inst->flag_subreg = inst->flag_subreg; 466 inst->remove(block, true); 467 progress = true; 468 break; 469 } 470 } 471 472 /* The conditional mod of the CMP/CMPN instructions behaves 473 * specially because the flag output is not calculated from the 474 * result of the instruction, but the other way around, which 475 * means that even if the condmod to propagate and the condmod 476 * from the CMP instruction are the same they will in general give 477 * different results because they are evaluated based on different 478 * inputs. 479 */ 480 if (scan_inst->opcode == BRW_OPCODE_CMP || 481 scan_inst->opcode == BRW_OPCODE_CMPN) 482 break; 483 484 /* From the Sky Lake PRM, Vol 2a, "Multiply": 485 * 486 * "When multiplying integer data types, if one of the sources 487 * is a DW, the resulting full precision data is stored in 488 * the accumulator. However, if the destination data type is 489 * either W or DW, the low bits of the result are written to 490 * the destination register and the remaining high bits are 491 * discarded. This results in undefined Overflow and Sign 492 * flags. Therefore, conditional modifiers and saturation 493 * (.sat) cannot be used in this case." 494 * 495 * We just disallow cmod propagation on all integer multiplies. 496 */ 497 if (!brw_reg_type_is_floating_point(scan_inst->dst.type) && 498 scan_inst->opcode == BRW_OPCODE_MUL) 499 break; 500 501 enum brw_conditional_mod cond = 502 inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod) 503 : inst->conditional_mod; 504 505 /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags": 506 * 507 * * Note that the [post condition signal] bits generated at 508 * the output of a compute are before the .sat. 509 * 510 * Paragraph about post_zero does not mention saturation, but 511 * testing it on actual GPUs shows that conditional modifiers are 512 * applied after saturation. 513 * 514 * * post_zero bit: This bit reflects whether the final 515 * result is zero after all the clamping, normalizing, 516 * or format conversion logic. 517 * 518 * For this reason, no additional restrictions are necessary on 519 * instructions with saturate. 520 */ 521 522 /* Otherwise, try propagating the conditional. */ 523 if (scan_inst->can_do_cmod() && 524 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || 525 scan_inst->conditional_mod == cond)) { 526 scan_inst->conditional_mod = cond; 527 scan_inst->flag_subreg = inst->flag_subreg; 528 inst->remove(block, true); 529 progress = true; 530 } 531 break; 532 } 533 534 if ((scan_inst->flags_written(devinfo) & flags_written) != 0) 535 break; 536 537 read_flag = read_flag || 538 (scan_inst->flags_read(devinfo) & flags_written) != 0; 539 } 540 } 541 542 /* There is progress if and only if instructions were removed. */ 543 assert(progress == (block->end_ip_delta != 0)); 544 545 return progress; 546} 547 548bool 549fs_visitor::opt_cmod_propagation() 550{ 551 bool progress = false; 552 553 foreach_block_reverse(block, cfg) { 554 progress = opt_cmod_propagation_local(devinfo, block) || progress; 555 } 556 557 if (progress) { 558 cfg->adjust_block_ips(); 559 560 invalidate_analysis(DEPENDENCY_INSTRUCTIONS); 561 } 562 563 return progress; 564} 565