1/* 2 * Copyright © 2018 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "nir_instr_set.h" 25#include "nir_search_helpers.h" 26#include "nir_builder.h" 27#include "util/u_vector.h" 28 29/* Partial redundancy elimination of compares 30 * 31 * Seaches for comparisons of the form 'a cmp b' that dominate arithmetic 32 * instructions like 'b - a'. The comparison is replaced by the arithmetic 33 * instruction, and the result is compared with zero. For example, 34 * 35 * vec1 32 ssa_111 = flt 0.37, ssa_110.w 36 * if ssa_111 { 37 * block block_1: 38 * vec1 32 ssa_112 = fadd ssa_110.w, -0.37 39 * ... 40 * 41 * becomes 42 * 43 * vec1 32 ssa_111 = fadd ssa_110.w, -0.37 44 * vec1 32 ssa_112 = flt 0.0, ssa_111 45 * if ssa_112 { 46 * block block_1: 47 * ... 48 */ 49 50struct block_queue { 51 /** 52 * Stack of blocks from the current location in the CFG to the entry point 53 * of the function. 54 * 55 * This is sort of a poor man's dominator tree. 56 */ 57 struct exec_list blocks; 58 59 /** List of freed block_instructions structures that can be reused. */ 60 struct exec_list reusable_blocks; 61}; 62 63struct block_instructions { 64 struct exec_node node; 65 66 /** 67 * Set of comparison instructions from the block that are candidates for 68 * being replaced by add instructions. 69 */ 70 struct u_vector instructions; 71}; 72 73static void 74block_queue_init(struct block_queue *bq) 75{ 76 exec_list_make_empty(&bq->blocks); 77 exec_list_make_empty(&bq->reusable_blocks); 78} 79 80static void 81block_queue_finish(struct block_queue *bq) 82{ 83 struct block_instructions *n; 84 85 while ((n = (struct block_instructions *) exec_list_pop_head(&bq->blocks)) != NULL) { 86 u_vector_finish(&n->instructions); 87 free(n); 88 } 89 90 while ((n = (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks)) != NULL) { 91 free(n); 92 } 93} 94 95static struct block_instructions * 96push_block(struct block_queue *bq) 97{ 98 struct block_instructions *bi = 99 (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks); 100 101 if (bi == NULL) { 102 bi = calloc(1, sizeof(struct block_instructions)); 103 104 if (bi == NULL) 105 return NULL; 106 } 107 108 if (!u_vector_init_pow2(&bi->instructions, 8, sizeof(nir_alu_instr *))) { 109 free(bi); 110 return NULL; 111 } 112 113 exec_list_push_tail(&bq->blocks, &bi->node); 114 115 return bi; 116} 117 118static void 119pop_block(struct block_queue *bq, struct block_instructions *bi) 120{ 121 u_vector_finish(&bi->instructions); 122 exec_node_remove(&bi->node); 123 exec_list_push_head(&bq->reusable_blocks, &bi->node); 124} 125 126static void 127add_instruction_for_block(struct block_instructions *bi, 128 nir_alu_instr *alu) 129{ 130 nir_alu_instr **data = 131 u_vector_add(&bi->instructions); 132 133 *data = alu; 134} 135 136static void 137rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp, 138 nir_alu_instr *orig_add, bool zero_on_left) 139{ 140 bld->cursor = nir_before_instr(&orig_cmp->instr); 141 142 /* This is somewhat tricky. The compare instruction may be something like 143 * (fcmp, a, b) while the add instruction is something like (fadd, fneg(a), 144 * b). This is problematic because the SSA value for the fneg(a) may not 145 * exist yet at the compare instruction. 146 * 147 * We fabricate the operands of the new add. This is done using 148 * information provided by zero_on_left. If zero_on_left is true, we know 149 * the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)). If the 150 * original compare instruction was (fcmp, a, b), x = b and y = -a. If 151 * zero_on_left is false, the resulting compare instruction is (fcmp, 152 * (fadd, x, y), 0.0) and x = a and y = -b. 153 */ 154 nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0); 155 nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1); 156 157 nir_ssa_def *const fadd = zero_on_left 158 ? nir_fadd(bld, b, nir_fneg(bld, a)) 159 : nir_fadd(bld, a, nir_fneg(bld, b)); 160 161 nir_ssa_def *const zero = 162 nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size); 163 164 nir_ssa_def *const cmp = zero_on_left 165 ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL) 166 : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL); 167 168 /* Generating extra moves of the results is the easy way to make sure the 169 * writemasks match the original instructions. Later optimization passes 170 * will clean these up. This is similar to nir_replace_instr (in 171 * nir_search.c). 172 */ 173 nir_alu_instr *mov_add = nir_alu_instr_create(bld->shader, nir_op_mov); 174 mov_add->dest.write_mask = orig_add->dest.write_mask; 175 nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest, 176 orig_add->dest.dest.ssa.num_components, 177 orig_add->dest.dest.ssa.bit_size, NULL); 178 mov_add->src[0].src = nir_src_for_ssa(fadd); 179 180 nir_builder_instr_insert(bld, &mov_add->instr); 181 182 nir_alu_instr *mov_cmp = nir_alu_instr_create(bld->shader, nir_op_mov); 183 mov_cmp->dest.write_mask = orig_cmp->dest.write_mask; 184 nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest, 185 orig_cmp->dest.dest.ssa.num_components, 186 orig_cmp->dest.dest.ssa.bit_size, NULL); 187 mov_cmp->src[0].src = nir_src_for_ssa(cmp); 188 189 nir_builder_instr_insert(bld, &mov_cmp->instr); 190 191 nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa, 192 &mov_cmp->dest.dest.ssa); 193 nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa, 194 &mov_add->dest.dest.ssa); 195 196 /* We know these have no more uses because we just rewrote them all, so we 197 * can remove them. 198 */ 199 nir_instr_remove(&orig_cmp->instr); 200 nir_instr_remove(&orig_add->instr); 201} 202 203static bool 204comparison_pre_block(nir_block *block, struct block_queue *bq, nir_builder *bld) 205{ 206 bool progress = false; 207 208 struct block_instructions *bi = push_block(bq); 209 if (bi == NULL) 210 return false; 211 212 /* Starting with the current block, examine each instruction. If the 213 * instruction is a comparison that matches the '±a cmp ±b' pattern, add it 214 * to the block_instructions::instructions set. If the instruction is an 215 * add instruction, walk up the block queue looking at the stored 216 * instructions. If a matching comparison is found, move the addition and 217 * replace the comparison with a different comparison based on the result 218 * of the addition. All of the blocks in the queue are guaranteed to be 219 * dominators of the current block. 220 * 221 * After processing the current block, recurse into the blocks dominated by 222 * the current block. 223 */ 224 nir_foreach_instr_safe(instr, block) { 225 if (instr->type != nir_instr_type_alu) 226 continue; 227 228 nir_alu_instr *const alu = nir_instr_as_alu(instr); 229 230 if (alu->dest.dest.ssa.num_components != 1) 231 continue; 232 233 if (alu->dest.saturate) 234 continue; 235 236 static const uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = {0}; 237 238 switch (alu->op) { 239 case nir_op_fadd: { 240 /* If the instruction is fadd, check it against comparison 241 * instructions that dominate it. 242 */ 243 struct block_instructions *b = 244 (struct block_instructions *) exec_list_get_head_raw(&bq->blocks); 245 246 while (b->node.next != NULL) { 247 nir_alu_instr **a; 248 bool rewrote_compare = false; 249 250 u_vector_foreach(a, &b->instructions) { 251 nir_alu_instr *const cmp = *a; 252 253 if (cmp == NULL) 254 continue; 255 256 /* The operands of both instructions are, with some liberty, 257 * commutative. Check all four permutations. The third and 258 * fourth permutations are negations of the first two. 259 */ 260 if ((nir_alu_srcs_equal(cmp, alu, 0, 0) && 261 nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) || 262 (nir_alu_srcs_equal(cmp, alu, 0, 1) && 263 nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) { 264 /* These are the cases where (A cmp B) matches either (A + 265 * -B) or (-B + A) 266 * 267 * A cmp B <=> A + -B cmp 0 268 */ 269 rewrite_compare_instruction(bld, cmp, alu, false); 270 271 *a = NULL; 272 rewrote_compare = true; 273 break; 274 } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) && 275 nir_alu_srcs_negative_equal(cmp, alu, 0, 1)) || 276 (nir_alu_srcs_equal(cmp, alu, 1, 1) && 277 nir_alu_srcs_negative_equal(cmp, alu, 0, 0))) { 278 /* This is the case where (A cmp B) matches (B + -A) or (-A 279 * + B). 280 * 281 * A cmp B <=> 0 cmp B + -A 282 */ 283 rewrite_compare_instruction(bld, cmp, alu, true); 284 285 *a = NULL; 286 rewrote_compare = true; 287 break; 288 } 289 } 290 291 /* Bail after a compare in the most dominating block is found. 292 * This is necessary because 'alu' has been removed from the 293 * instruction stream. Should there be a matching compare in 294 * another block, calling rewrite_compare_instruction again will 295 * try to operate on a node that is not in the list as if it were 296 * in the list. 297 * 298 * FINISHME: There may be opportunity for additional optimization 299 * here. I discovered this problem due to a shader in Guacamelee. 300 * It may be possible to rewrite the matching compares that are 301 * encountered later to reuse the result from the compare that was 302 * first rewritten. It's also possible that this is just taken 303 * care of by calling the optimization pass repeatedly. 304 */ 305 if (rewrote_compare) { 306 progress = true; 307 break; 308 } 309 310 b = (struct block_instructions *) b->node.next; 311 } 312 313 break; 314 } 315 316 case nir_op_flt: 317 case nir_op_fge: 318 case nir_op_fneu: 319 case nir_op_feq: 320 /* If the instruction is a comparison that is used by an if-statement 321 * and neither operand is immediate value 0, add it to the set. 322 */ 323 if (is_used_by_if(alu) && 324 is_not_const_zero(NULL, alu, 0, 1, swizzle) && 325 is_not_const_zero(NULL, alu, 1, 1, swizzle)) 326 add_instruction_for_block(bi, alu); 327 328 break; 329 330 default: 331 break; 332 } 333 } 334 335 for (unsigned i = 0; i < block->num_dom_children; i++) { 336 nir_block *child = block->dom_children[i]; 337 338 if (comparison_pre_block(child, bq, bld)) 339 progress = true; 340 } 341 342 pop_block(bq, bi); 343 344 return progress; 345} 346 347bool 348nir_opt_comparison_pre_impl(nir_function_impl *impl) 349{ 350 struct block_queue bq; 351 nir_builder bld; 352 353 block_queue_init(&bq); 354 nir_builder_init(&bld, impl); 355 356 nir_metadata_require(impl, nir_metadata_dominance); 357 358 const bool progress = 359 comparison_pre_block(nir_start_block(impl), &bq, &bld); 360 361 block_queue_finish(&bq); 362 363 if (progress) { 364 nir_metadata_preserve(impl, nir_metadata_block_index | 365 nir_metadata_dominance); 366 } else { 367 nir_metadata_preserve(impl, nir_metadata_all); 368 } 369 370 return progress; 371} 372 373bool 374nir_opt_comparison_pre(nir_shader *shader) 375{ 376 bool progress = false; 377 378 nir_foreach_function(function, shader) { 379 if (function->impl) 380 progress |= nir_opt_comparison_pre_impl(function->impl); 381 } 382 383 return progress; 384} 385