1/* 2 * Copyright © 2010 Intel Corporation 3 * Copyright © 2018 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 * DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "nir.h" 26#include "nir_builder.h" 27 28/** nir_lower_alu.c 29 * 30 * NIR's home for miscellaneous ALU operation lowering implementations. 31 * 32 * Most NIR ALU lowering occurs in nir_opt_algebraic.py, since it's generally 33 * easy to write them there. However, if terms appear multiple times in the 34 * lowered code, it can get very verbose and cause a lot of work for CSE, so 35 * it may end up being easier to write out in C code. 36 * 37 * The shader must be in SSA for this pass. 38 */ 39 40#define LOWER_MUL_HIGH (1 << 0) 41 42static bool 43lower_alu_instr(nir_alu_instr *instr, nir_builder *b) 44{ 45 nir_ssa_def *lowered = NULL; 46 47 assert(instr->dest.dest.is_ssa); 48 49 b->cursor = nir_before_instr(&instr->instr); 50 b->exact = instr->exact; 51 52 switch (instr->op) { 53 case nir_op_bitfield_reverse: 54 if (b->shader->options->lower_bitfield_reverse) { 55 /* For more details, see: 56 * 57 * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel 58 */ 59 nir_ssa_def *c1 = nir_imm_int(b, 1); 60 nir_ssa_def *c2 = nir_imm_int(b, 2); 61 nir_ssa_def *c4 = nir_imm_int(b, 4); 62 nir_ssa_def *c8 = nir_imm_int(b, 8); 63 nir_ssa_def *c16 = nir_imm_int(b, 16); 64 nir_ssa_def *c33333333 = nir_imm_int(b, 0x33333333); 65 nir_ssa_def *c55555555 = nir_imm_int(b, 0x55555555); 66 nir_ssa_def *c0f0f0f0f = nir_imm_int(b, 0x0f0f0f0f); 67 nir_ssa_def *c00ff00ff = nir_imm_int(b, 0x00ff00ff); 68 69 lowered = nir_ssa_for_alu_src(b, instr, 0); 70 71 /* Swap odd and even bits. */ 72 lowered = nir_ior(b, 73 nir_iand(b, nir_ushr(b, lowered, c1), c55555555), 74 nir_ishl(b, nir_iand(b, lowered, c55555555), c1)); 75 76 /* Swap consecutive pairs. */ 77 lowered = nir_ior(b, 78 nir_iand(b, nir_ushr(b, lowered, c2), c33333333), 79 nir_ishl(b, nir_iand(b, lowered, c33333333), c2)); 80 81 /* Swap nibbles. */ 82 lowered = nir_ior(b, 83 nir_iand(b, nir_ushr(b, lowered, c4), c0f0f0f0f), 84 nir_ishl(b, nir_iand(b, lowered, c0f0f0f0f), c4)); 85 86 /* Swap bytes. */ 87 lowered = nir_ior(b, 88 nir_iand(b, nir_ushr(b, lowered, c8), c00ff00ff), 89 nir_ishl(b, nir_iand(b, lowered, c00ff00ff), c8)); 90 91 lowered = nir_ior(b, 92 nir_ushr(b, lowered, c16), 93 nir_ishl(b, lowered, c16)); 94 } 95 break; 96 97 case nir_op_bit_count: 98 if (b->shader->options->lower_bit_count) { 99 /* For more details, see: 100 * 101 * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel 102 */ 103 nir_ssa_def *c1 = nir_imm_int(b, 1); 104 nir_ssa_def *c2 = nir_imm_int(b, 2); 105 nir_ssa_def *c4 = nir_imm_int(b, 4); 106 nir_ssa_def *c24 = nir_imm_int(b, 24); 107 nir_ssa_def *c33333333 = nir_imm_int(b, 0x33333333); 108 nir_ssa_def *c55555555 = nir_imm_int(b, 0x55555555); 109 nir_ssa_def *c0f0f0f0f = nir_imm_int(b, 0x0f0f0f0f); 110 nir_ssa_def *c01010101 = nir_imm_int(b, 0x01010101); 111 112 lowered = nir_ssa_for_alu_src(b, instr, 0); 113 114 lowered = nir_isub(b, lowered, 115 nir_iand(b, nir_ushr(b, lowered, c1), c55555555)); 116 117 lowered = nir_iadd(b, 118 nir_iand(b, lowered, c33333333), 119 nir_iand(b, nir_ushr(b, lowered, c2), c33333333)); 120 121 lowered = nir_ushr(b, 122 nir_imul(b, 123 nir_iand(b, 124 nir_iadd(b, 125 lowered, 126 nir_ushr(b, lowered, c4)), 127 c0f0f0f0f), 128 c01010101), 129 c24); 130 } 131 break; 132 133 case nir_op_imul_high: 134 case nir_op_umul_high: 135 if (b->shader->options->lower_mul_high) { 136 nir_ssa_def *src0 = nir_ssa_for_alu_src(b, instr, 0); 137 nir_ssa_def *src1 = nir_ssa_for_alu_src(b, instr, 1); 138 if (src0->bit_size < 32) { 139 /* Just do the math in 32-bit space and shift the result */ 140 nir_alu_type base_type = nir_op_infos[instr->op].output_type; 141 nir_op upcast_op = nir_type_conversion_op(base_type | src0->bit_size, base_type | 32, nir_rounding_mode_undef); 142 nir_op downscast_op = nir_type_conversion_op(base_type | 32, base_type | src0->bit_size, nir_rounding_mode_undef); 143 144 nir_ssa_def *src0_32 = nir_build_alu(b, upcast_op, src0, NULL, NULL, NULL); 145 nir_ssa_def *src1_32 = nir_build_alu(b, upcast_op, src1, NULL, NULL, NULL); 146 nir_ssa_def *dest_32 = nir_imul(b, src0_32, src1_32); 147 nir_ssa_def *dest_shifted = nir_ishr(b, dest_32, nir_imm_int(b, src0->bit_size)); 148 lowered = nir_build_alu(b, downscast_op, dest_shifted, NULL, NULL, NULL); 149 } else { 150 nir_ssa_def *c1 = nir_imm_intN_t(b, 1, src0->bit_size); 151 nir_ssa_def *cshift = nir_imm_int(b, src0->bit_size / 2); 152 nir_ssa_def *cmask = nir_imm_intN_t(b, (1ull << (src0->bit_size / 2)) - 1, src0->bit_size); 153 nir_ssa_def *different_signs = NULL; 154 if (instr->op == nir_op_imul_high) { 155 nir_ssa_def *c0 = nir_imm_intN_t(b, 0, src0->bit_size); 156 different_signs = nir_ixor(b, 157 nir_ilt(b, src0, c0), 158 nir_ilt(b, src1, c0)); 159 src0 = nir_iabs(b, src0); 160 src1 = nir_iabs(b, src1); 161 } 162 163 /* ABCD 164 * * EFGH 165 * ====== 166 * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32 167 * 168 * Start by splitting into the 4 multiplies. 169 */ 170 nir_ssa_def *src0l = nir_iand(b, src0, cmask); 171 nir_ssa_def *src1l = nir_iand(b, src1, cmask); 172 nir_ssa_def *src0h = nir_ushr(b, src0, cshift); 173 nir_ssa_def *src1h = nir_ushr(b, src1, cshift); 174 175 nir_ssa_def *lo = nir_imul(b, src0l, src1l); 176 nir_ssa_def *m1 = nir_imul(b, src0l, src1h); 177 nir_ssa_def *m2 = nir_imul(b, src0h, src1l); 178 nir_ssa_def *hi = nir_imul(b, src0h, src1h); 179 180 nir_ssa_def *tmp; 181 182 tmp = nir_ishl(b, m1, cshift); 183 hi = nir_iadd(b, hi, nir_iand(b, nir_uadd_carry(b, lo, tmp), c1)); 184 lo = nir_iadd(b, lo, tmp); 185 hi = nir_iadd(b, hi, nir_ushr(b, m1, cshift)); 186 187 tmp = nir_ishl(b, m2, cshift); 188 hi = nir_iadd(b, hi, nir_iand(b, nir_uadd_carry(b, lo, tmp), c1)); 189 lo = nir_iadd(b, lo, tmp); 190 hi = nir_iadd(b, hi, nir_ushr(b, m2, cshift)); 191 192 if (instr->op == nir_op_imul_high) { 193 /* For channels where different_signs is set we have to perform a 194 * 64-bit negation. This is *not* the same as just negating the 195 * high 32-bits. Consider -3 * 2. The high 32-bits is 0, but the 196 * desired result is -1, not -0! Recall -x == ~x + 1. 197 */ 198 hi = nir_bcsel(b, different_signs, 199 nir_iadd(b, 200 nir_inot(b, hi), 201 nir_iand(b, 202 nir_uadd_carry(b, 203 nir_inot(b, lo), 204 c1), 205 nir_imm_intN_t(b, 1, src0->bit_size))), 206 hi); 207 } 208 209 lowered = hi; 210 } 211 } 212 break; 213 214 default: 215 break; 216 } 217 218 if (lowered) { 219 nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, lowered); 220 nir_instr_remove(&instr->instr); 221 return true; 222 } else { 223 return false; 224 } 225} 226 227bool 228nir_lower_alu(nir_shader *shader) 229{ 230 bool progress = false; 231 232 if (!shader->options->lower_bitfield_reverse && 233 !shader->options->lower_mul_high) 234 return false; 235 236 nir_foreach_function(function, shader) { 237 if (function->impl) { 238 nir_builder builder; 239 nir_builder_init(&builder, function->impl); 240 241 nir_foreach_block(block, function->impl) { 242 nir_foreach_instr_safe(instr, block) { 243 if (instr->type == nir_instr_type_alu) { 244 progress = lower_alu_instr(nir_instr_as_alu(instr), 245 &builder) || progress; 246 } 247 } 248 } 249 250 if (progress) { 251 nir_metadata_preserve(function->impl, 252 nir_metadata_block_index | 253 nir_metadata_dominance); 254 } 255 } 256 } 257 258 return progress; 259} 260