1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2010 Intel Corporation 3bf215546Sopenharmony_ci * Copyright © 2018 Broadcom 4bf215546Sopenharmony_ci * 5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 8bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 10bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 11bf215546Sopenharmony_ci * 12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 14bf215546Sopenharmony_ci * Software. 15bf215546Sopenharmony_ci * 16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22bf215546Sopenharmony_ci * DEALINGS IN THE SOFTWARE. 23bf215546Sopenharmony_ci */ 24bf215546Sopenharmony_ci 25bf215546Sopenharmony_ci#include "nir.h" 26bf215546Sopenharmony_ci#include "nir_builder.h" 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci/** nir_lower_alu.c 29bf215546Sopenharmony_ci * 30bf215546Sopenharmony_ci * NIR's home for miscellaneous ALU operation lowering implementations. 31bf215546Sopenharmony_ci * 32bf215546Sopenharmony_ci * Most NIR ALU lowering occurs in nir_opt_algebraic.py, since it's generally 33bf215546Sopenharmony_ci * easy to write them there. However, if terms appear multiple times in the 34bf215546Sopenharmony_ci * lowered code, it can get very verbose and cause a lot of work for CSE, so 35bf215546Sopenharmony_ci * it may end up being easier to write out in C code. 36bf215546Sopenharmony_ci * 37bf215546Sopenharmony_ci * The shader must be in SSA for this pass. 38bf215546Sopenharmony_ci */ 39bf215546Sopenharmony_ci 40bf215546Sopenharmony_ci#define LOWER_MUL_HIGH (1 << 0) 41bf215546Sopenharmony_ci 42bf215546Sopenharmony_cistatic bool 43bf215546Sopenharmony_cilower_alu_instr(nir_alu_instr *instr, nir_builder *b) 44bf215546Sopenharmony_ci{ 45bf215546Sopenharmony_ci nir_ssa_def *lowered = NULL; 46bf215546Sopenharmony_ci 47bf215546Sopenharmony_ci assert(instr->dest.dest.is_ssa); 48bf215546Sopenharmony_ci 49bf215546Sopenharmony_ci b->cursor = nir_before_instr(&instr->instr); 50bf215546Sopenharmony_ci b->exact = instr->exact; 51bf215546Sopenharmony_ci 52bf215546Sopenharmony_ci switch (instr->op) { 53bf215546Sopenharmony_ci case nir_op_bitfield_reverse: 54bf215546Sopenharmony_ci if (b->shader->options->lower_bitfield_reverse) { 55bf215546Sopenharmony_ci /* For more details, see: 56bf215546Sopenharmony_ci * 57bf215546Sopenharmony_ci * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel 58bf215546Sopenharmony_ci */ 59bf215546Sopenharmony_ci nir_ssa_def *c1 = nir_imm_int(b, 1); 60bf215546Sopenharmony_ci nir_ssa_def *c2 = nir_imm_int(b, 2); 61bf215546Sopenharmony_ci nir_ssa_def *c4 = nir_imm_int(b, 4); 62bf215546Sopenharmony_ci nir_ssa_def *c8 = nir_imm_int(b, 8); 63bf215546Sopenharmony_ci nir_ssa_def *c16 = nir_imm_int(b, 16); 64bf215546Sopenharmony_ci nir_ssa_def *c33333333 = nir_imm_int(b, 0x33333333); 65bf215546Sopenharmony_ci nir_ssa_def *c55555555 = nir_imm_int(b, 0x55555555); 66bf215546Sopenharmony_ci nir_ssa_def *c0f0f0f0f = nir_imm_int(b, 0x0f0f0f0f); 67bf215546Sopenharmony_ci nir_ssa_def *c00ff00ff = nir_imm_int(b, 0x00ff00ff); 68bf215546Sopenharmony_ci 69bf215546Sopenharmony_ci lowered = nir_ssa_for_alu_src(b, instr, 0); 70bf215546Sopenharmony_ci 71bf215546Sopenharmony_ci /* Swap odd and even bits. */ 72bf215546Sopenharmony_ci lowered = nir_ior(b, 73bf215546Sopenharmony_ci nir_iand(b, nir_ushr(b, lowered, c1), c55555555), 74bf215546Sopenharmony_ci nir_ishl(b, nir_iand(b, lowered, c55555555), c1)); 75bf215546Sopenharmony_ci 76bf215546Sopenharmony_ci /* Swap consecutive pairs. */ 77bf215546Sopenharmony_ci lowered = nir_ior(b, 78bf215546Sopenharmony_ci nir_iand(b, nir_ushr(b, lowered, c2), c33333333), 79bf215546Sopenharmony_ci nir_ishl(b, nir_iand(b, lowered, c33333333), c2)); 80bf215546Sopenharmony_ci 81bf215546Sopenharmony_ci /* Swap nibbles. */ 82bf215546Sopenharmony_ci lowered = nir_ior(b, 83bf215546Sopenharmony_ci nir_iand(b, nir_ushr(b, lowered, c4), c0f0f0f0f), 84bf215546Sopenharmony_ci nir_ishl(b, nir_iand(b, lowered, c0f0f0f0f), c4)); 85bf215546Sopenharmony_ci 86bf215546Sopenharmony_ci /* Swap bytes. */ 87bf215546Sopenharmony_ci lowered = nir_ior(b, 88bf215546Sopenharmony_ci nir_iand(b, nir_ushr(b, lowered, c8), c00ff00ff), 89bf215546Sopenharmony_ci nir_ishl(b, nir_iand(b, lowered, c00ff00ff), c8)); 90bf215546Sopenharmony_ci 91bf215546Sopenharmony_ci lowered = nir_ior(b, 92bf215546Sopenharmony_ci nir_ushr(b, lowered, c16), 93bf215546Sopenharmony_ci nir_ishl(b, lowered, c16)); 94bf215546Sopenharmony_ci } 95bf215546Sopenharmony_ci break; 96bf215546Sopenharmony_ci 97bf215546Sopenharmony_ci case nir_op_bit_count: 98bf215546Sopenharmony_ci if (b->shader->options->lower_bit_count) { 99bf215546Sopenharmony_ci /* For more details, see: 100bf215546Sopenharmony_ci * 101bf215546Sopenharmony_ci * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel 102bf215546Sopenharmony_ci */ 103bf215546Sopenharmony_ci nir_ssa_def *c1 = nir_imm_int(b, 1); 104bf215546Sopenharmony_ci nir_ssa_def *c2 = nir_imm_int(b, 2); 105bf215546Sopenharmony_ci nir_ssa_def *c4 = nir_imm_int(b, 4); 106bf215546Sopenharmony_ci nir_ssa_def *c24 = nir_imm_int(b, 24); 107bf215546Sopenharmony_ci nir_ssa_def *c33333333 = nir_imm_int(b, 0x33333333); 108bf215546Sopenharmony_ci nir_ssa_def *c55555555 = nir_imm_int(b, 0x55555555); 109bf215546Sopenharmony_ci nir_ssa_def *c0f0f0f0f = nir_imm_int(b, 0x0f0f0f0f); 110bf215546Sopenharmony_ci nir_ssa_def *c01010101 = nir_imm_int(b, 0x01010101); 111bf215546Sopenharmony_ci 112bf215546Sopenharmony_ci lowered = nir_ssa_for_alu_src(b, instr, 0); 113bf215546Sopenharmony_ci 114bf215546Sopenharmony_ci lowered = nir_isub(b, lowered, 115bf215546Sopenharmony_ci nir_iand(b, nir_ushr(b, lowered, c1), c55555555)); 116bf215546Sopenharmony_ci 117bf215546Sopenharmony_ci lowered = nir_iadd(b, 118bf215546Sopenharmony_ci nir_iand(b, lowered, c33333333), 119bf215546Sopenharmony_ci nir_iand(b, nir_ushr(b, lowered, c2), c33333333)); 120bf215546Sopenharmony_ci 121bf215546Sopenharmony_ci lowered = nir_ushr(b, 122bf215546Sopenharmony_ci nir_imul(b, 123bf215546Sopenharmony_ci nir_iand(b, 124bf215546Sopenharmony_ci nir_iadd(b, 125bf215546Sopenharmony_ci lowered, 126bf215546Sopenharmony_ci nir_ushr(b, lowered, c4)), 127bf215546Sopenharmony_ci c0f0f0f0f), 128bf215546Sopenharmony_ci c01010101), 129bf215546Sopenharmony_ci c24); 130bf215546Sopenharmony_ci } 131bf215546Sopenharmony_ci break; 132bf215546Sopenharmony_ci 133bf215546Sopenharmony_ci case nir_op_imul_high: 134bf215546Sopenharmony_ci case nir_op_umul_high: 135bf215546Sopenharmony_ci if (b->shader->options->lower_mul_high) { 136bf215546Sopenharmony_ci nir_ssa_def *src0 = nir_ssa_for_alu_src(b, instr, 0); 137bf215546Sopenharmony_ci nir_ssa_def *src1 = nir_ssa_for_alu_src(b, instr, 1); 138bf215546Sopenharmony_ci if (src0->bit_size < 32) { 139bf215546Sopenharmony_ci /* Just do the math in 32-bit space and shift the result */ 140bf215546Sopenharmony_ci nir_alu_type base_type = nir_op_infos[instr->op].output_type; 141bf215546Sopenharmony_ci nir_op upcast_op = nir_type_conversion_op(base_type | src0->bit_size, base_type | 32, nir_rounding_mode_undef); 142bf215546Sopenharmony_ci nir_op downscast_op = nir_type_conversion_op(base_type | 32, base_type | src0->bit_size, nir_rounding_mode_undef); 143bf215546Sopenharmony_ci 144bf215546Sopenharmony_ci nir_ssa_def *src0_32 = nir_build_alu(b, upcast_op, src0, NULL, NULL, NULL); 145bf215546Sopenharmony_ci nir_ssa_def *src1_32 = nir_build_alu(b, upcast_op, src1, NULL, NULL, NULL); 146bf215546Sopenharmony_ci nir_ssa_def *dest_32 = nir_imul(b, src0_32, src1_32); 147bf215546Sopenharmony_ci nir_ssa_def *dest_shifted = nir_ishr(b, dest_32, nir_imm_int(b, src0->bit_size)); 148bf215546Sopenharmony_ci lowered = nir_build_alu(b, downscast_op, dest_shifted, NULL, NULL, NULL); 149bf215546Sopenharmony_ci } else { 150bf215546Sopenharmony_ci nir_ssa_def *c1 = nir_imm_intN_t(b, 1, src0->bit_size); 151bf215546Sopenharmony_ci nir_ssa_def *cshift = nir_imm_int(b, src0->bit_size / 2); 152bf215546Sopenharmony_ci nir_ssa_def *cmask = nir_imm_intN_t(b, (1ull << (src0->bit_size / 2)) - 1, src0->bit_size); 153bf215546Sopenharmony_ci nir_ssa_def *different_signs = NULL; 154bf215546Sopenharmony_ci if (instr->op == nir_op_imul_high) { 155bf215546Sopenharmony_ci nir_ssa_def *c0 = nir_imm_intN_t(b, 0, src0->bit_size); 156bf215546Sopenharmony_ci different_signs = nir_ixor(b, 157bf215546Sopenharmony_ci nir_ilt(b, src0, c0), 158bf215546Sopenharmony_ci nir_ilt(b, src1, c0)); 159bf215546Sopenharmony_ci src0 = nir_iabs(b, src0); 160bf215546Sopenharmony_ci src1 = nir_iabs(b, src1); 161bf215546Sopenharmony_ci } 162bf215546Sopenharmony_ci 163bf215546Sopenharmony_ci /* ABCD 164bf215546Sopenharmony_ci * * EFGH 165bf215546Sopenharmony_ci * ====== 166bf215546Sopenharmony_ci * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32 167bf215546Sopenharmony_ci * 168bf215546Sopenharmony_ci * Start by splitting into the 4 multiplies. 169bf215546Sopenharmony_ci */ 170bf215546Sopenharmony_ci nir_ssa_def *src0l = nir_iand(b, src0, cmask); 171bf215546Sopenharmony_ci nir_ssa_def *src1l = nir_iand(b, src1, cmask); 172bf215546Sopenharmony_ci nir_ssa_def *src0h = nir_ushr(b, src0, cshift); 173bf215546Sopenharmony_ci nir_ssa_def *src1h = nir_ushr(b, src1, cshift); 174bf215546Sopenharmony_ci 175bf215546Sopenharmony_ci nir_ssa_def *lo = nir_imul(b, src0l, src1l); 176bf215546Sopenharmony_ci nir_ssa_def *m1 = nir_imul(b, src0l, src1h); 177bf215546Sopenharmony_ci nir_ssa_def *m2 = nir_imul(b, src0h, src1l); 178bf215546Sopenharmony_ci nir_ssa_def *hi = nir_imul(b, src0h, src1h); 179bf215546Sopenharmony_ci 180bf215546Sopenharmony_ci nir_ssa_def *tmp; 181bf215546Sopenharmony_ci 182bf215546Sopenharmony_ci tmp = nir_ishl(b, m1, cshift); 183bf215546Sopenharmony_ci hi = nir_iadd(b, hi, nir_iand(b, nir_uadd_carry(b, lo, tmp), c1)); 184bf215546Sopenharmony_ci lo = nir_iadd(b, lo, tmp); 185bf215546Sopenharmony_ci hi = nir_iadd(b, hi, nir_ushr(b, m1, cshift)); 186bf215546Sopenharmony_ci 187bf215546Sopenharmony_ci tmp = nir_ishl(b, m2, cshift); 188bf215546Sopenharmony_ci hi = nir_iadd(b, hi, nir_iand(b, nir_uadd_carry(b, lo, tmp), c1)); 189bf215546Sopenharmony_ci lo = nir_iadd(b, lo, tmp); 190bf215546Sopenharmony_ci hi = nir_iadd(b, hi, nir_ushr(b, m2, cshift)); 191bf215546Sopenharmony_ci 192bf215546Sopenharmony_ci if (instr->op == nir_op_imul_high) { 193bf215546Sopenharmony_ci /* For channels where different_signs is set we have to perform a 194bf215546Sopenharmony_ci * 64-bit negation. This is *not* the same as just negating the 195bf215546Sopenharmony_ci * high 32-bits. Consider -3 * 2. The high 32-bits is 0, but the 196bf215546Sopenharmony_ci * desired result is -1, not -0! Recall -x == ~x + 1. 197bf215546Sopenharmony_ci */ 198bf215546Sopenharmony_ci hi = nir_bcsel(b, different_signs, 199bf215546Sopenharmony_ci nir_iadd(b, 200bf215546Sopenharmony_ci nir_inot(b, hi), 201bf215546Sopenharmony_ci nir_iand(b, 202bf215546Sopenharmony_ci nir_uadd_carry(b, 203bf215546Sopenharmony_ci nir_inot(b, lo), 204bf215546Sopenharmony_ci c1), 205bf215546Sopenharmony_ci nir_imm_intN_t(b, 1, src0->bit_size))), 206bf215546Sopenharmony_ci hi); 207bf215546Sopenharmony_ci } 208bf215546Sopenharmony_ci 209bf215546Sopenharmony_ci lowered = hi; 210bf215546Sopenharmony_ci } 211bf215546Sopenharmony_ci } 212bf215546Sopenharmony_ci break; 213bf215546Sopenharmony_ci 214bf215546Sopenharmony_ci default: 215bf215546Sopenharmony_ci break; 216bf215546Sopenharmony_ci } 217bf215546Sopenharmony_ci 218bf215546Sopenharmony_ci if (lowered) { 219bf215546Sopenharmony_ci nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, lowered); 220bf215546Sopenharmony_ci nir_instr_remove(&instr->instr); 221bf215546Sopenharmony_ci return true; 222bf215546Sopenharmony_ci } else { 223bf215546Sopenharmony_ci return false; 224bf215546Sopenharmony_ci } 225bf215546Sopenharmony_ci} 226bf215546Sopenharmony_ci 227bf215546Sopenharmony_cibool 228bf215546Sopenharmony_cinir_lower_alu(nir_shader *shader) 229bf215546Sopenharmony_ci{ 230bf215546Sopenharmony_ci bool progress = false; 231bf215546Sopenharmony_ci 232bf215546Sopenharmony_ci if (!shader->options->lower_bitfield_reverse && 233bf215546Sopenharmony_ci !shader->options->lower_mul_high) 234bf215546Sopenharmony_ci return false; 235bf215546Sopenharmony_ci 236bf215546Sopenharmony_ci nir_foreach_function(function, shader) { 237bf215546Sopenharmony_ci if (function->impl) { 238bf215546Sopenharmony_ci nir_builder builder; 239bf215546Sopenharmony_ci nir_builder_init(&builder, function->impl); 240bf215546Sopenharmony_ci 241bf215546Sopenharmony_ci nir_foreach_block(block, function->impl) { 242bf215546Sopenharmony_ci nir_foreach_instr_safe(instr, block) { 243bf215546Sopenharmony_ci if (instr->type == nir_instr_type_alu) { 244bf215546Sopenharmony_ci progress = lower_alu_instr(nir_instr_as_alu(instr), 245bf215546Sopenharmony_ci &builder) || progress; 246bf215546Sopenharmony_ci } 247bf215546Sopenharmony_ci } 248bf215546Sopenharmony_ci } 249bf215546Sopenharmony_ci 250bf215546Sopenharmony_ci if (progress) { 251bf215546Sopenharmony_ci nir_metadata_preserve(function->impl, 252bf215546Sopenharmony_ci nir_metadata_block_index | 253bf215546Sopenharmony_ci nir_metadata_dominance); 254bf215546Sopenharmony_ci } 255bf215546Sopenharmony_ci } 256bf215546Sopenharmony_ci } 257bf215546Sopenharmony_ci 258bf215546Sopenharmony_ci return progress; 259bf215546Sopenharmony_ci} 260