1/* 2 * Copyright (C) 2021 Collabora, Ltd. 3 * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io> 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25#include "compiler.h" 26#include "bi_builder.h" 27 28/* 29 * Due to a Bifrost encoding restriction, some instructions cannot have an abs 30 * modifier on both sources. Check if adding a fabs modifier to a given source 31 * of a binary instruction would cause this restriction to be hit. 32 */ 33static bool 34bi_would_impact_abs(unsigned arch, bi_instr *I, bi_index repl, unsigned s) 35{ 36 return (arch <= 8) && I->src[1 - s].abs && 37 bi_is_word_equiv(I->src[1 - s], repl); 38} 39 40static bool 41bi_takes_fabs(unsigned arch, bi_instr *I, bi_index repl, unsigned s) 42{ 43 switch (I->op) { 44 case BI_OPCODE_FCMP_V2F16: 45 case BI_OPCODE_FMAX_V2F16: 46 case BI_OPCODE_FMIN_V2F16: 47 return !bi_would_impact_abs(arch, I, repl, s); 48 case BI_OPCODE_FADD_V2F16: 49 /* 50 * For FADD.v2f16, the FMA pipe has the abs encoding hazard, 51 * while the FADD pipe cannot encode a clamp. Either case in 52 * isolation can be worked around in the scheduler, but both 53 * together is impossible to encode. Avoid the hazard. 54 */ 55 return !(I->clamp && bi_would_impact_abs(arch, I, repl, s)); 56 case BI_OPCODE_V2F32_TO_V2F16: 57 /* TODO: Needs both match or lower */ 58 return false; 59 case BI_OPCODE_FLOG_TABLE_F32: 60 /* TODO: Need to check mode */ 61 return false; 62 default: 63 return bi_opcode_props[I->op].abs & BITFIELD_BIT(s); 64 } 65} 66 67static bool 68bi_takes_fneg(unsigned arch, bi_instr *I, unsigned s) 69{ 70 switch (I->op) { 71 case BI_OPCODE_CUBE_SSEL: 72 case BI_OPCODE_CUBE_TSEL: 73 case BI_OPCODE_CUBEFACE: 74 /* TODO: Bifrost encoding restriction: need to match or lower */ 75 return arch >= 9; 76 case BI_OPCODE_FREXPE_F32: 77 case BI_OPCODE_FREXPE_V2F16: 78 case BI_OPCODE_FLOG_TABLE_F32: 79 /* TODO: Need to check mode */ 80 return false; 81 default: 82 return bi_opcode_props[I->op].neg & BITFIELD_BIT(s); 83 } 84} 85 86static bool 87bi_is_fabsneg(enum bi_opcode op, enum bi_size size) 88{ 89 return (size == BI_SIZE_32 && op == BI_OPCODE_FABSNEG_F32) || 90 (size == BI_SIZE_16 && op == BI_OPCODE_FABSNEG_V2F16); 91} 92 93static enum bi_swizzle 94bi_compose_swizzle_16(enum bi_swizzle a, enum bi_swizzle b) 95{ 96 assert(a <= BI_SWIZZLE_H11); 97 assert(b <= BI_SWIZZLE_H11); 98 99 bool al = (a & BI_SWIZZLE_H10); 100 bool ar = (a & BI_SWIZZLE_H01); 101 bool bl = (b & BI_SWIZZLE_H10); 102 bool br = (b & BI_SWIZZLE_H01); 103 104 return ((al ? br : bl) ? BI_SWIZZLE_H10 : 0) | 105 ((ar ? br : bl) ? BI_SWIZZLE_H01 : 0); 106} 107 108/* Like bi_replace_index, but composes instead of overwrites */ 109 110static inline bi_index 111bi_compose_float_index(bi_index old, bi_index repl) 112{ 113 /* abs(-x) = abs(+x) so ignore repl.neg if old.abs is set, otherwise 114 * -(-x) = x but -(+x) = +(-x) so need to exclusive-or the negates */ 115 repl.neg = old.neg ^ (repl.neg && !old.abs); 116 117 /* +/- abs(+/- abs(x)) = +/- abs(x), etc so just or the two */ 118 repl.abs |= old.abs; 119 120 /* Use the old swizzle to select from the replacement swizzle */ 121 repl.swizzle = bi_compose_swizzle_16(old.swizzle, repl.swizzle); 122 123 return repl; 124} 125 126/* DISCARD.b32(FCMP.f(x, y)) --> DISCARD.f(x, y) */ 127 128static inline void 129bi_fuse_discard_fcmp(bi_instr *I, bi_instr *mod, unsigned arch) 130{ 131 if (I->op != BI_OPCODE_DISCARD_B32) return; 132 if (mod->op != BI_OPCODE_FCMP_F32 && mod->op != BI_OPCODE_FCMP_V2F16) return; 133 if (mod->cmpf >= BI_CMPF_GTLT) return; 134 135 /* .abs and .neg modifiers allowed on Valhall DISCARD but not Bifrost */ 136 bool absneg = mod->src[0].neg || mod->src[0].abs; 137 absneg |= mod->src[1].neg || mod->src[1].abs; 138 139 if (arch <= 8 && absneg) return; 140 141 enum bi_swizzle r = I->src[0].swizzle; 142 143 /* result_type doesn't matter */ 144 I->op = BI_OPCODE_DISCARD_F32; 145 I->cmpf = mod->cmpf; 146 I->src[0] = mod->src[0]; 147 I->src[1] = mod->src[1]; 148 149 if (mod->op == BI_OPCODE_FCMP_V2F16) { 150 I->src[0].swizzle = bi_compose_swizzle_16(r, I->src[0].swizzle); 151 I->src[1].swizzle = bi_compose_swizzle_16(r, I->src[1].swizzle); 152 } 153} 154 155void 156bi_opt_mod_prop_forward(bi_context *ctx) 157{ 158 bi_instr **lut = calloc(sizeof(bi_instr *), ctx->ssa_alloc); 159 160 bi_foreach_instr_global_safe(ctx, I) { 161 if (bi_is_ssa(I->dest[0])) 162 lut[I->dest[0].value] = I; 163 164 bi_foreach_src(I, s) { 165 if (!bi_is_ssa(I->src[s])) 166 continue; 167 168 bi_instr *mod = lut[I->src[s].value]; 169 170 if (!mod) 171 continue; 172 173 unsigned size = bi_opcode_props[I->op].size; 174 175 bi_fuse_discard_fcmp(I, mod, ctx->arch); 176 177 if (bi_is_fabsneg(mod->op, size)) { 178 if (mod->src[0].abs && !bi_takes_fabs(ctx->arch, I, mod->src[0], s)) 179 continue; 180 181 if (mod->src[0].neg && !bi_takes_fneg(ctx->arch, I, s)) 182 continue; 183 184 I->src[s] = bi_compose_float_index(I->src[s], mod->src[0]); 185 } 186 } 187 } 188 189 free(lut); 190} 191 192/* RSCALE has restrictions on how the clamp may be used, only used for 193 * specialized transcendental sequences that set the clamp explicitly anyway */ 194 195static bool 196bi_takes_clamp(bi_instr *I) 197{ 198 switch (I->op) { 199 case BI_OPCODE_FMA_RSCALE_F32: 200 case BI_OPCODE_FMA_RSCALE_V2F16: 201 case BI_OPCODE_FADD_RSCALE_F32: 202 return false; 203 case BI_OPCODE_FADD_V2F16: 204 /* Encoding restriction */ 205 return !(I->src[0].abs && I->src[1].abs && 206 bi_is_word_equiv(I->src[0], I->src[1])); 207 default: 208 return bi_opcode_props[I->op].clamp; 209 } 210} 211 212static bool 213bi_is_fclamp(enum bi_opcode op, enum bi_size size) 214{ 215 return (size == BI_SIZE_32 && op == BI_OPCODE_FCLAMP_F32) || 216 (size == BI_SIZE_16 && op == BI_OPCODE_FCLAMP_V2F16); 217} 218 219static bool 220bi_optimizer_clamp(bi_instr *I, bi_instr *use) 221{ 222 if (!bi_is_fclamp(use->op, bi_opcode_props[I->op].size)) return false; 223 if (!bi_takes_clamp(I)) return false; 224 225 /* Clamps are bitfields (clamp_m1_1/clamp_0_inf) so composition is OR */ 226 I->clamp |= use->clamp; 227 I->dest[0] = use->dest[0]; 228 return true; 229} 230 231static enum bi_opcode 232bi_sized_mux_op(unsigned size) 233{ 234 switch (size) { 235 case 8: return BI_OPCODE_MUX_V4I8; 236 case 16: return BI_OPCODE_MUX_V2I16; 237 case 32: return BI_OPCODE_MUX_I32; 238 default: unreachable("invalid size"); 239 } 240} 241 242static bool 243bi_is_fixed_mux(bi_instr *I, unsigned size, bi_index v1) 244{ 245 return I->op == bi_sized_mux_op(size) && 246 bi_is_value_equiv(I->src[0], bi_zero()) && 247 bi_is_value_equiv(I->src[1], v1); 248} 249 250static bool 251bi_takes_int_result_type(enum bi_opcode op) 252{ 253 switch (op) { 254 case BI_OPCODE_ICMP_I32: 255 case BI_OPCODE_ICMP_S32: 256 case BI_OPCODE_ICMP_U32: 257 case BI_OPCODE_ICMP_V2I16: 258 case BI_OPCODE_ICMP_V2S16: 259 case BI_OPCODE_ICMP_V2U16: 260 case BI_OPCODE_ICMP_V4I8: 261 case BI_OPCODE_ICMP_V4S8: 262 case BI_OPCODE_ICMP_V4U8: 263 case BI_OPCODE_FCMP_F32: 264 case BI_OPCODE_FCMP_V2F16: 265 return true; 266 default: 267 return false; 268 } 269} 270 271static bool 272bi_takes_float_result_type(enum bi_opcode op) 273{ 274 return (op == BI_OPCODE_FCMP_F32) || 275 (op == BI_OPCODE_FCMP_V2F16); 276} 277 278/* CMP+MUX -> CMP with result type */ 279static bool 280bi_optimizer_result_type(bi_instr *I, bi_instr *mux) 281{ 282 if (bi_opcode_props[I->op].size != bi_opcode_props[mux->op].size) 283 return false; 284 285 if (bi_is_fixed_mux(mux, 32, bi_imm_f32(1.0)) || 286 bi_is_fixed_mux(mux, 16, bi_imm_f16(1.0))) { 287 288 if (!bi_takes_float_result_type(I->op)) 289 return false; 290 291 I->result_type = BI_RESULT_TYPE_F1; 292 } else if (bi_is_fixed_mux(mux, 32, bi_imm_u32(1)) || 293 bi_is_fixed_mux(mux, 16, bi_imm_u16(1)) || 294 bi_is_fixed_mux(mux, 8, bi_imm_u8(1))) { 295 296 if (!bi_takes_int_result_type(I->op)) 297 return false; 298 299 I->result_type = BI_RESULT_TYPE_I1; 300 } else { 301 return false; 302 } 303 304 I->dest[0] = mux->dest[0]; 305 return true; 306} 307 308static bool 309bi_is_var_tex(bi_instr *var, bi_instr *tex) 310{ 311 return (var->op == BI_OPCODE_LD_VAR_IMM) && 312 (tex->op == BI_OPCODE_TEXS_2D_F16 || tex->op == BI_OPCODE_TEXS_2D_F32) && 313 (var->register_format == BI_REGISTER_FORMAT_F32) && 314 ((var->sample == BI_SAMPLE_CENTER && var->update == BI_UPDATE_STORE) || 315 (var->sample == BI_SAMPLE_NONE && var->update == BI_UPDATE_RETRIEVE)) && 316 (tex->texture_index == tex->sampler_index) && 317 (tex->texture_index < 4) && 318 (var->index < 8); 319} 320 321static bool 322bi_optimizer_var_tex(bi_context *ctx, bi_instr *var, bi_instr *tex) 323{ 324 if (!bi_is_var_tex(var, tex)) return false; 325 326 /* Construct the corresponding VAR_TEX intruction */ 327 bi_builder b = bi_init_builder(ctx, bi_after_instr(var)); 328 329 bi_instr *I = bi_var_tex_f32_to(&b, tex->dest[0], tex->lod_mode, 330 var->sample, var->update, tex->texture_index, var->index); 331 I->skip = tex->skip; 332 333 if (tex->op == BI_OPCODE_TEXS_2D_F16) 334 I->op = BI_OPCODE_VAR_TEX_F16; 335 336 /* Dead code elimination will clean up for us */ 337 return true; 338} 339 340void 341bi_opt_mod_prop_backward(bi_context *ctx) 342{ 343 unsigned count = ctx->ssa_alloc; 344 bi_instr **uses = calloc(count, sizeof(*uses)); 345 BITSET_WORD *multiple = calloc(BITSET_WORDS(count), sizeof(*multiple)); 346 347 bi_foreach_instr_global_rev(ctx, I) { 348 bi_foreach_src(I, s) { 349 if (bi_is_ssa(I->src[s])) { 350 unsigned v = I->src[s].value; 351 352 if (uses[v] && uses[v] != I) 353 BITSET_SET(multiple, v); 354 else 355 uses[v] = I; 356 } 357 } 358 359 if (!bi_is_ssa(I->dest[0])) 360 continue; 361 362 bi_instr *use = uses[I->dest[0].value]; 363 364 if (!use || BITSET_TEST(multiple, I->dest[0].value)) 365 continue; 366 367 /* Destination has a single use, try to propagate */ 368 bool propagated = 369 bi_optimizer_clamp(I, use) || 370 bi_optimizer_result_type(I, use); 371 372 if (!propagated && I->op == BI_OPCODE_LD_VAR_IMM && use->op == BI_OPCODE_SPLIT_I32) { 373 /* Need to see through the split in a 374 * ld_var_imm/split/var_tex sequence 375 */ 376 assert(bi_is_ssa(use->dest[0])); 377 bi_instr *tex = uses[use->dest[0].value]; 378 379 if (!tex || BITSET_TEST(multiple, use->dest[0].value)) 380 continue; 381 382 use = tex; 383 propagated = bi_optimizer_var_tex(ctx, I, use); 384 } 385 386 if (propagated) { 387 bi_remove_instruction(use); 388 continue; 389 } 390 } 391 392 free(uses); 393 free(multiple); 394} 395 396/** Lower pseudo instructions that exist to simplify the optimizer */ 397 398void 399bi_lower_opt_instruction(bi_instr *I) 400{ 401 switch (I->op) { 402 case BI_OPCODE_FABSNEG_F32: 403 case BI_OPCODE_FABSNEG_V2F16: 404 case BI_OPCODE_FCLAMP_F32: 405 case BI_OPCODE_FCLAMP_V2F16: 406 I->op = (bi_opcode_props[I->op].size == BI_SIZE_32) ? 407 BI_OPCODE_FADD_F32 : BI_OPCODE_FADD_V2F16; 408 409 I->round = BI_ROUND_NONE; 410 I->src[1] = bi_negzero(); 411 break; 412 413 case BI_OPCODE_DISCARD_B32: 414 I->op = BI_OPCODE_DISCARD_F32; 415 I->src[1] = bi_imm_u32(0); 416 I->cmpf = BI_CMPF_NE; 417 break; 418 419 default: 420 break; 421 } 422} 423