1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24/** 25 * \file lower_instructions.cpp 26 * 27 * Many GPUs lack native instructions for certain expression operations, and 28 * must replace them with some other expression tree. This pass lowers some 29 * of the most common cases, allowing the lowering code to be implemented once 30 * rather than in each driver backend. 31 * 32 * Currently supported transformations: 33 * - SUB_TO_ADD_NEG 34 * - LDEXP_TO_ARITH 35 * - CARRY_TO_ARITH 36 * - BORROW_TO_ARITH 37 * - DOPS_TO_DFRAC 38 * 39 * SUB_TO_ADD_NEG: 40 * --------------- 41 * Breaks an ir_binop_sub expression down to add(op0, neg(op1)) 42 * 43 * This simplifies expression reassociation, and for many backends 44 * there is no subtract operation separate from adding the negation. 45 * For backends with native subtract operations, they will probably 46 * want to recognize add(op0, neg(op1)) or the other way around to 47 * produce a subtract anyway. 48 * 49 * LDEXP_TO_ARITH: 50 * ------------- 51 * Converts ir_binop_ldexp to arithmetic and bit operations for float sources. 52 * 53 * DFREXP_DLDEXP_TO_ARITH: 54 * --------------- 55 * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to 56 * arithmetic and bit ops for double arguments. 57 * 58 * CARRY_TO_ARITH: 59 * --------------- 60 * Converts ir_carry into (x + y) < x. 61 * 62 * BORROW_TO_ARITH: 63 * ---------------- 64 * Converts ir_borrow into (x < y). 65 * 66 * DOPS_TO_DFRAC: 67 * -------------- 68 * Converts double trunc, ceil, floor, round to fract 69 */ 70 71#include "program/prog_instruction.h" /* for swizzle */ 72#include "compiler/glsl_types.h" 73#include "ir.h" 74#include "ir_builder.h" 75#include "ir_optimization.h" 76#include "util/half_float.h" 77 78#include <math.h> 79 80using namespace ir_builder; 81 82namespace { 83 84class lower_instructions_visitor : public ir_hierarchical_visitor { 85public: 86 lower_instructions_visitor(unsigned lower) 87 : progress(false), lower(lower) { } 88 89 ir_visitor_status visit_leave(ir_expression *); 90 91 bool progress; 92 93private: 94 unsigned lower; /** Bitfield of which operations to lower */ 95 96 void sub_to_add_neg(ir_expression *); 97 void ldexp_to_arith(ir_expression *); 98 void dldexp_to_arith(ir_expression *); 99 void dfrexp_sig_to_arith(ir_expression *); 100 void dfrexp_exp_to_arith(ir_expression *); 101 void carry_to_arith(ir_expression *); 102 void borrow_to_arith(ir_expression *); 103 void double_dot_to_fma(ir_expression *); 104 void double_lrp(ir_expression *); 105 void dceil_to_dfrac(ir_expression *); 106 void dfloor_to_dfrac(ir_expression *); 107 void dround_even_to_dfrac(ir_expression *); 108 void dtrunc_to_dfrac(ir_expression *); 109 void dsign_to_csel(ir_expression *); 110 void bit_count_to_math(ir_expression *); 111 void extract_to_shifts(ir_expression *); 112 void insert_to_shifts(ir_expression *); 113 void reverse_to_shifts(ir_expression *ir); 114 void find_lsb_to_float_cast(ir_expression *ir); 115 void find_msb_to_float_cast(ir_expression *ir); 116 void imul_high_to_mul(ir_expression *ir); 117 void sqrt_to_abs_sqrt(ir_expression *ir); 118 119 ir_expression *_carry(operand a, operand b); 120 121 static ir_constant *_imm_fp(void *mem_ctx, 122 const glsl_type *type, 123 double f, 124 unsigned vector_elements=1); 125}; 126 127} /* anonymous namespace */ 128 129/** 130 * Determine if a particular type of lowering should occur 131 */ 132#define lowering(x) (this->lower & x) 133 134bool 135lower_instructions(exec_list *instructions, unsigned what_to_lower) 136{ 137 lower_instructions_visitor v(what_to_lower); 138 139 visit_list_elements(&v, instructions); 140 return v.progress; 141} 142 143void 144lower_instructions_visitor::sub_to_add_neg(ir_expression *ir) 145{ 146 ir->operation = ir_binop_add; 147 ir->init_num_operands(); 148 ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type, 149 ir->operands[1], NULL); 150 this->progress = true; 151} 152 153void 154lower_instructions_visitor::ldexp_to_arith(ir_expression *ir) 155{ 156 /* Translates 157 * ir_binop_ldexp x exp 158 * into 159 * 160 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 161 * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 162 * 163 * if (extracted_biased_exp >= 255) 164 * return x; // +/-inf, NaN 165 * 166 * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 167 * 168 * if (min(resulting_biased_exp, extracted_biased_exp) < 1) 169 * resulting_biased_exp = 0; 170 * if (resulting_biased_exp >= 255 || 171 * min(resulting_biased_exp, extracted_biased_exp) < 1) { 172 * sign_mantissa &= sign_mask; 173 * } 174 * 175 * return bitcast_u2f(sign_mantissa | 176 * lshift(i2u(resulting_biased_exp), exp_shift)); 177 * 178 * which we can't actually implement as such, since the GLSL IR doesn't 179 * have vectorized if-statements. We actually implement it without branches 180 * using conditional-select: 181 * 182 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 183 * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 184 * 185 * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 186 * 187 * flush_to_zero = lequal(min(resulting_biased_exp, extracted_biased_exp), 0); 188 * resulting_biased_exp = csel(flush_to_zero, 0, resulting_biased_exp) 189 * zero_mantissa = logic_or(flush_to_zero, 190 * gequal(resulting_biased_exp, 255)); 191 * sign_mantissa = csel(zero_mantissa, sign_mantissa & sign_mask, sign_mantissa); 192 * 193 * result = sign_mantissa | 194 * lshift(i2u(resulting_biased_exp), exp_shift)); 195 * 196 * return csel(extracted_biased_exp >= 255, x, bitcast_u2f(result)); 197 * 198 * The definition of ldexp in the GLSL spec says: 199 * 200 * "If this product is too large to be represented in the 201 * floating-point type, the result is undefined." 202 * 203 * However, the definition of ldexp in the GLSL ES spec does not contain 204 * this sentence, so we do need to handle overflow correctly. 205 * 206 * There is additional language limiting the defined range of exp, but this 207 * is merely to allow implementations that store 2^exp in a temporary 208 * variable. 209 */ 210 211 const unsigned vec_elem = ir->type->vector_elements; 212 213 /* Types */ 214 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 215 const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 216 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 217 218 /* Temporary variables */ 219 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 220 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 221 ir_variable *result = new(ir) ir_variable(uvec, "result", ir_var_temporary); 222 223 ir_variable *extracted_biased_exp = 224 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 225 ir_variable *resulting_biased_exp = 226 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 227 228 ir_variable *sign_mantissa = 229 new(ir) ir_variable(uvec, "sign_mantissa", ir_var_temporary); 230 231 ir_variable *flush_to_zero = 232 new(ir) ir_variable(bvec, "flush_to_zero", ir_var_temporary); 233 ir_variable *zero_mantissa = 234 new(ir) ir_variable(bvec, "zero_mantissa", ir_var_temporary); 235 236 ir_instruction &i = *base_ir; 237 238 /* Copy <x> and <exp> arguments. */ 239 i.insert_before(x); 240 i.insert_before(assign(x, ir->operands[0])); 241 i.insert_before(exp); 242 i.insert_before(assign(exp, ir->operands[1])); 243 244 /* Extract the biased exponent from <x>. */ 245 i.insert_before(extracted_biased_exp); 246 i.insert_before(assign(extracted_biased_exp, 247 rshift(bitcast_f2i(abs(x)), 248 new(ir) ir_constant(23, vec_elem)))); 249 250 /* The definition of ldexp in the GLSL 4.60 spec says: 251 * 252 * "If exp is greater than +128 (single-precision) or +1024 253 * (double-precision), the value returned is undefined. If exp is less 254 * than -126 (single-precision) or -1022 (double-precision), the value 255 * returned may be flushed to zero." 256 * 257 * So we do not have to guard against the possibility of addition overflow, 258 * which could happen when exp is close to INT_MAX. Addition underflow 259 * cannot happen (the worst case is 0 + (-INT_MAX)). 260 */ 261 i.insert_before(resulting_biased_exp); 262 i.insert_before(assign(resulting_biased_exp, 263 min2(add(extracted_biased_exp, exp), 264 new(ir) ir_constant(255, vec_elem)))); 265 266 i.insert_before(sign_mantissa); 267 i.insert_before(assign(sign_mantissa, 268 bit_and(bitcast_f2u(x), 269 new(ir) ir_constant(0x807fffffu, vec_elem)))); 270 271 /* We flush to zero if the original or resulting biased exponent is 0, 272 * indicating a +/-0.0 or subnormal input or output. 273 * 274 * The mantissa is set to 0 if the resulting biased exponent is 255, since 275 * an overflow should produce a +/-inf result. 276 * 277 * Note that NaN inputs are handled separately. 278 */ 279 i.insert_before(flush_to_zero); 280 i.insert_before(assign(flush_to_zero, 281 lequal(min2(resulting_biased_exp, 282 extracted_biased_exp), 283 ir_constant::zero(ir, ivec)))); 284 i.insert_before(assign(resulting_biased_exp, 285 csel(flush_to_zero, 286 ir_constant::zero(ir, ivec), 287 resulting_biased_exp))); 288 289 i.insert_before(zero_mantissa); 290 i.insert_before(assign(zero_mantissa, 291 logic_or(flush_to_zero, 292 equal(resulting_biased_exp, 293 new(ir) ir_constant(255, vec_elem))))); 294 i.insert_before(assign(sign_mantissa, 295 csel(zero_mantissa, 296 bit_and(sign_mantissa, 297 new(ir) ir_constant(0x80000000u, vec_elem)), 298 sign_mantissa))); 299 300 /* Don't generate new IR that would need to be lowered in an additional 301 * pass. 302 */ 303 i.insert_before(result); 304 if (!lowering(INSERT_TO_SHIFTS)) { 305 i.insert_before(assign(result, 306 bitfield_insert(sign_mantissa, 307 i2u(resulting_biased_exp), 308 new(ir) ir_constant(23u, vec_elem), 309 new(ir) ir_constant(8u, vec_elem)))); 310 } else { 311 i.insert_before(assign(result, 312 bit_or(sign_mantissa, 313 lshift(i2u(resulting_biased_exp), 314 new(ir) ir_constant(23, vec_elem))))); 315 } 316 317 ir->operation = ir_triop_csel; 318 ir->init_num_operands(); 319 ir->operands[0] = gequal(extracted_biased_exp, 320 new(ir) ir_constant(255, vec_elem)); 321 ir->operands[1] = new(ir) ir_dereference_variable(x); 322 ir->operands[2] = bitcast_u2f(result); 323 324 this->progress = true; 325} 326 327void 328lower_instructions_visitor::dldexp_to_arith(ir_expression *ir) 329{ 330 /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent 331 * from the significand. 332 */ 333 334 const unsigned vec_elem = ir->type->vector_elements; 335 336 /* Types */ 337 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 338 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 339 340 /* Constants */ 341 ir_constant *zeroi = ir_constant::zero(ir, ivec); 342 343 ir_constant *sign_mask = new(ir) ir_constant(0x80000000u); 344 345 ir_constant *exp_shift = new(ir) ir_constant(20u); 346 ir_constant *exp_width = new(ir) ir_constant(11u); 347 ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem); 348 349 /* Temporary variables */ 350 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 351 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 352 353 ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x", 354 ir_var_temporary); 355 356 ir_variable *extracted_biased_exp = 357 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 358 ir_variable *resulting_biased_exp = 359 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 360 361 ir_variable *is_not_zero_or_underflow = 362 new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary); 363 364 ir_instruction &i = *base_ir; 365 366 /* Copy <x> and <exp> arguments. */ 367 i.insert_before(x); 368 i.insert_before(assign(x, ir->operands[0])); 369 i.insert_before(exp); 370 i.insert_before(assign(exp, ir->operands[1])); 371 372 ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x); 373 if (lowering(DFREXP_DLDEXP_TO_ARITH)) 374 dfrexp_exp_to_arith(frexp_exp); 375 376 /* Extract the biased exponent from <x>. */ 377 i.insert_before(extracted_biased_exp); 378 i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias))); 379 380 i.insert_before(resulting_biased_exp); 381 i.insert_before(assign(resulting_biased_exp, 382 add(extracted_biased_exp, exp))); 383 384 /* Test if result is ±0.0, subnormal, or underflow by checking if the 385 * resulting biased exponent would be less than 0x1. If so, the result is 386 * 0.0 with the sign of x. (Actually, invert the conditions so that 387 * immediate values are the second arguments, which is better for i965) 388 * TODO: Implement in a vector fashion. 389 */ 390 i.insert_before(zero_sign_x); 391 for (unsigned elem = 0; elem < vec_elem; elem++) { 392 ir_variable *unpacked = 393 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 394 i.insert_before(unpacked); 395 i.insert_before( 396 assign(unpacked, 397 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 398 i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)), 399 WRITEMASK_Y)); 400 i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X)); 401 i.insert_before(assign(zero_sign_x, 402 expr(ir_unop_pack_double_2x32, unpacked), 403 1 << elem)); 404 } 405 i.insert_before(is_not_zero_or_underflow); 406 i.insert_before(assign(is_not_zero_or_underflow, 407 gequal(resulting_biased_exp, 408 new(ir) ir_constant(0x1, vec_elem)))); 409 i.insert_before(assign(x, csel(is_not_zero_or_underflow, 410 x, zero_sign_x))); 411 i.insert_before(assign(resulting_biased_exp, 412 csel(is_not_zero_or_underflow, 413 resulting_biased_exp, zeroi))); 414 415 /* We could test for overflows by checking if the resulting biased exponent 416 * would be greater than 0xFE. Turns out we don't need to because the GLSL 417 * spec says: 418 * 419 * "If this product is too large to be represented in the 420 * floating-point type, the result is undefined." 421 */ 422 423 ir_rvalue *results[4] = {NULL}; 424 for (unsigned elem = 0; elem < vec_elem; elem++) { 425 ir_variable *unpacked = 426 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 427 i.insert_before(unpacked); 428 i.insert_before( 429 assign(unpacked, 430 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 431 432 ir_expression *bfi = bitfield_insert( 433 swizzle_y(unpacked), 434 i2u(swizzle(resulting_biased_exp, elem, 1)), 435 exp_shift->clone(ir, NULL), 436 exp_width->clone(ir, NULL)); 437 438 i.insert_before(assign(unpacked, bfi, WRITEMASK_Y)); 439 440 results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 441 } 442 443 ir->operation = ir_quadop_vector; 444 ir->init_num_operands(); 445 ir->operands[0] = results[0]; 446 ir->operands[1] = results[1]; 447 ir->operands[2] = results[2]; 448 ir->operands[3] = results[3]; 449 450 /* Don't generate new IR that would need to be lowered in an additional 451 * pass. 452 */ 453 454 this->progress = true; 455} 456 457void 458lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir) 459{ 460 const unsigned vec_elem = ir->type->vector_elements; 461 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 462 463 /* Double-precision floating-point values are stored as 464 * 1 sign bit; 465 * 11 exponent bits; 466 * 52 mantissa bits. 467 * 468 * We're just extracting the significand here, so we only need to modify 469 * the upper 32-bit uint. Unfortunately we must extract each double 470 * independently as there is no vector version of unpackDouble. 471 */ 472 473 ir_instruction &i = *base_ir; 474 475 ir_variable *is_not_zero = 476 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 477 ir_rvalue *results[4] = {NULL}; 478 479 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 480 i.insert_before(is_not_zero); 481 i.insert_before( 482 assign(is_not_zero, 483 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero))); 484 485 /* TODO: Remake this as more vector-friendly when int64 support is 486 * available. 487 */ 488 for (unsigned elem = 0; elem < vec_elem; elem++) { 489 ir_constant *zero = new(ir) ir_constant(0u, 1); 490 ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1); 491 492 /* Exponent of double floating-point values in the range [0.5, 1.0). */ 493 ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1); 494 495 ir_variable *bits = 496 new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary); 497 ir_variable *unpacked = 498 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 499 500 ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1); 501 502 i.insert_before(bits); 503 i.insert_before(unpacked); 504 i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x))); 505 506 /* Manipulate the high uint to remove the exponent and replace it with 507 * either the default exponent or zero. 508 */ 509 i.insert_before(assign(bits, swizzle_y(unpacked))); 510 i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask))); 511 i.insert_before(assign(bits, bit_or(bits, 512 csel(swizzle(is_not_zero, elem, 1), 513 exponent_value, 514 zero)))); 515 i.insert_before(assign(unpacked, bits, WRITEMASK_Y)); 516 results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 517 } 518 519 /* Put the dvec back together */ 520 ir->operation = ir_quadop_vector; 521 ir->init_num_operands(); 522 ir->operands[0] = results[0]; 523 ir->operands[1] = results[1]; 524 ir->operands[2] = results[2]; 525 ir->operands[3] = results[3]; 526 527 this->progress = true; 528} 529 530void 531lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir) 532{ 533 const unsigned vec_elem = ir->type->vector_elements; 534 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 535 const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 536 537 /* Double-precision floating-point values are stored as 538 * 1 sign bit; 539 * 11 exponent bits; 540 * 52 mantissa bits. 541 * 542 * We're just extracting the exponent here, so we only care about the upper 543 * 32-bit uint. 544 */ 545 546 ir_instruction &i = *base_ir; 547 548 ir_variable *is_not_zero = 549 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 550 ir_variable *high_words = 551 new(ir) ir_variable(uvec, "high_words", ir_var_temporary); 552 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 553 ir_constant *izero = new(ir) ir_constant(0, vec_elem); 554 555 ir_rvalue *absval = abs(ir->operands[0]); 556 557 i.insert_before(is_not_zero); 558 i.insert_before(high_words); 559 i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero))); 560 561 /* Extract all of the upper uints. */ 562 for (unsigned elem = 0; elem < vec_elem; elem++) { 563 ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1); 564 565 i.insert_before(assign(high_words, 566 swizzle_y(expr(ir_unop_unpack_double_2x32, x)), 567 1 << elem)); 568 569 } 570 ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem); 571 ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem); 572 573 /* For non-zero inputs, shift the exponent down and apply bias. */ 574 ir->operation = ir_triop_csel; 575 ir->init_num_operands(); 576 ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero); 577 ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift))); 578 ir->operands[2] = izero; 579 580 this->progress = true; 581} 582 583void 584lower_instructions_visitor::carry_to_arith(ir_expression *ir) 585{ 586 /* Translates 587 * ir_binop_carry x y 588 * into 589 * sum = ir_binop_add x y 590 * bcarry = ir_binop_less sum x 591 * carry = ir_unop_b2i bcarry 592 */ 593 594 ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL); 595 ir->operation = ir_unop_i2u; 596 ir->init_num_operands(); 597 ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone)); 598 ir->operands[1] = NULL; 599 600 this->progress = true; 601} 602 603void 604lower_instructions_visitor::borrow_to_arith(ir_expression *ir) 605{ 606 /* Translates 607 * ir_binop_borrow x y 608 * into 609 * bcarry = ir_binop_less x y 610 * carry = ir_unop_b2i bcarry 611 */ 612 613 ir->operation = ir_unop_i2u; 614 ir->init_num_operands(); 615 ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1])); 616 ir->operands[1] = NULL; 617 618 this->progress = true; 619} 620 621void 622lower_instructions_visitor::double_dot_to_fma(ir_expression *ir) 623{ 624 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res", 625 ir_var_temporary); 626 this->base_ir->insert_before(temp); 627 628 int nc = ir->operands[0]->type->components(); 629 for (int i = nc - 1; i >= 1; i--) { 630 ir_assignment *assig; 631 if (i == (nc - 1)) { 632 assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 633 swizzle(ir->operands[1]->clone(ir, NULL), i, 1))); 634 } else { 635 assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 636 swizzle(ir->operands[1]->clone(ir, NULL), i, 1), 637 temp)); 638 } 639 this->base_ir->insert_before(assig); 640 } 641 642 ir->operation = ir_triop_fma; 643 ir->init_num_operands(); 644 ir->operands[0] = swizzle(ir->operands[0], 0, 1); 645 ir->operands[1] = swizzle(ir->operands[1], 0, 1); 646 ir->operands[2] = new(ir) ir_dereference_variable(temp); 647 648 this->progress = true; 649 650} 651 652void 653lower_instructions_visitor::double_lrp(ir_expression *ir) 654{ 655 int swizval; 656 ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2]; 657 ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements); 658 659 switch (op2->type->vector_elements) { 660 case 1: 661 swizval = SWIZZLE_XXXX; 662 break; 663 default: 664 assert(op0->type->vector_elements == op2->type->vector_elements); 665 swizval = SWIZZLE_XYZW; 666 break; 667 } 668 669 ir->operation = ir_triop_fma; 670 ir->init_num_operands(); 671 ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements); 672 ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0); 673 674 this->progress = true; 675} 676 677void 678lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir) 679{ 680 /* 681 * frtemp = frac(x); 682 * temp = sub(x, frtemp); 683 * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0); 684 */ 685 ir_instruction &i = *base_ir; 686 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 687 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 688 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 689 ir_var_temporary); 690 691 i.insert_before(frtemp); 692 i.insert_before(assign(frtemp, fract(ir->operands[0]))); 693 694 ir->operation = ir_binop_add; 695 ir->init_num_operands(); 696 ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp); 697 ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL)); 698 699 this->progress = true; 700} 701 702void 703lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir) 704{ 705 /* 706 * frtemp = frac(x); 707 * result = sub(x, frtemp); 708 */ 709 ir->operation = ir_binop_sub; 710 ir->init_num_operands(); 711 ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL)); 712 713 this->progress = true; 714} 715void 716lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir) 717{ 718 /* 719 * insane but works 720 * temp = x + 0.5; 721 * frtemp = frac(temp); 722 * t2 = sub(temp, frtemp); 723 * if (frac(x) == 0.5) 724 * result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1; 725 * else 726 * result = t2; 727 728 */ 729 ir_instruction &i = *base_ir; 730 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 731 ir_var_temporary); 732 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 733 ir_var_temporary); 734 ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2", 735 ir_var_temporary); 736 ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements); 737 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 738 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 739 740 i.insert_before(temp); 741 i.insert_before(assign(temp, add(ir->operands[0], p5))); 742 743 i.insert_before(frtemp); 744 i.insert_before(assign(frtemp, fract(temp))); 745 746 i.insert_before(t2); 747 i.insert_before(assign(t2, sub(temp, frtemp))); 748 749 ir->operation = ir_triop_csel; 750 ir->init_num_operands(); 751 ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)), 752 p5->clone(ir, NULL)); 753 ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))), 754 zero), 755 t2, 756 sub(t2, one)); 757 ir->operands[2] = new(ir) ir_dereference_variable(t2); 758 759 this->progress = true; 760} 761 762void 763lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir) 764{ 765 /* 766 * frtemp = frac(x); 767 * temp = sub(x, frtemp); 768 * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1; 769 */ 770 ir_rvalue *arg = ir->operands[0]; 771 ir_instruction &i = *base_ir; 772 773 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 774 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 775 ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp", 776 ir_var_temporary); 777 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 778 ir_var_temporary); 779 780 i.insert_before(frtemp); 781 i.insert_before(assign(frtemp, fract(arg))); 782 i.insert_before(temp); 783 i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp))); 784 785 ir->operation = ir_triop_csel; 786 ir->init_num_operands(); 787 ir->operands[0] = gequal(arg->clone(ir, NULL), zero); 788 ir->operands[1] = new (ir) ir_dereference_variable(temp); 789 ir->operands[2] = add(temp, 790 csel(equal(frtemp, zero->clone(ir, NULL)), 791 zero->clone(ir, NULL), 792 one)); 793 794 this->progress = true; 795} 796 797void 798lower_instructions_visitor::dsign_to_csel(ir_expression *ir) 799{ 800 /* 801 * temp = x > 0.0 ? 1.0 : 0.0; 802 * result = x < 0.0 ? -1.0 : temp; 803 */ 804 ir_rvalue *arg = ir->operands[0]; 805 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 806 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 807 ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements); 808 809 ir->operation = ir_triop_csel; 810 ir->init_num_operands(); 811 ir->operands[0] = less(arg->clone(ir, NULL), 812 zero->clone(ir, NULL)); 813 ir->operands[1] = neg_one; 814 ir->operands[2] = csel(greater(arg, zero), 815 one, 816 zero->clone(ir, NULL)); 817 818 this->progress = true; 819} 820 821void 822lower_instructions_visitor::bit_count_to_math(ir_expression *ir) 823{ 824 /* For more details, see: 825 * 826 * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel 827 */ 828 const unsigned elements = ir->operands[0]->type->vector_elements; 829 ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp", 830 ir_var_temporary); 831 ir_constant *c55555555 = new(ir) ir_constant(0x55555555u); 832 ir_constant *c33333333 = new(ir) ir_constant(0x33333333u); 833 ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu); 834 ir_constant *c01010101 = new(ir) ir_constant(0x01010101u); 835 ir_constant *c1 = new(ir) ir_constant(1u); 836 ir_constant *c2 = new(ir) ir_constant(2u); 837 ir_constant *c4 = new(ir) ir_constant(4u); 838 ir_constant *c24 = new(ir) ir_constant(24u); 839 840 base_ir->insert_before(temp); 841 842 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 843 base_ir->insert_before(assign(temp, ir->operands[0])); 844 } else { 845 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 846 base_ir->insert_before(assign(temp, i2u(ir->operands[0]))); 847 } 848 849 /* temp = temp - ((temp >> 1) & 0x55555555u); */ 850 base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1), 851 c55555555)))); 852 853 /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */ 854 base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333), 855 bit_and(rshift(temp, c2), 856 c33333333->clone(ir, NULL))))); 857 858 /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */ 859 ir->operation = ir_unop_u2i; 860 ir->init_num_operands(); 861 ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F), 862 c01010101), 863 c24); 864 865 this->progress = true; 866} 867 868void 869lower_instructions_visitor::extract_to_shifts(ir_expression *ir) 870{ 871 ir_variable *bits = 872 new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 873 874 base_ir->insert_before(bits); 875 base_ir->insert_before(assign(bits, ir->operands[2])); 876 877 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 878 ir_constant *c1 = 879 new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 880 ir_constant *c32 = 881 new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 882 ir_constant *cFFFFFFFF = 883 new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 884 885 /* At least some hardware treats (x << y) as (x << (y%32)). This means 886 * we'd get a mask of 0 when bits is 32. Special case it. 887 * 888 * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u; 889 */ 890 ir_expression *mask = csel(equal(bits, c32), 891 cFFFFFFFF, 892 sub(lshift(c1, bits), c1->clone(ir, NULL))); 893 894 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 895 * 896 * If bits is zero, the result will be zero. 897 * 898 * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional 899 * select as in the signed integer case. 900 * 901 * (value >> offset) & mask; 902 */ 903 ir->operation = ir_binop_bit_and; 904 ir->init_num_operands(); 905 ir->operands[0] = rshift(ir->operands[0], ir->operands[1]); 906 ir->operands[1] = mask; 907 ir->operands[2] = NULL; 908 } else { 909 ir_constant *c0 = 910 new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements); 911 ir_constant *c32 = 912 new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 913 ir_variable *temp = 914 new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary); 915 916 /* temp = 32 - bits; */ 917 base_ir->insert_before(temp); 918 base_ir->insert_before(assign(temp, sub(c32, bits))); 919 920 /* expr = value << (temp - offset)) >> temp; */ 921 ir_expression *expr = 922 rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp); 923 924 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 925 * 926 * If bits is zero, the result will be zero. 927 * 928 * Due to the (x << (y%32)) behavior mentioned before, the (value << 929 * (32-0)) doesn't "erase" all of the data as we would like, so finish 930 * up with: 931 * 932 * (bits == 0) ? 0 : e; 933 */ 934 ir->operation = ir_triop_csel; 935 ir->init_num_operands(); 936 ir->operands[0] = equal(c0, bits); 937 ir->operands[1] = c0->clone(ir, NULL); 938 ir->operands[2] = expr; 939 } 940 941 this->progress = true; 942} 943 944void 945lower_instructions_visitor::insert_to_shifts(ir_expression *ir) 946{ 947 ir_constant *c1; 948 ir_constant *c32; 949 ir_constant *cFFFFFFFF; 950 ir_variable *offset = 951 new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary); 952 ir_variable *bits = 953 new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 954 ir_variable *mask = 955 new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary); 956 957 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 958 c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements); 959 c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 960 cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements); 961 } else { 962 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 963 964 c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 965 c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 966 cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 967 } 968 969 base_ir->insert_before(offset); 970 base_ir->insert_before(assign(offset, ir->operands[2])); 971 972 base_ir->insert_before(bits); 973 base_ir->insert_before(assign(bits, ir->operands[3])); 974 975 /* At least some hardware treats (x << y) as (x << (y%32)). This means 976 * we'd get a mask of 0 when bits is 32. Special case it. 977 * 978 * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset; 979 * 980 * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 981 * 982 * The result will be undefined if offset or bits is negative, or if the 983 * sum of offset and bits is greater than the number of bits used to 984 * store the operand. 985 * 986 * Since it's undefined, there are a couple other ways this could be 987 * implemented. The other way that was considered was to put the csel 988 * around the whole thing: 989 * 990 * final_result = bits == 32 ? insert : ... ; 991 */ 992 base_ir->insert_before(mask); 993 994 base_ir->insert_before(assign(mask, csel(equal(bits, c32), 995 cFFFFFFFF, 996 lshift(sub(lshift(c1, bits), 997 c1->clone(ir, NULL)), 998 offset)))); 999 1000 /* (base & ~mask) | ((insert << offset) & mask) */ 1001 ir->operation = ir_binop_bit_or; 1002 ir->init_num_operands(); 1003 ir->operands[0] = bit_and(ir->operands[0], bit_not(mask)); 1004 ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask); 1005 ir->operands[2] = NULL; 1006 ir->operands[3] = NULL; 1007 1008 this->progress = true; 1009} 1010 1011void 1012lower_instructions_visitor::reverse_to_shifts(ir_expression *ir) 1013{ 1014 /* For more details, see: 1015 * 1016 * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel 1017 */ 1018 ir_constant *c1 = 1019 new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1020 ir_constant *c2 = 1021 new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements); 1022 ir_constant *c4 = 1023 new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements); 1024 ir_constant *c8 = 1025 new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements); 1026 ir_constant *c16 = 1027 new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements); 1028 ir_constant *c33333333 = 1029 new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements); 1030 ir_constant *c55555555 = 1031 new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements); 1032 ir_constant *c0F0F0F0F = 1033 new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements); 1034 ir_constant *c00FF00FF = 1035 new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements); 1036 ir_variable *temp = 1037 new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements), 1038 "temp", ir_var_temporary); 1039 ir_instruction &i = *base_ir; 1040 1041 i.insert_before(temp); 1042 1043 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1044 i.insert_before(assign(temp, ir->operands[0])); 1045 } else { 1046 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1047 i.insert_before(assign(temp, i2u(ir->operands[0]))); 1048 } 1049 1050 /* Swap odd and even bits. 1051 * 1052 * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1); 1053 */ 1054 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555), 1055 lshift(bit_and(temp, c55555555->clone(ir, NULL)), 1056 c1->clone(ir, NULL))))); 1057 /* Swap consecutive pairs. 1058 * 1059 * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2); 1060 */ 1061 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333), 1062 lshift(bit_and(temp, c33333333->clone(ir, NULL)), 1063 c2->clone(ir, NULL))))); 1064 1065 /* Swap nibbles. 1066 * 1067 * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4); 1068 */ 1069 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F), 1070 lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)), 1071 c4->clone(ir, NULL))))); 1072 1073 /* The last step is, basically, bswap. Swap the bytes, then swap the 1074 * words. When this code is run through GCC on x86, it does generate a 1075 * bswap instruction. 1076 * 1077 * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8); 1078 * temp = ( temp >> 16 ) | ( temp << 16); 1079 */ 1080 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF), 1081 lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)), 1082 c8->clone(ir, NULL))))); 1083 1084 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1085 ir->operation = ir_binop_bit_or; 1086 ir->init_num_operands(); 1087 ir->operands[0] = rshift(temp, c16); 1088 ir->operands[1] = lshift(temp, c16->clone(ir, NULL)); 1089 } else { 1090 ir->operation = ir_unop_u2i; 1091 ir->init_num_operands(); 1092 ir->operands[0] = bit_or(rshift(temp, c16), 1093 lshift(temp, c16->clone(ir, NULL))); 1094 } 1095 1096 this->progress = true; 1097} 1098 1099void 1100lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir) 1101{ 1102 /* For more details, see: 1103 * 1104 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 1105 */ 1106 const unsigned elements = ir->operands[0]->type->vector_elements; 1107 ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements); 1108 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 1109 ir_constant *c23 = new(ir) ir_constant(int(23), elements); 1110 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 1111 ir_variable *temp = 1112 new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary); 1113 ir_variable *lsb_only = 1114 new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary); 1115 ir_variable *as_float = 1116 new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 1117 ir_variable *lsb = 1118 new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary); 1119 1120 ir_instruction &i = *base_ir; 1121 1122 i.insert_before(temp); 1123 1124 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 1125 i.insert_before(assign(temp, ir->operands[0])); 1126 } else { 1127 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1128 i.insert_before(assign(temp, u2i(ir->operands[0]))); 1129 } 1130 1131 /* The int-to-float conversion is lossless because (value & -value) is 1132 * either a power of two or zero. We don't use the result in the zero 1133 * case. The uint() cast is necessary so that 0x80000000 does not 1134 * generate a negative value. 1135 * 1136 * uint lsb_only = uint(value & -value); 1137 * float as_float = float(lsb_only); 1138 */ 1139 i.insert_before(lsb_only); 1140 i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp))))); 1141 1142 i.insert_before(as_float); 1143 i.insert_before(assign(as_float, u2f(lsb_only))); 1144 1145 /* This is basically an open-coded frexp. Implementations that have a 1146 * native frexp instruction would be better served by that. This is 1147 * optimized versus a full-featured open-coded implementation in two ways: 1148 * 1149 * - We don't care about a correct result from subnormal numbers (including 1150 * 0.0), so the raw exponent can always be safely unbiased. 1151 * 1152 * - The value cannot be negative, so it does not need to be masked off to 1153 * extract the exponent. 1154 * 1155 * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f; 1156 */ 1157 i.insert_before(lsb); 1158 i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 1159 1160 /* Use lsb_only in the comparison instead of temp so that the & (far above) 1161 * can possibly generate the result without an explicit comparison. 1162 * 1163 * (lsb_only == 0) ? -1 : lsb; 1164 * 1165 * Since our input values are all integers, the unbiased exponent must not 1166 * be negative. It will only be negative (-0x7f, in fact) if lsb_only is 1167 * 0. Instead of using (lsb_only == 0), we could use (lsb >= 0). Which is 1168 * better is likely GPU dependent. Either way, the difference should be 1169 * small. 1170 */ 1171 ir->operation = ir_triop_csel; 1172 ir->init_num_operands(); 1173 ir->operands[0] = equal(lsb_only, c0); 1174 ir->operands[1] = cminus1; 1175 ir->operands[2] = new(ir) ir_dereference_variable(lsb); 1176 1177 this->progress = true; 1178} 1179 1180void 1181lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir) 1182{ 1183 /* For more details, see: 1184 * 1185 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 1186 */ 1187 const unsigned elements = ir->operands[0]->type->vector_elements; 1188 ir_constant *c0 = new(ir) ir_constant(int(0), elements); 1189 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 1190 ir_constant *c23 = new(ir) ir_constant(int(23), elements); 1191 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 1192 ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements); 1193 ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements); 1194 ir_variable *temp = 1195 new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary); 1196 ir_variable *as_float = 1197 new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 1198 ir_variable *msb = 1199 new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary); 1200 1201 ir_instruction &i = *base_ir; 1202 1203 i.insert_before(temp); 1204 1205 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1206 i.insert_before(assign(temp, ir->operands[0])); 1207 } else { 1208 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1209 1210 /* findMSB(uint(abs(some_int))) almost always does the right thing. 1211 * There are two problem values: 1212 * 1213 * * 0x80000000. Since abs(0x80000000) == 0x80000000, findMSB returns 1214 * 31. However, findMSB(int(0x80000000)) == 30. 1215 * 1216 * * 0xffffffff. Since abs(0xffffffff) == 1, findMSB returns 1217 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1218 * 1219 * For a value of zero or negative one, -1 will be returned. 1220 * 1221 * For all negative number cases, including 0x80000000 and 0xffffffff, 1222 * the correct value is obtained from findMSB if instead of negating the 1223 * (already negative) value the logical-not is used. A conditonal 1224 * logical-not can be achieved in two instructions. 1225 */ 1226 ir_variable *as_int = 1227 new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary); 1228 ir_constant *c31 = new(ir) ir_constant(int(31), elements); 1229 1230 i.insert_before(as_int); 1231 i.insert_before(assign(as_int, ir->operands[0])); 1232 i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor, 1233 as_int, 1234 rshift(as_int, c31))))); 1235 } 1236 1237 /* The int-to-float conversion is lossless because bits are conditionally 1238 * masked off the bottom of temp to ensure the value has at most 24 bits of 1239 * data or is zero. We don't use the result in the zero case. The uint() 1240 * cast is necessary so that 0x80000000 does not generate a negative value. 1241 * 1242 * float as_float = float(temp > 255 ? temp & ~255 : temp); 1243 */ 1244 i.insert_before(as_float); 1245 i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF), 1246 bit_and(temp, cFFFFFF00), 1247 temp)))); 1248 1249 /* This is basically an open-coded frexp. Implementations that have a 1250 * native frexp instruction would be better served by that. This is 1251 * optimized versus a full-featured open-coded implementation in two ways: 1252 * 1253 * - We don't care about a correct result from subnormal numbers (including 1254 * 0.0), so the raw exponent can always be safely unbiased. 1255 * 1256 * - The value cannot be negative, so it does not need to be masked off to 1257 * extract the exponent. 1258 * 1259 * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f; 1260 */ 1261 i.insert_before(msb); 1262 i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 1263 1264 /* Use msb in the comparison instead of temp so that the subtract can 1265 * possibly generate the result without an explicit comparison. 1266 * 1267 * (msb < 0) ? -1 : msb; 1268 * 1269 * Since our input values are all integers, the unbiased exponent must not 1270 * be negative. It will only be negative (-0x7f, in fact) if temp is 0. 1271 */ 1272 ir->operation = ir_triop_csel; 1273 ir->init_num_operands(); 1274 ir->operands[0] = less(msb, c0); 1275 ir->operands[1] = cminus1; 1276 ir->operands[2] = new(ir) ir_dereference_variable(msb); 1277 1278 this->progress = true; 1279} 1280 1281ir_expression * 1282lower_instructions_visitor::_carry(operand a, operand b) 1283{ 1284 if (lowering(CARRY_TO_ARITH)) 1285 return i2u(b2i(less(add(a, b), 1286 a.val->clone(ralloc_parent(a.val), NULL)))); 1287 else 1288 return carry(a, b); 1289} 1290 1291void 1292lower_instructions_visitor::imul_high_to_mul(ir_expression *ir) 1293{ 1294 /* ABCD 1295 * * EFGH 1296 * ====== 1297 * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32 1298 * 1299 * In GLSL, (a * b) becomes 1300 * 1301 * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu); 1302 * uint m2 = (a & 0x0000ffffu) * (b >> 16); 1303 * uint m3 = (a >> 16) * (b & 0x0000ffffu); 1304 * uint m4 = (a >> 16) * (b >> 16); 1305 * 1306 * uint c1; 1307 * uint c2; 1308 * uint lo_result; 1309 * uint hi_result; 1310 * 1311 * lo_result = uaddCarry(m1, m2 << 16, c1); 1312 * hi_result = m4 + c1; 1313 * lo_result = uaddCarry(lo_result, m3 << 16, c2); 1314 * hi_result = hi_result + c2; 1315 * hi_result = hi_result + (m2 >> 16) + (m3 >> 16); 1316 */ 1317 const unsigned elements = ir->operands[0]->type->vector_elements; 1318 ir_variable *src1 = 1319 new(ir) ir_variable(glsl_type::uvec(elements), "src1", ir_var_temporary); 1320 ir_variable *src1h = 1321 new(ir) ir_variable(glsl_type::uvec(elements), "src1h", ir_var_temporary); 1322 ir_variable *src1l = 1323 new(ir) ir_variable(glsl_type::uvec(elements), "src1l", ir_var_temporary); 1324 ir_variable *src2 = 1325 new(ir) ir_variable(glsl_type::uvec(elements), "src2", ir_var_temporary); 1326 ir_variable *src2h = 1327 new(ir) ir_variable(glsl_type::uvec(elements), "src2h", ir_var_temporary); 1328 ir_variable *src2l = 1329 new(ir) ir_variable(glsl_type::uvec(elements), "src2l", ir_var_temporary); 1330 ir_variable *t1 = 1331 new(ir) ir_variable(glsl_type::uvec(elements), "t1", ir_var_temporary); 1332 ir_variable *t2 = 1333 new(ir) ir_variable(glsl_type::uvec(elements), "t2", ir_var_temporary); 1334 ir_variable *lo = 1335 new(ir) ir_variable(glsl_type::uvec(elements), "lo", ir_var_temporary); 1336 ir_variable *hi = 1337 new(ir) ir_variable(glsl_type::uvec(elements), "hi", ir_var_temporary); 1338 ir_variable *different_signs = NULL; 1339 ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements); 1340 ir_constant *c16 = new(ir) ir_constant(16u, elements); 1341 1342 ir_instruction &i = *base_ir; 1343 1344 i.insert_before(src1); 1345 i.insert_before(src2); 1346 i.insert_before(src1h); 1347 i.insert_before(src2h); 1348 i.insert_before(src1l); 1349 i.insert_before(src2l); 1350 1351 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1352 i.insert_before(assign(src1, ir->operands[0])); 1353 i.insert_before(assign(src2, ir->operands[1])); 1354 } else { 1355 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1356 1357 ir_variable *itmp1 = 1358 new(ir) ir_variable(glsl_type::ivec(elements), "itmp1", ir_var_temporary); 1359 ir_variable *itmp2 = 1360 new(ir) ir_variable(glsl_type::ivec(elements), "itmp2", ir_var_temporary); 1361 ir_constant *c0 = new(ir) ir_constant(int(0), elements); 1362 1363 i.insert_before(itmp1); 1364 i.insert_before(itmp2); 1365 i.insert_before(assign(itmp1, ir->operands[0])); 1366 i.insert_before(assign(itmp2, ir->operands[1])); 1367 1368 different_signs = 1369 new(ir) ir_variable(glsl_type::bvec(elements), "different_signs", 1370 ir_var_temporary); 1371 1372 i.insert_before(different_signs); 1373 i.insert_before(assign(different_signs, expr(ir_binop_logic_xor, 1374 less(itmp1, c0), 1375 less(itmp2, c0->clone(ir, NULL))))); 1376 1377 i.insert_before(assign(src1, i2u(abs(itmp1)))); 1378 i.insert_before(assign(src2, i2u(abs(itmp2)))); 1379 } 1380 1381 i.insert_before(assign(src1l, bit_and(src1, c0000FFFF))); 1382 i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL)))); 1383 i.insert_before(assign(src1h, rshift(src1, c16))); 1384 i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL)))); 1385 1386 i.insert_before(lo); 1387 i.insert_before(hi); 1388 i.insert_before(t1); 1389 i.insert_before(t2); 1390 1391 i.insert_before(assign(lo, mul(src1l, src2l))); 1392 i.insert_before(assign(t1, mul(src1l, src2h))); 1393 i.insert_before(assign(t2, mul(src1h, src2l))); 1394 i.insert_before(assign(hi, mul(src1h, src2h))); 1395 1396 i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL)))))); 1397 i.insert_before(assign(lo, add(lo, lshift(t1, c16->clone(ir, NULL))))); 1398 1399 i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL)))))); 1400 i.insert_before(assign(lo, add(lo, lshift(t2, c16->clone(ir, NULL))))); 1401 1402 if (different_signs == NULL) { 1403 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1404 1405 ir->operation = ir_binop_add; 1406 ir->init_num_operands(); 1407 ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL))); 1408 ir->operands[1] = rshift(t2, c16->clone(ir, NULL)); 1409 } else { 1410 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1411 1412 i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))), 1413 rshift(t2, c16->clone(ir, NULL))))); 1414 1415 /* For channels where different_signs is set we have to perform a 64-bit 1416 * negation. This is *not* the same as just negating the high 32-bits. 1417 * Consider -3 * 2. The high 32-bits is 0, but the desired result is 1418 * -1, not -0! Recall -x == ~x + 1. 1419 */ 1420 ir_variable *neg_hi = 1421 new(ir) ir_variable(glsl_type::ivec(elements), "neg_hi", ir_var_temporary); 1422 ir_constant *c1 = new(ir) ir_constant(1u, elements); 1423 1424 i.insert_before(neg_hi); 1425 i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)), 1426 u2i(_carry(bit_not(lo), c1))))); 1427 1428 ir->operation = ir_triop_csel; 1429 ir->init_num_operands(); 1430 ir->operands[0] = new(ir) ir_dereference_variable(different_signs); 1431 ir->operands[1] = new(ir) ir_dereference_variable(neg_hi); 1432 ir->operands[2] = u2i(hi); 1433 } 1434} 1435 1436void 1437lower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir) 1438{ 1439 ir->operands[0] = new(ir) ir_expression(ir_unop_abs, ir->operands[0]); 1440 this->progress = true; 1441} 1442 1443ir_visitor_status 1444lower_instructions_visitor::visit_leave(ir_expression *ir) 1445{ 1446 switch (ir->operation) { 1447 case ir_binop_dot: 1448 if (ir->operands[0]->type->is_double()) 1449 double_dot_to_fma(ir); 1450 break; 1451 case ir_triop_lrp: 1452 if (ir->operands[0]->type->is_double()) 1453 double_lrp(ir); 1454 break; 1455 case ir_binop_sub: 1456 if (lowering(SUB_TO_ADD_NEG)) 1457 sub_to_add_neg(ir); 1458 break; 1459 1460 case ir_binop_ldexp: 1461 if (lowering(LDEXP_TO_ARITH) && ir->type->is_float()) 1462 ldexp_to_arith(ir); 1463 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double()) 1464 dldexp_to_arith(ir); 1465 break; 1466 1467 case ir_unop_frexp_exp: 1468 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 1469 dfrexp_exp_to_arith(ir); 1470 break; 1471 1472 case ir_unop_frexp_sig: 1473 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 1474 dfrexp_sig_to_arith(ir); 1475 break; 1476 1477 case ir_binop_carry: 1478 if (lowering(CARRY_TO_ARITH)) 1479 carry_to_arith(ir); 1480 break; 1481 1482 case ir_binop_borrow: 1483 if (lowering(BORROW_TO_ARITH)) 1484 borrow_to_arith(ir); 1485 break; 1486 1487 case ir_unop_trunc: 1488 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1489 dtrunc_to_dfrac(ir); 1490 break; 1491 1492 case ir_unop_ceil: 1493 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1494 dceil_to_dfrac(ir); 1495 break; 1496 1497 case ir_unop_floor: 1498 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1499 dfloor_to_dfrac(ir); 1500 break; 1501 1502 case ir_unop_round_even: 1503 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1504 dround_even_to_dfrac(ir); 1505 break; 1506 1507 case ir_unop_sign: 1508 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1509 dsign_to_csel(ir); 1510 break; 1511 1512 case ir_unop_bit_count: 1513 if (lowering(BIT_COUNT_TO_MATH)) 1514 bit_count_to_math(ir); 1515 break; 1516 1517 case ir_triop_bitfield_extract: 1518 if (lowering(EXTRACT_TO_SHIFTS)) 1519 extract_to_shifts(ir); 1520 break; 1521 1522 case ir_quadop_bitfield_insert: 1523 if (lowering(INSERT_TO_SHIFTS)) 1524 insert_to_shifts(ir); 1525 break; 1526 1527 case ir_unop_bitfield_reverse: 1528 if (lowering(REVERSE_TO_SHIFTS)) 1529 reverse_to_shifts(ir); 1530 break; 1531 1532 case ir_unop_find_lsb: 1533 if (lowering(FIND_LSB_TO_FLOAT_CAST)) 1534 find_lsb_to_float_cast(ir); 1535 break; 1536 1537 case ir_unop_find_msb: 1538 if (lowering(FIND_MSB_TO_FLOAT_CAST)) 1539 find_msb_to_float_cast(ir); 1540 break; 1541 1542 case ir_binop_imul_high: 1543 if (lowering(IMUL_HIGH_TO_MUL)) 1544 imul_high_to_mul(ir); 1545 break; 1546 1547 case ir_unop_rsq: 1548 case ir_unop_sqrt: 1549 if (lowering(SQRT_TO_ABS_SQRT)) 1550 sqrt_to_abs_sqrt(ir); 1551 break; 1552 1553 default: 1554 return visit_continue; 1555 } 1556 1557 return visit_continue; 1558} 1559