1/* 2 * Copyright © 2018 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23#include <math.h> 24#include <float.h> 25#include "nir.h" 26#include "nir_range_analysis.h" 27#include "util/hash_table.h" 28 29/** 30 * Analyzes a sequence of operations to determine some aspects of the range of 31 * the result. 32 */ 33 34static bool 35is_not_negative(enum ssa_ranges r) 36{ 37 return r == gt_zero || r == ge_zero || r == eq_zero; 38} 39 40static bool 41is_not_zero(enum ssa_ranges r) 42{ 43 return r == gt_zero || r == lt_zero || r == ne_zero; 44} 45 46static void * 47pack_data(const struct ssa_result_range r) 48{ 49 return (void *)(uintptr_t)(r.range | r.is_integral << 8 | r.is_finite << 9 | 50 r.is_a_number << 10); 51} 52 53static struct ssa_result_range 54unpack_data(const void *p) 55{ 56 const uintptr_t v = (uintptr_t) p; 57 58 return (struct ssa_result_range){ 59 .range = v & 0xff, 60 .is_integral = (v & 0x00100) != 0, 61 .is_finite = (v & 0x00200) != 0, 62 .is_a_number = (v & 0x00400) != 0 63 }; 64} 65 66static void * 67pack_key(const struct nir_alu_instr *instr, nir_alu_type type) 68{ 69 uintptr_t type_encoding; 70 uintptr_t ptr = (uintptr_t) instr; 71 72 /* The low 2 bits have to be zero or this whole scheme falls apart. */ 73 assert((ptr & 0x3) == 0); 74 75 /* NIR is typeless in the sense that sequences of bits have whatever 76 * meaning is attached to them by the instruction that consumes them. 77 * However, the number of bits must match between producer and consumer. 78 * As a result, the number of bits does not need to be encoded here. 79 */ 80 switch (nir_alu_type_get_base_type(type)) { 81 case nir_type_int: type_encoding = 0; break; 82 case nir_type_uint: type_encoding = 1; break; 83 case nir_type_bool: type_encoding = 2; break; 84 case nir_type_float: type_encoding = 3; break; 85 default: unreachable("Invalid base type."); 86 } 87 88 return (void *)(ptr | type_encoding); 89} 90 91static nir_alu_type 92nir_alu_src_type(const nir_alu_instr *instr, unsigned src) 93{ 94 return nir_alu_type_get_base_type(nir_op_infos[instr->op].input_types[src]) | 95 nir_src_bit_size(instr->src[src].src); 96} 97 98static struct ssa_result_range 99analyze_constant(const struct nir_alu_instr *instr, unsigned src, 100 nir_alu_type use_type) 101{ 102 uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = { 0, 1, 2, 3, 103 4, 5, 6, 7, 104 8, 9, 10, 11, 105 12, 13, 14, 15 }; 106 107 /* If the source is an explicitly sized source, then we need to reset 108 * both the number of components and the swizzle. 109 */ 110 const unsigned num_components = nir_ssa_alu_instr_src_components(instr, src); 111 112 for (unsigned i = 0; i < num_components; ++i) 113 swizzle[i] = instr->src[src].swizzle[i]; 114 115 const nir_load_const_instr *const load = 116 nir_instr_as_load_const(instr->src[src].src.ssa->parent_instr); 117 118 struct ssa_result_range r = { unknown, false, false, false }; 119 120 switch (nir_alu_type_get_base_type(use_type)) { 121 case nir_type_float: { 122 double min_value = DBL_MAX; 123 double max_value = -DBL_MAX; 124 bool any_zero = false; 125 bool all_zero = true; 126 127 r.is_integral = true; 128 r.is_a_number = true; 129 r.is_finite = true; 130 131 for (unsigned i = 0; i < num_components; ++i) { 132 const double v = nir_const_value_as_float(load->value[swizzle[i]], 133 load->def.bit_size); 134 135 if (floor(v) != v) 136 r.is_integral = false; 137 138 if (isnan(v)) 139 r.is_a_number = false; 140 141 if (!isfinite(v)) 142 r.is_finite = false; 143 144 any_zero = any_zero || (v == 0.0); 145 all_zero = all_zero && (v == 0.0); 146 min_value = MIN2(min_value, v); 147 max_value = MAX2(max_value, v); 148 } 149 150 assert(any_zero >= all_zero); 151 assert(isnan(max_value) || max_value >= min_value); 152 153 if (all_zero) 154 r.range = eq_zero; 155 else if (min_value > 0.0) 156 r.range = gt_zero; 157 else if (min_value == 0.0) 158 r.range = ge_zero; 159 else if (max_value < 0.0) 160 r.range = lt_zero; 161 else if (max_value == 0.0) 162 r.range = le_zero; 163 else if (!any_zero) 164 r.range = ne_zero; 165 else 166 r.range = unknown; 167 168 return r; 169 } 170 171 case nir_type_int: 172 case nir_type_bool: { 173 int64_t min_value = INT_MAX; 174 int64_t max_value = INT_MIN; 175 bool any_zero = false; 176 bool all_zero = true; 177 178 for (unsigned i = 0; i < num_components; ++i) { 179 const int64_t v = nir_const_value_as_int(load->value[swizzle[i]], 180 load->def.bit_size); 181 182 any_zero = any_zero || (v == 0); 183 all_zero = all_zero && (v == 0); 184 min_value = MIN2(min_value, v); 185 max_value = MAX2(max_value, v); 186 } 187 188 assert(any_zero >= all_zero); 189 assert(max_value >= min_value); 190 191 if (all_zero) 192 r.range = eq_zero; 193 else if (min_value > 0) 194 r.range = gt_zero; 195 else if (min_value == 0) 196 r.range = ge_zero; 197 else if (max_value < 0) 198 r.range = lt_zero; 199 else if (max_value == 0) 200 r.range = le_zero; 201 else if (!any_zero) 202 r.range = ne_zero; 203 else 204 r.range = unknown; 205 206 return r; 207 } 208 209 case nir_type_uint: { 210 bool any_zero = false; 211 bool all_zero = true; 212 213 for (unsigned i = 0; i < num_components; ++i) { 214 const uint64_t v = nir_const_value_as_uint(load->value[swizzle[i]], 215 load->def.bit_size); 216 217 any_zero = any_zero || (v == 0); 218 all_zero = all_zero && (v == 0); 219 } 220 221 assert(any_zero >= all_zero); 222 223 if (all_zero) 224 r.range = eq_zero; 225 else if (any_zero) 226 r.range = ge_zero; 227 else 228 r.range = gt_zero; 229 230 return r; 231 } 232 233 default: 234 unreachable("Invalid alu source type"); 235 } 236} 237 238/** 239 * Short-hand name for use in the tables in analyze_expression. If this name 240 * becomes a problem on some compiler, we can change it to _. 241 */ 242#define _______ unknown 243 244 245#if defined(__clang__) 246 /* clang wants _Pragma("unroll X") */ 247 #define pragma_unroll_5 _Pragma("unroll 5") 248 #define pragma_unroll_7 _Pragma("unroll 7") 249/* gcc wants _Pragma("GCC unroll X") */ 250#elif defined(__GNUC__) 251 #if __GNUC__ >= 8 252 #define pragma_unroll_5 _Pragma("GCC unroll 5") 253 #define pragma_unroll_7 _Pragma("GCC unroll 7") 254 #else 255 #pragma GCC optimize ("unroll-loops") 256 #define pragma_unroll_5 257 #define pragma_unroll_7 258 #endif 259#else 260 /* MSVC doesn't have C99's _Pragma() */ 261 #define pragma_unroll_5 262 #define pragma_unroll_7 263#endif 264 265 266#ifndef NDEBUG 267#define ASSERT_TABLE_IS_COMMUTATIVE(t) \ 268 do { \ 269 static bool first = true; \ 270 if (first) { \ 271 first = false; \ 272 pragma_unroll_7 \ 273 for (unsigned r = 0; r < ARRAY_SIZE(t); r++) { \ 274 pragma_unroll_7 \ 275 for (unsigned c = 0; c < ARRAY_SIZE(t[0]); c++) \ 276 assert(t[r][c] == t[c][r]); \ 277 } \ 278 } \ 279 } while (false) 280 281#define ASSERT_TABLE_IS_DIAGONAL(t) \ 282 do { \ 283 static bool first = true; \ 284 if (first) { \ 285 first = false; \ 286 pragma_unroll_7 \ 287 for (unsigned r = 0; r < ARRAY_SIZE(t); r++) \ 288 assert(t[r][r] == r); \ 289 } \ 290 } while (false) 291 292#else 293#define ASSERT_TABLE_IS_COMMUTATIVE(t) 294#define ASSERT_TABLE_IS_DIAGONAL(t) 295#endif /* !defined(NDEBUG) */ 296 297static enum ssa_ranges 298union_ranges(enum ssa_ranges a, enum ssa_ranges b) 299{ 300 static const enum ssa_ranges union_table[last_range + 1][last_range + 1] = { 301 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 302 /* unknown */ { _______, _______, _______, _______, _______, _______, _______ }, 303 /* lt_zero */ { _______, lt_zero, le_zero, ne_zero, _______, ne_zero, le_zero }, 304 /* le_zero */ { _______, le_zero, le_zero, _______, _______, _______, le_zero }, 305 /* gt_zero */ { _______, ne_zero, _______, gt_zero, ge_zero, ne_zero, ge_zero }, 306 /* ge_zero */ { _______, _______, _______, ge_zero, ge_zero, _______, ge_zero }, 307 /* ne_zero */ { _______, ne_zero, _______, ne_zero, _______, ne_zero, _______ }, 308 /* eq_zero */ { _______, le_zero, le_zero, ge_zero, ge_zero, _______, eq_zero }, 309 }; 310 311 ASSERT_TABLE_IS_COMMUTATIVE(union_table); 312 ASSERT_TABLE_IS_DIAGONAL(union_table); 313 314 return union_table[a][b]; 315} 316 317#ifndef NDEBUG 318/* Verify that the 'unknown' entry in each row (or column) of the table is the 319 * union of all the other values in the row (or column). 320 */ 321#define ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(t) \ 322 do { \ 323 static bool first = true; \ 324 if (first) { \ 325 first = false; \ 326 pragma_unroll_7 \ 327 for (unsigned i = 0; i < last_range; i++) { \ 328 enum ssa_ranges col_range = t[i][unknown + 1]; \ 329 enum ssa_ranges row_range = t[unknown + 1][i]; \ 330 \ 331 pragma_unroll_5 \ 332 for (unsigned j = unknown + 2; j < last_range; j++) { \ 333 col_range = union_ranges(col_range, t[i][j]); \ 334 row_range = union_ranges(row_range, t[j][i]); \ 335 } \ 336 \ 337 assert(col_range == t[i][unknown]); \ 338 assert(row_range == t[unknown][i]); \ 339 } \ 340 } \ 341 } while (false) 342 343/* For most operations, the union of ranges for a strict inequality and 344 * equality should be the range of the non-strict inequality (e.g., 345 * union_ranges(range(op(lt_zero), range(op(eq_zero))) == range(op(le_zero)). 346 * 347 * Does not apply to selection-like opcodes (bcsel, fmin, fmax, etc.). 348 */ 349#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(t) \ 350 do { \ 351 assert(union_ranges(t[lt_zero], t[eq_zero]) == t[le_zero]); \ 352 assert(union_ranges(t[gt_zero], t[eq_zero]) == t[ge_zero]); \ 353 } while (false) 354 355#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(t) \ 356 do { \ 357 static bool first = true; \ 358 if (first) { \ 359 first = false; \ 360 pragma_unroll_7 \ 361 for (unsigned i = 0; i < last_range; i++) { \ 362 assert(union_ranges(t[i][lt_zero], t[i][eq_zero]) == t[i][le_zero]); \ 363 assert(union_ranges(t[i][gt_zero], t[i][eq_zero]) == t[i][ge_zero]); \ 364 assert(union_ranges(t[lt_zero][i], t[eq_zero][i]) == t[le_zero][i]); \ 365 assert(union_ranges(t[gt_zero][i], t[eq_zero][i]) == t[ge_zero][i]); \ 366 } \ 367 } \ 368 } while (false) 369 370/* Several other unordered tuples span the range of "everything." Each should 371 * have the same value as unknown: (lt_zero, ge_zero), (le_zero, gt_zero), and 372 * (eq_zero, ne_zero). union_ranges is already commutative, so only one 373 * ordering needs to be checked. 374 * 375 * Does not apply to selection-like opcodes (bcsel, fmin, fmax, etc.). 376 * 377 * In cases where this can be used, it is unnecessary to also use 378 * ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_*_SOURCE. For any range X, 379 * union_ranges(X, X) == X. The disjoint ranges cover all of the non-unknown 380 * possibilities, so the union of all the unions of disjoint ranges is 381 * equivalent to the union of "others." 382 */ 383#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(t) \ 384 do { \ 385 assert(union_ranges(t[lt_zero], t[ge_zero]) == t[unknown]); \ 386 assert(union_ranges(t[le_zero], t[gt_zero]) == t[unknown]); \ 387 assert(union_ranges(t[eq_zero], t[ne_zero]) == t[unknown]); \ 388 } while (false) 389 390#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(t) \ 391 do { \ 392 static bool first = true; \ 393 if (first) { \ 394 first = false; \ 395 pragma_unroll_7 \ 396 for (unsigned i = 0; i < last_range; i++) { \ 397 assert(union_ranges(t[i][lt_zero], t[i][ge_zero]) == \ 398 t[i][unknown]); \ 399 assert(union_ranges(t[i][le_zero], t[i][gt_zero]) == \ 400 t[i][unknown]); \ 401 assert(union_ranges(t[i][eq_zero], t[i][ne_zero]) == \ 402 t[i][unknown]); \ 403 \ 404 assert(union_ranges(t[lt_zero][i], t[ge_zero][i]) == \ 405 t[unknown][i]); \ 406 assert(union_ranges(t[le_zero][i], t[gt_zero][i]) == \ 407 t[unknown][i]); \ 408 assert(union_ranges(t[eq_zero][i], t[ne_zero][i]) == \ 409 t[unknown][i]); \ 410 } \ 411 } \ 412 } while (false) 413 414#else 415#define ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(t) 416#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(t) 417#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(t) 418#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(t) 419#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(t) 420#endif /* !defined(NDEBUG) */ 421 422/** 423 * Analyze an expression to determine the range of its result 424 * 425 * The end result of this analysis is a token that communicates something 426 * about the range of values. There's an implicit grammar that produces 427 * tokens from sequences of literal values, other tokens, and operations. 428 * This function implements this grammar as a recursive-descent parser. Some 429 * (but not all) of the grammar is listed in-line in the function. 430 */ 431static struct ssa_result_range 432analyze_expression(const nir_alu_instr *instr, unsigned src, 433 struct hash_table *ht, nir_alu_type use_type) 434{ 435 /* Ensure that the _Pragma("GCC unroll 7") above are correct. */ 436 STATIC_ASSERT(last_range + 1 == 7); 437 438 if (!instr->src[src].src.is_ssa) 439 return (struct ssa_result_range){unknown, false, false, false}; 440 441 if (nir_src_is_const(instr->src[src].src)) 442 return analyze_constant(instr, src, use_type); 443 444 if (instr->src[src].src.ssa->parent_instr->type != nir_instr_type_alu) 445 return (struct ssa_result_range){unknown, false, false, false}; 446 447 const struct nir_alu_instr *const alu = 448 nir_instr_as_alu(instr->src[src].src.ssa->parent_instr); 449 450 /* Bail if the type of the instruction generating the value does not match 451 * the type the value will be interpreted as. int/uint/bool can be 452 * reinterpreted trivially. The most important cases are between float and 453 * non-float. 454 */ 455 if (alu->op != nir_op_mov && alu->op != nir_op_bcsel) { 456 const nir_alu_type use_base_type = 457 nir_alu_type_get_base_type(use_type); 458 const nir_alu_type src_base_type = 459 nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type); 460 461 if (use_base_type != src_base_type && 462 (use_base_type == nir_type_float || 463 src_base_type == nir_type_float)) { 464 return (struct ssa_result_range){unknown, false, false, false}; 465 } 466 } 467 468 struct hash_entry *he = _mesa_hash_table_search(ht, pack_key(alu, use_type)); 469 if (he != NULL) 470 return unpack_data(he->data); 471 472 struct ssa_result_range r = {unknown, false, false, false}; 473 474 /* ge_zero: ge_zero + ge_zero 475 * 476 * gt_zero: gt_zero + eq_zero 477 * | gt_zero + ge_zero 478 * | eq_zero + gt_zero # Addition is commutative 479 * | ge_zero + gt_zero # Addition is commutative 480 * | gt_zero + gt_zero 481 * ; 482 * 483 * le_zero: le_zero + le_zero 484 * 485 * lt_zero: lt_zero + eq_zero 486 * | lt_zero + le_zero 487 * | eq_zero + lt_zero # Addition is commutative 488 * | le_zero + lt_zero # Addition is commutative 489 * | lt_zero + lt_zero 490 * ; 491 * 492 * ne_zero: eq_zero + ne_zero 493 * | ne_zero + eq_zero # Addition is commutative 494 * ; 495 * 496 * eq_zero: eq_zero + eq_zero 497 * ; 498 * 499 * All other cases are 'unknown'. The seeming odd entry is (ne_zero, 500 * ne_zero), but that could be (-5, +5) which is not ne_zero. 501 */ 502 static const enum ssa_ranges fadd_table[last_range + 1][last_range + 1] = { 503 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 504 /* unknown */ { _______, _______, _______, _______, _______, _______, _______ }, 505 /* lt_zero */ { _______, lt_zero, lt_zero, _______, _______, _______, lt_zero }, 506 /* le_zero */ { _______, lt_zero, le_zero, _______, _______, _______, le_zero }, 507 /* gt_zero */ { _______, _______, _______, gt_zero, gt_zero, _______, gt_zero }, 508 /* ge_zero */ { _______, _______, _______, gt_zero, ge_zero, _______, ge_zero }, 509 /* ne_zero */ { _______, _______, _______, _______, _______, _______, ne_zero }, 510 /* eq_zero */ { _______, lt_zero, le_zero, gt_zero, ge_zero, ne_zero, eq_zero }, 511 }; 512 513 ASSERT_TABLE_IS_COMMUTATIVE(fadd_table); 514 ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(fadd_table); 515 ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(fadd_table); 516 517 /* Due to flush-to-zero semanatics of floating-point numbers with very 518 * small mangnitudes, we can never really be sure a result will be 519 * non-zero. 520 * 521 * ge_zero: ge_zero * ge_zero 522 * | ge_zero * gt_zero 523 * | ge_zero * eq_zero 524 * | le_zero * lt_zero 525 * | lt_zero * le_zero # Multiplication is commutative 526 * | le_zero * le_zero 527 * | gt_zero * ge_zero # Multiplication is commutative 528 * | eq_zero * ge_zero # Multiplication is commutative 529 * | a * a # Left source == right source 530 * | gt_zero * gt_zero 531 * | lt_zero * lt_zero 532 * ; 533 * 534 * le_zero: ge_zero * le_zero 535 * | ge_zero * lt_zero 536 * | lt_zero * ge_zero # Multiplication is commutative 537 * | le_zero * ge_zero # Multiplication is commutative 538 * | le_zero * gt_zero 539 * | lt_zero * gt_zero 540 * | gt_zero * lt_zero # Multiplication is commutative 541 * ; 542 * 543 * eq_zero: eq_zero * <any> 544 * <any> * eq_zero # Multiplication is commutative 545 * 546 * All other cases are 'unknown'. 547 */ 548 static const enum ssa_ranges fmul_table[last_range + 1][last_range + 1] = { 549 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 550 /* unknown */ { _______, _______, _______, _______, _______, _______, eq_zero }, 551 /* lt_zero */ { _______, ge_zero, ge_zero, le_zero, le_zero, _______, eq_zero }, 552 /* le_zero */ { _______, ge_zero, ge_zero, le_zero, le_zero, _______, eq_zero }, 553 /* gt_zero */ { _______, le_zero, le_zero, ge_zero, ge_zero, _______, eq_zero }, 554 /* ge_zero */ { _______, le_zero, le_zero, ge_zero, ge_zero, _______, eq_zero }, 555 /* ne_zero */ { _______, _______, _______, _______, _______, _______, eq_zero }, 556 /* eq_zero */ { eq_zero, eq_zero, eq_zero, eq_zero, eq_zero, eq_zero, eq_zero } 557 }; 558 559 ASSERT_TABLE_IS_COMMUTATIVE(fmul_table); 560 ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(fmul_table); 561 ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(fmul_table); 562 563 static const enum ssa_ranges fneg_table[last_range + 1] = { 564 /* unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 565 _______, gt_zero, ge_zero, lt_zero, le_zero, ne_zero, eq_zero 566 }; 567 568 ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(fneg_table); 569 ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(fneg_table); 570 571 572 switch (alu->op) { 573 case nir_op_b2f32: 574 case nir_op_b2i32: 575 /* b2f32 will generate either 0.0 or 1.0. This case is trivial. 576 * 577 * b2i32 will generate either 0x00000000 or 0x00000001. When those bit 578 * patterns are interpreted as floating point, they are 0.0 and 579 * 1.401298464324817e-45. The latter is subnormal, but it is finite and 580 * a number. 581 */ 582 r = (struct ssa_result_range){ge_zero, alu->op == nir_op_b2f32, true, true}; 583 break; 584 585 case nir_op_bcsel: { 586 const struct ssa_result_range left = 587 analyze_expression(alu, 1, ht, use_type); 588 const struct ssa_result_range right = 589 analyze_expression(alu, 2, ht, use_type); 590 591 r.is_integral = left.is_integral && right.is_integral; 592 593 /* This could be better, but it would require a lot of work. For 594 * example, the result of the following is a number: 595 * 596 * bcsel(a > 0.0, a, 38.6) 597 * 598 * If the result of 'a > 0.0' is true, then the use of 'a' in the true 599 * part of the bcsel must be a number. 600 * 601 * Other cases are even more challenging. 602 * 603 * bcsel(a > 0.5, a - 0.5, 0.0) 604 */ 605 r.is_a_number = left.is_a_number && right.is_a_number; 606 r.is_finite = left.is_finite && right.is_finite; 607 608 r.range = union_ranges(left.range, right.range); 609 break; 610 } 611 612 case nir_op_i2f32: 613 case nir_op_u2f32: 614 r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 615 616 r.is_integral = true; 617 r.is_a_number = true; 618 r.is_finite = true; 619 620 if (r.range == unknown && alu->op == nir_op_u2f32) 621 r.range = ge_zero; 622 623 break; 624 625 case nir_op_fabs: 626 r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 627 628 switch (r.range) { 629 case unknown: 630 case le_zero: 631 case ge_zero: 632 r.range = ge_zero; 633 break; 634 635 case lt_zero: 636 case gt_zero: 637 case ne_zero: 638 r.range = gt_zero; 639 break; 640 641 case eq_zero: 642 break; 643 } 644 645 break; 646 647 case nir_op_fadd: { 648 const struct ssa_result_range left = 649 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 650 const struct ssa_result_range right = 651 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 652 653 r.is_integral = left.is_integral && right.is_integral; 654 r.range = fadd_table[left.range][right.range]; 655 656 /* X + Y is NaN if either operand is NaN or if one operand is +Inf and 657 * the other is -Inf. If neither operand is NaN and at least one of the 658 * operands is finite, then the result cannot be NaN. 659 */ 660 r.is_a_number = left.is_a_number && right.is_a_number && 661 (left.is_finite || right.is_finite); 662 break; 663 } 664 665 case nir_op_fexp2: { 666 /* If the parameter might be less than zero, the mathematically result 667 * will be on (0, 1). For sufficiently large magnitude negative 668 * parameters, the result will flush to zero. 669 */ 670 static const enum ssa_ranges table[last_range + 1] = { 671 /* unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 672 ge_zero, ge_zero, ge_zero, gt_zero, gt_zero, ge_zero, gt_zero 673 }; 674 675 r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 676 677 ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(table); 678 ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(table); 679 680 r.is_integral = r.is_integral && is_not_negative(r.range); 681 r.range = table[r.range]; 682 683 /* Various cases can result in NaN, so assume the worst. */ 684 r.is_finite = false; 685 r.is_a_number = false; 686 break; 687 } 688 689 case nir_op_fmax: { 690 const struct ssa_result_range left = 691 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 692 const struct ssa_result_range right = 693 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 694 695 r.is_integral = left.is_integral && right.is_integral; 696 697 /* This is conservative. It may be possible to determine that the 698 * result must be finite in more cases, but it would take some effort to 699 * work out all the corners. For example, fmax({lt_zero, finite}, 700 * {lt_zero}) should result in {lt_zero, finite}. 701 */ 702 r.is_finite = left.is_finite && right.is_finite; 703 704 /* If one source is NaN, fmax always picks the other source. */ 705 r.is_a_number = left.is_a_number || right.is_a_number; 706 707 /* gt_zero: fmax(gt_zero, *) 708 * | fmax(*, gt_zero) # Treat fmax as commutative 709 * ; 710 * 711 * ge_zero: fmax(ge_zero, ne_zero) 712 * | fmax(ge_zero, lt_zero) 713 * | fmax(ge_zero, le_zero) 714 * | fmax(ge_zero, eq_zero) 715 * | fmax(ne_zero, ge_zero) # Treat fmax as commutative 716 * | fmax(lt_zero, ge_zero) # Treat fmax as commutative 717 * | fmax(le_zero, ge_zero) # Treat fmax as commutative 718 * | fmax(eq_zero, ge_zero) # Treat fmax as commutative 719 * | fmax(ge_zero, ge_zero) 720 * ; 721 * 722 * le_zero: fmax(le_zero, lt_zero) 723 * | fmax(lt_zero, le_zero) # Treat fmax as commutative 724 * | fmax(le_zero, le_zero) 725 * ; 726 * 727 * lt_zero: fmax(lt_zero, lt_zero) 728 * ; 729 * 730 * ne_zero: fmax(ne_zero, lt_zero) 731 * | fmax(lt_zero, ne_zero) # Treat fmax as commutative 732 * | fmax(ne_zero, ne_zero) 733 * ; 734 * 735 * eq_zero: fmax(eq_zero, le_zero) 736 * | fmax(eq_zero, lt_zero) 737 * | fmax(le_zero, eq_zero) # Treat fmax as commutative 738 * | fmax(lt_zero, eq_zero) # Treat fmax as commutative 739 * | fmax(eq_zero, eq_zero) 740 * ; 741 * 742 * All other cases are 'unknown'. 743 */ 744 static const enum ssa_ranges table[last_range + 1][last_range + 1] = { 745 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 746 /* unknown */ { _______, _______, _______, gt_zero, ge_zero, _______, _______ }, 747 /* lt_zero */ { _______, lt_zero, le_zero, gt_zero, ge_zero, ne_zero, eq_zero }, 748 /* le_zero */ { _______, le_zero, le_zero, gt_zero, ge_zero, _______, eq_zero }, 749 /* gt_zero */ { gt_zero, gt_zero, gt_zero, gt_zero, gt_zero, gt_zero, gt_zero }, 750 /* ge_zero */ { ge_zero, ge_zero, ge_zero, gt_zero, ge_zero, ge_zero, ge_zero }, 751 /* ne_zero */ { _______, ne_zero, _______, gt_zero, ge_zero, ne_zero, _______ }, 752 /* eq_zero */ { _______, eq_zero, eq_zero, gt_zero, ge_zero, _______, eq_zero } 753 }; 754 755 /* Treat fmax as commutative. */ 756 ASSERT_TABLE_IS_COMMUTATIVE(table); 757 ASSERT_TABLE_IS_DIAGONAL(table); 758 ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(table); 759 760 r.range = table[left.range][right.range]; 761 762 /* Recall that when either value is NaN, fmax will pick the other value. 763 * This means the result range of the fmax will either be the "ideal" 764 * result range (calculated above) or the range of the non-NaN value. 765 */ 766 if (!left.is_a_number) 767 r.range = union_ranges(r.range, right.range); 768 769 if (!right.is_a_number) 770 r.range = union_ranges(r.range, left.range); 771 772 break; 773 } 774 775 case nir_op_fmin: { 776 const struct ssa_result_range left = 777 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 778 const struct ssa_result_range right = 779 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 780 781 r.is_integral = left.is_integral && right.is_integral; 782 783 /* This is conservative. It may be possible to determine that the 784 * result must be finite in more cases, but it would take some effort to 785 * work out all the corners. For example, fmin({gt_zero, finite}, 786 * {gt_zero}) should result in {gt_zero, finite}. 787 */ 788 r.is_finite = left.is_finite && right.is_finite; 789 790 /* If one source is NaN, fmin always picks the other source. */ 791 r.is_a_number = left.is_a_number || right.is_a_number; 792 793 /* lt_zero: fmin(lt_zero, *) 794 * | fmin(*, lt_zero) # Treat fmin as commutative 795 * ; 796 * 797 * le_zero: fmin(le_zero, ne_zero) 798 * | fmin(le_zero, gt_zero) 799 * | fmin(le_zero, ge_zero) 800 * | fmin(le_zero, eq_zero) 801 * | fmin(ne_zero, le_zero) # Treat fmin as commutative 802 * | fmin(gt_zero, le_zero) # Treat fmin as commutative 803 * | fmin(ge_zero, le_zero) # Treat fmin as commutative 804 * | fmin(eq_zero, le_zero) # Treat fmin as commutative 805 * | fmin(le_zero, le_zero) 806 * ; 807 * 808 * ge_zero: fmin(ge_zero, gt_zero) 809 * | fmin(gt_zero, ge_zero) # Treat fmin as commutative 810 * | fmin(ge_zero, ge_zero) 811 * ; 812 * 813 * gt_zero: fmin(gt_zero, gt_zero) 814 * ; 815 * 816 * ne_zero: fmin(ne_zero, gt_zero) 817 * | fmin(gt_zero, ne_zero) # Treat fmin as commutative 818 * | fmin(ne_zero, ne_zero) 819 * ; 820 * 821 * eq_zero: fmin(eq_zero, ge_zero) 822 * | fmin(eq_zero, gt_zero) 823 * | fmin(ge_zero, eq_zero) # Treat fmin as commutative 824 * | fmin(gt_zero, eq_zero) # Treat fmin as commutative 825 * | fmin(eq_zero, eq_zero) 826 * ; 827 * 828 * All other cases are 'unknown'. 829 */ 830 static const enum ssa_ranges table[last_range + 1][last_range + 1] = { 831 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 832 /* unknown */ { _______, lt_zero, le_zero, _______, _______, _______, _______ }, 833 /* lt_zero */ { lt_zero, lt_zero, lt_zero, lt_zero, lt_zero, lt_zero, lt_zero }, 834 /* le_zero */ { le_zero, lt_zero, le_zero, le_zero, le_zero, le_zero, le_zero }, 835 /* gt_zero */ { _______, lt_zero, le_zero, gt_zero, ge_zero, ne_zero, eq_zero }, 836 /* ge_zero */ { _______, lt_zero, le_zero, ge_zero, ge_zero, _______, eq_zero }, 837 /* ne_zero */ { _______, lt_zero, le_zero, ne_zero, _______, ne_zero, _______ }, 838 /* eq_zero */ { _______, lt_zero, le_zero, eq_zero, eq_zero, _______, eq_zero } 839 }; 840 841 /* Treat fmin as commutative. */ 842 ASSERT_TABLE_IS_COMMUTATIVE(table); 843 ASSERT_TABLE_IS_DIAGONAL(table); 844 ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(table); 845 846 r.range = table[left.range][right.range]; 847 848 /* Recall that when either value is NaN, fmin will pick the other value. 849 * This means the result range of the fmin will either be the "ideal" 850 * result range (calculated above) or the range of the non-NaN value. 851 */ 852 if (!left.is_a_number) 853 r.range = union_ranges(r.range, right.range); 854 855 if (!right.is_a_number) 856 r.range = union_ranges(r.range, left.range); 857 858 break; 859 } 860 861 case nir_op_fmul: 862 case nir_op_fmulz: { 863 const struct ssa_result_range left = 864 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 865 const struct ssa_result_range right = 866 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 867 868 r.is_integral = left.is_integral && right.is_integral; 869 870 /* x * x => ge_zero */ 871 if (left.range != eq_zero && nir_alu_srcs_equal(alu, alu, 0, 1)) { 872 /* Even if x > 0, the result of x*x can be zero when x is, for 873 * example, a subnormal number. 874 */ 875 r.range = ge_zero; 876 } else if (left.range != eq_zero && nir_alu_srcs_negative_equal(alu, alu, 0, 1)) { 877 /* -x * x => le_zero. */ 878 r.range = le_zero; 879 } else 880 r.range = fmul_table[left.range][right.range]; 881 882 if (alu->op == nir_op_fmul) { 883 /* Mulitpliation produces NaN for X * NaN and for 0 * ±Inf. If both 884 * operands are numbers and either both are finite or one is finite and 885 * the other cannot be zero, then the result must be a number. 886 */ 887 r.is_a_number = (left.is_a_number && right.is_a_number) && 888 ((left.is_finite && right.is_finite) || 889 (!is_not_zero(left.range) && right.is_finite) || 890 (left.is_finite && !is_not_zero(right.range))); 891 } else { 892 /* nir_op_fmulz: unlike nir_op_fmul, 0 * ±Inf is a number. */ 893 r.is_a_number = left.is_a_number && right.is_a_number; 894 } 895 896 break; 897 } 898 899 case nir_op_frcp: 900 r = (struct ssa_result_range){ 901 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)).range, 902 false, 903 false, /* Various cases can result in NaN, so assume the worst. */ 904 false /* " " " " " " " " " " */ 905 }; 906 break; 907 908 case nir_op_mov: 909 r = analyze_expression(alu, 0, ht, use_type); 910 break; 911 912 case nir_op_fneg: 913 r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 914 915 r.range = fneg_table[r.range]; 916 break; 917 918 case nir_op_fsat: { 919 const struct ssa_result_range left = 920 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 921 922 /* fsat(NaN) = 0. */ 923 r.is_a_number = true; 924 r.is_finite = true; 925 926 switch (left.range) { 927 case le_zero: 928 case lt_zero: 929 case eq_zero: 930 r.range = eq_zero; 931 r.is_integral = true; 932 break; 933 934 case gt_zero: 935 /* fsat is equivalent to fmin(fmax(X, 0.0), 1.0), so if X is not a 936 * number, the result will be 0. 937 */ 938 r.range = left.is_a_number ? gt_zero : ge_zero; 939 r.is_integral = left.is_integral; 940 break; 941 942 case ge_zero: 943 case ne_zero: 944 case unknown: 945 /* Since the result must be in [0, 1], the value must be >= 0. */ 946 r.range = ge_zero; 947 r.is_integral = left.is_integral; 948 break; 949 } 950 break; 951 } 952 953 case nir_op_fsign: 954 r = (struct ssa_result_range){ 955 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)).range, 956 true, 957 true, /* fsign is -1, 0, or 1, even for NaN, so it must be a number. */ 958 true /* fsign is -1, 0, or 1, even for NaN, so it must be finite. */ 959 }; 960 break; 961 962 case nir_op_fsqrt: 963 case nir_op_frsq: 964 r = (struct ssa_result_range){ge_zero, false, false, false}; 965 break; 966 967 case nir_op_ffloor: { 968 const struct ssa_result_range left = 969 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 970 971 r.is_integral = true; 972 973 /* In IEEE 754, floor(NaN) is NaN, and floor(±Inf) is ±Inf. See 974 * https://pubs.opengroup.org/onlinepubs/9699919799.2016edition/functions/floor.html 975 */ 976 r.is_a_number = left.is_a_number; 977 r.is_finite = left.is_finite; 978 979 if (left.is_integral || left.range == le_zero || left.range == lt_zero) 980 r.range = left.range; 981 else if (left.range == ge_zero || left.range == gt_zero) 982 r.range = ge_zero; 983 else if (left.range == ne_zero) 984 r.range = unknown; 985 986 break; 987 } 988 989 case nir_op_fceil: { 990 const struct ssa_result_range left = 991 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 992 993 r.is_integral = true; 994 995 /* In IEEE 754, ceil(NaN) is NaN, and ceil(±Inf) is ±Inf. See 996 * https://pubs.opengroup.org/onlinepubs/9699919799.2016edition/functions/ceil.html 997 */ 998 r.is_a_number = left.is_a_number; 999 r.is_finite = left.is_finite; 1000 1001 if (left.is_integral || left.range == ge_zero || left.range == gt_zero) 1002 r.range = left.range; 1003 else if (left.range == le_zero || left.range == lt_zero) 1004 r.range = le_zero; 1005 else if (left.range == ne_zero) 1006 r.range = unknown; 1007 1008 break; 1009 } 1010 1011 case nir_op_ftrunc: { 1012 const struct ssa_result_range left = 1013 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 1014 1015 r.is_integral = true; 1016 1017 /* In IEEE 754, trunc(NaN) is NaN, and trunc(±Inf) is ±Inf. See 1018 * https://pubs.opengroup.org/onlinepubs/9699919799.2016edition/functions/trunc.html 1019 */ 1020 r.is_a_number = left.is_a_number; 1021 r.is_finite = left.is_finite; 1022 1023 if (left.is_integral) 1024 r.range = left.range; 1025 else if (left.range == ge_zero || left.range == gt_zero) 1026 r.range = ge_zero; 1027 else if (left.range == le_zero || left.range == lt_zero) 1028 r.range = le_zero; 1029 else if (left.range == ne_zero) 1030 r.range = unknown; 1031 1032 break; 1033 } 1034 1035 case nir_op_flt: 1036 case nir_op_fge: 1037 case nir_op_feq: 1038 case nir_op_fneu: 1039 case nir_op_ilt: 1040 case nir_op_ige: 1041 case nir_op_ieq: 1042 case nir_op_ine: 1043 case nir_op_ult: 1044 case nir_op_uge: 1045 /* Boolean results are 0 or -1. */ 1046 r = (struct ssa_result_range){le_zero, false, true, false}; 1047 break; 1048 1049 case nir_op_fdot2: 1050 case nir_op_fdot3: 1051 case nir_op_fdot4: 1052 case nir_op_fdot8: 1053 case nir_op_fdot16: 1054 case nir_op_fdot2_replicated: 1055 case nir_op_fdot3_replicated: 1056 case nir_op_fdot4_replicated: 1057 case nir_op_fdot8_replicated: 1058 case nir_op_fdot16_replicated: { 1059 const struct ssa_result_range left = 1060 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 1061 1062 /* If the two sources are the same SSA value, then the result is either 1063 * NaN or some number >= 0. If one source is the negation of the other, 1064 * the result is either NaN or some number <= 0. 1065 * 1066 * In either of these two cases, if one source is a number, then the 1067 * other must also be a number. Since it should not be possible to get 1068 * Inf-Inf in the dot-product, the result must also be a number. 1069 */ 1070 if (nir_alu_srcs_equal(alu, alu, 0, 1)) { 1071 r = (struct ssa_result_range){ge_zero, false, left.is_a_number, false }; 1072 } else if (nir_alu_srcs_negative_equal(alu, alu, 0, 1)) { 1073 r = (struct ssa_result_range){le_zero, false, left.is_a_number, false }; 1074 } else { 1075 r = (struct ssa_result_range){unknown, false, false, false}; 1076 } 1077 break; 1078 } 1079 1080 case nir_op_fpow: { 1081 /* Due to flush-to-zero semanatics of floating-point numbers with very 1082 * small mangnitudes, we can never really be sure a result will be 1083 * non-zero. 1084 * 1085 * NIR uses pow() and powf() to constant evaluate nir_op_fpow. The man 1086 * page for that function says: 1087 * 1088 * If y is 0, the result is 1.0 (even if x is a NaN). 1089 * 1090 * gt_zero: pow(*, eq_zero) 1091 * | pow(eq_zero, lt_zero) # 0^-y = +inf 1092 * | pow(eq_zero, le_zero) # 0^-y = +inf or 0^0 = 1.0 1093 * ; 1094 * 1095 * eq_zero: pow(eq_zero, gt_zero) 1096 * ; 1097 * 1098 * ge_zero: pow(gt_zero, gt_zero) 1099 * | pow(gt_zero, ge_zero) 1100 * | pow(gt_zero, lt_zero) 1101 * | pow(gt_zero, le_zero) 1102 * | pow(gt_zero, ne_zero) 1103 * | pow(gt_zero, unknown) 1104 * | pow(ge_zero, gt_zero) 1105 * | pow(ge_zero, ge_zero) 1106 * | pow(ge_zero, lt_zero) 1107 * | pow(ge_zero, le_zero) 1108 * | pow(ge_zero, ne_zero) 1109 * | pow(ge_zero, unknown) 1110 * | pow(eq_zero, ge_zero) # 0^0 = 1.0 or 0^+y = 0.0 1111 * | pow(eq_zero, ne_zero) # 0^-y = +inf or 0^+y = 0.0 1112 * | pow(eq_zero, unknown) # union of all other y cases 1113 * ; 1114 * 1115 * All other cases are unknown. 1116 * 1117 * We could do better if the right operand is a constant, integral 1118 * value. 1119 */ 1120 static const enum ssa_ranges table[last_range + 1][last_range + 1] = { 1121 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 1122 /* unknown */ { _______, _______, _______, _______, _______, _______, gt_zero }, 1123 /* lt_zero */ { _______, _______, _______, _______, _______, _______, gt_zero }, 1124 /* le_zero */ { _______, _______, _______, _______, _______, _______, gt_zero }, 1125 /* gt_zero */ { ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, gt_zero }, 1126 /* ge_zero */ { ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, gt_zero }, 1127 /* ne_zero */ { _______, _______, _______, _______, _______, _______, gt_zero }, 1128 /* eq_zero */ { ge_zero, gt_zero, gt_zero, eq_zero, ge_zero, ge_zero, gt_zero }, 1129 }; 1130 1131 const struct ssa_result_range left = 1132 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 1133 const struct ssa_result_range right = 1134 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 1135 1136 ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(table); 1137 ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(table); 1138 1139 r.is_integral = left.is_integral && right.is_integral && 1140 is_not_negative(right.range); 1141 r.range = table[left.range][right.range]; 1142 1143 /* Various cases can result in NaN, so assume the worst. */ 1144 r.is_a_number = false; 1145 1146 break; 1147 } 1148 1149 case nir_op_ffma: { 1150 const struct ssa_result_range first = 1151 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 1152 const struct ssa_result_range second = 1153 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 1154 const struct ssa_result_range third = 1155 analyze_expression(alu, 2, ht, nir_alu_src_type(alu, 2)); 1156 1157 r.is_integral = first.is_integral && second.is_integral && 1158 third.is_integral; 1159 1160 /* Various cases can result in NaN, so assume the worst. */ 1161 r.is_a_number = false; 1162 1163 enum ssa_ranges fmul_range; 1164 1165 if (first.range != eq_zero && nir_alu_srcs_equal(alu, alu, 0, 1)) { 1166 /* See handling of nir_op_fmul for explanation of why ge_zero is the 1167 * range. 1168 */ 1169 fmul_range = ge_zero; 1170 } else if (first.range != eq_zero && nir_alu_srcs_negative_equal(alu, alu, 0, 1)) { 1171 /* -x * x => le_zero */ 1172 fmul_range = le_zero; 1173 } else 1174 fmul_range = fmul_table[first.range][second.range]; 1175 1176 r.range = fadd_table[fmul_range][third.range]; 1177 break; 1178 } 1179 1180 case nir_op_flrp: { 1181 const struct ssa_result_range first = 1182 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 1183 const struct ssa_result_range second = 1184 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 1185 const struct ssa_result_range third = 1186 analyze_expression(alu, 2, ht, nir_alu_src_type(alu, 2)); 1187 1188 r.is_integral = first.is_integral && second.is_integral && 1189 third.is_integral; 1190 1191 /* Various cases can result in NaN, so assume the worst. */ 1192 r.is_a_number = false; 1193 1194 /* Decompose the flrp to first + third * (second + -first) */ 1195 const enum ssa_ranges inner_fadd_range = 1196 fadd_table[second.range][fneg_table[first.range]]; 1197 1198 const enum ssa_ranges fmul_range = 1199 fmul_table[third.range][inner_fadd_range]; 1200 1201 r.range = fadd_table[first.range][fmul_range]; 1202 break; 1203 } 1204 1205 default: 1206 r = (struct ssa_result_range){unknown, false, false, false}; 1207 break; 1208 } 1209 1210 if (r.range == eq_zero) 1211 r.is_integral = true; 1212 1213 /* Just like isfinite(), the is_finite flag implies the value is a number. */ 1214 assert((int) r.is_finite <= (int) r.is_a_number); 1215 1216 _mesa_hash_table_insert(ht, pack_key(alu, use_type), pack_data(r)); 1217 return r; 1218} 1219 1220#undef _______ 1221 1222struct ssa_result_range 1223nir_analyze_range(struct hash_table *range_ht, 1224 const nir_alu_instr *instr, unsigned src) 1225{ 1226 return analyze_expression(instr, src, range_ht, 1227 nir_alu_src_type(instr, src)); 1228} 1229 1230static uint32_t bitmask(uint32_t size) { 1231 return size >= 32 ? 0xffffffffu : ((uint32_t)1 << size) - 1u; 1232} 1233 1234static uint64_t mul_clamp(uint32_t a, uint32_t b) 1235{ 1236 if (a != 0 && (a * b) / a != b) 1237 return (uint64_t)UINT32_MAX + 1; 1238 else 1239 return a * b; 1240} 1241 1242/* recursively gather at most "buf_size" phi/bcsel sources */ 1243static unsigned 1244search_phi_bcsel(nir_ssa_scalar scalar, nir_ssa_scalar *buf, unsigned buf_size, struct set *visited) 1245{ 1246 if (_mesa_set_search(visited, scalar.def)) 1247 return 0; 1248 _mesa_set_add(visited, scalar.def); 1249 1250 if (scalar.def->parent_instr->type == nir_instr_type_phi) { 1251 nir_phi_instr *phi = nir_instr_as_phi(scalar.def->parent_instr); 1252 unsigned num_sources_left = exec_list_length(&phi->srcs); 1253 if (buf_size >= num_sources_left) { 1254 unsigned total_added = 0; 1255 nir_foreach_phi_src(src, phi) { 1256 num_sources_left--; 1257 unsigned added = search_phi_bcsel(nir_get_ssa_scalar(src->src.ssa, 0), 1258 buf + total_added, buf_size - num_sources_left, visited); 1259 assert(added <= buf_size); 1260 buf_size -= added; 1261 total_added += added; 1262 } 1263 return total_added; 1264 } 1265 } 1266 1267 if (nir_ssa_scalar_is_alu(scalar)) { 1268 nir_op op = nir_ssa_scalar_alu_op(scalar); 1269 1270 if ((op == nir_op_bcsel || op == nir_op_b32csel) && buf_size >= 2) { 1271 nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0); 1272 nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1); 1273 1274 unsigned added = search_phi_bcsel(src0, buf, buf_size - 1, visited); 1275 buf_size -= added; 1276 added += search_phi_bcsel(src1, buf + added, buf_size, visited); 1277 return added; 1278 } 1279 } 1280 1281 buf[0] = scalar; 1282 return 1; 1283} 1284 1285static nir_variable * 1286lookup_input(nir_shader *shader, unsigned driver_location) 1287{ 1288 return nir_find_variable_with_driver_location(shader, nir_var_shader_in, 1289 driver_location); 1290} 1291 1292/* The config here should be generic enough to be correct on any HW. */ 1293static const nir_unsigned_upper_bound_config default_ub_config = { 1294 .min_subgroup_size = 1u, 1295 .max_subgroup_size = UINT16_MAX, 1296 .max_workgroup_invocations = UINT16_MAX, 1297 .max_workgroup_count = {UINT16_MAX, UINT16_MAX, UINT16_MAX}, 1298 .max_workgroup_size = {UINT16_MAX, UINT16_MAX, UINT16_MAX}, 1299 .vertex_attrib_max = { 1300 UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, 1301 UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, 1302 UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, 1303 UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, 1304 }, 1305}; 1306 1307uint32_t 1308nir_unsigned_upper_bound(nir_shader *shader, struct hash_table *range_ht, 1309 nir_ssa_scalar scalar, 1310 const nir_unsigned_upper_bound_config *config) 1311{ 1312 assert(scalar.def->bit_size <= 32); 1313 1314 if (!config) 1315 config = &default_ub_config; 1316 if (nir_ssa_scalar_is_const(scalar)) 1317 return nir_ssa_scalar_as_uint(scalar); 1318 1319 /* keys can't be 0, so we have to add 1 to the index */ 1320 void *key = (void*)(((uintptr_t)(scalar.def->index + 1) << 4) | scalar.comp); 1321 struct hash_entry *he = _mesa_hash_table_search(range_ht, key); 1322 if (he != NULL) 1323 return (uintptr_t)he->data; 1324 1325 uint32_t max = bitmask(scalar.def->bit_size); 1326 1327 if (scalar.def->parent_instr->type == nir_instr_type_intrinsic) { 1328 uint32_t res = max; 1329 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(scalar.def->parent_instr); 1330 switch (intrin->intrinsic) { 1331 case nir_intrinsic_load_local_invocation_index: 1332 /* The local invocation index is used under the hood by RADV for 1333 * some non-compute-like shaders (eg. LS and NGG). These technically 1334 * run in workgroups on the HW, even though this fact is not exposed 1335 * by the API. 1336 * They can safely use the same code path here as variable sized 1337 * compute-like shader stages. 1338 */ 1339 if (!gl_shader_stage_uses_workgroup(shader->info.stage) || 1340 shader->info.workgroup_size_variable) { 1341 res = config->max_workgroup_invocations - 1; 1342 } else { 1343 res = (shader->info.workgroup_size[0] * 1344 shader->info.workgroup_size[1] * 1345 shader->info.workgroup_size[2]) - 1u; 1346 } 1347 break; 1348 case nir_intrinsic_load_local_invocation_id: 1349 if (shader->info.workgroup_size_variable) 1350 res = config->max_workgroup_size[scalar.comp] - 1u; 1351 else 1352 res = shader->info.workgroup_size[scalar.comp] - 1u; 1353 break; 1354 case nir_intrinsic_load_workgroup_id: 1355 res = config->max_workgroup_count[scalar.comp] - 1u; 1356 break; 1357 case nir_intrinsic_load_num_workgroups: 1358 res = config->max_workgroup_count[scalar.comp]; 1359 break; 1360 case nir_intrinsic_load_global_invocation_id: 1361 if (shader->info.workgroup_size_variable) { 1362 res = mul_clamp(config->max_workgroup_size[scalar.comp], 1363 config->max_workgroup_count[scalar.comp]) - 1u; 1364 } else { 1365 res = (shader->info.workgroup_size[scalar.comp] * 1366 config->max_workgroup_count[scalar.comp]) - 1u; 1367 } 1368 break; 1369 case nir_intrinsic_load_invocation_id: 1370 if (shader->info.stage == MESA_SHADER_TESS_CTRL) 1371 res = shader->info.tess.tcs_vertices_out 1372 ? (shader->info.tess.tcs_vertices_out - 1) 1373 : 511; /* Generous maximum output patch size of 512 */ 1374 break; 1375 case nir_intrinsic_load_subgroup_invocation: 1376 case nir_intrinsic_first_invocation: 1377 res = config->max_subgroup_size - 1; 1378 break; 1379 case nir_intrinsic_mbcnt_amd: { 1380 uint32_t src0 = config->max_subgroup_size - 1; 1381 uint32_t src1 = nir_unsigned_upper_bound(shader, range_ht, nir_get_ssa_scalar(intrin->src[1].ssa, 0), config); 1382 1383 if (src0 + src1 < src0) 1384 res = max; /* overflow */ 1385 else 1386 res = src0 + src1; 1387 break; 1388 } 1389 case nir_intrinsic_load_subgroup_size: 1390 res = config->max_subgroup_size; 1391 break; 1392 case nir_intrinsic_load_subgroup_id: 1393 case nir_intrinsic_load_num_subgroups: { 1394 uint32_t workgroup_size = config->max_workgroup_invocations; 1395 if (gl_shader_stage_uses_workgroup(shader->info.stage) && 1396 !shader->info.workgroup_size_variable) { 1397 workgroup_size = shader->info.workgroup_size[0] * 1398 shader->info.workgroup_size[1] * 1399 shader->info.workgroup_size[2]; 1400 } 1401 res = DIV_ROUND_UP(workgroup_size, config->min_subgroup_size); 1402 if (intrin->intrinsic == nir_intrinsic_load_subgroup_id) 1403 res--; 1404 break; 1405 } 1406 case nir_intrinsic_load_input: { 1407 if (shader->info.stage == MESA_SHADER_VERTEX && nir_src_is_const(intrin->src[0])) { 1408 nir_variable *var = lookup_input(shader, nir_intrinsic_base(intrin)); 1409 if (var) { 1410 int loc = var->data.location - VERT_ATTRIB_GENERIC0; 1411 if (loc >= 0) 1412 res = config->vertex_attrib_max[loc]; 1413 } 1414 } 1415 break; 1416 } 1417 case nir_intrinsic_reduce: 1418 case nir_intrinsic_inclusive_scan: 1419 case nir_intrinsic_exclusive_scan: { 1420 nir_op op = nir_intrinsic_reduction_op(intrin); 1421 if (op == nir_op_umin || op == nir_op_umax || op == nir_op_imin || op == nir_op_imax) 1422 res = nir_unsigned_upper_bound(shader, range_ht, nir_get_ssa_scalar(intrin->src[0].ssa, 0), config); 1423 break; 1424 } 1425 case nir_intrinsic_read_first_invocation: 1426 case nir_intrinsic_read_invocation: 1427 case nir_intrinsic_shuffle: 1428 case nir_intrinsic_shuffle_xor: 1429 case nir_intrinsic_shuffle_up: 1430 case nir_intrinsic_shuffle_down: 1431 case nir_intrinsic_quad_broadcast: 1432 case nir_intrinsic_quad_swap_horizontal: 1433 case nir_intrinsic_quad_swap_vertical: 1434 case nir_intrinsic_quad_swap_diagonal: 1435 case nir_intrinsic_quad_swizzle_amd: 1436 case nir_intrinsic_masked_swizzle_amd: 1437 res = nir_unsigned_upper_bound(shader, range_ht, nir_get_ssa_scalar(intrin->src[0].ssa, 0), config); 1438 break; 1439 case nir_intrinsic_write_invocation_amd: { 1440 uint32_t src0 = nir_unsigned_upper_bound(shader, range_ht, nir_get_ssa_scalar(intrin->src[0].ssa, 0), config); 1441 uint32_t src1 = nir_unsigned_upper_bound(shader, range_ht, nir_get_ssa_scalar(intrin->src[1].ssa, 0), config); 1442 res = MAX2(src0, src1); 1443 break; 1444 } 1445 case nir_intrinsic_load_tess_rel_patch_id_amd: 1446 case nir_intrinsic_load_tcs_num_patches_amd: 1447 /* Very generous maximum: TCS/TES executed by largest possible workgroup */ 1448 res = config->max_workgroup_invocations / MAX2(shader->info.tess.tcs_vertices_out, 1u); 1449 break; 1450 case nir_intrinsic_load_scalar_arg_amd: 1451 case nir_intrinsic_load_vector_arg_amd: { 1452 uint32_t upper_bound = nir_intrinsic_arg_upper_bound_u32_amd(intrin); 1453 if (upper_bound) 1454 res = upper_bound; 1455 break; 1456 } 1457 default: 1458 break; 1459 } 1460 if (res != max) 1461 _mesa_hash_table_insert(range_ht, key, (void*)(uintptr_t)res); 1462 return res; 1463 } 1464 1465 if (scalar.def->parent_instr->type == nir_instr_type_phi) { 1466 nir_cf_node *prev = nir_cf_node_prev(&scalar.def->parent_instr->block->cf_node); 1467 1468 uint32_t res = 0; 1469 if (!prev || prev->type == nir_cf_node_block) { 1470 _mesa_hash_table_insert(range_ht, key, (void*)(uintptr_t)max); 1471 1472 struct set *visited = _mesa_pointer_set_create(NULL); 1473 nir_ssa_scalar defs[64]; 1474 unsigned def_count = search_phi_bcsel(scalar, defs, 64, visited); 1475 _mesa_set_destroy(visited, NULL); 1476 1477 for (unsigned i = 0; i < def_count; i++) 1478 res = MAX2(res, nir_unsigned_upper_bound(shader, range_ht, defs[i], config)); 1479 } else { 1480 nir_foreach_phi_src(src, nir_instr_as_phi(scalar.def->parent_instr)) { 1481 res = MAX2(res, nir_unsigned_upper_bound( 1482 shader, range_ht, nir_get_ssa_scalar(src->src.ssa, 0), config)); 1483 } 1484 } 1485 1486 _mesa_hash_table_insert(range_ht, key, (void*)(uintptr_t)res); 1487 return res; 1488 } 1489 1490 if (nir_ssa_scalar_is_alu(scalar)) { 1491 nir_op op = nir_ssa_scalar_alu_op(scalar); 1492 1493 switch (op) { 1494 case nir_op_umin: 1495 case nir_op_imin: 1496 case nir_op_imax: 1497 case nir_op_umax: 1498 case nir_op_iand: 1499 case nir_op_ior: 1500 case nir_op_ixor: 1501 case nir_op_ishl: 1502 case nir_op_imul: 1503 case nir_op_ushr: 1504 case nir_op_ishr: 1505 case nir_op_iadd: 1506 case nir_op_umod: 1507 case nir_op_udiv: 1508 case nir_op_bcsel: 1509 case nir_op_b32csel: 1510 case nir_op_ubfe: 1511 case nir_op_bfm: 1512 case nir_op_fmul: 1513 case nir_op_fmulz: 1514 case nir_op_extract_u8: 1515 case nir_op_extract_i8: 1516 case nir_op_extract_u16: 1517 case nir_op_extract_i16: 1518 break; 1519 case nir_op_u2u1: 1520 case nir_op_u2u8: 1521 case nir_op_u2u16: 1522 case nir_op_u2u32: 1523 case nir_op_f2u32: 1524 if (nir_ssa_scalar_chase_alu_src(scalar, 0).def->bit_size > 32) { 1525 /* If src is >32 bits, return max */ 1526 return max; 1527 } 1528 break; 1529 default: 1530 return max; 1531 } 1532 1533 uint32_t src0 = nir_unsigned_upper_bound(shader, range_ht, nir_ssa_scalar_chase_alu_src(scalar, 0), config); 1534 uint32_t src1 = max, src2 = max; 1535 if (nir_op_infos[op].num_inputs > 1) 1536 src1 = nir_unsigned_upper_bound(shader, range_ht, nir_ssa_scalar_chase_alu_src(scalar, 1), config); 1537 if (nir_op_infos[op].num_inputs > 2) 1538 src2 = nir_unsigned_upper_bound(shader, range_ht, nir_ssa_scalar_chase_alu_src(scalar, 2), config); 1539 1540 uint32_t res = max; 1541 switch (op) { 1542 case nir_op_umin: 1543 res = src0 < src1 ? src0 : src1; 1544 break; 1545 case nir_op_imin: 1546 case nir_op_imax: 1547 case nir_op_umax: 1548 res = src0 > src1 ? src0 : src1; 1549 break; 1550 case nir_op_iand: 1551 res = bitmask(util_last_bit64(src0)) & bitmask(util_last_bit64(src1)); 1552 break; 1553 case nir_op_ior: 1554 case nir_op_ixor: 1555 res = bitmask(util_last_bit64(src0)) | bitmask(util_last_bit64(src1)); 1556 break; 1557 case nir_op_ishl: 1558 if (util_last_bit64(src0) + src1 > scalar.def->bit_size) 1559 res = max; /* overflow */ 1560 else 1561 res = src0 << MIN2(src1, scalar.def->bit_size - 1u); 1562 break; 1563 case nir_op_imul: 1564 if (src0 != 0 && (src0 * src1) / src0 != src1) 1565 res = max; 1566 else 1567 res = src0 * src1; 1568 break; 1569 case nir_op_ushr: { 1570 nir_ssa_scalar src1_scalar = nir_ssa_scalar_chase_alu_src(scalar, 1); 1571 if (nir_ssa_scalar_is_const(src1_scalar)) 1572 res = src0 >> nir_ssa_scalar_as_uint(src1_scalar); 1573 else 1574 res = src0; 1575 break; 1576 } 1577 case nir_op_ishr: { 1578 nir_ssa_scalar src1_scalar = nir_ssa_scalar_chase_alu_src(scalar, 1); 1579 if (src0 <= 2147483647 && nir_ssa_scalar_is_const(src1_scalar)) 1580 res = src0 >> nir_ssa_scalar_as_uint(src1_scalar); 1581 else 1582 res = src0; 1583 break; 1584 } 1585 case nir_op_iadd: 1586 if (src0 + src1 < src0) 1587 res = max; /* overflow */ 1588 else 1589 res = src0 + src1; 1590 break; 1591 case nir_op_umod: 1592 res = src1 ? src1 - 1 : 0; 1593 break; 1594 case nir_op_udiv: { 1595 nir_ssa_scalar src1_scalar = nir_ssa_scalar_chase_alu_src(scalar, 1); 1596 if (nir_ssa_scalar_is_const(src1_scalar)) 1597 res = nir_ssa_scalar_as_uint(src1_scalar) ? src0 / nir_ssa_scalar_as_uint(src1_scalar) : 0; 1598 else 1599 res = src0; 1600 break; 1601 } 1602 case nir_op_bcsel: 1603 case nir_op_b32csel: 1604 res = src1 > src2 ? src1 : src2; 1605 break; 1606 case nir_op_ubfe: 1607 res = bitmask(MIN2(src2, scalar.def->bit_size)); 1608 break; 1609 case nir_op_bfm: { 1610 nir_ssa_scalar src1_scalar = nir_ssa_scalar_chase_alu_src(scalar, 1); 1611 if (nir_ssa_scalar_is_const(src1_scalar)) { 1612 src0 = MIN2(src0, 31); 1613 src1 = nir_ssa_scalar_as_uint(src1_scalar) & 0x1fu; 1614 res = bitmask(src0) << src1; 1615 } else { 1616 src0 = MIN2(src0, 31); 1617 src1 = MIN2(src1, 31); 1618 res = bitmask(MIN2(src0 + src1, 32)); 1619 } 1620 break; 1621 } 1622 /* limited floating-point support for f2u32(fmul(load_input(), <constant>)) */ 1623 case nir_op_f2u32: 1624 /* infinity/NaN starts at 0x7f800000u, negative numbers at 0x80000000 */ 1625 if (src0 < 0x7f800000u) { 1626 float val; 1627 memcpy(&val, &src0, 4); 1628 res = (uint32_t)val; 1629 } 1630 break; 1631 case nir_op_fmul: 1632 case nir_op_fmulz: 1633 /* infinity/NaN starts at 0x7f800000u, negative numbers at 0x80000000 */ 1634 if (src0 < 0x7f800000u && src1 < 0x7f800000u) { 1635 float src0_f, src1_f; 1636 memcpy(&src0_f, &src0, 4); 1637 memcpy(&src1_f, &src1, 4); 1638 /* not a proper rounding-up multiplication, but should be good enough */ 1639 float max_f = ceilf(src0_f) * ceilf(src1_f); 1640 memcpy(&res, &max_f, 4); 1641 } 1642 break; 1643 case nir_op_u2u1: 1644 case nir_op_u2u8: 1645 case nir_op_u2u16: 1646 case nir_op_u2u32: 1647 res = MIN2(src0, max); 1648 break; 1649 case nir_op_sad_u8x4: 1650 res = src2 + 4 * 255; 1651 break; 1652 case nir_op_extract_u8: 1653 res = MIN2(src0, UINT8_MAX); 1654 break; 1655 case nir_op_extract_i8: 1656 res = (src0 >= 0x80) ? max : MIN2(src0, INT8_MAX); 1657 break; 1658 case nir_op_extract_u16: 1659 res = MIN2(src0, UINT16_MAX); 1660 break; 1661 case nir_op_extract_i16: 1662 res = (src0 >= 0x8000) ? max : MIN2(src0, INT16_MAX); 1663 break; 1664 default: 1665 res = max; 1666 break; 1667 } 1668 _mesa_hash_table_insert(range_ht, key, (void*)(uintptr_t)res); 1669 return res; 1670 } 1671 1672 return max; 1673} 1674 1675bool 1676nir_addition_might_overflow(nir_shader *shader, struct hash_table *range_ht, 1677 nir_ssa_scalar ssa, unsigned const_val, 1678 const nir_unsigned_upper_bound_config *config) 1679{ 1680 if (nir_ssa_scalar_is_alu(ssa)) { 1681 nir_op alu_op = nir_ssa_scalar_alu_op(ssa); 1682 1683 /* iadd(imul(a, #b), #c) */ 1684 if (alu_op == nir_op_imul || alu_op == nir_op_ishl) { 1685 nir_ssa_scalar mul_src0 = nir_ssa_scalar_chase_alu_src(ssa, 0); 1686 nir_ssa_scalar mul_src1 = nir_ssa_scalar_chase_alu_src(ssa, 1); 1687 uint32_t stride = 1; 1688 if (nir_ssa_scalar_is_const(mul_src0)) 1689 stride = nir_ssa_scalar_as_uint(mul_src0); 1690 else if (nir_ssa_scalar_is_const(mul_src1)) 1691 stride = nir_ssa_scalar_as_uint(mul_src1); 1692 1693 if (alu_op == nir_op_ishl) 1694 stride = 1u << (stride % 32u); 1695 1696 if (!stride || const_val <= UINT32_MAX - (UINT32_MAX / stride * stride)) 1697 return false; 1698 } 1699 1700 /* iadd(iand(a, #b), #c) */ 1701 if (alu_op == nir_op_iand) { 1702 nir_ssa_scalar and_src0 = nir_ssa_scalar_chase_alu_src(ssa, 0); 1703 nir_ssa_scalar and_src1 = nir_ssa_scalar_chase_alu_src(ssa, 1); 1704 uint32_t mask = 0xffffffff; 1705 if (nir_ssa_scalar_is_const(and_src0)) 1706 mask = nir_ssa_scalar_as_uint(and_src0); 1707 else if (nir_ssa_scalar_is_const(and_src1)) 1708 mask = nir_ssa_scalar_as_uint(and_src1); 1709 if (mask == 0 || const_val < (1u << (ffs(mask) - 1))) 1710 return false; 1711 } 1712 } 1713 1714 uint32_t ub = nir_unsigned_upper_bound(shader, range_ht, ssa, config); 1715 return const_val + ub < const_val; 1716} 1717 1718static uint64_t 1719ssa_def_bits_used(const nir_ssa_def *def, int recur) 1720{ 1721 uint64_t bits_used = 0; 1722 uint64_t all_bits = BITFIELD64_MASK(def->bit_size); 1723 1724 /* Querying the bits used from a vector is too hard of a question to 1725 * answer. Return the conservative answer that all bits are used. To 1726 * handle this, the function would need to be extended to be a query of a 1727 * single component of the vector. That would also necessary to fully 1728 * handle the 'num_components > 1' inside the loop below. 1729 * 1730 * FINISHME: This restriction will eventually need to be restricted to be 1731 * useful for hardware that uses u16vec2 as the native 16-bit integer type. 1732 */ 1733 if (def->num_components > 1) 1734 return all_bits; 1735 1736 /* Limit recursion */ 1737 if (recur-- <= 0) 1738 return all_bits; 1739 1740 nir_foreach_use(src, def) { 1741 switch (src->parent_instr->type) { 1742 case nir_instr_type_alu: { 1743 nir_alu_instr *use_alu = nir_instr_as_alu(src->parent_instr); 1744 unsigned src_idx = container_of(src, nir_alu_src, src) - use_alu->src; 1745 1746 /* If a user of the value produces a vector result, return the 1747 * conservative answer that all bits are used. It is possible to 1748 * answer this query by looping over the components used. For example, 1749 * 1750 * vec4 32 ssa_5 = load_const(0x0000f000, 0x00000f00, 0x000000f0, 0x0000000f) 1751 * ... 1752 * vec4 32 ssa_8 = iand ssa_7.xxxx, ssa_5 1753 * 1754 * could conceivably return 0x0000ffff when queyring the bits used of 1755 * ssa_7. This is unlikely to be worth the effort because the 1756 * question can eventually answered after the shader has been 1757 * scalarized. 1758 */ 1759 if (use_alu->dest.dest.ssa.num_components > 1) 1760 return all_bits; 1761 1762 switch (use_alu->op) { 1763 case nir_op_u2u8: 1764 case nir_op_i2i8: 1765 bits_used |= 0xff; 1766 break; 1767 1768 case nir_op_u2u16: 1769 case nir_op_i2i16: 1770 bits_used |= all_bits & 0xffff; 1771 break; 1772 1773 case nir_op_u2u32: 1774 case nir_op_i2i32: 1775 bits_used |= all_bits & 0xffffffff; 1776 break; 1777 1778 case nir_op_extract_u8: 1779 case nir_op_extract_i8: 1780 if (src_idx == 0 && nir_src_is_const(use_alu->src[1].src)) { 1781 unsigned chunk = nir_src_comp_as_uint(use_alu->src[1].src, 1782 use_alu->src[1].swizzle[0]); 1783 bits_used |= 0xffull << (chunk * 8); 1784 break; 1785 } else { 1786 return all_bits; 1787 } 1788 1789 case nir_op_extract_u16: 1790 case nir_op_extract_i16: 1791 if (src_idx == 0 && nir_src_is_const(use_alu->src[1].src)) { 1792 unsigned chunk = nir_src_comp_as_uint(use_alu->src[1].src, 1793 use_alu->src[1].swizzle[0]); 1794 bits_used |= 0xffffull << (chunk * 16); 1795 break; 1796 } else { 1797 return all_bits; 1798 } 1799 1800 case nir_op_ishl: 1801 case nir_op_ishr: 1802 case nir_op_ushr: 1803 if (src_idx == 1) { 1804 bits_used |= (nir_src_bit_size(use_alu->src[0].src) - 1); 1805 break; 1806 } else { 1807 return all_bits; 1808 } 1809 1810 case nir_op_iand: 1811 assert(src_idx < 2); 1812 if (nir_src_is_const(use_alu->src[1 - src_idx].src)) { 1813 uint64_t u64 = nir_src_comp_as_uint(use_alu->src[1 - src_idx].src, 1814 use_alu->src[1 - src_idx].swizzle[0]); 1815 bits_used |= u64; 1816 break; 1817 } else { 1818 return all_bits; 1819 } 1820 1821 case nir_op_ior: 1822 assert(src_idx < 2); 1823 if (nir_src_is_const(use_alu->src[1 - src_idx].src)) { 1824 uint64_t u64 = nir_src_comp_as_uint(use_alu->src[1 - src_idx].src, 1825 use_alu->src[1 - src_idx].swizzle[0]); 1826 bits_used |= all_bits & ~u64; 1827 break; 1828 } else { 1829 return all_bits; 1830 } 1831 1832 default: 1833 /* We don't know what this op does */ 1834 return all_bits; 1835 } 1836 break; 1837 } 1838 1839 case nir_instr_type_intrinsic: { 1840 nir_intrinsic_instr *use_intrin = 1841 nir_instr_as_intrinsic(src->parent_instr); 1842 unsigned src_idx = src - use_intrin->src; 1843 1844 switch (use_intrin->intrinsic) { 1845 case nir_intrinsic_read_invocation: 1846 case nir_intrinsic_shuffle: 1847 case nir_intrinsic_shuffle_up: 1848 case nir_intrinsic_shuffle_down: 1849 case nir_intrinsic_shuffle_xor: 1850 case nir_intrinsic_quad_broadcast: 1851 case nir_intrinsic_quad_swap_horizontal: 1852 case nir_intrinsic_quad_swap_vertical: 1853 case nir_intrinsic_quad_swap_diagonal: 1854 if (src_idx == 0) { 1855 assert(use_intrin->dest.is_ssa); 1856 bits_used |= ssa_def_bits_used(&use_intrin->dest.ssa, recur); 1857 } else { 1858 if (use_intrin->intrinsic == nir_intrinsic_quad_broadcast) { 1859 bits_used |= 3; 1860 } else { 1861 /* Subgroups larger than 128 are not a thing */ 1862 bits_used |= 127; 1863 } 1864 } 1865 break; 1866 1867 case nir_intrinsic_reduce: 1868 case nir_intrinsic_inclusive_scan: 1869 case nir_intrinsic_exclusive_scan: 1870 assert(src_idx == 0); 1871 switch (nir_intrinsic_reduction_op(use_intrin)) { 1872 case nir_op_iadd: 1873 case nir_op_imul: 1874 case nir_op_ior: 1875 case nir_op_iand: 1876 case nir_op_ixor: 1877 bits_used |= ssa_def_bits_used(&use_intrin->dest.ssa, recur); 1878 break; 1879 1880 default: 1881 return all_bits; 1882 } 1883 break; 1884 1885 default: 1886 /* We don't know what this op does */ 1887 return all_bits; 1888 } 1889 break; 1890 } 1891 1892 case nir_instr_type_phi: { 1893 nir_phi_instr *use_phi = nir_instr_as_phi(src->parent_instr); 1894 bits_used |= ssa_def_bits_used(&use_phi->dest.ssa, recur); 1895 break; 1896 } 1897 1898 default: 1899 return all_bits; 1900 } 1901 1902 /* If we've somehow shown that all our bits are used, we're done */ 1903 assert((bits_used & ~all_bits) == 0); 1904 if (bits_used == all_bits) 1905 return all_bits; 1906 } 1907 1908 return bits_used; 1909} 1910 1911uint64_t 1912nir_ssa_def_bits_used(const nir_ssa_def *def) 1913{ 1914 return ssa_def_bits_used(def, 2); 1915} 1916