1/* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "nir.h" 25#include "nir_builder.h" 26 27#define COND_LOWER_OP(b, name, ...) \ 28 (b->shader->options->lower_int64_options & \ 29 nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \ 30 lower_##name##64(b, __VA_ARGS__) : nir_##name(b, __VA_ARGS__) 31 32#define COND_LOWER_CMP(b, name, ...) \ 33 (b->shader->options->lower_int64_options & \ 34 nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \ 35 lower_int64_compare(b, nir_op_##name, __VA_ARGS__) : \ 36 nir_##name(b, __VA_ARGS__) 37 38#define COND_LOWER_CAST(b, name, ...) \ 39 (b->shader->options->lower_int64_options & \ 40 nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \ 41 lower_##name(b, __VA_ARGS__) : \ 42 nir_##name(b, __VA_ARGS__) 43 44static nir_ssa_def * 45lower_b2i64(nir_builder *b, nir_ssa_def *x) 46{ 47 return nir_pack_64_2x32_split(b, nir_b2i32(b, x), nir_imm_int(b, 0)); 48} 49 50static nir_ssa_def * 51lower_i2b(nir_builder *b, nir_ssa_def *x) 52{ 53 return nir_ine(b, nir_ior(b, nir_unpack_64_2x32_split_x(b, x), 54 nir_unpack_64_2x32_split_y(b, x)), 55 nir_imm_int(b, 0)); 56} 57 58static nir_ssa_def * 59lower_i2i8(nir_builder *b, nir_ssa_def *x) 60{ 61 return nir_i2i8(b, nir_unpack_64_2x32_split_x(b, x)); 62} 63 64static nir_ssa_def * 65lower_i2i16(nir_builder *b, nir_ssa_def *x) 66{ 67 return nir_i2i16(b, nir_unpack_64_2x32_split_x(b, x)); 68} 69 70 71static nir_ssa_def * 72lower_i2i32(nir_builder *b, nir_ssa_def *x) 73{ 74 return nir_unpack_64_2x32_split_x(b, x); 75} 76 77static nir_ssa_def * 78lower_i2i64(nir_builder *b, nir_ssa_def *x) 79{ 80 nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_i2i32(b, x); 81 return nir_pack_64_2x32_split(b, x32, nir_ishr_imm(b, x32, 31)); 82} 83 84static nir_ssa_def * 85lower_u2u8(nir_builder *b, nir_ssa_def *x) 86{ 87 return nir_u2u8(b, nir_unpack_64_2x32_split_x(b, x)); 88} 89 90static nir_ssa_def * 91lower_u2u16(nir_builder *b, nir_ssa_def *x) 92{ 93 return nir_u2u16(b, nir_unpack_64_2x32_split_x(b, x)); 94} 95 96static nir_ssa_def * 97lower_u2u32(nir_builder *b, nir_ssa_def *x) 98{ 99 return nir_unpack_64_2x32_split_x(b, x); 100} 101 102static nir_ssa_def * 103lower_u2u64(nir_builder *b, nir_ssa_def *x) 104{ 105 nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_u2u32(b, x); 106 return nir_pack_64_2x32_split(b, x32, nir_imm_int(b, 0)); 107} 108 109static nir_ssa_def * 110lower_bcsel64(nir_builder *b, nir_ssa_def *cond, nir_ssa_def *x, nir_ssa_def *y) 111{ 112 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 113 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 114 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 115 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 116 117 return nir_pack_64_2x32_split(b, nir_bcsel(b, cond, x_lo, y_lo), 118 nir_bcsel(b, cond, x_hi, y_hi)); 119} 120 121static nir_ssa_def * 122lower_inot64(nir_builder *b, nir_ssa_def *x) 123{ 124 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 125 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 126 127 return nir_pack_64_2x32_split(b, nir_inot(b, x_lo), nir_inot(b, x_hi)); 128} 129 130static nir_ssa_def * 131lower_iand64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 132{ 133 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 134 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 135 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 136 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 137 138 return nir_pack_64_2x32_split(b, nir_iand(b, x_lo, y_lo), 139 nir_iand(b, x_hi, y_hi)); 140} 141 142static nir_ssa_def * 143lower_ior64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 144{ 145 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 146 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 147 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 148 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 149 150 return nir_pack_64_2x32_split(b, nir_ior(b, x_lo, y_lo), 151 nir_ior(b, x_hi, y_hi)); 152} 153 154static nir_ssa_def * 155lower_ixor64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 156{ 157 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 158 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 159 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 160 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 161 162 return nir_pack_64_2x32_split(b, nir_ixor(b, x_lo, y_lo), 163 nir_ixor(b, x_hi, y_hi)); 164} 165 166static nir_ssa_def * 167lower_ishl64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 168{ 169 /* Implemented as 170 * 171 * uint64_t lshift(uint64_t x, int c) 172 * { 173 * if (c == 0) return x; 174 * 175 * uint32_t lo = LO(x), hi = HI(x); 176 * 177 * if (c < 32) { 178 * uint32_t lo_shifted = lo << c; 179 * uint32_t hi_shifted = hi << c; 180 * uint32_t lo_shifted_hi = lo >> abs(32 - c); 181 * return pack_64(lo_shifted, hi_shifted | lo_shifted_hi); 182 * } else { 183 * uint32_t lo_shifted_hi = lo << abs(32 - c); 184 * return pack_64(0, lo_shifted_hi); 185 * } 186 * } 187 */ 188 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 189 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 190 191 nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32))); 192 nir_ssa_def *lo_shifted = nir_ishl(b, x_lo, y); 193 nir_ssa_def *hi_shifted = nir_ishl(b, x_hi, y); 194 nir_ssa_def *lo_shifted_hi = nir_ushr(b, x_lo, reverse_count); 195 196 nir_ssa_def *res_if_lt_32 = 197 nir_pack_64_2x32_split(b, lo_shifted, 198 nir_ior(b, hi_shifted, lo_shifted_hi)); 199 nir_ssa_def *res_if_ge_32 = 200 nir_pack_64_2x32_split(b, nir_imm_int(b, 0), 201 nir_ishl(b, x_lo, reverse_count)); 202 203 return nir_bcsel(b, nir_ieq_imm(b, y, 0), x, 204 nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), 205 res_if_ge_32, res_if_lt_32)); 206} 207 208static nir_ssa_def * 209lower_ishr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 210{ 211 /* Implemented as 212 * 213 * uint64_t arshift(uint64_t x, int c) 214 * { 215 * if (c == 0) return x; 216 * 217 * uint32_t lo = LO(x); 218 * int32_t hi = HI(x); 219 * 220 * if (c < 32) { 221 * uint32_t lo_shifted = lo >> c; 222 * uint32_t hi_shifted = hi >> c; 223 * uint32_t hi_shifted_lo = hi << abs(32 - c); 224 * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted); 225 * } else { 226 * uint32_t hi_shifted = hi >> 31; 227 * uint32_t hi_shifted_lo = hi >> abs(32 - c); 228 * return pack_64(hi_shifted, hi_shifted_lo); 229 * } 230 * } 231 */ 232 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 233 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 234 235 nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32))); 236 nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y); 237 nir_ssa_def *hi_shifted = nir_ishr(b, x_hi, y); 238 nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count); 239 240 nir_ssa_def *res_if_lt_32 = 241 nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo), 242 hi_shifted); 243 nir_ssa_def *res_if_ge_32 = 244 nir_pack_64_2x32_split(b, nir_ishr(b, x_hi, reverse_count), 245 nir_ishr(b, x_hi, nir_imm_int(b, 31))); 246 247 return nir_bcsel(b, nir_ieq_imm(b, y, 0), x, 248 nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), 249 res_if_ge_32, res_if_lt_32)); 250} 251 252static nir_ssa_def * 253lower_ushr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 254{ 255 /* Implemented as 256 * 257 * uint64_t rshift(uint64_t x, int c) 258 * { 259 * if (c == 0) return x; 260 * 261 * uint32_t lo = LO(x), hi = HI(x); 262 * 263 * if (c < 32) { 264 * uint32_t lo_shifted = lo >> c; 265 * uint32_t hi_shifted = hi >> c; 266 * uint32_t hi_shifted_lo = hi << abs(32 - c); 267 * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted); 268 * } else { 269 * uint32_t hi_shifted_lo = hi >> abs(32 - c); 270 * return pack_64(0, hi_shifted_lo); 271 * } 272 * } 273 */ 274 275 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 276 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 277 278 nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32))); 279 nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y); 280 nir_ssa_def *hi_shifted = nir_ushr(b, x_hi, y); 281 nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count); 282 283 nir_ssa_def *res_if_lt_32 = 284 nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo), 285 hi_shifted); 286 nir_ssa_def *res_if_ge_32 = 287 nir_pack_64_2x32_split(b, nir_ushr(b, x_hi, reverse_count), 288 nir_imm_int(b, 0)); 289 290 return nir_bcsel(b, nir_ieq_imm(b, y, 0), x, 291 nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), 292 res_if_ge_32, res_if_lt_32)); 293} 294 295static nir_ssa_def * 296lower_iadd64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 297{ 298 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 299 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 300 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 301 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 302 303 nir_ssa_def *res_lo = nir_iadd(b, x_lo, y_lo); 304 nir_ssa_def *carry = nir_b2i32(b, nir_ult(b, res_lo, x_lo)); 305 nir_ssa_def *res_hi = nir_iadd(b, carry, nir_iadd(b, x_hi, y_hi)); 306 307 return nir_pack_64_2x32_split(b, res_lo, res_hi); 308} 309 310static nir_ssa_def * 311lower_isub64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 312{ 313 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 314 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 315 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 316 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 317 318 nir_ssa_def *res_lo = nir_isub(b, x_lo, y_lo); 319 nir_ssa_def *borrow = nir_ineg(b, nir_b2i32(b, nir_ult(b, x_lo, y_lo))); 320 nir_ssa_def *res_hi = nir_iadd(b, nir_isub(b, x_hi, y_hi), borrow); 321 322 return nir_pack_64_2x32_split(b, res_lo, res_hi); 323} 324 325static nir_ssa_def * 326lower_ineg64(nir_builder *b, nir_ssa_def *x) 327{ 328 /* Since isub is the same number of instructions (with better dependencies) 329 * as iadd, subtraction is actually more efficient for ineg than the usual 330 * 2's complement "flip the bits and add one". 331 */ 332 return lower_isub64(b, nir_imm_int64(b, 0), x); 333} 334 335static nir_ssa_def * 336lower_iabs64(nir_builder *b, nir_ssa_def *x) 337{ 338 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 339 nir_ssa_def *x_is_neg = nir_ilt(b, x_hi, nir_imm_int(b, 0)); 340 return nir_bcsel(b, x_is_neg, nir_ineg(b, x), x); 341} 342 343static nir_ssa_def * 344lower_int64_compare(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *y) 345{ 346 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 347 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 348 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 349 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 350 351 switch (op) { 352 case nir_op_ieq: 353 return nir_iand(b, nir_ieq(b, x_hi, y_hi), nir_ieq(b, x_lo, y_lo)); 354 case nir_op_ine: 355 return nir_ior(b, nir_ine(b, x_hi, y_hi), nir_ine(b, x_lo, y_lo)); 356 case nir_op_ult: 357 return nir_ior(b, nir_ult(b, x_hi, y_hi), 358 nir_iand(b, nir_ieq(b, x_hi, y_hi), 359 nir_ult(b, x_lo, y_lo))); 360 case nir_op_ilt: 361 return nir_ior(b, nir_ilt(b, x_hi, y_hi), 362 nir_iand(b, nir_ieq(b, x_hi, y_hi), 363 nir_ult(b, x_lo, y_lo))); 364 break; 365 case nir_op_uge: 366 /* Lower as !(x < y) in the hopes of better CSE */ 367 return nir_inot(b, lower_int64_compare(b, nir_op_ult, x, y)); 368 case nir_op_ige: 369 /* Lower as !(x < y) in the hopes of better CSE */ 370 return nir_inot(b, lower_int64_compare(b, nir_op_ilt, x, y)); 371 default: 372 unreachable("Invalid comparison"); 373 } 374} 375 376static nir_ssa_def * 377lower_umax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 378{ 379 return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), y, x); 380} 381 382static nir_ssa_def * 383lower_imax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 384{ 385 return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), y, x); 386} 387 388static nir_ssa_def * 389lower_umin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 390{ 391 return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), x, y); 392} 393 394static nir_ssa_def * 395lower_imin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 396{ 397 return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), x, y); 398} 399 400static nir_ssa_def * 401lower_mul_2x32_64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, 402 bool sign_extend) 403{ 404 nir_ssa_def *res_hi = sign_extend ? nir_imul_high(b, x, y) 405 : nir_umul_high(b, x, y); 406 407 return nir_pack_64_2x32_split(b, nir_imul(b, x, y), res_hi); 408} 409 410static nir_ssa_def * 411lower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 412{ 413 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 414 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 415 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 416 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 417 418 nir_ssa_def *mul_lo = nir_umul_2x32_64(b, x_lo, y_lo); 419 nir_ssa_def *res_hi = nir_iadd(b, nir_unpack_64_2x32_split_y(b, mul_lo), 420 nir_iadd(b, nir_imul(b, x_lo, y_hi), 421 nir_imul(b, x_hi, y_lo))); 422 423 return nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, mul_lo), 424 res_hi); 425} 426 427static nir_ssa_def * 428lower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, 429 bool sign_extend) 430{ 431 nir_ssa_def *x32[4], *y32[4]; 432 x32[0] = nir_unpack_64_2x32_split_x(b, x); 433 x32[1] = nir_unpack_64_2x32_split_y(b, x); 434 if (sign_extend) { 435 x32[2] = x32[3] = nir_ishr_imm(b, x32[1], 31); 436 } else { 437 x32[2] = x32[3] = nir_imm_int(b, 0); 438 } 439 440 y32[0] = nir_unpack_64_2x32_split_x(b, y); 441 y32[1] = nir_unpack_64_2x32_split_y(b, y); 442 if (sign_extend) { 443 y32[2] = y32[3] = nir_ishr_imm(b, y32[1], 31); 444 } else { 445 y32[2] = y32[3] = nir_imm_int(b, 0); 446 } 447 448 nir_ssa_def *res[8] = { NULL, }; 449 450 /* Yes, the following generates a pile of code. However, we throw res[0] 451 * and res[1] away in the end and, if we're in the umul case, four of our 452 * eight dword operands will be constant zero and opt_algebraic will clean 453 * this up nicely. 454 */ 455 for (unsigned i = 0; i < 4; i++) { 456 nir_ssa_def *carry = NULL; 457 for (unsigned j = 0; j < 4; j++) { 458 /* The maximum values of x32[i] and y32[j] are UINT32_MAX so the 459 * maximum value of tmp is UINT32_MAX * UINT32_MAX. The maximum 460 * value that will fit in tmp is 461 * 462 * UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX 463 * = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX 464 * = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX 465 * 466 * so we're guaranteed that we can add in two more 32-bit values 467 * without overflowing tmp. 468 */ 469 nir_ssa_def *tmp = nir_umul_2x32_64(b, x32[i], y32[j]); 470 471 if (res[i + j]) 472 tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j])); 473 if (carry) 474 tmp = nir_iadd(b, tmp, carry); 475 res[i + j] = nir_u2u32(b, tmp); 476 carry = nir_ushr_imm(b, tmp, 32); 477 } 478 res[i + 4] = nir_u2u32(b, carry); 479 } 480 481 return nir_pack_64_2x32_split(b, res[2], res[3]); 482} 483 484static nir_ssa_def * 485lower_isign64(nir_builder *b, nir_ssa_def *x) 486{ 487 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 488 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 489 490 nir_ssa_def *is_non_zero = nir_i2b(b, nir_ior(b, x_lo, x_hi)); 491 nir_ssa_def *res_hi = nir_ishr_imm(b, x_hi, 31); 492 nir_ssa_def *res_lo = nir_ior(b, res_hi, nir_b2i32(b, is_non_zero)); 493 494 return nir_pack_64_2x32_split(b, res_lo, res_hi); 495} 496 497static void 498lower_udiv64_mod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d, 499 nir_ssa_def **q, nir_ssa_def **r) 500{ 501 /* TODO: We should specially handle the case where the denominator is a 502 * constant. In that case, we should be able to reduce it to a multiply by 503 * a constant, some shifts, and an add. 504 */ 505 nir_ssa_def *n_lo = nir_unpack_64_2x32_split_x(b, n); 506 nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 507 nir_ssa_def *d_lo = nir_unpack_64_2x32_split_x(b, d); 508 nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d); 509 510 nir_ssa_def *q_lo = nir_imm_zero(b, n->num_components, 32); 511 nir_ssa_def *q_hi = nir_imm_zero(b, n->num_components, 32); 512 513 nir_ssa_def *n_hi_before_if = n_hi; 514 nir_ssa_def *q_hi_before_if = q_hi; 515 516 /* If the upper 32 bits of denom are non-zero, it is impossible for shifts 517 * greater than 32 bits to occur. If the upper 32 bits of the numerator 518 * are zero, it is impossible for (denom << [63, 32]) <= numer unless 519 * denom == 0. 520 */ 521 nir_ssa_def *need_high_div = 522 nir_iand(b, nir_ieq_imm(b, d_hi, 0), nir_uge(b, n_hi, d_lo)); 523 nir_push_if(b, nir_bany(b, need_high_div)); 524 { 525 /* If we only have one component, then the bany above goes away and 526 * this is always true within the if statement. 527 */ 528 if (n->num_components == 1) 529 need_high_div = nir_imm_true(b); 530 531 nir_ssa_def *log2_d_lo = nir_ufind_msb(b, d_lo); 532 533 for (int i = 31; i >= 0; i--) { 534 /* if ((d.x << i) <= n.y) { 535 * n.y -= d.x << i; 536 * quot.y |= 1U << i; 537 * } 538 */ 539 nir_ssa_def *d_shift = nir_ishl(b, d_lo, nir_imm_int(b, i)); 540 nir_ssa_def *new_n_hi = nir_isub(b, n_hi, d_shift); 541 nir_ssa_def *new_q_hi = nir_ior(b, q_hi, nir_imm_int(b, 1u << i)); 542 nir_ssa_def *cond = nir_iand(b, need_high_div, 543 nir_uge(b, n_hi, d_shift)); 544 if (i != 0) { 545 /* log2_d_lo is always <= 31, so we don't need to bother with it 546 * in the last iteration. 547 */ 548 cond = nir_iand(b, cond, 549 nir_ige(b, nir_imm_int(b, 31 - i), log2_d_lo)); 550 } 551 n_hi = nir_bcsel(b, cond, new_n_hi, n_hi); 552 q_hi = nir_bcsel(b, cond, new_q_hi, q_hi); 553 } 554 } 555 nir_pop_if(b, NULL); 556 n_hi = nir_if_phi(b, n_hi, n_hi_before_if); 557 q_hi = nir_if_phi(b, q_hi, q_hi_before_if); 558 559 nir_ssa_def *log2_denom = nir_ufind_msb(b, d_hi); 560 561 n = nir_pack_64_2x32_split(b, n_lo, n_hi); 562 d = nir_pack_64_2x32_split(b, d_lo, d_hi); 563 for (int i = 31; i >= 0; i--) { 564 /* if ((d64 << i) <= n64) { 565 * n64 -= d64 << i; 566 * quot.x |= 1U << i; 567 * } 568 */ 569 nir_ssa_def *d_shift = nir_ishl(b, d, nir_imm_int(b, i)); 570 nir_ssa_def *new_n = nir_isub(b, n, d_shift); 571 nir_ssa_def *new_q_lo = nir_ior(b, q_lo, nir_imm_int(b, 1u << i)); 572 nir_ssa_def *cond = nir_uge(b, n, d_shift); 573 if (i != 0) { 574 /* log2_denom is always <= 31, so we don't need to bother with it 575 * in the last iteration. 576 */ 577 cond = nir_iand(b, cond, 578 nir_ige(b, nir_imm_int(b, 31 - i), log2_denom)); 579 } 580 n = nir_bcsel(b, cond, new_n, n); 581 q_lo = nir_bcsel(b, cond, new_q_lo, q_lo); 582 } 583 584 *q = nir_pack_64_2x32_split(b, q_lo, q_hi); 585 *r = n; 586} 587 588static nir_ssa_def * 589lower_udiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 590{ 591 nir_ssa_def *q, *r; 592 lower_udiv64_mod64(b, n, d, &q, &r); 593 return q; 594} 595 596static nir_ssa_def * 597lower_idiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 598{ 599 nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 600 nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d); 601 602 nir_ssa_def *negate = nir_ine(b, nir_ilt(b, n_hi, nir_imm_int(b, 0)), 603 nir_ilt(b, d_hi, nir_imm_int(b, 0))); 604 nir_ssa_def *q, *r; 605 lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r); 606 return nir_bcsel(b, negate, nir_ineg(b, q), q); 607} 608 609static nir_ssa_def * 610lower_umod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 611{ 612 nir_ssa_def *q, *r; 613 lower_udiv64_mod64(b, n, d, &q, &r); 614 return r; 615} 616 617static nir_ssa_def * 618lower_imod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 619{ 620 nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 621 nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d); 622 nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0)); 623 nir_ssa_def *d_is_neg = nir_ilt(b, d_hi, nir_imm_int(b, 0)); 624 625 nir_ssa_def *q, *r; 626 lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r); 627 628 nir_ssa_def *rem = nir_bcsel(b, n_is_neg, nir_ineg(b, r), r); 629 630 return nir_bcsel(b, nir_ieq_imm(b, r, 0), nir_imm_int64(b, 0), 631 nir_bcsel(b, nir_ieq(b, n_is_neg, d_is_neg), rem, 632 nir_iadd(b, rem, d))); 633} 634 635static nir_ssa_def * 636lower_irem64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 637{ 638 nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 639 nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0)); 640 641 nir_ssa_def *q, *r; 642 lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r); 643 return nir_bcsel(b, n_is_neg, nir_ineg(b, r), r); 644} 645 646static nir_ssa_def * 647lower_extract(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *c) 648{ 649 assert(op == nir_op_extract_u8 || op == nir_op_extract_i8 || 650 op == nir_op_extract_u16 || op == nir_op_extract_i16); 651 652 const int chunk = nir_src_as_uint(nir_src_for_ssa(c)); 653 const int chunk_bits = 654 (op == nir_op_extract_u8 || op == nir_op_extract_i8) ? 8 : 16; 655 const int num_chunks_in_32 = 32 / chunk_bits; 656 657 nir_ssa_def *extract32; 658 if (chunk < num_chunks_in_32) { 659 extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_x(b, x), 660 nir_imm_int(b, chunk), 661 NULL, NULL); 662 } else { 663 extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_y(b, x), 664 nir_imm_int(b, chunk - num_chunks_in_32), 665 NULL, NULL); 666 } 667 668 if (op == nir_op_extract_i8 || op == nir_op_extract_i16) 669 return lower_i2i64(b, extract32); 670 else 671 return lower_u2u64(b, extract32); 672} 673 674static nir_ssa_def * 675lower_ufind_msb64(nir_builder *b, nir_ssa_def *x) 676{ 677 678 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 679 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 680 nir_ssa_def *lo_count = nir_ufind_msb(b, x_lo); 681 nir_ssa_def *hi_count = nir_ufind_msb(b, x_hi); 682 nir_ssa_def *valid_hi_bits = nir_ine(b, x_hi, nir_imm_int(b, 0)); 683 nir_ssa_def *hi_res = nir_iadd(b, nir_imm_intN_t(b, 32, 32), hi_count); 684 return nir_bcsel(b, valid_hi_bits, hi_res, lo_count); 685} 686 687static nir_ssa_def * 688lower_2f(nir_builder *b, nir_ssa_def *x, unsigned dest_bit_size, 689 bool src_is_signed) 690{ 691 nir_ssa_def *x_sign = NULL; 692 693 if (src_is_signed) { 694 x_sign = nir_bcsel(b, COND_LOWER_CMP(b, ilt, x, nir_imm_int64(b, 0)), 695 nir_imm_floatN_t(b, -1, dest_bit_size), 696 nir_imm_floatN_t(b, 1, dest_bit_size)); 697 x = COND_LOWER_OP(b, iabs, x); 698 } 699 700 nir_ssa_def *exp = COND_LOWER_OP(b, ufind_msb, x); 701 unsigned significand_bits; 702 703 switch (dest_bit_size) { 704 case 32: 705 significand_bits = 23; 706 break; 707 case 16: 708 significand_bits = 10; 709 break; 710 default: 711 unreachable("Invalid dest_bit_size"); 712 } 713 714 nir_ssa_def *discard = 715 nir_imax(b, nir_isub(b, exp, nir_imm_int(b, significand_bits)), 716 nir_imm_int(b, 0)); 717 nir_ssa_def *significand = 718 COND_LOWER_CAST(b, u2u32, COND_LOWER_OP(b, ushr, x, discard)); 719 720 /* Round-to-nearest-even implementation: 721 * - if the non-representable part of the significand is higher than half 722 * the minimum representable significand, we round-up 723 * - if the non-representable part of the significand is equal to half the 724 * minimum representable significand and the representable part of the 725 * significand is odd, we round-up 726 * - in any other case, we round-down 727 */ 728 nir_ssa_def *lsb_mask = COND_LOWER_OP(b, ishl, nir_imm_int64(b, 1), discard); 729 nir_ssa_def *rem_mask = COND_LOWER_OP(b, isub, lsb_mask, nir_imm_int64(b, 1)); 730 nir_ssa_def *half = COND_LOWER_OP(b, ishr, lsb_mask, nir_imm_int(b, 1)); 731 nir_ssa_def *rem = COND_LOWER_OP(b, iand, x, rem_mask); 732 nir_ssa_def *halfway = nir_iand(b, COND_LOWER_CMP(b, ieq, rem, half), 733 nir_ine(b, discard, nir_imm_int(b, 0))); 734 nir_ssa_def *is_odd = nir_i2b(b, nir_iand(b, significand, nir_imm_int(b, 1))); 735 nir_ssa_def *round_up = nir_ior(b, COND_LOWER_CMP(b, ilt, half, rem), 736 nir_iand(b, halfway, is_odd)); 737 significand = nir_iadd(b, significand, nir_b2i32(b, round_up)); 738 739 nir_ssa_def *res; 740 741 if (dest_bit_size == 32) 742 res = nir_fmul(b, nir_u2f32(b, significand), 743 nir_fexp2(b, nir_u2f32(b, discard))); 744 else 745 res = nir_fmul(b, nir_u2f16(b, significand), 746 nir_fexp2(b, nir_u2f16(b, discard))); 747 748 if (src_is_signed) 749 res = nir_fmul(b, res, x_sign); 750 751 return res; 752} 753 754static nir_ssa_def * 755lower_f2(nir_builder *b, nir_ssa_def *x, bool dst_is_signed) 756{ 757 assert(x->bit_size == 16 || x->bit_size == 32); 758 nir_ssa_def *x_sign = NULL; 759 760 if (dst_is_signed) 761 x_sign = nir_fsign(b, x); 762 763 x = nir_ftrunc(b, x); 764 765 if (dst_is_signed) 766 x = nir_fabs(b, x); 767 768 nir_ssa_def *res; 769 if (x->bit_size < 32) { 770 res = nir_pack_64_2x32_split(b, nir_f2u32(b, x), nir_imm_int(b, 0)); 771 } else { 772 nir_ssa_def *div = nir_imm_floatN_t(b, 1ULL << 32, x->bit_size); 773 nir_ssa_def *res_hi = nir_f2u32(b, nir_fdiv(b, x, div)); 774 nir_ssa_def *res_lo = nir_f2u32(b, nir_frem(b, x, div)); 775 res = nir_pack_64_2x32_split(b, res_lo, res_hi); 776 } 777 778 if (dst_is_signed) 779 res = nir_bcsel(b, nir_flt(b, x_sign, nir_imm_floatN_t(b, 0, x->bit_size)), 780 nir_ineg(b, res), res); 781 782 return res; 783} 784 785static nir_ssa_def * 786lower_bit_count64(nir_builder *b, nir_ssa_def *x) 787{ 788 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 789 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 790 nir_ssa_def *lo_count = nir_bit_count(b, x_lo); 791 nir_ssa_def *hi_count = nir_bit_count(b, x_hi); 792 return nir_iadd(b, lo_count, hi_count); 793} 794 795nir_lower_int64_options 796nir_lower_int64_op_to_options_mask(nir_op opcode) 797{ 798 switch (opcode) { 799 case nir_op_imul: 800 case nir_op_amul: 801 return nir_lower_imul64; 802 case nir_op_imul_2x32_64: 803 case nir_op_umul_2x32_64: 804 return nir_lower_imul_2x32_64; 805 case nir_op_imul_high: 806 case nir_op_umul_high: 807 return nir_lower_imul_high64; 808 case nir_op_isign: 809 return nir_lower_isign64; 810 case nir_op_udiv: 811 case nir_op_idiv: 812 case nir_op_umod: 813 case nir_op_imod: 814 case nir_op_irem: 815 return nir_lower_divmod64; 816 case nir_op_b2i64: 817 case nir_op_i2b1: 818 case nir_op_i2i8: 819 case nir_op_i2i16: 820 case nir_op_i2i32: 821 case nir_op_i2i64: 822 case nir_op_u2u8: 823 case nir_op_u2u16: 824 case nir_op_u2u32: 825 case nir_op_u2u64: 826 case nir_op_i2f32: 827 case nir_op_u2f32: 828 case nir_op_i2f16: 829 case nir_op_u2f16: 830 case nir_op_f2i64: 831 case nir_op_f2u64: 832 case nir_op_bcsel: 833 return nir_lower_mov64; 834 case nir_op_ieq: 835 case nir_op_ine: 836 case nir_op_ult: 837 case nir_op_ilt: 838 case nir_op_uge: 839 case nir_op_ige: 840 return nir_lower_icmp64; 841 case nir_op_iadd: 842 case nir_op_isub: 843 return nir_lower_iadd64; 844 case nir_op_imin: 845 case nir_op_imax: 846 case nir_op_umin: 847 case nir_op_umax: 848 return nir_lower_minmax64; 849 case nir_op_iabs: 850 return nir_lower_iabs64; 851 case nir_op_ineg: 852 return nir_lower_ineg64; 853 case nir_op_iand: 854 case nir_op_ior: 855 case nir_op_ixor: 856 case nir_op_inot: 857 return nir_lower_logic64; 858 case nir_op_ishl: 859 case nir_op_ishr: 860 case nir_op_ushr: 861 return nir_lower_shift64; 862 case nir_op_extract_u8: 863 case nir_op_extract_i8: 864 case nir_op_extract_u16: 865 case nir_op_extract_i16: 866 return nir_lower_extract64; 867 case nir_op_ufind_msb: 868 return nir_lower_ufind_msb64; 869 case nir_op_bit_count: 870 return nir_lower_bit_count64; 871 default: 872 return 0; 873 } 874} 875 876static nir_ssa_def * 877lower_int64_alu_instr(nir_builder *b, nir_alu_instr *alu) 878{ 879 nir_ssa_def *src[4]; 880 for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) 881 src[i] = nir_ssa_for_alu_src(b, alu, i); 882 883 switch (alu->op) { 884 case nir_op_imul: 885 case nir_op_amul: 886 return lower_imul64(b, src[0], src[1]); 887 case nir_op_imul_2x32_64: 888 return lower_mul_2x32_64(b, src[0], src[1], true); 889 case nir_op_umul_2x32_64: 890 return lower_mul_2x32_64(b, src[0], src[1], false); 891 case nir_op_imul_high: 892 return lower_mul_high64(b, src[0], src[1], true); 893 case nir_op_umul_high: 894 return lower_mul_high64(b, src[0], src[1], false); 895 case nir_op_isign: 896 return lower_isign64(b, src[0]); 897 case nir_op_udiv: 898 return lower_udiv64(b, src[0], src[1]); 899 case nir_op_idiv: 900 return lower_idiv64(b, src[0], src[1]); 901 case nir_op_umod: 902 return lower_umod64(b, src[0], src[1]); 903 case nir_op_imod: 904 return lower_imod64(b, src[0], src[1]); 905 case nir_op_irem: 906 return lower_irem64(b, src[0], src[1]); 907 case nir_op_b2i64: 908 return lower_b2i64(b, src[0]); 909 case nir_op_i2b1: 910 return lower_i2b(b, src[0]); 911 case nir_op_i2i8: 912 return lower_i2i8(b, src[0]); 913 case nir_op_i2i16: 914 return lower_i2i16(b, src[0]); 915 case nir_op_i2i32: 916 return lower_i2i32(b, src[0]); 917 case nir_op_i2i64: 918 return lower_i2i64(b, src[0]); 919 case nir_op_u2u8: 920 return lower_u2u8(b, src[0]); 921 case nir_op_u2u16: 922 return lower_u2u16(b, src[0]); 923 case nir_op_u2u32: 924 return lower_u2u32(b, src[0]); 925 case nir_op_u2u64: 926 return lower_u2u64(b, src[0]); 927 case nir_op_bcsel: 928 return lower_bcsel64(b, src[0], src[1], src[2]); 929 case nir_op_ieq: 930 case nir_op_ine: 931 case nir_op_ult: 932 case nir_op_ilt: 933 case nir_op_uge: 934 case nir_op_ige: 935 return lower_int64_compare(b, alu->op, src[0], src[1]); 936 case nir_op_iadd: 937 return lower_iadd64(b, src[0], src[1]); 938 case nir_op_isub: 939 return lower_isub64(b, src[0], src[1]); 940 case nir_op_imin: 941 return lower_imin64(b, src[0], src[1]); 942 case nir_op_imax: 943 return lower_imax64(b, src[0], src[1]); 944 case nir_op_umin: 945 return lower_umin64(b, src[0], src[1]); 946 case nir_op_umax: 947 return lower_umax64(b, src[0], src[1]); 948 case nir_op_iabs: 949 return lower_iabs64(b, src[0]); 950 case nir_op_ineg: 951 return lower_ineg64(b, src[0]); 952 case nir_op_iand: 953 return lower_iand64(b, src[0], src[1]); 954 case nir_op_ior: 955 return lower_ior64(b, src[0], src[1]); 956 case nir_op_ixor: 957 return lower_ixor64(b, src[0], src[1]); 958 case nir_op_inot: 959 return lower_inot64(b, src[0]); 960 case nir_op_ishl: 961 return lower_ishl64(b, src[0], src[1]); 962 case nir_op_ishr: 963 return lower_ishr64(b, src[0], src[1]); 964 case nir_op_ushr: 965 return lower_ushr64(b, src[0], src[1]); 966 case nir_op_extract_u8: 967 case nir_op_extract_i8: 968 case nir_op_extract_u16: 969 case nir_op_extract_i16: 970 return lower_extract(b, alu->op, src[0], src[1]); 971 case nir_op_ufind_msb: 972 return lower_ufind_msb64(b, src[0]); 973 case nir_op_bit_count: 974 return lower_bit_count64(b, src[0]); 975 case nir_op_i2f64: 976 case nir_op_i2f32: 977 case nir_op_i2f16: 978 return lower_2f(b, src[0], nir_dest_bit_size(alu->dest.dest), true); 979 case nir_op_u2f64: 980 case nir_op_u2f32: 981 case nir_op_u2f16: 982 return lower_2f(b, src[0], nir_dest_bit_size(alu->dest.dest), false); 983 case nir_op_f2i64: 984 case nir_op_f2u64: 985 /* We don't support f64toi64 (yet?). */ 986 if (src[0]->bit_size > 32) 987 return false; 988 989 return lower_f2(b, src[0], alu->op == nir_op_f2i64); 990 default: 991 unreachable("Invalid ALU opcode to lower"); 992 } 993} 994 995static bool 996should_lower_int64_alu_instr(const nir_alu_instr *alu, 997 const nir_shader_compiler_options *options) 998{ 999 switch (alu->op) { 1000 case nir_op_i2b1: 1001 case nir_op_i2i8: 1002 case nir_op_i2i16: 1003 case nir_op_i2i32: 1004 case nir_op_u2u8: 1005 case nir_op_u2u16: 1006 case nir_op_u2u32: 1007 assert(alu->src[0].src.is_ssa); 1008 if (alu->src[0].src.ssa->bit_size != 64) 1009 return false; 1010 break; 1011 case nir_op_bcsel: 1012 assert(alu->src[1].src.is_ssa); 1013 assert(alu->src[2].src.is_ssa); 1014 assert(alu->src[1].src.ssa->bit_size == 1015 alu->src[2].src.ssa->bit_size); 1016 if (alu->src[1].src.ssa->bit_size != 64) 1017 return false; 1018 break; 1019 case nir_op_ieq: 1020 case nir_op_ine: 1021 case nir_op_ult: 1022 case nir_op_ilt: 1023 case nir_op_uge: 1024 case nir_op_ige: 1025 assert(alu->src[0].src.is_ssa); 1026 assert(alu->src[1].src.is_ssa); 1027 assert(alu->src[0].src.ssa->bit_size == 1028 alu->src[1].src.ssa->bit_size); 1029 if (alu->src[0].src.ssa->bit_size != 64) 1030 return false; 1031 break; 1032 case nir_op_ufind_msb: 1033 case nir_op_bit_count: 1034 assert(alu->src[0].src.is_ssa); 1035 if (alu->src[0].src.ssa->bit_size != 64) 1036 return false; 1037 break; 1038 case nir_op_amul: 1039 assert(alu->dest.dest.is_ssa); 1040 if (options->has_imul24) 1041 return false; 1042 if (alu->dest.dest.ssa.bit_size != 64) 1043 return false; 1044 break; 1045 case nir_op_i2f64: 1046 case nir_op_u2f64: 1047 case nir_op_i2f32: 1048 case nir_op_u2f32: 1049 case nir_op_i2f16: 1050 case nir_op_u2f16: 1051 assert(alu->src[0].src.is_ssa); 1052 if (alu->src[0].src.ssa->bit_size != 64) 1053 return false; 1054 break; 1055 case nir_op_f2u64: 1056 case nir_op_f2i64: 1057 FALLTHROUGH; 1058 default: 1059 assert(alu->dest.dest.is_ssa); 1060 if (alu->dest.dest.ssa.bit_size != 64) 1061 return false; 1062 break; 1063 } 1064 1065 unsigned mask = nir_lower_int64_op_to_options_mask(alu->op); 1066 return (options->lower_int64_options & mask) != 0; 1067} 1068 1069static nir_ssa_def * 1070split_64bit_subgroup_op(nir_builder *b, const nir_intrinsic_instr *intrin) 1071{ 1072 const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic]; 1073 1074 /* This works on subgroup ops with a single 64-bit source which can be 1075 * trivially lowered by doing the exact same op on both halves. 1076 */ 1077 assert(intrin->src[0].is_ssa && intrin->src[0].ssa->bit_size == 64); 1078 nir_ssa_def *split_src0[2] = { 1079 nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa), 1080 nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa), 1081 }; 1082 1083 assert(info->has_dest && intrin->dest.is_ssa && 1084 intrin->dest.ssa.bit_size == 64); 1085 1086 nir_ssa_def *res[2]; 1087 for (unsigned i = 0; i < 2; i++) { 1088 nir_intrinsic_instr *split = 1089 nir_intrinsic_instr_create(b->shader, intrin->intrinsic); 1090 split->num_components = intrin->num_components; 1091 split->src[0] = nir_src_for_ssa(split_src0[i]); 1092 1093 /* Other sources must be less than 64 bits and get copied directly */ 1094 for (unsigned j = 1; j < info->num_srcs; j++) { 1095 assert(intrin->src[j].is_ssa && intrin->src[j].ssa->bit_size < 64); 1096 split->src[j] = nir_src_for_ssa(intrin->src[j].ssa); 1097 } 1098 1099 /* Copy const indices, if any */ 1100 memcpy(split->const_index, intrin->const_index, 1101 sizeof(intrin->const_index)); 1102 1103 nir_ssa_dest_init(&split->instr, &split->dest, 1104 intrin->dest.ssa.num_components, 32, NULL); 1105 nir_builder_instr_insert(b, &split->instr); 1106 1107 res[i] = &split->dest.ssa; 1108 } 1109 1110 return nir_pack_64_2x32_split(b, res[0], res[1]); 1111} 1112 1113static nir_ssa_def * 1114build_vote_ieq(nir_builder *b, nir_ssa_def *x) 1115{ 1116 nir_intrinsic_instr *vote = 1117 nir_intrinsic_instr_create(b->shader, nir_intrinsic_vote_ieq); 1118 vote->src[0] = nir_src_for_ssa(x); 1119 vote->num_components = x->num_components; 1120 nir_ssa_dest_init(&vote->instr, &vote->dest, 1, 1, NULL); 1121 nir_builder_instr_insert(b, &vote->instr); 1122 return &vote->dest.ssa; 1123} 1124 1125static nir_ssa_def * 1126lower_vote_ieq(nir_builder *b, nir_ssa_def *x) 1127{ 1128 return nir_iand(b, build_vote_ieq(b, nir_unpack_64_2x32_split_x(b, x)), 1129 build_vote_ieq(b, nir_unpack_64_2x32_split_y(b, x))); 1130} 1131 1132static nir_ssa_def * 1133build_scan_intrinsic(nir_builder *b, nir_intrinsic_op scan_op, 1134 nir_op reduction_op, unsigned cluster_size, 1135 nir_ssa_def *val) 1136{ 1137 nir_intrinsic_instr *scan = 1138 nir_intrinsic_instr_create(b->shader, scan_op); 1139 scan->num_components = val->num_components; 1140 scan->src[0] = nir_src_for_ssa(val); 1141 nir_intrinsic_set_reduction_op(scan, reduction_op); 1142 if (scan_op == nir_intrinsic_reduce) 1143 nir_intrinsic_set_cluster_size(scan, cluster_size); 1144 nir_ssa_dest_init(&scan->instr, &scan->dest, 1145 val->num_components, val->bit_size, NULL); 1146 nir_builder_instr_insert(b, &scan->instr); 1147 return &scan->dest.ssa; 1148} 1149 1150static nir_ssa_def * 1151lower_scan_iadd64(nir_builder *b, const nir_intrinsic_instr *intrin) 1152{ 1153 unsigned cluster_size = 1154 intrin->intrinsic == nir_intrinsic_reduce ? 1155 nir_intrinsic_cluster_size(intrin) : 0; 1156 1157 /* Split it into three chunks of no more than 24 bits each. With 8 bits 1158 * of headroom, we're guaranteed that there will never be overflow in the 1159 * individual subgroup operations. (Assuming, of course, a subgroup size 1160 * no larger than 256 which seems reasonable.) We can then scan on each of 1161 * the chunks and add them back together at the end. 1162 */ 1163 assert(intrin->src[0].is_ssa); 1164 nir_ssa_def *x = intrin->src[0].ssa; 1165 nir_ssa_def *x_low = 1166 nir_u2u32(b, nir_iand_imm(b, x, 0xffffff)); 1167 nir_ssa_def *x_mid = 1168 nir_u2u32(b, nir_iand_imm(b, nir_ushr(b, x, nir_imm_int(b, 24)), 1169 0xffffff)); 1170 nir_ssa_def *x_hi = 1171 nir_u2u32(b, nir_ushr(b, x, nir_imm_int(b, 48))); 1172 1173 nir_ssa_def *scan_low = 1174 build_scan_intrinsic(b, intrin->intrinsic, nir_op_iadd, 1175 cluster_size, x_low); 1176 nir_ssa_def *scan_mid = 1177 build_scan_intrinsic(b, intrin->intrinsic, nir_op_iadd, 1178 cluster_size, x_mid); 1179 nir_ssa_def *scan_hi = 1180 build_scan_intrinsic(b, intrin->intrinsic, nir_op_iadd, 1181 cluster_size, x_hi); 1182 1183 scan_low = nir_u2u64(b, scan_low); 1184 scan_mid = nir_ishl(b, nir_u2u64(b, scan_mid), nir_imm_int(b, 24)); 1185 scan_hi = nir_ishl(b, nir_u2u64(b, scan_hi), nir_imm_int(b, 48)); 1186 1187 return nir_iadd(b, scan_hi, nir_iadd(b, scan_mid, scan_low)); 1188} 1189 1190static bool 1191should_lower_int64_intrinsic(const nir_intrinsic_instr *intrin, 1192 const nir_shader_compiler_options *options) 1193{ 1194 switch (intrin->intrinsic) { 1195 case nir_intrinsic_read_invocation: 1196 case nir_intrinsic_read_first_invocation: 1197 case nir_intrinsic_shuffle: 1198 case nir_intrinsic_shuffle_xor: 1199 case nir_intrinsic_shuffle_up: 1200 case nir_intrinsic_shuffle_down: 1201 case nir_intrinsic_quad_broadcast: 1202 case nir_intrinsic_quad_swap_horizontal: 1203 case nir_intrinsic_quad_swap_vertical: 1204 case nir_intrinsic_quad_swap_diagonal: 1205 assert(intrin->dest.is_ssa); 1206 return intrin->dest.ssa.bit_size == 64 && 1207 (options->lower_int64_options & nir_lower_subgroup_shuffle64); 1208 1209 case nir_intrinsic_vote_ieq: 1210 assert(intrin->src[0].is_ssa); 1211 return intrin->src[0].ssa->bit_size == 64 && 1212 (options->lower_int64_options & nir_lower_vote_ieq64); 1213 1214 case nir_intrinsic_reduce: 1215 case nir_intrinsic_inclusive_scan: 1216 case nir_intrinsic_exclusive_scan: 1217 assert(intrin->dest.is_ssa); 1218 if (intrin->dest.ssa.bit_size != 64) 1219 return false; 1220 1221 switch (nir_intrinsic_reduction_op(intrin)) { 1222 case nir_op_iadd: 1223 return options->lower_int64_options & nir_lower_scan_reduce_iadd64; 1224 case nir_op_iand: 1225 case nir_op_ior: 1226 case nir_op_ixor: 1227 return options->lower_int64_options & nir_lower_scan_reduce_bitwise64; 1228 default: 1229 return false; 1230 } 1231 break; 1232 1233 default: 1234 return false; 1235 } 1236} 1237 1238static nir_ssa_def * 1239lower_int64_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin) 1240{ 1241 switch (intrin->intrinsic) { 1242 case nir_intrinsic_read_invocation: 1243 case nir_intrinsic_read_first_invocation: 1244 case nir_intrinsic_shuffle: 1245 case nir_intrinsic_shuffle_xor: 1246 case nir_intrinsic_shuffle_up: 1247 case nir_intrinsic_shuffle_down: 1248 case nir_intrinsic_quad_broadcast: 1249 case nir_intrinsic_quad_swap_horizontal: 1250 case nir_intrinsic_quad_swap_vertical: 1251 case nir_intrinsic_quad_swap_diagonal: 1252 return split_64bit_subgroup_op(b, intrin); 1253 1254 case nir_intrinsic_vote_ieq: 1255 assert(intrin->src[0].is_ssa); 1256 return lower_vote_ieq(b, intrin->src[0].ssa); 1257 1258 case nir_intrinsic_reduce: 1259 case nir_intrinsic_inclusive_scan: 1260 case nir_intrinsic_exclusive_scan: 1261 switch (nir_intrinsic_reduction_op(intrin)) { 1262 case nir_op_iadd: 1263 return lower_scan_iadd64(b, intrin); 1264 case nir_op_iand: 1265 case nir_op_ior: 1266 case nir_op_ixor: 1267 return split_64bit_subgroup_op(b, intrin); 1268 default: 1269 unreachable("Unsupported subgroup scan/reduce op"); 1270 } 1271 break; 1272 1273 default: 1274 unreachable("Unsupported intrinsic"); 1275 } 1276} 1277 1278static bool 1279should_lower_int64_instr(const nir_instr *instr, const void *_options) 1280{ 1281 switch (instr->type) { 1282 case nir_instr_type_alu: 1283 return should_lower_int64_alu_instr(nir_instr_as_alu(instr), _options); 1284 case nir_instr_type_intrinsic: 1285 return should_lower_int64_intrinsic(nir_instr_as_intrinsic(instr), 1286 _options); 1287 default: 1288 return false; 1289 } 1290} 1291 1292static nir_ssa_def * 1293lower_int64_instr(nir_builder *b, nir_instr *instr, void *_options) 1294{ 1295 switch (instr->type) { 1296 case nir_instr_type_alu: 1297 return lower_int64_alu_instr(b, nir_instr_as_alu(instr)); 1298 case nir_instr_type_intrinsic: 1299 return lower_int64_intrinsic(b, nir_instr_as_intrinsic(instr)); 1300 default: 1301 return NULL; 1302 } 1303} 1304 1305bool 1306nir_lower_int64(nir_shader *shader) 1307{ 1308 return nir_shader_lower_instructions(shader, should_lower_int64_instr, 1309 lower_int64_instr, 1310 (void *)shader->options); 1311} 1312