1/* 2 * Copyright (C) 2020 Collabora Ltd. 3 * Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io> 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 * 24 * Authors (Collabora): 25 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> 26 */ 27 28#include "compiler/glsl/glsl_to_nir.h" 29#include "compiler/nir_types.h" 30#include "compiler/nir/nir_builder.h" 31#include "compiler/nir/nir_schedule.h" 32#include "util/u_debug.h" 33 34#include "disassemble.h" 35#include "valhall/va_compiler.h" 36#include "valhall/disassemble.h" 37#include "bifrost_compile.h" 38#include "compiler.h" 39#include "valhall/va_compiler.h" 40#include "bi_quirks.h" 41#include "bi_builder.h" 42#include "bifrost_nir.h" 43 44static const struct debug_named_value bifrost_debug_options[] = { 45 {"msgs", BIFROST_DBG_MSGS, "Print debug messages"}, 46 {"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"}, 47 {"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"}, 48 {"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"}, 49 {"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"}, 50 {"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"}, 51 {"nopsched", BIFROST_DBG_NOPSCHED, "Disable scheduling for pressure"}, 52 {"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"}, 53 {"novalidate",BIFROST_DBG_NOVALIDATE, "Skip IR validation"}, 54 {"noopt", BIFROST_DBG_NOOPT, "Skip optimization passes"}, 55 {"noidvs", BIFROST_DBG_NOIDVS, "Disable IDVS"}, 56 {"nosb", BIFROST_DBG_NOSB, "Disable scoreboarding"}, 57 {"nopreload", BIFROST_DBG_NOPRELOAD, "Disable message preloading"}, 58 {"spill", BIFROST_DBG_SPILL, "Test register spilling"}, 59 DEBUG_NAMED_VALUE_END 60}; 61 62DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0) 63 64/* How many bytes are prefetched by the Bifrost shader core. From the final 65 * clause of the shader, this range must be valid instructions or zero. */ 66#define BIFROST_SHADER_PREFETCH 128 67 68int bifrost_debug = 0; 69 70#define DBG(fmt, ...) \ 71 do { if (bifrost_debug & BIFROST_DBG_MSGS) \ 72 fprintf(stderr, "%s:%d: "fmt, \ 73 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) 74 75static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list); 76 77static bi_index 78bi_preload(bi_builder *b, unsigned reg) 79{ 80 if (bi_is_null(b->shader->preloaded[reg])) { 81 /* Insert at the beginning of the shader */ 82 bi_builder b_ = *b; 83 b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks)); 84 85 /* Cache the result */ 86 b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg)); 87 } 88 89 return b->shader->preloaded[reg]; 90} 91 92static bi_index 93bi_coverage(bi_builder *b) 94{ 95 if (bi_is_null(b->shader->coverage)) 96 b->shader->coverage = bi_preload(b, 60); 97 98 return b->shader->coverage; 99} 100 101/* 102 * Vertex ID and Instance ID are preloaded registers. Where they are preloaded 103 * changed from Bifrost to Valhall. Provide helpers that smooth over the 104 * architectural difference. 105 */ 106static inline bi_index 107bi_vertex_id(bi_builder *b) 108{ 109 return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61); 110} 111 112static inline bi_index 113bi_instance_id(bi_builder *b) 114{ 115 return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62); 116} 117 118static void 119bi_emit_jump(bi_builder *b, nir_jump_instr *instr) 120{ 121 bi_instr *branch = bi_jump(b, bi_zero()); 122 123 switch (instr->type) { 124 case nir_jump_break: 125 branch->branch_target = b->shader->break_block; 126 break; 127 case nir_jump_continue: 128 branch->branch_target = b->shader->continue_block; 129 break; 130 default: 131 unreachable("Unhandled jump type"); 132 } 133 134 bi_block_add_successor(b->shader->current_block, branch->branch_target); 135 b->shader->current_block->unconditional_jumps = true; 136} 137 138/* Builds a 64-bit hash table key for an index */ 139static uint64_t 140bi_index_to_key(bi_index idx) 141{ 142 static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding"); 143 144 uint64_t key = 0; 145 memcpy(&key, &idx, sizeof(idx)); 146 return key; 147} 148 149/* 150 * Extract a single channel out of a vector source. We split vectors with SPLIT 151 * so we can use the split components directly, without emitting an extract. 152 * This has advantages of RA, as the split can usually be optimized away. 153 */ 154static bi_index 155bi_extract(bi_builder *b, bi_index vec, unsigned channel) 156{ 157 /* Extract caching relies on SSA form. It is incorrect for nir_register. 158 * Bypass the cache and emit an explicit split for registers. 159 */ 160 if (vec.reg) { 161 bi_instr *I = bi_split_i32_to(b, bi_null(), vec); 162 I->nr_dests = channel + 1; 163 I->dest[channel] = bi_temp(b->shader); 164 return I->dest[channel]; 165 } 166 167 bi_index *components = 168 _mesa_hash_table_u64_search(b->shader->allocated_vec, 169 bi_index_to_key(vec)); 170 171 /* No extract needed for scalars. 172 * 173 * This is a bit imprecise, but actual bugs (missing splits for vectors) 174 * should be caught by the following assertion. It is too difficult to 175 * ensure bi_extract is only called for real vectors. 176 */ 177 if (components == NULL && channel == 0) 178 return vec; 179 180 assert(components != NULL && "missing bi_cache_collect()"); 181 return components[channel]; 182} 183 184static void 185bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n) 186{ 187 /* Lifetime of a hash table entry has to be at least as long as the table */ 188 bi_index *channels = ralloc_array(b->shader, bi_index, n); 189 memcpy(channels, s, sizeof(bi_index) * n); 190 191 _mesa_hash_table_u64_insert(b->shader->allocated_vec, 192 bi_index_to_key(dst), channels); 193} 194 195/* 196 * Splits an n-component vector (vec) into n scalar destinations (dests) using a 197 * split pseudo-instruction. 198 * 199 * Pre-condition: dests is filled with bi_null(). 200 */ 201static void 202bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n) 203{ 204 /* Setup the destinations */ 205 for (unsigned i = 0; i < n; ++i) { 206 dests[i] = bi_temp(b->shader); 207 } 208 209 /* Emit the split */ 210 if (n == 1) { 211 bi_mov_i32_to(b, dests[0], vec); 212 } else { 213 bi_instr *I = bi_split_i32_to(b, dests[0], vec); 214 I->nr_dests = n; 215 216 for (unsigned j = 1; j < n; ++j) 217 I->dest[j] = dests[j]; 218 } 219} 220 221static void 222bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n) 223{ 224 bi_index dests[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; 225 bi_emit_split_i32(b, dests, vec, n); 226 bi_cache_collect(b, vec, dests, n); 227} 228 229/* 230 * Emit and cache a split for a vector of a given bitsize. The vector may not be 231 * composed of 32-bit words, but it will be split at 32-bit word boundaries. 232 */ 233static void 234bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits) 235{ 236 bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32)); 237} 238 239static void 240bi_split_dest(bi_builder *b, nir_dest dest) 241{ 242 bi_emit_cached_split(b, bi_dest_index(&dest), 243 nir_dest_bit_size(dest) * 244 nir_dest_num_components(dest)); 245} 246 247static bi_instr * 248bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n) 249{ 250 /* Special case: COLLECT of a single value is a scalar move */ 251 if (n == 1) 252 return bi_mov_i32_to(b, dst, chan[0]); 253 254 bi_instr *I = bi_collect_i32_to(b, dst); 255 I->nr_srcs = n; 256 257 for (unsigned i = 0; i < n; ++i) 258 I->src[i] = chan[i]; 259 260 bi_cache_collect(b, dst, chan, n); 261 return I; 262} 263 264static bi_instr * 265bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1) 266{ 267 return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1 }, 2); 268} 269 270static bi_instr * 271bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2) 272{ 273 return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1, s2 }, 3); 274} 275 276static bi_index 277bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1) 278{ 279 bi_index dst = bi_temp(b->shader); 280 bi_collect_v2i32_to(b, dst, s0, s1); 281 return dst; 282} 283 284static bi_index 285bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr) 286{ 287 switch (intr->intrinsic) { 288 case nir_intrinsic_load_barycentric_centroid: 289 case nir_intrinsic_load_barycentric_sample: 290 return bi_preload(b, 61); 291 292 /* Need to put the sample ID in the top 16-bits */ 293 case nir_intrinsic_load_barycentric_at_sample: 294 return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false), 295 bi_half(bi_src_index(&intr->src[0]), false)); 296 297 /* Interpret as 8:8 signed fixed point positions in pixels along X and 298 * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0) 299 * is the center of the pixel so we first fixup and then convert. For 300 * fp16 input: 301 * 302 * f2i16(((x, y) + (0.5, 0.5)) * 2**8) = 303 * f2i16((256 * (x, y)) + (128, 128)) = 304 * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128)) 305 * 306 * For fp32 input, that lacks enough precision for MSAA 16x, but the 307 * idea is the same. FIXME: still doesn't pass 308 */ 309 case nir_intrinsic_load_barycentric_at_offset: { 310 bi_index offset = bi_src_index(&intr->src[0]); 311 bi_index f16 = bi_null(); 312 unsigned sz = nir_src_bit_size(intr->src[0]); 313 314 if (sz == 16) { 315 f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), 316 bi_imm_f16(128.0)); 317 } else { 318 assert(sz == 32); 319 bi_index f[2]; 320 for (unsigned i = 0; i < 2; ++i) { 321 f[i] = bi_fadd_rscale_f32(b, 322 bi_extract(b, offset, i), 323 bi_imm_f32(0.5), bi_imm_u32(8), 324 BI_SPECIAL_NONE); 325 } 326 327 f16 = bi_v2f32_to_v2f16(b, f[0], f[1]); 328 } 329 330 return bi_v2f16_to_v2s16(b, f16); 331 } 332 333 case nir_intrinsic_load_barycentric_pixel: 334 default: 335 return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b); 336 } 337} 338 339static enum bi_sample 340bi_interp_for_intrinsic(nir_intrinsic_op op) 341{ 342 switch (op) { 343 case nir_intrinsic_load_barycentric_centroid: 344 return BI_SAMPLE_CENTROID; 345 case nir_intrinsic_load_barycentric_sample: 346 case nir_intrinsic_load_barycentric_at_sample: 347 return BI_SAMPLE_SAMPLE; 348 case nir_intrinsic_load_barycentric_at_offset: 349 return BI_SAMPLE_EXPLICIT; 350 case nir_intrinsic_load_barycentric_pixel: 351 default: 352 return BI_SAMPLE_CENTER; 353 } 354} 355 356/* auto, 64-bit omitted */ 357static enum bi_register_format 358bi_reg_fmt_for_nir(nir_alu_type T) 359{ 360 switch (T) { 361 case nir_type_float16: return BI_REGISTER_FORMAT_F16; 362 case nir_type_float32: return BI_REGISTER_FORMAT_F32; 363 case nir_type_int16: return BI_REGISTER_FORMAT_S16; 364 case nir_type_uint16: return BI_REGISTER_FORMAT_U16; 365 case nir_type_int32: return BI_REGISTER_FORMAT_S32; 366 case nir_type_uint32: return BI_REGISTER_FORMAT_U32; 367 default: unreachable("Invalid type for register format"); 368 } 369} 370 371/* Checks if the _IMM variant of an intrinsic can be used, returning in imm the 372 * immediate to be used (which applies even if _IMM can't be used) */ 373 374static bool 375bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max) 376{ 377 nir_src *offset = nir_get_io_offset_src(instr); 378 379 if (!nir_src_is_const(*offset)) 380 return false; 381 382 *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); 383 return (*immediate) < max; 384} 385 386static void 387bi_make_vec_to(bi_builder *b, bi_index final_dst, 388 bi_index *src, 389 unsigned *channel, 390 unsigned count, 391 unsigned bitsize); 392 393/* Bifrost's load instructions lack a component offset despite operating in 394 * terms of vec4 slots. Usually I/O vectorization avoids nonzero components, 395 * but they may be unavoidable with separate shaders in use. To solve this, we 396 * lower to a larger load and an explicit copy of the desired components. */ 397 398static void 399bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp) 400{ 401 unsigned component = nir_intrinsic_component(instr); 402 unsigned nr = instr->num_components; 403 unsigned total = nr + component; 404 unsigned bitsize = nir_dest_bit_size(instr->dest); 405 406 assert(total <= 4 && "should be vec4"); 407 bi_emit_cached_split(b, tmp, total * bitsize); 408 409 if (component == 0) 410 return; 411 412 bi_index srcs[] = { tmp, tmp, tmp }; 413 unsigned channels[] = { component, component + 1, component + 2 }; 414 415 bi_make_vec_to(b, bi_dest_index(&instr->dest), 416 srcs, channels, nr, nir_dest_bit_size(instr->dest)); 417} 418 419static void 420bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) 421{ 422 nir_alu_type T = nir_intrinsic_dest_type(instr); 423 enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); 424 nir_src *offset = nir_get_io_offset_src(instr); 425 unsigned component = nir_intrinsic_component(instr); 426 enum bi_vecsize vecsize = (instr->num_components + component - 1); 427 unsigned imm_index = 0; 428 unsigned base = nir_intrinsic_base(instr); 429 bool constant = nir_src_is_const(*offset); 430 bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); 431 bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); 432 bi_instr *I; 433 434 if (immediate) { 435 I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b), 436 bi_instance_id(b), regfmt, vecsize, 437 imm_index); 438 } else { 439 bi_index idx = bi_src_index(&instr->src[0]); 440 441 if (constant) 442 idx = bi_imm_u32(imm_index); 443 else if (base != 0) 444 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); 445 446 I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b), 447 idx, regfmt, vecsize); 448 } 449 450 if (b->shader->arch >= 9) 451 I->table = PAN_TABLE_ATTRIBUTE; 452 453 bi_copy_component(b, instr, dest); 454} 455 456/* 457 * ABI: Special (desktop GL) slots come first, tightly packed. General varyings 458 * come later, sparsely packed. This handles both linked and separable shaders 459 * with a common code path, with minimal keying only for desktop GL. Each slot 460 * consumes 16 bytes (TODO: fp16, partial vectors). 461 */ 462static unsigned 463bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr) 464{ 465 nir_io_semantics sem = nir_intrinsic_io_semantics(intr); 466 uint32_t mask = ctx->inputs->fixed_varying_mask; 467 468 if (sem.location >= VARYING_SLOT_VAR0) { 469 unsigned nr_special = util_bitcount(mask); 470 unsigned general_index = (sem.location - VARYING_SLOT_VAR0); 471 472 return 16 * (nr_special + general_index); 473 } else { 474 return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location))); 475 } 476} 477 478/* 479 * Compute the offset in bytes of a varying with an immediate offset, adding the 480 * offset to the base computed above. Convenience method. 481 */ 482static unsigned 483bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr) 484{ 485 nir_src *src = nir_get_io_offset_src(intr); 486 assert(nir_src_is_const(*src) && "assumes immediate offset"); 487 488 return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16); 489} 490 491static void 492bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr) 493{ 494 enum bi_sample sample = BI_SAMPLE_CENTER; 495 enum bi_update update = BI_UPDATE_STORE; 496 enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; 497 bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input; 498 bi_index src0 = bi_null(); 499 500 unsigned component = nir_intrinsic_component(instr); 501 enum bi_vecsize vecsize = (instr->num_components + component - 1); 502 bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); 503 504 unsigned sz = nir_dest_bit_size(instr->dest); 505 506 if (smooth) { 507 nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]); 508 assert(parent); 509 510 sample = bi_interp_for_intrinsic(parent->intrinsic); 511 src0 = bi_varying_src0_for_barycentric(b, parent); 512 513 assert(sz == 16 || sz == 32); 514 regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 515 : BI_REGISTER_FORMAT_F32; 516 } else { 517 assert(sz == 32); 518 regfmt = BI_REGISTER_FORMAT_U32; 519 520 /* Valhall can't have bi_null() here, although the source is 521 * logically unused for flat varyings 522 */ 523 if (b->shader->arch >= 9) 524 src0 = bi_preload(b, 61); 525 526 /* Gather info as we go */ 527 b->shader->info.bifrost->uses_flat_shading = true; 528 } 529 530 enum bi_source_format source_format = 531 smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32; 532 533 nir_src *offset = nir_get_io_offset_src(instr); 534 unsigned imm_index = 0; 535 bool immediate = bi_is_intr_immediate(instr, &imm_index, 20); 536 bi_instr *I = NULL; 537 538 if (b->shader->malloc_idvs && immediate) { 539 /* Immediate index given in bytes. */ 540 bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, 541 sample, source_format, update, vecsize, 542 bi_varying_offset(b->shader, instr)); 543 } else if (immediate && smooth) { 544 I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, 545 vecsize, imm_index); 546 } else if (immediate && !smooth) { 547 I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, 548 vecsize, imm_index); 549 } else { 550 bi_index idx = bi_src_index(offset); 551 unsigned base = nir_intrinsic_base(instr); 552 553 if (b->shader->malloc_idvs) { 554 /* Index needs to be in bytes, but NIR gives the index 555 * in slots. For now assume 16 bytes per element. 556 */ 557 bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4)); 558 unsigned vbase = bi_varying_base_bytes(b->shader, instr); 559 560 if (vbase != 0) 561 idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false); 562 563 bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, 564 sample, source_format, update, 565 vecsize); 566 } else if (smooth) { 567 if (base != 0) 568 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); 569 570 I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample, 571 update, vecsize); 572 } else { 573 if (base != 0) 574 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); 575 576 I = bi_ld_var_flat_to(b, dest, idx, 577 BI_FUNCTION_NONE, regfmt, 578 vecsize); 579 } 580 } 581 582 /* Valhall usually uses machine-allocated IDVS. If this is disabled, use 583 * a simple Midgard-style ABI. 584 */ 585 if (b->shader->arch >= 9 && I != NULL) 586 I->table = PAN_TABLE_ATTRIBUTE; 587 588 bi_copy_component(b, instr, dest); 589} 590 591static void 592bi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src, 593 unsigned *channel, unsigned count) 594{ 595 bi_index srcs[BI_MAX_VEC]; 596 597 for (unsigned i = 0; i < count; i += 2) { 598 bool next = (i + 1) < count; 599 600 unsigned chan = channel ? channel[i] : 0; 601 unsigned nextc = next && channel ? channel[i + 1] : 0; 602 603 bi_index w0 = bi_extract(b, src[i], chan >> 1); 604 bi_index w1 = next ? bi_extract(b, src[i + 1], nextc >> 1) : bi_zero(); 605 606 bi_index h0 = bi_half(w0, chan & 1); 607 bi_index h1 = bi_half(w1, nextc & 1); 608 609 if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1)) 610 srcs[i >> 1] = bi_mov_i32(b, w0); 611 else if (bi_is_word_equiv(w0, w1)) 612 srcs[i >> 1] = bi_swz_v2i16(b, bi_swz_16(w0, chan & 1, nextc & 1)); 613 else 614 srcs[i >> 1] = bi_mkvec_v2i16(b, h0, h1); 615 } 616 617 bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, 2)); 618} 619 620static void 621bi_make_vec_to(bi_builder *b, bi_index dst, 622 bi_index *src, 623 unsigned *channel, 624 unsigned count, 625 unsigned bitsize) 626{ 627 if (bitsize == 32) { 628 bi_index srcs[BI_MAX_VEC]; 629 630 for (unsigned i = 0; i < count; ++i) 631 srcs[i] = bi_extract(b, src[i], channel ? channel[i] : 0); 632 633 bi_emit_collect_to(b, dst, srcs, count); 634 } else if (bitsize == 16) { 635 bi_make_vec16_to(b, dst, src, channel, count); 636 } else if (bitsize == 8 && count == 1) { 637 bi_swz_v4i8_to(b, dst, bi_byte( 638 bi_extract(b, src[0], channel[0] >> 2), 639 channel[0] & 3)); 640 } else { 641 unreachable("8-bit mkvec not yet supported"); 642 } 643} 644 645static inline bi_instr * 646bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0, 647 bi_index src1) 648{ 649 bi_instr *I; 650 651 if (b->shader->arch >= 9) { 652 I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1); 653 I->seg = BI_SEG_UBO; 654 } else { 655 I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0); 656 } 657 658 bi_emit_cached_split(b, dest0, bitsize); 659 return I; 660} 661 662static bi_instr * 663bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval, 664 unsigned nr_components, unsigned offset) 665{ 666 unsigned sysval_ubo = b->shader->inputs->fixed_sysval_ubo >= 0 ? 667 b->shader->inputs->fixed_sysval_ubo : 668 b->shader->nir->info.num_ubos; 669 unsigned uniform = 670 pan_lookup_sysval(b->shader->sysval_to_id, 671 b->shader->info.sysvals, 672 sysval); 673 unsigned idx = (uniform * 16) + offset; 674 675 return bi_load_ubo_to(b, nr_components * 32, dest, 676 bi_imm_u32(idx), bi_imm_u32(sysval_ubo)); 677} 678 679static void 680bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr, 681 unsigned nr_components, unsigned offset) 682{ 683 bi_load_sysval_to(b, bi_dest_index(&intr->dest), 684 panfrost_sysval_for_instr(&intr->instr, NULL), 685 nr_components, offset); 686} 687 688static bi_index 689bi_load_sysval(bi_builder *b, int sysval, 690 unsigned nr_components, unsigned offset) 691{ 692 bi_index tmp = bi_temp(b->shader); 693 bi_load_sysval_to(b, tmp, sysval, nr_components, offset); 694 return tmp; 695} 696 697static void 698bi_load_sample_id_to(bi_builder *b, bi_index dst) 699{ 700 /* r61[16:23] contains the sampleID, mask it out. Upper bits 701 * seem to read garbage (despite being architecturally defined 702 * as zero), so use a 5-bit mask instead of 8-bits */ 703 704 bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f), 705 bi_imm_u8(16), false); 706} 707 708static bi_index 709bi_load_sample_id(bi_builder *b) 710{ 711 bi_index sample_id = bi_temp(b->shader); 712 bi_load_sample_id_to(b, sample_id); 713 return sample_id; 714} 715 716static bi_index 717bi_pixel_indices(bi_builder *b, unsigned rt) 718{ 719 /* We want to load the current pixel. */ 720 struct bifrost_pixel_indices pix = { 721 .y = BIFROST_CURRENT_PIXEL, 722 .rt = rt 723 }; 724 725 uint32_t indices_u32 = 0; 726 memcpy(&indices_u32, &pix, sizeof(indices_u32)); 727 bi_index indices = bi_imm_u32(indices_u32); 728 729 /* Sample index above is left as zero. For multisampling, we need to 730 * fill in the actual sample ID in the lower byte */ 731 732 if (b->shader->inputs->blend.nr_samples > 1) 733 indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false); 734 735 return indices; 736} 737 738/* Source color is passed through r0-r3, or r4-r7 for the second source when 739 * dual-source blending. Preload the corresponding vector. 740 */ 741static void 742bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr) 743{ 744 nir_io_semantics sem = nir_intrinsic_io_semantics(instr); 745 unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0; 746 unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr)); 747 assert(size == 16 || size == 32); 748 749 bi_index srcs[] = { 750 bi_preload(b, base + 0), bi_preload(b, base + 1), 751 bi_preload(b, base + 2), bi_preload(b, base + 3) 752 }; 753 754 bi_emit_collect_to(b, bi_dest_index(&instr->dest), srcs, size == 32 ? 4 : 2); 755} 756 757static void 758bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, 759 bi_index rgba2, nir_alu_type T2, unsigned rt) 760{ 761 /* On Valhall, BLEND does not encode the return address */ 762 bool bifrost = b->shader->arch <= 8; 763 764 /* Reads 2 or 4 staging registers to cover the input */ 765 unsigned size = nir_alu_type_get_type_size(T); 766 unsigned size_2 = nir_alu_type_get_type_size(T2); 767 unsigned sr_count = (size <= 16) ? 2 : 4; 768 unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4; 769 const struct panfrost_compile_inputs *inputs = b->shader->inputs; 770 uint64_t blend_desc = inputs->blend.bifrost_blend_desc; 771 enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); 772 773 if (inputs->is_blend && inputs->blend.nr_samples > 1) { 774 /* Conversion descriptor comes from the compile inputs, pixel 775 * indices derived at run time based on sample ID */ 776 bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b), 777 bi_imm_u32(blend_desc >> 32), 778 regfmt, BI_VECSIZE_V4); 779 } else if (b->shader->inputs->is_blend) { 780 uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc; 781 782 /* Blend descriptor comes from the compile inputs */ 783 /* Put the result in r0 */ 784 785 bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba, 786 bi_coverage(b), 787 bi_imm_u32(blend_desc), 788 bi_imm_u32(blend_desc >> 32), 789 bi_null(), regfmt, sr_count, 0); 790 } else { 791 /* Blend descriptor comes from the FAU RAM. By convention, the 792 * return address on Bifrost is stored in r48 and will be used 793 * by the blend shader to jump back to the fragment shader */ 794 795 bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba, 796 bi_coverage(b), 797 bi_fau(BIR_FAU_BLEND_0 + rt, false), 798 bi_fau(BIR_FAU_BLEND_0 + rt, true), 799 rgba2, regfmt, sr_count, sr_count_2); 800 } 801 802 assert(rt < 8); 803 b->shader->info.bifrost->blend[rt].type = T; 804 805 if (T2) 806 b->shader->info.bifrost->blend_src1_type = T2; 807} 808 809/* Blend shaders do not need to run ATEST since they are dependent on a 810 * fragment shader that runs it. Blit shaders may not need to run ATEST, since 811 * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and 812 * there are no writes to the coverage mask. The latter two are satisfied for 813 * all blit shaders, so we just care about early-z, which blit shaders force 814 * iff they do not write depth or stencil */ 815 816static bool 817bi_skip_atest(bi_context *ctx, bool emit_zs) 818{ 819 return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend; 820} 821 822static void 823bi_emit_atest(bi_builder *b, bi_index alpha) 824{ 825 bi_instr *atest = bi_atest_to(b, bi_temp(b->shader), bi_coverage(b), alpha); 826 b->shader->emitted_atest = true; 827 b->shader->coverage = atest->dest[0]; 828 829 /* Pseudo-source to encode in the tuple */ 830 atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false); 831} 832 833static void 834bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr) 835{ 836 bool combined = instr->intrinsic == 837 nir_intrinsic_store_combined_output_pan; 838 839 unsigned writeout = combined ? nir_intrinsic_component(instr) : 840 PAN_WRITEOUT_C; 841 842 bool emit_blend = writeout & (PAN_WRITEOUT_C); 843 bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S); 844 845 const nir_variable *var = 846 nir_find_variable_with_driver_location(b->shader->nir, 847 nir_var_shader_out, nir_intrinsic_base(instr)); 848 849 unsigned loc = var ? var->data.location : 0; 850 851 bi_index src0 = bi_src_index(&instr->src[0]); 852 853 /* By ISA convention, the coverage mask is stored in R60. The store 854 * itself will be handled by a subsequent ATEST instruction */ 855 if (loc == FRAG_RESULT_SAMPLE_MASK) { 856 bi_index orig = bi_coverage(b); 857 bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0); 858 bi_index new = bi_lshift_and_i32(b, orig, bi_extract(b, src0, 0), bi_imm_u8(0)); 859 860 b->shader->coverage = 861 bi_mux_i32(b, orig, new, msaa, BI_MUX_INT_ZERO); 862 return; 863 } 864 865 /* Emit ATEST if we have to, note ATEST requires a floating-point alpha 866 * value, but render target #0 might not be floating point. However the 867 * alpha value is only used for alpha-to-coverage, a stage which is 868 * skipped for pure integer framebuffers, so the issue is moot. */ 869 870 if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) { 871 nir_alu_type T = nir_intrinsic_src_type(instr); 872 873 bi_index rgba = bi_src_index(&instr->src[0]); 874 bi_index alpha = 875 (T == nir_type_float16) ? bi_half(bi_extract(b, rgba, 1), true) : 876 (T == nir_type_float32) ? bi_extract(b, rgba, 3) : 877 bi_dontcare(b); 878 879 /* Don't read out-of-bounds */ 880 if (nir_src_num_components(instr->src[0]) < 4) 881 alpha = bi_imm_f32(1.0); 882 883 bi_emit_atest(b, alpha); 884 } 885 886 if (emit_zs) { 887 bi_index z = bi_dontcare(b), s = bi_dontcare(b); 888 889 if (writeout & PAN_WRITEOUT_Z) 890 z = bi_src_index(&instr->src[2]); 891 892 if (writeout & PAN_WRITEOUT_S) 893 s = bi_src_index(&instr->src[3]); 894 895 b->shader->coverage = bi_zs_emit(b, z, s, bi_coverage(b), 896 writeout & PAN_WRITEOUT_S, 897 writeout & PAN_WRITEOUT_Z); 898 } 899 900 if (emit_blend) { 901 unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0; 902 bool dual = (writeout & PAN_WRITEOUT_2); 903 bi_index color = bi_src_index(&instr->src[0]); 904 bi_index color2 = dual ? bi_src_index(&instr->src[4]) : bi_null(); 905 nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0; 906 907 /* Explicit copy since BLEND inputs are precoloured to R0-R3, 908 * TODO: maybe schedule around this or implement in RA as a 909 * spill */ 910 bool has_mrt = false; 911 912 nir_foreach_shader_out_variable(var, b->shader->nir) 913 has_mrt |= (var->data.location > FRAG_RESULT_DATA0); 914 915 if (has_mrt) { 916 bi_index srcs[4] = { color, color, color, color }; 917 unsigned channels[4] = { 0, 1, 2, 3 }; 918 color = bi_temp(b->shader); 919 bi_make_vec_to(b, color, srcs, channels, 920 nir_src_num_components(instr->src[0]), 921 nir_alu_type_get_type_size(nir_intrinsic_src_type(instr))); 922 } 923 924 bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), 925 color2, T2, rt); 926 } 927 928 if (b->shader->inputs->is_blend) { 929 /* Jump back to the fragment shader, return address is stored 930 * in r48 (see above). On Valhall, only jump if the address is 931 * nonzero. The check is free there and it implements the "jump 932 * to 0 terminates the blend shader" that's automatic on 933 * Bifrost. 934 */ 935 if (b->shader->arch >= 8) 936 bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE); 937 else 938 bi_jump(b, bi_preload(b, 48)); 939 } 940} 941 942/** 943 * In a vertex shader, is the specified variable a position output? These kinds 944 * of outputs are written from position shaders when IDVS is enabled. All other 945 * outputs are written from the varying shader. 946 */ 947static bool 948bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs) 949{ 950 nir_io_semantics sem = nir_intrinsic_io_semantics(intr); 951 952 switch (sem.location) { 953 case VARYING_SLOT_POS: 954 case VARYING_SLOT_PSIZ: 955 return idvs == BI_IDVS_VARYING; 956 default: 957 return idvs == BI_IDVS_POSITION; 958 } 959} 960 961static bool 962bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data) 963{ 964 enum bi_idvs_mode *idvs = data; 965 966 if (instr->type != nir_instr_type_intrinsic) 967 return false; 968 969 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 970 971 if (intr->intrinsic != nir_intrinsic_store_output) 972 return false; 973 974 if (bi_should_remove_store(intr, *idvs)) { 975 nir_instr_remove(instr); 976 return true; 977 } 978 979 return false; 980} 981 982static void 983bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr) 984{ 985 /* In principle we can do better for 16-bit. At the moment we require 986 * 32-bit to permit the use of .auto, in order to force .u32 for flat 987 * varyings, to handle internal TGSI shaders that set flat in the VS 988 * but smooth in the FS */ 989 990 ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr); 991 ASSERTED unsigned T_size = nir_alu_type_get_type_size(T); 992 assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16)); 993 enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; 994 995 unsigned imm_index = 0; 996 bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); 997 998 /* Only look at the total components needed. In effect, we fill in all 999 * the intermediate "holes" in the write mask, since we can't mask off 1000 * stores. Since nir_lower_io_to_temporaries ensures each varying is 1001 * written at most once, anything that's masked out is undefined, so it 1002 * doesn't matter what we write there. So we may as well do the 1003 * simplest thing possible. */ 1004 unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr)); 1005 assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0)); 1006 1007 bi_index data = bi_src_index(&instr->src[0]); 1008 1009 /* To keep the vector dimensions consistent, we need to drop some 1010 * components. This should be coalesced. 1011 * 1012 * TODO: This is ugly and maybe inefficient. Would we rather 1013 * introduce a TRIM.i32 pseudoinstruction? 1014 */ 1015 if (nr < nir_intrinsic_src_components(instr, 0)) { 1016 assert(T_size == 32 && "todo: 16-bit trim"); 1017 1018 bi_instr *split = bi_split_i32_to(b, bi_null(), data); 1019 split->nr_dests = nir_intrinsic_src_components(instr, 0); 1020 1021 bi_index tmp = bi_temp(b->shader); 1022 bi_instr *collect = bi_collect_i32_to(b, tmp); 1023 collect->nr_srcs = nr; 1024 1025 for (unsigned w = 0; w < nr; ++w) { 1026 split->dest[w] = bi_temp(b->shader); 1027 collect->src[w] = split->dest[w]; 1028 } 1029 1030 data = tmp; 1031 } 1032 1033 bool psiz = (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ); 1034 1035 bi_index a[4] = { bi_null() }; 1036 1037 if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) { 1038 /* Bifrost position shaders have a fast path */ 1039 assert(T == nir_type_float16 || T == nir_type_float32); 1040 unsigned regfmt = (T == nir_type_float16) ? 0 : 1; 1041 unsigned identity = (b->shader->arch == 6) ? 0x688 : 0; 1042 unsigned snap4 = 0x5E; 1043 uint32_t format = identity | (snap4 << 12) | (regfmt << 24); 1044 1045 bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59), 1046 bi_imm_u32(format), regfmt, nr - 1); 1047 } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) { 1048 bi_index index = bi_preload(b, 59); 1049 1050 if (psiz) { 1051 assert(T_size == 16 && "should've been lowered"); 1052 index = bi_iadd_imm_i32(b, index, 4); 1053 } 1054 1055 bi_index address = bi_lea_buf_imm(b, index); 1056 bi_emit_split_i32(b, a, address, 2); 1057 1058 bool varying = (b->shader->idvs == BI_IDVS_VARYING); 1059 1060 bi_store(b, nr * nir_src_bit_size(instr->src[0]), 1061 data, a[0], a[1], 1062 varying ? BI_SEG_VARY : BI_SEG_POS, 1063 varying ? bi_varying_offset(b->shader, instr) : 0); 1064 } else if (immediate) { 1065 bi_index address = bi_lea_attr_imm(b, 1066 bi_vertex_id(b), bi_instance_id(b), 1067 regfmt, imm_index); 1068 bi_emit_split_i32(b, a, address, 3); 1069 1070 bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); 1071 } else { 1072 bi_index idx = 1073 bi_iadd_u32(b, 1074 bi_src_index(nir_get_io_offset_src(instr)), 1075 bi_imm_u32(nir_intrinsic_base(instr)), 1076 false); 1077 bi_index address = bi_lea_attr(b, 1078 bi_vertex_id(b), bi_instance_id(b), 1079 idx, regfmt); 1080 bi_emit_split_i32(b, a, address, 3); 1081 1082 bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); 1083 } 1084} 1085 1086static void 1087bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr) 1088{ 1089 nir_src *offset = nir_get_io_offset_src(instr); 1090 1091 bool offset_is_const = nir_src_is_const(*offset); 1092 bi_index dyn_offset = bi_src_index(offset); 1093 uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0; 1094 bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input); 1095 1096 bi_load_ubo_to(b, instr->num_components * nir_dest_bit_size(instr->dest), 1097 bi_dest_index(&instr->dest), offset_is_const ? 1098 bi_imm_u32(const_offset) : dyn_offset, 1099 kernel_input ? bi_zero() : bi_src_index(&instr->src[0])); 1100} 1101 1102static void 1103bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr) 1104{ 1105 assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms"); 1106 1107 nir_src *offset = &instr->src[0]; 1108 assert(nir_src_is_const(*offset) && "no indirect push constants"); 1109 uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); 1110 assert((base & 3) == 0 && "unaligned push constants"); 1111 1112 unsigned bits = nir_dest_bit_size(instr->dest) * 1113 nir_dest_num_components(instr->dest); 1114 1115 unsigned n = DIV_ROUND_UP(bits, 32); 1116 assert(n <= 4); 1117 bi_index channels[4] = { bi_null() }; 1118 1119 for (unsigned i = 0; i < n; ++i) { 1120 unsigned word = (base >> 2) + i; 1121 1122 channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1); 1123 } 1124 1125 bi_emit_collect_to(b, bi_dest_index(&instr->dest), channels, n); 1126} 1127 1128static bi_index 1129bi_addr_high(bi_builder *b, nir_src *src) 1130{ 1131 return (nir_src_bit_size(*src) == 64) ? 1132 bi_extract(b, bi_src_index(src), 1) : bi_zero(); 1133} 1134 1135static void 1136bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi, enum bi_seg seg, int16_t *offset) 1137{ 1138 /* Not needed on Bifrost or for global accesses */ 1139 if (b->shader->arch < 9 || seg == BI_SEG_NONE) 1140 return; 1141 1142 /* There is no segment modifier on Valhall. Instead, we need to 1143 * emit the arithmetic ourselves. We do have an offset 1144 * available, which saves an instruction for constant offsets. 1145 */ 1146 bool wls = (seg == BI_SEG_WLS); 1147 assert(wls || (seg == BI_SEG_TL)); 1148 1149 enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR; 1150 1151 bi_index base_lo = bi_fau(fau, false); 1152 1153 if (offset && addr_lo->type == BI_INDEX_CONSTANT && addr_lo->value == (int16_t) addr_lo->value) { 1154 *offset = addr_lo->value; 1155 *addr_lo = base_lo; 1156 } else { 1157 *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false); 1158 } 1159 1160 /* Do not allow overflow for WLS or TLS */ 1161 *addr_hi = bi_fau(fau, true); 1162} 1163 1164static void 1165bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) 1166{ 1167 int16_t offset = 0; 1168 unsigned bits = instr->num_components * nir_dest_bit_size(instr->dest); 1169 bi_index dest = bi_dest_index(&instr->dest); 1170 bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0); 1171 bi_index addr_hi = bi_addr_high(b, &instr->src[0]); 1172 1173 bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset); 1174 1175 bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset); 1176 bi_emit_cached_split(b, dest, bits); 1177} 1178 1179static void 1180bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) 1181{ 1182 /* Require contiguous masks, gauranteed by nir_lower_wrmasks */ 1183 assert(nir_intrinsic_write_mask(instr) == 1184 BITFIELD_MASK(instr->num_components)); 1185 1186 int16_t offset = 0; 1187 bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0); 1188 bi_index addr_hi = bi_addr_high(b, &instr->src[1]); 1189 1190 bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset); 1191 1192 bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]), 1193 bi_src_index(&instr->src[0]), 1194 addr_lo, addr_hi, seg, offset); 1195} 1196 1197/* Exchanges the staging register with memory */ 1198 1199static void 1200bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg) 1201{ 1202 assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); 1203 1204 unsigned sz = nir_src_bit_size(*arg); 1205 assert(sz == 32 || sz == 64); 1206 1207 bi_index data = bi_src_index(arg); 1208 1209 bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1); 1210 1211 if (b->shader->arch >= 9) 1212 bi_handle_segment(b, &addr, &addr_hi, seg, NULL); 1213 else if (seg == BI_SEG_WLS) 1214 addr_hi = bi_zero(); 1215 1216 bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg); 1217} 1218 1219/* Exchanges the second staging register with memory if comparison with first 1220 * staging register passes */ 1221 1222static void 1223bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg) 1224{ 1225 assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); 1226 1227 /* hardware is swapped from NIR */ 1228 bi_index src0 = bi_src_index(arg_2); 1229 bi_index src1 = bi_src_index(arg_1); 1230 1231 unsigned sz = nir_src_bit_size(*arg_1); 1232 assert(sz == 32 || sz == 64); 1233 1234 bi_index data_words[] = { 1235 bi_extract(b, src0, 0), 1236 sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1), 1237 1238 /* 64-bit */ 1239 bi_extract(b, src1, 0), 1240 sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1), 1241 }; 1242 1243 bi_index in = bi_temp(b->shader); 1244 bi_emit_collect_to(b, in, data_words, 2 * (sz / 32)); 1245 bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1); 1246 1247 if (b->shader->arch >= 9) 1248 bi_handle_segment(b, &addr, &addr_hi, seg, NULL); 1249 else if (seg == BI_SEG_WLS) 1250 addr_hi = bi_zero(); 1251 1252 bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg); 1253 bi_emit_cached_split(b, out, sz); 1254 1255 bi_index inout_words[] = { 1256 bi_extract(b, out, 0), 1257 sz == 64 ? bi_extract(b, out, 1) : bi_null() 1258 }; 1259 1260 bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32); 1261} 1262 1263/* Extracts an atomic opcode */ 1264 1265static enum bi_atom_opc 1266bi_atom_opc_for_nir(nir_intrinsic_op op) 1267{ 1268 switch (op) { 1269 case nir_intrinsic_global_atomic_add: 1270 case nir_intrinsic_shared_atomic_add: 1271 case nir_intrinsic_image_atomic_add: 1272 return BI_ATOM_OPC_AADD; 1273 1274 case nir_intrinsic_global_atomic_imin: 1275 case nir_intrinsic_shared_atomic_imin: 1276 case nir_intrinsic_image_atomic_imin: 1277 return BI_ATOM_OPC_ASMIN; 1278 1279 case nir_intrinsic_global_atomic_umin: 1280 case nir_intrinsic_shared_atomic_umin: 1281 case nir_intrinsic_image_atomic_umin: 1282 return BI_ATOM_OPC_AUMIN; 1283 1284 case nir_intrinsic_global_atomic_imax: 1285 case nir_intrinsic_shared_atomic_imax: 1286 case nir_intrinsic_image_atomic_imax: 1287 return BI_ATOM_OPC_ASMAX; 1288 1289 case nir_intrinsic_global_atomic_umax: 1290 case nir_intrinsic_shared_atomic_umax: 1291 case nir_intrinsic_image_atomic_umax: 1292 return BI_ATOM_OPC_AUMAX; 1293 1294 case nir_intrinsic_global_atomic_and: 1295 case nir_intrinsic_shared_atomic_and: 1296 case nir_intrinsic_image_atomic_and: 1297 return BI_ATOM_OPC_AAND; 1298 1299 case nir_intrinsic_global_atomic_or: 1300 case nir_intrinsic_shared_atomic_or: 1301 case nir_intrinsic_image_atomic_or: 1302 return BI_ATOM_OPC_AOR; 1303 1304 case nir_intrinsic_global_atomic_xor: 1305 case nir_intrinsic_shared_atomic_xor: 1306 case nir_intrinsic_image_atomic_xor: 1307 return BI_ATOM_OPC_AXOR; 1308 1309 default: 1310 unreachable("Unexpected computational atomic"); 1311 } 1312} 1313 1314/* Optimized unary atomics are available with an implied #1 argument */ 1315 1316static bool 1317bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out) 1318{ 1319 /* Check we have a compatible constant */ 1320 if (arg.type != BI_INDEX_CONSTANT) 1321 return false; 1322 1323 if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD))) 1324 return false; 1325 1326 /* Check for a compatible operation */ 1327 switch (op) { 1328 case BI_ATOM_OPC_AADD: 1329 *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC; 1330 return true; 1331 case BI_ATOM_OPC_ASMAX: 1332 *out = BI_ATOM_OPC_ASMAX1; 1333 return true; 1334 case BI_ATOM_OPC_AUMAX: 1335 *out = BI_ATOM_OPC_AUMAX1; 1336 return true; 1337 case BI_ATOM_OPC_AOR: 1338 *out = BI_ATOM_OPC_AOR1; 1339 return true; 1340 default: 1341 return false; 1342 } 1343} 1344 1345/* 1346 * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to 1347 * translate between these forms (with MKVEC.v2i16). 1348 * 1349 * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D 1350 * arrays. For uniform handling, we also treat 3D textures like 2D arrays. 1351 * 1352 * Our indexing needs to reflects this. 1353 */ 1354static bi_index 1355bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx, 1356 unsigned coord_comps, bool is_array) 1357{ 1358 assert(coord_comps > 0 && coord_comps <= 3); 1359 1360 if (src_idx == 0) { 1361 if (coord_comps == 1 || (coord_comps == 2 && is_array)) 1362 return bi_extract(b, coord, 0); 1363 else 1364 return bi_mkvec_v2i16(b, 1365 bi_half(bi_extract(b, coord, 0), false), 1366 bi_half(bi_extract(b, coord, 1), false)); 1367 } else { 1368 if (coord_comps == 3 && b->shader->arch >= 9) 1369 return bi_mkvec_v2i16(b, bi_imm_u16(0), 1370 bi_half(bi_extract(b, coord, 2), false)); 1371 else if (coord_comps == 3) 1372 return bi_extract(b, coord, 2); 1373 else if (coord_comps == 2 && is_array) 1374 return bi_extract(b, coord, 1); 1375 else 1376 return bi_zero(); 1377 } 1378} 1379 1380static bi_index 1381bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr) 1382{ 1383 nir_src src = instr->src[0]; 1384 bi_index index = bi_src_index(&src); 1385 bi_context *ctx = b->shader; 1386 1387 /* Images come after vertex attributes, so handle an explicit offset */ 1388 unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ? 1389 util_bitcount64(ctx->nir->info.inputs_read) : 0; 1390 1391 if (offset == 0) 1392 return index; 1393 else if (nir_src_is_const(src)) 1394 return bi_imm_u32(nir_src_as_uint(src) + offset); 1395 else 1396 return bi_iadd_u32(b, index, bi_imm_u32(offset), false); 1397} 1398 1399static void 1400bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr) 1401{ 1402 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 1403 unsigned coord_comps = nir_image_intrinsic_coord_components(instr); 1404 bool array = nir_intrinsic_image_array(instr); 1405 ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); 1406 1407 bi_index coords = bi_src_index(&instr->src[1]); 1408 bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); 1409 bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); 1410 bi_index dest = bi_dest_index(&instr->dest); 1411 enum bi_register_format regfmt = bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)); 1412 enum bi_vecsize vecsize = instr->num_components - 1; 1413 1414 /* TODO: MSAA */ 1415 assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); 1416 1417 if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) { 1418 bi_instr *I = bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize, 1419 nir_src_as_uint(instr->src[0])); 1420 1421 I->table = PAN_TABLE_IMAGE; 1422 } else if (b->shader->arch >= 9) { 1423 unreachable("Indirect images on Valhall not yet supported"); 1424 } else { 1425 bi_ld_attr_tex_to(b, dest, xy, zw, 1426 bi_emit_image_index(b, instr), regfmt, 1427 vecsize); 1428 } 1429 1430 bi_split_dest(b, instr->dest); 1431} 1432 1433static bi_index 1434bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr) 1435{ 1436 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 1437 bool array = nir_intrinsic_image_array(instr); 1438 ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); 1439 unsigned coord_comps = nir_image_intrinsic_coord_components(instr); 1440 1441 /* TODO: MSAA */ 1442 assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); 1443 1444 enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ? 1445 bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) : 1446 BI_REGISTER_FORMAT_AUTO; 1447 1448 bi_index coords = bi_src_index(&instr->src[1]); 1449 bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); 1450 bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); 1451 bi_index dest = bi_temp(b->shader); 1452 1453 if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) { 1454 bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false, 1455 nir_src_as_uint(instr->src[0])); 1456 1457 I->table = PAN_TABLE_IMAGE; 1458 } else if (b->shader->arch >= 9) { 1459 unreachable("Indirect images on Valhall not yet supported"); 1460 } else { 1461 bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw, 1462 bi_emit_image_index(b, instr), type); 1463 1464 /* LEA_ATTR_TEX defaults to the secondary attribute table, but 1465 * our ABI has all images in the primary attribute table 1466 */ 1467 I->table = BI_TABLE_ATTRIBUTE_1; 1468 } 1469 1470 bi_emit_cached_split(b, dest, 3 * 32); 1471 return dest; 1472} 1473 1474static void 1475bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr) 1476{ 1477 bi_index a[4] = { bi_null() }; 1478 bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3); 1479 1480 bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2], 1481 bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)), 1482 instr->num_components - 1); 1483} 1484 1485static void 1486bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, 1487 bi_index addr, bi_index arg, nir_intrinsic_op intrinsic) 1488{ 1489 enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic); 1490 enum bi_atom_opc post_opc = opc; 1491 bool bifrost = b->shader->arch <= 8; 1492 1493 /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't 1494 * take any vector but can still output in RETURN mode */ 1495 bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst; 1496 unsigned sr_count = bifrost ? 2 : 1; 1497 1498 /* Generate either ATOM or ATOM1 as required */ 1499 if (bi_promote_atom_c1(opc, arg, &opc)) { 1500 bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0), 1501 bi_extract(b, addr, 1), opc, sr_count); 1502 } else { 1503 bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0), 1504 bi_extract(b, addr, 1), opc, sr_count); 1505 } 1506 1507 if (bifrost) { 1508 /* Post-process it */ 1509 bi_emit_cached_split_i32(b, tmp_dest, 2); 1510 bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0), bi_extract(b, tmp_dest, 1), post_opc); 1511 } 1512} 1513 1514/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5 1515 * gl_FragCoord.z = ld_vary(fragz) 1516 * gl_FragCoord.w = ld_vary(fragw) 1517 */ 1518 1519static void 1520bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr) 1521{ 1522 bi_index src[4] = {}; 1523 1524 for (unsigned i = 0; i < 2; ++i) { 1525 src[i] = bi_fadd_f32(b, 1526 bi_u16_to_f32(b, bi_half(bi_preload(b, 59), i)), 1527 bi_imm_f32(0.5f)); 1528 } 1529 1530 for (unsigned i = 0; i < 2; ++i) { 1531 src[2 + i] = bi_ld_var_special(b, bi_zero(), 1532 BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER, 1533 BI_UPDATE_CLOBBER, 1534 (i == 0) ? BI_VARYING_NAME_FRAG_Z : 1535 BI_VARYING_NAME_FRAG_W, 1536 BI_VECSIZE_NONE); 1537 } 1538 1539 bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32); 1540} 1541 1542static void 1543bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr) 1544{ 1545 bi_index dest = bi_dest_index(&instr->dest); 1546 nir_alu_type T = nir_intrinsic_dest_type(instr); 1547 enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); 1548 unsigned rt = b->shader->inputs->blend.rt; 1549 unsigned size = nir_dest_bit_size(instr->dest); 1550 unsigned nr = instr->num_components; 1551 1552 /* Get the render target */ 1553 if (!b->shader->inputs->is_blend) { 1554 const nir_variable *var = 1555 nir_find_variable_with_driver_location(b->shader->nir, 1556 nir_var_shader_out, nir_intrinsic_base(instr)); 1557 unsigned loc = var->data.location; 1558 assert(loc >= FRAG_RESULT_DATA0); 1559 rt = (loc - FRAG_RESULT_DATA0); 1560 } 1561 1562 bi_index desc = b->shader->inputs->is_blend ? 1563 bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) : 1564 b->shader->inputs->bifrost.static_rt_conv ? 1565 bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) : 1566 bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0); 1567 1568 bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b), desc, 1569 regfmt, nr - 1); 1570 bi_emit_cached_split(b, dest, size * nr); 1571} 1572 1573static void 1574bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) 1575{ 1576 bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ? 1577 bi_dest_index(&instr->dest) : bi_null(); 1578 gl_shader_stage stage = b->shader->stage; 1579 1580 switch (instr->intrinsic) { 1581 case nir_intrinsic_load_barycentric_pixel: 1582 case nir_intrinsic_load_barycentric_centroid: 1583 case nir_intrinsic_load_barycentric_sample: 1584 case nir_intrinsic_load_barycentric_at_sample: 1585 case nir_intrinsic_load_barycentric_at_offset: 1586 /* handled later via load_vary */ 1587 break; 1588 case nir_intrinsic_load_interpolated_input: 1589 case nir_intrinsic_load_input: 1590 if (b->shader->inputs->is_blend) 1591 bi_emit_load_blend_input(b, instr); 1592 else if (stage == MESA_SHADER_FRAGMENT) 1593 bi_emit_load_vary(b, instr); 1594 else if (stage == MESA_SHADER_VERTEX) 1595 bi_emit_load_attr(b, instr); 1596 else 1597 unreachable("Unsupported shader stage"); 1598 break; 1599 1600 case nir_intrinsic_store_output: 1601 if (stage == MESA_SHADER_FRAGMENT) 1602 bi_emit_fragment_out(b, instr); 1603 else if (stage == MESA_SHADER_VERTEX) 1604 bi_emit_store_vary(b, instr); 1605 else 1606 unreachable("Unsupported shader stage"); 1607 break; 1608 1609 case nir_intrinsic_store_combined_output_pan: 1610 assert(stage == MESA_SHADER_FRAGMENT); 1611 bi_emit_fragment_out(b, instr); 1612 break; 1613 1614 case nir_intrinsic_load_ubo: 1615 case nir_intrinsic_load_kernel_input: 1616 bi_emit_load_ubo(b, instr); 1617 break; 1618 1619 case nir_intrinsic_load_push_constant: 1620 bi_emit_load_push_constant(b, instr); 1621 break; 1622 1623 case nir_intrinsic_load_global: 1624 case nir_intrinsic_load_global_constant: 1625 bi_emit_load(b, instr, BI_SEG_NONE); 1626 break; 1627 1628 case nir_intrinsic_store_global: 1629 bi_emit_store(b, instr, BI_SEG_NONE); 1630 break; 1631 1632 case nir_intrinsic_load_scratch: 1633 bi_emit_load(b, instr, BI_SEG_TL); 1634 break; 1635 1636 case nir_intrinsic_store_scratch: 1637 bi_emit_store(b, instr, BI_SEG_TL); 1638 break; 1639 1640 case nir_intrinsic_load_shared: 1641 bi_emit_load(b, instr, BI_SEG_WLS); 1642 break; 1643 1644 case nir_intrinsic_store_shared: 1645 bi_emit_store(b, instr, BI_SEG_WLS); 1646 break; 1647 1648 /* Blob doesn't seem to do anything for memory barriers, note +BARRIER 1649 * is illegal in fragment shaders */ 1650 case nir_intrinsic_memory_barrier: 1651 case nir_intrinsic_memory_barrier_buffer: 1652 case nir_intrinsic_memory_barrier_image: 1653 case nir_intrinsic_memory_barrier_shared: 1654 case nir_intrinsic_group_memory_barrier: 1655 break; 1656 1657 case nir_intrinsic_control_barrier: 1658 assert(b->shader->stage != MESA_SHADER_FRAGMENT); 1659 bi_barrier(b); 1660 break; 1661 1662 case nir_intrinsic_shared_atomic_add: 1663 case nir_intrinsic_shared_atomic_imin: 1664 case nir_intrinsic_shared_atomic_umin: 1665 case nir_intrinsic_shared_atomic_imax: 1666 case nir_intrinsic_shared_atomic_umax: 1667 case nir_intrinsic_shared_atomic_and: 1668 case nir_intrinsic_shared_atomic_or: 1669 case nir_intrinsic_shared_atomic_xor: { 1670 assert(nir_src_bit_size(instr->src[1]) == 32); 1671 1672 bi_index addr = bi_src_index(&instr->src[0]); 1673 bi_index addr_hi; 1674 1675 if (b->shader->arch >= 9) { 1676 bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL); 1677 addr = bi_collect_v2i32(b, addr, addr_hi); 1678 } else { 1679 addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS); 1680 bi_emit_cached_split(b, addr, 64); 1681 } 1682 1683 bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), 1684 instr->intrinsic); 1685 bi_split_dest(b, instr->dest); 1686 break; 1687 } 1688 1689 case nir_intrinsic_image_atomic_add: 1690 case nir_intrinsic_image_atomic_imin: 1691 case nir_intrinsic_image_atomic_umin: 1692 case nir_intrinsic_image_atomic_imax: 1693 case nir_intrinsic_image_atomic_umax: 1694 case nir_intrinsic_image_atomic_and: 1695 case nir_intrinsic_image_atomic_or: 1696 case nir_intrinsic_image_atomic_xor: 1697 assert(nir_src_bit_size(instr->src[3]) == 32); 1698 1699 bi_emit_atomic_i32_to(b, dst, 1700 bi_emit_lea_image(b, instr), 1701 bi_src_index(&instr->src[3]), 1702 instr->intrinsic); 1703 bi_split_dest(b, instr->dest); 1704 break; 1705 1706 case nir_intrinsic_global_atomic_add: 1707 case nir_intrinsic_global_atomic_imin: 1708 case nir_intrinsic_global_atomic_umin: 1709 case nir_intrinsic_global_atomic_imax: 1710 case nir_intrinsic_global_atomic_umax: 1711 case nir_intrinsic_global_atomic_and: 1712 case nir_intrinsic_global_atomic_or: 1713 case nir_intrinsic_global_atomic_xor: 1714 assert(nir_src_bit_size(instr->src[1]) == 32); 1715 1716 bi_emit_atomic_i32_to(b, dst, 1717 bi_src_index(&instr->src[0]), 1718 bi_src_index(&instr->src[1]), 1719 instr->intrinsic); 1720 1721 bi_split_dest(b, instr->dest); 1722 break; 1723 1724 case nir_intrinsic_image_load: 1725 bi_emit_image_load(b, instr); 1726 break; 1727 1728 case nir_intrinsic_image_store: 1729 bi_emit_image_store(b, instr); 1730 break; 1731 1732 case nir_intrinsic_global_atomic_exchange: 1733 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), 1734 &instr->src[1], BI_SEG_NONE); 1735 bi_split_dest(b, instr->dest); 1736 break; 1737 1738 case nir_intrinsic_image_atomic_exchange: 1739 bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr), 1740 &instr->src[3], BI_SEG_NONE); 1741 bi_split_dest(b, instr->dest); 1742 break; 1743 1744 case nir_intrinsic_shared_atomic_exchange: 1745 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), 1746 &instr->src[1], BI_SEG_WLS); 1747 bi_split_dest(b, instr->dest); 1748 break; 1749 1750 case nir_intrinsic_global_atomic_comp_swap: 1751 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), 1752 &instr->src[1], &instr->src[2], BI_SEG_NONE); 1753 bi_split_dest(b, instr->dest); 1754 break; 1755 1756 case nir_intrinsic_image_atomic_comp_swap: 1757 bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr), 1758 &instr->src[3], &instr->src[4], BI_SEG_NONE); 1759 bi_split_dest(b, instr->dest); 1760 break; 1761 1762 case nir_intrinsic_shared_atomic_comp_swap: 1763 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), 1764 &instr->src[1], &instr->src[2], BI_SEG_WLS); 1765 bi_split_dest(b, instr->dest); 1766 break; 1767 1768 case nir_intrinsic_load_frag_coord: 1769 bi_emit_load_frag_coord(b, instr); 1770 break; 1771 1772 case nir_intrinsic_load_output: 1773 bi_emit_ld_tile(b, instr); 1774 break; 1775 1776 case nir_intrinsic_discard_if: 1777 bi_discard_b32(b, bi_src_index(&instr->src[0])); 1778 break; 1779 1780 case nir_intrinsic_discard: 1781 bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ); 1782 break; 1783 1784 case nir_intrinsic_load_ssbo_address: 1785 case nir_intrinsic_load_xfb_address: 1786 bi_load_sysval_nir(b, instr, 2, 0); 1787 break; 1788 1789 case nir_intrinsic_load_work_dim: 1790 case nir_intrinsic_load_num_vertices: 1791 bi_load_sysval_nir(b, instr, 1, 0); 1792 break; 1793 1794 case nir_intrinsic_load_first_vertex: 1795 bi_load_sysval_nir(b, instr, 1, 0); 1796 break; 1797 1798 case nir_intrinsic_load_base_vertex: 1799 bi_load_sysval_nir(b, instr, 1, 4); 1800 break; 1801 1802 case nir_intrinsic_load_base_instance: 1803 bi_load_sysval_nir(b, instr, 1, 8); 1804 break; 1805 1806 case nir_intrinsic_load_draw_id: 1807 bi_load_sysval_nir(b, instr, 1, 0); 1808 break; 1809 1810 case nir_intrinsic_get_ssbo_size: 1811 bi_load_sysval_nir(b, instr, 1, 8); 1812 break; 1813 1814 case nir_intrinsic_load_viewport_scale: 1815 case nir_intrinsic_load_viewport_offset: 1816 case nir_intrinsic_load_num_workgroups: 1817 case nir_intrinsic_load_workgroup_size: 1818 bi_load_sysval_nir(b, instr, 3, 0); 1819 break; 1820 1821 case nir_intrinsic_image_size: 1822 bi_load_sysval_nir(b, instr, 1823 nir_dest_num_components(instr->dest), 0); 1824 break; 1825 1826 case nir_intrinsic_load_blend_const_color_rgba: 1827 bi_load_sysval_nir(b, instr, 1828 nir_dest_num_components(instr->dest), 0); 1829 break; 1830 1831 case nir_intrinsic_load_sample_positions_pan: 1832 bi_collect_v2i32_to(b, dst, 1833 bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false), 1834 bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true)); 1835 break; 1836 1837 case nir_intrinsic_load_sample_mask_in: 1838 /* r61[0:15] contains the coverage bitmap */ 1839 bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false)); 1840 break; 1841 1842 case nir_intrinsic_load_sample_id: 1843 bi_load_sample_id_to(b, dst); 1844 break; 1845 1846 case nir_intrinsic_load_front_face: 1847 /* r58 == 0 means primitive is front facing */ 1848 bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ, 1849 BI_RESULT_TYPE_M1); 1850 break; 1851 1852 case nir_intrinsic_load_point_coord: 1853 bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32, 1854 BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER, 1855 BI_VARYING_NAME_POINT, BI_VECSIZE_V2); 1856 bi_emit_cached_split_i32(b, dst, 2); 1857 break; 1858 1859 /* It appears vertex_id is zero-based with Bifrost geometry flows, but 1860 * not with Valhall's memory-allocation IDVS geometry flow. Ostensibly 1861 * we support the legacy geometry flow even on Valhall, so 1862 * vertex_id_zero_based isn't a machine property for us. Don't set it, 1863 * and lower here if needed. 1864 */ 1865 case nir_intrinsic_load_vertex_id: 1866 if (b->shader->malloc_idvs) { 1867 bi_mov_i32_to(b, dst, bi_vertex_id(b)); 1868 } else { 1869 bi_index first = bi_load_sysval(b, 1870 PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS, 1871 1, 0); 1872 1873 bi_iadd_u32_to(b, dst, bi_vertex_id(b), first, false); 1874 } 1875 1876 break; 1877 1878 /* We only use in our transform feedback lowering */ 1879 case nir_intrinsic_load_vertex_id_zero_base: 1880 assert(b->shader->nir->info.has_transform_feedback_varyings); 1881 bi_mov_i32_to(b, dst, bi_vertex_id(b)); 1882 break; 1883 1884 case nir_intrinsic_load_instance_id: 1885 bi_mov_i32_to(b, dst, bi_instance_id(b)); 1886 break; 1887 1888 case nir_intrinsic_load_subgroup_invocation: 1889 bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false)); 1890 break; 1891 1892 case nir_intrinsic_load_local_invocation_id: 1893 bi_collect_v3i32_to(b, dst, 1894 bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)), 1895 bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)), 1896 bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0))); 1897 break; 1898 1899 case nir_intrinsic_load_workgroup_id: 1900 bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58), 1901 bi_preload(b, 59)); 1902 break; 1903 1904 case nir_intrinsic_load_global_invocation_id: 1905 case nir_intrinsic_load_global_invocation_id_zero_base: 1906 bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61), 1907 bi_preload(b, 62)); 1908 break; 1909 1910 case nir_intrinsic_shader_clock: 1911 bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER); 1912 bi_split_dest(b, instr->dest); 1913 break; 1914 1915 default: 1916 fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name); 1917 assert(0); 1918 } 1919} 1920 1921static void 1922bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr) 1923{ 1924 /* Make sure we've been lowered */ 1925 assert(instr->def.num_components <= (32 / instr->def.bit_size)); 1926 1927 /* Accumulate all the channels of the constant, as if we did an 1928 * implicit SEL over them */ 1929 uint32_t acc = 0; 1930 1931 for (unsigned i = 0; i < instr->def.num_components; ++i) { 1932 unsigned v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size); 1933 acc |= (v << (i * instr->def.bit_size)); 1934 } 1935 1936 bi_mov_i32_to(b, bi_get_index(instr->def.index, false, 0), bi_imm_u32(acc)); 1937} 1938 1939static bi_index 1940bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps) 1941{ 1942 /* we don't lower modifiers until the backend */ 1943 assert(!(src.negate || src.abs)); 1944 1945 unsigned bitsize = nir_src_bit_size(src.src); 1946 1947 /* the bi_index carries the 32-bit (word) offset separate from the 1948 * subword swizzle, first handle the offset */ 1949 1950 unsigned offset = 0; 1951 1952 assert(bitsize == 8 || bitsize == 16 || bitsize == 32); 1953 unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2; 1954 1955 for (unsigned i = 0; i < comps; ++i) { 1956 unsigned new_offset = (src.swizzle[i] >> subword_shift); 1957 1958 if (i > 0) 1959 assert(offset == new_offset && "wrong vectorization"); 1960 1961 offset = new_offset; 1962 } 1963 1964 bi_index idx = bi_extract(b, bi_src_index(&src.src), offset); 1965 1966 /* Compose the subword swizzle with existing (identity) swizzle */ 1967 assert(idx.swizzle == BI_SWIZZLE_H01); 1968 1969 /* Bigger vectors should have been lowered */ 1970 assert(comps <= (1 << subword_shift)); 1971 1972 if (bitsize == 16) { 1973 unsigned c0 = src.swizzle[0] & 1; 1974 unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0; 1975 idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1); 1976 } else if (bitsize == 8) { 1977 /* 8-bit vectors not yet supported */ 1978 assert(comps == 1 && "8-bit vectors not supported"); 1979 assert(src.swizzle[0] < 4 && "8-bit vectors not supported"); 1980 idx.swizzle = BI_SWIZZLE_B0000 + src.swizzle[0]; 1981 } 1982 1983 return idx; 1984} 1985 1986static enum bi_round 1987bi_nir_round(nir_op op) 1988{ 1989 switch (op) { 1990 case nir_op_fround_even: return BI_ROUND_NONE; 1991 case nir_op_ftrunc: return BI_ROUND_RTZ; 1992 case nir_op_fceil: return BI_ROUND_RTP; 1993 case nir_op_ffloor: return BI_ROUND_RTN; 1994 default: unreachable("invalid nir round op"); 1995 } 1996} 1997 1998/* Convenience for lowered transcendentals */ 1999 2000static bi_index 2001bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1) 2002{ 2003 return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f)); 2004} 2005 2006/* Approximate with FRCP_APPROX.f32 and apply a single iteration of 2007 * Newton-Raphson to improve precision */ 2008 2009static void 2010bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0) 2011{ 2012 bi_index x1 = bi_frcp_approx_f32(b, s0); 2013 bi_index m = bi_frexpm_f32(b, s0, false, false); 2014 bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false); 2015 bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), 2016 bi_zero(), BI_SPECIAL_N); 2017 bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE); 2018} 2019 2020static void 2021bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0) 2022{ 2023 bi_index x1 = bi_frsq_approx_f32(b, s0); 2024 bi_index m = bi_frexpm_f32(b, s0, false, true); 2025 bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true); 2026 bi_index t1 = bi_fmul_f32(b, x1, x1); 2027 bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0), 2028 bi_imm_u32(-1), BI_SPECIAL_N); 2029 bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N); 2030} 2031 2032/* More complex transcendentals, see 2033 * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc 2034 * for documentation */ 2035 2036static void 2037bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0) 2038{ 2039 bi_index t1 = bi_temp(b->shader); 2040 bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000)); 2041 t1_instr->clamp = BI_CLAMP_CLAMP_0_INF; 2042 2043 bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000)); 2044 2045 bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2)); 2046 a2->clamp = BI_CLAMP_CLAMP_M1_1; 2047 2048 bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE); 2049 bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false); 2050 bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4)); 2051 bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635), 2052 bi_imm_u32(0x3e75fffa)); 2053 bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218)); 2054 bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2); 2055 bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), 2056 p3, a1t, a1t, a1i, BI_SPECIAL_NONE); 2057 x->clamp = BI_CLAMP_CLAMP_0_INF; 2058 2059 bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0); 2060 max->sem = BI_SEM_NAN_PROPAGATE; 2061} 2062 2063static void 2064bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base) 2065{ 2066 /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24 2067 * fixed-point input */ 2068 bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(), 2069 bi_imm_u32(24), BI_SPECIAL_NONE); 2070 bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale); 2071 fixed_pt->round = BI_ROUND_NONE; // XXX 2072 2073 /* Compute the result for the fixed-point input, but pass along 2074 * the floating-point scale for correct NaN propagation */ 2075 bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale); 2076} 2077 2078static void 2079bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0) 2080{ 2081 /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */ 2082 bi_index a1 = bi_frexpm_f32(b, s0, true, false); 2083 bi_index ei = bi_frexpe_f32(b, s0, true, false); 2084 bi_index ef = bi_s32_to_f32(b, ei); 2085 2086 /* xt estimates -log(r1), a coarse approximation of log(a1) */ 2087 bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE); 2088 bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE); 2089 2090 /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) - 2091 * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1), 2092 * and then log(s0) = x1 + x2 */ 2093 bi_index x1 = bi_fadd_f32(b, ef, xt); 2094 2095 /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by 2096 * polynomial approximation around 1. The series is expressed around 2097 * 1, so set y = (a1 * r1) - 1.0 */ 2098 bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0)); 2099 2100 /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate 2101 * log_e(1 + y) by the Taylor series (lower precision than the blob): 2102 * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */ 2103 bi_index loge = bi_fmul_f32(b, y, 2104 bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0))); 2105 2106 bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0))); 2107 2108 /* log(s0) = x1 + x2 */ 2109 bi_fadd_f32_to(b, dst, x1, x2); 2110} 2111 2112static void 2113bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0) 2114{ 2115 bi_index frexp = bi_frexpe_f32(b, s0, true, false); 2116 bi_index frexpi = bi_s32_to_f32(b, frexp); 2117 bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0); 2118 bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi); 2119} 2120 2121static void 2122bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) 2123{ 2124 bi_index log2_base = bi_null(); 2125 2126 if (base.type == BI_INDEX_CONSTANT) { 2127 log2_base = bi_imm_f32(log2f(uif(base.value))); 2128 } else { 2129 log2_base = bi_temp(b->shader); 2130 bi_lower_flog2_32(b, log2_base, base); 2131 } 2132 2133 return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base)); 2134} 2135 2136static void 2137bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) 2138{ 2139 bi_index log2_base = bi_null(); 2140 2141 if (base.type == BI_INDEX_CONSTANT) { 2142 log2_base = bi_imm_f32(log2f(uif(base.value))); 2143 } else { 2144 log2_base = bi_temp(b->shader); 2145 bi_flog2_32(b, log2_base, base); 2146 } 2147 2148 return bi_fexp_32(b, dst, exp, log2_base); 2149} 2150 2151/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as 2152 * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and 2153 * calculates the results. We use them to calculate sin/cos via a Taylor 2154 * approximation: 2155 * 2156 * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x) 2157 * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x) 2158 * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x) 2159 */ 2160 2161#define TWO_OVER_PI bi_imm_f32(2.0f / 3.14159f) 2162#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0) 2163#define SINCOS_BIAS bi_imm_u32(0x49400000) 2164 2165static void 2166bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos) 2167{ 2168 /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */ 2169 bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS); 2170 2171 /* Approximate domain error (small) */ 2172 bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)), 2173 MPI_OVER_TWO, s0); 2174 2175 /* Lookup sin(x), cos(x) */ 2176 bi_index sinx = bi_fsin_table_u6(b, x_u6, false); 2177 bi_index cosx = bi_fcos_table_u6(b, x_u6, false); 2178 2179 /* e^2 / 2 */ 2180 bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(), 2181 bi_imm_u32(-1), BI_SPECIAL_NONE); 2182 2183 /* (-e^2)/2 f''(x) */ 2184 bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2), 2185 cos ? cosx : sinx, 2186 bi_negzero()); 2187 2188 /* e f'(x) - (e^2/2) f''(x) */ 2189 bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e, 2190 cos ? bi_neg(sinx) : cosx, 2191 quadratic); 2192 I->clamp = BI_CLAMP_CLAMP_M1_1; 2193 2194 /* f(x) + e f'(x) - (e^2/2) f''(x) */ 2195 bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx); 2196} 2197 2198/* 2199 * The XOR lane op is useful for derivative calculations, but not all Bifrost 2200 * implementations have it. Add a safe helper that uses the hardware 2201 * functionality when available and lowers where unavailable. 2202 */ 2203static bi_index 2204bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1) 2205{ 2206 if (!(b->shader->quirks & BIFROST_LIMITED_CLPER)) { 2207 return bi_clper_i32(b, s0, s1, 2208 BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR, 2209 BI_SUBGROUP_SUBGROUP4); 2210 } 2211 2212 bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false); 2213 bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0)); 2214 return bi_clper_old_i32(b, s0, lane); 2215} 2216 2217static enum bi_cmpf 2218bi_translate_cmpf(nir_op op) 2219{ 2220 switch (op) { 2221 case nir_op_ieq8: 2222 case nir_op_ieq16: 2223 case nir_op_ieq32: 2224 case nir_op_feq16: 2225 case nir_op_feq32: 2226 return BI_CMPF_EQ; 2227 2228 case nir_op_ine8: 2229 case nir_op_ine16: 2230 case nir_op_ine32: 2231 case nir_op_fneu16: 2232 case nir_op_fneu32: 2233 return BI_CMPF_NE; 2234 2235 case nir_op_ilt8: 2236 case nir_op_ilt16: 2237 case nir_op_ilt32: 2238 case nir_op_flt16: 2239 case nir_op_flt32: 2240 case nir_op_ult8: 2241 case nir_op_ult16: 2242 case nir_op_ult32: 2243 return BI_CMPF_LT; 2244 2245 case nir_op_ige8: 2246 case nir_op_ige16: 2247 case nir_op_ige32: 2248 case nir_op_fge16: 2249 case nir_op_fge32: 2250 case nir_op_uge8: 2251 case nir_op_uge16: 2252 case nir_op_uge32: 2253 return BI_CMPF_GE; 2254 2255 default: 2256 unreachable("invalid comparison"); 2257 } 2258} 2259 2260static bool 2261bi_nir_is_replicated(nir_alu_src *src) 2262{ 2263 for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) { 2264 if (src->swizzle[0] == src->swizzle[i]) 2265 return false; 2266 } 2267 2268 return true; 2269} 2270 2271static void 2272bi_emit_alu(bi_builder *b, nir_alu_instr *instr) 2273{ 2274 bi_index dst = bi_dest_index(&instr->dest.dest); 2275 unsigned srcs = nir_op_infos[instr->op].num_inputs; 2276 unsigned sz = nir_dest_bit_size(instr->dest.dest); 2277 unsigned comps = nir_dest_num_components(instr->dest.dest); 2278 unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0; 2279 2280 /* Indicate scalarness */ 2281 if (sz == 16 && comps == 1) 2282 dst.swizzle = BI_SWIZZLE_H00; 2283 2284 if (!instr->dest.dest.is_ssa) { 2285 for (unsigned i = 0; i < comps; ++i) 2286 assert(instr->dest.write_mask); 2287 } 2288 2289 /* First, match against the various moves in NIR. These are 2290 * special-cased because they can operate on vectors even after 2291 * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the 2292 * instruction is no "bigger" than SIMD-within-a-register. These moves 2293 * are the exceptions that need to handle swizzles specially. */ 2294 2295 switch (instr->op) { 2296 case nir_op_vec2: 2297 case nir_op_vec3: 2298 case nir_op_vec4: { 2299 bi_index unoffset_srcs[4] = { 2300 srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(), 2301 srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(), 2302 srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(), 2303 srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(), 2304 }; 2305 2306 unsigned channels[4] = { 2307 instr->src[0].swizzle[0], 2308 instr->src[1].swizzle[0], 2309 srcs > 2 ? instr->src[2].swizzle[0] : 0, 2310 srcs > 3 ? instr->src[3].swizzle[0] : 0, 2311 }; 2312 2313 bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz); 2314 return; 2315 } 2316 2317 case nir_op_vec8: 2318 case nir_op_vec16: 2319 unreachable("should've been lowered"); 2320 2321 case nir_op_unpack_32_2x16: { 2322 /* Should have been scalarized */ 2323 assert(comps == 2 && sz == 16); 2324 2325 bi_index vec = bi_src_index(&instr->src[0].src); 2326 unsigned chan = instr->src[0].swizzle[0]; 2327 2328 bi_mov_i32_to(b, dst, bi_extract(b, vec, chan)); 2329 return; 2330 } 2331 2332 case nir_op_unpack_64_2x32_split_x: 2333 bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), 0)); 2334 return; 2335 2336 case nir_op_unpack_64_2x32_split_y: 2337 bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), 1)); 2338 return; 2339 2340 case nir_op_pack_64_2x32_split: 2341 bi_collect_v2i32_to(b, dst, 2342 bi_extract(b, bi_src_index(&instr->src[0].src), instr->src[0].swizzle[0]), 2343 bi_extract(b, bi_src_index(&instr->src[1].src), instr->src[1].swizzle[0])); 2344 return; 2345 2346 case nir_op_pack_64_2x32: 2347 bi_collect_v2i32_to(b, dst, 2348 bi_extract(b, bi_src_index(&instr->src[0].src), 0), 2349 bi_extract(b, bi_src_index(&instr->src[0].src), 1)); 2350 return; 2351 2352 case nir_op_pack_uvec2_to_uint: { 2353 bi_index src = bi_src_index(&instr->src[0].src); 2354 2355 assert(sz == 32 && src_sz == 32); 2356 bi_mkvec_v2i16_to(b, dst, bi_half(bi_extract(b, src, 0), false), 2357 bi_half(bi_extract(b, src, 1), false)); 2358 return; 2359 } 2360 2361 case nir_op_pack_uvec4_to_uint: { 2362 bi_index src = bi_src_index(&instr->src[0].src); 2363 2364 assert(sz == 32 && src_sz == 32); 2365 bi_mkvec_v4i8_to(b, dst, bi_byte(bi_extract(b, src, 0), 0), 2366 bi_byte(bi_extract(b, src, 1), 0), 2367 bi_byte(bi_extract(b, src, 2), 0), 2368 bi_byte(bi_extract(b, src, 3), 0)); 2369 return; 2370 } 2371 2372 case nir_op_mov: { 2373 bi_index idx = bi_src_index(&instr->src[0].src); 2374 bi_index unoffset_srcs[4] = { idx, idx, idx, idx }; 2375 2376 unsigned channels[4] = { 2377 comps > 0 ? instr->src[0].swizzle[0] : 0, 2378 comps > 1 ? instr->src[0].swizzle[1] : 0, 2379 comps > 2 ? instr->src[0].swizzle[2] : 0, 2380 comps > 3 ? instr->src[0].swizzle[3] : 0, 2381 }; 2382 2383 bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz); 2384 return; 2385 } 2386 2387 case nir_op_pack_32_2x16: { 2388 assert(nir_src_num_components(instr->src[0].src) == 2); 2389 assert(comps == 1); 2390 2391 bi_index idx = bi_src_index(&instr->src[0].src); 2392 bi_index unoffset_srcs[4] = { idx, idx, idx, idx }; 2393 2394 unsigned channels[2] = { 2395 instr->src[0].swizzle[0], 2396 instr->src[0].swizzle[1] 2397 }; 2398 2399 bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16); 2400 return; 2401 } 2402 2403 case nir_op_f2f16: 2404 case nir_op_f2f16_rtz: 2405 case nir_op_f2f16_rtne: { 2406 assert(src_sz == 32); 2407 bi_index idx = bi_src_index(&instr->src[0].src); 2408 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); 2409 bi_index s1 = comps > 1 ? 2410 bi_extract(b, idx, instr->src[0].swizzle[1]) : s0; 2411 2412 bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1); 2413 2414 /* Override rounding if explicitly requested. Otherwise, the 2415 * default rounding mode is selected by the builder. Depending 2416 * on the float controls required by the shader, the default 2417 * mode may not be nearest-even. 2418 */ 2419 if (instr->op == nir_op_f2f16_rtz) 2420 I->round = BI_ROUND_RTZ; 2421 else if (instr->op == nir_op_f2f16_rtne) 2422 I->round = BI_ROUND_NONE; /* Nearest even */ 2423 2424 return; 2425 } 2426 2427 /* Vectorized downcasts */ 2428 case nir_op_u2u16: 2429 case nir_op_i2i16: { 2430 if (!(src_sz == 32 && comps == 2)) 2431 break; 2432 2433 bi_index idx = bi_src_index(&instr->src[0].src); 2434 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); 2435 bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]); 2436 2437 bi_mkvec_v2i16_to(b, dst, 2438 bi_half(s0, false), bi_half(s1, false)); 2439 return; 2440 } 2441 2442 /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to 2443 * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than 2444 * scalarizing due to scheduling (equal cost on Valhall). Additionally 2445 * if the source is replicated the MKVEC.v2i16 can be optimized out. 2446 */ 2447 case nir_op_u2f16: 2448 case nir_op_i2f16: { 2449 if (!(src_sz == 32 && comps == 2)) 2450 break; 2451 2452 nir_alu_src *src = &instr->src[0]; 2453 bi_index idx = bi_src_index(&src->src); 2454 bi_index s0 = bi_extract(b, idx, src->swizzle[0]); 2455 bi_index s1 = bi_extract(b, idx, src->swizzle[1]); 2456 2457 bi_index t = (src->swizzle[0] == src->swizzle[1]) ? 2458 bi_half(s0, false) : 2459 bi_mkvec_v2i16(b, bi_half(s0, false), 2460 bi_half(s1, false)); 2461 2462 if (instr->op == nir_op_u2f16) 2463 bi_v2u16_to_v2f16_to(b, dst, t); 2464 else 2465 bi_v2s16_to_v2f16_to(b, dst, t); 2466 2467 return; 2468 } 2469 2470 case nir_op_i2i8: 2471 case nir_op_u2u8: 2472 { 2473 /* Acts like an 8-bit swizzle */ 2474 bi_index idx = bi_src_index(&instr->src[0].src); 2475 unsigned factor = src_sz / 8; 2476 unsigned chan[4] = { 0 }; 2477 2478 for (unsigned i = 0; i < comps; ++i) 2479 chan[i] = instr->src[0].swizzle[i] * factor; 2480 2481 bi_make_vec_to(b, dst, &idx, chan, comps, 8); 2482 return; 2483 } 2484 2485 case nir_op_b32csel: 2486 { 2487 if (sz != 16) 2488 break; 2489 2490 /* We allow vectorizing b32csel(cond, A, B) which can be 2491 * translated as MUX.v2i16, even though cond is a 32-bit vector. 2492 * 2493 * If the source condition vector is replicated, we can use 2494 * MUX.v2i16 directly, letting each component use the 2495 * corresponding half of the 32-bit source. NIR uses 0/~0 2496 * booleans so that's guaranteed to work (that is, 32-bit NIR 2497 * booleans are 16-bit replicated). 2498 * 2499 * If we're not replicated, we use the same trick but must 2500 * insert a MKVEC.v2i16 first to convert down to 16-bit. 2501 */ 2502 bi_index idx = bi_src_index(&instr->src[0].src); 2503 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); 2504 bi_index s1 = bi_alu_src_index(b, instr->src[1], comps); 2505 bi_index s2 = bi_alu_src_index(b, instr->src[2], comps); 2506 2507 if (!bi_nir_is_replicated(&instr->src[0])) { 2508 s0 = bi_mkvec_v2i16(b, bi_half(s0, false), 2509 bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false)); 2510 } 2511 2512 bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); 2513 return; 2514 } 2515 2516 default: 2517 break; 2518 } 2519 2520 bi_index s0 = srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null(); 2521 bi_index s1 = srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null(); 2522 bi_index s2 = srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null(); 2523 2524 switch (instr->op) { 2525 case nir_op_ffma: 2526 bi_fma_to(b, sz, dst, s0, s1, s2); 2527 break; 2528 2529 case nir_op_fmul: 2530 bi_fma_to(b, sz, dst, s0, s1, bi_negzero()); 2531 break; 2532 2533 case nir_op_fsub: 2534 s1 = bi_neg(s1); 2535 FALLTHROUGH; 2536 case nir_op_fadd: 2537 bi_fadd_to(b, sz, dst, s0, s1); 2538 break; 2539 2540 case nir_op_fsat: { 2541 bi_instr *I = bi_fclamp_to(b, sz, dst, s0); 2542 I->clamp = BI_CLAMP_CLAMP_0_1; 2543 break; 2544 } 2545 2546 case nir_op_fsat_signed_mali: { 2547 bi_instr *I = bi_fclamp_to(b, sz, dst, s0); 2548 I->clamp = BI_CLAMP_CLAMP_M1_1; 2549 break; 2550 } 2551 2552 case nir_op_fclamp_pos_mali: { 2553 bi_instr *I = bi_fclamp_to(b, sz, dst, s0); 2554 I->clamp = BI_CLAMP_CLAMP_0_INF; 2555 break; 2556 } 2557 2558 case nir_op_fneg: 2559 bi_fabsneg_to(b, sz, dst, bi_neg(s0)); 2560 break; 2561 2562 case nir_op_fabs: 2563 bi_fabsneg_to(b, sz, dst, bi_abs(s0)); 2564 break; 2565 2566 case nir_op_fsin: 2567 bi_lower_fsincos_32(b, dst, s0, false); 2568 break; 2569 2570 case nir_op_fcos: 2571 bi_lower_fsincos_32(b, dst, s0, true); 2572 break; 2573 2574 case nir_op_fexp2: 2575 assert(sz == 32); /* should've been lowered */ 2576 2577 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 2578 bi_lower_fexp2_32(b, dst, s0); 2579 else 2580 bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f)); 2581 2582 break; 2583 2584 case nir_op_flog2: 2585 assert(sz == 32); /* should've been lowered */ 2586 2587 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 2588 bi_lower_flog2_32(b, dst, s0); 2589 else 2590 bi_flog2_32(b, dst, s0); 2591 2592 break; 2593 2594 case nir_op_fpow: 2595 assert(sz == 32); /* should've been lowered */ 2596 2597 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 2598 bi_lower_fpow_32(b, dst, s0, s1); 2599 else 2600 bi_fpow_32(b, dst, s0, s1); 2601 2602 break; 2603 2604 case nir_op_frexp_exp: 2605 bi_frexpe_to(b, sz, dst, s0, false, false); 2606 break; 2607 2608 case nir_op_frexp_sig: 2609 bi_frexpm_to(b, sz, dst, s0, false, false); 2610 break; 2611 2612 case nir_op_ldexp: 2613 bi_ldexp_to(b, sz, dst, s0, s1); 2614 break; 2615 2616 case nir_op_b8csel: 2617 bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); 2618 break; 2619 2620 case nir_op_b16csel: 2621 bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); 2622 break; 2623 2624 case nir_op_b32csel: 2625 bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); 2626 break; 2627 2628 case nir_op_ishl: 2629 bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0)); 2630 break; 2631 case nir_op_ushr: 2632 bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false); 2633 break; 2634 2635 case nir_op_ishr: 2636 if (b->shader->arch >= 9) 2637 bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true); 2638 else 2639 bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0)); 2640 break; 2641 2642 case nir_op_imin: 2643 case nir_op_umin: 2644 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, 2645 s0, s1, s0, s1, BI_CMPF_LT); 2646 break; 2647 2648 case nir_op_imax: 2649 case nir_op_umax: 2650 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, 2651 s0, s1, s0, s1, BI_CMPF_GT); 2652 break; 2653 2654 case nir_op_fddx_must_abs_mali: 2655 case nir_op_fddy_must_abs_mali: { 2656 bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2); 2657 bi_index adjacent = bi_clper_xor(b, s0, bit); 2658 bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0)); 2659 break; 2660 } 2661 2662 case nir_op_fddx: 2663 case nir_op_fddy: 2664 case nir_op_fddx_coarse: 2665 case nir_op_fddy_coarse: 2666 case nir_op_fddx_fine: 2667 case nir_op_fddy_fine: { 2668 unsigned axis; 2669 switch (instr->op) { 2670 case nir_op_fddx: 2671 case nir_op_fddx_coarse: 2672 case nir_op_fddx_fine: 2673 axis = 1; 2674 break; 2675 case nir_op_fddy: 2676 case nir_op_fddy_coarse: 2677 case nir_op_fddy_fine: 2678 axis = 2; 2679 break; 2680 default: 2681 unreachable("Invalid derivative op"); 2682 } 2683 2684 bi_index lane1, lane2; 2685 switch (instr->op) { 2686 case nir_op_fddx: 2687 case nir_op_fddx_fine: 2688 case nir_op_fddy: 2689 case nir_op_fddy_fine: 2690 lane1 = bi_lshift_and_i32(b, 2691 bi_fau(BIR_FAU_LANE_ID, false), 2692 bi_imm_u32(0x3 & ~axis), 2693 bi_imm_u8(0)); 2694 2695 lane2 = bi_iadd_u32(b, lane1, 2696 bi_imm_u32(axis), 2697 false); 2698 break; 2699 case nir_op_fddx_coarse: 2700 case nir_op_fddy_coarse: 2701 lane1 = bi_imm_u32(0); 2702 lane2 = bi_imm_u32(axis); 2703 break; 2704 default: 2705 unreachable("Invalid derivative op"); 2706 } 2707 2708 bi_index left, right; 2709 2710 if (b->shader->quirks & BIFROST_LIMITED_CLPER) { 2711 left = bi_clper_old_i32(b, s0, lane1); 2712 right = bi_clper_old_i32(b, s0, lane2); 2713 } else { 2714 left = bi_clper_i32(b, s0, lane1, 2715 BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, 2716 BI_SUBGROUP_SUBGROUP4); 2717 2718 right = bi_clper_i32(b, s0, lane2, 2719 BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, 2720 BI_SUBGROUP_SUBGROUP4); 2721 } 2722 2723 bi_fadd_to(b, sz, dst, right, bi_neg(left)); 2724 break; 2725 } 2726 2727 case nir_op_f2f32: 2728 bi_f16_to_f32_to(b, dst, s0); 2729 break; 2730 2731 case nir_op_fquantize2f16: 2732 { 2733 bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0); 2734 bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false)); 2735 2736 f16->ftz = f32->ftz = true; 2737 break; 2738 } 2739 2740 case nir_op_f2i32: 2741 if (src_sz == 32) 2742 bi_f32_to_s32_to(b, dst, s0); 2743 else 2744 bi_f16_to_s32_to(b, dst, s0); 2745 break; 2746 2747 /* Note 32-bit sources => no vectorization, so 32-bit works */ 2748 case nir_op_f2u16: 2749 if (src_sz == 32) 2750 bi_f32_to_u32_to(b, dst, s0); 2751 else 2752 bi_v2f16_to_v2u16_to(b, dst, s0); 2753 break; 2754 2755 case nir_op_f2i16: 2756 if (src_sz == 32) 2757 bi_f32_to_s32_to(b, dst, s0); 2758 else 2759 bi_v2f16_to_v2s16_to(b, dst, s0); 2760 break; 2761 2762 case nir_op_f2u32: 2763 if (src_sz == 32) 2764 bi_f32_to_u32_to(b, dst, s0); 2765 else 2766 bi_f16_to_u32_to(b, dst, s0); 2767 break; 2768 2769 case nir_op_u2f16: 2770 if (src_sz == 32) 2771 bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false)); 2772 else if (src_sz == 16) 2773 bi_v2u16_to_v2f16_to(b, dst, s0); 2774 else if (src_sz == 8) 2775 bi_v2u8_to_v2f16_to(b, dst, s0); 2776 break; 2777 2778 case nir_op_u2f32: 2779 if (src_sz == 32) 2780 bi_u32_to_f32_to(b, dst, s0); 2781 else if (src_sz == 16) 2782 bi_u16_to_f32_to(b, dst, s0); 2783 else 2784 bi_u8_to_f32_to(b, dst, s0); 2785 break; 2786 2787 case nir_op_i2f16: 2788 if (src_sz == 32) 2789 bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false)); 2790 else if (src_sz == 16) 2791 bi_v2s16_to_v2f16_to(b, dst, s0); 2792 else if (src_sz == 8) 2793 bi_v2s8_to_v2f16_to(b, dst, s0); 2794 break; 2795 2796 case nir_op_i2f32: 2797 assert(src_sz == 32 || src_sz == 16 || src_sz == 8); 2798 2799 if (src_sz == 32) 2800 bi_s32_to_f32_to(b, dst, s0); 2801 else if (src_sz == 16) 2802 bi_s16_to_f32_to(b, dst, s0); 2803 else if (src_sz == 8) 2804 bi_s8_to_f32_to(b, dst, s0); 2805 break; 2806 2807 case nir_op_i2i32: 2808 assert(src_sz == 32 || src_sz == 16 || src_sz == 8); 2809 2810 if (src_sz == 32) 2811 bi_mov_i32_to(b, dst, s0); 2812 else if (src_sz == 16) 2813 bi_s16_to_s32_to(b, dst, s0); 2814 else if (src_sz == 8) 2815 bi_s8_to_s32_to(b, dst, s0); 2816 break; 2817 2818 case nir_op_u2u32: 2819 assert(src_sz == 32 || src_sz == 16 || src_sz == 8); 2820 2821 if (src_sz == 32) 2822 bi_mov_i32_to(b, dst, s0); 2823 else if (src_sz == 16) 2824 bi_u16_to_u32_to(b, dst, s0); 2825 else if (src_sz == 8) 2826 bi_u8_to_u32_to(b, dst, s0); 2827 2828 break; 2829 2830 case nir_op_i2i16: 2831 assert(src_sz == 8 || src_sz == 32); 2832 2833 if (src_sz == 8) 2834 bi_v2s8_to_v2s16_to(b, dst, s0); 2835 else 2836 bi_mov_i32_to(b, dst, s0); 2837 break; 2838 2839 case nir_op_u2u16: 2840 assert(src_sz == 8 || src_sz == 32); 2841 2842 if (src_sz == 8) 2843 bi_v2u8_to_v2u16_to(b, dst, s0); 2844 else 2845 bi_mov_i32_to(b, dst, s0); 2846 break; 2847 2848 case nir_op_b2i8: 2849 case nir_op_b2i16: 2850 case nir_op_b2i32: 2851 bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0, BI_MUX_INT_ZERO); 2852 break; 2853 2854 case nir_op_f2b16: 2855 bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_FP_ZERO); 2856 break; 2857 case nir_op_f2b32: 2858 bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_FP_ZERO); 2859 break; 2860 2861 case nir_op_i2b8: 2862 bi_mux_v4i8_to(b, dst, bi_imm_u8(0), bi_imm_u8(~0), s0, BI_MUX_INT_ZERO); 2863 break; 2864 case nir_op_i2b16: 2865 bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_INT_ZERO); 2866 break; 2867 case nir_op_i2b32: 2868 bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_INT_ZERO); 2869 break; 2870 2871 case nir_op_ieq8: 2872 case nir_op_ine8: 2873 case nir_op_ilt8: 2874 case nir_op_ige8: 2875 case nir_op_ieq16: 2876 case nir_op_ine16: 2877 case nir_op_ilt16: 2878 case nir_op_ige16: 2879 case nir_op_ieq32: 2880 case nir_op_ine32: 2881 case nir_op_ilt32: 2882 case nir_op_ige32: 2883 bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); 2884 break; 2885 2886 case nir_op_ult8: 2887 case nir_op_uge8: 2888 case nir_op_ult16: 2889 case nir_op_uge16: 2890 case nir_op_ult32: 2891 case nir_op_uge32: 2892 bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); 2893 break; 2894 2895 case nir_op_feq32: 2896 case nir_op_feq16: 2897 case nir_op_flt32: 2898 case nir_op_flt16: 2899 case nir_op_fge32: 2900 case nir_op_fge16: 2901 case nir_op_fneu32: 2902 case nir_op_fneu16: 2903 bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); 2904 break; 2905 2906 case nir_op_fround_even: 2907 case nir_op_fceil: 2908 case nir_op_ffloor: 2909 case nir_op_ftrunc: 2910 bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op)); 2911 break; 2912 2913 case nir_op_fmin: 2914 bi_fmin_to(b, sz, dst, s0, s1); 2915 break; 2916 2917 case nir_op_fmax: 2918 bi_fmax_to(b, sz, dst, s0, s1); 2919 break; 2920 2921 case nir_op_iadd: 2922 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false); 2923 break; 2924 2925 case nir_op_iadd_sat: 2926 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true); 2927 break; 2928 2929 case nir_op_uadd_sat: 2930 bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true); 2931 break; 2932 2933 case nir_op_ihadd: 2934 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN); 2935 break; 2936 2937 case nir_op_irhadd: 2938 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP); 2939 break; 2940 2941 case nir_op_ineg: 2942 bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false); 2943 break; 2944 2945 case nir_op_isub: 2946 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false); 2947 break; 2948 2949 case nir_op_isub_sat: 2950 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true); 2951 break; 2952 2953 case nir_op_usub_sat: 2954 bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true); 2955 break; 2956 2957 case nir_op_imul: 2958 bi_imul_to(b, sz, dst, s0, s1); 2959 break; 2960 2961 case nir_op_iabs: 2962 bi_iabs_to(b, sz, dst, s0); 2963 break; 2964 2965 case nir_op_iand: 2966 bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 2967 break; 2968 2969 case nir_op_ior: 2970 bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 2971 break; 2972 2973 case nir_op_ixor: 2974 bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 2975 break; 2976 2977 case nir_op_inot: 2978 bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0)); 2979 break; 2980 2981 case nir_op_frsq: 2982 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 2983 bi_lower_frsq_32(b, dst, s0); 2984 else 2985 bi_frsq_to(b, sz, dst, s0); 2986 break; 2987 2988 case nir_op_frcp: 2989 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 2990 bi_lower_frcp_32(b, dst, s0); 2991 else 2992 bi_frcp_to(b, sz, dst, s0); 2993 break; 2994 2995 case nir_op_uclz: 2996 bi_clz_to(b, sz, dst, s0, false); 2997 break; 2998 2999 case nir_op_bit_count: 3000 bi_popcount_i32_to(b, dst, s0); 3001 break; 3002 3003 case nir_op_bitfield_reverse: 3004 bi_bitrev_i32_to(b, dst, s0); 3005 break; 3006 3007 case nir_op_ufind_msb: { 3008 bi_index clz = bi_clz(b, src_sz, s0, false); 3009 3010 if (sz == 8) 3011 clz = bi_byte(clz, 0); 3012 else if (sz == 16) 3013 clz = bi_half(clz, false); 3014 3015 bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false); 3016 break; 3017 } 3018 3019 default: 3020 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name); 3021 unreachable("Unknown ALU op"); 3022 } 3023} 3024 3025/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */ 3026static unsigned 3027bifrost_tex_format(enum glsl_sampler_dim dim) 3028{ 3029 switch (dim) { 3030 case GLSL_SAMPLER_DIM_1D: 3031 case GLSL_SAMPLER_DIM_BUF: 3032 return 1; 3033 3034 case GLSL_SAMPLER_DIM_2D: 3035 case GLSL_SAMPLER_DIM_MS: 3036 case GLSL_SAMPLER_DIM_EXTERNAL: 3037 case GLSL_SAMPLER_DIM_RECT: 3038 return 2; 3039 3040 case GLSL_SAMPLER_DIM_3D: 3041 return 3; 3042 3043 case GLSL_SAMPLER_DIM_CUBE: 3044 return 0; 3045 3046 default: 3047 DBG("Unknown sampler dim type\n"); 3048 assert(0); 3049 return 0; 3050 } 3051} 3052 3053static enum bi_dimension 3054valhall_tex_dimension(enum glsl_sampler_dim dim) 3055{ 3056 switch (dim) { 3057 case GLSL_SAMPLER_DIM_1D: 3058 case GLSL_SAMPLER_DIM_BUF: 3059 return BI_DIMENSION_1D; 3060 3061 case GLSL_SAMPLER_DIM_2D: 3062 case GLSL_SAMPLER_DIM_MS: 3063 case GLSL_SAMPLER_DIM_EXTERNAL: 3064 case GLSL_SAMPLER_DIM_RECT: 3065 return BI_DIMENSION_2D; 3066 3067 case GLSL_SAMPLER_DIM_3D: 3068 return BI_DIMENSION_3D; 3069 3070 case GLSL_SAMPLER_DIM_CUBE: 3071 return BI_DIMENSION_CUBE; 3072 3073 default: 3074 unreachable("Unknown sampler dim type"); 3075 } 3076} 3077 3078static enum bifrost_texture_format_full 3079bi_texture_format(nir_alu_type T, enum bi_clamp clamp) 3080{ 3081 switch (T) { 3082 case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp; 3083 case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp; 3084 case nir_type_uint16: return BIFROST_TEXTURE_FORMAT_U16; 3085 case nir_type_int16: return BIFROST_TEXTURE_FORMAT_S16; 3086 case nir_type_uint32: return BIFROST_TEXTURE_FORMAT_U32; 3087 case nir_type_int32: return BIFROST_TEXTURE_FORMAT_S32; 3088 default: unreachable("Invalid type for texturing"); 3089 } 3090} 3091 3092/* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */ 3093static bi_index 3094bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T) 3095{ 3096 /* For (u)int we can just passthrough */ 3097 nir_alu_type base = nir_alu_type_get_base_type(T); 3098 if (base == nir_type_int || base == nir_type_uint) 3099 return idx; 3100 3101 /* Otherwise we convert */ 3102 assert(T == nir_type_float32); 3103 3104 /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and 3105 * Texel Selection") defines the layer to be taken from clamp(RNE(r), 3106 * 0, dt - 1). So we use round RTE, clamping is handled at the data 3107 * structure level */ 3108 3109 bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx); 3110 I->round = BI_ROUND_NONE; 3111 return I->dest[0]; 3112} 3113 3114/* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a 3115 * 16-bit 8:8 fixed-point format. We lower as: 3116 * 3117 * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF = 3118 * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0) 3119 */ 3120 3121static bi_index 3122bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16) 3123{ 3124 /* Precompute for constant LODs to avoid general constant folding */ 3125 if (lod.type == BI_INDEX_CONSTANT) { 3126 uint32_t raw = lod.value; 3127 float x = fp16 ? _mesa_half_to_float(raw) : uif(raw); 3128 int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f; 3129 return bi_imm_u32(s32 & 0xFFFF); 3130 } 3131 3132 /* Sort of arbitrary. Must be less than 128.0, greater than or equal to 3133 * the max LOD (16 since we cap at 2^16 texture dimensions), and 3134 * preferably small to minimize precision loss */ 3135 const float max_lod = 16.0; 3136 3137 bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader), 3138 fp16 ? bi_half(lod, false) : lod, 3139 bi_imm_f32(1.0f / max_lod), bi_negzero()); 3140 3141 fsat->clamp = BI_CLAMP_CLAMP_M1_1; 3142 3143 bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), 3144 bi_negzero()); 3145 3146 return bi_mkvec_v2i16(b, 3147 bi_half(bi_f32_to_s32(b, fmul), false), bi_imm_u16(0)); 3148} 3149 3150/* FETCH takes a 32-bit staging register containing the LOD as an integer in 3151 * the bottom 16-bits and (if present) the cube face index in the top 16-bits. 3152 * TODO: Cube face. 3153 */ 3154 3155static bi_index 3156bi_emit_texc_lod_cube(bi_builder *b, bi_index lod) 3157{ 3158 return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8)); 3159} 3160 3161/* The hardware specifies texel offsets and multisample indices together as a 3162 * u8vec4 <offset, ms index>. By default all are zero, so if have either a 3163 * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with 3164 * the bits we need and return that to be passed as a staging register. Else we 3165 * return 0 to avoid allocating a data register when everything is zero. */ 3166 3167static bi_index 3168bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr) 3169{ 3170 bi_index dest = bi_zero(); 3171 3172 int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); 3173 if (offs_idx >= 0 && 3174 (!nir_src_is_const(instr->src[offs_idx].src) || 3175 nir_src_as_uint(instr->src[offs_idx].src) != 0)) { 3176 unsigned nr = nir_src_num_components(instr->src[offs_idx].src); 3177 bi_index idx = bi_src_index(&instr->src[offs_idx].src); 3178 dest = bi_mkvec_v4i8(b, 3179 (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0), 3180 (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), 3181 (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0), 3182 bi_imm_u8(0)); 3183 } 3184 3185 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); 3186 if (ms_idx >= 0 && 3187 (!nir_src_is_const(instr->src[ms_idx].src) || 3188 nir_src_as_uint(instr->src[ms_idx].src) != 0)) { 3189 dest = bi_lshift_or_i32(b, 3190 bi_src_index(&instr->src[ms_idx].src), dest, 3191 bi_imm_u8(24)); 3192 } 3193 3194 return dest; 3195} 3196 3197/* 3198 * Valhall specifies specifies texel offsets, multisample indices, and (for 3199 * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third 3200 * component is either offset.z or multisample index depending on context. Build 3201 * this register. 3202 */ 3203static bi_index 3204bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr) 3205{ 3206 bi_index dest = bi_zero(); 3207 3208 int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); 3209 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); 3210 int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); 3211 3212 /* Components 0-2: offsets */ 3213 if (offs_idx >= 0 && 3214 (!nir_src_is_const(instr->src[offs_idx].src) || 3215 nir_src_as_uint(instr->src[offs_idx].src) != 0)) { 3216 unsigned nr = nir_src_num_components(instr->src[offs_idx].src); 3217 bi_index idx = bi_src_index(&instr->src[offs_idx].src); 3218 3219 /* No multisample index with 3D */ 3220 assert((nr <= 2) || (ms_idx < 0)); 3221 3222 /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */ 3223 bi_index z = (nr > 2) ? 3224 bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0), 3225 bi_imm_u8(0), bi_zero()) : 3226 bi_zero(); 3227 3228 dest = bi_mkvec_v2i8(b, 3229 (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0), 3230 (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), 3231 z); 3232 } 3233 3234 /* Component 2: multisample index */ 3235 if (ms_idx >= 0 && 3236 (!nir_src_is_const(instr->src[ms_idx].src) || 3237 nir_src_as_uint(instr->src[ms_idx].src) != 0)) { 3238 dest = bi_mkvec_v2i16(b, dest, 3239 bi_src_index(&instr->src[ms_idx].src)); 3240 } 3241 3242 /* Component 3: 8-bit LOD */ 3243 if (lod_idx >= 0 && 3244 (!nir_src_is_const(instr->src[lod_idx].src) || 3245 nir_src_as_uint(instr->src[lod_idx].src) != 0) && 3246 nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) { 3247 dest = bi_lshift_or_i32(b, 3248 bi_src_index(&instr->src[lod_idx].src), dest, 3249 bi_imm_u8(24)); 3250 } 3251 3252 return dest; 3253} 3254 3255static void 3256bi_emit_cube_coord(bi_builder *b, bi_index coord, 3257 bi_index *face, bi_index *s, bi_index *t) 3258{ 3259 /* Compute max { |x|, |y|, |z| } */ 3260 bi_index maxxyz = bi_temp(b->shader); 3261 *face = bi_temp(b->shader); 3262 3263 bi_index cx = bi_extract(b, coord, 0), 3264 cy = bi_extract(b, coord, 1), 3265 cz = bi_extract(b, coord, 2); 3266 3267 /* Use a pseudo op on Bifrost due to tuple restrictions */ 3268 if (b->shader->arch <= 8) { 3269 bi_cubeface_to(b, maxxyz, *face, cx, cy, cz); 3270 } else { 3271 bi_cubeface1_to(b, maxxyz, cx, cy, cz); 3272 bi_cubeface2_v9_to(b, *face, cx, cy, cz); 3273 } 3274 3275 /* Select coordinates */ 3276 bi_index ssel = bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face); 3277 bi_index tsel = bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), 3278 *face); 3279 3280 /* The OpenGL ES specification requires us to transform an input vector 3281 * (x, y, z) to the coordinate, given the selected S/T: 3282 * 3283 * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1)) 3284 * 3285 * We implement (s shown, t similar) in a form friendlier to FMA 3286 * instructions, and clamp coordinates at the end for correct 3287 * NaN/infinity handling: 3288 * 3289 * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5) 3290 * 3291 * Take the reciprocal of max{x, y, z} 3292 */ 3293 bi_index rcp = bi_frcp_f32(b, maxxyz); 3294 3295 /* Calculate 0.5 * (1.0 / max{x, y, z}) */ 3296 bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero()); 3297 3298 /* Transform the coordinates */ 3299 *s = bi_temp(b->shader); 3300 *t = bi_temp(b->shader); 3301 3302 bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f)); 3303 bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f)); 3304 3305 S->clamp = BI_CLAMP_CLAMP_0_1; 3306 T->clamp = BI_CLAMP_CLAMP_0_1; 3307} 3308 3309/* Emits a cube map descriptor, returning lower 32-bits and putting upper 3310 * 32-bits in passed pointer t. The packing of the face with the S coordinate 3311 * exploits the redundancy of floating points with the range restriction of 3312 * CUBEFACE output. 3313 * 3314 * struct cube_map_descriptor { 3315 * float s : 29; 3316 * unsigned face : 3; 3317 * float t : 32; 3318 * } 3319 * 3320 * Since the cube face index is preshifted, this is easy to pack with a bitwise 3321 * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3 3322 * bits from face. 3323 */ 3324 3325static bi_index 3326bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t) 3327{ 3328 bi_index face, s; 3329 bi_emit_cube_coord(b, coord, &face, &s, t); 3330 bi_index mask = bi_imm_u32(BITFIELD_MASK(29)); 3331 return bi_mux_i32(b, s, face, mask, BI_MUX_BIT); 3332} 3333 3334/* Map to the main texture op used. Some of these (txd in particular) will 3335 * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in 3336 * sequence). We assume that lowering is handled elsewhere. 3337 */ 3338 3339static enum bifrost_tex_op 3340bi_tex_op(nir_texop op) 3341{ 3342 switch (op) { 3343 case nir_texop_tex: 3344 case nir_texop_txb: 3345 case nir_texop_txl: 3346 case nir_texop_txd: 3347 case nir_texop_tex_prefetch: 3348 return BIFROST_TEX_OP_TEX; 3349 case nir_texop_txf: 3350 case nir_texop_txf_ms: 3351 case nir_texop_txf_ms_fb: 3352 case nir_texop_tg4: 3353 return BIFROST_TEX_OP_FETCH; 3354 case nir_texop_txs: 3355 case nir_texop_lod: 3356 case nir_texop_query_levels: 3357 case nir_texop_texture_samples: 3358 case nir_texop_samples_identical: 3359 unreachable("should've been lowered"); 3360 default: 3361 unreachable("unsupported tex op"); 3362 } 3363} 3364 3365/* Data registers required by texturing in the order they appear. All are 3366 * optional, the texture operation descriptor determines which are present. 3367 * Note since 3D arrays are not permitted at an API level, Z_COORD and 3368 * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */ 3369 3370enum bifrost_tex_dreg { 3371 BIFROST_TEX_DREG_Z_COORD = 0, 3372 BIFROST_TEX_DREG_Y_DELTAS = 1, 3373 BIFROST_TEX_DREG_LOD = 2, 3374 BIFROST_TEX_DREG_GRDESC_HI = 3, 3375 BIFROST_TEX_DREG_SHADOW = 4, 3376 BIFROST_TEX_DREG_ARRAY = 5, 3377 BIFROST_TEX_DREG_OFFSETMS = 6, 3378 BIFROST_TEX_DREG_SAMPLER = 7, 3379 BIFROST_TEX_DREG_TEXTURE = 8, 3380 BIFROST_TEX_DREG_COUNT, 3381}; 3382 3383static void 3384bi_emit_texc(bi_builder *b, nir_tex_instr *instr) 3385{ 3386 struct bifrost_texture_operation desc = { 3387 .op = bi_tex_op(instr->op), 3388 .offset_or_bias_disable = false, /* TODO */ 3389 .shadow_or_clamp_disable = instr->is_shadow, 3390 .array = instr->is_array, 3391 .dimension = bifrost_tex_format(instr->sampler_dim), 3392 .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */ 3393 .mask = 0xF, 3394 }; 3395 3396 switch (desc.op) { 3397 case BIFROST_TEX_OP_TEX: 3398 desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE; 3399 break; 3400 case BIFROST_TEX_OP_FETCH: 3401 desc.lod_or_fetch = (enum bifrost_lod_mode) 3402 (instr->op == nir_texop_tg4 ? 3403 BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component : 3404 BIFROST_TEXTURE_FETCH_TEXEL); 3405 break; 3406 default: 3407 unreachable("texture op unsupported"); 3408 } 3409 3410 /* 32-bit indices to be allocated as consecutive staging registers */ 3411 bi_index dregs[BIFROST_TEX_DREG_COUNT] = { }; 3412 bi_index cx = bi_null(), cy = bi_null(); 3413 3414 for (unsigned i = 0; i < instr->num_srcs; ++i) { 3415 bi_index index = bi_src_index(&instr->src[i].src); 3416 unsigned sz = nir_src_bit_size(instr->src[i].src); 3417 unsigned components = nir_src_num_components(instr->src[i].src); 3418 ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i); 3419 nir_alu_type T = base | sz; 3420 3421 switch (instr->src[i].src_type) { 3422 case nir_tex_src_coord: 3423 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 3424 cx = bi_emit_texc_cube_coord(b, index, &cy); 3425 } else { 3426 /* Copy XY (for 2D+) or XX (for 1D) */ 3427 cx = bi_extract(b, index, 0); 3428 cy = bi_extract(b, index, MIN2(1, components - 1)); 3429 3430 assert(components >= 1 && components <= 3); 3431 3432 if (components == 3 && !desc.array) { 3433 /* 3D */ 3434 dregs[BIFROST_TEX_DREG_Z_COORD] = 3435 bi_extract(b, index, 2); 3436 } 3437 } 3438 3439 if (desc.array) { 3440 dregs[BIFROST_TEX_DREG_ARRAY] = 3441 bi_emit_texc_array_index(b, 3442 bi_extract(b, index, components - 1), T); 3443 } 3444 3445 break; 3446 3447 case nir_tex_src_lod: 3448 if (desc.op == BIFROST_TEX_OP_TEX && 3449 nir_src_is_const(instr->src[i].src) && 3450 nir_src_as_uint(instr->src[i].src) == 0) { 3451 desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO; 3452 } else if (desc.op == BIFROST_TEX_OP_TEX) { 3453 assert(base == nir_type_float); 3454 3455 assert(sz == 16 || sz == 32); 3456 dregs[BIFROST_TEX_DREG_LOD] = 3457 bi_emit_texc_lod_88(b, index, sz == 16); 3458 desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT; 3459 } else { 3460 assert(desc.op == BIFROST_TEX_OP_FETCH); 3461 assert(base == nir_type_uint || base == nir_type_int); 3462 assert(sz == 16 || sz == 32); 3463 3464 dregs[BIFROST_TEX_DREG_LOD] = 3465 bi_emit_texc_lod_cube(b, index); 3466 } 3467 3468 break; 3469 3470 case nir_tex_src_bias: 3471 /* Upper 16-bits interpreted as a clamp, leave zero */ 3472 assert(desc.op == BIFROST_TEX_OP_TEX); 3473 assert(base == nir_type_float); 3474 assert(sz == 16 || sz == 32); 3475 dregs[BIFROST_TEX_DREG_LOD] = 3476 bi_emit_texc_lod_88(b, index, sz == 16); 3477 desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS; 3478 break; 3479 3480 case nir_tex_src_ms_index: 3481 case nir_tex_src_offset: 3482 if (desc.offset_or_bias_disable) 3483 break; 3484 3485 dregs[BIFROST_TEX_DREG_OFFSETMS] = 3486 bi_emit_texc_offset_ms_index(b, instr); 3487 if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero())) 3488 desc.offset_or_bias_disable = true; 3489 break; 3490 3491 case nir_tex_src_comparator: 3492 dregs[BIFROST_TEX_DREG_SHADOW] = index; 3493 break; 3494 3495 case nir_tex_src_texture_offset: 3496 if (instr->texture_index) 3497 index = bi_iadd_u32(b, index, bi_imm_u32(instr->texture_index), false); 3498 3499 dregs[BIFROST_TEX_DREG_TEXTURE] = index; 3500 3501 break; 3502 3503 case nir_tex_src_sampler_offset: 3504 if (instr->sampler_index) 3505 index = bi_iadd_u32(b, index, bi_imm_u32(instr->sampler_index), false); 3506 3507 dregs[BIFROST_TEX_DREG_SAMPLER] = index; 3508 break; 3509 3510 default: 3511 unreachable("Unhandled src type in texc emit"); 3512 } 3513 } 3514 3515 if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) { 3516 dregs[BIFROST_TEX_DREG_LOD] = 3517 bi_emit_texc_lod_cube(b, bi_zero()); 3518 } 3519 3520 /* Choose an index mode */ 3521 3522 bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]); 3523 bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]); 3524 bool direct = direct_tex && direct_samp; 3525 3526 desc.immediate_indices = direct && (instr->sampler_index < 16); 3527 3528 if (desc.immediate_indices) { 3529 desc.sampler_index_or_mode = instr->sampler_index; 3530 desc.index = instr->texture_index; 3531 } else { 3532 unsigned mode = 0; 3533 3534 if (direct && instr->sampler_index == instr->texture_index) { 3535 mode = BIFROST_INDEX_IMMEDIATE_SHARED; 3536 desc.index = instr->texture_index; 3537 } else if (direct) { 3538 mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; 3539 desc.index = instr->sampler_index; 3540 dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b, 3541 bi_imm_u32(instr->texture_index)); 3542 } else if (direct_tex) { 3543 assert(!direct_samp); 3544 mode = BIFROST_INDEX_IMMEDIATE_TEXTURE; 3545 desc.index = instr->texture_index; 3546 } else if (direct_samp) { 3547 assert(!direct_tex); 3548 mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; 3549 desc.index = instr->sampler_index; 3550 } else { 3551 mode = BIFROST_INDEX_REGISTER; 3552 } 3553 3554 mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2); 3555 desc.sampler_index_or_mode = mode; 3556 } 3557 3558 /* Allocate staging registers contiguously by compacting the array. */ 3559 unsigned sr_count = 0; 3560 3561 for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) { 3562 if (!bi_is_null(dregs[i])) 3563 dregs[sr_count++] = dregs[i]; 3564 } 3565 3566 unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4; 3567 3568 bi_index sr = sr_count ? bi_temp(b->shader) : bi_null(); 3569 bi_index dst = bi_temp(b->shader); 3570 3571 if (sr_count) 3572 bi_emit_collect_to(b, sr, dregs, sr_count); 3573 3574 uint32_t desc_u = 0; 3575 memcpy(&desc_u, &desc, sizeof(desc_u)); 3576 bi_instr *I = 3577 bi_texc_to(b, dst, bi_null(), sr, cx, cy, 3578 bi_imm_u32(desc_u), 3579 !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0); 3580 I->register_format = bi_reg_fmt_for_nir(instr->dest_type); 3581 3582 bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; 3583 bi_emit_split_i32(b, w, dst, res_size); 3584 bi_emit_collect_to(b, bi_dest_index(&instr->dest), w, 3585 DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4)); 3586} 3587 3588/* Staging registers required by texturing in the order they appear (Valhall) */ 3589 3590enum valhall_tex_sreg { 3591 VALHALL_TEX_SREG_X_COORD = 0, 3592 VALHALL_TEX_SREG_Y_COORD = 1, 3593 VALHALL_TEX_SREG_Z_COORD = 2, 3594 VALHALL_TEX_SREG_Y_DELTAS = 3, 3595 VALHALL_TEX_SREG_ARRAY = 4, 3596 VALHALL_TEX_SREG_SHADOW = 5, 3597 VALHALL_TEX_SREG_OFFSETMS = 6, 3598 VALHALL_TEX_SREG_LOD = 7, 3599 VALHALL_TEX_SREG_GRDESC = 8, 3600 VALHALL_TEX_SREG_COUNT, 3601}; 3602 3603static void 3604bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr) 3605{ 3606 bool explicit_offset = false; 3607 enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD; 3608 3609 bool has_lod_mode = 3610 (instr->op == nir_texop_tex) || 3611 (instr->op == nir_texop_txl) || 3612 (instr->op == nir_texop_txb); 3613 3614 /* 32-bit indices to be allocated as consecutive staging registers */ 3615 bi_index sregs[VALHALL_TEX_SREG_COUNT] = { }; 3616 3617 bi_index sampler = bi_imm_u32(instr->sampler_index); 3618 bi_index texture = bi_imm_u32(instr->texture_index); 3619 uint32_t tables = (PAN_TABLE_SAMPLER << 11) | (PAN_TABLE_TEXTURE << 27); 3620 3621 for (unsigned i = 0; i < instr->num_srcs; ++i) { 3622 bi_index index = bi_src_index(&instr->src[i].src); 3623 unsigned sz = nir_src_bit_size(instr->src[i].src); 3624 unsigned components = nir_src_num_components(instr->src[i].src); 3625 3626 switch (instr->src[i].src_type) { 3627 case nir_tex_src_coord: 3628 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 3629 sregs[VALHALL_TEX_SREG_X_COORD] = 3630 bi_emit_texc_cube_coord(b, index, 3631 &sregs[VALHALL_TEX_SREG_Y_COORD]); 3632 } else { 3633 assert(components >= 1 && components <= 3); 3634 3635 /* Copy XY (for 2D+) or XX (for 1D) */ 3636 sregs[VALHALL_TEX_SREG_X_COORD] = index; 3637 3638 if (components >= 2) 3639 sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1); 3640 3641 if (components == 3 && !instr->is_array) { 3642 sregs[VALHALL_TEX_SREG_Z_COORD] = 3643 bi_extract(b, index, 2); 3644 } 3645 } 3646 3647 if (instr->is_array) { 3648 sregs[VALHALL_TEX_SREG_ARRAY] = 3649 bi_extract(b, index, components - 1); 3650 } 3651 3652 break; 3653 3654 case nir_tex_src_lod: 3655 if (nir_src_is_const(instr->src[i].src) && 3656 nir_src_as_uint(instr->src[i].src) == 0) { 3657 lod_mode = BI_VA_LOD_MODE_ZERO_LOD; 3658 } else if (has_lod_mode) { 3659 lod_mode = BI_VA_LOD_MODE_EXPLICIT; 3660 3661 assert(sz == 16 || sz == 32); 3662 sregs[VALHALL_TEX_SREG_LOD] = 3663 bi_emit_texc_lod_88(b, index, sz == 16); 3664 } 3665 break; 3666 3667 case nir_tex_src_bias: 3668 /* Upper 16-bits interpreted as a clamp, leave zero */ 3669 assert(sz == 16 || sz == 32); 3670 sregs[VALHALL_TEX_SREG_LOD] = 3671 bi_emit_texc_lod_88(b, index, sz == 16); 3672 3673 lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS; 3674 break; 3675 case nir_tex_src_ms_index: 3676 case nir_tex_src_offset: 3677 /* Handled below */ 3678 break; 3679 3680 case nir_tex_src_comparator: 3681 sregs[VALHALL_TEX_SREG_SHADOW] = index; 3682 break; 3683 3684 case nir_tex_src_texture_offset: 3685 assert(instr->texture_index == 0); 3686 texture = index; 3687 break; 3688 3689 case nir_tex_src_sampler_offset: 3690 assert(instr->sampler_index == 0); 3691 sampler = index; 3692 break; 3693 3694 default: 3695 unreachable("Unhandled src type in tex emit"); 3696 } 3697 } 3698 3699 /* Generate packed offset + ms index + LOD register. These default to 3700 * zero so we only need to encode if these features are actually in use. 3701 */ 3702 bi_index offsets = bi_emit_valhall_offsets(b, instr); 3703 3704 if (!bi_is_equiv(offsets, bi_zero())) { 3705 sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets; 3706 explicit_offset = true; 3707 } 3708 3709 /* Allocate staging registers contiguously by compacting the array. */ 3710 unsigned sr_count = 0; 3711 3712 for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) { 3713 if (!bi_is_null(sregs[i])) 3714 sregs[sr_count++] = sregs[i]; 3715 } 3716 3717 bi_index idx = sr_count ? bi_temp(b->shader) : bi_null(); 3718 3719 if (sr_count) 3720 bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32); 3721 3722 bi_index image_src = bi_imm_u32(tables); 3723 image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0)); 3724 image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16)); 3725 3726 unsigned mask = BI_WRITE_MASK_RGBA; 3727 unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4; 3728 enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type); 3729 enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim); 3730 bi_index dest = bi_temp(b->shader); 3731 3732 switch (instr->op) { 3733 case nir_texop_tex: 3734 case nir_texop_txl: 3735 case nir_texop_txb: 3736 bi_tex_single_to(b, dest, idx, image_src, bi_zero(), 3737 instr->is_array, dim, regfmt, instr->is_shadow, 3738 explicit_offset, lod_mode, mask, sr_count); 3739 break; 3740 case nir_texop_txf: 3741 case nir_texop_txf_ms: 3742 bi_tex_fetch_to(b, dest, idx, image_src, bi_zero(), 3743 instr->is_array, dim, regfmt, explicit_offset, 3744 mask, sr_count); 3745 break; 3746 case nir_texop_tg4: 3747 bi_tex_gather_to(b, dest, idx, image_src, bi_zero(), 3748 instr->is_array, dim, instr->component, false, 3749 regfmt, instr->is_shadow, explicit_offset, 3750 mask, sr_count); 3751 break; 3752 default: 3753 unreachable("Unhandled Valhall texture op"); 3754 } 3755 3756 bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; 3757 bi_emit_split_i32(b, w, dest, res_size); 3758 bi_emit_collect_to(b, bi_dest_index(&instr->dest), w, 3759 DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4)); 3760} 3761 3762/* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube 3763 * textures with sufficiently small immediate indices. Anything else 3764 * needs a complete texture op. */ 3765 3766static void 3767bi_emit_texs(bi_builder *b, nir_tex_instr *instr) 3768{ 3769 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); 3770 assert(coord_idx >= 0); 3771 bi_index coords = bi_src_index(&instr->src[coord_idx].src); 3772 3773 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 3774 bi_index face, s, t; 3775 bi_emit_cube_coord(b, coords, &face, &s, &t); 3776 3777 bi_texs_cube_to(b, nir_dest_bit_size(instr->dest), 3778 bi_dest_index(&instr->dest), 3779 s, t, face, 3780 instr->sampler_index, instr->texture_index); 3781 } else { 3782 bi_texs_2d_to(b, nir_dest_bit_size(instr->dest), 3783 bi_dest_index(&instr->dest), 3784 bi_extract(b, coords, 0), 3785 bi_extract(b, coords, 1), 3786 instr->op != nir_texop_tex, /* zero LOD */ 3787 instr->sampler_index, instr->texture_index); 3788 } 3789 3790 bi_split_dest(b, instr->dest); 3791} 3792 3793static bool 3794bi_is_simple_tex(nir_tex_instr *instr) 3795{ 3796 if (instr->op != nir_texop_tex && instr->op != nir_texop_txl) 3797 return false; 3798 3799 if (instr->dest_type != nir_type_float32 && 3800 instr->dest_type != nir_type_float16) 3801 return false; 3802 3803 if (instr->is_shadow || instr->is_array) 3804 return false; 3805 3806 switch (instr->sampler_dim) { 3807 case GLSL_SAMPLER_DIM_2D: 3808 case GLSL_SAMPLER_DIM_EXTERNAL: 3809 case GLSL_SAMPLER_DIM_RECT: 3810 break; 3811 3812 case GLSL_SAMPLER_DIM_CUBE: 3813 /* LOD can't be specified with TEXS_CUBE */ 3814 if (instr->op == nir_texop_txl) 3815 return false; 3816 break; 3817 3818 default: 3819 return false; 3820 } 3821 3822 for (unsigned i = 0; i < instr->num_srcs; ++i) { 3823 if (instr->src[i].src_type != nir_tex_src_lod && 3824 instr->src[i].src_type != nir_tex_src_coord) 3825 return false; 3826 } 3827 3828 /* Indices need to fit in provided bits */ 3829 unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3; 3830 if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits)) 3831 return false; 3832 3833 int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); 3834 if (lod_idx < 0) 3835 return true; 3836 3837 nir_src lod = instr->src[lod_idx].src; 3838 return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0; 3839} 3840 3841static void 3842bi_emit_tex(bi_builder *b, nir_tex_instr *instr) 3843{ 3844 switch (instr->op) { 3845 case nir_texop_txs: 3846 bi_load_sysval_to(b, bi_dest_index(&instr->dest), 3847 panfrost_sysval_for_instr(&instr->instr, NULL), 3848 nir_dest_num_components(instr->dest), 0); 3849 return; 3850 case nir_texop_tex: 3851 case nir_texop_txl: 3852 case nir_texop_txb: 3853 case nir_texop_txf: 3854 case nir_texop_txf_ms: 3855 case nir_texop_tg4: 3856 break; 3857 default: 3858 unreachable("Invalid texture operation"); 3859 } 3860 3861 if (b->shader->arch >= 9) 3862 bi_emit_tex_valhall(b, instr); 3863 else if (bi_is_simple_tex(instr)) 3864 bi_emit_texs(b, instr); 3865 else 3866 bi_emit_texc(b, instr); 3867} 3868 3869static void 3870bi_emit_instr(bi_builder *b, struct nir_instr *instr) 3871{ 3872 switch (instr->type) { 3873 case nir_instr_type_load_const: 3874 bi_emit_load_const(b, nir_instr_as_load_const(instr)); 3875 break; 3876 3877 case nir_instr_type_intrinsic: 3878 bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr)); 3879 break; 3880 3881 case nir_instr_type_alu: 3882 bi_emit_alu(b, nir_instr_as_alu(instr)); 3883 break; 3884 3885 case nir_instr_type_tex: 3886 bi_emit_tex(b, nir_instr_as_tex(instr)); 3887 break; 3888 3889 case nir_instr_type_jump: 3890 bi_emit_jump(b, nir_instr_as_jump(instr)); 3891 break; 3892 3893 default: 3894 unreachable("should've been lowered"); 3895 } 3896} 3897 3898static bi_block * 3899create_empty_block(bi_context *ctx) 3900{ 3901 bi_block *blk = rzalloc(ctx, bi_block); 3902 3903 util_dynarray_init(&blk->predecessors, blk); 3904 3905 return blk; 3906} 3907 3908static bi_block * 3909emit_block(bi_context *ctx, nir_block *block) 3910{ 3911 if (ctx->after_block) { 3912 ctx->current_block = ctx->after_block; 3913 ctx->after_block = NULL; 3914 } else { 3915 ctx->current_block = create_empty_block(ctx); 3916 } 3917 3918 list_addtail(&ctx->current_block->link, &ctx->blocks); 3919 list_inithead(&ctx->current_block->instructions); 3920 3921 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); 3922 3923 nir_foreach_instr(instr, block) { 3924 bi_emit_instr(&_b, instr); 3925 ++ctx->instruction_count; 3926 } 3927 3928 return ctx->current_block; 3929} 3930 3931static void 3932emit_if(bi_context *ctx, nir_if *nif) 3933{ 3934 bi_block *before_block = ctx->current_block; 3935 3936 /* Speculatively emit the branch, but we can't fill it in until later */ 3937 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); 3938 bi_instr *then_branch = bi_branchz_i16(&_b, 3939 bi_half(bi_src_index(&nif->condition), false), 3940 bi_zero(), BI_CMPF_EQ); 3941 3942 /* Emit the two subblocks. */ 3943 bi_block *then_block = emit_cf_list(ctx, &nif->then_list); 3944 bi_block *end_then_block = ctx->current_block; 3945 3946 /* Emit second block, and check if it's empty */ 3947 3948 int count_in = ctx->instruction_count; 3949 bi_block *else_block = emit_cf_list(ctx, &nif->else_list); 3950 bi_block *end_else_block = ctx->current_block; 3951 ctx->after_block = create_empty_block(ctx); 3952 3953 /* Now that we have the subblocks emitted, fix up the branches */ 3954 3955 assert(then_block); 3956 assert(else_block); 3957 3958 if (ctx->instruction_count == count_in) { 3959 then_branch->branch_target = ctx->after_block; 3960 bi_block_add_successor(end_then_block, ctx->after_block); /* fallthrough */ 3961 } else { 3962 then_branch->branch_target = else_block; 3963 3964 /* Emit a jump from the end of the then block to the end of the else */ 3965 _b.cursor = bi_after_block(end_then_block); 3966 bi_instr *then_exit = bi_jump(&_b, bi_zero()); 3967 then_exit->branch_target = ctx->after_block; 3968 3969 bi_block_add_successor(end_then_block, then_exit->branch_target); 3970 bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */ 3971 } 3972 3973 bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */ 3974 bi_block_add_successor(before_block, then_block); /* fallthrough */ 3975} 3976 3977static void 3978emit_loop(bi_context *ctx, nir_loop *nloop) 3979{ 3980 /* Remember where we are */ 3981 bi_block *start_block = ctx->current_block; 3982 3983 bi_block *saved_break = ctx->break_block; 3984 bi_block *saved_continue = ctx->continue_block; 3985 3986 ctx->continue_block = create_empty_block(ctx); 3987 ctx->break_block = create_empty_block(ctx); 3988 ctx->after_block = ctx->continue_block; 3989 3990 /* Emit the body itself */ 3991 emit_cf_list(ctx, &nloop->body); 3992 3993 /* Branch back to loop back */ 3994 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); 3995 bi_instr *I = bi_jump(&_b, bi_zero()); 3996 I->branch_target = ctx->continue_block; 3997 bi_block_add_successor(start_block, ctx->continue_block); 3998 bi_block_add_successor(ctx->current_block, ctx->continue_block); 3999 4000 ctx->after_block = ctx->break_block; 4001 4002 /* Pop off */ 4003 ctx->break_block = saved_break; 4004 ctx->continue_block = saved_continue; 4005 ++ctx->loop_count; 4006} 4007 4008static bi_block * 4009emit_cf_list(bi_context *ctx, struct exec_list *list) 4010{ 4011 bi_block *start_block = NULL; 4012 4013 foreach_list_typed(nir_cf_node, node, node, list) { 4014 switch (node->type) { 4015 case nir_cf_node_block: { 4016 bi_block *block = emit_block(ctx, nir_cf_node_as_block(node)); 4017 4018 if (!start_block) 4019 start_block = block; 4020 4021 break; 4022 } 4023 4024 case nir_cf_node_if: 4025 emit_if(ctx, nir_cf_node_as_if(node)); 4026 break; 4027 4028 case nir_cf_node_loop: 4029 emit_loop(ctx, nir_cf_node_as_loop(node)); 4030 break; 4031 4032 default: 4033 unreachable("Unknown control flow"); 4034 } 4035 } 4036 4037 return start_block; 4038} 4039 4040/* shader-db stuff */ 4041 4042struct bi_stats { 4043 unsigned nr_clauses, nr_tuples, nr_ins; 4044 unsigned nr_arith, nr_texture, nr_varying, nr_ldst; 4045}; 4046 4047static void 4048bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats) 4049{ 4050 /* Count instructions */ 4051 stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0); 4052 4053 /* Non-message passing tuples are always arithmetic */ 4054 if (tuple->add != clause->message) { 4055 stats->nr_arith++; 4056 return; 4057 } 4058 4059 /* Message + FMA we'll count as arithmetic _and_ message */ 4060 if (tuple->fma) 4061 stats->nr_arith++; 4062 4063 switch (clause->message_type) { 4064 case BIFROST_MESSAGE_VARYING: 4065 /* Check components interpolated */ 4066 stats->nr_varying += (clause->message->vecsize + 1) * 4067 (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2); 4068 break; 4069 4070 case BIFROST_MESSAGE_VARTEX: 4071 /* 2 coordinates, fp32 each */ 4072 stats->nr_varying += (2 * 2); 4073 FALLTHROUGH; 4074 case BIFROST_MESSAGE_TEX: 4075 stats->nr_texture++; 4076 break; 4077 4078 case BIFROST_MESSAGE_ATTRIBUTE: 4079 case BIFROST_MESSAGE_LOAD: 4080 case BIFROST_MESSAGE_STORE: 4081 case BIFROST_MESSAGE_ATOMIC: 4082 stats->nr_ldst++; 4083 break; 4084 4085 case BIFROST_MESSAGE_NONE: 4086 case BIFROST_MESSAGE_BARRIER: 4087 case BIFROST_MESSAGE_BLEND: 4088 case BIFROST_MESSAGE_TILE: 4089 case BIFROST_MESSAGE_Z_STENCIL: 4090 case BIFROST_MESSAGE_ATEST: 4091 case BIFROST_MESSAGE_JOB: 4092 case BIFROST_MESSAGE_64BIT: 4093 /* Nothing to do */ 4094 break; 4095 }; 4096 4097} 4098 4099/* 4100 * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the 4101 * shader completes. These costs are not accounted for in the general cycle 4102 * counts, so this function calculates the effective cost of these messages, as 4103 * if they were executed by shader code. 4104 */ 4105static unsigned 4106bi_count_preload_cost(bi_context *ctx) 4107{ 4108 /* Units: 1/16 of a normalized cycle, assuming that we may interpolate 4109 * 16 fp16 varying components per cycle or fetch two texels per cycle. 4110 */ 4111 unsigned cost = 0; 4112 4113 for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) { 4114 struct bifrost_message_preload msg = ctx->info.bifrost->messages[i]; 4115 4116 if (msg.enabled && msg.texture) { 4117 /* 2 coordinate, 2 half-words each, plus texture */ 4118 cost += 12; 4119 } else if (msg.enabled) { 4120 cost += (msg.num_components * (msg.fp16 ? 1 : 2)); 4121 } 4122 } 4123 4124 return cost; 4125} 4126 4127static const char * 4128bi_shader_stage_name(bi_context *ctx) 4129{ 4130 if (ctx->idvs == BI_IDVS_VARYING) 4131 return "MESA_SHADER_VARYING"; 4132 else if (ctx->idvs == BI_IDVS_POSITION) 4133 return "MESA_SHADER_POSITION"; 4134 else if (ctx->inputs->is_blend) 4135 return "MESA_SHADER_BLEND"; 4136 else 4137 return gl_shader_stage_name(ctx->stage); 4138} 4139 4140static void 4141bi_print_stats(bi_context *ctx, unsigned size, FILE *fp) 4142{ 4143 struct bi_stats stats = { 0 }; 4144 4145 /* Count instructions, clauses, and tuples. Also attempt to construct 4146 * normalized execution engine cycle counts, using the following ratio: 4147 * 4148 * 24 arith tuples/cycle 4149 * 2 texture messages/cycle 4150 * 16 x 16-bit varying channels interpolated/cycle 4151 * 1 load store message/cycle 4152 * 4153 * These numbers seem to match Arm Mobile Studio's heuristic. The real 4154 * cycle counts are surely more complicated. 4155 */ 4156 4157 bi_foreach_block(ctx, block) { 4158 bi_foreach_clause_in_block(block, clause) { 4159 stats.nr_clauses++; 4160 stats.nr_tuples += clause->tuple_count; 4161 4162 for (unsigned i = 0; i < clause->tuple_count; ++i) 4163 bi_count_tuple_stats(clause, &clause->tuples[i], &stats); 4164 } 4165 } 4166 4167 float cycles_arith = ((float) stats.nr_arith) / 24.0; 4168 float cycles_texture = ((float) stats.nr_texture) / 2.0; 4169 float cycles_varying = ((float) stats.nr_varying) / 16.0; 4170 float cycles_ldst = ((float) stats.nr_ldst) / 1.0; 4171 4172 float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst); 4173 float cycles_bound = MAX2(cycles_arith, cycles_message); 4174 4175 /* Thread count and register pressure are traded off only on v7 */ 4176 bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32); 4177 unsigned nr_threads = full_threads ? 2 : 1; 4178 4179 /* Dump stats */ 4180 char *str = ralloc_asprintf(NULL, "%s - %s shader: " 4181 "%u inst, %u tuples, %u clauses, " 4182 "%f cycles, %f arith, %f texture, %f vary, %f ldst, " 4183 "%u quadwords, %u threads", 4184 ctx->nir->info.label ?: "", 4185 bi_shader_stage_name(ctx), 4186 stats.nr_ins, stats.nr_tuples, stats.nr_clauses, 4187 cycles_bound, cycles_arith, cycles_texture, 4188 cycles_varying, cycles_ldst, 4189 size / 16, nr_threads); 4190 4191 if (ctx->arch == 7) { 4192 ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx)); 4193 } 4194 4195 ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills\n", 4196 ctx->loop_count, ctx->spills, ctx->fills); 4197 4198 fputs(str, stderr); 4199 ralloc_free(str); 4200} 4201 4202static void 4203va_print_stats(bi_context *ctx, unsigned size, FILE *fp) 4204{ 4205 unsigned nr_ins = 0; 4206 struct va_stats stats = { 0 }; 4207 4208 /* Count instructions */ 4209 bi_foreach_instr_global(ctx, I) { 4210 nr_ins++; 4211 va_count_instr_stats(I, &stats); 4212 } 4213 4214 /* Mali G78 peak performance: 4215 * 4216 * 64 FMA instructions per cycle 4217 * 64 CVT instructions per cycle 4218 * 16 SFU instructions per cycle 4219 * 8 x 32-bit varying channels interpolated per cycle 4220 * 4 texture instructions per cycle 4221 * 1 load/store operation per cycle 4222 */ 4223 4224 float cycles_fma = ((float) stats.fma) / 64.0; 4225 float cycles_cvt = ((float) stats.cvt) / 64.0; 4226 float cycles_sfu = ((float) stats.sfu) / 16.0; 4227 float cycles_v = ((float) stats.v) / 16.0; 4228 float cycles_t = ((float) stats.t) / 4.0; 4229 float cycles_ls = ((float) stats.ls) / 1.0; 4230 4231 /* Calculate the bound */ 4232 float cycles = MAX2( 4233 MAX3(cycles_fma, cycles_cvt, cycles_sfu), 4234 MAX3(cycles_v, cycles_t, cycles_ls)); 4235 4236 4237 /* Thread count and register pressure are traded off */ 4238 unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1; 4239 4240 /* Dump stats */ 4241 fprintf(stderr, "%s - %s shader: " 4242 "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, " 4243 "%f t, %f ls, %u quadwords, %u threads, %u loops, " 4244 "%u:%u spills:fills\n", 4245 ctx->nir->info.label ?: "", 4246 bi_shader_stage_name(ctx), 4247 nr_ins, cycles, cycles_fma, cycles_cvt, cycles_sfu, 4248 cycles_v, cycles_t, cycles_ls, size / 16, nr_threads, 4249 ctx->loop_count, ctx->spills, ctx->fills); 4250} 4251 4252static int 4253glsl_type_size(const struct glsl_type *type, bool bindless) 4254{ 4255 return glsl_count_attribute_slots(type, false); 4256} 4257 4258/* Split stores to memory. We don't split stores to vertex outputs, since 4259 * nir_lower_io_to_temporaries will ensure there's only a single write. 4260 */ 4261 4262static bool 4263should_split_wrmask(const nir_instr *instr, UNUSED const void *data) 4264{ 4265 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 4266 4267 switch (intr->intrinsic) { 4268 case nir_intrinsic_store_ssbo: 4269 case nir_intrinsic_store_shared: 4270 case nir_intrinsic_store_global: 4271 case nir_intrinsic_store_scratch: 4272 return true; 4273 default: 4274 return false; 4275 } 4276} 4277 4278/* Bifrost wants transcendentals as FP32 */ 4279 4280static unsigned 4281bi_lower_bit_size(const nir_instr *instr, UNUSED void *data) 4282{ 4283 if (instr->type != nir_instr_type_alu) 4284 return 0; 4285 4286 nir_alu_instr *alu = nir_instr_as_alu(instr); 4287 4288 switch (alu->op) { 4289 case nir_op_fexp2: 4290 case nir_op_flog2: 4291 case nir_op_fpow: 4292 case nir_op_fsin: 4293 case nir_op_fcos: 4294 return (nir_dest_bit_size(alu->dest.dest) == 32) ? 0 : 32; 4295 default: 4296 return 0; 4297 } 4298} 4299 4300/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4, 4301 * transcendentals are an exception. Also shifts because of lane size mismatch 4302 * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need 4303 * to be scalarized due to type size. */ 4304 4305static uint8_t 4306bi_vectorize_filter(const nir_instr *instr, const void *data) 4307{ 4308 /* Defaults work for everything else */ 4309 if (instr->type != nir_instr_type_alu) 4310 return 0; 4311 4312 const nir_alu_instr *alu = nir_instr_as_alu(instr); 4313 4314 switch (alu->op) { 4315 case nir_op_frcp: 4316 case nir_op_frsq: 4317 case nir_op_ishl: 4318 case nir_op_ishr: 4319 case nir_op_ushr: 4320 case nir_op_f2i16: 4321 case nir_op_f2u16: 4322 return 1; 4323 default: 4324 break; 4325 } 4326 4327 /* Vectorized instructions cannot write more than 32-bit */ 4328 int dst_bit_size = nir_dest_bit_size(alu->dest.dest); 4329 if (dst_bit_size == 16) 4330 return 2; 4331 else 4332 return 1; 4333} 4334 4335static bool 4336bi_scalarize_filter(const nir_instr *instr, const void *data) 4337{ 4338 if (instr->type != nir_instr_type_alu) 4339 return false; 4340 4341 const nir_alu_instr *alu = nir_instr_as_alu(instr); 4342 4343 switch (alu->op) { 4344 case nir_op_pack_uvec2_to_uint: 4345 case nir_op_pack_uvec4_to_uint: 4346 return false; 4347 default: 4348 return true; 4349 } 4350} 4351 4352/* XXX: This is a kludge to workaround NIR's lack of divergence metadata. If we 4353 * keep divergence info around after we consume it for indirect lowering, 4354 * nir_convert_from_ssa will regress code quality since it will avoid 4355 * coalescing divergent with non-divergent nodes. */ 4356 4357static bool 4358nir_invalidate_divergence_ssa(nir_ssa_def *ssa, UNUSED void *data) 4359{ 4360 ssa->divergent = false; 4361 return true; 4362} 4363 4364static bool 4365nir_invalidate_divergence(struct nir_builder *b, nir_instr *instr, 4366 UNUSED void *data) 4367{ 4368 return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL); 4369} 4370 4371/* Ensure we write exactly 4 components */ 4372static nir_ssa_def * 4373bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in, 4374 unsigned channel, unsigned first, unsigned mask) 4375{ 4376 if (!(mask & BITFIELD_BIT(channel))) 4377 channel = first; 4378 4379 return nir_channel(b, in, channel); 4380} 4381 4382/* Lower fragment store_output instructions to always write 4 components, 4383 * matching the hardware semantic. This may require additional moves. Skipping 4384 * these moves is possible in theory, but invokes undefined behaviour in the 4385 * compiler. The DDK inserts these moves, so we will as well. */ 4386 4387static bool 4388bifrost_nir_lower_blend_components(struct nir_builder *b, 4389 nir_instr *instr, void *data) 4390{ 4391 if (instr->type != nir_instr_type_intrinsic) 4392 return false; 4393 4394 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 4395 4396 if (intr->intrinsic != nir_intrinsic_store_output) 4397 return false; 4398 4399 nir_ssa_def *in = intr->src[0].ssa; 4400 unsigned first = nir_intrinsic_component(intr); 4401 unsigned mask = nir_intrinsic_write_mask(intr); 4402 4403 assert(first == 0 && "shouldn't get nonzero components"); 4404 4405 /* Nothing to do */ 4406 if (mask == BITFIELD_MASK(4)) 4407 return false; 4408 4409 b->cursor = nir_before_instr(&intr->instr); 4410 4411 /* Replicate the first valid component instead */ 4412 nir_ssa_def *replicated = 4413 nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask), 4414 bifrost_nir_valid_channel(b, in, 1, first, mask), 4415 bifrost_nir_valid_channel(b, in, 2, first, mask), 4416 bifrost_nir_valid_channel(b, in, 3, first, mask)); 4417 4418 /* Rewrite to use our replicated version */ 4419 nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated); 4420 nir_intrinsic_set_component(intr, 0); 4421 nir_intrinsic_set_write_mask(intr, 0xF); 4422 intr->num_components = 4; 4423 4424 return true; 4425} 4426 4427static void 4428bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) 4429{ 4430 bool progress; 4431 unsigned lower_flrp = 16 | 32 | 64; 4432 4433 NIR_PASS(progress, nir, nir_lower_regs_to_ssa); 4434 4435 nir_lower_tex_options lower_tex_options = { 4436 .lower_txs_lod = true, 4437 .lower_txp = ~0, 4438 .lower_tg4_broadcom_swizzle = true, 4439 .lower_txd = true, 4440 .lower_invalid_implicit_lod = true, 4441 }; 4442 4443 NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin); 4444 NIR_PASS(progress, nir, pan_lower_helper_invocation); 4445 4446 NIR_PASS(progress, nir, nir_lower_int64); 4447 4448 nir_lower_idiv_options idiv_options = { 4449 .imprecise_32bit_lowering = true, 4450 .allow_fp16 = true, 4451 }; 4452 NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); 4453 4454 NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); 4455 NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL); 4456 NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); 4457 4458 do { 4459 progress = false; 4460 4461 NIR_PASS(progress, nir, nir_lower_var_copies); 4462 NIR_PASS(progress, nir, nir_lower_vars_to_ssa); 4463 NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL); 4464 4465 NIR_PASS(progress, nir, nir_copy_prop); 4466 NIR_PASS(progress, nir, nir_opt_remove_phis); 4467 NIR_PASS(progress, nir, nir_opt_dce); 4468 NIR_PASS(progress, nir, nir_opt_dead_cf); 4469 NIR_PASS(progress, nir, nir_opt_cse); 4470 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); 4471 NIR_PASS(progress, nir, nir_opt_algebraic); 4472 NIR_PASS(progress, nir, nir_opt_constant_folding); 4473 4474 NIR_PASS(progress, nir, nir_lower_alu); 4475 4476 if (lower_flrp != 0) { 4477 bool lower_flrp_progress = false; 4478 NIR_PASS(lower_flrp_progress, 4479 nir, 4480 nir_lower_flrp, 4481 lower_flrp, 4482 false /* always_precise */); 4483 if (lower_flrp_progress) { 4484 NIR_PASS(progress, nir, 4485 nir_opt_constant_folding); 4486 progress = true; 4487 } 4488 4489 /* Nothing should rematerialize any flrps, so we only 4490 * need to do this lowering once. 4491 */ 4492 lower_flrp = 0; 4493 } 4494 4495 NIR_PASS(progress, nir, nir_opt_undef); 4496 NIR_PASS(progress, nir, nir_lower_undef_to_zero); 4497 4498 NIR_PASS(progress, nir, nir_opt_shrink_vectors); 4499 NIR_PASS(progress, nir, nir_opt_loop_unroll); 4500 } while (progress); 4501 4502 /* TODO: Why is 64-bit getting rematerialized? 4503 * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */ 4504 NIR_PASS(progress, nir, nir_lower_int64); 4505 4506 /* We need to cleanup after each iteration of late algebraic 4507 * optimizations, since otherwise NIR can produce weird edge cases 4508 * (like fneg of a constant) which we don't handle */ 4509 bool late_algebraic = true; 4510 while (late_algebraic) { 4511 late_algebraic = false; 4512 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); 4513 NIR_PASS(progress, nir, nir_opt_constant_folding); 4514 NIR_PASS(progress, nir, nir_copy_prop); 4515 NIR_PASS(progress, nir, nir_opt_dce); 4516 NIR_PASS(progress, nir, nir_opt_cse); 4517 } 4518 4519 NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL); 4520 NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true); 4521 NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL); 4522 NIR_PASS(progress, nir, nir_lower_bool_to_bitsize); 4523 4524 /* Prepass to simplify instruction selection */ 4525 late_algebraic = false; 4526 NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late); 4527 4528 while (late_algebraic) { 4529 late_algebraic = false; 4530 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); 4531 NIR_PASS(progress, nir, nir_opt_constant_folding); 4532 NIR_PASS(progress, nir, nir_copy_prop); 4533 NIR_PASS(progress, nir, nir_opt_dce); 4534 NIR_PASS(progress, nir, nir_opt_cse); 4535 } 4536 4537 NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); 4538 NIR_PASS(progress, nir, nir_opt_dce); 4539 4540 if (nir->info.stage == MESA_SHADER_FRAGMENT) { 4541 NIR_PASS_V(nir, nir_shader_instructions_pass, 4542 bifrost_nir_lower_blend_components, 4543 nir_metadata_block_index | nir_metadata_dominance, 4544 NULL); 4545 } 4546 4547 /* Backend scheduler is purely local, so do some global optimizations 4548 * to reduce register pressure. */ 4549 nir_move_options move_all = 4550 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | 4551 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; 4552 4553 NIR_PASS_V(nir, nir_opt_sink, move_all); 4554 NIR_PASS_V(nir, nir_opt_move, move_all); 4555 4556 /* We might lower attribute, varying, and image indirects. Use the 4557 * gathered info to skip the extra analysis in the happy path. */ 4558 bool any_indirects = 4559 nir->info.inputs_read_indirectly || 4560 nir->info.outputs_accessed_indirectly || 4561 nir->info.patch_inputs_read_indirectly || 4562 nir->info.patch_outputs_accessed_indirectly || 4563 nir->info.images_used[0]; 4564 4565 if (any_indirects) { 4566 nir_convert_to_lcssa(nir, true, true); 4567 NIR_PASS_V(nir, nir_divergence_analysis); 4568 NIR_PASS_V(nir, bi_lower_divergent_indirects, 4569 pan_subgroup_size(gpu_id >> 12)); 4570 NIR_PASS_V(nir, nir_shader_instructions_pass, 4571 nir_invalidate_divergence, nir_metadata_all, NULL); 4572 } 4573} 4574 4575/* The cmdstream lowers 8-bit fragment output as 16-bit, so we need to do the 4576 * same lowering here to zero-extend correctly */ 4577 4578static bool 4579bifrost_nir_lower_i8_fragout_impl(struct nir_builder *b, 4580 nir_intrinsic_instr *intr, UNUSED void *data) 4581{ 4582 if (nir_src_bit_size(intr->src[0]) != 8) 4583 return false; 4584 4585 nir_alu_type type = 4586 nir_alu_type_get_base_type(nir_intrinsic_src_type(intr)); 4587 4588 assert(type == nir_type_int || type == nir_type_uint); 4589 4590 b->cursor = nir_before_instr(&intr->instr); 4591 nir_ssa_def *cast = nir_convert_to_bit_size(b, intr->src[0].ssa, type, 16); 4592 4593 nir_intrinsic_set_src_type(intr, type | 16); 4594 nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0], cast); 4595 return true; 4596} 4597 4598static bool 4599bifrost_nir_lower_i8_fragin_impl(struct nir_builder *b, 4600 nir_intrinsic_instr *intr, UNUSED void *data) 4601{ 4602 if (nir_dest_bit_size(intr->dest) != 8) 4603 return false; 4604 4605 nir_alu_type type = 4606 nir_alu_type_get_base_type(nir_intrinsic_dest_type(intr)); 4607 4608 assert(type == nir_type_int || type == nir_type_uint); 4609 4610 b->cursor = nir_before_instr(&intr->instr); 4611 nir_ssa_def *out = 4612 nir_load_output(b, intr->num_components, 16, intr->src[0].ssa, 4613 .base = nir_intrinsic_base(intr), 4614 .component = nir_intrinsic_component(intr), 4615 .dest_type = type | 16, 4616 .io_semantics = nir_intrinsic_io_semantics(intr)); 4617 4618 nir_ssa_def *cast = nir_convert_to_bit_size(b, out, type, 8); 4619 nir_ssa_def_rewrite_uses(&intr->dest.ssa, cast); 4620 return true; 4621} 4622 4623static bool 4624bifrost_nir_lower_i8_frag(struct nir_builder *b, 4625 nir_instr *instr, UNUSED void *data) 4626{ 4627 if (instr->type != nir_instr_type_intrinsic) 4628 return false; 4629 4630 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 4631 if (intr->intrinsic == nir_intrinsic_load_output) 4632 return bifrost_nir_lower_i8_fragin_impl(b, intr, data); 4633 else if (intr->intrinsic == nir_intrinsic_store_output) 4634 return bifrost_nir_lower_i8_fragout_impl(b, intr, data); 4635 else 4636 return false; 4637} 4638 4639static void 4640bi_opt_post_ra(bi_context *ctx) 4641{ 4642 bi_foreach_instr_global_safe(ctx, ins) { 4643 if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0])) 4644 bi_remove_instruction(ins); 4645 } 4646} 4647 4648/* If the shader packs multiple varyings into the same location with different 4649 * location_frac, we'll need to lower to a single varying store that collects 4650 * all of the channels together. 4651 */ 4652static bool 4653bifrost_nir_lower_store_component(struct nir_builder *b, 4654 nir_instr *instr, void *data) 4655{ 4656 if (instr->type != nir_instr_type_intrinsic) 4657 return false; 4658 4659 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 4660 4661 if (intr->intrinsic != nir_intrinsic_store_output) 4662 return false; 4663 4664 struct hash_table_u64 *slots = data; 4665 unsigned component = nir_intrinsic_component(intr); 4666 nir_src *slot_src = nir_get_io_offset_src(intr); 4667 uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr); 4668 4669 nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot); 4670 unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0); 4671 4672 nir_ssa_def *value = intr->src[0].ssa; 4673 b->cursor = nir_before_instr(&intr->instr); 4674 4675 nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size); 4676 nir_ssa_def *channels[4] = { undef, undef, undef, undef }; 4677 4678 /* Copy old */ 4679 u_foreach_bit(i, mask) { 4680 assert(prev != NULL); 4681 nir_ssa_def *prev_ssa = prev->src[0].ssa; 4682 channels[i] = nir_channel(b, prev_ssa, i); 4683 } 4684 4685 /* Copy new */ 4686 unsigned new_mask = nir_intrinsic_write_mask(intr); 4687 mask |= (new_mask << component); 4688 4689 u_foreach_bit(i, new_mask) { 4690 assert(component + i < 4); 4691 channels[component + i] = nir_channel(b, value, i); 4692 } 4693 4694 intr->num_components = util_last_bit(mask); 4695 nir_instr_rewrite_src_ssa(instr, &intr->src[0], 4696 nir_vec(b, channels, intr->num_components)); 4697 4698 nir_intrinsic_set_component(intr, 0); 4699 nir_intrinsic_set_write_mask(intr, mask); 4700 4701 if (prev) { 4702 _mesa_hash_table_u64_remove(slots, slot); 4703 nir_instr_remove(&prev->instr); 4704 } 4705 4706 _mesa_hash_table_u64_insert(slots, slot, intr); 4707 return false; 4708} 4709 4710/* Dead code elimination for branches at the end of a block - only one branch 4711 * per block is legal semantically, but unreachable jumps can be generated. 4712 * Likewise on Bifrost we can generate jumps to the terminal block which need 4713 * to be lowered away to a jump to #0x0, which induces successful termination. 4714 * That trick doesn't work on Valhall, which needs a NOP inserted in the 4715 * terminal block instead. 4716 */ 4717static void 4718bi_lower_branch(bi_context *ctx, bi_block *block) 4719{ 4720 bool cull_terminal = (ctx->arch <= 8); 4721 bool branched = false; 4722 ASSERTED bool was_jump = false; 4723 4724 bi_foreach_instr_in_block_safe(block, ins) { 4725 if (!ins->branch_target) continue; 4726 4727 if (branched) { 4728 assert(was_jump && (ins->op == BI_OPCODE_JUMP)); 4729 bi_remove_instruction(ins); 4730 continue; 4731 } 4732 4733 branched = true; 4734 was_jump = ins->op == BI_OPCODE_JUMP; 4735 4736 if (!bi_is_terminal_block(ins->branch_target)) 4737 continue; 4738 4739 if (cull_terminal) 4740 ins->branch_target = NULL; 4741 else if (ins->branch_target) 4742 ins->branch_target->needs_nop = true; 4743 } 4744} 4745 4746static void 4747bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset) 4748{ 4749 unsigned final_clause = bi_pack(ctx, binary); 4750 4751 /* If we need to wait for ATEST or BLEND in the first clause, pass the 4752 * corresponding bits through to the renderer state descriptor */ 4753 bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); 4754 bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL); 4755 4756 unsigned first_deps = first_clause ? first_clause->dependencies : 0; 4757 ctx->info.bifrost->wait_6 = (first_deps & (1 << 6)); 4758 ctx->info.bifrost->wait_7 = (first_deps & (1 << 7)); 4759 4760 /* Pad the shader with enough zero bytes to trick the prefetcher, 4761 * unless we're compiling an empty shader (in which case we don't pad 4762 * so the size remains 0) */ 4763 unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause; 4764 4765 if (binary->size - offset) { 4766 memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 4767 0, prefetch_size); 4768 } 4769} 4770 4771/* 4772 * Build a bit mask of varyings (by location) that are flatshaded. This 4773 * information is needed by lower_mediump_io, as we don't yet support 16-bit 4774 * flat varyings. 4775 * 4776 * Also varyings that are used as texture coordinates should be kept at fp32 so 4777 * the texture instruction may be promoted to VAR_TEX. In general this is a good 4778 * idea, as fp16 texture coordinates are not supported by the hardware and are 4779 * usually inappropriate. (There are both relevant CTS bugs here, even.) 4780 * 4781 * TODO: If we compacted the varyings with some fixup code in the vertex shader, 4782 * we could implement 16-bit flat varyings. Consider if this case matters. 4783 * 4784 * TODO: The texture coordinate handling could be less heavyhanded. 4785 */ 4786static bool 4787bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data) 4788{ 4789 uint64_t *mask = data; 4790 4791 if (instr->type != nir_instr_type_tex) 4792 return false; 4793 4794 nir_tex_instr *tex = nir_instr_as_tex(instr); 4795 4796 int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); 4797 if (coord_idx < 0) 4798 return false; 4799 4800 nir_src src = tex->src[coord_idx].src; 4801 assert(src.is_ssa); 4802 4803 nir_ssa_scalar x = nir_ssa_scalar_resolved(src.ssa, 0); 4804 nir_ssa_scalar y = nir_ssa_scalar_resolved(src.ssa, 1); 4805 4806 if (x.def != y.def) 4807 return false; 4808 4809 nir_instr *parent = x.def->parent_instr; 4810 4811 if (parent->type != nir_instr_type_intrinsic) 4812 return false; 4813 4814 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent); 4815 4816 if (intr->intrinsic != nir_intrinsic_load_interpolated_input) 4817 return false; 4818 4819 nir_io_semantics sem = nir_intrinsic_io_semantics(intr); 4820 *mask |= BITFIELD64_BIT(sem.location); 4821 return false; 4822} 4823 4824static uint64_t 4825bi_fp32_varying_mask(nir_shader *nir) 4826{ 4827 uint64_t mask = 0; 4828 4829 assert(nir->info.stage == MESA_SHADER_FRAGMENT); 4830 4831 nir_foreach_shader_in_variable(var, nir) { 4832 if (var->data.interpolation == INTERP_MODE_FLAT) 4833 mask |= BITFIELD64_BIT(var->data.location); 4834 } 4835 4836 nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all, &mask); 4837 4838 return mask; 4839} 4840 4841static void 4842bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) 4843{ 4844 /* Lower gl_Position pre-optimisation, but after lowering vars to ssa 4845 * (so we don't accidentally duplicate the epilogue since mesa/st has 4846 * messed with our I/O quite a bit already) */ 4847 4848 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 4849 4850 if (nir->info.stage == MESA_SHADER_VERTEX) { 4851 NIR_PASS_V(nir, nir_lower_viewport_transform); 4852 NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0); 4853 4854 nir_variable *psiz = nir_find_variable_with_location(nir, 4855 nir_var_shader_out, 4856 VARYING_SLOT_PSIZ); 4857 if (psiz != NULL) 4858 psiz->data.precision = GLSL_PRECISION_MEDIUM; 4859 } 4860 4861 /* Get rid of any global vars before we lower to scratch. */ 4862 NIR_PASS_V(nir, nir_lower_global_vars_to_local); 4863 4864 /* Valhall introduces packed thread local storage, which improves cache 4865 * locality of TLS access. However, access to packed TLS cannot 4866 * straddle 16-byte boundaries. As such, when packed TLS is in use 4867 * (currently unconditional for Valhall), we force vec4 alignment for 4868 * scratch access. 4869 */ 4870 bool packed_tls = (gpu_id >= 0x9000); 4871 4872 /* Lower large arrays to scratch and small arrays to bcsel */ 4873 NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256, 4874 packed_tls ? 4875 glsl_get_vec4_size_align_bytes : 4876 glsl_get_natural_size_align_bytes); 4877 NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0); 4878 4879 NIR_PASS_V(nir, nir_split_var_copies); 4880 NIR_PASS_V(nir, nir_lower_var_copies); 4881 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 4882 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 4883 glsl_type_size, 0); 4884 4885 /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for 4886 * offsets it could figure out are constant. Do some constant folding 4887 * before bifrost_nir_lower_store_component below. 4888 */ 4889 NIR_PASS_V(nir, nir_opt_constant_folding); 4890 4891 if (nir->info.stage == MESA_SHADER_FRAGMENT) { 4892 NIR_PASS_V(nir, nir_lower_mediump_io, 4893 nir_var_shader_in | nir_var_shader_out, 4894 ~bi_fp32_varying_mask(nir), false); 4895 } else { 4896 if (gpu_id >= 0x9000) { 4897 NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out, 4898 BITFIELD64_BIT(VARYING_SLOT_PSIZ), false); 4899 } 4900 4901 struct hash_table_u64 *stores = _mesa_hash_table_u64_create(NULL); 4902 NIR_PASS_V(nir, nir_shader_instructions_pass, 4903 bifrost_nir_lower_store_component, 4904 nir_metadata_block_index | 4905 nir_metadata_dominance, stores); 4906 _mesa_hash_table_u64_destroy(stores); 4907 } 4908 4909 NIR_PASS_V(nir, nir_lower_ssbo); 4910 NIR_PASS_V(nir, pan_nir_lower_zs_store); 4911 NIR_PASS_V(nir, pan_lower_sample_pos); 4912 NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL); 4913 4914 if (nir->info.stage == MESA_SHADER_FRAGMENT) { 4915 NIR_PASS_V(nir, nir_shader_instructions_pass, 4916 bifrost_nir_lower_i8_frag, 4917 nir_metadata_block_index | nir_metadata_dominance, 4918 NULL); 4919 } 4920 4921 if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) { 4922 NIR_PASS_V(nir, nir_io_add_const_offset_to_base, 4923 nir_var_shader_in | nir_var_shader_out); 4924 NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info); 4925 NIR_PASS_V(nir, pan_lower_xfb); 4926 } 4927 4928 bi_optimize_nir(nir, gpu_id, is_blend); 4929} 4930 4931static bi_context * 4932bi_compile_variant_nir(nir_shader *nir, 4933 const struct panfrost_compile_inputs *inputs, 4934 struct util_dynarray *binary, 4935 struct hash_table_u64 *sysval_to_id, 4936 struct bi_shader_info info, 4937 enum bi_idvs_mode idvs) 4938{ 4939 bi_context *ctx = rzalloc(NULL, bi_context); 4940 4941 /* There may be another program in the dynarray, start at the end */ 4942 unsigned offset = binary->size; 4943 4944 ctx->sysval_to_id = sysval_to_id; 4945 ctx->inputs = inputs; 4946 ctx->nir = nir; 4947 ctx->stage = nir->info.stage; 4948 ctx->quirks = bifrost_get_quirks(inputs->gpu_id); 4949 ctx->arch = inputs->gpu_id >> 12; 4950 ctx->info = info; 4951 ctx->idvs = idvs; 4952 ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs; 4953 4954 if (idvs != BI_IDVS_NONE) { 4955 /* Specializing shaders for IDVS is destructive, so we need to 4956 * clone. However, the last (second) IDVS shader does not need 4957 * to be preserved so we can skip cloning that one. 4958 */ 4959 if (offset == 0) 4960 ctx->nir = nir = nir_shader_clone(ctx, nir); 4961 4962 NIR_PASS_V(nir, nir_shader_instructions_pass, 4963 bifrost_nir_specialize_idvs, 4964 nir_metadata_block_index | nir_metadata_dominance, 4965 &idvs); 4966 4967 /* After specializing, clean up the mess */ 4968 bool progress = true; 4969 4970 while (progress) { 4971 progress = false; 4972 4973 NIR_PASS(progress, nir, nir_opt_dce); 4974 NIR_PASS(progress, nir, nir_opt_dead_cf); 4975 } 4976 } 4977 4978 /* We can only go out-of-SSA after speciailizing IDVS, as opt_dead_cf 4979 * doesn't know how to deal with nir_register. 4980 */ 4981 NIR_PASS_V(nir, nir_convert_from_ssa, true); 4982 4983 /* If nothing is pushed, all UBOs need to be uploaded */ 4984 ctx->ubo_mask = ~0; 4985 4986 list_inithead(&ctx->blocks); 4987 4988 bool skip_internal = nir->info.internal; 4989 skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL); 4990 4991 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { 4992 nir_print_shader(nir, stdout); 4993 } 4994 4995 ctx->allocated_vec = _mesa_hash_table_u64_create(ctx); 4996 4997 nir_foreach_function(func, nir) { 4998 if (!func->impl) 4999 continue; 5000 5001 ctx->ssa_alloc += func->impl->ssa_alloc; 5002 ctx->reg_alloc += func->impl->reg_alloc; 5003 5004 emit_cf_list(ctx, &func->impl->body); 5005 break; /* TODO: Multi-function shaders */ 5006 } 5007 5008 /* Index blocks now that we're done emitting */ 5009 bi_foreach_block(ctx, block) { 5010 block->index = ctx->num_blocks++; 5011 } 5012 5013 bi_validate(ctx, "NIR -> BIR"); 5014 5015 /* If the shader doesn't write any colour or depth outputs, it may 5016 * still need an ATEST at the very end! */ 5017 bool need_dummy_atest = 5018 (ctx->stage == MESA_SHADER_FRAGMENT) && 5019 !ctx->emitted_atest && 5020 !bi_skip_atest(ctx, false); 5021 5022 if (need_dummy_atest) { 5023 bi_block *end = list_last_entry(&ctx->blocks, bi_block, link); 5024 bi_builder b = bi_init_builder(ctx, bi_after_block(end)); 5025 bi_emit_atest(&b, bi_zero()); 5026 } 5027 5028 bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT); 5029 5030 /* Runs before constant folding */ 5031 bi_lower_swizzle(ctx); 5032 bi_validate(ctx, "Early lowering"); 5033 5034 /* Runs before copy prop */ 5035 if (optimize && !ctx->inputs->no_ubo_to_push) { 5036 bi_opt_push_ubo(ctx); 5037 } 5038 5039 if (likely(optimize)) { 5040 bi_opt_copy_prop(ctx); 5041 5042 while (bi_opt_constant_fold(ctx)) 5043 bi_opt_copy_prop(ctx); 5044 5045 bi_opt_mod_prop_forward(ctx); 5046 bi_opt_mod_prop_backward(ctx); 5047 5048 /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after 5049 * mod_prop_backward to fuse VAR_TEX */ 5050 if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT && 5051 !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) { 5052 bi_opt_dead_code_eliminate(ctx); 5053 bi_opt_message_preload(ctx); 5054 bi_opt_copy_prop(ctx); 5055 } 5056 5057 bi_opt_dead_code_eliminate(ctx); 5058 bi_opt_cse(ctx); 5059 bi_opt_dead_code_eliminate(ctx); 5060 if (!ctx->inputs->no_ubo_to_push) 5061 bi_opt_reorder_push(ctx); 5062 bi_validate(ctx, "Optimization passes"); 5063 } 5064 5065 bi_foreach_instr_global(ctx, I) { 5066 bi_lower_opt_instruction(I); 5067 } 5068 5069 if (ctx->arch >= 9) { 5070 va_optimize(ctx); 5071 5072 bi_foreach_instr_global_safe(ctx, I) { 5073 va_lower_isel(I); 5074 va_lower_constants(ctx, I); 5075 5076 bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); 5077 va_repair_fau(&b, I); 5078 } 5079 5080 /* We need to clean up after constant lowering */ 5081 if (likely(optimize)) { 5082 bi_opt_cse(ctx); 5083 bi_opt_dead_code_eliminate(ctx); 5084 } 5085 5086 bi_validate(ctx, "Valhall passes"); 5087 } 5088 5089 bi_foreach_block(ctx, block) { 5090 bi_lower_branch(ctx, block); 5091 } 5092 5093 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) 5094 bi_print_shader(ctx, stdout); 5095 5096 /* Analyze before register allocation to avoid false dependencies. The 5097 * skip bit is a function of only the data flow graph and is invariant 5098 * under valid scheduling. Helpers are only defined for fragment 5099 * shaders, so this analysis is only required in fragment shaders. 5100 */ 5101 if (ctx->stage == MESA_SHADER_FRAGMENT) 5102 bi_analyze_helper_requirements(ctx); 5103 5104 /* Fuse TEXC after analyzing helper requirements so the analysis 5105 * doesn't have to know about dual textures */ 5106 if (likely(optimize)) { 5107 bi_opt_fuse_dual_texture(ctx); 5108 } 5109 5110 /* Lower FAU after fusing dual texture, because fusing dual texture 5111 * creates new immediates that themselves may need lowering. 5112 */ 5113 if (ctx->arch <= 8) { 5114 bi_lower_fau(ctx); 5115 } 5116 5117 /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */ 5118 if (likely(optimize)) { 5119 bi_opt_cse(ctx); 5120 bi_opt_dead_code_eliminate(ctx); 5121 } 5122 5123 if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) 5124 bi_pressure_schedule(ctx); 5125 5126 bi_validate(ctx, "Late lowering"); 5127 5128 bi_register_allocate(ctx); 5129 5130 if (likely(optimize)) 5131 bi_opt_post_ra(ctx); 5132 5133 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) 5134 bi_print_shader(ctx, stdout); 5135 5136 if (ctx->arch >= 9) { 5137 va_assign_slots(ctx); 5138 va_insert_flow_control_nops(ctx); 5139 va_merge_flow(ctx); 5140 va_mark_last(ctx); 5141 } else { 5142 bi_schedule(ctx); 5143 bi_assign_scoreboard(ctx); 5144 5145 /* Analyze after scheduling since we depend on instruction 5146 * order. Valhall calls as part of va_insert_flow_control_nops, 5147 * as the handling for clauses differs from instructions. 5148 */ 5149 bi_analyze_helper_terminate(ctx); 5150 bi_mark_clauses_td(ctx); 5151 } 5152 5153 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) 5154 bi_print_shader(ctx, stdout); 5155 5156 if (ctx->arch <= 8) { 5157 bi_pack_clauses(ctx, binary, offset); 5158 } else { 5159 bi_pack_valhall(ctx, binary); 5160 } 5161 5162 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { 5163 if (ctx->arch <= 8) { 5164 disassemble_bifrost(stdout, binary->data + offset, 5165 binary->size - offset, 5166 bifrost_debug & BIFROST_DBG_VERBOSE); 5167 } else { 5168 disassemble_valhall(stdout, binary->data + offset, 5169 binary->size - offset, 5170 bifrost_debug & BIFROST_DBG_VERBOSE); 5171 } 5172 5173 fflush(stdout); 5174 } 5175 5176 if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) && 5177 !skip_internal) { 5178 if (ctx->arch >= 9) { 5179 va_print_stats(ctx, binary->size - offset, stderr); 5180 } else { 5181 bi_print_stats(ctx, binary->size - offset, stderr); 5182 } 5183 } 5184 5185 return ctx; 5186} 5187 5188static void 5189bi_compile_variant(nir_shader *nir, 5190 const struct panfrost_compile_inputs *inputs, 5191 struct util_dynarray *binary, 5192 struct hash_table_u64 *sysval_to_id, 5193 struct pan_shader_info *info, 5194 enum bi_idvs_mode idvs) 5195{ 5196 struct bi_shader_info local_info = { 5197 .push = &info->push, 5198 .bifrost = &info->bifrost, 5199 .tls_size = info->tls_size, 5200 .sysvals = &info->sysvals, 5201 .push_offset = info->push.count 5202 }; 5203 5204 unsigned offset = binary->size; 5205 5206 /* If there is no position shader (gl_Position is not written), then 5207 * there is no need to build a varying shader either. This case is hit 5208 * for transform feedback only vertex shaders which only make sense with 5209 * rasterizer discard. 5210 */ 5211 if ((offset == 0) && (idvs == BI_IDVS_VARYING)) 5212 return; 5213 5214 /* Software invariant: Only a secondary shader can appear at a nonzero 5215 * offset, to keep the ABI simple. */ 5216 assert((offset == 0) ^ (idvs == BI_IDVS_VARYING)); 5217 5218 bi_context *ctx = bi_compile_variant_nir(nir, inputs, binary, sysval_to_id, local_info, idvs); 5219 5220 /* A register is preloaded <==> it is live before the first block */ 5221 bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); 5222 uint64_t preload = first_block->reg_live_in; 5223 5224 /* If multisampling is used with a blend shader, the blend shader needs 5225 * to access the sample coverage mask in r60 and the sample ID in r61. 5226 * Blend shaders run in the same context as fragment shaders, so if a 5227 * blend shader could run, we need to preload these registers 5228 * conservatively. There is believed to be little cost to doing so, so 5229 * do so always to avoid variants of the preload descriptor. 5230 * 5231 * We only do this on Valhall, as Bifrost has to update the RSD for 5232 * multisampling w/ blend shader anyway, so this is handled in the 5233 * driver. We could unify the paths if the cost is acceptable. 5234 */ 5235 if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9) 5236 preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61); 5237 5238 info->ubo_mask |= ctx->ubo_mask; 5239 info->tls_size = MAX2(info->tls_size, ctx->info.tls_size); 5240 5241 if (idvs == BI_IDVS_VARYING) { 5242 info->vs.secondary_enable = (binary->size > offset); 5243 info->vs.secondary_offset = offset; 5244 info->vs.secondary_preload = preload; 5245 info->vs.secondary_work_reg_count = ctx->info.work_reg_count; 5246 } else { 5247 info->preload = preload; 5248 info->work_reg_count = ctx->info.work_reg_count; 5249 } 5250 5251 if (idvs == BI_IDVS_POSITION && 5252 !nir->info.internal && 5253 nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) { 5254 /* Find the psiz write */ 5255 bi_instr *write = NULL; 5256 5257 bi_foreach_instr_global(ctx, I) { 5258 if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) { 5259 write = I; 5260 break; 5261 } 5262 } 5263 5264 assert(write != NULL); 5265 5266 /* NOP it out, preserving its flow control. TODO: maybe DCE */ 5267 if (write->flow) { 5268 bi_builder b = bi_init_builder(ctx, bi_before_instr(write)); 5269 bi_instr *nop = bi_nop(&b); 5270 nop->flow = write->flow; 5271 } 5272 5273 bi_remove_instruction(write); 5274 5275 info->vs.no_psiz_offset = binary->size; 5276 bi_pack_valhall(ctx, binary); 5277 } 5278 5279 ralloc_free(ctx); 5280} 5281 5282/* Decide if Index-Driven Vertex Shading should be used for a given shader */ 5283static bool 5284bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs) 5285{ 5286 /* Opt-out */ 5287 if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS) 5288 return false; 5289 5290 /* IDVS splits up vertex shaders, not defined on other shader stages */ 5291 if (nir->info.stage != MESA_SHADER_VERTEX) 5292 return false; 5293 5294 /* Bifrost cannot write gl_PointSize during IDVS */ 5295 if ((inputs->gpu_id < 0x9000) && 5296 nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) 5297 return false; 5298 5299 /* Otherwise, IDVS is usually better */ 5300 return true; 5301} 5302 5303void 5304bifrost_compile_shader_nir(nir_shader *nir, 5305 const struct panfrost_compile_inputs *inputs, 5306 struct util_dynarray *binary, 5307 struct pan_shader_info *info) 5308{ 5309 bifrost_debug = debug_get_option_bifrost_debug(); 5310 5311 bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend); 5312 struct hash_table_u64 *sysval_to_id = 5313 panfrost_init_sysvals(&info->sysvals, 5314 inputs->fixed_sysval_layout, 5315 NULL); 5316 5317 info->tls_size = nir->scratch_size; 5318 info->vs.idvs = bi_should_idvs(nir, inputs); 5319 5320 if (info->vs.idvs) { 5321 bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_POSITION); 5322 bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_VARYING); 5323 } else { 5324 bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_NONE); 5325 } 5326 5327 if (gl_shader_stage_is_compute(nir->info.stage)) { 5328 /* Workgroups may be merged if the structure of the workgroup is 5329 * not software visible. This is true if neither shared memory 5330 * nor barriers are used. The hardware may be able to optimize 5331 * compute shaders that set this flag. 5332 */ 5333 info->cs.allow_merging_workgroups = 5334 (nir->info.shared_size == 0) && 5335 !nir->info.uses_control_barrier && 5336 !nir->info.uses_memory_barrier; 5337 } 5338 5339 info->ubo_mask &= (1 << nir->info.num_ubos) - 1; 5340 5341 _mesa_hash_table_u64_destroy(sysval_to_id); 5342} 5343