1/* 2 * Copyright © 2014-2015 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "compiler/nir/nir.h" 25#include "compiler/nir/nir_deref.h" 26#include "compiler/nir/nir_worklist.h" 27#include "nir/nir_to_tgsi.h" 28#include "pipe/p_screen.h" 29#include "pipe/p_state.h" 30#include "tgsi/tgsi_dump.h" 31#include "tgsi/tgsi_from_mesa.h" 32#include "tgsi/tgsi_info.h" 33#include "tgsi/tgsi_parse.h" 34#include "tgsi/tgsi_ureg.h" 35#include "tgsi/tgsi_util.h" 36#include "util/debug.h" 37#include "util/u_math.h" 38#include "util/u_memory.h" 39#include "util/u_dynarray.h" 40 41struct ntt_insn { 42 enum tgsi_opcode opcode; 43 struct ureg_dst dst[2]; 44 struct ureg_src src[4]; 45 enum tgsi_texture_type tex_target; 46 enum tgsi_return_type tex_return_type; 47 struct tgsi_texture_offset tex_offset; 48 49 unsigned mem_qualifier; 50 enum pipe_format mem_format; 51 52 bool is_tex : 1; 53 bool is_mem : 1; 54 bool precise : 1; 55}; 56 57struct ntt_block { 58 /* Array of struct ntt_insn */ 59 struct util_dynarray insns; 60 int start_ip; 61 int end_ip; 62}; 63 64struct ntt_reg_interval { 65 uint32_t start, end; 66}; 67 68struct ntt_compile { 69 nir_shader *s; 70 nir_function_impl *impl; 71 const struct nir_to_tgsi_options *options; 72 struct pipe_screen *screen; 73 struct ureg_program *ureg; 74 75 bool needs_texcoord_semantic; 76 bool native_integers; 77 bool has_txf_lz; 78 79 bool addr_declared[3]; 80 struct ureg_dst addr_reg[3]; 81 82 /* if condition set up at the end of a block, for ntt_emit_if(). */ 83 struct ureg_src if_cond; 84 85 /* TGSI temps for our NIR SSA and register values. */ 86 struct ureg_dst *reg_temp; 87 struct ureg_src *ssa_temp; 88 89 struct ntt_reg_interval *liveness; 90 91 /* Map from nir_block to ntt_block */ 92 struct hash_table *blocks; 93 struct ntt_block *cur_block; 94 unsigned current_if_else; 95 unsigned cf_label; 96 97 /* Whether we're currently emitting instructiosn for a precise NIR instruction. */ 98 bool precise; 99 100 unsigned num_temps; 101 unsigned first_non_array_temp; 102 103 /* Mappings from driver_location to TGSI input/output number. 104 * 105 * We'll be declaring TGSI input/outputs in an arbitrary order, and they get 106 * their numbers assigned incrementally, unlike inputs or constants. 107 */ 108 struct ureg_src *input_index_map; 109 uint64_t centroid_inputs; 110 111 uint32_t first_ubo; 112 113 struct ureg_src images[PIPE_MAX_SHADER_IMAGES]; 114}; 115 116static struct ureg_dst 117ntt_temp(struct ntt_compile *c) 118{ 119 return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++); 120} 121 122static struct ntt_block * 123ntt_block_from_nir(struct ntt_compile *c, struct nir_block *block) 124{ 125 struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block); 126 return entry->data; 127} 128 129static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list); 130static void ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list); 131 132static struct ntt_insn * 133ntt_insn(struct ntt_compile *c, enum tgsi_opcode opcode, 134 struct ureg_dst dst, 135 struct ureg_src src0, struct ureg_src src1, 136 struct ureg_src src2, struct ureg_src src3) 137{ 138 struct ntt_insn insn = { 139 .opcode = opcode, 140 .dst = { dst, ureg_dst_undef() }, 141 .src = { src0, src1, src2, src3 }, 142 .precise = c->precise, 143 }; 144 util_dynarray_append(&c->cur_block->insns, struct ntt_insn, insn); 145 return util_dynarray_top_ptr(&c->cur_block->insns, struct ntt_insn); 146} 147 148#define OP00( op ) \ 149static inline void ntt_##op(struct ntt_compile *c) \ 150{ \ 151 ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ 152} 153 154#define OP01( op ) \ 155static inline void ntt_##op(struct ntt_compile *c, \ 156 struct ureg_src src0) \ 157{ \ 158 ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ 159} 160 161 162#define OP10( op ) \ 163static inline void ntt_##op(struct ntt_compile *c, \ 164 struct ureg_dst dst) \ 165{ \ 166 ntt_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ 167} 168 169#define OP11( op ) \ 170static inline void ntt_##op(struct ntt_compile *c, \ 171 struct ureg_dst dst, \ 172 struct ureg_src src0) \ 173{ \ 174 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ 175} 176 177#define OP12( op ) \ 178static inline void ntt_##op(struct ntt_compile *c, \ 179 struct ureg_dst dst, \ 180 struct ureg_src src0, \ 181 struct ureg_src src1) \ 182{ \ 183 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \ 184} 185 186#define OP13( op ) \ 187static inline void ntt_##op(struct ntt_compile *c, \ 188 struct ureg_dst dst, \ 189 struct ureg_src src0, \ 190 struct ureg_src src1, \ 191 struct ureg_src src2) \ 192{ \ 193 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef()); \ 194} 195 196#define OP14( op ) \ 197static inline void ntt_##op(struct ntt_compile *c, \ 198 struct ureg_dst dst, \ 199 struct ureg_src src0, \ 200 struct ureg_src src1, \ 201 struct ureg_src src2, \ 202 struct ureg_src src3) \ 203{ \ 204 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3); \ 205} 206 207/* We hand-craft our tex instructions */ 208#define OP12_TEX(op) 209#define OP14_TEX(op) 210 211/* Use a template include to generate a correctly-typed ntt_OP() 212 * function for each TGSI opcode: 213 */ 214#include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h" 215 216/** 217 * Interprets a nir_load_const used as a NIR src as a uint. 218 * 219 * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU 220 * instruction (or in a phi-web used by an integer ALU instruction) were 221 * converted to floats and the ALU instruction swapped to the float equivalent. 222 * However, this means that integer load_consts used by intrinsics (which don't 223 * normally get that conversion) may have been reformatted to be floats. Given 224 * that all of our intrinsic nir_src_as_uint() calls are expected to be small, 225 * we can just look and see if they look like floats and convert them back to 226 * ints. 227 */ 228static uint32_t 229ntt_src_as_uint(struct ntt_compile *c, nir_src src) 230{ 231 uint32_t val = nir_src_as_uint(src); 232 if (!c->native_integers && val >= fui(1.0)) 233 val = (uint32_t)uif(val); 234 return val; 235} 236 237static unsigned 238ntt_64bit_write_mask(unsigned write_mask) 239{ 240 return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0); 241} 242 243static struct ureg_src 244ntt_64bit_1f(struct ntt_compile *c) 245{ 246 return ureg_imm4u(c->ureg, 247 0x00000000, 0x3ff00000, 248 0x00000000, 0x3ff00000); 249} 250 251/* Per-channel masks of def/use within the block, and the per-channel 252 * livein/liveout for the block as a whole. 253 */ 254struct ntt_live_reg_block_state { 255 uint8_t *def, *use, *livein, *liveout, *defin, *defout; 256}; 257 258struct ntt_live_reg_state { 259 unsigned bitset_words; 260 261 struct ntt_reg_interval *regs; 262 263 /* Used in propagate_across_edge() */ 264 BITSET_WORD *tmp_live; 265 266 struct ntt_live_reg_block_state *blocks; 267 268 nir_block_worklist worklist; 269}; 270 271static void 272ntt_live_reg_mark_use(struct ntt_compile *c, struct ntt_live_reg_block_state *bs, 273 int ip, unsigned index, unsigned used_mask) 274{ 275 bs->use[index] |= used_mask & ~bs->def[index]; 276 277 c->liveness[index].start = MIN2(c->liveness[index].start, ip); 278 c->liveness[index].end = MAX2(c->liveness[index].end, ip); 279 280} 281static void 282ntt_live_reg_setup_def_use(struct ntt_compile *c, nir_function_impl *impl, struct ntt_live_reg_state *state) 283{ 284 for (int i = 0; i < impl->num_blocks; i++) { 285 state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps); 286 state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps); 287 state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps); 288 state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps); 289 state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps); 290 state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps); 291 } 292 293 int ip = 0; 294 nir_foreach_block(block, impl) { 295 struct ntt_live_reg_block_state *bs = &state->blocks[block->index]; 296 struct ntt_block *ntt_block = ntt_block_from_nir(c, block); 297 298 ntt_block->start_ip = ip; 299 300 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) { 301 const struct tgsi_opcode_info *opcode_info = 302 tgsi_get_opcode_info(insn->opcode); 303 304 /* Set up use[] for the srcs. 305 * 306 * Uses are the channels of the reg read in the block that don't have a 307 * preceding def to screen them off. Note that we don't do per-element 308 * tracking of array regs, so they're never screened off. 309 */ 310 for (int i = 0; i < opcode_info->num_src; i++) { 311 if (insn->src[i].File != TGSI_FILE_TEMPORARY) 312 continue; 313 int index = insn->src[i].Index; 314 315 uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i, 316 insn->dst->WriteMask, 317 insn->src[i].SwizzleX, 318 insn->src[i].SwizzleY, 319 insn->src[i].SwizzleZ, 320 insn->src[i].SwizzleW, 321 insn->tex_target, 322 insn->tex_target); 323 324 assert(!insn->src[i].Indirect || index < c->first_non_array_temp); 325 ntt_live_reg_mark_use(c, bs, ip, index, used_mask); 326 } 327 328 if (insn->is_tex && insn->tex_offset.File == TGSI_FILE_TEMPORARY) 329 ntt_live_reg_mark_use(c, bs, ip, insn->tex_offset.Index, 0xf); 330 331 /* Set up def[] for the srcs. 332 * 333 * Defs are the unconditionally-written (not R/M/W) channels of the reg in 334 * the block that don't have a preceding use. 335 */ 336 for (int i = 0; i < opcode_info->num_dst; i++) { 337 if (insn->dst[i].File != TGSI_FILE_TEMPORARY) 338 continue; 339 int index = insn->dst[i].Index; 340 uint32_t writemask = insn->dst[i].WriteMask; 341 342 bs->def[index] |= writemask & ~bs->use[index]; 343 bs->defout[index] |= writemask; 344 345 assert(!insn->dst[i].Indirect || index < c->first_non_array_temp); 346 c->liveness[index].start = MIN2(c->liveness[index].start, ip); 347 c->liveness[index].end = MAX2(c->liveness[index].end, ip); 348 } 349 ip++; 350 } 351 352 ntt_block->end_ip = ip; 353 } 354} 355 356static void 357ntt_live_regs(struct ntt_compile *c, nir_function_impl *impl) 358{ 359 nir_metadata_require(impl, nir_metadata_block_index); 360 361 c->liveness = rzalloc_array(c, struct ntt_reg_interval, c->num_temps); 362 363 struct ntt_live_reg_state state = { 364 .blocks = rzalloc_array(impl, struct ntt_live_reg_block_state, impl->num_blocks), 365 }; 366 367 /* The intervals start out with start > end (indicating unused) */ 368 for (int i = 0; i < c->num_temps; i++) 369 c->liveness[i].start = ~0; 370 371 ntt_live_reg_setup_def_use(c, impl, &state); 372 373 /* Make a forward-order worklist of all the blocks. */ 374 nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL); 375 nir_foreach_block(block, impl) { 376 nir_block_worklist_push_tail(&state.worklist, block); 377 } 378 379 /* Propagate defin/defout down the CFG to calculate the live variables 380 * potentially defined along any possible control flow path. We'll use this 381 * to keep things like conditional defs of the reg (or array regs where we 382 * don't track defs!) from making the reg's live range extend back to the 383 * start of the program. 384 */ 385 while (!nir_block_worklist_is_empty(&state.worklist)) { 386 nir_block *block = nir_block_worklist_pop_head(&state.worklist); 387 for (int j = 0; j < ARRAY_SIZE(block->successors); j++) { 388 nir_block *succ = block->successors[j]; 389 if (!succ || succ->index == impl->num_blocks) 390 continue; 391 392 for (int i = 0; i < c->num_temps; i++) { 393 uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i]; 394 395 if (new_def) { 396 state.blocks[succ->index].defin[i] |= new_def; 397 state.blocks[succ->index].defout[i] |= new_def; 398 nir_block_worklist_push_tail(&state.worklist, succ); 399 } 400 } 401 } 402 } 403 404 /* Make a reverse-order worklist of all the blocks. */ 405 nir_foreach_block(block, impl) { 406 nir_block_worklist_push_head(&state.worklist, block); 407 } 408 409 /* We're now ready to work through the worklist and update the liveness sets 410 * of each of the blocks. As long as we keep the worklist up-to-date as we 411 * go, everything will get covered. 412 */ 413 while (!nir_block_worklist_is_empty(&state.worklist)) { 414 /* We pop them off in the reverse order we pushed them on. This way 415 * the first walk of the instructions is backwards so we only walk 416 * once in the case of no control flow. 417 */ 418 nir_block *block = nir_block_worklist_pop_head(&state.worklist); 419 struct ntt_block *ntt_block = ntt_block_from_nir(c, block); 420 struct ntt_live_reg_block_state *bs = &state.blocks[block->index]; 421 422 for (int i = 0; i < c->num_temps; i++) { 423 /* Collect livein from our successors to include in our liveout. */ 424 for (int j = 0; j < ARRAY_SIZE(block->successors); j++) { 425 nir_block *succ = block->successors[j]; 426 if (!succ || succ->index == impl->num_blocks) 427 continue; 428 struct ntt_live_reg_block_state *sbs = &state.blocks[succ->index]; 429 430 uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i]; 431 if (new_liveout) { 432 if (state.blocks[block->index].defout[i]) 433 c->liveness[i].end = MAX2(c->liveness[i].end, ntt_block->end_ip); 434 bs->liveout[i] |= sbs->livein[i]; 435 } 436 } 437 438 /* Propagate use requests from either our block's uses or our 439 * non-screened-off liveout up to our predecessors. 440 */ 441 uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) & 442 ~bs->livein[i]); 443 if (new_livein) { 444 bs->livein[i] |= new_livein; 445 set_foreach(block->predecessors, entry) { 446 nir_block *pred = (void *)entry->key; 447 nir_block_worklist_push_tail(&state.worklist, pred); 448 } 449 450 if (new_livein & state.blocks[block->index].defin[i]) 451 c->liveness[i].start = MIN2(c->liveness[i].start, ntt_block->start_ip); 452 } 453 } 454 } 455 456 ralloc_free(state.blocks); 457 nir_block_worklist_fini(&state.worklist); 458} 459 460static void 461ntt_ra_check(struct ntt_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index) 462{ 463 if (index < c->first_non_array_temp) 464 return; 465 466 if (c->liveness[index].start == ip && ra_map[index] == ~0) 467 ra_map[index] = ureg_DECL_temporary(c->ureg).Index; 468 469 if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) { 470 ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index])); 471 BITSET_SET(released, index); 472 } 473} 474 475static void 476ntt_allocate_regs(struct ntt_compile *c, nir_function_impl *impl) 477{ 478 ntt_live_regs(c, impl); 479 480 unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps); 481 unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps)); 482 483 /* No RA on NIR array regs */ 484 for (int i = 0; i < c->first_non_array_temp; i++) 485 ra_map[i] = i; 486 487 for (int i = c->first_non_array_temp; i < c->num_temps; i++) 488 ra_map[i] = ~0; 489 490 int ip = 0; 491 nir_foreach_block(block, impl) { 492 struct ntt_block *ntt_block = ntt_block_from_nir(c, block); 493 494 for (int i = 0; i < c->num_temps; i++) 495 ntt_ra_check(c, ra_map, released, ip, i); 496 497 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) { 498 const struct tgsi_opcode_info *opcode_info = 499 tgsi_get_opcode_info(insn->opcode); 500 501 for (int i = 0; i < opcode_info->num_src; i++) { 502 if (insn->src[i].File == TGSI_FILE_TEMPORARY) { 503 ntt_ra_check(c, ra_map, released, ip, insn->src[i].Index); 504 insn->src[i].Index = ra_map[insn->src[i].Index]; 505 } 506 } 507 508 if (insn->is_tex && insn->tex_offset.File == TGSI_FILE_TEMPORARY) { 509 ntt_ra_check(c, ra_map, released, ip, insn->tex_offset.Index); 510 insn->tex_offset.Index = ra_map[insn->tex_offset.Index]; 511 } 512 513 for (int i = 0; i < opcode_info->num_dst; i++) { 514 if (insn->dst[i].File == TGSI_FILE_TEMPORARY) { 515 ntt_ra_check(c, ra_map, released, ip, insn->dst[i].Index); 516 insn->dst[i].Index = ra_map[insn->dst[i].Index]; 517 } 518 } 519 ip++; 520 } 521 522 for (int i = 0; i < c->num_temps; i++) 523 ntt_ra_check(c, ra_map, released, ip, i); 524 } 525} 526 527static void 528ntt_allocate_regs_unoptimized(struct ntt_compile *c, nir_function_impl *impl) 529{ 530 for (int i = c->first_non_array_temp; i < c->num_temps; i++) 531 ureg_DECL_temporary(c->ureg); 532} 533 534 535/** 536 * Try to find an iadd of a constant value with a non-constant value in the 537 * nir_src's first component, returning the constant offset and replacing *src 538 * with the non-constant component. 539 */ 540static const uint32_t 541ntt_extract_const_src_offset(nir_src *src) 542{ 543 if (!src->is_ssa) 544 return 0; 545 546 nir_ssa_scalar s = nir_get_ssa_scalar(src->ssa, 0); 547 548 while (nir_ssa_scalar_is_alu(s)) { 549 nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr); 550 551 for (int i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { 552 if (!alu->src[i].src.is_ssa) 553 return 0; 554 } 555 556 if (alu->op == nir_op_iadd) { 557 for (int i = 0; i < 2; i++) { 558 nir_const_value *v = nir_src_as_const_value(alu->src[i].src); 559 if (v && !alu->src[i].negate && !alu->src[i].abs) { 560 *src = alu->src[1 - i].src; 561 return v[alu->src[i].swizzle[s.comp]].u32; 562 } 563 } 564 565 return 0; 566 } 567 568 /* We'd like to reuse nir_ssa_scalar_chase_movs(), but it assumes SSA and that 569 * seems reasonable for something used in inner loops of the compiler. 570 */ 571 if (!nir_alu_instr_is_copy(alu)) 572 return 0; 573 574 if (alu->op == nir_op_mov) { 575 s.def = alu->src[0].src.ssa; 576 s.comp = alu->src[0].swizzle[s.comp]; 577 } else if (nir_op_is_vec(alu->op)) { 578 s.def = alu->src[s.comp].src.ssa; 579 s.comp = alu->src[s.comp].swizzle[0]; 580 } else { 581 return 0; 582 } 583 } 584 585 return 0; 586} 587 588static const struct glsl_type * 589ntt_shader_input_type(struct ntt_compile *c, 590 struct nir_variable *var) 591{ 592 switch (c->s->info.stage) { 593 case MESA_SHADER_GEOMETRY: 594 case MESA_SHADER_TESS_EVAL: 595 case MESA_SHADER_TESS_CTRL: 596 if (glsl_type_is_array(var->type)) 597 return glsl_get_array_element(var->type); 598 else 599 return var->type; 600 default: 601 return var->type; 602 } 603} 604 605static void 606ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location, 607 unsigned *semantic_name, unsigned *semantic_index) 608{ 609 /* We want to use most of tgsi_get_gl_varying_semantic(), but the 610 * !texcoord shifting has already been applied, so avoid that. 611 */ 612 if (!c->needs_texcoord_semantic && 613 (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) { 614 *semantic_name = TGSI_SEMANTIC_GENERIC; 615 *semantic_index = location - VARYING_SLOT_VAR0; 616 return; 617 } 618 619 tgsi_get_gl_varying_semantic(location, true, 620 semantic_name, semantic_index); 621} 622 623/* TGSI varying declarations have a component usage mask associated (used by 624 * r600 and svga). 625 */ 626static uint32_t 627ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components, 628 bool is_64) 629{ 630 uint32_t usage_mask = 631 u_bit_consecutive(start_component, num_components); 632 633 if (is_64) { 634 if (start_component >= 2) 635 usage_mask >>= 2; 636 637 uint32_t tgsi_usage_mask = 0; 638 639 if (usage_mask & TGSI_WRITEMASK_X) 640 tgsi_usage_mask |= TGSI_WRITEMASK_XY; 641 if (usage_mask & TGSI_WRITEMASK_Y) 642 tgsi_usage_mask |= TGSI_WRITEMASK_ZW; 643 644 return tgsi_usage_mask; 645 } else { 646 return usage_mask; 647 } 648} 649 650/* TGSI varying declarations have a component usage mask associated (used by 651 * r600 and svga). 652 */ 653static uint32_t 654ntt_tgsi_var_usage_mask(const struct nir_variable *var) 655{ 656 const struct glsl_type *type_without_array = 657 glsl_without_array(var->type); 658 unsigned num_components = glsl_get_vector_elements(type_without_array); 659 if (num_components == 0) /* structs */ 660 num_components = 4; 661 662 return ntt_tgsi_usage_mask(var->data.location_frac, num_components, 663 glsl_type_is_64bit(type_without_array)); 664} 665 666static struct ureg_dst 667ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac) 668{ 669 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr); 670 int base = nir_intrinsic_base(instr); 671 *frac = nir_intrinsic_component(instr); 672 bool is_64 = nir_src_bit_size(instr->src[0]) == 64; 673 674 struct ureg_dst out; 675 if (c->s->info.stage == MESA_SHADER_FRAGMENT) { 676 unsigned semantic_name, semantic_index; 677 tgsi_get_gl_frag_result_semantic(semantics.location, 678 &semantic_name, &semantic_index); 679 semantic_index += semantics.dual_source_blend_index; 680 681 switch (semantics.location) { 682 case FRAG_RESULT_DEPTH: 683 *frac = 2; /* z write is the to the .z channel in TGSI */ 684 break; 685 case FRAG_RESULT_STENCIL: 686 *frac = 1; 687 break; 688 default: 689 break; 690 } 691 692 out = ureg_DECL_output(c->ureg, semantic_name, semantic_index); 693 } else { 694 unsigned semantic_name, semantic_index; 695 696 ntt_get_gl_varying_semantic(c, semantics.location, 697 &semantic_name, &semantic_index); 698 699 uint32_t usage_mask = ntt_tgsi_usage_mask(*frac, 700 instr->num_components, 701 is_64); 702 uint32_t gs_streams = semantics.gs_streams; 703 for (int i = 0; i < 4; i++) { 704 if (!(usage_mask & (1 << i))) 705 gs_streams &= ~(0x3 << 2 * i); 706 } 707 708 /* No driver appears to use array_id of outputs. */ 709 unsigned array_id = 0; 710 711 /* This bit is lost in the i/o semantics, but it's unused in in-tree 712 * drivers. 713 */ 714 bool invariant = semantics.invariant; 715 716 out = ureg_DECL_output_layout(c->ureg, 717 semantic_name, semantic_index, 718 gs_streams, 719 base, 720 usage_mask, 721 array_id, 722 semantics.num_slots, 723 invariant); 724 } 725 726 unsigned write_mask; 727 if (nir_intrinsic_has_write_mask(instr)) 728 write_mask = nir_intrinsic_write_mask(instr); 729 else 730 write_mask = ((1 << instr->num_components) - 1) << *frac; 731 732 if (is_64) { 733 write_mask = ntt_64bit_write_mask(write_mask); 734 if (*frac >= 2) 735 write_mask = write_mask << 2; 736 } else { 737 write_mask = write_mask << *frac; 738 } 739 return ureg_writemask(out, write_mask); 740} 741 742/* If this reg or SSA def is used only for storing an output, then in the simple 743 * cases we can write directly to the TGSI output instead of having store_output 744 * emit its own MOV. 745 */ 746static bool 747ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst, 748 struct list_head *uses, struct list_head *if_uses) 749{ 750 *dst = ureg_dst_undef(); 751 752 switch (c->s->info.stage) { 753 case MESA_SHADER_FRAGMENT: 754 case MESA_SHADER_VERTEX: 755 break; 756 default: 757 /* tgsi_exec (at least) requires that output stores happen per vertex 758 * emitted, you don't get to reuse a previous output value for the next 759 * vertex. 760 */ 761 return false; 762 } 763 764 if (!list_is_empty(if_uses) || !list_is_singular(uses)) 765 return false; 766 767 nir_src *src = list_first_entry(uses, nir_src, use_link); 768 769 if (src->parent_instr->type != nir_instr_type_intrinsic) 770 return false; 771 772 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr); 773 if (intr->intrinsic != nir_intrinsic_store_output || 774 !nir_src_is_const(intr->src[1])) { 775 return false; 776 } 777 778 uint32_t frac; 779 *dst = ntt_output_decl(c, intr, &frac); 780 dst->Index += ntt_src_as_uint(c, intr->src[1]); 781 782 return frac == 0; 783} 784 785static void 786ntt_setup_inputs(struct ntt_compile *c) 787{ 788 if (c->s->info.stage != MESA_SHADER_FRAGMENT) 789 return; 790 791 unsigned num_inputs = 0; 792 int num_input_arrays = 0; 793 794 nir_foreach_shader_in_variable(var, c->s) { 795 const struct glsl_type *type = ntt_shader_input_type(c, var); 796 unsigned array_len = 797 glsl_count_attribute_slots(type, false); 798 799 num_inputs = MAX2(num_inputs, var->data.driver_location + array_len); 800 } 801 802 c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs); 803 804 nir_foreach_shader_in_variable(var, c->s) { 805 const struct glsl_type *type = ntt_shader_input_type(c, var); 806 unsigned array_len = 807 glsl_count_attribute_slots(type, false); 808 809 unsigned interpolation = TGSI_INTERPOLATE_CONSTANT; 810 unsigned sample_loc; 811 struct ureg_src decl; 812 813 if (c->s->info.stage == MESA_SHADER_FRAGMENT) { 814 interpolation = 815 tgsi_get_interp_mode(var->data.interpolation, 816 var->data.location == VARYING_SLOT_COL0 || 817 var->data.location == VARYING_SLOT_COL1); 818 819 if (var->data.location == VARYING_SLOT_POS) 820 interpolation = TGSI_INTERPOLATE_LINEAR; 821 } 822 823 unsigned semantic_name, semantic_index; 824 ntt_get_gl_varying_semantic(c, var->data.location, 825 &semantic_name, &semantic_index); 826 827 if (var->data.sample) { 828 sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE; 829 } else if (var->data.centroid) { 830 sample_loc = TGSI_INTERPOLATE_LOC_CENTROID; 831 c->centroid_inputs |= (BITSET_MASK(array_len) << 832 var->data.driver_location); 833 } else { 834 sample_loc = TGSI_INTERPOLATE_LOC_CENTER; 835 } 836 837 unsigned array_id = 0; 838 if (glsl_type_is_array(type)) 839 array_id = ++num_input_arrays; 840 841 uint32_t usage_mask = ntt_tgsi_var_usage_mask(var); 842 843 decl = ureg_DECL_fs_input_centroid_layout(c->ureg, 844 semantic_name, 845 semantic_index, 846 interpolation, 847 sample_loc, 848 var->data.driver_location, 849 usage_mask, 850 array_id, array_len); 851 852 if (semantic_name == TGSI_SEMANTIC_FACE) { 853 struct ureg_dst temp = ntt_temp(c); 854 if (c->native_integers) { 855 /* NIR is ~0 front and 0 back, while TGSI is +1 front */ 856 ntt_SGE(c, temp, decl, ureg_imm1f(c->ureg, 0)); 857 } else { 858 /* tgsi docs say that floating point FACE will be positive for 859 * frontface and negative for backface, but realistically 860 * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0. 861 * Copy that behavior, since some drivers (r300) have been doing a 862 * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0 863 * front face). 864 */ 865 temp.Saturate = true; 866 ntt_MOV(c, temp, decl); 867 868 } 869 decl = ureg_src(temp); 870 } 871 872 for (unsigned i = 0; i < array_len; i++) { 873 c->input_index_map[var->data.driver_location + i] = decl; 874 c->input_index_map[var->data.driver_location + i].Index += i; 875 } 876 } 877} 878 879static int 880ntt_sort_by_location(const nir_variable *a, const nir_variable *b) 881{ 882 return a->data.location - b->data.location; 883} 884 885/** 886 * Workaround for virglrenderer requiring that TGSI FS output color variables 887 * are declared in order. Besides, it's a lot nicer to read the TGSI this way. 888 */ 889static void 890ntt_setup_outputs(struct ntt_compile *c) 891{ 892 if (c->s->info.stage != MESA_SHADER_FRAGMENT) 893 return; 894 895 nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out); 896 897 nir_foreach_shader_out_variable(var, c->s) { 898 if (var->data.location == FRAG_RESULT_COLOR) 899 ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1); 900 901 unsigned semantic_name, semantic_index; 902 tgsi_get_gl_frag_result_semantic(var->data.location, 903 &semantic_name, &semantic_index); 904 905 (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index); 906 } 907} 908 909static enum tgsi_texture_type 910tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow) 911{ 912 switch (dim) { 913 case GLSL_SAMPLER_DIM_1D: 914 if (is_shadow) 915 return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D; 916 else 917 return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D; 918 case GLSL_SAMPLER_DIM_2D: 919 case GLSL_SAMPLER_DIM_EXTERNAL: 920 if (is_shadow) 921 return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D; 922 else 923 return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D; 924 case GLSL_SAMPLER_DIM_3D: 925 return TGSI_TEXTURE_3D; 926 case GLSL_SAMPLER_DIM_CUBE: 927 if (is_shadow) 928 return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE; 929 else 930 return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE; 931 case GLSL_SAMPLER_DIM_RECT: 932 if (is_shadow) 933 return TGSI_TEXTURE_SHADOWRECT; 934 else 935 return TGSI_TEXTURE_RECT; 936 case GLSL_SAMPLER_DIM_MS: 937 return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA; 938 case GLSL_SAMPLER_DIM_BUF: 939 return TGSI_TEXTURE_BUFFER; 940 default: 941 unreachable("unknown sampler dim"); 942 } 943} 944 945static enum tgsi_return_type 946tgsi_return_type_from_base_type(enum glsl_base_type type) 947{ 948 switch (type) { 949 case GLSL_TYPE_INT: 950 return TGSI_RETURN_TYPE_SINT; 951 case GLSL_TYPE_UINT: 952 return TGSI_RETURN_TYPE_UINT; 953 case GLSL_TYPE_FLOAT: 954 return TGSI_RETURN_TYPE_FLOAT; 955 default: 956 unreachable("unexpected texture type"); 957 } 958} 959 960static void 961ntt_setup_uniforms(struct ntt_compile *c) 962{ 963 nir_foreach_uniform_variable(var, c->s) { 964 if (glsl_type_is_sampler(glsl_without_array(var->type)) || 965 glsl_type_is_texture(glsl_without_array(var->type))) { 966 /* Don't use this size for the check for samplers -- arrays of structs 967 * containing samplers should be ignored, and just the separate lowered 968 * sampler uniform decl used. 969 */ 970 int size = glsl_type_get_sampler_count(var->type) + 971 glsl_type_get_texture_count(var->type); 972 973 const struct glsl_type *stype = glsl_without_array(var->type); 974 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype), 975 glsl_sampler_type_is_array(stype), 976 glsl_sampler_type_is_shadow(stype)); 977 enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype)); 978 for (int i = 0; i < size; i++) { 979 ureg_DECL_sampler_view(c->ureg, var->data.binding + i, 980 target, ret_type, ret_type, ret_type, ret_type); 981 ureg_DECL_sampler(c->ureg, var->data.binding + i); 982 } 983 } else if (glsl_contains_atomic(var->type)) { 984 uint32_t offset = var->data.offset / 4; 985 uint32_t size = glsl_atomic_size(var->type) / 4; 986 ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0); 987 } 988 989 /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0 990 * size declaration happens with other UBOs below. 991 */ 992 } 993 994 nir_foreach_image_variable(var, c->s) { 995 int image_count = glsl_type_get_image_count(var->type); 996 const struct glsl_type *itype = glsl_without_array(var->type); 997 enum tgsi_texture_type tex_type = 998 tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype), 999 glsl_sampler_type_is_array(itype), false); 1000 1001 for (int i = 0; i < image_count; i++) { 1002 c->images[var->data.binding] = ureg_DECL_image(c->ureg, 1003 var->data.binding + i, 1004 tex_type, 1005 var->data.image.format, 1006 !(var->data.access & ACCESS_NON_WRITEABLE), 1007 false); 1008 } 1009 } 1010 1011 c->first_ubo = ~0; 1012 1013 unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0}; 1014 nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) { 1015 int ubo = var->data.driver_location; 1016 if (ubo == -1) 1017 continue; 1018 1019 if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo)) 1020 c->first_ubo = MIN2(c->first_ubo, ubo); 1021 1022 unsigned size = glsl_get_explicit_size(var->interface_type, false); 1023 1024 int array_size = 1; 1025 if (glsl_type_is_interface(glsl_without_array(var->type))) 1026 array_size = MAX2(1, glsl_get_aoa_size(var->type)); 1027 1028 for (int i = 0; i < array_size; i++) { 1029 /* Even if multiple NIR variables are in the same uniform block, their 1030 * explicit size is the size of the block. 1031 */ 1032 if (ubo_sizes[ubo + i]) 1033 assert(ubo_sizes[ubo + i] == size); 1034 1035 ubo_sizes[ubo + i] = size; 1036 } 1037 } 1038 1039 for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) { 1040 if (ubo_sizes[i]) 1041 ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i); 1042 } 1043 1044 for (int i = 0; i < c->s->info.num_ssbos; i++) { 1045 /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic 1046 * counters 1047 */ 1048 bool atomic = false; 1049 ureg_DECL_buffer(c->ureg, i, atomic); 1050 } 1051} 1052 1053static void 1054ntt_setup_registers(struct ntt_compile *c, struct exec_list *list) 1055{ 1056 assert(c->num_temps == 0); 1057 /* Permanently allocate all the array regs at the start. */ 1058 foreach_list_typed(nir_register, nir_reg, node, list) { 1059 if (nir_reg->num_array_elems != 0) { 1060 struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems, true); 1061 c->reg_temp[nir_reg->index] = decl; 1062 assert(c->num_temps == decl.Index); 1063 c->num_temps += nir_reg->num_array_elems; 1064 } 1065 } 1066 c->first_non_array_temp = c->num_temps; 1067 1068 /* After that, allocate non-array regs in our virtual space that we'll 1069 * register-allocate before ureg emit. 1070 */ 1071 foreach_list_typed(nir_register, nir_reg, node, list) { 1072 if (nir_reg->num_array_elems == 0) { 1073 struct ureg_dst decl; 1074 uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components); 1075 if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses, &nir_reg->if_uses)) { 1076 if (nir_reg->bit_size == 64) { 1077 if (nir_reg->num_components > 2) { 1078 fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n", 1079 nir_reg->num_components, nir_reg->index); 1080 } 1081 1082 write_mask = ntt_64bit_write_mask(write_mask); 1083 } 1084 1085 decl = ureg_writemask(ntt_temp(c), write_mask); 1086 } 1087 c->reg_temp[nir_reg->index] = decl; 1088 } 1089 } 1090} 1091 1092static struct ureg_src 1093ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr) 1094{ 1095 int num_components = instr->def.num_components; 1096 1097 if (!c->native_integers) { 1098 float values[4]; 1099 assert(instr->def.bit_size == 32); 1100 for (int i = 0; i < num_components; i++) 1101 values[i] = uif(instr->value[i].u32); 1102 1103 return ureg_DECL_immediate(c->ureg, values, num_components); 1104 } else { 1105 uint32_t values[4]; 1106 1107 if (instr->def.bit_size == 32) { 1108 for (int i = 0; i < num_components; i++) 1109 values[i] = instr->value[i].u32; 1110 } else { 1111 assert(num_components <= 2); 1112 for (int i = 0; i < num_components; i++) { 1113 values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff; 1114 values[i * 2 + 1] = instr->value[i].u64 >> 32; 1115 } 1116 num_components *= 2; 1117 } 1118 1119 return ureg_DECL_immediate_uint(c->ureg, values, num_components); 1120 } 1121} 1122 1123static struct ureg_src 1124ntt_reladdr(struct ntt_compile *c, struct ureg_src addr, int addr_index) 1125{ 1126 assert(addr_index < ARRAY_SIZE(c->addr_reg)); 1127 1128 for (int i = 0; i <= addr_index; i++) { 1129 if (!c->addr_declared[i]) { 1130 c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg), 1131 TGSI_WRITEMASK_X); 1132 c->addr_declared[i] = true; 1133 } 1134 } 1135 1136 if (c->native_integers) 1137 ntt_UARL(c, c->addr_reg[addr_index], addr); 1138 else 1139 ntt_ARL(c, c->addr_reg[addr_index], addr); 1140 return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0); 1141} 1142 1143static struct ureg_src 1144ntt_get_src(struct ntt_compile *c, nir_src src) 1145{ 1146 if (src.is_ssa) { 1147 if (src.ssa->parent_instr->type == nir_instr_type_load_const) 1148 return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr)); 1149 1150 return c->ssa_temp[src.ssa->index]; 1151 } else { 1152 nir_register *reg = src.reg.reg; 1153 struct ureg_dst reg_temp = c->reg_temp[reg->index]; 1154 reg_temp.Index += src.reg.base_offset; 1155 1156 if (src.reg.indirect) { 1157 struct ureg_src offset = ntt_get_src(c, *src.reg.indirect); 1158 return ureg_src_indirect(ureg_src(reg_temp), 1159 ntt_reladdr(c, offset, 0)); 1160 } else { 1161 return ureg_src(reg_temp); 1162 } 1163 } 1164} 1165 1166static struct ureg_src 1167ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i) 1168{ 1169 nir_alu_src src = instr->src[i]; 1170 struct ureg_src usrc = ntt_get_src(c, src.src); 1171 1172 /* Expand double/dvec2 src references to TGSI swizzles using a pair of 32-bit 1173 * channels. We skip this for undefs, as those don't get split to vec2s (but 1174 * the specific swizzles from an undef don't matter) 1175 */ 1176 if (nir_src_bit_size(src.src) == 64 && 1177 !(src.src.is_ssa && src.src.ssa->parent_instr->type == nir_instr_type_ssa_undef)) { 1178 int chan0 = 0, chan1 = 1; 1179 if (nir_op_infos[instr->op].input_sizes[i] == 0) { 1180 chan0 = ffs(instr->dest.write_mask) - 1; 1181 chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1; 1182 if (chan1 == -1) 1183 chan1 = chan0; 1184 } 1185 usrc = ureg_swizzle(usrc, 1186 src.swizzle[chan0] * 2, 1187 src.swizzle[chan0] * 2 + 1, 1188 src.swizzle[chan1] * 2, 1189 src.swizzle[chan1] * 2 + 1); 1190 } else { 1191 usrc = ureg_swizzle(usrc, 1192 src.swizzle[0], 1193 src.swizzle[1], 1194 src.swizzle[2], 1195 src.swizzle[3]); 1196 } 1197 1198 if (src.abs) 1199 usrc = ureg_abs(usrc); 1200 if (src.negate) 1201 usrc = ureg_negate(usrc); 1202 1203 return usrc; 1204} 1205 1206/* Reswizzles a source so that the unset channels in the write mask still refer 1207 * to one of the channels present in the write mask. 1208 */ 1209static struct ureg_src 1210ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask) 1211{ 1212 assert(write_mask); 1213 int first_chan = ffs(write_mask) - 1; 1214 return ureg_swizzle(src, 1215 (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan, 1216 (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan, 1217 (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan, 1218 (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan); 1219} 1220 1221static struct ureg_dst 1222ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa) 1223{ 1224 uint32_t writemask = BITSET_MASK(ssa->num_components); 1225 if (ssa->bit_size == 64) 1226 writemask = ntt_64bit_write_mask(writemask); 1227 1228 struct ureg_dst dst; 1229 if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses)) 1230 dst = ntt_temp(c); 1231 1232 c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask); 1233 1234 return ureg_writemask(dst, writemask); 1235} 1236 1237static struct ureg_dst 1238ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest) 1239{ 1240 if (dest->is_ssa) 1241 return ntt_get_ssa_def_decl(c, &dest->ssa); 1242 else 1243 return c->reg_temp[dest->reg.reg->index]; 1244} 1245 1246static struct ureg_dst 1247ntt_get_dest(struct ntt_compile *c, nir_dest *dest) 1248{ 1249 struct ureg_dst dst = ntt_get_dest_decl(c, dest); 1250 1251 if (!dest->is_ssa) { 1252 dst.Index += dest->reg.base_offset; 1253 1254 if (dest->reg.indirect) { 1255 struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect); 1256 dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset, 0)); 1257 } 1258 } 1259 1260 return dst; 1261} 1262 1263/* For an SSA dest being populated by a constant src, replace the storage with 1264 * a copy of the ureg_src. 1265 */ 1266static void 1267ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src) 1268{ 1269 if (!src.Indirect && !src.DimIndirect) { 1270 switch (src.File) { 1271 case TGSI_FILE_IMMEDIATE: 1272 case TGSI_FILE_INPUT: 1273 case TGSI_FILE_CONSTANT: 1274 case TGSI_FILE_SYSTEM_VALUE: 1275 c->ssa_temp[def->index] = src; 1276 return; 1277 } 1278 } 1279 1280 ntt_MOV(c, ntt_get_ssa_def_decl(c, def), src); 1281} 1282 1283static void 1284ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src) 1285{ 1286 if (dest->is_ssa) 1287 ntt_store_def(c, &dest->ssa, src); 1288 else { 1289 struct ureg_dst dst = ntt_get_dest(c, dest); 1290 ntt_MOV(c, dst, src); 1291 } 1292} 1293 1294static void 1295ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op, 1296 struct ureg_dst dst, 1297 struct ureg_src src0, 1298 struct ureg_src src1) 1299{ 1300 unsigned i; 1301 1302 /* POW is the only 2-operand scalar op. */ 1303 if (tgsi_op != TGSI_OPCODE_POW) 1304 src1 = src0; 1305 1306 for (i = 0; i < 4; i++) { 1307 if (dst.WriteMask & (1 << i)) { 1308 ntt_insn(c, tgsi_op, 1309 ureg_writemask(dst, 1 << i), 1310 ureg_scalar(src0, i), 1311 ureg_scalar(src1, i), 1312 ureg_src_undef(), ureg_src_undef()); 1313 } 1314 } 1315} 1316 1317static void 1318ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr) 1319{ 1320 struct ureg_src src[4]; 1321 struct ureg_dst dst; 1322 unsigned i; 1323 int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64; 1324 int src_64 = nir_src_bit_size(instr->src[0].src) == 64; 1325 int num_srcs = nir_op_infos[instr->op].num_inputs; 1326 1327 c->precise = instr->exact; 1328 1329 assert(num_srcs <= ARRAY_SIZE(src)); 1330 for (i = 0; i < num_srcs; i++) 1331 src[i] = ntt_get_alu_src(c, instr, i); 1332 for (; i < ARRAY_SIZE(src); i++) 1333 src[i] = ureg_src_undef(); 1334 1335 dst = ntt_get_dest(c, &instr->dest.dest); 1336 1337 if (instr->dest.saturate) 1338 dst.Saturate = true; 1339 1340 if (dst_64) 1341 dst = ureg_writemask(dst, ntt_64bit_write_mask(instr->dest.write_mask)); 1342 else 1343 dst = ureg_writemask(dst, instr->dest.write_mask); 1344 1345 static enum tgsi_opcode op_map[][2] = { 1346 [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV }, 1347 1348 /* fabs/fneg 32-bit are special-cased below. */ 1349 [nir_op_fabs] = { 0, TGSI_OPCODE_DABS }, 1350 [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG }, 1351 1352 [nir_op_fdot2] = { TGSI_OPCODE_DP2 }, 1353 [nir_op_fdot3] = { TGSI_OPCODE_DP3 }, 1354 [nir_op_fdot4] = { TGSI_OPCODE_DP4 }, 1355 [nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 }, 1356 [nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 }, 1357 [nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 }, 1358 [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR }, 1359 [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC }, 1360 [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL }, 1361 [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND }, 1362 [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV }, 1363 [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV }, 1364 [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV }, 1365 1366 [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP }, 1367 [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ }, 1368 [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT }, 1369 1370 /* The conversions will have one combination of src and dst bitsize. */ 1371 [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F }, 1372 [nir_op_f2f64] = { TGSI_OPCODE_F2D }, 1373 [nir_op_i2i64] = { TGSI_OPCODE_I2I64 }, 1374 1375 [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I }, 1376 [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 }, 1377 [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U }, 1378 [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 }, 1379 [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F }, 1380 [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D }, 1381 [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F }, 1382 [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D }, 1383 1384 [nir_op_slt] = { TGSI_OPCODE_SLT }, 1385 [nir_op_sge] = { TGSI_OPCODE_SGE }, 1386 [nir_op_seq] = { TGSI_OPCODE_SEQ }, 1387 [nir_op_sne] = { TGSI_OPCODE_SNE }, 1388 1389 [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT }, 1390 [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE }, 1391 [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ }, 1392 [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE }, 1393 1394 [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT }, 1395 [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE }, 1396 [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ }, 1397 [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE }, 1398 1399 [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT }, 1400 [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE }, 1401 1402 [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS }, 1403 [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG }, 1404 [nir_op_fsign] = { TGSI_OPCODE_SSG }, 1405 [nir_op_isign] = { TGSI_OPCODE_ISSG }, 1406 [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC }, 1407 [nir_op_fddx] = { TGSI_OPCODE_DDX }, 1408 [nir_op_fddy] = { TGSI_OPCODE_DDY }, 1409 [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX }, 1410 [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY }, 1411 [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE }, 1412 [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE }, 1413 [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H }, 1414 [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H }, 1415 [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE }, 1416 [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE }, 1417 [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI }, 1418 [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV }, 1419 [nir_op_bit_count] = { TGSI_OPCODE_POPC }, 1420 [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB }, 1421 [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB }, 1422 [nir_op_find_lsb] = { TGSI_OPCODE_LSB }, 1423 [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD }, 1424 [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD }, 1425 [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL }, 1426 [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL }, 1427 [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD }, 1428 [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD }, 1429 [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI }, 1430 [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI }, 1431 [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL }, 1432 [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR }, 1433 [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR }, 1434 1435 /* These bitwise ops don't care about 32 vs 64 types, so they have the 1436 * same TGSI op. 1437 */ 1438 [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT }, 1439 [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND }, 1440 [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR }, 1441 [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR }, 1442 1443 [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN }, 1444 [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN }, 1445 [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN }, 1446 [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX }, 1447 [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX }, 1448 [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX }, 1449 [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD }, 1450 [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 }, 1451 }; 1452 1453 if (src_64 && !dst_64) { 1454 if (num_srcs == 2 || nir_op_infos[instr->op].output_type == nir_type_bool32) { 1455 /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead 1456 * of .xy. 1457 */ 1458 assert(!(dst.WriteMask & TGSI_WRITEMASK_YW)); 1459 } else { 1460 /* TGSI 64bit-to-32-bit conversions only generate results in the .xy 1461 * channels and will need to get fixed up. 1462 */ 1463 assert(!(dst.WriteMask & TGSI_WRITEMASK_ZW)); 1464 } 1465 } 1466 1467 bool table_op64 = src_64; 1468 if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) { 1469 /* The normal path for NIR to TGSI ALU op translation */ 1470 ntt_insn(c, op_map[instr->op][table_op64], 1471 dst, src[0], src[1], src[2], src[3]); 1472 } else { 1473 /* Special cases for NIR to TGSI ALU op translation. */ 1474 1475 /* TODO: Use something like the ntt_store() path for the MOV calls so we 1476 * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm. 1477 */ 1478 1479 switch (instr->op) { 1480 case nir_op_u2u64: 1481 ntt_AND(c, dst, ureg_swizzle(src[0], 1482 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, 1483 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y), 1484 ureg_imm4u(c->ureg, ~0, 0, ~0, 0)); 1485 break; 1486 1487 case nir_op_i2i32: 1488 case nir_op_u2u32: 1489 assert(src_64); 1490 ntt_MOV(c, dst, ureg_swizzle(src[0], 1491 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z, 1492 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X)); 1493 break; 1494 1495 case nir_op_fabs: 1496 if (c->options->lower_fabs) 1497 ntt_MAX(c, dst, src[0], ureg_negate(src[0])); 1498 else 1499 ntt_MOV(c, dst, ureg_abs(src[0])); 1500 break; 1501 1502 case nir_op_fsat: 1503 if (dst_64) { 1504 ntt_MIN(c, dst, src[0], ntt_64bit_1f(c)); 1505 ntt_MAX(c, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0)); 1506 } else { 1507 ntt_MOV(c, ureg_saturate(dst), src[0]); 1508 } 1509 break; 1510 1511 case nir_op_fneg: 1512 ntt_MOV(c, dst, ureg_negate(src[0])); 1513 break; 1514 1515 /* NOTE: TGSI 32-bit math ops have the old "one source channel 1516 * replicated to all dst channels" behavior, while 64 is normal mapping 1517 * of src channels to dst. 1518 */ 1519 case nir_op_frcp: 1520 assert(!dst_64); 1521 ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef()); 1522 break; 1523 1524 case nir_op_frsq: 1525 assert(!dst_64); 1526 ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef()); 1527 break; 1528 1529 case nir_op_fsqrt: 1530 assert(!dst_64); 1531 ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], ureg_src_undef()); 1532 break; 1533 1534 case nir_op_fexp2: 1535 assert(!dst_64); 1536 ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef()); 1537 break; 1538 1539 case nir_op_flog2: 1540 assert(!dst_64); 1541 ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef()); 1542 break; 1543 1544 case nir_op_b2f32: 1545 ntt_AND(c, dst, src[0], ureg_imm1f(c->ureg, 1.0)); 1546 break; 1547 1548 case nir_op_b2f64: 1549 ntt_AND(c, dst, 1550 ureg_swizzle(src[0], 1551 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, 1552 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y), 1553 ntt_64bit_1f(c)); 1554 break; 1555 1556 case nir_op_f2b32: 1557 if (src_64) 1558 ntt_DSNE(c, dst, src[0], ureg_imm1f(c->ureg, 0)); 1559 else 1560 ntt_FSNE(c, dst, src[0], ureg_imm1f(c->ureg, 0)); 1561 break; 1562 1563 case nir_op_i2b32: 1564 if (src_64) { 1565 ntt_U64SNE(c, dst, src[0], ureg_imm1u(c->ureg, 0)); 1566 } else 1567 ntt_USNE(c, dst, src[0], ureg_imm1u(c->ureg, 0)); 1568 break; 1569 1570 case nir_op_b2i32: 1571 ntt_AND(c, dst, src[0], ureg_imm1u(c->ureg, 1)); 1572 break; 1573 1574 case nir_op_b2i64: 1575 ntt_AND(c, dst, 1576 ureg_swizzle(src[0], 1577 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, 1578 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y), 1579 ureg_imm4u(c->ureg, 1, 0, 1, 0)); 1580 break; 1581 1582 case nir_op_fsin: 1583 ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef()); 1584 break; 1585 1586 case nir_op_fcos: 1587 ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef()); 1588 break; 1589 1590 case nir_op_fsub: 1591 assert(!dst_64); 1592 ntt_ADD(c, dst, src[0], ureg_negate(src[1])); 1593 break; 1594 1595 case nir_op_isub: 1596 assert(!dst_64); 1597 ntt_UADD(c, dst, src[0], ureg_negate(src[1])); 1598 break; 1599 1600 case nir_op_fmod: 1601 unreachable("should be handled by .lower_fmod = true"); 1602 break; 1603 1604 case nir_op_fpow: 1605 ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]); 1606 break; 1607 1608 case nir_op_flrp: 1609 ntt_LRP(c, dst, src[2], src[1], src[0]); 1610 break; 1611 1612 case nir_op_pack_64_2x32_split: 1613 ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_XZ), 1614 ureg_swizzle(src[0], 1615 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, 1616 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y)); 1617 ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_YW), 1618 ureg_swizzle(src[1], 1619 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, 1620 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y)); 1621 break; 1622 1623 case nir_op_unpack_64_2x32_split_x: 1624 ntt_MOV(c, dst, ureg_swizzle(src[0], 1625 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z, 1626 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z)); 1627 break; 1628 1629 case nir_op_unpack_64_2x32_split_y: 1630 ntt_MOV(c, dst, ureg_swizzle(src[0], 1631 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W, 1632 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W)); 1633 break; 1634 1635 case nir_op_b32csel: 1636 if (nir_src_bit_size(instr->src[1].src) == 64) { 1637 ntt_UCMP(c, dst, ureg_swizzle(src[0], 1638 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, 1639 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y), 1640 src[1], src[2]); 1641 } else { 1642 ntt_UCMP(c, dst, src[0], src[1], src[2]); 1643 } 1644 break; 1645 1646 case nir_op_fcsel: 1647 /* NIR fcsel is src0 != 0 ? src1 : src2. 1648 * TGSI CMP is src0 < 0 ? src1 : src2. 1649 * 1650 * However, fcsel so far as I can find only appears on bools-as-floats 1651 * (1.0 or 0.0), so we can just negate it for the TGSI op. It's 1652 * important to not have an abs here, as i915g has to make extra 1653 * instructions to do the abs. 1654 */ 1655 if (c->options->lower_cmp) { 1656 /* If the HW doesn't support TGSI CMP (r300 VS), then lower it to a 1657 * LRP on the boolean 1.0/0.0 value, instead of requiring the 1658 * backend to turn the src0 into 1.0/0.0 first. 1659 * 1660 * We don't use this in general because some hardware (i915 FS) the 1661 * LRP gets expanded to MUL/MAD. 1662 */ 1663 ntt_LRP(c, dst, src[0], src[1], src[2]); 1664 } else { 1665 ntt_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]); 1666 } 1667 break; 1668 1669 /* It would be nice if we could get this left as scalar in NIR, since 1670 * the TGSI op is scalar. 1671 */ 1672 case nir_op_frexp_sig: 1673 case nir_op_frexp_exp: { 1674 assert(src_64); 1675 struct ureg_dst temp = ntt_temp(c); 1676 1677 for (int chan = 0; chan < 2; chan++) { 1678 int wm = 1 << chan; 1679 1680 if (!(instr->dest.write_mask & wm)) 1681 continue; 1682 1683 struct ureg_dst dsts[2] = { temp, temp }; 1684 if (instr->op == nir_op_frexp_sig) { 1685 dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm)); 1686 } else { 1687 dsts[1] = ureg_writemask(dst, wm); 1688 } 1689 1690 struct ureg_src chan_src = ureg_swizzle(src[0], 1691 chan * 2, chan * 2 + 1, 1692 chan * 2, chan * 2 + 1); 1693 1694 struct ntt_insn *insn = ntt_insn(c, TGSI_OPCODE_DFRACEXP, 1695 dsts[0], chan_src, 1696 ureg_src_undef(), 1697 ureg_src_undef(), 1698 ureg_src_undef()); 1699 insn->dst[1] = dsts[1]; 1700 } 1701 break; 1702 } 1703 1704 case nir_op_ldexp: 1705 assert(dst_64); /* 32bit handled in table. */ 1706 ntt_DLDEXP(c, dst, src[0], 1707 ureg_swizzle(src[1], 1708 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, 1709 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y)); 1710 break; 1711 1712 case nir_op_vec4: 1713 case nir_op_vec3: 1714 case nir_op_vec2: 1715 unreachable("covered by nir_lower_vec_to_movs()"); 1716 1717 default: 1718 fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name); 1719 unreachable("Unknown NIR opcode"); 1720 } 1721 } 1722 1723 c->precise = false; 1724} 1725 1726static struct ureg_src 1727ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc, 1728 nir_src src, int addr_reg) 1729{ 1730 if (nir_src_is_const(src)) { 1731 usrc.Index += ntt_src_as_uint(c, src); 1732 return usrc; 1733 } else { 1734 return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src), addr_reg)); 1735 } 1736} 1737 1738static struct ureg_dst 1739ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst, 1740 nir_src src) 1741{ 1742 if (nir_src_is_const(src)) { 1743 dst.Index += ntt_src_as_uint(c, src); 1744 return dst; 1745 } else { 1746 return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src), 0)); 1747 } 1748} 1749 1750static struct ureg_src 1751ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc, 1752 nir_src src) 1753{ 1754 if (nir_src_is_const(src)) { 1755 return ureg_src_dimension(usrc, ntt_src_as_uint(c, src)); 1756 } 1757 else 1758 { 1759 return ureg_src_dimension_indirect(usrc, 1760 ntt_reladdr(c, ntt_get_src(c, src), 1), 1761 0); 1762 } 1763} 1764 1765static struct ureg_dst 1766ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst, 1767 nir_src src) 1768{ 1769 if (nir_src_is_const(src)) { 1770 return ureg_dst_dimension(udst, ntt_src_as_uint(c, src)); 1771 } else { 1772 return ureg_dst_dimension_indirect(udst, 1773 ntt_reladdr(c, ntt_get_src(c, src), 1), 1774 0); 1775 } 1776} 1777/* Some load operations in NIR will have a fractional offset that we need to 1778 * swizzle down before storing to the result register. 1779 */ 1780static struct ureg_src 1781ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components) 1782{ 1783 return ureg_swizzle(src, 1784 frac, 1785 frac + MIN2(num_components - 1, 1), 1786 frac + MIN2(num_components - 1, 2), 1787 frac + MIN2(num_components - 1, 3)); 1788} 1789 1790 1791static void 1792ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr) 1793{ 1794 int bit_size = nir_dest_bit_size(instr->dest); 1795 assert(bit_size == 32 || instr->num_components <= 2); 1796 1797 struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0); 1798 1799 struct ureg_dst addr_temp = ureg_dst_undef(); 1800 1801 if (nir_src_is_const(instr->src[0])) { 1802 src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0])); 1803 } else { 1804 /* virglrenderer requires that indirect UBO references have the UBO 1805 * array's base index in the Index field, not added to the indrect 1806 * address. 1807 * 1808 * Many nir intrinsics have a base address const value for the start of 1809 * their array indirection, but load_ubo doesn't. We fake it by 1810 * subtracting it off here. 1811 */ 1812 addr_temp = ntt_temp(c); 1813 ntt_UADD(c, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo)); 1814 src = ureg_src_dimension_indirect(src, 1815 ntt_reladdr(c, ureg_src(addr_temp), 1), 1816 c->first_ubo); 1817 } 1818 1819 if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) { 1820 /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const 1821 * file. 1822 */ 1823 src.Index = nir_intrinsic_base(instr); 1824 1825 if (nir_src_is_const(instr->src[1])) { 1826 src.Index += ntt_src_as_uint(c, instr->src[1]); 1827 } else { 1828 src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1]), 0)); 1829 } 1830 1831 int start_component = nir_intrinsic_component(instr); 1832 if (bit_size == 64) 1833 start_component *= 2; 1834 1835 src = ntt_shift_by_frac(src, start_component, 1836 instr->num_components * bit_size / 32); 1837 1838 ntt_store(c, &instr->dest, src); 1839 } else { 1840 /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a 1841 * TGSI_OPCODE_LOAD instruction from the const file. 1842 */ 1843 struct ntt_insn *insn = 1844 ntt_insn(c, TGSI_OPCODE_LOAD, 1845 ntt_get_dest(c, &instr->dest), 1846 src, ntt_get_src(c, instr->src[1]), 1847 ureg_src_undef(), ureg_src_undef()); 1848 insn->is_mem = true; 1849 insn->tex_target = 0; 1850 insn->mem_qualifier = 0; 1851 insn->mem_format = 0; /* unused */ 1852 } 1853} 1854 1855static unsigned 1856ntt_get_access_qualifier(nir_intrinsic_instr *instr) 1857{ 1858 enum gl_access_qualifier access = nir_intrinsic_access(instr); 1859 unsigned qualifier = 0; 1860 1861 if (access & ACCESS_COHERENT) 1862 qualifier |= TGSI_MEMORY_COHERENT; 1863 if (access & ACCESS_VOLATILE) 1864 qualifier |= TGSI_MEMORY_VOLATILE; 1865 if (access & ACCESS_RESTRICT) 1866 qualifier |= TGSI_MEMORY_RESTRICT; 1867 1868 return qualifier; 1869} 1870 1871static void 1872ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr, 1873 nir_variable_mode mode) 1874{ 1875 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || 1876 instr->intrinsic == nir_intrinsic_store_shared); 1877 bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read || 1878 instr->intrinsic == nir_intrinsic_load_ssbo || 1879 instr->intrinsic == nir_intrinsic_load_shared); 1880 unsigned opcode; 1881 struct ureg_src src[4]; 1882 int num_src = 0; 1883 int next_src; 1884 struct ureg_dst addr_temp = ureg_dst_undef(); 1885 1886 struct ureg_src memory; 1887 switch (mode) { 1888 case nir_var_mem_ssbo: 1889 memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0), 1890 instr->src[is_store ? 1 : 0], 2); 1891 next_src = 1; 1892 break; 1893 case nir_var_mem_shared: 1894 memory = ureg_src_register(TGSI_FILE_MEMORY, 0); 1895 next_src = 0; 1896 break; 1897 case nir_var_uniform: { /* HW atomic buffers */ 1898 nir_src src = instr->src[0]; 1899 uint32_t offset = ntt_extract_const_src_offset(&src) / 4; 1900 memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, offset); 1901 /* ntt_ureg_src_indirect, except dividing by 4 */ 1902 if (nir_src_is_const(src)) { 1903 memory.Index += nir_src_as_uint(src) / 4; 1904 } else { 1905 addr_temp = ntt_temp(c); 1906 ntt_USHR(c, addr_temp, ntt_get_src(c, src), ureg_imm1i(c->ureg, 2)); 1907 memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp), 2)); 1908 } 1909 memory = ureg_src_dimension(memory, nir_intrinsic_base(instr)); 1910 next_src = 0; 1911 break; 1912 } 1913 1914 default: 1915 unreachable("unknown memory type"); 1916 } 1917 1918 if (is_store) { 1919 src[num_src++] = ntt_get_src(c, instr->src[next_src + 1]); /* offset */ 1920 src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */ 1921 } else { 1922 src[num_src++] = memory; 1923 if (instr->intrinsic != nir_intrinsic_get_ssbo_size) { 1924 src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* offset */ 1925 switch (instr->intrinsic) { 1926 case nir_intrinsic_atomic_counter_inc: 1927 src[num_src++] = ureg_imm1i(c->ureg, 1); 1928 break; 1929 case nir_intrinsic_atomic_counter_post_dec: 1930 src[num_src++] = ureg_imm1i(c->ureg, -1); 1931 break; 1932 default: 1933 if (!is_load) 1934 src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* value */ 1935 break; 1936 } 1937 } 1938 } 1939 1940 1941 switch (instr->intrinsic) { 1942 case nir_intrinsic_atomic_counter_add: 1943 case nir_intrinsic_atomic_counter_inc: 1944 case nir_intrinsic_atomic_counter_post_dec: 1945 case nir_intrinsic_ssbo_atomic_add: 1946 case nir_intrinsic_shared_atomic_add: 1947 opcode = TGSI_OPCODE_ATOMUADD; 1948 break; 1949 case nir_intrinsic_ssbo_atomic_fadd: 1950 case nir_intrinsic_shared_atomic_fadd: 1951 opcode = TGSI_OPCODE_ATOMFADD; 1952 break; 1953 case nir_intrinsic_atomic_counter_min: 1954 case nir_intrinsic_ssbo_atomic_imin: 1955 case nir_intrinsic_shared_atomic_imin: 1956 opcode = TGSI_OPCODE_ATOMIMIN; 1957 break; 1958 case nir_intrinsic_atomic_counter_max: 1959 case nir_intrinsic_ssbo_atomic_imax: 1960 case nir_intrinsic_shared_atomic_imax: 1961 opcode = TGSI_OPCODE_ATOMIMAX; 1962 break; 1963 case nir_intrinsic_ssbo_atomic_umin: 1964 case nir_intrinsic_shared_atomic_umin: 1965 opcode = TGSI_OPCODE_ATOMUMIN; 1966 break; 1967 case nir_intrinsic_ssbo_atomic_umax: 1968 case nir_intrinsic_shared_atomic_umax: 1969 opcode = TGSI_OPCODE_ATOMUMAX; 1970 break; 1971 case nir_intrinsic_atomic_counter_and: 1972 case nir_intrinsic_ssbo_atomic_and: 1973 case nir_intrinsic_shared_atomic_and: 1974 opcode = TGSI_OPCODE_ATOMAND; 1975 break; 1976 case nir_intrinsic_atomic_counter_or: 1977 case nir_intrinsic_ssbo_atomic_or: 1978 case nir_intrinsic_shared_atomic_or: 1979 opcode = TGSI_OPCODE_ATOMOR; 1980 break; 1981 case nir_intrinsic_atomic_counter_xor: 1982 case nir_intrinsic_ssbo_atomic_xor: 1983 case nir_intrinsic_shared_atomic_xor: 1984 opcode = TGSI_OPCODE_ATOMXOR; 1985 break; 1986 case nir_intrinsic_atomic_counter_exchange: 1987 case nir_intrinsic_ssbo_atomic_exchange: 1988 case nir_intrinsic_shared_atomic_exchange: 1989 opcode = TGSI_OPCODE_ATOMXCHG; 1990 break; 1991 case nir_intrinsic_atomic_counter_comp_swap: 1992 case nir_intrinsic_ssbo_atomic_comp_swap: 1993 case nir_intrinsic_shared_atomic_comp_swap: 1994 opcode = TGSI_OPCODE_ATOMCAS; 1995 src[num_src++] = ntt_get_src(c, instr->src[next_src++]); 1996 break; 1997 case nir_intrinsic_atomic_counter_read: 1998 case nir_intrinsic_load_ssbo: 1999 case nir_intrinsic_load_shared: 2000 opcode = TGSI_OPCODE_LOAD; 2001 break; 2002 case nir_intrinsic_store_ssbo: 2003 case nir_intrinsic_store_shared: 2004 opcode = TGSI_OPCODE_STORE; 2005 break; 2006 case nir_intrinsic_get_ssbo_size: 2007 opcode = TGSI_OPCODE_RESQ; 2008 break; 2009 default: 2010 unreachable("unknown memory op"); 2011 } 2012 2013 unsigned qualifier = 0; 2014 if (mode == nir_var_mem_ssbo && 2015 instr->intrinsic != nir_intrinsic_get_ssbo_size) { 2016 qualifier = ntt_get_access_qualifier(instr); 2017 } 2018 2019 struct ureg_dst dst; 2020 if (is_store) { 2021 dst = ureg_dst(memory); 2022 2023 unsigned write_mask = nir_intrinsic_write_mask(instr); 2024 if (nir_src_bit_size(instr->src[0]) == 64) 2025 write_mask = ntt_64bit_write_mask(write_mask); 2026 dst = ureg_writemask(dst, write_mask); 2027 } else { 2028 dst = ntt_get_dest(c, &instr->dest); 2029 } 2030 2031 struct ntt_insn *insn = ntt_insn(c, opcode, dst, src[0], src[1], src[2], src[3]); 2032 insn->tex_target = TGSI_TEXTURE_BUFFER; 2033 insn->mem_qualifier = qualifier; 2034 insn->mem_format = 0; /* unused */ 2035 insn->is_mem = true; 2036} 2037 2038static void 2039ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr) 2040{ 2041 unsigned op; 2042 struct ureg_src srcs[4]; 2043 int num_src = 0; 2044 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 2045 bool is_array = nir_intrinsic_image_array(instr); 2046 2047 struct ureg_dst temp = ureg_dst_undef(); 2048 2049 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false); 2050 2051 struct ureg_src resource = 2052 ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0), 2053 instr->src[0], 2); 2054 2055 struct ureg_dst dst; 2056 if (instr->intrinsic == nir_intrinsic_image_store) { 2057 dst = ureg_dst(resource); 2058 } else { 2059 srcs[num_src++] = resource; 2060 dst = ntt_get_dest(c, &instr->dest); 2061 } 2062 struct ureg_dst opcode_dst = dst; 2063 2064 if (instr->intrinsic != nir_intrinsic_image_size && instr->intrinsic != nir_intrinsic_image_samples) { 2065 struct ureg_src coord = ntt_get_src(c, instr->src[1]); 2066 2067 if (dim == GLSL_SAMPLER_DIM_MS) { 2068 temp = ntt_temp(c); 2069 ntt_MOV(c, temp, coord); 2070 ntt_MOV(c, ureg_writemask(temp, TGSI_WRITEMASK_W), 2071 ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X)); 2072 coord = ureg_src(temp); 2073 } 2074 srcs[num_src++] = coord; 2075 2076 if (instr->intrinsic != nir_intrinsic_image_load) { 2077 srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */ 2078 if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap) 2079 srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */ 2080 } 2081 } 2082 2083 switch (instr->intrinsic) { 2084 case nir_intrinsic_image_load: 2085 op = TGSI_OPCODE_LOAD; 2086 break; 2087 case nir_intrinsic_image_store: 2088 op = TGSI_OPCODE_STORE; 2089 break; 2090 case nir_intrinsic_image_size: 2091 op = TGSI_OPCODE_RESQ; 2092 break; 2093 case nir_intrinsic_image_samples: 2094 op = TGSI_OPCODE_RESQ; 2095 opcode_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W); 2096 break; 2097 case nir_intrinsic_image_atomic_add: 2098 op = TGSI_OPCODE_ATOMUADD; 2099 break; 2100 case nir_intrinsic_image_atomic_fadd: 2101 op = TGSI_OPCODE_ATOMFADD; 2102 break; 2103 case nir_intrinsic_image_atomic_imin: 2104 op = TGSI_OPCODE_ATOMIMIN; 2105 break; 2106 case nir_intrinsic_image_atomic_umin: 2107 op = TGSI_OPCODE_ATOMUMIN; 2108 break; 2109 case nir_intrinsic_image_atomic_imax: 2110 op = TGSI_OPCODE_ATOMIMAX; 2111 break; 2112 case nir_intrinsic_image_atomic_umax: 2113 op = TGSI_OPCODE_ATOMUMAX; 2114 break; 2115 case nir_intrinsic_image_atomic_and: 2116 op = TGSI_OPCODE_ATOMAND; 2117 break; 2118 case nir_intrinsic_image_atomic_or: 2119 op = TGSI_OPCODE_ATOMOR; 2120 break; 2121 case nir_intrinsic_image_atomic_xor: 2122 op = TGSI_OPCODE_ATOMXOR; 2123 break; 2124 case nir_intrinsic_image_atomic_exchange: 2125 op = TGSI_OPCODE_ATOMXCHG; 2126 break; 2127 case nir_intrinsic_image_atomic_comp_swap: 2128 op = TGSI_OPCODE_ATOMCAS; 2129 break; 2130 default: 2131 unreachable("bad op"); 2132 } 2133 2134 struct ntt_insn *insn = ntt_insn(c, op, opcode_dst, srcs[0], srcs[1], srcs[2], srcs[3]); 2135 insn->tex_target = target; 2136 insn->mem_qualifier = ntt_get_access_qualifier(instr); 2137 insn->mem_format = nir_intrinsic_format(instr); 2138 insn->is_mem = true; 2139 2140 if (instr->intrinsic == nir_intrinsic_image_samples) 2141 ntt_MOV(c, dst, ureg_scalar(ureg_src(opcode_dst), 3)); 2142} 2143 2144static void 2145ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr) 2146{ 2147 uint32_t frac = nir_intrinsic_component(instr); 2148 uint32_t num_components = instr->num_components; 2149 unsigned base = nir_intrinsic_base(instr); 2150 struct ureg_src input; 2151 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr); 2152 bool is_64 = nir_dest_bit_size(instr->dest) == 64; 2153 2154 if (c->s->info.stage == MESA_SHADER_VERTEX) { 2155 input = ureg_DECL_vs_input(c->ureg, base); 2156 for (int i = 1; i < semantics.num_slots; i++) 2157 ureg_DECL_vs_input(c->ureg, base + i); 2158 } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) { 2159 unsigned semantic_name, semantic_index; 2160 ntt_get_gl_varying_semantic(c, semantics.location, 2161 &semantic_name, &semantic_index); 2162 2163 /* XXX: ArrayID is used in r600 gs inputs */ 2164 uint32_t array_id = 0; 2165 2166 input = ureg_DECL_input_layout(c->ureg, 2167 semantic_name, 2168 semantic_index, 2169 base, 2170 ntt_tgsi_usage_mask(frac, 2171 instr->num_components, 2172 is_64), 2173 array_id, 2174 semantics.num_slots); 2175 } else { 2176 input = c->input_index_map[base]; 2177 } 2178 2179 if (is_64) 2180 num_components *= 2; 2181 2182 input = ntt_shift_by_frac(input, frac, num_components); 2183 2184 switch (instr->intrinsic) { 2185 case nir_intrinsic_load_input: 2186 input = ntt_ureg_src_indirect(c, input, instr->src[0], 0); 2187 ntt_store(c, &instr->dest, input); 2188 break; 2189 2190 case nir_intrinsic_load_per_vertex_input: 2191 input = ntt_ureg_src_indirect(c, input, instr->src[1], 0); 2192 input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]); 2193 ntt_store(c, &instr->dest, input); 2194 break; 2195 2196 case nir_intrinsic_load_interpolated_input: { 2197 input = ntt_ureg_src_indirect(c, input, instr->src[1], 0); 2198 2199 nir_intrinsic_instr *bary_instr = 2200 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); 2201 2202 switch (bary_instr->intrinsic) { 2203 case nir_intrinsic_load_barycentric_pixel: 2204 case nir_intrinsic_load_barycentric_sample: 2205 /* For these, we know that the barycentric load matches the 2206 * interpolation on the input declaration, so we can use it directly. 2207 */ 2208 ntt_store(c, &instr->dest, input); 2209 break; 2210 2211 case nir_intrinsic_load_barycentric_centroid: 2212 /* If the input was declared centroid, then there's no need to 2213 * emit the extra TGSI interp instruction, we can just read the 2214 * input. 2215 */ 2216 if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) { 2217 ntt_store(c, &instr->dest, input); 2218 } else { 2219 ntt_INTERP_CENTROID(c, ntt_get_dest(c, &instr->dest), input); 2220 } 2221 break; 2222 2223 case nir_intrinsic_load_barycentric_at_sample: 2224 /* We stored the sample in the fake "bary" dest. */ 2225 ntt_INTERP_SAMPLE(c, ntt_get_dest(c, &instr->dest), input, 2226 ntt_get_src(c, instr->src[0])); 2227 break; 2228 2229 case nir_intrinsic_load_barycentric_at_offset: 2230 /* We stored the offset in the fake "bary" dest. */ 2231 ntt_INTERP_OFFSET(c, ntt_get_dest(c, &instr->dest), input, 2232 ntt_get_src(c, instr->src[0])); 2233 break; 2234 2235 default: 2236 unreachable("bad barycentric interp intrinsic\n"); 2237 } 2238 break; 2239 } 2240 2241 default: 2242 unreachable("bad load input intrinsic\n"); 2243 } 2244} 2245 2246static void 2247ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr) 2248{ 2249 struct ureg_src src = ntt_get_src(c, instr->src[0]); 2250 2251 if (src.File == TGSI_FILE_OUTPUT) { 2252 /* If our src is the output file, that's an indication that we were able 2253 * to emit the output stores in the generating instructions and we have 2254 * nothing to do here. 2255 */ 2256 return; 2257 } 2258 2259 uint32_t frac; 2260 struct ureg_dst out = ntt_output_decl(c, instr, &frac); 2261 2262 if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) { 2263 out = ntt_ureg_dst_indirect(c, out, instr->src[2]); 2264 out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]); 2265 } else { 2266 out = ntt_ureg_dst_indirect(c, out, instr->src[1]); 2267 } 2268 2269 uint8_t swizzle[4] = { 0, 0, 0, 0 }; 2270 for (int i = frac; i <= 4; i++) { 2271 if (out.WriteMask & (1 << i)) 2272 swizzle[i] = i - frac; 2273 } 2274 2275 src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]); 2276 2277 ntt_MOV(c, out, src); 2278} 2279 2280static void 2281ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr) 2282{ 2283 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr); 2284 2285 /* ntt_try_store_in_tgsi_output() optimization is not valid if normal 2286 * load_output is present. 2287 */ 2288 assert(c->s->info.stage != MESA_SHADER_VERTEX && 2289 (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output)); 2290 2291 uint32_t frac; 2292 struct ureg_dst out = ntt_output_decl(c, instr, &frac); 2293 2294 if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) { 2295 out = ntt_ureg_dst_indirect(c, out, instr->src[1]); 2296 out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]); 2297 } else { 2298 out = ntt_ureg_dst_indirect(c, out, instr->src[0]); 2299 } 2300 2301 struct ureg_dst dst = ntt_get_dest(c, &instr->dest); 2302 struct ureg_src out_src = ureg_src(out); 2303 2304 /* Don't swizzling unavailable channels of the output in the writemasked-out 2305 * components. Avoids compile failures in virglrenderer with 2306 * TESS_LEVEL_INNER. 2307 */ 2308 int fill_channel = ffs(dst.WriteMask) - 1; 2309 uint8_t swizzles[4] = { 0, 1, 2, 3 }; 2310 for (int i = 0; i < 4; i++) 2311 if (!(dst.WriteMask & (1 << i))) 2312 swizzles[i] = fill_channel; 2313 out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]); 2314 2315 if (semantics.fb_fetch_output) 2316 ntt_FBFETCH(c, dst, out_src); 2317 else 2318 ntt_MOV(c, dst, out_src); 2319} 2320 2321static void 2322ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr) 2323{ 2324 gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic); 2325 enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval); 2326 struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0); 2327 2328 /* virglrenderer doesn't like references to channels of the sysval that 2329 * aren't defined, even if they aren't really read. (GLSL compile fails on 2330 * gl_NumWorkGroups.w, for example). 2331 */ 2332 uint32_t write_mask = BITSET_MASK(nir_dest_num_components(instr->dest)); 2333 sv = ntt_swizzle_for_write_mask(sv, write_mask); 2334 2335 /* TGSI and NIR define these intrinsics as always loading ints, but they can 2336 * still appear on hardware with non-native-integers fragment shaders using 2337 * the draw path (i915g). In that case, having called nir_lower_int_to_float 2338 * means that we actually want floats instead. 2339 */ 2340 if (!c->native_integers) { 2341 switch (instr->intrinsic) { 2342 case nir_intrinsic_load_vertex_id: 2343 case nir_intrinsic_load_instance_id: 2344 ntt_U2F(c, ntt_get_dest(c, &instr->dest), sv); 2345 return; 2346 2347 default: 2348 break; 2349 } 2350 } 2351 2352 ntt_store(c, &instr->dest, sv); 2353} 2354 2355static void 2356ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr) 2357{ 2358 switch (instr->intrinsic) { 2359 case nir_intrinsic_load_ubo: 2360 case nir_intrinsic_load_ubo_vec4: 2361 ntt_emit_load_ubo(c, instr); 2362 break; 2363 2364 /* Vertex */ 2365 case nir_intrinsic_load_vertex_id: 2366 case nir_intrinsic_load_vertex_id_zero_base: 2367 case nir_intrinsic_load_base_vertex: 2368 case nir_intrinsic_load_base_instance: 2369 case nir_intrinsic_load_instance_id: 2370 case nir_intrinsic_load_draw_id: 2371 case nir_intrinsic_load_invocation_id: 2372 case nir_intrinsic_load_frag_coord: 2373 case nir_intrinsic_load_point_coord: 2374 case nir_intrinsic_load_front_face: 2375 case nir_intrinsic_load_sample_id: 2376 case nir_intrinsic_load_sample_pos: 2377 case nir_intrinsic_load_sample_mask_in: 2378 case nir_intrinsic_load_helper_invocation: 2379 case nir_intrinsic_load_tess_coord: 2380 case nir_intrinsic_load_patch_vertices_in: 2381 case nir_intrinsic_load_primitive_id: 2382 case nir_intrinsic_load_tess_level_outer: 2383 case nir_intrinsic_load_tess_level_inner: 2384 case nir_intrinsic_load_local_invocation_id: 2385 case nir_intrinsic_load_workgroup_id: 2386 case nir_intrinsic_load_num_workgroups: 2387 case nir_intrinsic_load_workgroup_size: 2388 case nir_intrinsic_load_subgroup_size: 2389 case nir_intrinsic_load_subgroup_invocation: 2390 case nir_intrinsic_load_subgroup_eq_mask: 2391 case nir_intrinsic_load_subgroup_ge_mask: 2392 case nir_intrinsic_load_subgroup_gt_mask: 2393 case nir_intrinsic_load_subgroup_lt_mask: 2394 ntt_emit_load_sysval(c, instr); 2395 break; 2396 2397 case nir_intrinsic_load_input: 2398 case nir_intrinsic_load_per_vertex_input: 2399 case nir_intrinsic_load_interpolated_input: 2400 ntt_emit_load_input(c, instr); 2401 break; 2402 2403 case nir_intrinsic_store_output: 2404 case nir_intrinsic_store_per_vertex_output: 2405 ntt_emit_store_output(c, instr); 2406 break; 2407 2408 case nir_intrinsic_load_output: 2409 case nir_intrinsic_load_per_vertex_output: 2410 ntt_emit_load_output(c, instr); 2411 break; 2412 2413 case nir_intrinsic_discard: 2414 ntt_KILL(c); 2415 break; 2416 2417 case nir_intrinsic_discard_if: { 2418 struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0); 2419 2420 if (c->native_integers) { 2421 struct ureg_dst temp = ureg_writemask(ntt_temp(c), 1); 2422 ntt_AND(c, temp, cond, ureg_imm1f(c->ureg, 1.0)); 2423 ntt_KILL_IF(c, ureg_scalar(ureg_negate(ureg_src(temp)), 0)); 2424 } else { 2425 /* For !native_integers, the bool got lowered to 1.0 or 0.0. */ 2426 ntt_KILL_IF(c, ureg_negate(cond)); 2427 } 2428 break; 2429 } 2430 2431 case nir_intrinsic_load_ssbo: 2432 case nir_intrinsic_store_ssbo: 2433 case nir_intrinsic_ssbo_atomic_add: 2434 case nir_intrinsic_ssbo_atomic_fadd: 2435 case nir_intrinsic_ssbo_atomic_imin: 2436 case nir_intrinsic_ssbo_atomic_imax: 2437 case nir_intrinsic_ssbo_atomic_umin: 2438 case nir_intrinsic_ssbo_atomic_umax: 2439 case nir_intrinsic_ssbo_atomic_and: 2440 case nir_intrinsic_ssbo_atomic_or: 2441 case nir_intrinsic_ssbo_atomic_xor: 2442 case nir_intrinsic_ssbo_atomic_exchange: 2443 case nir_intrinsic_ssbo_atomic_comp_swap: 2444 case nir_intrinsic_get_ssbo_size: 2445 ntt_emit_mem(c, instr, nir_var_mem_ssbo); 2446 break; 2447 2448 case nir_intrinsic_load_shared: 2449 case nir_intrinsic_store_shared: 2450 case nir_intrinsic_shared_atomic_add: 2451 case nir_intrinsic_shared_atomic_fadd: 2452 case nir_intrinsic_shared_atomic_imin: 2453 case nir_intrinsic_shared_atomic_imax: 2454 case nir_intrinsic_shared_atomic_umin: 2455 case nir_intrinsic_shared_atomic_umax: 2456 case nir_intrinsic_shared_atomic_and: 2457 case nir_intrinsic_shared_atomic_or: 2458 case nir_intrinsic_shared_atomic_xor: 2459 case nir_intrinsic_shared_atomic_exchange: 2460 case nir_intrinsic_shared_atomic_comp_swap: 2461 ntt_emit_mem(c, instr, nir_var_mem_shared); 2462 break; 2463 2464 case nir_intrinsic_atomic_counter_read: 2465 case nir_intrinsic_atomic_counter_add: 2466 case nir_intrinsic_atomic_counter_inc: 2467 case nir_intrinsic_atomic_counter_post_dec: 2468 case nir_intrinsic_atomic_counter_min: 2469 case nir_intrinsic_atomic_counter_max: 2470 case nir_intrinsic_atomic_counter_and: 2471 case nir_intrinsic_atomic_counter_or: 2472 case nir_intrinsic_atomic_counter_xor: 2473 case nir_intrinsic_atomic_counter_exchange: 2474 case nir_intrinsic_atomic_counter_comp_swap: 2475 ntt_emit_mem(c, instr, nir_var_uniform); 2476 break; 2477 case nir_intrinsic_atomic_counter_pre_dec: 2478 unreachable("Should be lowered by ntt_lower_atomic_pre_dec()"); 2479 break; 2480 2481 case nir_intrinsic_image_load: 2482 case nir_intrinsic_image_store: 2483 case nir_intrinsic_image_size: 2484 case nir_intrinsic_image_samples: 2485 case nir_intrinsic_image_atomic_add: 2486 case nir_intrinsic_image_atomic_fadd: 2487 case nir_intrinsic_image_atomic_imin: 2488 case nir_intrinsic_image_atomic_umin: 2489 case nir_intrinsic_image_atomic_imax: 2490 case nir_intrinsic_image_atomic_umax: 2491 case nir_intrinsic_image_atomic_and: 2492 case nir_intrinsic_image_atomic_or: 2493 case nir_intrinsic_image_atomic_xor: 2494 case nir_intrinsic_image_atomic_exchange: 2495 case nir_intrinsic_image_atomic_comp_swap: 2496 ntt_emit_image_load_store(c, instr); 2497 break; 2498 2499 case nir_intrinsic_control_barrier: 2500 case nir_intrinsic_memory_barrier_tcs_patch: 2501 ntt_BARRIER(c); 2502 break; 2503 2504 case nir_intrinsic_memory_barrier: 2505 ntt_MEMBAR(c, ureg_imm1u(c->ureg, 2506 TGSI_MEMBAR_SHADER_BUFFER | 2507 TGSI_MEMBAR_ATOMIC_BUFFER | 2508 TGSI_MEMBAR_SHADER_IMAGE | 2509 TGSI_MEMBAR_SHARED)); 2510 break; 2511 2512 case nir_intrinsic_memory_barrier_atomic_counter: 2513 ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER)); 2514 break; 2515 2516 case nir_intrinsic_memory_barrier_buffer: 2517 ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER)); 2518 break; 2519 2520 case nir_intrinsic_memory_barrier_image: 2521 ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE)); 2522 break; 2523 2524 case nir_intrinsic_memory_barrier_shared: 2525 ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED)); 2526 break; 2527 2528 case nir_intrinsic_group_memory_barrier: 2529 ntt_MEMBAR(c, ureg_imm1u(c->ureg, 2530 TGSI_MEMBAR_SHADER_BUFFER | 2531 TGSI_MEMBAR_ATOMIC_BUFFER | 2532 TGSI_MEMBAR_SHADER_IMAGE | 2533 TGSI_MEMBAR_SHARED | 2534 TGSI_MEMBAR_THREAD_GROUP)); 2535 break; 2536 2537 case nir_intrinsic_end_primitive: 2538 ntt_ENDPRIM(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr))); 2539 break; 2540 2541 case nir_intrinsic_emit_vertex: 2542 ntt_EMIT(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr))); 2543 break; 2544 2545 /* In TGSI we don't actually generate the barycentric coords, and emit 2546 * interp intrinsics later. However, we do need to store the 2547 * load_barycentric_at_* argument so that we can use it at that point. 2548 */ 2549 case nir_intrinsic_load_barycentric_pixel: 2550 case nir_intrinsic_load_barycentric_centroid: 2551 case nir_intrinsic_load_barycentric_sample: 2552 break; 2553 case nir_intrinsic_load_barycentric_at_sample: 2554 case nir_intrinsic_load_barycentric_at_offset: 2555 ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0])); 2556 break; 2557 2558 case nir_intrinsic_shader_clock: 2559 ntt_CLOCK(c, ntt_get_dest(c, &instr->dest)); 2560 break; 2561 2562 default: 2563 fprintf(stderr, "Unknown intrinsic: "); 2564 nir_print_instr(&instr->instr, stderr); 2565 fprintf(stderr, "\n"); 2566 break; 2567 } 2568} 2569 2570struct ntt_tex_operand_state { 2571 struct ureg_src srcs[4]; 2572 unsigned i; 2573}; 2574 2575static void 2576ntt_push_tex_arg(struct ntt_compile *c, 2577 nir_tex_instr *instr, 2578 nir_tex_src_type tex_src_type, 2579 struct ntt_tex_operand_state *s) 2580{ 2581 int tex_src = nir_tex_instr_src_index(instr, tex_src_type); 2582 if (tex_src < 0) 2583 return; 2584 2585 nir_src *src = &instr->src[tex_src].src; 2586 2587 /* virglrenderer workaround that's hard to do in tgsi_translate: Make sure 2588 * that TG4's immediate offset arg is float-typed. 2589 */ 2590 if (instr->op == nir_texop_tg4 && tex_src_type == nir_tex_src_backend2 && 2591 nir_src_is_const(*src)) { 2592 nir_const_value *consts = nir_src_as_const_value(*src); 2593 s->srcs[s->i++] = ureg_imm4f(c->ureg, 2594 consts[0].f32, 2595 consts[1].f32, 2596 consts[2].f32, 2597 consts[3].f32); 2598 return; 2599 } 2600 2601 s->srcs[s->i++] = ntt_get_src(c, *src); 2602} 2603 2604static void 2605ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr) 2606{ 2607 struct ureg_dst dst = ntt_get_dest(c, &instr->dest); 2608 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow); 2609 unsigned tex_opcode; 2610 2611 struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index); 2612 int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset); 2613 if (sampler_src >= 0) { 2614 struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src); 2615 sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr, 2)); 2616 } 2617 2618 switch (instr->op) { 2619 case nir_texop_tex: 2620 if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) > 2621 MAX2(instr->coord_components, 2) + instr->is_shadow) 2622 tex_opcode = TGSI_OPCODE_TXP; 2623 else 2624 tex_opcode = TGSI_OPCODE_TEX; 2625 break; 2626 case nir_texop_txf: 2627 case nir_texop_txf_ms: 2628 tex_opcode = TGSI_OPCODE_TXF; 2629 2630 if (c->has_txf_lz) { 2631 int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod); 2632 if (lod_src >= 0 && 2633 nir_src_is_const(instr->src[lod_src].src) && 2634 ntt_src_as_uint(c, instr->src[lod_src].src) == 0) { 2635 tex_opcode = TGSI_OPCODE_TXF_LZ; 2636 } 2637 } 2638 break; 2639 case nir_texop_txl: 2640 tex_opcode = TGSI_OPCODE_TXL; 2641 break; 2642 case nir_texop_txb: 2643 tex_opcode = TGSI_OPCODE_TXB; 2644 break; 2645 case nir_texop_txd: 2646 tex_opcode = TGSI_OPCODE_TXD; 2647 break; 2648 case nir_texop_txs: 2649 tex_opcode = TGSI_OPCODE_TXQ; 2650 break; 2651 case nir_texop_tg4: 2652 tex_opcode = TGSI_OPCODE_TG4; 2653 break; 2654 case nir_texop_query_levels: 2655 tex_opcode = TGSI_OPCODE_TXQ; 2656 break; 2657 case nir_texop_lod: 2658 tex_opcode = TGSI_OPCODE_LODQ; 2659 break; 2660 case nir_texop_texture_samples: 2661 tex_opcode = TGSI_OPCODE_TXQS; 2662 break; 2663 default: 2664 unreachable("unsupported tex op"); 2665 } 2666 2667 struct ntt_tex_operand_state s = { .i = 0 }; 2668 ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s); 2669 ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s); 2670 2671 /* non-coord arg for TXQ */ 2672 if (tex_opcode == TGSI_OPCODE_TXQ) { 2673 ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s); 2674 /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's 2675 * scalar 2676 */ 2677 s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0); 2678 } 2679 2680 if (s.i > 1) { 2681 if (tex_opcode == TGSI_OPCODE_TEX) 2682 tex_opcode = TGSI_OPCODE_TEX2; 2683 if (tex_opcode == TGSI_OPCODE_TXB) 2684 tex_opcode = TGSI_OPCODE_TXB2; 2685 if (tex_opcode == TGSI_OPCODE_TXL) 2686 tex_opcode = TGSI_OPCODE_TXL2; 2687 } 2688 2689 if (instr->op == nir_texop_txd) { 2690 /* Derivs appear in their own src args */ 2691 int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx); 2692 int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy); 2693 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src); 2694 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src); 2695 } 2696 2697 if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 2698 if (c->screen->get_param(c->screen, 2699 PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) { 2700 sampler = ureg_scalar(sampler, instr->component); 2701 s.srcs[s.i++] = ureg_src_undef(); 2702 } else { 2703 s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component); 2704 } 2705 } 2706 2707 s.srcs[s.i++] = sampler; 2708 2709 enum tgsi_return_type tex_type; 2710 switch (instr->dest_type) { 2711 case nir_type_float32: 2712 tex_type = TGSI_RETURN_TYPE_FLOAT; 2713 break; 2714 case nir_type_int32: 2715 tex_type = TGSI_RETURN_TYPE_SINT; 2716 break; 2717 case nir_type_uint32: 2718 tex_type = TGSI_RETURN_TYPE_UINT; 2719 break; 2720 default: 2721 unreachable("unknown texture type"); 2722 } 2723 2724 struct tgsi_texture_offset tex_offset = { 2725 .File = TGSI_FILE_NULL 2726 }; 2727 int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset); 2728 if (tex_offset_src >= 0) { 2729 struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src); 2730 2731 tex_offset.File = offset.File; 2732 tex_offset.Index = offset.Index; 2733 tex_offset.SwizzleX = offset.SwizzleX; 2734 tex_offset.SwizzleY = offset.SwizzleY; 2735 tex_offset.SwizzleZ = offset.SwizzleZ; 2736 tex_offset.Padding = 0; 2737 } 2738 2739 struct ureg_dst tex_dst; 2740 if (instr->op == nir_texop_query_levels) 2741 tex_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W); 2742 else 2743 tex_dst = dst; 2744 2745 while (s.i < 4) 2746 s.srcs[s.i++] = ureg_src_undef(); 2747 2748 struct ntt_insn *insn = ntt_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]); 2749 insn->tex_target = target; 2750 insn->tex_return_type = tex_type; 2751 insn->tex_offset = tex_offset; 2752 insn->is_tex = true; 2753 2754 if (instr->op == nir_texop_query_levels) 2755 ntt_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3)); 2756} 2757 2758static void 2759ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump) 2760{ 2761 switch (jump->type) { 2762 case nir_jump_break: 2763 ntt_BRK(c); 2764 break; 2765 2766 case nir_jump_continue: 2767 ntt_CONT(c); 2768 break; 2769 2770 default: 2771 fprintf(stderr, "Unknown jump instruction: "); 2772 nir_print_instr(&jump->instr, stderr); 2773 fprintf(stderr, "\n"); 2774 abort(); 2775 } 2776} 2777 2778static void 2779ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr) 2780{ 2781 /* Nothing to do but make sure that we have some storage to deref. */ 2782 (void)ntt_get_ssa_def_decl(c, &instr->def); 2783} 2784 2785static void 2786ntt_emit_instr(struct ntt_compile *c, nir_instr *instr) 2787{ 2788 switch (instr->type) { 2789 case nir_instr_type_deref: 2790 /* ignored, will be walked by nir_intrinsic_image_*_deref. */ 2791 break; 2792 2793 case nir_instr_type_alu: 2794 ntt_emit_alu(c, nir_instr_as_alu(instr)); 2795 break; 2796 2797 case nir_instr_type_intrinsic: 2798 ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); 2799 break; 2800 2801 case nir_instr_type_load_const: 2802 /* Nothing to do here, as load consts are done directly from 2803 * ntt_get_src() (since many constant NIR srcs will often get folded 2804 * directly into a register file index instead of as a TGSI src). 2805 */ 2806 break; 2807 2808 case nir_instr_type_tex: 2809 ntt_emit_texture(c, nir_instr_as_tex(instr)); 2810 break; 2811 2812 case nir_instr_type_jump: 2813 ntt_emit_jump(c, nir_instr_as_jump(instr)); 2814 break; 2815 2816 case nir_instr_type_ssa_undef: 2817 ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr)); 2818 break; 2819 2820 default: 2821 fprintf(stderr, "Unknown NIR instr type: "); 2822 nir_print_instr(instr, stderr); 2823 fprintf(stderr, "\n"); 2824 abort(); 2825 } 2826} 2827 2828static void 2829ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt) 2830{ 2831 if (c->native_integers) 2832 ntt_UIF(c, c->if_cond); 2833 else 2834 ntt_IF(c, c->if_cond); 2835 2836 ntt_emit_cf_list(c, &if_stmt->then_list); 2837 2838 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) { 2839 ntt_ELSE(c); 2840 ntt_emit_cf_list(c, &if_stmt->else_list); 2841 } 2842 2843 ntt_ENDIF(c); 2844} 2845 2846static void 2847ntt_emit_loop(struct ntt_compile *c, nir_loop *loop) 2848{ 2849 ntt_BGNLOOP(c); 2850 ntt_emit_cf_list(c, &loop->body); 2851 ntt_ENDLOOP(c); 2852} 2853 2854static void 2855ntt_emit_block(struct ntt_compile *c, nir_block *block) 2856{ 2857 struct ntt_block *ntt_block = ntt_block_from_nir(c, block); 2858 c->cur_block = ntt_block; 2859 2860 nir_foreach_instr(instr, block) { 2861 ntt_emit_instr(c, instr); 2862 2863 /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntt_OPCODE(). */ 2864 if (ureg_get_instruction_number(c->ureg) != 0) { 2865 fprintf(stderr, "Emitted ureg insn during: "); 2866 nir_print_instr(instr, stderr); 2867 fprintf(stderr, "\n"); 2868 unreachable("emitted ureg insn"); 2869 } 2870 } 2871 2872 /* Set up the if condition for ntt_emit_if(), which we have to do before 2873 * freeing up the temps (the "if" is treated as inside the block for liveness 2874 * purposes, despite not being an instruction) 2875 * 2876 * Note that, while IF and UIF are supposed to look at only .x, virglrenderer 2877 * looks at all of .xyzw. No harm in working around the bug. 2878 */ 2879 nir_if *nif = nir_block_get_following_if(block); 2880 if (nif) 2881 c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X); 2882} 2883 2884static void 2885ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list) 2886{ 2887 foreach_list_typed(nir_cf_node, node, node, list) { 2888 switch (node->type) { 2889 case nir_cf_node_block: 2890 ntt_emit_block(c, nir_cf_node_as_block(node)); 2891 break; 2892 2893 case nir_cf_node_if: 2894 ntt_emit_if(c, nir_cf_node_as_if(node)); 2895 break; 2896 2897 case nir_cf_node_loop: 2898 ntt_emit_loop(c, nir_cf_node_as_loop(node)); 2899 break; 2900 2901 default: 2902 unreachable("unknown CF type"); 2903 } 2904 } 2905} 2906 2907static void 2908ntt_emit_block_ureg(struct ntt_compile *c, struct nir_block *block) 2909{ 2910 struct ntt_block *ntt_block = ntt_block_from_nir(c, block); 2911 2912 /* Emit the ntt insns to tgsi_ureg. */ 2913 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) { 2914 const struct tgsi_opcode_info *opcode_info = 2915 tgsi_get_opcode_info(insn->opcode); 2916 2917 switch (insn->opcode) { 2918 case TGSI_OPCODE_UIF: 2919 ureg_UIF(c->ureg, insn->src[0], &c->cf_label); 2920 break; 2921 2922 case TGSI_OPCODE_IF: 2923 ureg_IF(c->ureg, insn->src[0], &c->cf_label); 2924 break; 2925 2926 case TGSI_OPCODE_ELSE: 2927 ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg)); 2928 ureg_ELSE(c->ureg, &c->cf_label); 2929 c->current_if_else = c->cf_label; 2930 break; 2931 2932 case TGSI_OPCODE_ENDIF: 2933 ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg)); 2934 ureg_ENDIF(c->ureg); 2935 break; 2936 2937 case TGSI_OPCODE_BGNLOOP: 2938 /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx 2939 * does reference BGNLOOP's. Follow the former behavior unless something comes up 2940 * with a need. 2941 */ 2942 ureg_BGNLOOP(c->ureg, &c->cf_label); 2943 break; 2944 2945 case TGSI_OPCODE_ENDLOOP: 2946 ureg_ENDLOOP(c->ureg, &c->cf_label); 2947 break; 2948 2949 default: 2950 if (insn->is_tex) { 2951 ureg_tex_insn(c->ureg, insn->opcode, 2952 insn->dst, opcode_info->num_dst, 2953 insn->tex_target, insn->tex_return_type, 2954 &insn->tex_offset, 2955 insn->tex_offset.File != TGSI_FILE_NULL ? 1 : 0, 2956 insn->src, opcode_info->num_src); 2957 } else if (insn->is_mem) { 2958 ureg_memory_insn(c->ureg, insn->opcode, 2959 insn->dst, opcode_info->num_dst, 2960 insn->src, opcode_info->num_src, 2961 insn->mem_qualifier, 2962 insn->tex_target, 2963 insn->mem_format); 2964 } else { 2965 ureg_insn(c->ureg, insn->opcode, 2966 insn->dst, opcode_info->num_dst, 2967 insn->src, opcode_info->num_src, 2968 insn->precise); 2969 } 2970 } 2971 } 2972} 2973 2974static void 2975ntt_emit_if_ureg(struct ntt_compile *c, nir_if *if_stmt) 2976{ 2977 /* Note: the last block emitted our IF opcode. */ 2978 2979 int if_stack = c->current_if_else; 2980 c->current_if_else = c->cf_label; 2981 2982 /* Either the then or else block includes the ENDIF, which will fix up the 2983 * IF(/ELSE)'s label for jumping 2984 */ 2985 ntt_emit_cf_list_ureg(c, &if_stmt->then_list); 2986 ntt_emit_cf_list_ureg(c, &if_stmt->else_list); 2987 2988 c->current_if_else = if_stack; 2989} 2990 2991static void 2992ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list) 2993{ 2994 foreach_list_typed(nir_cf_node, node, node, list) { 2995 switch (node->type) { 2996 case nir_cf_node_block: 2997 ntt_emit_block_ureg(c, nir_cf_node_as_block(node)); 2998 break; 2999 3000 case nir_cf_node_if: 3001 ntt_emit_if_ureg(c, nir_cf_node_as_if(node)); 3002 break; 3003 3004 case nir_cf_node_loop: 3005 /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx 3006 * does reference BGNLOOP's. Follow the former behavior unless something comes up 3007 * with a need. 3008 */ 3009 ntt_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body); 3010 break; 3011 3012 default: 3013 unreachable("unknown CF type"); 3014 } 3015 } 3016} 3017 3018static void 3019ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl) 3020{ 3021 c->impl = impl; 3022 3023 c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc); 3024 c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc); 3025 3026 /* Set up the struct ntt_blocks to put insns in */ 3027 c->blocks = _mesa_pointer_hash_table_create(c); 3028 nir_foreach_block(block, impl) { 3029 struct ntt_block *ntt_block = rzalloc(c->blocks, struct ntt_block); 3030 util_dynarray_init(&ntt_block->insns, ntt_block); 3031 _mesa_hash_table_insert(c->blocks, block, ntt_block); 3032 } 3033 3034 3035 ntt_setup_registers(c, &impl->registers); 3036 3037 c->cur_block = ntt_block_from_nir(c, nir_start_block(impl)); 3038 ntt_setup_inputs(c); 3039 ntt_setup_outputs(c); 3040 ntt_setup_uniforms(c); 3041 3042 /* Emit the ntt insns */ 3043 ntt_emit_cf_list(c, &impl->body); 3044 3045 /* Don't do optimized RA if the driver requests it, unless the number of 3046 * temps is too large to be covered by the 16 bit signed int that TGSI 3047 * allocates for the register index */ 3048 if (!c->options->unoptimized_ra || c->num_temps > 0x7fff) 3049 ntt_allocate_regs(c, impl); 3050 else 3051 ntt_allocate_regs_unoptimized(c, impl); 3052 3053 /* Turn the ntt insns into actual TGSI tokens */ 3054 ntt_emit_cf_list_ureg(c, &impl->body); 3055 3056 ralloc_free(c->liveness); 3057 c->liveness = NULL; 3058 3059} 3060 3061static int 3062type_size(const struct glsl_type *type, bool bindless) 3063{ 3064 return glsl_count_attribute_slots(type, false); 3065} 3066 3067/* Allow vectorizing of ALU instructions, but avoid vectorizing past what we 3068 * can handle for 64-bit values in TGSI. 3069 */ 3070static uint8_t 3071ntt_should_vectorize_instr(const nir_instr *instr, const void *data) 3072{ 3073 if (instr->type != nir_instr_type_alu) 3074 return 0; 3075 3076 nir_alu_instr *alu = nir_instr_as_alu(instr); 3077 3078 switch (alu->op) { 3079 case nir_op_ibitfield_extract: 3080 case nir_op_ubitfield_extract: 3081 case nir_op_bitfield_insert: 3082 /* virglrenderer only looks at the .x channel of the offset/bits operands 3083 * when translating to GLSL. tgsi.rst doesn't seem to require scalar 3084 * offset/bits operands. 3085 * 3086 * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195 3087 */ 3088 return 1; 3089 3090 default: 3091 break; 3092 } 3093 3094 int src_bit_size = nir_src_bit_size(alu->src[0].src); 3095 int dst_bit_size = nir_dest_bit_size(alu->dest.dest); 3096 3097 if (src_bit_size == 64 || dst_bit_size == 64) { 3098 /* Avoid vectorizing 64-bit instructions at all. Despite tgsi.rst 3099 * claiming support, virglrenderer generates bad shaders on the host when 3100 * presented with them. Maybe we can make virgl avoid tickling the 3101 * virglrenderer bugs, but given that glsl-to-TGSI didn't generate vector 3102 * 64-bit instrs in the first place, I don't see much reason to care about 3103 * this. 3104 */ 3105 return 1; 3106 } 3107 3108 return 4; 3109} 3110 3111static bool 3112ntt_should_vectorize_io(unsigned align, unsigned bit_size, 3113 unsigned num_components, unsigned high_offset, 3114 nir_intrinsic_instr *low, nir_intrinsic_instr *high, 3115 void *data) 3116{ 3117 if (bit_size != 32) 3118 return false; 3119 3120 /* Our offset alignment should aways be at least 4 bytes */ 3121 if (align < 4) 3122 return false; 3123 3124 /* No wrapping off the end of a TGSI reg. We could do a bit better by 3125 * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't 3126 * need this restriction. 3127 */ 3128 unsigned worst_start_component = align == 4 ? 3 : align / 4; 3129 if (worst_start_component + num_components > 4) 3130 return false; 3131 3132 return true; 3133} 3134 3135static nir_variable_mode 3136ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen) 3137{ 3138 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage); 3139 unsigned indirect_mask = 0; 3140 3141 if (!screen->get_shader_param(screen, pipe_stage, 3142 PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) { 3143 indirect_mask |= nir_var_shader_in; 3144 } 3145 3146 if (!screen->get_shader_param(screen, pipe_stage, 3147 PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) { 3148 indirect_mask |= nir_var_shader_out; 3149 } 3150 3151 if (!screen->get_shader_param(screen, pipe_stage, 3152 PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) { 3153 indirect_mask |= nir_var_function_temp; 3154 } 3155 3156 return indirect_mask; 3157} 3158 3159static void 3160ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen) 3161{ 3162 bool progress; 3163 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage); 3164 unsigned control_flow_depth = 3165 screen->get_shader_param(screen, pipe_stage, 3166 PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH); 3167 do { 3168 progress = false; 3169 3170 NIR_PASS_V(s, nir_lower_vars_to_ssa); 3171 NIR_PASS_V(s, nir_split_64bit_vec3_and_vec4); 3172 3173 NIR_PASS(progress, s, nir_copy_prop); 3174 NIR_PASS(progress, s, nir_opt_algebraic); 3175 NIR_PASS(progress, s, nir_opt_constant_folding); 3176 NIR_PASS(progress, s, nir_opt_remove_phis); 3177 NIR_PASS(progress, s, nir_opt_conditional_discard); 3178 NIR_PASS(progress, s, nir_opt_dce); 3179 NIR_PASS(progress, s, nir_opt_dead_cf); 3180 NIR_PASS(progress, s, nir_opt_cse); 3181 NIR_PASS(progress, s, nir_opt_find_array_copies); 3182 NIR_PASS(progress, s, nir_opt_copy_prop_vars); 3183 NIR_PASS(progress, s, nir_opt_dead_write_vars); 3184 3185 NIR_PASS(progress, s, nir_opt_if, nir_opt_if_aggressive_last_continue | nir_opt_if_optimize_phi_true_false); 3186 NIR_PASS(progress, s, nir_opt_peephole_select, 3187 control_flow_depth == 0 ? ~0 : 8, true, true); 3188 NIR_PASS(progress, s, nir_opt_algebraic); 3189 NIR_PASS(progress, s, nir_opt_constant_folding); 3190 nir_load_store_vectorize_options vectorize_opts = { 3191 .modes = nir_var_mem_ubo, 3192 .callback = ntt_should_vectorize_io, 3193 .robust_modes = 0, 3194 }; 3195 NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts); 3196 NIR_PASS(progress, s, nir_opt_shrink_stores, true); 3197 NIR_PASS(progress, s, nir_opt_shrink_vectors); 3198 NIR_PASS(progress, s, nir_opt_trivial_continues); 3199 NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL); 3200 NIR_PASS(progress, s, nir_opt_undef); 3201 NIR_PASS(progress, s, nir_opt_loop_unroll); 3202 3203 /* Try to fold addressing math into ubo_vec4's base to avoid load_consts 3204 * and ALU ops for it. 3205 */ 3206 static const nir_opt_offsets_options offset_options = { 3207 .ubo_vec4_max = ~0, 3208 3209 /* No const offset in TGSI for shared accesses. */ 3210 .shared_max = 0, 3211 3212 /* unused intrinsics */ 3213 .uniform_max = 0, 3214 .buffer_max = 0, 3215 }; 3216 NIR_PASS(progress, s, nir_opt_offsets, &offset_options); 3217 } while (progress); 3218 3219 NIR_PASS_V(s, nir_lower_var_copies); 3220} 3221 3222/* Scalarizes all 64-bit ALU ops. Note that we only actually need to 3223 * scalarize vec3/vec4s, should probably fix that. 3224 */ 3225static bool 3226scalarize_64bit(const nir_instr *instr, const void *data) 3227{ 3228 const nir_alu_instr *alu = nir_instr_as_alu(instr); 3229 3230 return (nir_dest_bit_size(alu->dest.dest) == 64 || 3231 nir_src_bit_size(alu->src[0].src) == 64); 3232} 3233 3234static bool 3235nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr) 3236{ 3237 b->cursor = nir_after_instr(&instr->instr); 3238 3239 switch (instr->intrinsic) { 3240 case nir_intrinsic_load_ubo: 3241 case nir_intrinsic_load_ubo_vec4: 3242 case nir_intrinsic_load_ssbo: 3243 case nir_intrinsic_load_input: 3244 case nir_intrinsic_load_interpolated_input: 3245 case nir_intrinsic_load_per_vertex_input: 3246 case nir_intrinsic_store_output: 3247 case nir_intrinsic_store_per_vertex_output: 3248 case nir_intrinsic_store_ssbo: 3249 break; 3250 default: 3251 return false; 3252 } 3253 3254 if (instr->num_components <= 2) 3255 return false; 3256 3257 bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest; 3258 if (has_dest) { 3259 if (nir_dest_bit_size(instr->dest) != 64) 3260 return false; 3261 } else { 3262 if (nir_src_bit_size(instr->src[0]) != 64) 3263 return false; 3264 } 3265 3266 nir_intrinsic_instr *first = 3267 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr)); 3268 nir_intrinsic_instr *second = 3269 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr)); 3270 3271 switch (instr->intrinsic) { 3272 case nir_intrinsic_load_ubo: 3273 case nir_intrinsic_load_ubo_vec4: 3274 case nir_intrinsic_load_ssbo: 3275 case nir_intrinsic_store_ssbo: 3276 break; 3277 3278 default: { 3279 nir_io_semantics semantics = nir_intrinsic_io_semantics(second); 3280 semantics.location++; 3281 semantics.num_slots--; 3282 nir_intrinsic_set_io_semantics(second, semantics); 3283 3284 nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1); 3285 break; 3286 } 3287 } 3288 3289 first->num_components = 2; 3290 second->num_components -= 2; 3291 if (has_dest) { 3292 first->dest.ssa.num_components = 2; 3293 second->dest.ssa.num_components -= 2; 3294 } 3295 3296 nir_builder_instr_insert(b, &first->instr); 3297 nir_builder_instr_insert(b, &second->instr); 3298 3299 if (has_dest) { 3300 /* Merge the two loads' results back into a vector. */ 3301 nir_ssa_scalar channels[4] = { 3302 nir_get_ssa_scalar(&first->dest.ssa, 0), 3303 nir_get_ssa_scalar(&first->dest.ssa, 1), 3304 nir_get_ssa_scalar(&second->dest.ssa, 0), 3305 nir_get_ssa_scalar(&second->dest.ssa, second->num_components > 1 ? 1 : 0), 3306 }; 3307 nir_ssa_def *new = nir_vec_scalars(b, channels, instr->num_components); 3308 nir_ssa_def_rewrite_uses(&instr->dest.ssa, new); 3309 } else { 3310 /* Split the src value across the two stores. */ 3311 b->cursor = nir_before_instr(&instr->instr); 3312 3313 nir_ssa_def *src0 = instr->src[0].ssa; 3314 nir_ssa_scalar channels[4] = { 0 }; 3315 for (int i = 0; i < instr->num_components; i++) 3316 channels[i] = nir_get_ssa_scalar(src0, i); 3317 3318 nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3); 3319 nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2); 3320 3321 nir_instr_rewrite_src(&first->instr, &first->src[0], 3322 nir_src_for_ssa(nir_vec_scalars(b, channels, 2))); 3323 nir_instr_rewrite_src(&second->instr, &second->src[0], 3324 nir_src_for_ssa(nir_vec_scalars(b, &channels[2], 3325 second->num_components))); 3326 } 3327 3328 int offset_src = -1; 3329 uint32_t offset_amount = 16; 3330 3331 switch (instr->intrinsic) { 3332 case nir_intrinsic_load_ssbo: 3333 case nir_intrinsic_load_ubo: 3334 offset_src = 1; 3335 break; 3336 case nir_intrinsic_load_ubo_vec4: 3337 offset_src = 1; 3338 offset_amount = 1; 3339 break; 3340 case nir_intrinsic_store_ssbo: 3341 offset_src = 2; 3342 break; 3343 default: 3344 break; 3345 } 3346 if (offset_src != -1) { 3347 b->cursor = nir_before_instr(&second->instr); 3348 nir_ssa_def *second_offset = 3349 nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount); 3350 nir_instr_rewrite_src(&second->instr, &second->src[offset_src], 3351 nir_src_for_ssa(second_offset)); 3352 } 3353 3354 /* DCE stores we generated with no writemask (nothing else does this 3355 * currently). 3356 */ 3357 if (!has_dest) { 3358 if (nir_intrinsic_write_mask(first) == 0) 3359 nir_instr_remove(&first->instr); 3360 if (nir_intrinsic_write_mask(second) == 0) 3361 nir_instr_remove(&second->instr); 3362 } 3363 3364 nir_instr_remove(&instr->instr); 3365 3366 return true; 3367} 3368 3369static bool 3370nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr) 3371{ 3372 int num_components = instr->def.num_components; 3373 3374 if (instr->def.bit_size != 64 || num_components <= 2) 3375 return false; 3376 3377 b->cursor = nir_before_instr(&instr->instr); 3378 3379 nir_load_const_instr *first = 3380 nir_load_const_instr_create(b->shader, 2, 64); 3381 nir_load_const_instr *second = 3382 nir_load_const_instr_create(b->shader, num_components - 2, 64); 3383 3384 first->value[0] = instr->value[0]; 3385 first->value[1] = instr->value[1]; 3386 second->value[0] = instr->value[2]; 3387 if (num_components == 4) 3388 second->value[1] = instr->value[3]; 3389 3390 nir_builder_instr_insert(b, &first->instr); 3391 nir_builder_instr_insert(b, &second->instr); 3392 3393 nir_ssa_def *channels[4] = { 3394 nir_channel(b, &first->def, 0), 3395 nir_channel(b, &first->def, 1), 3396 nir_channel(b, &second->def, 0), 3397 num_components == 4 ? nir_channel(b, &second->def, 1) : NULL, 3398 }; 3399 nir_ssa_def *new = nir_vec(b, channels, num_components); 3400 nir_ssa_def_rewrite_uses(&instr->def, new); 3401 nir_instr_remove(&instr->instr); 3402 3403 return true; 3404} 3405 3406static bool 3407nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr, 3408 void *data) 3409{ 3410 switch (instr->type) { 3411 case nir_instr_type_load_const: 3412 return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr)); 3413 3414 case nir_instr_type_intrinsic: 3415 return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr)); 3416 default: 3417 return false; 3418 } 3419} 3420 3421static bool 3422nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s) 3423{ 3424 return nir_shader_instructions_pass(s, 3425 nir_to_tgsi_lower_64bit_to_vec2_instr, 3426 nir_metadata_block_index | 3427 nir_metadata_dominance, 3428 NULL); 3429} 3430 3431struct ntt_lower_tex_state { 3432 nir_ssa_scalar channels[8]; 3433 unsigned i; 3434}; 3435 3436static void 3437nir_to_tgsi_lower_tex_instr_arg(nir_builder *b, 3438 nir_tex_instr *instr, 3439 nir_tex_src_type tex_src_type, 3440 struct ntt_lower_tex_state *s) 3441{ 3442 int tex_src = nir_tex_instr_src_index(instr, tex_src_type); 3443 if (tex_src < 0) 3444 return; 3445 3446 assert(instr->src[tex_src].src.is_ssa); 3447 3448 nir_ssa_def *def = instr->src[tex_src].src.ssa; 3449 for (int i = 0; i < def->num_components; i++) { 3450 s->channels[s->i++] = nir_get_ssa_scalar(def, i); 3451 } 3452 3453 nir_tex_instr_remove_src(instr, tex_src); 3454} 3455 3456/** 3457 * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex 3458 * src. This lets NIR handle the coalescing of the vec4 rather than trying to 3459 * manage it on our own, and may lead to more vectorization. 3460 */ 3461static bool 3462nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data) 3463{ 3464 if (instr->type != nir_instr_type_tex) 3465 return false; 3466 3467 nir_tex_instr *tex = nir_instr_as_tex(instr); 3468 3469 if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0) 3470 return false; 3471 3472 b->cursor = nir_before_instr(instr); 3473 3474 struct ntt_lower_tex_state s = {0}; 3475 3476 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s); 3477 /* We always have at least two slots for the coordinate, even on 1D. */ 3478 s.i = MAX2(s.i, 2); 3479 3480 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s); 3481 s.i = MAX2(s.i, 3); 3482 3483 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s); 3484 3485 /* XXX: LZ */ 3486 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s); 3487 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s); 3488 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s); 3489 3490 /* No need to pack undefs in unused channels of the tex instr */ 3491 while (!s.channels[s.i - 1].def) 3492 s.i--; 3493 3494 /* Instead of putting undefs in the unused slots of the vecs, just put in 3495 * another used channel. Otherwise, we'll get unnecessary moves into 3496 * registers. 3497 */ 3498 assert(s.channels[0].def != NULL); 3499 for (int i = 1; i < s.i; i++) { 3500 if (!s.channels[i].def) 3501 s.channels[i] = s.channels[0]; 3502 } 3503 3504 nir_tex_instr_add_src(tex, nir_tex_src_backend1, nir_src_for_ssa(nir_vec_scalars(b, s.channels, MIN2(s.i, 4)))); 3505 if (s.i > 4) 3506 nir_tex_instr_add_src(tex, nir_tex_src_backend2, nir_src_for_ssa(nir_vec_scalars(b, &s.channels[4], s.i - 4))); 3507 3508 return true; 3509} 3510 3511static bool 3512nir_to_tgsi_lower_tex(nir_shader *s) 3513{ 3514 return nir_shader_instructions_pass(s, 3515 nir_to_tgsi_lower_tex_instr, 3516 nir_metadata_block_index | 3517 nir_metadata_dominance, 3518 NULL); 3519} 3520 3521static void 3522ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s, 3523 const struct nir_to_tgsi_options *ntt_options) 3524{ 3525 const struct nir_shader_compiler_options *options = s->options; 3526 bool lower_fsqrt = 3527 !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage), 3528 PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED); 3529 3530 bool force_indirect_unrolling_sampler = 3531 screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL) < 400; 3532 3533 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen); 3534 3535 if (!options->lower_extract_byte || 3536 !options->lower_extract_word || 3537 !options->lower_insert_byte || 3538 !options->lower_insert_word || 3539 !options->lower_fdph || 3540 !options->lower_flrp64 || 3541 !options->lower_fmod || 3542 !options->lower_rotate || 3543 !options->lower_uadd_sat || 3544 !options->lower_usub_sat || 3545 !options->lower_uniforms_to_ubo || 3546 !options->lower_vector_cmp || 3547 options->lower_fsqrt != lower_fsqrt || 3548 options->force_indirect_unrolling != no_indirects_mask || 3549 force_indirect_unrolling_sampler) { 3550 nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options); 3551 *new_options = *s->options; 3552 3553 new_options->lower_extract_byte = true; 3554 new_options->lower_extract_word = true; 3555 new_options->lower_insert_byte = true; 3556 new_options->lower_insert_word = true; 3557 new_options->lower_fdph = true; 3558 new_options->lower_flrp64 = true; 3559 new_options->lower_fmod = true; 3560 new_options->lower_rotate = true; 3561 new_options->lower_uadd_sat = true; 3562 new_options->lower_usub_sat = true; 3563 new_options->lower_uniforms_to_ubo = true; 3564 new_options->lower_vector_cmp = true; 3565 new_options->lower_fsqrt = lower_fsqrt; 3566 new_options->force_indirect_unrolling = no_indirects_mask; 3567 new_options->force_indirect_unrolling_sampler = force_indirect_unrolling_sampler; 3568 3569 s->options = new_options; 3570 } 3571} 3572 3573static bool 3574ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data) 3575{ 3576 return (instr->type == nir_instr_type_intrinsic && 3577 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec); 3578} 3579 3580static nir_ssa_def * 3581ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data) 3582{ 3583 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 3584 3585 nir_ssa_def *old_result = &intr->dest.ssa; 3586 intr->intrinsic = nir_intrinsic_atomic_counter_post_dec; 3587 3588 return nir_iadd_imm(b, old_result, -1); 3589} 3590 3591static bool 3592ntt_lower_atomic_pre_dec(nir_shader *s) 3593{ 3594 return nir_shader_lower_instructions(s, 3595 ntt_lower_atomic_pre_dec_filter, 3596 ntt_lower_atomic_pre_dec_lower, NULL); 3597} 3598 3599/* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */ 3600static void 3601nir_to_tgsi_lower_txp(nir_shader *s) 3602{ 3603 nir_lower_tex_options lower_tex_options = { 3604 .lower_txp = 0, 3605 }; 3606 3607 nir_foreach_block(block, nir_shader_get_entrypoint(s)) { 3608 nir_foreach_instr(instr, block) { 3609 if (instr->type != nir_instr_type_tex) 3610 continue; 3611 nir_tex_instr *tex = nir_instr_as_tex(instr); 3612 3613 if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0) 3614 continue; 3615 3616 bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0; 3617 bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT; 3618 bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0; 3619 3620 /* We can do TXP for any tex (not txg) where we can fit all the 3621 * coordinates and comparator and projector in one vec4 without any 3622 * other modifiers to add on. 3623 * 3624 * nir_lower_tex() only handles the lowering on a sampler-dim basis, so 3625 * if we get any funny projectors then we just blow them all away. 3626 */ 3627 if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare)) 3628 lower_tex_options.lower_txp |= 1 << tex->sampler_dim; 3629 } 3630 } 3631 3632 /* nir_lower_tex must be run even if no options are set, because we need the 3633 * LOD to be set for query_levels and for non-fragment shaders. 3634 */ 3635 NIR_PASS_V(s, nir_lower_tex, &lower_tex_options); 3636} 3637 3638static bool 3639nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data) 3640{ 3641 return (instr->type == nir_instr_type_intrinsic && 3642 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id); 3643} 3644 3645static nir_ssa_def * 3646nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data) 3647{ 3648 nir_variable *var = *(nir_variable **)data; 3649 if (!var) { 3650 var = nir_variable_create(b->shader, nir_var_shader_in, glsl_uint_type(), "gl_PrimitiveID"); 3651 var->data.location = VARYING_SLOT_PRIMITIVE_ID; 3652 b->shader->info.inputs_read |= VARYING_BIT_PRIMITIVE_ID; 3653 var->data.driver_location = b->shader->num_inputs++; 3654 3655 *(nir_variable **)data = var; 3656 } 3657 3658 nir_io_semantics semantics = { 3659 .location = var->data.location, 3660 .num_slots = 1 3661 }; 3662 return nir_load_input(b, 1, 32, nir_imm_int(b, 0), 3663 .base = var->data.driver_location, 3664 .io_semantics = semantics); 3665} 3666 3667static bool 3668nir_lower_primid_sysval_to_input(nir_shader *s) 3669{ 3670 nir_variable *input = NULL; 3671 3672 return nir_shader_lower_instructions(s, 3673 nir_lower_primid_sysval_to_input_filter, 3674 nir_lower_primid_sysval_to_input_lower, &input); 3675} 3676 3677const void * 3678nir_to_tgsi(struct nir_shader *s, 3679 struct pipe_screen *screen) 3680{ 3681 static const struct nir_to_tgsi_options default_ntt_options = {0}; 3682 return nir_to_tgsi_options(s, screen, &default_ntt_options); 3683} 3684 3685/* Prevent lower_vec_to_mov from coalescing 64-to-32 conversions and comparisons 3686 * into unsupported channels of registers. 3687 */ 3688static bool 3689ntt_vec_to_mov_writemask_cb(const nir_instr *instr, unsigned writemask, UNUSED const void *_data) 3690{ 3691 if (instr->type != nir_instr_type_alu) 3692 return false; 3693 3694 nir_alu_instr *alu = nir_instr_as_alu(instr); 3695 int dst_32 = nir_dest_bit_size(alu->dest.dest) == 32; 3696 int src_64 = nir_src_bit_size(alu->src[0].src) == 64; 3697 3698 if (src_64 && dst_32) { 3699 int num_srcs = nir_op_infos[alu->op].num_inputs; 3700 3701 if (num_srcs == 2 || nir_op_infos[alu->op].output_type == nir_type_bool32) { 3702 /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz 3703 * instead of .xy. Just support scalar compares storing to .x, 3704 * GLSL-to-TGSI only ever emitted scalar ops anyway. 3705 */ 3706 if (writemask != TGSI_WRITEMASK_X) 3707 return false; 3708 } else { 3709 /* TGSI's 64-to-32-bit conversions can only store to .xy (since a TGSI 3710 * register can only store a dvec2). Don't try to coalesce to write to 3711 * .zw. 3712 */ 3713 if (writemask & ~(TGSI_WRITEMASK_XY)) 3714 return false; 3715 } 3716 } 3717 3718 return true; 3719} 3720 3721/** 3722 * Translates the NIR shader to TGSI. 3723 * 3724 * This requires some lowering of the NIR shader to prepare it for translation. 3725 * We take ownership of the NIR shader passed, returning a reference to the new 3726 * TGSI tokens instead. If you need to keep the NIR, then pass us a clone. 3727 */ 3728const void *nir_to_tgsi_options(struct nir_shader *s, 3729 struct pipe_screen *screen, 3730 const struct nir_to_tgsi_options *options) 3731{ 3732 struct ntt_compile *c; 3733 const void *tgsi_tokens; 3734 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen); 3735 bool native_integers = screen->get_shader_param(screen, 3736 pipe_shader_type_from_mesa(s->info.stage), 3737 PIPE_SHADER_CAP_INTEGERS); 3738 const struct nir_shader_compiler_options *original_options = s->options; 3739 3740 ntt_fix_nir_options(screen, s, options); 3741 3742 /* Lower array indexing on FS inputs. Since we don't set 3743 * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to 3744 * elements by ureg, and so dynamically indexing them would be invalid. 3745 * Ideally we would set that ureg flag based on 3746 * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st 3747 * splitting NIR VS outputs to elements even if the FS doesn't get the 3748 * corresponding splitting, and virgl depends on TGSI across link boundaries 3749 * having matching declarations. 3750 */ 3751 if (s->info.stage == MESA_SHADER_FRAGMENT) { 3752 NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX); 3753 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL); 3754 } 3755 3756 NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 3757 type_size, (nir_lower_io_options)0); 3758 NIR_PASS_V(s, nir_lower_regs_to_ssa); 3759 3760 nir_to_tgsi_lower_txp(s); 3761 NIR_PASS_V(s, nir_to_tgsi_lower_tex); 3762 3763 /* While TGSI can represent PRIMID as either an input or a system value, 3764 * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers 3765 * depend on that. 3766 */ 3767 if (s->info.stage == MESA_SHADER_GEOMETRY) 3768 NIR_PASS_V(s, nir_lower_primid_sysval_to_input); 3769 3770 if (s->info.num_abos) 3771 NIR_PASS_V(s, ntt_lower_atomic_pre_dec); 3772 3773 if (!original_options->lower_uniforms_to_ubo) { 3774 NIR_PASS_V(s, nir_lower_uniforms_to_ubo, 3775 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS), 3776 !native_integers); 3777 } 3778 3779 /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI -- 3780 * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op 3781 * duplication logic we just make it so that we only see vec2s. 3782 */ 3783 NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL); 3784 NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2); 3785 3786 if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF)) 3787 NIR_PASS_V(s, nir_lower_ubo_vec4); 3788 3789 ntt_optimize_nir(s, screen); 3790 3791 NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX); 3792 3793 bool progress; 3794 do { 3795 progress = false; 3796 NIR_PASS(progress, s, nir_opt_algebraic_late); 3797 if (progress) { 3798 NIR_PASS_V(s, nir_copy_prop); 3799 NIR_PASS_V(s, nir_opt_dce); 3800 NIR_PASS_V(s, nir_opt_cse); 3801 } 3802 } while (progress); 3803 3804 if (screen->get_shader_param(screen, 3805 pipe_shader_type_from_mesa(s->info.stage), 3806 PIPE_SHADER_CAP_INTEGERS)) { 3807 NIR_PASS_V(s, nir_lower_bool_to_int32); 3808 } else { 3809 NIR_PASS_V(s, nir_lower_int_to_float); 3810 NIR_PASS_V(s, nir_lower_bool_to_float); 3811 /* bool_to_float generates MOVs for b2f32 that we want to clean up. */ 3812 NIR_PASS_V(s, nir_copy_prop); 3813 NIR_PASS_V(s, nir_opt_dce); 3814 } 3815 3816 nir_move_options move_all = 3817 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | 3818 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; 3819 3820 NIR_PASS_V(s, nir_opt_move, move_all); 3821 3822 /* Only lower 32-bit floats. The only other modifier type officially 3823 * supported by TGSI is 32-bit integer negates, but even those are broken on 3824 * virglrenderer, so skip lowering all integer and f64 float mods. 3825 * 3826 * The options->lower_fabs requests that we not have native source modifiers 3827 * for fabs, and instead emit MAX(a,-a) for nir_op_fabs. 3828 */ 3829 nir_lower_to_source_mods_flags source_mods = nir_lower_fneg_source_mods; 3830 if (!options->lower_fabs) 3831 source_mods |= nir_lower_fabs_source_mods; 3832 NIR_PASS_V(s, nir_lower_to_source_mods, source_mods); 3833 3834 NIR_PASS_V(s, nir_convert_from_ssa, true); 3835 NIR_PASS_V(s, nir_lower_vec_to_movs, ntt_vec_to_mov_writemask_cb, NULL); 3836 3837 /* locals_to_regs will leave dead derefs that are good to clean up. */ 3838 NIR_PASS_V(s, nir_lower_locals_to_regs); 3839 NIR_PASS_V(s, nir_opt_dce); 3840 3841 if (NIR_DEBUG(TGSI)) { 3842 fprintf(stderr, "NIR before translation to TGSI:\n"); 3843 nir_print_shader(s, stderr); 3844 } 3845 3846 c = rzalloc(NULL, struct ntt_compile); 3847 c->screen = screen; 3848 c->options = options; 3849 3850 c->needs_texcoord_semantic = 3851 screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD); 3852 c->has_txf_lz = 3853 screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ); 3854 3855 c->s = s; 3856 c->native_integers = native_integers; 3857 c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage)); 3858 ureg_setup_shader_info(c->ureg, &s->info); 3859 if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES)) 3860 ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1); 3861 3862 if (s->info.stage == MESA_SHADER_FRAGMENT) { 3863 /* The draw module's polygon stipple layer doesn't respect the chosen 3864 * coordinate mode, so leave it as unspecified unless we're actually 3865 * reading the position in the shader already. See 3866 * gl-2.1-polygon-stipple-fs on softpipe. 3867 */ 3868 if ((s->info.inputs_read & VARYING_BIT_POS) || 3869 BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) { 3870 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN, 3871 s->info.fs.origin_upper_left ? 3872 TGSI_FS_COORD_ORIGIN_UPPER_LEFT : 3873 TGSI_FS_COORD_ORIGIN_LOWER_LEFT); 3874 3875 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER, 3876 s->info.fs.pixel_center_integer ? 3877 TGSI_FS_COORD_PIXEL_CENTER_INTEGER : 3878 TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER); 3879 } 3880 } 3881 /* Emit the main function */ 3882 nir_function_impl *impl = nir_shader_get_entrypoint(c->s); 3883 ntt_emit_impl(c, impl); 3884 ureg_END(c->ureg); 3885 3886 tgsi_tokens = ureg_get_tokens(c->ureg, NULL); 3887 3888 if (NIR_DEBUG(TGSI)) { 3889 fprintf(stderr, "TGSI after translation from NIR:\n"); 3890 tgsi_dump(tgsi_tokens, 0); 3891 } 3892 3893 ureg_destroy(c->ureg); 3894 3895 ralloc_free(c); 3896 ralloc_free(s); 3897 3898 return tgsi_tokens; 3899} 3900 3901static const nir_shader_compiler_options nir_to_tgsi_compiler_options = { 3902 .fdot_replicates = true, 3903 .fuse_ffma32 = true, 3904 .fuse_ffma64 = true, 3905 .lower_extract_byte = true, 3906 .lower_extract_word = true, 3907 .lower_insert_byte = true, 3908 .lower_insert_word = true, 3909 .lower_fdph = true, 3910 .lower_flrp64 = true, 3911 .lower_fmod = true, 3912 .lower_rotate = true, 3913 .lower_uniforms_to_ubo = true, 3914 .lower_uadd_sat = true, 3915 .lower_usub_sat = true, 3916 .lower_vector_cmp = true, 3917 .lower_int64_options = nir_lower_imul_2x32_64, 3918 .use_interpolated_input_intrinsics = true, 3919}; 3920 3921/* Returns a default compiler options for drivers with only nir-to-tgsi-based 3922 * NIR support. 3923 */ 3924const void * 3925nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen, 3926 enum pipe_shader_ir ir, 3927 unsigned shader) 3928{ 3929 assert(ir == PIPE_SHADER_IR_NIR); 3930 return &nir_to_tgsi_compiler_options; 3931} 3932 3933/** Helper for getting TGSI tokens to store for a pipe_shader_state CSO. */ 3934const void * 3935pipe_shader_state_to_tgsi_tokens(struct pipe_screen *screen, 3936 const struct pipe_shader_state *cso) 3937{ 3938 if (cso->type == PIPE_SHADER_IR_NIR) { 3939 return nir_to_tgsi((nir_shader *)cso->ir.nir, screen); 3940 } else { 3941 assert(cso->type == PIPE_SHADER_IR_TGSI); 3942 /* we need to keep a local copy of the tokens */ 3943 return tgsi_dup_tokens(cso->tokens); 3944 } 3945} 3946