1/* 2 * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#include <stdarg.h> 28 29#include "util/u_math.h" 30#include "util/u_memory.h" 31#include "util/u_string.h" 32 33#include "ir3_compiler.h" 34#include "ir3_image.h" 35#include "ir3_nir.h" 36#include "ir3_shader.h" 37 38#include "instr-a3xx.h" 39#include "ir3.h" 40#include "ir3_context.h" 41 42void 43ir3_handle_nonuniform(struct ir3_instruction *instr, 44 nir_intrinsic_instr *intrin) 45{ 46 if (nir_intrinsic_has_access(intrin) && 47 (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) { 48 instr->flags |= IR3_INSTR_NONUNIF; 49 } 50} 51 52void 53ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc) 54{ 55 nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc); 56 if (!intrin) 57 return; 58 59 instr->flags |= IR3_INSTR_B; 60 instr->cat6.base = nir_intrinsic_desc_set(intrin); 61} 62 63static struct ir3_instruction * 64create_input(struct ir3_context *ctx, unsigned compmask) 65{ 66 struct ir3_instruction *in; 67 68 in = ir3_instr_create(ctx->in_block, OPC_META_INPUT, 1, 0); 69 in->input.sysval = ~0; 70 __ssa_dst(in)->wrmask = compmask; 71 72 array_insert(ctx->ir, ctx->ir->inputs, in); 73 74 return in; 75} 76 77static struct ir3_instruction * 78create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord, 79 unsigned n) 80{ 81 struct ir3_block *block = ctx->block; 82 struct ir3_instruction *instr; 83 /* packed inloc is fixed up later: */ 84 struct ir3_instruction *inloc = create_immed(block, n); 85 86 if (coord) { 87 instr = ir3_BARY_F(block, inloc, 0, coord, 0); 88 } else if (ctx->compiler->flat_bypass) { 89 if (ctx->compiler->gen >= 6) { 90 instr = ir3_FLAT_B(block, inloc, 0, inloc, 0); 91 } else { 92 instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0); 93 instr->cat6.type = TYPE_U32; 94 instr->cat6.iim_val = 1; 95 } 96 } else { 97 instr = ir3_BARY_F(block, inloc, 0, ctx->ij[IJ_PERSP_PIXEL], 0); 98 instr->srcs[1]->wrmask = 0x3; 99 } 100 101 return instr; 102} 103 104static struct ir3_instruction * 105create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp) 106{ 107 /* first four vec4 sysval's reserved for UBOs: */ 108 /* NOTE: dp is in scalar, but there can be >4 dp components: */ 109 struct ir3_const_state *const_state = ir3_const_state(ctx->so); 110 unsigned n = const_state->offsets.driver_param; 111 unsigned r = regid(n + dp / 4, dp % 4); 112 return create_uniform(ctx->block, r); 113} 114 115/* 116 * Adreno's comparisons produce a 1 for true and 0 for false, in either 16 or 117 * 32-bit registers. We use NIR's 1-bit integers to represent bools, and 118 * trust that we will only see and/or/xor on those 1-bit values, so we can 119 * safely store NIR i1s in a 32-bit reg while always containing either a 1 or 120 * 0. 121 */ 122 123/* 124 * alu/sfu instructions: 125 */ 126 127static struct ir3_instruction * 128create_cov(struct ir3_context *ctx, struct ir3_instruction *src, 129 unsigned src_bitsize, nir_op op) 130{ 131 type_t src_type, dst_type; 132 133 switch (op) { 134 case nir_op_f2f32: 135 case nir_op_f2f16_rtne: 136 case nir_op_f2f16_rtz: 137 case nir_op_f2f16: 138 case nir_op_f2i32: 139 case nir_op_f2i16: 140 case nir_op_f2i8: 141 case nir_op_f2u32: 142 case nir_op_f2u16: 143 case nir_op_f2u8: 144 switch (src_bitsize) { 145 case 32: 146 src_type = TYPE_F32; 147 break; 148 case 16: 149 src_type = TYPE_F16; 150 break; 151 default: 152 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize); 153 } 154 break; 155 156 case nir_op_i2f32: 157 case nir_op_i2f16: 158 case nir_op_i2i32: 159 case nir_op_i2i16: 160 case nir_op_i2i8: 161 switch (src_bitsize) { 162 case 32: 163 src_type = TYPE_S32; 164 break; 165 case 16: 166 src_type = TYPE_S16; 167 break; 168 case 8: 169 src_type = TYPE_S8; 170 break; 171 default: 172 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize); 173 } 174 break; 175 176 case nir_op_u2f32: 177 case nir_op_u2f16: 178 case nir_op_u2u32: 179 case nir_op_u2u16: 180 case nir_op_u2u8: 181 switch (src_bitsize) { 182 case 32: 183 src_type = TYPE_U32; 184 break; 185 case 16: 186 src_type = TYPE_U16; 187 break; 188 case 8: 189 src_type = TYPE_U8; 190 break; 191 default: 192 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize); 193 } 194 break; 195 196 case nir_op_b2f16: 197 case nir_op_b2f32: 198 case nir_op_b2i8: 199 case nir_op_b2i16: 200 case nir_op_b2i32: 201 src_type = ctx->compiler->bool_type; 202 break; 203 204 default: 205 ir3_context_error(ctx, "invalid conversion op: %u", op); 206 } 207 208 switch (op) { 209 case nir_op_f2f32: 210 case nir_op_i2f32: 211 case nir_op_u2f32: 212 case nir_op_b2f32: 213 dst_type = TYPE_F32; 214 break; 215 216 case nir_op_f2f16_rtne: 217 case nir_op_f2f16_rtz: 218 case nir_op_f2f16: 219 case nir_op_i2f16: 220 case nir_op_u2f16: 221 case nir_op_b2f16: 222 dst_type = TYPE_F16; 223 break; 224 225 case nir_op_f2i32: 226 case nir_op_i2i32: 227 case nir_op_b2i32: 228 dst_type = TYPE_S32; 229 break; 230 231 case nir_op_f2i16: 232 case nir_op_i2i16: 233 case nir_op_b2i16: 234 dst_type = TYPE_S16; 235 break; 236 237 case nir_op_f2i8: 238 case nir_op_i2i8: 239 case nir_op_b2i8: 240 dst_type = TYPE_S8; 241 break; 242 243 case nir_op_f2u32: 244 case nir_op_u2u32: 245 dst_type = TYPE_U32; 246 break; 247 248 case nir_op_f2u16: 249 case nir_op_u2u16: 250 dst_type = TYPE_U16; 251 break; 252 253 case nir_op_f2u8: 254 case nir_op_u2u8: 255 dst_type = TYPE_U8; 256 break; 257 258 default: 259 ir3_context_error(ctx, "invalid conversion op: %u", op); 260 } 261 262 if (src_type == dst_type) 263 return src; 264 265 struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type); 266 267 if (op == nir_op_f2f16_rtne) { 268 cov->cat1.round = ROUND_EVEN; 269 } else if (op == nir_op_f2f16) { 270 unsigned execution_mode = ctx->s->info.float_controls_execution_mode; 271 nir_rounding_mode rounding_mode = 272 nir_get_rounding_mode_from_float_controls(execution_mode, 273 nir_type_float16); 274 if (rounding_mode == nir_rounding_mode_rtne) 275 cov->cat1.round = ROUND_EVEN; 276 } 277 278 return cov; 279} 280 281/* For shift instructions NIR always has shift amount as 32 bit integer */ 282static struct ir3_instruction * 283resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src, 284 unsigned bs) 285{ 286 if (bs != 16) 287 return src; 288 289 return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16); 290} 291 292static void 293emit_alu_dot_4x8_as_dp4acc(struct ir3_context *ctx, nir_alu_instr *alu, 294 struct ir3_instruction **dst, 295 struct ir3_instruction **src) 296{ 297 struct ir3_instruction *accumulator = NULL; 298 if (alu->op == nir_op_udot_4x8_uadd_sat) { 299 accumulator = create_immed(ctx->block, 0); 300 } else { 301 accumulator = src[2]; 302 } 303 304 dst[0] = ir3_DP4ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0); 305 306 if (alu->op == nir_op_udot_4x8_uadd || 307 alu->op == nir_op_udot_4x8_uadd_sat) { 308 dst[0]->cat3.signedness = IR3_SRC_UNSIGNED; 309 } else { 310 dst[0]->cat3.signedness = IR3_SRC_MIXED; 311 } 312 313 /* For some reason (sat) doesn't work in unsigned case so 314 * we have to emulate it. 315 */ 316 if (alu->op == nir_op_udot_4x8_uadd_sat) { 317 dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0); 318 dst[0]->flags |= IR3_INSTR_SAT; 319 } else if (alu->op == nir_op_sudot_4x8_iadd_sat) { 320 dst[0]->flags |= IR3_INSTR_SAT; 321 } 322} 323 324static void 325emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu, 326 struct ir3_instruction **dst, 327 struct ir3_instruction **src) 328{ 329 int signedness; 330 if (alu->op == nir_op_udot_4x8_uadd || 331 alu->op == nir_op_udot_4x8_uadd_sat) { 332 signedness = IR3_SRC_UNSIGNED; 333 } else { 334 signedness = IR3_SRC_MIXED; 335 } 336 337 struct ir3_instruction *accumulator = NULL; 338 if (alu->op == nir_op_udot_4x8_uadd_sat || 339 alu->op == nir_op_sudot_4x8_iadd_sat) { 340 accumulator = create_immed(ctx->block, 0); 341 } else { 342 accumulator = src[2]; 343 } 344 345 dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0); 346 dst[0]->cat3.packed = IR3_SRC_PACKED_LOW; 347 dst[0]->cat3.signedness = signedness; 348 349 dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, dst[0], 0); 350 dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH; 351 dst[0]->cat3.signedness = signedness; 352 353 if (alu->op == nir_op_udot_4x8_uadd_sat) { 354 dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0); 355 dst[0]->flags |= IR3_INSTR_SAT; 356 } else if (alu->op == nir_op_sudot_4x8_iadd_sat) { 357 dst[0] = ir3_ADD_S(ctx->block, dst[0], 0, src[2], 0); 358 dst[0]->flags |= IR3_INSTR_SAT; 359 } 360} 361 362static void 363emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) 364{ 365 const nir_op_info *info = &nir_op_infos[alu->op]; 366 struct ir3_instruction **dst, *src[info->num_inputs]; 367 unsigned bs[info->num_inputs]; /* bit size */ 368 struct ir3_block *b = ctx->block; 369 unsigned dst_sz, wrmask; 370 type_t dst_type = type_uint_size(nir_dest_bit_size(alu->dest.dest)); 371 372 if (alu->dest.dest.is_ssa) { 373 dst_sz = alu->dest.dest.ssa.num_components; 374 wrmask = (1 << dst_sz) - 1; 375 } else { 376 dst_sz = alu->dest.dest.reg.reg->num_components; 377 wrmask = alu->dest.write_mask; 378 } 379 380 dst = ir3_get_dst(ctx, &alu->dest.dest, dst_sz); 381 382 /* Vectors are special in that they have non-scalarized writemasks, 383 * and just take the first swizzle channel for each argument in 384 * order into each writemask channel. 385 */ 386 if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) || 387 (alu->op == nir_op_vec4) || (alu->op == nir_op_vec8) || 388 (alu->op == nir_op_vec16)) { 389 390 for (int i = 0; i < info->num_inputs; i++) { 391 nir_alu_src *asrc = &alu->src[i]; 392 393 compile_assert(ctx, !asrc->abs); 394 compile_assert(ctx, !asrc->negate); 395 396 src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]]; 397 if (!src[i]) 398 src[i] = create_immed_typed(ctx->block, 0, dst_type); 399 dst[i] = ir3_MOV(b, src[i], dst_type); 400 } 401 402 ir3_put_dst(ctx, &alu->dest.dest); 403 return; 404 } 405 406 /* We also get mov's with more than one component for mov's so 407 * handle those specially: 408 */ 409 if (alu->op == nir_op_mov) { 410 nir_alu_src *asrc = &alu->src[0]; 411 struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src); 412 413 for (unsigned i = 0; i < dst_sz; i++) { 414 if (wrmask & (1 << i)) { 415 dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type); 416 } else { 417 dst[i] = NULL; 418 } 419 } 420 421 ir3_put_dst(ctx, &alu->dest.dest); 422 return; 423 } 424 425 /* General case: We can just grab the one used channel per src. */ 426 for (int i = 0; i < info->num_inputs; i++) { 427 unsigned chan = ffs(alu->dest.write_mask) - 1; 428 nir_alu_src *asrc = &alu->src[i]; 429 430 compile_assert(ctx, !asrc->abs); 431 compile_assert(ctx, !asrc->negate); 432 433 src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[chan]]; 434 bs[i] = nir_src_bit_size(asrc->src); 435 436 compile_assert(ctx, src[i]); 437 } 438 439 switch (alu->op) { 440 case nir_op_f2f32: 441 case nir_op_f2f16_rtne: 442 case nir_op_f2f16_rtz: 443 case nir_op_f2f16: 444 case nir_op_f2i32: 445 case nir_op_f2i16: 446 case nir_op_f2i8: 447 case nir_op_f2u32: 448 case nir_op_f2u16: 449 case nir_op_f2u8: 450 case nir_op_i2f32: 451 case nir_op_i2f16: 452 case nir_op_i2i32: 453 case nir_op_i2i16: 454 case nir_op_i2i8: 455 case nir_op_u2f32: 456 case nir_op_u2f16: 457 case nir_op_u2u32: 458 case nir_op_u2u16: 459 case nir_op_u2u8: 460 case nir_op_b2f16: 461 case nir_op_b2f32: 462 case nir_op_b2i8: 463 case nir_op_b2i16: 464 case nir_op_b2i32: 465 dst[0] = create_cov(ctx, src[0], bs[0], alu->op); 466 break; 467 468 case nir_op_fquantize2f16: 469 dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16_rtne), 470 16, nir_op_f2f32); 471 break; 472 case nir_op_f2b1: 473 dst[0] = ir3_CMPS_F( 474 b, src[0], 0, 475 create_immed_typed(b, 0, type_float_size(bs[0])), 0); 476 dst[0]->cat2.condition = IR3_COND_NE; 477 break; 478 479 case nir_op_i2b1: 480 /* i2b1 will appear when translating from nir_load_ubo or 481 * nir_intrinsic_load_ssbo, where any non-zero value is true. 482 */ 483 dst[0] = ir3_CMPS_S( 484 b, src[0], 0, 485 create_immed_typed(b, 0, type_uint_size(bs[0])), 0); 486 dst[0]->cat2.condition = IR3_COND_NE; 487 break; 488 489 case nir_op_b2b1: 490 /* b2b1 will appear when translating from 491 * 492 * - nir_intrinsic_load_shared of a 32-bit 0/~0 value. 493 * - nir_intrinsic_load_constant of a 32-bit 0/~0 value 494 * 495 * A negate can turn those into a 1 or 0 for us. 496 */ 497 dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); 498 break; 499 500 case nir_op_b2b32: 501 /* b2b32 will appear when converting our 1-bit bools to a store_shared 502 * argument. 503 * 504 * A negate can turn those into a ~0 for us. 505 */ 506 dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); 507 break; 508 509 case nir_op_fneg: 510 dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG); 511 break; 512 case nir_op_fabs: 513 dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS); 514 break; 515 case nir_op_fmax: 516 dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0); 517 break; 518 case nir_op_fmin: 519 dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0); 520 break; 521 case nir_op_fsat: 522 /* if there is just a single use of the src, and it supports 523 * (sat) bit, we can just fold the (sat) flag back to the 524 * src instruction and create a mov. This is easier for cp 525 * to eliminate. 526 */ 527 if (alu->src[0].src.is_ssa && is_sat_compatible(src[0]->opc) && 528 (list_length(&alu->src[0].src.ssa->uses) == 1)) { 529 src[0]->flags |= IR3_INSTR_SAT; 530 dst[0] = ir3_MOV(b, src[0], dst_type); 531 } else { 532 /* otherwise generate a max.f that saturates.. blob does 533 * similar (generating a cat2 mov using max.f) 534 */ 535 dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0); 536 dst[0]->flags |= IR3_INSTR_SAT; 537 } 538 break; 539 case nir_op_fmul: 540 dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0); 541 break; 542 case nir_op_fadd: 543 dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0); 544 break; 545 case nir_op_fsub: 546 dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG); 547 break; 548 case nir_op_ffma: 549 dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0); 550 break; 551 case nir_op_fddx: 552 case nir_op_fddx_coarse: 553 dst[0] = ir3_DSX(b, src[0], 0); 554 dst[0]->cat5.type = TYPE_F32; 555 break; 556 case nir_op_fddx_fine: 557 dst[0] = ir3_DSXPP_MACRO(b, src[0], 0); 558 dst[0]->cat5.type = TYPE_F32; 559 break; 560 case nir_op_fddy: 561 case nir_op_fddy_coarse: 562 dst[0] = ir3_DSY(b, src[0], 0); 563 dst[0]->cat5.type = TYPE_F32; 564 break; 565 break; 566 case nir_op_fddy_fine: 567 dst[0] = ir3_DSYPP_MACRO(b, src[0], 0); 568 dst[0]->cat5.type = TYPE_F32; 569 break; 570 case nir_op_flt: 571 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); 572 dst[0]->cat2.condition = IR3_COND_LT; 573 break; 574 case nir_op_fge: 575 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); 576 dst[0]->cat2.condition = IR3_COND_GE; 577 break; 578 case nir_op_feq: 579 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); 580 dst[0]->cat2.condition = IR3_COND_EQ; 581 break; 582 case nir_op_fneu: 583 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); 584 dst[0]->cat2.condition = IR3_COND_NE; 585 break; 586 case nir_op_fceil: 587 dst[0] = ir3_CEIL_F(b, src[0], 0); 588 break; 589 case nir_op_ffloor: 590 dst[0] = ir3_FLOOR_F(b, src[0], 0); 591 break; 592 case nir_op_ftrunc: 593 dst[0] = ir3_TRUNC_F(b, src[0], 0); 594 break; 595 case nir_op_fround_even: 596 dst[0] = ir3_RNDNE_F(b, src[0], 0); 597 break; 598 case nir_op_fsign: 599 dst[0] = ir3_SIGN_F(b, src[0], 0); 600 break; 601 602 case nir_op_fsin: 603 dst[0] = ir3_SIN(b, src[0], 0); 604 break; 605 case nir_op_fcos: 606 dst[0] = ir3_COS(b, src[0], 0); 607 break; 608 case nir_op_frsq: 609 dst[0] = ir3_RSQ(b, src[0], 0); 610 break; 611 case nir_op_frcp: 612 dst[0] = ir3_RCP(b, src[0], 0); 613 break; 614 case nir_op_flog2: 615 dst[0] = ir3_LOG2(b, src[0], 0); 616 break; 617 case nir_op_fexp2: 618 dst[0] = ir3_EXP2(b, src[0], 0); 619 break; 620 case nir_op_fsqrt: 621 dst[0] = ir3_SQRT(b, src[0], 0); 622 break; 623 624 case nir_op_iabs: 625 dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS); 626 break; 627 case nir_op_iadd: 628 dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0); 629 break; 630 case nir_op_ihadd: 631 dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0); 632 dst[0]->dsts[0]->flags |= IR3_REG_EI; 633 break; 634 case nir_op_uhadd: 635 dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0); 636 dst[0]->dsts[0]->flags |= IR3_REG_EI; 637 break; 638 case nir_op_iand: 639 dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0); 640 break; 641 case nir_op_imax: 642 dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0); 643 break; 644 case nir_op_umax: 645 dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0); 646 break; 647 case nir_op_imin: 648 dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0); 649 break; 650 case nir_op_umin: 651 dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0); 652 break; 653 case nir_op_umul_low: 654 dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0); 655 break; 656 case nir_op_imadsh_mix16: 657 dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0); 658 break; 659 case nir_op_imad24_ir3: 660 dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0); 661 break; 662 case nir_op_imul: 663 compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) == 16); 664 dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0); 665 break; 666 case nir_op_imul24: 667 dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0); 668 break; 669 case nir_op_ineg: 670 dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); 671 break; 672 case nir_op_inot: 673 if (bs[0] == 1) { 674 struct ir3_instruction *one = 675 create_immed_typed(ctx->block, 1, ctx->compiler->bool_type); 676 dst[0] = ir3_SUB_U(b, one, 0, src[0], 0); 677 } else { 678 dst[0] = ir3_NOT_B(b, src[0], 0); 679 } 680 break; 681 case nir_op_ior: 682 dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0); 683 break; 684 case nir_op_ishl: 685 dst[0] = 686 ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0); 687 break; 688 case nir_op_ishr: 689 dst[0] = 690 ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0); 691 break; 692 case nir_op_isub: 693 dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0); 694 break; 695 case nir_op_ixor: 696 dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0); 697 break; 698 case nir_op_ushr: 699 dst[0] = 700 ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0); 701 break; 702 case nir_op_ilt: 703 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); 704 dst[0]->cat2.condition = IR3_COND_LT; 705 break; 706 case nir_op_ige: 707 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); 708 dst[0]->cat2.condition = IR3_COND_GE; 709 break; 710 case nir_op_ieq: 711 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); 712 dst[0]->cat2.condition = IR3_COND_EQ; 713 break; 714 case nir_op_ine: 715 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); 716 dst[0]->cat2.condition = IR3_COND_NE; 717 break; 718 case nir_op_ult: 719 dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); 720 dst[0]->cat2.condition = IR3_COND_LT; 721 break; 722 case nir_op_uge: 723 dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); 724 dst[0]->cat2.condition = IR3_COND_GE; 725 break; 726 727 case nir_op_bcsel: { 728 struct ir3_instruction *cond = src[0]; 729 730 /* If src[0] is a negation (likely as a result of an ir3_b2n(cond)), 731 * we can ignore that and use original cond, since the nonzero-ness of 732 * cond stays the same. 733 */ 734 if (cond->opc == OPC_ABSNEG_S && cond->flags == 0 && 735 (cond->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) == 736 IR3_REG_SNEG) { 737 cond = cond->srcs[0]->def->instr; 738 } 739 740 compile_assert(ctx, bs[1] == bs[2]); 741 742 /* The condition's size has to match the other two arguments' size, so 743 * convert down if necessary. 744 * 745 * Single hashtable is fine, because the conversion will either be 746 * 16->32 or 32->16, but never both 747 */ 748 if (is_half(src[1]) != is_half(cond)) { 749 struct hash_entry *prev_entry = 750 _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]); 751 if (prev_entry) { 752 cond = prev_entry->data; 753 } else { 754 if (is_half(cond)) { 755 cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32); 756 } else { 757 cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16); 758 } 759 _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond); 760 } 761 } 762 763 if (is_half(src[1])) { 764 dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0); 765 } else { 766 dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0); 767 } 768 769 break; 770 } 771 case nir_op_bit_count: { 772 if (ctx->compiler->gen < 5 || (src[0]->dsts[0]->flags & IR3_REG_HALF)) { 773 dst[0] = ir3_CBITS_B(b, src[0], 0); 774 break; 775 } 776 777 // We need to do this 16b at a time on a5xx+a6xx. Once half-precision 778 // support is in place, this should probably move to a NIR lowering pass: 779 struct ir3_instruction *hi, *lo; 780 781 hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0), TYPE_U32, 782 TYPE_U16); 783 lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16); 784 785 hi = ir3_CBITS_B(b, hi, 0); 786 lo = ir3_CBITS_B(b, lo, 0); 787 788 // TODO maybe the builders should default to making dst half-precision 789 // if the src's were half precision, to make this less awkward.. otoh 790 // we should probably just do this lowering in NIR. 791 hi->dsts[0]->flags |= IR3_REG_HALF; 792 lo->dsts[0]->flags |= IR3_REG_HALF; 793 794 dst[0] = ir3_ADD_S(b, hi, 0, lo, 0); 795 dst[0]->dsts[0]->flags |= IR3_REG_HALF; 796 dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32); 797 break; 798 } 799 case nir_op_ifind_msb: { 800 struct ir3_instruction *cmp; 801 dst[0] = ir3_CLZ_S(b, src[0], 0); 802 cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0); 803 cmp->cat2.condition = IR3_COND_GE; 804 dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 805 0, cmp, 0, dst[0], 0); 806 break; 807 } 808 case nir_op_ufind_msb: 809 dst[0] = ir3_CLZ_B(b, src[0], 0); 810 dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 811 0, src[0], 0, dst[0], 0); 812 break; 813 case nir_op_find_lsb: 814 dst[0] = ir3_BFREV_B(b, src[0], 0); 815 dst[0] = ir3_CLZ_B(b, dst[0], 0); 816 break; 817 case nir_op_bitfield_reverse: 818 dst[0] = ir3_BFREV_B(b, src[0], 0); 819 break; 820 821 case nir_op_uadd_sat: 822 dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0); 823 dst[0]->flags |= IR3_INSTR_SAT; 824 break; 825 case nir_op_iadd_sat: 826 dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0); 827 dst[0]->flags |= IR3_INSTR_SAT; 828 break; 829 case nir_op_usub_sat: 830 dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0); 831 dst[0]->flags |= IR3_INSTR_SAT; 832 break; 833 case nir_op_isub_sat: 834 dst[0] = ir3_SUB_S(b, src[0], 0, src[1], 0); 835 dst[0]->flags |= IR3_INSTR_SAT; 836 break; 837 838 case nir_op_udot_4x8_uadd: 839 case nir_op_udot_4x8_uadd_sat: 840 case nir_op_sudot_4x8_iadd: 841 case nir_op_sudot_4x8_iadd_sat: { 842 if (ctx->compiler->has_dp4acc) { 843 emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst, src); 844 } else if (ctx->compiler->has_dp2acc) { 845 emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst, src); 846 } else { 847 ir3_context_error(ctx, "ALU op should have been lowered: %s\n", 848 nir_op_infos[alu->op].name); 849 } 850 851 break; 852 } 853 854 default: 855 ir3_context_error(ctx, "Unhandled ALU op: %s\n", 856 nir_op_infos[alu->op].name); 857 break; 858 } 859 860 if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) { 861 assert(nir_dest_bit_size(alu->dest.dest) == 1 || alu->op == nir_op_b2b32); 862 assert(dst_sz == 1); 863 } else { 864 /* 1-bit values stored in 32-bit registers are only valid for certain 865 * ALU ops. 866 */ 867 switch (alu->op) { 868 case nir_op_iand: 869 case nir_op_ior: 870 case nir_op_ixor: 871 case nir_op_inot: 872 case nir_op_bcsel: 873 break; 874 default: 875 compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) != 1); 876 } 877 } 878 879 ir3_put_dst(ctx, &alu->dest.dest); 880} 881 882static void 883emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr, 884 struct ir3_instruction **dst) 885{ 886 struct ir3_block *b = ctx->block; 887 888 /* This is only generated for us by nir_lower_ubo_vec4, which leaves base = 889 * 0. 890 */ 891 assert(nir_intrinsic_base(intr) == 0); 892 893 unsigned ncomp = intr->num_components; 894 struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0]; 895 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0]; 896 struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0); 897 ldc->dsts[0]->wrmask = MASK(ncomp); 898 ldc->cat6.iim_val = ncomp; 899 ldc->cat6.d = nir_intrinsic_component(intr); 900 ldc->cat6.type = TYPE_U32; 901 902 ir3_handle_bindless_cat6(ldc, intr->src[0]); 903 if (ldc->flags & IR3_INSTR_B) 904 ctx->so->bindless_ubo = true; 905 ir3_handle_nonuniform(ldc, intr); 906 907 ir3_split_dest(b, dst, ldc, 0, ncomp); 908} 909 910static void 911emit_intrinsic_copy_ubo_to_uniform(struct ir3_context *ctx, 912 nir_intrinsic_instr *intr) 913{ 914 struct ir3_block *b = ctx->block; 915 916 unsigned base = nir_intrinsic_base(intr); 917 unsigned size = nir_intrinsic_range(intr); 918 919 struct ir3_instruction *addr1 = ir3_get_addr1(ctx, base); 920 921 struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0]; 922 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0]; 923 struct ir3_instruction *ldc = ir3_LDC_K(b, idx, 0, offset, 0); 924 ldc->cat6.iim_val = size; 925 ldc->barrier_class = ldc->barrier_conflict = IR3_BARRIER_CONST_W; 926 927 ir3_handle_bindless_cat6(ldc, intr->src[0]); 928 if (ldc->flags & IR3_INSTR_B) 929 ctx->so->bindless_ubo = true; 930 931 ir3_instr_set_address(ldc, addr1); 932 933 array_insert(b, b->keeps, ldc); 934} 935 936/* handles direct/indirect UBO reads: */ 937static void 938emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, 939 struct ir3_instruction **dst) 940{ 941 struct ir3_block *b = ctx->block; 942 struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1; 943 const struct ir3_const_state *const_state = ir3_const_state(ctx->so); 944 unsigned ubo = regid(const_state->offsets.ubo, 0); 945 const unsigned ptrsz = ir3_pointer_size(ctx->compiler); 946 947 int off = 0; 948 949 /* First src is ubo index, which could either be an immed or not: */ 950 src0 = ir3_get_src(ctx, &intr->src[0])[0]; 951 if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) { 952 base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz)); 953 base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1); 954 } else { 955 base_lo = create_uniform_indirect(b, ubo, TYPE_U32, 956 ir3_get_addr0(ctx, src0, ptrsz)); 957 base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32, 958 ir3_get_addr0(ctx, src0, ptrsz)); 959 960 /* NOTE: since relative addressing is used, make sure constlen is 961 * at least big enough to cover all the UBO addresses, since the 962 * assembler won't know what the max address reg is. 963 */ 964 ctx->so->constlen = 965 MAX2(ctx->so->constlen, 966 const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz)); 967 } 968 969 /* note: on 32bit gpu's base_hi is ignored and DCE'd */ 970 addr = base_lo; 971 972 if (nir_src_is_const(intr->src[1])) { 973 off += nir_src_as_uint(intr->src[1]); 974 } else { 975 /* For load_ubo_indirect, second src is indirect offset: */ 976 src1 = ir3_get_src(ctx, &intr->src[1])[0]; 977 978 /* and add offset to addr: */ 979 addr = ir3_ADD_S(b, addr, 0, src1, 0); 980 } 981 982 /* if offset is to large to encode in the ldg, split it out: */ 983 if ((off + (intr->num_components * 4)) > 1024) { 984 /* split out the minimal amount to improve the odds that 985 * cp can fit the immediate in the add.s instruction: 986 */ 987 unsigned off2 = off + (intr->num_components * 4) - 1024; 988 addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0); 989 off -= off2; 990 } 991 992 if (ptrsz == 2) { 993 struct ir3_instruction *carry; 994 995 /* handle 32b rollover, ie: 996 * if (addr < base_lo) 997 * base_hi++ 998 */ 999 carry = ir3_CMPS_U(b, addr, 0, base_lo, 0); 1000 carry->cat2.condition = IR3_COND_LT; 1001 base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0); 1002 1003 addr = ir3_collect(b, addr, base_hi); 1004 } 1005 1006 for (int i = 0; i < intr->num_components; i++) { 1007 struct ir3_instruction *load = 1008 ir3_LDG(b, addr, 0, create_immed(b, off + i * 4), 0, 1009 create_immed(b, 1), 0); /* num components */ 1010 load->cat6.type = TYPE_U32; 1011 dst[i] = load; 1012 } 1013} 1014 1015/* Load a kernel param: src[] = { address }. */ 1016static void 1017emit_intrinsic_load_kernel_input(struct ir3_context *ctx, 1018 nir_intrinsic_instr *intr, 1019 struct ir3_instruction **dst) 1020{ 1021 const struct ir3_const_state *const_state = ir3_const_state(ctx->so); 1022 struct ir3_block *b = ctx->block; 1023 unsigned offset = nir_intrinsic_base(intr); 1024 unsigned p = regid(const_state->offsets.kernel_params, 0); 1025 1026 struct ir3_instruction *src0 = ir3_get_src(ctx, &intr->src[0])[0]; 1027 1028 if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) { 1029 offset += src0->srcs[0]->iim_val; 1030 1031 /* kernel param position is in bytes, but constant space is 32b registers: */ 1032 compile_assert(ctx, !(offset & 0x3)); 1033 1034 dst[0] = create_uniform(b, p + (offset / 4)); 1035 } else { 1036 /* kernel param position is in bytes, but constant space is 32b registers: */ 1037 compile_assert(ctx, !(offset & 0x3)); 1038 1039 /* TODO we should probably be lowering this in nir, and also handling 1040 * non-32b inputs.. Also we probably don't want to be using 1041 * SP_MODE_CONTROL.CONSTANT_DEMOTION_ENABLE for KERNEL shaders.. 1042 */ 1043 src0 = ir3_SHR_B(b, src0, 0, create_immed(b, 2), 0); 1044 1045 dst[0] = create_uniform_indirect(b, offset / 4, TYPE_U32, 1046 ir3_get_addr0(ctx, src0, 1)); 1047 } 1048} 1049 1050/* src[] = { block_index } */ 1051static void 1052emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr, 1053 struct ir3_instruction **dst) 1054{ 1055 struct ir3_block *b = ctx->block; 1056 struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]); 1057 struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0); 1058 resinfo->cat6.iim_val = 1; 1059 resinfo->cat6.d = ctx->compiler->gen >= 6 ? 1 : 2; 1060 resinfo->cat6.type = TYPE_U32; 1061 resinfo->cat6.typed = false; 1062 /* resinfo has no writemask and always writes out 3 components */ 1063 resinfo->dsts[0]->wrmask = MASK(3); 1064 ir3_handle_bindless_cat6(resinfo, intr->src[0]); 1065 ir3_handle_nonuniform(resinfo, intr); 1066 1067 if (ctx->compiler->gen >= 6) { 1068 ir3_split_dest(b, dst, resinfo, 0, 1); 1069 } else { 1070 /* On a5xx, resinfo returns the low 16 bits of ssbo size in .x and the high 16 bits in .y */ 1071 struct ir3_instruction *resinfo_dst[2]; 1072 ir3_split_dest(b, resinfo_dst, resinfo, 0, 2); 1073 *dst = ir3_ADD_U(b, ir3_SHL_B(b, resinfo_dst[1], 0, create_immed(b, 16), 0), 0, resinfo_dst[0], 0); 1074 } 1075} 1076 1077/* src[] = { offset }. const_index[] = { base } */ 1078static void 1079emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr, 1080 struct ir3_instruction **dst) 1081{ 1082 struct ir3_block *b = ctx->block; 1083 struct ir3_instruction *ldl, *offset; 1084 unsigned base; 1085 1086 offset = ir3_get_src(ctx, &intr->src[0])[0]; 1087 base = nir_intrinsic_base(intr); 1088 1089 ldl = ir3_LDL(b, offset, 0, create_immed(b, base), 0, 1090 create_immed(b, intr->num_components), 0); 1091 1092 ldl->cat6.type = utype_dst(intr->dest); 1093 ldl->dsts[0]->wrmask = MASK(intr->num_components); 1094 1095 ldl->barrier_class = IR3_BARRIER_SHARED_R; 1096 ldl->barrier_conflict = IR3_BARRIER_SHARED_W; 1097 1098 ir3_split_dest(b, dst, ldl, 0, intr->num_components); 1099} 1100 1101/* src[] = { value, offset }. const_index[] = { base, write_mask } */ 1102static void 1103emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) 1104{ 1105 struct ir3_block *b = ctx->block; 1106 struct ir3_instruction *stl, *offset; 1107 struct ir3_instruction *const *value; 1108 unsigned base, wrmask, ncomp; 1109 1110 value = ir3_get_src(ctx, &intr->src[0]); 1111 offset = ir3_get_src(ctx, &intr->src[1])[0]; 1112 1113 base = nir_intrinsic_base(intr); 1114 wrmask = nir_intrinsic_write_mask(intr); 1115 ncomp = ffs(~wrmask) - 1; 1116 1117 assert(wrmask == BITFIELD_MASK(intr->num_components)); 1118 1119 stl = ir3_STL(b, offset, 0, ir3_create_collect(b, value, ncomp), 0, 1120 create_immed(b, ncomp), 0); 1121 stl->cat6.dst_offset = base; 1122 stl->cat6.type = utype_src(intr->src[0]); 1123 stl->barrier_class = IR3_BARRIER_SHARED_W; 1124 stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; 1125 1126 array_insert(b, b->keeps, stl); 1127} 1128 1129/* src[] = { offset }. const_index[] = { base } */ 1130static void 1131emit_intrinsic_load_shared_ir3(struct ir3_context *ctx, 1132 nir_intrinsic_instr *intr, 1133 struct ir3_instruction **dst) 1134{ 1135 struct ir3_block *b = ctx->block; 1136 struct ir3_instruction *load, *offset; 1137 unsigned base; 1138 1139 offset = ir3_get_src(ctx, &intr->src[0])[0]; 1140 base = nir_intrinsic_base(intr); 1141 1142 load = ir3_LDLW(b, offset, 0, create_immed(b, base), 0, 1143 create_immed(b, intr->num_components), 0); 1144 1145 /* for a650, use LDL for tess ctrl inputs: */ 1146 if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared) 1147 load->opc = OPC_LDL; 1148 1149 load->cat6.type = utype_dst(intr->dest); 1150 load->dsts[0]->wrmask = MASK(intr->num_components); 1151 1152 load->barrier_class = IR3_BARRIER_SHARED_R; 1153 load->barrier_conflict = IR3_BARRIER_SHARED_W; 1154 1155 ir3_split_dest(b, dst, load, 0, intr->num_components); 1156} 1157 1158/* src[] = { value, offset }. const_index[] = { base } */ 1159static void 1160emit_intrinsic_store_shared_ir3(struct ir3_context *ctx, 1161 nir_intrinsic_instr *intr) 1162{ 1163 struct ir3_block *b = ctx->block; 1164 struct ir3_instruction *store, *offset; 1165 struct ir3_instruction *const *value; 1166 1167 value = ir3_get_src(ctx, &intr->src[0]); 1168 offset = ir3_get_src(ctx, &intr->src[1])[0]; 1169 1170 store = ir3_STLW(b, offset, 0, 1171 ir3_create_collect(b, value, intr->num_components), 0, 1172 create_immed(b, intr->num_components), 0); 1173 1174 /* for a650, use STL for vertex outputs used by tess ctrl shader: */ 1175 if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation && 1176 ctx->compiler->tess_use_shared) 1177 store->opc = OPC_STL; 1178 1179 store->cat6.dst_offset = nir_intrinsic_base(intr); 1180 store->cat6.type = utype_src(intr->src[0]); 1181 store->barrier_class = IR3_BARRIER_SHARED_W; 1182 store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; 1183 1184 array_insert(b, b->keeps, store); 1185} 1186 1187/* 1188 * CS shared variable atomic intrinsics 1189 * 1190 * All of the shared variable atomic memory operations read a value from 1191 * memory, compute a new value using one of the operations below, write the 1192 * new value to memory, and return the original value read. 1193 * 1194 * All operations take 2 sources except CompSwap that takes 3. These 1195 * sources represent: 1196 * 1197 * 0: The offset into the shared variable storage region that the atomic 1198 * operation will operate on. 1199 * 1: The data parameter to the atomic function (i.e. the value to add 1200 * in shared_atomic_add, etc). 1201 * 2: For CompSwap only: the second data parameter. 1202 */ 1203static struct ir3_instruction * 1204emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) 1205{ 1206 struct ir3_block *b = ctx->block; 1207 struct ir3_instruction *atomic, *src0, *src1; 1208 type_t type = TYPE_U32; 1209 1210 src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */ 1211 src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */ 1212 1213 switch (intr->intrinsic) { 1214 case nir_intrinsic_shared_atomic_add: 1215 atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0); 1216 break; 1217 case nir_intrinsic_shared_atomic_imin: 1218 atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0); 1219 type = TYPE_S32; 1220 break; 1221 case nir_intrinsic_shared_atomic_umin: 1222 atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0); 1223 break; 1224 case nir_intrinsic_shared_atomic_imax: 1225 atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0); 1226 type = TYPE_S32; 1227 break; 1228 case nir_intrinsic_shared_atomic_umax: 1229 atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0); 1230 break; 1231 case nir_intrinsic_shared_atomic_and: 1232 atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0); 1233 break; 1234 case nir_intrinsic_shared_atomic_or: 1235 atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0); 1236 break; 1237 case nir_intrinsic_shared_atomic_xor: 1238 atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0); 1239 break; 1240 case nir_intrinsic_shared_atomic_exchange: 1241 atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0); 1242 break; 1243 case nir_intrinsic_shared_atomic_comp_swap: 1244 /* for cmpxchg, src1 is [ui]vec2(data, compare): */ 1245 src1 = ir3_collect(b, ir3_get_src(ctx, &intr->src[2])[0], src1); 1246 atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0); 1247 break; 1248 default: 1249 unreachable("boo"); 1250 } 1251 1252 atomic->cat6.iim_val = 1; 1253 atomic->cat6.d = 1; 1254 atomic->cat6.type = type; 1255 atomic->barrier_class = IR3_BARRIER_SHARED_W; 1256 atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; 1257 1258 /* even if nothing consume the result, we can't DCE the instruction: */ 1259 array_insert(b, b->keeps, atomic); 1260 1261 return atomic; 1262} 1263 1264static void 1265stp_ldp_offset(struct ir3_context *ctx, nir_src *src, 1266 struct ir3_instruction **offset, int32_t *base) 1267{ 1268 struct ir3_block *b = ctx->block; 1269 1270 if (nir_src_is_const(*src)) { 1271 unsigned src_offset = nir_src_as_uint(*src); 1272 /* The base offset field is only 13 bits, and it's signed. Try to make the 1273 * offset constant whenever the original offsets are similar, to avoid 1274 * creating too many constants in the final shader. 1275 */ 1276 *base = ((int32_t) src_offset << (32 - 13)) >> (32 - 13); 1277 uint32_t offset_val = src_offset - *base; 1278 *offset = create_immed(b, offset_val); 1279 } else { 1280 /* TODO: match on nir_iadd with a constant that fits */ 1281 *base = 0; 1282 *offset = ir3_get_src(ctx, src)[0]; 1283 } 1284} 1285 1286/* src[] = { offset }. */ 1287static void 1288emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr, 1289 struct ir3_instruction **dst) 1290{ 1291 struct ir3_block *b = ctx->block; 1292 struct ir3_instruction *ldp, *offset; 1293 int32_t base; 1294 1295 stp_ldp_offset(ctx, &intr->src[0], &offset, &base); 1296 1297 ldp = ir3_LDP(b, offset, 0, create_immed(b, base), 0, 1298 create_immed(b, intr->num_components), 0); 1299 1300 ldp->cat6.type = utype_dst(intr->dest); 1301 ldp->dsts[0]->wrmask = MASK(intr->num_components); 1302 1303 ldp->barrier_class = IR3_BARRIER_PRIVATE_R; 1304 ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W; 1305 1306 ir3_split_dest(b, dst, ldp, 0, intr->num_components); 1307} 1308 1309/* src[] = { value, offset }. const_index[] = { write_mask } */ 1310static void 1311emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr) 1312{ 1313 struct ir3_block *b = ctx->block; 1314 struct ir3_instruction *stp, *offset; 1315 struct ir3_instruction *const *value; 1316 unsigned wrmask, ncomp; 1317 int32_t base; 1318 1319 value = ir3_get_src(ctx, &intr->src[0]); 1320 1321 stp_ldp_offset(ctx, &intr->src[1], &offset, &base); 1322 1323 wrmask = nir_intrinsic_write_mask(intr); 1324 ncomp = ffs(~wrmask) - 1; 1325 1326 assert(wrmask == BITFIELD_MASK(intr->num_components)); 1327 1328 stp = ir3_STP(b, offset, 0, ir3_create_collect(b, value, ncomp), 0, 1329 create_immed(b, ncomp), 0); 1330 stp->cat6.dst_offset = base; 1331 stp->cat6.type = utype_src(intr->src[0]); 1332 stp->barrier_class = IR3_BARRIER_PRIVATE_W; 1333 stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W; 1334 1335 array_insert(b, b->keeps, stp); 1336} 1337 1338struct tex_src_info { 1339 /* For prefetch */ 1340 unsigned tex_base, samp_base, tex_idx, samp_idx; 1341 /* For normal tex instructions */ 1342 unsigned base, a1_val, flags; 1343 struct ir3_instruction *samp_tex; 1344}; 1345 1346/* TODO handle actual indirect/dynamic case.. which is going to be weird 1347 * to handle with the image_mapping table.. 1348 */ 1349static struct tex_src_info 1350get_image_ssbo_samp_tex_src(struct ir3_context *ctx, nir_src *src) 1351{ 1352 struct ir3_block *b = ctx->block; 1353 struct tex_src_info info = {0}; 1354 nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(*src); 1355 1356 if (bindless_tex) { 1357 /* Bindless case */ 1358 ctx->so->bindless_tex = true; 1359 info.flags |= IR3_INSTR_B; 1360 1361 /* Gather information required to determine which encoding to 1362 * choose as well as for prefetch. 1363 */ 1364 info.tex_base = nir_intrinsic_desc_set(bindless_tex); 1365 bool tex_const = nir_src_is_const(bindless_tex->src[0]); 1366 if (tex_const) 1367 info.tex_idx = nir_src_as_uint(bindless_tex->src[0]); 1368 info.samp_idx = 0; 1369 1370 /* Choose encoding. */ 1371 if (tex_const && info.tex_idx < 256) { 1372 if (info.tex_idx < 16) { 1373 /* Everything fits within the instruction */ 1374 info.base = info.tex_base; 1375 } else { 1376 info.base = info.tex_base; 1377 info.a1_val = info.tex_idx << 3; 1378 info.flags |= IR3_INSTR_A1EN; 1379 } 1380 info.samp_tex = NULL; 1381 } else { 1382 info.flags |= IR3_INSTR_S2EN; 1383 info.base = info.tex_base; 1384 1385 /* Note: the indirect source is now a vec2 instead of hvec2 */ 1386 struct ir3_instruction *texture, *sampler; 1387 1388 texture = ir3_get_src(ctx, src)[0]; 1389 sampler = create_immed(b, 0); 1390 info.samp_tex = ir3_collect(b, texture, sampler); 1391 } 1392 } else { 1393 info.flags |= IR3_INSTR_S2EN; 1394 unsigned slot = nir_src_as_uint(*src); 1395 unsigned tex_idx = ir3_image_to_tex(&ctx->so->image_mapping, slot); 1396 struct ir3_instruction *texture, *sampler; 1397 1398 texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16); 1399 sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16); 1400 1401 info.samp_tex = ir3_collect(b, sampler, texture); 1402 } 1403 1404 return info; 1405} 1406 1407static struct ir3_instruction * 1408emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info, 1409 type_t type, unsigned wrmask, struct ir3_instruction *src0, 1410 struct ir3_instruction *src1) 1411{ 1412 struct ir3_instruction *sam, *addr; 1413 if (info.flags & IR3_INSTR_A1EN) { 1414 addr = ir3_get_addr1(ctx, info.a1_val); 1415 } 1416 sam = ir3_SAM(ctx->block, opc, type, wrmask, info.flags, info.samp_tex, src0, 1417 src1); 1418 if (info.flags & IR3_INSTR_A1EN) { 1419 ir3_instr_set_address(sam, addr); 1420 } 1421 if (info.flags & IR3_INSTR_B) { 1422 sam->cat5.tex_base = info.base; 1423 sam->cat5.samp = info.samp_idx; 1424 sam->cat5.tex = info.tex_idx; 1425 } 1426 return sam; 1427} 1428 1429/* src[] = { deref, coord, sample_index }. const_index[] = {} */ 1430static void 1431emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr, 1432 struct ir3_instruction **dst) 1433{ 1434 /* If the image can be written, must use LDIB to retrieve data, rather than 1435 * through ISAM (which uses the texture cache and won't get previous writes). 1436 */ 1437 if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER)) { 1438 ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst); 1439 return; 1440 } 1441 1442 /* The sparse set of texture descriptors for non-coherent load_images means we can't do indirection, so 1443 * fall back to coherent load. 1444 */ 1445 if (ctx->compiler->gen >= 5 && 1446 !ir3_bindless_resource(intr->src[0]) && 1447 !nir_src_is_const(intr->src[0])) { 1448 ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst); 1449 return; 1450 } 1451 1452 struct ir3_block *b = ctx->block; 1453 struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0]); 1454 struct ir3_instruction *sam; 1455 struct ir3_instruction *const *src0 = ir3_get_src(ctx, &intr->src[1]); 1456 struct ir3_instruction *coords[4]; 1457 unsigned flags, ncoords = ir3_get_image_coords(intr, &flags); 1458 type_t type = ir3_get_type_for_image_intrinsic(intr); 1459 1460 info.flags |= flags; 1461 1462 /* hw doesn't do 1d, so we treat it as 2d with height of 1, and patch up the 1463 * y coord. Note that the array index must come after the fake y coord. 1464 */ 1465 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr); 1466 if (dim == GLSL_SAMPLER_DIM_1D || dim == GLSL_SAMPLER_DIM_BUF) { 1467 coords[0] = src0[0]; 1468 coords[1] = create_immed(b, 0); 1469 for (unsigned i = 1; i < ncoords; i++) 1470 coords[i + 1] = src0[i]; 1471 ncoords++; 1472 } else { 1473 for (unsigned i = 0; i < ncoords; i++) 1474 coords[i] = src0[i]; 1475 } 1476 1477 sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111, 1478 ir3_create_collect(b, coords, ncoords), NULL); 1479 1480 ir3_handle_nonuniform(sam, intr); 1481 1482 sam->barrier_class = IR3_BARRIER_IMAGE_R; 1483 sam->barrier_conflict = IR3_BARRIER_IMAGE_W; 1484 1485 ir3_split_dest(b, dst, sam, 0, 4); 1486} 1487 1488/* A4xx version of image_size, see ir3_a6xx.c for newer resinfo version. */ 1489void 1490emit_intrinsic_image_size_tex(struct ir3_context *ctx, 1491 nir_intrinsic_instr *intr, 1492 struct ir3_instruction **dst) 1493{ 1494 struct ir3_block *b = ctx->block; 1495 struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0]); 1496 struct ir3_instruction *sam, *lod; 1497 unsigned flags, ncoords = ir3_get_image_coords(intr, &flags); 1498 type_t dst_type = nir_dest_bit_size(intr->dest) == 16 ? TYPE_U16 : TYPE_U32; 1499 1500 info.flags |= flags; 1501 assert(nir_src_as_uint(intr->src[1]) == 0); 1502 lod = create_immed(b, 0); 1503 sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL); 1504 1505 /* Array size actually ends up in .w rather than .z. This doesn't 1506 * matter for miplevel 0, but for higher mips the value in z is 1507 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is 1508 * returned, which means that we have to add 1 to it for arrays for 1509 * a3xx. 1510 * 1511 * Note use a temporary dst and then copy, since the size of the dst 1512 * array that is passed in is based on nir's understanding of the 1513 * result size, not the hardware's 1514 */ 1515 struct ir3_instruction *tmp[4]; 1516 1517 ir3_split_dest(b, tmp, sam, 0, 4); 1518 1519 for (unsigned i = 0; i < ncoords; i++) 1520 dst[i] = tmp[i]; 1521 1522 if (flags & IR3_INSTR_A) { 1523 if (ctx->compiler->levels_add_one) { 1524 dst[ncoords - 1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0); 1525 } else { 1526 dst[ncoords - 1] = ir3_MOV(b, tmp[3], TYPE_U32); 1527 } 1528 } 1529} 1530 1531/* src[] = { buffer_index, offset }. No const_index */ 1532static void 1533emit_intrinsic_load_ssbo(struct ir3_context *ctx, 1534 nir_intrinsic_instr *intr, 1535 struct ir3_instruction **dst) 1536{ 1537 /* Note: isam currently can't handle vectorized loads/stores */ 1538 if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) || 1539 !ir3_bindless_resource(intr->src[0]) || 1540 intr->dest.ssa.num_components > 1) { 1541 ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst); 1542 return; 1543 } 1544 1545 struct ir3_block *b = ctx->block; 1546 struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0]; 1547 struct ir3_instruction *coords = ir3_collect(b, offset, create_immed(b, 0)); 1548 struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0]); 1549 1550 unsigned num_components = intr->dest.ssa.num_components; 1551 struct ir3_instruction *sam = 1552 emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->dest.ssa.bit_size), 1553 MASK(num_components), coords, NULL); 1554 1555 ir3_handle_nonuniform(sam, intr); 1556 1557 sam->barrier_class = IR3_BARRIER_BUFFER_R; 1558 sam->barrier_conflict = IR3_BARRIER_BUFFER_W; 1559 1560 ir3_split_dest(b, dst, sam, 0, num_components); 1561} 1562 1563static void 1564emit_control_barrier(struct ir3_context *ctx) 1565{ 1566 /* Hull shaders dispatch 32 wide so an entire patch will always 1567 * fit in a single warp and execute in lock-step. Consequently, 1568 * we don't need to do anything for TCS barriers. Emitting 1569 * barrier instruction will deadlock. 1570 */ 1571 if (ctx->so->type == MESA_SHADER_TESS_CTRL) 1572 return; 1573 1574 struct ir3_block *b = ctx->block; 1575 struct ir3_instruction *barrier = ir3_BAR(b); 1576 barrier->cat7.g = true; 1577 if (ctx->compiler->gen < 6) 1578 barrier->cat7.l = true; 1579 barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY; 1580 barrier->barrier_class = IR3_BARRIER_EVERYTHING; 1581 array_insert(b, b->keeps, barrier); 1582 1583 ctx->so->has_barrier = true; 1584} 1585 1586static void 1587emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr) 1588{ 1589 struct ir3_block *b = ctx->block; 1590 struct ir3_instruction *barrier; 1591 1592 /* TODO: find out why there is a major difference of .l usage 1593 * between a5xx and a6xx, 1594 */ 1595 1596 switch (intr->intrinsic) { 1597 case nir_intrinsic_control_barrier: 1598 emit_control_barrier(ctx); 1599 return; 1600 case nir_intrinsic_scoped_barrier: { 1601 nir_scope exec_scope = nir_intrinsic_execution_scope(intr); 1602 nir_variable_mode modes = nir_intrinsic_memory_modes(intr); 1603 /* loads/stores are always cache-coherent so we can filter out 1604 * available/visible. 1605 */ 1606 nir_memory_semantics semantics = 1607 nir_intrinsic_memory_semantics(intr) & (NIR_MEMORY_ACQUIRE | 1608 NIR_MEMORY_RELEASE); 1609 1610 if (ctx->so->type == MESA_SHADER_TESS_CTRL) { 1611 /* Remove mode corresponding to nir_intrinsic_memory_barrier_tcs_patch, 1612 * because hull shaders dispatch 32 wide so an entire patch will 1613 * always fit in a single warp and execute in lock-step. 1614 * 1615 * TODO: memory barrier also tells us not to reorder stores, this 1616 * information is lost here (backend doesn't reorder stores so we 1617 * are safe for now). 1618 */ 1619 modes &= ~nir_var_shader_out; 1620 } 1621 1622 assert(!(modes & nir_var_shader_out)); 1623 1624 if ((modes & 1625 (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_mem_global | 1626 nir_var_image)) && semantics) { 1627 barrier = ir3_FENCE(b); 1628 barrier->cat7.r = true; 1629 barrier->cat7.w = true; 1630 1631 if (modes & (nir_var_mem_ssbo | nir_var_image | nir_var_mem_global)) { 1632 barrier->cat7.g = true; 1633 } 1634 1635 if (ctx->compiler->gen >= 6) { 1636 if (modes & (nir_var_mem_ssbo | nir_var_image)) { 1637 barrier->cat7.l = true; 1638 } 1639 } else { 1640 if (modes & (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_image)) { 1641 barrier->cat7.l = true; 1642 } 1643 } 1644 1645 barrier->barrier_class = 0; 1646 barrier->barrier_conflict = 0; 1647 1648 if (modes & nir_var_mem_shared) { 1649 barrier->barrier_class |= IR3_BARRIER_SHARED_W; 1650 barrier->barrier_conflict |= 1651 IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; 1652 } 1653 1654 if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) { 1655 barrier->barrier_class |= IR3_BARRIER_BUFFER_W; 1656 barrier->barrier_conflict |= 1657 IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; 1658 } 1659 1660 if (modes & nir_var_image) { 1661 barrier->barrier_class |= IR3_BARRIER_IMAGE_W; 1662 barrier->barrier_conflict |= 1663 IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R; 1664 } 1665 array_insert(b, b->keeps, barrier); 1666 } 1667 1668 if (exec_scope >= NIR_SCOPE_WORKGROUP) { 1669 emit_control_barrier(ctx); 1670 } 1671 1672 return; 1673 } 1674 case nir_intrinsic_memory_barrier_tcs_patch: 1675 /* Not applicable, see explanation for scoped_barrier + shader_out */ 1676 return; 1677 case nir_intrinsic_memory_barrier_buffer: 1678 barrier = ir3_FENCE(b); 1679 barrier->cat7.g = true; 1680 if (ctx->compiler->gen >= 6) 1681 barrier->cat7.l = true; 1682 barrier->cat7.r = true; 1683 barrier->cat7.w = true; 1684 barrier->barrier_class = IR3_BARRIER_BUFFER_W; 1685 barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; 1686 break; 1687 case nir_intrinsic_memory_barrier_image: 1688 barrier = ir3_FENCE(b); 1689 barrier->cat7.g = true; 1690 barrier->cat7.l = true; 1691 barrier->cat7.r = true; 1692 barrier->cat7.w = true; 1693 barrier->barrier_class = IR3_BARRIER_IMAGE_W; 1694 barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; 1695 break; 1696 case nir_intrinsic_memory_barrier_shared: 1697 barrier = ir3_FENCE(b); 1698 if (ctx->compiler->gen < 6) 1699 barrier->cat7.l = true; 1700 barrier->cat7.r = true; 1701 barrier->cat7.w = true; 1702 barrier->barrier_class = IR3_BARRIER_SHARED_W; 1703 barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; 1704 break; 1705 case nir_intrinsic_memory_barrier: 1706 case nir_intrinsic_group_memory_barrier: 1707 barrier = ir3_FENCE(b); 1708 barrier->cat7.g = true; 1709 barrier->cat7.l = true; 1710 barrier->cat7.r = true; 1711 barrier->cat7.w = true; 1712 barrier->barrier_class = 1713 IR3_BARRIER_SHARED_W | IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W; 1714 barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W | 1715 IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W | 1716 IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; 1717 break; 1718 default: 1719 unreachable("boo"); 1720 } 1721 1722 /* make sure barrier doesn't get DCE'd */ 1723 array_insert(b, b->keeps, barrier); 1724} 1725 1726static void 1727add_sysval_input_compmask(struct ir3_context *ctx, gl_system_value slot, 1728 unsigned compmask, struct ir3_instruction *instr) 1729{ 1730 struct ir3_shader_variant *so = ctx->so; 1731 unsigned n = so->inputs_count++; 1732 1733 assert(instr->opc == OPC_META_INPUT); 1734 instr->input.inidx = n; 1735 instr->input.sysval = slot; 1736 1737 so->inputs[n].sysval = true; 1738 so->inputs[n].slot = slot; 1739 so->inputs[n].compmask = compmask; 1740 so->total_in++; 1741 1742 so->sysval_in += util_last_bit(compmask); 1743} 1744 1745static struct ir3_instruction * 1746create_sysval_input(struct ir3_context *ctx, gl_system_value slot, 1747 unsigned compmask) 1748{ 1749 assert(compmask); 1750 struct ir3_instruction *sysval = create_input(ctx, compmask); 1751 add_sysval_input_compmask(ctx, slot, compmask, sysval); 1752 return sysval; 1753} 1754 1755static struct ir3_instruction * 1756get_barycentric(struct ir3_context *ctx, enum ir3_bary bary) 1757{ 1758 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_PIXEL == 1759 SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL); 1760 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_SAMPLE == 1761 SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE); 1762 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTROID == 1763 SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID); 1764 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTER_RHW == 1765 SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW); 1766 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_PIXEL == 1767 SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL); 1768 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_CENTROID == 1769 SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID); 1770 STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_SAMPLE == 1771 SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE); 1772 1773 if (!ctx->ij[bary]) { 1774 struct ir3_instruction *xy[2]; 1775 struct ir3_instruction *ij; 1776 1777 ij = create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + 1778 bary, 0x3); 1779 ir3_split_dest(ctx->in_block, xy, ij, 0, 2); 1780 1781 ctx->ij[bary] = ir3_create_collect(ctx->in_block, xy, 2); 1782 } 1783 1784 return ctx->ij[bary]; 1785} 1786 1787/* TODO: make this a common NIR helper? 1788 * there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so 1789 * it can't be extended to work with this 1790 */ 1791static gl_system_value 1792nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr) 1793{ 1794 enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr); 1795 gl_system_value sysval; 1796 1797 switch (intr->intrinsic) { 1798 case nir_intrinsic_load_barycentric_pixel: 1799 if (interp_mode == INTERP_MODE_NOPERSPECTIVE) 1800 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL; 1801 else 1802 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL; 1803 break; 1804 case nir_intrinsic_load_barycentric_centroid: 1805 if (interp_mode == INTERP_MODE_NOPERSPECTIVE) 1806 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID; 1807 else 1808 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID; 1809 break; 1810 case nir_intrinsic_load_barycentric_sample: 1811 if (interp_mode == INTERP_MODE_NOPERSPECTIVE) 1812 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE; 1813 else 1814 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE; 1815 break; 1816 default: 1817 unreachable("invalid barycentric intrinsic"); 1818 } 1819 1820 return sysval; 1821} 1822 1823static void 1824emit_intrinsic_barycentric(struct ir3_context *ctx, nir_intrinsic_instr *intr, 1825 struct ir3_instruction **dst) 1826{ 1827 gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr); 1828 1829 if (!ctx->so->key.msaa) { 1830 switch (sysval) { 1831 case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE: 1832 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL; 1833 break; 1834 case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID: 1835 if (ctx->compiler->gen < 6) 1836 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL; 1837 break; 1838 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE: 1839 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL; 1840 break; 1841 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID: 1842 if (ctx->compiler->gen < 6) 1843 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL; 1844 break; 1845 default: 1846 break; 1847 } 1848 } 1849 1850 enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL; 1851 1852 struct ir3_instruction *ij = get_barycentric(ctx, bary); 1853 ir3_split_dest(ctx->block, dst, ij, 0, 2); 1854} 1855 1856static struct ir3_instruction * 1857get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr) 1858{ 1859 if (!ctx->frag_coord) { 1860 struct ir3_block *b = ir3_after_preamble(ctx->ir); 1861 struct ir3_instruction *xyzw[4]; 1862 struct ir3_instruction *hw_frag_coord; 1863 1864 hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf); 1865 ir3_split_dest(b, xyzw, hw_frag_coord, 0, 4); 1866 1867 /* for frag_coord.xy, we get unsigned values.. we need 1868 * to subtract (integer) 8 and divide by 16 (right- 1869 * shift by 4) then convert to float: 1870 * 1871 * sub.s tmp, src, 8 1872 * shr.b tmp, tmp, 4 1873 * mov.u32f32 dst, tmp 1874 * 1875 */ 1876 for (int i = 0; i < 2; i++) { 1877 xyzw[i] = ir3_COV(b, xyzw[i], TYPE_U32, TYPE_F32); 1878 xyzw[i] = 1879 ir3_MUL_F(b, xyzw[i], 0, create_immed(b, fui(1.0 / 16.0)), 0); 1880 } 1881 1882 ctx->frag_coord = ir3_create_collect(b, xyzw, 4); 1883 } 1884 1885 ctx->so->fragcoord_compmask |= nir_ssa_def_components_read(&intr->dest.ssa); 1886 1887 return ctx->frag_coord; 1888} 1889 1890/* This is a bit of a hack until ir3_context is converted to store SSA values 1891 * as ir3_register's instead of ir3_instruction's. Pick out a given destination 1892 * of an instruction with multiple destinations using a mov that will get folded 1893 * away by ir3_cp. 1894 */ 1895static struct ir3_instruction * 1896create_multidst_mov(struct ir3_block *block, struct ir3_register *dst) 1897{ 1898 struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1); 1899 unsigned dst_flags = dst->flags & IR3_REG_HALF; 1900 unsigned src_flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED); 1901 1902 __ssa_dst(mov)->flags |= dst_flags; 1903 struct ir3_register *src = 1904 ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | src_flags); 1905 src->wrmask = dst->wrmask; 1906 src->def = dst; 1907 assert(!(dst->flags & IR3_REG_RELATIV)); 1908 mov->cat1.src_type = mov->cat1.dst_type = 1909 (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; 1910 return mov; 1911} 1912 1913static reduce_op_t 1914get_reduce_op(nir_op opc) 1915{ 1916 switch (opc) { 1917 case nir_op_iadd: return REDUCE_OP_ADD_U; 1918 case nir_op_fadd: return REDUCE_OP_ADD_F; 1919 case nir_op_imul: return REDUCE_OP_MUL_U; 1920 case nir_op_fmul: return REDUCE_OP_MUL_F; 1921 case nir_op_umin: return REDUCE_OP_MIN_U; 1922 case nir_op_imin: return REDUCE_OP_MIN_S; 1923 case nir_op_fmin: return REDUCE_OP_MIN_F; 1924 case nir_op_umax: return REDUCE_OP_MAX_U; 1925 case nir_op_imax: return REDUCE_OP_MAX_S; 1926 case nir_op_fmax: return REDUCE_OP_MAX_F; 1927 case nir_op_iand: return REDUCE_OP_AND_B; 1928 case nir_op_ior: return REDUCE_OP_OR_B; 1929 case nir_op_ixor: return REDUCE_OP_XOR_B; 1930 default: 1931 unreachable("unknown NIR reduce op"); 1932 } 1933} 1934 1935static uint32_t 1936get_reduce_identity(nir_op opc, unsigned size) 1937{ 1938 switch (opc) { 1939 case nir_op_iadd: 1940 return 0; 1941 case nir_op_fadd: 1942 return size == 32 ? fui(0.0f) : _mesa_float_to_half(0.0f); 1943 case nir_op_imul: 1944 return 1; 1945 case nir_op_fmul: 1946 return size == 32 ? fui(1.0f) : _mesa_float_to_half(1.0f); 1947 case nir_op_umax: 1948 return 0; 1949 case nir_op_imax: 1950 return size == 32 ? INT32_MIN : (uint32_t)INT16_MIN; 1951 case nir_op_fmax: 1952 return size == 32 ? fui(-INFINITY) : _mesa_float_to_half(-INFINITY); 1953 case nir_op_umin: 1954 return size == 32 ? UINT32_MAX : UINT16_MAX; 1955 case nir_op_imin: 1956 return size == 32 ? INT32_MAX : (uint32_t)INT16_MAX; 1957 case nir_op_fmin: 1958 return size == 32 ? fui(INFINITY) : _mesa_float_to_half(INFINITY); 1959 case nir_op_iand: 1960 return size == 32 ? ~0 : (size == 16 ? (uint32_t)(uint16_t)~0 : 1); 1961 case nir_op_ior: 1962 return 0; 1963 case nir_op_ixor: 1964 return 0; 1965 default: 1966 unreachable("unknown NIR reduce op"); 1967 } 1968} 1969 1970static struct ir3_instruction * 1971emit_intrinsic_reduce(struct ir3_context *ctx, nir_intrinsic_instr *intr) 1972{ 1973 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; 1974 nir_op nir_reduce_op = (nir_op) nir_intrinsic_reduction_op(intr); 1975 reduce_op_t reduce_op = get_reduce_op(nir_reduce_op); 1976 unsigned dst_size = nir_dest_bit_size(intr->dest); 1977 unsigned flags = (ir3_bitsize(ctx, dst_size) == 16) ? IR3_REG_HALF : 0; 1978 1979 /* Note: the shared reg is initialized to the identity, so we need it to 1980 * always be 32-bit even when the source isn't because half shared regs are 1981 * not supported. 1982 */ 1983 struct ir3_instruction *identity = 1984 create_immed(ctx->block, get_reduce_identity(nir_reduce_op, dst_size)); 1985 identity = ir3_READ_FIRST_MACRO(ctx->block, identity, 0); 1986 identity->dsts[0]->flags |= IR3_REG_SHARED; 1987 1988 /* OPC_SCAN_MACRO has the following destinations: 1989 * - Exclusive scan result (interferes with source) 1990 * - Inclusive scan result 1991 * - Shared reg reduction result, must be initialized to the identity 1992 * 1993 * The loop computes all three results at the same time, we just have to 1994 * choose which destination to return. 1995 */ 1996 struct ir3_instruction *scan = 1997 ir3_instr_create(ctx->block, OPC_SCAN_MACRO, 3, 2); 1998 scan->cat1.reduce_op = reduce_op; 1999 2000 struct ir3_register *exclusive = __ssa_dst(scan); 2001 exclusive->flags |= flags | IR3_REG_EARLY_CLOBBER; 2002 struct ir3_register *inclusive = __ssa_dst(scan); 2003 inclusive->flags |= flags; 2004 struct ir3_register *reduce = __ssa_dst(scan); 2005 reduce->flags |= IR3_REG_SHARED; 2006 2007 /* The 32-bit multiply macro reads its sources after writing a partial result 2008 * to the destination, therefore inclusive also interferes with the source. 2009 */ 2010 if (reduce_op == REDUCE_OP_MUL_U && dst_size == 32) 2011 inclusive->flags |= IR3_REG_EARLY_CLOBBER; 2012 2013 /* Normal source */ 2014 __ssa_src(scan, src, 0); 2015 2016 /* shared reg tied source */ 2017 struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED); 2018 ir3_reg_tie(reduce, reduce_init); 2019 2020 struct ir3_register *dst; 2021 switch (intr->intrinsic) { 2022 case nir_intrinsic_reduce: dst = reduce; break; 2023 case nir_intrinsic_inclusive_scan: dst = inclusive; break; 2024 case nir_intrinsic_exclusive_scan: dst = exclusive; break; 2025 default: 2026 unreachable("unknown reduce intrinsic"); 2027 } 2028 2029 return create_multidst_mov(ctx->block, dst); 2030} 2031 2032static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr); 2033static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr); 2034 2035static void 2036emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) 2037{ 2038 const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; 2039 struct ir3_instruction **dst; 2040 struct ir3_instruction *const *src; 2041 struct ir3_block *b = ctx->block; 2042 unsigned dest_components = nir_intrinsic_dest_components(intr); 2043 int idx; 2044 2045 if (info->has_dest) { 2046 dst = ir3_get_dst(ctx, &intr->dest, dest_components); 2047 } else { 2048 dst = NULL; 2049 } 2050 2051 const struct ir3_const_state *const_state = ir3_const_state(ctx->so); 2052 const unsigned primitive_param = const_state->offsets.primitive_param * 4; 2053 const unsigned primitive_map = const_state->offsets.primitive_map * 4; 2054 2055 switch (intr->intrinsic) { 2056 case nir_intrinsic_load_uniform: 2057 idx = nir_intrinsic_base(intr); 2058 if (nir_src_is_const(intr->src[0])) { 2059 idx += nir_src_as_uint(intr->src[0]); 2060 for (int i = 0; i < dest_components; i++) { 2061 dst[i] = create_uniform_typed( 2062 b, idx + i, 2063 nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32); 2064 } 2065 } else { 2066 src = ir3_get_src(ctx, &intr->src[0]); 2067 for (int i = 0; i < dest_components; i++) { 2068 dst[i] = create_uniform_indirect( 2069 b, idx + i, 2070 nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32, 2071 ir3_get_addr0(ctx, src[0], 1)); 2072 } 2073 /* NOTE: if relative addressing is used, we set 2074 * constlen in the compiler (to worst-case value) 2075 * since we don't know in the assembler what the max 2076 * addr reg value can be: 2077 */ 2078 ctx->so->constlen = 2079 MAX2(ctx->so->constlen, 2080 ctx->so->num_reserved_user_consts + 2081 const_state->ubo_state.size / 16); 2082 } 2083 break; 2084 2085 case nir_intrinsic_load_vs_primitive_stride_ir3: 2086 dst[0] = create_uniform(b, primitive_param + 0); 2087 break; 2088 case nir_intrinsic_load_vs_vertex_stride_ir3: 2089 dst[0] = create_uniform(b, primitive_param + 1); 2090 break; 2091 case nir_intrinsic_load_hs_patch_stride_ir3: 2092 dst[0] = create_uniform(b, primitive_param + 2); 2093 break; 2094 case nir_intrinsic_load_patch_vertices_in: 2095 dst[0] = create_uniform(b, primitive_param + 3); 2096 break; 2097 case nir_intrinsic_load_tess_param_base_ir3: 2098 dst[0] = create_uniform(b, primitive_param + 4); 2099 dst[1] = create_uniform(b, primitive_param + 5); 2100 break; 2101 case nir_intrinsic_load_tess_factor_base_ir3: 2102 dst[0] = create_uniform(b, primitive_param + 6); 2103 dst[1] = create_uniform(b, primitive_param + 7); 2104 break; 2105 2106 case nir_intrinsic_load_primitive_location_ir3: 2107 idx = nir_intrinsic_driver_location(intr); 2108 dst[0] = create_uniform(b, primitive_map + idx); 2109 break; 2110 2111 case nir_intrinsic_load_gs_header_ir3: 2112 dst[0] = ctx->gs_header; 2113 break; 2114 case nir_intrinsic_load_tcs_header_ir3: 2115 dst[0] = ctx->tcs_header; 2116 break; 2117 2118 case nir_intrinsic_load_rel_patch_id_ir3: 2119 dst[0] = ctx->rel_patch_id; 2120 break; 2121 2122 case nir_intrinsic_load_primitive_id: 2123 if (!ctx->primitive_id) { 2124 ctx->primitive_id = 2125 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1); 2126 } 2127 dst[0] = ctx->primitive_id; 2128 break; 2129 2130 case nir_intrinsic_load_tess_coord: 2131 if (!ctx->tess_coord) { 2132 ctx->tess_coord = 2133 create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3); 2134 } 2135 ir3_split_dest(b, dst, ctx->tess_coord, 0, 2); 2136 2137 /* Unused, but ir3_put_dst() below wants to free something */ 2138 dst[2] = create_immed(b, 0); 2139 break; 2140 2141 case nir_intrinsic_end_patch_ir3: 2142 assert(ctx->so->type == MESA_SHADER_TESS_CTRL); 2143 struct ir3_instruction *end = ir3_PREDE(b); 2144 array_insert(b, b->keeps, end); 2145 2146 end->barrier_class = IR3_BARRIER_EVERYTHING; 2147 end->barrier_conflict = IR3_BARRIER_EVERYTHING; 2148 break; 2149 2150 case nir_intrinsic_store_global_ir3: 2151 ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr); 2152 break; 2153 case nir_intrinsic_load_global_ir3: 2154 ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst); 2155 break; 2156 2157 case nir_intrinsic_load_ubo: 2158 emit_intrinsic_load_ubo(ctx, intr, dst); 2159 break; 2160 case nir_intrinsic_load_ubo_vec4: 2161 emit_intrinsic_load_ubo_ldc(ctx, intr, dst); 2162 break; 2163 case nir_intrinsic_copy_ubo_to_uniform_ir3: 2164 emit_intrinsic_copy_ubo_to_uniform(ctx, intr); 2165 break; 2166 case nir_intrinsic_load_frag_coord: 2167 ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4); 2168 break; 2169 case nir_intrinsic_load_sample_pos_from_id: { 2170 /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32, 2171 * but that doesn't seem necessary. 2172 */ 2173 struct ir3_instruction *offset = 2174 ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0); 2175 offset->dsts[0]->wrmask = 0x3; 2176 offset->cat5.type = TYPE_F32; 2177 2178 ir3_split_dest(b, dst, offset, 0, 2); 2179 2180 break; 2181 } 2182 case nir_intrinsic_load_persp_center_rhw_ir3: 2183 if (!ctx->ij[IJ_PERSP_CENTER_RHW]) { 2184 ctx->ij[IJ_PERSP_CENTER_RHW] = 2185 create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW, 0x1); 2186 } 2187 dst[0] = ctx->ij[IJ_PERSP_CENTER_RHW]; 2188 break; 2189 case nir_intrinsic_load_barycentric_centroid: 2190 case nir_intrinsic_load_barycentric_sample: 2191 case nir_intrinsic_load_barycentric_pixel: 2192 emit_intrinsic_barycentric(ctx, intr, dst); 2193 break; 2194 case nir_intrinsic_load_interpolated_input: 2195 case nir_intrinsic_load_input: 2196 setup_input(ctx, intr); 2197 break; 2198 case nir_intrinsic_load_kernel_input: 2199 emit_intrinsic_load_kernel_input(ctx, intr, dst); 2200 break; 2201 /* All SSBO intrinsics should have been lowered by 'lower_io_offsets' 2202 * pass and replaced by an ir3-specifc version that adds the 2203 * dword-offset in the last source. 2204 */ 2205 case nir_intrinsic_load_ssbo_ir3: 2206 emit_intrinsic_load_ssbo(ctx, intr, dst); 2207 break; 2208 case nir_intrinsic_store_ssbo_ir3: 2209 ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr); 2210 break; 2211 case nir_intrinsic_get_ssbo_size: 2212 emit_intrinsic_ssbo_size(ctx, intr, dst); 2213 break; 2214 case nir_intrinsic_ssbo_atomic_add_ir3: 2215 case nir_intrinsic_ssbo_atomic_imin_ir3: 2216 case nir_intrinsic_ssbo_atomic_umin_ir3: 2217 case nir_intrinsic_ssbo_atomic_imax_ir3: 2218 case nir_intrinsic_ssbo_atomic_umax_ir3: 2219 case nir_intrinsic_ssbo_atomic_and_ir3: 2220 case nir_intrinsic_ssbo_atomic_or_ir3: 2221 case nir_intrinsic_ssbo_atomic_xor_ir3: 2222 case nir_intrinsic_ssbo_atomic_exchange_ir3: 2223 case nir_intrinsic_ssbo_atomic_comp_swap_ir3: 2224 dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr); 2225 break; 2226 case nir_intrinsic_load_shared: 2227 emit_intrinsic_load_shared(ctx, intr, dst); 2228 break; 2229 case nir_intrinsic_store_shared: 2230 emit_intrinsic_store_shared(ctx, intr); 2231 break; 2232 case nir_intrinsic_shared_atomic_add: 2233 case nir_intrinsic_shared_atomic_imin: 2234 case nir_intrinsic_shared_atomic_umin: 2235 case nir_intrinsic_shared_atomic_imax: 2236 case nir_intrinsic_shared_atomic_umax: 2237 case nir_intrinsic_shared_atomic_and: 2238 case nir_intrinsic_shared_atomic_or: 2239 case nir_intrinsic_shared_atomic_xor: 2240 case nir_intrinsic_shared_atomic_exchange: 2241 case nir_intrinsic_shared_atomic_comp_swap: 2242 dst[0] = emit_intrinsic_atomic_shared(ctx, intr); 2243 break; 2244 case nir_intrinsic_load_scratch: 2245 emit_intrinsic_load_scratch(ctx, intr, dst); 2246 break; 2247 case nir_intrinsic_store_scratch: 2248 emit_intrinsic_store_scratch(ctx, intr); 2249 break; 2250 case nir_intrinsic_image_load: 2251 case nir_intrinsic_bindless_image_load: 2252 emit_intrinsic_load_image(ctx, intr, dst); 2253 break; 2254 case nir_intrinsic_image_store: 2255 case nir_intrinsic_bindless_image_store: 2256 ctx->funcs->emit_intrinsic_store_image(ctx, intr); 2257 break; 2258 case nir_intrinsic_image_size: 2259 case nir_intrinsic_bindless_image_size: 2260 ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst); 2261 break; 2262 case nir_intrinsic_image_atomic_add: 2263 case nir_intrinsic_bindless_image_atomic_add: 2264 case nir_intrinsic_image_atomic_imin: 2265 case nir_intrinsic_bindless_image_atomic_imin: 2266 case nir_intrinsic_image_atomic_umin: 2267 case nir_intrinsic_bindless_image_atomic_umin: 2268 case nir_intrinsic_image_atomic_imax: 2269 case nir_intrinsic_bindless_image_atomic_imax: 2270 case nir_intrinsic_image_atomic_umax: 2271 case nir_intrinsic_bindless_image_atomic_umax: 2272 case nir_intrinsic_image_atomic_and: 2273 case nir_intrinsic_bindless_image_atomic_and: 2274 case nir_intrinsic_image_atomic_or: 2275 case nir_intrinsic_bindless_image_atomic_or: 2276 case nir_intrinsic_image_atomic_xor: 2277 case nir_intrinsic_bindless_image_atomic_xor: 2278 case nir_intrinsic_image_atomic_exchange: 2279 case nir_intrinsic_bindless_image_atomic_exchange: 2280 case nir_intrinsic_image_atomic_comp_swap: 2281 case nir_intrinsic_bindless_image_atomic_comp_swap: 2282 dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr); 2283 break; 2284 case nir_intrinsic_scoped_barrier: 2285 case nir_intrinsic_control_barrier: 2286 case nir_intrinsic_memory_barrier: 2287 case nir_intrinsic_group_memory_barrier: 2288 case nir_intrinsic_memory_barrier_buffer: 2289 case nir_intrinsic_memory_barrier_image: 2290 case nir_intrinsic_memory_barrier_shared: 2291 case nir_intrinsic_memory_barrier_tcs_patch: 2292 emit_intrinsic_barrier(ctx, intr); 2293 /* note that blk ptr no longer valid, make that obvious: */ 2294 b = NULL; 2295 break; 2296 case nir_intrinsic_store_output: 2297 setup_output(ctx, intr); 2298 break; 2299 case nir_intrinsic_load_base_vertex: 2300 case nir_intrinsic_load_first_vertex: 2301 if (!ctx->basevertex) { 2302 ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE); 2303 } 2304 dst[0] = ctx->basevertex; 2305 break; 2306 case nir_intrinsic_load_draw_id: 2307 if (!ctx->draw_id) { 2308 ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID); 2309 } 2310 dst[0] = ctx->draw_id; 2311 break; 2312 case nir_intrinsic_load_base_instance: 2313 if (!ctx->base_instance) { 2314 ctx->base_instance = create_driver_param(ctx, IR3_DP_INSTID_BASE); 2315 } 2316 dst[0] = ctx->base_instance; 2317 break; 2318 case nir_intrinsic_load_view_index: 2319 if (!ctx->view_index) { 2320 ctx->view_index = 2321 create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1); 2322 } 2323 dst[0] = ctx->view_index; 2324 break; 2325 case nir_intrinsic_load_vertex_id_zero_base: 2326 case nir_intrinsic_load_vertex_id: 2327 if (!ctx->vertex_id) { 2328 gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) 2329 ? SYSTEM_VALUE_VERTEX_ID 2330 : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE; 2331 ctx->vertex_id = create_sysval_input(ctx, sv, 0x1); 2332 } 2333 dst[0] = ctx->vertex_id; 2334 break; 2335 case nir_intrinsic_load_instance_id: 2336 if (!ctx->instance_id) { 2337 ctx->instance_id = 2338 create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1); 2339 } 2340 dst[0] = ctx->instance_id; 2341 break; 2342 case nir_intrinsic_load_sample_id: 2343 ctx->so->per_samp = true; 2344 FALLTHROUGH; 2345 case nir_intrinsic_load_sample_id_no_per_sample: 2346 if (!ctx->samp_id) { 2347 ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1); 2348 ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF; 2349 } 2350 dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32); 2351 break; 2352 case nir_intrinsic_load_sample_mask_in: 2353 if (!ctx->samp_mask_in) { 2354 ctx->samp_mask_in = 2355 create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1); 2356 } 2357 dst[0] = ctx->samp_mask_in; 2358 break; 2359 case nir_intrinsic_load_user_clip_plane: 2360 idx = nir_intrinsic_ucp_id(intr); 2361 for (int i = 0; i < dest_components; i++) { 2362 unsigned n = idx * 4 + i; 2363 dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n); 2364 } 2365 break; 2366 case nir_intrinsic_load_front_face: 2367 if (!ctx->frag_face) { 2368 ctx->so->frag_face = true; 2369 ctx->frag_face = 2370 create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1); 2371 ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF; 2372 } 2373 /* for fragface, we get -1 for back and 0 for front. However this is 2374 * the inverse of what nir expects (where ~0 is true). 2375 */ 2376 dst[0] = ir3_CMPS_S(b, ctx->frag_face, 0, 2377 create_immed_typed(b, 0, TYPE_U16), 0); 2378 dst[0]->cat2.condition = IR3_COND_EQ; 2379 break; 2380 case nir_intrinsic_load_local_invocation_id: 2381 if (!ctx->local_invocation_id) { 2382 ctx->local_invocation_id = 2383 create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7); 2384 } 2385 ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3); 2386 break; 2387 case nir_intrinsic_load_workgroup_id: 2388 case nir_intrinsic_load_workgroup_id_zero_base: 2389 if (ctx->compiler->has_shared_regfile) { 2390 if (!ctx->work_group_id) { 2391 ctx->work_group_id = 2392 create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7); 2393 ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED; 2394 } 2395 ir3_split_dest(b, dst, ctx->work_group_id, 0, 3); 2396 } else { 2397 /* For a3xx/a4xx, this comes in via const injection by the hw */ 2398 for (int i = 0; i < dest_components; i++) { 2399 dst[i] = create_driver_param(ctx, IR3_DP_WORKGROUP_ID_X + i); 2400 } 2401 } 2402 break; 2403 case nir_intrinsic_load_base_workgroup_id: 2404 for (int i = 0; i < dest_components; i++) { 2405 dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i); 2406 } 2407 break; 2408 case nir_intrinsic_load_num_workgroups: 2409 for (int i = 0; i < dest_components; i++) { 2410 dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i); 2411 } 2412 break; 2413 case nir_intrinsic_load_workgroup_size: 2414 for (int i = 0; i < dest_components; i++) { 2415 dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i); 2416 } 2417 break; 2418 case nir_intrinsic_load_subgroup_size: { 2419 assert(ctx->so->type == MESA_SHADER_COMPUTE || 2420 ctx->so->type == MESA_SHADER_FRAGMENT); 2421 enum ir3_driver_param size = ctx->so->type == MESA_SHADER_COMPUTE ? 2422 IR3_DP_CS_SUBGROUP_SIZE : IR3_DP_FS_SUBGROUP_SIZE; 2423 dst[0] = create_driver_param(ctx, size); 2424 break; 2425 } 2426 case nir_intrinsic_load_subgroup_id_shift_ir3: 2427 dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT); 2428 break; 2429 case nir_intrinsic_load_work_dim: 2430 dst[0] = create_driver_param(ctx, IR3_DP_WORK_DIM); 2431 break; 2432 case nir_intrinsic_load_subgroup_invocation: 2433 assert(ctx->compiler->has_getfiberid); 2434 dst[0] = ir3_GETFIBERID(b); 2435 dst[0]->cat6.type = TYPE_U32; 2436 __ssa_dst(dst[0]); 2437 break; 2438 case nir_intrinsic_discard_if: 2439 case nir_intrinsic_discard: 2440 case nir_intrinsic_demote: 2441 case nir_intrinsic_demote_if: 2442 case nir_intrinsic_terminate: 2443 case nir_intrinsic_terminate_if: { 2444 struct ir3_instruction *cond, *kill; 2445 2446 if (intr->intrinsic == nir_intrinsic_discard_if || 2447 intr->intrinsic == nir_intrinsic_demote_if || 2448 intr->intrinsic == nir_intrinsic_terminate_if) { 2449 /* conditional discard: */ 2450 src = ir3_get_src(ctx, &intr->src[0]); 2451 cond = src[0]; 2452 } else { 2453 /* unconditional discard: */ 2454 cond = create_immed_typed(b, 1, ctx->compiler->bool_type); 2455 } 2456 2457 /* NOTE: only cmps.*.* can write p0.x: */ 2458 struct ir3_instruction *zero = 2459 create_immed_typed(b, 0, is_half(cond) ? TYPE_U16 : TYPE_U32); 2460 cond = ir3_CMPS_S(b, cond, 0, zero, 0); 2461 cond->cat2.condition = IR3_COND_NE; 2462 2463 /* condition always goes in predicate register: */ 2464 cond->dsts[0]->num = regid(REG_P0, 0); 2465 cond->dsts[0]->flags &= ~IR3_REG_SSA; 2466 2467 if (intr->intrinsic == nir_intrinsic_demote || 2468 intr->intrinsic == nir_intrinsic_demote_if) { 2469 kill = ir3_DEMOTE(b, cond, 0); 2470 } else { 2471 kill = ir3_KILL(b, cond, 0); 2472 } 2473 2474 /* - Side-effects should not be moved on a different side of the kill 2475 * - Instructions that depend on active fibers should not be reordered 2476 */ 2477 kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W | 2478 IR3_BARRIER_ACTIVE_FIBERS_W; 2479 kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W | 2480 IR3_BARRIER_ACTIVE_FIBERS_R; 2481 kill->srcs[0]->num = regid(REG_P0, 0); 2482 array_insert(ctx->ir, ctx->ir->predicates, kill); 2483 2484 array_insert(b, b->keeps, kill); 2485 ctx->so->has_kill = true; 2486 2487 break; 2488 } 2489 2490 case nir_intrinsic_cond_end_ir3: { 2491 struct ir3_instruction *cond, *kill; 2492 2493 src = ir3_get_src(ctx, &intr->src[0]); 2494 cond = src[0]; 2495 2496 /* NOTE: only cmps.*.* can write p0.x: */ 2497 struct ir3_instruction *zero = 2498 create_immed_typed(b, 0, is_half(cond) ? TYPE_U16 : TYPE_U32); 2499 cond = ir3_CMPS_S(b, cond, 0, zero, 0); 2500 cond->cat2.condition = IR3_COND_NE; 2501 2502 /* condition always goes in predicate register: */ 2503 cond->dsts[0]->num = regid(REG_P0, 0); 2504 2505 kill = ir3_PREDT(b, cond, 0); 2506 2507 kill->barrier_class = IR3_BARRIER_EVERYTHING; 2508 kill->barrier_conflict = IR3_BARRIER_EVERYTHING; 2509 2510 array_insert(ctx->ir, ctx->ir->predicates, kill); 2511 array_insert(b, b->keeps, kill); 2512 break; 2513 } 2514 2515 case nir_intrinsic_vote_any: 2516 case nir_intrinsic_vote_all: { 2517 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; 2518 struct ir3_instruction *pred = ir3_get_predicate(ctx, src); 2519 if (intr->intrinsic == nir_intrinsic_vote_any) 2520 dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0); 2521 else 2522 dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0); 2523 dst[0]->srcs[0]->num = regid(REG_P0, 0); 2524 array_insert(ctx->ir, ctx->ir->predicates, dst[0]); 2525 break; 2526 } 2527 case nir_intrinsic_elect: 2528 dst[0] = ir3_ELECT_MACRO(ctx->block); 2529 /* This may expand to a divergent if/then, so allocate stack space for 2530 * it. 2531 */ 2532 ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1); 2533 break; 2534 case nir_intrinsic_preamble_start_ir3: 2535 dst[0] = ir3_SHPS_MACRO(ctx->block); 2536 ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1); 2537 break; 2538 2539 case nir_intrinsic_read_invocation_cond_ir3: { 2540 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; 2541 struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0]; 2542 dst[0] = ir3_READ_COND_MACRO(ctx->block, ir3_get_predicate(ctx, cond), 0, 2543 src, 0); 2544 dst[0]->dsts[0]->flags |= IR3_REG_SHARED; 2545 dst[0]->srcs[0]->num = regid(REG_P0, 0); 2546 array_insert(ctx->ir, ctx->ir->predicates, dst[0]); 2547 ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1); 2548 break; 2549 } 2550 2551 case nir_intrinsic_read_first_invocation: { 2552 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; 2553 dst[0] = ir3_READ_FIRST_MACRO(ctx->block, src, 0); 2554 dst[0]->dsts[0]->flags |= IR3_REG_SHARED; 2555 ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1); 2556 break; 2557 } 2558 2559 case nir_intrinsic_ballot: { 2560 struct ir3_instruction *ballot; 2561 unsigned components = intr->dest.ssa.num_components; 2562 if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) { 2563 /* ballot(true) is just MOVMSK */ 2564 ballot = ir3_MOVMSK(ctx->block, components); 2565 } else { 2566 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; 2567 struct ir3_instruction *pred = ir3_get_predicate(ctx, src); 2568 ballot = ir3_BALLOT_MACRO(ctx->block, pred, components); 2569 ballot->srcs[0]->num = regid(REG_P0, 0); 2570 array_insert(ctx->ir, ctx->ir->predicates, ballot); 2571 ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1); 2572 } 2573 2574 ballot->barrier_class = IR3_BARRIER_ACTIVE_FIBERS_R; 2575 ballot->barrier_conflict = IR3_BARRIER_ACTIVE_FIBERS_W; 2576 2577 ir3_split_dest(ctx->block, dst, ballot, 0, components); 2578 break; 2579 } 2580 2581 case nir_intrinsic_quad_broadcast: { 2582 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; 2583 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0]; 2584 2585 type_t dst_type = type_uint_size(nir_dest_bit_size(intr->dest)); 2586 2587 if (dst_type != TYPE_U32) 2588 idx = ir3_COV(ctx->block, idx, TYPE_U32, dst_type); 2589 2590 dst[0] = ir3_QUAD_SHUFFLE_BRCST(ctx->block, src, 0, idx, 0); 2591 dst[0]->cat5.type = dst_type; 2592 break; 2593 } 2594 2595 case nir_intrinsic_quad_swap_horizontal: { 2596 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; 2597 dst[0] = ir3_QUAD_SHUFFLE_HORIZ(ctx->block, src, 0); 2598 dst[0]->cat5.type = type_uint_size(nir_dest_bit_size(intr->dest)); 2599 break; 2600 } 2601 2602 case nir_intrinsic_quad_swap_vertical: { 2603 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; 2604 dst[0] = ir3_QUAD_SHUFFLE_VERT(ctx->block, src, 0); 2605 dst[0]->cat5.type = type_uint_size(nir_dest_bit_size(intr->dest)); 2606 break; 2607 } 2608 2609 case nir_intrinsic_quad_swap_diagonal: { 2610 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; 2611 dst[0] = ir3_QUAD_SHUFFLE_DIAG(ctx->block, src, 0); 2612 dst[0]->cat5.type = type_uint_size(nir_dest_bit_size(intr->dest)); 2613 break; 2614 } 2615 2616 case nir_intrinsic_load_shared_ir3: 2617 emit_intrinsic_load_shared_ir3(ctx, intr, dst); 2618 break; 2619 case nir_intrinsic_store_shared_ir3: 2620 emit_intrinsic_store_shared_ir3(ctx, intr); 2621 break; 2622 case nir_intrinsic_bindless_resource_ir3: 2623 dst[0] = ir3_get_src(ctx, &intr->src[0])[0]; 2624 break; 2625 case nir_intrinsic_global_atomic_add_ir3: 2626 case nir_intrinsic_global_atomic_imin_ir3: 2627 case nir_intrinsic_global_atomic_umin_ir3: 2628 case nir_intrinsic_global_atomic_imax_ir3: 2629 case nir_intrinsic_global_atomic_umax_ir3: 2630 case nir_intrinsic_global_atomic_and_ir3: 2631 case nir_intrinsic_global_atomic_or_ir3: 2632 case nir_intrinsic_global_atomic_xor_ir3: 2633 case nir_intrinsic_global_atomic_exchange_ir3: 2634 case nir_intrinsic_global_atomic_comp_swap_ir3: { 2635 dst[0] = ctx->funcs->emit_intrinsic_atomic_global(ctx, intr); 2636 break; 2637 } 2638 2639 case nir_intrinsic_reduce: 2640 case nir_intrinsic_inclusive_scan: 2641 case nir_intrinsic_exclusive_scan: 2642 dst[0] = emit_intrinsic_reduce(ctx, intr); 2643 break; 2644 2645 case nir_intrinsic_preamble_end_ir3: { 2646 struct ir3_instruction *instr = ir3_SHPE(ctx->block); 2647 instr->barrier_class = instr->barrier_conflict = IR3_BARRIER_CONST_W; 2648 array_insert(b, b->keeps, instr); 2649 break; 2650 } 2651 case nir_intrinsic_store_uniform_ir3: { 2652 unsigned components = nir_src_num_components(intr->src[0]); 2653 unsigned dst = nir_intrinsic_base(intr); 2654 unsigned dst_lo = dst & 0xff; 2655 unsigned dst_hi = dst >> 8; 2656 2657 struct ir3_instruction *src = 2658 ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), components); 2659 struct ir3_instruction *a1 = NULL; 2660 if (dst_hi) { 2661 /* Encode only the high part of the destination in a1.x to increase the 2662 * chance that we can reuse the a1.x value in subsequent stc 2663 * instructions. 2664 */ 2665 a1 = ir3_get_addr1(ctx, dst_hi << 8); 2666 } 2667 2668 struct ir3_instruction *stc = 2669 ir3_STC(ctx->block, create_immed(b, dst_lo), 0, src, 0); 2670 stc->cat6.iim_val = components; 2671 stc->cat6.type = TYPE_U32; 2672 stc->barrier_conflict = IR3_BARRIER_CONST_W; 2673 if (a1) { 2674 ir3_instr_set_address(stc, a1); 2675 stc->flags |= IR3_INSTR_A1EN; 2676 } 2677 array_insert(b, b->keeps, stc); 2678 break; 2679 } 2680 default: 2681 ir3_context_error(ctx, "Unhandled intrinsic type: %s\n", 2682 nir_intrinsic_infos[intr->intrinsic].name); 2683 break; 2684 } 2685 2686 if (info->has_dest) 2687 ir3_put_dst(ctx, &intr->dest); 2688} 2689 2690static void 2691emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr) 2692{ 2693 struct ir3_instruction **dst = 2694 ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components); 2695 unsigned bit_size = ir3_bitsize(ctx, instr->def.bit_size); 2696 2697 if (bit_size <= 8) { 2698 for (int i = 0; i < instr->def.num_components; i++) 2699 dst[i] = create_immed_typed(ctx->block, instr->value[i].u8, TYPE_U8); 2700 } else if (bit_size <= 16) { 2701 for (int i = 0; i < instr->def.num_components; i++) 2702 dst[i] = create_immed_typed(ctx->block, instr->value[i].u16, TYPE_U16); 2703 } else { 2704 for (int i = 0; i < instr->def.num_components; i++) 2705 dst[i] = create_immed_typed(ctx->block, instr->value[i].u32, TYPE_U32); 2706 } 2707} 2708 2709static void 2710emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef) 2711{ 2712 struct ir3_instruction **dst = 2713 ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components); 2714 type_t type = utype_for_size(ir3_bitsize(ctx, undef->def.bit_size)); 2715 2716 /* backend doesn't want undefined instructions, so just plug 2717 * in 0.0.. 2718 */ 2719 for (int i = 0; i < undef->def.num_components; i++) 2720 dst[i] = create_immed_typed(ctx->block, fui(0.0), type); 2721} 2722 2723/* 2724 * texture fetch/sample instructions: 2725 */ 2726 2727static type_t 2728get_tex_dest_type(nir_tex_instr *tex) 2729{ 2730 type_t type; 2731 2732 switch (tex->dest_type) { 2733 case nir_type_float32: 2734 return TYPE_F32; 2735 case nir_type_float16: 2736 return TYPE_F16; 2737 case nir_type_int32: 2738 return TYPE_S32; 2739 case nir_type_int16: 2740 return TYPE_S16; 2741 case nir_type_bool32: 2742 case nir_type_uint32: 2743 return TYPE_U32; 2744 case nir_type_bool16: 2745 case nir_type_uint16: 2746 return TYPE_U16; 2747 case nir_type_invalid: 2748 default: 2749 unreachable("bad dest_type"); 2750 } 2751 2752 return type; 2753} 2754 2755static void 2756tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp) 2757{ 2758 unsigned coords = 2759 glsl_get_sampler_dim_coordinate_components(tex->sampler_dim); 2760 unsigned flags = 0; 2761 2762 /* note: would use tex->coord_components.. except txs.. also, 2763 * since array index goes after shadow ref, we don't want to 2764 * count it: 2765 */ 2766 if (coords == 3) 2767 flags |= IR3_INSTR_3D; 2768 2769 if (tex->is_shadow && tex->op != nir_texop_lod) 2770 flags |= IR3_INSTR_S; 2771 2772 if (tex->is_array && tex->op != nir_texop_lod) 2773 flags |= IR3_INSTR_A; 2774 2775 *flagsp = flags; 2776 *coordsp = coords; 2777} 2778 2779/* Gets the sampler/texture idx as a hvec2. Which could either be dynamic 2780 * or immediate (in which case it will get lowered later to a non .s2en 2781 * version of the tex instruction which encode tex/samp as immediates: 2782 */ 2783static struct tex_src_info 2784get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex) 2785{ 2786 struct ir3_block *b = ctx->block; 2787 struct tex_src_info info = {0}; 2788 int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle); 2789 int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle); 2790 struct ir3_instruction *texture, *sampler; 2791 2792 if (texture_idx >= 0 || sampler_idx >= 0) { 2793 /* Bindless case */ 2794 info.flags |= IR3_INSTR_B; 2795 2796 if (tex->texture_non_uniform || tex->sampler_non_uniform) 2797 info.flags |= IR3_INSTR_NONUNIF; 2798 2799 /* Gather information required to determine which encoding to 2800 * choose as well as for prefetch. 2801 */ 2802 nir_intrinsic_instr *bindless_tex = NULL; 2803 bool tex_const; 2804 if (texture_idx >= 0) { 2805 ctx->so->bindless_tex = true; 2806 bindless_tex = ir3_bindless_resource(tex->src[texture_idx].src); 2807 assert(bindless_tex); 2808 info.tex_base = nir_intrinsic_desc_set(bindless_tex); 2809 tex_const = nir_src_is_const(bindless_tex->src[0]); 2810 if (tex_const) 2811 info.tex_idx = nir_src_as_uint(bindless_tex->src[0]); 2812 } else { 2813 /* To simplify some of the logic below, assume the index is 2814 * constant 0 when it's not enabled. 2815 */ 2816 tex_const = true; 2817 info.tex_idx = 0; 2818 } 2819 nir_intrinsic_instr *bindless_samp = NULL; 2820 bool samp_const; 2821 if (sampler_idx >= 0) { 2822 ctx->so->bindless_samp = true; 2823 bindless_samp = ir3_bindless_resource(tex->src[sampler_idx].src); 2824 assert(bindless_samp); 2825 info.samp_base = nir_intrinsic_desc_set(bindless_samp); 2826 samp_const = nir_src_is_const(bindless_samp->src[0]); 2827 if (samp_const) 2828 info.samp_idx = nir_src_as_uint(bindless_samp->src[0]); 2829 } else { 2830 samp_const = true; 2831 info.samp_idx = 0; 2832 } 2833 2834 /* Choose encoding. */ 2835 if (tex_const && samp_const && info.tex_idx < 256 && 2836 info.samp_idx < 256) { 2837 if (info.tex_idx < 16 && info.samp_idx < 16 && 2838 (!bindless_tex || !bindless_samp || 2839 info.tex_base == info.samp_base)) { 2840 /* Everything fits within the instruction */ 2841 info.base = info.tex_base; 2842 } else { 2843 info.base = info.tex_base; 2844 info.a1_val = info.tex_idx << 3 | info.samp_base; 2845 info.flags |= IR3_INSTR_A1EN; 2846 } 2847 info.samp_tex = NULL; 2848 } else { 2849 info.flags |= IR3_INSTR_S2EN; 2850 /* In the indirect case, we only use a1.x to store the sampler 2851 * base if it differs from the texture base. 2852 */ 2853 if (!bindless_tex || !bindless_samp || 2854 info.tex_base == info.samp_base) { 2855 info.base = info.tex_base; 2856 } else { 2857 info.base = info.tex_base; 2858 info.a1_val = info.samp_base; 2859 info.flags |= IR3_INSTR_A1EN; 2860 } 2861 2862 /* Note: the indirect source is now a vec2 instead of hvec2, and 2863 * for some reason the texture and sampler are swapped. 2864 */ 2865 struct ir3_instruction *texture, *sampler; 2866 2867 if (bindless_tex) { 2868 texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0]; 2869 } else { 2870 texture = create_immed(b, 0); 2871 } 2872 2873 if (bindless_samp) { 2874 sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0]; 2875 } else { 2876 sampler = create_immed(b, 0); 2877 } 2878 info.samp_tex = ir3_collect(b, texture, sampler); 2879 } 2880 } else { 2881 info.flags |= IR3_INSTR_S2EN; 2882 texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset); 2883 sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset); 2884 if (texture_idx >= 0) { 2885 texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0]; 2886 texture = ir3_COV(ctx->block, texture, TYPE_U32, TYPE_U16); 2887 } else { 2888 /* TODO what to do for dynamic case? I guess we only need the 2889 * max index for astc srgb workaround so maybe not a problem 2890 * to worry about if we don't enable indirect samplers for 2891 * a4xx? 2892 */ 2893 ctx->max_texture_index = 2894 MAX2(ctx->max_texture_index, tex->texture_index); 2895 texture = create_immed_typed(ctx->block, tex->texture_index, TYPE_U16); 2896 info.tex_idx = tex->texture_index; 2897 } 2898 2899 if (sampler_idx >= 0) { 2900 sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0]; 2901 sampler = ir3_COV(ctx->block, sampler, TYPE_U32, TYPE_U16); 2902 } else { 2903 sampler = create_immed_typed(ctx->block, tex->sampler_index, TYPE_U16); 2904 info.samp_idx = tex->texture_index; 2905 } 2906 2907 info.samp_tex = ir3_collect(b, sampler, texture); 2908 } 2909 2910 return info; 2911} 2912 2913static void 2914emit_tex(struct ir3_context *ctx, nir_tex_instr *tex) 2915{ 2916 struct ir3_block *b = ctx->block; 2917 struct ir3_instruction **dst, *sam, *src0[12], *src1[4]; 2918 struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy; 2919 struct ir3_instruction *lod, *compare, *proj, *sample_index; 2920 struct tex_src_info info = {0}; 2921 bool has_bias = false, has_lod = false, has_proj = false, has_off = false; 2922 unsigned i, coords, flags, ncomp; 2923 unsigned nsrc0 = 0, nsrc1 = 0; 2924 type_t type; 2925 opc_t opc = 0; 2926 2927 ncomp = nir_dest_num_components(tex->dest); 2928 2929 coord = off = ddx = ddy = NULL; 2930 lod = proj = compare = sample_index = NULL; 2931 2932 dst = ir3_get_dst(ctx, &tex->dest, ncomp); 2933 2934 for (unsigned i = 0; i < tex->num_srcs; i++) { 2935 switch (tex->src[i].src_type) { 2936 case nir_tex_src_coord: 2937 coord = ir3_get_src(ctx, &tex->src[i].src); 2938 break; 2939 case nir_tex_src_bias: 2940 lod = ir3_get_src(ctx, &tex->src[i].src)[0]; 2941 has_bias = true; 2942 break; 2943 case nir_tex_src_lod: 2944 lod = ir3_get_src(ctx, &tex->src[i].src)[0]; 2945 has_lod = true; 2946 break; 2947 case nir_tex_src_comparator: /* shadow comparator */ 2948 compare = ir3_get_src(ctx, &tex->src[i].src)[0]; 2949 break; 2950 case nir_tex_src_projector: 2951 proj = ir3_get_src(ctx, &tex->src[i].src)[0]; 2952 has_proj = true; 2953 break; 2954 case nir_tex_src_offset: 2955 off = ir3_get_src(ctx, &tex->src[i].src); 2956 has_off = true; 2957 break; 2958 case nir_tex_src_ddx: 2959 ddx = ir3_get_src(ctx, &tex->src[i].src); 2960 break; 2961 case nir_tex_src_ddy: 2962 ddy = ir3_get_src(ctx, &tex->src[i].src); 2963 break; 2964 case nir_tex_src_ms_index: 2965 sample_index = ir3_get_src(ctx, &tex->src[i].src)[0]; 2966 break; 2967 case nir_tex_src_texture_offset: 2968 case nir_tex_src_sampler_offset: 2969 case nir_tex_src_texture_handle: 2970 case nir_tex_src_sampler_handle: 2971 /* handled in get_tex_samp_src() */ 2972 break; 2973 default: 2974 ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n", 2975 tex->src[i].src_type); 2976 return; 2977 } 2978 } 2979 2980 switch (tex->op) { 2981 case nir_texop_tex_prefetch: 2982 compile_assert(ctx, !has_bias); 2983 compile_assert(ctx, !has_lod); 2984 compile_assert(ctx, !compare); 2985 compile_assert(ctx, !has_proj); 2986 compile_assert(ctx, !has_off); 2987 compile_assert(ctx, !ddx); 2988 compile_assert(ctx, !ddy); 2989 compile_assert(ctx, !sample_index); 2990 compile_assert( 2991 ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0); 2992 compile_assert( 2993 ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0); 2994 2995 if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) { 2996 opc = OPC_META_TEX_PREFETCH; 2997 ctx->so->num_sampler_prefetch++; 2998 break; 2999 } 3000 FALLTHROUGH; 3001 case nir_texop_tex: 3002 opc = has_lod ? OPC_SAML : OPC_SAM; 3003 break; 3004 case nir_texop_txb: 3005 opc = OPC_SAMB; 3006 break; 3007 case nir_texop_txl: 3008 opc = OPC_SAML; 3009 break; 3010 case nir_texop_txd: 3011 opc = OPC_SAMGQ; 3012 break; 3013 case nir_texop_txf: 3014 opc = OPC_ISAML; 3015 break; 3016 case nir_texop_lod: 3017 opc = OPC_GETLOD; 3018 break; 3019 case nir_texop_tg4: 3020 switch (tex->component) { 3021 case 0: 3022 opc = OPC_GATHER4R; 3023 break; 3024 case 1: 3025 opc = OPC_GATHER4G; 3026 break; 3027 case 2: 3028 opc = OPC_GATHER4B; 3029 break; 3030 case 3: 3031 opc = OPC_GATHER4A; 3032 break; 3033 } 3034 break; 3035 case nir_texop_txf_ms_fb: 3036 case nir_texop_txf_ms: 3037 opc = OPC_ISAMM; 3038 break; 3039 default: 3040 ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op); 3041 return; 3042 } 3043 3044 tex_info(tex, &flags, &coords); 3045 3046 /* 3047 * lay out the first argument in the proper order: 3048 * - actual coordinates first 3049 * - shadow reference 3050 * - array index 3051 * - projection w 3052 * - starting at offset 4, dpdx.xy, dpdy.xy 3053 * 3054 * bias/lod go into the second arg 3055 */ 3056 3057 /* insert tex coords: */ 3058 for (i = 0; i < coords; i++) 3059 src0[i] = coord[i]; 3060 3061 nsrc0 = i; 3062 3063 type_t coord_pad_type = is_half(coord[0]) ? TYPE_U16 : TYPE_U32; 3064 /* scale up integer coords for TXF based on the LOD */ 3065 if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) { 3066 assert(has_lod); 3067 for (i = 0; i < coords; i++) 3068 src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0); 3069 } 3070 3071 if (coords == 1) { 3072 /* hw doesn't do 1d, so we treat it as 2d with 3073 * height of 1, and patch up the y coord. 3074 */ 3075 if (is_isam(opc)) { 3076 src0[nsrc0++] = create_immed_typed(b, 0, coord_pad_type); 3077 } else if (is_half(coord[0])) { 3078 src0[nsrc0++] = create_immed_typed(b, _mesa_float_to_half(0.5), coord_pad_type); 3079 } else { 3080 src0[nsrc0++] = create_immed_typed(b, fui(0.5), coord_pad_type); 3081 } 3082 } 3083 3084 if (tex->is_shadow && tex->op != nir_texop_lod) 3085 src0[nsrc0++] = compare; 3086 3087 if (tex->is_array && tex->op != nir_texop_lod) 3088 src0[nsrc0++] = coord[coords]; 3089 3090 if (has_proj) { 3091 src0[nsrc0++] = proj; 3092 flags |= IR3_INSTR_P; 3093 } 3094 3095 /* pad to 4, then ddx/ddy: */ 3096 if (tex->op == nir_texop_txd) { 3097 while (nsrc0 < 4) 3098 src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type); 3099 for (i = 0; i < coords; i++) 3100 src0[nsrc0++] = ddx[i]; 3101 if (coords < 2) 3102 src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type); 3103 for (i = 0; i < coords; i++) 3104 src0[nsrc0++] = ddy[i]; 3105 if (coords < 2) 3106 src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type); 3107 } 3108 3109 /* NOTE a3xx (and possibly a4xx?) might be different, using isaml 3110 * with scaled x coord according to requested sample: 3111 */ 3112 if (opc == OPC_ISAMM) { 3113 if (ctx->compiler->txf_ms_with_isaml) { 3114 /* the samples are laid out in x dimension as 3115 * 0 1 2 3 3116 * x_ms = (x << ms) + sample_index; 3117 */ 3118 struct ir3_instruction *ms; 3119 ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3); 3120 3121 src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0); 3122 src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0); 3123 3124 opc = OPC_ISAML; 3125 } else { 3126 src0[nsrc0++] = sample_index; 3127 } 3128 } 3129 3130 /* 3131 * second argument (if applicable): 3132 * - offsets 3133 * - lod 3134 * - bias 3135 */ 3136 if (has_off | has_lod | has_bias) { 3137 if (has_off) { 3138 unsigned off_coords = coords; 3139 if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) 3140 off_coords--; 3141 for (i = 0; i < off_coords; i++) 3142 src1[nsrc1++] = off[i]; 3143 if (off_coords < 2) 3144 src1[nsrc1++] = create_immed_typed(b, fui(0.0), coord_pad_type); 3145 flags |= IR3_INSTR_O; 3146 } 3147 3148 if (has_lod | has_bias) 3149 src1[nsrc1++] = lod; 3150 } 3151 3152 type = get_tex_dest_type(tex); 3153 3154 if (opc == OPC_GETLOD) 3155 type = TYPE_S32; 3156 3157 if (tex->op == nir_texop_txf_ms_fb) { 3158 /* only expect a single txf_ms_fb per shader: */ 3159 compile_assert(ctx, !ctx->so->fb_read); 3160 compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT); 3161 3162 ctx->so->fb_read = true; 3163 info.samp_tex = ir3_collect( 3164 b, create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16), 3165 create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16)); 3166 info.flags = IR3_INSTR_S2EN; 3167 3168 ctx->so->num_samp++; 3169 } else { 3170 info = get_tex_samp_tex_src(ctx, tex); 3171 } 3172 3173 bool tg4_swizzle_fixup = false; 3174 if (tex->op == nir_texop_tg4 && ctx->compiler->gen == 4 && 3175 ctx->sampler_swizzles[tex->texture_index] != 0x688 /* rgba */) { 3176 uint16_t swizzles = ctx->sampler_swizzles[tex->texture_index]; 3177 uint16_t swizzle = (swizzles >> (tex->component * 3)) & 7; 3178 if (swizzle > 3) { 3179 /* this would mean that we can just return 0 / 1, no texturing 3180 * necessary 3181 */ 3182 struct ir3_instruction *imm = create_immed(b, 3183 type_float(type) ? fui(swizzle - 4) : (swizzle - 4)); 3184 for (int i = 0; i < 4; i++) 3185 dst[i] = imm; 3186 ir3_put_dst(ctx, &tex->dest); 3187 return; 3188 } 3189 opc = OPC_GATHER4R + swizzle; 3190 tg4_swizzle_fixup = true; 3191 } 3192 3193 struct ir3_instruction *col0 = ir3_create_collect(b, src0, nsrc0); 3194 struct ir3_instruction *col1 = ir3_create_collect(b, src1, nsrc1); 3195 3196 if (opc == OPC_META_TEX_PREFETCH) { 3197 int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); 3198 3199 compile_assert(ctx, tex->src[idx].src.is_ssa); 3200 3201 sam = ir3_SAM(ctx->in_block, opc, type, MASK(ncomp), 0, NULL, 3202 get_barycentric(ctx, IJ_PERSP_PIXEL), 0); 3203 sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa); 3204 /* make sure not to add irrelevant flags like S2EN */ 3205 sam->flags = flags | (info.flags & IR3_INSTR_B); 3206 sam->prefetch.tex = info.tex_idx; 3207 sam->prefetch.samp = info.samp_idx; 3208 sam->prefetch.tex_base = info.tex_base; 3209 sam->prefetch.samp_base = info.samp_base; 3210 } else { 3211 info.flags |= flags; 3212 sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1); 3213 } 3214 3215 if (tg4_swizzle_fixup) { 3216 /* TODO: fix-up for ASTC when alpha is selected? */ 3217 array_insert(ctx->ir, ctx->ir->tg4, sam); 3218 3219 ir3_split_dest(b, dst, sam, 0, 4); 3220 3221 uint8_t tex_bits = ctx->sampler_swizzles[tex->texture_index] >> 12; 3222 if (!type_float(type) && tex_bits != 3 /* 32bpp */ && 3223 tex_bits != 0 /* key unset */) { 3224 uint8_t bits = 0; 3225 switch (tex_bits) { 3226 case 1: /* 8bpp */ 3227 bits = 8; 3228 break; 3229 case 2: /* 16bpp */ 3230 bits = 16; 3231 break; 3232 case 4: /* 10bpp or 2bpp for alpha */ 3233 if (opc == OPC_GATHER4A) 3234 bits = 2; 3235 else 3236 bits = 10; 3237 break; 3238 default: 3239 assert(0); 3240 } 3241 3242 sam->cat5.type = TYPE_F32; 3243 for (int i = 0; i < 4; i++) { 3244 /* scale and offset the unorm data */ 3245 dst[i] = ir3_MAD_F32(b, dst[i], 0, create_immed(b, fui((1 << bits) - 1)), 0, create_immed(b, fui(0.5f)), 0); 3246 /* convert the scaled value to integer */ 3247 dst[i] = ir3_COV(b, dst[i], TYPE_F32, TYPE_U32); 3248 /* sign extend for signed values */ 3249 if (type == TYPE_S32) { 3250 dst[i] = ir3_SHL_B(b, dst[i], 0, create_immed(b, 32 - bits), 0); 3251 dst[i] = ir3_ASHR_B(b, dst[i], 0, create_immed(b, 32 - bits), 0); 3252 } 3253 } 3254 } 3255 } else if ((ctx->astc_srgb & (1 << tex->texture_index)) && 3256 tex->op != nir_texop_tg4 && /* leave out tg4, unless it's on alpha? */ 3257 !nir_tex_instr_is_query(tex)) { 3258 assert(opc != OPC_META_TEX_PREFETCH); 3259 3260 /* only need first 3 components: */ 3261 sam->dsts[0]->wrmask = 0x7; 3262 ir3_split_dest(b, dst, sam, 0, 3); 3263 3264 /* we need to sample the alpha separately with a non-SRGB 3265 * texture state: 3266 */ 3267 sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags, info.samp_tex, 3268 col0, col1); 3269 3270 array_insert(ctx->ir, ctx->ir->astc_srgb, sam); 3271 3272 /* fixup .w component: */ 3273 ir3_split_dest(b, &dst[3], sam, 3, 1); 3274 } else { 3275 /* normal (non-workaround) case: */ 3276 ir3_split_dest(b, dst, sam, 0, ncomp); 3277 } 3278 3279 /* GETLOD returns results in 4.8 fixed point */ 3280 if (opc == OPC_GETLOD) { 3281 bool half = nir_dest_bit_size(tex->dest) == 16; 3282 struct ir3_instruction *factor = 3283 half ? create_immed_typed(b, _mesa_float_to_half(1.0 / 256), TYPE_F16) 3284 : create_immed(b, fui(1.0 / 256)); 3285 3286 for (i = 0; i < 2; i++) { 3287 dst[i] = ir3_MUL_F( 3288 b, ir3_COV(b, dst[i], TYPE_S32, half ? TYPE_F16 : TYPE_F32), 0, 3289 factor, 0); 3290 } 3291 } 3292 3293 ir3_put_dst(ctx, &tex->dest); 3294} 3295 3296static void 3297emit_tex_info(struct ir3_context *ctx, nir_tex_instr *tex, unsigned idx) 3298{ 3299 struct ir3_block *b = ctx->block; 3300 struct ir3_instruction **dst, *sam; 3301 type_t dst_type = get_tex_dest_type(tex); 3302 struct tex_src_info info = get_tex_samp_tex_src(ctx, tex); 3303 3304 dst = ir3_get_dst(ctx, &tex->dest, 1); 3305 3306 sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL); 3307 3308 /* even though there is only one component, since it ends 3309 * up in .y/.z/.w rather than .x, we need a split_dest() 3310 */ 3311 ir3_split_dest(b, dst, sam, idx, 1); 3312 3313 /* The # of levels comes from getinfo.z. We need to add 1 to it, since 3314 * the value in TEX_CONST_0 is zero-based. 3315 */ 3316 if (ctx->compiler->levels_add_one) 3317 dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0); 3318 3319 ir3_put_dst(ctx, &tex->dest); 3320} 3321 3322static void 3323emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex) 3324{ 3325 struct ir3_block *b = ctx->block; 3326 struct ir3_instruction **dst, *sam; 3327 struct ir3_instruction *lod; 3328 unsigned flags, coords; 3329 type_t dst_type = get_tex_dest_type(tex); 3330 struct tex_src_info info = get_tex_samp_tex_src(ctx, tex); 3331 3332 tex_info(tex, &flags, &coords); 3333 info.flags |= flags; 3334 3335 /* Actually we want the number of dimensions, not coordinates. This 3336 * distinction only matters for cubes. 3337 */ 3338 if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) 3339 coords = 2; 3340 3341 dst = ir3_get_dst(ctx, &tex->dest, 4); 3342 3343 int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod); 3344 compile_assert(ctx, lod_idx >= 0); 3345 3346 lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0]; 3347 3348 if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) { 3349 sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL); 3350 } else { 3351 /* 3352 * The maximum value which OPC_GETSIZE could return for one dimension 3353 * is 0x007ff0, however sampler buffer could be much bigger. 3354 * Blob uses OPC_GETBUF for them. 3355 */ 3356 sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL); 3357 } 3358 3359 ir3_split_dest(b, dst, sam, 0, 4); 3360 3361 /* Array size actually ends up in .w rather than .z. This doesn't 3362 * matter for miplevel 0, but for higher mips the value in z is 3363 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is 3364 * returned, which means that we have to add 1 to it for arrays. 3365 */ 3366 if (tex->is_array) { 3367 if (ctx->compiler->levels_add_one) { 3368 dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0); 3369 } else { 3370 dst[coords] = ir3_MOV(b, dst[3], TYPE_U32); 3371 } 3372 } 3373 3374 ir3_put_dst(ctx, &tex->dest); 3375} 3376 3377/* phi instructions are left partially constructed. We don't resolve 3378 * their srcs until the end of the shader, since (eg. loops) one of 3379 * the phi's srcs might be defined after the phi due to back edges in 3380 * the CFG. 3381 */ 3382static void 3383emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi) 3384{ 3385 struct ir3_instruction *phi, **dst; 3386 3387 /* NOTE: phi's should be lowered to scalar at this point */ 3388 compile_assert(ctx, nphi->dest.ssa.num_components == 1); 3389 3390 dst = ir3_get_dst(ctx, &nphi->dest, 1); 3391 3392 phi = ir3_instr_create(ctx->block, OPC_META_PHI, 1, 3393 exec_list_length(&nphi->srcs)); 3394 __ssa_dst(phi); 3395 phi->phi.nphi = nphi; 3396 3397 dst[0] = phi; 3398 3399 ir3_put_dst(ctx, &nphi->dest); 3400} 3401 3402static struct ir3_block *get_block(struct ir3_context *ctx, 3403 const nir_block *nblock); 3404 3405static struct ir3_instruction * 3406read_phi_src(struct ir3_context *ctx, struct ir3_block *blk, 3407 struct ir3_instruction *phi, nir_phi_instr *nphi) 3408{ 3409 if (!blk->nblock) { 3410 struct ir3_instruction *continue_phi = 3411 ir3_instr_create(blk, OPC_META_PHI, 1, blk->predecessors_count); 3412 __ssa_dst(continue_phi)->flags = phi->dsts[0]->flags; 3413 3414 for (unsigned i = 0; i < blk->predecessors_count; i++) { 3415 struct ir3_instruction *src = 3416 read_phi_src(ctx, blk->predecessors[i], phi, nphi); 3417 if (src) 3418 __ssa_src(continue_phi, src, 0); 3419 else 3420 ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags); 3421 } 3422 3423 return continue_phi; 3424 } 3425 3426 nir_foreach_phi_src (nsrc, nphi) { 3427 if (blk->nblock == nsrc->pred) { 3428 if (nsrc->src.ssa->parent_instr->type == nir_instr_type_ssa_undef) { 3429 /* Create an ir3 undef */ 3430 return NULL; 3431 } else { 3432 return ir3_get_src(ctx, &nsrc->src)[0]; 3433 } 3434 } 3435 } 3436 3437 unreachable("couldn't find phi node ir3 block"); 3438 return NULL; 3439} 3440 3441static void 3442resolve_phis(struct ir3_context *ctx, struct ir3_block *block) 3443{ 3444 foreach_instr (phi, &block->instr_list) { 3445 if (phi->opc != OPC_META_PHI) 3446 break; 3447 3448 nir_phi_instr *nphi = phi->phi.nphi; 3449 3450 if (!nphi) /* skip continue phis created above */ 3451 continue; 3452 3453 for (unsigned i = 0; i < block->predecessors_count; i++) { 3454 struct ir3_block *pred = block->predecessors[i]; 3455 struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi); 3456 if (src) { 3457 __ssa_src(phi, src, 0); 3458 } else { 3459 /* Create an ir3 undef */ 3460 ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags); 3461 } 3462 } 3463 } 3464} 3465 3466static void 3467emit_jump(struct ir3_context *ctx, nir_jump_instr *jump) 3468{ 3469 switch (jump->type) { 3470 case nir_jump_break: 3471 case nir_jump_continue: 3472 case nir_jump_return: 3473 /* I *think* we can simply just ignore this, and use the 3474 * successor block link to figure out where we need to 3475 * jump to for break/continue 3476 */ 3477 break; 3478 default: 3479 ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type); 3480 break; 3481 } 3482} 3483 3484static void 3485emit_instr(struct ir3_context *ctx, nir_instr *instr) 3486{ 3487 switch (instr->type) { 3488 case nir_instr_type_alu: 3489 emit_alu(ctx, nir_instr_as_alu(instr)); 3490 break; 3491 case nir_instr_type_deref: 3492 /* ignored, handled as part of the intrinsic they are src to */ 3493 break; 3494 case nir_instr_type_intrinsic: 3495 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); 3496 break; 3497 case nir_instr_type_load_const: 3498 emit_load_const(ctx, nir_instr_as_load_const(instr)); 3499 break; 3500 case nir_instr_type_ssa_undef: 3501 emit_undef(ctx, nir_instr_as_ssa_undef(instr)); 3502 break; 3503 case nir_instr_type_tex: { 3504 nir_tex_instr *tex = nir_instr_as_tex(instr); 3505 /* couple tex instructions get special-cased: 3506 */ 3507 switch (tex->op) { 3508 case nir_texop_txs: 3509 emit_tex_txs(ctx, tex); 3510 break; 3511 case nir_texop_query_levels: 3512 emit_tex_info(ctx, tex, 2); 3513 break; 3514 case nir_texop_texture_samples: 3515 emit_tex_info(ctx, tex, 3); 3516 break; 3517 default: 3518 emit_tex(ctx, tex); 3519 break; 3520 } 3521 break; 3522 } 3523 case nir_instr_type_jump: 3524 emit_jump(ctx, nir_instr_as_jump(instr)); 3525 break; 3526 case nir_instr_type_phi: 3527 emit_phi(ctx, nir_instr_as_phi(instr)); 3528 break; 3529 case nir_instr_type_call: 3530 case nir_instr_type_parallel_copy: 3531 ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n", 3532 instr->type); 3533 break; 3534 } 3535} 3536 3537static struct ir3_block * 3538get_block(struct ir3_context *ctx, const nir_block *nblock) 3539{ 3540 struct ir3_block *block; 3541 struct hash_entry *hentry; 3542 3543 hentry = _mesa_hash_table_search(ctx->block_ht, nblock); 3544 if (hentry) 3545 return hentry->data; 3546 3547 block = ir3_block_create(ctx->ir); 3548 block->nblock = nblock; 3549 _mesa_hash_table_insert(ctx->block_ht, nblock, block); 3550 3551 return block; 3552} 3553 3554static struct ir3_block * 3555get_block_or_continue(struct ir3_context *ctx, const nir_block *nblock) 3556{ 3557 struct hash_entry *hentry; 3558 3559 hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock); 3560 if (hentry) 3561 return hentry->data; 3562 3563 return get_block(ctx, nblock); 3564} 3565 3566static struct ir3_block * 3567create_continue_block(struct ir3_context *ctx, const nir_block *nblock) 3568{ 3569 struct ir3_block *block = ir3_block_create(ctx->ir); 3570 block->nblock = NULL; 3571 _mesa_hash_table_insert(ctx->continue_block_ht, nblock, block); 3572 return block; 3573} 3574 3575static void 3576emit_block(struct ir3_context *ctx, nir_block *nblock) 3577{ 3578 ctx->block = get_block(ctx, nblock); 3579 3580 list_addtail(&ctx->block->node, &ctx->ir->block_list); 3581 3582 ctx->block->loop_id = ctx->loop_id; 3583 ctx->block->loop_depth = ctx->loop_depth; 3584 3585 /* re-emit addr register in each block if needed: */ 3586 for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) { 3587 _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL); 3588 ctx->addr0_ht[i] = NULL; 3589 } 3590 3591 _mesa_hash_table_u64_destroy(ctx->addr1_ht); 3592 ctx->addr1_ht = NULL; 3593 3594 nir_foreach_instr (instr, nblock) { 3595 ctx->cur_instr = instr; 3596 emit_instr(ctx, instr); 3597 ctx->cur_instr = NULL; 3598 if (ctx->error) 3599 return; 3600 } 3601 3602 for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) { 3603 if (nblock->successors[i]) { 3604 ctx->block->successors[i] = 3605 get_block_or_continue(ctx, nblock->successors[i]); 3606 ctx->block->physical_successors[i] = ctx->block->successors[i]; 3607 } 3608 } 3609 3610 _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL); 3611} 3612 3613static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list); 3614 3615static void 3616emit_if(struct ir3_context *ctx, nir_if *nif) 3617{ 3618 struct ir3_instruction *condition = ir3_get_src(ctx, &nif->condition)[0]; 3619 3620 if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) { 3621 ctx->block->condition = ssa(condition->srcs[0]); 3622 ctx->block->brtype = IR3_BRANCH_ANY; 3623 } else if (condition->opc == OPC_ALL_MACRO && 3624 condition->block == ctx->block) { 3625 ctx->block->condition = ssa(condition->srcs[0]); 3626 ctx->block->brtype = IR3_BRANCH_ALL; 3627 } else if (condition->opc == OPC_ELECT_MACRO && 3628 condition->block == ctx->block) { 3629 ctx->block->condition = NULL; 3630 ctx->block->brtype = IR3_BRANCH_GETONE; 3631 } else if (condition->opc == OPC_SHPS_MACRO && 3632 condition->block == ctx->block) { 3633 /* TODO: technically this only works if the block is the only user of the 3634 * shps, but we only use it in very constrained scenarios so this should 3635 * be ok. 3636 */ 3637 ctx->block->condition = NULL; 3638 ctx->block->brtype = IR3_BRANCH_SHPS; 3639 } else { 3640 ctx->block->condition = ir3_get_predicate(ctx, condition); 3641 ctx->block->brtype = IR3_BRANCH_COND; 3642 } 3643 3644 emit_cf_list(ctx, &nif->then_list); 3645 emit_cf_list(ctx, &nif->else_list); 3646 3647 struct ir3_block *last_then = get_block(ctx, nir_if_last_then_block(nif)); 3648 struct ir3_block *first_else = get_block(ctx, nir_if_first_else_block(nif)); 3649 assert(last_then->physical_successors[0] && 3650 !last_then->physical_successors[1]); 3651 last_then->physical_successors[1] = first_else; 3652 3653 struct ir3_block *last_else = get_block(ctx, nir_if_last_else_block(nif)); 3654 struct ir3_block *after_if = 3655 get_block(ctx, nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node))); 3656 assert(last_else->physical_successors[0] && 3657 !last_else->physical_successors[1]); 3658 if (after_if != last_else->physical_successors[0]) 3659 last_else->physical_successors[1] = after_if; 3660} 3661 3662static void 3663emit_loop(struct ir3_context *ctx, nir_loop *nloop) 3664{ 3665 unsigned old_loop_id = ctx->loop_id; 3666 ctx->loop_id = ctx->so->loops + 1; 3667 ctx->loop_depth++; 3668 3669 struct nir_block *nstart = nir_loop_first_block(nloop); 3670 struct ir3_block *continue_blk = NULL; 3671 3672 /* There's always one incoming edge from outside the loop, and if there 3673 * are more than two backedges from inside the loop (so more than 2 total 3674 * edges) then we need to create a continue block after the loop to ensure 3675 * that control reconverges at the end of each loop iteration. 3676 */ 3677 if (nstart->predecessors->entries > 2) { 3678 continue_blk = create_continue_block(ctx, nstart); 3679 } 3680 3681 emit_cf_list(ctx, &nloop->body); 3682 3683 if (continue_blk) { 3684 struct ir3_block *start = get_block(ctx, nstart); 3685 continue_blk->successors[0] = start; 3686 continue_blk->physical_successors[0] = start; 3687 continue_blk->loop_id = ctx->loop_id; 3688 continue_blk->loop_depth = ctx->loop_depth; 3689 list_addtail(&continue_blk->node, &ctx->ir->block_list); 3690 } 3691 3692 ctx->so->loops++; 3693 ctx->loop_depth--; 3694 ctx->loop_id = old_loop_id; 3695} 3696 3697static void 3698stack_push(struct ir3_context *ctx) 3699{ 3700 ctx->stack++; 3701 ctx->max_stack = MAX2(ctx->max_stack, ctx->stack); 3702} 3703 3704static void 3705stack_pop(struct ir3_context *ctx) 3706{ 3707 compile_assert(ctx, ctx->stack > 0); 3708 ctx->stack--; 3709} 3710 3711static void 3712emit_cf_list(struct ir3_context *ctx, struct exec_list *list) 3713{ 3714 foreach_list_typed (nir_cf_node, node, node, list) { 3715 switch (node->type) { 3716 case nir_cf_node_block: 3717 emit_block(ctx, nir_cf_node_as_block(node)); 3718 break; 3719 case nir_cf_node_if: 3720 stack_push(ctx); 3721 emit_if(ctx, nir_cf_node_as_if(node)); 3722 stack_pop(ctx); 3723 break; 3724 case nir_cf_node_loop: 3725 stack_push(ctx); 3726 emit_loop(ctx, nir_cf_node_as_loop(node)); 3727 stack_pop(ctx); 3728 break; 3729 case nir_cf_node_function: 3730 ir3_context_error(ctx, "TODO\n"); 3731 break; 3732 } 3733 } 3734} 3735 3736/* emit stream-out code. At this point, the current block is the original 3737 * (nir) end block, and nir ensures that all flow control paths terminate 3738 * into the end block. We re-purpose the original end block to generate 3739 * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional 3740 * block holding stream-out write instructions, followed by the new end 3741 * block: 3742 * 3743 * blockOrigEnd { 3744 * p0.x = (vtxcnt < maxvtxcnt) 3745 * // succs: blockStreamOut, blockNewEnd 3746 * } 3747 * blockStreamOut { 3748 * // preds: blockOrigEnd 3749 * ... stream-out instructions ... 3750 * // succs: blockNewEnd 3751 * } 3752 * blockNewEnd { 3753 * // preds: blockOrigEnd, blockStreamOut 3754 * } 3755 */ 3756static void 3757emit_stream_out(struct ir3_context *ctx) 3758{ 3759 struct ir3 *ir = ctx->ir; 3760 struct ir3_stream_output_info *strmout = &ctx->so->stream_output; 3761 struct ir3_block *orig_end_block, *stream_out_block, *new_end_block; 3762 struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond; 3763 struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS]; 3764 3765 /* create vtxcnt input in input block at top of shader, 3766 * so that it is seen as live over the entire duration 3767 * of the shader: 3768 */ 3769 vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1); 3770 maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX); 3771 3772 /* at this point, we are at the original 'end' block, 3773 * re-purpose this block to stream-out condition, then 3774 * append stream-out block and new-end block 3775 */ 3776 orig_end_block = ctx->block; 3777 3778 // maybe w/ store_global intrinsic, we could do this 3779 // stuff in nir->nir pass 3780 3781 stream_out_block = ir3_block_create(ir); 3782 list_addtail(&stream_out_block->node, &ir->block_list); 3783 3784 new_end_block = ir3_block_create(ir); 3785 list_addtail(&new_end_block->node, &ir->block_list); 3786 3787 orig_end_block->successors[0] = stream_out_block; 3788 orig_end_block->successors[1] = new_end_block; 3789 3790 orig_end_block->physical_successors[0] = stream_out_block; 3791 orig_end_block->physical_successors[1] = new_end_block; 3792 3793 stream_out_block->successors[0] = new_end_block; 3794 3795 stream_out_block->physical_successors[0] = new_end_block; 3796 3797 /* setup 'if (vtxcnt < maxvtxcnt)' condition: */ 3798 cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0); 3799 cond->dsts[0]->num = regid(REG_P0, 0); 3800 cond->dsts[0]->flags &= ~IR3_REG_SSA; 3801 cond->cat2.condition = IR3_COND_LT; 3802 3803 /* condition goes on previous block to the conditional, 3804 * since it is used to pick which of the two successor 3805 * paths to take: 3806 */ 3807 orig_end_block->condition = cond; 3808 3809 /* switch to stream_out_block to generate the stream-out 3810 * instructions: 3811 */ 3812 ctx->block = stream_out_block; 3813 3814 /* Calculate base addresses based on vtxcnt. Instructions 3815 * generated for bases not used in following loop will be 3816 * stripped out in the backend. 3817 */ 3818 for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) { 3819 const struct ir3_const_state *const_state = ir3_const_state(ctx->so); 3820 unsigned stride = strmout->stride[i]; 3821 struct ir3_instruction *base, *off; 3822 3823 base = create_uniform(ctx->block, regid(const_state->offsets.tfbo, i)); 3824 3825 /* 24-bit should be enough: */ 3826 off = ir3_MUL_U24(ctx->block, vtxcnt, 0, 3827 create_immed(ctx->block, stride * 4), 0); 3828 3829 bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0); 3830 } 3831 3832 /* Generate the per-output store instructions: */ 3833 for (unsigned i = 0; i < strmout->num_outputs; i++) { 3834 for (unsigned j = 0; j < strmout->output[i].num_components; j++) { 3835 unsigned c = j + strmout->output[i].start_component; 3836 struct ir3_instruction *base, *out, *stg; 3837 3838 base = bases[strmout->output[i].output_buffer]; 3839 out = ctx->outputs[regid(strmout->output[i].register_index, c)]; 3840 3841 stg = ir3_STG( 3842 ctx->block, base, 0, 3843 create_immed(ctx->block, (strmout->output[i].dst_offset + j) * 4), 3844 0, out, 0, create_immed(ctx->block, 1), 0); 3845 stg->cat6.type = TYPE_U32; 3846 3847 array_insert(ctx->block, ctx->block->keeps, stg); 3848 } 3849 } 3850 3851 /* and finally switch to the new_end_block: */ 3852 ctx->block = new_end_block; 3853} 3854 3855static void 3856setup_predecessors(struct ir3 *ir) 3857{ 3858 foreach_block (block, &ir->block_list) { 3859 for (int i = 0; i < ARRAY_SIZE(block->successors); i++) { 3860 if (block->successors[i]) 3861 ir3_block_add_predecessor(block->successors[i], block); 3862 if (block->physical_successors[i]) 3863 ir3_block_add_physical_predecessor(block->physical_successors[i], 3864 block); 3865 } 3866 } 3867} 3868 3869static void 3870emit_function(struct ir3_context *ctx, nir_function_impl *impl) 3871{ 3872 nir_metadata_require(impl, nir_metadata_block_index); 3873 3874 compile_assert(ctx, ctx->stack == 0); 3875 3876 emit_cf_list(ctx, &impl->body); 3877 emit_block(ctx, impl->end_block); 3878 3879 compile_assert(ctx, ctx->stack == 0); 3880 3881 /* at this point, we should have a single empty block, 3882 * into which we emit the 'end' instruction. 3883 */ 3884 compile_assert(ctx, list_is_empty(&ctx->block->instr_list)); 3885 3886 /* If stream-out (aka transform-feedback) enabled, emit the 3887 * stream-out instructions, followed by a new empty block (into 3888 * which the 'end' instruction lands). 3889 * 3890 * NOTE: it is done in this order, rather than inserting before 3891 * we emit end_block, because NIR guarantees that all blocks 3892 * flow into end_block, and that end_block has no successors. 3893 * So by re-purposing end_block as the first block of stream- 3894 * out, we guarantee that all exit paths flow into the stream- 3895 * out instructions. 3896 */ 3897 if ((ctx->compiler->gen < 5) && 3898 (ctx->so->stream_output.num_outputs > 0) && 3899 !ctx->so->binning_pass) { 3900 assert(ctx->so->type == MESA_SHADER_VERTEX); 3901 emit_stream_out(ctx); 3902 } 3903 3904 setup_predecessors(ctx->ir); 3905 foreach_block (block, &ctx->ir->block_list) { 3906 resolve_phis(ctx, block); 3907 } 3908} 3909 3910static void 3911setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr) 3912{ 3913 struct ir3_shader_variant *so = ctx->so; 3914 struct ir3_instruction *coord = NULL; 3915 3916 if (intr->intrinsic == nir_intrinsic_load_interpolated_input) 3917 coord = ir3_create_collect(ctx->block, ir3_get_src(ctx, &intr->src[0]), 2); 3918 3919 compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0])); 3920 3921 unsigned frac = nir_intrinsic_component(intr); 3922 unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]); 3923 unsigned ncomp = nir_intrinsic_dest_components(intr); 3924 unsigned n = nir_intrinsic_base(intr) + offset; 3925 unsigned slot = nir_intrinsic_io_semantics(intr).location + offset; 3926 unsigned compmask; 3927 3928 /* Inputs are loaded using ldlw or ldg for other stages. */ 3929 compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT || 3930 ctx->so->type == MESA_SHADER_VERTEX); 3931 3932 if (ctx->so->type == MESA_SHADER_FRAGMENT) 3933 compmask = BITFIELD_MASK(ncomp) << frac; 3934 else 3935 compmask = BITFIELD_MASK(ncomp + frac); 3936 3937 /* for a4xx+ rasterflat */ 3938 if (so->inputs[n].rasterflat && ctx->so->key.rasterflat) 3939 coord = NULL; 3940 3941 so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask); 3942 3943 so->inputs[n].slot = slot; 3944 so->inputs[n].compmask |= compmask; 3945 so->inputs_count = MAX2(so->inputs_count, n + 1); 3946 compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs)); 3947 so->inputs[n].flat = !coord; 3948 3949 if (ctx->so->type == MESA_SHADER_FRAGMENT) { 3950 compile_assert(ctx, slot != VARYING_SLOT_POS); 3951 3952 so->inputs[n].bary = true; 3953 3954 for (int i = 0; i < ncomp; i++) { 3955 unsigned idx = (n * 4) + i + frac; 3956 ctx->last_dst[i] = create_frag_input(ctx, coord, idx); 3957 } 3958 } else { 3959 struct ir3_instruction *input = NULL; 3960 3961 foreach_input (in, ctx->ir) { 3962 if (in->input.inidx == n) { 3963 input = in; 3964 break; 3965 } 3966 } 3967 3968 if (!input) { 3969 input = create_input(ctx, compmask); 3970 input->input.inidx = n; 3971 } else { 3972 /* For aliased inputs, just append to the wrmask.. ie. if we 3973 * first see a vec2 index at slot N, and then later a vec4, 3974 * the wrmask of the resulting overlapped vec2 and vec4 is 0xf 3975 */ 3976 input->dsts[0]->wrmask |= compmask; 3977 } 3978 3979 for (int i = 0; i < ncomp + frac; i++) { 3980 unsigned idx = (n * 4) + i; 3981 compile_assert(ctx, idx < ctx->ninputs); 3982 3983 /* fixup the src wrmask to avoid validation fail */ 3984 if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) { 3985 ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask; 3986 continue; 3987 } 3988 3989 ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1); 3990 } 3991 3992 for (int i = 0; i < ncomp; i++) { 3993 unsigned idx = (n * 4) + i + frac; 3994 ctx->last_dst[i] = ctx->inputs[idx]; 3995 } 3996 } 3997} 3998 3999/* Initially we assign non-packed inloc's for varyings, as we don't really 4000 * know up-front which components will be unused. After all the compilation 4001 * stages we scan the shader to see which components are actually used, and 4002 * re-pack the inlocs to eliminate unneeded varyings. 4003 */ 4004static void 4005pack_inlocs(struct ir3_context *ctx) 4006{ 4007 struct ir3_shader_variant *so = ctx->so; 4008 uint8_t used_components[so->inputs_count]; 4009 4010 memset(used_components, 0, sizeof(used_components)); 4011 4012 /* 4013 * First Step: scan shader to find which bary.f/ldlv remain: 4014 */ 4015 4016 foreach_block (block, &ctx->ir->block_list) { 4017 foreach_instr (instr, &block->instr_list) { 4018 if (is_input(instr)) { 4019 unsigned inloc = instr->srcs[0]->iim_val; 4020 unsigned i = inloc / 4; 4021 unsigned j = inloc % 4; 4022 4023 compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED); 4024 compile_assert(ctx, i < so->inputs_count); 4025 4026 used_components[i] |= 1 << j; 4027 } else if (instr->opc == OPC_META_TEX_PREFETCH) { 4028 for (int n = 0; n < 2; n++) { 4029 unsigned inloc = instr->prefetch.input_offset + n; 4030 unsigned i = inloc / 4; 4031 unsigned j = inloc % 4; 4032 4033 compile_assert(ctx, i < so->inputs_count); 4034 4035 used_components[i] |= 1 << j; 4036 } 4037 } 4038 } 4039 } 4040 4041 /* 4042 * Second Step: reassign varying inloc/slots: 4043 */ 4044 4045 unsigned inloc = 0; 4046 4047 /* for clip+cull distances, unused components can't be eliminated because 4048 * they're read by fixed-function, even if there's a hole. Note that 4049 * clip/cull distance arrays must be declared in the FS, so we can just 4050 * use the NIR clip/cull distances to avoid reading ucp_enables in the 4051 * shader key. 4052 */ 4053 unsigned clip_cull_mask = so->clip_mask | so->cull_mask; 4054 4055 for (unsigned i = 0; i < so->inputs_count; i++) { 4056 unsigned compmask = 0, maxcomp = 0; 4057 4058 so->inputs[i].inloc = inloc; 4059 so->inputs[i].bary = false; 4060 4061 if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 || 4062 so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) { 4063 if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0) 4064 compmask = clip_cull_mask & 0xf; 4065 else 4066 compmask = clip_cull_mask >> 4; 4067 used_components[i] = compmask; 4068 } 4069 4070 for (unsigned j = 0; j < 4; j++) { 4071 if (!(used_components[i] & (1 << j))) 4072 continue; 4073 4074 compmask |= (1 << j); 4075 maxcomp = j + 1; 4076 4077 /* at this point, since used_components[i] mask is only 4078 * considering varyings (ie. not sysvals) we know this 4079 * is a varying: 4080 */ 4081 so->inputs[i].bary = true; 4082 } 4083 4084 if (so->inputs[i].bary) { 4085 so->varying_in++; 4086 so->inputs[i].compmask = (1 << maxcomp) - 1; 4087 inloc += maxcomp; 4088 } 4089 } 4090 4091 /* 4092 * Third Step: reassign packed inloc's: 4093 */ 4094 4095 foreach_block (block, &ctx->ir->block_list) { 4096 foreach_instr (instr, &block->instr_list) { 4097 if (is_input(instr)) { 4098 unsigned inloc = instr->srcs[0]->iim_val; 4099 unsigned i = inloc / 4; 4100 unsigned j = inloc % 4; 4101 4102 instr->srcs[0]->iim_val = so->inputs[i].inloc + j; 4103 } else if (instr->opc == OPC_META_TEX_PREFETCH) { 4104 unsigned i = instr->prefetch.input_offset / 4; 4105 unsigned j = instr->prefetch.input_offset % 4; 4106 instr->prefetch.input_offset = so->inputs[i].inloc + j; 4107 } 4108 } 4109 } 4110} 4111 4112static void 4113setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr) 4114{ 4115 struct ir3_shader_variant *so = ctx->so; 4116 nir_io_semantics io = nir_intrinsic_io_semantics(intr); 4117 4118 compile_assert(ctx, nir_src_is_const(intr->src[1])); 4119 4120 unsigned offset = nir_src_as_uint(intr->src[1]); 4121 unsigned n = nir_intrinsic_base(intr) + offset; 4122 unsigned frac = nir_intrinsic_component(intr); 4123 unsigned ncomp = nir_intrinsic_src_components(intr, 0); 4124 4125 /* For per-view variables, each user-facing slot corresponds to multiple 4126 * views, each with a corresponding driver_location, and the offset is for 4127 * the driver_location. To properly figure out of the slot, we'd need to 4128 * plumb through the number of views. However, for now we only use 4129 * per-view with gl_Position, so we assume that the variable is not an 4130 * array or matrix (so there are no indirect accesses to the variable 4131 * itself) and the indirect offset corresponds to the view. 4132 */ 4133 unsigned slot = io.location + (io.per_view ? 0 : offset); 4134 4135 if (ctx->so->type == MESA_SHADER_FRAGMENT) { 4136 switch (slot) { 4137 case FRAG_RESULT_DEPTH: 4138 so->writes_pos = true; 4139 break; 4140 case FRAG_RESULT_COLOR: 4141 if (!ctx->s->info.fs.color_is_dual_source) { 4142 so->color0_mrt = 1; 4143 } else { 4144 slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index; 4145 } 4146 break; 4147 case FRAG_RESULT_SAMPLE_MASK: 4148 so->writes_smask = true; 4149 break; 4150 case FRAG_RESULT_STENCIL: 4151 so->writes_stencilref = true; 4152 break; 4153 default: 4154 slot += io.dual_source_blend_index; /* For dual-src blend */ 4155 if (slot >= FRAG_RESULT_DATA0) 4156 break; 4157 ir3_context_error(ctx, "unknown FS output name: %s\n", 4158 gl_frag_result_name(slot)); 4159 } 4160 } else if (ctx->so->type == MESA_SHADER_VERTEX || 4161 ctx->so->type == MESA_SHADER_TESS_EVAL || 4162 ctx->so->type == MESA_SHADER_GEOMETRY) { 4163 switch (slot) { 4164 case VARYING_SLOT_POS: 4165 so->writes_pos = true; 4166 break; 4167 case VARYING_SLOT_PSIZ: 4168 so->writes_psize = true; 4169 break; 4170 case VARYING_SLOT_PRIMITIVE_ID: 4171 case VARYING_SLOT_GS_VERTEX_FLAGS_IR3: 4172 assert(ctx->so->type == MESA_SHADER_GEOMETRY); 4173 FALLTHROUGH; 4174 case VARYING_SLOT_COL0: 4175 case VARYING_SLOT_COL1: 4176 case VARYING_SLOT_BFC0: 4177 case VARYING_SLOT_BFC1: 4178 case VARYING_SLOT_FOGC: 4179 case VARYING_SLOT_CLIP_DIST0: 4180 case VARYING_SLOT_CLIP_DIST1: 4181 case VARYING_SLOT_CLIP_VERTEX: 4182 case VARYING_SLOT_LAYER: 4183 case VARYING_SLOT_VIEWPORT: 4184 break; 4185 default: 4186 if (slot >= VARYING_SLOT_VAR0) 4187 break; 4188 if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7)) 4189 break; 4190 ir3_context_error(ctx, "unknown %s shader output name: %s\n", 4191 _mesa_shader_stage_to_string(ctx->so->type), 4192 gl_varying_slot_name_for_stage(slot, ctx->so->type)); 4193 } 4194 } else { 4195 ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type); 4196 } 4197 4198 so->outputs_count = MAX2(so->outputs_count, n + 1); 4199 compile_assert(ctx, so->outputs_count <= ARRAY_SIZE(so->outputs)); 4200 4201 so->outputs[n].slot = slot; 4202 if (io.per_view) 4203 so->outputs[n].view = offset; 4204 4205 for (int i = 0; i < ncomp; i++) { 4206 unsigned idx = (n * 4) + i + frac; 4207 compile_assert(ctx, idx < ctx->noutputs); 4208 ctx->outputs[idx] = create_immed(ctx->block, fui(0.0)); 4209 } 4210 4211 /* if varying packing doesn't happen, we could end up in a situation 4212 * with "holes" in the output, and since the per-generation code that 4213 * sets up varying linkage registers doesn't expect to have more than 4214 * one varying per vec4 slot, pad the holes. 4215 * 4216 * Note that this should probably generate a performance warning of 4217 * some sort. 4218 */ 4219 for (int i = 0; i < frac; i++) { 4220 unsigned idx = (n * 4) + i; 4221 if (!ctx->outputs[idx]) { 4222 ctx->outputs[idx] = create_immed(ctx->block, fui(0.0)); 4223 } 4224 } 4225 4226 struct ir3_instruction *const *src = ir3_get_src(ctx, &intr->src[0]); 4227 for (int i = 0; i < ncomp; i++) { 4228 unsigned idx = (n * 4) + i + frac; 4229 ctx->outputs[idx] = src[i]; 4230 } 4231} 4232 4233static bool 4234uses_load_input(struct ir3_shader_variant *so) 4235{ 4236 return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT; 4237} 4238 4239static bool 4240uses_store_output(struct ir3_shader_variant *so) 4241{ 4242 switch (so->type) { 4243 case MESA_SHADER_VERTEX: 4244 return !so->key.has_gs && !so->key.tessellation; 4245 case MESA_SHADER_TESS_EVAL: 4246 return !so->key.has_gs; 4247 case MESA_SHADER_GEOMETRY: 4248 case MESA_SHADER_FRAGMENT: 4249 return true; 4250 case MESA_SHADER_TESS_CTRL: 4251 case MESA_SHADER_COMPUTE: 4252 case MESA_SHADER_KERNEL: 4253 return false; 4254 default: 4255 unreachable("unknown stage"); 4256 } 4257} 4258 4259static void 4260emit_instructions(struct ir3_context *ctx) 4261{ 4262 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s); 4263 4264 /* some varying setup which can't be done in setup_input(): */ 4265 if (ctx->so->type == MESA_SHADER_FRAGMENT) { 4266 nir_foreach_shader_in_variable (var, ctx->s) { 4267 /* if any varyings have 'sample' qualifer, that triggers us 4268 * to run in per-sample mode: 4269 */ 4270 if (var->data.sample) 4271 ctx->so->per_samp = true; 4272 4273 /* set rasterflat flag for front/back color */ 4274 if (var->data.interpolation == INTERP_MODE_NONE) { 4275 switch (var->data.location) { 4276 case VARYING_SLOT_COL0: 4277 case VARYING_SLOT_COL1: 4278 case VARYING_SLOT_BFC0: 4279 case VARYING_SLOT_BFC1: 4280 ctx->so->inputs[var->data.driver_location].rasterflat = true; 4281 break; 4282 default: 4283 break; 4284 } 4285 } 4286 } 4287 } 4288 4289 if (uses_load_input(ctx->so)) { 4290 ctx->so->inputs_count = ctx->s->num_inputs; 4291 compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs)); 4292 ctx->ninputs = ctx->s->num_inputs * 4; 4293 ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs); 4294 } else { 4295 ctx->ninputs = 0; 4296 ctx->so->inputs_count = 0; 4297 } 4298 4299 if (uses_store_output(ctx->so)) { 4300 ctx->noutputs = ctx->s->num_outputs * 4; 4301 ctx->outputs = 4302 rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs); 4303 } else { 4304 ctx->noutputs = 0; 4305 } 4306 4307 ctx->ir = ir3_create(ctx->compiler, ctx->so); 4308 4309 /* Create inputs in first block: */ 4310 ctx->block = get_block(ctx, nir_start_block(fxn)); 4311 ctx->in_block = ctx->block; 4312 4313 /* for fragment shader, the vcoord input register is used as the 4314 * base for bary.f varying fetch instrs: 4315 * 4316 * TODO defer creating ctx->ij_pixel and corresponding sysvals 4317 * until emit_intrinsic when we know they are actually needed. 4318 * For now, we defer creating ctx->ij_centroid, etc, since we 4319 * only need ij_pixel for "old style" varying inputs (ie. 4320 * tgsi_to_nir) 4321 */ 4322 if (ctx->so->type == MESA_SHADER_FRAGMENT) { 4323 ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3); 4324 } 4325 4326 /* Defer add_sysval_input() stuff until after setup_inputs(), 4327 * because sysvals need to be appended after varyings: 4328 */ 4329 if (ctx->ij[IJ_PERSP_PIXEL]) { 4330 add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, 0x3, 4331 ctx->ij[IJ_PERSP_PIXEL]); 4332 } 4333 4334 /* Tesselation shaders always need primitive ID for indexing the 4335 * BO. Geometry shaders don't always need it but when they do it has be 4336 * delivered and unclobbered in the VS. To make things easy, we always 4337 * make room for it in VS/DS. 4338 */ 4339 bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE; 4340 bool has_gs = ctx->so->key.has_gs; 4341 switch (ctx->so->type) { 4342 case MESA_SHADER_VERTEX: 4343 if (has_tess) { 4344 ctx->tcs_header = 4345 create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1); 4346 ctx->rel_patch_id = 4347 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1); 4348 ctx->primitive_id = 4349 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1); 4350 } else if (has_gs) { 4351 ctx->gs_header = 4352 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1); 4353 ctx->primitive_id = 4354 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1); 4355 } 4356 break; 4357 case MESA_SHADER_TESS_CTRL: 4358 ctx->tcs_header = 4359 create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1); 4360 ctx->rel_patch_id = 4361 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1); 4362 break; 4363 case MESA_SHADER_TESS_EVAL: 4364 if (has_gs) { 4365 ctx->gs_header = 4366 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1); 4367 ctx->primitive_id = 4368 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1); 4369 } 4370 ctx->rel_patch_id = 4371 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1); 4372 break; 4373 case MESA_SHADER_GEOMETRY: 4374 ctx->gs_header = 4375 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1); 4376 break; 4377 default: 4378 break; 4379 } 4380 4381 /* Find # of samplers. Just assume that we'll be reading from images.. if 4382 * it is write-only we don't have to count it, but after lowering derefs 4383 * is too late to compact indices for that. 4384 */ 4385 ctx->so->num_samp = 4386 BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images; 4387 4388 /* Save off clip+cull information. Note that in OpenGL clip planes may 4389 * be individually enabled/disabled, and some gens handle lowering in 4390 * backend, so we also need to consider the shader key: 4391 */ 4392 ctx->so->clip_mask = ctx->so->key.ucp_enables | 4393 MASK(ctx->s->info.clip_distance_array_size); 4394 ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size) 4395 << ctx->s->info.clip_distance_array_size; 4396 4397 ctx->so->pvtmem_size = ctx->s->scratch_size; 4398 ctx->so->shared_size = ctx->s->info.shared_size; 4399 4400 /* NOTE: need to do something more clever when we support >1 fxn */ 4401 nir_foreach_register (reg, &fxn->registers) { 4402 ir3_declare_array(ctx, reg); 4403 } 4404 4405 if (ctx->so->type == MESA_SHADER_TESS_CTRL && 4406 ctx->compiler->tess_use_shared) { 4407 struct ir3_instruction *barrier = ir3_BAR(ctx->block); 4408 barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY; 4409 barrier->barrier_class = IR3_BARRIER_EVERYTHING; 4410 array_insert(ctx->block, ctx->block->keeps, barrier); 4411 ctx->so->has_barrier = true; 4412 } 4413 4414 /* And emit the body: */ 4415 ctx->impl = fxn; 4416 emit_function(ctx, fxn); 4417} 4418 4419/* Fixup tex sampler state for astc/srgb workaround instructions. We 4420 * need to assign the tex state indexes for these after we know the 4421 * max tex index. 4422 */ 4423static void 4424fixup_astc_srgb(struct ir3_context *ctx) 4425{ 4426 struct ir3_shader_variant *so = ctx->so; 4427 /* indexed by original tex idx, value is newly assigned alpha sampler 4428 * state tex idx. Zero is invalid since there is at least one sampler 4429 * if we get here. 4430 */ 4431 unsigned alt_tex_state[16] = {0}; 4432 unsigned tex_idx = ctx->max_texture_index + 1; 4433 unsigned idx = 0; 4434 4435 so->astc_srgb.base = tex_idx; 4436 4437 for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) { 4438 struct ir3_instruction *sam = ctx->ir->astc_srgb[i]; 4439 4440 compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state)); 4441 4442 if (alt_tex_state[sam->cat5.tex] == 0) { 4443 /* assign new alternate/alpha tex state slot: */ 4444 alt_tex_state[sam->cat5.tex] = tex_idx++; 4445 so->astc_srgb.orig_idx[idx++] = sam->cat5.tex; 4446 so->astc_srgb.count++; 4447 } 4448 4449 sam->cat5.tex = alt_tex_state[sam->cat5.tex]; 4450 } 4451} 4452 4453/* Fixup tex sampler state for tg4 workaround instructions. We 4454 * need to assign the tex state indexes for these after we know the 4455 * max tex index. 4456 */ 4457static void 4458fixup_tg4(struct ir3_context *ctx) 4459{ 4460 struct ir3_shader_variant *so = ctx->so; 4461 /* indexed by original tex idx, value is newly assigned alpha sampler 4462 * state tex idx. Zero is invalid since there is at least one sampler 4463 * if we get here. 4464 */ 4465 unsigned alt_tex_state[16] = {0}; 4466 unsigned tex_idx = ctx->max_texture_index + so->astc_srgb.count + 1; 4467 unsigned idx = 0; 4468 4469 so->tg4.base = tex_idx; 4470 4471 for (unsigned i = 0; i < ctx->ir->tg4_count; i++) { 4472 struct ir3_instruction *sam = ctx->ir->tg4[i]; 4473 4474 compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state)); 4475 4476 if (alt_tex_state[sam->cat5.tex] == 0) { 4477 /* assign new alternate/alpha tex state slot: */ 4478 alt_tex_state[sam->cat5.tex] = tex_idx++; 4479 so->tg4.orig_idx[idx++] = sam->cat5.tex; 4480 so->tg4.count++; 4481 } 4482 4483 sam->cat5.tex = alt_tex_state[sam->cat5.tex]; 4484 } 4485} 4486 4487static bool 4488output_slot_used_for_binning(gl_varying_slot slot) 4489{ 4490 return slot == VARYING_SLOT_POS || slot == VARYING_SLOT_PSIZ || 4491 slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1 || 4492 slot == VARYING_SLOT_VIEWPORT; 4493} 4494 4495static struct ir3_instruction * 4496find_end(struct ir3 *ir) 4497{ 4498 foreach_block_rev (block, &ir->block_list) { 4499 foreach_instr_rev (instr, &block->instr_list) { 4500 if (instr->opc == OPC_END || instr->opc == OPC_CHMASK) 4501 return instr; 4502 } 4503 } 4504 unreachable("couldn't find end instruction"); 4505} 4506 4507static void 4508fixup_binning_pass(struct ir3_context *ctx, struct ir3_instruction *end) 4509{ 4510 struct ir3_shader_variant *so = ctx->so; 4511 unsigned i, j; 4512 4513 /* first pass, remove unused outputs from the IR level outputs: */ 4514 for (i = 0, j = 0; i < end->srcs_count; i++) { 4515 unsigned outidx = end->end.outidxs[i]; 4516 unsigned slot = so->outputs[outidx].slot; 4517 4518 if (output_slot_used_for_binning(slot)) { 4519 end->srcs[j] = end->srcs[i]; 4520 end->end.outidxs[j] = end->end.outidxs[i]; 4521 j++; 4522 } 4523 } 4524 end->srcs_count = j; 4525 4526 /* second pass, cleanup the unused slots in ir3_shader_variant::outputs 4527 * table: 4528 */ 4529 for (i = 0, j = 0; i < so->outputs_count; i++) { 4530 unsigned slot = so->outputs[i].slot; 4531 4532 if (output_slot_used_for_binning(slot)) { 4533 so->outputs[j] = so->outputs[i]; 4534 4535 /* fixup outidx to point to new output table entry: */ 4536 for (unsigned k = 0; k < end->srcs_count; k++) { 4537 if (end->end.outidxs[k] == i) { 4538 end->end.outidxs[k] = j; 4539 break; 4540 } 4541 } 4542 4543 j++; 4544 } 4545 } 4546 so->outputs_count = j; 4547} 4548 4549static void 4550collect_tex_prefetches(struct ir3_context *ctx, struct ir3 *ir) 4551{ 4552 unsigned idx = 0; 4553 4554 /* Collect sampling instructions eligible for pre-dispatch. */ 4555 foreach_block (block, &ir->block_list) { 4556 foreach_instr_safe (instr, &block->instr_list) { 4557 if (instr->opc == OPC_META_TEX_PREFETCH) { 4558 assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch)); 4559 struct ir3_sampler_prefetch *fetch = 4560 &ctx->so->sampler_prefetch[idx]; 4561 idx++; 4562 4563 if (instr->flags & IR3_INSTR_B) { 4564 fetch->cmd = IR3_SAMPLER_BINDLESS_PREFETCH_CMD; 4565 /* In bindless mode, the index is actually the base */ 4566 fetch->tex_id = instr->prefetch.tex_base; 4567 fetch->samp_id = instr->prefetch.samp_base; 4568 fetch->tex_bindless_id = instr->prefetch.tex; 4569 fetch->samp_bindless_id = instr->prefetch.samp; 4570 } else { 4571 fetch->cmd = IR3_SAMPLER_PREFETCH_CMD; 4572 fetch->tex_id = instr->prefetch.tex; 4573 fetch->samp_id = instr->prefetch.samp; 4574 } 4575 fetch->wrmask = instr->dsts[0]->wrmask; 4576 fetch->dst = instr->dsts[0]->num; 4577 fetch->src = instr->prefetch.input_offset; 4578 4579 /* These are the limits on a5xx/a6xx, we might need to 4580 * revisit if SP_FS_PREFETCH[n] changes on later gens: 4581 */ 4582 assert(fetch->dst <= 0x3f); 4583 assert(fetch->tex_id <= 0x1f); 4584 assert(fetch->samp_id <= 0xf); 4585 4586 ctx->so->total_in = 4587 MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2); 4588 4589 fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF); 4590 4591 /* Remove the prefetch placeholder instruction: */ 4592 list_delinit(&instr->node); 4593 } 4594 } 4595 } 4596} 4597 4598int 4599ir3_compile_shader_nir(struct ir3_compiler *compiler, 4600 struct ir3_shader *shader, 4601 struct ir3_shader_variant *so) 4602{ 4603 struct ir3_context *ctx; 4604 struct ir3 *ir; 4605 int ret = 0, max_bary; 4606 bool progress; 4607 4608 assert(!so->ir); 4609 4610 ctx = ir3_context_init(compiler, shader, so); 4611 if (!ctx) { 4612 DBG("INIT failed!"); 4613 ret = -1; 4614 goto out; 4615 } 4616 4617 emit_instructions(ctx); 4618 4619 if (ctx->error) { 4620 DBG("EMIT failed!"); 4621 ret = -1; 4622 goto out; 4623 } 4624 4625 ir = so->ir = ctx->ir; 4626 4627 if (gl_shader_stage_is_compute(so->type)) { 4628 so->local_size[0] = ctx->s->info.workgroup_size[0]; 4629 so->local_size[1] = ctx->s->info.workgroup_size[1]; 4630 so->local_size[2] = ctx->s->info.workgroup_size[2]; 4631 so->local_size_variable = ctx->s->info.workgroup_size_variable; 4632 } 4633 4634 /* Vertex shaders in a tessellation or geometry pipeline treat END as a 4635 * NOP and has an epilogue that writes the VS outputs to local storage, to 4636 * be read by the HS. Then it resets execution mask (chmask) and chains 4637 * to the next shader (chsh). There are also a few output values which we 4638 * must send to the next stage via registers, and in order for both stages 4639 * to agree on the register used we must force these to be in specific 4640 * registers. 4641 */ 4642 if ((so->type == MESA_SHADER_VERTEX && 4643 (so->key.has_gs || so->key.tessellation)) || 4644 (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) { 4645 struct ir3_instruction *outputs[3]; 4646 unsigned outidxs[3]; 4647 unsigned regids[3]; 4648 unsigned outputs_count = 0; 4649 4650 if (ctx->primitive_id) { 4651 unsigned n = so->outputs_count++; 4652 so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID; 4653 4654 struct ir3_instruction *out = ir3_collect(ctx->block, ctx->primitive_id); 4655 outputs[outputs_count] = out; 4656 outidxs[outputs_count] = n; 4657 if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id) 4658 regids[outputs_count] = regid(0, 2); 4659 else 4660 regids[outputs_count] = regid(0, 1); 4661 outputs_count++; 4662 } 4663 4664 if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id) { 4665 unsigned n = so->outputs_count++; 4666 so->outputs[n].slot = VARYING_SLOT_REL_PATCH_ID_IR3; 4667 struct ir3_instruction *out = ir3_collect(ctx->block, ctx->rel_patch_id); 4668 outputs[outputs_count] = out; 4669 outidxs[outputs_count] = n; 4670 regids[outputs_count] = regid(0, 1); 4671 outputs_count++; 4672 } 4673 4674 if (ctx->gs_header) { 4675 unsigned n = so->outputs_count++; 4676 so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3; 4677 struct ir3_instruction *out = ir3_collect(ctx->block, ctx->gs_header); 4678 outputs[outputs_count] = out; 4679 outidxs[outputs_count] = n; 4680 regids[outputs_count] = regid(0, 0); 4681 outputs_count++; 4682 } 4683 4684 if (ctx->tcs_header) { 4685 unsigned n = so->outputs_count++; 4686 so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3; 4687 struct ir3_instruction *out = ir3_collect(ctx->block, ctx->tcs_header); 4688 outputs[outputs_count] = out; 4689 outidxs[outputs_count] = n; 4690 regids[outputs_count] = regid(0, 0); 4691 outputs_count++; 4692 } 4693 4694 struct ir3_instruction *chmask = 4695 ir3_instr_create(ctx->block, OPC_CHMASK, 0, outputs_count); 4696 chmask->barrier_class = IR3_BARRIER_EVERYTHING; 4697 chmask->barrier_conflict = IR3_BARRIER_EVERYTHING; 4698 4699 for (unsigned i = 0; i < outputs_count; i++) 4700 __ssa_src(chmask, outputs[i], 0)->num = regids[i]; 4701 4702 chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count); 4703 memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count); 4704 4705 array_insert(ctx->block, ctx->block->keeps, chmask); 4706 4707 struct ir3_instruction *chsh = ir3_CHSH(ctx->block); 4708 chsh->barrier_class = IR3_BARRIER_EVERYTHING; 4709 chsh->barrier_conflict = IR3_BARRIER_EVERYTHING; 4710 } else { 4711 assert((ctx->noutputs % 4) == 0); 4712 unsigned outidxs[ctx->noutputs / 4]; 4713 struct ir3_instruction *outputs[ctx->noutputs / 4]; 4714 unsigned outputs_count = 0; 4715 4716 struct ir3_block *b = ctx->block; 4717 /* Insert these collect's in the block before the end-block if 4718 * possible, so that any moves they generate can be shuffled around to 4719 * reduce nop's: 4720 */ 4721 if (ctx->block->predecessors_count == 1) 4722 b = ctx->block->predecessors[0]; 4723 4724 /* Setup IR level outputs, which are "collects" that gather 4725 * the scalar components of outputs. 4726 */ 4727 for (unsigned i = 0; i < ctx->noutputs; i += 4) { 4728 unsigned ncomp = 0; 4729 /* figure out the # of components written: 4730 * 4731 * TODO do we need to handle holes, ie. if .x and .z 4732 * components written, but .y component not written? 4733 */ 4734 for (unsigned j = 0; j < 4; j++) { 4735 if (!ctx->outputs[i + j]) 4736 break; 4737 ncomp++; 4738 } 4739 4740 /* Note that in some stages, like TCS, store_output is 4741 * lowered to memory writes, so no components of the 4742 * are "written" from the PoV of traditional store- 4743 * output instructions: 4744 */ 4745 if (!ncomp) 4746 continue; 4747 4748 struct ir3_instruction *out = 4749 ir3_create_collect(b, &ctx->outputs[i], ncomp); 4750 4751 int outidx = i / 4; 4752 assert(outidx < so->outputs_count); 4753 4754 outidxs[outputs_count] = outidx; 4755 outputs[outputs_count] = out; 4756 outputs_count++; 4757 } 4758 4759 /* for a6xx+, binning and draw pass VS use same VBO state, so we 4760 * need to make sure not to remove any inputs that are used by 4761 * the nonbinning VS. 4762 */ 4763 if (ctx->compiler->gen >= 6 && so->binning_pass && 4764 so->type == MESA_SHADER_VERTEX) { 4765 for (int i = 0; i < ctx->ninputs; i++) { 4766 struct ir3_instruction *in = ctx->inputs[i]; 4767 4768 if (!in) 4769 continue; 4770 4771 unsigned n = i / 4; 4772 unsigned c = i % 4; 4773 4774 assert(n < so->nonbinning->inputs_count); 4775 4776 if (so->nonbinning->inputs[n].sysval) 4777 continue; 4778 4779 /* be sure to keep inputs, even if only used in VS */ 4780 if (so->nonbinning->inputs[n].compmask & (1 << c)) 4781 array_insert(in->block, in->block->keeps, in); 4782 } 4783 } 4784 4785 struct ir3_instruction *end = 4786 ir3_instr_create(ctx->block, OPC_END, 0, outputs_count); 4787 4788 for (unsigned i = 0; i < outputs_count; i++) { 4789 __ssa_src(end, outputs[i], 0); 4790 } 4791 4792 end->end.outidxs = ralloc_array(end, unsigned, outputs_count); 4793 memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count); 4794 4795 array_insert(ctx->block, ctx->block->keeps, end); 4796 4797 /* at this point, for binning pass, throw away unneeded outputs: */ 4798 if (so->binning_pass && (ctx->compiler->gen < 6)) 4799 fixup_binning_pass(ctx, end); 4800 } 4801 4802 ir3_debug_print(ir, "AFTER: nir->ir3"); 4803 ir3_validate(ir); 4804 4805 IR3_PASS(ir, ir3_remove_unreachable); 4806 4807 IR3_PASS(ir, ir3_array_to_ssa); 4808 4809 do { 4810 progress = false; 4811 4812 /* the folding doesn't seem to work reliably on a4xx */ 4813 if (ctx->compiler->gen != 4) 4814 progress |= IR3_PASS(ir, ir3_cf); 4815 progress |= IR3_PASS(ir, ir3_cp, so); 4816 progress |= IR3_PASS(ir, ir3_cse); 4817 progress |= IR3_PASS(ir, ir3_dce, so); 4818 } while (progress); 4819 4820 /* at this point, for binning pass, throw away unneeded outputs: 4821 * Note that for a6xx and later, we do this after ir3_cp to ensure 4822 * that the uniform/constant layout for BS and VS matches, so that 4823 * we can re-use same VS_CONST state group. 4824 */ 4825 if (so->binning_pass && (ctx->compiler->gen >= 6)) { 4826 fixup_binning_pass(ctx, find_end(ctx->so->ir)); 4827 /* cleanup the result of removing unneeded outputs: */ 4828 while (IR3_PASS(ir, ir3_dce, so)) { 4829 } 4830 } 4831 4832 IR3_PASS(ir, ir3_sched_add_deps); 4833 4834 /* At this point, all the dead code should be long gone: */ 4835 assert(!IR3_PASS(ir, ir3_dce, so)); 4836 4837 ret = ir3_sched(ir); 4838 if (ret) { 4839 DBG("SCHED failed!"); 4840 goto out; 4841 } 4842 4843 ir3_debug_print(ir, "AFTER: ir3_sched"); 4844 4845 /* Pre-assign VS inputs on a6xx+ binning pass shader, to align 4846 * with draw pass VS, so binning and draw pass can both use the 4847 * same VBO state. 4848 * 4849 * Note that VS inputs are expected to be full precision. 4850 */ 4851 bool pre_assign_inputs = (ir->compiler->gen >= 6) && 4852 (ir->type == MESA_SHADER_VERTEX) && 4853 so->binning_pass; 4854 4855 if (pre_assign_inputs) { 4856 foreach_input (in, ir) { 4857 assert(in->opc == OPC_META_INPUT); 4858 unsigned inidx = in->input.inidx; 4859 4860 in->dsts[0]->num = so->nonbinning->inputs[inidx].regid; 4861 } 4862 } else if (ctx->tcs_header) { 4863 /* We need to have these values in the same registers between VS and TCS 4864 * since the VS chains to TCS and doesn't get the sysvals redelivered. 4865 */ 4866 4867 ctx->tcs_header->dsts[0]->num = regid(0, 0); 4868 ctx->rel_patch_id->dsts[0]->num = regid(0, 1); 4869 if (ctx->primitive_id) 4870 ctx->primitive_id->dsts[0]->num = regid(0, 2); 4871 } else if (ctx->gs_header) { 4872 /* We need to have these values in the same registers between producer 4873 * (VS or DS) and GS since the producer chains to GS and doesn't get 4874 * the sysvals redelivered. 4875 */ 4876 4877 ctx->gs_header->dsts[0]->num = regid(0, 0); 4878 if (ctx->primitive_id) 4879 ctx->primitive_id->dsts[0]->num = regid(0, 1); 4880 } else if (so->num_sampler_prefetch) { 4881 assert(so->type == MESA_SHADER_FRAGMENT); 4882 int idx = 0; 4883 4884 foreach_input (instr, ir) { 4885 if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL) 4886 continue; 4887 4888 assert(idx < 2); 4889 instr->dsts[0]->num = idx; 4890 idx++; 4891 } 4892 } 4893 4894 ret = ir3_ra(so); 4895 4896 if (ret) { 4897 mesa_loge("ir3_ra() failed!"); 4898 goto out; 4899 } 4900 4901 IR3_PASS(ir, ir3_postsched, so); 4902 4903 IR3_PASS(ir, ir3_legalize_relative); 4904 IR3_PASS(ir, ir3_lower_subgroups); 4905 4906 if (so->type == MESA_SHADER_FRAGMENT) 4907 pack_inlocs(ctx); 4908 4909 /* 4910 * Fixup inputs/outputs to point to the actual registers assigned: 4911 * 4912 * 1) initialize to r63.x (invalid/unused) 4913 * 2) iterate IR level inputs/outputs and update the variants 4914 * inputs/outputs table based on the assigned registers for 4915 * the remaining inputs/outputs. 4916 */ 4917 4918 for (unsigned i = 0; i < so->inputs_count; i++) 4919 so->inputs[i].regid = INVALID_REG; 4920 for (unsigned i = 0; i < so->outputs_count; i++) 4921 so->outputs[i].regid = INVALID_REG; 4922 4923 struct ir3_instruction *end = find_end(so->ir); 4924 4925 for (unsigned i = 0; i < end->srcs_count; i++) { 4926 unsigned outidx = end->end.outidxs[i]; 4927 struct ir3_register *reg = end->srcs[i]; 4928 4929 so->outputs[outidx].regid = reg->num; 4930 so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF); 4931 } 4932 4933 foreach_input (in, ir) { 4934 assert(in->opc == OPC_META_INPUT); 4935 unsigned inidx = in->input.inidx; 4936 4937 if (pre_assign_inputs && !so->inputs[inidx].sysval) { 4938 if (VALIDREG(so->nonbinning->inputs[inidx].regid)) { 4939 compile_assert( 4940 ctx, in->dsts[0]->num == so->nonbinning->inputs[inidx].regid); 4941 compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) == 4942 so->nonbinning->inputs[inidx].half); 4943 } 4944 so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid; 4945 so->inputs[inidx].half = so->nonbinning->inputs[inidx].half; 4946 } else { 4947 so->inputs[inidx].regid = in->dsts[0]->num; 4948 so->inputs[inidx].half = !!(in->dsts[0]->flags & IR3_REG_HALF); 4949 } 4950 } 4951 4952 if (ctx->astc_srgb) 4953 fixup_astc_srgb(ctx); 4954 4955 if (ctx->compiler->gen == 4 && ctx->s->info.uses_texture_gather) 4956 fixup_tg4(ctx); 4957 4958 /* We need to do legalize after (for frag shader's) the "bary.f" 4959 * offsets (inloc) have been assigned. 4960 */ 4961 IR3_PASS(ir, ir3_legalize, so, &max_bary); 4962 4963 /* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't 4964 * know what we might have to wait on when coming in from VS chsh. 4965 */ 4966 if (so->type == MESA_SHADER_TESS_CTRL || so->type == MESA_SHADER_GEOMETRY) { 4967 foreach_block (block, &ir->block_list) { 4968 foreach_instr (instr, &block->instr_list) { 4969 instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY; 4970 break; 4971 } 4972 } 4973 } 4974 4975 so->branchstack = ctx->max_stack; 4976 4977 /* Note that max_bary counts inputs that are not bary.f'd for FS: */ 4978 if (so->type == MESA_SHADER_FRAGMENT) 4979 so->total_in = max_bary + 1; 4980 4981 /* Collect sampling instructions eligible for pre-dispatch. */ 4982 collect_tex_prefetches(ctx, ir); 4983 4984 if (so->type == MESA_SHADER_FRAGMENT && 4985 ctx->s->info.fs.needs_quad_helper_invocations) 4986 so->need_pixlod = true; 4987 4988 if ((ctx->so->type == MESA_SHADER_FRAGMENT) && 4989 !ctx->s->info.fs.early_fragment_tests) 4990 ctx->so->no_earlyz |= ctx->s->info.writes_memory; 4991 4992out: 4993 if (ret) { 4994 if (so->ir) 4995 ir3_destroy(so->ir); 4996 so->ir = NULL; 4997 } 4998 ir3_context_free(ctx); 4999 5000 return ret; 5001} 5002