1/* 2 * Copyright © 2016 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <inttypes.h> 25#include "util/format/u_format.h" 26#include "util/u_helpers.h" 27#include "util/u_math.h" 28#include "util/u_memory.h" 29#include "util/ralloc.h" 30#include "util/hash_table.h" 31#include "compiler/nir/nir.h" 32#include "compiler/nir/nir_builder.h" 33#include "common/v3d_device_info.h" 34#include "v3d_compiler.h" 35 36/* We don't do any address packing. */ 37#define __gen_user_data void 38#define __gen_address_type uint32_t 39#define __gen_address_offset(reloc) (*reloc) 40#define __gen_emit_reloc(cl, reloc) 41#include "cle/v3d_packet_v41_pack.h" 42 43#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7) 44#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7) 45#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0) 46#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0) 47#define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0) 48#define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0) 49#define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0) 50#define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0) 51#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0) 52#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0) 53 54#define V3D_TSY_SET_QUORUM 0 55#define V3D_TSY_INC_WAITERS 1 56#define V3D_TSY_DEC_WAITERS 2 57#define V3D_TSY_INC_QUORUM 3 58#define V3D_TSY_DEC_QUORUM 4 59#define V3D_TSY_FREE_ALL 5 60#define V3D_TSY_RELEASE 6 61#define V3D_TSY_ACQUIRE 7 62#define V3D_TSY_WAIT 8 63#define V3D_TSY_WAIT_INC 9 64#define V3D_TSY_WAIT_CHECK 10 65#define V3D_TSY_WAIT_INC_CHECK 11 66#define V3D_TSY_WAIT_CV 12 67#define V3D_TSY_INC_SEMAPHORE 13 68#define V3D_TSY_DEC_SEMAPHORE 14 69#define V3D_TSY_SET_QUORUM_FREE_ALL 15 70 71enum v3d_tmu_op_type 72{ 73 V3D_TMU_OP_TYPE_REGULAR, 74 V3D_TMU_OP_TYPE_ATOMIC, 75 V3D_TMU_OP_TYPE_CACHE 76}; 77 78static enum v3d_tmu_op_type 79v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write) 80{ 81 switch(tmu_op) { 82 case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH: 83 case V3D_TMU_OP_WRITE_SUB_READ_CLEAR: 84 case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH: 85 case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH: 86 case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR: 87 return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE; 88 case V3D_TMU_OP_WRITE_UMAX: 89 case V3D_TMU_OP_WRITE_SMIN: 90 case V3D_TMU_OP_WRITE_SMAX: 91 assert(is_write); 92 FALLTHROUGH; 93 case V3D_TMU_OP_WRITE_AND_READ_INC: 94 case V3D_TMU_OP_WRITE_OR_READ_DEC: 95 case V3D_TMU_OP_WRITE_XOR_READ_NOT: 96 return V3D_TMU_OP_TYPE_ATOMIC; 97 case V3D_TMU_OP_REGULAR: 98 return V3D_TMU_OP_TYPE_REGULAR; 99 100 default: 101 unreachable("Unknown tmu_op\n"); 102 } 103} 104static void 105ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 106 107static void 108resize_qreg_array(struct v3d_compile *c, 109 struct qreg **regs, 110 uint32_t *size, 111 uint32_t decl_size) 112{ 113 if (*size >= decl_size) 114 return; 115 116 uint32_t old_size = *size; 117 *size = MAX2(*size * 2, decl_size); 118 *regs = reralloc(c, *regs, struct qreg, *size); 119 if (!*regs) { 120 fprintf(stderr, "Malloc failure\n"); 121 abort(); 122 } 123 124 for (uint32_t i = old_size; i < *size; i++) 125 (*regs)[i] = c->undef; 126} 127 128static void 129resize_interp_array(struct v3d_compile *c, 130 struct v3d_interp_input **regs, 131 uint32_t *size, 132 uint32_t decl_size) 133{ 134 if (*size >= decl_size) 135 return; 136 137 uint32_t old_size = *size; 138 *size = MAX2(*size * 2, decl_size); 139 *regs = reralloc(c, *regs, struct v3d_interp_input, *size); 140 if (!*regs) { 141 fprintf(stderr, "Malloc failure\n"); 142 abort(); 143 } 144 145 for (uint32_t i = old_size; i < *size; i++) { 146 (*regs)[i].vp = c->undef; 147 (*regs)[i].C = c->undef; 148 } 149} 150 151void 152vir_emit_thrsw(struct v3d_compile *c) 153{ 154 if (c->threads == 1) 155 return; 156 157 /* Always thread switch after each texture operation for now. 158 * 159 * We could do better by batching a bunch of texture fetches up and 160 * then doing one thread switch and collecting all their results 161 * afterward. 162 */ 163 c->last_thrsw = vir_NOP(c); 164 c->last_thrsw->qpu.sig.thrsw = true; 165 c->last_thrsw_at_top_level = !c->in_control_flow; 166 167 /* We need to lock the scoreboard before any tlb acess happens. If this 168 * thread switch comes after we have emitted a tlb load, then it means 169 * that we can't lock on the last thread switch any more. 170 */ 171 if (c->emitted_tlb_load) 172 c->lock_scoreboard_on_first_thrsw = true; 173} 174 175uint32_t 176v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) 177{ 178 if (nir_src_is_const(instr->src[src])) { 179 int64_t add_val = nir_src_as_int(instr->src[src]); 180 if (add_val == 1) 181 return V3D_TMU_OP_WRITE_AND_READ_INC; 182 else if (add_val == -1) 183 return V3D_TMU_OP_WRITE_OR_READ_DEC; 184 } 185 186 return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH; 187} 188 189static uint32_t 190v3d_general_tmu_op(nir_intrinsic_instr *instr) 191{ 192 switch (instr->intrinsic) { 193 case nir_intrinsic_load_ssbo: 194 case nir_intrinsic_load_ubo: 195 case nir_intrinsic_load_uniform: 196 case nir_intrinsic_load_shared: 197 case nir_intrinsic_load_scratch: 198 case nir_intrinsic_load_global_2x32: 199 case nir_intrinsic_store_ssbo: 200 case nir_intrinsic_store_shared: 201 case nir_intrinsic_store_scratch: 202 case nir_intrinsic_store_global_2x32: 203 return V3D_TMU_OP_REGULAR; 204 case nir_intrinsic_ssbo_atomic_add: 205 return v3d_get_op_for_atomic_add(instr, 2); 206 case nir_intrinsic_shared_atomic_add: 207 case nir_intrinsic_global_atomic_add_2x32: 208 return v3d_get_op_for_atomic_add(instr, 1); 209 case nir_intrinsic_ssbo_atomic_imin: 210 case nir_intrinsic_global_atomic_imin_2x32: 211 case nir_intrinsic_shared_atomic_imin: 212 return V3D_TMU_OP_WRITE_SMIN; 213 case nir_intrinsic_ssbo_atomic_umin: 214 case nir_intrinsic_global_atomic_umin_2x32: 215 case nir_intrinsic_shared_atomic_umin: 216 return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; 217 case nir_intrinsic_ssbo_atomic_imax: 218 case nir_intrinsic_global_atomic_imax_2x32: 219 case nir_intrinsic_shared_atomic_imax: 220 return V3D_TMU_OP_WRITE_SMAX; 221 case nir_intrinsic_ssbo_atomic_umax: 222 case nir_intrinsic_global_atomic_umax_2x32: 223 case nir_intrinsic_shared_atomic_umax: 224 return V3D_TMU_OP_WRITE_UMAX; 225 case nir_intrinsic_ssbo_atomic_and: 226 case nir_intrinsic_global_atomic_and_2x32: 227 case nir_intrinsic_shared_atomic_and: 228 return V3D_TMU_OP_WRITE_AND_READ_INC; 229 case nir_intrinsic_ssbo_atomic_or: 230 case nir_intrinsic_global_atomic_or_2x32: 231 case nir_intrinsic_shared_atomic_or: 232 return V3D_TMU_OP_WRITE_OR_READ_DEC; 233 case nir_intrinsic_ssbo_atomic_xor: 234 case nir_intrinsic_global_atomic_xor_2x32: 235 case nir_intrinsic_shared_atomic_xor: 236 return V3D_TMU_OP_WRITE_XOR_READ_NOT; 237 case nir_intrinsic_ssbo_atomic_exchange: 238 case nir_intrinsic_global_atomic_exchange_2x32: 239 case nir_intrinsic_shared_atomic_exchange: 240 return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; 241 case nir_intrinsic_ssbo_atomic_comp_swap: 242 case nir_intrinsic_global_atomic_comp_swap_2x32: 243 case nir_intrinsic_shared_atomic_comp_swap: 244 return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; 245 default: 246 unreachable("unknown intrinsic op"); 247 } 248} 249 250/** 251 * Checks if pipelining a new TMU operation requiring 'components' LDTMUs 252 * would overflow the Output TMU fifo. 253 * 254 * It is not allowed to overflow the Output fifo, however, we can overflow 255 * Input and Config fifos. Doing that makes the shader stall, but only for as 256 * long as it needs to be able to continue so it is better for pipelining to 257 * let the QPU stall on these if needed than trying to emit TMU flushes in the 258 * driver. 259 */ 260bool 261ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components) 262{ 263 if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE) 264 return true; 265 266 return components > 0 && 267 c->tmu.output_fifo_size + components > 16 / c->threads; 268} 269 270/** 271 * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations, 272 * popping all TMU fifo entries. 273 */ 274void 275ntq_flush_tmu(struct v3d_compile *c) 276{ 277 if (c->tmu.flush_count == 0) 278 return; 279 280 vir_emit_thrsw(c); 281 282 bool emitted_tmuwt = false; 283 for (int i = 0; i < c->tmu.flush_count; i++) { 284 if (c->tmu.flush[i].component_mask > 0) { 285 nir_dest *dest = c->tmu.flush[i].dest; 286 assert(dest); 287 288 for (int j = 0; j < 4; j++) { 289 if (c->tmu.flush[i].component_mask & (1 << j)) { 290 ntq_store_dest(c, dest, j, 291 vir_MOV(c, vir_LDTMU(c))); 292 } 293 } 294 } else if (!emitted_tmuwt) { 295 vir_TMUWT(c); 296 emitted_tmuwt = true; 297 } 298 } 299 300 c->tmu.output_fifo_size = 0; 301 c->tmu.flush_count = 0; 302 _mesa_set_clear(c->tmu.outstanding_regs, NULL); 303} 304 305/** 306 * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller 307 * is reponsible for ensuring that doing this doesn't overflow the TMU fifos, 308 * and more specifically, the output fifo, since that can't stall. 309 */ 310void 311ntq_add_pending_tmu_flush(struct v3d_compile *c, 312 nir_dest *dest, 313 uint32_t component_mask) 314{ 315 const uint32_t num_components = util_bitcount(component_mask); 316 assert(!ntq_tmu_fifo_overflow(c, num_components)); 317 318 if (num_components > 0) { 319 c->tmu.output_fifo_size += num_components; 320 if (!dest->is_ssa) 321 _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg); 322 } 323 324 c->tmu.flush[c->tmu.flush_count].dest = dest; 325 c->tmu.flush[c->tmu.flush_count].component_mask = component_mask; 326 c->tmu.flush_count++; 327 328 if (c->disable_tmu_pipelining) 329 ntq_flush_tmu(c); 330 else if (c->tmu.flush_count > 1) 331 c->pipelined_any_tmu = true; 332} 333 334enum emit_mode { 335 MODE_COUNT = 0, 336 MODE_EMIT, 337 MODE_LAST, 338}; 339 340/** 341 * For a TMU general store instruction: 342 * 343 * In MODE_COUNT mode, records the number of TMU writes required and flushes 344 * any outstanding TMU operations the instruction depends on, but it doesn't 345 * emit any actual register writes. 346 * 347 * In MODE_EMIT mode, emits the data register writes required by the 348 * instruction. 349 */ 350static void 351emit_tmu_general_store_writes(struct v3d_compile *c, 352 enum emit_mode mode, 353 nir_intrinsic_instr *instr, 354 uint32_t base_const_offset, 355 uint32_t *writemask, 356 uint32_t *const_offset, 357 uint32_t *type_size, 358 uint32_t *tmu_writes) 359{ 360 struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); 361 362 /* Find the first set of consecutive components that 363 * are enabled in the writemask and emit the TMUD 364 * instructions for them. 365 */ 366 assert(*writemask != 0); 367 uint32_t first_component = ffs(*writemask) - 1; 368 uint32_t last_component = first_component; 369 while (*writemask & BITFIELD_BIT(last_component + 1)) 370 last_component++; 371 372 assert(first_component <= last_component && 373 last_component < instr->num_components); 374 375 for (int i = first_component; i <= last_component; i++) { 376 struct qreg data = ntq_get_src(c, instr->src[0], i); 377 if (mode == MODE_COUNT) 378 (*tmu_writes)++; 379 else 380 vir_MOV_dest(c, tmud, data); 381 } 382 383 if (mode == MODE_EMIT) { 384 /* Update the offset for the TMU write based on the 385 * the first component we are writing. 386 */ 387 *type_size = nir_src_bit_size(instr->src[0]) / 8; 388 *const_offset = 389 base_const_offset + first_component * (*type_size); 390 391 /* Clear these components from the writemask */ 392 uint32_t written_mask = 393 BITFIELD_RANGE(first_component, *tmu_writes); 394 (*writemask) &= ~written_mask; 395 } 396} 397 398/** 399 * For a TMU general atomic instruction: 400 * 401 * In MODE_COUNT mode, records the number of TMU writes required and flushes 402 * any outstanding TMU operations the instruction depends on, but it doesn't 403 * emit any actual register writes. 404 * 405 * In MODE_EMIT mode, emits the data register writes required by the 406 * instruction. 407 */ 408static void 409emit_tmu_general_atomic_writes(struct v3d_compile *c, 410 enum emit_mode mode, 411 nir_intrinsic_instr *instr, 412 uint32_t tmu_op, 413 bool has_index, 414 uint32_t *tmu_writes) 415{ 416 struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); 417 418 struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0); 419 if (mode == MODE_COUNT) 420 (*tmu_writes)++; 421 else 422 vir_MOV_dest(c, tmud, data); 423 424 if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { 425 data = ntq_get_src(c, instr->src[2 + has_index], 0); 426 if (mode == MODE_COUNT) 427 (*tmu_writes)++; 428 else 429 vir_MOV_dest(c, tmud, data); 430 } 431} 432 433/** 434 * For any TMU general instruction: 435 * 436 * In MODE_COUNT mode, records the number of TMU writes required to emit the 437 * address parameter and flushes any outstanding TMU operations the instruction 438 * depends on, but it doesn't emit any actual register writes. 439 * 440 * In MODE_EMIT mode, emits register writes required to emit the address. 441 */ 442static void 443emit_tmu_general_address_write(struct v3d_compile *c, 444 enum emit_mode mode, 445 nir_intrinsic_instr *instr, 446 uint32_t config, 447 bool dynamic_src, 448 int offset_src, 449 struct qreg base_offset, 450 uint32_t const_offset, 451 uint32_t *tmu_writes) 452{ 453 if (mode == MODE_COUNT) { 454 (*tmu_writes)++; 455 if (dynamic_src) 456 ntq_get_src(c, instr->src[offset_src], 0); 457 return; 458 } 459 460 if (vir_in_nonuniform_control_flow(c)) { 461 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 462 V3D_QPU_PF_PUSHZ); 463 } 464 465 struct qreg tmua; 466 if (config == ~0) 467 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); 468 else 469 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); 470 471 struct qinst *tmu; 472 if (dynamic_src) { 473 struct qreg offset = base_offset; 474 if (const_offset != 0) { 475 offset = vir_ADD(c, offset, 476 vir_uniform_ui(c, const_offset)); 477 } 478 struct qreg data = ntq_get_src(c, instr->src[offset_src], 0); 479 tmu = vir_ADD_dest(c, tmua, offset, data); 480 } else { 481 if (const_offset != 0) { 482 tmu = vir_ADD_dest(c, tmua, base_offset, 483 vir_uniform_ui(c, const_offset)); 484 } else { 485 tmu = vir_MOV_dest(c, tmua, base_offset); 486 } 487 } 488 489 if (config != ~0) { 490 tmu->uniform = 491 vir_get_uniform_index(c, QUNIFORM_CONSTANT, config); 492 } 493 494 if (vir_in_nonuniform_control_flow(c)) 495 vir_set_cond(tmu, V3D_QPU_COND_IFA); 496} 497 498/** 499 * Implements indirect uniform loads and SSBO accesses through the TMU general 500 * memory access interface. 501 */ 502static void 503ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, 504 bool is_shared_or_scratch, bool is_global) 505{ 506 uint32_t tmu_op = v3d_general_tmu_op(instr); 507 508 /* If we were able to replace atomic_add for an inc/dec, then we 509 * need/can to do things slightly different, like not loading the 510 * amount to add/sub, as that is implicit. 511 */ 512 bool atomic_add_replaced = 513 ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add || 514 instr->intrinsic == nir_intrinsic_shared_atomic_add || 515 instr->intrinsic == nir_intrinsic_global_atomic_add_2x32) && 516 (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC || 517 tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC)); 518 519 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || 520 instr->intrinsic == nir_intrinsic_store_scratch || 521 instr->intrinsic == nir_intrinsic_store_shared || 522 instr->intrinsic == nir_intrinsic_store_global_2x32); 523 524 bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform || 525 instr->intrinsic == nir_intrinsic_load_ubo || 526 instr->intrinsic == nir_intrinsic_load_ssbo || 527 instr->intrinsic == nir_intrinsic_load_scratch || 528 instr->intrinsic == nir_intrinsic_load_shared || 529 instr->intrinsic == nir_intrinsic_load_global_2x32); 530 531 if (!is_load) 532 c->tmu_dirty_rcl = true; 533 534 if (is_global) 535 c->has_global_address = true; 536 537 bool has_index = !is_shared_or_scratch && !is_global; 538 539 int offset_src; 540 if (instr->intrinsic == nir_intrinsic_load_uniform) { 541 offset_src = 0; 542 } else if (instr->intrinsic == nir_intrinsic_load_ssbo || 543 instr->intrinsic == nir_intrinsic_load_ubo || 544 instr->intrinsic == nir_intrinsic_load_scratch || 545 instr->intrinsic == nir_intrinsic_load_shared || 546 instr->intrinsic == nir_intrinsic_load_global_2x32 || 547 atomic_add_replaced) { 548 offset_src = 0 + has_index; 549 } else if (is_store) { 550 offset_src = 1 + has_index; 551 } else { 552 offset_src = 0 + has_index; 553 } 554 555 bool dynamic_src = !nir_src_is_const(instr->src[offset_src]); 556 uint32_t const_offset = 0; 557 if (!dynamic_src) 558 const_offset = nir_src_as_uint(instr->src[offset_src]); 559 560 struct qreg base_offset; 561 if (instr->intrinsic == nir_intrinsic_load_uniform) { 562 const_offset += nir_intrinsic_base(instr); 563 base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 564 v3d_unit_data_create(0, const_offset)); 565 const_offset = 0; 566 } else if (instr->intrinsic == nir_intrinsic_load_ubo) { 567 uint32_t index = nir_src_as_uint(instr->src[0]); 568 /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index 569 * shifted up by 1 (0 is gallium's constant buffer 0). 570 */ 571 if (c->key->environment == V3D_ENVIRONMENT_OPENGL) 572 index++; 573 574 base_offset = 575 vir_uniform(c, QUNIFORM_UBO_ADDR, 576 v3d_unit_data_create(index, const_offset)); 577 const_offset = 0; 578 } else if (is_shared_or_scratch) { 579 /* Shared and scratch variables have no buffer index, and all 580 * start from a common base that we set up at the start of 581 * dispatch. 582 */ 583 if (instr->intrinsic == nir_intrinsic_load_scratch || 584 instr->intrinsic == nir_intrinsic_store_scratch) { 585 base_offset = c->spill_base; 586 } else { 587 base_offset = c->cs_shared_offset; 588 const_offset += nir_intrinsic_base(instr); 589 } 590 } else if (is_global) { 591 /* Global load/store intrinsics use gloal addresses, so the 592 * offset is the target address and we don't need to add it 593 * to a base offset. 594 */ 595 base_offset = vir_uniform_ui(c, 0); 596 } else { 597 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, 598 nir_src_as_uint(instr->src[is_store ? 599 1 : 0])); 600 } 601 602 /* We are ready to emit TMU register writes now, but before we actually 603 * emit them we need to flush outstanding TMU operations if any of our 604 * writes reads from the result of an outstanding TMU operation before 605 * we start the TMU sequence for this operation, since otherwise the 606 * flush could happen in the middle of the TMU sequence we are about to 607 * emit, which is illegal. To do this we run this logic twice, the 608 * first time it will count required register writes and flush pending 609 * TMU requests if necessary due to a dependency, and the second one 610 * will emit the actual TMU writes. 611 */ 612 const uint32_t dest_components = nir_intrinsic_dest_components(instr); 613 uint32_t base_const_offset = const_offset; 614 uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0; 615 uint32_t tmu_writes = 0; 616 for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) { 617 assert(mode == MODE_COUNT || tmu_writes > 0); 618 619 uint32_t type_size = 4; 620 621 if (is_store) { 622 emit_tmu_general_store_writes(c, mode, instr, 623 base_const_offset, 624 &writemask, 625 &const_offset, 626 &type_size, 627 &tmu_writes); 628 } else if (!is_load && !atomic_add_replaced) { 629 emit_tmu_general_atomic_writes(c, mode, instr, 630 tmu_op, has_index, 631 &tmu_writes); 632 } else if (is_load) { 633 type_size = nir_dest_bit_size(instr->dest) / 8; 634 } 635 636 /* For atomics we use 32bit except for CMPXCHG, that we need 637 * to use VEC2. For the rest of the cases we use the number of 638 * tmud writes we did to decide the type. For cache operations 639 * the type is ignored. 640 */ 641 uint32_t config = 0; 642 if (mode == MODE_EMIT) { 643 uint32_t num_components; 644 if (is_load || atomic_add_replaced) { 645 num_components = instr->num_components; 646 } else { 647 assert(tmu_writes > 0); 648 num_components = tmu_writes - 1; 649 } 650 bool is_atomic = 651 v3d_tmu_get_type_from_op(tmu_op, !is_load) == 652 V3D_TMU_OP_TYPE_ATOMIC; 653 654 uint32_t perquad = 655 is_load && !vir_in_nonuniform_control_flow(c) 656 ? GENERAL_TMU_LOOKUP_PER_QUAD 657 : GENERAL_TMU_LOOKUP_PER_PIXEL; 658 config = 0xffffff00 | tmu_op << 3 | perquad; 659 660 if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { 661 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2; 662 } else if (is_atomic || num_components == 1) { 663 switch (type_size) { 664 case 4: 665 config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; 666 break; 667 case 2: 668 config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI; 669 break; 670 case 1: 671 config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI; 672 break; 673 default: 674 unreachable("Unsupported bitsize"); 675 } 676 } else { 677 assert(type_size == 4); 678 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + 679 num_components - 2; 680 } 681 } 682 683 emit_tmu_general_address_write(c, mode, instr, config, 684 dynamic_src, offset_src, 685 base_offset, const_offset, 686 &tmu_writes); 687 688 assert(tmu_writes > 0); 689 if (mode == MODE_COUNT) { 690 /* Make sure we won't exceed the 16-entry TMU 691 * fifo if each thread is storing at the same 692 * time. 693 */ 694 while (tmu_writes > 16 / c->threads) 695 c->threads /= 2; 696 697 /* If pipelining this TMU operation would 698 * overflow TMU fifos, we need to flush. 699 */ 700 if (ntq_tmu_fifo_overflow(c, dest_components)) 701 ntq_flush_tmu(c); 702 } else { 703 /* Delay emission of the thread switch and 704 * LDTMU/TMUWT until we really need to do it to 705 * improve pipelining. 706 */ 707 const uint32_t component_mask = 708 (1 << dest_components) - 1; 709 ntq_add_pending_tmu_flush(c, &instr->dest, 710 component_mask); 711 } 712 } 713 714 /* nir_lower_wrmasks should've ensured that any writemask on a store 715 * operation only has consecutive bits set, in which case we should've 716 * processed the full writemask above. 717 */ 718 assert(writemask == 0); 719} 720 721static struct qreg * 722ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) 723{ 724 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 725 def->num_components); 726 _mesa_hash_table_insert(c->def_ht, def, qregs); 727 return qregs; 728} 729 730static bool 731is_ld_signal(const struct v3d_qpu_sig *sig) 732{ 733 return (sig->ldunif || 734 sig->ldunifa || 735 sig->ldunifrf || 736 sig->ldunifarf || 737 sig->ldtmu || 738 sig->ldvary || 739 sig->ldvpm || 740 sig->ldtlb || 741 sig->ldtlbu); 742} 743 744static inline bool 745is_ldunif_signal(const struct v3d_qpu_sig *sig) 746{ 747 return sig->ldunif || sig->ldunifrf; 748} 749 750/** 751 * This function is responsible for getting VIR results into the associated 752 * storage for a NIR instruction. 753 * 754 * If it's a NIR SSA def, then we just set the associated hash table entry to 755 * the new result. 756 * 757 * If it's a NIR reg, then we need to update the existing qreg assigned to the 758 * NIR destination with the incoming value. To do that without introducing 759 * new MOVs, we require that the incoming qreg either be a uniform, or be 760 * SSA-defined by the previous VIR instruction in the block and rewritable by 761 * this function. That lets us sneak ahead and insert the SF flag beforehand 762 * (knowing that the previous instruction doesn't depend on flags) and rewrite 763 * its destination to be the NIR reg's destination 764 */ 765void 766ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, 767 struct qreg result) 768{ 769 struct qinst *last_inst = NULL; 770 if (!list_is_empty(&c->cur_block->instructions)) 771 last_inst = (struct qinst *)c->cur_block->instructions.prev; 772 773 bool is_reused_uniform = 774 is_ldunif_signal(&c->defs[result.index]->qpu.sig) && 775 last_inst != c->defs[result.index]; 776 777 assert(result.file == QFILE_TEMP && last_inst && 778 (last_inst == c->defs[result.index] || is_reused_uniform)); 779 780 if (dest->is_ssa) { 781 assert(chan < dest->ssa.num_components); 782 783 struct qreg *qregs; 784 struct hash_entry *entry = 785 _mesa_hash_table_search(c->def_ht, &dest->ssa); 786 787 if (entry) 788 qregs = entry->data; 789 else 790 qregs = ntq_init_ssa_def(c, &dest->ssa); 791 792 qregs[chan] = result; 793 } else { 794 nir_register *reg = dest->reg.reg; 795 assert(dest->reg.base_offset == 0); 796 assert(reg->num_array_elems == 0); 797 struct hash_entry *entry = 798 _mesa_hash_table_search(c->def_ht, reg); 799 struct qreg *qregs = entry->data; 800 801 /* If the previous instruction can't be predicated for 802 * the store into the nir_register, then emit a MOV 803 * that can be. 804 */ 805 if (is_reused_uniform || 806 (vir_in_nonuniform_control_flow(c) && 807 is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) { 808 result = vir_MOV(c, result); 809 last_inst = c->defs[result.index]; 810 } 811 812 /* We know they're both temps, so just rewrite index. */ 813 c->defs[last_inst->dst.index] = NULL; 814 last_inst->dst.index = qregs[chan].index; 815 816 /* If we're in control flow, then make this update of the reg 817 * conditional on the execution mask. 818 */ 819 if (vir_in_nonuniform_control_flow(c)) { 820 last_inst->dst.index = qregs[chan].index; 821 822 /* Set the flags to the current exec mask. 823 */ 824 c->cursor = vir_before_inst(last_inst); 825 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 826 V3D_QPU_PF_PUSHZ); 827 c->cursor = vir_after_inst(last_inst); 828 829 vir_set_cond(last_inst, V3D_QPU_COND_IFA); 830 } 831 } 832} 833 834/** 835 * This looks up the qreg associated with a particular ssa/reg used as a source 836 * in any instruction. 837 * 838 * It is expected that the definition for any NIR value read as a source has 839 * been emitted by a previous instruction, however, in the case of TMU 840 * operations we may have postponed emission of the thread switch and LDTMUs 841 * required to read the TMU results until the results are actually used to 842 * improve pipelining, which then would lead to us not finding them here 843 * (for SSA defs) or finding them in the list of registers awaiting a TMU flush 844 * (for registers), meaning that we need to flush outstanding TMU operations 845 * to read the correct value. 846 */ 847struct qreg 848ntq_get_src(struct v3d_compile *c, nir_src src, int i) 849{ 850 struct hash_entry *entry; 851 if (src.is_ssa) { 852 assert(i < src.ssa->num_components); 853 854 entry = _mesa_hash_table_search(c->def_ht, src.ssa); 855 if (!entry) { 856 ntq_flush_tmu(c); 857 entry = _mesa_hash_table_search(c->def_ht, src.ssa); 858 } 859 } else { 860 nir_register *reg = src.reg.reg; 861 assert(reg->num_array_elems == 0); 862 assert(src.reg.base_offset == 0); 863 assert(i < reg->num_components); 864 865 if (_mesa_set_search(c->tmu.outstanding_regs, reg)) 866 ntq_flush_tmu(c); 867 entry = _mesa_hash_table_search(c->def_ht, reg); 868 } 869 assert(entry); 870 871 struct qreg *qregs = entry->data; 872 return qregs[i]; 873} 874 875static struct qreg 876ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr, 877 unsigned src) 878{ 879 assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 880 unsigned chan = ffs(instr->dest.write_mask) - 1; 881 struct qreg r = ntq_get_src(c, instr->src[src].src, 882 instr->src[src].swizzle[chan]); 883 884 assert(!instr->src[src].abs); 885 assert(!instr->src[src].negate); 886 887 return r; 888}; 889 890static struct qreg 891ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level) 892{ 893 return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1)); 894} 895 896static void 897ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) 898{ 899 unsigned unit = instr->texture_index; 900 int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod); 901 int dest_size = nir_tex_instr_dest_size(instr); 902 903 struct qreg lod = c->undef; 904 if (lod_index != -1) 905 lod = ntq_get_src(c, instr->src[lod_index].src, 0); 906 907 for (int i = 0; i < dest_size; i++) { 908 assert(i < 3); 909 enum quniform_contents contents; 910 911 if (instr->is_array && i == dest_size - 1) 912 contents = QUNIFORM_TEXTURE_ARRAY_SIZE; 913 else 914 contents = QUNIFORM_TEXTURE_WIDTH + i; 915 916 struct qreg size = vir_uniform(c, contents, unit); 917 918 switch (instr->sampler_dim) { 919 case GLSL_SAMPLER_DIM_1D: 920 case GLSL_SAMPLER_DIM_2D: 921 case GLSL_SAMPLER_DIM_MS: 922 case GLSL_SAMPLER_DIM_3D: 923 case GLSL_SAMPLER_DIM_CUBE: 924 case GLSL_SAMPLER_DIM_BUF: 925 /* Don't minify the array size. */ 926 if (!(instr->is_array && i == dest_size - 1)) { 927 size = ntq_minify(c, size, lod); 928 } 929 break; 930 931 case GLSL_SAMPLER_DIM_RECT: 932 /* There's no LOD field for rects */ 933 break; 934 935 default: 936 unreachable("Bad sampler type"); 937 } 938 939 ntq_store_dest(c, &instr->dest, i, size); 940 } 941} 942 943static void 944ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) 945{ 946 unsigned unit = instr->texture_index; 947 948 /* Since each texture sampling op requires uploading uniforms to 949 * reference the texture, there's no HW support for texture size and 950 * you just upload uniforms containing the size. 951 */ 952 switch (instr->op) { 953 case nir_texop_query_levels: 954 ntq_store_dest(c, &instr->dest, 0, 955 vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); 956 return; 957 case nir_texop_texture_samples: 958 ntq_store_dest(c, &instr->dest, 0, 959 vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit)); 960 return; 961 case nir_texop_txs: 962 ntq_emit_txs(c, instr); 963 return; 964 default: 965 break; 966 } 967 968 if (c->devinfo->ver >= 40) 969 v3d40_vir_emit_tex(c, instr); 970 else 971 v3d33_vir_emit_tex(c, instr); 972} 973 974static struct qreg 975ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos) 976{ 977 struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI)); 978 if (is_cos) 979 input = vir_FADD(c, input, vir_uniform_f(c, 0.5)); 980 981 struct qreg periods = vir_FROUND(c, input); 982 struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods)); 983 return vir_XOR(c, sin_output, vir_SHL(c, 984 vir_FTOIN(c, periods), 985 vir_uniform_ui(c, -1))); 986} 987 988static struct qreg 989ntq_fsign(struct v3d_compile *c, struct qreg src) 990{ 991 struct qreg t = vir_get_temp(c); 992 993 vir_MOV_dest(c, t, vir_uniform_f(c, 0.0)); 994 vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ); 995 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0)); 996 vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN); 997 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0)); 998 return vir_MOV(c, t); 999} 1000 1001static void 1002emit_fragcoord_input(struct v3d_compile *c, int attr) 1003{ 1004 c->inputs[attr * 4 + 0] = vir_FXCD(c); 1005 c->inputs[attr * 4 + 1] = vir_FYCD(c); 1006 c->inputs[attr * 4 + 2] = c->payload_z; 1007 c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w); 1008} 1009 1010static struct qreg 1011emit_smooth_varying(struct v3d_compile *c, 1012 struct qreg vary, struct qreg w, struct qreg r5) 1013{ 1014 return vir_FADD(c, vir_FMUL(c, vary, w), r5); 1015} 1016 1017static struct qreg 1018emit_noperspective_varying(struct v3d_compile *c, 1019 struct qreg vary, struct qreg r5) 1020{ 1021 return vir_FADD(c, vir_MOV(c, vary), r5); 1022} 1023 1024static struct qreg 1025emit_flat_varying(struct v3d_compile *c, 1026 struct qreg vary, struct qreg r5) 1027{ 1028 vir_MOV_dest(c, c->undef, vary); 1029 return vir_MOV(c, r5); 1030} 1031 1032static struct qreg 1033emit_fragment_varying(struct v3d_compile *c, nir_variable *var, 1034 int8_t input_idx, uint8_t swizzle, int array_index) 1035{ 1036 struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); 1037 struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); 1038 1039 struct qinst *ldvary = NULL; 1040 struct qreg vary; 1041 if (c->devinfo->ver >= 41) { 1042 ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, 1043 c->undef, c->undef); 1044 ldvary->qpu.sig.ldvary = true; 1045 vary = vir_emit_def(c, ldvary); 1046 } else { 1047 vir_NOP(c)->qpu.sig.ldvary = true; 1048 vary = r3; 1049 } 1050 1051 /* Store the input value before interpolation so we can implement 1052 * GLSL's interpolateAt functions if the shader uses them. 1053 */ 1054 if (input_idx >= 0) { 1055 assert(var); 1056 c->interp[input_idx].vp = vary; 1057 c->interp[input_idx].C = vir_MOV(c, r5); 1058 c->interp[input_idx].mode = var->data.interpolation; 1059 } 1060 1061 /* For gl_PointCoord input or distance along a line, we'll be called 1062 * with no nir_variable, and we don't count toward VPM size so we 1063 * don't track an input slot. 1064 */ 1065 if (!var) { 1066 assert(input_idx < 0); 1067 return emit_smooth_varying(c, vary, c->payload_w, r5); 1068 } 1069 1070 int i = c->num_inputs++; 1071 c->input_slots[i] = 1072 v3d_slot_from_slot_and_component(var->data.location + 1073 array_index, swizzle); 1074 1075 struct qreg result; 1076 switch (var->data.interpolation) { 1077 case INTERP_MODE_NONE: 1078 case INTERP_MODE_SMOOTH: 1079 if (var->data.centroid) { 1080 BITSET_SET(c->centroid_flags, i); 1081 result = emit_smooth_varying(c, vary, 1082 c->payload_w_centroid, r5); 1083 } else { 1084 result = emit_smooth_varying(c, vary, c->payload_w, r5); 1085 } 1086 break; 1087 1088 case INTERP_MODE_NOPERSPECTIVE: 1089 BITSET_SET(c->noperspective_flags, i); 1090 result = emit_noperspective_varying(c, vary, r5); 1091 break; 1092 1093 case INTERP_MODE_FLAT: 1094 BITSET_SET(c->flat_shade_flags, i); 1095 result = emit_flat_varying(c, vary, r5); 1096 break; 1097 1098 default: 1099 unreachable("Bad interp mode"); 1100 } 1101 1102 if (input_idx >= 0) 1103 c->inputs[input_idx] = result; 1104 return result; 1105} 1106 1107static void 1108emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var, 1109 int array_index, unsigned nelem) 1110{ 1111 for (int i = 0; i < nelem ; i++) { 1112 int chan = var->data.location_frac + i; 1113 int input_idx = (base_attr + array_index) * 4 + chan; 1114 emit_fragment_varying(c, var, input_idx, chan, array_index); 1115 } 1116} 1117 1118static void 1119emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var, 1120 int array_index) 1121{ 1122 /* Compact variables are scalar arrays where each set of 4 elements 1123 * consumes a single location. 1124 */ 1125 int loc_offset = array_index / 4; 1126 int chan = var->data.location_frac + array_index % 4; 1127 int input_idx = (attr + loc_offset) * 4 + chan; 1128 emit_fragment_varying(c, var, input_idx, chan, loc_offset); 1129} 1130 1131static void 1132add_output(struct v3d_compile *c, 1133 uint32_t decl_offset, 1134 uint8_t slot, 1135 uint8_t swizzle) 1136{ 1137 uint32_t old_array_size = c->outputs_array_size; 1138 resize_qreg_array(c, &c->outputs, &c->outputs_array_size, 1139 decl_offset + 1); 1140 1141 if (old_array_size != c->outputs_array_size) { 1142 c->output_slots = reralloc(c, 1143 c->output_slots, 1144 struct v3d_varying_slot, 1145 c->outputs_array_size); 1146 } 1147 1148 c->output_slots[decl_offset] = 1149 v3d_slot_from_slot_and_component(slot, swizzle); 1150} 1151 1152/** 1153 * If compare_instr is a valid comparison instruction, emits the 1154 * compare_instr's comparison and returns the sel_instr's return value based 1155 * on the compare_instr's result. 1156 */ 1157static bool 1158ntq_emit_comparison(struct v3d_compile *c, 1159 nir_alu_instr *compare_instr, 1160 enum v3d_qpu_cond *out_cond) 1161{ 1162 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); 1163 struct qreg src1; 1164 if (nir_op_infos[compare_instr->op].num_inputs > 1) 1165 src1 = ntq_get_alu_src(c, compare_instr, 1); 1166 bool cond_invert = false; 1167 struct qreg nop = vir_nop_reg(); 1168 1169 switch (compare_instr->op) { 1170 case nir_op_feq32: 1171 case nir_op_seq: 1172 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1173 break; 1174 case nir_op_ieq32: 1175 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1176 break; 1177 1178 case nir_op_fneu32: 1179 case nir_op_sne: 1180 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1181 cond_invert = true; 1182 break; 1183 case nir_op_ine32: 1184 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1185 cond_invert = true; 1186 break; 1187 1188 case nir_op_fge32: 1189 case nir_op_sge: 1190 vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 1191 break; 1192 case nir_op_ige32: 1193 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 1194 cond_invert = true; 1195 break; 1196 case nir_op_uge32: 1197 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); 1198 cond_invert = true; 1199 break; 1200 1201 case nir_op_slt: 1202 case nir_op_flt32: 1203 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN); 1204 break; 1205 case nir_op_ilt32: 1206 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 1207 break; 1208 case nir_op_ult32: 1209 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); 1210 break; 1211 1212 case nir_op_i2b32: 1213 vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); 1214 cond_invert = true; 1215 break; 1216 1217 case nir_op_f2b32: 1218 vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); 1219 cond_invert = true; 1220 break; 1221 1222 default: 1223 return false; 1224 } 1225 1226 *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA; 1227 1228 return true; 1229} 1230 1231/* Finds an ALU instruction that generates our src value that could 1232 * (potentially) be greedily emitted in the consuming instruction. 1233 */ 1234static struct nir_alu_instr * 1235ntq_get_alu_parent(nir_src src) 1236{ 1237 if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu) 1238 return NULL; 1239 nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr); 1240 if (!instr) 1241 return NULL; 1242 1243 /* If the ALU instr's srcs are non-SSA, then we would have to avoid 1244 * moving emission of the ALU instr down past another write of the 1245 * src. 1246 */ 1247 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1248 if (!instr->src[i].src.is_ssa) 1249 return NULL; 1250 } 1251 1252 return instr; 1253} 1254 1255/* Turns a NIR bool into a condition code to predicate on. */ 1256static enum v3d_qpu_cond 1257ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src) 1258{ 1259 struct qreg qsrc = ntq_get_src(c, src, 0); 1260 /* skip if we already have src in the flags */ 1261 if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index) 1262 return c->flags_cond; 1263 1264 nir_alu_instr *compare = ntq_get_alu_parent(src); 1265 if (!compare) 1266 goto out; 1267 1268 enum v3d_qpu_cond cond; 1269 if (ntq_emit_comparison(c, compare, &cond)) 1270 return cond; 1271 1272out: 1273 1274 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)), 1275 V3D_QPU_PF_PUSHZ); 1276 return V3D_QPU_COND_IFNA; 1277} 1278 1279static struct qreg 1280ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond) 1281{ 1282 struct qreg result = 1283 vir_MOV(c, vir_SEL(c, cond, 1284 vir_uniform_ui(c, ~0), 1285 vir_uniform_ui(c, 0))); 1286 c->flags_temp = result.index; 1287 c->flags_cond = cond; 1288 return result; 1289} 1290 1291static struct qreg 1292ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond) 1293{ 1294 struct qreg result = 1295 vir_MOV(c, vir_SEL(c, cond, 1296 vir_uniform_ui(c, 1), 1297 vir_uniform_ui(c, 0))); 1298 c->flags_temp = result.index; 1299 c->flags_cond = cond; 1300 return result; 1301} 1302 1303static struct qreg 1304f2f16_rtz(struct v3d_compile *c, struct qreg f32) 1305{ 1306 /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding 1307 * method and seems to be using RTE by default, so we need to implement 1308 * RTZ rounding in software :-( 1309 * 1310 * The implementation identifies the cases where RTZ applies and 1311 * returns the correct result and for everything else, it just uses 1312 * the default RTE conversion. 1313 */ 1314 static bool _first = true; 1315 if (_first && unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) { 1316 fprintf(stderr, "Shader uses round-toward-zero f32->f16 " 1317 "conversion which is not supported in hardware.\n"); 1318 _first = false; 1319 } 1320 1321 struct qinst *inst; 1322 struct qreg tmp; 1323 1324 struct qreg result = vir_get_temp(c); 1325 1326 struct qreg mantissa32 = vir_AND(c, f32, vir_uniform_ui(c, 0x007fffff)); 1327 1328 /* Compute sign bit of result */ 1329 struct qreg sign = vir_AND(c, vir_SHR(c, f32, vir_uniform_ui(c, 16)), 1330 vir_uniform_ui(c, 0x8000)); 1331 1332 /* Check the cases were RTZ rounding is relevant based on exponent */ 1333 struct qreg exp32 = vir_AND(c, vir_SHR(c, f32, vir_uniform_ui(c, 23)), 1334 vir_uniform_ui(c, 0xff)); 1335 struct qreg exp16 = vir_ADD(c, exp32, vir_uniform_ui(c, -127 + 15)); 1336 1337 /* if (exp16 > 30) */ 1338 inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, 30)); 1339 vir_set_pf(c, inst, V3D_QPU_PF_PUSHC); 1340 inst = vir_OR_dest(c, result, sign, vir_uniform_ui(c, 0x7bff)); 1341 vir_set_cond(inst, V3D_QPU_COND_IFA); 1342 1343 /* if (exp16 <= 30) */ 1344 inst = vir_OR_dest(c, result, 1345 vir_OR(c, sign, 1346 vir_SHL(c, exp16, vir_uniform_ui(c, 10))), 1347 vir_SHR(c, mantissa32, vir_uniform_ui(c, 13))); 1348 vir_set_cond(inst, V3D_QPU_COND_IFNA); 1349 1350 /* if (exp16 <= 0) */ 1351 inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, 0)); 1352 vir_set_pf(c, inst, V3D_QPU_PF_PUSHC); 1353 1354 tmp = vir_OR(c, mantissa32, vir_uniform_ui(c, 0x800000)); 1355 tmp = vir_SHR(c, tmp, vir_SUB(c, vir_uniform_ui(c, 14), exp16)); 1356 inst = vir_OR_dest(c, result, sign, tmp); 1357 vir_set_cond(inst, V3D_QPU_COND_IFNA); 1358 1359 /* Cases where RTZ mode is not relevant: use default RTE conversion. 1360 * 1361 * The cases that are not affected by RTZ are: 1362 * 1363 * exp16 < - 10 || exp32 == 0 || exp32 == 0xff 1364 * 1365 * In V3D we can implement this condition as: 1366 * 1367 * !((exp16 >= -10) && !(exp32 == 0) && !(exp32 == 0xff))) 1368 */ 1369 1370 /* exp16 >= -10 */ 1371 inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, -10)); 1372 vir_set_pf(c, inst, V3D_QPU_PF_PUSHC); 1373 1374 /* && !(exp32 == 0) */ 1375 inst = vir_MOV_dest(c, vir_nop_reg(), exp32); 1376 vir_set_uf(c, inst, V3D_QPU_UF_ANDNZ); 1377 1378 /* && !(exp32 == 0xff) */ 1379 inst = vir_XOR_dest(c, vir_nop_reg(), exp32, vir_uniform_ui(c, 0xff)); 1380 vir_set_uf(c, inst, V3D_QPU_UF_ANDNZ); 1381 1382 /* Use regular RTE conversion if condition is False */ 1383 inst = vir_FMOV_dest(c, result, f32); 1384 vir_set_pack(inst, V3D_QPU_PACK_L); 1385 vir_set_cond(inst, V3D_QPU_COND_IFNA); 1386 1387 return vir_MOV(c, result); 1388} 1389 1390/** 1391 * Takes the result value of a signed integer width conversion from a smaller 1392 * type to a larger type and if needed, it applies sign extension to it. 1393 */ 1394static struct qreg 1395sign_extend(struct v3d_compile *c, 1396 struct qreg value, 1397 uint32_t src_bit_size, 1398 uint32_t dst_bit_size) 1399{ 1400 assert(src_bit_size < dst_bit_size); 1401 1402 struct qreg tmp = vir_MOV(c, value); 1403 1404 /* Do we need to sign-extend? */ 1405 uint32_t sign_mask = 1 << (src_bit_size - 1); 1406 struct qinst *sign_check = 1407 vir_AND_dest(c, vir_nop_reg(), 1408 tmp, vir_uniform_ui(c, sign_mask)); 1409 vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ); 1410 1411 /* If so, fill in leading sign bits */ 1412 uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) & 1413 ((1ull << dst_bit_size) - 1); 1414 struct qinst *extend_inst = 1415 vir_OR_dest(c, tmp, tmp, 1416 vir_uniform_ui(c, extend_bits)); 1417 vir_set_cond(extend_inst, V3D_QPU_COND_IFNA); 1418 1419 return tmp; 1420} 1421 1422static void 1423ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) 1424{ 1425 /* This should always be lowered to ALU operations for V3D. */ 1426 assert(!instr->dest.saturate); 1427 1428 /* Vectors are special in that they have non-scalarized writemasks, 1429 * and just take the first swizzle channel for each argument in order 1430 * into each writemask channel. 1431 */ 1432 if (instr->op == nir_op_vec2 || 1433 instr->op == nir_op_vec3 || 1434 instr->op == nir_op_vec4) { 1435 struct qreg srcs[4]; 1436 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 1437 srcs[i] = ntq_get_src(c, instr->src[i].src, 1438 instr->src[i].swizzle[0]); 1439 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 1440 ntq_store_dest(c, &instr->dest.dest, i, 1441 vir_MOV(c, srcs[i])); 1442 return; 1443 } 1444 1445 /* General case: We can just grab the one used channel per src. */ 1446 struct qreg src[nir_op_infos[instr->op].num_inputs]; 1447 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1448 src[i] = ntq_get_alu_src(c, instr, i); 1449 } 1450 1451 struct qreg result; 1452 1453 switch (instr->op) { 1454 case nir_op_mov: 1455 result = vir_MOV(c, src[0]); 1456 break; 1457 1458 case nir_op_fneg: 1459 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31)); 1460 break; 1461 case nir_op_ineg: 1462 result = vir_NEG(c, src[0]); 1463 break; 1464 1465 case nir_op_fmul: 1466 result = vir_FMUL(c, src[0], src[1]); 1467 break; 1468 case nir_op_fadd: 1469 result = vir_FADD(c, src[0], src[1]); 1470 break; 1471 case nir_op_fsub: 1472 result = vir_FSUB(c, src[0], src[1]); 1473 break; 1474 case nir_op_fmin: 1475 result = vir_FMIN(c, src[0], src[1]); 1476 break; 1477 case nir_op_fmax: 1478 result = vir_FMAX(c, src[0], src[1]); 1479 break; 1480 1481 case nir_op_f2i32: { 1482 nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src); 1483 if (src0_alu && src0_alu->op == nir_op_fround_even) { 1484 result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0)); 1485 } else { 1486 result = vir_FTOIZ(c, src[0]); 1487 } 1488 break; 1489 } 1490 1491 case nir_op_f2u32: 1492 result = vir_FTOUZ(c, src[0]); 1493 break; 1494 case nir_op_i2f32: 1495 result = vir_ITOF(c, src[0]); 1496 break; 1497 case nir_op_u2f32: 1498 result = vir_UTOF(c, src[0]); 1499 break; 1500 case nir_op_b2f32: 1501 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0)); 1502 break; 1503 case nir_op_b2i32: 1504 result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); 1505 break; 1506 1507 case nir_op_f2f16: 1508 case nir_op_f2f16_rtne: 1509 assert(nir_src_bit_size(instr->src[0].src) == 32); 1510 result = vir_FMOV(c, src[0]); 1511 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L); 1512 break; 1513 1514 case nir_op_f2f16_rtz: 1515 assert(nir_src_bit_size(instr->src[0].src) == 32); 1516 result = f2f16_rtz(c, src[0]); 1517 break; 1518 1519 case nir_op_f2f32: 1520 assert(nir_src_bit_size(instr->src[0].src) == 16); 1521 result = vir_FMOV(c, src[0]); 1522 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); 1523 break; 1524 1525 case nir_op_i2i16: { 1526 uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1527 assert(bit_size == 32 || bit_size == 8); 1528 if (bit_size == 32) { 1529 /* We don't have integer pack/unpack methods for 1530 * converting between 16-bit and 32-bit, so we implement 1531 * the conversion manually by truncating the src. 1532 */ 1533 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff)); 1534 } else { 1535 struct qreg tmp = vir_AND(c, src[0], 1536 vir_uniform_ui(c, 0xff)); 1537 result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16)); 1538 } 1539 break; 1540 } 1541 1542 case nir_op_u2u16: { 1543 uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1544 assert(bit_size == 32 || bit_size == 8); 1545 1546 /* We don't have integer pack/unpack methods for converting 1547 * between 16-bit and 32-bit, so we implement the conversion 1548 * manually by truncating the src. For the 8-bit case, we 1549 * want to make sure we don't copy garbage from any of the 1550 * 24 MSB bits. 1551 */ 1552 if (bit_size == 32) 1553 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff)); 1554 else 1555 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff)); 1556 break; 1557 } 1558 1559 case nir_op_i2i8: 1560 case nir_op_u2u8: 1561 assert(nir_src_bit_size(instr->src[0].src) == 32 || 1562 nir_src_bit_size(instr->src[0].src) == 16); 1563 /* We don't have integer pack/unpack methods for converting 1564 * between 8-bit and 32-bit, so we implement the conversion 1565 * manually by truncating the src. 1566 */ 1567 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff)); 1568 break; 1569 1570 case nir_op_u2u32: { 1571 uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1572 assert(bit_size == 16 || bit_size == 8); 1573 1574 /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit 1575 * from the src but we make sure to clear any garbage bits that 1576 * may be present in the invalid src bits. 1577 */ 1578 uint32_t mask = (1 << bit_size) - 1; 1579 result = vir_AND(c, src[0], vir_uniform_ui(c, mask)); 1580 break; 1581 } 1582 1583 case nir_op_i2i32: { 1584 uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1585 assert(bit_size == 16 || bit_size == 8); 1586 1587 uint32_t mask = (1 << bit_size) - 1; 1588 struct qreg tmp = vir_AND(c, src[0], 1589 vir_uniform_ui(c, mask)); 1590 1591 result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32)); 1592 break; 1593 } 1594 1595 case nir_op_iadd: 1596 result = vir_ADD(c, src[0], src[1]); 1597 break; 1598 case nir_op_ushr: 1599 result = vir_SHR(c, src[0], src[1]); 1600 break; 1601 case nir_op_isub: 1602 result = vir_SUB(c, src[0], src[1]); 1603 break; 1604 case nir_op_ishr: 1605 result = vir_ASR(c, src[0], src[1]); 1606 break; 1607 case nir_op_ishl: 1608 result = vir_SHL(c, src[0], src[1]); 1609 break; 1610 case nir_op_imin: 1611 result = vir_MIN(c, src[0], src[1]); 1612 break; 1613 case nir_op_umin: 1614 result = vir_UMIN(c, src[0], src[1]); 1615 break; 1616 case nir_op_imax: 1617 result = vir_MAX(c, src[0], src[1]); 1618 break; 1619 case nir_op_umax: 1620 result = vir_UMAX(c, src[0], src[1]); 1621 break; 1622 case nir_op_iand: 1623 result = vir_AND(c, src[0], src[1]); 1624 break; 1625 case nir_op_ior: 1626 result = vir_OR(c, src[0], src[1]); 1627 break; 1628 case nir_op_ixor: 1629 result = vir_XOR(c, src[0], src[1]); 1630 break; 1631 case nir_op_inot: 1632 result = vir_NOT(c, src[0]); 1633 break; 1634 1635 case nir_op_ufind_msb: 1636 result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0])); 1637 break; 1638 1639 case nir_op_imul: 1640 result = vir_UMUL(c, src[0], src[1]); 1641 break; 1642 1643 case nir_op_seq: 1644 case nir_op_sne: 1645 case nir_op_sge: 1646 case nir_op_slt: { 1647 enum v3d_qpu_cond cond; 1648 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond); 1649 assert(ok); 1650 result = vir_MOV(c, vir_SEL(c, cond, 1651 vir_uniform_f(c, 1.0), 1652 vir_uniform_f(c, 0.0))); 1653 c->flags_temp = result.index; 1654 c->flags_cond = cond; 1655 break; 1656 } 1657 1658 case nir_op_i2b32: 1659 case nir_op_f2b32: 1660 case nir_op_feq32: 1661 case nir_op_fneu32: 1662 case nir_op_fge32: 1663 case nir_op_flt32: 1664 case nir_op_ieq32: 1665 case nir_op_ine32: 1666 case nir_op_ige32: 1667 case nir_op_uge32: 1668 case nir_op_ilt32: 1669 case nir_op_ult32: { 1670 enum v3d_qpu_cond cond; 1671 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond); 1672 assert(ok); 1673 result = ntq_emit_cond_to_bool(c, cond); 1674 break; 1675 } 1676 1677 case nir_op_b32csel: 1678 result = vir_MOV(c, 1679 vir_SEL(c, 1680 ntq_emit_bool_to_cond(c, instr->src[0].src), 1681 src[1], src[2])); 1682 break; 1683 1684 case nir_op_fcsel: 1685 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]), 1686 V3D_QPU_PF_PUSHZ); 1687 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, 1688 src[1], src[2])); 1689 break; 1690 1691 case nir_op_frcp: 1692 result = vir_RECIP(c, src[0]); 1693 break; 1694 case nir_op_frsq: 1695 result = vir_RSQRT(c, src[0]); 1696 break; 1697 case nir_op_fexp2: 1698 result = vir_EXP(c, src[0]); 1699 break; 1700 case nir_op_flog2: 1701 result = vir_LOG(c, src[0]); 1702 break; 1703 1704 case nir_op_fceil: 1705 result = vir_FCEIL(c, src[0]); 1706 break; 1707 case nir_op_ffloor: 1708 result = vir_FFLOOR(c, src[0]); 1709 break; 1710 case nir_op_fround_even: 1711 result = vir_FROUND(c, src[0]); 1712 break; 1713 case nir_op_ftrunc: 1714 result = vir_FTRUNC(c, src[0]); 1715 break; 1716 1717 case nir_op_fsin: 1718 result = ntq_fsincos(c, src[0], false); 1719 break; 1720 case nir_op_fcos: 1721 result = ntq_fsincos(c, src[0], true); 1722 break; 1723 1724 case nir_op_fsign: 1725 result = ntq_fsign(c, src[0]); 1726 break; 1727 1728 case nir_op_fabs: { 1729 result = vir_FMOV(c, src[0]); 1730 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS); 1731 break; 1732 } 1733 1734 case nir_op_iabs: 1735 result = vir_MAX(c, src[0], vir_NEG(c, src[0])); 1736 break; 1737 1738 case nir_op_fddx: 1739 case nir_op_fddx_coarse: 1740 case nir_op_fddx_fine: 1741 result = vir_FDX(c, src[0]); 1742 break; 1743 1744 case nir_op_fddy: 1745 case nir_op_fddy_coarse: 1746 case nir_op_fddy_fine: 1747 result = vir_FDY(c, src[0]); 1748 break; 1749 1750 case nir_op_uadd_carry: 1751 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]), 1752 V3D_QPU_PF_PUSHC); 1753 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA); 1754 break; 1755 1756 case nir_op_usub_borrow: 1757 vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]), 1758 V3D_QPU_PF_PUSHC); 1759 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA); 1760 break; 1761 1762 case nir_op_pack_half_2x16_split: 1763 result = vir_VFPACK(c, src[0], src[1]); 1764 break; 1765 1766 case nir_op_unpack_half_2x16_split_x: 1767 result = vir_FMOV(c, src[0]); 1768 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); 1769 break; 1770 1771 case nir_op_unpack_half_2x16_split_y: 1772 result = vir_FMOV(c, src[0]); 1773 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H); 1774 break; 1775 1776 case nir_op_fquantize2f16: { 1777 /* F32 -> F16 -> F32 conversion */ 1778 struct qreg tmp = vir_FMOV(c, src[0]); 1779 vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L); 1780 tmp = vir_FMOV(c, tmp); 1781 vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L); 1782 1783 /* Check for denorm */ 1784 struct qreg abs_src = vir_FMOV(c, src[0]); 1785 vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS); 1786 struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14)); 1787 vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold), 1788 V3D_QPU_PF_PUSHC); 1789 1790 /* Return +/-0 for denorms */ 1791 struct qreg zero = 1792 vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000)); 1793 result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero)); 1794 break; 1795 } 1796 1797 default: 1798 fprintf(stderr, "unknown NIR ALU inst: "); 1799 nir_print_instr(&instr->instr, stderr); 1800 fprintf(stderr, "\n"); 1801 abort(); 1802 } 1803 1804 /* We have a scalar result, so the instruction should only have a 1805 * single channel written to. 1806 */ 1807 assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 1808 ntq_store_dest(c, &instr->dest.dest, 1809 ffs(instr->dest.write_mask) - 1, result); 1810} 1811 1812/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit 1813 * specifier. They come from a register that's preloaded with 0xffffffff 1814 * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low 1815 * 8 bits are shifted off the bottom and 0xff shifted in from the top. 1816 */ 1817#define TLB_TYPE_F16_COLOR (3 << 6) 1818#define TLB_TYPE_I32_COLOR (1 << 6) 1819#define TLB_TYPE_F32_COLOR (0 << 6) 1820#define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */ 1821#define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2) 1822#define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2) 1823#define TLB_F16_SWAP_HI_LO (1 << 1) 1824#define TLB_VEC_SIZE_4_F16 (1 << 0) 1825#define TLB_VEC_SIZE_2_F16 (0 << 0) 1826#define TLB_VEC_SIZE_MINUS_1_SHIFT 0 1827 1828/* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z" 1829 * flag is set. 1830 */ 1831#define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4)) 1832#define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */ 1833#define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */ 1834#define TLB_V42_DEPTH_TYPE_INVARIANT (0 << 3) /* Unmodified sideband input used */ 1835#define TLB_V42_DEPTH_TYPE_PER_PIXEL (1 << 3) /* QPU result used */ 1836 1837/* Stencil is a single 32-bit write. */ 1838#define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4)) 1839 1840static void 1841vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt) 1842{ 1843 if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt]) 1844 return; 1845 1846 struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB); 1847 struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); 1848 1849 nir_variable *var = c->output_color_var[rt]; 1850 int num_components = glsl_get_vector_elements(var->type); 1851 uint32_t conf = 0xffffff00; 1852 struct qinst *inst; 1853 1854 conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE : 1855 TLB_SAMPLE_MODE_PER_PIXEL; 1856 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; 1857 1858 if (c->fs_key->swap_color_rb & (1 << rt)) 1859 num_components = MAX2(num_components, 3); 1860 assert(num_components != 0); 1861 1862 enum glsl_base_type type = glsl_get_base_type(var->type); 1863 bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT; 1864 bool is_32b_tlb_format = is_int_format || 1865 (c->fs_key->f32_color_rb & (1 << rt)); 1866 1867 if (is_int_format) { 1868 /* The F32 vs I32 distinction was dropped in 4.2. */ 1869 if (c->devinfo->ver < 42) 1870 conf |= TLB_TYPE_I32_COLOR; 1871 else 1872 conf |= TLB_TYPE_F32_COLOR; 1873 conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT); 1874 } else { 1875 if (c->fs_key->f32_color_rb & (1 << rt)) { 1876 conf |= TLB_TYPE_F32_COLOR; 1877 conf |= ((num_components - 1) << 1878 TLB_VEC_SIZE_MINUS_1_SHIFT); 1879 } else { 1880 conf |= TLB_TYPE_F16_COLOR; 1881 conf |= TLB_F16_SWAP_HI_LO; 1882 if (num_components >= 3) 1883 conf |= TLB_VEC_SIZE_4_F16; 1884 else 1885 conf |= TLB_VEC_SIZE_2_F16; 1886 } 1887 } 1888 1889 int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1; 1890 for (int i = 0; i < num_samples; i++) { 1891 struct qreg *color = c->msaa_per_sample_output ? 1892 &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] : 1893 &c->outputs[var->data.driver_location * 4]; 1894 1895 struct qreg r = color[0]; 1896 struct qreg g = color[1]; 1897 struct qreg b = color[2]; 1898 struct qreg a = color[3]; 1899 1900 if (c->fs_key->swap_color_rb & (1 << rt)) { 1901 r = color[2]; 1902 b = color[0]; 1903 } 1904 1905 if (c->fs_key->sample_alpha_to_one) 1906 a = vir_uniform_f(c, 1.0); 1907 1908 if (is_32b_tlb_format) { 1909 if (i == 0) { 1910 inst = vir_MOV_dest(c, tlbu_reg, r); 1911 inst->uniform = 1912 vir_get_uniform_index(c, 1913 QUNIFORM_CONSTANT, 1914 conf); 1915 } else { 1916 vir_MOV_dest(c, tlb_reg, r); 1917 } 1918 1919 if (num_components >= 2) 1920 vir_MOV_dest(c, tlb_reg, g); 1921 if (num_components >= 3) 1922 vir_MOV_dest(c, tlb_reg, b); 1923 if (num_components >= 4) 1924 vir_MOV_dest(c, tlb_reg, a); 1925 } else { 1926 inst = vir_VFPACK_dest(c, tlb_reg, r, g); 1927 if (conf != ~0 && i == 0) { 1928 inst->dst = tlbu_reg; 1929 inst->uniform = 1930 vir_get_uniform_index(c, 1931 QUNIFORM_CONSTANT, 1932 conf); 1933 } 1934 1935 if (num_components >= 3) 1936 vir_VFPACK_dest(c, tlb_reg, b, a); 1937 } 1938 } 1939} 1940 1941static void 1942emit_frag_end(struct v3d_compile *c) 1943{ 1944 if (c->output_sample_mask_index != -1) { 1945 vir_SETMSF_dest(c, vir_nop_reg(), 1946 vir_AND(c, 1947 vir_MSF(c), 1948 c->outputs[c->output_sample_mask_index])); 1949 } 1950 1951 bool has_any_tlb_color_write = false; 1952 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) { 1953 if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt]) 1954 has_any_tlb_color_write = true; 1955 } 1956 1957 if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) { 1958 struct nir_variable *var = c->output_color_var[0]; 1959 struct qreg *color = &c->outputs[var->data.driver_location * 4]; 1960 1961 vir_SETMSF_dest(c, vir_nop_reg(), 1962 vir_AND(c, 1963 vir_MSF(c), 1964 vir_FTOC(c, color[3]))); 1965 } 1966 1967 struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); 1968 1969 /* If the shader has no non-TLB side effects and doesn't write Z 1970 * we can promote it to enabling early_fragment_tests even 1971 * if the user didn't. 1972 */ 1973 if (c->output_position_index == -1 && 1974 !(c->s->info.num_images || c->s->info.num_ssbos) && 1975 !c->s->info.fs.uses_discard && 1976 !c->fs_key->sample_alpha_to_coverage && 1977 c->output_sample_mask_index == -1 && 1978 has_any_tlb_color_write) { 1979 c->s->info.fs.early_fragment_tests = true; 1980 } 1981 1982 /* By default, Z buffer writes are implicit using the Z values produced 1983 * from FEP (Z value produced from rasterization). When this is not 1984 * desirable (shader writes Z explicitly, has discards, etc) we need 1985 * to let the hardware know by setting c->writes_z to true, in which 1986 * case we always need to write a Z value from the QPU, even if it is 1987 * just the passthrough Z value produced from FEP. 1988 * 1989 * Also, from the V3D 4.2 spec: 1990 * 1991 * "If a shader performs a Z read the “Fragment shader does Z writes” 1992 * bit in the shader record must be enabled to ensure deterministic 1993 * results" 1994 * 1995 * So if c->reads_z is set we always need to write Z, even if it is 1996 * a passthrough from the Z value produced from FEP. 1997 */ 1998 if (!c->s->info.fs.early_fragment_tests || c->reads_z) { 1999 c->writes_z = true; 2000 uint8_t tlb_specifier = TLB_TYPE_DEPTH; 2001 struct qinst *inst; 2002 2003 if (c->output_position_index != -1) { 2004 /* Shader writes to gl_FragDepth, use that */ 2005 inst = vir_MOV_dest(c, tlbu_reg, 2006 c->outputs[c->output_position_index]); 2007 2008 if (c->devinfo->ver >= 42) { 2009 tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL | 2010 TLB_SAMPLE_MODE_PER_PIXEL); 2011 } else { 2012 tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL; 2013 } 2014 } else { 2015 /* Shader doesn't write to gl_FragDepth, take Z from 2016 * FEP. 2017 */ 2018 c->writes_z_from_fep = true; 2019 inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg()); 2020 2021 if (c->devinfo->ver >= 42) { 2022 /* The spec says the PER_PIXEL flag is ignored 2023 * for invariant writes, but the simulator 2024 * demands it. 2025 */ 2026 tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT | 2027 TLB_SAMPLE_MODE_PER_PIXEL); 2028 } else { 2029 tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT; 2030 } 2031 2032 /* Since (single-threaded) fragment shaders always need 2033 * a TLB write, if we dond't have any we emit a 2034 * passthrouh Z and flag us as potentially discarding, 2035 * so that we can use Z as the required TLB write. 2036 */ 2037 if (!has_any_tlb_color_write) 2038 c->s->info.fs.uses_discard = true; 2039 } 2040 2041 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 2042 tlb_specifier | 2043 0xffffff00); 2044 inst->is_tlb_z_write = true; 2045 } 2046 2047 /* XXX: Performance improvement: Merge Z write and color writes TLB 2048 * uniform setup 2049 */ 2050 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) 2051 vir_emit_tlb_color_write(c, rt); 2052} 2053 2054static inline void 2055vir_VPM_WRITE_indirect(struct v3d_compile *c, 2056 struct qreg val, 2057 struct qreg vpm_index, 2058 bool uniform_vpm_index) 2059{ 2060 assert(c->devinfo->ver >= 40); 2061 if (uniform_vpm_index) 2062 vir_STVPMV(c, vpm_index, val); 2063 else 2064 vir_STVPMD(c, vpm_index, val); 2065} 2066 2067static void 2068vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) 2069{ 2070 if (c->devinfo->ver >= 40) { 2071 vir_VPM_WRITE_indirect(c, val, 2072 vir_uniform_ui(c, vpm_index), true); 2073 } else { 2074 /* XXX: v3d33_vir_vpm_write_setup(c); */ 2075 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); 2076 } 2077} 2078 2079static void 2080emit_vert_end(struct v3d_compile *c) 2081{ 2082 /* GFXH-1684: VPM writes need to be complete by the end of the shader. 2083 */ 2084 if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) 2085 vir_VPMWT(c); 2086} 2087 2088static void 2089emit_geom_end(struct v3d_compile *c) 2090{ 2091 /* GFXH-1684: VPM writes need to be complete by the end of the shader. 2092 */ 2093 if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) 2094 vir_VPMWT(c); 2095} 2096 2097static bool 2098mem_vectorize_callback(unsigned align_mul, unsigned align_offset, 2099 unsigned bit_size, 2100 unsigned num_components, 2101 nir_intrinsic_instr *low, 2102 nir_intrinsic_instr *high, 2103 void *data) 2104{ 2105 /* TMU general access only supports 32-bit vectors */ 2106 if (bit_size > 32) 2107 return false; 2108 2109 if ((bit_size == 8 || bit_size == 16) && num_components > 1) 2110 return false; 2111 2112 if (align_mul % 4 != 0 || align_offset % 4 != 0) 2113 return false; 2114 2115 /* Vector accesses wrap at 16-byte boundaries so we can't vectorize 2116 * if the resulting vector crosses a 16-byte boundary. 2117 */ 2118 assert(util_is_power_of_two_nonzero(align_mul)); 2119 align_mul = MIN2(align_mul, 16); 2120 align_offset &= 0xf; 2121 if (16 - align_mul + align_offset + num_components * 4 > 16) 2122 return false; 2123 2124 return true; 2125} 2126 2127void 2128v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) 2129{ 2130 bool progress; 2131 unsigned lower_flrp = 2132 (s->options->lower_flrp16 ? 16 : 0) | 2133 (s->options->lower_flrp32 ? 32 : 0) | 2134 (s->options->lower_flrp64 ? 64 : 0); 2135 2136 do { 2137 progress = false; 2138 2139 NIR_PASS(progress, s, nir_lower_vars_to_ssa); 2140 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); 2141 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false); 2142 NIR_PASS(progress, s, nir_copy_prop); 2143 NIR_PASS(progress, s, nir_opt_remove_phis); 2144 NIR_PASS(progress, s, nir_opt_dce); 2145 NIR_PASS(progress, s, nir_opt_dead_cf); 2146 NIR_PASS(progress, s, nir_opt_cse); 2147 NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); 2148 NIR_PASS(progress, s, nir_opt_algebraic); 2149 NIR_PASS(progress, s, nir_opt_constant_folding); 2150 2151 /* Note that vectorization may undo the load/store scalarization 2152 * pass we run for non 32-bit TMU general load/store by 2153 * converting, for example, 2 consecutive 16-bit loads into a 2154 * single 32-bit load. This is fine (and desirable) as long as 2155 * the resulting 32-bit load meets 32-bit alignment requirements, 2156 * which mem_vectorize_callback() should be enforcing. 2157 */ 2158 nir_load_store_vectorize_options vectorize_opts = { 2159 .modes = nir_var_mem_ssbo | nir_var_mem_ubo | 2160 nir_var_mem_push_const | nir_var_mem_shared | 2161 nir_var_mem_global, 2162 .callback = mem_vectorize_callback, 2163 .robust_modes = 0, 2164 }; 2165 bool vectorize_progress = false; 2166 NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize, 2167 &vectorize_opts); 2168 if (vectorize_progress) { 2169 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); 2170 NIR_PASS(progress, s, nir_lower_pack); 2171 progress = true; 2172 } 2173 2174 if (lower_flrp != 0) { 2175 bool lower_flrp_progress = false; 2176 2177 NIR_PASS(lower_flrp_progress, s, nir_lower_flrp, 2178 lower_flrp, 2179 false /* always_precise */); 2180 if (lower_flrp_progress) { 2181 NIR_PASS(progress, s, nir_opt_constant_folding); 2182 progress = true; 2183 } 2184 2185 /* Nothing should rematerialize any flrps, so we only 2186 * need to do this lowering once. 2187 */ 2188 lower_flrp = 0; 2189 } 2190 2191 NIR_PASS(progress, s, nir_opt_undef); 2192 NIR_PASS(progress, s, nir_lower_undef_to_zero); 2193 2194 if (c && !c->disable_loop_unrolling && 2195 s->options->max_unroll_iterations > 0) { 2196 bool local_progress = false; 2197 NIR_PASS(local_progress, s, nir_opt_loop_unroll); 2198 c->unrolled_any_loops |= local_progress; 2199 progress |= local_progress; 2200 } 2201 } while (progress); 2202 2203 nir_move_options sink_opts = 2204 nir_move_const_undef | nir_move_comparisons | nir_move_copies | 2205 nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform; 2206 NIR_PASS(progress, s, nir_opt_sink, sink_opts); 2207} 2208 2209static int 2210driver_location_compare(const nir_variable *a, const nir_variable *b) 2211{ 2212 return a->data.driver_location == b->data.driver_location ? 2213 a->data.location_frac - b->data.location_frac : 2214 a->data.driver_location - b->data.driver_location; 2215} 2216 2217static struct qreg 2218ntq_emit_vpm_read(struct v3d_compile *c, 2219 uint32_t *num_components_queued, 2220 uint32_t *remaining, 2221 uint32_t vpm_index) 2222{ 2223 struct qreg vpm = vir_reg(QFILE_VPM, vpm_index); 2224 2225 if (c->devinfo->ver >= 40 ) { 2226 return vir_LDVPMV_IN(c, 2227 vir_uniform_ui(c, 2228 (*num_components_queued)++)); 2229 } 2230 2231 if (*num_components_queued != 0) { 2232 (*num_components_queued)--; 2233 return vir_MOV(c, vpm); 2234 } 2235 2236 uint32_t num_components = MIN2(*remaining, 32); 2237 2238 v3d33_vir_vpm_read_setup(c, num_components); 2239 2240 *num_components_queued = num_components - 1; 2241 *remaining -= num_components; 2242 2243 return vir_MOV(c, vpm); 2244} 2245 2246static void 2247ntq_setup_vs_inputs(struct v3d_compile *c) 2248{ 2249 /* Figure out how many components of each vertex attribute the shader 2250 * uses. Each variable should have been split to individual 2251 * components and unused ones DCEed. The vertex fetcher will load 2252 * from the start of the attribute to the number of components we 2253 * declare we need in c->vattr_sizes[]. 2254 * 2255 * BGRA vertex attributes are a bit special: since we implement these 2256 * as RGBA swapping R/B components we always need at least 3 components 2257 * if component 0 is read. 2258 */ 2259 nir_foreach_shader_in_variable(var, c->s) { 2260 /* No VS attribute array support. */ 2261 assert(MAX2(glsl_get_length(var->type), 1) == 1); 2262 2263 unsigned loc = var->data.driver_location; 2264 int start_component = var->data.location_frac; 2265 int num_components = glsl_get_components(var->type); 2266 2267 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc], 2268 start_component + num_components); 2269 2270 /* Handle BGRA inputs */ 2271 if (start_component == 0 && 2272 c->vs_key->va_swap_rb_mask & (1 << var->data.location)) { 2273 c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]); 2274 } 2275 } 2276 2277 unsigned num_components = 0; 2278 uint32_t vpm_components_queued = 0; 2279 bool uses_iid = BITSET_TEST(c->s->info.system_values_read, 2280 SYSTEM_VALUE_INSTANCE_ID) || 2281 BITSET_TEST(c->s->info.system_values_read, 2282 SYSTEM_VALUE_INSTANCE_INDEX); 2283 bool uses_biid = BITSET_TEST(c->s->info.system_values_read, 2284 SYSTEM_VALUE_BASE_INSTANCE); 2285 bool uses_vid = BITSET_TEST(c->s->info.system_values_read, 2286 SYSTEM_VALUE_VERTEX_ID) || 2287 BITSET_TEST(c->s->info.system_values_read, 2288 SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); 2289 2290 num_components += uses_iid; 2291 num_components += uses_biid; 2292 num_components += uses_vid; 2293 2294 for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++) 2295 num_components += c->vattr_sizes[i]; 2296 2297 if (uses_iid) { 2298 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued, 2299 &num_components, ~0); 2300 } 2301 2302 if (uses_biid) { 2303 c->biid = ntq_emit_vpm_read(c, &vpm_components_queued, 2304 &num_components, ~0); 2305 } 2306 2307 if (uses_vid) { 2308 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued, 2309 &num_components, ~0); 2310 } 2311 2312 /* The actual loads will happen directly in nir_intrinsic_load_input 2313 * on newer versions. 2314 */ 2315 if (c->devinfo->ver >= 40) 2316 return; 2317 2318 for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) { 2319 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 2320 (loc + 1) * 4); 2321 2322 for (int i = 0; i < c->vattr_sizes[loc]; i++) { 2323 c->inputs[loc * 4 + i] = 2324 ntq_emit_vpm_read(c, 2325 &vpm_components_queued, 2326 &num_components, 2327 loc * 4 + i); 2328 2329 } 2330 } 2331 2332 if (c->devinfo->ver >= 40) { 2333 assert(vpm_components_queued == num_components); 2334 } else { 2335 assert(vpm_components_queued == 0); 2336 assert(num_components == 0); 2337 } 2338} 2339 2340static bool 2341program_reads_point_coord(struct v3d_compile *c) 2342{ 2343 nir_foreach_shader_in_variable(var, c->s) { 2344 if (util_varying_is_point_coord(var->data.location, 2345 c->fs_key->point_sprite_mask)) { 2346 return true; 2347 } 2348 } 2349 2350 return false; 2351} 2352 2353static void 2354ntq_setup_gs_inputs(struct v3d_compile *c) 2355{ 2356 nir_sort_variables_with_modes(c->s, driver_location_compare, 2357 nir_var_shader_in); 2358 2359 nir_foreach_shader_in_variable(var, c->s) { 2360 /* All GS inputs are arrays with as many entries as vertices 2361 * in the input primitive, but here we only care about the 2362 * per-vertex input type. 2363 */ 2364 assert(glsl_type_is_array(var->type)); 2365 const struct glsl_type *type = glsl_get_array_element(var->type); 2366 unsigned var_len = glsl_count_vec4_slots(type, false, false); 2367 unsigned loc = var->data.driver_location; 2368 2369 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 2370 (loc + var_len) * 4); 2371 2372 if (var->data.compact) { 2373 for (unsigned j = 0; j < var_len; j++) { 2374 unsigned input_idx = c->num_inputs++; 2375 unsigned loc_frac = var->data.location_frac + j; 2376 unsigned loc = var->data.location + loc_frac / 4; 2377 unsigned comp = loc_frac % 4; 2378 c->input_slots[input_idx] = 2379 v3d_slot_from_slot_and_component(loc, comp); 2380 } 2381 continue; 2382 } 2383 2384 for (unsigned j = 0; j < var_len; j++) { 2385 unsigned num_elements = 2386 glsl_type_is_struct(glsl_without_array(type)) ? 2387 4 : glsl_get_vector_elements(type); 2388 for (unsigned k = 0; k < num_elements; k++) { 2389 unsigned chan = var->data.location_frac + k; 2390 unsigned input_idx = c->num_inputs++; 2391 struct v3d_varying_slot slot = 2392 v3d_slot_from_slot_and_component(var->data.location + j, chan); 2393 c->input_slots[input_idx] = slot; 2394 } 2395 } 2396 } 2397} 2398 2399 2400static void 2401ntq_setup_fs_inputs(struct v3d_compile *c) 2402{ 2403 nir_sort_variables_with_modes(c->s, driver_location_compare, 2404 nir_var_shader_in); 2405 2406 nir_foreach_shader_in_variable(var, c->s) { 2407 unsigned var_len = glsl_count_vec4_slots(var->type, false, false); 2408 unsigned loc = var->data.driver_location; 2409 2410 uint32_t inputs_array_size = c->inputs_array_size; 2411 uint32_t inputs_array_required_size = (loc + var_len) * 4; 2412 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 2413 inputs_array_required_size); 2414 resize_interp_array(c, &c->interp, &inputs_array_size, 2415 inputs_array_required_size); 2416 2417 if (var->data.location == VARYING_SLOT_POS) { 2418 emit_fragcoord_input(c, loc); 2419 } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID && 2420 !c->fs_key->has_gs) { 2421 /* If the fragment shader reads gl_PrimitiveID and we 2422 * don't have a geometry shader in the pipeline to write 2423 * it then we program the hardware to inject it as 2424 * an implicit varying. Take it from there. 2425 */ 2426 c->inputs[loc * 4] = c->primitive_id; 2427 } else if (util_varying_is_point_coord(var->data.location, 2428 c->fs_key->point_sprite_mask)) { 2429 c->inputs[loc * 4 + 0] = c->point_x; 2430 c->inputs[loc * 4 + 1] = c->point_y; 2431 } else if (var->data.compact) { 2432 for (int j = 0; j < var_len; j++) 2433 emit_compact_fragment_input(c, loc, var, j); 2434 } else if (glsl_type_is_struct(glsl_without_array(var->type))) { 2435 for (int j = 0; j < var_len; j++) { 2436 emit_fragment_input(c, loc, var, j, 4); 2437 } 2438 } else { 2439 for (int j = 0; j < var_len; j++) { 2440 emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type)); 2441 } 2442 } 2443 } 2444} 2445 2446static void 2447ntq_setup_outputs(struct v3d_compile *c) 2448{ 2449 if (c->s->info.stage != MESA_SHADER_FRAGMENT) 2450 return; 2451 2452 nir_foreach_shader_out_variable(var, c->s) { 2453 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 2454 unsigned loc = var->data.driver_location * 4; 2455 2456 assert(array_len == 1); 2457 (void)array_len; 2458 2459 for (int i = 0; i < 4 - var->data.location_frac; i++) { 2460 add_output(c, loc + var->data.location_frac + i, 2461 var->data.location, 2462 var->data.location_frac + i); 2463 } 2464 2465 switch (var->data.location) { 2466 case FRAG_RESULT_COLOR: 2467 c->output_color_var[0] = var; 2468 c->output_color_var[1] = var; 2469 c->output_color_var[2] = var; 2470 c->output_color_var[3] = var; 2471 break; 2472 case FRAG_RESULT_DATA0: 2473 case FRAG_RESULT_DATA1: 2474 case FRAG_RESULT_DATA2: 2475 case FRAG_RESULT_DATA3: 2476 c->output_color_var[var->data.location - 2477 FRAG_RESULT_DATA0] = var; 2478 break; 2479 case FRAG_RESULT_DEPTH: 2480 c->output_position_index = loc; 2481 break; 2482 case FRAG_RESULT_SAMPLE_MASK: 2483 c->output_sample_mask_index = loc; 2484 break; 2485 } 2486 } 2487} 2488 2489/** 2490 * Sets up the mapping from nir_register to struct qreg *. 2491 * 2492 * Each nir_register gets a struct qreg per 32-bit component being stored. 2493 */ 2494static void 2495ntq_setup_registers(struct v3d_compile *c, struct exec_list *list) 2496{ 2497 foreach_list_typed(nir_register, nir_reg, node, list) { 2498 unsigned array_len = MAX2(nir_reg->num_array_elems, 1); 2499 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 2500 array_len * 2501 nir_reg->num_components); 2502 2503 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); 2504 2505 for (int i = 0; i < array_len * nir_reg->num_components; i++) 2506 qregs[i] = vir_get_temp(c); 2507 } 2508} 2509 2510static void 2511ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr) 2512{ 2513 /* XXX perf: Experiment with using immediate loads to avoid having 2514 * these end up in the uniform stream. Watch out for breaking the 2515 * small immediates optimization in the process! 2516 */ 2517 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 2518 for (int i = 0; i < instr->def.num_components; i++) 2519 qregs[i] = vir_uniform_ui(c, instr->value[i].u32); 2520 2521 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); 2522} 2523 2524static void 2525ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr) 2526{ 2527 unsigned image_index = nir_src_as_uint(instr->src[0]); 2528 bool is_array = nir_intrinsic_image_array(instr); 2529 2530 assert(nir_src_as_uint(instr->src[1]) == 0); 2531 2532 ntq_store_dest(c, &instr->dest, 0, 2533 vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index)); 2534 if (instr->num_components > 1) { 2535 ntq_store_dest(c, &instr->dest, 1, 2536 vir_uniform(c, 2537 instr->num_components == 2 && is_array ? 2538 QUNIFORM_IMAGE_ARRAY_SIZE : 2539 QUNIFORM_IMAGE_HEIGHT, 2540 image_index)); 2541 } 2542 if (instr->num_components > 2) { 2543 ntq_store_dest(c, &instr->dest, 2, 2544 vir_uniform(c, 2545 is_array ? 2546 QUNIFORM_IMAGE_ARRAY_SIZE : 2547 QUNIFORM_IMAGE_DEPTH, 2548 image_index)); 2549 } 2550} 2551 2552static void 2553vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr) 2554{ 2555 assert(c->s->info.stage == MESA_SHADER_FRAGMENT); 2556 2557 int rt = nir_src_as_uint(instr->src[0]); 2558 assert(rt < V3D_MAX_DRAW_BUFFERS); 2559 2560 int sample_index = nir_intrinsic_base(instr) ; 2561 assert(sample_index < V3D_MAX_SAMPLES); 2562 2563 int component = nir_intrinsic_component(instr); 2564 assert(component < 4); 2565 2566 /* We need to emit our TLB reads after we have acquired the scoreboard 2567 * lock, or the GPU will hang. Usually, we do our scoreboard locking on 2568 * the last thread switch to improve parallelism, however, that is only 2569 * guaranteed to happen before the tlb color writes. 2570 * 2571 * To fix that, we make sure we always emit a thread switch before the 2572 * first tlb color read. If that happens to be the last thread switch 2573 * we emit, then everything is fine, but otherwsie, if any code after 2574 * this point needs to emit additional thread switches, then we will 2575 * switch the strategy to locking the scoreboard on the first thread 2576 * switch instead -- see vir_emit_thrsw(). 2577 */ 2578 if (!c->emitted_tlb_load) { 2579 if (!c->last_thrsw_at_top_level) { 2580 assert(c->devinfo->ver >= 41); 2581 vir_emit_thrsw(c); 2582 } 2583 2584 c->emitted_tlb_load = true; 2585 } 2586 2587 struct qreg *color_reads_for_sample = 2588 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4]; 2589 2590 if (color_reads_for_sample[component].file == QFILE_NULL) { 2591 enum pipe_format rt_format = c->fs_key->color_fmt[rt].format; 2592 int num_components = 2593 util_format_get_nr_components(rt_format); 2594 2595 const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt); 2596 if (swap_rb) 2597 num_components = MAX2(num_components, 3); 2598 2599 nir_variable *var = c->output_color_var[rt]; 2600 enum glsl_base_type type = glsl_get_base_type(var->type); 2601 2602 bool is_int_format = type == GLSL_TYPE_INT || 2603 type == GLSL_TYPE_UINT; 2604 2605 bool is_32b_tlb_format = is_int_format || 2606 (c->fs_key->f32_color_rb & (1 << rt)); 2607 2608 int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1; 2609 2610 uint32_t conf = 0xffffff00; 2611 conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE : 2612 TLB_SAMPLE_MODE_PER_PIXEL; 2613 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; 2614 2615 if (is_32b_tlb_format) { 2616 /* The F32 vs I32 distinction was dropped in 4.2. */ 2617 conf |= (c->devinfo->ver < 42 && is_int_format) ? 2618 TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR; 2619 2620 conf |= ((num_components - 1) << 2621 TLB_VEC_SIZE_MINUS_1_SHIFT); 2622 } else { 2623 conf |= TLB_TYPE_F16_COLOR; 2624 conf |= TLB_F16_SWAP_HI_LO; 2625 2626 if (num_components >= 3) 2627 conf |= TLB_VEC_SIZE_4_F16; 2628 else 2629 conf |= TLB_VEC_SIZE_2_F16; 2630 } 2631 2632 2633 for (int i = 0; i < num_samples; i++) { 2634 struct qreg r, g, b, a; 2635 if (is_32b_tlb_format) { 2636 r = conf != 0xffffffff && i == 0? 2637 vir_TLBU_COLOR_READ(c, conf) : 2638 vir_TLB_COLOR_READ(c); 2639 if (num_components >= 2) 2640 g = vir_TLB_COLOR_READ(c); 2641 if (num_components >= 3) 2642 b = vir_TLB_COLOR_READ(c); 2643 if (num_components >= 4) 2644 a = vir_TLB_COLOR_READ(c); 2645 } else { 2646 struct qreg rg = conf != 0xffffffff && i == 0 ? 2647 vir_TLBU_COLOR_READ(c, conf) : 2648 vir_TLB_COLOR_READ(c); 2649 r = vir_FMOV(c, rg); 2650 vir_set_unpack(c->defs[r.index], 0, 2651 V3D_QPU_UNPACK_L); 2652 g = vir_FMOV(c, rg); 2653 vir_set_unpack(c->defs[g.index], 0, 2654 V3D_QPU_UNPACK_H); 2655 2656 if (num_components > 2) { 2657 struct qreg ba = vir_TLB_COLOR_READ(c); 2658 b = vir_FMOV(c, ba); 2659 vir_set_unpack(c->defs[b.index], 0, 2660 V3D_QPU_UNPACK_L); 2661 a = vir_FMOV(c, ba); 2662 vir_set_unpack(c->defs[a.index], 0, 2663 V3D_QPU_UNPACK_H); 2664 } 2665 } 2666 2667 struct qreg *color_reads = 2668 &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4]; 2669 2670 color_reads[0] = swap_rb ? b : r; 2671 if (num_components >= 2) 2672 color_reads[1] = g; 2673 if (num_components >= 3) 2674 color_reads[2] = swap_rb ? r : b; 2675 if (num_components >= 4) 2676 color_reads[3] = a; 2677 } 2678 } 2679 2680 assert(color_reads_for_sample[component].file != QFILE_NULL); 2681 ntq_store_dest(c, &instr->dest, 0, 2682 vir_MOV(c, color_reads_for_sample[component])); 2683} 2684 2685static bool 2686ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr); 2687 2688static bool 2689try_emit_uniform(struct v3d_compile *c, 2690 int offset, 2691 int num_components, 2692 nir_dest *dest, 2693 enum quniform_contents contents) 2694{ 2695 /* Even though ldunif is strictly 32-bit we can still use it 2696 * to load scalar 8-bit/16-bit uniforms so long as their offset 2697 * is 32-bit aligned. In this case, ldunif would still load 2698 * 32-bit into the destination with the 8-bit/16-bit uniform 2699 * data in the LSB and garbage in the MSB, but that is fine 2700 * because we should only be accessing the valid bits of the 2701 * destination. 2702 * 2703 * FIXME: if in the future we improve our register allocator to 2704 * pack 2 16-bit variables in the MSB and LSB of the same 2705 * register then this optimization would not be valid as is, 2706 * since the load clobbers the MSB. 2707 */ 2708 if (offset % 4 != 0) 2709 return false; 2710 2711 /* We need dwords */ 2712 offset = offset / 4; 2713 2714 for (int i = 0; i < num_components; i++) { 2715 ntq_store_dest(c, dest, i, 2716 vir_uniform(c, contents, offset + i)); 2717 } 2718 2719 return true; 2720} 2721 2722static void 2723ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr) 2724{ 2725 /* We scalarize general TMU access for anything that is not 32-bit. */ 2726 assert(nir_dest_bit_size(instr->dest) == 32 || 2727 instr->num_components == 1); 2728 2729 /* Try to emit ldunif if possible, otherwise fallback to general TMU */ 2730 if (nir_src_is_const(instr->src[0])) { 2731 int offset = (nir_intrinsic_base(instr) + 2732 nir_src_as_uint(instr->src[0])); 2733 2734 if (try_emit_uniform(c, offset, instr->num_components, 2735 &instr->dest, QUNIFORM_UNIFORM)) { 2736 return; 2737 } 2738 } 2739 2740 if (!ntq_emit_load_unifa(c, instr)) { 2741 ntq_emit_tmu_general(c, instr, false, false); 2742 c->has_general_tmu_load = true; 2743 } 2744} 2745 2746static bool 2747ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr) 2748{ 2749 if (c->compiler->max_inline_uniform_buffers <= 0) 2750 return false; 2751 2752 /* On Vulkan we use indices 1..MAX_INLINE_UNIFORM_BUFFERS for inline 2753 * uniform buffers which we want to handle more like push constants 2754 * than regular UBO. OpenGL doesn't implement this feature. 2755 */ 2756 assert(c->key->environment == V3D_ENVIRONMENT_VULKAN); 2757 uint32_t index = nir_src_as_uint(instr->src[0]); 2758 if (index == 0 || index > c->compiler->max_inline_uniform_buffers) 2759 return false; 2760 2761 /* We scalarize general TMU access for anything that is not 32-bit */ 2762 assert(nir_dest_bit_size(instr->dest) == 32 || 2763 instr->num_components == 1); 2764 2765 if (nir_src_is_const(instr->src[1])) { 2766 /* Index 0 is reserved for push constants */ 2767 assert(index > 0); 2768 uint32_t inline_index = index - 1; 2769 int offset = nir_src_as_uint(instr->src[1]); 2770 if (try_emit_uniform(c, offset, instr->num_components, 2771 &instr->dest, 2772 QUNIFORM_INLINE_UBO_0 + inline_index)) { 2773 return true; 2774 } 2775 } 2776 2777 /* Fallback to regular UBO load */ 2778 return false; 2779} 2780 2781static void 2782ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) 2783{ 2784 /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset). 2785 * 2786 * Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even 2787 * if we don't support non-uniform offsets because we also set the 2788 * lower_all_io_to_temps option in the NIR compiler. This ensures that 2789 * any indirect indexing on in/out variables is turned into indirect 2790 * indexing on temporary variables instead, that we handle by lowering 2791 * to scratch. If we implement non-uniform offset here we might be able 2792 * to avoid the temp and scratch lowering, which involves copying from 2793 * the input to the temp variable, possibly making code more optimal. 2794 */ 2795 unsigned offset = 2796 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]); 2797 2798 if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) { 2799 /* Emit the LDVPM directly now, rather than at the top 2800 * of the shader like we did for V3D 3.x (which needs 2801 * vpmsetup when not just taking the next offset). 2802 * 2803 * Note that delaying like this may introduce stalls, 2804 * as LDVPMV takes a minimum of 1 instruction but may 2805 * be slower if the VPM unit is busy with another QPU. 2806 */ 2807 int index = 0; 2808 if (BITSET_TEST(c->s->info.system_values_read, 2809 SYSTEM_VALUE_INSTANCE_ID)) { 2810 index++; 2811 } 2812 if (BITSET_TEST(c->s->info.system_values_read, 2813 SYSTEM_VALUE_BASE_INSTANCE)) { 2814 index++; 2815 } 2816 if (BITSET_TEST(c->s->info.system_values_read, 2817 SYSTEM_VALUE_VERTEX_ID)) { 2818 index++; 2819 } 2820 for (int i = 0; i < offset; i++) 2821 index += c->vattr_sizes[i]; 2822 index += nir_intrinsic_component(instr); 2823 for (int i = 0; i < instr->num_components; i++) { 2824 struct qreg vpm_offset = vir_uniform_ui(c, index++); 2825 ntq_store_dest(c, &instr->dest, i, 2826 vir_LDVPMV_IN(c, vpm_offset)); 2827 } 2828 } else { 2829 for (int i = 0; i < instr->num_components; i++) { 2830 int comp = nir_intrinsic_component(instr) + i; 2831 struct qreg input = c->inputs[offset * 4 + comp]; 2832 ntq_store_dest(c, &instr->dest, i, vir_MOV(c, input)); 2833 2834 if (c->s->info.stage == MESA_SHADER_FRAGMENT && 2835 input.file == c->payload_z.file && 2836 input.index == c->payload_z.index) { 2837 c->reads_z = true; 2838 } 2839 } 2840 } 2841} 2842 2843static void 2844ntq_emit_per_sample_color_write(struct v3d_compile *c, 2845 nir_intrinsic_instr *instr) 2846{ 2847 assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d); 2848 2849 unsigned rt = nir_src_as_uint(instr->src[1]); 2850 assert(rt < V3D_MAX_DRAW_BUFFERS); 2851 2852 unsigned sample_idx = nir_intrinsic_base(instr); 2853 assert(sample_idx < V3D_MAX_SAMPLES); 2854 2855 unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4; 2856 for (int i = 0; i < instr->num_components; i++) { 2857 c->sample_colors[offset + i] = 2858 vir_MOV(c, ntq_get_src(c, instr->src[0], i)); 2859 } 2860} 2861 2862static void 2863ntq_emit_color_write(struct v3d_compile *c, 2864 nir_intrinsic_instr *instr) 2865{ 2866 unsigned offset = (nir_intrinsic_base(instr) + 2867 nir_src_as_uint(instr->src[1])) * 4 + 2868 nir_intrinsic_component(instr); 2869 for (int i = 0; i < instr->num_components; i++) { 2870 c->outputs[offset + i] = 2871 vir_MOV(c, ntq_get_src(c, instr->src[0], i)); 2872 } 2873} 2874 2875static void 2876emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr) 2877{ 2878 assert(instr->num_components == 1); 2879 2880 struct qreg offset = ntq_get_src(c, instr->src[1], 0); 2881 2882 uint32_t base_offset = nir_intrinsic_base(instr); 2883 2884 if (base_offset) 2885 offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset); 2886 2887 /* Usually, for VS or FS, we only emit outputs once at program end so 2888 * our VPM writes are never in non-uniform control flow, but this 2889 * is not true for GS, where we are emitting multiple vertices. 2890 */ 2891 if (vir_in_nonuniform_control_flow(c)) { 2892 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 2893 V3D_QPU_PF_PUSHZ); 2894 } 2895 2896 struct qreg val = ntq_get_src(c, instr->src[0], 0); 2897 2898 /* The offset isn’t necessarily dynamically uniform for a geometry 2899 * shader. This can happen if the shader sometimes doesn’t emit one of 2900 * the vertices. In that case subsequent vertices will be written to 2901 * different offsets in the VPM and we need to use the scatter write 2902 * instruction to have a different offset for each lane. 2903 */ 2904 bool is_uniform_offset = 2905 !vir_in_nonuniform_control_flow(c) && 2906 !nir_src_is_divergent(instr->src[1]); 2907 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset); 2908 2909 if (vir_in_nonuniform_control_flow(c)) { 2910 struct qinst *last_inst = 2911 (struct qinst *)c->cur_block->instructions.prev; 2912 vir_set_cond(last_inst, V3D_QPU_COND_IFA); 2913 } 2914} 2915 2916static void 2917emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr) 2918{ 2919 assert(c->s->info.stage == MESA_SHADER_VERTEX); 2920 assert(instr->num_components == 1); 2921 2922 uint32_t base = nir_intrinsic_base(instr); 2923 struct qreg val = ntq_get_src(c, instr->src[0], 0); 2924 2925 if (nir_src_is_const(instr->src[1])) { 2926 vir_VPM_WRITE(c, val, 2927 base + nir_src_as_uint(instr->src[1])); 2928 } else { 2929 struct qreg offset = vir_ADD(c, 2930 ntq_get_src(c, instr->src[1], 1), 2931 vir_uniform_ui(c, base)); 2932 bool is_uniform_offset = 2933 !vir_in_nonuniform_control_flow(c) && 2934 !nir_src_is_divergent(instr->src[1]); 2935 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset); 2936 } 2937} 2938 2939static void 2940ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr) 2941{ 2942 if (c->s->info.stage == MESA_SHADER_FRAGMENT) 2943 ntq_emit_color_write(c, instr); 2944 else if (c->s->info.stage == MESA_SHADER_GEOMETRY) 2945 emit_store_output_gs(c, instr); 2946 else 2947 emit_store_output_vs(c, instr); 2948} 2949 2950/** 2951 * This implementation is based on v3d_sample_{x,y}_offset() from 2952 * v3d_sample_offset.h. 2953 */ 2954static void 2955ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx, 2956 struct qreg *sx, struct qreg *sy) 2957{ 2958 sample_idx = vir_ITOF(c, sample_idx); 2959 2960 struct qreg offset_x = 2961 vir_FADD(c, vir_uniform_f(c, -0.125f), 2962 vir_FMUL(c, sample_idx, 2963 vir_uniform_f(c, 0.5f))); 2964 vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), 2965 vir_uniform_f(c, 2.0f), sample_idx), 2966 V3D_QPU_PF_PUSHC); 2967 offset_x = vir_SEL(c, V3D_QPU_COND_IFA, 2968 vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)), 2969 offset_x); 2970 2971 struct qreg offset_y = 2972 vir_FADD(c, vir_uniform_f(c, -0.375f), 2973 vir_FMUL(c, sample_idx, 2974 vir_uniform_f(c, 0.25f))); 2975 *sx = offset_x; 2976 *sy = offset_y; 2977} 2978 2979/** 2980 * This implementation is based on get_centroid_offset() from fep.c. 2981 */ 2982static void 2983ntq_get_barycentric_centroid(struct v3d_compile *c, 2984 struct qreg *out_x, 2985 struct qreg *out_y) 2986{ 2987 struct qreg sample_mask; 2988 if (c->output_sample_mask_index != -1) 2989 sample_mask = c->outputs[c->output_sample_mask_index]; 2990 else 2991 sample_mask = vir_MSF(c); 2992 2993 struct qreg i0 = vir_uniform_ui(c, 0); 2994 struct qreg i1 = vir_uniform_ui(c, 1); 2995 struct qreg i2 = vir_uniform_ui(c, 2); 2996 struct qreg i3 = vir_uniform_ui(c, 3); 2997 struct qreg i4 = vir_uniform_ui(c, 4); 2998 struct qreg i8 = vir_uniform_ui(c, 8); 2999 3000 /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */ 3001 struct qreg F = vir_uniform_ui(c, 0); 3002 struct qreg T = vir_uniform_ui(c, ~0); 3003 struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1); 3004 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ); 3005 s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 3006 struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2); 3007 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ); 3008 s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 3009 struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4); 3010 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ); 3011 s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 3012 struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8); 3013 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ); 3014 s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 3015 3016 /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */ 3017 struct qreg sample_idx = i3; 3018 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ); 3019 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx); 3020 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ); 3021 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx); 3022 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ); 3023 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx); 3024 3025 /* Get offset at selected sample index */ 3026 struct qreg offset_x, offset_y; 3027 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y); 3028 3029 /* Select pixel center [offset=(0,0)] if two opposing samples (or none) 3030 * are selected. 3031 */ 3032 struct qreg s0_and_s3 = vir_AND(c, s0, s3); 3033 struct qreg s1_and_s2 = vir_AND(c, s1, s2); 3034 3035 struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0)); 3036 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ); 3037 use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 3038 use_center = vir_OR(c, use_center, s0_and_s3); 3039 use_center = vir_OR(c, use_center, s1_and_s2); 3040 3041 struct qreg zero = vir_uniform_f(c, 0.0f); 3042 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ); 3043 offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x); 3044 offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y); 3045 3046 *out_x = offset_x; 3047 *out_y = offset_y; 3048} 3049 3050static struct qreg 3051ntq_emit_load_interpolated_input(struct v3d_compile *c, 3052 struct qreg p, 3053 struct qreg C, 3054 struct qreg offset_x, 3055 struct qreg offset_y, 3056 unsigned mode) 3057{ 3058 if (mode == INTERP_MODE_FLAT) 3059 return C; 3060 3061 struct qreg sample_offset_x = 3062 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))); 3063 struct qreg sample_offset_y = 3064 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))); 3065 3066 struct qreg scaleX = 3067 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x), 3068 offset_x); 3069 struct qreg scaleY = 3070 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y), 3071 offset_y); 3072 3073 struct qreg pInterp = 3074 vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX), 3075 vir_FMUL(c, vir_FDY(c, p), scaleY))); 3076 3077 if (mode == INTERP_MODE_NOPERSPECTIVE) 3078 return vir_FADD(c, pInterp, C); 3079 3080 struct qreg w = c->payload_w; 3081 struct qreg wInterp = 3082 vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX), 3083 vir_FMUL(c, vir_FDY(c, w), scaleY))); 3084 3085 return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C); 3086} 3087 3088static void 3089emit_ldunifa(struct v3d_compile *c, struct qreg *result) 3090{ 3091 struct qinst *ldunifa = 3092 vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef); 3093 ldunifa->qpu.sig.ldunifa = true; 3094 if (result) 3095 *result = vir_emit_def(c, ldunifa); 3096 else 3097 vir_emit_nondef(c, ldunifa); 3098 c->current_unifa_offset += 4; 3099} 3100 3101static bool 3102ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) 3103{ 3104 assert(instr->intrinsic == nir_intrinsic_load_ubo || 3105 instr->intrinsic == nir_intrinsic_load_ssbo || 3106 instr->intrinsic == nir_intrinsic_load_uniform); 3107 3108 bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform; 3109 bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; 3110 bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo; 3111 3112 /* Every ldunifa auto-increments the unifa address by 4 bytes, so our 3113 * current unifa offset is 4 bytes ahead of the offset of the last load. 3114 */ 3115 static const int32_t max_unifa_skip_dist = 3116 MAX_UNIFA_SKIP_DISTANCE - 4; 3117 3118 /* We can only use unifa if the offset is uniform */ 3119 nir_src offset = is_uniform ? instr->src[0] : instr->src[1]; 3120 if (nir_src_is_divergent(offset)) 3121 return false; 3122 3123 /* We can only use unifa with SSBOs if they are read-only. Otherwise 3124 * ldunifa won't see the shader writes to that address (possibly 3125 * because ldunifa doesn't read from the L2T cache). 3126 */ 3127 if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE)) 3128 return false; 3129 3130 /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms 3131 * that we may have been written to scratch using the TMU. 3132 */ 3133 bool dynamic_src = !nir_src_is_const(offset); 3134 if (is_uniform && dynamic_src && c->s->scratch_size > 0) 3135 return false; 3136 3137 uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset); 3138 if (is_uniform) 3139 const_offset += nir_intrinsic_base(instr); 3140 3141 /* ldunifa is a 32-bit load instruction so we can only use it with 3142 * 32-bit aligned addresses. We always produce 32-bit aligned addresses 3143 * except for types smaller than 32-bit, so in these cases we can only 3144 * use ldunifa if we can verify alignment, which we can only do for 3145 * loads with a constant offset. 3146 */ 3147 uint32_t bit_size = nir_dest_bit_size(instr->dest); 3148 uint32_t value_skips = 0; 3149 if (bit_size < 32) { 3150 if (dynamic_src) { 3151 return false; 3152 } else if (const_offset % 4 != 0) { 3153 /* If we are loading from an unaligned offset, fix 3154 * alignment and skip over unused elements in result. 3155 */ 3156 value_skips = (const_offset % 4) / (bit_size / 8); 3157 const_offset &= ~0x3; 3158 } 3159 } 3160 3161 assert((bit_size == 32 && value_skips == 0) || 3162 (bit_size == 16 && value_skips <= 1) || 3163 (bit_size == 8 && value_skips <= 3)); 3164 3165 /* Both Vulkan and OpenGL reserve index 0 for uniforms / push 3166 * constants. 3167 */ 3168 uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]); 3169 3170 /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index 3171 * shifted up by 1 (0 is gallium's constant buffer 0). 3172 */ 3173 if (is_ubo && c->key->environment == V3D_ENVIRONMENT_OPENGL) 3174 index++; 3175 3176 /* We can only keep track of the last unifa address we used with 3177 * constant offset loads. If the new load targets the same buffer and 3178 * is close enough to the previous load, we can skip the unifa register 3179 * write by emitting dummy ldunifa instructions to update the unifa 3180 * address. 3181 */ 3182 bool skip_unifa = false; 3183 uint32_t ldunifa_skips = 0; 3184 if (dynamic_src) { 3185 c->current_unifa_block = NULL; 3186 } else if (c->cur_block == c->current_unifa_block && 3187 c->current_unifa_is_ubo == !is_ssbo && 3188 c->current_unifa_index == index && 3189 c->current_unifa_offset <= const_offset && 3190 c->current_unifa_offset + max_unifa_skip_dist >= const_offset) { 3191 skip_unifa = true; 3192 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4; 3193 } else { 3194 c->current_unifa_block = c->cur_block; 3195 c->current_unifa_is_ubo = !is_ssbo; 3196 c->current_unifa_index = index; 3197 c->current_unifa_offset = const_offset; 3198 } 3199 3200 if (!skip_unifa) { 3201 struct qreg base_offset = !is_ssbo ? 3202 vir_uniform(c, QUNIFORM_UBO_ADDR, 3203 v3d_unit_data_create(index, const_offset)) : 3204 vir_uniform(c, QUNIFORM_SSBO_OFFSET, index); 3205 3206 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA); 3207 if (!dynamic_src) { 3208 if (!is_ssbo) { 3209 vir_MOV_dest(c, unifa, base_offset); 3210 } else { 3211 vir_ADD_dest(c, unifa, base_offset, 3212 vir_uniform_ui(c, const_offset)); 3213 } 3214 } else { 3215 vir_ADD_dest(c, unifa, base_offset, 3216 ntq_get_src(c, offset, 0)); 3217 } 3218 } else { 3219 for (int i = 0; i < ldunifa_skips; i++) 3220 emit_ldunifa(c, NULL); 3221 } 3222 3223 uint32_t num_components = nir_intrinsic_dest_components(instr); 3224 for (uint32_t i = 0; i < num_components; ) { 3225 struct qreg data; 3226 emit_ldunifa(c, &data); 3227 3228 if (bit_size == 32) { 3229 assert(value_skips == 0); 3230 ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data)); 3231 i++; 3232 } else { 3233 assert((bit_size == 16 && value_skips <= 1) || 3234 (bit_size == 8 && value_skips <= 3)); 3235 3236 /* If we have any values to skip, shift to the first 3237 * valid value in the ldunifa result. 3238 */ 3239 if (value_skips > 0) { 3240 data = vir_SHR(c, data, 3241 vir_uniform_ui(c, bit_size * 3242 value_skips)); 3243 } 3244 3245 /* Check how many valid components we have discounting 3246 * read components to skip. 3247 */ 3248 uint32_t valid_count = (32 / bit_size) - value_skips; 3249 assert((bit_size == 16 && valid_count <= 2) || 3250 (bit_size == 8 && valid_count <= 4)); 3251 assert(valid_count > 0); 3252 3253 /* Process the valid components */ 3254 do { 3255 struct qreg tmp; 3256 uint32_t mask = (1 << bit_size) - 1; 3257 tmp = vir_AND(c, vir_MOV(c, data), 3258 vir_uniform_ui(c, mask)); 3259 ntq_store_dest(c, &instr->dest, i, 3260 vir_MOV(c, tmp)); 3261 i++; 3262 valid_count--; 3263 3264 /* Shift to next component */ 3265 if (i < num_components && valid_count > 0) { 3266 data = vir_SHR(c, data, 3267 vir_uniform_ui(c, bit_size)); 3268 } 3269 } while (i < num_components && valid_count > 0); 3270 } 3271 } 3272 3273 return true; 3274} 3275 3276static inline struct qreg 3277emit_load_local_invocation_index(struct v3d_compile *c) 3278{ 3279 return vir_SHR(c, c->cs_payload[1], 3280 vir_uniform_ui(c, 32 - c->local_invocation_index_bits)); 3281} 3282 3283/* Various subgroup operations rely on the A flags, so this helper ensures that 3284 * A flags represents currently active lanes in the subgroup. 3285 */ 3286static void 3287set_a_flags_for_subgroup(struct v3d_compile *c) 3288{ 3289 /* MSF returns 0 for disabled lanes in compute shaders so 3290 * PUSHZ will set A=1 for disabled lanes. We want the inverse 3291 * of this but we don't have any means to negate the A flags 3292 * directly, but we can do it by repeating the same operation 3293 * with NORZ (A = ~A & ~Z). 3294 */ 3295 assert(c->s->info.stage == MESA_SHADER_COMPUTE); 3296 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); 3297 vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ); 3298 3299 /* If we are under non-uniform control flow we also need to 3300 * AND the A flags with the current execute mask. 3301 */ 3302 if (vir_in_nonuniform_control_flow(c)) { 3303 const uint32_t bidx = c->cur_block->index; 3304 vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(), 3305 c->execute, 3306 vir_uniform_ui(c, bidx)), 3307 V3D_QPU_UF_ANDZ); 3308 } 3309} 3310 3311static void 3312ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) 3313{ 3314 switch (instr->intrinsic) { 3315 case nir_intrinsic_load_uniform: 3316 ntq_emit_load_uniform(c, instr); 3317 break; 3318 3319 case nir_intrinsic_load_global_2x32: 3320 ntq_emit_tmu_general(c, instr, false, true); 3321 c->has_general_tmu_load = true; 3322 break; 3323 3324 case nir_intrinsic_load_ubo: 3325 if (ntq_emit_inline_ubo_load(c, instr)) 3326 break; 3327 FALLTHROUGH; 3328 case nir_intrinsic_load_ssbo: 3329 if (!ntq_emit_load_unifa(c, instr)) { 3330 ntq_emit_tmu_general(c, instr, false, false); 3331 c->has_general_tmu_load = true; 3332 } 3333 break; 3334 3335 case nir_intrinsic_ssbo_atomic_add: 3336 case nir_intrinsic_ssbo_atomic_imin: 3337 case nir_intrinsic_ssbo_atomic_umin: 3338 case nir_intrinsic_ssbo_atomic_imax: 3339 case nir_intrinsic_ssbo_atomic_umax: 3340 case nir_intrinsic_ssbo_atomic_and: 3341 case nir_intrinsic_ssbo_atomic_or: 3342 case nir_intrinsic_ssbo_atomic_xor: 3343 case nir_intrinsic_ssbo_atomic_exchange: 3344 case nir_intrinsic_ssbo_atomic_comp_swap: 3345 case nir_intrinsic_store_ssbo: 3346 ntq_emit_tmu_general(c, instr, false, false); 3347 break; 3348 3349 case nir_intrinsic_global_atomic_add_2x32: 3350 case nir_intrinsic_global_atomic_imin_2x32: 3351 case nir_intrinsic_global_atomic_umin_2x32: 3352 case nir_intrinsic_global_atomic_imax_2x32: 3353 case nir_intrinsic_global_atomic_umax_2x32: 3354 case nir_intrinsic_global_atomic_and_2x32: 3355 case nir_intrinsic_global_atomic_or_2x32: 3356 case nir_intrinsic_global_atomic_xor_2x32: 3357 case nir_intrinsic_global_atomic_exchange_2x32: 3358 case nir_intrinsic_global_atomic_comp_swap_2x32: 3359 case nir_intrinsic_store_global_2x32: 3360 ntq_emit_tmu_general(c, instr, false, true); 3361 break; 3362 3363 case nir_intrinsic_shared_atomic_add: 3364 case nir_intrinsic_shared_atomic_imin: 3365 case nir_intrinsic_shared_atomic_umin: 3366 case nir_intrinsic_shared_atomic_imax: 3367 case nir_intrinsic_shared_atomic_umax: 3368 case nir_intrinsic_shared_atomic_and: 3369 case nir_intrinsic_shared_atomic_or: 3370 case nir_intrinsic_shared_atomic_xor: 3371 case nir_intrinsic_shared_atomic_exchange: 3372 case nir_intrinsic_shared_atomic_comp_swap: 3373 case nir_intrinsic_store_shared: 3374 case nir_intrinsic_store_scratch: 3375 ntq_emit_tmu_general(c, instr, true, false); 3376 break; 3377 3378 case nir_intrinsic_load_scratch: 3379 case nir_intrinsic_load_shared: 3380 ntq_emit_tmu_general(c, instr, true, false); 3381 c->has_general_tmu_load = true; 3382 break; 3383 3384 case nir_intrinsic_image_store: 3385 case nir_intrinsic_image_atomic_add: 3386 case nir_intrinsic_image_atomic_imin: 3387 case nir_intrinsic_image_atomic_umin: 3388 case nir_intrinsic_image_atomic_imax: 3389 case nir_intrinsic_image_atomic_umax: 3390 case nir_intrinsic_image_atomic_and: 3391 case nir_intrinsic_image_atomic_or: 3392 case nir_intrinsic_image_atomic_xor: 3393 case nir_intrinsic_image_atomic_exchange: 3394 case nir_intrinsic_image_atomic_comp_swap: 3395 v3d40_vir_emit_image_load_store(c, instr); 3396 break; 3397 3398 case nir_intrinsic_image_load: 3399 v3d40_vir_emit_image_load_store(c, instr); 3400 /* Not really a general TMU load, but we only use this flag 3401 * for NIR scheduling and we do schedule these under the same 3402 * policy as general TMU. 3403 */ 3404 c->has_general_tmu_load = true; 3405 break; 3406 3407 case nir_intrinsic_get_ssbo_size: 3408 ntq_store_dest(c, &instr->dest, 0, 3409 vir_uniform(c, QUNIFORM_GET_SSBO_SIZE, 3410 nir_src_comp_as_uint(instr->src[0], 0))); 3411 break; 3412 3413 case nir_intrinsic_get_ubo_size: 3414 ntq_store_dest(c, &instr->dest, 0, 3415 vir_uniform(c, QUNIFORM_GET_UBO_SIZE, 3416 nir_src_comp_as_uint(instr->src[0], 0))); 3417 break; 3418 3419 case nir_intrinsic_load_user_clip_plane: 3420 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { 3421 ntq_store_dest(c, &instr->dest, i, 3422 vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, 3423 nir_intrinsic_ucp_id(instr) * 3424 4 + i)); 3425 } 3426 break; 3427 3428 case nir_intrinsic_load_viewport_x_scale: 3429 ntq_store_dest(c, &instr->dest, 0, 3430 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); 3431 break; 3432 3433 case nir_intrinsic_load_viewport_y_scale: 3434 ntq_store_dest(c, &instr->dest, 0, 3435 vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); 3436 break; 3437 3438 case nir_intrinsic_load_viewport_z_scale: 3439 ntq_store_dest(c, &instr->dest, 0, 3440 vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); 3441 break; 3442 3443 case nir_intrinsic_load_viewport_z_offset: 3444 ntq_store_dest(c, &instr->dest, 0, 3445 vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); 3446 break; 3447 3448 case nir_intrinsic_load_line_coord: 3449 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x)); 3450 break; 3451 3452 case nir_intrinsic_load_line_width: 3453 ntq_store_dest(c, &instr->dest, 0, 3454 vir_uniform(c, QUNIFORM_LINE_WIDTH, 0)); 3455 break; 3456 3457 case nir_intrinsic_load_aa_line_width: 3458 ntq_store_dest(c, &instr->dest, 0, 3459 vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0)); 3460 break; 3461 3462 case nir_intrinsic_load_sample_mask_in: 3463 ntq_store_dest(c, &instr->dest, 0, vir_MSF(c)); 3464 break; 3465 3466 case nir_intrinsic_load_helper_invocation: 3467 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); 3468 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); 3469 ntq_store_dest(c, &instr->dest, 0, qdest); 3470 break; 3471 3472 case nir_intrinsic_load_front_face: 3473 /* The register contains 0 (front) or 1 (back), and we need to 3474 * turn it into a NIR bool where true means front. 3475 */ 3476 ntq_store_dest(c, &instr->dest, 0, 3477 vir_ADD(c, 3478 vir_uniform_ui(c, -1), 3479 vir_REVF(c))); 3480 break; 3481 3482 case nir_intrinsic_load_base_instance: 3483 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid)); 3484 break; 3485 3486 case nir_intrinsic_load_instance_id: 3487 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid)); 3488 break; 3489 3490 case nir_intrinsic_load_vertex_id: 3491 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); 3492 break; 3493 3494 case nir_intrinsic_load_tlb_color_v3d: 3495 vir_emit_tlb_color_read(c, instr); 3496 break; 3497 3498 case nir_intrinsic_load_input: 3499 ntq_emit_load_input(c, instr); 3500 break; 3501 3502 case nir_intrinsic_store_tlb_sample_color_v3d: 3503 ntq_emit_per_sample_color_write(c, instr); 3504 break; 3505 3506 case nir_intrinsic_store_output: 3507 ntq_emit_store_output(c, instr); 3508 break; 3509 3510 case nir_intrinsic_image_size: 3511 ntq_emit_image_size(c, instr); 3512 break; 3513 3514 case nir_intrinsic_discard: 3515 ntq_flush_tmu(c); 3516 3517 if (vir_in_nonuniform_control_flow(c)) { 3518 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 3519 V3D_QPU_PF_PUSHZ); 3520 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), 3521 vir_uniform_ui(c, 0)), 3522 V3D_QPU_COND_IFA); 3523 } else { 3524 vir_SETMSF_dest(c, vir_nop_reg(), 3525 vir_uniform_ui(c, 0)); 3526 } 3527 break; 3528 3529 case nir_intrinsic_discard_if: { 3530 ntq_flush_tmu(c); 3531 3532 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]); 3533 3534 if (vir_in_nonuniform_control_flow(c)) { 3535 struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(), 3536 c->execute); 3537 if (cond == V3D_QPU_COND_IFA) { 3538 vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ); 3539 } else { 3540 vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ); 3541 cond = V3D_QPU_COND_IFA; 3542 } 3543 } 3544 3545 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), 3546 vir_uniform_ui(c, 0)), cond); 3547 3548 break; 3549 } 3550 3551 case nir_intrinsic_memory_barrier: 3552 case nir_intrinsic_memory_barrier_buffer: 3553 case nir_intrinsic_memory_barrier_image: 3554 case nir_intrinsic_memory_barrier_shared: 3555 case nir_intrinsic_memory_barrier_tcs_patch: 3556 case nir_intrinsic_group_memory_barrier: 3557 /* We don't do any instruction scheduling of these NIR 3558 * instructions between each other, so we just need to make 3559 * sure that the TMU operations before the barrier are flushed 3560 * before the ones after the barrier. 3561 */ 3562 ntq_flush_tmu(c); 3563 break; 3564 3565 case nir_intrinsic_control_barrier: 3566 /* Emit a TSY op to get all invocations in the workgroup 3567 * (actually supergroup) to block until the last invocation 3568 * reaches the TSY op. 3569 */ 3570 ntq_flush_tmu(c); 3571 3572 if (c->devinfo->ver >= 42) { 3573 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, 3574 V3D_QPU_WADDR_SYNCB)); 3575 } else { 3576 struct qinst *sync = 3577 vir_BARRIERID_dest(c, 3578 vir_reg(QFILE_MAGIC, 3579 V3D_QPU_WADDR_SYNCU)); 3580 sync->uniform = 3581 vir_get_uniform_index(c, QUNIFORM_CONSTANT, 3582 0xffffff00 | 3583 V3D_TSY_WAIT_INC_CHECK); 3584 3585 } 3586 3587 /* The blocking of a TSY op only happens at the next thread 3588 * switch. No texturing may be outstanding at the time of a 3589 * TSY blocking operation. 3590 */ 3591 vir_emit_thrsw(c); 3592 break; 3593 3594 case nir_intrinsic_load_num_workgroups: 3595 for (int i = 0; i < 3; i++) { 3596 ntq_store_dest(c, &instr->dest, i, 3597 vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS, 3598 i)); 3599 } 3600 break; 3601 3602 case nir_intrinsic_load_workgroup_id: { 3603 struct qreg x = vir_AND(c, c->cs_payload[0], 3604 vir_uniform_ui(c, 0xffff)); 3605 3606 struct qreg y = vir_SHR(c, c->cs_payload[0], 3607 vir_uniform_ui(c, 16)); 3608 3609 struct qreg z = vir_AND(c, c->cs_payload[1], 3610 vir_uniform_ui(c, 0xffff)); 3611 3612 /* We only support dispatch base in Vulkan */ 3613 if (c->key->environment == V3D_ENVIRONMENT_VULKAN) { 3614 x = vir_ADD(c, x, 3615 vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0)); 3616 y = vir_ADD(c, y, 3617 vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1)); 3618 z = vir_ADD(c, z, 3619 vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2)); 3620 } 3621 3622 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x)); 3623 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y)); 3624 ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z)); 3625 break; 3626 } 3627 3628 case nir_intrinsic_load_local_invocation_index: 3629 ntq_store_dest(c, &instr->dest, 0, 3630 emit_load_local_invocation_index(c)); 3631 break; 3632 3633 case nir_intrinsic_load_subgroup_id: { 3634 /* This is basically the batch index, which is the Local 3635 * Invocation Index divided by the SIMD width). 3636 */ 3637 STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0); 3638 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1; 3639 struct qreg lii = emit_load_local_invocation_index(c); 3640 ntq_store_dest(c, &instr->dest, 0, 3641 vir_SHR(c, lii, 3642 vir_uniform_ui(c, divide_shift))); 3643 break; 3644 } 3645 3646 case nir_intrinsic_load_per_vertex_input: { 3647 /* The vertex shader writes all its used outputs into 3648 * consecutive VPM offsets, so if any output component is 3649 * unused, its VPM offset is used by the next used 3650 * component. This means that we can't assume that each 3651 * location will use 4 consecutive scalar offsets in the VPM 3652 * and we need to compute the VPM offset for each input by 3653 * going through the inputs and finding the one that matches 3654 * our location and component. 3655 * 3656 * col: vertex index, row = varying index 3657 */ 3658 assert(nir_src_is_const(instr->src[1])); 3659 uint32_t location = 3660 nir_intrinsic_io_semantics(instr).location + 3661 nir_src_as_uint(instr->src[1]); 3662 uint32_t component = nir_intrinsic_component(instr); 3663 3664 int32_t row_idx = -1; 3665 for (int i = 0; i < c->num_inputs; i++) { 3666 struct v3d_varying_slot slot = c->input_slots[i]; 3667 if (v3d_slot_get_slot(slot) == location && 3668 v3d_slot_get_component(slot) == component) { 3669 row_idx = i; 3670 break; 3671 } 3672 } 3673 3674 assert(row_idx != -1); 3675 3676 struct qreg col = ntq_get_src(c, instr->src[0], 0); 3677 for (int i = 0; i < instr->num_components; i++) { 3678 struct qreg row = vir_uniform_ui(c, row_idx++); 3679 ntq_store_dest(c, &instr->dest, i, 3680 vir_LDVPMG_IN(c, row, col)); 3681 } 3682 break; 3683 } 3684 3685 case nir_intrinsic_emit_vertex: 3686 case nir_intrinsic_end_primitive: 3687 unreachable("Should have been lowered in v3d_nir_lower_io"); 3688 break; 3689 3690 case nir_intrinsic_load_primitive_id: { 3691 /* gl_PrimitiveIdIn is written by the GBG in the first word of 3692 * VPM output header. According to docs, we should read this 3693 * using ldvpm(v,d)_in (See Table 71). 3694 */ 3695 assert(c->s->info.stage == MESA_SHADER_GEOMETRY); 3696 ntq_store_dest(c, &instr->dest, 0, 3697 vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); 3698 break; 3699 } 3700 3701 case nir_intrinsic_load_invocation_id: 3702 ntq_store_dest(c, &instr->dest, 0, vir_IID(c)); 3703 break; 3704 3705 case nir_intrinsic_load_fb_layers_v3d: 3706 ntq_store_dest(c, &instr->dest, 0, 3707 vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); 3708 break; 3709 3710 case nir_intrinsic_load_sample_id: 3711 ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c)); 3712 break; 3713 3714 case nir_intrinsic_load_sample_pos: 3715 ntq_store_dest(c, &instr->dest, 0, 3716 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)))); 3717 ntq_store_dest(c, &instr->dest, 1, 3718 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)))); 3719 break; 3720 3721 case nir_intrinsic_load_barycentric_at_offset: 3722 ntq_store_dest(c, &instr->dest, 0, 3723 vir_MOV(c, ntq_get_src(c, instr->src[0], 0))); 3724 ntq_store_dest(c, &instr->dest, 1, 3725 vir_MOV(c, ntq_get_src(c, instr->src[0], 1))); 3726 break; 3727 3728 case nir_intrinsic_load_barycentric_pixel: 3729 ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); 3730 ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); 3731 break; 3732 3733 case nir_intrinsic_load_barycentric_at_sample: { 3734 if (!c->fs_key->msaa) { 3735 ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); 3736 ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); 3737 return; 3738 } 3739 3740 struct qreg offset_x, offset_y; 3741 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0); 3742 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y); 3743 3744 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); 3745 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); 3746 break; 3747 } 3748 3749 case nir_intrinsic_load_barycentric_sample: { 3750 struct qreg offset_x = 3751 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))); 3752 struct qreg offset_y = 3753 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))); 3754 3755 ntq_store_dest(c, &instr->dest, 0, 3756 vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f))); 3757 ntq_store_dest(c, &instr->dest, 1, 3758 vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f))); 3759 break; 3760 } 3761 3762 case nir_intrinsic_load_barycentric_centroid: { 3763 struct qreg offset_x, offset_y; 3764 ntq_get_barycentric_centroid(c, &offset_x, &offset_y); 3765 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); 3766 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); 3767 break; 3768 } 3769 3770 case nir_intrinsic_load_interpolated_input: { 3771 assert(nir_src_is_const(instr->src[1])); 3772 const uint32_t offset = nir_src_as_uint(instr->src[1]); 3773 3774 for (int i = 0; i < instr->num_components; i++) { 3775 const uint32_t input_idx = 3776 (nir_intrinsic_base(instr) + offset) * 4 + 3777 nir_intrinsic_component(instr) + i; 3778 3779 /* If we are not in MSAA or if we are not interpolating 3780 * a user varying, just return the pre-computed 3781 * interpolated input. 3782 */ 3783 if (!c->fs_key->msaa || 3784 c->interp[input_idx].vp.file == QFILE_NULL) { 3785 ntq_store_dest(c, &instr->dest, i, 3786 vir_MOV(c, c->inputs[input_idx])); 3787 continue; 3788 } 3789 3790 /* Otherwise compute interpolation at the specified 3791 * offset. 3792 */ 3793 struct qreg p = c->interp[input_idx].vp; 3794 struct qreg C = c->interp[input_idx].C; 3795 unsigned interp_mode = c->interp[input_idx].mode; 3796 3797 struct qreg offset_x = ntq_get_src(c, instr->src[0], 0); 3798 struct qreg offset_y = ntq_get_src(c, instr->src[0], 1); 3799 3800 struct qreg result = 3801 ntq_emit_load_interpolated_input(c, p, C, 3802 offset_x, offset_y, 3803 interp_mode); 3804 ntq_store_dest(c, &instr->dest, i, result); 3805 } 3806 break; 3807 } 3808 3809 case nir_intrinsic_load_subgroup_size: 3810 ntq_store_dest(c, &instr->dest, 0, 3811 vir_uniform_ui(c, V3D_CHANNELS)); 3812 break; 3813 3814 case nir_intrinsic_load_subgroup_invocation: 3815 ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); 3816 break; 3817 3818 case nir_intrinsic_elect: { 3819 set_a_flags_for_subgroup(c); 3820 struct qreg first = vir_FLAFIRST(c); 3821 3822 /* Produce a boolean result from Flafirst */ 3823 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), 3824 first, vir_uniform_ui(c, 1)), 3825 V3D_QPU_PF_PUSHZ); 3826 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); 3827 ntq_store_dest(c, &instr->dest, 0, result); 3828 break; 3829 } 3830 3831 case nir_intrinsic_load_num_subgroups: 3832 unreachable("Should have been lowered"); 3833 break; 3834 3835 case nir_intrinsic_load_view_index: 3836 ntq_store_dest(c, &instr->dest, 0, 3837 vir_uniform(c, QUNIFORM_VIEW_INDEX, 0)); 3838 break; 3839 3840 default: 3841 fprintf(stderr, "Unknown intrinsic: "); 3842 nir_print_instr(&instr->instr, stderr); 3843 fprintf(stderr, "\n"); 3844 break; 3845 } 3846} 3847 3848/* Clears (activates) the execute flags for any channels whose jump target 3849 * matches this block. 3850 * 3851 * XXX perf: Could we be using flpush/flpop somehow for our execution channel 3852 * enabling? 3853 * 3854 */ 3855static void 3856ntq_activate_execute_for_block(struct v3d_compile *c) 3857{ 3858 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), 3859 c->execute, vir_uniform_ui(c, c->cur_block->index)), 3860 V3D_QPU_PF_PUSHZ); 3861 3862 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 3863} 3864 3865static void 3866ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt) 3867{ 3868 nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 3869 bool empty_else_block = 3870 (nir_else_block == nir_if_last_else_block(if_stmt) && 3871 exec_list_is_empty(&nir_else_block->instr_list)); 3872 3873 struct qblock *then_block = vir_new_block(c); 3874 struct qblock *after_block = vir_new_block(c); 3875 struct qblock *else_block; 3876 if (empty_else_block) 3877 else_block = after_block; 3878 else 3879 else_block = vir_new_block(c); 3880 3881 /* Check if this if statement is really just a conditional jump with 3882 * the form: 3883 * 3884 * if (cond) { 3885 * break/continue; 3886 * } else { 3887 * } 3888 * 3889 * In which case we can skip the jump to ELSE we emit before the THEN 3890 * block and instead just emit the break/continue directly. 3891 */ 3892 nir_jump_instr *conditional_jump = NULL; 3893 if (empty_else_block) { 3894 nir_block *nir_then_block = nir_if_first_then_block(if_stmt); 3895 struct nir_instr *inst = nir_block_first_instr(nir_then_block); 3896 if (inst && inst->type == nir_instr_type_jump) 3897 conditional_jump = nir_instr_as_jump(inst); 3898 } 3899 3900 /* Set up the flags for the IF condition (taking the THEN branch). */ 3901 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); 3902 3903 if (!conditional_jump) { 3904 /* Jump to ELSE. */ 3905 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ? 3906 V3D_QPU_BRANCH_COND_ANYNA : 3907 V3D_QPU_BRANCH_COND_ANYA); 3908 /* Pixels that were not dispatched or have been discarded 3909 * should not contribute to the ANYA/ANYNA condition. 3910 */ 3911 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 3912 3913 vir_link_blocks(c->cur_block, else_block); 3914 vir_link_blocks(c->cur_block, then_block); 3915 3916 /* Process the THEN block. */ 3917 vir_set_emit_block(c, then_block); 3918 ntq_emit_cf_list(c, &if_stmt->then_list); 3919 3920 if (!empty_else_block) { 3921 /* At the end of the THEN block, jump to ENDIF, unless 3922 * the block ended in a break or continue. 3923 */ 3924 if (!c->cur_block->branch_emitted) { 3925 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 3926 vir_link_blocks(c->cur_block, after_block); 3927 } 3928 3929 /* Emit the else block. */ 3930 vir_set_emit_block(c, else_block); 3931 ntq_emit_cf_list(c, &if_stmt->else_list); 3932 } 3933 } else { 3934 /* Emit the conditional jump directly. 3935 * 3936 * Use ALL with breaks and ANY with continues to ensure that 3937 * we always break and never continue when all lanes have been 3938 * disabled (for example because of discards) to prevent 3939 * infinite loops. 3940 */ 3941 assert(conditional_jump && 3942 (conditional_jump->type == nir_jump_continue || 3943 conditional_jump->type == nir_jump_break)); 3944 3945 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ? 3946 (conditional_jump->type == nir_jump_break ? 3947 V3D_QPU_BRANCH_COND_ALLA : 3948 V3D_QPU_BRANCH_COND_ANYA) : 3949 (conditional_jump->type == nir_jump_break ? 3950 V3D_QPU_BRANCH_COND_ALLNA : 3951 V3D_QPU_BRANCH_COND_ANYNA)); 3952 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 3953 3954 vir_link_blocks(c->cur_block, 3955 conditional_jump->type == nir_jump_break ? 3956 c->loop_break_block : 3957 c->loop_cont_block); 3958 } 3959 3960 vir_link_blocks(c->cur_block, after_block); 3961 3962 vir_set_emit_block(c, after_block); 3963} 3964 3965static void 3966ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) 3967{ 3968 nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 3969 bool empty_else_block = 3970 (nir_else_block == nir_if_last_else_block(if_stmt) && 3971 exec_list_is_empty(&nir_else_block->instr_list)); 3972 3973 struct qblock *then_block = vir_new_block(c); 3974 struct qblock *after_block = vir_new_block(c); 3975 struct qblock *else_block; 3976 if (empty_else_block) 3977 else_block = after_block; 3978 else 3979 else_block = vir_new_block(c); 3980 3981 bool was_uniform_control_flow = false; 3982 if (!vir_in_nonuniform_control_flow(c)) { 3983 c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 3984 was_uniform_control_flow = true; 3985 } 3986 3987 /* Set up the flags for the IF condition (taking the THEN branch). */ 3988 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); 3989 3990 /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and 3991 * was previously active (execute Z) for updating the exec flags. 3992 */ 3993 if (was_uniform_control_flow) { 3994 cond = v3d_qpu_cond_invert(cond); 3995 } else { 3996 struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute); 3997 if (cond == V3D_QPU_COND_IFA) { 3998 vir_set_uf(c, inst, V3D_QPU_UF_NORNZ); 3999 } else { 4000 vir_set_uf(c, inst, V3D_QPU_UF_ANDZ); 4001 cond = V3D_QPU_COND_IFA; 4002 } 4003 } 4004 4005 vir_MOV_cond(c, cond, 4006 c->execute, 4007 vir_uniform_ui(c, else_block->index)); 4008 4009 /* Jump to ELSE if nothing is active for THEN, otherwise fall 4010 * through. 4011 */ 4012 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); 4013 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); 4014 vir_link_blocks(c->cur_block, else_block); 4015 vir_link_blocks(c->cur_block, then_block); 4016 4017 /* Process the THEN block. */ 4018 vir_set_emit_block(c, then_block); 4019 ntq_emit_cf_list(c, &if_stmt->then_list); 4020 4021 if (!empty_else_block) { 4022 /* Handle the end of the THEN block. First, all currently 4023 * active channels update their execute flags to point to 4024 * ENDIF 4025 */ 4026 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 4027 V3D_QPU_PF_PUSHZ); 4028 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 4029 vir_uniform_ui(c, after_block->index)); 4030 4031 /* If everything points at ENDIF, then jump there immediately. */ 4032 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), 4033 c->execute, 4034 vir_uniform_ui(c, after_block->index)), 4035 V3D_QPU_PF_PUSHZ); 4036 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); 4037 vir_link_blocks(c->cur_block, after_block); 4038 vir_link_blocks(c->cur_block, else_block); 4039 4040 vir_set_emit_block(c, else_block); 4041 ntq_activate_execute_for_block(c); 4042 ntq_emit_cf_list(c, &if_stmt->else_list); 4043 } 4044 4045 vir_link_blocks(c->cur_block, after_block); 4046 4047 vir_set_emit_block(c, after_block); 4048 if (was_uniform_control_flow) 4049 c->execute = c->undef; 4050 else 4051 ntq_activate_execute_for_block(c); 4052} 4053 4054static void 4055ntq_emit_if(struct v3d_compile *c, nir_if *nif) 4056{ 4057 bool was_in_control_flow = c->in_control_flow; 4058 c->in_control_flow = true; 4059 if (!vir_in_nonuniform_control_flow(c) && 4060 !nir_src_is_divergent(nif->condition)) { 4061 ntq_emit_uniform_if(c, nif); 4062 } else { 4063 ntq_emit_nonuniform_if(c, nif); 4064 } 4065 c->in_control_flow = was_in_control_flow; 4066} 4067 4068static void 4069ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump) 4070{ 4071 switch (jump->type) { 4072 case nir_jump_break: 4073 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 4074 V3D_QPU_PF_PUSHZ); 4075 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 4076 vir_uniform_ui(c, c->loop_break_block->index)); 4077 break; 4078 4079 case nir_jump_continue: 4080 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 4081 V3D_QPU_PF_PUSHZ); 4082 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 4083 vir_uniform_ui(c, c->loop_cont_block->index)); 4084 break; 4085 4086 case nir_jump_return: 4087 unreachable("All returns should be lowered\n"); 4088 break; 4089 4090 case nir_jump_halt: 4091 case nir_jump_goto: 4092 case nir_jump_goto_if: 4093 unreachable("not supported\n"); 4094 break; 4095 } 4096} 4097 4098static void 4099ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump) 4100{ 4101 switch (jump->type) { 4102 case nir_jump_break: 4103 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 4104 vir_link_blocks(c->cur_block, c->loop_break_block); 4105 c->cur_block->branch_emitted = true; 4106 break; 4107 case nir_jump_continue: 4108 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 4109 vir_link_blocks(c->cur_block, c->loop_cont_block); 4110 c->cur_block->branch_emitted = true; 4111 break; 4112 4113 case nir_jump_return: 4114 unreachable("All returns should be lowered\n"); 4115 break; 4116 4117 case nir_jump_halt: 4118 case nir_jump_goto: 4119 case nir_jump_goto_if: 4120 unreachable("not supported\n"); 4121 break; 4122 } 4123} 4124 4125static void 4126ntq_emit_instr(struct v3d_compile *c, nir_instr *instr) 4127{ 4128 switch (instr->type) { 4129 case nir_instr_type_alu: 4130 ntq_emit_alu(c, nir_instr_as_alu(instr)); 4131 break; 4132 4133 case nir_instr_type_intrinsic: 4134 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); 4135 break; 4136 4137 case nir_instr_type_load_const: 4138 ntq_emit_load_const(c, nir_instr_as_load_const(instr)); 4139 break; 4140 4141 case nir_instr_type_ssa_undef: 4142 unreachable("Should've been lowered by nir_lower_undef_to_zero"); 4143 break; 4144 4145 case nir_instr_type_tex: 4146 ntq_emit_tex(c, nir_instr_as_tex(instr)); 4147 break; 4148 4149 case nir_instr_type_jump: 4150 /* Always flush TMU before jumping to another block, for the 4151 * same reasons as in ntq_emit_block. 4152 */ 4153 ntq_flush_tmu(c); 4154 if (vir_in_nonuniform_control_flow(c)) 4155 ntq_emit_jump(c, nir_instr_as_jump(instr)); 4156 else 4157 ntq_emit_uniform_jump(c, nir_instr_as_jump(instr)); 4158 break; 4159 4160 default: 4161 fprintf(stderr, "Unknown NIR instr type: "); 4162 nir_print_instr(instr, stderr); 4163 fprintf(stderr, "\n"); 4164 abort(); 4165 } 4166} 4167 4168static void 4169ntq_emit_block(struct v3d_compile *c, nir_block *block) 4170{ 4171 nir_foreach_instr(instr, block) { 4172 ntq_emit_instr(c, instr); 4173 } 4174 4175 /* Always process pending TMU operations in the same block they were 4176 * emitted: we can't emit TMU operations in a block and then emit a 4177 * thread switch and LDTMU/TMUWT for them in another block, possibly 4178 * under control flow. 4179 */ 4180 ntq_flush_tmu(c); 4181} 4182 4183static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 4184 4185static void 4186ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop) 4187{ 4188 bool was_uniform_control_flow = false; 4189 if (!vir_in_nonuniform_control_flow(c)) { 4190 c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 4191 was_uniform_control_flow = true; 4192 } 4193 4194 c->loop_cont_block = vir_new_block(c); 4195 c->loop_break_block = vir_new_block(c); 4196 4197 vir_link_blocks(c->cur_block, c->loop_cont_block); 4198 vir_set_emit_block(c, c->loop_cont_block); 4199 ntq_activate_execute_for_block(c); 4200 4201 ntq_emit_cf_list(c, &loop->body); 4202 4203 /* Re-enable any previous continues now, so our ANYA check below 4204 * works. 4205 * 4206 * XXX: Use the .ORZ flags update, instead. 4207 */ 4208 vir_set_pf(c, vir_XOR_dest(c, 4209 vir_nop_reg(), 4210 c->execute, 4211 vir_uniform_ui(c, c->loop_cont_block->index)), 4212 V3D_QPU_PF_PUSHZ); 4213 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 4214 4215 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); 4216 4217 struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA); 4218 /* Pixels that were not dispatched or have been discarded should not 4219 * contribute to looping again. 4220 */ 4221 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 4222 vir_link_blocks(c->cur_block, c->loop_cont_block); 4223 vir_link_blocks(c->cur_block, c->loop_break_block); 4224 4225 vir_set_emit_block(c, c->loop_break_block); 4226 if (was_uniform_control_flow) 4227 c->execute = c->undef; 4228 else 4229 ntq_activate_execute_for_block(c); 4230} 4231 4232static void 4233ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop) 4234{ 4235 c->loop_cont_block = vir_new_block(c); 4236 c->loop_break_block = vir_new_block(c); 4237 4238 vir_link_blocks(c->cur_block, c->loop_cont_block); 4239 vir_set_emit_block(c, c->loop_cont_block); 4240 4241 ntq_emit_cf_list(c, &loop->body); 4242 4243 if (!c->cur_block->branch_emitted) { 4244 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 4245 vir_link_blocks(c->cur_block, c->loop_cont_block); 4246 } 4247 4248 vir_set_emit_block(c, c->loop_break_block); 4249} 4250 4251static void 4252ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) 4253{ 4254 /* Disable flags optimization for loop conditions. The problem here is 4255 * that we can have code like this: 4256 * 4257 * // block_0 4258 * vec1 32 con ssa_9 = ine32 ssa_8, ssa_2 4259 * loop { 4260 * // block_1 4261 * if ssa_9 { 4262 * 4263 * In this example we emit flags to compute ssa_9 and the optimization 4264 * will skip regenerating them again for the loop condition in the 4265 * loop continue block (block_1). However, this is not safe after the 4266 * first iteration because the loop body can stomp the flags if it has 4267 * any conditionals. 4268 */ 4269 c->flags_temp = -1; 4270 4271 bool was_in_control_flow = c->in_control_flow; 4272 c->in_control_flow = true; 4273 4274 struct qblock *save_loop_cont_block = c->loop_cont_block; 4275 struct qblock *save_loop_break_block = c->loop_break_block; 4276 4277 if (vir_in_nonuniform_control_flow(c) || loop->divergent) { 4278 ntq_emit_nonuniform_loop(c, loop); 4279 } else { 4280 ntq_emit_uniform_loop(c, loop); 4281 } 4282 4283 c->loop_break_block = save_loop_break_block; 4284 c->loop_cont_block = save_loop_cont_block; 4285 4286 c->loops++; 4287 4288 c->in_control_flow = was_in_control_flow; 4289} 4290 4291static void 4292ntq_emit_function(struct v3d_compile *c, nir_function_impl *func) 4293{ 4294 fprintf(stderr, "FUNCTIONS not handled.\n"); 4295 abort(); 4296} 4297 4298static void 4299ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list) 4300{ 4301 foreach_list_typed(nir_cf_node, node, node, list) { 4302 switch (node->type) { 4303 case nir_cf_node_block: 4304 ntq_emit_block(c, nir_cf_node_as_block(node)); 4305 break; 4306 4307 case nir_cf_node_if: 4308 ntq_emit_if(c, nir_cf_node_as_if(node)); 4309 break; 4310 4311 case nir_cf_node_loop: 4312 ntq_emit_loop(c, nir_cf_node_as_loop(node)); 4313 break; 4314 4315 case nir_cf_node_function: 4316 ntq_emit_function(c, nir_cf_node_as_function(node)); 4317 break; 4318 4319 default: 4320 fprintf(stderr, "Unknown NIR node type\n"); 4321 abort(); 4322 } 4323 } 4324} 4325 4326static void 4327ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl) 4328{ 4329 ntq_setup_registers(c, &impl->registers); 4330 ntq_emit_cf_list(c, &impl->body); 4331} 4332 4333static void 4334nir_to_vir(struct v3d_compile *c) 4335{ 4336 switch (c->s->info.stage) { 4337 case MESA_SHADER_FRAGMENT: 4338 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); 4339 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); 4340 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); 4341 4342 /* V3D 4.x can disable implicit varyings if they are not used */ 4343 c->fs_uses_primitive_id = 4344 nir_find_variable_with_location(c->s, nir_var_shader_in, 4345 VARYING_SLOT_PRIMITIVE_ID); 4346 if (c->fs_uses_primitive_id && !c->fs_key->has_gs) { 4347 c->primitive_id = 4348 emit_fragment_varying(c, NULL, -1, 0, 0); 4349 } 4350 4351 if (c->fs_key->is_points && 4352 (c->devinfo->ver < 40 || program_reads_point_coord(c))) { 4353 c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0); 4354 c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0); 4355 c->uses_implicit_point_line_varyings = true; 4356 } else if (c->fs_key->is_lines && 4357 (c->devinfo->ver < 40 || 4358 BITSET_TEST(c->s->info.system_values_read, 4359 SYSTEM_VALUE_LINE_COORD))) { 4360 c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0); 4361 c->uses_implicit_point_line_varyings = true; 4362 } 4363 4364 c->force_per_sample_msaa = 4365 c->s->info.fs.uses_sample_qualifier || 4366 BITSET_TEST(c->s->info.system_values_read, 4367 SYSTEM_VALUE_SAMPLE_ID) || 4368 BITSET_TEST(c->s->info.system_values_read, 4369 SYSTEM_VALUE_SAMPLE_POS); 4370 break; 4371 case MESA_SHADER_COMPUTE: 4372 /* Set up the TSO for barriers, assuming we do some. */ 4373 if (c->devinfo->ver < 42) { 4374 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, 4375 V3D_QPU_WADDR_SYNC)); 4376 } 4377 4378 c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); 4379 c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); 4380 4381 /* Set up the division between gl_LocalInvocationIndex and 4382 * wg_in_mem in the payload reg. 4383 */ 4384 int wg_size = (c->s->info.workgroup_size[0] * 4385 c->s->info.workgroup_size[1] * 4386 c->s->info.workgroup_size[2]); 4387 c->local_invocation_index_bits = 4388 ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1; 4389 assert(c->local_invocation_index_bits <= 8); 4390 4391 if (c->s->info.shared_size) { 4392 struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1], 4393 vir_uniform_ui(c, 16)); 4394 if (c->s->info.workgroup_size[0] != 1 || 4395 c->s->info.workgroup_size[1] != 1 || 4396 c->s->info.workgroup_size[2] != 1) { 4397 int wg_bits = (16 - 4398 c->local_invocation_index_bits); 4399 int wg_mask = (1 << wg_bits) - 1; 4400 wg_in_mem = vir_AND(c, wg_in_mem, 4401 vir_uniform_ui(c, wg_mask)); 4402 } 4403 struct qreg shared_per_wg = 4404 vir_uniform_ui(c, c->s->info.shared_size); 4405 4406 c->cs_shared_offset = 4407 vir_ADD(c, 4408 vir_uniform(c, QUNIFORM_SHARED_OFFSET,0), 4409 vir_UMUL(c, wg_in_mem, shared_per_wg)); 4410 } 4411 break; 4412 default: 4413 break; 4414 } 4415 4416 if (c->s->scratch_size) { 4417 v3d_setup_spill_base(c); 4418 c->spill_size += V3D_CHANNELS * c->s->scratch_size; 4419 } 4420 4421 switch (c->s->info.stage) { 4422 case MESA_SHADER_VERTEX: 4423 ntq_setup_vs_inputs(c); 4424 break; 4425 case MESA_SHADER_GEOMETRY: 4426 ntq_setup_gs_inputs(c); 4427 break; 4428 case MESA_SHADER_FRAGMENT: 4429 ntq_setup_fs_inputs(c); 4430 break; 4431 case MESA_SHADER_COMPUTE: 4432 break; 4433 default: 4434 unreachable("unsupported shader stage"); 4435 } 4436 4437 ntq_setup_outputs(c); 4438 4439 /* Find the main function and emit the body. */ 4440 nir_foreach_function(function, c->s) { 4441 assert(function->is_entrypoint); 4442 assert(function->impl); 4443 ntq_emit_impl(c, function->impl); 4444 } 4445} 4446 4447/** 4448 * When demoting a shader down to single-threaded, removes the THRSW 4449 * instructions (one will still be inserted at v3d_vir_to_qpu() for the 4450 * program end). 4451 */ 4452static void 4453vir_remove_thrsw(struct v3d_compile *c) 4454{ 4455 vir_for_each_block(block, c) { 4456 vir_for_each_inst_safe(inst, block) { 4457 if (inst->qpu.sig.thrsw) 4458 vir_remove_instruction(c, inst); 4459 } 4460 } 4461 4462 c->last_thrsw = NULL; 4463} 4464 4465/** 4466 * This makes sure we have a top-level last thread switch which signals the 4467 * start of the last thread section, which may include adding a new thrsw 4468 * instruction if needed. We don't allow spilling in the last thread section, so 4469 * if we need to do any spills that inject additional thread switches later on, 4470 * we ensure this thread switch will still be the last thread switch in the 4471 * program, which makes last thread switch signalling a lot easier when we have 4472 * spilling. If in the end we don't need to spill to compile the program and we 4473 * injected a new thread switch instruction here only for that, we will 4474 * eventually restore the previous last thread switch and remove the one we 4475 * added here. 4476 */ 4477static void 4478vir_emit_last_thrsw(struct v3d_compile *c, 4479 struct qinst **restore_last_thrsw, 4480 bool *restore_scoreboard_lock) 4481{ 4482 *restore_last_thrsw = c->last_thrsw; 4483 4484 /* On V3D before 4.1, we need a TMU op to be outstanding when thread 4485 * switching, so disable threads if we didn't do any TMU ops (each of 4486 * which would have emitted a THRSW). 4487 */ 4488 if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) { 4489 c->threads = 1; 4490 if (c->last_thrsw) 4491 vir_remove_thrsw(c); 4492 *restore_last_thrsw = NULL; 4493 } 4494 4495 /* If we're threaded and the last THRSW was in conditional code, then 4496 * we need to emit another one so that we can flag it as the last 4497 * thrsw. 4498 */ 4499 if (c->last_thrsw && !c->last_thrsw_at_top_level) { 4500 assert(c->devinfo->ver >= 41); 4501 vir_emit_thrsw(c); 4502 } 4503 4504 /* If we're threaded, then we need to mark the last THRSW instruction 4505 * so we can emit a pair of them at QPU emit time. 4506 * 4507 * For V3D 4.x, we can spawn the non-fragment shaders already in the 4508 * post-last-THRSW state, so we can skip this. 4509 */ 4510 if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) { 4511 assert(c->devinfo->ver >= 41); 4512 vir_emit_thrsw(c); 4513 } 4514 4515 /* If we have not inserted a last thread switch yet, do it now to ensure 4516 * any potential spilling we do happens before this. If we don't spill 4517 * in the end, we will restore the previous one. 4518 */ 4519 if (*restore_last_thrsw == c->last_thrsw) { 4520 if (*restore_last_thrsw) 4521 (*restore_last_thrsw)->is_last_thrsw = false; 4522 *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw; 4523 vir_emit_thrsw(c); 4524 } else { 4525 *restore_last_thrsw = c->last_thrsw; 4526 } 4527 4528 assert(c->last_thrsw); 4529 c->last_thrsw->is_last_thrsw = true; 4530} 4531 4532static void 4533vir_restore_last_thrsw(struct v3d_compile *c, 4534 struct qinst *thrsw, 4535 bool scoreboard_lock) 4536{ 4537 assert(c->last_thrsw); 4538 vir_remove_instruction(c, c->last_thrsw); 4539 c->last_thrsw = thrsw; 4540 if (c->last_thrsw) 4541 c->last_thrsw->is_last_thrsw = true; 4542 c->lock_scoreboard_on_first_thrsw = scoreboard_lock; 4543} 4544 4545/* There's a flag in the shader for "center W is needed for reasons other than 4546 * non-centroid varyings", so we just walk the program after VIR optimization 4547 * to see if it's used. It should be harmless to set even if we only use 4548 * center W for varyings. 4549 */ 4550static void 4551vir_check_payload_w(struct v3d_compile *c) 4552{ 4553 if (c->s->info.stage != MESA_SHADER_FRAGMENT) 4554 return; 4555 4556 vir_for_each_inst_inorder(inst, c) { 4557 for (int i = 0; i < vir_get_nsrc(inst); i++) { 4558 if (inst->src[i].file == QFILE_REG && 4559 inst->src[i].index == 0) { 4560 c->uses_center_w = true; 4561 return; 4562 } 4563 } 4564 } 4565} 4566 4567void 4568v3d_nir_to_vir(struct v3d_compile *c) 4569{ 4570 if (V3D_DEBUG & (V3D_DEBUG_NIR | 4571 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 4572 fprintf(stderr, "%s prog %d/%d NIR:\n", 4573 vir_get_stage_name(c), 4574 c->program_id, c->variant_id); 4575 nir_print_shader(c->s, stderr); 4576 } 4577 4578 nir_to_vir(c); 4579 4580 bool restore_scoreboard_lock = false; 4581 struct qinst *restore_last_thrsw; 4582 4583 /* Emit the last THRSW before STVPM and TLB writes. */ 4584 vir_emit_last_thrsw(c, 4585 &restore_last_thrsw, 4586 &restore_scoreboard_lock); 4587 4588 4589 switch (c->s->info.stage) { 4590 case MESA_SHADER_FRAGMENT: 4591 emit_frag_end(c); 4592 break; 4593 case MESA_SHADER_GEOMETRY: 4594 emit_geom_end(c); 4595 break; 4596 case MESA_SHADER_VERTEX: 4597 emit_vert_end(c); 4598 break; 4599 case MESA_SHADER_COMPUTE: 4600 break; 4601 default: 4602 unreachable("bad stage"); 4603 } 4604 4605 if (V3D_DEBUG & (V3D_DEBUG_VIR | 4606 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 4607 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n", 4608 vir_get_stage_name(c), 4609 c->program_id, c->variant_id); 4610 vir_dump(c); 4611 fprintf(stderr, "\n"); 4612 } 4613 4614 vir_optimize(c); 4615 4616 vir_check_payload_w(c); 4617 4618 /* XXX perf: On VC4, we do a VIR-level instruction scheduling here. 4619 * We used that on that platform to pipeline TMU writes and reduce the 4620 * number of thread switches, as well as try (mostly successfully) to 4621 * reduce maximum register pressure to allow more threads. We should 4622 * do something of that sort for V3D -- either instruction scheduling 4623 * here, or delay the the THRSW and LDTMUs from our texture 4624 * instructions until the results are needed. 4625 */ 4626 4627 if (V3D_DEBUG & (V3D_DEBUG_VIR | 4628 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 4629 fprintf(stderr, "%s prog %d/%d VIR:\n", 4630 vir_get_stage_name(c), 4631 c->program_id, c->variant_id); 4632 vir_dump(c); 4633 fprintf(stderr, "\n"); 4634 } 4635 4636 /* Attempt to allocate registers for the temporaries. If we fail, 4637 * reduce thread count and try again. 4638 */ 4639 int min_threads = (c->devinfo->ver >= 41) ? 2 : 1; 4640 struct qpu_reg *temp_registers; 4641 while (true) { 4642 temp_registers = v3d_register_allocate(c); 4643 if (temp_registers) { 4644 assert(c->spills + c->fills <= c->max_tmu_spills); 4645 break; 4646 } 4647 4648 if (c->threads == min_threads && 4649 (V3D_DEBUG & V3D_DEBUG_RA)) { 4650 fprintf(stderr, 4651 "Failed to register allocate using %s\n", 4652 c->fallback_scheduler ? "the fallback scheduler:" : 4653 "the normal scheduler: \n"); 4654 4655 vir_dump(c); 4656 4657 char *shaderdb; 4658 int ret = v3d_shaderdb_dump(c, &shaderdb); 4659 if (ret > 0) { 4660 fprintf(stderr, "%s\n", shaderdb); 4661 free(shaderdb); 4662 } 4663 } 4664 4665 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) { 4666 if (V3D_DEBUG & V3D_DEBUG_PERF) { 4667 fprintf(stderr, 4668 "Failed to register allocate %s " 4669 "prog %d/%d at %d threads.\n", 4670 vir_get_stage_name(c), 4671 c->program_id, c->variant_id, c->threads); 4672 } 4673 c->compilation_result = 4674 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION; 4675 return; 4676 } 4677 4678 c->spills = 0; 4679 c->fills = 0; 4680 c->threads /= 2; 4681 4682 if (c->threads == 1) 4683 vir_remove_thrsw(c); 4684 } 4685 4686 /* If we didn't spill, then remove the last thread switch we injected 4687 * artificially (if any) and restore the previous one. 4688 */ 4689 if (!c->spills && c->last_thrsw != restore_last_thrsw) 4690 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock); 4691 4692 if (c->spills && 4693 (V3D_DEBUG & (V3D_DEBUG_VIR | 4694 v3d_debug_flag_for_shader_stage(c->s->info.stage)))) { 4695 fprintf(stderr, "%s prog %d/%d spilled VIR:\n", 4696 vir_get_stage_name(c), 4697 c->program_id, c->variant_id); 4698 vir_dump(c); 4699 fprintf(stderr, "\n"); 4700 } 4701 4702 v3d_vir_to_qpu(c, temp_registers); 4703} 4704