1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2016 Broadcom 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci#include <inttypes.h> 25bf215546Sopenharmony_ci#include "util/format/u_format.h" 26bf215546Sopenharmony_ci#include "util/u_helpers.h" 27bf215546Sopenharmony_ci#include "util/u_math.h" 28bf215546Sopenharmony_ci#include "util/u_memory.h" 29bf215546Sopenharmony_ci#include "util/ralloc.h" 30bf215546Sopenharmony_ci#include "util/hash_table.h" 31bf215546Sopenharmony_ci#include "compiler/nir/nir.h" 32bf215546Sopenharmony_ci#include "compiler/nir/nir_builder.h" 33bf215546Sopenharmony_ci#include "common/v3d_device_info.h" 34bf215546Sopenharmony_ci#include "v3d_compiler.h" 35bf215546Sopenharmony_ci 36bf215546Sopenharmony_ci/* We don't do any address packing. */ 37bf215546Sopenharmony_ci#define __gen_user_data void 38bf215546Sopenharmony_ci#define __gen_address_type uint32_t 39bf215546Sopenharmony_ci#define __gen_address_offset(reloc) (*reloc) 40bf215546Sopenharmony_ci#define __gen_emit_reloc(cl, reloc) 41bf215546Sopenharmony_ci#include "cle/v3d_packet_v41_pack.h" 42bf215546Sopenharmony_ci 43bf215546Sopenharmony_ci#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7) 44bf215546Sopenharmony_ci#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7) 45bf215546Sopenharmony_ci#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0) 46bf215546Sopenharmony_ci#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0) 47bf215546Sopenharmony_ci#define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0) 48bf215546Sopenharmony_ci#define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0) 49bf215546Sopenharmony_ci#define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0) 50bf215546Sopenharmony_ci#define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0) 51bf215546Sopenharmony_ci#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0) 52bf215546Sopenharmony_ci#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0) 53bf215546Sopenharmony_ci 54bf215546Sopenharmony_ci#define V3D_TSY_SET_QUORUM 0 55bf215546Sopenharmony_ci#define V3D_TSY_INC_WAITERS 1 56bf215546Sopenharmony_ci#define V3D_TSY_DEC_WAITERS 2 57bf215546Sopenharmony_ci#define V3D_TSY_INC_QUORUM 3 58bf215546Sopenharmony_ci#define V3D_TSY_DEC_QUORUM 4 59bf215546Sopenharmony_ci#define V3D_TSY_FREE_ALL 5 60bf215546Sopenharmony_ci#define V3D_TSY_RELEASE 6 61bf215546Sopenharmony_ci#define V3D_TSY_ACQUIRE 7 62bf215546Sopenharmony_ci#define V3D_TSY_WAIT 8 63bf215546Sopenharmony_ci#define V3D_TSY_WAIT_INC 9 64bf215546Sopenharmony_ci#define V3D_TSY_WAIT_CHECK 10 65bf215546Sopenharmony_ci#define V3D_TSY_WAIT_INC_CHECK 11 66bf215546Sopenharmony_ci#define V3D_TSY_WAIT_CV 12 67bf215546Sopenharmony_ci#define V3D_TSY_INC_SEMAPHORE 13 68bf215546Sopenharmony_ci#define V3D_TSY_DEC_SEMAPHORE 14 69bf215546Sopenharmony_ci#define V3D_TSY_SET_QUORUM_FREE_ALL 15 70bf215546Sopenharmony_ci 71bf215546Sopenharmony_cienum v3d_tmu_op_type 72bf215546Sopenharmony_ci{ 73bf215546Sopenharmony_ci V3D_TMU_OP_TYPE_REGULAR, 74bf215546Sopenharmony_ci V3D_TMU_OP_TYPE_ATOMIC, 75bf215546Sopenharmony_ci V3D_TMU_OP_TYPE_CACHE 76bf215546Sopenharmony_ci}; 77bf215546Sopenharmony_ci 78bf215546Sopenharmony_cistatic enum v3d_tmu_op_type 79bf215546Sopenharmony_civ3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write) 80bf215546Sopenharmony_ci{ 81bf215546Sopenharmony_ci switch(tmu_op) { 82bf215546Sopenharmony_ci case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH: 83bf215546Sopenharmony_ci case V3D_TMU_OP_WRITE_SUB_READ_CLEAR: 84bf215546Sopenharmony_ci case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH: 85bf215546Sopenharmony_ci case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH: 86bf215546Sopenharmony_ci case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR: 87bf215546Sopenharmony_ci return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE; 88bf215546Sopenharmony_ci case V3D_TMU_OP_WRITE_UMAX: 89bf215546Sopenharmony_ci case V3D_TMU_OP_WRITE_SMIN: 90bf215546Sopenharmony_ci case V3D_TMU_OP_WRITE_SMAX: 91bf215546Sopenharmony_ci assert(is_write); 92bf215546Sopenharmony_ci FALLTHROUGH; 93bf215546Sopenharmony_ci case V3D_TMU_OP_WRITE_AND_READ_INC: 94bf215546Sopenharmony_ci case V3D_TMU_OP_WRITE_OR_READ_DEC: 95bf215546Sopenharmony_ci case V3D_TMU_OP_WRITE_XOR_READ_NOT: 96bf215546Sopenharmony_ci return V3D_TMU_OP_TYPE_ATOMIC; 97bf215546Sopenharmony_ci case V3D_TMU_OP_REGULAR: 98bf215546Sopenharmony_ci return V3D_TMU_OP_TYPE_REGULAR; 99bf215546Sopenharmony_ci 100bf215546Sopenharmony_ci default: 101bf215546Sopenharmony_ci unreachable("Unknown tmu_op\n"); 102bf215546Sopenharmony_ci } 103bf215546Sopenharmony_ci} 104bf215546Sopenharmony_cistatic void 105bf215546Sopenharmony_cintq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 106bf215546Sopenharmony_ci 107bf215546Sopenharmony_cistatic void 108bf215546Sopenharmony_ciresize_qreg_array(struct v3d_compile *c, 109bf215546Sopenharmony_ci struct qreg **regs, 110bf215546Sopenharmony_ci uint32_t *size, 111bf215546Sopenharmony_ci uint32_t decl_size) 112bf215546Sopenharmony_ci{ 113bf215546Sopenharmony_ci if (*size >= decl_size) 114bf215546Sopenharmony_ci return; 115bf215546Sopenharmony_ci 116bf215546Sopenharmony_ci uint32_t old_size = *size; 117bf215546Sopenharmony_ci *size = MAX2(*size * 2, decl_size); 118bf215546Sopenharmony_ci *regs = reralloc(c, *regs, struct qreg, *size); 119bf215546Sopenharmony_ci if (!*regs) { 120bf215546Sopenharmony_ci fprintf(stderr, "Malloc failure\n"); 121bf215546Sopenharmony_ci abort(); 122bf215546Sopenharmony_ci } 123bf215546Sopenharmony_ci 124bf215546Sopenharmony_ci for (uint32_t i = old_size; i < *size; i++) 125bf215546Sopenharmony_ci (*regs)[i] = c->undef; 126bf215546Sopenharmony_ci} 127bf215546Sopenharmony_ci 128bf215546Sopenharmony_cistatic void 129bf215546Sopenharmony_ciresize_interp_array(struct v3d_compile *c, 130bf215546Sopenharmony_ci struct v3d_interp_input **regs, 131bf215546Sopenharmony_ci uint32_t *size, 132bf215546Sopenharmony_ci uint32_t decl_size) 133bf215546Sopenharmony_ci{ 134bf215546Sopenharmony_ci if (*size >= decl_size) 135bf215546Sopenharmony_ci return; 136bf215546Sopenharmony_ci 137bf215546Sopenharmony_ci uint32_t old_size = *size; 138bf215546Sopenharmony_ci *size = MAX2(*size * 2, decl_size); 139bf215546Sopenharmony_ci *regs = reralloc(c, *regs, struct v3d_interp_input, *size); 140bf215546Sopenharmony_ci if (!*regs) { 141bf215546Sopenharmony_ci fprintf(stderr, "Malloc failure\n"); 142bf215546Sopenharmony_ci abort(); 143bf215546Sopenharmony_ci } 144bf215546Sopenharmony_ci 145bf215546Sopenharmony_ci for (uint32_t i = old_size; i < *size; i++) { 146bf215546Sopenharmony_ci (*regs)[i].vp = c->undef; 147bf215546Sopenharmony_ci (*regs)[i].C = c->undef; 148bf215546Sopenharmony_ci } 149bf215546Sopenharmony_ci} 150bf215546Sopenharmony_ci 151bf215546Sopenharmony_civoid 152bf215546Sopenharmony_civir_emit_thrsw(struct v3d_compile *c) 153bf215546Sopenharmony_ci{ 154bf215546Sopenharmony_ci if (c->threads == 1) 155bf215546Sopenharmony_ci return; 156bf215546Sopenharmony_ci 157bf215546Sopenharmony_ci /* Always thread switch after each texture operation for now. 158bf215546Sopenharmony_ci * 159bf215546Sopenharmony_ci * We could do better by batching a bunch of texture fetches up and 160bf215546Sopenharmony_ci * then doing one thread switch and collecting all their results 161bf215546Sopenharmony_ci * afterward. 162bf215546Sopenharmony_ci */ 163bf215546Sopenharmony_ci c->last_thrsw = vir_NOP(c); 164bf215546Sopenharmony_ci c->last_thrsw->qpu.sig.thrsw = true; 165bf215546Sopenharmony_ci c->last_thrsw_at_top_level = !c->in_control_flow; 166bf215546Sopenharmony_ci 167bf215546Sopenharmony_ci /* We need to lock the scoreboard before any tlb acess happens. If this 168bf215546Sopenharmony_ci * thread switch comes after we have emitted a tlb load, then it means 169bf215546Sopenharmony_ci * that we can't lock on the last thread switch any more. 170bf215546Sopenharmony_ci */ 171bf215546Sopenharmony_ci if (c->emitted_tlb_load) 172bf215546Sopenharmony_ci c->lock_scoreboard_on_first_thrsw = true; 173bf215546Sopenharmony_ci} 174bf215546Sopenharmony_ci 175bf215546Sopenharmony_ciuint32_t 176bf215546Sopenharmony_civ3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) 177bf215546Sopenharmony_ci{ 178bf215546Sopenharmony_ci if (nir_src_is_const(instr->src[src])) { 179bf215546Sopenharmony_ci int64_t add_val = nir_src_as_int(instr->src[src]); 180bf215546Sopenharmony_ci if (add_val == 1) 181bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_AND_READ_INC; 182bf215546Sopenharmony_ci else if (add_val == -1) 183bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_OR_READ_DEC; 184bf215546Sopenharmony_ci } 185bf215546Sopenharmony_ci 186bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH; 187bf215546Sopenharmony_ci} 188bf215546Sopenharmony_ci 189bf215546Sopenharmony_cistatic uint32_t 190bf215546Sopenharmony_civ3d_general_tmu_op(nir_intrinsic_instr *instr) 191bf215546Sopenharmony_ci{ 192bf215546Sopenharmony_ci switch (instr->intrinsic) { 193bf215546Sopenharmony_ci case nir_intrinsic_load_ssbo: 194bf215546Sopenharmony_ci case nir_intrinsic_load_ubo: 195bf215546Sopenharmony_ci case nir_intrinsic_load_uniform: 196bf215546Sopenharmony_ci case nir_intrinsic_load_shared: 197bf215546Sopenharmony_ci case nir_intrinsic_load_scratch: 198bf215546Sopenharmony_ci case nir_intrinsic_load_global_2x32: 199bf215546Sopenharmony_ci case nir_intrinsic_store_ssbo: 200bf215546Sopenharmony_ci case nir_intrinsic_store_shared: 201bf215546Sopenharmony_ci case nir_intrinsic_store_scratch: 202bf215546Sopenharmony_ci case nir_intrinsic_store_global_2x32: 203bf215546Sopenharmony_ci return V3D_TMU_OP_REGULAR; 204bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_add: 205bf215546Sopenharmony_ci return v3d_get_op_for_atomic_add(instr, 2); 206bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_add: 207bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_add_2x32: 208bf215546Sopenharmony_ci return v3d_get_op_for_atomic_add(instr, 1); 209bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_imin: 210bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_imin_2x32: 211bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_imin: 212bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_SMIN; 213bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_umin: 214bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_umin_2x32: 215bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_umin: 216bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; 217bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_imax: 218bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_imax_2x32: 219bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_imax: 220bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_SMAX; 221bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_umax: 222bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_umax_2x32: 223bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_umax: 224bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_UMAX; 225bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_and: 226bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_and_2x32: 227bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_and: 228bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_AND_READ_INC; 229bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_or: 230bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_or_2x32: 231bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_or: 232bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_OR_READ_DEC; 233bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_xor: 234bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_xor_2x32: 235bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_xor: 236bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_XOR_READ_NOT; 237bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_exchange: 238bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_exchange_2x32: 239bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_exchange: 240bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; 241bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_comp_swap: 242bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_comp_swap_2x32: 243bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_comp_swap: 244bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; 245bf215546Sopenharmony_ci default: 246bf215546Sopenharmony_ci unreachable("unknown intrinsic op"); 247bf215546Sopenharmony_ci } 248bf215546Sopenharmony_ci} 249bf215546Sopenharmony_ci 250bf215546Sopenharmony_ci/** 251bf215546Sopenharmony_ci * Checks if pipelining a new TMU operation requiring 'components' LDTMUs 252bf215546Sopenharmony_ci * would overflow the Output TMU fifo. 253bf215546Sopenharmony_ci * 254bf215546Sopenharmony_ci * It is not allowed to overflow the Output fifo, however, we can overflow 255bf215546Sopenharmony_ci * Input and Config fifos. Doing that makes the shader stall, but only for as 256bf215546Sopenharmony_ci * long as it needs to be able to continue so it is better for pipelining to 257bf215546Sopenharmony_ci * let the QPU stall on these if needed than trying to emit TMU flushes in the 258bf215546Sopenharmony_ci * driver. 259bf215546Sopenharmony_ci */ 260bf215546Sopenharmony_cibool 261bf215546Sopenharmony_cintq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components) 262bf215546Sopenharmony_ci{ 263bf215546Sopenharmony_ci if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE) 264bf215546Sopenharmony_ci return true; 265bf215546Sopenharmony_ci 266bf215546Sopenharmony_ci return components > 0 && 267bf215546Sopenharmony_ci c->tmu.output_fifo_size + components > 16 / c->threads; 268bf215546Sopenharmony_ci} 269bf215546Sopenharmony_ci 270bf215546Sopenharmony_ci/** 271bf215546Sopenharmony_ci * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations, 272bf215546Sopenharmony_ci * popping all TMU fifo entries. 273bf215546Sopenharmony_ci */ 274bf215546Sopenharmony_civoid 275bf215546Sopenharmony_cintq_flush_tmu(struct v3d_compile *c) 276bf215546Sopenharmony_ci{ 277bf215546Sopenharmony_ci if (c->tmu.flush_count == 0) 278bf215546Sopenharmony_ci return; 279bf215546Sopenharmony_ci 280bf215546Sopenharmony_ci vir_emit_thrsw(c); 281bf215546Sopenharmony_ci 282bf215546Sopenharmony_ci bool emitted_tmuwt = false; 283bf215546Sopenharmony_ci for (int i = 0; i < c->tmu.flush_count; i++) { 284bf215546Sopenharmony_ci if (c->tmu.flush[i].component_mask > 0) { 285bf215546Sopenharmony_ci nir_dest *dest = c->tmu.flush[i].dest; 286bf215546Sopenharmony_ci assert(dest); 287bf215546Sopenharmony_ci 288bf215546Sopenharmony_ci for (int j = 0; j < 4; j++) { 289bf215546Sopenharmony_ci if (c->tmu.flush[i].component_mask & (1 << j)) { 290bf215546Sopenharmony_ci ntq_store_dest(c, dest, j, 291bf215546Sopenharmony_ci vir_MOV(c, vir_LDTMU(c))); 292bf215546Sopenharmony_ci } 293bf215546Sopenharmony_ci } 294bf215546Sopenharmony_ci } else if (!emitted_tmuwt) { 295bf215546Sopenharmony_ci vir_TMUWT(c); 296bf215546Sopenharmony_ci emitted_tmuwt = true; 297bf215546Sopenharmony_ci } 298bf215546Sopenharmony_ci } 299bf215546Sopenharmony_ci 300bf215546Sopenharmony_ci c->tmu.output_fifo_size = 0; 301bf215546Sopenharmony_ci c->tmu.flush_count = 0; 302bf215546Sopenharmony_ci _mesa_set_clear(c->tmu.outstanding_regs, NULL); 303bf215546Sopenharmony_ci} 304bf215546Sopenharmony_ci 305bf215546Sopenharmony_ci/** 306bf215546Sopenharmony_ci * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller 307bf215546Sopenharmony_ci * is reponsible for ensuring that doing this doesn't overflow the TMU fifos, 308bf215546Sopenharmony_ci * and more specifically, the output fifo, since that can't stall. 309bf215546Sopenharmony_ci */ 310bf215546Sopenharmony_civoid 311bf215546Sopenharmony_cintq_add_pending_tmu_flush(struct v3d_compile *c, 312bf215546Sopenharmony_ci nir_dest *dest, 313bf215546Sopenharmony_ci uint32_t component_mask) 314bf215546Sopenharmony_ci{ 315bf215546Sopenharmony_ci const uint32_t num_components = util_bitcount(component_mask); 316bf215546Sopenharmony_ci assert(!ntq_tmu_fifo_overflow(c, num_components)); 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci if (num_components > 0) { 319bf215546Sopenharmony_ci c->tmu.output_fifo_size += num_components; 320bf215546Sopenharmony_ci if (!dest->is_ssa) 321bf215546Sopenharmony_ci _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg); 322bf215546Sopenharmony_ci } 323bf215546Sopenharmony_ci 324bf215546Sopenharmony_ci c->tmu.flush[c->tmu.flush_count].dest = dest; 325bf215546Sopenharmony_ci c->tmu.flush[c->tmu.flush_count].component_mask = component_mask; 326bf215546Sopenharmony_ci c->tmu.flush_count++; 327bf215546Sopenharmony_ci 328bf215546Sopenharmony_ci if (c->disable_tmu_pipelining) 329bf215546Sopenharmony_ci ntq_flush_tmu(c); 330bf215546Sopenharmony_ci else if (c->tmu.flush_count > 1) 331bf215546Sopenharmony_ci c->pipelined_any_tmu = true; 332bf215546Sopenharmony_ci} 333bf215546Sopenharmony_ci 334bf215546Sopenharmony_cienum emit_mode { 335bf215546Sopenharmony_ci MODE_COUNT = 0, 336bf215546Sopenharmony_ci MODE_EMIT, 337bf215546Sopenharmony_ci MODE_LAST, 338bf215546Sopenharmony_ci}; 339bf215546Sopenharmony_ci 340bf215546Sopenharmony_ci/** 341bf215546Sopenharmony_ci * For a TMU general store instruction: 342bf215546Sopenharmony_ci * 343bf215546Sopenharmony_ci * In MODE_COUNT mode, records the number of TMU writes required and flushes 344bf215546Sopenharmony_ci * any outstanding TMU operations the instruction depends on, but it doesn't 345bf215546Sopenharmony_ci * emit any actual register writes. 346bf215546Sopenharmony_ci * 347bf215546Sopenharmony_ci * In MODE_EMIT mode, emits the data register writes required by the 348bf215546Sopenharmony_ci * instruction. 349bf215546Sopenharmony_ci */ 350bf215546Sopenharmony_cistatic void 351bf215546Sopenharmony_ciemit_tmu_general_store_writes(struct v3d_compile *c, 352bf215546Sopenharmony_ci enum emit_mode mode, 353bf215546Sopenharmony_ci nir_intrinsic_instr *instr, 354bf215546Sopenharmony_ci uint32_t base_const_offset, 355bf215546Sopenharmony_ci uint32_t *writemask, 356bf215546Sopenharmony_ci uint32_t *const_offset, 357bf215546Sopenharmony_ci uint32_t *type_size, 358bf215546Sopenharmony_ci uint32_t *tmu_writes) 359bf215546Sopenharmony_ci{ 360bf215546Sopenharmony_ci struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); 361bf215546Sopenharmony_ci 362bf215546Sopenharmony_ci /* Find the first set of consecutive components that 363bf215546Sopenharmony_ci * are enabled in the writemask and emit the TMUD 364bf215546Sopenharmony_ci * instructions for them. 365bf215546Sopenharmony_ci */ 366bf215546Sopenharmony_ci assert(*writemask != 0); 367bf215546Sopenharmony_ci uint32_t first_component = ffs(*writemask) - 1; 368bf215546Sopenharmony_ci uint32_t last_component = first_component; 369bf215546Sopenharmony_ci while (*writemask & BITFIELD_BIT(last_component + 1)) 370bf215546Sopenharmony_ci last_component++; 371bf215546Sopenharmony_ci 372bf215546Sopenharmony_ci assert(first_component <= last_component && 373bf215546Sopenharmony_ci last_component < instr->num_components); 374bf215546Sopenharmony_ci 375bf215546Sopenharmony_ci for (int i = first_component; i <= last_component; i++) { 376bf215546Sopenharmony_ci struct qreg data = ntq_get_src(c, instr->src[0], i); 377bf215546Sopenharmony_ci if (mode == MODE_COUNT) 378bf215546Sopenharmony_ci (*tmu_writes)++; 379bf215546Sopenharmony_ci else 380bf215546Sopenharmony_ci vir_MOV_dest(c, tmud, data); 381bf215546Sopenharmony_ci } 382bf215546Sopenharmony_ci 383bf215546Sopenharmony_ci if (mode == MODE_EMIT) { 384bf215546Sopenharmony_ci /* Update the offset for the TMU write based on the 385bf215546Sopenharmony_ci * the first component we are writing. 386bf215546Sopenharmony_ci */ 387bf215546Sopenharmony_ci *type_size = nir_src_bit_size(instr->src[0]) / 8; 388bf215546Sopenharmony_ci *const_offset = 389bf215546Sopenharmony_ci base_const_offset + first_component * (*type_size); 390bf215546Sopenharmony_ci 391bf215546Sopenharmony_ci /* Clear these components from the writemask */ 392bf215546Sopenharmony_ci uint32_t written_mask = 393bf215546Sopenharmony_ci BITFIELD_RANGE(first_component, *tmu_writes); 394bf215546Sopenharmony_ci (*writemask) &= ~written_mask; 395bf215546Sopenharmony_ci } 396bf215546Sopenharmony_ci} 397bf215546Sopenharmony_ci 398bf215546Sopenharmony_ci/** 399bf215546Sopenharmony_ci * For a TMU general atomic instruction: 400bf215546Sopenharmony_ci * 401bf215546Sopenharmony_ci * In MODE_COUNT mode, records the number of TMU writes required and flushes 402bf215546Sopenharmony_ci * any outstanding TMU operations the instruction depends on, but it doesn't 403bf215546Sopenharmony_ci * emit any actual register writes. 404bf215546Sopenharmony_ci * 405bf215546Sopenharmony_ci * In MODE_EMIT mode, emits the data register writes required by the 406bf215546Sopenharmony_ci * instruction. 407bf215546Sopenharmony_ci */ 408bf215546Sopenharmony_cistatic void 409bf215546Sopenharmony_ciemit_tmu_general_atomic_writes(struct v3d_compile *c, 410bf215546Sopenharmony_ci enum emit_mode mode, 411bf215546Sopenharmony_ci nir_intrinsic_instr *instr, 412bf215546Sopenharmony_ci uint32_t tmu_op, 413bf215546Sopenharmony_ci bool has_index, 414bf215546Sopenharmony_ci uint32_t *tmu_writes) 415bf215546Sopenharmony_ci{ 416bf215546Sopenharmony_ci struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); 417bf215546Sopenharmony_ci 418bf215546Sopenharmony_ci struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0); 419bf215546Sopenharmony_ci if (mode == MODE_COUNT) 420bf215546Sopenharmony_ci (*tmu_writes)++; 421bf215546Sopenharmony_ci else 422bf215546Sopenharmony_ci vir_MOV_dest(c, tmud, data); 423bf215546Sopenharmony_ci 424bf215546Sopenharmony_ci if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { 425bf215546Sopenharmony_ci data = ntq_get_src(c, instr->src[2 + has_index], 0); 426bf215546Sopenharmony_ci if (mode == MODE_COUNT) 427bf215546Sopenharmony_ci (*tmu_writes)++; 428bf215546Sopenharmony_ci else 429bf215546Sopenharmony_ci vir_MOV_dest(c, tmud, data); 430bf215546Sopenharmony_ci } 431bf215546Sopenharmony_ci} 432bf215546Sopenharmony_ci 433bf215546Sopenharmony_ci/** 434bf215546Sopenharmony_ci * For any TMU general instruction: 435bf215546Sopenharmony_ci * 436bf215546Sopenharmony_ci * In MODE_COUNT mode, records the number of TMU writes required to emit the 437bf215546Sopenharmony_ci * address parameter and flushes any outstanding TMU operations the instruction 438bf215546Sopenharmony_ci * depends on, but it doesn't emit any actual register writes. 439bf215546Sopenharmony_ci * 440bf215546Sopenharmony_ci * In MODE_EMIT mode, emits register writes required to emit the address. 441bf215546Sopenharmony_ci */ 442bf215546Sopenharmony_cistatic void 443bf215546Sopenharmony_ciemit_tmu_general_address_write(struct v3d_compile *c, 444bf215546Sopenharmony_ci enum emit_mode mode, 445bf215546Sopenharmony_ci nir_intrinsic_instr *instr, 446bf215546Sopenharmony_ci uint32_t config, 447bf215546Sopenharmony_ci bool dynamic_src, 448bf215546Sopenharmony_ci int offset_src, 449bf215546Sopenharmony_ci struct qreg base_offset, 450bf215546Sopenharmony_ci uint32_t const_offset, 451bf215546Sopenharmony_ci uint32_t *tmu_writes) 452bf215546Sopenharmony_ci{ 453bf215546Sopenharmony_ci if (mode == MODE_COUNT) { 454bf215546Sopenharmony_ci (*tmu_writes)++; 455bf215546Sopenharmony_ci if (dynamic_src) 456bf215546Sopenharmony_ci ntq_get_src(c, instr->src[offset_src], 0); 457bf215546Sopenharmony_ci return; 458bf215546Sopenharmony_ci } 459bf215546Sopenharmony_ci 460bf215546Sopenharmony_ci if (vir_in_nonuniform_control_flow(c)) { 461bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 462bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 463bf215546Sopenharmony_ci } 464bf215546Sopenharmony_ci 465bf215546Sopenharmony_ci struct qreg tmua; 466bf215546Sopenharmony_ci if (config == ~0) 467bf215546Sopenharmony_ci tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); 468bf215546Sopenharmony_ci else 469bf215546Sopenharmony_ci tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); 470bf215546Sopenharmony_ci 471bf215546Sopenharmony_ci struct qinst *tmu; 472bf215546Sopenharmony_ci if (dynamic_src) { 473bf215546Sopenharmony_ci struct qreg offset = base_offset; 474bf215546Sopenharmony_ci if (const_offset != 0) { 475bf215546Sopenharmony_ci offset = vir_ADD(c, offset, 476bf215546Sopenharmony_ci vir_uniform_ui(c, const_offset)); 477bf215546Sopenharmony_ci } 478bf215546Sopenharmony_ci struct qreg data = ntq_get_src(c, instr->src[offset_src], 0); 479bf215546Sopenharmony_ci tmu = vir_ADD_dest(c, tmua, offset, data); 480bf215546Sopenharmony_ci } else { 481bf215546Sopenharmony_ci if (const_offset != 0) { 482bf215546Sopenharmony_ci tmu = vir_ADD_dest(c, tmua, base_offset, 483bf215546Sopenharmony_ci vir_uniform_ui(c, const_offset)); 484bf215546Sopenharmony_ci } else { 485bf215546Sopenharmony_ci tmu = vir_MOV_dest(c, tmua, base_offset); 486bf215546Sopenharmony_ci } 487bf215546Sopenharmony_ci } 488bf215546Sopenharmony_ci 489bf215546Sopenharmony_ci if (config != ~0) { 490bf215546Sopenharmony_ci tmu->uniform = 491bf215546Sopenharmony_ci vir_get_uniform_index(c, QUNIFORM_CONSTANT, config); 492bf215546Sopenharmony_ci } 493bf215546Sopenharmony_ci 494bf215546Sopenharmony_ci if (vir_in_nonuniform_control_flow(c)) 495bf215546Sopenharmony_ci vir_set_cond(tmu, V3D_QPU_COND_IFA); 496bf215546Sopenharmony_ci} 497bf215546Sopenharmony_ci 498bf215546Sopenharmony_ci/** 499bf215546Sopenharmony_ci * Implements indirect uniform loads and SSBO accesses through the TMU general 500bf215546Sopenharmony_ci * memory access interface. 501bf215546Sopenharmony_ci */ 502bf215546Sopenharmony_cistatic void 503bf215546Sopenharmony_cintq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, 504bf215546Sopenharmony_ci bool is_shared_or_scratch, bool is_global) 505bf215546Sopenharmony_ci{ 506bf215546Sopenharmony_ci uint32_t tmu_op = v3d_general_tmu_op(instr); 507bf215546Sopenharmony_ci 508bf215546Sopenharmony_ci /* If we were able to replace atomic_add for an inc/dec, then we 509bf215546Sopenharmony_ci * need/can to do things slightly different, like not loading the 510bf215546Sopenharmony_ci * amount to add/sub, as that is implicit. 511bf215546Sopenharmony_ci */ 512bf215546Sopenharmony_ci bool atomic_add_replaced = 513bf215546Sopenharmony_ci ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add || 514bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_shared_atomic_add || 515bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_global_atomic_add_2x32) && 516bf215546Sopenharmony_ci (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC || 517bf215546Sopenharmony_ci tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC)); 518bf215546Sopenharmony_ci 519bf215546Sopenharmony_ci bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || 520bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_store_scratch || 521bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_store_shared || 522bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_store_global_2x32); 523bf215546Sopenharmony_ci 524bf215546Sopenharmony_ci bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform || 525bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_ubo || 526bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_ssbo || 527bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_scratch || 528bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_shared || 529bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_global_2x32); 530bf215546Sopenharmony_ci 531bf215546Sopenharmony_ci if (!is_load) 532bf215546Sopenharmony_ci c->tmu_dirty_rcl = true; 533bf215546Sopenharmony_ci 534bf215546Sopenharmony_ci if (is_global) 535bf215546Sopenharmony_ci c->has_global_address = true; 536bf215546Sopenharmony_ci 537bf215546Sopenharmony_ci bool has_index = !is_shared_or_scratch && !is_global; 538bf215546Sopenharmony_ci 539bf215546Sopenharmony_ci int offset_src; 540bf215546Sopenharmony_ci if (instr->intrinsic == nir_intrinsic_load_uniform) { 541bf215546Sopenharmony_ci offset_src = 0; 542bf215546Sopenharmony_ci } else if (instr->intrinsic == nir_intrinsic_load_ssbo || 543bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_ubo || 544bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_scratch || 545bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_shared || 546bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_global_2x32 || 547bf215546Sopenharmony_ci atomic_add_replaced) { 548bf215546Sopenharmony_ci offset_src = 0 + has_index; 549bf215546Sopenharmony_ci } else if (is_store) { 550bf215546Sopenharmony_ci offset_src = 1 + has_index; 551bf215546Sopenharmony_ci } else { 552bf215546Sopenharmony_ci offset_src = 0 + has_index; 553bf215546Sopenharmony_ci } 554bf215546Sopenharmony_ci 555bf215546Sopenharmony_ci bool dynamic_src = !nir_src_is_const(instr->src[offset_src]); 556bf215546Sopenharmony_ci uint32_t const_offset = 0; 557bf215546Sopenharmony_ci if (!dynamic_src) 558bf215546Sopenharmony_ci const_offset = nir_src_as_uint(instr->src[offset_src]); 559bf215546Sopenharmony_ci 560bf215546Sopenharmony_ci struct qreg base_offset; 561bf215546Sopenharmony_ci if (instr->intrinsic == nir_intrinsic_load_uniform) { 562bf215546Sopenharmony_ci const_offset += nir_intrinsic_base(instr); 563bf215546Sopenharmony_ci base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 564bf215546Sopenharmony_ci v3d_unit_data_create(0, const_offset)); 565bf215546Sopenharmony_ci const_offset = 0; 566bf215546Sopenharmony_ci } else if (instr->intrinsic == nir_intrinsic_load_ubo) { 567bf215546Sopenharmony_ci uint32_t index = nir_src_as_uint(instr->src[0]); 568bf215546Sopenharmony_ci /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index 569bf215546Sopenharmony_ci * shifted up by 1 (0 is gallium's constant buffer 0). 570bf215546Sopenharmony_ci */ 571bf215546Sopenharmony_ci if (c->key->environment == V3D_ENVIRONMENT_OPENGL) 572bf215546Sopenharmony_ci index++; 573bf215546Sopenharmony_ci 574bf215546Sopenharmony_ci base_offset = 575bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_UBO_ADDR, 576bf215546Sopenharmony_ci v3d_unit_data_create(index, const_offset)); 577bf215546Sopenharmony_ci const_offset = 0; 578bf215546Sopenharmony_ci } else if (is_shared_or_scratch) { 579bf215546Sopenharmony_ci /* Shared and scratch variables have no buffer index, and all 580bf215546Sopenharmony_ci * start from a common base that we set up at the start of 581bf215546Sopenharmony_ci * dispatch. 582bf215546Sopenharmony_ci */ 583bf215546Sopenharmony_ci if (instr->intrinsic == nir_intrinsic_load_scratch || 584bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_store_scratch) { 585bf215546Sopenharmony_ci base_offset = c->spill_base; 586bf215546Sopenharmony_ci } else { 587bf215546Sopenharmony_ci base_offset = c->cs_shared_offset; 588bf215546Sopenharmony_ci const_offset += nir_intrinsic_base(instr); 589bf215546Sopenharmony_ci } 590bf215546Sopenharmony_ci } else if (is_global) { 591bf215546Sopenharmony_ci /* Global load/store intrinsics use gloal addresses, so the 592bf215546Sopenharmony_ci * offset is the target address and we don't need to add it 593bf215546Sopenharmony_ci * to a base offset. 594bf215546Sopenharmony_ci */ 595bf215546Sopenharmony_ci base_offset = vir_uniform_ui(c, 0); 596bf215546Sopenharmony_ci } else { 597bf215546Sopenharmony_ci base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, 598bf215546Sopenharmony_ci nir_src_as_uint(instr->src[is_store ? 599bf215546Sopenharmony_ci 1 : 0])); 600bf215546Sopenharmony_ci } 601bf215546Sopenharmony_ci 602bf215546Sopenharmony_ci /* We are ready to emit TMU register writes now, but before we actually 603bf215546Sopenharmony_ci * emit them we need to flush outstanding TMU operations if any of our 604bf215546Sopenharmony_ci * writes reads from the result of an outstanding TMU operation before 605bf215546Sopenharmony_ci * we start the TMU sequence for this operation, since otherwise the 606bf215546Sopenharmony_ci * flush could happen in the middle of the TMU sequence we are about to 607bf215546Sopenharmony_ci * emit, which is illegal. To do this we run this logic twice, the 608bf215546Sopenharmony_ci * first time it will count required register writes and flush pending 609bf215546Sopenharmony_ci * TMU requests if necessary due to a dependency, and the second one 610bf215546Sopenharmony_ci * will emit the actual TMU writes. 611bf215546Sopenharmony_ci */ 612bf215546Sopenharmony_ci const uint32_t dest_components = nir_intrinsic_dest_components(instr); 613bf215546Sopenharmony_ci uint32_t base_const_offset = const_offset; 614bf215546Sopenharmony_ci uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0; 615bf215546Sopenharmony_ci uint32_t tmu_writes = 0; 616bf215546Sopenharmony_ci for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) { 617bf215546Sopenharmony_ci assert(mode == MODE_COUNT || tmu_writes > 0); 618bf215546Sopenharmony_ci 619bf215546Sopenharmony_ci uint32_t type_size = 4; 620bf215546Sopenharmony_ci 621bf215546Sopenharmony_ci if (is_store) { 622bf215546Sopenharmony_ci emit_tmu_general_store_writes(c, mode, instr, 623bf215546Sopenharmony_ci base_const_offset, 624bf215546Sopenharmony_ci &writemask, 625bf215546Sopenharmony_ci &const_offset, 626bf215546Sopenharmony_ci &type_size, 627bf215546Sopenharmony_ci &tmu_writes); 628bf215546Sopenharmony_ci } else if (!is_load && !atomic_add_replaced) { 629bf215546Sopenharmony_ci emit_tmu_general_atomic_writes(c, mode, instr, 630bf215546Sopenharmony_ci tmu_op, has_index, 631bf215546Sopenharmony_ci &tmu_writes); 632bf215546Sopenharmony_ci } else if (is_load) { 633bf215546Sopenharmony_ci type_size = nir_dest_bit_size(instr->dest) / 8; 634bf215546Sopenharmony_ci } 635bf215546Sopenharmony_ci 636bf215546Sopenharmony_ci /* For atomics we use 32bit except for CMPXCHG, that we need 637bf215546Sopenharmony_ci * to use VEC2. For the rest of the cases we use the number of 638bf215546Sopenharmony_ci * tmud writes we did to decide the type. For cache operations 639bf215546Sopenharmony_ci * the type is ignored. 640bf215546Sopenharmony_ci */ 641bf215546Sopenharmony_ci uint32_t config = 0; 642bf215546Sopenharmony_ci if (mode == MODE_EMIT) { 643bf215546Sopenharmony_ci uint32_t num_components; 644bf215546Sopenharmony_ci if (is_load || atomic_add_replaced) { 645bf215546Sopenharmony_ci num_components = instr->num_components; 646bf215546Sopenharmony_ci } else { 647bf215546Sopenharmony_ci assert(tmu_writes > 0); 648bf215546Sopenharmony_ci num_components = tmu_writes - 1; 649bf215546Sopenharmony_ci } 650bf215546Sopenharmony_ci bool is_atomic = 651bf215546Sopenharmony_ci v3d_tmu_get_type_from_op(tmu_op, !is_load) == 652bf215546Sopenharmony_ci V3D_TMU_OP_TYPE_ATOMIC; 653bf215546Sopenharmony_ci 654bf215546Sopenharmony_ci uint32_t perquad = 655bf215546Sopenharmony_ci is_load && !vir_in_nonuniform_control_flow(c) 656bf215546Sopenharmony_ci ? GENERAL_TMU_LOOKUP_PER_QUAD 657bf215546Sopenharmony_ci : GENERAL_TMU_LOOKUP_PER_PIXEL; 658bf215546Sopenharmony_ci config = 0xffffff00 | tmu_op << 3 | perquad; 659bf215546Sopenharmony_ci 660bf215546Sopenharmony_ci if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { 661bf215546Sopenharmony_ci config |= GENERAL_TMU_LOOKUP_TYPE_VEC2; 662bf215546Sopenharmony_ci } else if (is_atomic || num_components == 1) { 663bf215546Sopenharmony_ci switch (type_size) { 664bf215546Sopenharmony_ci case 4: 665bf215546Sopenharmony_ci config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; 666bf215546Sopenharmony_ci break; 667bf215546Sopenharmony_ci case 2: 668bf215546Sopenharmony_ci config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI; 669bf215546Sopenharmony_ci break; 670bf215546Sopenharmony_ci case 1: 671bf215546Sopenharmony_ci config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI; 672bf215546Sopenharmony_ci break; 673bf215546Sopenharmony_ci default: 674bf215546Sopenharmony_ci unreachable("Unsupported bitsize"); 675bf215546Sopenharmony_ci } 676bf215546Sopenharmony_ci } else { 677bf215546Sopenharmony_ci assert(type_size == 4); 678bf215546Sopenharmony_ci config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + 679bf215546Sopenharmony_ci num_components - 2; 680bf215546Sopenharmony_ci } 681bf215546Sopenharmony_ci } 682bf215546Sopenharmony_ci 683bf215546Sopenharmony_ci emit_tmu_general_address_write(c, mode, instr, config, 684bf215546Sopenharmony_ci dynamic_src, offset_src, 685bf215546Sopenharmony_ci base_offset, const_offset, 686bf215546Sopenharmony_ci &tmu_writes); 687bf215546Sopenharmony_ci 688bf215546Sopenharmony_ci assert(tmu_writes > 0); 689bf215546Sopenharmony_ci if (mode == MODE_COUNT) { 690bf215546Sopenharmony_ci /* Make sure we won't exceed the 16-entry TMU 691bf215546Sopenharmony_ci * fifo if each thread is storing at the same 692bf215546Sopenharmony_ci * time. 693bf215546Sopenharmony_ci */ 694bf215546Sopenharmony_ci while (tmu_writes > 16 / c->threads) 695bf215546Sopenharmony_ci c->threads /= 2; 696bf215546Sopenharmony_ci 697bf215546Sopenharmony_ci /* If pipelining this TMU operation would 698bf215546Sopenharmony_ci * overflow TMU fifos, we need to flush. 699bf215546Sopenharmony_ci */ 700bf215546Sopenharmony_ci if (ntq_tmu_fifo_overflow(c, dest_components)) 701bf215546Sopenharmony_ci ntq_flush_tmu(c); 702bf215546Sopenharmony_ci } else { 703bf215546Sopenharmony_ci /* Delay emission of the thread switch and 704bf215546Sopenharmony_ci * LDTMU/TMUWT until we really need to do it to 705bf215546Sopenharmony_ci * improve pipelining. 706bf215546Sopenharmony_ci */ 707bf215546Sopenharmony_ci const uint32_t component_mask = 708bf215546Sopenharmony_ci (1 << dest_components) - 1; 709bf215546Sopenharmony_ci ntq_add_pending_tmu_flush(c, &instr->dest, 710bf215546Sopenharmony_ci component_mask); 711bf215546Sopenharmony_ci } 712bf215546Sopenharmony_ci } 713bf215546Sopenharmony_ci 714bf215546Sopenharmony_ci /* nir_lower_wrmasks should've ensured that any writemask on a store 715bf215546Sopenharmony_ci * operation only has consecutive bits set, in which case we should've 716bf215546Sopenharmony_ci * processed the full writemask above. 717bf215546Sopenharmony_ci */ 718bf215546Sopenharmony_ci assert(writemask == 0); 719bf215546Sopenharmony_ci} 720bf215546Sopenharmony_ci 721bf215546Sopenharmony_cistatic struct qreg * 722bf215546Sopenharmony_cintq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) 723bf215546Sopenharmony_ci{ 724bf215546Sopenharmony_ci struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 725bf215546Sopenharmony_ci def->num_components); 726bf215546Sopenharmony_ci _mesa_hash_table_insert(c->def_ht, def, qregs); 727bf215546Sopenharmony_ci return qregs; 728bf215546Sopenharmony_ci} 729bf215546Sopenharmony_ci 730bf215546Sopenharmony_cistatic bool 731bf215546Sopenharmony_ciis_ld_signal(const struct v3d_qpu_sig *sig) 732bf215546Sopenharmony_ci{ 733bf215546Sopenharmony_ci return (sig->ldunif || 734bf215546Sopenharmony_ci sig->ldunifa || 735bf215546Sopenharmony_ci sig->ldunifrf || 736bf215546Sopenharmony_ci sig->ldunifarf || 737bf215546Sopenharmony_ci sig->ldtmu || 738bf215546Sopenharmony_ci sig->ldvary || 739bf215546Sopenharmony_ci sig->ldvpm || 740bf215546Sopenharmony_ci sig->ldtlb || 741bf215546Sopenharmony_ci sig->ldtlbu); 742bf215546Sopenharmony_ci} 743bf215546Sopenharmony_ci 744bf215546Sopenharmony_cistatic inline bool 745bf215546Sopenharmony_ciis_ldunif_signal(const struct v3d_qpu_sig *sig) 746bf215546Sopenharmony_ci{ 747bf215546Sopenharmony_ci return sig->ldunif || sig->ldunifrf; 748bf215546Sopenharmony_ci} 749bf215546Sopenharmony_ci 750bf215546Sopenharmony_ci/** 751bf215546Sopenharmony_ci * This function is responsible for getting VIR results into the associated 752bf215546Sopenharmony_ci * storage for a NIR instruction. 753bf215546Sopenharmony_ci * 754bf215546Sopenharmony_ci * If it's a NIR SSA def, then we just set the associated hash table entry to 755bf215546Sopenharmony_ci * the new result. 756bf215546Sopenharmony_ci * 757bf215546Sopenharmony_ci * If it's a NIR reg, then we need to update the existing qreg assigned to the 758bf215546Sopenharmony_ci * NIR destination with the incoming value. To do that without introducing 759bf215546Sopenharmony_ci * new MOVs, we require that the incoming qreg either be a uniform, or be 760bf215546Sopenharmony_ci * SSA-defined by the previous VIR instruction in the block and rewritable by 761bf215546Sopenharmony_ci * this function. That lets us sneak ahead and insert the SF flag beforehand 762bf215546Sopenharmony_ci * (knowing that the previous instruction doesn't depend on flags) and rewrite 763bf215546Sopenharmony_ci * its destination to be the NIR reg's destination 764bf215546Sopenharmony_ci */ 765bf215546Sopenharmony_civoid 766bf215546Sopenharmony_cintq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, 767bf215546Sopenharmony_ci struct qreg result) 768bf215546Sopenharmony_ci{ 769bf215546Sopenharmony_ci struct qinst *last_inst = NULL; 770bf215546Sopenharmony_ci if (!list_is_empty(&c->cur_block->instructions)) 771bf215546Sopenharmony_ci last_inst = (struct qinst *)c->cur_block->instructions.prev; 772bf215546Sopenharmony_ci 773bf215546Sopenharmony_ci bool is_reused_uniform = 774bf215546Sopenharmony_ci is_ldunif_signal(&c->defs[result.index]->qpu.sig) && 775bf215546Sopenharmony_ci last_inst != c->defs[result.index]; 776bf215546Sopenharmony_ci 777bf215546Sopenharmony_ci assert(result.file == QFILE_TEMP && last_inst && 778bf215546Sopenharmony_ci (last_inst == c->defs[result.index] || is_reused_uniform)); 779bf215546Sopenharmony_ci 780bf215546Sopenharmony_ci if (dest->is_ssa) { 781bf215546Sopenharmony_ci assert(chan < dest->ssa.num_components); 782bf215546Sopenharmony_ci 783bf215546Sopenharmony_ci struct qreg *qregs; 784bf215546Sopenharmony_ci struct hash_entry *entry = 785bf215546Sopenharmony_ci _mesa_hash_table_search(c->def_ht, &dest->ssa); 786bf215546Sopenharmony_ci 787bf215546Sopenharmony_ci if (entry) 788bf215546Sopenharmony_ci qregs = entry->data; 789bf215546Sopenharmony_ci else 790bf215546Sopenharmony_ci qregs = ntq_init_ssa_def(c, &dest->ssa); 791bf215546Sopenharmony_ci 792bf215546Sopenharmony_ci qregs[chan] = result; 793bf215546Sopenharmony_ci } else { 794bf215546Sopenharmony_ci nir_register *reg = dest->reg.reg; 795bf215546Sopenharmony_ci assert(dest->reg.base_offset == 0); 796bf215546Sopenharmony_ci assert(reg->num_array_elems == 0); 797bf215546Sopenharmony_ci struct hash_entry *entry = 798bf215546Sopenharmony_ci _mesa_hash_table_search(c->def_ht, reg); 799bf215546Sopenharmony_ci struct qreg *qregs = entry->data; 800bf215546Sopenharmony_ci 801bf215546Sopenharmony_ci /* If the previous instruction can't be predicated for 802bf215546Sopenharmony_ci * the store into the nir_register, then emit a MOV 803bf215546Sopenharmony_ci * that can be. 804bf215546Sopenharmony_ci */ 805bf215546Sopenharmony_ci if (is_reused_uniform || 806bf215546Sopenharmony_ci (vir_in_nonuniform_control_flow(c) && 807bf215546Sopenharmony_ci is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) { 808bf215546Sopenharmony_ci result = vir_MOV(c, result); 809bf215546Sopenharmony_ci last_inst = c->defs[result.index]; 810bf215546Sopenharmony_ci } 811bf215546Sopenharmony_ci 812bf215546Sopenharmony_ci /* We know they're both temps, so just rewrite index. */ 813bf215546Sopenharmony_ci c->defs[last_inst->dst.index] = NULL; 814bf215546Sopenharmony_ci last_inst->dst.index = qregs[chan].index; 815bf215546Sopenharmony_ci 816bf215546Sopenharmony_ci /* If we're in control flow, then make this update of the reg 817bf215546Sopenharmony_ci * conditional on the execution mask. 818bf215546Sopenharmony_ci */ 819bf215546Sopenharmony_ci if (vir_in_nonuniform_control_flow(c)) { 820bf215546Sopenharmony_ci last_inst->dst.index = qregs[chan].index; 821bf215546Sopenharmony_ci 822bf215546Sopenharmony_ci /* Set the flags to the current exec mask. 823bf215546Sopenharmony_ci */ 824bf215546Sopenharmony_ci c->cursor = vir_before_inst(last_inst); 825bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 826bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 827bf215546Sopenharmony_ci c->cursor = vir_after_inst(last_inst); 828bf215546Sopenharmony_ci 829bf215546Sopenharmony_ci vir_set_cond(last_inst, V3D_QPU_COND_IFA); 830bf215546Sopenharmony_ci } 831bf215546Sopenharmony_ci } 832bf215546Sopenharmony_ci} 833bf215546Sopenharmony_ci 834bf215546Sopenharmony_ci/** 835bf215546Sopenharmony_ci * This looks up the qreg associated with a particular ssa/reg used as a source 836bf215546Sopenharmony_ci * in any instruction. 837bf215546Sopenharmony_ci * 838bf215546Sopenharmony_ci * It is expected that the definition for any NIR value read as a source has 839bf215546Sopenharmony_ci * been emitted by a previous instruction, however, in the case of TMU 840bf215546Sopenharmony_ci * operations we may have postponed emission of the thread switch and LDTMUs 841bf215546Sopenharmony_ci * required to read the TMU results until the results are actually used to 842bf215546Sopenharmony_ci * improve pipelining, which then would lead to us not finding them here 843bf215546Sopenharmony_ci * (for SSA defs) or finding them in the list of registers awaiting a TMU flush 844bf215546Sopenharmony_ci * (for registers), meaning that we need to flush outstanding TMU operations 845bf215546Sopenharmony_ci * to read the correct value. 846bf215546Sopenharmony_ci */ 847bf215546Sopenharmony_cistruct qreg 848bf215546Sopenharmony_cintq_get_src(struct v3d_compile *c, nir_src src, int i) 849bf215546Sopenharmony_ci{ 850bf215546Sopenharmony_ci struct hash_entry *entry; 851bf215546Sopenharmony_ci if (src.is_ssa) { 852bf215546Sopenharmony_ci assert(i < src.ssa->num_components); 853bf215546Sopenharmony_ci 854bf215546Sopenharmony_ci entry = _mesa_hash_table_search(c->def_ht, src.ssa); 855bf215546Sopenharmony_ci if (!entry) { 856bf215546Sopenharmony_ci ntq_flush_tmu(c); 857bf215546Sopenharmony_ci entry = _mesa_hash_table_search(c->def_ht, src.ssa); 858bf215546Sopenharmony_ci } 859bf215546Sopenharmony_ci } else { 860bf215546Sopenharmony_ci nir_register *reg = src.reg.reg; 861bf215546Sopenharmony_ci assert(reg->num_array_elems == 0); 862bf215546Sopenharmony_ci assert(src.reg.base_offset == 0); 863bf215546Sopenharmony_ci assert(i < reg->num_components); 864bf215546Sopenharmony_ci 865bf215546Sopenharmony_ci if (_mesa_set_search(c->tmu.outstanding_regs, reg)) 866bf215546Sopenharmony_ci ntq_flush_tmu(c); 867bf215546Sopenharmony_ci entry = _mesa_hash_table_search(c->def_ht, reg); 868bf215546Sopenharmony_ci } 869bf215546Sopenharmony_ci assert(entry); 870bf215546Sopenharmony_ci 871bf215546Sopenharmony_ci struct qreg *qregs = entry->data; 872bf215546Sopenharmony_ci return qregs[i]; 873bf215546Sopenharmony_ci} 874bf215546Sopenharmony_ci 875bf215546Sopenharmony_cistatic struct qreg 876bf215546Sopenharmony_cintq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr, 877bf215546Sopenharmony_ci unsigned src) 878bf215546Sopenharmony_ci{ 879bf215546Sopenharmony_ci assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 880bf215546Sopenharmony_ci unsigned chan = ffs(instr->dest.write_mask) - 1; 881bf215546Sopenharmony_ci struct qreg r = ntq_get_src(c, instr->src[src].src, 882bf215546Sopenharmony_ci instr->src[src].swizzle[chan]); 883bf215546Sopenharmony_ci 884bf215546Sopenharmony_ci assert(!instr->src[src].abs); 885bf215546Sopenharmony_ci assert(!instr->src[src].negate); 886bf215546Sopenharmony_ci 887bf215546Sopenharmony_ci return r; 888bf215546Sopenharmony_ci}; 889bf215546Sopenharmony_ci 890bf215546Sopenharmony_cistatic struct qreg 891bf215546Sopenharmony_cintq_minify(struct v3d_compile *c, struct qreg size, struct qreg level) 892bf215546Sopenharmony_ci{ 893bf215546Sopenharmony_ci return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1)); 894bf215546Sopenharmony_ci} 895bf215546Sopenharmony_ci 896bf215546Sopenharmony_cistatic void 897bf215546Sopenharmony_cintq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) 898bf215546Sopenharmony_ci{ 899bf215546Sopenharmony_ci unsigned unit = instr->texture_index; 900bf215546Sopenharmony_ci int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod); 901bf215546Sopenharmony_ci int dest_size = nir_tex_instr_dest_size(instr); 902bf215546Sopenharmony_ci 903bf215546Sopenharmony_ci struct qreg lod = c->undef; 904bf215546Sopenharmony_ci if (lod_index != -1) 905bf215546Sopenharmony_ci lod = ntq_get_src(c, instr->src[lod_index].src, 0); 906bf215546Sopenharmony_ci 907bf215546Sopenharmony_ci for (int i = 0; i < dest_size; i++) { 908bf215546Sopenharmony_ci assert(i < 3); 909bf215546Sopenharmony_ci enum quniform_contents contents; 910bf215546Sopenharmony_ci 911bf215546Sopenharmony_ci if (instr->is_array && i == dest_size - 1) 912bf215546Sopenharmony_ci contents = QUNIFORM_TEXTURE_ARRAY_SIZE; 913bf215546Sopenharmony_ci else 914bf215546Sopenharmony_ci contents = QUNIFORM_TEXTURE_WIDTH + i; 915bf215546Sopenharmony_ci 916bf215546Sopenharmony_ci struct qreg size = vir_uniform(c, contents, unit); 917bf215546Sopenharmony_ci 918bf215546Sopenharmony_ci switch (instr->sampler_dim) { 919bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_1D: 920bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_2D: 921bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_MS: 922bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_3D: 923bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_CUBE: 924bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_BUF: 925bf215546Sopenharmony_ci /* Don't minify the array size. */ 926bf215546Sopenharmony_ci if (!(instr->is_array && i == dest_size - 1)) { 927bf215546Sopenharmony_ci size = ntq_minify(c, size, lod); 928bf215546Sopenharmony_ci } 929bf215546Sopenharmony_ci break; 930bf215546Sopenharmony_ci 931bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_RECT: 932bf215546Sopenharmony_ci /* There's no LOD field for rects */ 933bf215546Sopenharmony_ci break; 934bf215546Sopenharmony_ci 935bf215546Sopenharmony_ci default: 936bf215546Sopenharmony_ci unreachable("Bad sampler type"); 937bf215546Sopenharmony_ci } 938bf215546Sopenharmony_ci 939bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, i, size); 940bf215546Sopenharmony_ci } 941bf215546Sopenharmony_ci} 942bf215546Sopenharmony_ci 943bf215546Sopenharmony_cistatic void 944bf215546Sopenharmony_cintq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) 945bf215546Sopenharmony_ci{ 946bf215546Sopenharmony_ci unsigned unit = instr->texture_index; 947bf215546Sopenharmony_ci 948bf215546Sopenharmony_ci /* Since each texture sampling op requires uploading uniforms to 949bf215546Sopenharmony_ci * reference the texture, there's no HW support for texture size and 950bf215546Sopenharmony_ci * you just upload uniforms containing the size. 951bf215546Sopenharmony_ci */ 952bf215546Sopenharmony_ci switch (instr->op) { 953bf215546Sopenharmony_ci case nir_texop_query_levels: 954bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 955bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); 956bf215546Sopenharmony_ci return; 957bf215546Sopenharmony_ci case nir_texop_texture_samples: 958bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 959bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit)); 960bf215546Sopenharmony_ci return; 961bf215546Sopenharmony_ci case nir_texop_txs: 962bf215546Sopenharmony_ci ntq_emit_txs(c, instr); 963bf215546Sopenharmony_ci return; 964bf215546Sopenharmony_ci default: 965bf215546Sopenharmony_ci break; 966bf215546Sopenharmony_ci } 967bf215546Sopenharmony_ci 968bf215546Sopenharmony_ci if (c->devinfo->ver >= 40) 969bf215546Sopenharmony_ci v3d40_vir_emit_tex(c, instr); 970bf215546Sopenharmony_ci else 971bf215546Sopenharmony_ci v3d33_vir_emit_tex(c, instr); 972bf215546Sopenharmony_ci} 973bf215546Sopenharmony_ci 974bf215546Sopenharmony_cistatic struct qreg 975bf215546Sopenharmony_cintq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos) 976bf215546Sopenharmony_ci{ 977bf215546Sopenharmony_ci struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI)); 978bf215546Sopenharmony_ci if (is_cos) 979bf215546Sopenharmony_ci input = vir_FADD(c, input, vir_uniform_f(c, 0.5)); 980bf215546Sopenharmony_ci 981bf215546Sopenharmony_ci struct qreg periods = vir_FROUND(c, input); 982bf215546Sopenharmony_ci struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods)); 983bf215546Sopenharmony_ci return vir_XOR(c, sin_output, vir_SHL(c, 984bf215546Sopenharmony_ci vir_FTOIN(c, periods), 985bf215546Sopenharmony_ci vir_uniform_ui(c, -1))); 986bf215546Sopenharmony_ci} 987bf215546Sopenharmony_ci 988bf215546Sopenharmony_cistatic struct qreg 989bf215546Sopenharmony_cintq_fsign(struct v3d_compile *c, struct qreg src) 990bf215546Sopenharmony_ci{ 991bf215546Sopenharmony_ci struct qreg t = vir_get_temp(c); 992bf215546Sopenharmony_ci 993bf215546Sopenharmony_ci vir_MOV_dest(c, t, vir_uniform_f(c, 0.0)); 994bf215546Sopenharmony_ci vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ); 995bf215546Sopenharmony_ci vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0)); 996bf215546Sopenharmony_ci vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN); 997bf215546Sopenharmony_ci vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0)); 998bf215546Sopenharmony_ci return vir_MOV(c, t); 999bf215546Sopenharmony_ci} 1000bf215546Sopenharmony_ci 1001bf215546Sopenharmony_cistatic void 1002bf215546Sopenharmony_ciemit_fragcoord_input(struct v3d_compile *c, int attr) 1003bf215546Sopenharmony_ci{ 1004bf215546Sopenharmony_ci c->inputs[attr * 4 + 0] = vir_FXCD(c); 1005bf215546Sopenharmony_ci c->inputs[attr * 4 + 1] = vir_FYCD(c); 1006bf215546Sopenharmony_ci c->inputs[attr * 4 + 2] = c->payload_z; 1007bf215546Sopenharmony_ci c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w); 1008bf215546Sopenharmony_ci} 1009bf215546Sopenharmony_ci 1010bf215546Sopenharmony_cistatic struct qreg 1011bf215546Sopenharmony_ciemit_smooth_varying(struct v3d_compile *c, 1012bf215546Sopenharmony_ci struct qreg vary, struct qreg w, struct qreg r5) 1013bf215546Sopenharmony_ci{ 1014bf215546Sopenharmony_ci return vir_FADD(c, vir_FMUL(c, vary, w), r5); 1015bf215546Sopenharmony_ci} 1016bf215546Sopenharmony_ci 1017bf215546Sopenharmony_cistatic struct qreg 1018bf215546Sopenharmony_ciemit_noperspective_varying(struct v3d_compile *c, 1019bf215546Sopenharmony_ci struct qreg vary, struct qreg r5) 1020bf215546Sopenharmony_ci{ 1021bf215546Sopenharmony_ci return vir_FADD(c, vir_MOV(c, vary), r5); 1022bf215546Sopenharmony_ci} 1023bf215546Sopenharmony_ci 1024bf215546Sopenharmony_cistatic struct qreg 1025bf215546Sopenharmony_ciemit_flat_varying(struct v3d_compile *c, 1026bf215546Sopenharmony_ci struct qreg vary, struct qreg r5) 1027bf215546Sopenharmony_ci{ 1028bf215546Sopenharmony_ci vir_MOV_dest(c, c->undef, vary); 1029bf215546Sopenharmony_ci return vir_MOV(c, r5); 1030bf215546Sopenharmony_ci} 1031bf215546Sopenharmony_ci 1032bf215546Sopenharmony_cistatic struct qreg 1033bf215546Sopenharmony_ciemit_fragment_varying(struct v3d_compile *c, nir_variable *var, 1034bf215546Sopenharmony_ci int8_t input_idx, uint8_t swizzle, int array_index) 1035bf215546Sopenharmony_ci{ 1036bf215546Sopenharmony_ci struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); 1037bf215546Sopenharmony_ci struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); 1038bf215546Sopenharmony_ci 1039bf215546Sopenharmony_ci struct qinst *ldvary = NULL; 1040bf215546Sopenharmony_ci struct qreg vary; 1041bf215546Sopenharmony_ci if (c->devinfo->ver >= 41) { 1042bf215546Sopenharmony_ci ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, 1043bf215546Sopenharmony_ci c->undef, c->undef); 1044bf215546Sopenharmony_ci ldvary->qpu.sig.ldvary = true; 1045bf215546Sopenharmony_ci vary = vir_emit_def(c, ldvary); 1046bf215546Sopenharmony_ci } else { 1047bf215546Sopenharmony_ci vir_NOP(c)->qpu.sig.ldvary = true; 1048bf215546Sopenharmony_ci vary = r3; 1049bf215546Sopenharmony_ci } 1050bf215546Sopenharmony_ci 1051bf215546Sopenharmony_ci /* Store the input value before interpolation so we can implement 1052bf215546Sopenharmony_ci * GLSL's interpolateAt functions if the shader uses them. 1053bf215546Sopenharmony_ci */ 1054bf215546Sopenharmony_ci if (input_idx >= 0) { 1055bf215546Sopenharmony_ci assert(var); 1056bf215546Sopenharmony_ci c->interp[input_idx].vp = vary; 1057bf215546Sopenharmony_ci c->interp[input_idx].C = vir_MOV(c, r5); 1058bf215546Sopenharmony_ci c->interp[input_idx].mode = var->data.interpolation; 1059bf215546Sopenharmony_ci } 1060bf215546Sopenharmony_ci 1061bf215546Sopenharmony_ci /* For gl_PointCoord input or distance along a line, we'll be called 1062bf215546Sopenharmony_ci * with no nir_variable, and we don't count toward VPM size so we 1063bf215546Sopenharmony_ci * don't track an input slot. 1064bf215546Sopenharmony_ci */ 1065bf215546Sopenharmony_ci if (!var) { 1066bf215546Sopenharmony_ci assert(input_idx < 0); 1067bf215546Sopenharmony_ci return emit_smooth_varying(c, vary, c->payload_w, r5); 1068bf215546Sopenharmony_ci } 1069bf215546Sopenharmony_ci 1070bf215546Sopenharmony_ci int i = c->num_inputs++; 1071bf215546Sopenharmony_ci c->input_slots[i] = 1072bf215546Sopenharmony_ci v3d_slot_from_slot_and_component(var->data.location + 1073bf215546Sopenharmony_ci array_index, swizzle); 1074bf215546Sopenharmony_ci 1075bf215546Sopenharmony_ci struct qreg result; 1076bf215546Sopenharmony_ci switch (var->data.interpolation) { 1077bf215546Sopenharmony_ci case INTERP_MODE_NONE: 1078bf215546Sopenharmony_ci case INTERP_MODE_SMOOTH: 1079bf215546Sopenharmony_ci if (var->data.centroid) { 1080bf215546Sopenharmony_ci BITSET_SET(c->centroid_flags, i); 1081bf215546Sopenharmony_ci result = emit_smooth_varying(c, vary, 1082bf215546Sopenharmony_ci c->payload_w_centroid, r5); 1083bf215546Sopenharmony_ci } else { 1084bf215546Sopenharmony_ci result = emit_smooth_varying(c, vary, c->payload_w, r5); 1085bf215546Sopenharmony_ci } 1086bf215546Sopenharmony_ci break; 1087bf215546Sopenharmony_ci 1088bf215546Sopenharmony_ci case INTERP_MODE_NOPERSPECTIVE: 1089bf215546Sopenharmony_ci BITSET_SET(c->noperspective_flags, i); 1090bf215546Sopenharmony_ci result = emit_noperspective_varying(c, vary, r5); 1091bf215546Sopenharmony_ci break; 1092bf215546Sopenharmony_ci 1093bf215546Sopenharmony_ci case INTERP_MODE_FLAT: 1094bf215546Sopenharmony_ci BITSET_SET(c->flat_shade_flags, i); 1095bf215546Sopenharmony_ci result = emit_flat_varying(c, vary, r5); 1096bf215546Sopenharmony_ci break; 1097bf215546Sopenharmony_ci 1098bf215546Sopenharmony_ci default: 1099bf215546Sopenharmony_ci unreachable("Bad interp mode"); 1100bf215546Sopenharmony_ci } 1101bf215546Sopenharmony_ci 1102bf215546Sopenharmony_ci if (input_idx >= 0) 1103bf215546Sopenharmony_ci c->inputs[input_idx] = result; 1104bf215546Sopenharmony_ci return result; 1105bf215546Sopenharmony_ci} 1106bf215546Sopenharmony_ci 1107bf215546Sopenharmony_cistatic void 1108bf215546Sopenharmony_ciemit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var, 1109bf215546Sopenharmony_ci int array_index, unsigned nelem) 1110bf215546Sopenharmony_ci{ 1111bf215546Sopenharmony_ci for (int i = 0; i < nelem ; i++) { 1112bf215546Sopenharmony_ci int chan = var->data.location_frac + i; 1113bf215546Sopenharmony_ci int input_idx = (base_attr + array_index) * 4 + chan; 1114bf215546Sopenharmony_ci emit_fragment_varying(c, var, input_idx, chan, array_index); 1115bf215546Sopenharmony_ci } 1116bf215546Sopenharmony_ci} 1117bf215546Sopenharmony_ci 1118bf215546Sopenharmony_cistatic void 1119bf215546Sopenharmony_ciemit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var, 1120bf215546Sopenharmony_ci int array_index) 1121bf215546Sopenharmony_ci{ 1122bf215546Sopenharmony_ci /* Compact variables are scalar arrays where each set of 4 elements 1123bf215546Sopenharmony_ci * consumes a single location. 1124bf215546Sopenharmony_ci */ 1125bf215546Sopenharmony_ci int loc_offset = array_index / 4; 1126bf215546Sopenharmony_ci int chan = var->data.location_frac + array_index % 4; 1127bf215546Sopenharmony_ci int input_idx = (attr + loc_offset) * 4 + chan; 1128bf215546Sopenharmony_ci emit_fragment_varying(c, var, input_idx, chan, loc_offset); 1129bf215546Sopenharmony_ci} 1130bf215546Sopenharmony_ci 1131bf215546Sopenharmony_cistatic void 1132bf215546Sopenharmony_ciadd_output(struct v3d_compile *c, 1133bf215546Sopenharmony_ci uint32_t decl_offset, 1134bf215546Sopenharmony_ci uint8_t slot, 1135bf215546Sopenharmony_ci uint8_t swizzle) 1136bf215546Sopenharmony_ci{ 1137bf215546Sopenharmony_ci uint32_t old_array_size = c->outputs_array_size; 1138bf215546Sopenharmony_ci resize_qreg_array(c, &c->outputs, &c->outputs_array_size, 1139bf215546Sopenharmony_ci decl_offset + 1); 1140bf215546Sopenharmony_ci 1141bf215546Sopenharmony_ci if (old_array_size != c->outputs_array_size) { 1142bf215546Sopenharmony_ci c->output_slots = reralloc(c, 1143bf215546Sopenharmony_ci c->output_slots, 1144bf215546Sopenharmony_ci struct v3d_varying_slot, 1145bf215546Sopenharmony_ci c->outputs_array_size); 1146bf215546Sopenharmony_ci } 1147bf215546Sopenharmony_ci 1148bf215546Sopenharmony_ci c->output_slots[decl_offset] = 1149bf215546Sopenharmony_ci v3d_slot_from_slot_and_component(slot, swizzle); 1150bf215546Sopenharmony_ci} 1151bf215546Sopenharmony_ci 1152bf215546Sopenharmony_ci/** 1153bf215546Sopenharmony_ci * If compare_instr is a valid comparison instruction, emits the 1154bf215546Sopenharmony_ci * compare_instr's comparison and returns the sel_instr's return value based 1155bf215546Sopenharmony_ci * on the compare_instr's result. 1156bf215546Sopenharmony_ci */ 1157bf215546Sopenharmony_cistatic bool 1158bf215546Sopenharmony_cintq_emit_comparison(struct v3d_compile *c, 1159bf215546Sopenharmony_ci nir_alu_instr *compare_instr, 1160bf215546Sopenharmony_ci enum v3d_qpu_cond *out_cond) 1161bf215546Sopenharmony_ci{ 1162bf215546Sopenharmony_ci struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); 1163bf215546Sopenharmony_ci struct qreg src1; 1164bf215546Sopenharmony_ci if (nir_op_infos[compare_instr->op].num_inputs > 1) 1165bf215546Sopenharmony_ci src1 = ntq_get_alu_src(c, compare_instr, 1); 1166bf215546Sopenharmony_ci bool cond_invert = false; 1167bf215546Sopenharmony_ci struct qreg nop = vir_nop_reg(); 1168bf215546Sopenharmony_ci 1169bf215546Sopenharmony_ci switch (compare_instr->op) { 1170bf215546Sopenharmony_ci case nir_op_feq32: 1171bf215546Sopenharmony_ci case nir_op_seq: 1172bf215546Sopenharmony_ci vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1173bf215546Sopenharmony_ci break; 1174bf215546Sopenharmony_ci case nir_op_ieq32: 1175bf215546Sopenharmony_ci vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1176bf215546Sopenharmony_ci break; 1177bf215546Sopenharmony_ci 1178bf215546Sopenharmony_ci case nir_op_fneu32: 1179bf215546Sopenharmony_ci case nir_op_sne: 1180bf215546Sopenharmony_ci vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1181bf215546Sopenharmony_ci cond_invert = true; 1182bf215546Sopenharmony_ci break; 1183bf215546Sopenharmony_ci case nir_op_ine32: 1184bf215546Sopenharmony_ci vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1185bf215546Sopenharmony_ci cond_invert = true; 1186bf215546Sopenharmony_ci break; 1187bf215546Sopenharmony_ci 1188bf215546Sopenharmony_ci case nir_op_fge32: 1189bf215546Sopenharmony_ci case nir_op_sge: 1190bf215546Sopenharmony_ci vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 1191bf215546Sopenharmony_ci break; 1192bf215546Sopenharmony_ci case nir_op_ige32: 1193bf215546Sopenharmony_ci vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 1194bf215546Sopenharmony_ci cond_invert = true; 1195bf215546Sopenharmony_ci break; 1196bf215546Sopenharmony_ci case nir_op_uge32: 1197bf215546Sopenharmony_ci vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); 1198bf215546Sopenharmony_ci cond_invert = true; 1199bf215546Sopenharmony_ci break; 1200bf215546Sopenharmony_ci 1201bf215546Sopenharmony_ci case nir_op_slt: 1202bf215546Sopenharmony_ci case nir_op_flt32: 1203bf215546Sopenharmony_ci vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN); 1204bf215546Sopenharmony_ci break; 1205bf215546Sopenharmony_ci case nir_op_ilt32: 1206bf215546Sopenharmony_ci vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 1207bf215546Sopenharmony_ci break; 1208bf215546Sopenharmony_ci case nir_op_ult32: 1209bf215546Sopenharmony_ci vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); 1210bf215546Sopenharmony_ci break; 1211bf215546Sopenharmony_ci 1212bf215546Sopenharmony_ci case nir_op_i2b32: 1213bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); 1214bf215546Sopenharmony_ci cond_invert = true; 1215bf215546Sopenharmony_ci break; 1216bf215546Sopenharmony_ci 1217bf215546Sopenharmony_ci case nir_op_f2b32: 1218bf215546Sopenharmony_ci vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); 1219bf215546Sopenharmony_ci cond_invert = true; 1220bf215546Sopenharmony_ci break; 1221bf215546Sopenharmony_ci 1222bf215546Sopenharmony_ci default: 1223bf215546Sopenharmony_ci return false; 1224bf215546Sopenharmony_ci } 1225bf215546Sopenharmony_ci 1226bf215546Sopenharmony_ci *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA; 1227bf215546Sopenharmony_ci 1228bf215546Sopenharmony_ci return true; 1229bf215546Sopenharmony_ci} 1230bf215546Sopenharmony_ci 1231bf215546Sopenharmony_ci/* Finds an ALU instruction that generates our src value that could 1232bf215546Sopenharmony_ci * (potentially) be greedily emitted in the consuming instruction. 1233bf215546Sopenharmony_ci */ 1234bf215546Sopenharmony_cistatic struct nir_alu_instr * 1235bf215546Sopenharmony_cintq_get_alu_parent(nir_src src) 1236bf215546Sopenharmony_ci{ 1237bf215546Sopenharmony_ci if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu) 1238bf215546Sopenharmony_ci return NULL; 1239bf215546Sopenharmony_ci nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr); 1240bf215546Sopenharmony_ci if (!instr) 1241bf215546Sopenharmony_ci return NULL; 1242bf215546Sopenharmony_ci 1243bf215546Sopenharmony_ci /* If the ALU instr's srcs are non-SSA, then we would have to avoid 1244bf215546Sopenharmony_ci * moving emission of the ALU instr down past another write of the 1245bf215546Sopenharmony_ci * src. 1246bf215546Sopenharmony_ci */ 1247bf215546Sopenharmony_ci for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1248bf215546Sopenharmony_ci if (!instr->src[i].src.is_ssa) 1249bf215546Sopenharmony_ci return NULL; 1250bf215546Sopenharmony_ci } 1251bf215546Sopenharmony_ci 1252bf215546Sopenharmony_ci return instr; 1253bf215546Sopenharmony_ci} 1254bf215546Sopenharmony_ci 1255bf215546Sopenharmony_ci/* Turns a NIR bool into a condition code to predicate on. */ 1256bf215546Sopenharmony_cistatic enum v3d_qpu_cond 1257bf215546Sopenharmony_cintq_emit_bool_to_cond(struct v3d_compile *c, nir_src src) 1258bf215546Sopenharmony_ci{ 1259bf215546Sopenharmony_ci struct qreg qsrc = ntq_get_src(c, src, 0); 1260bf215546Sopenharmony_ci /* skip if we already have src in the flags */ 1261bf215546Sopenharmony_ci if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index) 1262bf215546Sopenharmony_ci return c->flags_cond; 1263bf215546Sopenharmony_ci 1264bf215546Sopenharmony_ci nir_alu_instr *compare = ntq_get_alu_parent(src); 1265bf215546Sopenharmony_ci if (!compare) 1266bf215546Sopenharmony_ci goto out; 1267bf215546Sopenharmony_ci 1268bf215546Sopenharmony_ci enum v3d_qpu_cond cond; 1269bf215546Sopenharmony_ci if (ntq_emit_comparison(c, compare, &cond)) 1270bf215546Sopenharmony_ci return cond; 1271bf215546Sopenharmony_ci 1272bf215546Sopenharmony_ciout: 1273bf215546Sopenharmony_ci 1274bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)), 1275bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 1276bf215546Sopenharmony_ci return V3D_QPU_COND_IFNA; 1277bf215546Sopenharmony_ci} 1278bf215546Sopenharmony_ci 1279bf215546Sopenharmony_cistatic struct qreg 1280bf215546Sopenharmony_cintq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond) 1281bf215546Sopenharmony_ci{ 1282bf215546Sopenharmony_ci struct qreg result = 1283bf215546Sopenharmony_ci vir_MOV(c, vir_SEL(c, cond, 1284bf215546Sopenharmony_ci vir_uniform_ui(c, ~0), 1285bf215546Sopenharmony_ci vir_uniform_ui(c, 0))); 1286bf215546Sopenharmony_ci c->flags_temp = result.index; 1287bf215546Sopenharmony_ci c->flags_cond = cond; 1288bf215546Sopenharmony_ci return result; 1289bf215546Sopenharmony_ci} 1290bf215546Sopenharmony_ci 1291bf215546Sopenharmony_cistatic struct qreg 1292bf215546Sopenharmony_cintq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond) 1293bf215546Sopenharmony_ci{ 1294bf215546Sopenharmony_ci struct qreg result = 1295bf215546Sopenharmony_ci vir_MOV(c, vir_SEL(c, cond, 1296bf215546Sopenharmony_ci vir_uniform_ui(c, 1), 1297bf215546Sopenharmony_ci vir_uniform_ui(c, 0))); 1298bf215546Sopenharmony_ci c->flags_temp = result.index; 1299bf215546Sopenharmony_ci c->flags_cond = cond; 1300bf215546Sopenharmony_ci return result; 1301bf215546Sopenharmony_ci} 1302bf215546Sopenharmony_ci 1303bf215546Sopenharmony_cistatic struct qreg 1304bf215546Sopenharmony_cif2f16_rtz(struct v3d_compile *c, struct qreg f32) 1305bf215546Sopenharmony_ci{ 1306bf215546Sopenharmony_ci /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding 1307bf215546Sopenharmony_ci * method and seems to be using RTE by default, so we need to implement 1308bf215546Sopenharmony_ci * RTZ rounding in software :-( 1309bf215546Sopenharmony_ci * 1310bf215546Sopenharmony_ci * The implementation identifies the cases where RTZ applies and 1311bf215546Sopenharmony_ci * returns the correct result and for everything else, it just uses 1312bf215546Sopenharmony_ci * the default RTE conversion. 1313bf215546Sopenharmony_ci */ 1314bf215546Sopenharmony_ci static bool _first = true; 1315bf215546Sopenharmony_ci if (_first && unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) { 1316bf215546Sopenharmony_ci fprintf(stderr, "Shader uses round-toward-zero f32->f16 " 1317bf215546Sopenharmony_ci "conversion which is not supported in hardware.\n"); 1318bf215546Sopenharmony_ci _first = false; 1319bf215546Sopenharmony_ci } 1320bf215546Sopenharmony_ci 1321bf215546Sopenharmony_ci struct qinst *inst; 1322bf215546Sopenharmony_ci struct qreg tmp; 1323bf215546Sopenharmony_ci 1324bf215546Sopenharmony_ci struct qreg result = vir_get_temp(c); 1325bf215546Sopenharmony_ci 1326bf215546Sopenharmony_ci struct qreg mantissa32 = vir_AND(c, f32, vir_uniform_ui(c, 0x007fffff)); 1327bf215546Sopenharmony_ci 1328bf215546Sopenharmony_ci /* Compute sign bit of result */ 1329bf215546Sopenharmony_ci struct qreg sign = vir_AND(c, vir_SHR(c, f32, vir_uniform_ui(c, 16)), 1330bf215546Sopenharmony_ci vir_uniform_ui(c, 0x8000)); 1331bf215546Sopenharmony_ci 1332bf215546Sopenharmony_ci /* Check the cases were RTZ rounding is relevant based on exponent */ 1333bf215546Sopenharmony_ci struct qreg exp32 = vir_AND(c, vir_SHR(c, f32, vir_uniform_ui(c, 23)), 1334bf215546Sopenharmony_ci vir_uniform_ui(c, 0xff)); 1335bf215546Sopenharmony_ci struct qreg exp16 = vir_ADD(c, exp32, vir_uniform_ui(c, -127 + 15)); 1336bf215546Sopenharmony_ci 1337bf215546Sopenharmony_ci /* if (exp16 > 30) */ 1338bf215546Sopenharmony_ci inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, 30)); 1339bf215546Sopenharmony_ci vir_set_pf(c, inst, V3D_QPU_PF_PUSHC); 1340bf215546Sopenharmony_ci inst = vir_OR_dest(c, result, sign, vir_uniform_ui(c, 0x7bff)); 1341bf215546Sopenharmony_ci vir_set_cond(inst, V3D_QPU_COND_IFA); 1342bf215546Sopenharmony_ci 1343bf215546Sopenharmony_ci /* if (exp16 <= 30) */ 1344bf215546Sopenharmony_ci inst = vir_OR_dest(c, result, 1345bf215546Sopenharmony_ci vir_OR(c, sign, 1346bf215546Sopenharmony_ci vir_SHL(c, exp16, vir_uniform_ui(c, 10))), 1347bf215546Sopenharmony_ci vir_SHR(c, mantissa32, vir_uniform_ui(c, 13))); 1348bf215546Sopenharmony_ci vir_set_cond(inst, V3D_QPU_COND_IFNA); 1349bf215546Sopenharmony_ci 1350bf215546Sopenharmony_ci /* if (exp16 <= 0) */ 1351bf215546Sopenharmony_ci inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, 0)); 1352bf215546Sopenharmony_ci vir_set_pf(c, inst, V3D_QPU_PF_PUSHC); 1353bf215546Sopenharmony_ci 1354bf215546Sopenharmony_ci tmp = vir_OR(c, mantissa32, vir_uniform_ui(c, 0x800000)); 1355bf215546Sopenharmony_ci tmp = vir_SHR(c, tmp, vir_SUB(c, vir_uniform_ui(c, 14), exp16)); 1356bf215546Sopenharmony_ci inst = vir_OR_dest(c, result, sign, tmp); 1357bf215546Sopenharmony_ci vir_set_cond(inst, V3D_QPU_COND_IFNA); 1358bf215546Sopenharmony_ci 1359bf215546Sopenharmony_ci /* Cases where RTZ mode is not relevant: use default RTE conversion. 1360bf215546Sopenharmony_ci * 1361bf215546Sopenharmony_ci * The cases that are not affected by RTZ are: 1362bf215546Sopenharmony_ci * 1363bf215546Sopenharmony_ci * exp16 < - 10 || exp32 == 0 || exp32 == 0xff 1364bf215546Sopenharmony_ci * 1365bf215546Sopenharmony_ci * In V3D we can implement this condition as: 1366bf215546Sopenharmony_ci * 1367bf215546Sopenharmony_ci * !((exp16 >= -10) && !(exp32 == 0) && !(exp32 == 0xff))) 1368bf215546Sopenharmony_ci */ 1369bf215546Sopenharmony_ci 1370bf215546Sopenharmony_ci /* exp16 >= -10 */ 1371bf215546Sopenharmony_ci inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, -10)); 1372bf215546Sopenharmony_ci vir_set_pf(c, inst, V3D_QPU_PF_PUSHC); 1373bf215546Sopenharmony_ci 1374bf215546Sopenharmony_ci /* && !(exp32 == 0) */ 1375bf215546Sopenharmony_ci inst = vir_MOV_dest(c, vir_nop_reg(), exp32); 1376bf215546Sopenharmony_ci vir_set_uf(c, inst, V3D_QPU_UF_ANDNZ); 1377bf215546Sopenharmony_ci 1378bf215546Sopenharmony_ci /* && !(exp32 == 0xff) */ 1379bf215546Sopenharmony_ci inst = vir_XOR_dest(c, vir_nop_reg(), exp32, vir_uniform_ui(c, 0xff)); 1380bf215546Sopenharmony_ci vir_set_uf(c, inst, V3D_QPU_UF_ANDNZ); 1381bf215546Sopenharmony_ci 1382bf215546Sopenharmony_ci /* Use regular RTE conversion if condition is False */ 1383bf215546Sopenharmony_ci inst = vir_FMOV_dest(c, result, f32); 1384bf215546Sopenharmony_ci vir_set_pack(inst, V3D_QPU_PACK_L); 1385bf215546Sopenharmony_ci vir_set_cond(inst, V3D_QPU_COND_IFNA); 1386bf215546Sopenharmony_ci 1387bf215546Sopenharmony_ci return vir_MOV(c, result); 1388bf215546Sopenharmony_ci} 1389bf215546Sopenharmony_ci 1390bf215546Sopenharmony_ci/** 1391bf215546Sopenharmony_ci * Takes the result value of a signed integer width conversion from a smaller 1392bf215546Sopenharmony_ci * type to a larger type and if needed, it applies sign extension to it. 1393bf215546Sopenharmony_ci */ 1394bf215546Sopenharmony_cistatic struct qreg 1395bf215546Sopenharmony_cisign_extend(struct v3d_compile *c, 1396bf215546Sopenharmony_ci struct qreg value, 1397bf215546Sopenharmony_ci uint32_t src_bit_size, 1398bf215546Sopenharmony_ci uint32_t dst_bit_size) 1399bf215546Sopenharmony_ci{ 1400bf215546Sopenharmony_ci assert(src_bit_size < dst_bit_size); 1401bf215546Sopenharmony_ci 1402bf215546Sopenharmony_ci struct qreg tmp = vir_MOV(c, value); 1403bf215546Sopenharmony_ci 1404bf215546Sopenharmony_ci /* Do we need to sign-extend? */ 1405bf215546Sopenharmony_ci uint32_t sign_mask = 1 << (src_bit_size - 1); 1406bf215546Sopenharmony_ci struct qinst *sign_check = 1407bf215546Sopenharmony_ci vir_AND_dest(c, vir_nop_reg(), 1408bf215546Sopenharmony_ci tmp, vir_uniform_ui(c, sign_mask)); 1409bf215546Sopenharmony_ci vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ); 1410bf215546Sopenharmony_ci 1411bf215546Sopenharmony_ci /* If so, fill in leading sign bits */ 1412bf215546Sopenharmony_ci uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) & 1413bf215546Sopenharmony_ci ((1ull << dst_bit_size) - 1); 1414bf215546Sopenharmony_ci struct qinst *extend_inst = 1415bf215546Sopenharmony_ci vir_OR_dest(c, tmp, tmp, 1416bf215546Sopenharmony_ci vir_uniform_ui(c, extend_bits)); 1417bf215546Sopenharmony_ci vir_set_cond(extend_inst, V3D_QPU_COND_IFNA); 1418bf215546Sopenharmony_ci 1419bf215546Sopenharmony_ci return tmp; 1420bf215546Sopenharmony_ci} 1421bf215546Sopenharmony_ci 1422bf215546Sopenharmony_cistatic void 1423bf215546Sopenharmony_cintq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) 1424bf215546Sopenharmony_ci{ 1425bf215546Sopenharmony_ci /* This should always be lowered to ALU operations for V3D. */ 1426bf215546Sopenharmony_ci assert(!instr->dest.saturate); 1427bf215546Sopenharmony_ci 1428bf215546Sopenharmony_ci /* Vectors are special in that they have non-scalarized writemasks, 1429bf215546Sopenharmony_ci * and just take the first swizzle channel for each argument in order 1430bf215546Sopenharmony_ci * into each writemask channel. 1431bf215546Sopenharmony_ci */ 1432bf215546Sopenharmony_ci if (instr->op == nir_op_vec2 || 1433bf215546Sopenharmony_ci instr->op == nir_op_vec3 || 1434bf215546Sopenharmony_ci instr->op == nir_op_vec4) { 1435bf215546Sopenharmony_ci struct qreg srcs[4]; 1436bf215546Sopenharmony_ci for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 1437bf215546Sopenharmony_ci srcs[i] = ntq_get_src(c, instr->src[i].src, 1438bf215546Sopenharmony_ci instr->src[i].swizzle[0]); 1439bf215546Sopenharmony_ci for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 1440bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest.dest, i, 1441bf215546Sopenharmony_ci vir_MOV(c, srcs[i])); 1442bf215546Sopenharmony_ci return; 1443bf215546Sopenharmony_ci } 1444bf215546Sopenharmony_ci 1445bf215546Sopenharmony_ci /* General case: We can just grab the one used channel per src. */ 1446bf215546Sopenharmony_ci struct qreg src[nir_op_infos[instr->op].num_inputs]; 1447bf215546Sopenharmony_ci for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1448bf215546Sopenharmony_ci src[i] = ntq_get_alu_src(c, instr, i); 1449bf215546Sopenharmony_ci } 1450bf215546Sopenharmony_ci 1451bf215546Sopenharmony_ci struct qreg result; 1452bf215546Sopenharmony_ci 1453bf215546Sopenharmony_ci switch (instr->op) { 1454bf215546Sopenharmony_ci case nir_op_mov: 1455bf215546Sopenharmony_ci result = vir_MOV(c, src[0]); 1456bf215546Sopenharmony_ci break; 1457bf215546Sopenharmony_ci 1458bf215546Sopenharmony_ci case nir_op_fneg: 1459bf215546Sopenharmony_ci result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31)); 1460bf215546Sopenharmony_ci break; 1461bf215546Sopenharmony_ci case nir_op_ineg: 1462bf215546Sopenharmony_ci result = vir_NEG(c, src[0]); 1463bf215546Sopenharmony_ci break; 1464bf215546Sopenharmony_ci 1465bf215546Sopenharmony_ci case nir_op_fmul: 1466bf215546Sopenharmony_ci result = vir_FMUL(c, src[0], src[1]); 1467bf215546Sopenharmony_ci break; 1468bf215546Sopenharmony_ci case nir_op_fadd: 1469bf215546Sopenharmony_ci result = vir_FADD(c, src[0], src[1]); 1470bf215546Sopenharmony_ci break; 1471bf215546Sopenharmony_ci case nir_op_fsub: 1472bf215546Sopenharmony_ci result = vir_FSUB(c, src[0], src[1]); 1473bf215546Sopenharmony_ci break; 1474bf215546Sopenharmony_ci case nir_op_fmin: 1475bf215546Sopenharmony_ci result = vir_FMIN(c, src[0], src[1]); 1476bf215546Sopenharmony_ci break; 1477bf215546Sopenharmony_ci case nir_op_fmax: 1478bf215546Sopenharmony_ci result = vir_FMAX(c, src[0], src[1]); 1479bf215546Sopenharmony_ci break; 1480bf215546Sopenharmony_ci 1481bf215546Sopenharmony_ci case nir_op_f2i32: { 1482bf215546Sopenharmony_ci nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src); 1483bf215546Sopenharmony_ci if (src0_alu && src0_alu->op == nir_op_fround_even) { 1484bf215546Sopenharmony_ci result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0)); 1485bf215546Sopenharmony_ci } else { 1486bf215546Sopenharmony_ci result = vir_FTOIZ(c, src[0]); 1487bf215546Sopenharmony_ci } 1488bf215546Sopenharmony_ci break; 1489bf215546Sopenharmony_ci } 1490bf215546Sopenharmony_ci 1491bf215546Sopenharmony_ci case nir_op_f2u32: 1492bf215546Sopenharmony_ci result = vir_FTOUZ(c, src[0]); 1493bf215546Sopenharmony_ci break; 1494bf215546Sopenharmony_ci case nir_op_i2f32: 1495bf215546Sopenharmony_ci result = vir_ITOF(c, src[0]); 1496bf215546Sopenharmony_ci break; 1497bf215546Sopenharmony_ci case nir_op_u2f32: 1498bf215546Sopenharmony_ci result = vir_UTOF(c, src[0]); 1499bf215546Sopenharmony_ci break; 1500bf215546Sopenharmony_ci case nir_op_b2f32: 1501bf215546Sopenharmony_ci result = vir_AND(c, src[0], vir_uniform_f(c, 1.0)); 1502bf215546Sopenharmony_ci break; 1503bf215546Sopenharmony_ci case nir_op_b2i32: 1504bf215546Sopenharmony_ci result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); 1505bf215546Sopenharmony_ci break; 1506bf215546Sopenharmony_ci 1507bf215546Sopenharmony_ci case nir_op_f2f16: 1508bf215546Sopenharmony_ci case nir_op_f2f16_rtne: 1509bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0].src) == 32); 1510bf215546Sopenharmony_ci result = vir_FMOV(c, src[0]); 1511bf215546Sopenharmony_ci vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L); 1512bf215546Sopenharmony_ci break; 1513bf215546Sopenharmony_ci 1514bf215546Sopenharmony_ci case nir_op_f2f16_rtz: 1515bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0].src) == 32); 1516bf215546Sopenharmony_ci result = f2f16_rtz(c, src[0]); 1517bf215546Sopenharmony_ci break; 1518bf215546Sopenharmony_ci 1519bf215546Sopenharmony_ci case nir_op_f2f32: 1520bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0].src) == 16); 1521bf215546Sopenharmony_ci result = vir_FMOV(c, src[0]); 1522bf215546Sopenharmony_ci vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); 1523bf215546Sopenharmony_ci break; 1524bf215546Sopenharmony_ci 1525bf215546Sopenharmony_ci case nir_op_i2i16: { 1526bf215546Sopenharmony_ci uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1527bf215546Sopenharmony_ci assert(bit_size == 32 || bit_size == 8); 1528bf215546Sopenharmony_ci if (bit_size == 32) { 1529bf215546Sopenharmony_ci /* We don't have integer pack/unpack methods for 1530bf215546Sopenharmony_ci * converting between 16-bit and 32-bit, so we implement 1531bf215546Sopenharmony_ci * the conversion manually by truncating the src. 1532bf215546Sopenharmony_ci */ 1533bf215546Sopenharmony_ci result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff)); 1534bf215546Sopenharmony_ci } else { 1535bf215546Sopenharmony_ci struct qreg tmp = vir_AND(c, src[0], 1536bf215546Sopenharmony_ci vir_uniform_ui(c, 0xff)); 1537bf215546Sopenharmony_ci result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16)); 1538bf215546Sopenharmony_ci } 1539bf215546Sopenharmony_ci break; 1540bf215546Sopenharmony_ci } 1541bf215546Sopenharmony_ci 1542bf215546Sopenharmony_ci case nir_op_u2u16: { 1543bf215546Sopenharmony_ci uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1544bf215546Sopenharmony_ci assert(bit_size == 32 || bit_size == 8); 1545bf215546Sopenharmony_ci 1546bf215546Sopenharmony_ci /* We don't have integer pack/unpack methods for converting 1547bf215546Sopenharmony_ci * between 16-bit and 32-bit, so we implement the conversion 1548bf215546Sopenharmony_ci * manually by truncating the src. For the 8-bit case, we 1549bf215546Sopenharmony_ci * want to make sure we don't copy garbage from any of the 1550bf215546Sopenharmony_ci * 24 MSB bits. 1551bf215546Sopenharmony_ci */ 1552bf215546Sopenharmony_ci if (bit_size == 32) 1553bf215546Sopenharmony_ci result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff)); 1554bf215546Sopenharmony_ci else 1555bf215546Sopenharmony_ci result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff)); 1556bf215546Sopenharmony_ci break; 1557bf215546Sopenharmony_ci } 1558bf215546Sopenharmony_ci 1559bf215546Sopenharmony_ci case nir_op_i2i8: 1560bf215546Sopenharmony_ci case nir_op_u2u8: 1561bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0].src) == 32 || 1562bf215546Sopenharmony_ci nir_src_bit_size(instr->src[0].src) == 16); 1563bf215546Sopenharmony_ci /* We don't have integer pack/unpack methods for converting 1564bf215546Sopenharmony_ci * between 8-bit and 32-bit, so we implement the conversion 1565bf215546Sopenharmony_ci * manually by truncating the src. 1566bf215546Sopenharmony_ci */ 1567bf215546Sopenharmony_ci result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff)); 1568bf215546Sopenharmony_ci break; 1569bf215546Sopenharmony_ci 1570bf215546Sopenharmony_ci case nir_op_u2u32: { 1571bf215546Sopenharmony_ci uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1572bf215546Sopenharmony_ci assert(bit_size == 16 || bit_size == 8); 1573bf215546Sopenharmony_ci 1574bf215546Sopenharmony_ci /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit 1575bf215546Sopenharmony_ci * from the src but we make sure to clear any garbage bits that 1576bf215546Sopenharmony_ci * may be present in the invalid src bits. 1577bf215546Sopenharmony_ci */ 1578bf215546Sopenharmony_ci uint32_t mask = (1 << bit_size) - 1; 1579bf215546Sopenharmony_ci result = vir_AND(c, src[0], vir_uniform_ui(c, mask)); 1580bf215546Sopenharmony_ci break; 1581bf215546Sopenharmony_ci } 1582bf215546Sopenharmony_ci 1583bf215546Sopenharmony_ci case nir_op_i2i32: { 1584bf215546Sopenharmony_ci uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1585bf215546Sopenharmony_ci assert(bit_size == 16 || bit_size == 8); 1586bf215546Sopenharmony_ci 1587bf215546Sopenharmony_ci uint32_t mask = (1 << bit_size) - 1; 1588bf215546Sopenharmony_ci struct qreg tmp = vir_AND(c, src[0], 1589bf215546Sopenharmony_ci vir_uniform_ui(c, mask)); 1590bf215546Sopenharmony_ci 1591bf215546Sopenharmony_ci result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32)); 1592bf215546Sopenharmony_ci break; 1593bf215546Sopenharmony_ci } 1594bf215546Sopenharmony_ci 1595bf215546Sopenharmony_ci case nir_op_iadd: 1596bf215546Sopenharmony_ci result = vir_ADD(c, src[0], src[1]); 1597bf215546Sopenharmony_ci break; 1598bf215546Sopenharmony_ci case nir_op_ushr: 1599bf215546Sopenharmony_ci result = vir_SHR(c, src[0], src[1]); 1600bf215546Sopenharmony_ci break; 1601bf215546Sopenharmony_ci case nir_op_isub: 1602bf215546Sopenharmony_ci result = vir_SUB(c, src[0], src[1]); 1603bf215546Sopenharmony_ci break; 1604bf215546Sopenharmony_ci case nir_op_ishr: 1605bf215546Sopenharmony_ci result = vir_ASR(c, src[0], src[1]); 1606bf215546Sopenharmony_ci break; 1607bf215546Sopenharmony_ci case nir_op_ishl: 1608bf215546Sopenharmony_ci result = vir_SHL(c, src[0], src[1]); 1609bf215546Sopenharmony_ci break; 1610bf215546Sopenharmony_ci case nir_op_imin: 1611bf215546Sopenharmony_ci result = vir_MIN(c, src[0], src[1]); 1612bf215546Sopenharmony_ci break; 1613bf215546Sopenharmony_ci case nir_op_umin: 1614bf215546Sopenharmony_ci result = vir_UMIN(c, src[0], src[1]); 1615bf215546Sopenharmony_ci break; 1616bf215546Sopenharmony_ci case nir_op_imax: 1617bf215546Sopenharmony_ci result = vir_MAX(c, src[0], src[1]); 1618bf215546Sopenharmony_ci break; 1619bf215546Sopenharmony_ci case nir_op_umax: 1620bf215546Sopenharmony_ci result = vir_UMAX(c, src[0], src[1]); 1621bf215546Sopenharmony_ci break; 1622bf215546Sopenharmony_ci case nir_op_iand: 1623bf215546Sopenharmony_ci result = vir_AND(c, src[0], src[1]); 1624bf215546Sopenharmony_ci break; 1625bf215546Sopenharmony_ci case nir_op_ior: 1626bf215546Sopenharmony_ci result = vir_OR(c, src[0], src[1]); 1627bf215546Sopenharmony_ci break; 1628bf215546Sopenharmony_ci case nir_op_ixor: 1629bf215546Sopenharmony_ci result = vir_XOR(c, src[0], src[1]); 1630bf215546Sopenharmony_ci break; 1631bf215546Sopenharmony_ci case nir_op_inot: 1632bf215546Sopenharmony_ci result = vir_NOT(c, src[0]); 1633bf215546Sopenharmony_ci break; 1634bf215546Sopenharmony_ci 1635bf215546Sopenharmony_ci case nir_op_ufind_msb: 1636bf215546Sopenharmony_ci result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0])); 1637bf215546Sopenharmony_ci break; 1638bf215546Sopenharmony_ci 1639bf215546Sopenharmony_ci case nir_op_imul: 1640bf215546Sopenharmony_ci result = vir_UMUL(c, src[0], src[1]); 1641bf215546Sopenharmony_ci break; 1642bf215546Sopenharmony_ci 1643bf215546Sopenharmony_ci case nir_op_seq: 1644bf215546Sopenharmony_ci case nir_op_sne: 1645bf215546Sopenharmony_ci case nir_op_sge: 1646bf215546Sopenharmony_ci case nir_op_slt: { 1647bf215546Sopenharmony_ci enum v3d_qpu_cond cond; 1648bf215546Sopenharmony_ci ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond); 1649bf215546Sopenharmony_ci assert(ok); 1650bf215546Sopenharmony_ci result = vir_MOV(c, vir_SEL(c, cond, 1651bf215546Sopenharmony_ci vir_uniform_f(c, 1.0), 1652bf215546Sopenharmony_ci vir_uniform_f(c, 0.0))); 1653bf215546Sopenharmony_ci c->flags_temp = result.index; 1654bf215546Sopenharmony_ci c->flags_cond = cond; 1655bf215546Sopenharmony_ci break; 1656bf215546Sopenharmony_ci } 1657bf215546Sopenharmony_ci 1658bf215546Sopenharmony_ci case nir_op_i2b32: 1659bf215546Sopenharmony_ci case nir_op_f2b32: 1660bf215546Sopenharmony_ci case nir_op_feq32: 1661bf215546Sopenharmony_ci case nir_op_fneu32: 1662bf215546Sopenharmony_ci case nir_op_fge32: 1663bf215546Sopenharmony_ci case nir_op_flt32: 1664bf215546Sopenharmony_ci case nir_op_ieq32: 1665bf215546Sopenharmony_ci case nir_op_ine32: 1666bf215546Sopenharmony_ci case nir_op_ige32: 1667bf215546Sopenharmony_ci case nir_op_uge32: 1668bf215546Sopenharmony_ci case nir_op_ilt32: 1669bf215546Sopenharmony_ci case nir_op_ult32: { 1670bf215546Sopenharmony_ci enum v3d_qpu_cond cond; 1671bf215546Sopenharmony_ci ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond); 1672bf215546Sopenharmony_ci assert(ok); 1673bf215546Sopenharmony_ci result = ntq_emit_cond_to_bool(c, cond); 1674bf215546Sopenharmony_ci break; 1675bf215546Sopenharmony_ci } 1676bf215546Sopenharmony_ci 1677bf215546Sopenharmony_ci case nir_op_b32csel: 1678bf215546Sopenharmony_ci result = vir_MOV(c, 1679bf215546Sopenharmony_ci vir_SEL(c, 1680bf215546Sopenharmony_ci ntq_emit_bool_to_cond(c, instr->src[0].src), 1681bf215546Sopenharmony_ci src[1], src[2])); 1682bf215546Sopenharmony_ci break; 1683bf215546Sopenharmony_ci 1684bf215546Sopenharmony_ci case nir_op_fcsel: 1685bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]), 1686bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 1687bf215546Sopenharmony_ci result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, 1688bf215546Sopenharmony_ci src[1], src[2])); 1689bf215546Sopenharmony_ci break; 1690bf215546Sopenharmony_ci 1691bf215546Sopenharmony_ci case nir_op_frcp: 1692bf215546Sopenharmony_ci result = vir_RECIP(c, src[0]); 1693bf215546Sopenharmony_ci break; 1694bf215546Sopenharmony_ci case nir_op_frsq: 1695bf215546Sopenharmony_ci result = vir_RSQRT(c, src[0]); 1696bf215546Sopenharmony_ci break; 1697bf215546Sopenharmony_ci case nir_op_fexp2: 1698bf215546Sopenharmony_ci result = vir_EXP(c, src[0]); 1699bf215546Sopenharmony_ci break; 1700bf215546Sopenharmony_ci case nir_op_flog2: 1701bf215546Sopenharmony_ci result = vir_LOG(c, src[0]); 1702bf215546Sopenharmony_ci break; 1703bf215546Sopenharmony_ci 1704bf215546Sopenharmony_ci case nir_op_fceil: 1705bf215546Sopenharmony_ci result = vir_FCEIL(c, src[0]); 1706bf215546Sopenharmony_ci break; 1707bf215546Sopenharmony_ci case nir_op_ffloor: 1708bf215546Sopenharmony_ci result = vir_FFLOOR(c, src[0]); 1709bf215546Sopenharmony_ci break; 1710bf215546Sopenharmony_ci case nir_op_fround_even: 1711bf215546Sopenharmony_ci result = vir_FROUND(c, src[0]); 1712bf215546Sopenharmony_ci break; 1713bf215546Sopenharmony_ci case nir_op_ftrunc: 1714bf215546Sopenharmony_ci result = vir_FTRUNC(c, src[0]); 1715bf215546Sopenharmony_ci break; 1716bf215546Sopenharmony_ci 1717bf215546Sopenharmony_ci case nir_op_fsin: 1718bf215546Sopenharmony_ci result = ntq_fsincos(c, src[0], false); 1719bf215546Sopenharmony_ci break; 1720bf215546Sopenharmony_ci case nir_op_fcos: 1721bf215546Sopenharmony_ci result = ntq_fsincos(c, src[0], true); 1722bf215546Sopenharmony_ci break; 1723bf215546Sopenharmony_ci 1724bf215546Sopenharmony_ci case nir_op_fsign: 1725bf215546Sopenharmony_ci result = ntq_fsign(c, src[0]); 1726bf215546Sopenharmony_ci break; 1727bf215546Sopenharmony_ci 1728bf215546Sopenharmony_ci case nir_op_fabs: { 1729bf215546Sopenharmony_ci result = vir_FMOV(c, src[0]); 1730bf215546Sopenharmony_ci vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS); 1731bf215546Sopenharmony_ci break; 1732bf215546Sopenharmony_ci } 1733bf215546Sopenharmony_ci 1734bf215546Sopenharmony_ci case nir_op_iabs: 1735bf215546Sopenharmony_ci result = vir_MAX(c, src[0], vir_NEG(c, src[0])); 1736bf215546Sopenharmony_ci break; 1737bf215546Sopenharmony_ci 1738bf215546Sopenharmony_ci case nir_op_fddx: 1739bf215546Sopenharmony_ci case nir_op_fddx_coarse: 1740bf215546Sopenharmony_ci case nir_op_fddx_fine: 1741bf215546Sopenharmony_ci result = vir_FDX(c, src[0]); 1742bf215546Sopenharmony_ci break; 1743bf215546Sopenharmony_ci 1744bf215546Sopenharmony_ci case nir_op_fddy: 1745bf215546Sopenharmony_ci case nir_op_fddy_coarse: 1746bf215546Sopenharmony_ci case nir_op_fddy_fine: 1747bf215546Sopenharmony_ci result = vir_FDY(c, src[0]); 1748bf215546Sopenharmony_ci break; 1749bf215546Sopenharmony_ci 1750bf215546Sopenharmony_ci case nir_op_uadd_carry: 1751bf215546Sopenharmony_ci vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]), 1752bf215546Sopenharmony_ci V3D_QPU_PF_PUSHC); 1753bf215546Sopenharmony_ci result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA); 1754bf215546Sopenharmony_ci break; 1755bf215546Sopenharmony_ci 1756bf215546Sopenharmony_ci case nir_op_usub_borrow: 1757bf215546Sopenharmony_ci vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]), 1758bf215546Sopenharmony_ci V3D_QPU_PF_PUSHC); 1759bf215546Sopenharmony_ci result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA); 1760bf215546Sopenharmony_ci break; 1761bf215546Sopenharmony_ci 1762bf215546Sopenharmony_ci case nir_op_pack_half_2x16_split: 1763bf215546Sopenharmony_ci result = vir_VFPACK(c, src[0], src[1]); 1764bf215546Sopenharmony_ci break; 1765bf215546Sopenharmony_ci 1766bf215546Sopenharmony_ci case nir_op_unpack_half_2x16_split_x: 1767bf215546Sopenharmony_ci result = vir_FMOV(c, src[0]); 1768bf215546Sopenharmony_ci vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); 1769bf215546Sopenharmony_ci break; 1770bf215546Sopenharmony_ci 1771bf215546Sopenharmony_ci case nir_op_unpack_half_2x16_split_y: 1772bf215546Sopenharmony_ci result = vir_FMOV(c, src[0]); 1773bf215546Sopenharmony_ci vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H); 1774bf215546Sopenharmony_ci break; 1775bf215546Sopenharmony_ci 1776bf215546Sopenharmony_ci case nir_op_fquantize2f16: { 1777bf215546Sopenharmony_ci /* F32 -> F16 -> F32 conversion */ 1778bf215546Sopenharmony_ci struct qreg tmp = vir_FMOV(c, src[0]); 1779bf215546Sopenharmony_ci vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L); 1780bf215546Sopenharmony_ci tmp = vir_FMOV(c, tmp); 1781bf215546Sopenharmony_ci vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L); 1782bf215546Sopenharmony_ci 1783bf215546Sopenharmony_ci /* Check for denorm */ 1784bf215546Sopenharmony_ci struct qreg abs_src = vir_FMOV(c, src[0]); 1785bf215546Sopenharmony_ci vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS); 1786bf215546Sopenharmony_ci struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14)); 1787bf215546Sopenharmony_ci vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold), 1788bf215546Sopenharmony_ci V3D_QPU_PF_PUSHC); 1789bf215546Sopenharmony_ci 1790bf215546Sopenharmony_ci /* Return +/-0 for denorms */ 1791bf215546Sopenharmony_ci struct qreg zero = 1792bf215546Sopenharmony_ci vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000)); 1793bf215546Sopenharmony_ci result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero)); 1794bf215546Sopenharmony_ci break; 1795bf215546Sopenharmony_ci } 1796bf215546Sopenharmony_ci 1797bf215546Sopenharmony_ci default: 1798bf215546Sopenharmony_ci fprintf(stderr, "unknown NIR ALU inst: "); 1799bf215546Sopenharmony_ci nir_print_instr(&instr->instr, stderr); 1800bf215546Sopenharmony_ci fprintf(stderr, "\n"); 1801bf215546Sopenharmony_ci abort(); 1802bf215546Sopenharmony_ci } 1803bf215546Sopenharmony_ci 1804bf215546Sopenharmony_ci /* We have a scalar result, so the instruction should only have a 1805bf215546Sopenharmony_ci * single channel written to. 1806bf215546Sopenharmony_ci */ 1807bf215546Sopenharmony_ci assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 1808bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest.dest, 1809bf215546Sopenharmony_ci ffs(instr->dest.write_mask) - 1, result); 1810bf215546Sopenharmony_ci} 1811bf215546Sopenharmony_ci 1812bf215546Sopenharmony_ci/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit 1813bf215546Sopenharmony_ci * specifier. They come from a register that's preloaded with 0xffffffff 1814bf215546Sopenharmony_ci * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low 1815bf215546Sopenharmony_ci * 8 bits are shifted off the bottom and 0xff shifted in from the top. 1816bf215546Sopenharmony_ci */ 1817bf215546Sopenharmony_ci#define TLB_TYPE_F16_COLOR (3 << 6) 1818bf215546Sopenharmony_ci#define TLB_TYPE_I32_COLOR (1 << 6) 1819bf215546Sopenharmony_ci#define TLB_TYPE_F32_COLOR (0 << 6) 1820bf215546Sopenharmony_ci#define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */ 1821bf215546Sopenharmony_ci#define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2) 1822bf215546Sopenharmony_ci#define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2) 1823bf215546Sopenharmony_ci#define TLB_F16_SWAP_HI_LO (1 << 1) 1824bf215546Sopenharmony_ci#define TLB_VEC_SIZE_4_F16 (1 << 0) 1825bf215546Sopenharmony_ci#define TLB_VEC_SIZE_2_F16 (0 << 0) 1826bf215546Sopenharmony_ci#define TLB_VEC_SIZE_MINUS_1_SHIFT 0 1827bf215546Sopenharmony_ci 1828bf215546Sopenharmony_ci/* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z" 1829bf215546Sopenharmony_ci * flag is set. 1830bf215546Sopenharmony_ci */ 1831bf215546Sopenharmony_ci#define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4)) 1832bf215546Sopenharmony_ci#define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */ 1833bf215546Sopenharmony_ci#define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */ 1834bf215546Sopenharmony_ci#define TLB_V42_DEPTH_TYPE_INVARIANT (0 << 3) /* Unmodified sideband input used */ 1835bf215546Sopenharmony_ci#define TLB_V42_DEPTH_TYPE_PER_PIXEL (1 << 3) /* QPU result used */ 1836bf215546Sopenharmony_ci 1837bf215546Sopenharmony_ci/* Stencil is a single 32-bit write. */ 1838bf215546Sopenharmony_ci#define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4)) 1839bf215546Sopenharmony_ci 1840bf215546Sopenharmony_cistatic void 1841bf215546Sopenharmony_civir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt) 1842bf215546Sopenharmony_ci{ 1843bf215546Sopenharmony_ci if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt]) 1844bf215546Sopenharmony_ci return; 1845bf215546Sopenharmony_ci 1846bf215546Sopenharmony_ci struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB); 1847bf215546Sopenharmony_ci struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); 1848bf215546Sopenharmony_ci 1849bf215546Sopenharmony_ci nir_variable *var = c->output_color_var[rt]; 1850bf215546Sopenharmony_ci int num_components = glsl_get_vector_elements(var->type); 1851bf215546Sopenharmony_ci uint32_t conf = 0xffffff00; 1852bf215546Sopenharmony_ci struct qinst *inst; 1853bf215546Sopenharmony_ci 1854bf215546Sopenharmony_ci conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE : 1855bf215546Sopenharmony_ci TLB_SAMPLE_MODE_PER_PIXEL; 1856bf215546Sopenharmony_ci conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; 1857bf215546Sopenharmony_ci 1858bf215546Sopenharmony_ci if (c->fs_key->swap_color_rb & (1 << rt)) 1859bf215546Sopenharmony_ci num_components = MAX2(num_components, 3); 1860bf215546Sopenharmony_ci assert(num_components != 0); 1861bf215546Sopenharmony_ci 1862bf215546Sopenharmony_ci enum glsl_base_type type = glsl_get_base_type(var->type); 1863bf215546Sopenharmony_ci bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT; 1864bf215546Sopenharmony_ci bool is_32b_tlb_format = is_int_format || 1865bf215546Sopenharmony_ci (c->fs_key->f32_color_rb & (1 << rt)); 1866bf215546Sopenharmony_ci 1867bf215546Sopenharmony_ci if (is_int_format) { 1868bf215546Sopenharmony_ci /* The F32 vs I32 distinction was dropped in 4.2. */ 1869bf215546Sopenharmony_ci if (c->devinfo->ver < 42) 1870bf215546Sopenharmony_ci conf |= TLB_TYPE_I32_COLOR; 1871bf215546Sopenharmony_ci else 1872bf215546Sopenharmony_ci conf |= TLB_TYPE_F32_COLOR; 1873bf215546Sopenharmony_ci conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT); 1874bf215546Sopenharmony_ci } else { 1875bf215546Sopenharmony_ci if (c->fs_key->f32_color_rb & (1 << rt)) { 1876bf215546Sopenharmony_ci conf |= TLB_TYPE_F32_COLOR; 1877bf215546Sopenharmony_ci conf |= ((num_components - 1) << 1878bf215546Sopenharmony_ci TLB_VEC_SIZE_MINUS_1_SHIFT); 1879bf215546Sopenharmony_ci } else { 1880bf215546Sopenharmony_ci conf |= TLB_TYPE_F16_COLOR; 1881bf215546Sopenharmony_ci conf |= TLB_F16_SWAP_HI_LO; 1882bf215546Sopenharmony_ci if (num_components >= 3) 1883bf215546Sopenharmony_ci conf |= TLB_VEC_SIZE_4_F16; 1884bf215546Sopenharmony_ci else 1885bf215546Sopenharmony_ci conf |= TLB_VEC_SIZE_2_F16; 1886bf215546Sopenharmony_ci } 1887bf215546Sopenharmony_ci } 1888bf215546Sopenharmony_ci 1889bf215546Sopenharmony_ci int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1; 1890bf215546Sopenharmony_ci for (int i = 0; i < num_samples; i++) { 1891bf215546Sopenharmony_ci struct qreg *color = c->msaa_per_sample_output ? 1892bf215546Sopenharmony_ci &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] : 1893bf215546Sopenharmony_ci &c->outputs[var->data.driver_location * 4]; 1894bf215546Sopenharmony_ci 1895bf215546Sopenharmony_ci struct qreg r = color[0]; 1896bf215546Sopenharmony_ci struct qreg g = color[1]; 1897bf215546Sopenharmony_ci struct qreg b = color[2]; 1898bf215546Sopenharmony_ci struct qreg a = color[3]; 1899bf215546Sopenharmony_ci 1900bf215546Sopenharmony_ci if (c->fs_key->swap_color_rb & (1 << rt)) { 1901bf215546Sopenharmony_ci r = color[2]; 1902bf215546Sopenharmony_ci b = color[0]; 1903bf215546Sopenharmony_ci } 1904bf215546Sopenharmony_ci 1905bf215546Sopenharmony_ci if (c->fs_key->sample_alpha_to_one) 1906bf215546Sopenharmony_ci a = vir_uniform_f(c, 1.0); 1907bf215546Sopenharmony_ci 1908bf215546Sopenharmony_ci if (is_32b_tlb_format) { 1909bf215546Sopenharmony_ci if (i == 0) { 1910bf215546Sopenharmony_ci inst = vir_MOV_dest(c, tlbu_reg, r); 1911bf215546Sopenharmony_ci inst->uniform = 1912bf215546Sopenharmony_ci vir_get_uniform_index(c, 1913bf215546Sopenharmony_ci QUNIFORM_CONSTANT, 1914bf215546Sopenharmony_ci conf); 1915bf215546Sopenharmony_ci } else { 1916bf215546Sopenharmony_ci vir_MOV_dest(c, tlb_reg, r); 1917bf215546Sopenharmony_ci } 1918bf215546Sopenharmony_ci 1919bf215546Sopenharmony_ci if (num_components >= 2) 1920bf215546Sopenharmony_ci vir_MOV_dest(c, tlb_reg, g); 1921bf215546Sopenharmony_ci if (num_components >= 3) 1922bf215546Sopenharmony_ci vir_MOV_dest(c, tlb_reg, b); 1923bf215546Sopenharmony_ci if (num_components >= 4) 1924bf215546Sopenharmony_ci vir_MOV_dest(c, tlb_reg, a); 1925bf215546Sopenharmony_ci } else { 1926bf215546Sopenharmony_ci inst = vir_VFPACK_dest(c, tlb_reg, r, g); 1927bf215546Sopenharmony_ci if (conf != ~0 && i == 0) { 1928bf215546Sopenharmony_ci inst->dst = tlbu_reg; 1929bf215546Sopenharmony_ci inst->uniform = 1930bf215546Sopenharmony_ci vir_get_uniform_index(c, 1931bf215546Sopenharmony_ci QUNIFORM_CONSTANT, 1932bf215546Sopenharmony_ci conf); 1933bf215546Sopenharmony_ci } 1934bf215546Sopenharmony_ci 1935bf215546Sopenharmony_ci if (num_components >= 3) 1936bf215546Sopenharmony_ci vir_VFPACK_dest(c, tlb_reg, b, a); 1937bf215546Sopenharmony_ci } 1938bf215546Sopenharmony_ci } 1939bf215546Sopenharmony_ci} 1940bf215546Sopenharmony_ci 1941bf215546Sopenharmony_cistatic void 1942bf215546Sopenharmony_ciemit_frag_end(struct v3d_compile *c) 1943bf215546Sopenharmony_ci{ 1944bf215546Sopenharmony_ci if (c->output_sample_mask_index != -1) { 1945bf215546Sopenharmony_ci vir_SETMSF_dest(c, vir_nop_reg(), 1946bf215546Sopenharmony_ci vir_AND(c, 1947bf215546Sopenharmony_ci vir_MSF(c), 1948bf215546Sopenharmony_ci c->outputs[c->output_sample_mask_index])); 1949bf215546Sopenharmony_ci } 1950bf215546Sopenharmony_ci 1951bf215546Sopenharmony_ci bool has_any_tlb_color_write = false; 1952bf215546Sopenharmony_ci for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) { 1953bf215546Sopenharmony_ci if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt]) 1954bf215546Sopenharmony_ci has_any_tlb_color_write = true; 1955bf215546Sopenharmony_ci } 1956bf215546Sopenharmony_ci 1957bf215546Sopenharmony_ci if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) { 1958bf215546Sopenharmony_ci struct nir_variable *var = c->output_color_var[0]; 1959bf215546Sopenharmony_ci struct qreg *color = &c->outputs[var->data.driver_location * 4]; 1960bf215546Sopenharmony_ci 1961bf215546Sopenharmony_ci vir_SETMSF_dest(c, vir_nop_reg(), 1962bf215546Sopenharmony_ci vir_AND(c, 1963bf215546Sopenharmony_ci vir_MSF(c), 1964bf215546Sopenharmony_ci vir_FTOC(c, color[3]))); 1965bf215546Sopenharmony_ci } 1966bf215546Sopenharmony_ci 1967bf215546Sopenharmony_ci struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); 1968bf215546Sopenharmony_ci 1969bf215546Sopenharmony_ci /* If the shader has no non-TLB side effects and doesn't write Z 1970bf215546Sopenharmony_ci * we can promote it to enabling early_fragment_tests even 1971bf215546Sopenharmony_ci * if the user didn't. 1972bf215546Sopenharmony_ci */ 1973bf215546Sopenharmony_ci if (c->output_position_index == -1 && 1974bf215546Sopenharmony_ci !(c->s->info.num_images || c->s->info.num_ssbos) && 1975bf215546Sopenharmony_ci !c->s->info.fs.uses_discard && 1976bf215546Sopenharmony_ci !c->fs_key->sample_alpha_to_coverage && 1977bf215546Sopenharmony_ci c->output_sample_mask_index == -1 && 1978bf215546Sopenharmony_ci has_any_tlb_color_write) { 1979bf215546Sopenharmony_ci c->s->info.fs.early_fragment_tests = true; 1980bf215546Sopenharmony_ci } 1981bf215546Sopenharmony_ci 1982bf215546Sopenharmony_ci /* By default, Z buffer writes are implicit using the Z values produced 1983bf215546Sopenharmony_ci * from FEP (Z value produced from rasterization). When this is not 1984bf215546Sopenharmony_ci * desirable (shader writes Z explicitly, has discards, etc) we need 1985bf215546Sopenharmony_ci * to let the hardware know by setting c->writes_z to true, in which 1986bf215546Sopenharmony_ci * case we always need to write a Z value from the QPU, even if it is 1987bf215546Sopenharmony_ci * just the passthrough Z value produced from FEP. 1988bf215546Sopenharmony_ci * 1989bf215546Sopenharmony_ci * Also, from the V3D 4.2 spec: 1990bf215546Sopenharmony_ci * 1991bf215546Sopenharmony_ci * "If a shader performs a Z read the “Fragment shader does Z writes” 1992bf215546Sopenharmony_ci * bit in the shader record must be enabled to ensure deterministic 1993bf215546Sopenharmony_ci * results" 1994bf215546Sopenharmony_ci * 1995bf215546Sopenharmony_ci * So if c->reads_z is set we always need to write Z, even if it is 1996bf215546Sopenharmony_ci * a passthrough from the Z value produced from FEP. 1997bf215546Sopenharmony_ci */ 1998bf215546Sopenharmony_ci if (!c->s->info.fs.early_fragment_tests || c->reads_z) { 1999bf215546Sopenharmony_ci c->writes_z = true; 2000bf215546Sopenharmony_ci uint8_t tlb_specifier = TLB_TYPE_DEPTH; 2001bf215546Sopenharmony_ci struct qinst *inst; 2002bf215546Sopenharmony_ci 2003bf215546Sopenharmony_ci if (c->output_position_index != -1) { 2004bf215546Sopenharmony_ci /* Shader writes to gl_FragDepth, use that */ 2005bf215546Sopenharmony_ci inst = vir_MOV_dest(c, tlbu_reg, 2006bf215546Sopenharmony_ci c->outputs[c->output_position_index]); 2007bf215546Sopenharmony_ci 2008bf215546Sopenharmony_ci if (c->devinfo->ver >= 42) { 2009bf215546Sopenharmony_ci tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL | 2010bf215546Sopenharmony_ci TLB_SAMPLE_MODE_PER_PIXEL); 2011bf215546Sopenharmony_ci } else { 2012bf215546Sopenharmony_ci tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL; 2013bf215546Sopenharmony_ci } 2014bf215546Sopenharmony_ci } else { 2015bf215546Sopenharmony_ci /* Shader doesn't write to gl_FragDepth, take Z from 2016bf215546Sopenharmony_ci * FEP. 2017bf215546Sopenharmony_ci */ 2018bf215546Sopenharmony_ci c->writes_z_from_fep = true; 2019bf215546Sopenharmony_ci inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg()); 2020bf215546Sopenharmony_ci 2021bf215546Sopenharmony_ci if (c->devinfo->ver >= 42) { 2022bf215546Sopenharmony_ci /* The spec says the PER_PIXEL flag is ignored 2023bf215546Sopenharmony_ci * for invariant writes, but the simulator 2024bf215546Sopenharmony_ci * demands it. 2025bf215546Sopenharmony_ci */ 2026bf215546Sopenharmony_ci tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT | 2027bf215546Sopenharmony_ci TLB_SAMPLE_MODE_PER_PIXEL); 2028bf215546Sopenharmony_ci } else { 2029bf215546Sopenharmony_ci tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT; 2030bf215546Sopenharmony_ci } 2031bf215546Sopenharmony_ci 2032bf215546Sopenharmony_ci /* Since (single-threaded) fragment shaders always need 2033bf215546Sopenharmony_ci * a TLB write, if we dond't have any we emit a 2034bf215546Sopenharmony_ci * passthrouh Z and flag us as potentially discarding, 2035bf215546Sopenharmony_ci * so that we can use Z as the required TLB write. 2036bf215546Sopenharmony_ci */ 2037bf215546Sopenharmony_ci if (!has_any_tlb_color_write) 2038bf215546Sopenharmony_ci c->s->info.fs.uses_discard = true; 2039bf215546Sopenharmony_ci } 2040bf215546Sopenharmony_ci 2041bf215546Sopenharmony_ci inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 2042bf215546Sopenharmony_ci tlb_specifier | 2043bf215546Sopenharmony_ci 0xffffff00); 2044bf215546Sopenharmony_ci inst->is_tlb_z_write = true; 2045bf215546Sopenharmony_ci } 2046bf215546Sopenharmony_ci 2047bf215546Sopenharmony_ci /* XXX: Performance improvement: Merge Z write and color writes TLB 2048bf215546Sopenharmony_ci * uniform setup 2049bf215546Sopenharmony_ci */ 2050bf215546Sopenharmony_ci for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) 2051bf215546Sopenharmony_ci vir_emit_tlb_color_write(c, rt); 2052bf215546Sopenharmony_ci} 2053bf215546Sopenharmony_ci 2054bf215546Sopenharmony_cistatic inline void 2055bf215546Sopenharmony_civir_VPM_WRITE_indirect(struct v3d_compile *c, 2056bf215546Sopenharmony_ci struct qreg val, 2057bf215546Sopenharmony_ci struct qreg vpm_index, 2058bf215546Sopenharmony_ci bool uniform_vpm_index) 2059bf215546Sopenharmony_ci{ 2060bf215546Sopenharmony_ci assert(c->devinfo->ver >= 40); 2061bf215546Sopenharmony_ci if (uniform_vpm_index) 2062bf215546Sopenharmony_ci vir_STVPMV(c, vpm_index, val); 2063bf215546Sopenharmony_ci else 2064bf215546Sopenharmony_ci vir_STVPMD(c, vpm_index, val); 2065bf215546Sopenharmony_ci} 2066bf215546Sopenharmony_ci 2067bf215546Sopenharmony_cistatic void 2068bf215546Sopenharmony_civir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) 2069bf215546Sopenharmony_ci{ 2070bf215546Sopenharmony_ci if (c->devinfo->ver >= 40) { 2071bf215546Sopenharmony_ci vir_VPM_WRITE_indirect(c, val, 2072bf215546Sopenharmony_ci vir_uniform_ui(c, vpm_index), true); 2073bf215546Sopenharmony_ci } else { 2074bf215546Sopenharmony_ci /* XXX: v3d33_vir_vpm_write_setup(c); */ 2075bf215546Sopenharmony_ci vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); 2076bf215546Sopenharmony_ci } 2077bf215546Sopenharmony_ci} 2078bf215546Sopenharmony_ci 2079bf215546Sopenharmony_cistatic void 2080bf215546Sopenharmony_ciemit_vert_end(struct v3d_compile *c) 2081bf215546Sopenharmony_ci{ 2082bf215546Sopenharmony_ci /* GFXH-1684: VPM writes need to be complete by the end of the shader. 2083bf215546Sopenharmony_ci */ 2084bf215546Sopenharmony_ci if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) 2085bf215546Sopenharmony_ci vir_VPMWT(c); 2086bf215546Sopenharmony_ci} 2087bf215546Sopenharmony_ci 2088bf215546Sopenharmony_cistatic void 2089bf215546Sopenharmony_ciemit_geom_end(struct v3d_compile *c) 2090bf215546Sopenharmony_ci{ 2091bf215546Sopenharmony_ci /* GFXH-1684: VPM writes need to be complete by the end of the shader. 2092bf215546Sopenharmony_ci */ 2093bf215546Sopenharmony_ci if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) 2094bf215546Sopenharmony_ci vir_VPMWT(c); 2095bf215546Sopenharmony_ci} 2096bf215546Sopenharmony_ci 2097bf215546Sopenharmony_cistatic bool 2098bf215546Sopenharmony_cimem_vectorize_callback(unsigned align_mul, unsigned align_offset, 2099bf215546Sopenharmony_ci unsigned bit_size, 2100bf215546Sopenharmony_ci unsigned num_components, 2101bf215546Sopenharmony_ci nir_intrinsic_instr *low, 2102bf215546Sopenharmony_ci nir_intrinsic_instr *high, 2103bf215546Sopenharmony_ci void *data) 2104bf215546Sopenharmony_ci{ 2105bf215546Sopenharmony_ci /* TMU general access only supports 32-bit vectors */ 2106bf215546Sopenharmony_ci if (bit_size > 32) 2107bf215546Sopenharmony_ci return false; 2108bf215546Sopenharmony_ci 2109bf215546Sopenharmony_ci if ((bit_size == 8 || bit_size == 16) && num_components > 1) 2110bf215546Sopenharmony_ci return false; 2111bf215546Sopenharmony_ci 2112bf215546Sopenharmony_ci if (align_mul % 4 != 0 || align_offset % 4 != 0) 2113bf215546Sopenharmony_ci return false; 2114bf215546Sopenharmony_ci 2115bf215546Sopenharmony_ci /* Vector accesses wrap at 16-byte boundaries so we can't vectorize 2116bf215546Sopenharmony_ci * if the resulting vector crosses a 16-byte boundary. 2117bf215546Sopenharmony_ci */ 2118bf215546Sopenharmony_ci assert(util_is_power_of_two_nonzero(align_mul)); 2119bf215546Sopenharmony_ci align_mul = MIN2(align_mul, 16); 2120bf215546Sopenharmony_ci align_offset &= 0xf; 2121bf215546Sopenharmony_ci if (16 - align_mul + align_offset + num_components * 4 > 16) 2122bf215546Sopenharmony_ci return false; 2123bf215546Sopenharmony_ci 2124bf215546Sopenharmony_ci return true; 2125bf215546Sopenharmony_ci} 2126bf215546Sopenharmony_ci 2127bf215546Sopenharmony_civoid 2128bf215546Sopenharmony_civ3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) 2129bf215546Sopenharmony_ci{ 2130bf215546Sopenharmony_ci bool progress; 2131bf215546Sopenharmony_ci unsigned lower_flrp = 2132bf215546Sopenharmony_ci (s->options->lower_flrp16 ? 16 : 0) | 2133bf215546Sopenharmony_ci (s->options->lower_flrp32 ? 32 : 0) | 2134bf215546Sopenharmony_ci (s->options->lower_flrp64 ? 64 : 0); 2135bf215546Sopenharmony_ci 2136bf215546Sopenharmony_ci do { 2137bf215546Sopenharmony_ci progress = false; 2138bf215546Sopenharmony_ci 2139bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_lower_vars_to_ssa); 2140bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); 2141bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_lower_phis_to_scalar, false); 2142bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_copy_prop); 2143bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_opt_remove_phis); 2144bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_opt_dce); 2145bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_opt_dead_cf); 2146bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_opt_cse); 2147bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); 2148bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_opt_algebraic); 2149bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_opt_constant_folding); 2150bf215546Sopenharmony_ci 2151bf215546Sopenharmony_ci /* Note that vectorization may undo the load/store scalarization 2152bf215546Sopenharmony_ci * pass we run for non 32-bit TMU general load/store by 2153bf215546Sopenharmony_ci * converting, for example, 2 consecutive 16-bit loads into a 2154bf215546Sopenharmony_ci * single 32-bit load. This is fine (and desirable) as long as 2155bf215546Sopenharmony_ci * the resulting 32-bit load meets 32-bit alignment requirements, 2156bf215546Sopenharmony_ci * which mem_vectorize_callback() should be enforcing. 2157bf215546Sopenharmony_ci */ 2158bf215546Sopenharmony_ci nir_load_store_vectorize_options vectorize_opts = { 2159bf215546Sopenharmony_ci .modes = nir_var_mem_ssbo | nir_var_mem_ubo | 2160bf215546Sopenharmony_ci nir_var_mem_push_const | nir_var_mem_shared | 2161bf215546Sopenharmony_ci nir_var_mem_global, 2162bf215546Sopenharmony_ci .callback = mem_vectorize_callback, 2163bf215546Sopenharmony_ci .robust_modes = 0, 2164bf215546Sopenharmony_ci }; 2165bf215546Sopenharmony_ci bool vectorize_progress = false; 2166bf215546Sopenharmony_ci NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize, 2167bf215546Sopenharmony_ci &vectorize_opts); 2168bf215546Sopenharmony_ci if (vectorize_progress) { 2169bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); 2170bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_lower_pack); 2171bf215546Sopenharmony_ci progress = true; 2172bf215546Sopenharmony_ci } 2173bf215546Sopenharmony_ci 2174bf215546Sopenharmony_ci if (lower_flrp != 0) { 2175bf215546Sopenharmony_ci bool lower_flrp_progress = false; 2176bf215546Sopenharmony_ci 2177bf215546Sopenharmony_ci NIR_PASS(lower_flrp_progress, s, nir_lower_flrp, 2178bf215546Sopenharmony_ci lower_flrp, 2179bf215546Sopenharmony_ci false /* always_precise */); 2180bf215546Sopenharmony_ci if (lower_flrp_progress) { 2181bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_opt_constant_folding); 2182bf215546Sopenharmony_ci progress = true; 2183bf215546Sopenharmony_ci } 2184bf215546Sopenharmony_ci 2185bf215546Sopenharmony_ci /* Nothing should rematerialize any flrps, so we only 2186bf215546Sopenharmony_ci * need to do this lowering once. 2187bf215546Sopenharmony_ci */ 2188bf215546Sopenharmony_ci lower_flrp = 0; 2189bf215546Sopenharmony_ci } 2190bf215546Sopenharmony_ci 2191bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_opt_undef); 2192bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_lower_undef_to_zero); 2193bf215546Sopenharmony_ci 2194bf215546Sopenharmony_ci if (c && !c->disable_loop_unrolling && 2195bf215546Sopenharmony_ci s->options->max_unroll_iterations > 0) { 2196bf215546Sopenharmony_ci bool local_progress = false; 2197bf215546Sopenharmony_ci NIR_PASS(local_progress, s, nir_opt_loop_unroll); 2198bf215546Sopenharmony_ci c->unrolled_any_loops |= local_progress; 2199bf215546Sopenharmony_ci progress |= local_progress; 2200bf215546Sopenharmony_ci } 2201bf215546Sopenharmony_ci } while (progress); 2202bf215546Sopenharmony_ci 2203bf215546Sopenharmony_ci nir_move_options sink_opts = 2204bf215546Sopenharmony_ci nir_move_const_undef | nir_move_comparisons | nir_move_copies | 2205bf215546Sopenharmony_ci nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform; 2206bf215546Sopenharmony_ci NIR_PASS(progress, s, nir_opt_sink, sink_opts); 2207bf215546Sopenharmony_ci} 2208bf215546Sopenharmony_ci 2209bf215546Sopenharmony_cistatic int 2210bf215546Sopenharmony_cidriver_location_compare(const nir_variable *a, const nir_variable *b) 2211bf215546Sopenharmony_ci{ 2212bf215546Sopenharmony_ci return a->data.driver_location == b->data.driver_location ? 2213bf215546Sopenharmony_ci a->data.location_frac - b->data.location_frac : 2214bf215546Sopenharmony_ci a->data.driver_location - b->data.driver_location; 2215bf215546Sopenharmony_ci} 2216bf215546Sopenharmony_ci 2217bf215546Sopenharmony_cistatic struct qreg 2218bf215546Sopenharmony_cintq_emit_vpm_read(struct v3d_compile *c, 2219bf215546Sopenharmony_ci uint32_t *num_components_queued, 2220bf215546Sopenharmony_ci uint32_t *remaining, 2221bf215546Sopenharmony_ci uint32_t vpm_index) 2222bf215546Sopenharmony_ci{ 2223bf215546Sopenharmony_ci struct qreg vpm = vir_reg(QFILE_VPM, vpm_index); 2224bf215546Sopenharmony_ci 2225bf215546Sopenharmony_ci if (c->devinfo->ver >= 40 ) { 2226bf215546Sopenharmony_ci return vir_LDVPMV_IN(c, 2227bf215546Sopenharmony_ci vir_uniform_ui(c, 2228bf215546Sopenharmony_ci (*num_components_queued)++)); 2229bf215546Sopenharmony_ci } 2230bf215546Sopenharmony_ci 2231bf215546Sopenharmony_ci if (*num_components_queued != 0) { 2232bf215546Sopenharmony_ci (*num_components_queued)--; 2233bf215546Sopenharmony_ci return vir_MOV(c, vpm); 2234bf215546Sopenharmony_ci } 2235bf215546Sopenharmony_ci 2236bf215546Sopenharmony_ci uint32_t num_components = MIN2(*remaining, 32); 2237bf215546Sopenharmony_ci 2238bf215546Sopenharmony_ci v3d33_vir_vpm_read_setup(c, num_components); 2239bf215546Sopenharmony_ci 2240bf215546Sopenharmony_ci *num_components_queued = num_components - 1; 2241bf215546Sopenharmony_ci *remaining -= num_components; 2242bf215546Sopenharmony_ci 2243bf215546Sopenharmony_ci return vir_MOV(c, vpm); 2244bf215546Sopenharmony_ci} 2245bf215546Sopenharmony_ci 2246bf215546Sopenharmony_cistatic void 2247bf215546Sopenharmony_cintq_setup_vs_inputs(struct v3d_compile *c) 2248bf215546Sopenharmony_ci{ 2249bf215546Sopenharmony_ci /* Figure out how many components of each vertex attribute the shader 2250bf215546Sopenharmony_ci * uses. Each variable should have been split to individual 2251bf215546Sopenharmony_ci * components and unused ones DCEed. The vertex fetcher will load 2252bf215546Sopenharmony_ci * from the start of the attribute to the number of components we 2253bf215546Sopenharmony_ci * declare we need in c->vattr_sizes[]. 2254bf215546Sopenharmony_ci * 2255bf215546Sopenharmony_ci * BGRA vertex attributes are a bit special: since we implement these 2256bf215546Sopenharmony_ci * as RGBA swapping R/B components we always need at least 3 components 2257bf215546Sopenharmony_ci * if component 0 is read. 2258bf215546Sopenharmony_ci */ 2259bf215546Sopenharmony_ci nir_foreach_shader_in_variable(var, c->s) { 2260bf215546Sopenharmony_ci /* No VS attribute array support. */ 2261bf215546Sopenharmony_ci assert(MAX2(glsl_get_length(var->type), 1) == 1); 2262bf215546Sopenharmony_ci 2263bf215546Sopenharmony_ci unsigned loc = var->data.driver_location; 2264bf215546Sopenharmony_ci int start_component = var->data.location_frac; 2265bf215546Sopenharmony_ci int num_components = glsl_get_components(var->type); 2266bf215546Sopenharmony_ci 2267bf215546Sopenharmony_ci c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc], 2268bf215546Sopenharmony_ci start_component + num_components); 2269bf215546Sopenharmony_ci 2270bf215546Sopenharmony_ci /* Handle BGRA inputs */ 2271bf215546Sopenharmony_ci if (start_component == 0 && 2272bf215546Sopenharmony_ci c->vs_key->va_swap_rb_mask & (1 << var->data.location)) { 2273bf215546Sopenharmony_ci c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]); 2274bf215546Sopenharmony_ci } 2275bf215546Sopenharmony_ci } 2276bf215546Sopenharmony_ci 2277bf215546Sopenharmony_ci unsigned num_components = 0; 2278bf215546Sopenharmony_ci uint32_t vpm_components_queued = 0; 2279bf215546Sopenharmony_ci bool uses_iid = BITSET_TEST(c->s->info.system_values_read, 2280bf215546Sopenharmony_ci SYSTEM_VALUE_INSTANCE_ID) || 2281bf215546Sopenharmony_ci BITSET_TEST(c->s->info.system_values_read, 2282bf215546Sopenharmony_ci SYSTEM_VALUE_INSTANCE_INDEX); 2283bf215546Sopenharmony_ci bool uses_biid = BITSET_TEST(c->s->info.system_values_read, 2284bf215546Sopenharmony_ci SYSTEM_VALUE_BASE_INSTANCE); 2285bf215546Sopenharmony_ci bool uses_vid = BITSET_TEST(c->s->info.system_values_read, 2286bf215546Sopenharmony_ci SYSTEM_VALUE_VERTEX_ID) || 2287bf215546Sopenharmony_ci BITSET_TEST(c->s->info.system_values_read, 2288bf215546Sopenharmony_ci SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); 2289bf215546Sopenharmony_ci 2290bf215546Sopenharmony_ci num_components += uses_iid; 2291bf215546Sopenharmony_ci num_components += uses_biid; 2292bf215546Sopenharmony_ci num_components += uses_vid; 2293bf215546Sopenharmony_ci 2294bf215546Sopenharmony_ci for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++) 2295bf215546Sopenharmony_ci num_components += c->vattr_sizes[i]; 2296bf215546Sopenharmony_ci 2297bf215546Sopenharmony_ci if (uses_iid) { 2298bf215546Sopenharmony_ci c->iid = ntq_emit_vpm_read(c, &vpm_components_queued, 2299bf215546Sopenharmony_ci &num_components, ~0); 2300bf215546Sopenharmony_ci } 2301bf215546Sopenharmony_ci 2302bf215546Sopenharmony_ci if (uses_biid) { 2303bf215546Sopenharmony_ci c->biid = ntq_emit_vpm_read(c, &vpm_components_queued, 2304bf215546Sopenharmony_ci &num_components, ~0); 2305bf215546Sopenharmony_ci } 2306bf215546Sopenharmony_ci 2307bf215546Sopenharmony_ci if (uses_vid) { 2308bf215546Sopenharmony_ci c->vid = ntq_emit_vpm_read(c, &vpm_components_queued, 2309bf215546Sopenharmony_ci &num_components, ~0); 2310bf215546Sopenharmony_ci } 2311bf215546Sopenharmony_ci 2312bf215546Sopenharmony_ci /* The actual loads will happen directly in nir_intrinsic_load_input 2313bf215546Sopenharmony_ci * on newer versions. 2314bf215546Sopenharmony_ci */ 2315bf215546Sopenharmony_ci if (c->devinfo->ver >= 40) 2316bf215546Sopenharmony_ci return; 2317bf215546Sopenharmony_ci 2318bf215546Sopenharmony_ci for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) { 2319bf215546Sopenharmony_ci resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 2320bf215546Sopenharmony_ci (loc + 1) * 4); 2321bf215546Sopenharmony_ci 2322bf215546Sopenharmony_ci for (int i = 0; i < c->vattr_sizes[loc]; i++) { 2323bf215546Sopenharmony_ci c->inputs[loc * 4 + i] = 2324bf215546Sopenharmony_ci ntq_emit_vpm_read(c, 2325bf215546Sopenharmony_ci &vpm_components_queued, 2326bf215546Sopenharmony_ci &num_components, 2327bf215546Sopenharmony_ci loc * 4 + i); 2328bf215546Sopenharmony_ci 2329bf215546Sopenharmony_ci } 2330bf215546Sopenharmony_ci } 2331bf215546Sopenharmony_ci 2332bf215546Sopenharmony_ci if (c->devinfo->ver >= 40) { 2333bf215546Sopenharmony_ci assert(vpm_components_queued == num_components); 2334bf215546Sopenharmony_ci } else { 2335bf215546Sopenharmony_ci assert(vpm_components_queued == 0); 2336bf215546Sopenharmony_ci assert(num_components == 0); 2337bf215546Sopenharmony_ci } 2338bf215546Sopenharmony_ci} 2339bf215546Sopenharmony_ci 2340bf215546Sopenharmony_cistatic bool 2341bf215546Sopenharmony_ciprogram_reads_point_coord(struct v3d_compile *c) 2342bf215546Sopenharmony_ci{ 2343bf215546Sopenharmony_ci nir_foreach_shader_in_variable(var, c->s) { 2344bf215546Sopenharmony_ci if (util_varying_is_point_coord(var->data.location, 2345bf215546Sopenharmony_ci c->fs_key->point_sprite_mask)) { 2346bf215546Sopenharmony_ci return true; 2347bf215546Sopenharmony_ci } 2348bf215546Sopenharmony_ci } 2349bf215546Sopenharmony_ci 2350bf215546Sopenharmony_ci return false; 2351bf215546Sopenharmony_ci} 2352bf215546Sopenharmony_ci 2353bf215546Sopenharmony_cistatic void 2354bf215546Sopenharmony_cintq_setup_gs_inputs(struct v3d_compile *c) 2355bf215546Sopenharmony_ci{ 2356bf215546Sopenharmony_ci nir_sort_variables_with_modes(c->s, driver_location_compare, 2357bf215546Sopenharmony_ci nir_var_shader_in); 2358bf215546Sopenharmony_ci 2359bf215546Sopenharmony_ci nir_foreach_shader_in_variable(var, c->s) { 2360bf215546Sopenharmony_ci /* All GS inputs are arrays with as many entries as vertices 2361bf215546Sopenharmony_ci * in the input primitive, but here we only care about the 2362bf215546Sopenharmony_ci * per-vertex input type. 2363bf215546Sopenharmony_ci */ 2364bf215546Sopenharmony_ci assert(glsl_type_is_array(var->type)); 2365bf215546Sopenharmony_ci const struct glsl_type *type = glsl_get_array_element(var->type); 2366bf215546Sopenharmony_ci unsigned var_len = glsl_count_vec4_slots(type, false, false); 2367bf215546Sopenharmony_ci unsigned loc = var->data.driver_location; 2368bf215546Sopenharmony_ci 2369bf215546Sopenharmony_ci resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 2370bf215546Sopenharmony_ci (loc + var_len) * 4); 2371bf215546Sopenharmony_ci 2372bf215546Sopenharmony_ci if (var->data.compact) { 2373bf215546Sopenharmony_ci for (unsigned j = 0; j < var_len; j++) { 2374bf215546Sopenharmony_ci unsigned input_idx = c->num_inputs++; 2375bf215546Sopenharmony_ci unsigned loc_frac = var->data.location_frac + j; 2376bf215546Sopenharmony_ci unsigned loc = var->data.location + loc_frac / 4; 2377bf215546Sopenharmony_ci unsigned comp = loc_frac % 4; 2378bf215546Sopenharmony_ci c->input_slots[input_idx] = 2379bf215546Sopenharmony_ci v3d_slot_from_slot_and_component(loc, comp); 2380bf215546Sopenharmony_ci } 2381bf215546Sopenharmony_ci continue; 2382bf215546Sopenharmony_ci } 2383bf215546Sopenharmony_ci 2384bf215546Sopenharmony_ci for (unsigned j = 0; j < var_len; j++) { 2385bf215546Sopenharmony_ci unsigned num_elements = 2386bf215546Sopenharmony_ci glsl_type_is_struct(glsl_without_array(type)) ? 2387bf215546Sopenharmony_ci 4 : glsl_get_vector_elements(type); 2388bf215546Sopenharmony_ci for (unsigned k = 0; k < num_elements; k++) { 2389bf215546Sopenharmony_ci unsigned chan = var->data.location_frac + k; 2390bf215546Sopenharmony_ci unsigned input_idx = c->num_inputs++; 2391bf215546Sopenharmony_ci struct v3d_varying_slot slot = 2392bf215546Sopenharmony_ci v3d_slot_from_slot_and_component(var->data.location + j, chan); 2393bf215546Sopenharmony_ci c->input_slots[input_idx] = slot; 2394bf215546Sopenharmony_ci } 2395bf215546Sopenharmony_ci } 2396bf215546Sopenharmony_ci } 2397bf215546Sopenharmony_ci} 2398bf215546Sopenharmony_ci 2399bf215546Sopenharmony_ci 2400bf215546Sopenharmony_cistatic void 2401bf215546Sopenharmony_cintq_setup_fs_inputs(struct v3d_compile *c) 2402bf215546Sopenharmony_ci{ 2403bf215546Sopenharmony_ci nir_sort_variables_with_modes(c->s, driver_location_compare, 2404bf215546Sopenharmony_ci nir_var_shader_in); 2405bf215546Sopenharmony_ci 2406bf215546Sopenharmony_ci nir_foreach_shader_in_variable(var, c->s) { 2407bf215546Sopenharmony_ci unsigned var_len = glsl_count_vec4_slots(var->type, false, false); 2408bf215546Sopenharmony_ci unsigned loc = var->data.driver_location; 2409bf215546Sopenharmony_ci 2410bf215546Sopenharmony_ci uint32_t inputs_array_size = c->inputs_array_size; 2411bf215546Sopenharmony_ci uint32_t inputs_array_required_size = (loc + var_len) * 4; 2412bf215546Sopenharmony_ci resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 2413bf215546Sopenharmony_ci inputs_array_required_size); 2414bf215546Sopenharmony_ci resize_interp_array(c, &c->interp, &inputs_array_size, 2415bf215546Sopenharmony_ci inputs_array_required_size); 2416bf215546Sopenharmony_ci 2417bf215546Sopenharmony_ci if (var->data.location == VARYING_SLOT_POS) { 2418bf215546Sopenharmony_ci emit_fragcoord_input(c, loc); 2419bf215546Sopenharmony_ci } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID && 2420bf215546Sopenharmony_ci !c->fs_key->has_gs) { 2421bf215546Sopenharmony_ci /* If the fragment shader reads gl_PrimitiveID and we 2422bf215546Sopenharmony_ci * don't have a geometry shader in the pipeline to write 2423bf215546Sopenharmony_ci * it then we program the hardware to inject it as 2424bf215546Sopenharmony_ci * an implicit varying. Take it from there. 2425bf215546Sopenharmony_ci */ 2426bf215546Sopenharmony_ci c->inputs[loc * 4] = c->primitive_id; 2427bf215546Sopenharmony_ci } else if (util_varying_is_point_coord(var->data.location, 2428bf215546Sopenharmony_ci c->fs_key->point_sprite_mask)) { 2429bf215546Sopenharmony_ci c->inputs[loc * 4 + 0] = c->point_x; 2430bf215546Sopenharmony_ci c->inputs[loc * 4 + 1] = c->point_y; 2431bf215546Sopenharmony_ci } else if (var->data.compact) { 2432bf215546Sopenharmony_ci for (int j = 0; j < var_len; j++) 2433bf215546Sopenharmony_ci emit_compact_fragment_input(c, loc, var, j); 2434bf215546Sopenharmony_ci } else if (glsl_type_is_struct(glsl_without_array(var->type))) { 2435bf215546Sopenharmony_ci for (int j = 0; j < var_len; j++) { 2436bf215546Sopenharmony_ci emit_fragment_input(c, loc, var, j, 4); 2437bf215546Sopenharmony_ci } 2438bf215546Sopenharmony_ci } else { 2439bf215546Sopenharmony_ci for (int j = 0; j < var_len; j++) { 2440bf215546Sopenharmony_ci emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type)); 2441bf215546Sopenharmony_ci } 2442bf215546Sopenharmony_ci } 2443bf215546Sopenharmony_ci } 2444bf215546Sopenharmony_ci} 2445bf215546Sopenharmony_ci 2446bf215546Sopenharmony_cistatic void 2447bf215546Sopenharmony_cintq_setup_outputs(struct v3d_compile *c) 2448bf215546Sopenharmony_ci{ 2449bf215546Sopenharmony_ci if (c->s->info.stage != MESA_SHADER_FRAGMENT) 2450bf215546Sopenharmony_ci return; 2451bf215546Sopenharmony_ci 2452bf215546Sopenharmony_ci nir_foreach_shader_out_variable(var, c->s) { 2453bf215546Sopenharmony_ci unsigned array_len = MAX2(glsl_get_length(var->type), 1); 2454bf215546Sopenharmony_ci unsigned loc = var->data.driver_location * 4; 2455bf215546Sopenharmony_ci 2456bf215546Sopenharmony_ci assert(array_len == 1); 2457bf215546Sopenharmony_ci (void)array_len; 2458bf215546Sopenharmony_ci 2459bf215546Sopenharmony_ci for (int i = 0; i < 4 - var->data.location_frac; i++) { 2460bf215546Sopenharmony_ci add_output(c, loc + var->data.location_frac + i, 2461bf215546Sopenharmony_ci var->data.location, 2462bf215546Sopenharmony_ci var->data.location_frac + i); 2463bf215546Sopenharmony_ci } 2464bf215546Sopenharmony_ci 2465bf215546Sopenharmony_ci switch (var->data.location) { 2466bf215546Sopenharmony_ci case FRAG_RESULT_COLOR: 2467bf215546Sopenharmony_ci c->output_color_var[0] = var; 2468bf215546Sopenharmony_ci c->output_color_var[1] = var; 2469bf215546Sopenharmony_ci c->output_color_var[2] = var; 2470bf215546Sopenharmony_ci c->output_color_var[3] = var; 2471bf215546Sopenharmony_ci break; 2472bf215546Sopenharmony_ci case FRAG_RESULT_DATA0: 2473bf215546Sopenharmony_ci case FRAG_RESULT_DATA1: 2474bf215546Sopenharmony_ci case FRAG_RESULT_DATA2: 2475bf215546Sopenharmony_ci case FRAG_RESULT_DATA3: 2476bf215546Sopenharmony_ci c->output_color_var[var->data.location - 2477bf215546Sopenharmony_ci FRAG_RESULT_DATA0] = var; 2478bf215546Sopenharmony_ci break; 2479bf215546Sopenharmony_ci case FRAG_RESULT_DEPTH: 2480bf215546Sopenharmony_ci c->output_position_index = loc; 2481bf215546Sopenharmony_ci break; 2482bf215546Sopenharmony_ci case FRAG_RESULT_SAMPLE_MASK: 2483bf215546Sopenharmony_ci c->output_sample_mask_index = loc; 2484bf215546Sopenharmony_ci break; 2485bf215546Sopenharmony_ci } 2486bf215546Sopenharmony_ci } 2487bf215546Sopenharmony_ci} 2488bf215546Sopenharmony_ci 2489bf215546Sopenharmony_ci/** 2490bf215546Sopenharmony_ci * Sets up the mapping from nir_register to struct qreg *. 2491bf215546Sopenharmony_ci * 2492bf215546Sopenharmony_ci * Each nir_register gets a struct qreg per 32-bit component being stored. 2493bf215546Sopenharmony_ci */ 2494bf215546Sopenharmony_cistatic void 2495bf215546Sopenharmony_cintq_setup_registers(struct v3d_compile *c, struct exec_list *list) 2496bf215546Sopenharmony_ci{ 2497bf215546Sopenharmony_ci foreach_list_typed(nir_register, nir_reg, node, list) { 2498bf215546Sopenharmony_ci unsigned array_len = MAX2(nir_reg->num_array_elems, 1); 2499bf215546Sopenharmony_ci struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 2500bf215546Sopenharmony_ci array_len * 2501bf215546Sopenharmony_ci nir_reg->num_components); 2502bf215546Sopenharmony_ci 2503bf215546Sopenharmony_ci _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); 2504bf215546Sopenharmony_ci 2505bf215546Sopenharmony_ci for (int i = 0; i < array_len * nir_reg->num_components; i++) 2506bf215546Sopenharmony_ci qregs[i] = vir_get_temp(c); 2507bf215546Sopenharmony_ci } 2508bf215546Sopenharmony_ci} 2509bf215546Sopenharmony_ci 2510bf215546Sopenharmony_cistatic void 2511bf215546Sopenharmony_cintq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr) 2512bf215546Sopenharmony_ci{ 2513bf215546Sopenharmony_ci /* XXX perf: Experiment with using immediate loads to avoid having 2514bf215546Sopenharmony_ci * these end up in the uniform stream. Watch out for breaking the 2515bf215546Sopenharmony_ci * small immediates optimization in the process! 2516bf215546Sopenharmony_ci */ 2517bf215546Sopenharmony_ci struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 2518bf215546Sopenharmony_ci for (int i = 0; i < instr->def.num_components; i++) 2519bf215546Sopenharmony_ci qregs[i] = vir_uniform_ui(c, instr->value[i].u32); 2520bf215546Sopenharmony_ci 2521bf215546Sopenharmony_ci _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); 2522bf215546Sopenharmony_ci} 2523bf215546Sopenharmony_ci 2524bf215546Sopenharmony_cistatic void 2525bf215546Sopenharmony_cintq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr) 2526bf215546Sopenharmony_ci{ 2527bf215546Sopenharmony_ci unsigned image_index = nir_src_as_uint(instr->src[0]); 2528bf215546Sopenharmony_ci bool is_array = nir_intrinsic_image_array(instr); 2529bf215546Sopenharmony_ci 2530bf215546Sopenharmony_ci assert(nir_src_as_uint(instr->src[1]) == 0); 2531bf215546Sopenharmony_ci 2532bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 2533bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index)); 2534bf215546Sopenharmony_ci if (instr->num_components > 1) { 2535bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 1, 2536bf215546Sopenharmony_ci vir_uniform(c, 2537bf215546Sopenharmony_ci instr->num_components == 2 && is_array ? 2538bf215546Sopenharmony_ci QUNIFORM_IMAGE_ARRAY_SIZE : 2539bf215546Sopenharmony_ci QUNIFORM_IMAGE_HEIGHT, 2540bf215546Sopenharmony_ci image_index)); 2541bf215546Sopenharmony_ci } 2542bf215546Sopenharmony_ci if (instr->num_components > 2) { 2543bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 2, 2544bf215546Sopenharmony_ci vir_uniform(c, 2545bf215546Sopenharmony_ci is_array ? 2546bf215546Sopenharmony_ci QUNIFORM_IMAGE_ARRAY_SIZE : 2547bf215546Sopenharmony_ci QUNIFORM_IMAGE_DEPTH, 2548bf215546Sopenharmony_ci image_index)); 2549bf215546Sopenharmony_ci } 2550bf215546Sopenharmony_ci} 2551bf215546Sopenharmony_ci 2552bf215546Sopenharmony_cistatic void 2553bf215546Sopenharmony_civir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr) 2554bf215546Sopenharmony_ci{ 2555bf215546Sopenharmony_ci assert(c->s->info.stage == MESA_SHADER_FRAGMENT); 2556bf215546Sopenharmony_ci 2557bf215546Sopenharmony_ci int rt = nir_src_as_uint(instr->src[0]); 2558bf215546Sopenharmony_ci assert(rt < V3D_MAX_DRAW_BUFFERS); 2559bf215546Sopenharmony_ci 2560bf215546Sopenharmony_ci int sample_index = nir_intrinsic_base(instr) ; 2561bf215546Sopenharmony_ci assert(sample_index < V3D_MAX_SAMPLES); 2562bf215546Sopenharmony_ci 2563bf215546Sopenharmony_ci int component = nir_intrinsic_component(instr); 2564bf215546Sopenharmony_ci assert(component < 4); 2565bf215546Sopenharmony_ci 2566bf215546Sopenharmony_ci /* We need to emit our TLB reads after we have acquired the scoreboard 2567bf215546Sopenharmony_ci * lock, or the GPU will hang. Usually, we do our scoreboard locking on 2568bf215546Sopenharmony_ci * the last thread switch to improve parallelism, however, that is only 2569bf215546Sopenharmony_ci * guaranteed to happen before the tlb color writes. 2570bf215546Sopenharmony_ci * 2571bf215546Sopenharmony_ci * To fix that, we make sure we always emit a thread switch before the 2572bf215546Sopenharmony_ci * first tlb color read. If that happens to be the last thread switch 2573bf215546Sopenharmony_ci * we emit, then everything is fine, but otherwsie, if any code after 2574bf215546Sopenharmony_ci * this point needs to emit additional thread switches, then we will 2575bf215546Sopenharmony_ci * switch the strategy to locking the scoreboard on the first thread 2576bf215546Sopenharmony_ci * switch instead -- see vir_emit_thrsw(). 2577bf215546Sopenharmony_ci */ 2578bf215546Sopenharmony_ci if (!c->emitted_tlb_load) { 2579bf215546Sopenharmony_ci if (!c->last_thrsw_at_top_level) { 2580bf215546Sopenharmony_ci assert(c->devinfo->ver >= 41); 2581bf215546Sopenharmony_ci vir_emit_thrsw(c); 2582bf215546Sopenharmony_ci } 2583bf215546Sopenharmony_ci 2584bf215546Sopenharmony_ci c->emitted_tlb_load = true; 2585bf215546Sopenharmony_ci } 2586bf215546Sopenharmony_ci 2587bf215546Sopenharmony_ci struct qreg *color_reads_for_sample = 2588bf215546Sopenharmony_ci &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4]; 2589bf215546Sopenharmony_ci 2590bf215546Sopenharmony_ci if (color_reads_for_sample[component].file == QFILE_NULL) { 2591bf215546Sopenharmony_ci enum pipe_format rt_format = c->fs_key->color_fmt[rt].format; 2592bf215546Sopenharmony_ci int num_components = 2593bf215546Sopenharmony_ci util_format_get_nr_components(rt_format); 2594bf215546Sopenharmony_ci 2595bf215546Sopenharmony_ci const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt); 2596bf215546Sopenharmony_ci if (swap_rb) 2597bf215546Sopenharmony_ci num_components = MAX2(num_components, 3); 2598bf215546Sopenharmony_ci 2599bf215546Sopenharmony_ci nir_variable *var = c->output_color_var[rt]; 2600bf215546Sopenharmony_ci enum glsl_base_type type = glsl_get_base_type(var->type); 2601bf215546Sopenharmony_ci 2602bf215546Sopenharmony_ci bool is_int_format = type == GLSL_TYPE_INT || 2603bf215546Sopenharmony_ci type == GLSL_TYPE_UINT; 2604bf215546Sopenharmony_ci 2605bf215546Sopenharmony_ci bool is_32b_tlb_format = is_int_format || 2606bf215546Sopenharmony_ci (c->fs_key->f32_color_rb & (1 << rt)); 2607bf215546Sopenharmony_ci 2608bf215546Sopenharmony_ci int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1; 2609bf215546Sopenharmony_ci 2610bf215546Sopenharmony_ci uint32_t conf = 0xffffff00; 2611bf215546Sopenharmony_ci conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE : 2612bf215546Sopenharmony_ci TLB_SAMPLE_MODE_PER_PIXEL; 2613bf215546Sopenharmony_ci conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; 2614bf215546Sopenharmony_ci 2615bf215546Sopenharmony_ci if (is_32b_tlb_format) { 2616bf215546Sopenharmony_ci /* The F32 vs I32 distinction was dropped in 4.2. */ 2617bf215546Sopenharmony_ci conf |= (c->devinfo->ver < 42 && is_int_format) ? 2618bf215546Sopenharmony_ci TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR; 2619bf215546Sopenharmony_ci 2620bf215546Sopenharmony_ci conf |= ((num_components - 1) << 2621bf215546Sopenharmony_ci TLB_VEC_SIZE_MINUS_1_SHIFT); 2622bf215546Sopenharmony_ci } else { 2623bf215546Sopenharmony_ci conf |= TLB_TYPE_F16_COLOR; 2624bf215546Sopenharmony_ci conf |= TLB_F16_SWAP_HI_LO; 2625bf215546Sopenharmony_ci 2626bf215546Sopenharmony_ci if (num_components >= 3) 2627bf215546Sopenharmony_ci conf |= TLB_VEC_SIZE_4_F16; 2628bf215546Sopenharmony_ci else 2629bf215546Sopenharmony_ci conf |= TLB_VEC_SIZE_2_F16; 2630bf215546Sopenharmony_ci } 2631bf215546Sopenharmony_ci 2632bf215546Sopenharmony_ci 2633bf215546Sopenharmony_ci for (int i = 0; i < num_samples; i++) { 2634bf215546Sopenharmony_ci struct qreg r, g, b, a; 2635bf215546Sopenharmony_ci if (is_32b_tlb_format) { 2636bf215546Sopenharmony_ci r = conf != 0xffffffff && i == 0? 2637bf215546Sopenharmony_ci vir_TLBU_COLOR_READ(c, conf) : 2638bf215546Sopenharmony_ci vir_TLB_COLOR_READ(c); 2639bf215546Sopenharmony_ci if (num_components >= 2) 2640bf215546Sopenharmony_ci g = vir_TLB_COLOR_READ(c); 2641bf215546Sopenharmony_ci if (num_components >= 3) 2642bf215546Sopenharmony_ci b = vir_TLB_COLOR_READ(c); 2643bf215546Sopenharmony_ci if (num_components >= 4) 2644bf215546Sopenharmony_ci a = vir_TLB_COLOR_READ(c); 2645bf215546Sopenharmony_ci } else { 2646bf215546Sopenharmony_ci struct qreg rg = conf != 0xffffffff && i == 0 ? 2647bf215546Sopenharmony_ci vir_TLBU_COLOR_READ(c, conf) : 2648bf215546Sopenharmony_ci vir_TLB_COLOR_READ(c); 2649bf215546Sopenharmony_ci r = vir_FMOV(c, rg); 2650bf215546Sopenharmony_ci vir_set_unpack(c->defs[r.index], 0, 2651bf215546Sopenharmony_ci V3D_QPU_UNPACK_L); 2652bf215546Sopenharmony_ci g = vir_FMOV(c, rg); 2653bf215546Sopenharmony_ci vir_set_unpack(c->defs[g.index], 0, 2654bf215546Sopenharmony_ci V3D_QPU_UNPACK_H); 2655bf215546Sopenharmony_ci 2656bf215546Sopenharmony_ci if (num_components > 2) { 2657bf215546Sopenharmony_ci struct qreg ba = vir_TLB_COLOR_READ(c); 2658bf215546Sopenharmony_ci b = vir_FMOV(c, ba); 2659bf215546Sopenharmony_ci vir_set_unpack(c->defs[b.index], 0, 2660bf215546Sopenharmony_ci V3D_QPU_UNPACK_L); 2661bf215546Sopenharmony_ci a = vir_FMOV(c, ba); 2662bf215546Sopenharmony_ci vir_set_unpack(c->defs[a.index], 0, 2663bf215546Sopenharmony_ci V3D_QPU_UNPACK_H); 2664bf215546Sopenharmony_ci } 2665bf215546Sopenharmony_ci } 2666bf215546Sopenharmony_ci 2667bf215546Sopenharmony_ci struct qreg *color_reads = 2668bf215546Sopenharmony_ci &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4]; 2669bf215546Sopenharmony_ci 2670bf215546Sopenharmony_ci color_reads[0] = swap_rb ? b : r; 2671bf215546Sopenharmony_ci if (num_components >= 2) 2672bf215546Sopenharmony_ci color_reads[1] = g; 2673bf215546Sopenharmony_ci if (num_components >= 3) 2674bf215546Sopenharmony_ci color_reads[2] = swap_rb ? r : b; 2675bf215546Sopenharmony_ci if (num_components >= 4) 2676bf215546Sopenharmony_ci color_reads[3] = a; 2677bf215546Sopenharmony_ci } 2678bf215546Sopenharmony_ci } 2679bf215546Sopenharmony_ci 2680bf215546Sopenharmony_ci assert(color_reads_for_sample[component].file != QFILE_NULL); 2681bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 2682bf215546Sopenharmony_ci vir_MOV(c, color_reads_for_sample[component])); 2683bf215546Sopenharmony_ci} 2684bf215546Sopenharmony_ci 2685bf215546Sopenharmony_cistatic bool 2686bf215546Sopenharmony_cintq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr); 2687bf215546Sopenharmony_ci 2688bf215546Sopenharmony_cistatic bool 2689bf215546Sopenharmony_citry_emit_uniform(struct v3d_compile *c, 2690bf215546Sopenharmony_ci int offset, 2691bf215546Sopenharmony_ci int num_components, 2692bf215546Sopenharmony_ci nir_dest *dest, 2693bf215546Sopenharmony_ci enum quniform_contents contents) 2694bf215546Sopenharmony_ci{ 2695bf215546Sopenharmony_ci /* Even though ldunif is strictly 32-bit we can still use it 2696bf215546Sopenharmony_ci * to load scalar 8-bit/16-bit uniforms so long as their offset 2697bf215546Sopenharmony_ci * is 32-bit aligned. In this case, ldunif would still load 2698bf215546Sopenharmony_ci * 32-bit into the destination with the 8-bit/16-bit uniform 2699bf215546Sopenharmony_ci * data in the LSB and garbage in the MSB, but that is fine 2700bf215546Sopenharmony_ci * because we should only be accessing the valid bits of the 2701bf215546Sopenharmony_ci * destination. 2702bf215546Sopenharmony_ci * 2703bf215546Sopenharmony_ci * FIXME: if in the future we improve our register allocator to 2704bf215546Sopenharmony_ci * pack 2 16-bit variables in the MSB and LSB of the same 2705bf215546Sopenharmony_ci * register then this optimization would not be valid as is, 2706bf215546Sopenharmony_ci * since the load clobbers the MSB. 2707bf215546Sopenharmony_ci */ 2708bf215546Sopenharmony_ci if (offset % 4 != 0) 2709bf215546Sopenharmony_ci return false; 2710bf215546Sopenharmony_ci 2711bf215546Sopenharmony_ci /* We need dwords */ 2712bf215546Sopenharmony_ci offset = offset / 4; 2713bf215546Sopenharmony_ci 2714bf215546Sopenharmony_ci for (int i = 0; i < num_components; i++) { 2715bf215546Sopenharmony_ci ntq_store_dest(c, dest, i, 2716bf215546Sopenharmony_ci vir_uniform(c, contents, offset + i)); 2717bf215546Sopenharmony_ci } 2718bf215546Sopenharmony_ci 2719bf215546Sopenharmony_ci return true; 2720bf215546Sopenharmony_ci} 2721bf215546Sopenharmony_ci 2722bf215546Sopenharmony_cistatic void 2723bf215546Sopenharmony_cintq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr) 2724bf215546Sopenharmony_ci{ 2725bf215546Sopenharmony_ci /* We scalarize general TMU access for anything that is not 32-bit. */ 2726bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32 || 2727bf215546Sopenharmony_ci instr->num_components == 1); 2728bf215546Sopenharmony_ci 2729bf215546Sopenharmony_ci /* Try to emit ldunif if possible, otherwise fallback to general TMU */ 2730bf215546Sopenharmony_ci if (nir_src_is_const(instr->src[0])) { 2731bf215546Sopenharmony_ci int offset = (nir_intrinsic_base(instr) + 2732bf215546Sopenharmony_ci nir_src_as_uint(instr->src[0])); 2733bf215546Sopenharmony_ci 2734bf215546Sopenharmony_ci if (try_emit_uniform(c, offset, instr->num_components, 2735bf215546Sopenharmony_ci &instr->dest, QUNIFORM_UNIFORM)) { 2736bf215546Sopenharmony_ci return; 2737bf215546Sopenharmony_ci } 2738bf215546Sopenharmony_ci } 2739bf215546Sopenharmony_ci 2740bf215546Sopenharmony_ci if (!ntq_emit_load_unifa(c, instr)) { 2741bf215546Sopenharmony_ci ntq_emit_tmu_general(c, instr, false, false); 2742bf215546Sopenharmony_ci c->has_general_tmu_load = true; 2743bf215546Sopenharmony_ci } 2744bf215546Sopenharmony_ci} 2745bf215546Sopenharmony_ci 2746bf215546Sopenharmony_cistatic bool 2747bf215546Sopenharmony_cintq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr) 2748bf215546Sopenharmony_ci{ 2749bf215546Sopenharmony_ci if (c->compiler->max_inline_uniform_buffers <= 0) 2750bf215546Sopenharmony_ci return false; 2751bf215546Sopenharmony_ci 2752bf215546Sopenharmony_ci /* On Vulkan we use indices 1..MAX_INLINE_UNIFORM_BUFFERS for inline 2753bf215546Sopenharmony_ci * uniform buffers which we want to handle more like push constants 2754bf215546Sopenharmony_ci * than regular UBO. OpenGL doesn't implement this feature. 2755bf215546Sopenharmony_ci */ 2756bf215546Sopenharmony_ci assert(c->key->environment == V3D_ENVIRONMENT_VULKAN); 2757bf215546Sopenharmony_ci uint32_t index = nir_src_as_uint(instr->src[0]); 2758bf215546Sopenharmony_ci if (index == 0 || index > c->compiler->max_inline_uniform_buffers) 2759bf215546Sopenharmony_ci return false; 2760bf215546Sopenharmony_ci 2761bf215546Sopenharmony_ci /* We scalarize general TMU access for anything that is not 32-bit */ 2762bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32 || 2763bf215546Sopenharmony_ci instr->num_components == 1); 2764bf215546Sopenharmony_ci 2765bf215546Sopenharmony_ci if (nir_src_is_const(instr->src[1])) { 2766bf215546Sopenharmony_ci /* Index 0 is reserved for push constants */ 2767bf215546Sopenharmony_ci assert(index > 0); 2768bf215546Sopenharmony_ci uint32_t inline_index = index - 1; 2769bf215546Sopenharmony_ci int offset = nir_src_as_uint(instr->src[1]); 2770bf215546Sopenharmony_ci if (try_emit_uniform(c, offset, instr->num_components, 2771bf215546Sopenharmony_ci &instr->dest, 2772bf215546Sopenharmony_ci QUNIFORM_INLINE_UBO_0 + inline_index)) { 2773bf215546Sopenharmony_ci return true; 2774bf215546Sopenharmony_ci } 2775bf215546Sopenharmony_ci } 2776bf215546Sopenharmony_ci 2777bf215546Sopenharmony_ci /* Fallback to regular UBO load */ 2778bf215546Sopenharmony_ci return false; 2779bf215546Sopenharmony_ci} 2780bf215546Sopenharmony_ci 2781bf215546Sopenharmony_cistatic void 2782bf215546Sopenharmony_cintq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) 2783bf215546Sopenharmony_ci{ 2784bf215546Sopenharmony_ci /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset). 2785bf215546Sopenharmony_ci * 2786bf215546Sopenharmony_ci * Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even 2787bf215546Sopenharmony_ci * if we don't support non-uniform offsets because we also set the 2788bf215546Sopenharmony_ci * lower_all_io_to_temps option in the NIR compiler. This ensures that 2789bf215546Sopenharmony_ci * any indirect indexing on in/out variables is turned into indirect 2790bf215546Sopenharmony_ci * indexing on temporary variables instead, that we handle by lowering 2791bf215546Sopenharmony_ci * to scratch. If we implement non-uniform offset here we might be able 2792bf215546Sopenharmony_ci * to avoid the temp and scratch lowering, which involves copying from 2793bf215546Sopenharmony_ci * the input to the temp variable, possibly making code more optimal. 2794bf215546Sopenharmony_ci */ 2795bf215546Sopenharmony_ci unsigned offset = 2796bf215546Sopenharmony_ci nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]); 2797bf215546Sopenharmony_ci 2798bf215546Sopenharmony_ci if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) { 2799bf215546Sopenharmony_ci /* Emit the LDVPM directly now, rather than at the top 2800bf215546Sopenharmony_ci * of the shader like we did for V3D 3.x (which needs 2801bf215546Sopenharmony_ci * vpmsetup when not just taking the next offset). 2802bf215546Sopenharmony_ci * 2803bf215546Sopenharmony_ci * Note that delaying like this may introduce stalls, 2804bf215546Sopenharmony_ci * as LDVPMV takes a minimum of 1 instruction but may 2805bf215546Sopenharmony_ci * be slower if the VPM unit is busy with another QPU. 2806bf215546Sopenharmony_ci */ 2807bf215546Sopenharmony_ci int index = 0; 2808bf215546Sopenharmony_ci if (BITSET_TEST(c->s->info.system_values_read, 2809bf215546Sopenharmony_ci SYSTEM_VALUE_INSTANCE_ID)) { 2810bf215546Sopenharmony_ci index++; 2811bf215546Sopenharmony_ci } 2812bf215546Sopenharmony_ci if (BITSET_TEST(c->s->info.system_values_read, 2813bf215546Sopenharmony_ci SYSTEM_VALUE_BASE_INSTANCE)) { 2814bf215546Sopenharmony_ci index++; 2815bf215546Sopenharmony_ci } 2816bf215546Sopenharmony_ci if (BITSET_TEST(c->s->info.system_values_read, 2817bf215546Sopenharmony_ci SYSTEM_VALUE_VERTEX_ID)) { 2818bf215546Sopenharmony_ci index++; 2819bf215546Sopenharmony_ci } 2820bf215546Sopenharmony_ci for (int i = 0; i < offset; i++) 2821bf215546Sopenharmony_ci index += c->vattr_sizes[i]; 2822bf215546Sopenharmony_ci index += nir_intrinsic_component(instr); 2823bf215546Sopenharmony_ci for (int i = 0; i < instr->num_components; i++) { 2824bf215546Sopenharmony_ci struct qreg vpm_offset = vir_uniform_ui(c, index++); 2825bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, i, 2826bf215546Sopenharmony_ci vir_LDVPMV_IN(c, vpm_offset)); 2827bf215546Sopenharmony_ci } 2828bf215546Sopenharmony_ci } else { 2829bf215546Sopenharmony_ci for (int i = 0; i < instr->num_components; i++) { 2830bf215546Sopenharmony_ci int comp = nir_intrinsic_component(instr) + i; 2831bf215546Sopenharmony_ci struct qreg input = c->inputs[offset * 4 + comp]; 2832bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, i, vir_MOV(c, input)); 2833bf215546Sopenharmony_ci 2834bf215546Sopenharmony_ci if (c->s->info.stage == MESA_SHADER_FRAGMENT && 2835bf215546Sopenharmony_ci input.file == c->payload_z.file && 2836bf215546Sopenharmony_ci input.index == c->payload_z.index) { 2837bf215546Sopenharmony_ci c->reads_z = true; 2838bf215546Sopenharmony_ci } 2839bf215546Sopenharmony_ci } 2840bf215546Sopenharmony_ci } 2841bf215546Sopenharmony_ci} 2842bf215546Sopenharmony_ci 2843bf215546Sopenharmony_cistatic void 2844bf215546Sopenharmony_cintq_emit_per_sample_color_write(struct v3d_compile *c, 2845bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 2846bf215546Sopenharmony_ci{ 2847bf215546Sopenharmony_ci assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d); 2848bf215546Sopenharmony_ci 2849bf215546Sopenharmony_ci unsigned rt = nir_src_as_uint(instr->src[1]); 2850bf215546Sopenharmony_ci assert(rt < V3D_MAX_DRAW_BUFFERS); 2851bf215546Sopenharmony_ci 2852bf215546Sopenharmony_ci unsigned sample_idx = nir_intrinsic_base(instr); 2853bf215546Sopenharmony_ci assert(sample_idx < V3D_MAX_SAMPLES); 2854bf215546Sopenharmony_ci 2855bf215546Sopenharmony_ci unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4; 2856bf215546Sopenharmony_ci for (int i = 0; i < instr->num_components; i++) { 2857bf215546Sopenharmony_ci c->sample_colors[offset + i] = 2858bf215546Sopenharmony_ci vir_MOV(c, ntq_get_src(c, instr->src[0], i)); 2859bf215546Sopenharmony_ci } 2860bf215546Sopenharmony_ci} 2861bf215546Sopenharmony_ci 2862bf215546Sopenharmony_cistatic void 2863bf215546Sopenharmony_cintq_emit_color_write(struct v3d_compile *c, 2864bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 2865bf215546Sopenharmony_ci{ 2866bf215546Sopenharmony_ci unsigned offset = (nir_intrinsic_base(instr) + 2867bf215546Sopenharmony_ci nir_src_as_uint(instr->src[1])) * 4 + 2868bf215546Sopenharmony_ci nir_intrinsic_component(instr); 2869bf215546Sopenharmony_ci for (int i = 0; i < instr->num_components; i++) { 2870bf215546Sopenharmony_ci c->outputs[offset + i] = 2871bf215546Sopenharmony_ci vir_MOV(c, ntq_get_src(c, instr->src[0], i)); 2872bf215546Sopenharmony_ci } 2873bf215546Sopenharmony_ci} 2874bf215546Sopenharmony_ci 2875bf215546Sopenharmony_cistatic void 2876bf215546Sopenharmony_ciemit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr) 2877bf215546Sopenharmony_ci{ 2878bf215546Sopenharmony_ci assert(instr->num_components == 1); 2879bf215546Sopenharmony_ci 2880bf215546Sopenharmony_ci struct qreg offset = ntq_get_src(c, instr->src[1], 0); 2881bf215546Sopenharmony_ci 2882bf215546Sopenharmony_ci uint32_t base_offset = nir_intrinsic_base(instr); 2883bf215546Sopenharmony_ci 2884bf215546Sopenharmony_ci if (base_offset) 2885bf215546Sopenharmony_ci offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset); 2886bf215546Sopenharmony_ci 2887bf215546Sopenharmony_ci /* Usually, for VS or FS, we only emit outputs once at program end so 2888bf215546Sopenharmony_ci * our VPM writes are never in non-uniform control flow, but this 2889bf215546Sopenharmony_ci * is not true for GS, where we are emitting multiple vertices. 2890bf215546Sopenharmony_ci */ 2891bf215546Sopenharmony_ci if (vir_in_nonuniform_control_flow(c)) { 2892bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 2893bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 2894bf215546Sopenharmony_ci } 2895bf215546Sopenharmony_ci 2896bf215546Sopenharmony_ci struct qreg val = ntq_get_src(c, instr->src[0], 0); 2897bf215546Sopenharmony_ci 2898bf215546Sopenharmony_ci /* The offset isn’t necessarily dynamically uniform for a geometry 2899bf215546Sopenharmony_ci * shader. This can happen if the shader sometimes doesn’t emit one of 2900bf215546Sopenharmony_ci * the vertices. In that case subsequent vertices will be written to 2901bf215546Sopenharmony_ci * different offsets in the VPM and we need to use the scatter write 2902bf215546Sopenharmony_ci * instruction to have a different offset for each lane. 2903bf215546Sopenharmony_ci */ 2904bf215546Sopenharmony_ci bool is_uniform_offset = 2905bf215546Sopenharmony_ci !vir_in_nonuniform_control_flow(c) && 2906bf215546Sopenharmony_ci !nir_src_is_divergent(instr->src[1]); 2907bf215546Sopenharmony_ci vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset); 2908bf215546Sopenharmony_ci 2909bf215546Sopenharmony_ci if (vir_in_nonuniform_control_flow(c)) { 2910bf215546Sopenharmony_ci struct qinst *last_inst = 2911bf215546Sopenharmony_ci (struct qinst *)c->cur_block->instructions.prev; 2912bf215546Sopenharmony_ci vir_set_cond(last_inst, V3D_QPU_COND_IFA); 2913bf215546Sopenharmony_ci } 2914bf215546Sopenharmony_ci} 2915bf215546Sopenharmony_ci 2916bf215546Sopenharmony_cistatic void 2917bf215546Sopenharmony_ciemit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr) 2918bf215546Sopenharmony_ci{ 2919bf215546Sopenharmony_ci assert(c->s->info.stage == MESA_SHADER_VERTEX); 2920bf215546Sopenharmony_ci assert(instr->num_components == 1); 2921bf215546Sopenharmony_ci 2922bf215546Sopenharmony_ci uint32_t base = nir_intrinsic_base(instr); 2923bf215546Sopenharmony_ci struct qreg val = ntq_get_src(c, instr->src[0], 0); 2924bf215546Sopenharmony_ci 2925bf215546Sopenharmony_ci if (nir_src_is_const(instr->src[1])) { 2926bf215546Sopenharmony_ci vir_VPM_WRITE(c, val, 2927bf215546Sopenharmony_ci base + nir_src_as_uint(instr->src[1])); 2928bf215546Sopenharmony_ci } else { 2929bf215546Sopenharmony_ci struct qreg offset = vir_ADD(c, 2930bf215546Sopenharmony_ci ntq_get_src(c, instr->src[1], 1), 2931bf215546Sopenharmony_ci vir_uniform_ui(c, base)); 2932bf215546Sopenharmony_ci bool is_uniform_offset = 2933bf215546Sopenharmony_ci !vir_in_nonuniform_control_flow(c) && 2934bf215546Sopenharmony_ci !nir_src_is_divergent(instr->src[1]); 2935bf215546Sopenharmony_ci vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset); 2936bf215546Sopenharmony_ci } 2937bf215546Sopenharmony_ci} 2938bf215546Sopenharmony_ci 2939bf215546Sopenharmony_cistatic void 2940bf215546Sopenharmony_cintq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr) 2941bf215546Sopenharmony_ci{ 2942bf215546Sopenharmony_ci if (c->s->info.stage == MESA_SHADER_FRAGMENT) 2943bf215546Sopenharmony_ci ntq_emit_color_write(c, instr); 2944bf215546Sopenharmony_ci else if (c->s->info.stage == MESA_SHADER_GEOMETRY) 2945bf215546Sopenharmony_ci emit_store_output_gs(c, instr); 2946bf215546Sopenharmony_ci else 2947bf215546Sopenharmony_ci emit_store_output_vs(c, instr); 2948bf215546Sopenharmony_ci} 2949bf215546Sopenharmony_ci 2950bf215546Sopenharmony_ci/** 2951bf215546Sopenharmony_ci * This implementation is based on v3d_sample_{x,y}_offset() from 2952bf215546Sopenharmony_ci * v3d_sample_offset.h. 2953bf215546Sopenharmony_ci */ 2954bf215546Sopenharmony_cistatic void 2955bf215546Sopenharmony_cintq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx, 2956bf215546Sopenharmony_ci struct qreg *sx, struct qreg *sy) 2957bf215546Sopenharmony_ci{ 2958bf215546Sopenharmony_ci sample_idx = vir_ITOF(c, sample_idx); 2959bf215546Sopenharmony_ci 2960bf215546Sopenharmony_ci struct qreg offset_x = 2961bf215546Sopenharmony_ci vir_FADD(c, vir_uniform_f(c, -0.125f), 2962bf215546Sopenharmony_ci vir_FMUL(c, sample_idx, 2963bf215546Sopenharmony_ci vir_uniform_f(c, 0.5f))); 2964bf215546Sopenharmony_ci vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), 2965bf215546Sopenharmony_ci vir_uniform_f(c, 2.0f), sample_idx), 2966bf215546Sopenharmony_ci V3D_QPU_PF_PUSHC); 2967bf215546Sopenharmony_ci offset_x = vir_SEL(c, V3D_QPU_COND_IFA, 2968bf215546Sopenharmony_ci vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)), 2969bf215546Sopenharmony_ci offset_x); 2970bf215546Sopenharmony_ci 2971bf215546Sopenharmony_ci struct qreg offset_y = 2972bf215546Sopenharmony_ci vir_FADD(c, vir_uniform_f(c, -0.375f), 2973bf215546Sopenharmony_ci vir_FMUL(c, sample_idx, 2974bf215546Sopenharmony_ci vir_uniform_f(c, 0.25f))); 2975bf215546Sopenharmony_ci *sx = offset_x; 2976bf215546Sopenharmony_ci *sy = offset_y; 2977bf215546Sopenharmony_ci} 2978bf215546Sopenharmony_ci 2979bf215546Sopenharmony_ci/** 2980bf215546Sopenharmony_ci * This implementation is based on get_centroid_offset() from fep.c. 2981bf215546Sopenharmony_ci */ 2982bf215546Sopenharmony_cistatic void 2983bf215546Sopenharmony_cintq_get_barycentric_centroid(struct v3d_compile *c, 2984bf215546Sopenharmony_ci struct qreg *out_x, 2985bf215546Sopenharmony_ci struct qreg *out_y) 2986bf215546Sopenharmony_ci{ 2987bf215546Sopenharmony_ci struct qreg sample_mask; 2988bf215546Sopenharmony_ci if (c->output_sample_mask_index != -1) 2989bf215546Sopenharmony_ci sample_mask = c->outputs[c->output_sample_mask_index]; 2990bf215546Sopenharmony_ci else 2991bf215546Sopenharmony_ci sample_mask = vir_MSF(c); 2992bf215546Sopenharmony_ci 2993bf215546Sopenharmony_ci struct qreg i0 = vir_uniform_ui(c, 0); 2994bf215546Sopenharmony_ci struct qreg i1 = vir_uniform_ui(c, 1); 2995bf215546Sopenharmony_ci struct qreg i2 = vir_uniform_ui(c, 2); 2996bf215546Sopenharmony_ci struct qreg i3 = vir_uniform_ui(c, 3); 2997bf215546Sopenharmony_ci struct qreg i4 = vir_uniform_ui(c, 4); 2998bf215546Sopenharmony_ci struct qreg i8 = vir_uniform_ui(c, 8); 2999bf215546Sopenharmony_ci 3000bf215546Sopenharmony_ci /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */ 3001bf215546Sopenharmony_ci struct qreg F = vir_uniform_ui(c, 0); 3002bf215546Sopenharmony_ci struct qreg T = vir_uniform_ui(c, ~0); 3003bf215546Sopenharmony_ci struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1); 3004bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ); 3005bf215546Sopenharmony_ci s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 3006bf215546Sopenharmony_ci struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2); 3007bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ); 3008bf215546Sopenharmony_ci s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 3009bf215546Sopenharmony_ci struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4); 3010bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ); 3011bf215546Sopenharmony_ci s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 3012bf215546Sopenharmony_ci struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8); 3013bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ); 3014bf215546Sopenharmony_ci s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 3015bf215546Sopenharmony_ci 3016bf215546Sopenharmony_ci /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */ 3017bf215546Sopenharmony_ci struct qreg sample_idx = i3; 3018bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ); 3019bf215546Sopenharmony_ci sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx); 3020bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ); 3021bf215546Sopenharmony_ci sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx); 3022bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ); 3023bf215546Sopenharmony_ci sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx); 3024bf215546Sopenharmony_ci 3025bf215546Sopenharmony_ci /* Get offset at selected sample index */ 3026bf215546Sopenharmony_ci struct qreg offset_x, offset_y; 3027bf215546Sopenharmony_ci ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y); 3028bf215546Sopenharmony_ci 3029bf215546Sopenharmony_ci /* Select pixel center [offset=(0,0)] if two opposing samples (or none) 3030bf215546Sopenharmony_ci * are selected. 3031bf215546Sopenharmony_ci */ 3032bf215546Sopenharmony_ci struct qreg s0_and_s3 = vir_AND(c, s0, s3); 3033bf215546Sopenharmony_ci struct qreg s1_and_s2 = vir_AND(c, s1, s2); 3034bf215546Sopenharmony_ci 3035bf215546Sopenharmony_ci struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0)); 3036bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ); 3037bf215546Sopenharmony_ci use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 3038bf215546Sopenharmony_ci use_center = vir_OR(c, use_center, s0_and_s3); 3039bf215546Sopenharmony_ci use_center = vir_OR(c, use_center, s1_and_s2); 3040bf215546Sopenharmony_ci 3041bf215546Sopenharmony_ci struct qreg zero = vir_uniform_f(c, 0.0f); 3042bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ); 3043bf215546Sopenharmony_ci offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x); 3044bf215546Sopenharmony_ci offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y); 3045bf215546Sopenharmony_ci 3046bf215546Sopenharmony_ci *out_x = offset_x; 3047bf215546Sopenharmony_ci *out_y = offset_y; 3048bf215546Sopenharmony_ci} 3049bf215546Sopenharmony_ci 3050bf215546Sopenharmony_cistatic struct qreg 3051bf215546Sopenharmony_cintq_emit_load_interpolated_input(struct v3d_compile *c, 3052bf215546Sopenharmony_ci struct qreg p, 3053bf215546Sopenharmony_ci struct qreg C, 3054bf215546Sopenharmony_ci struct qreg offset_x, 3055bf215546Sopenharmony_ci struct qreg offset_y, 3056bf215546Sopenharmony_ci unsigned mode) 3057bf215546Sopenharmony_ci{ 3058bf215546Sopenharmony_ci if (mode == INTERP_MODE_FLAT) 3059bf215546Sopenharmony_ci return C; 3060bf215546Sopenharmony_ci 3061bf215546Sopenharmony_ci struct qreg sample_offset_x = 3062bf215546Sopenharmony_ci vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))); 3063bf215546Sopenharmony_ci struct qreg sample_offset_y = 3064bf215546Sopenharmony_ci vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))); 3065bf215546Sopenharmony_ci 3066bf215546Sopenharmony_ci struct qreg scaleX = 3067bf215546Sopenharmony_ci vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x), 3068bf215546Sopenharmony_ci offset_x); 3069bf215546Sopenharmony_ci struct qreg scaleY = 3070bf215546Sopenharmony_ci vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y), 3071bf215546Sopenharmony_ci offset_y); 3072bf215546Sopenharmony_ci 3073bf215546Sopenharmony_ci struct qreg pInterp = 3074bf215546Sopenharmony_ci vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX), 3075bf215546Sopenharmony_ci vir_FMUL(c, vir_FDY(c, p), scaleY))); 3076bf215546Sopenharmony_ci 3077bf215546Sopenharmony_ci if (mode == INTERP_MODE_NOPERSPECTIVE) 3078bf215546Sopenharmony_ci return vir_FADD(c, pInterp, C); 3079bf215546Sopenharmony_ci 3080bf215546Sopenharmony_ci struct qreg w = c->payload_w; 3081bf215546Sopenharmony_ci struct qreg wInterp = 3082bf215546Sopenharmony_ci vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX), 3083bf215546Sopenharmony_ci vir_FMUL(c, vir_FDY(c, w), scaleY))); 3084bf215546Sopenharmony_ci 3085bf215546Sopenharmony_ci return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C); 3086bf215546Sopenharmony_ci} 3087bf215546Sopenharmony_ci 3088bf215546Sopenharmony_cistatic void 3089bf215546Sopenharmony_ciemit_ldunifa(struct v3d_compile *c, struct qreg *result) 3090bf215546Sopenharmony_ci{ 3091bf215546Sopenharmony_ci struct qinst *ldunifa = 3092bf215546Sopenharmony_ci vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef); 3093bf215546Sopenharmony_ci ldunifa->qpu.sig.ldunifa = true; 3094bf215546Sopenharmony_ci if (result) 3095bf215546Sopenharmony_ci *result = vir_emit_def(c, ldunifa); 3096bf215546Sopenharmony_ci else 3097bf215546Sopenharmony_ci vir_emit_nondef(c, ldunifa); 3098bf215546Sopenharmony_ci c->current_unifa_offset += 4; 3099bf215546Sopenharmony_ci} 3100bf215546Sopenharmony_ci 3101bf215546Sopenharmony_cistatic bool 3102bf215546Sopenharmony_cintq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) 3103bf215546Sopenharmony_ci{ 3104bf215546Sopenharmony_ci assert(instr->intrinsic == nir_intrinsic_load_ubo || 3105bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_ssbo || 3106bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_uniform); 3107bf215546Sopenharmony_ci 3108bf215546Sopenharmony_ci bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform; 3109bf215546Sopenharmony_ci bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; 3110bf215546Sopenharmony_ci bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo; 3111bf215546Sopenharmony_ci 3112bf215546Sopenharmony_ci /* Every ldunifa auto-increments the unifa address by 4 bytes, so our 3113bf215546Sopenharmony_ci * current unifa offset is 4 bytes ahead of the offset of the last load. 3114bf215546Sopenharmony_ci */ 3115bf215546Sopenharmony_ci static const int32_t max_unifa_skip_dist = 3116bf215546Sopenharmony_ci MAX_UNIFA_SKIP_DISTANCE - 4; 3117bf215546Sopenharmony_ci 3118bf215546Sopenharmony_ci /* We can only use unifa if the offset is uniform */ 3119bf215546Sopenharmony_ci nir_src offset = is_uniform ? instr->src[0] : instr->src[1]; 3120bf215546Sopenharmony_ci if (nir_src_is_divergent(offset)) 3121bf215546Sopenharmony_ci return false; 3122bf215546Sopenharmony_ci 3123bf215546Sopenharmony_ci /* We can only use unifa with SSBOs if they are read-only. Otherwise 3124bf215546Sopenharmony_ci * ldunifa won't see the shader writes to that address (possibly 3125bf215546Sopenharmony_ci * because ldunifa doesn't read from the L2T cache). 3126bf215546Sopenharmony_ci */ 3127bf215546Sopenharmony_ci if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE)) 3128bf215546Sopenharmony_ci return false; 3129bf215546Sopenharmony_ci 3130bf215546Sopenharmony_ci /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms 3131bf215546Sopenharmony_ci * that we may have been written to scratch using the TMU. 3132bf215546Sopenharmony_ci */ 3133bf215546Sopenharmony_ci bool dynamic_src = !nir_src_is_const(offset); 3134bf215546Sopenharmony_ci if (is_uniform && dynamic_src && c->s->scratch_size > 0) 3135bf215546Sopenharmony_ci return false; 3136bf215546Sopenharmony_ci 3137bf215546Sopenharmony_ci uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset); 3138bf215546Sopenharmony_ci if (is_uniform) 3139bf215546Sopenharmony_ci const_offset += nir_intrinsic_base(instr); 3140bf215546Sopenharmony_ci 3141bf215546Sopenharmony_ci /* ldunifa is a 32-bit load instruction so we can only use it with 3142bf215546Sopenharmony_ci * 32-bit aligned addresses. We always produce 32-bit aligned addresses 3143bf215546Sopenharmony_ci * except for types smaller than 32-bit, so in these cases we can only 3144bf215546Sopenharmony_ci * use ldunifa if we can verify alignment, which we can only do for 3145bf215546Sopenharmony_ci * loads with a constant offset. 3146bf215546Sopenharmony_ci */ 3147bf215546Sopenharmony_ci uint32_t bit_size = nir_dest_bit_size(instr->dest); 3148bf215546Sopenharmony_ci uint32_t value_skips = 0; 3149bf215546Sopenharmony_ci if (bit_size < 32) { 3150bf215546Sopenharmony_ci if (dynamic_src) { 3151bf215546Sopenharmony_ci return false; 3152bf215546Sopenharmony_ci } else if (const_offset % 4 != 0) { 3153bf215546Sopenharmony_ci /* If we are loading from an unaligned offset, fix 3154bf215546Sopenharmony_ci * alignment and skip over unused elements in result. 3155bf215546Sopenharmony_ci */ 3156bf215546Sopenharmony_ci value_skips = (const_offset % 4) / (bit_size / 8); 3157bf215546Sopenharmony_ci const_offset &= ~0x3; 3158bf215546Sopenharmony_ci } 3159bf215546Sopenharmony_ci } 3160bf215546Sopenharmony_ci 3161bf215546Sopenharmony_ci assert((bit_size == 32 && value_skips == 0) || 3162bf215546Sopenharmony_ci (bit_size == 16 && value_skips <= 1) || 3163bf215546Sopenharmony_ci (bit_size == 8 && value_skips <= 3)); 3164bf215546Sopenharmony_ci 3165bf215546Sopenharmony_ci /* Both Vulkan and OpenGL reserve index 0 for uniforms / push 3166bf215546Sopenharmony_ci * constants. 3167bf215546Sopenharmony_ci */ 3168bf215546Sopenharmony_ci uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]); 3169bf215546Sopenharmony_ci 3170bf215546Sopenharmony_ci /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index 3171bf215546Sopenharmony_ci * shifted up by 1 (0 is gallium's constant buffer 0). 3172bf215546Sopenharmony_ci */ 3173bf215546Sopenharmony_ci if (is_ubo && c->key->environment == V3D_ENVIRONMENT_OPENGL) 3174bf215546Sopenharmony_ci index++; 3175bf215546Sopenharmony_ci 3176bf215546Sopenharmony_ci /* We can only keep track of the last unifa address we used with 3177bf215546Sopenharmony_ci * constant offset loads. If the new load targets the same buffer and 3178bf215546Sopenharmony_ci * is close enough to the previous load, we can skip the unifa register 3179bf215546Sopenharmony_ci * write by emitting dummy ldunifa instructions to update the unifa 3180bf215546Sopenharmony_ci * address. 3181bf215546Sopenharmony_ci */ 3182bf215546Sopenharmony_ci bool skip_unifa = false; 3183bf215546Sopenharmony_ci uint32_t ldunifa_skips = 0; 3184bf215546Sopenharmony_ci if (dynamic_src) { 3185bf215546Sopenharmony_ci c->current_unifa_block = NULL; 3186bf215546Sopenharmony_ci } else if (c->cur_block == c->current_unifa_block && 3187bf215546Sopenharmony_ci c->current_unifa_is_ubo == !is_ssbo && 3188bf215546Sopenharmony_ci c->current_unifa_index == index && 3189bf215546Sopenharmony_ci c->current_unifa_offset <= const_offset && 3190bf215546Sopenharmony_ci c->current_unifa_offset + max_unifa_skip_dist >= const_offset) { 3191bf215546Sopenharmony_ci skip_unifa = true; 3192bf215546Sopenharmony_ci ldunifa_skips = (const_offset - c->current_unifa_offset) / 4; 3193bf215546Sopenharmony_ci } else { 3194bf215546Sopenharmony_ci c->current_unifa_block = c->cur_block; 3195bf215546Sopenharmony_ci c->current_unifa_is_ubo = !is_ssbo; 3196bf215546Sopenharmony_ci c->current_unifa_index = index; 3197bf215546Sopenharmony_ci c->current_unifa_offset = const_offset; 3198bf215546Sopenharmony_ci } 3199bf215546Sopenharmony_ci 3200bf215546Sopenharmony_ci if (!skip_unifa) { 3201bf215546Sopenharmony_ci struct qreg base_offset = !is_ssbo ? 3202bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_UBO_ADDR, 3203bf215546Sopenharmony_ci v3d_unit_data_create(index, const_offset)) : 3204bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_SSBO_OFFSET, index); 3205bf215546Sopenharmony_ci 3206bf215546Sopenharmony_ci struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA); 3207bf215546Sopenharmony_ci if (!dynamic_src) { 3208bf215546Sopenharmony_ci if (!is_ssbo) { 3209bf215546Sopenharmony_ci vir_MOV_dest(c, unifa, base_offset); 3210bf215546Sopenharmony_ci } else { 3211bf215546Sopenharmony_ci vir_ADD_dest(c, unifa, base_offset, 3212bf215546Sopenharmony_ci vir_uniform_ui(c, const_offset)); 3213bf215546Sopenharmony_ci } 3214bf215546Sopenharmony_ci } else { 3215bf215546Sopenharmony_ci vir_ADD_dest(c, unifa, base_offset, 3216bf215546Sopenharmony_ci ntq_get_src(c, offset, 0)); 3217bf215546Sopenharmony_ci } 3218bf215546Sopenharmony_ci } else { 3219bf215546Sopenharmony_ci for (int i = 0; i < ldunifa_skips; i++) 3220bf215546Sopenharmony_ci emit_ldunifa(c, NULL); 3221bf215546Sopenharmony_ci } 3222bf215546Sopenharmony_ci 3223bf215546Sopenharmony_ci uint32_t num_components = nir_intrinsic_dest_components(instr); 3224bf215546Sopenharmony_ci for (uint32_t i = 0; i < num_components; ) { 3225bf215546Sopenharmony_ci struct qreg data; 3226bf215546Sopenharmony_ci emit_ldunifa(c, &data); 3227bf215546Sopenharmony_ci 3228bf215546Sopenharmony_ci if (bit_size == 32) { 3229bf215546Sopenharmony_ci assert(value_skips == 0); 3230bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data)); 3231bf215546Sopenharmony_ci i++; 3232bf215546Sopenharmony_ci } else { 3233bf215546Sopenharmony_ci assert((bit_size == 16 && value_skips <= 1) || 3234bf215546Sopenharmony_ci (bit_size == 8 && value_skips <= 3)); 3235bf215546Sopenharmony_ci 3236bf215546Sopenharmony_ci /* If we have any values to skip, shift to the first 3237bf215546Sopenharmony_ci * valid value in the ldunifa result. 3238bf215546Sopenharmony_ci */ 3239bf215546Sopenharmony_ci if (value_skips > 0) { 3240bf215546Sopenharmony_ci data = vir_SHR(c, data, 3241bf215546Sopenharmony_ci vir_uniform_ui(c, bit_size * 3242bf215546Sopenharmony_ci value_skips)); 3243bf215546Sopenharmony_ci } 3244bf215546Sopenharmony_ci 3245bf215546Sopenharmony_ci /* Check how many valid components we have discounting 3246bf215546Sopenharmony_ci * read components to skip. 3247bf215546Sopenharmony_ci */ 3248bf215546Sopenharmony_ci uint32_t valid_count = (32 / bit_size) - value_skips; 3249bf215546Sopenharmony_ci assert((bit_size == 16 && valid_count <= 2) || 3250bf215546Sopenharmony_ci (bit_size == 8 && valid_count <= 4)); 3251bf215546Sopenharmony_ci assert(valid_count > 0); 3252bf215546Sopenharmony_ci 3253bf215546Sopenharmony_ci /* Process the valid components */ 3254bf215546Sopenharmony_ci do { 3255bf215546Sopenharmony_ci struct qreg tmp; 3256bf215546Sopenharmony_ci uint32_t mask = (1 << bit_size) - 1; 3257bf215546Sopenharmony_ci tmp = vir_AND(c, vir_MOV(c, data), 3258bf215546Sopenharmony_ci vir_uniform_ui(c, mask)); 3259bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, i, 3260bf215546Sopenharmony_ci vir_MOV(c, tmp)); 3261bf215546Sopenharmony_ci i++; 3262bf215546Sopenharmony_ci valid_count--; 3263bf215546Sopenharmony_ci 3264bf215546Sopenharmony_ci /* Shift to next component */ 3265bf215546Sopenharmony_ci if (i < num_components && valid_count > 0) { 3266bf215546Sopenharmony_ci data = vir_SHR(c, data, 3267bf215546Sopenharmony_ci vir_uniform_ui(c, bit_size)); 3268bf215546Sopenharmony_ci } 3269bf215546Sopenharmony_ci } while (i < num_components && valid_count > 0); 3270bf215546Sopenharmony_ci } 3271bf215546Sopenharmony_ci } 3272bf215546Sopenharmony_ci 3273bf215546Sopenharmony_ci return true; 3274bf215546Sopenharmony_ci} 3275bf215546Sopenharmony_ci 3276bf215546Sopenharmony_cistatic inline struct qreg 3277bf215546Sopenharmony_ciemit_load_local_invocation_index(struct v3d_compile *c) 3278bf215546Sopenharmony_ci{ 3279bf215546Sopenharmony_ci return vir_SHR(c, c->cs_payload[1], 3280bf215546Sopenharmony_ci vir_uniform_ui(c, 32 - c->local_invocation_index_bits)); 3281bf215546Sopenharmony_ci} 3282bf215546Sopenharmony_ci 3283bf215546Sopenharmony_ci/* Various subgroup operations rely on the A flags, so this helper ensures that 3284bf215546Sopenharmony_ci * A flags represents currently active lanes in the subgroup. 3285bf215546Sopenharmony_ci */ 3286bf215546Sopenharmony_cistatic void 3287bf215546Sopenharmony_ciset_a_flags_for_subgroup(struct v3d_compile *c) 3288bf215546Sopenharmony_ci{ 3289bf215546Sopenharmony_ci /* MSF returns 0 for disabled lanes in compute shaders so 3290bf215546Sopenharmony_ci * PUSHZ will set A=1 for disabled lanes. We want the inverse 3291bf215546Sopenharmony_ci * of this but we don't have any means to negate the A flags 3292bf215546Sopenharmony_ci * directly, but we can do it by repeating the same operation 3293bf215546Sopenharmony_ci * with NORZ (A = ~A & ~Z). 3294bf215546Sopenharmony_ci */ 3295bf215546Sopenharmony_ci assert(c->s->info.stage == MESA_SHADER_COMPUTE); 3296bf215546Sopenharmony_ci vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); 3297bf215546Sopenharmony_ci vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ); 3298bf215546Sopenharmony_ci 3299bf215546Sopenharmony_ci /* If we are under non-uniform control flow we also need to 3300bf215546Sopenharmony_ci * AND the A flags with the current execute mask. 3301bf215546Sopenharmony_ci */ 3302bf215546Sopenharmony_ci if (vir_in_nonuniform_control_flow(c)) { 3303bf215546Sopenharmony_ci const uint32_t bidx = c->cur_block->index; 3304bf215546Sopenharmony_ci vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(), 3305bf215546Sopenharmony_ci c->execute, 3306bf215546Sopenharmony_ci vir_uniform_ui(c, bidx)), 3307bf215546Sopenharmony_ci V3D_QPU_UF_ANDZ); 3308bf215546Sopenharmony_ci } 3309bf215546Sopenharmony_ci} 3310bf215546Sopenharmony_ci 3311bf215546Sopenharmony_cistatic void 3312bf215546Sopenharmony_cintq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) 3313bf215546Sopenharmony_ci{ 3314bf215546Sopenharmony_ci switch (instr->intrinsic) { 3315bf215546Sopenharmony_ci case nir_intrinsic_load_uniform: 3316bf215546Sopenharmony_ci ntq_emit_load_uniform(c, instr); 3317bf215546Sopenharmony_ci break; 3318bf215546Sopenharmony_ci 3319bf215546Sopenharmony_ci case nir_intrinsic_load_global_2x32: 3320bf215546Sopenharmony_ci ntq_emit_tmu_general(c, instr, false, true); 3321bf215546Sopenharmony_ci c->has_general_tmu_load = true; 3322bf215546Sopenharmony_ci break; 3323bf215546Sopenharmony_ci 3324bf215546Sopenharmony_ci case nir_intrinsic_load_ubo: 3325bf215546Sopenharmony_ci if (ntq_emit_inline_ubo_load(c, instr)) 3326bf215546Sopenharmony_ci break; 3327bf215546Sopenharmony_ci FALLTHROUGH; 3328bf215546Sopenharmony_ci case nir_intrinsic_load_ssbo: 3329bf215546Sopenharmony_ci if (!ntq_emit_load_unifa(c, instr)) { 3330bf215546Sopenharmony_ci ntq_emit_tmu_general(c, instr, false, false); 3331bf215546Sopenharmony_ci c->has_general_tmu_load = true; 3332bf215546Sopenharmony_ci } 3333bf215546Sopenharmony_ci break; 3334bf215546Sopenharmony_ci 3335bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_add: 3336bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_imin: 3337bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_umin: 3338bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_imax: 3339bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_umax: 3340bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_and: 3341bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_or: 3342bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_xor: 3343bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_exchange: 3344bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_comp_swap: 3345bf215546Sopenharmony_ci case nir_intrinsic_store_ssbo: 3346bf215546Sopenharmony_ci ntq_emit_tmu_general(c, instr, false, false); 3347bf215546Sopenharmony_ci break; 3348bf215546Sopenharmony_ci 3349bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_add_2x32: 3350bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_imin_2x32: 3351bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_umin_2x32: 3352bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_imax_2x32: 3353bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_umax_2x32: 3354bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_and_2x32: 3355bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_or_2x32: 3356bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_xor_2x32: 3357bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_exchange_2x32: 3358bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_comp_swap_2x32: 3359bf215546Sopenharmony_ci case nir_intrinsic_store_global_2x32: 3360bf215546Sopenharmony_ci ntq_emit_tmu_general(c, instr, false, true); 3361bf215546Sopenharmony_ci break; 3362bf215546Sopenharmony_ci 3363bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_add: 3364bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_imin: 3365bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_umin: 3366bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_imax: 3367bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_umax: 3368bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_and: 3369bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_or: 3370bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_xor: 3371bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_exchange: 3372bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_comp_swap: 3373bf215546Sopenharmony_ci case nir_intrinsic_store_shared: 3374bf215546Sopenharmony_ci case nir_intrinsic_store_scratch: 3375bf215546Sopenharmony_ci ntq_emit_tmu_general(c, instr, true, false); 3376bf215546Sopenharmony_ci break; 3377bf215546Sopenharmony_ci 3378bf215546Sopenharmony_ci case nir_intrinsic_load_scratch: 3379bf215546Sopenharmony_ci case nir_intrinsic_load_shared: 3380bf215546Sopenharmony_ci ntq_emit_tmu_general(c, instr, true, false); 3381bf215546Sopenharmony_ci c->has_general_tmu_load = true; 3382bf215546Sopenharmony_ci break; 3383bf215546Sopenharmony_ci 3384bf215546Sopenharmony_ci case nir_intrinsic_image_store: 3385bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_add: 3386bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_imin: 3387bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_umin: 3388bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_imax: 3389bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_umax: 3390bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_and: 3391bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_or: 3392bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_xor: 3393bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_exchange: 3394bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_comp_swap: 3395bf215546Sopenharmony_ci v3d40_vir_emit_image_load_store(c, instr); 3396bf215546Sopenharmony_ci break; 3397bf215546Sopenharmony_ci 3398bf215546Sopenharmony_ci case nir_intrinsic_image_load: 3399bf215546Sopenharmony_ci v3d40_vir_emit_image_load_store(c, instr); 3400bf215546Sopenharmony_ci /* Not really a general TMU load, but we only use this flag 3401bf215546Sopenharmony_ci * for NIR scheduling and we do schedule these under the same 3402bf215546Sopenharmony_ci * policy as general TMU. 3403bf215546Sopenharmony_ci */ 3404bf215546Sopenharmony_ci c->has_general_tmu_load = true; 3405bf215546Sopenharmony_ci break; 3406bf215546Sopenharmony_ci 3407bf215546Sopenharmony_ci case nir_intrinsic_get_ssbo_size: 3408bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3409bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_GET_SSBO_SIZE, 3410bf215546Sopenharmony_ci nir_src_comp_as_uint(instr->src[0], 0))); 3411bf215546Sopenharmony_ci break; 3412bf215546Sopenharmony_ci 3413bf215546Sopenharmony_ci case nir_intrinsic_get_ubo_size: 3414bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3415bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_GET_UBO_SIZE, 3416bf215546Sopenharmony_ci nir_src_comp_as_uint(instr->src[0], 0))); 3417bf215546Sopenharmony_ci break; 3418bf215546Sopenharmony_ci 3419bf215546Sopenharmony_ci case nir_intrinsic_load_user_clip_plane: 3420bf215546Sopenharmony_ci for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { 3421bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, i, 3422bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, 3423bf215546Sopenharmony_ci nir_intrinsic_ucp_id(instr) * 3424bf215546Sopenharmony_ci 4 + i)); 3425bf215546Sopenharmony_ci } 3426bf215546Sopenharmony_ci break; 3427bf215546Sopenharmony_ci 3428bf215546Sopenharmony_ci case nir_intrinsic_load_viewport_x_scale: 3429bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3430bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); 3431bf215546Sopenharmony_ci break; 3432bf215546Sopenharmony_ci 3433bf215546Sopenharmony_ci case nir_intrinsic_load_viewport_y_scale: 3434bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3435bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); 3436bf215546Sopenharmony_ci break; 3437bf215546Sopenharmony_ci 3438bf215546Sopenharmony_ci case nir_intrinsic_load_viewport_z_scale: 3439bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3440bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); 3441bf215546Sopenharmony_ci break; 3442bf215546Sopenharmony_ci 3443bf215546Sopenharmony_ci case nir_intrinsic_load_viewport_z_offset: 3444bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3445bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); 3446bf215546Sopenharmony_ci break; 3447bf215546Sopenharmony_ci 3448bf215546Sopenharmony_ci case nir_intrinsic_load_line_coord: 3449bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x)); 3450bf215546Sopenharmony_ci break; 3451bf215546Sopenharmony_ci 3452bf215546Sopenharmony_ci case nir_intrinsic_load_line_width: 3453bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3454bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_LINE_WIDTH, 0)); 3455bf215546Sopenharmony_ci break; 3456bf215546Sopenharmony_ci 3457bf215546Sopenharmony_ci case nir_intrinsic_load_aa_line_width: 3458bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3459bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0)); 3460bf215546Sopenharmony_ci break; 3461bf215546Sopenharmony_ci 3462bf215546Sopenharmony_ci case nir_intrinsic_load_sample_mask_in: 3463bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_MSF(c)); 3464bf215546Sopenharmony_ci break; 3465bf215546Sopenharmony_ci 3466bf215546Sopenharmony_ci case nir_intrinsic_load_helper_invocation: 3467bf215546Sopenharmony_ci vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); 3468bf215546Sopenharmony_ci struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); 3469bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, qdest); 3470bf215546Sopenharmony_ci break; 3471bf215546Sopenharmony_ci 3472bf215546Sopenharmony_ci case nir_intrinsic_load_front_face: 3473bf215546Sopenharmony_ci /* The register contains 0 (front) or 1 (back), and we need to 3474bf215546Sopenharmony_ci * turn it into a NIR bool where true means front. 3475bf215546Sopenharmony_ci */ 3476bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3477bf215546Sopenharmony_ci vir_ADD(c, 3478bf215546Sopenharmony_ci vir_uniform_ui(c, -1), 3479bf215546Sopenharmony_ci vir_REVF(c))); 3480bf215546Sopenharmony_ci break; 3481bf215546Sopenharmony_ci 3482bf215546Sopenharmony_ci case nir_intrinsic_load_base_instance: 3483bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid)); 3484bf215546Sopenharmony_ci break; 3485bf215546Sopenharmony_ci 3486bf215546Sopenharmony_ci case nir_intrinsic_load_instance_id: 3487bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid)); 3488bf215546Sopenharmony_ci break; 3489bf215546Sopenharmony_ci 3490bf215546Sopenharmony_ci case nir_intrinsic_load_vertex_id: 3491bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); 3492bf215546Sopenharmony_ci break; 3493bf215546Sopenharmony_ci 3494bf215546Sopenharmony_ci case nir_intrinsic_load_tlb_color_v3d: 3495bf215546Sopenharmony_ci vir_emit_tlb_color_read(c, instr); 3496bf215546Sopenharmony_ci break; 3497bf215546Sopenharmony_ci 3498bf215546Sopenharmony_ci case nir_intrinsic_load_input: 3499bf215546Sopenharmony_ci ntq_emit_load_input(c, instr); 3500bf215546Sopenharmony_ci break; 3501bf215546Sopenharmony_ci 3502bf215546Sopenharmony_ci case nir_intrinsic_store_tlb_sample_color_v3d: 3503bf215546Sopenharmony_ci ntq_emit_per_sample_color_write(c, instr); 3504bf215546Sopenharmony_ci break; 3505bf215546Sopenharmony_ci 3506bf215546Sopenharmony_ci case nir_intrinsic_store_output: 3507bf215546Sopenharmony_ci ntq_emit_store_output(c, instr); 3508bf215546Sopenharmony_ci break; 3509bf215546Sopenharmony_ci 3510bf215546Sopenharmony_ci case nir_intrinsic_image_size: 3511bf215546Sopenharmony_ci ntq_emit_image_size(c, instr); 3512bf215546Sopenharmony_ci break; 3513bf215546Sopenharmony_ci 3514bf215546Sopenharmony_ci case nir_intrinsic_discard: 3515bf215546Sopenharmony_ci ntq_flush_tmu(c); 3516bf215546Sopenharmony_ci 3517bf215546Sopenharmony_ci if (vir_in_nonuniform_control_flow(c)) { 3518bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 3519bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 3520bf215546Sopenharmony_ci vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), 3521bf215546Sopenharmony_ci vir_uniform_ui(c, 0)), 3522bf215546Sopenharmony_ci V3D_QPU_COND_IFA); 3523bf215546Sopenharmony_ci } else { 3524bf215546Sopenharmony_ci vir_SETMSF_dest(c, vir_nop_reg(), 3525bf215546Sopenharmony_ci vir_uniform_ui(c, 0)); 3526bf215546Sopenharmony_ci } 3527bf215546Sopenharmony_ci break; 3528bf215546Sopenharmony_ci 3529bf215546Sopenharmony_ci case nir_intrinsic_discard_if: { 3530bf215546Sopenharmony_ci ntq_flush_tmu(c); 3531bf215546Sopenharmony_ci 3532bf215546Sopenharmony_ci enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]); 3533bf215546Sopenharmony_ci 3534bf215546Sopenharmony_ci if (vir_in_nonuniform_control_flow(c)) { 3535bf215546Sopenharmony_ci struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(), 3536bf215546Sopenharmony_ci c->execute); 3537bf215546Sopenharmony_ci if (cond == V3D_QPU_COND_IFA) { 3538bf215546Sopenharmony_ci vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ); 3539bf215546Sopenharmony_ci } else { 3540bf215546Sopenharmony_ci vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ); 3541bf215546Sopenharmony_ci cond = V3D_QPU_COND_IFA; 3542bf215546Sopenharmony_ci } 3543bf215546Sopenharmony_ci } 3544bf215546Sopenharmony_ci 3545bf215546Sopenharmony_ci vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), 3546bf215546Sopenharmony_ci vir_uniform_ui(c, 0)), cond); 3547bf215546Sopenharmony_ci 3548bf215546Sopenharmony_ci break; 3549bf215546Sopenharmony_ci } 3550bf215546Sopenharmony_ci 3551bf215546Sopenharmony_ci case nir_intrinsic_memory_barrier: 3552bf215546Sopenharmony_ci case nir_intrinsic_memory_barrier_buffer: 3553bf215546Sopenharmony_ci case nir_intrinsic_memory_barrier_image: 3554bf215546Sopenharmony_ci case nir_intrinsic_memory_barrier_shared: 3555bf215546Sopenharmony_ci case nir_intrinsic_memory_barrier_tcs_patch: 3556bf215546Sopenharmony_ci case nir_intrinsic_group_memory_barrier: 3557bf215546Sopenharmony_ci /* We don't do any instruction scheduling of these NIR 3558bf215546Sopenharmony_ci * instructions between each other, so we just need to make 3559bf215546Sopenharmony_ci * sure that the TMU operations before the barrier are flushed 3560bf215546Sopenharmony_ci * before the ones after the barrier. 3561bf215546Sopenharmony_ci */ 3562bf215546Sopenharmony_ci ntq_flush_tmu(c); 3563bf215546Sopenharmony_ci break; 3564bf215546Sopenharmony_ci 3565bf215546Sopenharmony_ci case nir_intrinsic_control_barrier: 3566bf215546Sopenharmony_ci /* Emit a TSY op to get all invocations in the workgroup 3567bf215546Sopenharmony_ci * (actually supergroup) to block until the last invocation 3568bf215546Sopenharmony_ci * reaches the TSY op. 3569bf215546Sopenharmony_ci */ 3570bf215546Sopenharmony_ci ntq_flush_tmu(c); 3571bf215546Sopenharmony_ci 3572bf215546Sopenharmony_ci if (c->devinfo->ver >= 42) { 3573bf215546Sopenharmony_ci vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, 3574bf215546Sopenharmony_ci V3D_QPU_WADDR_SYNCB)); 3575bf215546Sopenharmony_ci } else { 3576bf215546Sopenharmony_ci struct qinst *sync = 3577bf215546Sopenharmony_ci vir_BARRIERID_dest(c, 3578bf215546Sopenharmony_ci vir_reg(QFILE_MAGIC, 3579bf215546Sopenharmony_ci V3D_QPU_WADDR_SYNCU)); 3580bf215546Sopenharmony_ci sync->uniform = 3581bf215546Sopenharmony_ci vir_get_uniform_index(c, QUNIFORM_CONSTANT, 3582bf215546Sopenharmony_ci 0xffffff00 | 3583bf215546Sopenharmony_ci V3D_TSY_WAIT_INC_CHECK); 3584bf215546Sopenharmony_ci 3585bf215546Sopenharmony_ci } 3586bf215546Sopenharmony_ci 3587bf215546Sopenharmony_ci /* The blocking of a TSY op only happens at the next thread 3588bf215546Sopenharmony_ci * switch. No texturing may be outstanding at the time of a 3589bf215546Sopenharmony_ci * TSY blocking operation. 3590bf215546Sopenharmony_ci */ 3591bf215546Sopenharmony_ci vir_emit_thrsw(c); 3592bf215546Sopenharmony_ci break; 3593bf215546Sopenharmony_ci 3594bf215546Sopenharmony_ci case nir_intrinsic_load_num_workgroups: 3595bf215546Sopenharmony_ci for (int i = 0; i < 3; i++) { 3596bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, i, 3597bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS, 3598bf215546Sopenharmony_ci i)); 3599bf215546Sopenharmony_ci } 3600bf215546Sopenharmony_ci break; 3601bf215546Sopenharmony_ci 3602bf215546Sopenharmony_ci case nir_intrinsic_load_workgroup_id: { 3603bf215546Sopenharmony_ci struct qreg x = vir_AND(c, c->cs_payload[0], 3604bf215546Sopenharmony_ci vir_uniform_ui(c, 0xffff)); 3605bf215546Sopenharmony_ci 3606bf215546Sopenharmony_ci struct qreg y = vir_SHR(c, c->cs_payload[0], 3607bf215546Sopenharmony_ci vir_uniform_ui(c, 16)); 3608bf215546Sopenharmony_ci 3609bf215546Sopenharmony_ci struct qreg z = vir_AND(c, c->cs_payload[1], 3610bf215546Sopenharmony_ci vir_uniform_ui(c, 0xffff)); 3611bf215546Sopenharmony_ci 3612bf215546Sopenharmony_ci /* We only support dispatch base in Vulkan */ 3613bf215546Sopenharmony_ci if (c->key->environment == V3D_ENVIRONMENT_VULKAN) { 3614bf215546Sopenharmony_ci x = vir_ADD(c, x, 3615bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0)); 3616bf215546Sopenharmony_ci y = vir_ADD(c, y, 3617bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1)); 3618bf215546Sopenharmony_ci z = vir_ADD(c, z, 3619bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2)); 3620bf215546Sopenharmony_ci } 3621bf215546Sopenharmony_ci 3622bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x)); 3623bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y)); 3624bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z)); 3625bf215546Sopenharmony_ci break; 3626bf215546Sopenharmony_ci } 3627bf215546Sopenharmony_ci 3628bf215546Sopenharmony_ci case nir_intrinsic_load_local_invocation_index: 3629bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3630bf215546Sopenharmony_ci emit_load_local_invocation_index(c)); 3631bf215546Sopenharmony_ci break; 3632bf215546Sopenharmony_ci 3633bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_id: { 3634bf215546Sopenharmony_ci /* This is basically the batch index, which is the Local 3635bf215546Sopenharmony_ci * Invocation Index divided by the SIMD width). 3636bf215546Sopenharmony_ci */ 3637bf215546Sopenharmony_ci STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0); 3638bf215546Sopenharmony_ci const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1; 3639bf215546Sopenharmony_ci struct qreg lii = emit_load_local_invocation_index(c); 3640bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3641bf215546Sopenharmony_ci vir_SHR(c, lii, 3642bf215546Sopenharmony_ci vir_uniform_ui(c, divide_shift))); 3643bf215546Sopenharmony_ci break; 3644bf215546Sopenharmony_ci } 3645bf215546Sopenharmony_ci 3646bf215546Sopenharmony_ci case nir_intrinsic_load_per_vertex_input: { 3647bf215546Sopenharmony_ci /* The vertex shader writes all its used outputs into 3648bf215546Sopenharmony_ci * consecutive VPM offsets, so if any output component is 3649bf215546Sopenharmony_ci * unused, its VPM offset is used by the next used 3650bf215546Sopenharmony_ci * component. This means that we can't assume that each 3651bf215546Sopenharmony_ci * location will use 4 consecutive scalar offsets in the VPM 3652bf215546Sopenharmony_ci * and we need to compute the VPM offset for each input by 3653bf215546Sopenharmony_ci * going through the inputs and finding the one that matches 3654bf215546Sopenharmony_ci * our location and component. 3655bf215546Sopenharmony_ci * 3656bf215546Sopenharmony_ci * col: vertex index, row = varying index 3657bf215546Sopenharmony_ci */ 3658bf215546Sopenharmony_ci assert(nir_src_is_const(instr->src[1])); 3659bf215546Sopenharmony_ci uint32_t location = 3660bf215546Sopenharmony_ci nir_intrinsic_io_semantics(instr).location + 3661bf215546Sopenharmony_ci nir_src_as_uint(instr->src[1]); 3662bf215546Sopenharmony_ci uint32_t component = nir_intrinsic_component(instr); 3663bf215546Sopenharmony_ci 3664bf215546Sopenharmony_ci int32_t row_idx = -1; 3665bf215546Sopenharmony_ci for (int i = 0; i < c->num_inputs; i++) { 3666bf215546Sopenharmony_ci struct v3d_varying_slot slot = c->input_slots[i]; 3667bf215546Sopenharmony_ci if (v3d_slot_get_slot(slot) == location && 3668bf215546Sopenharmony_ci v3d_slot_get_component(slot) == component) { 3669bf215546Sopenharmony_ci row_idx = i; 3670bf215546Sopenharmony_ci break; 3671bf215546Sopenharmony_ci } 3672bf215546Sopenharmony_ci } 3673bf215546Sopenharmony_ci 3674bf215546Sopenharmony_ci assert(row_idx != -1); 3675bf215546Sopenharmony_ci 3676bf215546Sopenharmony_ci struct qreg col = ntq_get_src(c, instr->src[0], 0); 3677bf215546Sopenharmony_ci for (int i = 0; i < instr->num_components; i++) { 3678bf215546Sopenharmony_ci struct qreg row = vir_uniform_ui(c, row_idx++); 3679bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, i, 3680bf215546Sopenharmony_ci vir_LDVPMG_IN(c, row, col)); 3681bf215546Sopenharmony_ci } 3682bf215546Sopenharmony_ci break; 3683bf215546Sopenharmony_ci } 3684bf215546Sopenharmony_ci 3685bf215546Sopenharmony_ci case nir_intrinsic_emit_vertex: 3686bf215546Sopenharmony_ci case nir_intrinsic_end_primitive: 3687bf215546Sopenharmony_ci unreachable("Should have been lowered in v3d_nir_lower_io"); 3688bf215546Sopenharmony_ci break; 3689bf215546Sopenharmony_ci 3690bf215546Sopenharmony_ci case nir_intrinsic_load_primitive_id: { 3691bf215546Sopenharmony_ci /* gl_PrimitiveIdIn is written by the GBG in the first word of 3692bf215546Sopenharmony_ci * VPM output header. According to docs, we should read this 3693bf215546Sopenharmony_ci * using ldvpm(v,d)_in (See Table 71). 3694bf215546Sopenharmony_ci */ 3695bf215546Sopenharmony_ci assert(c->s->info.stage == MESA_SHADER_GEOMETRY); 3696bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3697bf215546Sopenharmony_ci vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); 3698bf215546Sopenharmony_ci break; 3699bf215546Sopenharmony_ci } 3700bf215546Sopenharmony_ci 3701bf215546Sopenharmony_ci case nir_intrinsic_load_invocation_id: 3702bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_IID(c)); 3703bf215546Sopenharmony_ci break; 3704bf215546Sopenharmony_ci 3705bf215546Sopenharmony_ci case nir_intrinsic_load_fb_layers_v3d: 3706bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3707bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); 3708bf215546Sopenharmony_ci break; 3709bf215546Sopenharmony_ci 3710bf215546Sopenharmony_ci case nir_intrinsic_load_sample_id: 3711bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c)); 3712bf215546Sopenharmony_ci break; 3713bf215546Sopenharmony_ci 3714bf215546Sopenharmony_ci case nir_intrinsic_load_sample_pos: 3715bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3716bf215546Sopenharmony_ci vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)))); 3717bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 1, 3718bf215546Sopenharmony_ci vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)))); 3719bf215546Sopenharmony_ci break; 3720bf215546Sopenharmony_ci 3721bf215546Sopenharmony_ci case nir_intrinsic_load_barycentric_at_offset: 3722bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3723bf215546Sopenharmony_ci vir_MOV(c, ntq_get_src(c, instr->src[0], 0))); 3724bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 1, 3725bf215546Sopenharmony_ci vir_MOV(c, ntq_get_src(c, instr->src[0], 1))); 3726bf215546Sopenharmony_ci break; 3727bf215546Sopenharmony_ci 3728bf215546Sopenharmony_ci case nir_intrinsic_load_barycentric_pixel: 3729bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); 3730bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); 3731bf215546Sopenharmony_ci break; 3732bf215546Sopenharmony_ci 3733bf215546Sopenharmony_ci case nir_intrinsic_load_barycentric_at_sample: { 3734bf215546Sopenharmony_ci if (!c->fs_key->msaa) { 3735bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); 3736bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); 3737bf215546Sopenharmony_ci return; 3738bf215546Sopenharmony_ci } 3739bf215546Sopenharmony_ci 3740bf215546Sopenharmony_ci struct qreg offset_x, offset_y; 3741bf215546Sopenharmony_ci struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0); 3742bf215546Sopenharmony_ci ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y); 3743bf215546Sopenharmony_ci 3744bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); 3745bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); 3746bf215546Sopenharmony_ci break; 3747bf215546Sopenharmony_ci } 3748bf215546Sopenharmony_ci 3749bf215546Sopenharmony_ci case nir_intrinsic_load_barycentric_sample: { 3750bf215546Sopenharmony_ci struct qreg offset_x = 3751bf215546Sopenharmony_ci vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))); 3752bf215546Sopenharmony_ci struct qreg offset_y = 3753bf215546Sopenharmony_ci vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))); 3754bf215546Sopenharmony_ci 3755bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3756bf215546Sopenharmony_ci vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f))); 3757bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 1, 3758bf215546Sopenharmony_ci vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f))); 3759bf215546Sopenharmony_ci break; 3760bf215546Sopenharmony_ci } 3761bf215546Sopenharmony_ci 3762bf215546Sopenharmony_ci case nir_intrinsic_load_barycentric_centroid: { 3763bf215546Sopenharmony_ci struct qreg offset_x, offset_y; 3764bf215546Sopenharmony_ci ntq_get_barycentric_centroid(c, &offset_x, &offset_y); 3765bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); 3766bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); 3767bf215546Sopenharmony_ci break; 3768bf215546Sopenharmony_ci } 3769bf215546Sopenharmony_ci 3770bf215546Sopenharmony_ci case nir_intrinsic_load_interpolated_input: { 3771bf215546Sopenharmony_ci assert(nir_src_is_const(instr->src[1])); 3772bf215546Sopenharmony_ci const uint32_t offset = nir_src_as_uint(instr->src[1]); 3773bf215546Sopenharmony_ci 3774bf215546Sopenharmony_ci for (int i = 0; i < instr->num_components; i++) { 3775bf215546Sopenharmony_ci const uint32_t input_idx = 3776bf215546Sopenharmony_ci (nir_intrinsic_base(instr) + offset) * 4 + 3777bf215546Sopenharmony_ci nir_intrinsic_component(instr) + i; 3778bf215546Sopenharmony_ci 3779bf215546Sopenharmony_ci /* If we are not in MSAA or if we are not interpolating 3780bf215546Sopenharmony_ci * a user varying, just return the pre-computed 3781bf215546Sopenharmony_ci * interpolated input. 3782bf215546Sopenharmony_ci */ 3783bf215546Sopenharmony_ci if (!c->fs_key->msaa || 3784bf215546Sopenharmony_ci c->interp[input_idx].vp.file == QFILE_NULL) { 3785bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, i, 3786bf215546Sopenharmony_ci vir_MOV(c, c->inputs[input_idx])); 3787bf215546Sopenharmony_ci continue; 3788bf215546Sopenharmony_ci } 3789bf215546Sopenharmony_ci 3790bf215546Sopenharmony_ci /* Otherwise compute interpolation at the specified 3791bf215546Sopenharmony_ci * offset. 3792bf215546Sopenharmony_ci */ 3793bf215546Sopenharmony_ci struct qreg p = c->interp[input_idx].vp; 3794bf215546Sopenharmony_ci struct qreg C = c->interp[input_idx].C; 3795bf215546Sopenharmony_ci unsigned interp_mode = c->interp[input_idx].mode; 3796bf215546Sopenharmony_ci 3797bf215546Sopenharmony_ci struct qreg offset_x = ntq_get_src(c, instr->src[0], 0); 3798bf215546Sopenharmony_ci struct qreg offset_y = ntq_get_src(c, instr->src[0], 1); 3799bf215546Sopenharmony_ci 3800bf215546Sopenharmony_ci struct qreg result = 3801bf215546Sopenharmony_ci ntq_emit_load_interpolated_input(c, p, C, 3802bf215546Sopenharmony_ci offset_x, offset_y, 3803bf215546Sopenharmony_ci interp_mode); 3804bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, i, result); 3805bf215546Sopenharmony_ci } 3806bf215546Sopenharmony_ci break; 3807bf215546Sopenharmony_ci } 3808bf215546Sopenharmony_ci 3809bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_size: 3810bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3811bf215546Sopenharmony_ci vir_uniform_ui(c, V3D_CHANNELS)); 3812bf215546Sopenharmony_ci break; 3813bf215546Sopenharmony_ci 3814bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_invocation: 3815bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); 3816bf215546Sopenharmony_ci break; 3817bf215546Sopenharmony_ci 3818bf215546Sopenharmony_ci case nir_intrinsic_elect: { 3819bf215546Sopenharmony_ci set_a_flags_for_subgroup(c); 3820bf215546Sopenharmony_ci struct qreg first = vir_FLAFIRST(c); 3821bf215546Sopenharmony_ci 3822bf215546Sopenharmony_ci /* Produce a boolean result from Flafirst */ 3823bf215546Sopenharmony_ci vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), 3824bf215546Sopenharmony_ci first, vir_uniform_ui(c, 1)), 3825bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 3826bf215546Sopenharmony_ci struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); 3827bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, result); 3828bf215546Sopenharmony_ci break; 3829bf215546Sopenharmony_ci } 3830bf215546Sopenharmony_ci 3831bf215546Sopenharmony_ci case nir_intrinsic_load_num_subgroups: 3832bf215546Sopenharmony_ci unreachable("Should have been lowered"); 3833bf215546Sopenharmony_ci break; 3834bf215546Sopenharmony_ci 3835bf215546Sopenharmony_ci case nir_intrinsic_load_view_index: 3836bf215546Sopenharmony_ci ntq_store_dest(c, &instr->dest, 0, 3837bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_VIEW_INDEX, 0)); 3838bf215546Sopenharmony_ci break; 3839bf215546Sopenharmony_ci 3840bf215546Sopenharmony_ci default: 3841bf215546Sopenharmony_ci fprintf(stderr, "Unknown intrinsic: "); 3842bf215546Sopenharmony_ci nir_print_instr(&instr->instr, stderr); 3843bf215546Sopenharmony_ci fprintf(stderr, "\n"); 3844bf215546Sopenharmony_ci break; 3845bf215546Sopenharmony_ci } 3846bf215546Sopenharmony_ci} 3847bf215546Sopenharmony_ci 3848bf215546Sopenharmony_ci/* Clears (activates) the execute flags for any channels whose jump target 3849bf215546Sopenharmony_ci * matches this block. 3850bf215546Sopenharmony_ci * 3851bf215546Sopenharmony_ci * XXX perf: Could we be using flpush/flpop somehow for our execution channel 3852bf215546Sopenharmony_ci * enabling? 3853bf215546Sopenharmony_ci * 3854bf215546Sopenharmony_ci */ 3855bf215546Sopenharmony_cistatic void 3856bf215546Sopenharmony_cintq_activate_execute_for_block(struct v3d_compile *c) 3857bf215546Sopenharmony_ci{ 3858bf215546Sopenharmony_ci vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), 3859bf215546Sopenharmony_ci c->execute, vir_uniform_ui(c, c->cur_block->index)), 3860bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 3861bf215546Sopenharmony_ci 3862bf215546Sopenharmony_ci vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 3863bf215546Sopenharmony_ci} 3864bf215546Sopenharmony_ci 3865bf215546Sopenharmony_cistatic void 3866bf215546Sopenharmony_cintq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt) 3867bf215546Sopenharmony_ci{ 3868bf215546Sopenharmony_ci nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 3869bf215546Sopenharmony_ci bool empty_else_block = 3870bf215546Sopenharmony_ci (nir_else_block == nir_if_last_else_block(if_stmt) && 3871bf215546Sopenharmony_ci exec_list_is_empty(&nir_else_block->instr_list)); 3872bf215546Sopenharmony_ci 3873bf215546Sopenharmony_ci struct qblock *then_block = vir_new_block(c); 3874bf215546Sopenharmony_ci struct qblock *after_block = vir_new_block(c); 3875bf215546Sopenharmony_ci struct qblock *else_block; 3876bf215546Sopenharmony_ci if (empty_else_block) 3877bf215546Sopenharmony_ci else_block = after_block; 3878bf215546Sopenharmony_ci else 3879bf215546Sopenharmony_ci else_block = vir_new_block(c); 3880bf215546Sopenharmony_ci 3881bf215546Sopenharmony_ci /* Check if this if statement is really just a conditional jump with 3882bf215546Sopenharmony_ci * the form: 3883bf215546Sopenharmony_ci * 3884bf215546Sopenharmony_ci * if (cond) { 3885bf215546Sopenharmony_ci * break/continue; 3886bf215546Sopenharmony_ci * } else { 3887bf215546Sopenharmony_ci * } 3888bf215546Sopenharmony_ci * 3889bf215546Sopenharmony_ci * In which case we can skip the jump to ELSE we emit before the THEN 3890bf215546Sopenharmony_ci * block and instead just emit the break/continue directly. 3891bf215546Sopenharmony_ci */ 3892bf215546Sopenharmony_ci nir_jump_instr *conditional_jump = NULL; 3893bf215546Sopenharmony_ci if (empty_else_block) { 3894bf215546Sopenharmony_ci nir_block *nir_then_block = nir_if_first_then_block(if_stmt); 3895bf215546Sopenharmony_ci struct nir_instr *inst = nir_block_first_instr(nir_then_block); 3896bf215546Sopenharmony_ci if (inst && inst->type == nir_instr_type_jump) 3897bf215546Sopenharmony_ci conditional_jump = nir_instr_as_jump(inst); 3898bf215546Sopenharmony_ci } 3899bf215546Sopenharmony_ci 3900bf215546Sopenharmony_ci /* Set up the flags for the IF condition (taking the THEN branch). */ 3901bf215546Sopenharmony_ci enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); 3902bf215546Sopenharmony_ci 3903bf215546Sopenharmony_ci if (!conditional_jump) { 3904bf215546Sopenharmony_ci /* Jump to ELSE. */ 3905bf215546Sopenharmony_ci struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ? 3906bf215546Sopenharmony_ci V3D_QPU_BRANCH_COND_ANYNA : 3907bf215546Sopenharmony_ci V3D_QPU_BRANCH_COND_ANYA); 3908bf215546Sopenharmony_ci /* Pixels that were not dispatched or have been discarded 3909bf215546Sopenharmony_ci * should not contribute to the ANYA/ANYNA condition. 3910bf215546Sopenharmony_ci */ 3911bf215546Sopenharmony_ci branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 3912bf215546Sopenharmony_ci 3913bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, else_block); 3914bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, then_block); 3915bf215546Sopenharmony_ci 3916bf215546Sopenharmony_ci /* Process the THEN block. */ 3917bf215546Sopenharmony_ci vir_set_emit_block(c, then_block); 3918bf215546Sopenharmony_ci ntq_emit_cf_list(c, &if_stmt->then_list); 3919bf215546Sopenharmony_ci 3920bf215546Sopenharmony_ci if (!empty_else_block) { 3921bf215546Sopenharmony_ci /* At the end of the THEN block, jump to ENDIF, unless 3922bf215546Sopenharmony_ci * the block ended in a break or continue. 3923bf215546Sopenharmony_ci */ 3924bf215546Sopenharmony_ci if (!c->cur_block->branch_emitted) { 3925bf215546Sopenharmony_ci vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 3926bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, after_block); 3927bf215546Sopenharmony_ci } 3928bf215546Sopenharmony_ci 3929bf215546Sopenharmony_ci /* Emit the else block. */ 3930bf215546Sopenharmony_ci vir_set_emit_block(c, else_block); 3931bf215546Sopenharmony_ci ntq_emit_cf_list(c, &if_stmt->else_list); 3932bf215546Sopenharmony_ci } 3933bf215546Sopenharmony_ci } else { 3934bf215546Sopenharmony_ci /* Emit the conditional jump directly. 3935bf215546Sopenharmony_ci * 3936bf215546Sopenharmony_ci * Use ALL with breaks and ANY with continues to ensure that 3937bf215546Sopenharmony_ci * we always break and never continue when all lanes have been 3938bf215546Sopenharmony_ci * disabled (for example because of discards) to prevent 3939bf215546Sopenharmony_ci * infinite loops. 3940bf215546Sopenharmony_ci */ 3941bf215546Sopenharmony_ci assert(conditional_jump && 3942bf215546Sopenharmony_ci (conditional_jump->type == nir_jump_continue || 3943bf215546Sopenharmony_ci conditional_jump->type == nir_jump_break)); 3944bf215546Sopenharmony_ci 3945bf215546Sopenharmony_ci struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ? 3946bf215546Sopenharmony_ci (conditional_jump->type == nir_jump_break ? 3947bf215546Sopenharmony_ci V3D_QPU_BRANCH_COND_ALLA : 3948bf215546Sopenharmony_ci V3D_QPU_BRANCH_COND_ANYA) : 3949bf215546Sopenharmony_ci (conditional_jump->type == nir_jump_break ? 3950bf215546Sopenharmony_ci V3D_QPU_BRANCH_COND_ALLNA : 3951bf215546Sopenharmony_ci V3D_QPU_BRANCH_COND_ANYNA)); 3952bf215546Sopenharmony_ci branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 3953bf215546Sopenharmony_ci 3954bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, 3955bf215546Sopenharmony_ci conditional_jump->type == nir_jump_break ? 3956bf215546Sopenharmony_ci c->loop_break_block : 3957bf215546Sopenharmony_ci c->loop_cont_block); 3958bf215546Sopenharmony_ci } 3959bf215546Sopenharmony_ci 3960bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, after_block); 3961bf215546Sopenharmony_ci 3962bf215546Sopenharmony_ci vir_set_emit_block(c, after_block); 3963bf215546Sopenharmony_ci} 3964bf215546Sopenharmony_ci 3965bf215546Sopenharmony_cistatic void 3966bf215546Sopenharmony_cintq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) 3967bf215546Sopenharmony_ci{ 3968bf215546Sopenharmony_ci nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 3969bf215546Sopenharmony_ci bool empty_else_block = 3970bf215546Sopenharmony_ci (nir_else_block == nir_if_last_else_block(if_stmt) && 3971bf215546Sopenharmony_ci exec_list_is_empty(&nir_else_block->instr_list)); 3972bf215546Sopenharmony_ci 3973bf215546Sopenharmony_ci struct qblock *then_block = vir_new_block(c); 3974bf215546Sopenharmony_ci struct qblock *after_block = vir_new_block(c); 3975bf215546Sopenharmony_ci struct qblock *else_block; 3976bf215546Sopenharmony_ci if (empty_else_block) 3977bf215546Sopenharmony_ci else_block = after_block; 3978bf215546Sopenharmony_ci else 3979bf215546Sopenharmony_ci else_block = vir_new_block(c); 3980bf215546Sopenharmony_ci 3981bf215546Sopenharmony_ci bool was_uniform_control_flow = false; 3982bf215546Sopenharmony_ci if (!vir_in_nonuniform_control_flow(c)) { 3983bf215546Sopenharmony_ci c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 3984bf215546Sopenharmony_ci was_uniform_control_flow = true; 3985bf215546Sopenharmony_ci } 3986bf215546Sopenharmony_ci 3987bf215546Sopenharmony_ci /* Set up the flags for the IF condition (taking the THEN branch). */ 3988bf215546Sopenharmony_ci enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); 3989bf215546Sopenharmony_ci 3990bf215546Sopenharmony_ci /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and 3991bf215546Sopenharmony_ci * was previously active (execute Z) for updating the exec flags. 3992bf215546Sopenharmony_ci */ 3993bf215546Sopenharmony_ci if (was_uniform_control_flow) { 3994bf215546Sopenharmony_ci cond = v3d_qpu_cond_invert(cond); 3995bf215546Sopenharmony_ci } else { 3996bf215546Sopenharmony_ci struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute); 3997bf215546Sopenharmony_ci if (cond == V3D_QPU_COND_IFA) { 3998bf215546Sopenharmony_ci vir_set_uf(c, inst, V3D_QPU_UF_NORNZ); 3999bf215546Sopenharmony_ci } else { 4000bf215546Sopenharmony_ci vir_set_uf(c, inst, V3D_QPU_UF_ANDZ); 4001bf215546Sopenharmony_ci cond = V3D_QPU_COND_IFA; 4002bf215546Sopenharmony_ci } 4003bf215546Sopenharmony_ci } 4004bf215546Sopenharmony_ci 4005bf215546Sopenharmony_ci vir_MOV_cond(c, cond, 4006bf215546Sopenharmony_ci c->execute, 4007bf215546Sopenharmony_ci vir_uniform_ui(c, else_block->index)); 4008bf215546Sopenharmony_ci 4009bf215546Sopenharmony_ci /* Jump to ELSE if nothing is active for THEN, otherwise fall 4010bf215546Sopenharmony_ci * through. 4011bf215546Sopenharmony_ci */ 4012bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); 4013bf215546Sopenharmony_ci vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); 4014bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, else_block); 4015bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, then_block); 4016bf215546Sopenharmony_ci 4017bf215546Sopenharmony_ci /* Process the THEN block. */ 4018bf215546Sopenharmony_ci vir_set_emit_block(c, then_block); 4019bf215546Sopenharmony_ci ntq_emit_cf_list(c, &if_stmt->then_list); 4020bf215546Sopenharmony_ci 4021bf215546Sopenharmony_ci if (!empty_else_block) { 4022bf215546Sopenharmony_ci /* Handle the end of the THEN block. First, all currently 4023bf215546Sopenharmony_ci * active channels update their execute flags to point to 4024bf215546Sopenharmony_ci * ENDIF 4025bf215546Sopenharmony_ci */ 4026bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 4027bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 4028bf215546Sopenharmony_ci vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 4029bf215546Sopenharmony_ci vir_uniform_ui(c, after_block->index)); 4030bf215546Sopenharmony_ci 4031bf215546Sopenharmony_ci /* If everything points at ENDIF, then jump there immediately. */ 4032bf215546Sopenharmony_ci vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), 4033bf215546Sopenharmony_ci c->execute, 4034bf215546Sopenharmony_ci vir_uniform_ui(c, after_block->index)), 4035bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 4036bf215546Sopenharmony_ci vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); 4037bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, after_block); 4038bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, else_block); 4039bf215546Sopenharmony_ci 4040bf215546Sopenharmony_ci vir_set_emit_block(c, else_block); 4041bf215546Sopenharmony_ci ntq_activate_execute_for_block(c); 4042bf215546Sopenharmony_ci ntq_emit_cf_list(c, &if_stmt->else_list); 4043bf215546Sopenharmony_ci } 4044bf215546Sopenharmony_ci 4045bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, after_block); 4046bf215546Sopenharmony_ci 4047bf215546Sopenharmony_ci vir_set_emit_block(c, after_block); 4048bf215546Sopenharmony_ci if (was_uniform_control_flow) 4049bf215546Sopenharmony_ci c->execute = c->undef; 4050bf215546Sopenharmony_ci else 4051bf215546Sopenharmony_ci ntq_activate_execute_for_block(c); 4052bf215546Sopenharmony_ci} 4053bf215546Sopenharmony_ci 4054bf215546Sopenharmony_cistatic void 4055bf215546Sopenharmony_cintq_emit_if(struct v3d_compile *c, nir_if *nif) 4056bf215546Sopenharmony_ci{ 4057bf215546Sopenharmony_ci bool was_in_control_flow = c->in_control_flow; 4058bf215546Sopenharmony_ci c->in_control_flow = true; 4059bf215546Sopenharmony_ci if (!vir_in_nonuniform_control_flow(c) && 4060bf215546Sopenharmony_ci !nir_src_is_divergent(nif->condition)) { 4061bf215546Sopenharmony_ci ntq_emit_uniform_if(c, nif); 4062bf215546Sopenharmony_ci } else { 4063bf215546Sopenharmony_ci ntq_emit_nonuniform_if(c, nif); 4064bf215546Sopenharmony_ci } 4065bf215546Sopenharmony_ci c->in_control_flow = was_in_control_flow; 4066bf215546Sopenharmony_ci} 4067bf215546Sopenharmony_ci 4068bf215546Sopenharmony_cistatic void 4069bf215546Sopenharmony_cintq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump) 4070bf215546Sopenharmony_ci{ 4071bf215546Sopenharmony_ci switch (jump->type) { 4072bf215546Sopenharmony_ci case nir_jump_break: 4073bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 4074bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 4075bf215546Sopenharmony_ci vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 4076bf215546Sopenharmony_ci vir_uniform_ui(c, c->loop_break_block->index)); 4077bf215546Sopenharmony_ci break; 4078bf215546Sopenharmony_ci 4079bf215546Sopenharmony_ci case nir_jump_continue: 4080bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 4081bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 4082bf215546Sopenharmony_ci vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 4083bf215546Sopenharmony_ci vir_uniform_ui(c, c->loop_cont_block->index)); 4084bf215546Sopenharmony_ci break; 4085bf215546Sopenharmony_ci 4086bf215546Sopenharmony_ci case nir_jump_return: 4087bf215546Sopenharmony_ci unreachable("All returns should be lowered\n"); 4088bf215546Sopenharmony_ci break; 4089bf215546Sopenharmony_ci 4090bf215546Sopenharmony_ci case nir_jump_halt: 4091bf215546Sopenharmony_ci case nir_jump_goto: 4092bf215546Sopenharmony_ci case nir_jump_goto_if: 4093bf215546Sopenharmony_ci unreachable("not supported\n"); 4094bf215546Sopenharmony_ci break; 4095bf215546Sopenharmony_ci } 4096bf215546Sopenharmony_ci} 4097bf215546Sopenharmony_ci 4098bf215546Sopenharmony_cistatic void 4099bf215546Sopenharmony_cintq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump) 4100bf215546Sopenharmony_ci{ 4101bf215546Sopenharmony_ci switch (jump->type) { 4102bf215546Sopenharmony_ci case nir_jump_break: 4103bf215546Sopenharmony_ci vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 4104bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, c->loop_break_block); 4105bf215546Sopenharmony_ci c->cur_block->branch_emitted = true; 4106bf215546Sopenharmony_ci break; 4107bf215546Sopenharmony_ci case nir_jump_continue: 4108bf215546Sopenharmony_ci vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 4109bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, c->loop_cont_block); 4110bf215546Sopenharmony_ci c->cur_block->branch_emitted = true; 4111bf215546Sopenharmony_ci break; 4112bf215546Sopenharmony_ci 4113bf215546Sopenharmony_ci case nir_jump_return: 4114bf215546Sopenharmony_ci unreachable("All returns should be lowered\n"); 4115bf215546Sopenharmony_ci break; 4116bf215546Sopenharmony_ci 4117bf215546Sopenharmony_ci case nir_jump_halt: 4118bf215546Sopenharmony_ci case nir_jump_goto: 4119bf215546Sopenharmony_ci case nir_jump_goto_if: 4120bf215546Sopenharmony_ci unreachable("not supported\n"); 4121bf215546Sopenharmony_ci break; 4122bf215546Sopenharmony_ci } 4123bf215546Sopenharmony_ci} 4124bf215546Sopenharmony_ci 4125bf215546Sopenharmony_cistatic void 4126bf215546Sopenharmony_cintq_emit_instr(struct v3d_compile *c, nir_instr *instr) 4127bf215546Sopenharmony_ci{ 4128bf215546Sopenharmony_ci switch (instr->type) { 4129bf215546Sopenharmony_ci case nir_instr_type_alu: 4130bf215546Sopenharmony_ci ntq_emit_alu(c, nir_instr_as_alu(instr)); 4131bf215546Sopenharmony_ci break; 4132bf215546Sopenharmony_ci 4133bf215546Sopenharmony_ci case nir_instr_type_intrinsic: 4134bf215546Sopenharmony_ci ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); 4135bf215546Sopenharmony_ci break; 4136bf215546Sopenharmony_ci 4137bf215546Sopenharmony_ci case nir_instr_type_load_const: 4138bf215546Sopenharmony_ci ntq_emit_load_const(c, nir_instr_as_load_const(instr)); 4139bf215546Sopenharmony_ci break; 4140bf215546Sopenharmony_ci 4141bf215546Sopenharmony_ci case nir_instr_type_ssa_undef: 4142bf215546Sopenharmony_ci unreachable("Should've been lowered by nir_lower_undef_to_zero"); 4143bf215546Sopenharmony_ci break; 4144bf215546Sopenharmony_ci 4145bf215546Sopenharmony_ci case nir_instr_type_tex: 4146bf215546Sopenharmony_ci ntq_emit_tex(c, nir_instr_as_tex(instr)); 4147bf215546Sopenharmony_ci break; 4148bf215546Sopenharmony_ci 4149bf215546Sopenharmony_ci case nir_instr_type_jump: 4150bf215546Sopenharmony_ci /* Always flush TMU before jumping to another block, for the 4151bf215546Sopenharmony_ci * same reasons as in ntq_emit_block. 4152bf215546Sopenharmony_ci */ 4153bf215546Sopenharmony_ci ntq_flush_tmu(c); 4154bf215546Sopenharmony_ci if (vir_in_nonuniform_control_flow(c)) 4155bf215546Sopenharmony_ci ntq_emit_jump(c, nir_instr_as_jump(instr)); 4156bf215546Sopenharmony_ci else 4157bf215546Sopenharmony_ci ntq_emit_uniform_jump(c, nir_instr_as_jump(instr)); 4158bf215546Sopenharmony_ci break; 4159bf215546Sopenharmony_ci 4160bf215546Sopenharmony_ci default: 4161bf215546Sopenharmony_ci fprintf(stderr, "Unknown NIR instr type: "); 4162bf215546Sopenharmony_ci nir_print_instr(instr, stderr); 4163bf215546Sopenharmony_ci fprintf(stderr, "\n"); 4164bf215546Sopenharmony_ci abort(); 4165bf215546Sopenharmony_ci } 4166bf215546Sopenharmony_ci} 4167bf215546Sopenharmony_ci 4168bf215546Sopenharmony_cistatic void 4169bf215546Sopenharmony_cintq_emit_block(struct v3d_compile *c, nir_block *block) 4170bf215546Sopenharmony_ci{ 4171bf215546Sopenharmony_ci nir_foreach_instr(instr, block) { 4172bf215546Sopenharmony_ci ntq_emit_instr(c, instr); 4173bf215546Sopenharmony_ci } 4174bf215546Sopenharmony_ci 4175bf215546Sopenharmony_ci /* Always process pending TMU operations in the same block they were 4176bf215546Sopenharmony_ci * emitted: we can't emit TMU operations in a block and then emit a 4177bf215546Sopenharmony_ci * thread switch and LDTMU/TMUWT for them in another block, possibly 4178bf215546Sopenharmony_ci * under control flow. 4179bf215546Sopenharmony_ci */ 4180bf215546Sopenharmony_ci ntq_flush_tmu(c); 4181bf215546Sopenharmony_ci} 4182bf215546Sopenharmony_ci 4183bf215546Sopenharmony_cistatic void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 4184bf215546Sopenharmony_ci 4185bf215546Sopenharmony_cistatic void 4186bf215546Sopenharmony_cintq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop) 4187bf215546Sopenharmony_ci{ 4188bf215546Sopenharmony_ci bool was_uniform_control_flow = false; 4189bf215546Sopenharmony_ci if (!vir_in_nonuniform_control_flow(c)) { 4190bf215546Sopenharmony_ci c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 4191bf215546Sopenharmony_ci was_uniform_control_flow = true; 4192bf215546Sopenharmony_ci } 4193bf215546Sopenharmony_ci 4194bf215546Sopenharmony_ci c->loop_cont_block = vir_new_block(c); 4195bf215546Sopenharmony_ci c->loop_break_block = vir_new_block(c); 4196bf215546Sopenharmony_ci 4197bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, c->loop_cont_block); 4198bf215546Sopenharmony_ci vir_set_emit_block(c, c->loop_cont_block); 4199bf215546Sopenharmony_ci ntq_activate_execute_for_block(c); 4200bf215546Sopenharmony_ci 4201bf215546Sopenharmony_ci ntq_emit_cf_list(c, &loop->body); 4202bf215546Sopenharmony_ci 4203bf215546Sopenharmony_ci /* Re-enable any previous continues now, so our ANYA check below 4204bf215546Sopenharmony_ci * works. 4205bf215546Sopenharmony_ci * 4206bf215546Sopenharmony_ci * XXX: Use the .ORZ flags update, instead. 4207bf215546Sopenharmony_ci */ 4208bf215546Sopenharmony_ci vir_set_pf(c, vir_XOR_dest(c, 4209bf215546Sopenharmony_ci vir_nop_reg(), 4210bf215546Sopenharmony_ci c->execute, 4211bf215546Sopenharmony_ci vir_uniform_ui(c, c->loop_cont_block->index)), 4212bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 4213bf215546Sopenharmony_ci vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 4214bf215546Sopenharmony_ci 4215bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); 4216bf215546Sopenharmony_ci 4217bf215546Sopenharmony_ci struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA); 4218bf215546Sopenharmony_ci /* Pixels that were not dispatched or have been discarded should not 4219bf215546Sopenharmony_ci * contribute to looping again. 4220bf215546Sopenharmony_ci */ 4221bf215546Sopenharmony_ci branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 4222bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, c->loop_cont_block); 4223bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, c->loop_break_block); 4224bf215546Sopenharmony_ci 4225bf215546Sopenharmony_ci vir_set_emit_block(c, c->loop_break_block); 4226bf215546Sopenharmony_ci if (was_uniform_control_flow) 4227bf215546Sopenharmony_ci c->execute = c->undef; 4228bf215546Sopenharmony_ci else 4229bf215546Sopenharmony_ci ntq_activate_execute_for_block(c); 4230bf215546Sopenharmony_ci} 4231bf215546Sopenharmony_ci 4232bf215546Sopenharmony_cistatic void 4233bf215546Sopenharmony_cintq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop) 4234bf215546Sopenharmony_ci{ 4235bf215546Sopenharmony_ci c->loop_cont_block = vir_new_block(c); 4236bf215546Sopenharmony_ci c->loop_break_block = vir_new_block(c); 4237bf215546Sopenharmony_ci 4238bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, c->loop_cont_block); 4239bf215546Sopenharmony_ci vir_set_emit_block(c, c->loop_cont_block); 4240bf215546Sopenharmony_ci 4241bf215546Sopenharmony_ci ntq_emit_cf_list(c, &loop->body); 4242bf215546Sopenharmony_ci 4243bf215546Sopenharmony_ci if (!c->cur_block->branch_emitted) { 4244bf215546Sopenharmony_ci vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 4245bf215546Sopenharmony_ci vir_link_blocks(c->cur_block, c->loop_cont_block); 4246bf215546Sopenharmony_ci } 4247bf215546Sopenharmony_ci 4248bf215546Sopenharmony_ci vir_set_emit_block(c, c->loop_break_block); 4249bf215546Sopenharmony_ci} 4250bf215546Sopenharmony_ci 4251bf215546Sopenharmony_cistatic void 4252bf215546Sopenharmony_cintq_emit_loop(struct v3d_compile *c, nir_loop *loop) 4253bf215546Sopenharmony_ci{ 4254bf215546Sopenharmony_ci /* Disable flags optimization for loop conditions. The problem here is 4255bf215546Sopenharmony_ci * that we can have code like this: 4256bf215546Sopenharmony_ci * 4257bf215546Sopenharmony_ci * // block_0 4258bf215546Sopenharmony_ci * vec1 32 con ssa_9 = ine32 ssa_8, ssa_2 4259bf215546Sopenharmony_ci * loop { 4260bf215546Sopenharmony_ci * // block_1 4261bf215546Sopenharmony_ci * if ssa_9 { 4262bf215546Sopenharmony_ci * 4263bf215546Sopenharmony_ci * In this example we emit flags to compute ssa_9 and the optimization 4264bf215546Sopenharmony_ci * will skip regenerating them again for the loop condition in the 4265bf215546Sopenharmony_ci * loop continue block (block_1). However, this is not safe after the 4266bf215546Sopenharmony_ci * first iteration because the loop body can stomp the flags if it has 4267bf215546Sopenharmony_ci * any conditionals. 4268bf215546Sopenharmony_ci */ 4269bf215546Sopenharmony_ci c->flags_temp = -1; 4270bf215546Sopenharmony_ci 4271bf215546Sopenharmony_ci bool was_in_control_flow = c->in_control_flow; 4272bf215546Sopenharmony_ci c->in_control_flow = true; 4273bf215546Sopenharmony_ci 4274bf215546Sopenharmony_ci struct qblock *save_loop_cont_block = c->loop_cont_block; 4275bf215546Sopenharmony_ci struct qblock *save_loop_break_block = c->loop_break_block; 4276bf215546Sopenharmony_ci 4277bf215546Sopenharmony_ci if (vir_in_nonuniform_control_flow(c) || loop->divergent) { 4278bf215546Sopenharmony_ci ntq_emit_nonuniform_loop(c, loop); 4279bf215546Sopenharmony_ci } else { 4280bf215546Sopenharmony_ci ntq_emit_uniform_loop(c, loop); 4281bf215546Sopenharmony_ci } 4282bf215546Sopenharmony_ci 4283bf215546Sopenharmony_ci c->loop_break_block = save_loop_break_block; 4284bf215546Sopenharmony_ci c->loop_cont_block = save_loop_cont_block; 4285bf215546Sopenharmony_ci 4286bf215546Sopenharmony_ci c->loops++; 4287bf215546Sopenharmony_ci 4288bf215546Sopenharmony_ci c->in_control_flow = was_in_control_flow; 4289bf215546Sopenharmony_ci} 4290bf215546Sopenharmony_ci 4291bf215546Sopenharmony_cistatic void 4292bf215546Sopenharmony_cintq_emit_function(struct v3d_compile *c, nir_function_impl *func) 4293bf215546Sopenharmony_ci{ 4294bf215546Sopenharmony_ci fprintf(stderr, "FUNCTIONS not handled.\n"); 4295bf215546Sopenharmony_ci abort(); 4296bf215546Sopenharmony_ci} 4297bf215546Sopenharmony_ci 4298bf215546Sopenharmony_cistatic void 4299bf215546Sopenharmony_cintq_emit_cf_list(struct v3d_compile *c, struct exec_list *list) 4300bf215546Sopenharmony_ci{ 4301bf215546Sopenharmony_ci foreach_list_typed(nir_cf_node, node, node, list) { 4302bf215546Sopenharmony_ci switch (node->type) { 4303bf215546Sopenharmony_ci case nir_cf_node_block: 4304bf215546Sopenharmony_ci ntq_emit_block(c, nir_cf_node_as_block(node)); 4305bf215546Sopenharmony_ci break; 4306bf215546Sopenharmony_ci 4307bf215546Sopenharmony_ci case nir_cf_node_if: 4308bf215546Sopenharmony_ci ntq_emit_if(c, nir_cf_node_as_if(node)); 4309bf215546Sopenharmony_ci break; 4310bf215546Sopenharmony_ci 4311bf215546Sopenharmony_ci case nir_cf_node_loop: 4312bf215546Sopenharmony_ci ntq_emit_loop(c, nir_cf_node_as_loop(node)); 4313bf215546Sopenharmony_ci break; 4314bf215546Sopenharmony_ci 4315bf215546Sopenharmony_ci case nir_cf_node_function: 4316bf215546Sopenharmony_ci ntq_emit_function(c, nir_cf_node_as_function(node)); 4317bf215546Sopenharmony_ci break; 4318bf215546Sopenharmony_ci 4319bf215546Sopenharmony_ci default: 4320bf215546Sopenharmony_ci fprintf(stderr, "Unknown NIR node type\n"); 4321bf215546Sopenharmony_ci abort(); 4322bf215546Sopenharmony_ci } 4323bf215546Sopenharmony_ci } 4324bf215546Sopenharmony_ci} 4325bf215546Sopenharmony_ci 4326bf215546Sopenharmony_cistatic void 4327bf215546Sopenharmony_cintq_emit_impl(struct v3d_compile *c, nir_function_impl *impl) 4328bf215546Sopenharmony_ci{ 4329bf215546Sopenharmony_ci ntq_setup_registers(c, &impl->registers); 4330bf215546Sopenharmony_ci ntq_emit_cf_list(c, &impl->body); 4331bf215546Sopenharmony_ci} 4332bf215546Sopenharmony_ci 4333bf215546Sopenharmony_cistatic void 4334bf215546Sopenharmony_cinir_to_vir(struct v3d_compile *c) 4335bf215546Sopenharmony_ci{ 4336bf215546Sopenharmony_ci switch (c->s->info.stage) { 4337bf215546Sopenharmony_ci case MESA_SHADER_FRAGMENT: 4338bf215546Sopenharmony_ci c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); 4339bf215546Sopenharmony_ci c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); 4340bf215546Sopenharmony_ci c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); 4341bf215546Sopenharmony_ci 4342bf215546Sopenharmony_ci /* V3D 4.x can disable implicit varyings if they are not used */ 4343bf215546Sopenharmony_ci c->fs_uses_primitive_id = 4344bf215546Sopenharmony_ci nir_find_variable_with_location(c->s, nir_var_shader_in, 4345bf215546Sopenharmony_ci VARYING_SLOT_PRIMITIVE_ID); 4346bf215546Sopenharmony_ci if (c->fs_uses_primitive_id && !c->fs_key->has_gs) { 4347bf215546Sopenharmony_ci c->primitive_id = 4348bf215546Sopenharmony_ci emit_fragment_varying(c, NULL, -1, 0, 0); 4349bf215546Sopenharmony_ci } 4350bf215546Sopenharmony_ci 4351bf215546Sopenharmony_ci if (c->fs_key->is_points && 4352bf215546Sopenharmony_ci (c->devinfo->ver < 40 || program_reads_point_coord(c))) { 4353bf215546Sopenharmony_ci c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0); 4354bf215546Sopenharmony_ci c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0); 4355bf215546Sopenharmony_ci c->uses_implicit_point_line_varyings = true; 4356bf215546Sopenharmony_ci } else if (c->fs_key->is_lines && 4357bf215546Sopenharmony_ci (c->devinfo->ver < 40 || 4358bf215546Sopenharmony_ci BITSET_TEST(c->s->info.system_values_read, 4359bf215546Sopenharmony_ci SYSTEM_VALUE_LINE_COORD))) { 4360bf215546Sopenharmony_ci c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0); 4361bf215546Sopenharmony_ci c->uses_implicit_point_line_varyings = true; 4362bf215546Sopenharmony_ci } 4363bf215546Sopenharmony_ci 4364bf215546Sopenharmony_ci c->force_per_sample_msaa = 4365bf215546Sopenharmony_ci c->s->info.fs.uses_sample_qualifier || 4366bf215546Sopenharmony_ci BITSET_TEST(c->s->info.system_values_read, 4367bf215546Sopenharmony_ci SYSTEM_VALUE_SAMPLE_ID) || 4368bf215546Sopenharmony_ci BITSET_TEST(c->s->info.system_values_read, 4369bf215546Sopenharmony_ci SYSTEM_VALUE_SAMPLE_POS); 4370bf215546Sopenharmony_ci break; 4371bf215546Sopenharmony_ci case MESA_SHADER_COMPUTE: 4372bf215546Sopenharmony_ci /* Set up the TSO for barriers, assuming we do some. */ 4373bf215546Sopenharmony_ci if (c->devinfo->ver < 42) { 4374bf215546Sopenharmony_ci vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, 4375bf215546Sopenharmony_ci V3D_QPU_WADDR_SYNC)); 4376bf215546Sopenharmony_ci } 4377bf215546Sopenharmony_ci 4378bf215546Sopenharmony_ci c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); 4379bf215546Sopenharmony_ci c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); 4380bf215546Sopenharmony_ci 4381bf215546Sopenharmony_ci /* Set up the division between gl_LocalInvocationIndex and 4382bf215546Sopenharmony_ci * wg_in_mem in the payload reg. 4383bf215546Sopenharmony_ci */ 4384bf215546Sopenharmony_ci int wg_size = (c->s->info.workgroup_size[0] * 4385bf215546Sopenharmony_ci c->s->info.workgroup_size[1] * 4386bf215546Sopenharmony_ci c->s->info.workgroup_size[2]); 4387bf215546Sopenharmony_ci c->local_invocation_index_bits = 4388bf215546Sopenharmony_ci ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1; 4389bf215546Sopenharmony_ci assert(c->local_invocation_index_bits <= 8); 4390bf215546Sopenharmony_ci 4391bf215546Sopenharmony_ci if (c->s->info.shared_size) { 4392bf215546Sopenharmony_ci struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1], 4393bf215546Sopenharmony_ci vir_uniform_ui(c, 16)); 4394bf215546Sopenharmony_ci if (c->s->info.workgroup_size[0] != 1 || 4395bf215546Sopenharmony_ci c->s->info.workgroup_size[1] != 1 || 4396bf215546Sopenharmony_ci c->s->info.workgroup_size[2] != 1) { 4397bf215546Sopenharmony_ci int wg_bits = (16 - 4398bf215546Sopenharmony_ci c->local_invocation_index_bits); 4399bf215546Sopenharmony_ci int wg_mask = (1 << wg_bits) - 1; 4400bf215546Sopenharmony_ci wg_in_mem = vir_AND(c, wg_in_mem, 4401bf215546Sopenharmony_ci vir_uniform_ui(c, wg_mask)); 4402bf215546Sopenharmony_ci } 4403bf215546Sopenharmony_ci struct qreg shared_per_wg = 4404bf215546Sopenharmony_ci vir_uniform_ui(c, c->s->info.shared_size); 4405bf215546Sopenharmony_ci 4406bf215546Sopenharmony_ci c->cs_shared_offset = 4407bf215546Sopenharmony_ci vir_ADD(c, 4408bf215546Sopenharmony_ci vir_uniform(c, QUNIFORM_SHARED_OFFSET,0), 4409bf215546Sopenharmony_ci vir_UMUL(c, wg_in_mem, shared_per_wg)); 4410bf215546Sopenharmony_ci } 4411bf215546Sopenharmony_ci break; 4412bf215546Sopenharmony_ci default: 4413bf215546Sopenharmony_ci break; 4414bf215546Sopenharmony_ci } 4415bf215546Sopenharmony_ci 4416bf215546Sopenharmony_ci if (c->s->scratch_size) { 4417bf215546Sopenharmony_ci v3d_setup_spill_base(c); 4418bf215546Sopenharmony_ci c->spill_size += V3D_CHANNELS * c->s->scratch_size; 4419bf215546Sopenharmony_ci } 4420bf215546Sopenharmony_ci 4421bf215546Sopenharmony_ci switch (c->s->info.stage) { 4422bf215546Sopenharmony_ci case MESA_SHADER_VERTEX: 4423bf215546Sopenharmony_ci ntq_setup_vs_inputs(c); 4424bf215546Sopenharmony_ci break; 4425bf215546Sopenharmony_ci case MESA_SHADER_GEOMETRY: 4426bf215546Sopenharmony_ci ntq_setup_gs_inputs(c); 4427bf215546Sopenharmony_ci break; 4428bf215546Sopenharmony_ci case MESA_SHADER_FRAGMENT: 4429bf215546Sopenharmony_ci ntq_setup_fs_inputs(c); 4430bf215546Sopenharmony_ci break; 4431bf215546Sopenharmony_ci case MESA_SHADER_COMPUTE: 4432bf215546Sopenharmony_ci break; 4433bf215546Sopenharmony_ci default: 4434bf215546Sopenharmony_ci unreachable("unsupported shader stage"); 4435bf215546Sopenharmony_ci } 4436bf215546Sopenharmony_ci 4437bf215546Sopenharmony_ci ntq_setup_outputs(c); 4438bf215546Sopenharmony_ci 4439bf215546Sopenharmony_ci /* Find the main function and emit the body. */ 4440bf215546Sopenharmony_ci nir_foreach_function(function, c->s) { 4441bf215546Sopenharmony_ci assert(function->is_entrypoint); 4442bf215546Sopenharmony_ci assert(function->impl); 4443bf215546Sopenharmony_ci ntq_emit_impl(c, function->impl); 4444bf215546Sopenharmony_ci } 4445bf215546Sopenharmony_ci} 4446bf215546Sopenharmony_ci 4447bf215546Sopenharmony_ci/** 4448bf215546Sopenharmony_ci * When demoting a shader down to single-threaded, removes the THRSW 4449bf215546Sopenharmony_ci * instructions (one will still be inserted at v3d_vir_to_qpu() for the 4450bf215546Sopenharmony_ci * program end). 4451bf215546Sopenharmony_ci */ 4452bf215546Sopenharmony_cistatic void 4453bf215546Sopenharmony_civir_remove_thrsw(struct v3d_compile *c) 4454bf215546Sopenharmony_ci{ 4455bf215546Sopenharmony_ci vir_for_each_block(block, c) { 4456bf215546Sopenharmony_ci vir_for_each_inst_safe(inst, block) { 4457bf215546Sopenharmony_ci if (inst->qpu.sig.thrsw) 4458bf215546Sopenharmony_ci vir_remove_instruction(c, inst); 4459bf215546Sopenharmony_ci } 4460bf215546Sopenharmony_ci } 4461bf215546Sopenharmony_ci 4462bf215546Sopenharmony_ci c->last_thrsw = NULL; 4463bf215546Sopenharmony_ci} 4464bf215546Sopenharmony_ci 4465bf215546Sopenharmony_ci/** 4466bf215546Sopenharmony_ci * This makes sure we have a top-level last thread switch which signals the 4467bf215546Sopenharmony_ci * start of the last thread section, which may include adding a new thrsw 4468bf215546Sopenharmony_ci * instruction if needed. We don't allow spilling in the last thread section, so 4469bf215546Sopenharmony_ci * if we need to do any spills that inject additional thread switches later on, 4470bf215546Sopenharmony_ci * we ensure this thread switch will still be the last thread switch in the 4471bf215546Sopenharmony_ci * program, which makes last thread switch signalling a lot easier when we have 4472bf215546Sopenharmony_ci * spilling. If in the end we don't need to spill to compile the program and we 4473bf215546Sopenharmony_ci * injected a new thread switch instruction here only for that, we will 4474bf215546Sopenharmony_ci * eventually restore the previous last thread switch and remove the one we 4475bf215546Sopenharmony_ci * added here. 4476bf215546Sopenharmony_ci */ 4477bf215546Sopenharmony_cistatic void 4478bf215546Sopenharmony_civir_emit_last_thrsw(struct v3d_compile *c, 4479bf215546Sopenharmony_ci struct qinst **restore_last_thrsw, 4480bf215546Sopenharmony_ci bool *restore_scoreboard_lock) 4481bf215546Sopenharmony_ci{ 4482bf215546Sopenharmony_ci *restore_last_thrsw = c->last_thrsw; 4483bf215546Sopenharmony_ci 4484bf215546Sopenharmony_ci /* On V3D before 4.1, we need a TMU op to be outstanding when thread 4485bf215546Sopenharmony_ci * switching, so disable threads if we didn't do any TMU ops (each of 4486bf215546Sopenharmony_ci * which would have emitted a THRSW). 4487bf215546Sopenharmony_ci */ 4488bf215546Sopenharmony_ci if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) { 4489bf215546Sopenharmony_ci c->threads = 1; 4490bf215546Sopenharmony_ci if (c->last_thrsw) 4491bf215546Sopenharmony_ci vir_remove_thrsw(c); 4492bf215546Sopenharmony_ci *restore_last_thrsw = NULL; 4493bf215546Sopenharmony_ci } 4494bf215546Sopenharmony_ci 4495bf215546Sopenharmony_ci /* If we're threaded and the last THRSW was in conditional code, then 4496bf215546Sopenharmony_ci * we need to emit another one so that we can flag it as the last 4497bf215546Sopenharmony_ci * thrsw. 4498bf215546Sopenharmony_ci */ 4499bf215546Sopenharmony_ci if (c->last_thrsw && !c->last_thrsw_at_top_level) { 4500bf215546Sopenharmony_ci assert(c->devinfo->ver >= 41); 4501bf215546Sopenharmony_ci vir_emit_thrsw(c); 4502bf215546Sopenharmony_ci } 4503bf215546Sopenharmony_ci 4504bf215546Sopenharmony_ci /* If we're threaded, then we need to mark the last THRSW instruction 4505bf215546Sopenharmony_ci * so we can emit a pair of them at QPU emit time. 4506bf215546Sopenharmony_ci * 4507bf215546Sopenharmony_ci * For V3D 4.x, we can spawn the non-fragment shaders already in the 4508bf215546Sopenharmony_ci * post-last-THRSW state, so we can skip this. 4509bf215546Sopenharmony_ci */ 4510bf215546Sopenharmony_ci if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) { 4511bf215546Sopenharmony_ci assert(c->devinfo->ver >= 41); 4512bf215546Sopenharmony_ci vir_emit_thrsw(c); 4513bf215546Sopenharmony_ci } 4514bf215546Sopenharmony_ci 4515bf215546Sopenharmony_ci /* If we have not inserted a last thread switch yet, do it now to ensure 4516bf215546Sopenharmony_ci * any potential spilling we do happens before this. If we don't spill 4517bf215546Sopenharmony_ci * in the end, we will restore the previous one. 4518bf215546Sopenharmony_ci */ 4519bf215546Sopenharmony_ci if (*restore_last_thrsw == c->last_thrsw) { 4520bf215546Sopenharmony_ci if (*restore_last_thrsw) 4521bf215546Sopenharmony_ci (*restore_last_thrsw)->is_last_thrsw = false; 4522bf215546Sopenharmony_ci *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw; 4523bf215546Sopenharmony_ci vir_emit_thrsw(c); 4524bf215546Sopenharmony_ci } else { 4525bf215546Sopenharmony_ci *restore_last_thrsw = c->last_thrsw; 4526bf215546Sopenharmony_ci } 4527bf215546Sopenharmony_ci 4528bf215546Sopenharmony_ci assert(c->last_thrsw); 4529bf215546Sopenharmony_ci c->last_thrsw->is_last_thrsw = true; 4530bf215546Sopenharmony_ci} 4531bf215546Sopenharmony_ci 4532bf215546Sopenharmony_cistatic void 4533bf215546Sopenharmony_civir_restore_last_thrsw(struct v3d_compile *c, 4534bf215546Sopenharmony_ci struct qinst *thrsw, 4535bf215546Sopenharmony_ci bool scoreboard_lock) 4536bf215546Sopenharmony_ci{ 4537bf215546Sopenharmony_ci assert(c->last_thrsw); 4538bf215546Sopenharmony_ci vir_remove_instruction(c, c->last_thrsw); 4539bf215546Sopenharmony_ci c->last_thrsw = thrsw; 4540bf215546Sopenharmony_ci if (c->last_thrsw) 4541bf215546Sopenharmony_ci c->last_thrsw->is_last_thrsw = true; 4542bf215546Sopenharmony_ci c->lock_scoreboard_on_first_thrsw = scoreboard_lock; 4543bf215546Sopenharmony_ci} 4544bf215546Sopenharmony_ci 4545bf215546Sopenharmony_ci/* There's a flag in the shader for "center W is needed for reasons other than 4546bf215546Sopenharmony_ci * non-centroid varyings", so we just walk the program after VIR optimization 4547bf215546Sopenharmony_ci * to see if it's used. It should be harmless to set even if we only use 4548bf215546Sopenharmony_ci * center W for varyings. 4549bf215546Sopenharmony_ci */ 4550bf215546Sopenharmony_cistatic void 4551bf215546Sopenharmony_civir_check_payload_w(struct v3d_compile *c) 4552bf215546Sopenharmony_ci{ 4553bf215546Sopenharmony_ci if (c->s->info.stage != MESA_SHADER_FRAGMENT) 4554bf215546Sopenharmony_ci return; 4555bf215546Sopenharmony_ci 4556bf215546Sopenharmony_ci vir_for_each_inst_inorder(inst, c) { 4557bf215546Sopenharmony_ci for (int i = 0; i < vir_get_nsrc(inst); i++) { 4558bf215546Sopenharmony_ci if (inst->src[i].file == QFILE_REG && 4559bf215546Sopenharmony_ci inst->src[i].index == 0) { 4560bf215546Sopenharmony_ci c->uses_center_w = true; 4561bf215546Sopenharmony_ci return; 4562bf215546Sopenharmony_ci } 4563bf215546Sopenharmony_ci } 4564bf215546Sopenharmony_ci } 4565bf215546Sopenharmony_ci} 4566bf215546Sopenharmony_ci 4567bf215546Sopenharmony_civoid 4568bf215546Sopenharmony_civ3d_nir_to_vir(struct v3d_compile *c) 4569bf215546Sopenharmony_ci{ 4570bf215546Sopenharmony_ci if (V3D_DEBUG & (V3D_DEBUG_NIR | 4571bf215546Sopenharmony_ci v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 4572bf215546Sopenharmony_ci fprintf(stderr, "%s prog %d/%d NIR:\n", 4573bf215546Sopenharmony_ci vir_get_stage_name(c), 4574bf215546Sopenharmony_ci c->program_id, c->variant_id); 4575bf215546Sopenharmony_ci nir_print_shader(c->s, stderr); 4576bf215546Sopenharmony_ci } 4577bf215546Sopenharmony_ci 4578bf215546Sopenharmony_ci nir_to_vir(c); 4579bf215546Sopenharmony_ci 4580bf215546Sopenharmony_ci bool restore_scoreboard_lock = false; 4581bf215546Sopenharmony_ci struct qinst *restore_last_thrsw; 4582bf215546Sopenharmony_ci 4583bf215546Sopenharmony_ci /* Emit the last THRSW before STVPM and TLB writes. */ 4584bf215546Sopenharmony_ci vir_emit_last_thrsw(c, 4585bf215546Sopenharmony_ci &restore_last_thrsw, 4586bf215546Sopenharmony_ci &restore_scoreboard_lock); 4587bf215546Sopenharmony_ci 4588bf215546Sopenharmony_ci 4589bf215546Sopenharmony_ci switch (c->s->info.stage) { 4590bf215546Sopenharmony_ci case MESA_SHADER_FRAGMENT: 4591bf215546Sopenharmony_ci emit_frag_end(c); 4592bf215546Sopenharmony_ci break; 4593bf215546Sopenharmony_ci case MESA_SHADER_GEOMETRY: 4594bf215546Sopenharmony_ci emit_geom_end(c); 4595bf215546Sopenharmony_ci break; 4596bf215546Sopenharmony_ci case MESA_SHADER_VERTEX: 4597bf215546Sopenharmony_ci emit_vert_end(c); 4598bf215546Sopenharmony_ci break; 4599bf215546Sopenharmony_ci case MESA_SHADER_COMPUTE: 4600bf215546Sopenharmony_ci break; 4601bf215546Sopenharmony_ci default: 4602bf215546Sopenharmony_ci unreachable("bad stage"); 4603bf215546Sopenharmony_ci } 4604bf215546Sopenharmony_ci 4605bf215546Sopenharmony_ci if (V3D_DEBUG & (V3D_DEBUG_VIR | 4606bf215546Sopenharmony_ci v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 4607bf215546Sopenharmony_ci fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n", 4608bf215546Sopenharmony_ci vir_get_stage_name(c), 4609bf215546Sopenharmony_ci c->program_id, c->variant_id); 4610bf215546Sopenharmony_ci vir_dump(c); 4611bf215546Sopenharmony_ci fprintf(stderr, "\n"); 4612bf215546Sopenharmony_ci } 4613bf215546Sopenharmony_ci 4614bf215546Sopenharmony_ci vir_optimize(c); 4615bf215546Sopenharmony_ci 4616bf215546Sopenharmony_ci vir_check_payload_w(c); 4617bf215546Sopenharmony_ci 4618bf215546Sopenharmony_ci /* XXX perf: On VC4, we do a VIR-level instruction scheduling here. 4619bf215546Sopenharmony_ci * We used that on that platform to pipeline TMU writes and reduce the 4620bf215546Sopenharmony_ci * number of thread switches, as well as try (mostly successfully) to 4621bf215546Sopenharmony_ci * reduce maximum register pressure to allow more threads. We should 4622bf215546Sopenharmony_ci * do something of that sort for V3D -- either instruction scheduling 4623bf215546Sopenharmony_ci * here, or delay the the THRSW and LDTMUs from our texture 4624bf215546Sopenharmony_ci * instructions until the results are needed. 4625bf215546Sopenharmony_ci */ 4626bf215546Sopenharmony_ci 4627bf215546Sopenharmony_ci if (V3D_DEBUG & (V3D_DEBUG_VIR | 4628bf215546Sopenharmony_ci v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 4629bf215546Sopenharmony_ci fprintf(stderr, "%s prog %d/%d VIR:\n", 4630bf215546Sopenharmony_ci vir_get_stage_name(c), 4631bf215546Sopenharmony_ci c->program_id, c->variant_id); 4632bf215546Sopenharmony_ci vir_dump(c); 4633bf215546Sopenharmony_ci fprintf(stderr, "\n"); 4634bf215546Sopenharmony_ci } 4635bf215546Sopenharmony_ci 4636bf215546Sopenharmony_ci /* Attempt to allocate registers for the temporaries. If we fail, 4637bf215546Sopenharmony_ci * reduce thread count and try again. 4638bf215546Sopenharmony_ci */ 4639bf215546Sopenharmony_ci int min_threads = (c->devinfo->ver >= 41) ? 2 : 1; 4640bf215546Sopenharmony_ci struct qpu_reg *temp_registers; 4641bf215546Sopenharmony_ci while (true) { 4642bf215546Sopenharmony_ci temp_registers = v3d_register_allocate(c); 4643bf215546Sopenharmony_ci if (temp_registers) { 4644bf215546Sopenharmony_ci assert(c->spills + c->fills <= c->max_tmu_spills); 4645bf215546Sopenharmony_ci break; 4646bf215546Sopenharmony_ci } 4647bf215546Sopenharmony_ci 4648bf215546Sopenharmony_ci if (c->threads == min_threads && 4649bf215546Sopenharmony_ci (V3D_DEBUG & V3D_DEBUG_RA)) { 4650bf215546Sopenharmony_ci fprintf(stderr, 4651bf215546Sopenharmony_ci "Failed to register allocate using %s\n", 4652bf215546Sopenharmony_ci c->fallback_scheduler ? "the fallback scheduler:" : 4653bf215546Sopenharmony_ci "the normal scheduler: \n"); 4654bf215546Sopenharmony_ci 4655bf215546Sopenharmony_ci vir_dump(c); 4656bf215546Sopenharmony_ci 4657bf215546Sopenharmony_ci char *shaderdb; 4658bf215546Sopenharmony_ci int ret = v3d_shaderdb_dump(c, &shaderdb); 4659bf215546Sopenharmony_ci if (ret > 0) { 4660bf215546Sopenharmony_ci fprintf(stderr, "%s\n", shaderdb); 4661bf215546Sopenharmony_ci free(shaderdb); 4662bf215546Sopenharmony_ci } 4663bf215546Sopenharmony_ci } 4664bf215546Sopenharmony_ci 4665bf215546Sopenharmony_ci if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) { 4666bf215546Sopenharmony_ci if (V3D_DEBUG & V3D_DEBUG_PERF) { 4667bf215546Sopenharmony_ci fprintf(stderr, 4668bf215546Sopenharmony_ci "Failed to register allocate %s " 4669bf215546Sopenharmony_ci "prog %d/%d at %d threads.\n", 4670bf215546Sopenharmony_ci vir_get_stage_name(c), 4671bf215546Sopenharmony_ci c->program_id, c->variant_id, c->threads); 4672bf215546Sopenharmony_ci } 4673bf215546Sopenharmony_ci c->compilation_result = 4674bf215546Sopenharmony_ci V3D_COMPILATION_FAILED_REGISTER_ALLOCATION; 4675bf215546Sopenharmony_ci return; 4676bf215546Sopenharmony_ci } 4677bf215546Sopenharmony_ci 4678bf215546Sopenharmony_ci c->spills = 0; 4679bf215546Sopenharmony_ci c->fills = 0; 4680bf215546Sopenharmony_ci c->threads /= 2; 4681bf215546Sopenharmony_ci 4682bf215546Sopenharmony_ci if (c->threads == 1) 4683bf215546Sopenharmony_ci vir_remove_thrsw(c); 4684bf215546Sopenharmony_ci } 4685bf215546Sopenharmony_ci 4686bf215546Sopenharmony_ci /* If we didn't spill, then remove the last thread switch we injected 4687bf215546Sopenharmony_ci * artificially (if any) and restore the previous one. 4688bf215546Sopenharmony_ci */ 4689bf215546Sopenharmony_ci if (!c->spills && c->last_thrsw != restore_last_thrsw) 4690bf215546Sopenharmony_ci vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock); 4691bf215546Sopenharmony_ci 4692bf215546Sopenharmony_ci if (c->spills && 4693bf215546Sopenharmony_ci (V3D_DEBUG & (V3D_DEBUG_VIR | 4694bf215546Sopenharmony_ci v3d_debug_flag_for_shader_stage(c->s->info.stage)))) { 4695bf215546Sopenharmony_ci fprintf(stderr, "%s prog %d/%d spilled VIR:\n", 4696bf215546Sopenharmony_ci vir_get_stage_name(c), 4697bf215546Sopenharmony_ci c->program_id, c->variant_id); 4698bf215546Sopenharmony_ci vir_dump(c); 4699bf215546Sopenharmony_ci fprintf(stderr, "\n"); 4700bf215546Sopenharmony_ci } 4701bf215546Sopenharmony_ci 4702bf215546Sopenharmony_ci v3d_vir_to_qpu(c, temp_registers); 4703bf215546Sopenharmony_ci} 4704