1/* 2 * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org> 3 * Copyright © 2018 Google, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 * 24 * Authors: 25 * Rob Clark <robclark@freedesktop.org> 26 */ 27 28#include "pipe/p_state.h" 29#include "util/bitset.h" 30#include "util/format/u_format.h" 31#include "util/u_inlines.h" 32#include "util/u_memory.h" 33#include "util/u_string.h" 34 35#include "freedreno_program.h" 36 37#include "fd6_const.h" 38#include "fd6_emit.h" 39#include "fd6_pack.h" 40#include "fd6_program.h" 41#include "fd6_texture.h" 42 43void 44fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring, 45 const struct ir3_shader_variant *so) 46{ 47 enum a6xx_state_block sb = fd6_stage2shadersb(so->type); 48 49 uint32_t first_exec_offset = 0; 50 uint32_t instrlen = 0; 51 uint32_t hw_stack_offset = 0; 52 53 switch (so->type) { 54 case MESA_SHADER_VERTEX: 55 first_exec_offset = REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET; 56 instrlen = REG_A6XX_SP_VS_INSTRLEN; 57 hw_stack_offset = REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET; 58 break; 59 case MESA_SHADER_TESS_CTRL: 60 first_exec_offset = REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET; 61 instrlen = REG_A6XX_SP_HS_INSTRLEN; 62 hw_stack_offset = REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET; 63 break; 64 case MESA_SHADER_TESS_EVAL: 65 first_exec_offset = REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET; 66 instrlen = REG_A6XX_SP_DS_INSTRLEN; 67 hw_stack_offset = REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET; 68 break; 69 case MESA_SHADER_GEOMETRY: 70 first_exec_offset = REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET; 71 instrlen = REG_A6XX_SP_GS_INSTRLEN; 72 hw_stack_offset = REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET; 73 break; 74 case MESA_SHADER_FRAGMENT: 75 first_exec_offset = REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET; 76 instrlen = REG_A6XX_SP_FS_INSTRLEN; 77 hw_stack_offset = REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET; 78 break; 79 case MESA_SHADER_COMPUTE: 80 case MESA_SHADER_KERNEL: 81 first_exec_offset = REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET; 82 instrlen = REG_A6XX_SP_CS_INSTRLEN; 83 hw_stack_offset = REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET; 84 break; 85 case MESA_SHADER_TASK: 86 case MESA_SHADER_MESH: 87 case MESA_SHADER_RAYGEN: 88 case MESA_SHADER_ANY_HIT: 89 case MESA_SHADER_CLOSEST_HIT: 90 case MESA_SHADER_MISS: 91 case MESA_SHADER_INTERSECTION: 92 case MESA_SHADER_CALLABLE: 93 unreachable("Unsupported shader stage"); 94 case MESA_SHADER_NONE: 95 unreachable(""); 96 } 97 98#ifdef DEBUG 99 /* Name should generally match what you get with MESA_SHADER_CAPTURE_PATH: */ 100 const char *name = so->name; 101 if (name) 102 fd_emit_string5(ring, name, strlen(name)); 103#endif 104 105 uint32_t fibers_per_sp = ctx->screen->info->a6xx.fibers_per_sp; 106 uint32_t num_sp_cores = ctx->screen->info->num_sp_cores; 107 108 uint32_t per_fiber_size = ALIGN(so->pvtmem_size, 512); 109 if (per_fiber_size > ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size) { 110 if (ctx->pvtmem[so->pvtmem_per_wave].bo) 111 fd_bo_del(ctx->pvtmem[so->pvtmem_per_wave].bo); 112 ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size = per_fiber_size; 113 uint32_t total_size = 114 ALIGN(per_fiber_size * fibers_per_sp, 1 << 12) * num_sp_cores; 115 ctx->pvtmem[so->pvtmem_per_wave].bo = fd_bo_new( 116 ctx->screen->dev, total_size, FD_BO_NOMAP, 117 "pvtmem_%s_%d", so->pvtmem_per_wave ? "per_wave" : "per_fiber", 118 per_fiber_size); 119 } else { 120 per_fiber_size = ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size; 121 } 122 123 uint32_t per_sp_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12); 124 125 OUT_PKT4(ring, instrlen, 1); 126 OUT_RING(ring, so->instrlen); 127 128 OUT_PKT4(ring, first_exec_offset, 7); 129 OUT_RING(ring, 0); /* SP_xS_OBJ_FIRST_EXEC_OFFSET */ 130 OUT_RELOC(ring, so->bo, 0, 0, 0); /* SP_xS_OBJ_START_LO */ 131 OUT_RING(ring, A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(per_fiber_size)); 132 if (so->pvtmem_size > 0) { /* SP_xS_PVT_MEM_ADDR */ 133 OUT_RELOC(ring, ctx->pvtmem[so->pvtmem_per_wave].bo, 0, 0, 0); 134 } else { 135 OUT_RING(ring, 0); 136 OUT_RING(ring, 0); 137 } 138 OUT_RING(ring, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) | 139 COND(so->pvtmem_per_wave, 140 A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT)); 141 142 OUT_PKT4(ring, hw_stack_offset, 1); 143 OUT_RING(ring, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(per_sp_size)); 144 145 uint32_t shader_preload_size = 146 MIN2(so->instrlen, ctx->screen->info->a6xx.instr_cache_size); 147 148 OUT_PKT7(ring, fd6_stage2opcode(so->type), 3); 149 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | 150 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | 151 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 152 CP_LOAD_STATE6_0_STATE_BLOCK(sb) | 153 CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size)); 154 OUT_RELOC(ring, so->bo, 0, 0, 0); 155} 156 157/** 158 * Build a pre-baked state-obj to disable SO, so that we aren't dynamically 159 * building this at draw time whenever we transition from SO enabled->disabled 160 */ 161static void 162setup_stream_out_disable(struct fd_context *ctx) 163{ 164 unsigned sizedw = 4; 165 166 if (ctx->screen->info->a6xx.tess_use_shared) 167 sizedw += 2; 168 169 struct fd_ringbuffer *ring = 170 fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4); 171 172 OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw); 173 OUT_RING(ring, REG_A6XX_VPC_SO_CNTL); 174 OUT_RING(ring, 0); 175 OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL); 176 OUT_RING(ring, 0); 177 178 if (ctx->screen->info->a6xx.tess_use_shared) { 179 OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL); 180 OUT_RING(ring, 0); 181 } 182 183 fd6_context(ctx)->streamout_disable_stateobj = ring; 184} 185 186static void 187setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state, 188 const struct ir3_shader_variant *v, 189 struct ir3_shader_linkage *l) 190{ 191 const struct ir3_stream_output_info *strmout = &v->stream_output; 192 193 /* Note: 64 here comes from the HW layout of the program RAM. The program 194 * for stream N is at DWORD 64 * N. 195 */ 196#define A6XX_SO_PROG_DWORDS 64 197 uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {}; 198 BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0}; 199 200 memset(prog, 0, sizeof(prog)); 201 202 for (unsigned i = 0; i < strmout->num_outputs; i++) { 203 const struct ir3_stream_output *out = &strmout->output[i]; 204 unsigned k = out->register_index; 205 unsigned idx; 206 207 /* linkage map sorted by order frag shader wants things, so 208 * a bit less ideal here.. 209 */ 210 for (idx = 0; idx < l->cnt; idx++) 211 if (l->var[idx].slot == v->outputs[k].slot) 212 break; 213 214 assert(idx < l->cnt); 215 216 for (unsigned j = 0; j < out->num_components; j++) { 217 unsigned c = j + out->start_component; 218 unsigned loc = l->var[idx].loc + c; 219 unsigned off = j + out->dst_offset; /* in dwords */ 220 221 unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2; 222 if (loc & 1) { 223 prog[dword] |= A6XX_VPC_SO_PROG_B_EN | 224 A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | 225 A6XX_VPC_SO_PROG_B_OFF(off * 4); 226 } else { 227 prog[dword] |= A6XX_VPC_SO_PROG_A_EN | 228 A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | 229 A6XX_VPC_SO_PROG_A_OFF(off * 4); 230 } 231 BITSET_SET(valid_dwords, dword); 232 } 233 } 234 235 unsigned prog_count = 0; 236 unsigned start, end; 237 BITSET_FOREACH_RANGE (start, end, valid_dwords, 238 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) { 239 prog_count += end - start + 1; 240 } 241 242 unsigned sizedw = 10 + (2 * prog_count); 243 if (ctx->screen->info->a6xx.tess_use_shared) 244 sizedw += 2; 245 246 struct fd_ringbuffer *ring = 247 fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4); 248 249 OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw); 250 OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL); 251 OUT_RING(ring, 252 A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(0x1) | 253 COND(strmout->stride[0] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1)) | 254 COND(strmout->stride[1] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1)) | 255 COND(strmout->stride[2] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1)) | 256 COND(strmout->stride[3] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1))); 257 OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(0)); 258 OUT_RING(ring, strmout->stride[0]); 259 OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(1)); 260 OUT_RING(ring, strmout->stride[1]); 261 OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(2)); 262 OUT_RING(ring, strmout->stride[2]); 263 OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(3)); 264 OUT_RING(ring, strmout->stride[3]); 265 266 bool first = true; 267 BITSET_FOREACH_RANGE (start, end, valid_dwords, 268 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) { 269 OUT_RING(ring, REG_A6XX_VPC_SO_CNTL); 270 OUT_RING(ring, COND(first, A6XX_VPC_SO_CNTL_RESET) | 271 A6XX_VPC_SO_CNTL_ADDR(start)); 272 for (unsigned i = start; i < end; i++) { 273 OUT_RING(ring, REG_A6XX_VPC_SO_PROG); 274 OUT_RING(ring, prog[i]); 275 } 276 first = false; 277 } 278 279 if (ctx->screen->info->a6xx.tess_use_shared) { 280 /* Possibly not tess_use_shared related, but the combination of 281 * tess + xfb fails some tests if we don't emit this. 282 */ 283 OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL); 284 OUT_RING(ring, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE); 285 } 286 287 state->streamout_stateobj = ring; 288} 289 290static void 291setup_config_stateobj(struct fd_context *ctx, struct fd6_program_state *state) 292{ 293 struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 100 * 4); 294 295 OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD(.vs_state = true, .hs_state = true, 296 .ds_state = true, .gs_state = true, 297 .fs_state = true, .cs_state = true, 298 .gfx_ibo = true, .cs_ibo = true, )); 299 300 assert(state->vs->constlen >= state->bs->constlen); 301 302 OUT_PKT4(ring, REG_A6XX_HLSQ_VS_CNTL, 4); 303 OUT_RING(ring, A6XX_HLSQ_VS_CNTL_CONSTLEN(state->vs->constlen) | 304 A6XX_HLSQ_VS_CNTL_ENABLED); 305 OUT_RING(ring, COND(state->hs, 306 A6XX_HLSQ_HS_CNTL_ENABLED | 307 A6XX_HLSQ_HS_CNTL_CONSTLEN(state->hs->constlen))); 308 OUT_RING(ring, COND(state->ds, 309 A6XX_HLSQ_DS_CNTL_ENABLED | 310 A6XX_HLSQ_DS_CNTL_CONSTLEN(state->ds->constlen))); 311 OUT_RING(ring, COND(state->gs, 312 A6XX_HLSQ_GS_CNTL_ENABLED | 313 A6XX_HLSQ_GS_CNTL_CONSTLEN(state->gs->constlen))); 314 OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL, 1); 315 OUT_RING(ring, A6XX_HLSQ_FS_CNTL_CONSTLEN(state->fs->constlen) | 316 A6XX_HLSQ_FS_CNTL_ENABLED); 317 318 OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1); 319 OUT_RING(ring, COND(state->vs, A6XX_SP_VS_CONFIG_ENABLED) | 320 A6XX_SP_VS_CONFIG_NIBO(ir3_shader_nibo(state->vs)) | 321 A6XX_SP_VS_CONFIG_NTEX(state->vs->num_samp) | 322 A6XX_SP_VS_CONFIG_NSAMP(state->vs->num_samp)); 323 324 OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 1); 325 OUT_RING(ring, COND(state->hs, 326 A6XX_SP_HS_CONFIG_ENABLED | 327 A6XX_SP_HS_CONFIG_NIBO(ir3_shader_nibo(state->hs)) | 328 A6XX_SP_HS_CONFIG_NTEX(state->hs->num_samp) | 329 A6XX_SP_HS_CONFIG_NSAMP(state->hs->num_samp))); 330 331 OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 1); 332 OUT_RING(ring, COND(state->ds, 333 A6XX_SP_DS_CONFIG_ENABLED | 334 A6XX_SP_DS_CONFIG_NIBO(ir3_shader_nibo(state->ds)) | 335 A6XX_SP_DS_CONFIG_NTEX(state->ds->num_samp) | 336 A6XX_SP_DS_CONFIG_NSAMP(state->ds->num_samp))); 337 338 OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 1); 339 OUT_RING(ring, COND(state->gs, 340 A6XX_SP_GS_CONFIG_ENABLED | 341 A6XX_SP_GS_CONFIG_NIBO(ir3_shader_nibo(state->gs)) | 342 A6XX_SP_GS_CONFIG_NTEX(state->gs->num_samp) | 343 A6XX_SP_GS_CONFIG_NSAMP(state->gs->num_samp))); 344 345 OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 1); 346 OUT_RING(ring, COND(state->fs, A6XX_SP_FS_CONFIG_ENABLED) | 347 A6XX_SP_FS_CONFIG_NIBO(ir3_shader_nibo(state->fs)) | 348 A6XX_SP_FS_CONFIG_NTEX(state->fs->num_samp) | 349 A6XX_SP_FS_CONFIG_NSAMP(state->fs->num_samp)); 350 351 OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1); 352 OUT_RING(ring, ir3_shader_nibo(state->fs)); 353 354 state->config_stateobj = ring; 355} 356 357static inline uint32_t 358next_regid(uint32_t reg, uint32_t increment) 359{ 360 if (VALIDREG(reg)) 361 return reg + increment; 362 else 363 return regid(63, 0); 364} 365 366static void 367fd6_emit_tess_bos(struct fd_screen *screen, struct fd_ringbuffer *ring, 368 const struct ir3_shader_variant *s) assert_dt 369{ 370 const struct ir3_const_state *const_state = ir3_const_state(s); 371 const unsigned regid = const_state->offsets.primitive_param + 1; 372 uint32_t dwords = 8; 373 374 if (regid >= s->constlen) 375 return; 376 377 OUT_PKT7(ring, fd6_stage2opcode(s->type), 7); 378 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid) | 379 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 380 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 381 CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(s->type)) | 382 CP_LOAD_STATE6_0_NUM_UNIT(dwords / 4)); 383 OUT_RING(ring, 0); 384 OUT_RING(ring, 0); 385 OUT_RELOC(ring, screen->tess_bo, FD6_TESS_FACTOR_SIZE, 0, 0); 386 OUT_RELOC(ring, screen->tess_bo, 0, 0, 0); 387} 388 389static void 390setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx, 391 struct fd6_program_state *state, 392 const struct ir3_cache_key *cache_key, 393 bool binning_pass) assert_dt 394{ 395 const struct ir3_shader_key *key = &cache_key->key; 396 uint32_t pos_regid, psize_regid, color_regid[8], posz_regid; 397 uint32_t clip0_regid, clip1_regid; 398 uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; 399 uint32_t smask_in_regid, smask_regid; 400 uint32_t stencilref_regid; 401 uint32_t vertex_regid, instance_regid, layer_regid, vs_primitive_regid; 402 uint32_t hs_invocation_regid; 403 uint32_t tess_coord_x_regid, tess_coord_y_regid, hs_rel_patch_regid, 404 ds_rel_patch_regid, ds_primitive_regid; 405 uint32_t ij_regid[IJ_COUNT]; 406 uint32_t gs_header_regid; 407 enum a6xx_threadsize fssz; 408 uint8_t psize_loc = ~0, pos_loc = ~0, layer_loc = ~0; 409 uint8_t clip0_loc, clip1_loc; 410 int i, j; 411 412 static const struct ir3_shader_variant dummy_fs = {0}; 413 const struct ir3_shader_variant *vs = binning_pass ? state->bs : state->vs; 414 const struct ir3_shader_variant *hs = state->hs; 415 const struct ir3_shader_variant *ds = state->ds; 416 const struct ir3_shader_variant *gs = state->gs; 417 const struct ir3_shader_variant *fs = binning_pass ? &dummy_fs : state->fs; 418 419 /* binning VS is wrong when GS is present, so use nonbinning VS 420 * TODO: compile both binning VS/GS variants correctly 421 */ 422 if (binning_pass && state->gs) 423 vs = state->vs; 424 425 bool sample_shading = fs->per_samp | key->sample_shading; 426 427 fssz = fs->info.double_threadsize ? THREAD128 : THREAD64; 428 429 pos_regid = ir3_find_output_regid(vs, VARYING_SLOT_POS); 430 psize_regid = ir3_find_output_regid(vs, VARYING_SLOT_PSIZ); 431 clip0_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST0); 432 clip1_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST1); 433 layer_regid = ir3_find_output_regid(vs, VARYING_SLOT_LAYER); 434 vertex_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); 435 instance_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); 436 if (hs) 437 vs_primitive_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID); 438 else if (gs) 439 vs_primitive_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID); 440 else 441 vs_primitive_regid = regid(63, 0); 442 443 bool hs_reads_primid = false, ds_reads_primid = false; 444 if (hs) { 445 tess_coord_x_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD); 446 tess_coord_y_regid = next_regid(tess_coord_x_regid, 1); 447 hs_reads_primid = VALIDREG(ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID)); 448 ds_reads_primid = VALIDREG(ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID)); 449 hs_rel_patch_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3); 450 ds_rel_patch_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3); 451 ds_primitive_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID); 452 hs_invocation_regid = 453 ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3); 454 455 pos_regid = ir3_find_output_regid(ds, VARYING_SLOT_POS); 456 psize_regid = ir3_find_output_regid(ds, VARYING_SLOT_PSIZ); 457 clip0_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST0); 458 clip1_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST1); 459 } else { 460 tess_coord_x_regid = regid(63, 0); 461 tess_coord_y_regid = regid(63, 0); 462 hs_rel_patch_regid = regid(63, 0); 463 ds_rel_patch_regid = regid(63, 0); 464 ds_primitive_regid = regid(63, 0); 465 hs_invocation_regid = regid(63, 0); 466 } 467 468 bool gs_reads_primid = false; 469 if (gs) { 470 gs_header_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3); 471 gs_reads_primid = VALIDREG(ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID)); 472 pos_regid = ir3_find_output_regid(gs, VARYING_SLOT_POS); 473 psize_regid = ir3_find_output_regid(gs, VARYING_SLOT_PSIZ); 474 clip0_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST0); 475 clip1_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST1); 476 layer_regid = ir3_find_output_regid(gs, VARYING_SLOT_LAYER); 477 } else { 478 gs_header_regid = regid(63, 0); 479 } 480 481 if (fs->color0_mrt) { 482 color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = 483 color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = 484 ir3_find_output_regid(fs, FRAG_RESULT_COLOR); 485 } else { 486 color_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0); 487 color_regid[1] = ir3_find_output_regid(fs, FRAG_RESULT_DATA1); 488 color_regid[2] = ir3_find_output_regid(fs, FRAG_RESULT_DATA2); 489 color_regid[3] = ir3_find_output_regid(fs, FRAG_RESULT_DATA3); 490 color_regid[4] = ir3_find_output_regid(fs, FRAG_RESULT_DATA4); 491 color_regid[5] = ir3_find_output_regid(fs, FRAG_RESULT_DATA5); 492 color_regid[6] = ir3_find_output_regid(fs, FRAG_RESULT_DATA6); 493 color_regid[7] = ir3_find_output_regid(fs, FRAG_RESULT_DATA7); 494 } 495 496 samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); 497 smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); 498 face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); 499 coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); 500 zwcoord_regid = next_regid(coord_regid, 2); 501 posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); 502 smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); 503 stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL); 504 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) 505 ij_regid[i] = 506 ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); 507 508 /* If we have pre-dispatch texture fetches, then ij_pix should not 509 * be DCE'd, even if not actually used in the shader itself: 510 */ 511 if (fs->num_sampler_prefetch > 0) { 512 assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL])); 513 /* also, it seems like ij_pix is *required* to be r0.x */ 514 assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); 515 } 516 517 /* we can't write gl_SampleMask for !msaa.. if b0 is zero then we 518 * end up masking the single sample!! 519 */ 520 if (!key->msaa) 521 smask_regid = regid(63, 0); 522 523 /* we could probably divide this up into things that need to be 524 * emitted if frag-prog is dirty vs if vert-prog is dirty.. 525 */ 526 527 OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); 528 OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | 529 A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) | 530 0x7000); // XXX 531 for (int i = 0; i < fs->num_sampler_prefetch; i++) { 532 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; 533 OUT_RING(ring, 534 A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) | 535 A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) | 536 A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) | 537 A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) | 538 A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) | 539 COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) | 540 A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd)); 541 } 542 543 OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A9A8, 1); 544 OUT_RING(ring, 0); 545 546 OUT_PKT4(ring, REG_A6XX_SP_MODE_CONTROL, 1); 547 OUT_RING(ring, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4); 548 549 bool fs_has_dual_src_color = 550 !binning_pass && fs->fs.color_is_dual_source; 551 552 OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1); 553 OUT_RING(ring, 554 A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | 555 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | 556 A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) | 557 COND(fs_has_dual_src_color, 558 A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); 559 560 OUT_PKT4(ring, REG_A6XX_SP_VS_CTRL_REG0, 1); 561 OUT_RING( 562 ring, 563 A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vs->info.max_reg + 1) | 564 A6XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vs->info.max_half_reg + 1) | 565 COND(vs->mergedregs, A6XX_SP_VS_CTRL_REG0_MERGEDREGS) | 566 A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(vs))); 567 568 fd6_emit_shader(ctx, ring, vs); 569 fd6_emit_immediates(ctx->screen, vs, ring); 570 if (hs) { 571 fd6_emit_tess_bos(ctx->screen, ring, hs); 572 fd6_emit_tess_bos(ctx->screen, ring, ds); 573 } 574 575 struct ir3_shader_linkage l = {0}; 576 const struct ir3_shader_variant *last_shader = fd6_last_shader(state); 577 578 bool do_streamout = (last_shader->stream_output.num_outputs > 0); 579 uint8_t clip_mask = last_shader->clip_mask, 580 cull_mask = last_shader->cull_mask; 581 uint8_t clip_cull_mask = clip_mask | cull_mask; 582 583 clip_mask &= cache_key->clip_plane_enable; 584 585 /* If we have streamout, link against the real FS, rather than the 586 * dummy FS used for binning pass state, to ensure the OUTLOC's 587 * match. Depending on whether we end up doing sysmem or gmem, 588 * the actual streamout could happen with either the binning pass 589 * or draw pass program, but the same streamout stateobj is used 590 * in either case: 591 */ 592 ir3_link_shaders(&l, last_shader, do_streamout ? state->fs : fs, true); 593 594 bool primid_passthru = l.primid_loc != 0xff; 595 clip0_loc = l.clip0_loc; 596 clip1_loc = l.clip1_loc; 597 598 OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4); 599 OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */ 600 OUT_RING(ring, ~l.varmask[1]); /* VPC_VAR[1].DISABLE */ 601 OUT_RING(ring, ~l.varmask[2]); /* VPC_VAR[2].DISABLE */ 602 OUT_RING(ring, ~l.varmask[3]); /* VPC_VAR[3].DISABLE */ 603 604 /* Add stream out outputs after computing the VPC_VAR_DISABLE bitmask. */ 605 ir3_link_stream_out(&l, last_shader); 606 607 if (VALIDREG(layer_regid)) { 608 layer_loc = l.max_loc; 609 ir3_link_add(&l, VARYING_SLOT_LAYER, layer_regid, 0x1, l.max_loc); 610 } 611 612 if (VALIDREG(pos_regid)) { 613 pos_loc = l.max_loc; 614 ir3_link_add(&l, VARYING_SLOT_POS, pos_regid, 0xf, l.max_loc); 615 } 616 617 if (VALIDREG(psize_regid)) { 618 psize_loc = l.max_loc; 619 ir3_link_add(&l, VARYING_SLOT_PSIZ, psize_regid, 0x1, l.max_loc); 620 } 621 622 /* Handle the case where clip/cull distances aren't read by the FS. Make 623 * sure to avoid adding an output with an empty writemask if the user 624 * disables all the clip distances in the API so that the slot is unused. 625 */ 626 if (clip0_loc == 0xff && VALIDREG(clip0_regid) && 627 (clip_cull_mask & 0xf) != 0) { 628 clip0_loc = l.max_loc; 629 ir3_link_add(&l, VARYING_SLOT_CLIP_DIST0, clip0_regid, 630 clip_cull_mask & 0xf, l.max_loc); 631 } 632 633 if (clip1_loc == 0xff && VALIDREG(clip1_regid) && 634 (clip_cull_mask >> 4) != 0) { 635 clip1_loc = l.max_loc; 636 ir3_link_add(&l, VARYING_SLOT_CLIP_DIST1, clip1_regid, 637 clip_cull_mask >> 4, l.max_loc); 638 } 639 640 /* If we have stream-out, we use the full shader for binning 641 * pass, rather than the optimized binning pass one, so that we 642 * have all the varying outputs available for xfb. So streamout 643 * state should always be derived from the non-binning pass 644 * program: 645 */ 646 if (do_streamout && !binning_pass) { 647 setup_stream_out(ctx, state, last_shader, &l); 648 649 if (!fd6_context(ctx)->streamout_disable_stateobj) 650 setup_stream_out_disable(ctx); 651 } 652 653 assert(l.cnt <= 32); 654 if (gs) 655 OUT_PKT4(ring, REG_A6XX_SP_GS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); 656 else if (ds) 657 OUT_PKT4(ring, REG_A6XX_SP_DS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); 658 else 659 OUT_PKT4(ring, REG_A6XX_SP_VS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); 660 661 for (j = 0; j < l.cnt;) { 662 uint32_t reg = 0; 663 664 reg |= A6XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); 665 reg |= A6XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); 666 j++; 667 668 reg |= A6XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); 669 reg |= A6XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); 670 j++; 671 672 OUT_RING(ring, reg); 673 } 674 675 if (gs) 676 OUT_PKT4(ring, REG_A6XX_SP_GS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); 677 else if (ds) 678 OUT_PKT4(ring, REG_A6XX_SP_DS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); 679 else 680 OUT_PKT4(ring, REG_A6XX_SP_VS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); 681 682 for (j = 0; j < l.cnt;) { 683 uint32_t reg = 0; 684 685 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc); 686 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc); 687 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc); 688 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc); 689 690 OUT_RING(ring, reg); 691 } 692 693 if (hs) { 694 assert(vs->mergedregs == hs->mergedregs); 695 OUT_PKT4(ring, REG_A6XX_SP_HS_CTRL_REG0, 1); 696 OUT_RING( 697 ring, 698 A6XX_SP_HS_CTRL_REG0_FULLREGFOOTPRINT(hs->info.max_reg + 1) | 699 A6XX_SP_HS_CTRL_REG0_HALFREGFOOTPRINT(hs->info.max_half_reg + 1) | 700 A6XX_SP_HS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(hs))); 701 702 fd6_emit_shader(ctx, ring, hs); 703 fd6_emit_immediates(ctx->screen, hs, ring); 704 fd6_emit_link_map(ctx->screen, vs, hs, ring); 705 706 OUT_PKT4(ring, REG_A6XX_SP_DS_CTRL_REG0, 1); 707 OUT_RING( 708 ring, 709 A6XX_SP_DS_CTRL_REG0_FULLREGFOOTPRINT(ds->info.max_reg + 1) | 710 A6XX_SP_DS_CTRL_REG0_HALFREGFOOTPRINT(ds->info.max_half_reg + 1) | 711 A6XX_SP_DS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(ds))); 712 713 fd6_emit_shader(ctx, ring, ds); 714 fd6_emit_immediates(ctx->screen, ds, ring); 715 fd6_emit_link_map(ctx->screen, hs, ds, ring); 716 717 OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1); 718 OUT_RING(ring, hs->tess.tcs_vertices_out); 719 720 if (ctx->screen->info->a6xx.tess_use_shared) { 721 unsigned hs_input_size = 6 + (3 * (vs->output_size - 1)); 722 unsigned wave_input_size = 723 MIN2(64, DIV_ROUND_UP(hs_input_size * 4, 724 hs->tess.tcs_vertices_out)); 725 726 OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); 727 OUT_RING(ring, hs_input_size); 728 729 OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 730 OUT_RING(ring, wave_input_size); 731 } else { 732 uint32_t hs_input_size = 733 hs->tess.tcs_vertices_out * vs->output_size / 4; 734 735 /* Total attribute slots in HS incoming patch. */ 736 OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); 737 OUT_RING(ring, hs_input_size); 738 739 const uint32_t wavesize = 64; 740 const uint32_t max_wave_input_size = 64; 741 const uint32_t patch_control_points = hs->tess.tcs_vertices_out; 742 743 /* note: if HS is really just the VS extended, then this 744 * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) 745 * however that doesn't match the blob, and fails some dEQP tests. 746 */ 747 uint32_t prims_per_wave = wavesize / hs->tess.tcs_vertices_out; 748 uint32_t max_prims_per_wave = max_wave_input_size * wavesize / 749 (vs->output_size * patch_control_points); 750 prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); 751 752 uint32_t total_size = 753 vs->output_size * patch_control_points * prims_per_wave; 754 uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); 755 756 OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 757 OUT_RING(ring, wave_input_size); 758 } 759 760 OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1); 761 uint32_t output; 762 if (ds->tess.point_mode) 763 output = TESS_POINTS; 764 else if (ds->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES) 765 output = TESS_LINES; 766 else if (ds->tess.ccw) 767 output = TESS_CCW_TRIS; 768 else 769 output = TESS_CW_TRIS; 770 771 OUT_RING(ring, A6XX_PC_TESS_CNTL_SPACING( 772 fd6_gl2spacing(ds->tess.spacing)) | 773 A6XX_PC_TESS_CNTL_OUTPUT(output)); 774 775 OUT_PKT4(ring, REG_A6XX_VPC_DS_CLIP_CNTL, 1); 776 OUT_RING(ring, A6XX_VPC_DS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | 777 A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | 778 A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); 779 780 OUT_PKT4(ring, REG_A6XX_VPC_DS_LAYER_CNTL, 1); 781 OUT_RING(ring, 0x0000ffff); 782 783 OUT_PKT4(ring, REG_A6XX_GRAS_DS_LAYER_CNTL, 1); 784 OUT_RING(ring, 0x0); 785 786 OUT_PKT4(ring, REG_A6XX_GRAS_DS_CL_CNTL, 1); 787 OUT_RING(ring, A6XX_GRAS_DS_CL_CNTL_CLIP_MASK(clip_mask) | 788 A6XX_GRAS_DS_CL_CNTL_CULL_MASK(cull_mask)); 789 790 OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1); 791 OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) | 792 A6XX_VPC_VS_PACK_PSIZELOC(255) | 793 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc)); 794 795 OUT_PKT4(ring, REG_A6XX_VPC_DS_PACK, 1); 796 OUT_RING(ring, A6XX_VPC_DS_PACK_POSITIONLOC(pos_loc) | 797 A6XX_VPC_DS_PACK_PSIZELOC(psize_loc) | 798 A6XX_VPC_DS_PACK_STRIDE_IN_VPC(l.max_loc)); 799 800 OUT_PKT4(ring, REG_A6XX_SP_DS_PRIMITIVE_CNTL, 1); 801 OUT_RING(ring, A6XX_SP_DS_PRIMITIVE_CNTL_OUT(l.cnt)); 802 803 OUT_PKT4(ring, REG_A6XX_PC_DS_OUT_CNTL, 1); 804 OUT_RING(ring, A6XX_PC_DS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | 805 CONDREG(psize_regid, A6XX_PC_DS_OUT_CNTL_PSIZE) | 806 COND(ds_reads_primid, A6XX_PC_DS_OUT_CNTL_PRIMITIVE_ID) | 807 A6XX_PC_DS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); 808 809 OUT_PKT4(ring, REG_A6XX_PC_HS_OUT_CNTL, 1); 810 OUT_RING(ring, COND(hs_reads_primid, A6XX_PC_HS_OUT_CNTL_PRIMITIVE_ID)); 811 } else { 812 OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 813 OUT_RING(ring, 0); 814 } 815 816 OUT_PKT4(ring, REG_A6XX_SP_VS_PRIMITIVE_CNTL, 1); 817 OUT_RING(ring, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(l.cnt)); 818 819 bool enable_varyings = fs->total_in > 0; 820 821 OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1); 822 OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) | 823 COND(enable_varyings, A6XX_VPC_CNTL_0_VARYING) | 824 A6XX_VPC_CNTL_0_PRIMIDLOC(l.primid_loc) | 825 A6XX_VPC_CNTL_0_VIEWIDLOC(0xff)); 826 827 OUT_PKT4(ring, REG_A6XX_PC_VS_OUT_CNTL, 1); 828 OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | 829 CONDREG(psize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) | 830 CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) | 831 A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); 832 833 OUT_PKT4(ring, REG_A6XX_HLSQ_CONTROL_1_REG, 5); 834 OUT_RING(ring, 0x7); /* XXX */ 835 OUT_RING(ring, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | 836 A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | 837 A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) | 838 A6XX_HLSQ_CONTROL_2_REG_CENTERRHW(ij_regid[IJ_PERSP_CENTER_RHW])); 839 OUT_RING( 840 ring, 841 A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | 842 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | 843 A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID( 844 ij_regid[IJ_PERSP_CENTROID]) | 845 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID( 846 ij_regid[IJ_LINEAR_CENTROID])); 847 OUT_RING( 848 ring, 849 A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | 850 A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | 851 A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) | 852 A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE])); 853 OUT_RING(ring, 0xfcfc); /* line length (?), foveation quality */ 854 855 OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL_0, 1); 856 OUT_RING(ring, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(fssz) | 857 COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS)); 858 859 OUT_PKT4(ring, REG_A6XX_SP_FS_CTRL_REG0, 1); 860 OUT_RING( 861 ring, 862 A6XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | 863 COND(enable_varyings, A6XX_SP_FS_CTRL_REG0_VARYING) | 0x1000000 | 864 A6XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fs->info.max_reg + 1) | 865 A6XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fs->info.max_half_reg + 1) | 866 COND(fs->mergedregs, A6XX_SP_FS_CTRL_REG0_MERGEDREGS) | 867 A6XX_SP_FS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(fs)) | 868 COND(fs->need_pixlod, A6XX_SP_FS_CTRL_REG0_PIXLODENABLE)); 869 870 OUT_PKT4(ring, REG_A6XX_VPC_VS_LAYER_CNTL, 1); 871 OUT_RING(ring, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) | 872 A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(0xff)); 873 874 bool need_size = fs->frag_face || fs->fragcoord_compmask != 0; 875 bool need_size_persamp = false; 876 if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) { 877 if (sample_shading) 878 need_size_persamp = true; 879 else 880 need_size = true; 881 } 882 883 OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1); 884 OUT_RING( 885 ring, 886 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) | 887 CONDREG(ij_regid[IJ_PERSP_CENTROID], 888 A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) | 889 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) | 890 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | 891 CONDREG(ij_regid[IJ_LINEAR_CENTROID], 892 A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) | 893 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | 894 COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | 895 COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | 896 COND(fs->fragcoord_compmask != 0, 897 A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask))); 898 899 OUT_PKT4(ring, REG_A6XX_RB_RENDER_CONTROL0, 2); 900 OUT_RING( 901 ring, 902 CONDREG(ij_regid[IJ_PERSP_PIXEL], 903 A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | 904 CONDREG(ij_regid[IJ_PERSP_CENTROID], 905 A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | 906 CONDREG(ij_regid[IJ_PERSP_SAMPLE], 907 A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) | 908 CONDREG(ij_regid[IJ_LINEAR_PIXEL], 909 A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | 910 CONDREG(ij_regid[IJ_LINEAR_CENTROID], 911 A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) | 912 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], 913 A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | 914 COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | 915 COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) | 916 COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | 917 COND(fs->fragcoord_compmask != 0, 918 A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask))); 919 920 OUT_RING(ring, 921 CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | 922 CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | 923 CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) | 924 COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); 925 926 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_CNTL, 1); 927 OUT_RING(ring, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE)); 928 929 OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1); 930 OUT_RING(ring, 931 CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) | 932 A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE( 933 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER)); 934 935 OUT_PKT4(ring, REG_A6XX_GRAS_SAMPLE_CNTL, 1); 936 OUT_RING(ring, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE)); 937 938 OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); 939 for (i = 0; i < 8; i++) { 940 OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) | 941 COND(color_regid[i] & HALF_REG_ID, 942 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)); 943 if (VALIDREG(color_regid[i])) { 944 state->mrt_components |= 0xf << (i * 4); 945 } 946 } 947 948 /* dual source blending has an extra fs output in the 2nd slot */ 949 if (fs_has_dual_src_color) { 950 state->mrt_components |= 0xf << 4; 951 } 952 953 OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1); 954 OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) | 955 A6XX_VPC_VS_PACK_PSIZELOC(psize_loc) | 956 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc)); 957 958 if (gs) { 959 assert(gs->mergedregs == (ds ? ds->mergedregs : vs->mergedregs)); 960 OUT_PKT4(ring, REG_A6XX_SP_GS_CTRL_REG0, 1); 961 OUT_RING( 962 ring, 963 A6XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT(gs->info.max_reg + 1) | 964 A6XX_SP_GS_CTRL_REG0_HALFREGFOOTPRINT(gs->info.max_half_reg + 1) | 965 A6XX_SP_GS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(gs))); 966 967 fd6_emit_shader(ctx, ring, gs); 968 fd6_emit_immediates(ctx->screen, gs, ring); 969 if (ds) 970 fd6_emit_link_map(ctx->screen, ds, gs, ring); 971 else 972 fd6_emit_link_map(ctx->screen, vs, gs, ring); 973 974 OUT_PKT4(ring, REG_A6XX_VPC_GS_PACK, 1); 975 OUT_RING(ring, A6XX_VPC_GS_PACK_POSITIONLOC(pos_loc) | 976 A6XX_VPC_GS_PACK_PSIZELOC(psize_loc) | 977 A6XX_VPC_GS_PACK_STRIDE_IN_VPC(l.max_loc)); 978 979 OUT_PKT4(ring, REG_A6XX_VPC_GS_LAYER_CNTL, 1); 980 OUT_RING(ring, A6XX_VPC_GS_LAYER_CNTL_LAYERLOC(layer_loc) | 0xff00); 981 982 OUT_PKT4(ring, REG_A6XX_GRAS_GS_LAYER_CNTL, 1); 983 OUT_RING(ring, 984 CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER)); 985 986 uint32_t flags_regid = 987 ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3); 988 989 /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */ 990 assert(flags_regid != INVALID_REG); 991 992 OUT_PKT4(ring, REG_A6XX_SP_GS_PRIMITIVE_CNTL, 1); 993 OUT_RING(ring, A6XX_SP_GS_PRIMITIVE_CNTL_OUT(l.cnt) | 994 A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid)); 995 996 OUT_PKT4(ring, REG_A6XX_PC_GS_OUT_CNTL, 1); 997 OUT_RING(ring, 998 A6XX_PC_GS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | 999 CONDREG(psize_regid, A6XX_PC_GS_OUT_CNTL_PSIZE) | 1000 CONDREG(layer_regid, A6XX_PC_GS_OUT_CNTL_LAYER) | 1001 COND(gs_reads_primid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) | 1002 A6XX_PC_GS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); 1003 1004 uint32_t output; 1005 switch (gs->gs.output_primitive) { 1006 case SHADER_PRIM_POINTS: 1007 output = TESS_POINTS; 1008 break; 1009 case SHADER_PRIM_LINE_STRIP: 1010 output = TESS_LINES; 1011 break; 1012 case SHADER_PRIM_TRIANGLE_STRIP: 1013 output = TESS_CW_TRIS; 1014 break; 1015 default: 1016 unreachable(""); 1017 } 1018 OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); 1019 OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT( 1020 gs->gs.vertices_out - 1) | 1021 A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | 1022 A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS( 1023 gs->gs.invocations - 1)); 1024 1025 OUT_PKT4(ring, REG_A6XX_GRAS_GS_CL_CNTL, 1); 1026 OUT_RING(ring, A6XX_GRAS_GS_CL_CNTL_CLIP_MASK(clip_mask) | 1027 A6XX_GRAS_GS_CL_CNTL_CULL_MASK(cull_mask)); 1028 1029 OUT_PKT4(ring, REG_A6XX_VPC_GS_PARAM, 1); 1030 OUT_RING(ring, 0xff); 1031 1032 OUT_PKT4(ring, REG_A6XX_VPC_GS_CLIP_CNTL, 1); 1033 OUT_RING(ring, A6XX_VPC_GS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | 1034 A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | 1035 A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); 1036 1037 const struct ir3_shader_variant *prev = state->ds ? state->ds : state->vs; 1038 1039 /* Size of per-primitive alloction in ldlw memory in vec4s. */ 1040 uint32_t vec4_size = gs->gs.vertices_in * 1041 DIV_ROUND_UP(prev->output_size, 4); 1042 OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); 1043 OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); 1044 1045 OUT_PKT4(ring, REG_A6XX_PC_MULTIVIEW_CNTL, 1); 1046 OUT_RING(ring, 0); 1047 1048 uint32_t prim_size = prev->output_size; 1049 if (prim_size > 64) 1050 prim_size = 64; 1051 else if (prim_size == 64) 1052 prim_size = 63; 1053 OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1); 1054 OUT_RING(ring, prim_size); 1055 } else { 1056 OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); 1057 OUT_RING(ring, 0); 1058 OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1); 1059 OUT_RING(ring, 0); 1060 1061 OUT_PKT4(ring, REG_A6XX_GRAS_VS_LAYER_CNTL, 1); 1062 OUT_RING(ring, 1063 CONDREG(layer_regid, A6XX_GRAS_VS_LAYER_CNTL_WRITES_LAYER)); 1064 } 1065 1066 OUT_PKT4(ring, REG_A6XX_VPC_VS_CLIP_CNTL, 1); 1067 OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | 1068 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | 1069 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); 1070 1071 OUT_PKT4(ring, REG_A6XX_GRAS_VS_CL_CNTL, 1); 1072 OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) | 1073 A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask)); 1074 1075 OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9107, 1); 1076 OUT_RING(ring, 0); 1077 1078 if (fs->instrlen) 1079 fd6_emit_shader(ctx, ring, fs); 1080 1081 OUT_REG(ring, A6XX_PC_PRIMID_PASSTHRU(primid_passthru)); 1082 1083 uint32_t non_sysval_input_count = 0; 1084 for (uint32_t i = 0; i < vs->inputs_count; i++) 1085 if (!vs->inputs[i].sysval) 1086 non_sysval_input_count++; 1087 1088 OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_0, 1); 1089 OUT_RING(ring, A6XX_VFD_CONTROL_0_FETCH_CNT(non_sysval_input_count) | 1090 A6XX_VFD_CONTROL_0_DECODE_CNT(non_sysval_input_count)); 1091 1092 OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL(0), non_sysval_input_count); 1093 for (uint32_t i = 0; i < non_sysval_input_count; i++) { 1094 assert(vs->inputs[i].compmask); 1095 OUT_RING(ring, 1096 A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) | 1097 A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid)); 1098 } 1099 1100 OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6); 1101 OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | 1102 A6XX_VFD_CONTROL_1_REGID4INST(instance_regid) | 1103 A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitive_regid) | 1104 0xfc000000); 1105 OUT_RING(ring, 1106 A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) | 1107 A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid)); 1108 OUT_RING(ring, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) | 1109 A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) | 1110 A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) | 1111 A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitive_regid)); 1112 OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ 1113 OUT_RING(ring, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gs_header_regid) | 1114 0xfc00); /* VFD_CONTROL_5 */ 1115 OUT_RING(ring, COND(primid_passthru, 1116 A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */ 1117 1118 if (!binning_pass) 1119 fd6_emit_immediates(ctx->screen, fs, ring); 1120} 1121 1122static void emit_interp_state(struct fd_ringbuffer *ring, 1123 struct ir3_shader_variant *fs, bool rasterflat, 1124 bool sprite_coord_mode, 1125 uint32_t sprite_coord_enable); 1126 1127static struct fd_ringbuffer * 1128create_interp_stateobj(struct fd_context *ctx, struct fd6_program_state *state) 1129{ 1130 struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4); 1131 1132 emit_interp_state(ring, state->fs, false, false, 0); 1133 1134 return ring; 1135} 1136 1137/* build the program streaming state which is not part of the pre- 1138 * baked stateobj because of dependency on other gl state (rasterflat 1139 * or sprite-coord-replacement) 1140 */ 1141struct fd_ringbuffer * 1142fd6_program_interp_state(struct fd6_emit *emit) 1143{ 1144 const struct fd6_program_state *state = fd6_emit_get_prog(emit); 1145 1146 if (!unlikely(emit->rasterflat || emit->sprite_coord_enable)) { 1147 /* fastpath: */ 1148 return fd_ringbuffer_ref(state->interp_stateobj); 1149 } else { 1150 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( 1151 emit->ctx->batch->submit, 18 * 4, FD_RINGBUFFER_STREAMING); 1152 1153 emit_interp_state(ring, state->fs, emit->rasterflat, 1154 emit->sprite_coord_mode, emit->sprite_coord_enable); 1155 1156 return ring; 1157 } 1158} 1159 1160static void 1161emit_interp_state(struct fd_ringbuffer *ring, struct ir3_shader_variant *fs, 1162 bool rasterflat, bool sprite_coord_mode, 1163 uint32_t sprite_coord_enable) 1164{ 1165 uint32_t vinterp[8], vpsrepl[8]; 1166 1167 memset(vinterp, 0, sizeof(vinterp)); 1168 memset(vpsrepl, 0, sizeof(vpsrepl)); 1169 1170 for (int j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count;) { 1171 1172 /* NOTE: varyings are packed, so if compmask is 0xb 1173 * then first, third, and fourth component occupy 1174 * three consecutive varying slots: 1175 */ 1176 unsigned compmask = fs->inputs[j].compmask; 1177 1178 uint32_t inloc = fs->inputs[j].inloc; 1179 1180 if (fs->inputs[j].flat || (fs->inputs[j].rasterflat && rasterflat)) { 1181 uint32_t loc = inloc; 1182 1183 for (int i = 0; i < 4; i++) { 1184 if (compmask & (1 << i)) { 1185 vinterp[loc / 16] |= 1 << ((loc % 16) * 2); 1186 loc++; 1187 } 1188 } 1189 } 1190 1191 bool coord_mode = sprite_coord_mode; 1192 if (ir3_point_sprite(fs, j, sprite_coord_enable, &coord_mode)) { 1193 /* mask is two 2-bit fields, where: 1194 * '01' -> S 1195 * '10' -> T 1196 * '11' -> 1 - T (flip mode) 1197 */ 1198 unsigned mask = coord_mode ? 0b1101 : 0b1001; 1199 uint32_t loc = inloc; 1200 if (compmask & 0x1) { 1201 vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); 1202 loc++; 1203 } 1204 if (compmask & 0x2) { 1205 vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); 1206 loc++; 1207 } 1208 if (compmask & 0x4) { 1209 /* .z <- 0.0f */ 1210 vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); 1211 loc++; 1212 } 1213 if (compmask & 0x8) { 1214 /* .w <- 1.0f */ 1215 vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); 1216 loc++; 1217 } 1218 } 1219 } 1220 1221 OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); 1222 for (int i = 0; i < 8; i++) 1223 OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ 1224 1225 OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); 1226 for (int i = 0; i < 8; i++) 1227 OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ 1228} 1229 1230static struct ir3_program_state * 1231fd6_program_create(void *data, struct ir3_shader_variant *bs, 1232 struct ir3_shader_variant *vs, struct ir3_shader_variant *hs, 1233 struct ir3_shader_variant *ds, struct ir3_shader_variant *gs, 1234 struct ir3_shader_variant *fs, 1235 const struct ir3_cache_key *key) in_dt 1236{ 1237 struct fd_context *ctx = fd_context(data); 1238 struct fd_screen *screen = ctx->screen; 1239 struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state); 1240 1241 tc_assert_driver_thread(ctx->tc); 1242 1243 /* if we have streamout, use full VS in binning pass, as the 1244 * binning pass VS will have outputs on other than position/psize 1245 * stripped out: 1246 */ 1247 state->bs = vs->stream_output.num_outputs ? vs : bs; 1248 state->vs = vs; 1249 state->hs = hs; 1250 state->ds = ds; 1251 state->gs = gs; 1252 state->fs = fs; 1253 state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); 1254 state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); 1255 1256#ifdef DEBUG 1257 if (!ds) { 1258 for (unsigned i = 0; i < bs->inputs_count; i++) { 1259 if (vs->inputs[i].sysval) 1260 continue; 1261 assert(bs->inputs[i].regid == vs->inputs[i].regid); 1262 } 1263 } 1264#endif 1265 1266 if (hs) { 1267 /* Allocate the fixed-size tess factor BO globally on the screen. This 1268 * lets the program (which ideally we would have shared across contexts, 1269 * though the current ir3_cache impl doesn't do that) bake in the 1270 * addresses. 1271 */ 1272 fd_screen_lock(screen); 1273 if (!screen->tess_bo) 1274 screen->tess_bo = 1275 fd_bo_new(screen->dev, FD6_TESS_BO_SIZE, FD_BO_NOMAP, "tessfactor"); 1276 fd_screen_unlock(screen); 1277 } 1278 1279 setup_config_stateobj(ctx, state); 1280 setup_stateobj(state->binning_stateobj, ctx, state, key, true); 1281 setup_stateobj(state->stateobj, ctx, state, key, false); 1282 state->interp_stateobj = create_interp_stateobj(ctx, state); 1283 1284 const struct ir3_stream_output_info *stream_output = 1285 &fd6_last_shader(state)->stream_output; 1286 if (stream_output->num_outputs > 0) 1287 state->stream_output = stream_output; 1288 1289 return &state->base; 1290} 1291 1292static void 1293fd6_program_destroy(void *data, struct ir3_program_state *state) 1294{ 1295 struct fd6_program_state *so = fd6_program_state(state); 1296 fd_ringbuffer_del(so->stateobj); 1297 fd_ringbuffer_del(so->binning_stateobj); 1298 fd_ringbuffer_del(so->config_stateobj); 1299 fd_ringbuffer_del(so->interp_stateobj); 1300 if (so->streamout_stateobj) 1301 fd_ringbuffer_del(so->streamout_stateobj); 1302 free(so); 1303} 1304 1305static const struct ir3_cache_funcs cache_funcs = { 1306 .create_state = fd6_program_create, 1307 .destroy_state = fd6_program_destroy, 1308}; 1309 1310void 1311fd6_prog_init(struct pipe_context *pctx) 1312{ 1313 struct fd_context *ctx = fd_context(pctx); 1314 1315 ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx); 1316 1317 ir3_prog_init(pctx); 1318 1319 fd_prog_init(pctx); 1320} 1321