1/* 2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#include "pipe/p_state.h" 28#include "util/format/u_format.h" 29#include "util/u_inlines.h" 30#include "util/u_memory.h" 31#include "util/u_string.h" 32 33#include "freedreno_program.h" 34 35#include "fd4_emit.h" 36#include "fd4_format.h" 37#include "fd4_program.h" 38#include "fd4_texture.h" 39 40void 41fd4_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) 42{ 43 const struct ir3_info *si = &so->info; 44 enum a4xx_state_block sb = fd4_stage2shadersb(so->type); 45 enum a4xx_state_src src; 46 uint32_t i, sz, *bin; 47 48 if (FD_DBG(DIRECT)) { 49 sz = si->sizedwords; 50 src = SS4_DIRECT; 51 bin = fd_bo_map(so->bo); 52 } else { 53 sz = 0; 54 src = SS4_INDIRECT; 55 bin = NULL; 56 } 57 58 OUT_PKT3(ring, CP_LOAD_STATE4, 2 + sz); 59 OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | 60 CP_LOAD_STATE4_0_STATE_SRC(src) | 61 CP_LOAD_STATE4_0_STATE_BLOCK(sb) | 62 CP_LOAD_STATE4_0_NUM_UNIT(so->instrlen)); 63 if (bin) { 64 OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | 65 CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER)); 66 } else { 67 OUT_RELOC(ring, so->bo, 0, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER), 0); 68 } 69 70 /* for how clever coverity is, it is sometimes rather dull, and 71 * doesn't realize that the only case where bin==NULL, sz==0: 72 */ 73 assume(bin || (sz == 0)); 74 75 for (i = 0; i < sz; i++) { 76 OUT_RING(ring, bin[i]); 77 } 78} 79 80struct stage { 81 const struct ir3_shader_variant *v; 82 const struct ir3_info *i; 83 /* const sizes are in units of 4 * vec4 */ 84 uint8_t constoff; 85 uint8_t constlen; 86 /* instr sizes are in units of 16 instructions */ 87 uint8_t instroff; 88 uint8_t instrlen; 89}; 90 91enum { VS = 0, FS = 1, HS = 2, DS = 3, GS = 4, MAX_STAGES }; 92 93static void 94setup_stages(struct fd4_emit *emit, struct stage *s) 95{ 96 unsigned i; 97 98 s[VS].v = fd4_emit_get_vp(emit); 99 s[FS].v = fd4_emit_get_fp(emit); 100 101 s[HS].v = s[DS].v = s[GS].v = NULL; /* for now */ 102 103 for (i = 0; i < MAX_STAGES; i++) { 104 if (s[i].v) { 105 s[i].i = &s[i].v->info; 106 /* constlen is in units of 4 * vec4: */ 107 assert(s[i].v->constlen % 4 == 0); 108 s[i].constlen = s[i].v->constlen / 4; 109 /* instrlen is already in units of 16 instr.. although 110 * probably we should ditch that and not make the compiler 111 * care about instruction group size of a3xx vs a4xx 112 */ 113 s[i].instrlen = s[i].v->instrlen; 114 } else { 115 s[i].i = NULL; 116 s[i].constlen = 0; 117 s[i].instrlen = 0; 118 } 119 } 120 121 /* NOTE: at least for gles2, blob partitions VS at bottom of const 122 * space and FS taking entire remaining space. We probably don't 123 * need to do that the same way, but for now mimic what the blob 124 * does to make it easier to diff against register values from blob 125 * 126 * NOTE: if VS.instrlen + FS.instrlen > 64, then one or both shaders 127 * is run from external memory. 128 */ 129 if ((s[VS].instrlen + s[FS].instrlen) > 64) { 130 /* prioritize FS for internal memory: */ 131 if (s[FS].instrlen < 64) { 132 /* if FS can fit, kick VS out to external memory: */ 133 s[VS].instrlen = 0; 134 } else if (s[VS].instrlen < 64) { 135 /* otherwise if VS can fit, kick out FS: */ 136 s[FS].instrlen = 0; 137 } else { 138 /* neither can fit, run both from external memory: */ 139 s[VS].instrlen = 0; 140 s[FS].instrlen = 0; 141 } 142 } 143 s[VS].constlen = 66; 144 s[FS].constlen = 128 - s[VS].constlen; 145 s[VS].instroff = 0; 146 s[VS].constoff = 0; 147 s[FS].instroff = 64 - s[FS].instrlen; 148 s[FS].constoff = s[VS].constlen; 149 s[HS].instroff = s[DS].instroff = s[GS].instroff = s[FS].instroff; 150 s[HS].constoff = s[DS].constoff = s[GS].constoff = s[FS].constoff; 151} 152 153void 154fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, int nr, 155 struct pipe_surface **bufs) 156{ 157 struct stage s[MAX_STAGES]; 158 uint32_t pos_regid, posz_regid, psize_regid, color_regid[8]; 159 uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid, 160 samp_mask_regid, ij_regid[IJ_COUNT]; 161 enum a3xx_threadsize fssz; 162 int constmode; 163 int i, j; 164 165 assert(nr <= ARRAY_SIZE(color_regid)); 166 167 if (emit->binning_pass) 168 nr = 0; 169 170 setup_stages(emit, s); 171 172 fssz = (s[FS].i->double_threadsize) ? FOUR_QUADS : TWO_QUADS; 173 174 /* blob seems to always use constmode currently: */ 175 constmode = 1; 176 177 pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS); 178 if (pos_regid == regid(63, 0)) { 179 /* hw dislikes when there is no position output, which can 180 * happen for transform-feedback vertex shaders. Just tell 181 * the hw to use r0.x, with whatever random value is there: 182 */ 183 pos_regid = regid(0, 0); 184 } 185 posz_regid = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DEPTH); 186 psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ); 187 if (s[FS].v->color0_mrt) { 188 color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = 189 color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = 190 ir3_find_output_regid(s[FS].v, FRAG_RESULT_COLOR); 191 } else { 192 color_regid[0] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA0); 193 color_regid[1] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA1); 194 color_regid[2] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA2); 195 color_regid[3] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA3); 196 color_regid[4] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA4); 197 color_regid[5] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA5); 198 color_regid[6] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA6); 199 color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7); 200 } 201 202 samp_id_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_ID); 203 samp_mask_regid = 204 ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_MASK_IN); 205 face_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRONT_FACE); 206 coord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRAG_COORD); 207 zwcoord_regid = 208 (coord_regid == regid(63, 0)) ? regid(63, 0) : (coord_regid + 2); 209 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) 210 ij_regid[i] = ir3_find_sysval_regid( 211 s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); 212 213 /* we could probably divide this up into things that need to be 214 * emitted if frag-prog is dirty vs if vert-prog is dirty.. 215 */ 216 217 OUT_PKT0(ring, REG_A4XX_HLSQ_UPDATE_CONTROL, 1); 218 OUT_RING(ring, 0x00000003); 219 220 OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5); 221 OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) | 222 A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) | 223 A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE | 224 /* NOTE: I guess SHADERRESTART and CONSTFULLUPDATE maybe 225 * flush some caches? I think we only need to set those 226 * bits if we have updated const or shader.. 227 */ 228 A4XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART | 229 A4XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); 230 OUT_RING(ring, A4XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | 231 A4XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE | 232 A4XX_HLSQ_CONTROL_1_REG_COORDREGID(coord_regid) | 233 A4XX_HLSQ_CONTROL_1_REG_ZWCOORDREGID(zwcoord_regid)); 234 OUT_RING(ring, A4XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(63) | 235 A4XX_HLSQ_CONTROL_2_REG_SAMPLEID_REGID(samp_id_regid) | 236 A4XX_HLSQ_CONTROL_2_REG_SAMPLEMASK_REGID(samp_mask_regid) | 237 A4XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid)); 238 /* XXX left out centroid/sample for now */ 239 OUT_RING( 240 ring, 241 A4XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | 242 A4XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | 243 A4XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID( 244 ij_regid[IJ_PERSP_CENTROID]) | 245 A4XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID( 246 ij_regid[IJ_LINEAR_CENTROID])); 247 OUT_RING(ring, 0x00fcfcfc); /* XXX HLSQ_CONTROL_4 */ 248 249 OUT_PKT0(ring, REG_A4XX_HLSQ_VS_CONTROL_REG, 5); 250 OUT_RING(ring, 251 A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(s[VS].constlen) | 252 A4XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET(s[VS].constoff) | 253 COND(s[VS].v && s[VS].v->has_ssbo, A4XX_HLSQ_VS_CONTROL_REG_SSBO_ENABLE) | 254 COND(s[VS].v, A4XX_HLSQ_VS_CONTROL_REG_ENABLED) | 255 A4XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(s[VS].instrlen) | 256 A4XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET(s[VS].instroff)); 257 OUT_RING(ring, 258 A4XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(s[FS].constlen) | 259 A4XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET(s[FS].constoff) | 260 COND(s[FS].v && s[FS].v->has_ssbo, A4XX_HLSQ_FS_CONTROL_REG_SSBO_ENABLE) | 261 COND(s[FS].v, A4XX_HLSQ_FS_CONTROL_REG_ENABLED) | 262 A4XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(s[FS].instrlen) | 263 A4XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET(s[FS].instroff)); 264 OUT_RING(ring, 265 A4XX_HLSQ_HS_CONTROL_REG_CONSTLENGTH(s[HS].constlen) | 266 A4XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET(s[HS].constoff) | 267 COND(s[HS].v && s[HS].v->has_ssbo, A4XX_HLSQ_HS_CONTROL_REG_SSBO_ENABLE) | 268 A4XX_HLSQ_HS_CONTROL_REG_INSTRLENGTH(s[HS].instrlen) | 269 A4XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET(s[HS].instroff)); 270 OUT_RING(ring, 271 A4XX_HLSQ_DS_CONTROL_REG_CONSTLENGTH(s[DS].constlen) | 272 A4XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET(s[DS].constoff) | 273 COND(s[DS].v && s[DS].v->has_ssbo, A4XX_HLSQ_DS_CONTROL_REG_SSBO_ENABLE) | 274 A4XX_HLSQ_DS_CONTROL_REG_INSTRLENGTH(s[DS].instrlen) | 275 A4XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET(s[DS].instroff)); 276 OUT_RING(ring, 277 A4XX_HLSQ_GS_CONTROL_REG_CONSTLENGTH(s[GS].constlen) | 278 A4XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET(s[GS].constoff) | 279 COND(s[GS].v && s[GS].v->has_ssbo, A4XX_HLSQ_GS_CONTROL_REG_SSBO_ENABLE) | 280 A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(s[GS].instrlen) | 281 A4XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET(s[GS].instroff)); 282 283 OUT_PKT0(ring, REG_A4XX_SP_SP_CTRL_REG, 1); 284 OUT_RING(ring, 285 0x140010 | /* XXX */ 286 COND(emit->binning_pass, A4XX_SP_SP_CTRL_REG_BINNING_PASS)); 287 288 OUT_PKT0(ring, REG_A4XX_SP_INSTR_CACHE_CTRL, 1); 289 OUT_RING(ring, 0x7f | /* XXX */ 290 COND(s[VS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_VS_BUFFER) | 291 COND(s[FS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_FS_BUFFER) | 292 COND(s[VS].instrlen && s[FS].instrlen, 293 A4XX_SP_INSTR_CACHE_CTRL_INSTR_BUFFER)); 294 295 OUT_PKT0(ring, REG_A4XX_SP_VS_LENGTH_REG, 1); 296 OUT_RING(ring, s[VS].v->instrlen); /* SP_VS_LENGTH_REG */ 297 298 OUT_PKT0(ring, REG_A4XX_SP_VS_CTRL_REG0, 3); 299 OUT_RING( 300 ring, 301 A4XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) | 302 A4XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(s[VS].i->max_half_reg + 1) | 303 A4XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) | 304 A4XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) | 305 A4XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) | 306 A4XX_SP_VS_CTRL_REG0_SUPERTHREADMODE | 307 COND(s[VS].v->need_pixlod, A4XX_SP_VS_CTRL_REG0_PIXLODENABLE)); 308 OUT_RING(ring, 309 A4XX_SP_VS_CTRL_REG1_CONSTLENGTH(s[VS].constlen) | 310 A4XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(s[VS].v->total_in)); 311 OUT_RING(ring, A4XX_SP_VS_PARAM_REG_POSREGID(pos_regid) | 312 A4XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) | 313 A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(s[FS].v->varying_in)); 314 315 struct ir3_shader_linkage l = {0}; 316 ir3_link_shaders(&l, s[VS].v, s[FS].v, false); 317 318 for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { 319 uint32_t reg = 0; 320 321 OUT_PKT0(ring, REG_A4XX_SP_VS_OUT_REG(i), 1); 322 323 reg |= A4XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); 324 reg |= A4XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); 325 j++; 326 327 reg |= A4XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); 328 reg |= A4XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); 329 j++; 330 331 OUT_RING(ring, reg); 332 } 333 334 for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) { 335 uint32_t reg = 0; 336 337 OUT_PKT0(ring, REG_A4XX_SP_VS_VPC_DST_REG(i), 1); 338 339 reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc + 8); 340 reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc + 8); 341 reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc + 8); 342 reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc + 8); 343 344 OUT_RING(ring, reg); 345 } 346 347 OUT_PKT0(ring, REG_A4XX_SP_VS_OBJ_OFFSET_REG, 2); 348 OUT_RING(ring, A4XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[VS].constoff) | 349 A4XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[VS].instroff)); 350 OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */ 351 352 if (emit->binning_pass) { 353 OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); 354 OUT_RING(ring, 0x00000000); /* SP_FS_LENGTH_REG */ 355 356 OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); 357 OUT_RING(ring, 358 A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | 359 COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | 360 A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(0) | 361 A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(0) | 362 A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | 363 A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | 364 A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE); 365 OUT_RING(ring, 366 A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | 0x80000000); 367 368 OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); 369 OUT_RING(ring, 370 A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | 371 A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); 372 OUT_RING(ring, 0x00000000); 373 } else { 374 OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); 375 OUT_RING(ring, s[FS].v->instrlen); /* SP_FS_LENGTH_REG */ 376 377 OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); 378 OUT_RING( 379 ring, 380 A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | 381 COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | 382 A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) | 383 A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) | 384 A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | 385 A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | 386 A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | 387 COND(s[FS].v->need_pixlod, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE)); 388 OUT_RING(ring, 389 A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | 390 0x80000000 | /* XXX */ 391 COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) | 392 COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) | 393 COND(s[FS].v->fragcoord_compmask != 0, 394 A4XX_SP_FS_CTRL_REG1_FRAGCOORD)); 395 396 OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); 397 OUT_RING(ring, 398 A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | 399 A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); 400 OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ 401 } 402 403 OUT_PKT0(ring, REG_A4XX_SP_HS_OBJ_OFFSET_REG, 1); 404 OUT_RING(ring, A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[HS].constoff) | 405 A4XX_SP_HS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[HS].instroff)); 406 407 OUT_PKT0(ring, REG_A4XX_SP_DS_OBJ_OFFSET_REG, 1); 408 OUT_RING(ring, A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[DS].constoff) | 409 A4XX_SP_DS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[DS].instroff)); 410 411 OUT_PKT0(ring, REG_A4XX_SP_GS_OBJ_OFFSET_REG, 1); 412 OUT_RING(ring, A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[GS].constoff) | 413 A4XX_SP_GS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[GS].instroff)); 414 415 OUT_PKT0(ring, REG_A4XX_GRAS_CNTL, 1); 416 OUT_RING(ring, 417 CONDREG(face_regid, A4XX_GRAS_CNTL_IJ_PERSP) | 418 CONDREG(zwcoord_regid, A4XX_GRAS_CNTL_IJ_PERSP) | 419 CONDREG(ij_regid[IJ_PERSP_PIXEL], A4XX_GRAS_CNTL_IJ_PERSP) | 420 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A4XX_GRAS_CNTL_IJ_LINEAR) | 421 CONDREG(ij_regid[IJ_PERSP_CENTROID], A4XX_GRAS_CNTL_IJ_PERSP)); 422 423 OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL2, 1); 424 OUT_RING( 425 ring, 426 A4XX_RB_RENDER_CONTROL2_MSAA_SAMPLES(0) | 427 CONDREG(ij_regid[IJ_PERSP_PIXEL], 428 A4XX_RB_RENDER_CONTROL2_IJ_PERSP_PIXEL) | 429 CONDREG(ij_regid[IJ_PERSP_CENTROID], 430 A4XX_RB_RENDER_CONTROL2_IJ_PERSP_CENTROID) | 431 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A4XX_RB_RENDER_CONTROL2_SIZE) | 432 CONDREG(samp_id_regid, A4XX_RB_RENDER_CONTROL2_SAMPLEID) | 433 COND(s[FS].v->frag_face, A4XX_RB_RENDER_CONTROL2_FACENESS) | 434 CONDREG(samp_mask_regid, A4XX_RB_RENDER_CONTROL2_SAMPLEMASK) | 435 COND(s[FS].v->fragcoord_compmask != 0, 436 A4XX_RB_RENDER_CONTROL2_COORD_MASK(s[FS].v->fragcoord_compmask))); 437 438 OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1); 439 OUT_RING(ring, 440 A4XX_RB_FS_OUTPUT_REG_MRT(nr) | 441 COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z)); 442 443 OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1); 444 OUT_RING(ring, 445 A4XX_SP_FS_OUTPUT_REG_MRT(nr) | 446 COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) | 447 A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid)); 448 449 OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8); 450 for (i = 0; i < 8; i++) { 451 enum a4xx_color_fmt format = 0; 452 bool srgb = false; 453 bool uint = false; 454 bool sint = false; 455 if (i < nr) { 456 format = fd4_emit_format(bufs[i]); 457 if (bufs[i]) { 458 if (!emit->no_decode_srgb) 459 srgb = util_format_is_srgb(bufs[i]->format); 460 uint = util_format_is_pure_uint(bufs[i]->format); 461 sint = util_format_is_pure_sint(bufs[i]->format); 462 } 463 } 464 OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) | 465 A4XX_SP_FS_MRT_REG_MRTFORMAT(format) | 466 COND(srgb, A4XX_SP_FS_MRT_REG_COLOR_SRGB) | 467 COND(uint, A4XX_SP_FS_MRT_REG_COLOR_UINT) | 468 COND(sint, A4XX_SP_FS_MRT_REG_COLOR_SINT) | 469 COND(color_regid[i] & HALF_REG_ID, 470 A4XX_SP_FS_MRT_REG_HALF_PRECISION)); 471 } 472 473 if (emit->binning_pass) { 474 OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2); 475 OUT_RING(ring, A4XX_VPC_ATTR_THRDASSIGN(1) | 0x40000000 | /* XXX */ 476 COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE)); 477 OUT_RING(ring, 0x00000000); 478 } else { 479 uint32_t vinterp[8], vpsrepl[8]; 480 481 memset(vinterp, 0, sizeof(vinterp)); 482 memset(vpsrepl, 0, sizeof(vpsrepl)); 483 484 /* looks like we need to do int varyings in the frag 485 * shader on a4xx (no flatshad reg? or a420.0 bug?): 486 * 487 * (sy)(ss)nop 488 * (sy)ldlv.u32 r0.x,l[r0.x], 1 489 * ldlv.u32 r0.y,l[r0.x+1], 1 490 * (ss)bary.f (ei)r63.x, 0, r0.x 491 * (ss)(rpt1)cov.s32f16 hr0.x, (r)r0.x 492 * (rpt5)nop 493 * sam (f16)(xyzw)hr0.x, hr0.x, s#0, t#0 494 * 495 * Possibly on later a4xx variants we'll be able to use 496 * something like the code below instead of workaround 497 * in the shader: 498 */ 499 /* figure out VARYING_INTERP / VARYING_PS_REPL register values: */ 500 for (j = -1; 501 (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count;) { 502 /* NOTE: varyings are packed, so if compmask is 0xb 503 * then first, third, and fourth component occupy 504 * three consecutive varying slots: 505 */ 506 unsigned compmask = s[FS].v->inputs[j].compmask; 507 508 uint32_t inloc = s[FS].v->inputs[j].inloc; 509 510 if (s[FS].v->inputs[j].flat || 511 (s[FS].v->inputs[j].rasterflat && emit->rasterflat)) { 512 uint32_t loc = inloc; 513 514 for (i = 0; i < 4; i++) { 515 if (compmask & (1 << i)) { 516 vinterp[loc / 16] |= 1 << ((loc % 16) * 2); 517 // flatshade[loc / 32] |= 1 << (loc % 32); 518 loc++; 519 } 520 } 521 } 522 523 bool coord_mode = emit->sprite_coord_mode; 524 if (ir3_point_sprite(s[FS].v, j, emit->sprite_coord_enable, 525 &coord_mode)) { 526 /* mask is two 2-bit fields, where: 527 * '01' -> S 528 * '10' -> T 529 * '11' -> 1 - T (flip mode) 530 */ 531 unsigned mask = coord_mode ? 0b1101 : 0b1001; 532 uint32_t loc = inloc; 533 if (compmask & 0x1) { 534 vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); 535 loc++; 536 } 537 if (compmask & 0x2) { 538 vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); 539 loc++; 540 } 541 if (compmask & 0x4) { 542 /* .z <- 0.0f */ 543 vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); 544 loc++; 545 } 546 if (compmask & 0x8) { 547 /* .w <- 1.0f */ 548 vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); 549 loc++; 550 } 551 } 552 } 553 554 OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2); 555 OUT_RING(ring, A4XX_VPC_ATTR_TOTALATTR(s[FS].v->total_in) | 556 A4XX_VPC_ATTR_THRDASSIGN(1) | 557 COND(s[FS].v->total_in > 0, A4XX_VPC_ATTR_ENABLE) | 558 0x40000000 | /* XXX */ 559 COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE)); 560 OUT_RING(ring, A4XX_VPC_PACK_NUMFPNONPOSVAR(s[FS].v->total_in) | 561 A4XX_VPC_PACK_NUMNONPOSVSVAR(s[FS].v->total_in)); 562 563 OUT_PKT0(ring, REG_A4XX_VPC_VARYING_INTERP_MODE(0), 8); 564 for (i = 0; i < 8; i++) 565 OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ 566 567 OUT_PKT0(ring, REG_A4XX_VPC_VARYING_PS_REPL_MODE(0), 8); 568 for (i = 0; i < 8; i++) 569 OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ 570 } 571 572 if (s[VS].instrlen) 573 fd4_emit_shader(ring, s[VS].v); 574 575 if (!emit->binning_pass) 576 if (s[FS].instrlen) 577 fd4_emit_shader(ring, s[FS].v); 578} 579 580static struct ir3_program_state * 581fd4_program_create(void *data, struct ir3_shader_variant *bs, 582 struct ir3_shader_variant *vs, struct ir3_shader_variant *hs, 583 struct ir3_shader_variant *ds, struct ir3_shader_variant *gs, 584 struct ir3_shader_variant *fs, 585 const struct ir3_cache_key *key) in_dt 586{ 587 struct fd_context *ctx = fd_context(data); 588 struct fd4_program_state *state = CALLOC_STRUCT(fd4_program_state); 589 590 tc_assert_driver_thread(ctx->tc); 591 592 state->bs = bs; 593 state->vs = vs; 594 state->fs = fs; 595 596 return &state->base; 597} 598 599static void 600fd4_program_destroy(void *data, struct ir3_program_state *state) 601{ 602 struct fd4_program_state *so = fd4_program_state(state); 603 free(so); 604} 605 606static const struct ir3_cache_funcs cache_funcs = { 607 .create_state = fd4_program_create, 608 .destroy_state = fd4_program_destroy, 609}; 610 611void 612fd4_prog_init(struct pipe_context *pctx) 613{ 614 struct fd_context *ctx = fd_context(pctx); 615 616 ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx); 617 ir3_prog_init(pctx); 618 fd_prog_init(pctx); 619} 620