1/* 2 * Copyright (c) 2014 Scott Mansell 3 * Copyright © 2014 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25#include <inttypes.h> 26#include "util/format/u_format.h" 27#include "util/crc32.h" 28#include "util/u_helpers.h" 29#include "util/u_math.h" 30#include "util/u_memory.h" 31#include "util/ralloc.h" 32#include "util/hash_table.h" 33#include "tgsi/tgsi_dump.h" 34#include "tgsi/tgsi_parse.h" 35#include "compiler/nir/nir.h" 36#include "compiler/nir/nir_builder.h" 37#include "compiler/nir_types.h" 38#include "nir/tgsi_to_nir.h" 39#include "vc4_context.h" 40#include "vc4_qpu.h" 41#include "vc4_qir.h" 42 43static struct qreg 44ntq_get_src(struct vc4_compile *c, nir_src src, int i); 45static void 46ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list); 47 48static int 49type_size(const struct glsl_type *type, bool bindless) 50{ 51 return glsl_count_attribute_slots(type, false); 52} 53 54static void 55resize_qreg_array(struct vc4_compile *c, 56 struct qreg **regs, 57 uint32_t *size, 58 uint32_t decl_size) 59{ 60 if (*size >= decl_size) 61 return; 62 63 uint32_t old_size = *size; 64 *size = MAX2(*size * 2, decl_size); 65 *regs = reralloc(c, *regs, struct qreg, *size); 66 if (!*regs) { 67 fprintf(stderr, "Malloc failure\n"); 68 abort(); 69 } 70 71 for (uint32_t i = old_size; i < *size; i++) 72 (*regs)[i] = c->undef; 73} 74 75static void 76ntq_emit_thrsw(struct vc4_compile *c) 77{ 78 if (!c->fs_threaded) 79 return; 80 81 /* Always thread switch after each texture operation for now. 82 * 83 * We could do better by batching a bunch of texture fetches up and 84 * then doing one thread switch and collecting all their results 85 * afterward. 86 */ 87 qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef, 88 c->undef, c->undef)); 89 c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL); 90} 91 92static struct qreg 93indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) 94{ 95 struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0); 96 97 /* Clamp to [0, array size). Note that MIN/MAX are signed. */ 98 uint32_t range = nir_intrinsic_range(intr); 99 indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0)); 100 indirect_offset = qir_MIN_NOIMM(c, indirect_offset, 101 qir_uniform_ui(c, range - 4)); 102 103 qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), 104 indirect_offset, 105 qir_uniform(c, QUNIFORM_UBO0_ADDR, 106 nir_intrinsic_base(intr))); 107 108 c->num_texture_samples++; 109 110 ntq_emit_thrsw(c); 111 112 return qir_TEX_RESULT(c); 113} 114 115static struct qreg 116vc4_ubo_load(struct vc4_compile *c, nir_intrinsic_instr *intr) 117{ 118 ASSERTED int buffer_index = nir_src_as_uint(intr->src[0]); 119 assert(buffer_index == 1); 120 assert(c->stage == QSTAGE_FRAG); 121 122 struct qreg offset = ntq_get_src(c, intr->src[1], 0); 123 124 /* Clamp to [0, array size). Note that MIN/MAX are signed. */ 125 offset = qir_MAX(c, offset, qir_uniform_ui(c, 0)); 126 offset = qir_MIN_NOIMM(c, offset, 127 qir_uniform_ui(c, c->fs_key->ubo_1_size - 4)); 128 129 qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), 130 offset, 131 qir_uniform(c, QUNIFORM_UBO1_ADDR, 0)); 132 133 c->num_texture_samples++; 134 135 ntq_emit_thrsw(c); 136 137 return qir_TEX_RESULT(c); 138} 139 140nir_ssa_def * 141vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) 142{ 143 switch (swiz) { 144 default: 145 case PIPE_SWIZZLE_NONE: 146 fprintf(stderr, "warning: unknown swizzle\n"); 147 FALLTHROUGH; 148 case PIPE_SWIZZLE_0: 149 return nir_imm_float(b, 0.0); 150 case PIPE_SWIZZLE_1: 151 return nir_imm_float(b, 1.0); 152 case PIPE_SWIZZLE_X: 153 case PIPE_SWIZZLE_Y: 154 case PIPE_SWIZZLE_Z: 155 case PIPE_SWIZZLE_W: 156 return srcs[swiz]; 157 } 158} 159 160static struct qreg * 161ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def) 162{ 163 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 164 def->num_components); 165 _mesa_hash_table_insert(c->def_ht, def, qregs); 166 return qregs; 167} 168 169/** 170 * This function is responsible for getting QIR results into the associated 171 * storage for a NIR instruction. 172 * 173 * If it's a NIR SSA def, then we just set the associated hash table entry to 174 * the new result. 175 * 176 * If it's a NIR reg, then we need to update the existing qreg assigned to the 177 * NIR destination with the incoming value. To do that without introducing 178 * new MOVs, we require that the incoming qreg either be a uniform, or be 179 * SSA-defined by the previous QIR instruction in the block and rewritable by 180 * this function. That lets us sneak ahead and insert the SF flag beforehand 181 * (knowing that the previous instruction doesn't depend on flags) and rewrite 182 * its destination to be the NIR reg's destination 183 */ 184static void 185ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan, 186 struct qreg result) 187{ 188 struct qinst *last_inst = NULL; 189 if (!list_is_empty(&c->cur_block->instructions)) 190 last_inst = (struct qinst *)c->cur_block->instructions.prev; 191 192 assert(result.file == QFILE_UNIF || 193 (result.file == QFILE_TEMP && 194 last_inst && last_inst == c->defs[result.index])); 195 196 if (dest->is_ssa) { 197 assert(chan < dest->ssa.num_components); 198 199 struct qreg *qregs; 200 struct hash_entry *entry = 201 _mesa_hash_table_search(c->def_ht, &dest->ssa); 202 203 if (entry) 204 qregs = entry->data; 205 else 206 qregs = ntq_init_ssa_def(c, &dest->ssa); 207 208 qregs[chan] = result; 209 } else { 210 nir_register *reg = dest->reg.reg; 211 assert(dest->reg.base_offset == 0); 212 assert(reg->num_array_elems == 0); 213 struct hash_entry *entry = 214 _mesa_hash_table_search(c->def_ht, reg); 215 struct qreg *qregs = entry->data; 216 217 /* Insert a MOV if the source wasn't an SSA def in the 218 * previous instruction. 219 */ 220 if (result.file == QFILE_UNIF) { 221 result = qir_MOV(c, result); 222 last_inst = c->defs[result.index]; 223 } 224 225 /* We know they're both temps, so just rewrite index. */ 226 c->defs[last_inst->dst.index] = NULL; 227 last_inst->dst.index = qregs[chan].index; 228 229 /* If we're in control flow, then make this update of the reg 230 * conditional on the execution mask. 231 */ 232 if (c->execute.file != QFILE_NULL) { 233 last_inst->dst.index = qregs[chan].index; 234 235 /* Set the flags to the current exec mask. To insert 236 * the SF, we temporarily remove our SSA instruction. 237 */ 238 list_del(&last_inst->link); 239 qir_SF(c, c->execute); 240 list_addtail(&last_inst->link, 241 &c->cur_block->instructions); 242 243 last_inst->cond = QPU_COND_ZS; 244 last_inst->cond_is_exec_mask = true; 245 } 246 } 247} 248 249static struct qreg 250ntq_get_src(struct vc4_compile *c, nir_src src, int i) 251{ 252 struct hash_entry *entry; 253 if (src.is_ssa) { 254 entry = _mesa_hash_table_search(c->def_ht, src.ssa); 255 assert(i < src.ssa->num_components); 256 } else { 257 nir_register *reg = src.reg.reg; 258 entry = _mesa_hash_table_search(c->def_ht, reg); 259 assert(reg->num_array_elems == 0); 260 assert(src.reg.base_offset == 0); 261 assert(i < reg->num_components); 262 } 263 264 struct qreg *qregs = entry->data; 265 return qregs[i]; 266} 267 268static struct qreg 269ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr, 270 unsigned src) 271{ 272 assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 273 unsigned chan = ffs(instr->dest.write_mask) - 1; 274 struct qreg r = ntq_get_src(c, instr->src[src].src, 275 instr->src[src].swizzle[chan]); 276 277 assert(!instr->src[src].abs); 278 assert(!instr->src[src].negate); 279 280 return r; 281}; 282 283static inline struct qreg 284qir_SAT(struct vc4_compile *c, struct qreg val) 285{ 286 return qir_FMAX(c, 287 qir_FMIN(c, val, qir_uniform_f(c, 1.0)), 288 qir_uniform_f(c, 0.0)); 289} 290 291static struct qreg 292ntq_rcp(struct vc4_compile *c, struct qreg x) 293{ 294 struct qreg r = qir_RCP(c, x); 295 296 /* Apply a Newton-Raphson step to improve the accuracy. */ 297 r = qir_FMUL(c, r, qir_FSUB(c, 298 qir_uniform_f(c, 2.0), 299 qir_FMUL(c, x, r))); 300 301 return r; 302} 303 304static struct qreg 305ntq_rsq(struct vc4_compile *c, struct qreg x) 306{ 307 struct qreg r = qir_RSQ(c, x); 308 309 /* Apply a Newton-Raphson step to improve the accuracy. */ 310 r = qir_FMUL(c, r, qir_FSUB(c, 311 qir_uniform_f(c, 1.5), 312 qir_FMUL(c, 313 qir_uniform_f(c, 0.5), 314 qir_FMUL(c, x, 315 qir_FMUL(c, r, r))))); 316 317 return r; 318} 319 320static struct qreg 321ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1) 322{ 323 struct qreg src0_hi = qir_SHR(c, src0, 324 qir_uniform_ui(c, 24)); 325 struct qreg src1_hi = qir_SHR(c, src1, 326 qir_uniform_ui(c, 24)); 327 328 struct qreg hilo = qir_MUL24(c, src0_hi, src1); 329 struct qreg lohi = qir_MUL24(c, src0, src1_hi); 330 struct qreg lolo = qir_MUL24(c, src0, src1); 331 332 return qir_ADD(c, lolo, qir_SHL(c, 333 qir_ADD(c, hilo, lohi), 334 qir_uniform_ui(c, 24))); 335} 336 337static struct qreg 338ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src) 339{ 340 struct qreg depthf = qir_ITOF(c, qir_SHR(c, src, 341 qir_uniform_ui(c, 8))); 342 return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff)); 343} 344 345/** 346 * Emits a lowered TXF_MS from an MSAA texture. 347 * 348 * The addressing math has been lowered in NIR, and now we just need to read 349 * it like a UBO. 350 */ 351static void 352ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr) 353{ 354 uint32_t tile_width = 32; 355 uint32_t tile_height = 32; 356 uint32_t tile_size = (tile_height * tile_width * 357 VC4_MAX_SAMPLES * sizeof(uint32_t)); 358 359 unsigned unit = instr->texture_index; 360 uint32_t w = align(c->key->tex[unit].msaa_width, tile_width); 361 uint32_t w_tiles = w / tile_width; 362 uint32_t h = align(c->key->tex[unit].msaa_height, tile_height); 363 uint32_t h_tiles = h / tile_height; 364 uint32_t size = w_tiles * h_tiles * tile_size; 365 366 struct qreg addr; 367 assert(instr->num_srcs == 1); 368 assert(instr->src[0].src_type == nir_tex_src_coord); 369 addr = ntq_get_src(c, instr->src[0].src, 0); 370 371 /* Perform the clamping required by kernel validation. */ 372 addr = qir_MAX(c, addr, qir_uniform_ui(c, 0)); 373 addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4)); 374 375 qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), 376 addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit)); 377 378 ntq_emit_thrsw(c); 379 380 struct qreg tex = qir_TEX_RESULT(c); 381 c->num_texture_samples++; 382 383 enum pipe_format format = c->key->tex[unit].format; 384 if (util_format_is_depth_or_stencil(format)) { 385 struct qreg scaled = ntq_scale_depth_texture(c, tex); 386 for (int i = 0; i < 4; i++) 387 ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled)); 388 } else { 389 for (int i = 0; i < 4; i++) 390 ntq_store_dest(c, &instr->dest, i, 391 qir_UNPACK_8_F(c, tex, i)); 392 } 393} 394 395static void 396ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) 397{ 398 struct qreg s, t, r, lod, compare; 399 bool is_txb = false, is_txl = false; 400 unsigned unit = instr->texture_index; 401 402 if (instr->op == nir_texop_txf) { 403 ntq_emit_txf(c, instr); 404 return; 405 } 406 407 for (unsigned i = 0; i < instr->num_srcs; i++) { 408 switch (instr->src[i].src_type) { 409 case nir_tex_src_coord: 410 s = ntq_get_src(c, instr->src[i].src, 0); 411 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D) 412 t = qir_uniform_f(c, 0.5); 413 else 414 t = ntq_get_src(c, instr->src[i].src, 1); 415 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) 416 r = ntq_get_src(c, instr->src[i].src, 2); 417 break; 418 case nir_tex_src_bias: 419 lod = ntq_get_src(c, instr->src[i].src, 0); 420 is_txb = true; 421 break; 422 case nir_tex_src_lod: 423 lod = ntq_get_src(c, instr->src[i].src, 0); 424 is_txl = true; 425 break; 426 case nir_tex_src_comparator: 427 compare = ntq_get_src(c, instr->src[i].src, 0); 428 break; 429 default: 430 unreachable("unknown texture source"); 431 } 432 } 433 434 if (c->stage != QSTAGE_FRAG && !is_txl) { 435 /* From the GLSL 1.20 spec: 436 * 437 * "If it is mip-mapped and running on the vertex shader, 438 * then the base texture is used." 439 */ 440 is_txl = true; 441 lod = qir_uniform_ui(c, 0); 442 } 443 444 if (c->key->tex[unit].force_first_level) { 445 lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit); 446 is_txl = true; 447 is_txb = false; 448 } 449 450 struct qreg texture_u[] = { 451 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit), 452 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit), 453 qir_uniform(c, QUNIFORM_CONSTANT, 0), 454 qir_uniform(c, QUNIFORM_CONSTANT, 0), 455 }; 456 uint32_t next_texture_u = 0; 457 458 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) { 459 texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2, 460 unit | (is_txl << 16)); 461 } 462 463 struct qinst *tmu; 464 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 465 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r); 466 tmu->src[qir_get_tex_uniform_src(tmu)] = 467 texture_u[next_texture_u++]; 468 } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER || 469 c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP || 470 c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER || 471 c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) { 472 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), 473 qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, 474 unit)); 475 tmu->src[qir_get_tex_uniform_src(tmu)] = 476 texture_u[next_texture_u++]; 477 } 478 479 if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) { 480 s = qir_SAT(c, s); 481 } 482 483 if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) { 484 t = qir_SAT(c, t); 485 } 486 487 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t); 488 tmu->src[qir_get_tex_uniform_src(tmu)] = 489 texture_u[next_texture_u++]; 490 491 if (is_txl || is_txb) { 492 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod); 493 tmu->src[qir_get_tex_uniform_src(tmu)] = 494 texture_u[next_texture_u++]; 495 } 496 497 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s); 498 tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++]; 499 500 c->num_texture_samples++; 501 502 ntq_emit_thrsw(c); 503 504 struct qreg tex = qir_TEX_RESULT(c); 505 506 enum pipe_format format = c->key->tex[unit].format; 507 508 if (util_format_is_depth_or_stencil(format)) { 509 struct qreg normalized = ntq_scale_depth_texture(c, tex); 510 struct qreg depth_output; 511 512 struct qreg u0 = qir_uniform_f(c, 0.0f); 513 struct qreg u1 = qir_uniform_f(c, 1.0f); 514 if (c->key->tex[unit].compare_mode) { 515 /* From the GL_ARB_shadow spec: 516 * 517 * "Let Dt (D subscript t) be the depth texture 518 * value, in the range [0, 1]. Let R be the 519 * interpolated texture coordinate clamped to the 520 * range [0, 1]." 521 */ 522 compare = qir_SAT(c, compare); 523 524 switch (c->key->tex[unit].compare_func) { 525 case PIPE_FUNC_NEVER: 526 depth_output = qir_uniform_f(c, 0.0f); 527 break; 528 case PIPE_FUNC_ALWAYS: 529 depth_output = u1; 530 break; 531 case PIPE_FUNC_EQUAL: 532 qir_SF(c, qir_FSUB(c, compare, normalized)); 533 depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0); 534 break; 535 case PIPE_FUNC_NOTEQUAL: 536 qir_SF(c, qir_FSUB(c, compare, normalized)); 537 depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0); 538 break; 539 case PIPE_FUNC_GREATER: 540 qir_SF(c, qir_FSUB(c, compare, normalized)); 541 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0); 542 break; 543 case PIPE_FUNC_GEQUAL: 544 qir_SF(c, qir_FSUB(c, normalized, compare)); 545 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0); 546 break; 547 case PIPE_FUNC_LESS: 548 qir_SF(c, qir_FSUB(c, compare, normalized)); 549 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0); 550 break; 551 case PIPE_FUNC_LEQUAL: 552 qir_SF(c, qir_FSUB(c, normalized, compare)); 553 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0); 554 break; 555 } 556 } else { 557 depth_output = normalized; 558 } 559 560 for (int i = 0; i < 4; i++) 561 ntq_store_dest(c, &instr->dest, i, 562 qir_MOV(c, depth_output)); 563 } else { 564 for (int i = 0; i < 4; i++) 565 ntq_store_dest(c, &instr->dest, i, 566 qir_UNPACK_8_F(c, tex, i)); 567 } 568} 569 570/** 571 * Computes x - floor(x), which is tricky because our FTOI truncates (rounds 572 * to zero). 573 */ 574static struct qreg 575ntq_ffract(struct vc4_compile *c, struct qreg src) 576{ 577 struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src)); 578 struct qreg diff = qir_FSUB(c, src, trunc); 579 qir_SF(c, diff); 580 581 qir_FADD_dest(c, diff, 582 diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS; 583 584 return qir_MOV(c, diff); 585} 586 587/** 588 * Computes floor(x), which is tricky because our FTOI truncates (rounds to 589 * zero). 590 */ 591static struct qreg 592ntq_ffloor(struct vc4_compile *c, struct qreg src) 593{ 594 struct qreg result = qir_ITOF(c, qir_FTOI(c, src)); 595 596 /* This will be < 0 if we truncated and the truncation was of a value 597 * that was < 0 in the first place. 598 */ 599 qir_SF(c, qir_FSUB(c, src, result)); 600 601 struct qinst *sub = qir_FSUB_dest(c, result, 602 result, qir_uniform_f(c, 1.0)); 603 sub->cond = QPU_COND_NS; 604 605 return qir_MOV(c, result); 606} 607 608/** 609 * Computes ceil(x), which is tricky because our FTOI truncates (rounds to 610 * zero). 611 */ 612static struct qreg 613ntq_fceil(struct vc4_compile *c, struct qreg src) 614{ 615 struct qreg result = qir_ITOF(c, qir_FTOI(c, src)); 616 617 /* This will be < 0 if we truncated and the truncation was of a value 618 * that was > 0 in the first place. 619 */ 620 qir_SF(c, qir_FSUB(c, result, src)); 621 622 qir_FADD_dest(c, result, 623 result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS; 624 625 return qir_MOV(c, result); 626} 627 628static struct qreg 629ntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x) 630{ 631 /* Since we're using a Taylor approximation, we want to have a small 632 * number of coefficients and take advantage of sin/cos repeating 633 * every 2pi. We keep our x as close to 0 as we can, since the series 634 * will be less accurate as |x| increases. (Also, be careful of 635 * shifting the input x value to be tricky with sin/cos relations, 636 * because getting accurate values for x==0 is very important for SDL 637 * rendering) 638 */ 639 struct qreg scaled_x = 640 qir_FMUL(c, x, 641 qir_uniform_f(c, 1.0f / (M_PI * 2.0f))); 642 /* Note: FTOI truncates toward 0. */ 643 struct qreg x_frac = qir_FSUB(c, scaled_x, 644 qir_ITOF(c, qir_FTOI(c, scaled_x))); 645 /* Map [0.5, 1] to [-0.5, 0] */ 646 qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5))); 647 qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC; 648 /* Map [-1, -0.5] to [0, 0.5] */ 649 qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5))); 650 qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS; 651 652 return x_frac; 653} 654 655static struct qreg 656ntq_fsin(struct vc4_compile *c, struct qreg src) 657{ 658 float coeff[] = { 659 2.0 * M_PI, 660 -pow(2.0 * M_PI, 3) / (3 * 2 * 1), 661 pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1), 662 -pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1), 663 pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), 664 }; 665 666 struct qreg x = ntq_shrink_sincos_input_range(c, src); 667 struct qreg x2 = qir_FMUL(c, x, x); 668 struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0])); 669 for (int i = 1; i < ARRAY_SIZE(coeff); i++) { 670 x = qir_FMUL(c, x, x2); 671 sum = qir_FADD(c, 672 sum, 673 qir_FMUL(c, 674 x, 675 qir_uniform_f(c, coeff[i]))); 676 } 677 return sum; 678} 679 680static struct qreg 681ntq_fcos(struct vc4_compile *c, struct qreg src) 682{ 683 float coeff[] = { 684 1.0f, 685 -pow(2.0 * M_PI, 2) / (2 * 1), 686 pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1), 687 -pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1), 688 pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), 689 -pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), 690 }; 691 692 struct qreg x_frac = ntq_shrink_sincos_input_range(c, src); 693 struct qreg sum = qir_uniform_f(c, coeff[0]); 694 struct qreg x2 = qir_FMUL(c, x_frac, x_frac); 695 struct qreg x = x2; /* Current x^2, x^4, or x^6 */ 696 for (int i = 1; i < ARRAY_SIZE(coeff); i++) { 697 if (i != 1) 698 x = qir_FMUL(c, x, x2); 699 700 sum = qir_FADD(c, qir_FMUL(c, 701 x, 702 qir_uniform_f(c, coeff[i])), 703 sum); 704 } 705 return sum; 706} 707 708static struct qreg 709ntq_fsign(struct vc4_compile *c, struct qreg src) 710{ 711 struct qreg t = qir_get_temp(c); 712 713 qir_SF(c, src); 714 qir_MOV_dest(c, t, qir_uniform_f(c, 0.0)); 715 qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC; 716 qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS; 717 return qir_MOV(c, t); 718} 719 720static void 721emit_vertex_input(struct vc4_compile *c, int attr) 722{ 723 enum pipe_format format = c->vs_key->attr_formats[attr]; 724 uint32_t attr_size = util_format_get_blocksize(format); 725 726 c->vattr_sizes[attr] = align(attr_size, 4); 727 for (int i = 0; i < align(attr_size, 4) / 4; i++) { 728 c->inputs[attr * 4 + i] = 729 qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i)); 730 c->num_inputs++; 731 } 732} 733 734static void 735emit_fragcoord_input(struct vc4_compile *c, int attr) 736{ 737 c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0)); 738 c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0)); 739 c->inputs[attr * 4 + 2] = 740 qir_FMUL(c, 741 qir_ITOF(c, qir_FRAG_Z(c)), 742 qir_uniform_f(c, 1.0 / 0xffffff)); 743 c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c)); 744} 745 746static struct qreg 747emit_fragment_varying(struct vc4_compile *c, gl_varying_slot slot, 748 uint8_t swizzle) 749{ 750 uint32_t i = c->num_input_slots++; 751 struct qreg vary = { 752 QFILE_VARY, 753 i 754 }; 755 756 if (c->num_input_slots >= c->input_slots_array_size) { 757 c->input_slots_array_size = 758 MAX2(4, c->input_slots_array_size * 2); 759 760 c->input_slots = reralloc(c, c->input_slots, 761 struct vc4_varying_slot, 762 c->input_slots_array_size); 763 } 764 765 c->input_slots[i].slot = slot; 766 c->input_slots[i].swizzle = swizzle; 767 768 return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c))); 769} 770 771static void 772emit_fragment_input(struct vc4_compile *c, int attr, gl_varying_slot slot) 773{ 774 for (int i = 0; i < 4; i++) { 775 c->inputs[attr * 4 + i] = 776 emit_fragment_varying(c, slot, i); 777 c->num_inputs++; 778 } 779} 780 781static void 782add_output(struct vc4_compile *c, 783 uint32_t decl_offset, 784 uint8_t slot, 785 uint8_t swizzle) 786{ 787 uint32_t old_array_size = c->outputs_array_size; 788 resize_qreg_array(c, &c->outputs, &c->outputs_array_size, 789 decl_offset + 1); 790 791 if (old_array_size != c->outputs_array_size) { 792 c->output_slots = reralloc(c, 793 c->output_slots, 794 struct vc4_varying_slot, 795 c->outputs_array_size); 796 } 797 798 c->output_slots[decl_offset].slot = slot; 799 c->output_slots[decl_offset].swizzle = swizzle; 800} 801 802static bool 803ntq_src_is_only_ssa_def_user(nir_src *src) 804{ 805 if (!src->is_ssa) 806 return false; 807 808 if (!list_is_empty(&src->ssa->if_uses)) 809 return false; 810 811 return (src->ssa->uses.next == &src->use_link && 812 src->ssa->uses.next->next == &src->ssa->uses); 813} 814 815/** 816 * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack 817 * bit set. 818 * 819 * However, as an optimization, it tries to find the instructions generating 820 * the sources to be packed and just emit the pack flag there, if possible. 821 */ 822static void 823ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr) 824{ 825 struct qreg result = qir_get_temp(c); 826 struct nir_alu_instr *vec4 = NULL; 827 828 /* If packing from a vec4 op (as expected), identify it so that we can 829 * peek back at what generated its sources. 830 */ 831 if (instr->src[0].src.is_ssa && 832 instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu && 833 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op == 834 nir_op_vec4) { 835 vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 836 } 837 838 /* If the pack is replicating the same channel 4 times, use the 8888 839 * pack flag. This is common for blending using the alpha 840 * channel. 841 */ 842 if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] && 843 instr->src[0].swizzle[0] == instr->src[0].swizzle[2] && 844 instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) { 845 struct qreg rep = ntq_get_src(c, 846 instr->src[0].src, 847 instr->src[0].swizzle[0]); 848 ntq_store_dest(c, &instr->dest.dest, 0, qir_PACK_8888_F(c, rep)); 849 return; 850 } 851 852 for (int i = 0; i < 4; i++) { 853 int swiz = instr->src[0].swizzle[i]; 854 struct qreg src; 855 if (vec4) { 856 src = ntq_get_src(c, vec4->src[swiz].src, 857 vec4->src[swiz].swizzle[0]); 858 } else { 859 src = ntq_get_src(c, instr->src[0].src, swiz); 860 } 861 862 if (vec4 && 863 ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) && 864 src.file == QFILE_TEMP && 865 c->defs[src.index] && 866 qir_is_mul(c->defs[src.index]) && 867 !c->defs[src.index]->dst.pack) { 868 struct qinst *rewrite = c->defs[src.index]; 869 c->defs[src.index] = NULL; 870 rewrite->dst = result; 871 rewrite->dst.pack = QPU_PACK_MUL_8A + i; 872 continue; 873 } 874 875 qir_PACK_8_F(c, result, src, i); 876 } 877 878 ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result)); 879} 880 881/** Handles sign-extended bitfield extracts for 16 bits. */ 882static struct qreg 883ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset, 884 struct qreg bits) 885{ 886 assert(bits.file == QFILE_UNIF && 887 c->uniform_contents[bits.index] == QUNIFORM_CONSTANT && 888 c->uniform_data[bits.index] == 16); 889 890 assert(offset.file == QFILE_UNIF && 891 c->uniform_contents[offset.index] == QUNIFORM_CONSTANT); 892 int offset_bit = c->uniform_data[offset.index]; 893 assert(offset_bit % 16 == 0); 894 895 return qir_UNPACK_16_I(c, base, offset_bit / 16); 896} 897 898/** Handles unsigned bitfield extracts for 8 bits. */ 899static struct qreg 900ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset, 901 struct qreg bits) 902{ 903 assert(bits.file == QFILE_UNIF && 904 c->uniform_contents[bits.index] == QUNIFORM_CONSTANT && 905 c->uniform_data[bits.index] == 8); 906 907 assert(offset.file == QFILE_UNIF && 908 c->uniform_contents[offset.index] == QUNIFORM_CONSTANT); 909 int offset_bit = c->uniform_data[offset.index]; 910 assert(offset_bit % 8 == 0); 911 912 return qir_UNPACK_8_I(c, base, offset_bit / 8); 913} 914 915/** 916 * If compare_instr is a valid comparison instruction, emits the 917 * compare_instr's comparison and returns the sel_instr's return value based 918 * on the compare_instr's result. 919 */ 920static bool 921ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest, 922 nir_alu_instr *compare_instr, 923 nir_alu_instr *sel_instr) 924{ 925 enum qpu_cond cond; 926 927 switch (compare_instr->op) { 928 case nir_op_feq32: 929 case nir_op_ieq32: 930 case nir_op_seq: 931 cond = QPU_COND_ZS; 932 break; 933 case nir_op_fneu32: 934 case nir_op_ine32: 935 case nir_op_sne: 936 cond = QPU_COND_ZC; 937 break; 938 case nir_op_fge32: 939 case nir_op_ige32: 940 case nir_op_uge32: 941 case nir_op_sge: 942 cond = QPU_COND_NC; 943 break; 944 case nir_op_flt32: 945 case nir_op_ilt32: 946 case nir_op_slt: 947 cond = QPU_COND_NS; 948 break; 949 default: 950 return false; 951 } 952 953 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); 954 struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1); 955 956 unsigned unsized_type = 957 nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]); 958 if (unsized_type == nir_type_float) 959 qir_SF(c, qir_FSUB(c, src0, src1)); 960 else 961 qir_SF(c, qir_SUB(c, src0, src1)); 962 963 switch (sel_instr->op) { 964 case nir_op_seq: 965 case nir_op_sne: 966 case nir_op_sge: 967 case nir_op_slt: 968 *dest = qir_SEL(c, cond, 969 qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0)); 970 break; 971 972 case nir_op_b32csel: 973 *dest = qir_SEL(c, cond, 974 ntq_get_alu_src(c, sel_instr, 1), 975 ntq_get_alu_src(c, sel_instr, 2)); 976 break; 977 978 default: 979 *dest = qir_SEL(c, cond, 980 qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0)); 981 break; 982 } 983 984 /* Make the temporary for nir_store_dest(). */ 985 *dest = qir_MOV(c, *dest); 986 987 return true; 988} 989 990/** 991 * Attempts to fold a comparison generating a boolean result into the 992 * condition code for selecting between two values, instead of comparing the 993 * boolean result against 0 to generate the condition code. 994 */ 995static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr, 996 struct qreg *src) 997{ 998 if (!instr->src[0].src.is_ssa) 999 goto out; 1000 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 1001 goto out; 1002 nir_alu_instr *compare = 1003 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 1004 if (!compare) 1005 goto out; 1006 1007 struct qreg dest; 1008 if (ntq_emit_comparison(c, &dest, compare, instr)) 1009 return dest; 1010 1011out: 1012 qir_SF(c, src[0]); 1013 return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2])); 1014} 1015 1016static struct qreg 1017ntq_fddx(struct vc4_compile *c, struct qreg src) 1018{ 1019 /* Make sure that we have a bare temp to use for MUL rotation, so it 1020 * can be allocated to an accumulator. 1021 */ 1022 if (src.pack || src.file != QFILE_TEMP) 1023 src = qir_MOV(c, src); 1024 1025 struct qreg from_left = qir_ROT_MUL(c, src, 1); 1026 struct qreg from_right = qir_ROT_MUL(c, src, 15); 1027 1028 /* Distinguish left/right pixels of the quad. */ 1029 qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0), 1030 qir_uniform_ui(c, 1))); 1031 1032 return qir_MOV(c, qir_SEL(c, QPU_COND_ZS, 1033 qir_FSUB(c, from_right, src), 1034 qir_FSUB(c, src, from_left))); 1035} 1036 1037static struct qreg 1038ntq_fddy(struct vc4_compile *c, struct qreg src) 1039{ 1040 if (src.pack || src.file != QFILE_TEMP) 1041 src = qir_MOV(c, src); 1042 1043 struct qreg from_bottom = qir_ROT_MUL(c, src, 2); 1044 struct qreg from_top = qir_ROT_MUL(c, src, 14); 1045 1046 /* Distinguish top/bottom pixels of the quad. */ 1047 qir_SF(c, qir_AND(c, 1048 qir_reg(QFILE_QPU_ELEMENT, 0), 1049 qir_uniform_ui(c, 2))); 1050 1051 return qir_MOV(c, qir_SEL(c, QPU_COND_ZS, 1052 qir_FSUB(c, from_top, src), 1053 qir_FSUB(c, src, from_bottom))); 1054} 1055 1056static void 1057ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) 1058{ 1059 /* This should always be lowered to ALU operations for VC4. */ 1060 assert(!instr->dest.saturate); 1061 1062 /* Vectors are special in that they have non-scalarized writemasks, 1063 * and just take the first swizzle channel for each argument in order 1064 * into each writemask channel. 1065 */ 1066 if (instr->op == nir_op_vec2 || 1067 instr->op == nir_op_vec3 || 1068 instr->op == nir_op_vec4) { 1069 struct qreg srcs[4]; 1070 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 1071 srcs[i] = ntq_get_src(c, instr->src[i].src, 1072 instr->src[i].swizzle[0]); 1073 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 1074 ntq_store_dest(c, &instr->dest.dest, i, 1075 qir_MOV(c, srcs[i])); 1076 return; 1077 } 1078 1079 if (instr->op == nir_op_pack_unorm_4x8) { 1080 ntq_emit_pack_unorm_4x8(c, instr); 1081 return; 1082 } 1083 1084 if (instr->op == nir_op_unpack_unorm_4x8) { 1085 struct qreg src = ntq_get_src(c, instr->src[0].src, 1086 instr->src[0].swizzle[0]); 1087 for (int i = 0; i < 4; i++) { 1088 if (instr->dest.write_mask & (1 << i)) 1089 ntq_store_dest(c, &instr->dest.dest, i, 1090 qir_UNPACK_8_F(c, src, i)); 1091 } 1092 return; 1093 } 1094 1095 /* General case: We can just grab the one used channel per src. */ 1096 struct qreg src[nir_op_infos[instr->op].num_inputs]; 1097 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1098 src[i] = ntq_get_alu_src(c, instr, i); 1099 } 1100 1101 struct qreg result; 1102 1103 switch (instr->op) { 1104 case nir_op_mov: 1105 result = qir_MOV(c, src[0]); 1106 break; 1107 case nir_op_fmul: 1108 result = qir_FMUL(c, src[0], src[1]); 1109 break; 1110 case nir_op_fadd: 1111 result = qir_FADD(c, src[0], src[1]); 1112 break; 1113 case nir_op_fsub: 1114 result = qir_FSUB(c, src[0], src[1]); 1115 break; 1116 case nir_op_fmin: 1117 result = qir_FMIN(c, src[0], src[1]); 1118 break; 1119 case nir_op_fmax: 1120 result = qir_FMAX(c, src[0], src[1]); 1121 break; 1122 1123 case nir_op_f2i32: 1124 case nir_op_f2u32: 1125 result = qir_FTOI(c, src[0]); 1126 break; 1127 case nir_op_i2f32: 1128 case nir_op_u2f32: 1129 result = qir_ITOF(c, src[0]); 1130 break; 1131 case nir_op_b2f32: 1132 result = qir_AND(c, src[0], qir_uniform_f(c, 1.0)); 1133 break; 1134 case nir_op_b2i32: 1135 result = qir_AND(c, src[0], qir_uniform_ui(c, 1)); 1136 break; 1137 case nir_op_i2b32: 1138 case nir_op_f2b32: 1139 qir_SF(c, src[0]); 1140 result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, 1141 qir_uniform_ui(c, ~0), 1142 qir_uniform_ui(c, 0))); 1143 break; 1144 1145 case nir_op_iadd: 1146 result = qir_ADD(c, src[0], src[1]); 1147 break; 1148 case nir_op_ushr: 1149 result = qir_SHR(c, src[0], src[1]); 1150 break; 1151 case nir_op_isub: 1152 result = qir_SUB(c, src[0], src[1]); 1153 break; 1154 case nir_op_ishr: 1155 result = qir_ASR(c, src[0], src[1]); 1156 break; 1157 case nir_op_ishl: 1158 result = qir_SHL(c, src[0], src[1]); 1159 break; 1160 case nir_op_imin: 1161 result = qir_MIN(c, src[0], src[1]); 1162 break; 1163 case nir_op_imax: 1164 result = qir_MAX(c, src[0], src[1]); 1165 break; 1166 case nir_op_iand: 1167 result = qir_AND(c, src[0], src[1]); 1168 break; 1169 case nir_op_ior: 1170 result = qir_OR(c, src[0], src[1]); 1171 break; 1172 case nir_op_ixor: 1173 result = qir_XOR(c, src[0], src[1]); 1174 break; 1175 case nir_op_inot: 1176 result = qir_NOT(c, src[0]); 1177 break; 1178 1179 case nir_op_imul: 1180 result = ntq_umul(c, src[0], src[1]); 1181 break; 1182 1183 case nir_op_seq: 1184 case nir_op_sne: 1185 case nir_op_sge: 1186 case nir_op_slt: 1187 case nir_op_feq32: 1188 case nir_op_fneu32: 1189 case nir_op_fge32: 1190 case nir_op_flt32: 1191 case nir_op_ieq32: 1192 case nir_op_ine32: 1193 case nir_op_ige32: 1194 case nir_op_uge32: 1195 case nir_op_ilt32: 1196 if (!ntq_emit_comparison(c, &result, instr, instr)) { 1197 fprintf(stderr, "Bad comparison instruction\n"); 1198 } 1199 break; 1200 1201 case nir_op_b32csel: 1202 result = ntq_emit_bcsel(c, instr, src); 1203 break; 1204 case nir_op_fcsel: 1205 qir_SF(c, src[0]); 1206 result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2])); 1207 break; 1208 1209 case nir_op_frcp: 1210 result = ntq_rcp(c, src[0]); 1211 break; 1212 case nir_op_frsq: 1213 result = ntq_rsq(c, src[0]); 1214 break; 1215 case nir_op_fexp2: 1216 result = qir_EXP2(c, src[0]); 1217 break; 1218 case nir_op_flog2: 1219 result = qir_LOG2(c, src[0]); 1220 break; 1221 1222 case nir_op_ftrunc: 1223 result = qir_ITOF(c, qir_FTOI(c, src[0])); 1224 break; 1225 case nir_op_fceil: 1226 result = ntq_fceil(c, src[0]); 1227 break; 1228 case nir_op_ffract: 1229 result = ntq_ffract(c, src[0]); 1230 break; 1231 case nir_op_ffloor: 1232 result = ntq_ffloor(c, src[0]); 1233 break; 1234 1235 case nir_op_fsin: 1236 result = ntq_fsin(c, src[0]); 1237 break; 1238 case nir_op_fcos: 1239 result = ntq_fcos(c, src[0]); 1240 break; 1241 1242 case nir_op_fsign: 1243 result = ntq_fsign(c, src[0]); 1244 break; 1245 1246 case nir_op_fabs: 1247 result = qir_FMAXABS(c, src[0], src[0]); 1248 break; 1249 case nir_op_iabs: 1250 result = qir_MAX(c, src[0], 1251 qir_SUB(c, qir_uniform_ui(c, 0), src[0])); 1252 break; 1253 1254 case nir_op_ibitfield_extract: 1255 result = ntq_emit_ibfe(c, src[0], src[1], src[2]); 1256 break; 1257 1258 case nir_op_ubitfield_extract: 1259 result = ntq_emit_ubfe(c, src[0], src[1], src[2]); 1260 break; 1261 1262 case nir_op_usadd_4x8_vc4: 1263 result = qir_V8ADDS(c, src[0], src[1]); 1264 break; 1265 1266 case nir_op_ussub_4x8_vc4: 1267 result = qir_V8SUBS(c, src[0], src[1]); 1268 break; 1269 1270 case nir_op_umin_4x8_vc4: 1271 result = qir_V8MIN(c, src[0], src[1]); 1272 break; 1273 1274 case nir_op_umax_4x8_vc4: 1275 result = qir_V8MAX(c, src[0], src[1]); 1276 break; 1277 1278 case nir_op_umul_unorm_4x8_vc4: 1279 result = qir_V8MULD(c, src[0], src[1]); 1280 break; 1281 1282 case nir_op_fddx: 1283 case nir_op_fddx_coarse: 1284 case nir_op_fddx_fine: 1285 result = ntq_fddx(c, src[0]); 1286 break; 1287 1288 case nir_op_fddy: 1289 case nir_op_fddy_coarse: 1290 case nir_op_fddy_fine: 1291 result = ntq_fddy(c, src[0]); 1292 break; 1293 1294 default: 1295 fprintf(stderr, "unknown NIR ALU inst: "); 1296 nir_print_instr(&instr->instr, stderr); 1297 fprintf(stderr, "\n"); 1298 abort(); 1299 } 1300 1301 /* We have a scalar result, so the instruction should only have a 1302 * single channel written to. 1303 */ 1304 assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 1305 ntq_store_dest(c, &instr->dest.dest, 1306 ffs(instr->dest.write_mask) - 1, result); 1307} 1308 1309static void 1310emit_frag_end(struct vc4_compile *c) 1311{ 1312 struct qreg color; 1313 if (c->output_color_index != -1) { 1314 color = c->outputs[c->output_color_index]; 1315 } else { 1316 color = qir_uniform_ui(c, 0); 1317 } 1318 1319 uint32_t discard_cond = QPU_COND_ALWAYS; 1320 if (c->s->info.fs.uses_discard) { 1321 qir_SF(c, c->discard); 1322 discard_cond = QPU_COND_ZS; 1323 } 1324 1325 if (c->fs_key->stencil_enabled) { 1326 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), 1327 qir_uniform(c, QUNIFORM_STENCIL, 0)); 1328 if (c->fs_key->stencil_twoside) { 1329 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), 1330 qir_uniform(c, QUNIFORM_STENCIL, 1)); 1331 } 1332 if (c->fs_key->stencil_full_writemasks) { 1333 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), 1334 qir_uniform(c, QUNIFORM_STENCIL, 2)); 1335 } 1336 } 1337 1338 if (c->output_sample_mask_index != -1) { 1339 qir_MS_MASK(c, c->outputs[c->output_sample_mask_index]); 1340 } 1341 1342 if (c->fs_key->depth_enabled) { 1343 if (c->output_position_index != -1) { 1344 qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0), 1345 qir_FMUL(c, 1346 c->outputs[c->output_position_index], 1347 qir_uniform_f(c, 0xffffff)))->cond = discard_cond; 1348 } else { 1349 qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0), 1350 qir_FRAG_Z(c))->cond = discard_cond; 1351 } 1352 } 1353 1354 if (!c->msaa_per_sample_output) { 1355 qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0), 1356 color)->cond = discard_cond; 1357 } else { 1358 for (int i = 0; i < VC4_MAX_SAMPLES; i++) { 1359 qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0), 1360 c->sample_colors[i])->cond = discard_cond; 1361 } 1362 } 1363} 1364 1365static void 1366emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w) 1367{ 1368 struct qreg packed = qir_get_temp(c); 1369 1370 for (int i = 0; i < 2; i++) { 1371 struct qreg scale = 1372 qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0); 1373 1374 struct qreg packed_chan = packed; 1375 packed_chan.pack = QPU_PACK_A_16A + i; 1376 1377 qir_FTOI_dest(c, packed_chan, 1378 qir_FMUL(c, 1379 qir_FMUL(c, 1380 c->outputs[c->output_position_index + i], 1381 scale), 1382 rcp_w)); 1383 } 1384 1385 qir_VPM_WRITE(c, packed); 1386} 1387 1388static void 1389emit_zs_write(struct vc4_compile *c, struct qreg rcp_w) 1390{ 1391 struct qreg zscale = qir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0); 1392 struct qreg zoffset = qir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0); 1393 1394 qir_VPM_WRITE(c, qir_FADD(c, qir_FMUL(c, qir_FMUL(c, 1395 c->outputs[c->output_position_index + 2], 1396 zscale), 1397 rcp_w), 1398 zoffset)); 1399} 1400 1401static void 1402emit_rcp_wc_write(struct vc4_compile *c, struct qreg rcp_w) 1403{ 1404 qir_VPM_WRITE(c, rcp_w); 1405} 1406 1407static void 1408emit_point_size_write(struct vc4_compile *c) 1409{ 1410 struct qreg point_size; 1411 1412 if (c->output_point_size_index != -1) 1413 point_size = c->outputs[c->output_point_size_index]; 1414 else 1415 point_size = qir_uniform_f(c, 1.0); 1416 1417 qir_VPM_WRITE(c, point_size); 1418} 1419 1420/** 1421 * Emits a VPM read of the stub vertex attribute set up by vc4_draw.c. 1422 * 1423 * The simulator insists that there be at least one vertex attribute, so 1424 * vc4_draw.c will emit one if it wouldn't have otherwise. The simulator also 1425 * insists that all vertex attributes loaded get read by the VS/CS, so we have 1426 * to consume it here. 1427 */ 1428static void 1429emit_stub_vpm_read(struct vc4_compile *c) 1430{ 1431 if (c->num_inputs) 1432 return; 1433 1434 c->vattr_sizes[0] = 4; 1435 (void)qir_MOV(c, qir_reg(QFILE_VPM, 0)); 1436 c->num_inputs++; 1437} 1438 1439static void 1440emit_vert_end(struct vc4_compile *c, 1441 struct vc4_varying_slot *fs_inputs, 1442 uint32_t num_fs_inputs) 1443{ 1444 struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]); 1445 1446 emit_stub_vpm_read(c); 1447 1448 emit_scaled_viewport_write(c, rcp_w); 1449 emit_zs_write(c, rcp_w); 1450 emit_rcp_wc_write(c, rcp_w); 1451 if (c->vs_key->per_vertex_point_size) 1452 emit_point_size_write(c); 1453 1454 for (int i = 0; i < num_fs_inputs; i++) { 1455 struct vc4_varying_slot *input = &fs_inputs[i]; 1456 int j; 1457 1458 for (j = 0; j < c->num_outputs; j++) { 1459 struct vc4_varying_slot *output = 1460 &c->output_slots[j]; 1461 1462 if (input->slot == output->slot && 1463 input->swizzle == output->swizzle) { 1464 qir_VPM_WRITE(c, c->outputs[j]); 1465 break; 1466 } 1467 } 1468 /* Emit padding if we didn't find a declared VS output for 1469 * this FS input. 1470 */ 1471 if (j == c->num_outputs) 1472 qir_VPM_WRITE(c, qir_uniform_f(c, 0.0)); 1473 } 1474} 1475 1476static void 1477emit_coord_end(struct vc4_compile *c) 1478{ 1479 struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]); 1480 1481 emit_stub_vpm_read(c); 1482 1483 for (int i = 0; i < 4; i++) 1484 qir_VPM_WRITE(c, c->outputs[c->output_position_index + i]); 1485 1486 emit_scaled_viewport_write(c, rcp_w); 1487 emit_zs_write(c, rcp_w); 1488 emit_rcp_wc_write(c, rcp_w); 1489 if (c->vs_key->per_vertex_point_size) 1490 emit_point_size_write(c); 1491} 1492 1493static void 1494vc4_optimize_nir(struct nir_shader *s) 1495{ 1496 bool progress; 1497 unsigned lower_flrp = 1498 (s->options->lower_flrp16 ? 16 : 0) | 1499 (s->options->lower_flrp32 ? 32 : 0) | 1500 (s->options->lower_flrp64 ? 64 : 0); 1501 1502 do { 1503 progress = false; 1504 1505 NIR_PASS_V(s, nir_lower_vars_to_ssa); 1506 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); 1507 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false); 1508 NIR_PASS(progress, s, nir_copy_prop); 1509 NIR_PASS(progress, s, nir_opt_remove_phis); 1510 NIR_PASS(progress, s, nir_opt_dce); 1511 NIR_PASS(progress, s, nir_opt_dead_cf); 1512 NIR_PASS(progress, s, nir_opt_cse); 1513 NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); 1514 NIR_PASS(progress, s, nir_opt_algebraic); 1515 NIR_PASS(progress, s, nir_opt_constant_folding); 1516 if (lower_flrp != 0) { 1517 bool lower_flrp_progress = false; 1518 1519 NIR_PASS(lower_flrp_progress, s, nir_lower_flrp, 1520 lower_flrp, 1521 false /* always_precise */); 1522 if (lower_flrp_progress) { 1523 NIR_PASS(progress, s, nir_opt_constant_folding); 1524 progress = true; 1525 } 1526 1527 /* Nothing should rematerialize any flrps, so we only 1528 * need to do this lowering once. 1529 */ 1530 lower_flrp = 0; 1531 } 1532 1533 NIR_PASS(progress, s, nir_opt_undef); 1534 NIR_PASS(progress, s, nir_opt_loop_unroll); 1535 } while (progress); 1536} 1537 1538static int 1539driver_location_compare(const void *in_a, const void *in_b) 1540{ 1541 const nir_variable *const *a = in_a; 1542 const nir_variable *const *b = in_b; 1543 1544 return (*a)->data.driver_location - (*b)->data.driver_location; 1545} 1546 1547static void 1548ntq_setup_inputs(struct vc4_compile *c) 1549{ 1550 unsigned num_entries = 0; 1551 nir_foreach_shader_in_variable(var, c->s) 1552 num_entries++; 1553 1554 nir_variable *vars[num_entries]; 1555 1556 unsigned i = 0; 1557 nir_foreach_shader_in_variable(var, c->s) 1558 vars[i++] = var; 1559 1560 /* Sort the variables so that we emit the input setup in 1561 * driver_location order. This is required for VPM reads, whose data 1562 * is fetched into the VPM in driver_location (TGSI register index) 1563 * order. 1564 */ 1565 qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); 1566 1567 for (unsigned i = 0; i < num_entries; i++) { 1568 nir_variable *var = vars[i]; 1569 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 1570 unsigned loc = var->data.driver_location; 1571 1572 assert(array_len == 1); 1573 (void)array_len; 1574 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 1575 (loc + 1) * 4); 1576 1577 if (c->stage == QSTAGE_FRAG) { 1578 if (var->data.location == VARYING_SLOT_POS) { 1579 emit_fragcoord_input(c, loc); 1580 } else if (util_varying_is_point_coord(var->data.location, 1581 c->fs_key->point_sprite_mask)) { 1582 c->inputs[loc * 4 + 0] = c->point_x; 1583 c->inputs[loc * 4 + 1] = c->point_y; 1584 } else { 1585 emit_fragment_input(c, loc, var->data.location); 1586 } 1587 } else { 1588 emit_vertex_input(c, loc); 1589 } 1590 } 1591} 1592 1593static void 1594ntq_setup_outputs(struct vc4_compile *c) 1595{ 1596 nir_foreach_shader_out_variable(var, c->s) { 1597 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 1598 unsigned loc = var->data.driver_location * 4; 1599 1600 assert(array_len == 1); 1601 (void)array_len; 1602 1603 for (int i = 0; i < 4; i++) 1604 add_output(c, loc + i, var->data.location, i); 1605 1606 if (c->stage == QSTAGE_FRAG) { 1607 switch (var->data.location) { 1608 case FRAG_RESULT_COLOR: 1609 case FRAG_RESULT_DATA0: 1610 c->output_color_index = loc; 1611 break; 1612 case FRAG_RESULT_DEPTH: 1613 c->output_position_index = loc; 1614 break; 1615 case FRAG_RESULT_SAMPLE_MASK: 1616 c->output_sample_mask_index = loc; 1617 break; 1618 } 1619 } else { 1620 switch (var->data.location) { 1621 case VARYING_SLOT_POS: 1622 c->output_position_index = loc; 1623 break; 1624 case VARYING_SLOT_PSIZ: 1625 c->output_point_size_index = loc; 1626 break; 1627 } 1628 } 1629 } 1630} 1631 1632/** 1633 * Sets up the mapping from nir_register to struct qreg *. 1634 * 1635 * Each nir_register gets a struct qreg per 32-bit component being stored. 1636 */ 1637static void 1638ntq_setup_registers(struct vc4_compile *c, struct exec_list *list) 1639{ 1640 foreach_list_typed(nir_register, nir_reg, node, list) { 1641 unsigned array_len = MAX2(nir_reg->num_array_elems, 1); 1642 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 1643 array_len * 1644 nir_reg->num_components); 1645 1646 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); 1647 1648 for (int i = 0; i < array_len * nir_reg->num_components; i++) 1649 qregs[i] = qir_get_temp(c); 1650 } 1651} 1652 1653static void 1654ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr) 1655{ 1656 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 1657 for (int i = 0; i < instr->def.num_components; i++) 1658 qregs[i] = qir_uniform_ui(c, instr->value[i].u32); 1659 1660 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); 1661} 1662 1663static void 1664ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr) 1665{ 1666 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 1667 1668 /* QIR needs there to be *some* value, so pick 0 (same as for 1669 * ntq_setup_registers(). 1670 */ 1671 for (int i = 0; i < instr->def.num_components; i++) 1672 qregs[i] = qir_uniform_ui(c, 0); 1673} 1674 1675static void 1676ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr) 1677{ 1678 assert(nir_src_as_uint(instr->src[0]) == 0); 1679 1680 /* Reads of the per-sample color need to be done in 1681 * order. 1682 */ 1683 int sample_index = (nir_intrinsic_base(instr) - 1684 VC4_NIR_TLB_COLOR_READ_INPUT); 1685 for (int i = 0; i <= sample_index; i++) { 1686 if (c->color_reads[i].file == QFILE_NULL) { 1687 c->color_reads[i] = 1688 qir_TLB_COLOR_READ(c); 1689 } 1690 } 1691 ntq_store_dest(c, &instr->dest, 0, 1692 qir_MOV(c, c->color_reads[sample_index])); 1693} 1694 1695static void 1696ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr) 1697{ 1698 assert(instr->num_components == 1); 1699 assert(nir_src_is_const(instr->src[0]) && 1700 "vc4 doesn't support indirect inputs"); 1701 1702 if (c->stage == QSTAGE_FRAG && 1703 nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) { 1704 ntq_emit_color_read(c, instr); 1705 return; 1706 } 1707 1708 uint32_t offset = nir_intrinsic_base(instr) + 1709 nir_src_as_uint(instr->src[0]); 1710 int comp = nir_intrinsic_component(instr); 1711 ntq_store_dest(c, &instr->dest, 0, 1712 qir_MOV(c, c->inputs[offset * 4 + comp])); 1713} 1714 1715static void 1716ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) 1717{ 1718 unsigned offset; 1719 1720 switch (instr->intrinsic) { 1721 case nir_intrinsic_load_uniform: 1722 assert(instr->num_components == 1); 1723 if (nir_src_is_const(instr->src[0])) { 1724 offset = nir_intrinsic_base(instr) + 1725 nir_src_as_uint(instr->src[0]); 1726 assert(offset % 4 == 0); 1727 /* We need dwords */ 1728 offset = offset / 4; 1729 ntq_store_dest(c, &instr->dest, 0, 1730 qir_uniform(c, QUNIFORM_UNIFORM, 1731 offset)); 1732 } else { 1733 ntq_store_dest(c, &instr->dest, 0, 1734 indirect_uniform_load(c, instr)); 1735 } 1736 break; 1737 1738 case nir_intrinsic_load_ubo: 1739 assert(instr->num_components == 1); 1740 ntq_store_dest(c, &instr->dest, 0, vc4_ubo_load(c, instr)); 1741 break; 1742 1743 case nir_intrinsic_load_user_clip_plane: 1744 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { 1745 ntq_store_dest(c, &instr->dest, i, 1746 qir_uniform(c, QUNIFORM_USER_CLIP_PLANE, 1747 nir_intrinsic_ucp_id(instr) * 1748 4 + i)); 1749 } 1750 break; 1751 1752 case nir_intrinsic_load_blend_const_color_r_float: 1753 case nir_intrinsic_load_blend_const_color_g_float: 1754 case nir_intrinsic_load_blend_const_color_b_float: 1755 case nir_intrinsic_load_blend_const_color_a_float: 1756 ntq_store_dest(c, &instr->dest, 0, 1757 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_X + 1758 (instr->intrinsic - 1759 nir_intrinsic_load_blend_const_color_r_float), 1760 0)); 1761 break; 1762 1763 case nir_intrinsic_load_blend_const_color_rgba8888_unorm: 1764 ntq_store_dest(c, &instr->dest, 0, 1765 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_RGBA, 1766 0)); 1767 break; 1768 1769 case nir_intrinsic_load_blend_const_color_aaaa8888_unorm: 1770 ntq_store_dest(c, &instr->dest, 0, 1771 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_AAAA, 1772 0)); 1773 break; 1774 1775 case nir_intrinsic_load_sample_mask_in: 1776 ntq_store_dest(c, &instr->dest, 0, 1777 qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0)); 1778 break; 1779 1780 case nir_intrinsic_load_front_face: 1781 /* The register contains 0 (front) or 1 (back), and we need to 1782 * turn it into a NIR bool where true means front. 1783 */ 1784 ntq_store_dest(c, &instr->dest, 0, 1785 qir_ADD(c, 1786 qir_uniform_ui(c, -1), 1787 qir_reg(QFILE_FRAG_REV_FLAG, 0))); 1788 break; 1789 1790 case nir_intrinsic_load_input: 1791 ntq_emit_load_input(c, instr); 1792 break; 1793 1794 case nir_intrinsic_store_output: 1795 assert(nir_src_is_const(instr->src[1]) && 1796 "vc4 doesn't support indirect outputs"); 1797 offset = nir_intrinsic_base(instr) + 1798 nir_src_as_uint(instr->src[1]); 1799 1800 /* MSAA color outputs are the only case where we have an 1801 * output that's not lowered to being a store of a single 32 1802 * bit value. 1803 */ 1804 if (c->stage == QSTAGE_FRAG && instr->num_components == 4) { 1805 assert(offset == c->output_color_index); 1806 for (int i = 0; i < 4; i++) { 1807 c->sample_colors[i] = 1808 qir_MOV(c, ntq_get_src(c, instr->src[0], 1809 i)); 1810 } 1811 } else { 1812 offset = offset * 4 + nir_intrinsic_component(instr); 1813 assert(instr->num_components == 1); 1814 c->outputs[offset] = 1815 qir_MOV(c, ntq_get_src(c, instr->src[0], 0)); 1816 c->num_outputs = MAX2(c->num_outputs, offset + 1); 1817 } 1818 break; 1819 1820 case nir_intrinsic_discard: 1821 if (c->execute.file != QFILE_NULL) { 1822 qir_SF(c, c->execute); 1823 qir_MOV_cond(c, QPU_COND_ZS, c->discard, 1824 qir_uniform_ui(c, ~0)); 1825 } else { 1826 qir_MOV_dest(c, c->discard, qir_uniform_ui(c, ~0)); 1827 } 1828 break; 1829 1830 case nir_intrinsic_discard_if: { 1831 /* true (~0) if we're discarding */ 1832 struct qreg cond = ntq_get_src(c, instr->src[0], 0); 1833 1834 if (c->execute.file != QFILE_NULL) { 1835 /* execute == 0 means the channel is active. Invert 1836 * the condition so that we can use zero as "executing 1837 * and discarding." 1838 */ 1839 qir_SF(c, qir_AND(c, c->execute, qir_NOT(c, cond))); 1840 qir_MOV_cond(c, QPU_COND_ZS, c->discard, cond); 1841 } else { 1842 qir_OR_dest(c, c->discard, c->discard, 1843 ntq_get_src(c, instr->src[0], 0)); 1844 } 1845 1846 break; 1847 } 1848 1849 case nir_intrinsic_load_texture_rect_scaling: { 1850 assert(nir_src_is_const(instr->src[0])); 1851 int sampler = nir_src_as_int(instr->src[0]); 1852 1853 ntq_store_dest(c, &instr->dest, 0, 1854 qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, sampler)); 1855 ntq_store_dest(c, &instr->dest, 1, 1856 qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, sampler)); 1857 break; 1858 } 1859 1860 default: 1861 fprintf(stderr, "Unknown intrinsic: "); 1862 nir_print_instr(&instr->instr, stderr); 1863 fprintf(stderr, "\n"); 1864 break; 1865 } 1866} 1867 1868/* Clears (activates) the execute flags for any channels whose jump target 1869 * matches this block. 1870 */ 1871static void 1872ntq_activate_execute_for_block(struct vc4_compile *c) 1873{ 1874 qir_SF(c, qir_SUB(c, 1875 c->execute, 1876 qir_uniform_ui(c, c->cur_block->index))); 1877 qir_MOV_cond(c, QPU_COND_ZS, c->execute, qir_uniform_ui(c, 0)); 1878} 1879 1880static void 1881ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt) 1882{ 1883 if (!c->vc4->screen->has_control_flow) { 1884 fprintf(stderr, 1885 "IF statement support requires updated kernel.\n"); 1886 return; 1887 } 1888 1889 nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 1890 bool empty_else_block = 1891 (nir_else_block == nir_if_last_else_block(if_stmt) && 1892 exec_list_is_empty(&nir_else_block->instr_list)); 1893 1894 struct qblock *then_block = qir_new_block(c); 1895 struct qblock *after_block = qir_new_block(c); 1896 struct qblock *else_block; 1897 if (empty_else_block) 1898 else_block = after_block; 1899 else 1900 else_block = qir_new_block(c); 1901 1902 bool was_top_level = false; 1903 if (c->execute.file == QFILE_NULL) { 1904 c->execute = qir_MOV(c, qir_uniform_ui(c, 0)); 1905 was_top_level = true; 1906 } 1907 1908 /* Set ZS for executing (execute == 0) and jumping (if->condition == 1909 * 0) channels, and then update execute flags for those to point to 1910 * the ELSE block. 1911 */ 1912 qir_SF(c, qir_OR(c, 1913 c->execute, 1914 ntq_get_src(c, if_stmt->condition, 0))); 1915 qir_MOV_cond(c, QPU_COND_ZS, c->execute, 1916 qir_uniform_ui(c, else_block->index)); 1917 1918 /* Jump to ELSE if nothing is active for THEN, otherwise fall 1919 * through. 1920 */ 1921 qir_SF(c, c->execute); 1922 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZC); 1923 qir_link_blocks(c->cur_block, else_block); 1924 qir_link_blocks(c->cur_block, then_block); 1925 1926 /* Process the THEN block. */ 1927 qir_set_emit_block(c, then_block); 1928 ntq_emit_cf_list(c, &if_stmt->then_list); 1929 1930 if (!empty_else_block) { 1931 /* Handle the end of the THEN block. First, all currently 1932 * active channels update their execute flags to point to 1933 * ENDIF 1934 */ 1935 qir_SF(c, c->execute); 1936 qir_MOV_cond(c, QPU_COND_ZS, c->execute, 1937 qir_uniform_ui(c, after_block->index)); 1938 1939 /* If everything points at ENDIF, then jump there immediately. */ 1940 qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, after_block->index))); 1941 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS); 1942 qir_link_blocks(c->cur_block, after_block); 1943 qir_link_blocks(c->cur_block, else_block); 1944 1945 qir_set_emit_block(c, else_block); 1946 ntq_activate_execute_for_block(c); 1947 ntq_emit_cf_list(c, &if_stmt->else_list); 1948 } 1949 1950 qir_link_blocks(c->cur_block, after_block); 1951 1952 qir_set_emit_block(c, after_block); 1953 if (was_top_level) { 1954 c->execute = c->undef; 1955 c->last_top_block = c->cur_block; 1956 } else { 1957 ntq_activate_execute_for_block(c); 1958 } 1959} 1960 1961static void 1962ntq_emit_jump(struct vc4_compile *c, nir_jump_instr *jump) 1963{ 1964 struct qblock *jump_block; 1965 switch (jump->type) { 1966 case nir_jump_break: 1967 jump_block = c->loop_break_block; 1968 break; 1969 case nir_jump_continue: 1970 jump_block = c->loop_cont_block; 1971 break; 1972 default: 1973 unreachable("Unsupported jump type\n"); 1974 } 1975 1976 qir_SF(c, c->execute); 1977 qir_MOV_cond(c, QPU_COND_ZS, c->execute, 1978 qir_uniform_ui(c, jump_block->index)); 1979 1980 /* Jump to the destination block if everyone has taken the jump. */ 1981 qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, jump_block->index))); 1982 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS); 1983 struct qblock *new_block = qir_new_block(c); 1984 qir_link_blocks(c->cur_block, jump_block); 1985 qir_link_blocks(c->cur_block, new_block); 1986 qir_set_emit_block(c, new_block); 1987} 1988 1989static void 1990ntq_emit_instr(struct vc4_compile *c, nir_instr *instr) 1991{ 1992 switch (instr->type) { 1993 case nir_instr_type_alu: 1994 ntq_emit_alu(c, nir_instr_as_alu(instr)); 1995 break; 1996 1997 case nir_instr_type_intrinsic: 1998 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); 1999 break; 2000 2001 case nir_instr_type_load_const: 2002 ntq_emit_load_const(c, nir_instr_as_load_const(instr)); 2003 break; 2004 2005 case nir_instr_type_ssa_undef: 2006 ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr)); 2007 break; 2008 2009 case nir_instr_type_tex: 2010 ntq_emit_tex(c, nir_instr_as_tex(instr)); 2011 break; 2012 2013 case nir_instr_type_jump: 2014 ntq_emit_jump(c, nir_instr_as_jump(instr)); 2015 break; 2016 2017 default: 2018 fprintf(stderr, "Unknown NIR instr type: "); 2019 nir_print_instr(instr, stderr); 2020 fprintf(stderr, "\n"); 2021 abort(); 2022 } 2023} 2024 2025static void 2026ntq_emit_block(struct vc4_compile *c, nir_block *block) 2027{ 2028 nir_foreach_instr(instr, block) { 2029 ntq_emit_instr(c, instr); 2030 } 2031} 2032 2033static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list); 2034 2035static void 2036ntq_emit_loop(struct vc4_compile *c, nir_loop *loop) 2037{ 2038 if (!c->vc4->screen->has_control_flow) { 2039 fprintf(stderr, 2040 "loop support requires updated kernel.\n"); 2041 ntq_emit_cf_list(c, &loop->body); 2042 return; 2043 } 2044 2045 bool was_top_level = false; 2046 if (c->execute.file == QFILE_NULL) { 2047 c->execute = qir_MOV(c, qir_uniform_ui(c, 0)); 2048 was_top_level = true; 2049 } 2050 2051 struct qblock *save_loop_cont_block = c->loop_cont_block; 2052 struct qblock *save_loop_break_block = c->loop_break_block; 2053 2054 c->loop_cont_block = qir_new_block(c); 2055 c->loop_break_block = qir_new_block(c); 2056 2057 qir_link_blocks(c->cur_block, c->loop_cont_block); 2058 qir_set_emit_block(c, c->loop_cont_block); 2059 ntq_activate_execute_for_block(c); 2060 2061 ntq_emit_cf_list(c, &loop->body); 2062 2063 /* If anything had explicitly continued, or is here at the end of the 2064 * loop, then we need to loop again. SF updates are masked by the 2065 * instruction's condition, so we can do the OR of the two conditions 2066 * within SF. 2067 */ 2068 qir_SF(c, c->execute); 2069 struct qinst *cont_check = 2070 qir_SUB_dest(c, 2071 c->undef, 2072 c->execute, 2073 qir_uniform_ui(c, c->loop_cont_block->index)); 2074 cont_check->cond = QPU_COND_ZC; 2075 cont_check->sf = true; 2076 2077 qir_BRANCH(c, QPU_COND_BRANCH_ANY_ZS); 2078 qir_link_blocks(c->cur_block, c->loop_cont_block); 2079 qir_link_blocks(c->cur_block, c->loop_break_block); 2080 2081 qir_set_emit_block(c, c->loop_break_block); 2082 if (was_top_level) { 2083 c->execute = c->undef; 2084 c->last_top_block = c->cur_block; 2085 } else { 2086 ntq_activate_execute_for_block(c); 2087 } 2088 2089 c->loop_break_block = save_loop_break_block; 2090 c->loop_cont_block = save_loop_cont_block; 2091} 2092 2093static void 2094ntq_emit_function(struct vc4_compile *c, nir_function_impl *func) 2095{ 2096 fprintf(stderr, "FUNCTIONS not handled.\n"); 2097 abort(); 2098} 2099 2100static void 2101ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list) 2102{ 2103 foreach_list_typed(nir_cf_node, node, node, list) { 2104 switch (node->type) { 2105 case nir_cf_node_block: 2106 ntq_emit_block(c, nir_cf_node_as_block(node)); 2107 break; 2108 2109 case nir_cf_node_if: 2110 ntq_emit_if(c, nir_cf_node_as_if(node)); 2111 break; 2112 2113 case nir_cf_node_loop: 2114 ntq_emit_loop(c, nir_cf_node_as_loop(node)); 2115 break; 2116 2117 case nir_cf_node_function: 2118 ntq_emit_function(c, nir_cf_node_as_function(node)); 2119 break; 2120 2121 default: 2122 fprintf(stderr, "Unknown NIR node type\n"); 2123 abort(); 2124 } 2125 } 2126} 2127 2128static void 2129ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl) 2130{ 2131 ntq_setup_registers(c, &impl->registers); 2132 ntq_emit_cf_list(c, &impl->body); 2133} 2134 2135static void 2136nir_to_qir(struct vc4_compile *c) 2137{ 2138 if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard) 2139 c->discard = qir_MOV(c, qir_uniform_ui(c, 0)); 2140 2141 ntq_setup_inputs(c); 2142 ntq_setup_outputs(c); 2143 2144 /* Find the main function and emit the body. */ 2145 nir_foreach_function(function, c->s) { 2146 assert(strcmp(function->name, "main") == 0); 2147 assert(function->impl); 2148 ntq_emit_impl(c, function->impl); 2149 } 2150} 2151 2152static const nir_shader_compiler_options nir_options = { 2153 .lower_all_io_to_temps = true, 2154 .lower_extract_byte = true, 2155 .lower_extract_word = true, 2156 .lower_insert_byte = true, 2157 .lower_insert_word = true, 2158 .lower_fdiv = true, 2159 .lower_ffma16 = true, 2160 .lower_ffma32 = true, 2161 .lower_ffma64 = true, 2162 .lower_flrp32 = true, 2163 .lower_fmod = true, 2164 .lower_fpow = true, 2165 .lower_fsat = true, 2166 .lower_fsqrt = true, 2167 .lower_ldexp = true, 2168 .lower_fneg = true, 2169 .lower_ineg = true, 2170 .lower_rotate = true, 2171 .lower_to_scalar = true, 2172 .lower_umax = true, 2173 .lower_umin = true, 2174 .lower_isign = true, 2175 .has_fsub = true, 2176 .has_isub = true, 2177 .max_unroll_iterations = 32, 2178 .force_indirect_unrolling = (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp), 2179}; 2180 2181const void * 2182vc4_screen_get_compiler_options(struct pipe_screen *pscreen, 2183 enum pipe_shader_ir ir, 2184 enum pipe_shader_type shader) 2185{ 2186 return &nir_options; 2187} 2188 2189static int 2190count_nir_instrs(nir_shader *nir) 2191{ 2192 int count = 0; 2193 nir_foreach_function(function, nir) { 2194 if (!function->impl) 2195 continue; 2196 nir_foreach_block(block, function->impl) { 2197 nir_foreach_instr(instr, block) 2198 count++; 2199 } 2200 } 2201 return count; 2202} 2203 2204static struct vc4_compile * 2205vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, 2206 struct vc4_key *key, bool fs_threaded) 2207{ 2208 struct vc4_compile *c = qir_compile_init(); 2209 2210 c->vc4 = vc4; 2211 c->stage = stage; 2212 c->shader_state = &key->shader_state->base; 2213 c->program_id = key->shader_state->program_id; 2214 c->variant_id = 2215 p_atomic_inc_return(&key->shader_state->compiled_variant_count); 2216 c->fs_threaded = fs_threaded; 2217 2218 c->key = key; 2219 switch (stage) { 2220 case QSTAGE_FRAG: 2221 c->fs_key = (struct vc4_fs_key *)key; 2222 if (c->fs_key->is_points) { 2223 c->point_x = emit_fragment_varying(c, ~0, 0); 2224 c->point_y = emit_fragment_varying(c, ~0, 0); 2225 } else if (c->fs_key->is_lines) { 2226 c->line_x = emit_fragment_varying(c, ~0, 0); 2227 } 2228 break; 2229 case QSTAGE_VERT: 2230 c->vs_key = (struct vc4_vs_key *)key; 2231 break; 2232 case QSTAGE_COORD: 2233 c->vs_key = (struct vc4_vs_key *)key; 2234 break; 2235 } 2236 2237 c->s = nir_shader_clone(c, key->shader_state->base.ir.nir); 2238 2239 if (stage == QSTAGE_FRAG) { 2240 NIR_PASS_V(c->s, vc4_nir_lower_blend, c); 2241 } 2242 2243 struct nir_lower_tex_options tex_options = { 2244 .lower_txp = ~0, 2245 2246 /* Apply swizzles to all samplers. */ 2247 .swizzle_result = ~0, 2248 .lower_invalid_implicit_lod = true, 2249 }; 2250 2251 /* Lower the format swizzle and ARB_texture_swizzle-style swizzle. 2252 * The format swizzling applies before sRGB decode, and 2253 * ARB_texture_swizzle is the last thing before returning the sample. 2254 */ 2255 for (int i = 0; i < ARRAY_SIZE(key->tex); i++) { 2256 enum pipe_format format = c->key->tex[i].format; 2257 2258 if (!format) 2259 continue; 2260 2261 const uint8_t *format_swizzle = vc4_get_format_swizzle(format); 2262 2263 for (int j = 0; j < 4; j++) { 2264 uint8_t arb_swiz = c->key->tex[i].swizzle[j]; 2265 2266 if (arb_swiz <= 3) { 2267 tex_options.swizzles[i][j] = 2268 format_swizzle[arb_swiz]; 2269 } else { 2270 tex_options.swizzles[i][j] = arb_swiz; 2271 } 2272 } 2273 2274 if (util_format_is_srgb(format)) 2275 tex_options.lower_srgb |= (1 << i); 2276 } 2277 2278 NIR_PASS_V(c->s, nir_lower_tex, &tex_options); 2279 2280 if (c->key->ucp_enables) { 2281 if (stage == QSTAGE_FRAG) { 2282 NIR_PASS_V(c->s, nir_lower_clip_fs, 2283 c->key->ucp_enables, false); 2284 } else { 2285 NIR_PASS_V(c->s, nir_lower_clip_vs, 2286 c->key->ucp_enables, false, false, NULL); 2287 NIR_PASS_V(c->s, nir_lower_io_to_scalar, 2288 nir_var_shader_out); 2289 } 2290 } 2291 2292 /* FS input scalarizing must happen after nir_lower_two_sided_color, 2293 * which only handles a vec4 at a time. Similarly, VS output 2294 * scalarizing must happen after nir_lower_clip_vs. 2295 */ 2296 if (c->stage == QSTAGE_FRAG) 2297 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); 2298 else 2299 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); 2300 2301 NIR_PASS_V(c->s, vc4_nir_lower_io, c); 2302 NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c); 2303 nir_lower_idiv_options idiv_options = { 2304 .imprecise_32bit_lowering = true, 2305 .allow_fp16 = true, 2306 }; 2307 NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options); 2308 2309 vc4_optimize_nir(c->s); 2310 2311 /* Do late algebraic optimization to turn add(a, neg(b)) back into 2312 * subs, then the mandatory cleanup after algebraic. Note that it may 2313 * produce fnegs, and if so then we need to keep running to squash 2314 * fneg(fneg(a)). 2315 */ 2316 bool more_late_algebraic = true; 2317 while (more_late_algebraic) { 2318 more_late_algebraic = false; 2319 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late); 2320 NIR_PASS_V(c->s, nir_opt_constant_folding); 2321 NIR_PASS_V(c->s, nir_copy_prop); 2322 NIR_PASS_V(c->s, nir_opt_dce); 2323 NIR_PASS_V(c->s, nir_opt_cse); 2324 } 2325 2326 NIR_PASS_V(c->s, nir_lower_bool_to_int32); 2327 2328 NIR_PASS_V(c->s, nir_convert_from_ssa, true); 2329 2330 if (vc4_debug & VC4_DEBUG_SHADERDB) { 2331 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n", 2332 qir_get_stage_name(c->stage), 2333 c->program_id, c->variant_id, 2334 count_nir_instrs(c->s)); 2335 } 2336 2337 if (vc4_debug & VC4_DEBUG_NIR) { 2338 fprintf(stderr, "%s prog %d/%d NIR:\n", 2339 qir_get_stage_name(c->stage), 2340 c->program_id, c->variant_id); 2341 nir_print_shader(c->s, stderr); 2342 } 2343 2344 nir_to_qir(c); 2345 2346 switch (stage) { 2347 case QSTAGE_FRAG: 2348 /* FS threading requires that the thread execute 2349 * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating 2350 * (with no other THRSW afterwards, obviously). If we didn't 2351 * fetch a texture at a top level block, this wouldn't be 2352 * true. 2353 */ 2354 if (c->fs_threaded && !c->last_thrsw_at_top_level) { 2355 c->failed = true; 2356 return c; 2357 } 2358 2359 emit_frag_end(c); 2360 break; 2361 case QSTAGE_VERT: 2362 emit_vert_end(c, 2363 c->vs_key->fs_inputs->input_slots, 2364 c->vs_key->fs_inputs->num_inputs); 2365 break; 2366 case QSTAGE_COORD: 2367 emit_coord_end(c); 2368 break; 2369 } 2370 2371 if (vc4_debug & VC4_DEBUG_QIR) { 2372 fprintf(stderr, "%s prog %d/%d pre-opt QIR:\n", 2373 qir_get_stage_name(c->stage), 2374 c->program_id, c->variant_id); 2375 qir_dump(c); 2376 fprintf(stderr, "\n"); 2377 } 2378 2379 qir_optimize(c); 2380 qir_lower_uniforms(c); 2381 2382 qir_schedule_instructions(c); 2383 qir_emit_uniform_stream_resets(c); 2384 2385 if (vc4_debug & VC4_DEBUG_QIR) { 2386 fprintf(stderr, "%s prog %d/%d QIR:\n", 2387 qir_get_stage_name(c->stage), 2388 c->program_id, c->variant_id); 2389 qir_dump(c); 2390 fprintf(stderr, "\n"); 2391 } 2392 2393 qir_reorder_uniforms(c); 2394 vc4_generate_code(vc4, c); 2395 2396 if (vc4_debug & VC4_DEBUG_SHADERDB) { 2397 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n", 2398 qir_get_stage_name(c->stage), 2399 c->program_id, c->variant_id, 2400 c->qpu_inst_count); 2401 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d uniforms\n", 2402 qir_get_stage_name(c->stage), 2403 c->program_id, c->variant_id, 2404 c->num_uniforms); 2405 } 2406 2407 ralloc_free(c->s); 2408 2409 return c; 2410} 2411 2412static void * 2413vc4_shader_state_create(struct pipe_context *pctx, 2414 const struct pipe_shader_state *cso) 2415{ 2416 struct vc4_context *vc4 = vc4_context(pctx); 2417 struct vc4_uncompiled_shader *so = CALLOC_STRUCT(vc4_uncompiled_shader); 2418 if (!so) 2419 return NULL; 2420 2421 so->program_id = vc4->next_uncompiled_program_id++; 2422 2423 nir_shader *s; 2424 2425 if (cso->type == PIPE_SHADER_IR_NIR) { 2426 /* The backend takes ownership of the NIR shader on state 2427 * creation. 2428 */ 2429 s = cso->ir.nir; 2430 } else { 2431 assert(cso->type == PIPE_SHADER_IR_TGSI); 2432 2433 if (vc4_debug & VC4_DEBUG_TGSI) { 2434 fprintf(stderr, "prog %d TGSI:\n", 2435 so->program_id); 2436 tgsi_dump(cso->tokens, 0); 2437 fprintf(stderr, "\n"); 2438 } 2439 s = tgsi_to_nir(cso->tokens, pctx->screen, false); 2440 } 2441 2442 if (s->info.stage == MESA_SHADER_VERTEX) 2443 NIR_PASS_V(s, nir_lower_point_size, 1.0f, 0.0f); 2444 2445 NIR_PASS_V(s, nir_lower_io, 2446 nir_var_shader_in | nir_var_shader_out | nir_var_uniform, 2447 type_size, (nir_lower_io_options)0); 2448 2449 NIR_PASS_V(s, nir_lower_regs_to_ssa); 2450 NIR_PASS_V(s, nir_normalize_cubemap_coords); 2451 2452 NIR_PASS_V(s, nir_lower_load_const_to_scalar); 2453 2454 vc4_optimize_nir(s); 2455 2456 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL); 2457 2458 /* Garbage collect dead instructions */ 2459 nir_sweep(s); 2460 2461 so->base.type = PIPE_SHADER_IR_NIR; 2462 so->base.ir.nir = s; 2463 2464 if (vc4_debug & VC4_DEBUG_NIR) { 2465 fprintf(stderr, "%s prog %d NIR:\n", 2466 gl_shader_stage_name(s->info.stage), 2467 so->program_id); 2468 nir_print_shader(s, stderr); 2469 fprintf(stderr, "\n"); 2470 } 2471 2472 return so; 2473} 2474 2475static void 2476copy_uniform_state_to_shader(struct vc4_compiled_shader *shader, 2477 struct vc4_compile *c) 2478{ 2479 int count = c->num_uniforms; 2480 struct vc4_shader_uniform_info *uinfo = &shader->uniforms; 2481 2482 uinfo->count = count; 2483 uinfo->data = ralloc_array(shader, uint32_t, count); 2484 memcpy(uinfo->data, c->uniform_data, 2485 count * sizeof(*uinfo->data)); 2486 uinfo->contents = ralloc_array(shader, enum quniform_contents, count); 2487 memcpy(uinfo->contents, c->uniform_contents, 2488 count * sizeof(*uinfo->contents)); 2489 uinfo->num_texture_samples = c->num_texture_samples; 2490 2491 vc4_set_shader_uniform_dirty_flags(shader); 2492} 2493 2494static void 2495vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c, 2496 struct vc4_compiled_shader *shader) 2497{ 2498 struct vc4_fs_inputs inputs; 2499 2500 memset(&inputs, 0, sizeof(inputs)); 2501 inputs.input_slots = ralloc_array(shader, 2502 struct vc4_varying_slot, 2503 c->num_input_slots); 2504 2505 bool input_live[c->num_input_slots]; 2506 2507 memset(input_live, 0, sizeof(input_live)); 2508 qir_for_each_inst_inorder(inst, c) { 2509 for (int i = 0; i < qir_get_nsrc(inst); i++) { 2510 if (inst->src[i].file == QFILE_VARY) 2511 input_live[inst->src[i].index] = true; 2512 } 2513 } 2514 2515 for (int i = 0; i < c->num_input_slots; i++) { 2516 struct vc4_varying_slot *slot = &c->input_slots[i]; 2517 2518 if (!input_live[i]) 2519 continue; 2520 2521 /* Skip non-VS-output inputs. */ 2522 if (slot->slot == (uint8_t)~0) 2523 continue; 2524 2525 if (slot->slot == VARYING_SLOT_COL0 || 2526 slot->slot == VARYING_SLOT_COL1 || 2527 slot->slot == VARYING_SLOT_BFC0 || 2528 slot->slot == VARYING_SLOT_BFC1) { 2529 shader->color_inputs |= (1 << inputs.num_inputs); 2530 } 2531 2532 inputs.input_slots[inputs.num_inputs] = *slot; 2533 inputs.num_inputs++; 2534 } 2535 shader->num_inputs = inputs.num_inputs; 2536 2537 /* Add our set of inputs to the set of all inputs seen. This way, we 2538 * can have a single pointer that identifies an FS inputs set, 2539 * allowing VS to avoid recompiling when the FS is recompiled (or a 2540 * new one is bound using separate shader objects) but the inputs 2541 * don't change. 2542 */ 2543 struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs); 2544 if (entry) { 2545 shader->fs_inputs = entry->key; 2546 ralloc_free(inputs.input_slots); 2547 } else { 2548 struct vc4_fs_inputs *alloc_inputs; 2549 2550 alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs); 2551 memcpy(alloc_inputs, &inputs, sizeof(inputs)); 2552 ralloc_steal(alloc_inputs, inputs.input_slots); 2553 _mesa_set_add(vc4->fs_inputs_set, alloc_inputs); 2554 2555 shader->fs_inputs = alloc_inputs; 2556 } 2557} 2558 2559static struct vc4_compiled_shader * 2560vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, 2561 struct vc4_key *key) 2562{ 2563 struct hash_table *ht; 2564 uint32_t key_size; 2565 bool try_threading; 2566 2567 if (stage == QSTAGE_FRAG) { 2568 ht = vc4->fs_cache; 2569 key_size = sizeof(struct vc4_fs_key); 2570 try_threading = vc4->screen->has_threaded_fs; 2571 } else { 2572 ht = vc4->vs_cache; 2573 key_size = sizeof(struct vc4_vs_key); 2574 try_threading = false; 2575 } 2576 2577 struct vc4_compiled_shader *shader; 2578 struct hash_entry *entry = _mesa_hash_table_search(ht, key); 2579 if (entry) 2580 return entry->data; 2581 2582 struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading); 2583 /* If the FS failed to compile threaded, fall back to single threaded. */ 2584 if (try_threading && c->failed) { 2585 qir_compile_destroy(c); 2586 c = vc4_shader_ntq(vc4, stage, key, false); 2587 } 2588 2589 shader = rzalloc(NULL, struct vc4_compiled_shader); 2590 2591 shader->program_id = vc4->next_compiled_program_id++; 2592 if (stage == QSTAGE_FRAG) { 2593 vc4_setup_compiled_fs_inputs(vc4, c, shader); 2594 2595 /* Note: the temporary clone in c->s has been freed. */ 2596 nir_shader *orig_shader = key->shader_state->base.ir.nir; 2597 if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH)) 2598 shader->disable_early_z = true; 2599 } else { 2600 shader->num_inputs = c->num_inputs; 2601 2602 shader->vattr_offsets[0] = 0; 2603 for (int i = 0; i < 8; i++) { 2604 shader->vattr_offsets[i + 1] = 2605 shader->vattr_offsets[i] + c->vattr_sizes[i]; 2606 2607 if (c->vattr_sizes[i]) 2608 shader->vattrs_live |= (1 << i); 2609 } 2610 } 2611 2612 shader->failed = c->failed; 2613 if (c->failed) { 2614 shader->failed = true; 2615 } else { 2616 copy_uniform_state_to_shader(shader, c); 2617 shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts, 2618 c->qpu_inst_count * 2619 sizeof(uint64_t)); 2620 } 2621 2622 shader->fs_threaded = c->fs_threaded; 2623 2624 if ((vc4_debug & VC4_DEBUG_SHADERDB) && stage == QSTAGE_FRAG) { 2625 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d FS threads\n", 2626 qir_get_stage_name(c->stage), 2627 c->program_id, c->variant_id, 2628 1 + shader->fs_threaded); 2629 } 2630 2631 qir_compile_destroy(c); 2632 2633 struct vc4_key *dup_key; 2634 dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */ 2635 memcpy(dup_key, key, key_size); 2636 _mesa_hash_table_insert(ht, dup_key, shader); 2637 2638 return shader; 2639} 2640 2641static void 2642vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key, 2643 struct vc4_texture_stateobj *texstate) 2644{ 2645 for (int i = 0; i < texstate->num_textures; i++) { 2646 struct pipe_sampler_view *sampler = texstate->textures[i]; 2647 struct vc4_sampler_view *vc4_sampler = vc4_sampler_view(sampler); 2648 struct pipe_sampler_state *sampler_state = 2649 texstate->samplers[i]; 2650 2651 if (!sampler) 2652 continue; 2653 2654 key->tex[i].format = sampler->format; 2655 key->tex[i].swizzle[0] = sampler->swizzle_r; 2656 key->tex[i].swizzle[1] = sampler->swizzle_g; 2657 key->tex[i].swizzle[2] = sampler->swizzle_b; 2658 key->tex[i].swizzle[3] = sampler->swizzle_a; 2659 2660 if (sampler->texture->nr_samples > 1) { 2661 key->tex[i].msaa_width = sampler->texture->width0; 2662 key->tex[i].msaa_height = sampler->texture->height0; 2663 } else if (sampler){ 2664 key->tex[i].compare_mode = sampler_state->compare_mode; 2665 key->tex[i].compare_func = sampler_state->compare_func; 2666 key->tex[i].wrap_s = sampler_state->wrap_s; 2667 key->tex[i].wrap_t = sampler_state->wrap_t; 2668 key->tex[i].force_first_level = 2669 vc4_sampler->force_first_level; 2670 } 2671 } 2672 2673 key->ucp_enables = vc4->rasterizer->base.clip_plane_enable; 2674} 2675 2676static void 2677vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode) 2678{ 2679 struct vc4_job *job = vc4->job; 2680 struct vc4_fs_key local_key; 2681 struct vc4_fs_key *key = &local_key; 2682 2683 if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE | 2684 VC4_DIRTY_BLEND | 2685 VC4_DIRTY_FRAMEBUFFER | 2686 VC4_DIRTY_ZSA | 2687 VC4_DIRTY_RASTERIZER | 2688 VC4_DIRTY_SAMPLE_MASK | 2689 VC4_DIRTY_FRAGTEX | 2690 VC4_DIRTY_UNCOMPILED_FS | 2691 VC4_DIRTY_UBO_1_SIZE))) { 2692 return; 2693 } 2694 2695 memset(key, 0, sizeof(*key)); 2696 vc4_setup_shared_key(vc4, &key->base, &vc4->fragtex); 2697 key->base.shader_state = vc4->prog.bind_fs; 2698 key->is_points = (prim_mode == PIPE_PRIM_POINTS); 2699 key->is_lines = (prim_mode >= PIPE_PRIM_LINES && 2700 prim_mode <= PIPE_PRIM_LINE_STRIP); 2701 key->blend = vc4->blend->rt[0]; 2702 if (vc4->blend->logicop_enable) { 2703 key->logicop_func = vc4->blend->logicop_func; 2704 } else { 2705 key->logicop_func = PIPE_LOGICOP_COPY; 2706 } 2707 if (job->msaa) { 2708 key->msaa = vc4->rasterizer->base.multisample; 2709 key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1); 2710 key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage; 2711 key->sample_alpha_to_one = vc4->blend->alpha_to_one; 2712 } 2713 2714 if (vc4->framebuffer.cbufs[0]) 2715 key->color_format = vc4->framebuffer.cbufs[0]->format; 2716 2717 key->stencil_enabled = vc4->zsa->stencil_uniforms[0] != 0; 2718 key->stencil_twoside = vc4->zsa->stencil_uniforms[1] != 0; 2719 key->stencil_full_writemasks = vc4->zsa->stencil_uniforms[2] != 0; 2720 key->depth_enabled = (vc4->zsa->base.depth_enabled || 2721 key->stencil_enabled); 2722 2723 if (key->is_points) { 2724 key->point_sprite_mask = 2725 vc4->rasterizer->base.sprite_coord_enable; 2726 key->point_coord_upper_left = 2727 (vc4->rasterizer->base.sprite_coord_mode == 2728 PIPE_SPRITE_COORD_UPPER_LEFT); 2729 } 2730 2731 key->ubo_1_size = vc4->constbuf[PIPE_SHADER_FRAGMENT].cb[1].buffer_size; 2732 2733 struct vc4_compiled_shader *old_fs = vc4->prog.fs; 2734 vc4->prog.fs = vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key->base); 2735 if (vc4->prog.fs == old_fs) 2736 return; 2737 2738 vc4->dirty |= VC4_DIRTY_COMPILED_FS; 2739 2740 if (vc4->rasterizer->base.flatshade && 2741 (!old_fs || vc4->prog.fs->color_inputs != old_fs->color_inputs)) { 2742 vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS; 2743 } 2744 2745 if (!old_fs || vc4->prog.fs->fs_inputs != old_fs->fs_inputs) 2746 vc4->dirty |= VC4_DIRTY_FS_INPUTS; 2747} 2748 2749static void 2750vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode) 2751{ 2752 struct vc4_vs_key local_key; 2753 struct vc4_vs_key *key = &local_key; 2754 2755 if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE | 2756 VC4_DIRTY_RASTERIZER | 2757 VC4_DIRTY_VERTTEX | 2758 VC4_DIRTY_VTXSTATE | 2759 VC4_DIRTY_UNCOMPILED_VS | 2760 VC4_DIRTY_FS_INPUTS))) { 2761 return; 2762 } 2763 2764 memset(key, 0, sizeof(*key)); 2765 vc4_setup_shared_key(vc4, &key->base, &vc4->verttex); 2766 key->base.shader_state = vc4->prog.bind_vs; 2767 key->fs_inputs = vc4->prog.fs->fs_inputs; 2768 2769 for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++) 2770 key->attr_formats[i] = vc4->vtx->pipe[i].src_format; 2771 2772 key->per_vertex_point_size = 2773 (prim_mode == PIPE_PRIM_POINTS && 2774 vc4->rasterizer->base.point_size_per_vertex); 2775 2776 struct vc4_compiled_shader *vs = 2777 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base); 2778 if (vs != vc4->prog.vs) { 2779 vc4->prog.vs = vs; 2780 vc4->dirty |= VC4_DIRTY_COMPILED_VS; 2781 } 2782 2783 key->is_coord = true; 2784 /* Coord shaders don't care what the FS inputs are. */ 2785 key->fs_inputs = NULL; 2786 struct vc4_compiled_shader *cs = 2787 vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base); 2788 if (cs != vc4->prog.cs) { 2789 vc4->prog.cs = cs; 2790 vc4->dirty |= VC4_DIRTY_COMPILED_CS; 2791 } 2792} 2793 2794bool 2795vc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode) 2796{ 2797 vc4_update_compiled_fs(vc4, prim_mode); 2798 vc4_update_compiled_vs(vc4, prim_mode); 2799 2800 return !(vc4->prog.cs->failed || 2801 vc4->prog.vs->failed || 2802 vc4->prog.fs->failed); 2803} 2804 2805static uint32_t 2806fs_cache_hash(const void *key) 2807{ 2808 return _mesa_hash_data(key, sizeof(struct vc4_fs_key)); 2809} 2810 2811static uint32_t 2812vs_cache_hash(const void *key) 2813{ 2814 return _mesa_hash_data(key, sizeof(struct vc4_vs_key)); 2815} 2816 2817static bool 2818fs_cache_compare(const void *key1, const void *key2) 2819{ 2820 return memcmp(key1, key2, sizeof(struct vc4_fs_key)) == 0; 2821} 2822 2823static bool 2824vs_cache_compare(const void *key1, const void *key2) 2825{ 2826 return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0; 2827} 2828 2829static uint32_t 2830fs_inputs_hash(const void *key) 2831{ 2832 const struct vc4_fs_inputs *inputs = key; 2833 2834 return _mesa_hash_data(inputs->input_slots, 2835 sizeof(*inputs->input_slots) * 2836 inputs->num_inputs); 2837} 2838 2839static bool 2840fs_inputs_compare(const void *key1, const void *key2) 2841{ 2842 const struct vc4_fs_inputs *inputs1 = key1; 2843 const struct vc4_fs_inputs *inputs2 = key2; 2844 2845 return (inputs1->num_inputs == inputs2->num_inputs && 2846 memcmp(inputs1->input_slots, 2847 inputs2->input_slots, 2848 sizeof(*inputs1->input_slots) * 2849 inputs1->num_inputs) == 0); 2850} 2851 2852static void 2853delete_from_cache_if_matches(struct hash_table *ht, 2854 struct vc4_compiled_shader **last_compile, 2855 struct hash_entry *entry, 2856 struct vc4_uncompiled_shader *so) 2857{ 2858 const struct vc4_key *key = entry->key; 2859 2860 if (key->shader_state == so) { 2861 struct vc4_compiled_shader *shader = entry->data; 2862 _mesa_hash_table_remove(ht, entry); 2863 vc4_bo_unreference(&shader->bo); 2864 2865 if (shader == *last_compile) 2866 *last_compile = NULL; 2867 2868 ralloc_free(shader); 2869 } 2870} 2871 2872static void 2873vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso) 2874{ 2875 struct vc4_context *vc4 = vc4_context(pctx); 2876 struct vc4_uncompiled_shader *so = hwcso; 2877 2878 hash_table_foreach(vc4->fs_cache, entry) { 2879 delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs, 2880 entry, so); 2881 } 2882 hash_table_foreach(vc4->vs_cache, entry) { 2883 delete_from_cache_if_matches(vc4->vs_cache, &vc4->prog.vs, 2884 entry, so); 2885 } 2886 2887 ralloc_free(so->base.ir.nir); 2888 free(so); 2889} 2890 2891static void 2892vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso) 2893{ 2894 struct vc4_context *vc4 = vc4_context(pctx); 2895 vc4->prog.bind_fs = hwcso; 2896 vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS; 2897} 2898 2899static void 2900vc4_vp_state_bind(struct pipe_context *pctx, void *hwcso) 2901{ 2902 struct vc4_context *vc4 = vc4_context(pctx); 2903 vc4->prog.bind_vs = hwcso; 2904 vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS; 2905} 2906 2907void 2908vc4_program_init(struct pipe_context *pctx) 2909{ 2910 struct vc4_context *vc4 = vc4_context(pctx); 2911 2912 pctx->create_vs_state = vc4_shader_state_create; 2913 pctx->delete_vs_state = vc4_shader_state_delete; 2914 2915 pctx->create_fs_state = vc4_shader_state_create; 2916 pctx->delete_fs_state = vc4_shader_state_delete; 2917 2918 pctx->bind_fs_state = vc4_fp_state_bind; 2919 pctx->bind_vs_state = vc4_vp_state_bind; 2920 2921 vc4->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash, 2922 fs_cache_compare); 2923 vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash, 2924 vs_cache_compare); 2925 vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash, 2926 fs_inputs_compare); 2927} 2928 2929void 2930vc4_program_fini(struct pipe_context *pctx) 2931{ 2932 struct vc4_context *vc4 = vc4_context(pctx); 2933 2934 hash_table_foreach(vc4->fs_cache, entry) { 2935 struct vc4_compiled_shader *shader = entry->data; 2936 vc4_bo_unreference(&shader->bo); 2937 ralloc_free(shader); 2938 _mesa_hash_table_remove(vc4->fs_cache, entry); 2939 } 2940 2941 hash_table_foreach(vc4->vs_cache, entry) { 2942 struct vc4_compiled_shader *shader = entry->data; 2943 vc4_bo_unreference(&shader->bo); 2944 ralloc_free(shader); 2945 _mesa_hash_table_remove(vc4->vs_cache, entry); 2946 } 2947} 2948