1/* 2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Jonathan Marek <jonathan@marek.ca> 25 */ 26 27#include "ir2_private.h" 28 29static unsigned 30src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp) 31{ 32 struct ir2_reg_component *comps; 33 unsigned swiz = 0; 34 35 switch (src->type) { 36 case IR2_SRC_SSA: 37 case IR2_SRC_REG: 38 break; 39 default: 40 return src->swizzle; 41 } 42 /* we need to take into account where the components were allocated */ 43 comps = get_reg_src(ctx, src)->comp; 44 for (int i = 0; i < ncomp; i++) { 45 swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i); 46 } 47 return swiz; 48} 49 50/* alu instr need to take into how the output components are allocated */ 51 52/* scalar doesn't need to take into account dest swizzle */ 53 54static unsigned 55alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg) 56{ 57 /* hardware seems to take from W, but swizzle everywhere just in case */ 58 return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX); 59} 60 61static unsigned 62alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr, 63 struct ir2_src *src) 64{ 65 struct ir2_reg_component *comp = get_reg(instr)->comp; 66 unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr)); 67 unsigned swiz = 0; 68 69 /* non per component special cases */ 70 switch (instr->alu.vector_opc) { 71 case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv: 72 return alu_swizzle_scalar(ctx, src); 73 case DOT2ADDv: 74 case DOT3v: 75 case DOT4v: 76 case CUBEv: 77 return swiz0; 78 default: 79 break; 80 } 81 82 for (int i = 0, j = 0; i < dst_ncomp(instr); j++) { 83 if (instr->alu.write_mask & 1 << j) { 84 if (comp[j].c != 7) 85 swiz |= swiz_set(i, comp[j].c); 86 i++; 87 } 88 } 89 return swiz_merge(swiz0, swiz); 90} 91 92static unsigned 93alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1) 94{ 95 /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */ 96 unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0); 97 return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY); 98} 99 100/* write_mask needs to be transformed by allocation information */ 101 102static unsigned 103alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr) 104{ 105 struct ir2_reg_component *comp = get_reg(instr)->comp; 106 unsigned write_mask = 0; 107 108 for (int i = 0; i < 4; i++) { 109 if (instr->alu.write_mask & 1 << i) 110 write_mask |= 1 << comp[i].c; 111 } 112 113 return write_mask; 114} 115 116/* fetch instructions can swizzle dest, but src swizzle needs conversion */ 117 118static unsigned 119fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp) 120{ 121 unsigned alu_swiz = src_swizzle(ctx, src, ncomp); 122 unsigned swiz = 0; 123 for (int i = 0; i < ncomp; i++) 124 swiz |= swiz_get(alu_swiz, i) << i * 2; 125 return swiz; 126} 127 128static unsigned 129fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr) 130{ 131 struct ir2_reg_component *comp = get_reg(instr)->comp; 132 unsigned dst_swiz = 0xfff; 133 for (int i = 0; i < dst_ncomp(instr); i++) { 134 dst_swiz &= ~(7 << comp[i].c * 3); 135 dst_swiz |= i << comp[i].c * 3; 136 } 137 return dst_swiz; 138} 139 140/* register / export # for instr */ 141static unsigned 142dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr) 143{ 144 if (is_export(instr)) 145 return instr->alu.export; 146 147 return get_reg(instr)->idx; 148} 149 150/* register # for src */ 151static unsigned 152src_to_reg(struct ir2_context *ctx, struct ir2_src *src) 153{ 154 return get_reg_src(ctx, src)->idx; 155} 156 157static unsigned 158src_reg_byte(struct ir2_context *ctx, struct ir2_src *src) 159{ 160 if (src->type == IR2_SRC_CONST) { 161 assert(!src->abs); /* no abs bit for const */ 162 return src->num; 163 } 164 return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0); 165} 166 167/* produce the 12 byte binary instruction for a given sched_instr */ 168static void 169fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched, instr_t *bc, 170 bool *is_fetch) 171{ 172 struct ir2_instr *instr = sched->instr, *instr_s, *instr_v; 173 174 *bc = (instr_t){}; 175 176 if (instr && instr->type == IR2_FETCH) { 177 *is_fetch = true; 178 179 bc->fetch.opc = instr->fetch.opc; 180 bc->fetch.pred_select = !!instr->pred; 181 bc->fetch.pred_condition = instr->pred & 1; 182 183 struct ir2_src *src = instr->src; 184 185 if (instr->fetch.opc == VTX_FETCH) { 186 instr_fetch_vtx_t *vtx = &bc->fetch.vtx; 187 188 assert(instr->fetch.vtx.const_idx <= 0x1f); 189 assert(instr->fetch.vtx.const_idx_sel <= 0x3); 190 191 vtx->src_reg = src_to_reg(ctx, src); 192 vtx->src_swiz = fetch_swizzle(ctx, src, 1); 193 vtx->dst_reg = dst_to_reg(ctx, instr); 194 vtx->dst_swiz = fetch_dst_swiz(ctx, instr); 195 196 vtx->must_be_one = 1; 197 vtx->const_index = instr->fetch.vtx.const_idx; 198 vtx->const_index_sel = instr->fetch.vtx.const_idx_sel; 199 200 /* other fields will be patched */ 201 202 /* XXX seems like every FETCH but the first has 203 * this bit set: 204 */ 205 vtx->reserved3 = instr->idx ? 0x1 : 0x0; 206 vtx->reserved0 = instr->idx ? 0x2 : 0x3; 207 } else if (instr->fetch.opc == TEX_FETCH) { 208 instr_fetch_tex_t *tex = &bc->fetch.tex; 209 210 tex->src_reg = src_to_reg(ctx, src); 211 tex->src_swiz = fetch_swizzle(ctx, src, 3); 212 tex->dst_reg = dst_to_reg(ctx, instr); 213 tex->dst_swiz = fetch_dst_swiz(ctx, instr); 214 /* tex->const_idx = patch_fetches */ 215 tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; 216 tex->min_filter = TEX_FILTER_USE_FETCH_CONST; 217 tex->mip_filter = TEX_FILTER_USE_FETCH_CONST; 218 tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST; 219 tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST; 220 tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST; 221 tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST; 222 tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT; 223 tex->use_reg_lod = instr->src_count == 2; 224 tex->sample_location = SAMPLE_CENTER; 225 tex->tx_coord_denorm = instr->fetch.tex.is_rect; 226 } else if (instr->fetch.opc == TEX_SET_TEX_LOD) { 227 instr_fetch_tex_t *tex = &bc->fetch.tex; 228 229 tex->src_reg = src_to_reg(ctx, src); 230 tex->src_swiz = fetch_swizzle(ctx, src, 1); 231 tex->dst_reg = 0; 232 tex->dst_swiz = 0xfff; 233 234 tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; 235 tex->min_filter = TEX_FILTER_USE_FETCH_CONST; 236 tex->mip_filter = TEX_FILTER_USE_FETCH_CONST; 237 tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST; 238 tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST; 239 tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST; 240 tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST; 241 tex->use_comp_lod = 1; 242 tex->use_reg_lod = 0; 243 tex->sample_location = SAMPLE_CENTER; 244 } else { 245 assert(0); 246 } 247 return; 248 } 249 250 instr_v = sched->instr; 251 instr_s = sched->instr_s; 252 253 if (instr_v) { 254 struct ir2_src src1, src2, *src3; 255 256 src1 = instr_v->src[0]; 257 src2 = instr_v->src[instr_v->src_count > 1]; 258 src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL; 259 260 bc->alu.vector_opc = instr_v->alu.vector_opc; 261 bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v); 262 bc->alu.vector_dest = dst_to_reg(ctx, instr_v); 263 bc->alu.vector_clamp = instr_v->alu.saturate; 264 bc->alu.export_data = instr_v->alu.export >= 0; 265 266 /* single operand SETEv, use 0.0f as src2 */ 267 if (instr_v->src_count == 1 && 268 (bc->alu.vector_opc == SETEv || bc->alu.vector_opc == SETNEv || 269 bc->alu.vector_opc == SETGTv || bc->alu.vector_opc == SETGTEv)) 270 src2 = ir2_zero(ctx); 271 272 /* export32 instr for a20x hw binning has this bit set.. 273 * it seems to do more than change the base address of constants 274 * XXX this is a hack 275 */ 276 bc->alu.relative_addr = 277 (bc->alu.export_data && bc->alu.vector_dest == 32); 278 279 bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1); 280 bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1); 281 bc->alu.src1_reg_negate = src1.negate; 282 bc->alu.src1_sel = src1.type != IR2_SRC_CONST; 283 284 bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2); 285 bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2); 286 bc->alu.src2_reg_negate = src2.negate; 287 bc->alu.src2_sel = src2.type != IR2_SRC_CONST; 288 289 if (src3) { 290 bc->alu.src3_reg_byte = src_reg_byte(ctx, src3); 291 bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3); 292 bc->alu.src3_reg_negate = src3->negate; 293 bc->alu.src3_sel = src3->type != IR2_SRC_CONST; 294 } 295 296 bc->alu.pred_select = instr_v->pred; 297 } 298 299 if (instr_s) { 300 struct ir2_src *src = instr_s->src; 301 302 bc->alu.scalar_opc = instr_s->alu.scalar_opc; 303 bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s); 304 bc->alu.scalar_dest = dst_to_reg(ctx, instr_s); 305 bc->alu.scalar_clamp = instr_s->alu.saturate; 306 bc->alu.export_data = instr_s->alu.export >= 0; 307 308 if (instr_s->src_count == 1) { 309 bc->alu.src3_reg_byte = src_reg_byte(ctx, src); 310 bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src); 311 bc->alu.src3_reg_negate = src->negate; 312 bc->alu.src3_sel = src->type != IR2_SRC_CONST; 313 } else { 314 assert(instr_s->src_count == 2); 315 316 bc->alu.src3_reg_byte = src_reg_byte(ctx, src); 317 bc->alu.src3_swiz = 318 alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle); 319 bc->alu.src3_reg_negate = src->negate; 320 bc->alu.src3_sel = src->type != IR2_SRC_CONST; 321 ; 322 } 323 324 if (instr_v) 325 assert(instr_s->pred == instr_v->pred); 326 bc->alu.pred_select = instr_s->pred; 327 } 328 329 *is_fetch = false; 330 return; 331} 332 333static unsigned 334write_cfs(struct ir2_context *ctx, instr_cf_t *cfs, unsigned cf_idx, 335 instr_cf_alloc_t *alloc, instr_cf_exec_t *exec) 336{ 337 assert(exec->count); 338 339 if (alloc) 340 cfs[cf_idx++].alloc = *alloc; 341 342 /* for memory alloc offset for patching */ 343 if (alloc && alloc->buffer_select == SQ_MEMORY && 344 ctx->info->mem_export_ptr == -1) 345 ctx->info->mem_export_ptr = cf_idx / 2 * 3; 346 347 cfs[cf_idx++].exec = *exec; 348 exec->address += exec->count; 349 exec->serialize = 0; 350 exec->count = 0; 351 352 return cf_idx; 353} 354 355/* assemble the final shader */ 356void 357assemble(struct ir2_context *ctx, bool binning) 358{ 359 /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384) 360 * address is 9 bits so could it be 512 ? 361 */ 362 instr_cf_t cfs[384]; 363 instr_t bytecode[384], bc; 364 unsigned block_addr[128]; 365 unsigned num_cf = 0; 366 367 /* CF instr state */ 368 instr_cf_exec_t exec = {.opc = EXEC}; 369 instr_cf_alloc_t alloc = {.opc = ALLOC}; 370 371 int sync_id, sync_id_prev = -1; 372 bool is_fetch = false; 373 bool need_sync = true; 374 bool need_alloc = false; 375 unsigned block_idx = 0; 376 377 ctx->info->mem_export_ptr = -1; 378 ctx->info->num_fetch_instrs = 0; 379 380 /* vertex shader always needs to allocate at least one parameter 381 * if it will never happen, 382 */ 383 if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) { 384 alloc.buffer_select = SQ_PARAMETER_PIXEL; 385 cfs[num_cf++].alloc = alloc; 386 } 387 388 block_addr[0] = 0; 389 390 for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) { 391 struct ir2_instr *instr = ctx->instr_sched[j].instr; 392 393 /* catch IR2_CF since it isn't a regular instruction */ 394 if (instr && instr->type == IR2_CF) { 395 assert(!need_alloc); /* XXX */ 396 397 /* flush any exec cf before inserting jmp */ 398 if (exec.count) 399 num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec); 400 401 cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t){ 402 .opc = COND_JMP, 403 .address = instr->cf.block_idx, /* will be fixed later */ 404 .force_call = !instr->pred, 405 .predicated_jmp = 1, 406 .direction = instr->cf.block_idx > instr->block_idx, 407 .condition = instr->pred & 1, 408 }; 409 continue; 410 } 411 412 /* fill the 3 dwords for the instruction */ 413 fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch); 414 415 /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */ 416 sync_id = 0; 417 if (is_fetch) 418 sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2; 419 420 need_sync = sync_id != sync_id_prev; 421 sync_id_prev = sync_id; 422 423 unsigned block; 424 { 425 426 if (ctx->instr_sched[j].instr) 427 block = ctx->instr_sched[j].instr->block_idx; 428 else 429 block = ctx->instr_sched[j].instr_s->block_idx; 430 431 assert(block_idx <= block); 432 } 433 434 /* info for patching */ 435 if (is_fetch) { 436 struct ir2_fetch_info *info = 437 &ctx->info->fetch_info[ctx->info->num_fetch_instrs++]; 438 info->offset = i * 3; /* add cf offset later */ 439 440 if (bc.fetch.opc == VTX_FETCH) { 441 info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz; 442 } else if (bc.fetch.opc == TEX_FETCH) { 443 info->tex.samp_id = instr->fetch.tex.samp_id; 444 info->tex.src_swiz = bc.fetch.tex.src_swiz; 445 } else { 446 ctx->info->num_fetch_instrs--; 447 } 448 } 449 450 /* exec cf after 6 instr or when switching between fetch / alu */ 451 if (exec.count == 6 || 452 (exec.count && (need_sync || block != block_idx))) { 453 num_cf = 454 write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); 455 need_alloc = false; 456 } 457 458 /* update block_addrs for jmp patching */ 459 while (block_idx < block) 460 block_addr[++block_idx] = num_cf; 461 462 /* export - fill alloc cf */ 463 if (!is_fetch && bc.alu.export_data) { 464 /* get the export buffer from either vector/scalar dest */ 465 instr_alloc_type_t buffer = export_buf(bc.alu.vector_dest); 466 if (bc.alu.scalar_write_mask) { 467 if (bc.alu.vector_write_mask) 468 assert(buffer == export_buf(bc.alu.scalar_dest)); 469 buffer = export_buf(bc.alu.scalar_dest); 470 } 471 472 /* flush previous alloc if the buffer changes */ 473 bool need_new_alloc = buffer != alloc.buffer_select; 474 475 /* memory export always in 32/33 pair, new alloc on 32 */ 476 if (bc.alu.vector_dest == 32) 477 need_new_alloc = true; 478 479 if (need_new_alloc && exec.count) { 480 num_cf = 481 write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); 482 need_alloc = false; 483 } 484 485 need_alloc |= need_new_alloc; 486 487 alloc.size = 0; 488 alloc.buffer_select = buffer; 489 490 if (buffer == SQ_PARAMETER_PIXEL && 491 ctx->so->type == MESA_SHADER_VERTEX) 492 alloc.size = ctx->f->inputs_count - 1; 493 494 if (buffer == SQ_POSITION) 495 alloc.size = ctx->so->writes_psize; 496 } 497 498 if (is_fetch) 499 exec.serialize |= 0x1 << exec.count * 2; 500 if (need_sync) 501 exec.serialize |= 0x2 << exec.count * 2; 502 503 need_sync = false; 504 exec.count += 1; 505 bytecode[i++] = bc; 506 } 507 508 /* final exec cf */ 509 exec.opc = EXEC_END; 510 num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); 511 512 /* insert nop to get an even # of CFs */ 513 if (num_cf % 2) 514 cfs[num_cf++] = (instr_cf_t){.opc = NOP}; 515 516 /* patch cf addrs */ 517 for (int idx = 0; idx < num_cf; idx++) { 518 switch (cfs[idx].opc) { 519 case NOP: 520 case ALLOC: 521 break; 522 case EXEC: 523 case EXEC_END: 524 cfs[idx].exec.address += num_cf / 2; 525 break; 526 case COND_JMP: 527 cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address]; 528 break; 529 default: 530 assert(0); 531 } 532 } 533 534 /* concatenate cfs and alu/fetch */ 535 uint32_t cfdwords = num_cf / 2 * 3; 536 uint32_t alufetchdwords = exec.address * 3; 537 uint32_t sizedwords = cfdwords + alufetchdwords; 538 uint32_t *dwords = malloc(sizedwords * 4); 539 assert(dwords); 540 memcpy(dwords, cfs, cfdwords * 4); 541 memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4); 542 543 /* finalize ir2_shader_info */ 544 ctx->info->dwords = dwords; 545 ctx->info->sizedwords = sizedwords; 546 for (int i = 0; i < ctx->info->num_fetch_instrs; i++) 547 ctx->info->fetch_info[i].offset += cfdwords; 548 549 if (FD_DBG(DISASM)) { 550 DBG("disassemble: type=%d", ctx->so->type); 551 disasm_a2xx(dwords, sizedwords, 0, ctx->so->type); 552 } 553} 554