1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_builder.h" 26#include "aco_ir.h" 27 28#include "common/sid.h" 29 30#include "util/memstream.h" 31 32#include <algorithm> 33#include <map> 34#include <vector> 35 36namespace aco { 37 38struct constaddr_info { 39 unsigned getpc_end; 40 unsigned add_literal; 41}; 42 43struct asm_context { 44 Program* program; 45 enum amd_gfx_level gfx_level; 46 std::vector<std::pair<int, SOPP_instruction*>> branches; 47 std::map<unsigned, constaddr_info> constaddrs; 48 const int16_t* opcode; 49 // TODO: keep track of branch instructions referring blocks 50 // and, when emitting the block, correct the offset in instr 51 asm_context(Program* program_) : program(program_), gfx_level(program->gfx_level) 52 { 53 if (gfx_level <= GFX7) 54 opcode = &instr_info.opcode_gfx7[0]; 55 else if (gfx_level <= GFX9) 56 opcode = &instr_info.opcode_gfx9[0]; 57 else if (gfx_level >= GFX10) 58 opcode = &instr_info.opcode_gfx10[0]; 59 } 60 61 int subvector_begin_pos = -1; 62}; 63 64unsigned 65get_mimg_nsa_dwords(const Instruction* instr) 66{ 67 unsigned addr_dwords = instr->operands.size() - 3; 68 for (unsigned i = 1; i < addr_dwords; i++) { 69 if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4)) 70 return DIV_ROUND_UP(addr_dwords - 1, 4); 71 } 72 return 0; 73} 74 75void 76emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr) 77{ 78 /* lower remaining pseudo-instructions */ 79 if (instr->opcode == aco_opcode::p_constaddr_getpc) { 80 ctx.constaddrs[instr->operands[0].constantValue()].getpc_end = out.size() + 1; 81 82 instr->opcode = aco_opcode::s_getpc_b64; 83 instr->operands.pop_back(); 84 } else if (instr->opcode == aco_opcode::p_constaddr_addlo) { 85 ctx.constaddrs[instr->operands[2].constantValue()].add_literal = out.size() + 1; 86 87 instr->opcode = aco_opcode::s_add_u32; 88 instr->operands.pop_back(); 89 assert(instr->operands[1].isConstant()); 90 /* in case it's an inline constant, make it a literal */ 91 instr->operands[1] = Operand::literal32(instr->operands[1].constantValue()); 92 } 93 94 uint32_t opcode = ctx.opcode[(int)instr->opcode]; 95 if (opcode == (uint32_t)-1) { 96 char* outmem; 97 size_t outsize; 98 struct u_memstream mem; 99 u_memstream_open(&mem, &outmem, &outsize); 100 FILE* const memf = u_memstream_get(&mem); 101 102 fprintf(memf, "Unsupported opcode: "); 103 aco_print_instr(instr, memf); 104 u_memstream_close(&mem); 105 106 aco_err(ctx.program, outmem); 107 free(outmem); 108 109 abort(); 110 } 111 112 switch (instr->format) { 113 case Format::SOP2: { 114 uint32_t encoding = (0b10 << 30); 115 encoding |= opcode << 23; 116 encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0; 117 encoding |= instr->operands.size() >= 2 ? instr->operands[1].physReg() << 8 : 0; 118 encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; 119 out.push_back(encoding); 120 break; 121 } 122 case Format::SOPK: { 123 SOPK_instruction& sopk = instr->sopk(); 124 125 if (instr->opcode == aco_opcode::s_subvector_loop_begin) { 126 assert(ctx.gfx_level >= GFX10); 127 assert(ctx.subvector_begin_pos == -1); 128 ctx.subvector_begin_pos = out.size(); 129 } else if (instr->opcode == aco_opcode::s_subvector_loop_end) { 130 assert(ctx.gfx_level >= GFX10); 131 assert(ctx.subvector_begin_pos != -1); 132 /* Adjust s_subvector_loop_begin instruction to the address after the end */ 133 out[ctx.subvector_begin_pos] |= (out.size() - ctx.subvector_begin_pos); 134 /* Adjust s_subvector_loop_end instruction to the address after the beginning */ 135 sopk.imm = (uint16_t)(ctx.subvector_begin_pos - (int)out.size()); 136 ctx.subvector_begin_pos = -1; 137 } 138 139 uint32_t encoding = (0b1011 << 28); 140 encoding |= opcode << 23; 141 encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) 142 ? instr->definitions[0].physReg() << 16 143 : !instr->operands.empty() && instr->operands[0].physReg() <= 127 144 ? instr->operands[0].physReg() << 16 145 : 0; 146 encoding |= sopk.imm; 147 out.push_back(encoding); 148 break; 149 } 150 case Format::SOP1: { 151 uint32_t encoding = (0b101111101 << 23); 152 encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0; 153 encoding |= opcode << 8; 154 encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; 155 out.push_back(encoding); 156 break; 157 } 158 case Format::SOPC: { 159 uint32_t encoding = (0b101111110 << 23); 160 encoding |= opcode << 16; 161 encoding |= instr->operands.size() == 2 ? instr->operands[1].physReg() << 8 : 0; 162 encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; 163 out.push_back(encoding); 164 break; 165 } 166 case Format::SOPP: { 167 SOPP_instruction& sopp = instr->sopp(); 168 uint32_t encoding = (0b101111111 << 23); 169 encoding |= opcode << 16; 170 encoding |= (uint16_t)sopp.imm; 171 if (sopp.block != -1) { 172 sopp.pass_flags = 0; 173 ctx.branches.emplace_back(out.size(), &sopp); 174 } 175 out.push_back(encoding); 176 break; 177 } 178 case Format::SMEM: { 179 SMEM_instruction& smem = instr->smem(); 180 bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); 181 bool is_load = !instr->definitions.empty(); 182 uint32_t encoding = 0; 183 184 if (ctx.gfx_level <= GFX7) { 185 encoding = (0b11000 << 27); 186 encoding |= opcode << 22; 187 encoding |= instr->definitions.size() ? instr->definitions[0].physReg() << 15 : 0; 188 encoding |= instr->operands.size() ? (instr->operands[0].physReg() >> 1) << 9 : 0; 189 if (instr->operands.size() >= 2) { 190 if (!instr->operands[1].isConstant()) { 191 encoding |= instr->operands[1].physReg().reg(); 192 } else if (instr->operands[1].constantValue() >= 1024) { 193 encoding |= 255; /* SQ_SRC_LITERAL */ 194 } else { 195 encoding |= instr->operands[1].constantValue() >> 2; 196 encoding |= 1 << 8; 197 } 198 } 199 out.push_back(encoding); 200 /* SMRD instructions can take a literal on GFX7 */ 201 if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && 202 instr->operands[1].constantValue() >= 1024) 203 out.push_back(instr->operands[1].constantValue() >> 2); 204 return; 205 } 206 207 if (ctx.gfx_level <= GFX9) { 208 encoding = (0b110000 << 26); 209 assert(!smem.dlc); /* Device-level coherent is not supported on GFX9 and lower */ 210 encoding |= smem.nv ? 1 << 15 : 0; 211 } else { 212 encoding = (0b111101 << 26); 213 assert(!smem.nv); /* Non-volatile is not supported on GFX10 */ 214 encoding |= smem.dlc ? 1 << 14 : 0; 215 } 216 217 encoding |= opcode << 18; 218 encoding |= smem.glc ? 1 << 16 : 0; 219 220 if (ctx.gfx_level <= GFX9) { 221 if (instr->operands.size() >= 2) 222 encoding |= instr->operands[1].isConstant() ? 1 << 17 : 0; /* IMM - immediate enable */ 223 } 224 if (ctx.gfx_level == GFX9) { 225 encoding |= soe ? 1 << 14 : 0; 226 } 227 228 if (is_load || instr->operands.size() >= 3) { /* SDATA */ 229 encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) 230 << 6; 231 } 232 if (instr->operands.size() >= 1) { /* SBASE */ 233 encoding |= instr->operands[0].physReg() >> 1; 234 } 235 236 out.push_back(encoding); 237 encoding = 0; 238 239 int32_t offset = 0; 240 uint32_t soffset = ctx.gfx_level >= GFX10 241 ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */ 242 : 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on 243 GFX8 and below) */ 244 if (instr->operands.size() >= 2) { 245 const Operand& op_off1 = instr->operands[1]; 246 if (ctx.gfx_level <= GFX9) { 247 offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg(); 248 } else { 249 /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an 250 * SGPR */ 251 if (op_off1.isConstant()) { 252 offset = op_off1.constantValue(); 253 } else { 254 soffset = op_off1.physReg(); 255 assert(!soe); /* There is no place to put the other SGPR offset, if any */ 256 } 257 } 258 259 if (soe) { 260 const Operand& op_off2 = instr->operands.back(); 261 assert(ctx.gfx_level >= GFX9); /* GFX8 and below don't support specifying a constant 262 and an SGPR at the same time */ 263 assert(!op_off2.isConstant()); 264 soffset = op_off2.physReg(); 265 } 266 } 267 encoding |= offset; 268 encoding |= soffset << 25; 269 270 out.push_back(encoding); 271 return; 272 } 273 case Format::VOP2: { 274 uint32_t encoding = 0; 275 encoding |= opcode << 25; 276 encoding |= (0xFF & instr->definitions[0].physReg()) << 17; 277 encoding |= (0xFF & instr->operands[1].physReg()) << 9; 278 encoding |= instr->operands[0].physReg(); 279 out.push_back(encoding); 280 break; 281 } 282 case Format::VOP1: { 283 uint32_t encoding = (0b0111111 << 25); 284 if (!instr->definitions.empty()) 285 encoding |= (0xFF & instr->definitions[0].physReg()) << 17; 286 encoding |= opcode << 9; 287 if (!instr->operands.empty()) 288 encoding |= instr->operands[0].physReg(); 289 out.push_back(encoding); 290 break; 291 } 292 case Format::VOPC: { 293 uint32_t encoding = (0b0111110 << 25); 294 encoding |= opcode << 17; 295 encoding |= (0xFF & instr->operands[1].physReg()) << 9; 296 encoding |= instr->operands[0].physReg(); 297 out.push_back(encoding); 298 break; 299 } 300 case Format::VINTRP: { 301 Interp_instruction& interp = instr->vintrp(); 302 uint32_t encoding = 0; 303 304 if (instr->opcode == aco_opcode::v_interp_p1ll_f16 || 305 instr->opcode == aco_opcode::v_interp_p1lv_f16 || 306 instr->opcode == aco_opcode::v_interp_p2_legacy_f16 || 307 instr->opcode == aco_opcode::v_interp_p2_f16) { 308 if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) { 309 encoding = (0b110100 << 26); 310 } else if (ctx.gfx_level >= GFX10) { 311 encoding = (0b110101 << 26); 312 } else { 313 unreachable("Unknown gfx_level."); 314 } 315 316 encoding |= opcode << 16; 317 encoding |= (0xFF & instr->definitions[0].physReg()); 318 out.push_back(encoding); 319 320 encoding = 0; 321 encoding |= interp.attribute; 322 encoding |= interp.component << 6; 323 encoding |= instr->operands[0].physReg() << 9; 324 if (instr->opcode == aco_opcode::v_interp_p2_f16 || 325 instr->opcode == aco_opcode::v_interp_p2_legacy_f16 || 326 instr->opcode == aco_opcode::v_interp_p1lv_f16) { 327 encoding |= instr->operands[2].physReg() << 18; 328 } 329 out.push_back(encoding); 330 } else { 331 if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) { 332 encoding = (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */ 333 } else { 334 encoding = (0b110010 << 26); 335 } 336 337 assert(encoding); 338 encoding |= (0xFF & instr->definitions[0].physReg()) << 18; 339 encoding |= opcode << 16; 340 encoding |= interp.attribute << 10; 341 encoding |= interp.component << 8; 342 if (instr->opcode == aco_opcode::v_interp_mov_f32) 343 encoding |= (0x3 & instr->operands[0].constantValue()); 344 else 345 encoding |= (0xFF & instr->operands[0].physReg()); 346 out.push_back(encoding); 347 } 348 break; 349 } 350 case Format::DS: { 351 DS_instruction& ds = instr->ds(); 352 uint32_t encoding = (0b110110 << 26); 353 if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) { 354 encoding |= opcode << 17; 355 encoding |= (ds.gds ? 1 : 0) << 16; 356 } else { 357 encoding |= opcode << 18; 358 encoding |= (ds.gds ? 1 : 0) << 17; 359 } 360 encoding |= ((0xFF & ds.offset1) << 8); 361 encoding |= (0xFFFF & ds.offset0); 362 out.push_back(encoding); 363 encoding = 0; 364 unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0; 365 encoding |= (0xFF & reg) << 24; 366 reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0) 367 ? instr->operands[2].physReg() 368 : 0; 369 encoding |= (0xFF & reg) << 16; 370 reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) 371 ? instr->operands[1].physReg() 372 : 0; 373 encoding |= (0xFF & reg) << 8; 374 encoding |= (0xFF & instr->operands[0].physReg()); 375 out.push_back(encoding); 376 break; 377 } 378 case Format::MUBUF: { 379 MUBUF_instruction& mubuf = instr->mubuf(); 380 uint32_t encoding = (0b111000 << 26); 381 encoding |= opcode << 18; 382 encoding |= (mubuf.lds ? 1 : 0) << 16; 383 encoding |= (mubuf.glc ? 1 : 0) << 14; 384 encoding |= (mubuf.idxen ? 1 : 0) << 13; 385 assert(!mubuf.addr64 || ctx.gfx_level <= GFX7); 386 if (ctx.gfx_level == GFX6 || ctx.gfx_level == GFX7) 387 encoding |= (mubuf.addr64 ? 1 : 0) << 15; 388 encoding |= (mubuf.offen ? 1 : 0) << 12; 389 if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) { 390 assert(!mubuf.dlc); /* Device-level coherent is not supported on GFX9 and lower */ 391 encoding |= (mubuf.slc ? 1 : 0) << 17; 392 } else if (ctx.gfx_level >= GFX10) { 393 encoding |= (mubuf.dlc ? 1 : 0) << 15; 394 } 395 encoding |= 0x0FFF & mubuf.offset; 396 out.push_back(encoding); 397 encoding = 0; 398 if (ctx.gfx_level <= GFX7 || ctx.gfx_level >= GFX10) { 399 encoding |= (mubuf.slc ? 1 : 0) << 22; 400 } 401 encoding |= instr->operands[2].physReg() << 24; 402 encoding |= (mubuf.tfe ? 1 : 0) << 23; 403 encoding |= (instr->operands[0].physReg() >> 2) << 16; 404 unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() 405 : instr->definitions[0].physReg(); 406 encoding |= (0xFF & reg) << 8; 407 encoding |= (0xFF & instr->operands[1].physReg()); 408 out.push_back(encoding); 409 break; 410 } 411 case Format::MTBUF: { 412 MTBUF_instruction& mtbuf = instr->mtbuf(); 413 414 uint32_t img_format = ac_get_tbuffer_format(ctx.gfx_level, mtbuf.dfmt, mtbuf.nfmt); 415 uint32_t encoding = (0b111010 << 26); 416 assert(img_format <= 0x7F); 417 assert(!mtbuf.dlc || ctx.gfx_level >= GFX10); 418 encoding |= (mtbuf.dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */ 419 encoding |= (mtbuf.glc ? 1 : 0) << 14; 420 encoding |= (mtbuf.idxen ? 1 : 0) << 13; 421 encoding |= (mtbuf.offen ? 1 : 0) << 12; 422 encoding |= 0x0FFF & mtbuf.offset; 423 encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */ 424 425 if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) { 426 encoding |= opcode << 15; 427 } else { 428 encoding |= (opcode & 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */ 429 } 430 431 out.push_back(encoding); 432 encoding = 0; 433 434 encoding |= instr->operands[2].physReg() << 24; 435 encoding |= (mtbuf.tfe ? 1 : 0) << 23; 436 encoding |= (mtbuf.slc ? 1 : 0) << 22; 437 encoding |= (instr->operands[0].physReg() >> 2) << 16; 438 unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() 439 : instr->definitions[0].physReg(); 440 encoding |= (0xFF & reg) << 8; 441 encoding |= (0xFF & instr->operands[1].physReg()); 442 443 if (ctx.gfx_level >= GFX10) { 444 encoding |= (((opcode & 0x08) >> 3) << 21); /* MSB of 4-bit OPCODE */ 445 } 446 447 out.push_back(encoding); 448 break; 449 } 450 case Format::MIMG: { 451 unsigned nsa_dwords = get_mimg_nsa_dwords(instr); 452 assert(!nsa_dwords || ctx.gfx_level >= GFX10); 453 454 MIMG_instruction& mimg = instr->mimg(); 455 uint32_t encoding = (0b111100 << 26); 456 encoding |= mimg.slc ? 1 << 25 : 0; 457 encoding |= (opcode & 0x7f) << 18; 458 encoding |= (opcode >> 7) & 1; 459 encoding |= mimg.lwe ? 1 << 17 : 0; 460 encoding |= mimg.tfe ? 1 << 16 : 0; 461 encoding |= mimg.glc ? 1 << 13 : 0; 462 encoding |= mimg.unrm ? 1 << 12 : 0; 463 if (ctx.gfx_level <= GFX9) { 464 assert(!mimg.dlc); /* Device-level coherent is not supported on GFX9 and lower */ 465 assert(!mimg.r128); 466 encoding |= mimg.a16 ? 1 << 15 : 0; 467 encoding |= mimg.da ? 1 << 14 : 0; 468 } else { 469 encoding |= mimg.r128 ? 1 << 15 470 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */ 471 encoding |= nsa_dwords << 1; 472 encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */ 473 encoding |= mimg.dlc ? 1 << 7 : 0; 474 } 475 encoding |= (0xF & mimg.dmask) << 8; 476 out.push_back(encoding); 477 encoding = (0xFF & instr->operands[3].physReg()); /* VADDR */ 478 if (!instr->definitions.empty()) { 479 encoding |= (0xFF & instr->definitions[0].physReg()) << 8; /* VDATA */ 480 } else if (!instr->operands[2].isUndefined()) { 481 encoding |= (0xFF & instr->operands[2].physReg()) << 8; /* VDATA */ 482 } 483 encoding |= (0x1F & (instr->operands[0].physReg() >> 2)) << 16; /* T# (resource) */ 484 if (!instr->operands[1].isUndefined()) 485 encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 21; /* sampler */ 486 487 assert(!mimg.d16 || ctx.gfx_level >= GFX9); 488 encoding |= mimg.d16 ? 1 << 31 : 0; 489 if (ctx.gfx_level >= GFX10) { 490 /* GFX10: A16 still exists, but is in a different place */ 491 encoding |= mimg.a16 ? 1 << 30 : 0; 492 } 493 494 out.push_back(encoding); 495 496 if (nsa_dwords) { 497 out.resize(out.size() + nsa_dwords); 498 std::vector<uint32_t>::iterator nsa = std::prev(out.end(), nsa_dwords); 499 for (unsigned i = 0; i < instr->operands.size() - 4u; i++) 500 nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8); 501 } 502 break; 503 } 504 case Format::FLAT: 505 case Format::SCRATCH: 506 case Format::GLOBAL: { 507 FLAT_instruction& flat = instr->flatlike(); 508 uint32_t encoding = (0b110111 << 26); 509 encoding |= opcode << 18; 510 if (ctx.gfx_level == GFX9 || ctx.gfx_level >= GFX11) { 511 if (instr->isFlat()) 512 assert(flat.offset <= 0xfff); 513 else 514 assert(flat.offset >= -4096 && flat.offset < 4096); 515 encoding |= flat.offset & 0x1fff; 516 } else if (ctx.gfx_level <= GFX8 || instr->isFlat()) { 517 /* GFX10 has a 12-bit immediate OFFSET field, 518 * but it has a hw bug: it ignores the offset, called FlatSegmentOffsetBug 519 */ 520 assert(flat.offset == 0); 521 } else { 522 assert(flat.offset >= -2048 && flat.offset <= 2047); 523 encoding |= flat.offset & 0xfff; 524 } 525 if (instr->isScratch()) 526 encoding |= 1 << 14; 527 else if (instr->isGlobal()) 528 encoding |= 2 << 14; 529 encoding |= flat.lds ? 1 << 13 : 0; 530 encoding |= flat.glc ? 1 << 16 : 0; 531 encoding |= flat.slc ? 1 << 17 : 0; 532 if (ctx.gfx_level >= GFX10) { 533 assert(!flat.nv); 534 encoding |= flat.dlc ? 1 << 12 : 0; 535 } else { 536 assert(!flat.dlc); 537 } 538 out.push_back(encoding); 539 encoding = (0xFF & instr->operands[0].physReg()); 540 if (!instr->definitions.empty()) 541 encoding |= (0xFF & instr->definitions[0].physReg()) << 24; 542 if (instr->operands.size() >= 3) 543 encoding |= (0xFF & instr->operands[2].physReg()) << 8; 544 if (!instr->operands[1].isUndefined()) { 545 assert(ctx.gfx_level >= GFX10 || instr->operands[1].physReg() != 0x7F); 546 assert(instr->format != Format::FLAT); 547 encoding |= instr->operands[1].physReg() << 16; 548 } else if (instr->format != Format::FLAT || 549 ctx.gfx_level >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */ 550 /* For GFX10.3 scratch, 0x7F disables both ADDR and SADDR, unlike sgpr_null, which only 551 * disables SADDR. 552 */ 553 if (ctx.gfx_level <= GFX9 || 554 (instr->format == Format::SCRATCH && instr->operands[0].isUndefined())) 555 encoding |= 0x7F << 16; 556 else 557 encoding |= sgpr_null << 16; 558 } 559 encoding |= flat.nv ? 1 << 23 : 0; 560 out.push_back(encoding); 561 break; 562 } 563 case Format::EXP: { 564 Export_instruction& exp = instr->exp(); 565 uint32_t encoding; 566 if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) { 567 encoding = (0b110001 << 26); 568 } else { 569 encoding = (0b111110 << 26); 570 } 571 572 encoding |= exp.valid_mask ? 0b1 << 12 : 0; 573 encoding |= exp.done ? 0b1 << 11 : 0; 574 encoding |= exp.compressed ? 0b1 << 10 : 0; 575 encoding |= exp.dest << 4; 576 encoding |= exp.enabled_mask; 577 out.push_back(encoding); 578 encoding = 0xFF & exp.operands[0].physReg(); 579 encoding |= (0xFF & exp.operands[1].physReg()) << 8; 580 encoding |= (0xFF & exp.operands[2].physReg()) << 16; 581 encoding |= (0xFF & exp.operands[3].physReg()) << 24; 582 out.push_back(encoding); 583 break; 584 } 585 case Format::PSEUDO: 586 case Format::PSEUDO_BARRIER: 587 if (instr->opcode != aco_opcode::p_unit_test) 588 unreachable("Pseudo instructions should be lowered before assembly."); 589 break; 590 default: 591 if (instr->isVOP3()) { 592 VOP3_instruction& vop3 = instr->vop3(); 593 594 if (instr->isVOP2()) { 595 opcode = opcode + 0x100; 596 } else if (instr->isVOP1()) { 597 if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) 598 opcode = opcode + 0x140; 599 else 600 opcode = opcode + 0x180; 601 } else if (instr->isVOPC()) { 602 opcode = opcode + 0x0; 603 } else if (instr->isVINTRP()) { 604 opcode = opcode + 0x270; 605 } 606 607 uint32_t encoding; 608 if (ctx.gfx_level <= GFX9) { 609 encoding = (0b110100 << 26); 610 } else if (ctx.gfx_level >= GFX10) { 611 encoding = (0b110101 << 26); 612 } else { 613 unreachable("Unknown gfx_level."); 614 } 615 616 if (ctx.gfx_level <= GFX7) { 617 encoding |= opcode << 17; 618 encoding |= (vop3.clamp ? 1 : 0) << 11; 619 } else { 620 encoding |= opcode << 16; 621 encoding |= (vop3.clamp ? 1 : 0) << 15; 622 } 623 encoding |= vop3.opsel << 11; 624 for (unsigned i = 0; i < 3; i++) 625 encoding |= vop3.abs[i] << (8 + i); 626 if (instr->definitions.size() == 2) 627 encoding |= instr->definitions[1].physReg() << 8; 628 encoding |= (0xFF & instr->definitions[0].physReg()); 629 out.push_back(encoding); 630 encoding = 0; 631 if (instr->opcode == aco_opcode::v_interp_mov_f32) { 632 encoding = 0x3 & instr->operands[0].constantValue(); 633 } else if (instr->opcode == aco_opcode::v_writelane_b32_e64) { 634 encoding |= instr->operands[0].physReg() << 0; 635 encoding |= instr->operands[1].physReg() << 9; 636 /* Encoding src2 works fine with hardware but breaks some disassemblers. */ 637 } else { 638 for (unsigned i = 0; i < instr->operands.size(); i++) 639 encoding |= instr->operands[i].physReg() << (i * 9); 640 } 641 encoding |= vop3.omod << 27; 642 for (unsigned i = 0; i < 3; i++) 643 encoding |= vop3.neg[i] << (29 + i); 644 out.push_back(encoding); 645 646 } else if (instr->isVOP3P()) { 647 VOP3P_instruction& vop3 = instr->vop3p(); 648 649 uint32_t encoding; 650 if (ctx.gfx_level == GFX9) { 651 encoding = (0b110100111 << 23); 652 } else if (ctx.gfx_level >= GFX10) { 653 encoding = (0b110011 << 26); 654 } else { 655 unreachable("Unknown gfx_level."); 656 } 657 658 encoding |= opcode << 16; 659 encoding |= (vop3.clamp ? 1 : 0) << 15; 660 encoding |= vop3.opsel_lo << 11; 661 encoding |= ((vop3.opsel_hi & 0x4) ? 1 : 0) << 14; 662 for (unsigned i = 0; i < 3; i++) 663 encoding |= vop3.neg_hi[i] << (8 + i); 664 encoding |= (0xFF & instr->definitions[0].physReg()); 665 out.push_back(encoding); 666 encoding = 0; 667 for (unsigned i = 0; i < instr->operands.size(); i++) 668 encoding |= instr->operands[i].physReg() << (i * 9); 669 encoding |= (vop3.opsel_hi & 0x3) << 27; 670 for (unsigned i = 0; i < 3; i++) 671 encoding |= vop3.neg_lo[i] << (29 + i); 672 out.push_back(encoding); 673 674 } else if (instr->isDPP16()) { 675 assert(ctx.gfx_level >= GFX8); 676 DPP16_instruction& dpp = instr->dpp16(); 677 678 /* first emit the instruction without the DPP operand */ 679 Operand dpp_op = instr->operands[0]; 680 instr->operands[0] = Operand(PhysReg{250}, v1); 681 instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP16); 682 emit_instruction(ctx, out, instr); 683 uint32_t encoding = (0xF & dpp.row_mask) << 28; 684 encoding |= (0xF & dpp.bank_mask) << 24; 685 encoding |= dpp.abs[1] << 23; 686 encoding |= dpp.neg[1] << 22; 687 encoding |= dpp.abs[0] << 21; 688 encoding |= dpp.neg[0] << 20; 689 if (ctx.gfx_level >= GFX10) 690 encoding |= 1 << 18; /* set Fetch Inactive to match GFX9 behaviour */ 691 encoding |= dpp.bound_ctrl << 19; 692 encoding |= dpp.dpp_ctrl << 8; 693 encoding |= (0xFF) & dpp_op.physReg(); 694 out.push_back(encoding); 695 return; 696 } else if (instr->isDPP8()) { 697 assert(ctx.gfx_level >= GFX10); 698 DPP8_instruction& dpp = instr->dpp8(); 699 700 /* first emit the instruction without the DPP operand */ 701 Operand dpp_op = instr->operands[0]; 702 instr->operands[0] = Operand(PhysReg{234}, v1); 703 instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP8); 704 emit_instruction(ctx, out, instr); 705 uint32_t encoding = (0xFF) & dpp_op.physReg(); 706 for (unsigned i = 0; i < 8; ++i) 707 encoding |= dpp.lane_sel[i] << (8 + i * 3); 708 out.push_back(encoding); 709 return; 710 } else if (instr->isSDWA()) { 711 assert(ctx.gfx_level >= GFX8 && ctx.gfx_level < GFX11); 712 SDWA_instruction& sdwa = instr->sdwa(); 713 714 /* first emit the instruction without the SDWA operand */ 715 Operand sdwa_op = instr->operands[0]; 716 instr->operands[0] = Operand(PhysReg{249}, v1); 717 instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::SDWA); 718 emit_instruction(ctx, out, instr); 719 720 uint32_t encoding = 0; 721 722 if (instr->isVOPC()) { 723 if (instr->definitions[0].physReg() != vcc) { 724 encoding |= instr->definitions[0].physReg() << 8; 725 encoding |= 1 << 15; 726 } 727 encoding |= (sdwa.clamp ? 1 : 0) << 13; 728 } else { 729 encoding |= sdwa.dst_sel.to_sdwa_sel(instr->definitions[0].physReg().byte()) << 8; 730 uint32_t dst_u = sdwa.dst_sel.sign_extend() ? 1 : 0; 731 if (instr->definitions[0].bytes() < 4) /* dst_preserve */ 732 dst_u = 2; 733 encoding |= dst_u << 11; 734 encoding |= (sdwa.clamp ? 1 : 0) << 13; 735 encoding |= sdwa.omod << 14; 736 } 737 738 encoding |= sdwa.sel[0].to_sdwa_sel(sdwa_op.physReg().byte()) << 16; 739 encoding |= sdwa.sel[0].sign_extend() ? 1 << 19 : 0; 740 encoding |= sdwa.abs[0] << 21; 741 encoding |= sdwa.neg[0] << 20; 742 743 if (instr->operands.size() >= 2) { 744 encoding |= sdwa.sel[1].to_sdwa_sel(instr->operands[1].physReg().byte()) << 24; 745 encoding |= sdwa.sel[1].sign_extend() ? 1 << 27 : 0; 746 encoding |= sdwa.abs[1] << 29; 747 encoding |= sdwa.neg[1] << 28; 748 } 749 750 encoding |= 0xFF & sdwa_op.physReg(); 751 encoding |= (sdwa_op.physReg() < 256) << 23; 752 if (instr->operands.size() >= 2) 753 encoding |= (instr->operands[1].physReg() < 256) << 31; 754 out.push_back(encoding); 755 } else { 756 unreachable("unimplemented instruction format"); 757 } 758 break; 759 } 760 761 /* append literal dword */ 762 for (const Operand& op : instr->operands) { 763 if (op.isLiteral()) { 764 out.push_back(op.constantValue()); 765 break; 766 } 767 } 768} 769 770void 771emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block) 772{ 773 for (aco_ptr<Instruction>& instr : block.instructions) { 774#if 0 775 int start_idx = out.size(); 776 std::cerr << "Encoding:\t" << std::endl; 777 aco_print_instr(&*instr, stderr); 778 std::cerr << std::endl; 779#endif 780 emit_instruction(ctx, out, instr.get()); 781#if 0 782 for (int i = start_idx; i < out.size(); i++) 783 std::cerr << "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex << out[i] << std::endl; 784#endif 785 } 786} 787 788void 789fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program) 790{ 791 bool exported = false; 792 for (Block& block : program->blocks) { 793 if (!(block.kind & block_kind_export_end)) 794 continue; 795 std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin(); 796 while (it != block.instructions.rend()) { 797 if ((*it)->isEXP()) { 798 Export_instruction& exp = (*it)->exp(); 799 if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) { 800 if (exp.dest >= V_008DFC_SQ_EXP_POS && exp.dest <= (V_008DFC_SQ_EXP_POS + 3)) { 801 exp.done = true; 802 exported = true; 803 break; 804 } 805 } else { 806 if (!program->info.ps.has_epilog) { 807 exp.done = true; 808 exp.valid_mask = true; 809 } 810 exported = true; 811 break; 812 } 813 } else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec) { 814 break; 815 } else if ((*it)->opcode == aco_opcode::s_setpc_b64) { 816 /* Do not abort if the main FS has an epilog because it only 817 * exports MRTZ (if present) and the epilog exports colors. 818 */ 819 exported |= program->stage.hw == HWStage::FS && program->info.ps.has_epilog; 820 } 821 ++it; 822 } 823 } 824 825 if (!exported) { 826 /* Abort in order to avoid a GPU hang. */ 827 bool is_vertex_or_ngg = 828 (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG); 829 aco_err(program, 830 "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment"); 831 aco_print_program(program, stderr); 832 abort(); 833 } 834} 835 836static void 837insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before, 838 unsigned insert_count, const uint32_t* insert_data) 839{ 840 out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count); 841 842 /* Update the offset of each affected block */ 843 for (Block& block : ctx.program->blocks) { 844 if (block.offset >= insert_before) 845 block.offset += insert_count; 846 } 847 848 /* Find first branch after the inserted code */ 849 auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), 850 [insert_before](const auto& branch) -> bool 851 { return (unsigned)branch.first >= insert_before; }); 852 853 /* Update the locations of branches */ 854 for (; branch_it != ctx.branches.end(); ++branch_it) 855 branch_it->first += insert_count; 856 857 /* Update the locations of p_constaddr instructions */ 858 for (auto& constaddr : ctx.constaddrs) { 859 constaddr_info& info = constaddr.second; 860 if (info.getpc_end >= insert_before) 861 info.getpc_end += insert_count; 862 if (info.add_literal >= insert_before) 863 info.add_literal += insert_count; 864 } 865} 866 867static void 868fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out) 869{ 870 /* Branches with an offset of 0x3f are buggy on GFX10, 871 * we workaround by inserting NOPs if needed. 872 */ 873 bool gfx10_3f_bug = false; 874 875 do { 876 auto buggy_branch_it = std::find_if( 877 ctx.branches.begin(), ctx.branches.end(), 878 [&ctx](const auto& branch) -> bool { 879 return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == 880 0x3f; 881 }); 882 883 gfx10_3f_bug = buggy_branch_it != ctx.branches.end(); 884 885 if (gfx10_3f_bug) { 886 /* Insert an s_nop after the branch */ 887 constexpr uint32_t s_nop_0 = 0xbf800000u; 888 insert_code(ctx, out, buggy_branch_it->first + 1, 1, &s_nop_0); 889 } 890 } while (gfx10_3f_bug); 891} 892 893void 894emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards, 895 std::vector<uint32_t>& out) 896{ 897 Builder bld(ctx.program); 898 899 Definition def_tmp_lo(branch->definitions[0].physReg(), s1); 900 Operand op_tmp_lo(branch->definitions[0].physReg(), s1); 901 Definition def_tmp_hi(branch->definitions[0].physReg().advance(4), s1); 902 Operand op_tmp_hi(branch->definitions[0].physReg().advance(4), s1); 903 904 aco_ptr<Instruction> instr; 905 906 if (branch->opcode != aco_opcode::s_branch) { 907 /* for conditional branches, skip the long jump if the condition is false */ 908 aco_opcode inv; 909 switch (branch->opcode) { 910 case aco_opcode::s_cbranch_scc0: inv = aco_opcode::s_cbranch_scc1; break; 911 case aco_opcode::s_cbranch_scc1: inv = aco_opcode::s_cbranch_scc0; break; 912 case aco_opcode::s_cbranch_vccz: inv = aco_opcode::s_cbranch_vccnz; break; 913 case aco_opcode::s_cbranch_vccnz: inv = aco_opcode::s_cbranch_vccz; break; 914 case aco_opcode::s_cbranch_execz: inv = aco_opcode::s_cbranch_execnz; break; 915 case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break; 916 default: unreachable("Unhandled long jump."); 917 } 918 instr.reset(bld.sopp(inv, -1, 6)); 919 emit_instruction(ctx, out, instr.get()); 920 } 921 922 /* create the new PC and stash SCC in the LSB */ 923 instr.reset(bld.sop1(aco_opcode::s_getpc_b64, branch->definitions[0]).instr); 924 emit_instruction(ctx, out, instr.get()); 925 926 instr.reset( 927 bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand::literal32(0)).instr); 928 emit_instruction(ctx, out, instr.get()); 929 branch->pass_flags = out.size(); 930 931 /* s_addc_u32 for high 32 bits not needed because the program is in a 32-bit VA range */ 932 933 /* restore SCC and clear the LSB of the new PC */ 934 instr.reset(bld.sopc(aco_opcode::s_bitcmp1_b32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr); 935 emit_instruction(ctx, out, instr.get()); 936 instr.reset(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand::zero()).instr); 937 emit_instruction(ctx, out, instr.get()); 938 939 /* create the s_setpc_b64 to jump */ 940 instr.reset( 941 bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr); 942 emit_instruction(ctx, out, instr.get()); 943} 944 945void 946fix_branches(asm_context& ctx, std::vector<uint32_t>& out) 947{ 948 bool repeat = false; 949 do { 950 repeat = false; 951 952 if (ctx.gfx_level == GFX10) 953 fix_branches_gfx10(ctx, out); 954 955 for (std::pair<int, SOPP_instruction*>& branch : ctx.branches) { 956 int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1; 957 if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) { 958 std::vector<uint32_t> long_jump; 959 bool backwards = 960 ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first; 961 emit_long_jump(ctx, branch.second, backwards, long_jump); 962 963 out[branch.first] = long_jump[0]; 964 insert_code(ctx, out, branch.first + 1, long_jump.size() - 1, long_jump.data() + 1); 965 966 repeat = true; 967 break; 968 } 969 970 if (branch.second->pass_flags) { 971 int after_getpc = branch.first + branch.second->pass_flags - 2; 972 offset = (int)ctx.program->blocks[branch.second->block].offset - after_getpc; 973 out[branch.first + branch.second->pass_flags - 1] = offset * 4; 974 } else { 975 out[branch.first] &= 0xffff0000u; 976 out[branch.first] |= (uint16_t)offset; 977 } 978 } 979 } while (repeat); 980} 981 982void 983fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out) 984{ 985 for (auto& constaddr : ctx.constaddrs) { 986 constaddr_info& info = constaddr.second; 987 out[info.add_literal] += (out.size() - info.getpc_end) * 4u; 988 } 989} 990 991unsigned 992emit_program(Program* program, std::vector<uint32_t>& code) 993{ 994 asm_context ctx(program); 995 996 if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS || 997 program->stage.hw == HWStage::NGG) 998 fix_exports(ctx, code, program); 999 1000 for (Block& block : program->blocks) { 1001 block.offset = code.size(); 1002 emit_block(ctx, code, block); 1003 } 1004 1005 fix_branches(ctx, code); 1006 1007 unsigned exec_size = code.size() * sizeof(uint32_t); 1008 1009 if (program->gfx_level >= GFX10) { 1010 /* Pad output with s_code_end so instruction prefetching doesn't cause 1011 * page faults */ 1012 unsigned final_size = align(code.size() + 3 * 16, 16); 1013 while (code.size() < final_size) 1014 code.push_back(0xbf9f0000u); 1015 } 1016 1017 fix_constaddrs(ctx, code); 1018 1019 while (program->constant_data.size() % 4u) 1020 program->constant_data.push_back(0); 1021 /* Copy constant data */ 1022 code.insert(code.end(), (uint32_t*)program->constant_data.data(), 1023 (uint32_t*)(program->constant_data.data() + program->constant_data.size())); 1024 1025 return exec_size; 1026} 1027 1028} // namespace aco 1029