1/* -*- mesa-c++ -*- 2 * 3 * Copyright (c) 2022 Collabora LTD 4 * 5 * Author: Gert Wollny <gert.wollny@collabora.com> 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * on the rights to use, copy, modify, merge, publish, distribute, sub 11 * license, and/or sell copies of the Software, and to permit persons to whom 12 * the Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the next 15 * paragraph) shall be included in all copies or substantial portions of the 16 * Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 24 * USE OR OTHER DEALINGS IN THE SOFTWARE. 25 */ 26 27#include "sfn_assembler.h" 28#include "sfn_debug.h" 29#include "sfn_instr_alugroup.h" 30#include "sfn_instr_controlflow.h" 31#include "sfn_instr_fetch.h" 32#include "sfn_instr_export.h" 33#include "sfn_instr_mem.h" 34#include "sfn_instr_tex.h" 35 36#include "sfn_conditionaljumptracker.h" 37#include "sfn_callstack.h" 38 39#include "../eg_sq.h" 40 41namespace r600 { 42Assembler::Assembler(r600_shader *sh, const r600_shader_key& key): 43 m_sh(sh), m_key(key) 44{ 45} 46 47extern const std::map<ESDOp, int> ds_opcode_map; 48 49class AssamblerVisitor : public ConstInstrVisitor { 50public: 51 AssamblerVisitor(r600_shader *sh, const r600_shader_key& key); 52 53 void visit(const AluInstr& instr) override; 54 void visit(const AluGroup& instr) override; 55 void visit(const TexInstr& instr) override; 56 void visit(const ExportInstr& instr) override; 57 void visit(const FetchInstr& instr) override; 58 void visit(const Block& instr) override; 59 void visit(const IfInstr& instr) override; 60 void visit(const ControlFlowInstr& instr) override; 61 void visit(const ScratchIOInstr& instr) override; 62 void visit(const StreamOutInstr& instr) override; 63 void visit(const MemRingOutInstr& instr) override; 64 void visit(const EmitVertexInstr& instr) override; 65 void visit(const GDSInstr& instr) override; 66 void visit(const WriteTFInstr& instr) override; 67 void visit(const LDSAtomicInstr& instr) override; 68 void visit(const LDSReadInstr& instr) override; 69 void visit(const RatInstr& instr) override; 70 71 void finalize(); 72 73 const uint32_t sf_vtx = 1; 74 const uint32_t sf_tex = 2; 75 const uint32_t sf_alu = 4; 76 const uint32_t sf_addr_register = 8; 77 const uint32_t sf_all = 0xf; 78 79 void clear_states(const uint32_t& states); 80 bool copy_dst(r600_bytecode_alu_dst& dst, const Register& d, bool write); 81 PVirtualValue copy_src(r600_bytecode_alu_src& src, const VirtualValue& s); 82 83 EBufferIndexMode 84 emit_index_reg(const VirtualValue& addr, unsigned idx); 85 86 void emit_endif(); 87 void emit_else(); 88 void emit_loop_begin(bool vpm); 89 void emit_loop_end(); 90 void emit_loop_break(); 91 void emit_loop_cont(); 92 93 void emit_alu_op(const AluInstr& ai); 94 void emit_lds_op(const AluInstr& lds); 95 96 void emit_wait_ack(); 97 98 /* Start initialized in constructor */ 99 const r600_shader_key& m_key; 100 r600_shader *m_shader; 101 r600_bytecode *m_bc; 102 103 ConditionalJumpTracker m_jump_tracker; 104 CallStack m_callstack; 105 bool ps_alpha_to_one; 106 /* End initialized in constructor */ 107 108 std::set<uint32_t> m_nliterals_in_group; 109 std::set<int> vtx_fetch_results; 110 std::set<int> tex_fetch_results; 111 112 PRegister m_last_addr{nullptr}; 113 114 unsigned m_max_color_exports{0}; 115 int m_loop_nesting{0}; 116 117 bool m_ack_suggested{false}; 118 bool m_has_param_output{false}; 119 bool m_has_pos_output{false}; 120 bool m_last_op_was_barrier{false}; 121 bool m_result{true}; 122}; 123 124bool Assembler::lower(Shader *shader) 125{ 126 AssamblerVisitor ass(m_sh, m_key); 127 128 auto& blocks = shader->func(); 129 for (auto b : blocks) { 130 b->accept(ass); 131 if (!ass.m_result) 132 return false; 133 } 134 135 ass.finalize(); 136 137 return ass.m_result; 138 139} 140 141AssamblerVisitor::AssamblerVisitor(r600_shader *sh, const r600_shader_key& key): 142 m_key(key), 143 m_shader(sh), 144 145 m_bc(&sh->bc), 146 m_callstack(sh->bc), 147 ps_alpha_to_one(key.ps.alpha_to_one) 148{ 149 if (m_shader->processor_type == PIPE_SHADER_FRAGMENT) 150 m_max_color_exports = MAX2(m_key.ps.nr_cbufs, 1); 151 152 if (m_shader->processor_type == PIPE_SHADER_VERTEX && 153 m_shader->ninput > 0) 154 r600_bytecode_add_cfinst(m_bc, CF_OP_CALL_FS); 155} 156 157void AssamblerVisitor::finalize() 158{ 159 const struct cf_op_info *last = nullptr; 160 161 if (m_bc->cf_last) 162 last = r600_isa_cf(m_bc->cf_last->op); 163 164 /* alu clause instructions don't have EOP bit, so add NOP */ 165 if (m_shader->bc.gfx_level < CAYMAN && 166 (!last || last->flags & CF_ALU || m_bc->cf_last->op == CF_OP_LOOP_END 167 || m_bc->cf_last->op == CF_OP_POP)) 168 r600_bytecode_add_cfinst(m_bc, CF_OP_NOP); 169 170 /* A fetch shader only can't be EOP (results in hang), but we can replace it 171 * by a NOP */ 172 else if (last && m_bc->cf_last->op == CF_OP_CALL_FS) 173 m_bc->cf_last->op = CF_OP_NOP; 174 175 if (m_shader->bc.gfx_level != CAYMAN) 176 m_bc->cf_last->end_of_program = 1; 177 else 178 cm_bytecode_add_cf_end(m_bc); 179} 180 181extern const std::map<EAluOp, int> opcode_map; 182 183void AssamblerVisitor::visit(const AluInstr& ai) 184{ 185 assert(vtx_fetch_results.empty()); 186 assert(tex_fetch_results.empty()); 187 188 if (unlikely(ai.has_alu_flag(alu_is_lds))) 189 emit_lds_op(ai); 190 else 191 emit_alu_op(ai); 192} 193 194void AssamblerVisitor::emit_lds_op(const AluInstr& lds) 195{ 196 struct r600_bytecode_alu alu; 197 memset(&alu, 0, sizeof(alu)); 198 199 alu.is_lds_idx_op = true; 200 alu.op = lds.lds_opcode(); 201 202 bool has_lds_fetch = false; 203 switch (alu.op) { 204 case LDS_WRITE: 205 alu.op =LDS_OP2_LDS_WRITE; 206 break; 207 case LDS_WRITE_REL: 208 alu.op = LDS_OP3_LDS_WRITE_REL; 209 alu.lds_idx = 1; 210 break; 211 case DS_OP_READ_RET: 212 alu.op = LDS_OP1_LDS_READ_RET; 213 FALLTHROUGH; 214 case LDS_ADD_RET: 215 case LDS_AND_RET: 216 case LDS_OR_RET: 217 case LDS_MAX_INT_RET: 218 case LDS_MAX_UINT_RET: 219 case LDS_MIN_INT_RET: 220 case LDS_MIN_UINT_RET: 221 case LDS_XOR_RET: 222 case LDS_XCHG_RET: 223 case LDS_CMP_XCHG_RET: 224 has_lds_fetch = true; 225 break; 226 case LDS_ADD: 227 case LDS_AND: 228 case LDS_OR: 229 case LDS_MAX_INT: 230 case LDS_MAX_UINT: 231 case LDS_MIN_INT: 232 case LDS_MIN_UINT: 233 case LDS_XOR: 234 break; 235 default: 236 std::cerr << "\n R600: error op: " << lds << "\n"; 237 unreachable("Unhandled LDS op"); 238 } 239 240 copy_src(alu.src[0], lds.src(0)); 241 242 if (lds.n_sources() > 1) 243 copy_src(alu.src[1], lds.src(1)); 244 else 245 alu.src[1].sel = V_SQ_ALU_SRC_0; 246 247 if (lds.n_sources() > 2) 248 copy_src(alu.src[2], lds.src(2)); 249 else 250 alu.src[2].sel = V_SQ_ALU_SRC_0; 251 252 alu.last = lds.has_alu_flag(alu_last_instr); 253 254 int r = r600_bytecode_add_alu(m_bc, &alu); 255 if (has_lds_fetch) 256 m_bc->cf_last->nlds_read++; 257 258 if (r) 259 m_result = false; 260} 261 262void AssamblerVisitor::emit_alu_op(const AluInstr& ai) 263{ 264 struct r600_bytecode_alu alu; 265 memset(&alu, 0, sizeof(alu)); 266 267 if (opcode_map.find(ai.opcode()) == opcode_map.end()) { 268 std::cerr << "Opcode not handled for " << ai <<"\n"; 269 m_result = false; 270 return; 271 } 272 273 // skip multiple barriers 274 if (m_last_op_was_barrier && ai.opcode() == op0_group_barrier) 275 return; 276 277 m_last_op_was_barrier = ai.opcode() == op0_group_barrier; 278 279 alu.op = opcode_map.at(ai.opcode()); 280 281 auto dst = ai.dest(); 282 if (dst) { 283 if (!copy_dst(alu.dst, *dst, ai.has_alu_flag(alu_write))) { 284 m_result = false; 285 return; 286 } 287 288 alu.dst.write = ai.has_alu_flag(alu_write); 289 alu.dst.clamp = ai.has_alu_flag(alu_dst_clamp); 290 alu.dst.rel = dst->addr() ? 1 : 0; 291 } else { 292 alu.dst.chan = ai.dest_chan(); 293 } 294 295 alu.is_op3 = ai.n_sources() == 3; 296 297 EBufferIndexMode kcache_index_mode = bim_none; 298 PVirtualValue buffer_offset = nullptr; 299 300 for (unsigned i = 0; i < ai.n_sources(); ++i) { 301 buffer_offset = copy_src(alu.src[i], ai.src(i)); 302 alu.src[i].neg = ai.has_alu_flag(AluInstr::src_neg_flags[i]); 303 if (!alu.is_op3) 304 alu.src[i].abs = ai.has_alu_flag(AluInstr::src_abs_flags[i]); 305 306 if (buffer_offset && kcache_index_mode == bim_none) { 307 kcache_index_mode = bim_zero; 308 alu.src[i].kc_bank = 1; 309 alu.src[i].kc_rel = 1; 310 } 311 312 if (ai.has_lds_queue_read()) { 313 assert(m_bc->cf_last->nlds_read > 0); 314 m_bc->cf_last->nlds_read--; 315 } 316 } 317 318 if (ai.bank_swizzle() != alu_vec_unknown) 319 alu.bank_swizzle_force = ai.bank_swizzle(); 320 321 alu.last = ai.has_alu_flag(alu_last_instr); 322 alu.execute_mask = ai.has_alu_flag(alu_update_exec); 323 324 /* If the destination register is equal to the last loaded address register 325 * then clear the latter one, because the values will no longer be identical */ 326 if (m_last_addr) 327 sfn_log << SfnLog::assembly << " Current address register is " << *m_last_addr << "\n"; 328 329 if (dst) 330 sfn_log << SfnLog::assembly << " Current dst register is " << *dst << "\n"; 331 332 if (dst && m_last_addr && *dst == *m_last_addr) { 333 sfn_log << SfnLog::assembly << " Clear address register (was " << *m_last_addr << "\n"; 334 m_last_addr = nullptr; 335 } 336 337 auto cf_op = ai.cf_type(); 338 339 unsigned type = 0; 340 switch (cf_op) { 341 case cf_alu: type = CF_OP_ALU; break; 342 case cf_alu_push_before: type = CF_OP_ALU_PUSH_BEFORE; break; 343 case cf_alu_pop_after: type = CF_OP_ALU_POP_AFTER; break; 344 case cf_alu_pop2_after: type = CF_OP_ALU_POP2_AFTER; break; 345 case cf_alu_break: type = CF_OP_ALU_BREAK; break; 346 case cf_alu_else_after: type = CF_OP_ALU_ELSE_AFTER; break; 347 case cf_alu_continue: type = CF_OP_ALU_CONTINUE; break; 348 case cf_alu_extended: type = CF_OP_ALU_EXT; break; 349 default: 350 assert(0 && "cf_alu_undefined should have been replaced"); 351 } 352 353 if (alu.last) 354 m_nliterals_in_group.clear(); 355 356 357 m_result = !r600_bytecode_add_alu_type(m_bc, &alu, type); 358 359 if (ai.opcode() == op1_mova_int) 360 m_bc->ar_loaded = 0; 361 362 if (ai.opcode() == op1_set_cf_idx0) 363 m_bc->index_loaded[0] = 1; 364 365 if (ai.opcode() == op1_set_cf_idx1) 366 m_bc->index_loaded[1] = 1; 367 368 m_bc->force_add_cf |= (ai.opcode() == op2_kille || 369 ai.opcode() == op2_killne_int || 370 ai.opcode() == op1_set_cf_idx0 || 371 ai.opcode() == op1_set_cf_idx1); 372} 373 374void AssamblerVisitor::visit(const AluGroup& group) 375{ 376 clear_states(sf_vtx | sf_tex); 377 378 if (group.slots() == 0) 379 return; 380 381 if (group.has_lds_group_start()) { 382 if (m_bc->cf_last->ndw + 2 * (*group.begin())->required_slots() > 220) { 383 assert(m_bc->cf_last->nlds_read == 0); 384 m_bc->force_add_cf = 1; 385 m_last_addr = nullptr; 386 } 387 } else if (m_bc->cf_last) { 388 if (m_bc->cf_last->ndw + 2 * group.slots() > 240) { 389 assert(m_bc->cf_last->nlds_read == 0); 390 m_bc->force_add_cf = 1; 391 m_last_addr = nullptr; 392 } else { 393 auto instr = *group.begin(); 394 if (instr && 395 !instr->has_alu_flag(alu_is_lds) && 396 instr->opcode() == op0_group_barrier && 397 m_bc->cf_last->ndw + 14 > 240) { 398 assert(m_bc->cf_last->nlds_read == 0); 399 m_bc->force_add_cf = 1; 400 m_last_addr = nullptr; 401 } 402 } 403 } 404 405 auto addr = group.addr(); 406 407 if (addr.first) { 408 if (!addr.second) { 409 if (!m_last_addr || !m_bc->ar_loaded || 410 !m_last_addr->equal_to(*addr.first)) { 411 m_bc->ar_reg = addr.first->sel(); 412 m_bc->ar_chan = addr.first->chan(); 413 m_last_addr = addr.first; 414 m_bc->ar_loaded = 0; 415 416 r600_load_ar(m_bc, group.addr_for_src()); 417 } 418 } else { 419 emit_index_reg(*addr.first, 0); 420 } 421 } 422 423 for (auto& i : group) { 424 if (i) 425 i->accept(*this); 426 } 427} 428 429void AssamblerVisitor::visit(const TexInstr& tex_instr) 430{ 431 clear_states(sf_vtx | sf_alu); 432 433 int sampler_offset = 0; 434 auto addr = tex_instr.sampler_offset(); 435 EBufferIndexMode index_mode = bim_none; 436 437 if (addr) 438 index_mode = emit_index_reg(*addr, 1); 439 440 if (tex_fetch_results.find(tex_instr.src().sel()) != 441 tex_fetch_results.end()) { 442 m_bc->force_add_cf = 1; 443 tex_fetch_results.clear(); 444 } 445 446 r600_bytecode_tex tex; 447 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 448 tex.op = tex_instr.opcode(); 449 tex.sampler_id = tex_instr.sampler_id() + sampler_offset; 450 tex.resource_id = tex_instr.resource_id() + sampler_offset; 451 tex.src_gpr = tex_instr.src().sel(); 452 tex.dst_gpr = tex_instr.dst().sel(); 453 tex.dst_sel_x = tex_instr.dest_swizzle(0); 454 tex.dst_sel_y = tex_instr.dest_swizzle(1); 455 tex.dst_sel_z = tex_instr.dest_swizzle(2); 456 tex.dst_sel_w = tex_instr.dest_swizzle(3); 457 tex.src_sel_x = tex_instr.src()[0]->chan(); 458 tex.src_sel_y = tex_instr.src()[1]->chan(); 459 tex.src_sel_z = tex_instr.src()[2]->chan(); 460 tex.src_sel_w = tex_instr.src()[3]->chan(); 461 tex.coord_type_x = !tex_instr.has_tex_flag(TexInstr::x_unnormalized); 462 tex.coord_type_y = !tex_instr.has_tex_flag(TexInstr::y_unnormalized); 463 tex.coord_type_z = !tex_instr.has_tex_flag(TexInstr::z_unnormalized); 464 tex.coord_type_w = !tex_instr.has_tex_flag(TexInstr::w_unnormalized); 465 tex.offset_x = tex_instr.get_offset(0); 466 tex.offset_y = tex_instr.get_offset(1); 467 tex.offset_z = tex_instr.get_offset(2); 468 tex.resource_index_mode = index_mode; 469 tex.sampler_index_mode = index_mode; 470 471 if (tex.dst_sel_x < 4 && 472 tex.dst_sel_y < 4 && 473 tex.dst_sel_z < 4 && 474 tex.dst_sel_w < 4) 475 tex_fetch_results.insert(tex.dst_gpr); 476 477 if (tex_instr.opcode() == TexInstr::get_gradient_h || 478 tex_instr.opcode() == TexInstr::get_gradient_v) 479 tex.inst_mod = tex_instr.has_tex_flag(TexInstr::grad_fine) ? 1 : 0; 480 else 481 tex.inst_mod = tex_instr.inst_mode(); 482 if (r600_bytecode_add_tex(m_bc, &tex)) { 483 R600_ERR("shader_from_nir: Error creating tex assembly instruction\n"); 484 m_result = false; 485 } 486} 487 488void AssamblerVisitor::visit(const ExportInstr& exi) 489{ 490 const auto& value = exi.value(); 491 492 r600_bytecode_output output; 493 memset(&output, 0, sizeof(output)); 494 495 output.gpr = value.sel(); 496 output.elem_size = 3; 497 output.swizzle_x = value[0]->chan(); 498 output.swizzle_y = value[1]->chan(); 499 output.swizzle_z = value[2]->chan(); 500 output.burst_count = 1; 501 output.op = exi.is_last_export() ? CF_OP_EXPORT_DONE: CF_OP_EXPORT; 502 output.type = exi.export_type(); 503 504 505 clear_states(sf_all); 506 switch (exi.export_type()) { 507 case ExportInstr::pixel: 508 output.swizzle_w = ps_alpha_to_one ? 5 : exi.value()[3]->chan(); 509 output.array_base = exi.location(); 510 break; 511 case ExportInstr::pos: 512 output.swizzle_w = exi.value()[3]->chan(); 513 output.array_base = 60 + exi.location(); 514 break; 515 case ExportInstr::param: 516 output.swizzle_w = exi.value()[3]->chan(); 517 output.array_base = exi.location(); 518 break; 519 default: 520 R600_ERR("shader_from_nir: export %d type not yet supported\n", exi.export_type()); 521 m_result = false; 522 } 523 524 /* If all register elements pinned to fixed values 525 * we can override the gpr (the register allocator doesn't see 526 * this because it doesn't take these channels into account. */ 527 if (output.swizzle_x > 3 && output.swizzle_y > 3 && 528 output.swizzle_z > 3 && output.swizzle_w > 3) 529 output.gpr = 0; 530 531 int r = 0; 532 if ((r =r600_bytecode_add_output(m_bc, &output))) { 533 R600_ERR("Error adding export at location %d : err: %d\n", exi.location(), r); 534 m_result = false; 535 } 536} 537 538void AssamblerVisitor::visit(const ScratchIOInstr& instr) 539{ 540 clear_states(sf_all); 541 542 struct r600_bytecode_output cf; 543 544 memset(&cf, 0, sizeof(struct r600_bytecode_output)); 545 546 cf.op = CF_OP_MEM_SCRATCH; 547 cf.elem_size = 3; 548 cf.gpr = instr.value().sel(); 549 cf.mark = !instr.is_read(); 550 cf.comp_mask = instr.is_read() ? 0xf : instr.write_mask(); 551 cf.swizzle_x = 0; 552 cf.swizzle_y = 1; 553 cf.swizzle_z = 2; 554 cf.swizzle_w = 3; 555 cf.burst_count = 1; 556 557 assert(!instr.is_read() || m_bc->gfx_level < R700); 558 559 if (instr.address()) { 560 cf.type = instr.is_read() || m_bc->gfx_level > R600 ? 3 : 1; 561 cf.index_gpr = instr.address()->sel(); 562 563 /* The docu seems to be wrong here: In indirect addressing the 564 * address_base seems to be the array_size */ 565 cf.array_size = instr.array_size(); 566 } else { 567 cf.type = instr.is_read() || m_bc->gfx_level > R600 ? 2 : 0; 568 cf.array_base = instr.location(); 569 } 570 571 if (r600_bytecode_add_output(m_bc, &cf)){ 572 R600_ERR("shader_from_nir: Error creating SCRATCH_WR assembly instruction\n"); 573 m_result = false; 574 } 575} 576 577void AssamblerVisitor::visit(const StreamOutInstr& instr) 578{ 579 struct r600_bytecode_output output; 580 memset(&output, 0, sizeof(struct r600_bytecode_output)); 581 582 output.gpr = instr.value().sel(); 583 output.elem_size = instr.element_size(); 584 output.array_base = instr.array_base(); 585 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 586 output.burst_count = instr.burst_count(); 587 output.array_size = instr.array_size(); 588 output.comp_mask = instr.comp_mask(); 589 output.op = instr.op(m_shader->bc.gfx_level); 590 591 592 if (r600_bytecode_add_output(m_bc, &output)) { 593 R600_ERR("shader_from_nir: Error creating stream output instruction\n"); 594 m_result = false; 595 } 596} 597 598void AssamblerVisitor::visit(const MemRingOutInstr& instr) 599{ 600 struct r600_bytecode_output output; 601 memset(&output, 0, sizeof(struct r600_bytecode_output)); 602 603 output.gpr = instr.value().sel(); 604 output.type = instr.type(); 605 output.elem_size = 3; 606 output.comp_mask = 0xf; 607 output.burst_count = 1; 608 output.op = instr.op(); 609 if (instr.type() == MemRingOutInstr::mem_write_ind || 610 instr.type() == MemRingOutInstr::mem_write_ind_ack) { 611 output.index_gpr = instr.index_reg(); 612 output.array_size = 0xfff; 613 } 614 output.array_base = instr.array_base(); 615 616 if (r600_bytecode_add_output(m_bc, &output)) { 617 R600_ERR("shader_from_nir: Error creating mem ring write instruction\n"); 618 m_result = false; 619 } 620} 621 622void AssamblerVisitor::visit(const EmitVertexInstr& instr) 623{ 624 int r = r600_bytecode_add_cfinst(m_bc, instr.op()); 625 if (!r) 626 m_bc->cf_last->count = instr.stream(); 627 else 628 m_result = false; 629 assert(m_bc->cf_last->count < 4); 630} 631 632void AssamblerVisitor::visit(const FetchInstr& fetch_instr) 633{ 634 clear_states(sf_tex | sf_alu); 635 636 auto buffer_offset = fetch_instr.resource_offset(); 637 EBufferIndexMode rat_index_mode = bim_none; 638 639 if (buffer_offset) 640 rat_index_mode = emit_index_reg(*buffer_offset, 0); 641 642 if (fetch_instr.has_fetch_flag(FetchInstr::wait_ack)) 643 emit_wait_ack(); 644 645 bool use_tc = fetch_instr.has_fetch_flag(FetchInstr::use_tc) || 646 (m_bc->gfx_level == CAYMAN); 647 if (!use_tc && 648 vtx_fetch_results.find(fetch_instr.src().sel()) != 649 vtx_fetch_results.end()) { 650 m_bc->force_add_cf = 1; 651 vtx_fetch_results.clear(); 652 } 653 654 if (fetch_instr.has_fetch_flag(FetchInstr::use_tc) && 655 tex_fetch_results.find(fetch_instr.src().sel()) != 656 tex_fetch_results.end()) { 657 m_bc->force_add_cf = 1; 658 tex_fetch_results.clear(); 659 } 660 661 if (use_tc) 662 tex_fetch_results.insert(fetch_instr.dst().sel()); 663 else 664 vtx_fetch_results.insert(fetch_instr.dst().sel()); 665 666 struct r600_bytecode_vtx vtx; 667 memset(&vtx, 0, sizeof(vtx)); 668 vtx.op = fetch_instr.opcode(); 669 vtx.buffer_id = fetch_instr.resource_id(); 670 vtx.fetch_type = fetch_instr.fetch_type(); 671 vtx.src_gpr = fetch_instr.src().sel(); 672 vtx.src_sel_x = fetch_instr.src().chan(); 673 vtx.mega_fetch_count = fetch_instr.mega_fetch_count(); 674 vtx.dst_gpr = fetch_instr.dst().sel(); 675 vtx.dst_sel_x = fetch_instr.dest_swizzle(0); /* SEL_X */ 676 vtx.dst_sel_y = fetch_instr.dest_swizzle(1); /* SEL_Y */ 677 vtx.dst_sel_z = fetch_instr.dest_swizzle(2); /* SEL_Z */ 678 vtx.dst_sel_w = fetch_instr.dest_swizzle(3); /* SEL_W */ 679 vtx.use_const_fields = fetch_instr.has_fetch_flag(FetchInstr::use_const_field); 680 vtx.data_format = fetch_instr.data_format(); 681 vtx.num_format_all = fetch_instr.num_format(); /* NUM_FORMAT_SCALED */ 682 vtx.format_comp_all = fetch_instr.has_fetch_flag(FetchInstr::format_comp_signed); 683 vtx.endian = fetch_instr.endian_swap(); 684 vtx.buffer_index_mode = rat_index_mode; 685 vtx.offset = fetch_instr.src_offset(); 686 vtx.indexed = fetch_instr.has_fetch_flag(FetchInstr::indexed); 687 vtx.uncached = fetch_instr.has_fetch_flag(FetchInstr::uncached); 688 vtx.elem_size = fetch_instr.elm_size(); 689 vtx.array_base = fetch_instr.array_base(); 690 vtx.array_size = fetch_instr.array_size(); 691 vtx.srf_mode_all = fetch_instr.has_fetch_flag(FetchInstr::srf_mode); 692 693 if (fetch_instr.has_fetch_flag(FetchInstr::use_tc)) { 694 if ((r600_bytecode_add_vtx_tc(m_bc, &vtx))) { 695 R600_ERR("shader_from_nir: Error creating tex assembly instruction\n"); 696 m_result = false; 697 } 698 699 } else { 700 if ((r600_bytecode_add_vtx(m_bc, &vtx))) { 701 R600_ERR("shader_from_nir: Error creating tex assembly instruction\n"); 702 m_result = false; 703 } 704 } 705 706 m_bc->cf_last->vpm = (m_bc->type == PIPE_SHADER_FRAGMENT) && 707 fetch_instr.has_fetch_flag(FetchInstr::vpm); 708 m_bc->cf_last->barrier = 1; 709} 710 711void AssamblerVisitor::visit(const WriteTFInstr& instr) 712{ 713 struct r600_bytecode_gds gds; 714 715 auto& value = instr.value(); 716 717 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 718 gds.src_gpr = value.sel(); 719 gds.src_sel_x = value[0]->chan(); 720 gds.src_sel_y = value[1]->chan(); 721 gds.src_sel_z = 4; 722 gds.dst_sel_x = 7; 723 gds.dst_sel_y = 7; 724 gds.dst_sel_z = 7; 725 gds.dst_sel_w = 7; 726 gds.op = FETCH_OP_TF_WRITE; 727 728 if (r600_bytecode_add_gds(m_bc, &gds) != 0) { 729 m_result = false; 730 return; 731 } 732 733 if (value[2]->chan() != 7) { 734 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 735 gds.src_gpr = value.sel(); 736 gds.src_sel_x = value[2]->chan(); 737 gds.src_sel_y = value[3]->chan(); 738 gds.src_sel_z = 4; 739 gds.dst_sel_x = 7; 740 gds.dst_sel_y = 7; 741 gds.dst_sel_z = 7; 742 gds.dst_sel_w = 7; 743 gds.op = FETCH_OP_TF_WRITE; 744 745 if (r600_bytecode_add_gds(m_bc, &gds)) { 746 m_result = false; 747 return; 748 } 749 } 750} 751 752void AssamblerVisitor::visit(const RatInstr& instr) 753{ 754 struct r600_bytecode_gds gds; 755 756 /* The instruction writes to the retuen buffer loaction, and 757 * the value will actually be read bach, so make sure all previous writes 758 * have been finished */ 759 if (m_ack_suggested /*&& instr.has_instr_flag(Instr::ack_rat_return_write)*/) 760 emit_wait_ack(); 761 762 int rat_idx = instr.rat_id(); 763 EBufferIndexMode rat_index_mode = bim_none; 764 auto addr = instr.rat_id_offset(); 765 766 if (addr) 767 rat_index_mode = emit_index_reg(*addr, 1); 768 769 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 770 771 r600_bytecode_add_cfinst(m_bc, instr.cf_opcode()); 772 auto cf = m_bc->cf_last; 773 cf->rat.id = rat_idx + m_shader->rat_base; 774 cf->rat.inst = instr.rat_op(); 775 cf->rat.index_mode = rat_index_mode; 776 cf->output.type = instr.need_ack() ? 3 : 1; 777 cf->output.gpr = instr.data_gpr(); 778 cf->output.index_gpr = instr.index_gpr(); 779 cf->output.comp_mask = instr.comp_mask(); 780 cf->output.burst_count = instr.burst_count(); 781 assert(instr.data_swz(0) == PIPE_SWIZZLE_X); 782 if (cf->rat.inst != RatInstr::STORE_TYPED) { 783 assert(instr.data_swz(1) == PIPE_SWIZZLE_Y || 784 instr.data_swz(1) == PIPE_SWIZZLE_MAX) ; 785 assert(instr.data_swz(2) == PIPE_SWIZZLE_Z || 786 instr.data_swz(2) == PIPE_SWIZZLE_MAX) ; 787 } 788 789 cf->vpm = m_bc->type == PIPE_SHADER_FRAGMENT; 790 cf->barrier = 1; 791 cf->mark = instr.need_ack(); 792 cf->output.elem_size = instr.elm_size(); 793 794 m_ack_suggested |= instr.need_ack(); 795} 796 797 798void AssamblerVisitor::clear_states(const uint32_t& states) 799{ 800 if (states & sf_vtx) 801 vtx_fetch_results.clear(); 802 803 if (states & sf_tex) 804 tex_fetch_results.clear(); 805 806 if (states & sf_alu) { 807 m_last_op_was_barrier = false; 808 m_last_addr = nullptr; 809 } 810 811} 812 813 814void AssamblerVisitor::visit(const Block& block) 815{ 816 if (block.empty()) 817 return; 818 819 m_bc->force_add_cf = block.has_instr_flag(Instr::force_cf); 820 sfn_log << SfnLog::assembly << "Translate block size: " << block.size() << " new_cf:" << m_bc->force_add_cf << "\n"; 821 822 for (const auto& i : block) { 823 sfn_log << SfnLog::assembly << "Translate " << *i << " "; 824 i->accept(*this); 825 sfn_log << SfnLog::assembly << (m_result ? "good" : "fail") << "\n"; 826 827 if (!m_result) 828 break; 829 } 830} 831 832void AssamblerVisitor::visit(const IfInstr& instr) 833{ 834 int elems = m_callstack.push(FC_PUSH_VPM); 835 bool needs_workaround = false; 836 837 if (m_bc->gfx_level == CAYMAN && m_bc->stack.loop > 1) 838 needs_workaround = true; 839 840 if (m_bc->gfx_level == EVERGREEN && 841 m_bc->family != CHIP_HEMLOCK && 842 m_bc->family != CHIP_CYPRESS && 843 m_bc->family != CHIP_JUNIPER) { 844 unsigned dmod1 = (elems - 1) % m_bc->stack.entry_size; 845 unsigned dmod2 = (elems) % m_bc->stack.entry_size; 846 847 if (elems && (!dmod1 || !dmod2)) 848 needs_workaround = true; 849 } 850 851 auto pred = instr.predicate(); 852 auto [addr, dummy0, dummy1 ] = pred->indirect_addr(); {} 853 if (addr) { 854 if (!m_last_addr || !m_bc->ar_loaded || 855 !m_last_addr->equal_to(*addr)) { 856 m_bc->ar_reg = addr->sel(); 857 m_bc->ar_chan = addr->chan(); 858 m_last_addr = addr; 859 m_bc->ar_loaded = 0; 860 861 r600_load_ar(m_bc, true); 862 } 863 } 864 865 if (needs_workaround) { 866 r600_bytecode_add_cfinst(m_bc, CF_OP_PUSH); 867 m_bc->cf_last->cf_addr = m_bc->cf_last->id + 2; 868 pred->set_cf_type(cf_alu); 869 } 870 871 clear_states(sf_tex|sf_vtx); 872 pred->accept(*this); 873 874 r600_bytecode_add_cfinst(m_bc, CF_OP_JUMP); 875 clear_states(sf_all); 876 877 m_jump_tracker.push(m_bc->cf_last, jt_if); 878} 879 880void AssamblerVisitor::visit(const ControlFlowInstr& instr) 881{ 882 clear_states(sf_all); 883 switch (instr.cf_type()) { 884 case ControlFlowInstr::cf_else: 885 emit_else(); 886 break; 887 case ControlFlowInstr::cf_endif: 888 emit_endif(); 889 break; 890 case ControlFlowInstr::cf_loop_begin: 891 emit_loop_begin(instr.has_instr_flag(Instr::vpm)); 892 break; 893 case ControlFlowInstr::cf_loop_end: 894 emit_loop_end(); 895 break; 896 case ControlFlowInstr::cf_loop_break: 897 emit_loop_break(); 898 break; 899 case ControlFlowInstr::cf_loop_continue: 900 emit_loop_cont(); 901 break; 902 case ControlFlowInstr::cf_wait_ack: 903 { 904 int r = r600_bytecode_add_cfinst(m_bc, CF_OP_WAIT_ACK); 905 if (!r) { 906 m_bc->cf_last->cf_addr = 0; 907 m_bc->cf_last->barrier = 1; 908 m_ack_suggested = false; 909 } else { 910 m_result = false; 911 } 912 } 913 break; 914 default: 915 unreachable("Unknown CF instruction type"); 916 } 917} 918 919void AssamblerVisitor::visit(const GDSInstr& instr) 920{ 921 struct r600_bytecode_gds gds; 922 923 bool indirect = false; 924 auto addr = instr.uav_id(); 925 926 if (addr) { 927 indirect = true; 928 emit_index_reg(*addr, 1); 929 } 930 931 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 932 933 gds.op = ds_opcode_map.at(instr.opcode()); 934 gds.dst_gpr = instr.dest()->sel(); 935 gds.uav_id = instr.uav_base(); 936 gds.uav_index_mode = indirect ? bim_one : bim_none; 937 gds.src_gpr = instr.src().sel(); 938 939 gds.src_sel_x = instr.src()[0]->chan() < 7 ? instr.src()[0]->chan() : 4; 940 gds.src_sel_y = instr.src()[1]->chan(); 941 gds.src_sel_z = instr.src()[2]->chan() < 7 ? instr.src()[2]->chan() : 4; 942 943 gds.dst_sel_x = 7; 944 gds.dst_sel_y = 7; 945 gds.dst_sel_z = 7; 946 gds.dst_sel_w = 7; 947 948 switch (instr.dest()->chan()) { 949 case 0: gds.dst_sel_x = 0;break; 950 case 1: gds.dst_sel_y = 0;break; 951 case 2: gds.dst_sel_z = 0;break; 952 case 3: gds.dst_sel_w = 0; 953 } 954 955 gds.src_gpr2 = 0; 956 gds.alloc_consume = m_bc->gfx_level < CAYMAN ? 1 : 0; // Not Cayman 957 958 int r = r600_bytecode_add_gds(m_bc, &gds); 959 if (r) { 960 m_result = false; 961 return; 962 } 963 m_bc->cf_last->vpm = PIPE_SHADER_FRAGMENT == m_bc->type; 964 m_bc->cf_last->barrier = 1; 965} 966 967void AssamblerVisitor::visit(const LDSAtomicInstr& instr) 968{ 969 (void)instr; 970 unreachable("LDSAtomicInstr must be lowered to ALUInstr"); 971} 972 973void AssamblerVisitor::visit(const LDSReadInstr& instr) 974{ 975 (void)instr; 976 unreachable("LDSReadInstr must be lowered to ALUInstr"); 977} 978 979EBufferIndexMode 980AssamblerVisitor::emit_index_reg(const VirtualValue& addr, unsigned idx) 981{ 982 assert(idx < 2); 983 984 if (!m_bc->index_loaded[idx] || m_loop_nesting || 985 m_bc->index_reg[idx] != (unsigned)addr.sel() 986 || m_bc->index_reg_chan[idx] != (unsigned)addr.chan()) { 987 struct r600_bytecode_alu alu; 988 989 // Make sure MOVA is not last instr in clause 990 991 if (!m_bc->cf_last || (m_bc->cf_last->ndw>>1) >= 110) 992 m_bc->force_add_cf = 1; 993 994 if (m_bc->gfx_level != CAYMAN) { 995 996 EAluOp idxop = idx ? op1_set_cf_idx1 : op1_set_cf_idx0; 997 998 memset(&alu, 0, sizeof(alu)); 999 alu.op = opcode_map.at(op1_mova_int); 1000 alu.dst.chan = 0; 1001 alu.src[0].sel = addr.sel(); 1002 alu.src[0].chan = addr.chan(); 1003 alu.last = 1; 1004 sfn_log << SfnLog::assembly << " mova_int, "; 1005 int r = r600_bytecode_add_alu(m_bc, &alu); 1006 if (r) 1007 return bim_invalid; 1008 1009 alu.op = opcode_map.at(idxop); 1010 alu.dst.chan = 0; 1011 alu.src[0].sel = 0; 1012 alu.src[0].chan = 0; 1013 alu.last = 1; 1014 sfn_log << SfnLog::assembly << "op1_set_cf_idx" << idx; 1015 r = r600_bytecode_add_alu(m_bc, &alu); 1016 if (r) 1017 return bim_invalid; 1018 } else { 1019 memset(&alu, 0, sizeof(alu)); 1020 alu.op = opcode_map.at(op1_mova_int); 1021 alu.dst.sel = idx == 0 ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1; 1022 alu.dst.chan = 0; 1023 alu.src[0].sel = addr.sel(); 1024 alu.src[0].chan = addr.chan(); 1025 alu.last = 1; 1026 sfn_log << SfnLog::assembly << " mova_int, "; 1027 int r = r600_bytecode_add_alu(m_bc, &alu); 1028 if (r) 1029 return bim_invalid; 1030 } 1031 1032 m_bc->ar_loaded = 0; 1033 m_bc->index_reg[idx] = addr.sel(); 1034 m_bc->index_reg_chan[idx] = addr.chan(); 1035 m_bc->index_loaded[idx] = true; 1036 m_bc->force_add_cf = 1; 1037 sfn_log << SfnLog::assembly << "\n"; 1038 } 1039 return idx == 0 ? bim_zero : bim_one; 1040} 1041 1042void AssamblerVisitor::emit_else() 1043{ 1044 r600_bytecode_add_cfinst(m_bc, CF_OP_ELSE); 1045 m_bc->cf_last->pop_count = 1; 1046 m_result &= m_jump_tracker.add_mid(m_bc->cf_last, jt_if); 1047} 1048 1049void AssamblerVisitor::emit_endif() 1050{ 1051 m_callstack.pop(FC_PUSH_VPM); 1052 1053 unsigned force_pop = m_bc->force_add_cf; 1054 if (!force_pop) { 1055 int alu_pop = 3; 1056 if (m_bc->cf_last) { 1057 if (m_bc->cf_last->op == CF_OP_ALU) 1058 alu_pop = 0; 1059 else if (m_bc->cf_last->op == CF_OP_ALU_POP_AFTER) 1060 alu_pop = 1; 1061 } 1062 alu_pop += 1; 1063 if (alu_pop == 1) { 1064 m_bc->cf_last->op = CF_OP_ALU_POP_AFTER; 1065 m_bc->force_add_cf = 1; 1066 } else { 1067 force_pop = 1; 1068 } 1069 } 1070 1071 if (force_pop) { 1072 r600_bytecode_add_cfinst(m_bc, CF_OP_POP); 1073 m_bc->cf_last->pop_count = 1; 1074 m_bc->cf_last->cf_addr = m_bc->cf_last->id + 2; 1075 } 1076 1077 m_result &= m_jump_tracker.pop(m_bc->cf_last, jt_if); 1078} 1079 1080void AssamblerVisitor::emit_loop_begin(bool vpm) 1081{ 1082 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_START_DX10); 1083 m_bc->cf_last->vpm = vpm && m_bc->type == PIPE_SHADER_FRAGMENT; 1084 m_jump_tracker.push(m_bc->cf_last, jt_loop); 1085 m_callstack.push(FC_LOOP); 1086 ++m_loop_nesting; 1087} 1088 1089void AssamblerVisitor::emit_loop_end() 1090{ 1091 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_END); 1092 m_callstack.pop(FC_LOOP); 1093 assert(m_loop_nesting); 1094 --m_loop_nesting; 1095 m_result |= m_jump_tracker.pop(m_bc->cf_last, jt_loop); 1096} 1097 1098void AssamblerVisitor::emit_loop_break() 1099{ 1100 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_BREAK); 1101 m_result |= m_jump_tracker.add_mid(m_bc->cf_last, jt_loop); 1102} 1103 1104void AssamblerVisitor::emit_loop_cont() 1105{ 1106 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_CONTINUE); 1107 m_result |= m_jump_tracker.add_mid(m_bc->cf_last, jt_loop); 1108} 1109 1110bool AssamblerVisitor::copy_dst(r600_bytecode_alu_dst& dst, 1111 const Register& d, bool write) 1112{ 1113 if (write && d.sel() > 124) { 1114 R600_ERR("shader_from_nir: Don't support more then 124 GPRs, but try using %d\n", 1115 d.sel()); 1116 m_result = false; 1117 return false; 1118 } 1119 1120 dst.sel = d.sel(); 1121 dst.chan = d.chan(); 1122 1123 if (m_bc->index_reg[1] == dst.sel && 1124 m_bc->index_reg_chan[1] == dst.chan) 1125 m_bc->index_loaded[1] = false; 1126 1127 if (m_bc->index_reg[0] == dst.sel && 1128 m_bc->index_reg_chan[0] == dst.chan) 1129 m_bc->index_loaded[0] = false; 1130 1131 return true; 1132} 1133 1134void AssamblerVisitor::emit_wait_ack() 1135{ 1136 int r = r600_bytecode_add_cfinst(m_bc, CF_OP_WAIT_ACK); 1137 if (!r) { 1138 m_bc->cf_last->cf_addr = 0; 1139 m_bc->cf_last->barrier = 1; 1140 m_ack_suggested = false; 1141 } else 1142 m_result = false; 1143} 1144 1145class EncodeSourceVisitor : public ConstRegisterVisitor { 1146public: 1147 1148 EncodeSourceVisitor(r600_bytecode_alu_src& s, r600_bytecode *bc); 1149 void visit(const Register& value) override; 1150 void visit(const LocalArray& value) override; 1151 void visit(const LocalArrayValue& value) override; 1152 void visit(const UniformValue& value) override; 1153 void visit(const LiteralConstant& value) override; 1154 void visit(const InlineConstant& value) override; 1155 1156 r600_bytecode_alu_src& src; 1157 r600_bytecode *m_bc; 1158 PVirtualValue m_buffer_offset{nullptr}; 1159}; 1160 1161PVirtualValue AssamblerVisitor::copy_src(r600_bytecode_alu_src& src, const VirtualValue& s) 1162{ 1163 1164 EncodeSourceVisitor visitor(src, m_bc); 1165 src.sel = s.sel(); 1166 src.chan = s.chan(); 1167 1168 s.accept(visitor); 1169 return visitor.m_buffer_offset; 1170} 1171 1172EncodeSourceVisitor::EncodeSourceVisitor(r600_bytecode_alu_src& s, r600_bytecode *bc): 1173 src(s), m_bc(bc) 1174{ 1175} 1176 1177void EncodeSourceVisitor::visit(const Register& value) 1178{ 1179 assert(value.sel() <= 124 && "Only have 124 registers"); 1180} 1181 1182void EncodeSourceVisitor::visit(const LocalArray& value) 1183{ 1184 (void)value; 1185 unreachable("An array can't be a source register"); 1186} 1187 1188void EncodeSourceVisitor::visit(const LocalArrayValue& value) 1189{ 1190 src.rel = value.addr() ? 1 : 0; 1191} 1192 1193void EncodeSourceVisitor::visit(const UniformValue& value) 1194{ 1195 assert(value.sel() >= 512 && "Uniform values must have a sel >= 512"); 1196 m_buffer_offset = value.buf_addr(); 1197 src.kc_bank = value.kcache_bank(); 1198} 1199 1200void EncodeSourceVisitor::visit(const LiteralConstant& value) 1201{ 1202 src.value = value.value(); 1203} 1204 1205void EncodeSourceVisitor::visit(const InlineConstant& value) 1206{ 1207 (void)value; 1208} 1209 1210 1211 1212const std::map<EAluOp, int> opcode_map = { 1213 1214 {op2_add, ALU_OP2_ADD}, 1215 {op2_mul, ALU_OP2_MUL}, 1216 {op2_mul_ieee, ALU_OP2_MUL_IEEE}, 1217 {op2_max, ALU_OP2_MAX}, 1218 {op2_min, ALU_OP2_MIN}, 1219 {op2_max_dx10, ALU_OP2_MAX_DX10}, 1220 {op2_min_dx10, ALU_OP2_MIN_DX10}, 1221 {op2_sete, ALU_OP2_SETE}, 1222 {op2_setgt, ALU_OP2_SETGT}, 1223 {op2_setge, ALU_OP2_SETGE}, 1224 {op2_setne, ALU_OP2_SETNE}, 1225 {op2_sete_dx10, ALU_OP2_SETE_DX10}, 1226 {op2_setgt_dx10, ALU_OP2_SETGT_DX10}, 1227 {op2_setge_dx10, ALU_OP2_SETGE_DX10}, 1228 {op2_setne_dx10, ALU_OP2_SETNE_DX10}, 1229 {op1_fract, ALU_OP1_FRACT}, 1230 {op1_trunc, ALU_OP1_TRUNC}, 1231 {op1_ceil, ALU_OP1_CEIL}, 1232 {op1_rndne, ALU_OP1_RNDNE}, 1233 {op1_floor, ALU_OP1_FLOOR}, 1234 {op2_ashr_int, ALU_OP2_ASHR_INT}, 1235 {op2_lshr_int, ALU_OP2_LSHR_INT}, 1236 {op2_lshl_int, ALU_OP2_LSHL_INT}, 1237 {op1_mov, ALU_OP1_MOV}, 1238 {op0_nop, ALU_OP0_NOP}, 1239 {op2_mul_64, ALU_OP2_MUL_64}, 1240 {op1v_flt64_to_flt32, ALU_OP1_FLT64_TO_FLT32}, 1241 {op1v_flt32_to_flt64, ALU_OP1_FLT32_TO_FLT64}, 1242 {op2_prede_int, ALU_OP2_PRED_SETE_INT}, 1243 {op2_pred_setne_int, ALU_OP2_PRED_SETNE_INT}, 1244 {op2_pred_setge_int, ALU_OP2_PRED_SETGE_INT}, 1245 {op2_pred_setgt_int, ALU_OP2_PRED_SETGT_INT}, 1246 {op2_pred_setgt_uint, ALU_OP2_PRED_SETGT_UINT}, 1247 {op2_pred_setge_uint, ALU_OP2_PRED_SETGE_UINT}, 1248 {op2_pred_sete, ALU_OP2_PRED_SETE}, 1249 {op2_pred_setgt, ALU_OP2_PRED_SETGT}, 1250 {op2_pred_setge, ALU_OP2_PRED_SETGE}, 1251 {op2_pred_setne, ALU_OP2_PRED_SETNE}, 1252 {op0_pred_set_clr, ALU_OP0_PRED_SET_CLR}, 1253 {op1_pred_set_restore, ALU_OP1_PRED_SET_RESTORE}, 1254 {op2_pred_sete_push, ALU_OP2_PRED_SETE_PUSH}, 1255 {op2_pred_setgt_push, ALU_OP2_PRED_SETGT_PUSH}, 1256 {op2_pred_setge_push, ALU_OP2_PRED_SETGE_PUSH}, 1257 {op2_pred_setne_push, ALU_OP2_PRED_SETNE_PUSH}, 1258 {op2_kille, ALU_OP2_KILLE}, 1259 {op2_killgt, ALU_OP2_KILLGT}, 1260 {op2_killge, ALU_OP2_KILLGE}, 1261 {op2_killne, ALU_OP2_KILLNE}, 1262 {op2_and_int, ALU_OP2_AND_INT}, 1263 {op2_or_int, ALU_OP2_OR_INT}, 1264 {op2_xor_int, ALU_OP2_XOR_INT}, 1265 {op1_not_int, ALU_OP1_NOT_INT}, 1266 {op2_add_int, ALU_OP2_ADD_INT}, 1267 {op2_sub_int, ALU_OP2_SUB_INT}, 1268 {op2_max_int, ALU_OP2_MAX_INT}, 1269 {op2_min_int, ALU_OP2_MIN_INT}, 1270 {op2_max_uint, ALU_OP2_MAX_UINT}, 1271 {op2_min_uint, ALU_OP2_MIN_UINT}, 1272 {op2_sete_int, ALU_OP2_SETE_INT}, 1273 {op2_setgt_int, ALU_OP2_SETGT_INT}, 1274 {op2_setge_int, ALU_OP2_SETGE_INT}, 1275 {op2_setne_int, ALU_OP2_SETNE_INT}, 1276 {op2_setgt_uint, ALU_OP2_SETGT_UINT}, 1277 {op2_setge_uint, ALU_OP2_SETGE_UINT}, 1278 {op2_killgt_uint, ALU_OP2_KILLGT_UINT}, 1279 {op2_killge_uint, ALU_OP2_KILLGE_UINT}, 1280 {op2_pred_setgt_int, ALU_OP2_PRED_SETGT_INT}, 1281 {op2_pred_setge_int, ALU_OP2_PRED_SETGE_INT}, 1282 {op2_pred_setne_int, ALU_OP2_PRED_SETNE_INT}, 1283 {op2_kille_int, ALU_OP2_KILLE_INT}, 1284 {op2_killgt_int, ALU_OP2_KILLGT_INT}, 1285 {op2_killge_int, ALU_OP2_KILLGE_INT}, 1286 {op2_killne_int, ALU_OP2_KILLNE_INT}, 1287 {op2_pred_sete_push_int, ALU_OP2_PRED_SETE_PUSH_INT}, 1288 {op2_pred_setgt_push_int, ALU_OP2_PRED_SETGT_PUSH_INT}, 1289 {op2_pred_setge_push_int, ALU_OP2_PRED_SETGE_PUSH_INT}, 1290 {op2_pred_setne_push_int, ALU_OP2_PRED_SETNE_PUSH_INT}, 1291 {op2_pred_setlt_push_int, ALU_OP2_PRED_SETLT_PUSH_INT}, 1292 {op2_pred_setle_push_int, ALU_OP2_PRED_SETLE_PUSH_INT}, 1293 {op1_flt_to_int, ALU_OP1_FLT_TO_INT}, 1294 {op1_bfrev_int, ALU_OP1_BFREV_INT}, 1295 {op2_addc_uint, ALU_OP2_ADDC_UINT}, 1296 {op2_subb_uint, ALU_OP2_SUBB_UINT}, 1297 {op0_group_barrier, ALU_OP0_GROUP_BARRIER}, 1298 {op0_group_seq_begin, ALU_OP0_GROUP_SEQ_BEGIN}, 1299 {op0_group_seq_end, ALU_OP0_GROUP_SEQ_END}, 1300 {op2_set_mode, ALU_OP2_SET_MODE}, 1301 {op1_set_cf_idx0, ALU_OP0_SET_CF_IDX0}, 1302 {op1_set_cf_idx1, ALU_OP0_SET_CF_IDX1}, 1303 {op2_set_lds_size, ALU_OP2_SET_LDS_SIZE}, 1304 {op1_exp_ieee, ALU_OP1_EXP_IEEE}, 1305 {op1_log_clamped, ALU_OP1_LOG_CLAMPED}, 1306 {op1_log_ieee, ALU_OP1_LOG_IEEE}, 1307 {op1_recip_clamped, ALU_OP1_RECIP_CLAMPED}, 1308 {op1_recip_ff, ALU_OP1_RECIP_FF}, 1309 {op1_recip_ieee, ALU_OP1_RECIP_IEEE}, 1310 {op1_recipsqrt_clamped, ALU_OP1_RECIPSQRT_CLAMPED}, 1311 {op1_recipsqrt_ff, ALU_OP1_RECIPSQRT_FF}, 1312 {op1_recipsqrt_ieee1, ALU_OP1_RECIPSQRT_IEEE}, 1313 {op1_sqrt_ieee, ALU_OP1_SQRT_IEEE}, 1314 {op1_sin, ALU_OP1_SIN}, 1315 {op1_cos, ALU_OP1_COS}, 1316 {op2_mullo_int, ALU_OP2_MULLO_INT}, 1317 {op2_mulhi_int, ALU_OP2_MULHI_INT}, 1318 {op2_mullo_uint, ALU_OP2_MULLO_UINT}, 1319 {op2_mulhi_uint, ALU_OP2_MULHI_UINT}, 1320 {op1_recip_int, ALU_OP1_RECIP_INT}, 1321 {op1_recip_uint, ALU_OP1_RECIP_UINT}, 1322 {op1_recip_64, ALU_OP2_RECIP_64}, 1323 {op1_recip_clamped_64, ALU_OP2_RECIP_CLAMPED_64}, 1324 {op1_recipsqrt_64, ALU_OP2_RECIPSQRT_64}, 1325 {op1_recipsqrt_clamped_64, ALU_OP2_RECIPSQRT_CLAMPED_64}, 1326 {op1_sqrt_64, ALU_OP2_SQRT_64}, 1327 {op1_flt_to_uint, ALU_OP1_FLT_TO_UINT}, 1328 {op1_int_to_flt, ALU_OP1_INT_TO_FLT}, 1329 {op1_uint_to_flt, ALU_OP1_UINT_TO_FLT}, 1330 {op2_bfm_int, ALU_OP2_BFM_INT}, 1331 {op1_flt32_to_flt16, ALU_OP1_FLT32_TO_FLT16}, 1332 {op1_flt16_to_flt32, ALU_OP1_FLT16_TO_FLT32}, 1333 {op1_ubyte0_flt, ALU_OP1_UBYTE0_FLT}, 1334 {op1_ubyte1_flt, ALU_OP1_UBYTE1_FLT}, 1335 {op1_ubyte2_flt, ALU_OP1_UBYTE2_FLT}, 1336 {op1_ubyte3_flt, ALU_OP1_UBYTE3_FLT}, 1337 {op1_bcnt_int, ALU_OP1_BCNT_INT}, 1338 {op1_ffbh_uint, ALU_OP1_FFBH_UINT}, 1339 {op1_ffbl_int, ALU_OP1_FFBL_INT}, 1340 {op1_ffbh_int, ALU_OP1_FFBH_INT}, 1341 {op1_flt_to_uint4, ALU_OP1_FLT_TO_UINT4}, 1342 {op2_dot_ieee, ALU_OP2_DOT_IEEE}, 1343 {op1_flt_to_int_rpi, ALU_OP1_FLT_TO_INT_RPI}, 1344 {op1_flt_to_int_floor, ALU_OP1_FLT_TO_INT_FLOOR}, 1345 {op2_mulhi_uint24, ALU_OP2_MULHI_UINT24}, 1346 {op1_mbcnt_32hi_int, ALU_OP1_MBCNT_32HI_INT}, 1347 {op1_offset_to_flt, ALU_OP1_OFFSET_TO_FLT}, 1348 {op2_mul_uint24, ALU_OP2_MUL_UINT24}, 1349 {op1_bcnt_accum_prev_int, ALU_OP1_BCNT_ACCUM_PREV_INT}, 1350 {op1_mbcnt_32lo_accum_prev_int, ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT}, 1351 {op2_sete_64, ALU_OP2_SETE_64}, 1352 {op2_setne_64, ALU_OP2_SETNE_64}, 1353 {op2_setgt_64, ALU_OP2_SETGT_64}, 1354 {op2_setge_64, ALU_OP2_SETGE_64}, 1355 {op2_min_64, ALU_OP2_MIN_64}, 1356 {op2_max_64, ALU_OP2_MAX_64}, 1357 {op2_dot4, ALU_OP2_DOT4}, 1358 {op2_dot4_ieee, ALU_OP2_DOT4_IEEE}, 1359 {op2_cube, ALU_OP2_CUBE}, 1360 {op1_max4, ALU_OP1_MAX4}, 1361 {op1_frexp_64, ALU_OP1_FREXP_64}, 1362 {op1_ldexp_64, ALU_OP2_LDEXP_64}, 1363 {op1_fract_64, ALU_OP1_FRACT_64}, 1364 {op2_pred_setgt_64, ALU_OP2_PRED_SETGT_64}, 1365 {op2_pred_sete_64, ALU_OP2_PRED_SETE_64}, 1366 {op2_pred_setge_64, ALU_OP2_PRED_SETGE_64}, 1367 {op2_add_64, ALU_OP2_ADD_64}, 1368 {op1_mova_int, ALU_OP1_MOVA_INT}, 1369 {op1v_flt64_to_flt32, ALU_OP1_FLT64_TO_FLT32}, 1370 {op1_flt32_to_flt64, ALU_OP1_FLT32_TO_FLT64}, 1371 {op2_sad_accum_prev_uint, ALU_OP2_SAD_ACCUM_PREV_UINT}, 1372 {op2_dot, ALU_OP2_DOT}, 1373 {op1_mul_prev, ALU_OP1_MUL_PREV}, 1374 {op1_mul_ieee_prev, ALU_OP1_MUL_IEEE_PREV}, 1375 {op1_add_prev, ALU_OP1_ADD_PREV}, 1376 {op2_muladd_prev, ALU_OP2_MULADD_PREV}, 1377 {op2_muladd_ieee_prev, ALU_OP2_MULADD_IEEE_PREV}, 1378 {op2_interp_xy, ALU_OP2_INTERP_XY}, 1379 {op2_interp_zw, ALU_OP2_INTERP_ZW}, 1380 {op2_interp_x, ALU_OP2_INTERP_X}, 1381 {op2_interp_z, ALU_OP2_INTERP_Z}, 1382 {op0_store_flags, ALU_OP1_STORE_FLAGS}, 1383 {op1_load_store_flags, ALU_OP1_LOAD_STORE_FLAGS}, 1384 {op0_lds_1a, ALU_OP2_LDS_1A}, 1385 {op0_lds_1a1d, ALU_OP2_LDS_1A1D}, 1386 {op0_lds_2a, ALU_OP2_LDS_2A}, 1387 {op1_interp_load_p0, ALU_OP1_INTERP_LOAD_P0}, 1388 {op1_interp_load_p10, ALU_OP1_INTERP_LOAD_P10}, 1389 {op1_interp_load_p20, ALU_OP1_INTERP_LOAD_P20}, 1390 {op3_bfe_uint, ALU_OP3_BFE_UINT}, 1391 {op3_bfe_int, ALU_OP3_BFE_INT}, 1392 {op3_bfi_int, ALU_OP3_BFI_INT}, 1393 {op3_fma, ALU_OP3_FMA}, 1394 {op3_cndne_64, ALU_OP3_CNDNE_64}, 1395 {op3_fma_64, ALU_OP3_FMA_64}, 1396 {op3_lerp_uint, ALU_OP3_LERP_UINT}, 1397 {op3_bit_align_int, ALU_OP3_BIT_ALIGN_INT}, 1398 {op3_byte_align_int, ALU_OP3_BYTE_ALIGN_INT}, 1399 {op3_sad_accum_uint, ALU_OP3_SAD_ACCUM_UINT}, 1400 {op3_sad_accum_hi_uint, ALU_OP3_SAD_ACCUM_HI_UINT}, 1401 {op3_muladd_uint24, ALU_OP3_MULADD_UINT24}, 1402 {op3_lds_idx_op, ALU_OP3_LDS_IDX_OP}, 1403 {op3_muladd, ALU_OP3_MULADD}, 1404 {op3_muladd_m2, ALU_OP3_MULADD_M2}, 1405 {op3_muladd_m4, ALU_OP3_MULADD_M4}, 1406 {op3_muladd_d2, ALU_OP3_MULADD_D2}, 1407 {op3_muladd_ieee, ALU_OP3_MULADD_IEEE}, 1408 {op3_cnde, ALU_OP3_CNDE}, 1409 {op3_cndgt, ALU_OP3_CNDGT}, 1410 {op3_cndge, ALU_OP3_CNDGE}, 1411 {op3_cnde_int, ALU_OP3_CNDE_INT}, 1412 {op3_cndgt_int, ALU_OP3_CNDGT_INT}, 1413 {op3_cndge_int, ALU_OP3_CNDGE_INT}, 1414 {op3_mul_lit, ALU_OP3_MUL_LIT}, 1415}; 1416 1417const std::map<ESDOp, int> ds_opcode_map = { 1418 {DS_OP_ADD, FETCH_OP_GDS_ADD}, 1419 {DS_OP_SUB, FETCH_OP_GDS_SUB}, 1420 {DS_OP_RSUB, FETCH_OP_GDS_RSUB}, 1421 {DS_OP_INC, FETCH_OP_GDS_INC}, 1422 {DS_OP_DEC, FETCH_OP_GDS_DEC}, 1423 {DS_OP_MIN_INT, FETCH_OP_GDS_MIN_INT}, 1424 {DS_OP_MAX_INT, FETCH_OP_GDS_MAX_INT}, 1425 {DS_OP_MIN_UINT, FETCH_OP_GDS_MIN_UINT}, 1426 {DS_OP_MAX_UINT, FETCH_OP_GDS_MAX_UINT}, 1427 {DS_OP_AND, FETCH_OP_GDS_AND}, 1428 {DS_OP_OR, FETCH_OP_GDS_OR}, 1429 {DS_OP_XOR, FETCH_OP_GDS_XOR}, 1430 {DS_OP_MSKOR, FETCH_OP_GDS_MSKOR}, 1431 {DS_OP_WRITE, FETCH_OP_GDS_WRITE}, 1432 {DS_OP_WRITE_REL, FETCH_OP_GDS_WRITE_REL}, 1433 {DS_OP_WRITE2, FETCH_OP_GDS_WRITE2}, 1434 {DS_OP_CMP_STORE, FETCH_OP_GDS_CMP_STORE}, 1435 {DS_OP_CMP_STORE_SPF, FETCH_OP_GDS_CMP_STORE_SPF}, 1436 {DS_OP_BYTE_WRITE, FETCH_OP_GDS_BYTE_WRITE}, 1437 {DS_OP_SHORT_WRITE, FETCH_OP_GDS_SHORT_WRITE}, 1438 {DS_OP_ADD_RET, FETCH_OP_GDS_ADD_RET}, 1439 {DS_OP_SUB_RET, FETCH_OP_GDS_SUB_RET}, 1440 {DS_OP_RSUB_RET, FETCH_OP_GDS_RSUB_RET}, 1441 {DS_OP_INC_RET, FETCH_OP_GDS_INC_RET}, 1442 {DS_OP_DEC_RET, FETCH_OP_GDS_DEC_RET}, 1443 {DS_OP_MIN_INT_RET, FETCH_OP_GDS_MIN_INT_RET}, 1444 {DS_OP_MAX_INT_RET, FETCH_OP_GDS_MAX_INT_RET}, 1445 {DS_OP_MIN_UINT_RET, FETCH_OP_GDS_MIN_UINT_RET}, 1446 {DS_OP_MAX_UINT_RET, FETCH_OP_GDS_MAX_UINT_RET}, 1447 {DS_OP_AND_RET, FETCH_OP_GDS_AND_RET}, 1448 {DS_OP_OR_RET, FETCH_OP_GDS_OR_RET}, 1449 {DS_OP_XOR_RET, FETCH_OP_GDS_XOR_RET}, 1450 {DS_OP_MSKOR_RET, FETCH_OP_GDS_MSKOR_RET}, 1451 {DS_OP_XCHG_RET, FETCH_OP_GDS_XCHG_RET}, 1452 {DS_OP_XCHG_REL_RET, FETCH_OP_GDS_XCHG_REL_RET}, 1453 {DS_OP_XCHG2_RET, FETCH_OP_GDS_XCHG2_RET}, 1454 {DS_OP_CMP_XCHG_RET, FETCH_OP_GDS_CMP_XCHG_RET}, 1455 {DS_OP_CMP_XCHG_SPF_RET, FETCH_OP_GDS_CMP_XCHG_SPF_RET}, 1456 {DS_OP_READ_RET, FETCH_OP_GDS_READ_RET}, 1457 {DS_OP_READ_REL_RET, FETCH_OP_GDS_READ_REL_RET}, 1458 {DS_OP_READ2_RET, FETCH_OP_GDS_READ2_RET}, 1459 {DS_OP_READWRITE_RET, FETCH_OP_GDS_READWRITE_RET}, 1460 {DS_OP_BYTE_READ_RET, FETCH_OP_GDS_BYTE_READ_RET}, 1461 {DS_OP_UBYTE_READ_RET, FETCH_OP_GDS_UBYTE_READ_RET}, 1462 {DS_OP_SHORT_READ_RET, FETCH_OP_GDS_SHORT_READ_RET}, 1463 {DS_OP_USHORT_READ_RET, FETCH_OP_GDS_USHORT_READ_RET}, 1464 {DS_OP_ATOMIC_ORDERED_ALLOC_RET, FETCH_OP_GDS_ATOMIC_ORDERED_ALLOC}, 1465 {DS_OP_INVALID, 0}, 1466}; 1467 1468} 1469