1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2019 Valve Corporation 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci */ 24bf215546Sopenharmony_ci 25bf215546Sopenharmony_ci#include "aco_builder.h" 26bf215546Sopenharmony_ci#include "aco_ir.h" 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci#include <algorithm> 29bf215546Sopenharmony_ci#include <bitset> 30bf215546Sopenharmony_ci#include <stack> 31bf215546Sopenharmony_ci#include <vector> 32bf215546Sopenharmony_ci 33bf215546Sopenharmony_cinamespace aco { 34bf215546Sopenharmony_cinamespace { 35bf215546Sopenharmony_ci 36bf215546Sopenharmony_cistruct State { 37bf215546Sopenharmony_ci Program* program; 38bf215546Sopenharmony_ci Block* block; 39bf215546Sopenharmony_ci std::vector<aco_ptr<Instruction>> old_instructions; 40bf215546Sopenharmony_ci}; 41bf215546Sopenharmony_ci 42bf215546Sopenharmony_cistruct NOP_ctx_gfx6 { 43bf215546Sopenharmony_ci void join(const NOP_ctx_gfx6& other) 44bf215546Sopenharmony_ci { 45bf215546Sopenharmony_ci set_vskip_mode_then_vector = 46bf215546Sopenharmony_ci MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector); 47bf215546Sopenharmony_ci valu_wr_vcc_then_vccz = MAX2(valu_wr_vcc_then_vccz, other.valu_wr_vcc_then_vccz); 48bf215546Sopenharmony_ci valu_wr_exec_then_execz = MAX2(valu_wr_exec_then_execz, other.valu_wr_exec_then_execz); 49bf215546Sopenharmony_ci valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas); 50bf215546Sopenharmony_ci salu_wr_m0_then_gds_msg_ttrace = 51bf215546Sopenharmony_ci MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace); 52bf215546Sopenharmony_ci valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp); 53bf215546Sopenharmony_ci salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds); 54bf215546Sopenharmony_ci salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel); 55bf215546Sopenharmony_ci setreg_then_getsetreg = MAX2(setreg_then_getsetreg, other.setreg_then_getsetreg); 56bf215546Sopenharmony_ci vmem_store_then_wr_data |= other.vmem_store_then_wr_data; 57bf215546Sopenharmony_ci smem_clause |= other.smem_clause; 58bf215546Sopenharmony_ci smem_write |= other.smem_write; 59bf215546Sopenharmony_ci for (unsigned i = 0; i < BITSET_WORDS(128); i++) { 60bf215546Sopenharmony_ci smem_clause_read_write[i] |= other.smem_clause_read_write[i]; 61bf215546Sopenharmony_ci smem_clause_write[i] |= other.smem_clause_write[i]; 62bf215546Sopenharmony_ci } 63bf215546Sopenharmony_ci } 64bf215546Sopenharmony_ci 65bf215546Sopenharmony_ci bool operator==(const NOP_ctx_gfx6& other) 66bf215546Sopenharmony_ci { 67bf215546Sopenharmony_ci return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector && 68bf215546Sopenharmony_ci valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz && 69bf215546Sopenharmony_ci valu_wr_exec_then_execz == other.valu_wr_exec_then_execz && 70bf215546Sopenharmony_ci valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas && 71bf215546Sopenharmony_ci vmem_store_then_wr_data == other.vmem_store_then_wr_data && 72bf215546Sopenharmony_ci salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace && 73bf215546Sopenharmony_ci valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp && 74bf215546Sopenharmony_ci salu_wr_m0_then_lds == other.salu_wr_m0_then_lds && 75bf215546Sopenharmony_ci salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel && 76bf215546Sopenharmony_ci setreg_then_getsetreg == other.setreg_then_getsetreg && 77bf215546Sopenharmony_ci smem_clause == other.smem_clause && smem_write == other.smem_write && 78bf215546Sopenharmony_ci BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) && 79bf215546Sopenharmony_ci BITSET_EQUAL(smem_clause_write, other.smem_clause_write); 80bf215546Sopenharmony_ci } 81bf215546Sopenharmony_ci 82bf215546Sopenharmony_ci void add_wait_states(unsigned amount) 83bf215546Sopenharmony_ci { 84bf215546Sopenharmony_ci if ((set_vskip_mode_then_vector -= amount) < 0) 85bf215546Sopenharmony_ci set_vskip_mode_then_vector = 0; 86bf215546Sopenharmony_ci 87bf215546Sopenharmony_ci if ((valu_wr_vcc_then_vccz -= amount) < 0) 88bf215546Sopenharmony_ci valu_wr_vcc_then_vccz = 0; 89bf215546Sopenharmony_ci 90bf215546Sopenharmony_ci if ((valu_wr_exec_then_execz -= amount) < 0) 91bf215546Sopenharmony_ci valu_wr_exec_then_execz = 0; 92bf215546Sopenharmony_ci 93bf215546Sopenharmony_ci if ((valu_wr_vcc_then_div_fmas -= amount) < 0) 94bf215546Sopenharmony_ci valu_wr_vcc_then_div_fmas = 0; 95bf215546Sopenharmony_ci 96bf215546Sopenharmony_ci if ((salu_wr_m0_then_gds_msg_ttrace -= amount) < 0) 97bf215546Sopenharmony_ci salu_wr_m0_then_gds_msg_ttrace = 0; 98bf215546Sopenharmony_ci 99bf215546Sopenharmony_ci if ((valu_wr_exec_then_dpp -= amount) < 0) 100bf215546Sopenharmony_ci valu_wr_exec_then_dpp = 0; 101bf215546Sopenharmony_ci 102bf215546Sopenharmony_ci if ((salu_wr_m0_then_lds -= amount) < 0) 103bf215546Sopenharmony_ci salu_wr_m0_then_lds = 0; 104bf215546Sopenharmony_ci 105bf215546Sopenharmony_ci if ((salu_wr_m0_then_moverel -= amount) < 0) 106bf215546Sopenharmony_ci salu_wr_m0_then_moverel = 0; 107bf215546Sopenharmony_ci 108bf215546Sopenharmony_ci if ((setreg_then_getsetreg -= amount) < 0) 109bf215546Sopenharmony_ci setreg_then_getsetreg = 0; 110bf215546Sopenharmony_ci 111bf215546Sopenharmony_ci vmem_store_then_wr_data.reset(); 112bf215546Sopenharmony_ci } 113bf215546Sopenharmony_ci 114bf215546Sopenharmony_ci /* setting MODE.vskip and then any vector op requires 2 wait states */ 115bf215546Sopenharmony_ci int8_t set_vskip_mode_then_vector = 0; 116bf215546Sopenharmony_ci 117bf215546Sopenharmony_ci /* VALU writing VCC/EXEC and then a VALU reading VCCZ/EXECZ requires 5 wait states */ 118bf215546Sopenharmony_ci int8_t valu_wr_vcc_then_vccz = 0; 119bf215546Sopenharmony_ci int8_t valu_wr_exec_then_execz = 0; 120bf215546Sopenharmony_ci 121bf215546Sopenharmony_ci /* VALU writing VCC followed by v_div_fmas require 4 wait states */ 122bf215546Sopenharmony_ci int8_t valu_wr_vcc_then_div_fmas = 0; 123bf215546Sopenharmony_ci 124bf215546Sopenharmony_ci /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */ 125bf215546Sopenharmony_ci int8_t salu_wr_m0_then_gds_msg_ttrace = 0; 126bf215546Sopenharmony_ci 127bf215546Sopenharmony_ci /* VALU writing EXEC followed by DPP requires 5 wait states */ 128bf215546Sopenharmony_ci int8_t valu_wr_exec_then_dpp = 0; 129bf215546Sopenharmony_ci 130bf215546Sopenharmony_ci /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */ 131bf215546Sopenharmony_ci int8_t salu_wr_m0_then_lds = 0; 132bf215546Sopenharmony_ci 133bf215546Sopenharmony_ci /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */ 134bf215546Sopenharmony_ci int8_t salu_wr_m0_then_moverel = 0; 135bf215546Sopenharmony_ci 136bf215546Sopenharmony_ci /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states 137bf215546Sopenharmony_ci * currently we don't look at the actual register */ 138bf215546Sopenharmony_ci int8_t setreg_then_getsetreg = 0; 139bf215546Sopenharmony_ci 140bf215546Sopenharmony_ci /* some memory instructions writing >64bit followed by a instructions 141bf215546Sopenharmony_ci * writing the VGPRs holding the writedata requires 1 wait state */ 142bf215546Sopenharmony_ci std::bitset<256> vmem_store_then_wr_data; 143bf215546Sopenharmony_ci 144bf215546Sopenharmony_ci /* we break up SMEM clauses that contain stores or overwrite an 145bf215546Sopenharmony_ci * operand/definition of another instruction in the clause */ 146bf215546Sopenharmony_ci bool smem_clause = false; 147bf215546Sopenharmony_ci bool smem_write = false; 148bf215546Sopenharmony_ci BITSET_DECLARE(smem_clause_read_write, 128) = {0}; 149bf215546Sopenharmony_ci BITSET_DECLARE(smem_clause_write, 128) = {0}; 150bf215546Sopenharmony_ci}; 151bf215546Sopenharmony_ci 152bf215546Sopenharmony_cistruct NOP_ctx_gfx10 { 153bf215546Sopenharmony_ci bool has_VOPC = false; 154bf215546Sopenharmony_ci bool has_nonVALU_exec_read = false; 155bf215546Sopenharmony_ci bool has_VMEM = false; 156bf215546Sopenharmony_ci bool has_branch_after_VMEM = false; 157bf215546Sopenharmony_ci bool has_DS = false; 158bf215546Sopenharmony_ci bool has_branch_after_DS = false; 159bf215546Sopenharmony_ci bool has_NSA_MIMG = false; 160bf215546Sopenharmony_ci bool has_writelane = false; 161bf215546Sopenharmony_ci std::bitset<128> sgprs_read_by_VMEM; 162bf215546Sopenharmony_ci std::bitset<128> sgprs_read_by_VMEM_store; 163bf215546Sopenharmony_ci std::bitset<128> sgprs_read_by_DS; 164bf215546Sopenharmony_ci std::bitset<128> sgprs_read_by_SMEM; 165bf215546Sopenharmony_ci 166bf215546Sopenharmony_ci void join(const NOP_ctx_gfx10& other) 167bf215546Sopenharmony_ci { 168bf215546Sopenharmony_ci has_VOPC |= other.has_VOPC; 169bf215546Sopenharmony_ci has_nonVALU_exec_read |= other.has_nonVALU_exec_read; 170bf215546Sopenharmony_ci has_VMEM |= other.has_VMEM; 171bf215546Sopenharmony_ci has_branch_after_VMEM |= other.has_branch_after_VMEM; 172bf215546Sopenharmony_ci has_DS |= other.has_DS; 173bf215546Sopenharmony_ci has_branch_after_DS |= other.has_branch_after_DS; 174bf215546Sopenharmony_ci has_NSA_MIMG |= other.has_NSA_MIMG; 175bf215546Sopenharmony_ci has_writelane |= other.has_writelane; 176bf215546Sopenharmony_ci sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM; 177bf215546Sopenharmony_ci sgprs_read_by_DS |= other.sgprs_read_by_DS; 178bf215546Sopenharmony_ci sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store; 179bf215546Sopenharmony_ci sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM; 180bf215546Sopenharmony_ci } 181bf215546Sopenharmony_ci 182bf215546Sopenharmony_ci bool operator==(const NOP_ctx_gfx10& other) 183bf215546Sopenharmony_ci { 184bf215546Sopenharmony_ci return has_VOPC == other.has_VOPC && has_nonVALU_exec_read == other.has_nonVALU_exec_read && 185bf215546Sopenharmony_ci has_VMEM == other.has_VMEM && has_branch_after_VMEM == other.has_branch_after_VMEM && 186bf215546Sopenharmony_ci has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS && 187bf215546Sopenharmony_ci has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane && 188bf215546Sopenharmony_ci sgprs_read_by_VMEM == other.sgprs_read_by_VMEM && 189bf215546Sopenharmony_ci sgprs_read_by_DS == other.sgprs_read_by_DS && 190bf215546Sopenharmony_ci sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store && 191bf215546Sopenharmony_ci sgprs_read_by_SMEM == other.sgprs_read_by_SMEM; 192bf215546Sopenharmony_ci } 193bf215546Sopenharmony_ci}; 194bf215546Sopenharmony_ci 195bf215546Sopenharmony_ciint 196bf215546Sopenharmony_ciget_wait_states(aco_ptr<Instruction>& instr) 197bf215546Sopenharmony_ci{ 198bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_nop) 199bf215546Sopenharmony_ci return instr->sopp().imm + 1; 200bf215546Sopenharmony_ci else if (instr->opcode == aco_opcode::p_constaddr) 201bf215546Sopenharmony_ci return 3; /* lowered to 3 instructions in the assembler */ 202bf215546Sopenharmony_ci else 203bf215546Sopenharmony_ci return 1; 204bf215546Sopenharmony_ci} 205bf215546Sopenharmony_ci 206bf215546Sopenharmony_cibool 207bf215546Sopenharmony_ciregs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size) 208bf215546Sopenharmony_ci{ 209bf215546Sopenharmony_ci return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size); 210bf215546Sopenharmony_ci} 211bf215546Sopenharmony_ci 212bf215546Sopenharmony_citemplate <bool Valu, bool Vintrp, bool Salu> 213bf215546Sopenharmony_cibool 214bf215546Sopenharmony_cihandle_raw_hazard_instr(aco_ptr<Instruction>& pred, PhysReg reg, int* nops_needed, uint32_t* mask) 215bf215546Sopenharmony_ci{ 216bf215546Sopenharmony_ci unsigned mask_size = util_last_bit(*mask); 217bf215546Sopenharmony_ci 218bf215546Sopenharmony_ci uint32_t writemask = 0; 219bf215546Sopenharmony_ci for (Definition& def : pred->definitions) { 220bf215546Sopenharmony_ci if (regs_intersect(reg, mask_size, def.physReg(), def.size())) { 221bf215546Sopenharmony_ci unsigned start = def.physReg() > reg ? def.physReg() - reg : 0; 222bf215546Sopenharmony_ci unsigned end = MIN2(mask_size, start + def.size()); 223bf215546Sopenharmony_ci writemask |= u_bit_consecutive(start, end - start); 224bf215546Sopenharmony_ci } 225bf215546Sopenharmony_ci } 226bf215546Sopenharmony_ci 227bf215546Sopenharmony_ci bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) || 228bf215546Sopenharmony_ci (pred->isSALU() && Salu)); 229bf215546Sopenharmony_ci if (is_hazard) 230bf215546Sopenharmony_ci return true; 231bf215546Sopenharmony_ci 232bf215546Sopenharmony_ci *mask &= ~writemask; 233bf215546Sopenharmony_ci *nops_needed = MAX2(*nops_needed - get_wait_states(pred), 0); 234bf215546Sopenharmony_ci 235bf215546Sopenharmony_ci if (*mask == 0) 236bf215546Sopenharmony_ci *nops_needed = 0; 237bf215546Sopenharmony_ci 238bf215546Sopenharmony_ci return *nops_needed == 0; 239bf215546Sopenharmony_ci} 240bf215546Sopenharmony_ci 241bf215546Sopenharmony_citemplate <bool Valu, bool Vintrp, bool Salu> 242bf215546Sopenharmony_ciint 243bf215546Sopenharmony_cihandle_raw_hazard_internal(State& state, Block* block, int nops_needed, PhysReg reg, uint32_t mask, 244bf215546Sopenharmony_ci bool start_at_end) 245bf215546Sopenharmony_ci{ 246bf215546Sopenharmony_ci if (block == state.block && start_at_end) { 247bf215546Sopenharmony_ci /* If it's the current block, block->instructions is incomplete. */ 248bf215546Sopenharmony_ci for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) { 249bf215546Sopenharmony_ci aco_ptr<Instruction>& instr = state.old_instructions[pred_idx]; 250bf215546Sopenharmony_ci if (!instr) 251bf215546Sopenharmony_ci break; /* Instruction has been moved to block->instructions. */ 252bf215546Sopenharmony_ci if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(instr, reg, &nops_needed, &mask)) 253bf215546Sopenharmony_ci return nops_needed; 254bf215546Sopenharmony_ci } 255bf215546Sopenharmony_ci } 256bf215546Sopenharmony_ci for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) { 257bf215546Sopenharmony_ci if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(block->instructions[pred_idx], reg, 258bf215546Sopenharmony_ci &nops_needed, &mask)) 259bf215546Sopenharmony_ci return nops_needed; 260bf215546Sopenharmony_ci } 261bf215546Sopenharmony_ci 262bf215546Sopenharmony_ci int res = 0; 263bf215546Sopenharmony_ci 264bf215546Sopenharmony_ci /* Loops require branch instructions, which count towards the wait 265bf215546Sopenharmony_ci * states. So even with loops this should finish unless nops_needed is some 266bf215546Sopenharmony_ci * huge value. */ 267bf215546Sopenharmony_ci for (unsigned lin_pred : block->linear_preds) { 268bf215546Sopenharmony_ci res = 269bf215546Sopenharmony_ci std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>( 270bf215546Sopenharmony_ci state, &state.program->blocks[lin_pred], nops_needed, reg, mask, true)); 271bf215546Sopenharmony_ci } 272bf215546Sopenharmony_ci return res; 273bf215546Sopenharmony_ci} 274bf215546Sopenharmony_ci 275bf215546Sopenharmony_citemplate <bool Valu, bool Vintrp, bool Salu> 276bf215546Sopenharmony_civoid 277bf215546Sopenharmony_cihandle_raw_hazard(State& state, int* NOPs, int min_states, Operand op) 278bf215546Sopenharmony_ci{ 279bf215546Sopenharmony_ci if (*NOPs >= min_states) 280bf215546Sopenharmony_ci return; 281bf215546Sopenharmony_ci int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>( 282bf215546Sopenharmony_ci state, state.block, min_states, op.physReg(), u_bit_consecutive(0, op.size()), false); 283bf215546Sopenharmony_ci *NOPs = MAX2(*NOPs, res); 284bf215546Sopenharmony_ci} 285bf215546Sopenharmony_ci 286bf215546Sopenharmony_cistatic auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>; 287bf215546Sopenharmony_cistatic auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>; 288bf215546Sopenharmony_cistatic auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>; 289bf215546Sopenharmony_ci 290bf215546Sopenharmony_civoid 291bf215546Sopenharmony_ciset_bitset_range(BITSET_WORD* words, unsigned start, unsigned size) 292bf215546Sopenharmony_ci{ 293bf215546Sopenharmony_ci unsigned end = start + size - 1; 294bf215546Sopenharmony_ci unsigned start_mod = start % BITSET_WORDBITS; 295bf215546Sopenharmony_ci if (start_mod + size <= BITSET_WORDBITS) { 296bf215546Sopenharmony_ci BITSET_SET_RANGE_INSIDE_WORD(words, start, end); 297bf215546Sopenharmony_ci } else { 298bf215546Sopenharmony_ci unsigned first_size = BITSET_WORDBITS - start_mod; 299bf215546Sopenharmony_ci set_bitset_range(words, start, BITSET_WORDBITS - start_mod); 300bf215546Sopenharmony_ci set_bitset_range(words, start + first_size, size - first_size); 301bf215546Sopenharmony_ci } 302bf215546Sopenharmony_ci} 303bf215546Sopenharmony_ci 304bf215546Sopenharmony_cibool 305bf215546Sopenharmony_citest_bitset_range(BITSET_WORD* words, unsigned start, unsigned size) 306bf215546Sopenharmony_ci{ 307bf215546Sopenharmony_ci unsigned end = start + size - 1; 308bf215546Sopenharmony_ci unsigned start_mod = start % BITSET_WORDBITS; 309bf215546Sopenharmony_ci if (start_mod + size <= BITSET_WORDBITS) { 310bf215546Sopenharmony_ci return BITSET_TEST_RANGE(words, start, end); 311bf215546Sopenharmony_ci } else { 312bf215546Sopenharmony_ci unsigned first_size = BITSET_WORDBITS - start_mod; 313bf215546Sopenharmony_ci return test_bitset_range(words, start, BITSET_WORDBITS - start_mod) || 314bf215546Sopenharmony_ci test_bitset_range(words, start + first_size, size - first_size); 315bf215546Sopenharmony_ci } 316bf215546Sopenharmony_ci} 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci/* A SMEM clause is any group of consecutive SMEM instructions. The 319bf215546Sopenharmony_ci * instructions in this group may return out of order and/or may be replayed. 320bf215546Sopenharmony_ci * 321bf215546Sopenharmony_ci * To fix this potential hazard correctly, we have to make sure that when a 322bf215546Sopenharmony_ci * clause has more than one instruction, no instruction in the clause writes 323bf215546Sopenharmony_ci * to a register that is read by another instruction in the clause (including 324bf215546Sopenharmony_ci * itself). In this case, we have to break the SMEM clause by inserting non 325bf215546Sopenharmony_ci * SMEM instructions. 326bf215546Sopenharmony_ci * 327bf215546Sopenharmony_ci * SMEM clauses are only present on GFX8+, and only matter when XNACK is set. 328bf215546Sopenharmony_ci */ 329bf215546Sopenharmony_civoid 330bf215546Sopenharmony_cihandle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr, 331bf215546Sopenharmony_ci int* NOPs) 332bf215546Sopenharmony_ci{ 333bf215546Sopenharmony_ci /* break off from previous SMEM clause if needed */ 334bf215546Sopenharmony_ci if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) { 335bf215546Sopenharmony_ci /* Don't allow clauses with store instructions since the clause's 336bf215546Sopenharmony_ci * instructions may use the same address. */ 337bf215546Sopenharmony_ci if (ctx.smem_write || instr->definitions.empty() || 338bf215546Sopenharmony_ci instr_info.is_atomic[(unsigned)instr->opcode]) { 339bf215546Sopenharmony_ci *NOPs = 1; 340bf215546Sopenharmony_ci } else if (program->dev.xnack_enabled) { 341bf215546Sopenharmony_ci for (Operand op : instr->operands) { 342bf215546Sopenharmony_ci if (!op.isConstant() && 343bf215546Sopenharmony_ci test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) { 344bf215546Sopenharmony_ci *NOPs = 1; 345bf215546Sopenharmony_ci break; 346bf215546Sopenharmony_ci } 347bf215546Sopenharmony_ci } 348bf215546Sopenharmony_ci 349bf215546Sopenharmony_ci Definition def = instr->definitions[0]; 350bf215546Sopenharmony_ci if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size())) 351bf215546Sopenharmony_ci *NOPs = 1; 352bf215546Sopenharmony_ci } 353bf215546Sopenharmony_ci } 354bf215546Sopenharmony_ci} 355bf215546Sopenharmony_ci 356bf215546Sopenharmony_ci/* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */ 357bf215546Sopenharmony_civoid 358bf215546Sopenharmony_cihandle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr, 359bf215546Sopenharmony_ci std::vector<aco_ptr<Instruction>>& new_instructions) 360bf215546Sopenharmony_ci{ 361bf215546Sopenharmony_ci /* check hazards */ 362bf215546Sopenharmony_ci int NOPs = 0; 363bf215546Sopenharmony_ci 364bf215546Sopenharmony_ci if (instr->isSMEM()) { 365bf215546Sopenharmony_ci if (state.program->gfx_level == GFX6) { 366bf215546Sopenharmony_ci /* A read of an SGPR by SMRD instruction requires 4 wait states 367bf215546Sopenharmony_ci * when the SGPR was written by a VALU instruction. According to LLVM, 368bf215546Sopenharmony_ci * there is also an undocumented hardware behavior when the buffer 369bf215546Sopenharmony_ci * descriptor is written by a SALU instruction */ 370bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->operands.size(); i++) { 371bf215546Sopenharmony_ci Operand op = instr->operands[i]; 372bf215546Sopenharmony_ci if (op.isConstant()) 373bf215546Sopenharmony_ci continue; 374bf215546Sopenharmony_ci 375bf215546Sopenharmony_ci bool is_buffer_desc = i == 0 && op.size() > 2; 376bf215546Sopenharmony_ci if (is_buffer_desc) 377bf215546Sopenharmony_ci handle_valu_salu_then_read_hazard(state, &NOPs, 4, op); 378bf215546Sopenharmony_ci else 379bf215546Sopenharmony_ci handle_valu_then_read_hazard(state, &NOPs, 4, op); 380bf215546Sopenharmony_ci } 381bf215546Sopenharmony_ci } 382bf215546Sopenharmony_ci 383bf215546Sopenharmony_ci handle_smem_clause_hazards(state.program, ctx, instr, &NOPs); 384bf215546Sopenharmony_ci } else if (instr->isSALU()) { 385bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_setreg_b32 || 386bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_setreg_imm32_b32 || 387bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_getreg_b32) { 388bf215546Sopenharmony_ci NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg); 389bf215546Sopenharmony_ci } 390bf215546Sopenharmony_ci 391bf215546Sopenharmony_ci if (state.program->gfx_level == GFX9) { 392bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_movrels_b32 || 393bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_movrels_b64 || 394bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_movreld_b32 || 395bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_movreld_b64) { 396bf215546Sopenharmony_ci NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel); 397bf215546Sopenharmony_ci } 398bf215546Sopenharmony_ci } 399bf215546Sopenharmony_ci 400bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_ttracedata) 401bf215546Sopenharmony_ci NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace); 402bf215546Sopenharmony_ci } else if (instr->isDS() && instr->ds().gds) { 403bf215546Sopenharmony_ci NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace); 404bf215546Sopenharmony_ci } else if (instr->isVALU() || instr->isVINTRP()) { 405bf215546Sopenharmony_ci for (Operand op : instr->operands) { 406bf215546Sopenharmony_ci if (op.physReg() == vccz) 407bf215546Sopenharmony_ci NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_vccz); 408bf215546Sopenharmony_ci if (op.physReg() == execz) 409bf215546Sopenharmony_ci NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_execz); 410bf215546Sopenharmony_ci } 411bf215546Sopenharmony_ci 412bf215546Sopenharmony_ci if (instr->isDPP()) { 413bf215546Sopenharmony_ci NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp); 414bf215546Sopenharmony_ci handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]); 415bf215546Sopenharmony_ci } 416bf215546Sopenharmony_ci 417bf215546Sopenharmony_ci for (Definition def : instr->definitions) { 418bf215546Sopenharmony_ci if (def.regClass().type() != RegType::sgpr) { 419bf215546Sopenharmony_ci for (unsigned i = 0; i < def.size(); i++) 420bf215546Sopenharmony_ci NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data[(def.physReg() & 0xff) + i]); 421bf215546Sopenharmony_ci } 422bf215546Sopenharmony_ci } 423bf215546Sopenharmony_ci 424bf215546Sopenharmony_ci if ((instr->opcode == aco_opcode::v_readlane_b32 || 425bf215546Sopenharmony_ci instr->opcode == aco_opcode::v_readlane_b32_e64 || 426bf215546Sopenharmony_ci instr->opcode == aco_opcode::v_writelane_b32 || 427bf215546Sopenharmony_ci instr->opcode == aco_opcode::v_writelane_b32_e64) && 428bf215546Sopenharmony_ci !instr->operands[1].isConstant()) { 429bf215546Sopenharmony_ci handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]); 430bf215546Sopenharmony_ci } 431bf215546Sopenharmony_ci 432bf215546Sopenharmony_ci /* It's required to insert 1 wait state if the dst VGPR of any v_interp_* 433bf215546Sopenharmony_ci * is followed by a read with v_readfirstlane or v_readlane to fix GPU 434bf215546Sopenharmony_ci * hangs on GFX6. Note that v_writelane_* is apparently not affected. 435bf215546Sopenharmony_ci * This hazard isn't documented anywhere but AMD confirmed that hazard. 436bf215546Sopenharmony_ci */ 437bf215546Sopenharmony_ci if (state.program->gfx_level == GFX6 && 438bf215546Sopenharmony_ci (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */ 439bf215546Sopenharmony_ci instr->opcode == aco_opcode::v_readfirstlane_b32)) { 440bf215546Sopenharmony_ci handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]); 441bf215546Sopenharmony_ci } 442bf215546Sopenharmony_ci 443bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::v_div_fmas_f32 || 444bf215546Sopenharmony_ci instr->opcode == aco_opcode::v_div_fmas_f64) 445bf215546Sopenharmony_ci NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas); 446bf215546Sopenharmony_ci } else if (instr->isVMEM() || instr->isFlatLike()) { 447bf215546Sopenharmony_ci /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */ 448bf215546Sopenharmony_ci for (Operand op : instr->operands) { 449bf215546Sopenharmony_ci if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr) 450bf215546Sopenharmony_ci handle_valu_then_read_hazard(state, &NOPs, 5, op); 451bf215546Sopenharmony_ci } 452bf215546Sopenharmony_ci } 453bf215546Sopenharmony_ci 454bf215546Sopenharmony_ci if (!instr->isSALU() && instr->format != Format::SMEM) 455bf215546Sopenharmony_ci NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector); 456bf215546Sopenharmony_ci 457bf215546Sopenharmony_ci if (state.program->gfx_level == GFX9) { 458bf215546Sopenharmony_ci bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds; 459bf215546Sopenharmony_ci if (instr->isVINTRP() || lds_scratch_global || 460bf215546Sopenharmony_ci instr->opcode == aco_opcode::ds_read_addtid_b32 || 461bf215546Sopenharmony_ci instr->opcode == aco_opcode::ds_write_addtid_b32 || 462bf215546Sopenharmony_ci instr->opcode == aco_opcode::buffer_store_lds_dword) { 463bf215546Sopenharmony_ci NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds); 464bf215546Sopenharmony_ci } 465bf215546Sopenharmony_ci } 466bf215546Sopenharmony_ci 467bf215546Sopenharmony_ci ctx.add_wait_states(NOPs + get_wait_states(instr)); 468bf215546Sopenharmony_ci 469bf215546Sopenharmony_ci // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles 470bf215546Sopenharmony_ci if (NOPs) { 471bf215546Sopenharmony_ci /* create NOP */ 472bf215546Sopenharmony_ci aco_ptr<SOPP_instruction> nop{ 473bf215546Sopenharmony_ci create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)}; 474bf215546Sopenharmony_ci nop->imm = NOPs - 1; 475bf215546Sopenharmony_ci nop->block = -1; 476bf215546Sopenharmony_ci new_instructions.emplace_back(std::move(nop)); 477bf215546Sopenharmony_ci } 478bf215546Sopenharmony_ci 479bf215546Sopenharmony_ci /* update information to check for later hazards */ 480bf215546Sopenharmony_ci if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) { 481bf215546Sopenharmony_ci ctx.smem_clause = false; 482bf215546Sopenharmony_ci ctx.smem_write = false; 483bf215546Sopenharmony_ci 484bf215546Sopenharmony_ci if (state.program->dev.xnack_enabled) { 485bf215546Sopenharmony_ci BITSET_ZERO(ctx.smem_clause_read_write); 486bf215546Sopenharmony_ci BITSET_ZERO(ctx.smem_clause_write); 487bf215546Sopenharmony_ci } 488bf215546Sopenharmony_ci } 489bf215546Sopenharmony_ci 490bf215546Sopenharmony_ci if (instr->isSMEM()) { 491bf215546Sopenharmony_ci if (instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) { 492bf215546Sopenharmony_ci ctx.smem_write = true; 493bf215546Sopenharmony_ci } else { 494bf215546Sopenharmony_ci ctx.smem_clause = true; 495bf215546Sopenharmony_ci 496bf215546Sopenharmony_ci if (state.program->dev.xnack_enabled) { 497bf215546Sopenharmony_ci for (Operand op : instr->operands) { 498bf215546Sopenharmony_ci if (!op.isConstant()) { 499bf215546Sopenharmony_ci set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size()); 500bf215546Sopenharmony_ci } 501bf215546Sopenharmony_ci } 502bf215546Sopenharmony_ci 503bf215546Sopenharmony_ci Definition def = instr->definitions[0]; 504bf215546Sopenharmony_ci set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()); 505bf215546Sopenharmony_ci set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size()); 506bf215546Sopenharmony_ci } 507bf215546Sopenharmony_ci } 508bf215546Sopenharmony_ci } else if (instr->isVALU()) { 509bf215546Sopenharmony_ci for (Definition def : instr->definitions) { 510bf215546Sopenharmony_ci if (def.regClass().type() == RegType::sgpr) { 511bf215546Sopenharmony_ci if (def.physReg() == vcc || def.physReg() == vcc_hi) { 512bf215546Sopenharmony_ci ctx.valu_wr_vcc_then_vccz = 5; 513bf215546Sopenharmony_ci ctx.valu_wr_vcc_then_div_fmas = 4; 514bf215546Sopenharmony_ci } 515bf215546Sopenharmony_ci if (def.physReg() == exec || def.physReg() == exec_hi) { 516bf215546Sopenharmony_ci ctx.valu_wr_exec_then_execz = 5; 517bf215546Sopenharmony_ci ctx.valu_wr_exec_then_dpp = 5; 518bf215546Sopenharmony_ci } 519bf215546Sopenharmony_ci } 520bf215546Sopenharmony_ci } 521bf215546Sopenharmony_ci } else if (instr->isSALU() && !instr->definitions.empty()) { 522bf215546Sopenharmony_ci if (!instr->definitions.empty()) { 523bf215546Sopenharmony_ci /* all other definitions should be SCC */ 524bf215546Sopenharmony_ci Definition def = instr->definitions[0]; 525bf215546Sopenharmony_ci if (def.physReg() == m0) { 526bf215546Sopenharmony_ci ctx.salu_wr_m0_then_gds_msg_ttrace = 1; 527bf215546Sopenharmony_ci ctx.salu_wr_m0_then_lds = 1; 528bf215546Sopenharmony_ci ctx.salu_wr_m0_then_moverel = 1; 529bf215546Sopenharmony_ci } 530bf215546Sopenharmony_ci } else if (instr->opcode == aco_opcode::s_setreg_b32 || 531bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_setreg_imm32_b32) { 532bf215546Sopenharmony_ci SOPK_instruction& sopk = instr->sopk(); 533bf215546Sopenharmony_ci unsigned offset = (sopk.imm >> 6) & 0x1f; 534bf215546Sopenharmony_ci unsigned size = ((sopk.imm >> 11) & 0x1f) + 1; 535bf215546Sopenharmony_ci unsigned reg = sopk.imm & 0x3f; 536bf215546Sopenharmony_ci ctx.setreg_then_getsetreg = 2; 537bf215546Sopenharmony_ci 538bf215546Sopenharmony_ci if (reg == 1 && offset >= 28 && size > (28 - offset)) 539bf215546Sopenharmony_ci ctx.set_vskip_mode_then_vector = 2; 540bf215546Sopenharmony_ci } 541bf215546Sopenharmony_ci } else if (instr->isVMEM() || instr->isFlatLike()) { 542bf215546Sopenharmony_ci /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */ 543bf215546Sopenharmony_ci bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 && 544bf215546Sopenharmony_ci instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128; 545bf215546Sopenharmony_ci /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit 546bf215546Sopenharmony_ci * store) */ 547bf215546Sopenharmony_ci bool consider_mimg = instr->isMIMG() && 548bf215546Sopenharmony_ci instr->operands[1].regClass().type() == RegType::vgpr && 549bf215546Sopenharmony_ci instr->operands[1].size() > 2 && instr->operands[0].size() == 4; 550bf215546Sopenharmony_ci /* FLAT/GLOBAL/SCRATCH store with >64-bit data */ 551bf215546Sopenharmony_ci bool consider_flat = 552bf215546Sopenharmony_ci instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2; 553bf215546Sopenharmony_ci if (consider_buf || consider_mimg || consider_flat) { 554bf215546Sopenharmony_ci PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg(); 555bf215546Sopenharmony_ci unsigned size = instr->operands[consider_flat ? 2 : 3].size(); 556bf215546Sopenharmony_ci for (unsigned i = 0; i < size; i++) 557bf215546Sopenharmony_ci ctx.vmem_store_then_wr_data[(wrdata & 0xff) + i] = 1; 558bf215546Sopenharmony_ci } 559bf215546Sopenharmony_ci } 560bf215546Sopenharmony_ci} 561bf215546Sopenharmony_ci 562bf215546Sopenharmony_citemplate <std::size_t N> 563bf215546Sopenharmony_cibool 564bf215546Sopenharmony_cicheck_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs) 565bf215546Sopenharmony_ci{ 566bf215546Sopenharmony_ci return std::any_of(instr->definitions.begin(), instr->definitions.end(), 567bf215546Sopenharmony_ci [&check_regs](const Definition& def) -> bool 568bf215546Sopenharmony_ci { 569bf215546Sopenharmony_ci bool writes_any = false; 570bf215546Sopenharmony_ci for (unsigned i = 0; i < def.size(); i++) { 571bf215546Sopenharmony_ci unsigned def_reg = def.physReg() + i; 572bf215546Sopenharmony_ci writes_any |= def_reg < check_regs.size() && check_regs[def_reg]; 573bf215546Sopenharmony_ci } 574bf215546Sopenharmony_ci return writes_any; 575bf215546Sopenharmony_ci }); 576bf215546Sopenharmony_ci} 577bf215546Sopenharmony_ci 578bf215546Sopenharmony_citemplate <std::size_t N> 579bf215546Sopenharmony_civoid 580bf215546Sopenharmony_cimark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads) 581bf215546Sopenharmony_ci{ 582bf215546Sopenharmony_ci for (const Operand& op : instr->operands) { 583bf215546Sopenharmony_ci for (unsigned i = 0; i < op.size(); i++) { 584bf215546Sopenharmony_ci unsigned reg = op.physReg() + i; 585bf215546Sopenharmony_ci if (reg < reg_reads.size()) 586bf215546Sopenharmony_ci reg_reads.set(reg); 587bf215546Sopenharmony_ci } 588bf215546Sopenharmony_ci } 589bf215546Sopenharmony_ci} 590bf215546Sopenharmony_ci 591bf215546Sopenharmony_citemplate <std::size_t N> 592bf215546Sopenharmony_civoid 593bf215546Sopenharmony_cimark_read_regs_exec(State& state, const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads) 594bf215546Sopenharmony_ci{ 595bf215546Sopenharmony_ci mark_read_regs(instr, reg_reads); 596bf215546Sopenharmony_ci reg_reads.set(exec); 597bf215546Sopenharmony_ci if (state.program->wave_size == 64) 598bf215546Sopenharmony_ci reg_reads.set(exec_hi); 599bf215546Sopenharmony_ci} 600bf215546Sopenharmony_ci 601bf215546Sopenharmony_cibool 602bf215546Sopenharmony_ciVALU_writes_sgpr(aco_ptr<Instruction>& instr) 603bf215546Sopenharmony_ci{ 604bf215546Sopenharmony_ci if (instr->isVOPC()) 605bf215546Sopenharmony_ci return true; 606bf215546Sopenharmony_ci if (instr->isVOP3() && instr->definitions.size() == 2) 607bf215546Sopenharmony_ci return true; 608bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::v_readfirstlane_b32 || 609bf215546Sopenharmony_ci instr->opcode == aco_opcode::v_readlane_b32 || 610bf215546Sopenharmony_ci instr->opcode == aco_opcode::v_readlane_b32_e64) 611bf215546Sopenharmony_ci return true; 612bf215546Sopenharmony_ci return false; 613bf215546Sopenharmony_ci} 614bf215546Sopenharmony_ci 615bf215546Sopenharmony_cibool 616bf215546Sopenharmony_ciinstr_writes_exec(const aco_ptr<Instruction>& instr) 617bf215546Sopenharmony_ci{ 618bf215546Sopenharmony_ci return std::any_of(instr->definitions.begin(), instr->definitions.end(), 619bf215546Sopenharmony_ci [](const Definition& def) -> bool 620bf215546Sopenharmony_ci { return def.physReg() == exec_lo || def.physReg() == exec_hi; }); 621bf215546Sopenharmony_ci} 622bf215546Sopenharmony_ci 623bf215546Sopenharmony_cibool 624bf215546Sopenharmony_ciinstr_writes_sgpr(const aco_ptr<Instruction>& instr) 625bf215546Sopenharmony_ci{ 626bf215546Sopenharmony_ci return std::any_of(instr->definitions.begin(), instr->definitions.end(), 627bf215546Sopenharmony_ci [](const Definition& def) -> bool 628bf215546Sopenharmony_ci { return def.getTemp().type() == RegType::sgpr; }); 629bf215546Sopenharmony_ci} 630bf215546Sopenharmony_ci 631bf215546Sopenharmony_ciinline bool 632bf215546Sopenharmony_ciinstr_is_branch(const aco_ptr<Instruction>& instr) 633bf215546Sopenharmony_ci{ 634bf215546Sopenharmony_ci return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 || 635bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cbranch_scc1 || 636bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cbranch_vccz || 637bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cbranch_vccnz || 638bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cbranch_execz || 639bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cbranch_execnz || 640bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cbranch_cdbgsys || 641bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cbranch_cdbguser || 642bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user || 643bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user || 644bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_subvector_loop_begin || 645bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_subvector_loop_end || 646bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 || 647bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64; 648bf215546Sopenharmony_ci} 649bf215546Sopenharmony_ci 650bf215546Sopenharmony_civoid 651bf215546Sopenharmony_cihandle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr, 652bf215546Sopenharmony_ci std::vector<aco_ptr<Instruction>>& new_instructions) 653bf215546Sopenharmony_ci{ 654bf215546Sopenharmony_ci // TODO: s_dcache_inv needs to be in it's own group on GFX10 655bf215546Sopenharmony_ci 656bf215546Sopenharmony_ci /* VMEMtoScalarWriteHazard 657bf215546Sopenharmony_ci * Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)" 658bf215546Sopenharmony_ci * in-between. 659bf215546Sopenharmony_ci */ 660bf215546Sopenharmony_ci if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) { 661bf215546Sopenharmony_ci /* Remember all SGPRs that are read by the VMEM/DS instruction */ 662bf215546Sopenharmony_ci if (instr->isVMEM() || instr->isFlatLike()) 663bf215546Sopenharmony_ci mark_read_regs_exec( 664bf215546Sopenharmony_ci state, instr, 665bf215546Sopenharmony_ci instr->definitions.empty() ? ctx.sgprs_read_by_VMEM_store : ctx.sgprs_read_by_VMEM); 666bf215546Sopenharmony_ci if (instr->isFlat() || instr->isDS()) 667bf215546Sopenharmony_ci mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS); 668bf215546Sopenharmony_ci } else if (instr->isSALU() || instr->isSMEM()) { 669bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_waitcnt) { 670bf215546Sopenharmony_ci wait_imm imm(state.program->gfx_level, instr->sopp().imm); 671bf215546Sopenharmony_ci if (imm.vm == 0) 672bf215546Sopenharmony_ci ctx.sgprs_read_by_VMEM.reset(); 673bf215546Sopenharmony_ci } else if (instr->opcode == aco_opcode::s_waitcnt_depctr && instr->sopp().imm == 0xffe3) { 674bf215546Sopenharmony_ci /* Hazard is mitigated by a s_waitcnt_depctr with a magic imm */ 675bf215546Sopenharmony_ci ctx.sgprs_read_by_VMEM.reset(); 676bf215546Sopenharmony_ci ctx.sgprs_read_by_DS.reset(); 677bf215546Sopenharmony_ci ctx.sgprs_read_by_VMEM_store.reset(); 678bf215546Sopenharmony_ci } 679bf215546Sopenharmony_ci 680bf215546Sopenharmony_ci /* Check if SALU writes an SGPR that was previously read by the VALU */ 681bf215546Sopenharmony_ci if (check_written_regs(instr, ctx.sgprs_read_by_VMEM) || 682bf215546Sopenharmony_ci check_written_regs(instr, ctx.sgprs_read_by_DS) || 683bf215546Sopenharmony_ci check_written_regs(instr, ctx.sgprs_read_by_VMEM_store)) { 684bf215546Sopenharmony_ci ctx.sgprs_read_by_VMEM.reset(); 685bf215546Sopenharmony_ci ctx.sgprs_read_by_DS.reset(); 686bf215546Sopenharmony_ci ctx.sgprs_read_by_VMEM_store.reset(); 687bf215546Sopenharmony_ci 688bf215546Sopenharmony_ci /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */ 689bf215546Sopenharmony_ci aco_ptr<SOPP_instruction> depctr{ 690bf215546Sopenharmony_ci create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)}; 691bf215546Sopenharmony_ci depctr->imm = 0xffe3; 692bf215546Sopenharmony_ci depctr->block = -1; 693bf215546Sopenharmony_ci new_instructions.emplace_back(std::move(depctr)); 694bf215546Sopenharmony_ci } 695bf215546Sopenharmony_ci } else if (instr->isVALU()) { 696bf215546Sopenharmony_ci /* Hazard is mitigated by any VALU instruction */ 697bf215546Sopenharmony_ci ctx.sgprs_read_by_VMEM.reset(); 698bf215546Sopenharmony_ci ctx.sgprs_read_by_DS.reset(); 699bf215546Sopenharmony_ci ctx.sgprs_read_by_VMEM_store.reset(); 700bf215546Sopenharmony_ci } 701bf215546Sopenharmony_ci 702bf215546Sopenharmony_ci /* VcmpxPermlaneHazard 703bf215546Sopenharmony_ci * Handle any permlane following a VOPC instruction, insert v_mov between them. 704bf215546Sopenharmony_ci */ 705bf215546Sopenharmony_ci if (instr->isVOPC()) { 706bf215546Sopenharmony_ci ctx.has_VOPC = true; 707bf215546Sopenharmony_ci } else if (ctx.has_VOPC && (instr->opcode == aco_opcode::v_permlane16_b32 || 708bf215546Sopenharmony_ci instr->opcode == aco_opcode::v_permlanex16_b32)) { 709bf215546Sopenharmony_ci ctx.has_VOPC = false; 710bf215546Sopenharmony_ci 711bf215546Sopenharmony_ci /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */ 712bf215546Sopenharmony_ci aco_ptr<VOP1_instruction> v_mov{ 713bf215546Sopenharmony_ci create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)}; 714bf215546Sopenharmony_ci v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1); 715bf215546Sopenharmony_ci v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1); 716bf215546Sopenharmony_ci new_instructions.emplace_back(std::move(v_mov)); 717bf215546Sopenharmony_ci } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) { 718bf215546Sopenharmony_ci ctx.has_VOPC = false; 719bf215546Sopenharmony_ci } 720bf215546Sopenharmony_ci 721bf215546Sopenharmony_ci /* VcmpxExecWARHazard 722bf215546Sopenharmony_ci * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction. 723bf215546Sopenharmony_ci */ 724bf215546Sopenharmony_ci if (!instr->isVALU() && instr->reads_exec()) { 725bf215546Sopenharmony_ci ctx.has_nonVALU_exec_read = true; 726bf215546Sopenharmony_ci } else if (instr->isVALU()) { 727bf215546Sopenharmony_ci if (instr_writes_exec(instr)) { 728bf215546Sopenharmony_ci ctx.has_nonVALU_exec_read = false; 729bf215546Sopenharmony_ci 730bf215546Sopenharmony_ci /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */ 731bf215546Sopenharmony_ci aco_ptr<SOPP_instruction> depctr{ 732bf215546Sopenharmony_ci create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)}; 733bf215546Sopenharmony_ci depctr->imm = 0xfffe; 734bf215546Sopenharmony_ci depctr->block = -1; 735bf215546Sopenharmony_ci new_instructions.emplace_back(std::move(depctr)); 736bf215546Sopenharmony_ci } else if (instr_writes_sgpr(instr)) { 737bf215546Sopenharmony_ci /* Any VALU instruction that writes an SGPR mitigates the problem */ 738bf215546Sopenharmony_ci ctx.has_nonVALU_exec_read = false; 739bf215546Sopenharmony_ci } 740bf215546Sopenharmony_ci } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) { 741bf215546Sopenharmony_ci /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */ 742bf215546Sopenharmony_ci if ((instr->sopp().imm & 0xfffe) == 0xfffe) 743bf215546Sopenharmony_ci ctx.has_nonVALU_exec_read = false; 744bf215546Sopenharmony_ci } 745bf215546Sopenharmony_ci 746bf215546Sopenharmony_ci /* SMEMtoVectorWriteHazard 747bf215546Sopenharmony_ci * Handle any VALU instruction writing an SGPR after an SMEM reads it. 748bf215546Sopenharmony_ci */ 749bf215546Sopenharmony_ci if (instr->isSMEM()) { 750bf215546Sopenharmony_ci /* Remember all SGPRs that are read by the SMEM instruction */ 751bf215546Sopenharmony_ci mark_read_regs(instr, ctx.sgprs_read_by_SMEM); 752bf215546Sopenharmony_ci } else if (VALU_writes_sgpr(instr)) { 753bf215546Sopenharmony_ci /* Check if VALU writes an SGPR that was previously read by SMEM */ 754bf215546Sopenharmony_ci if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) { 755bf215546Sopenharmony_ci ctx.sgprs_read_by_SMEM.reset(); 756bf215546Sopenharmony_ci 757bf215546Sopenharmony_ci /* Insert s_mov to mitigate the problem */ 758bf215546Sopenharmony_ci aco_ptr<SOP1_instruction> s_mov{ 759bf215546Sopenharmony_ci create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)}; 760bf215546Sopenharmony_ci s_mov->definitions[0] = Definition(sgpr_null, s1); 761bf215546Sopenharmony_ci s_mov->operands[0] = Operand::zero(); 762bf215546Sopenharmony_ci new_instructions.emplace_back(std::move(s_mov)); 763bf215546Sopenharmony_ci } 764bf215546Sopenharmony_ci } else if (instr->isSALU()) { 765bf215546Sopenharmony_ci if (instr->format != Format::SOPP) { 766bf215546Sopenharmony_ci /* SALU can mitigate the hazard */ 767bf215546Sopenharmony_ci ctx.sgprs_read_by_SMEM.reset(); 768bf215546Sopenharmony_ci } else { 769bf215546Sopenharmony_ci /* Reducing lgkmcnt count to 0 always mitigates the hazard. */ 770bf215546Sopenharmony_ci const SOPP_instruction& sopp = instr->sopp(); 771bf215546Sopenharmony_ci if (sopp.opcode == aco_opcode::s_waitcnt_lgkmcnt) { 772bf215546Sopenharmony_ci if (sopp.imm == 0 && sopp.definitions[0].physReg() == sgpr_null) 773bf215546Sopenharmony_ci ctx.sgprs_read_by_SMEM.reset(); 774bf215546Sopenharmony_ci } else if (sopp.opcode == aco_opcode::s_waitcnt) { 775bf215546Sopenharmony_ci unsigned lgkm = (sopp.imm >> 8) & 0x3f; 776bf215546Sopenharmony_ci if (lgkm == 0) 777bf215546Sopenharmony_ci ctx.sgprs_read_by_SMEM.reset(); 778bf215546Sopenharmony_ci } 779bf215546Sopenharmony_ci } 780bf215546Sopenharmony_ci } 781bf215546Sopenharmony_ci 782bf215546Sopenharmony_ci /* LdsBranchVmemWARHazard 783bf215546Sopenharmony_ci * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns. 784bf215546Sopenharmony_ci */ 785bf215546Sopenharmony_ci if (instr->isVMEM() || instr->isGlobal() || instr->isScratch()) { 786bf215546Sopenharmony_ci ctx.has_VMEM = true; 787bf215546Sopenharmony_ci ctx.has_branch_after_VMEM = false; 788bf215546Sopenharmony_ci /* Mitigation for DS is needed only if there was already a branch after */ 789bf215546Sopenharmony_ci ctx.has_DS = ctx.has_branch_after_DS; 790bf215546Sopenharmony_ci } else if (instr->isDS()) { 791bf215546Sopenharmony_ci ctx.has_DS = true; 792bf215546Sopenharmony_ci ctx.has_branch_after_DS = false; 793bf215546Sopenharmony_ci /* Mitigation for VMEM is needed only if there was already a branch after */ 794bf215546Sopenharmony_ci ctx.has_VMEM = ctx.has_branch_after_VMEM; 795bf215546Sopenharmony_ci } else if (instr_is_branch(instr)) { 796bf215546Sopenharmony_ci ctx.has_branch_after_VMEM = ctx.has_VMEM; 797bf215546Sopenharmony_ci ctx.has_branch_after_DS = ctx.has_DS; 798bf215546Sopenharmony_ci } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) { 799bf215546Sopenharmony_ci /* Only s_waitcnt_vscnt can mitigate the hazard */ 800bf215546Sopenharmony_ci const SOPK_instruction& sopk = instr->sopk(); 801bf215546Sopenharmony_ci if (sopk.definitions[0].physReg() == sgpr_null && sopk.imm == 0) 802bf215546Sopenharmony_ci ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false; 803bf215546Sopenharmony_ci } 804bf215546Sopenharmony_ci if ((ctx.has_VMEM && ctx.has_branch_after_DS) || (ctx.has_DS && ctx.has_branch_after_VMEM)) { 805bf215546Sopenharmony_ci ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false; 806bf215546Sopenharmony_ci 807bf215546Sopenharmony_ci /* Insert s_waitcnt_vscnt to mitigate the problem */ 808bf215546Sopenharmony_ci aco_ptr<SOPK_instruction> wait{ 809bf215546Sopenharmony_ci create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)}; 810bf215546Sopenharmony_ci wait->definitions[0] = Definition(sgpr_null, s1); 811bf215546Sopenharmony_ci wait->imm = 0; 812bf215546Sopenharmony_ci new_instructions.emplace_back(std::move(wait)); 813bf215546Sopenharmony_ci 814bf215546Sopenharmony_ci ctx.has_VMEM = instr->isVMEM() || instr->isGlobal() || instr->isScratch(); 815bf215546Sopenharmony_ci ctx.has_DS = instr->isDS(); 816bf215546Sopenharmony_ci } 817bf215546Sopenharmony_ci 818bf215546Sopenharmony_ci /* NSAToVMEMBug 819bf215546Sopenharmony_ci * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] != 820bf215546Sopenharmony_ci * 0). 821bf215546Sopenharmony_ci */ 822bf215546Sopenharmony_ci if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) { 823bf215546Sopenharmony_ci ctx.has_NSA_MIMG = true; 824bf215546Sopenharmony_ci } else if (ctx.has_NSA_MIMG) { 825bf215546Sopenharmony_ci ctx.has_NSA_MIMG = false; 826bf215546Sopenharmony_ci 827bf215546Sopenharmony_ci if (instr->isMUBUF() || instr->isMTBUF()) { 828bf215546Sopenharmony_ci uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset; 829bf215546Sopenharmony_ci if (offset & 6) 830bf215546Sopenharmony_ci Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0); 831bf215546Sopenharmony_ci } 832bf215546Sopenharmony_ci } 833bf215546Sopenharmony_ci 834bf215546Sopenharmony_ci /* waNsaCannotFollowWritelane 835bf215546Sopenharmony_ci * Handles NSA MIMG immediately following a v_writelane_b32. 836bf215546Sopenharmony_ci */ 837bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::v_writelane_b32_e64) { 838bf215546Sopenharmony_ci ctx.has_writelane = true; 839bf215546Sopenharmony_ci } else if (ctx.has_writelane) { 840bf215546Sopenharmony_ci ctx.has_writelane = false; 841bf215546Sopenharmony_ci if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0) 842bf215546Sopenharmony_ci Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0); 843bf215546Sopenharmony_ci } 844bf215546Sopenharmony_ci} 845bf215546Sopenharmony_ci 846bf215546Sopenharmony_citemplate <typename Ctx> 847bf215546Sopenharmony_ciusing HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&, 848bf215546Sopenharmony_ci std::vector<aco_ptr<Instruction>>&); 849bf215546Sopenharmony_ci 850bf215546Sopenharmony_citemplate <typename Ctx, HandleInstr<Ctx> Handle> 851bf215546Sopenharmony_civoid 852bf215546Sopenharmony_cihandle_block(Program* program, Ctx& ctx, Block& block) 853bf215546Sopenharmony_ci{ 854bf215546Sopenharmony_ci if (block.instructions.empty()) 855bf215546Sopenharmony_ci return; 856bf215546Sopenharmony_ci 857bf215546Sopenharmony_ci State state; 858bf215546Sopenharmony_ci state.program = program; 859bf215546Sopenharmony_ci state.block = █ 860bf215546Sopenharmony_ci state.old_instructions = std::move(block.instructions); 861bf215546Sopenharmony_ci 862bf215546Sopenharmony_ci block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning 863bf215546Sopenharmony_ci block.instructions.reserve(state.old_instructions.size()); 864bf215546Sopenharmony_ci 865bf215546Sopenharmony_ci for (aco_ptr<Instruction>& instr : state.old_instructions) { 866bf215546Sopenharmony_ci Handle(state, ctx, instr, block.instructions); 867bf215546Sopenharmony_ci block.instructions.emplace_back(std::move(instr)); 868bf215546Sopenharmony_ci } 869bf215546Sopenharmony_ci} 870bf215546Sopenharmony_ci 871bf215546Sopenharmony_citemplate <typename Ctx, HandleInstr<Ctx> Handle> 872bf215546Sopenharmony_civoid 873bf215546Sopenharmony_cimitigate_hazards(Program* program) 874bf215546Sopenharmony_ci{ 875bf215546Sopenharmony_ci std::vector<Ctx> all_ctx(program->blocks.size()); 876bf215546Sopenharmony_ci std::stack<unsigned, std::vector<unsigned>> loop_header_indices; 877bf215546Sopenharmony_ci 878bf215546Sopenharmony_ci for (unsigned i = 0; i < program->blocks.size(); i++) { 879bf215546Sopenharmony_ci Block& block = program->blocks[i]; 880bf215546Sopenharmony_ci Ctx& ctx = all_ctx[i]; 881bf215546Sopenharmony_ci 882bf215546Sopenharmony_ci if (block.kind & block_kind_loop_header) { 883bf215546Sopenharmony_ci loop_header_indices.push(i); 884bf215546Sopenharmony_ci } else if (block.kind & block_kind_loop_exit) { 885bf215546Sopenharmony_ci /* Go through the whole loop again */ 886bf215546Sopenharmony_ci for (unsigned idx = loop_header_indices.top(); idx < i; idx++) { 887bf215546Sopenharmony_ci Ctx loop_block_ctx; 888bf215546Sopenharmony_ci for (unsigned b : program->blocks[idx].linear_preds) 889bf215546Sopenharmony_ci loop_block_ctx.join(all_ctx[b]); 890bf215546Sopenharmony_ci 891bf215546Sopenharmony_ci handle_block<Ctx, Handle>(program, loop_block_ctx, program->blocks[idx]); 892bf215546Sopenharmony_ci 893bf215546Sopenharmony_ci /* We only need to continue if the loop header context changed */ 894bf215546Sopenharmony_ci if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx]) 895bf215546Sopenharmony_ci break; 896bf215546Sopenharmony_ci 897bf215546Sopenharmony_ci all_ctx[idx] = loop_block_ctx; 898bf215546Sopenharmony_ci } 899bf215546Sopenharmony_ci 900bf215546Sopenharmony_ci loop_header_indices.pop(); 901bf215546Sopenharmony_ci } 902bf215546Sopenharmony_ci 903bf215546Sopenharmony_ci for (unsigned b : block.linear_preds) 904bf215546Sopenharmony_ci ctx.join(all_ctx[b]); 905bf215546Sopenharmony_ci 906bf215546Sopenharmony_ci handle_block<Ctx, Handle>(program, ctx, block); 907bf215546Sopenharmony_ci } 908bf215546Sopenharmony_ci} 909bf215546Sopenharmony_ci 910bf215546Sopenharmony_ci} /* end namespace */ 911bf215546Sopenharmony_ci 912bf215546Sopenharmony_civoid 913bf215546Sopenharmony_ciinsert_NOPs(Program* program) 914bf215546Sopenharmony_ci{ 915bf215546Sopenharmony_ci if (program->gfx_level >= GFX10_3) 916bf215546Sopenharmony_ci ; /* no hazards/bugs to mitigate */ 917bf215546Sopenharmony_ci else if (program->gfx_level >= GFX10) 918bf215546Sopenharmony_ci mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10>(program); 919bf215546Sopenharmony_ci else 920bf215546Sopenharmony_ci mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6>(program); 921bf215546Sopenharmony_ci} 922bf215546Sopenharmony_ci 923bf215546Sopenharmony_ci} // namespace aco 924