1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2020 Valve Corporation 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci */ 24bf215546Sopenharmony_ci 25bf215546Sopenharmony_ci#include "aco_ir.h" 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci#include "aco_builder.h" 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_ci#include "util/debug.h" 30bf215546Sopenharmony_ci 31bf215546Sopenharmony_ci#include "c11/threads.h" 32bf215546Sopenharmony_ci 33bf215546Sopenharmony_cinamespace aco { 34bf215546Sopenharmony_ci 35bf215546Sopenharmony_ciuint64_t debug_flags = 0; 36bf215546Sopenharmony_ci 37bf215546Sopenharmony_cistatic const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR}, 38bf215546Sopenharmony_ci {"validatera", DEBUG_VALIDATE_RA}, 39bf215546Sopenharmony_ci {"perfwarn", DEBUG_PERFWARN}, 40bf215546Sopenharmony_ci {"force-waitcnt", DEBUG_FORCE_WAITCNT}, 41bf215546Sopenharmony_ci {"novn", DEBUG_NO_VN}, 42bf215546Sopenharmony_ci {"noopt", DEBUG_NO_OPT}, 43bf215546Sopenharmony_ci {"nosched", DEBUG_NO_SCHED}, 44bf215546Sopenharmony_ci {"perfinfo", DEBUG_PERF_INFO}, 45bf215546Sopenharmony_ci {"liveinfo", DEBUG_LIVE_INFO}, 46bf215546Sopenharmony_ci {NULL, 0}}; 47bf215546Sopenharmony_ci 48bf215546Sopenharmony_cistatic once_flag init_once_flag = ONCE_FLAG_INIT; 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_cistatic void 51bf215546Sopenharmony_ciinit_once() 52bf215546Sopenharmony_ci{ 53bf215546Sopenharmony_ci debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options); 54bf215546Sopenharmony_ci 55bf215546Sopenharmony_ci#ifndef NDEBUG 56bf215546Sopenharmony_ci /* enable some flags by default on debug builds */ 57bf215546Sopenharmony_ci debug_flags |= aco::DEBUG_VALIDATE_IR; 58bf215546Sopenharmony_ci#endif 59bf215546Sopenharmony_ci} 60bf215546Sopenharmony_ci 61bf215546Sopenharmony_civoid 62bf215546Sopenharmony_ciinit() 63bf215546Sopenharmony_ci{ 64bf215546Sopenharmony_ci call_once(&init_once_flag, init_once); 65bf215546Sopenharmony_ci} 66bf215546Sopenharmony_ci 67bf215546Sopenharmony_civoid 68bf215546Sopenharmony_ciinit_program(Program* program, Stage stage, const struct aco_shader_info* info, 69bf215546Sopenharmony_ci enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode, 70bf215546Sopenharmony_ci ac_shader_config* config) 71bf215546Sopenharmony_ci{ 72bf215546Sopenharmony_ci program->stage = stage; 73bf215546Sopenharmony_ci program->config = config; 74bf215546Sopenharmony_ci program->info = *info; 75bf215546Sopenharmony_ci program->gfx_level = gfx_level; 76bf215546Sopenharmony_ci if (family == CHIP_UNKNOWN) { 77bf215546Sopenharmony_ci switch (gfx_level) { 78bf215546Sopenharmony_ci case GFX6: program->family = CHIP_TAHITI; break; 79bf215546Sopenharmony_ci case GFX7: program->family = CHIP_BONAIRE; break; 80bf215546Sopenharmony_ci case GFX8: program->family = CHIP_POLARIS10; break; 81bf215546Sopenharmony_ci case GFX9: program->family = CHIP_VEGA10; break; 82bf215546Sopenharmony_ci case GFX10: program->family = CHIP_NAVI10; break; 83bf215546Sopenharmony_ci default: program->family = CHIP_UNKNOWN; break; 84bf215546Sopenharmony_ci } 85bf215546Sopenharmony_ci } else { 86bf215546Sopenharmony_ci program->family = family; 87bf215546Sopenharmony_ci } 88bf215546Sopenharmony_ci program->wave_size = info->wave_size; 89bf215546Sopenharmony_ci program->lane_mask = program->wave_size == 32 ? s1 : s2; 90bf215546Sopenharmony_ci 91bf215546Sopenharmony_ci program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024 : 92bf215546Sopenharmony_ci gfx_level >= GFX7 ? 512 : 256; 93bf215546Sopenharmony_ci program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule; 94bf215546Sopenharmony_ci program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768; 95bf215546Sopenharmony_ci /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */ 96bf215546Sopenharmony_ci program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY; 97bf215546Sopenharmony_ci 98bf215546Sopenharmony_ci program->dev.vgpr_limit = 256; 99bf215546Sopenharmony_ci program->dev.physical_vgprs = 256; 100bf215546Sopenharmony_ci program->dev.vgpr_alloc_granule = 4; 101bf215546Sopenharmony_ci 102bf215546Sopenharmony_ci if (gfx_level >= GFX10) { 103bf215546Sopenharmony_ci program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */ 104bf215546Sopenharmony_ci program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512; 105bf215546Sopenharmony_ci program->dev.sgpr_alloc_granule = 128; 106bf215546Sopenharmony_ci program->dev.sgpr_limit = 107bf215546Sopenharmony_ci 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */ 108bf215546Sopenharmony_ci if (gfx_level == GFX10_3) 109bf215546Sopenharmony_ci program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8; 110bf215546Sopenharmony_ci else 111bf215546Sopenharmony_ci program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4; 112bf215546Sopenharmony_ci } else if (program->gfx_level >= GFX8) { 113bf215546Sopenharmony_ci program->dev.physical_sgprs = 800; 114bf215546Sopenharmony_ci program->dev.sgpr_alloc_granule = 16; 115bf215546Sopenharmony_ci program->dev.sgpr_limit = 102; 116bf215546Sopenharmony_ci if (family == CHIP_TONGA || family == CHIP_ICELAND) 117bf215546Sopenharmony_ci program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */ 118bf215546Sopenharmony_ci } else { 119bf215546Sopenharmony_ci program->dev.physical_sgprs = 512; 120bf215546Sopenharmony_ci program->dev.sgpr_alloc_granule = 8; 121bf215546Sopenharmony_ci program->dev.sgpr_limit = 104; 122bf215546Sopenharmony_ci } 123bf215546Sopenharmony_ci 124bf215546Sopenharmony_ci program->dev.max_wave64_per_simd = 10; 125bf215546Sopenharmony_ci if (program->gfx_level >= GFX10_3) 126bf215546Sopenharmony_ci program->dev.max_wave64_per_simd = 16; 127bf215546Sopenharmony_ci else if (program->gfx_level == GFX10) 128bf215546Sopenharmony_ci program->dev.max_wave64_per_simd = 20; 129bf215546Sopenharmony_ci else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM) 130bf215546Sopenharmony_ci program->dev.max_wave64_per_simd = 8; 131bf215546Sopenharmony_ci 132bf215546Sopenharmony_ci program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4; 133bf215546Sopenharmony_ci 134bf215546Sopenharmony_ci switch (program->family) { 135bf215546Sopenharmony_ci /* GFX8 APUs */ 136bf215546Sopenharmony_ci case CHIP_CARRIZO: 137bf215546Sopenharmony_ci case CHIP_STONEY: 138bf215546Sopenharmony_ci /* GFX9 APUS */ 139bf215546Sopenharmony_ci case CHIP_RAVEN: 140bf215546Sopenharmony_ci case CHIP_RAVEN2: 141bf215546Sopenharmony_ci case CHIP_RENOIR: program->dev.xnack_enabled = true; break; 142bf215546Sopenharmony_ci default: break; 143bf215546Sopenharmony_ci } 144bf215546Sopenharmony_ci 145bf215546Sopenharmony_ci program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS; 146bf215546Sopenharmony_ci /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */ 147bf215546Sopenharmony_ci program->dev.has_fast_fma32 = program->gfx_level >= GFX9; 148bf215546Sopenharmony_ci if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO || 149bf215546Sopenharmony_ci program->family == CHIP_HAWAII) 150bf215546Sopenharmony_ci program->dev.has_fast_fma32 = true; 151bf215546Sopenharmony_ci program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level >= GFX10; 152bf215546Sopenharmony_ci 153bf215546Sopenharmony_ci program->dev.fused_mad_mix = program->gfx_level >= GFX10; 154bf215546Sopenharmony_ci if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 || 155bf215546Sopenharmony_ci program->family == CHIP_ARCTURUS || program->family == CHIP_ALDEBARAN) 156bf215546Sopenharmony_ci program->dev.fused_mad_mix = true; 157bf215546Sopenharmony_ci 158bf215546Sopenharmony_ci if (program->gfx_level >= GFX11) { 159bf215546Sopenharmony_ci program->dev.scratch_global_offset_min = -4096; 160bf215546Sopenharmony_ci program->dev.scratch_global_offset_max = 4095; 161bf215546Sopenharmony_ci } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) { 162bf215546Sopenharmony_ci program->dev.scratch_global_offset_min = -2048; 163bf215546Sopenharmony_ci program->dev.scratch_global_offset_max = 2047; 164bf215546Sopenharmony_ci } else if (program->gfx_level == GFX9) { 165bf215546Sopenharmony_ci /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */ 166bf215546Sopenharmony_ci program->dev.scratch_global_offset_min = 0; 167bf215546Sopenharmony_ci program->dev.scratch_global_offset_max = 4095; 168bf215546Sopenharmony_ci } 169bf215546Sopenharmony_ci 170bf215546Sopenharmony_ci program->wgp_mode = wgp_mode; 171bf215546Sopenharmony_ci 172bf215546Sopenharmony_ci program->progress = CompilationProgress::after_isel; 173bf215546Sopenharmony_ci 174bf215546Sopenharmony_ci program->next_fp_mode.preserve_signed_zero_inf_nan32 = false; 175bf215546Sopenharmony_ci program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false; 176bf215546Sopenharmony_ci program->next_fp_mode.must_flush_denorms32 = false; 177bf215546Sopenharmony_ci program->next_fp_mode.must_flush_denorms16_64 = false; 178bf215546Sopenharmony_ci program->next_fp_mode.care_about_round32 = false; 179bf215546Sopenharmony_ci program->next_fp_mode.care_about_round16_64 = false; 180bf215546Sopenharmony_ci program->next_fp_mode.denorm16_64 = fp_denorm_keep; 181bf215546Sopenharmony_ci program->next_fp_mode.denorm32 = 0; 182bf215546Sopenharmony_ci program->next_fp_mode.round16_64 = fp_round_ne; 183bf215546Sopenharmony_ci program->next_fp_mode.round32 = fp_round_ne; 184bf215546Sopenharmony_ci} 185bf215546Sopenharmony_ci 186bf215546Sopenharmony_cimemory_sync_info 187bf215546Sopenharmony_ciget_sync_info(const Instruction* instr) 188bf215546Sopenharmony_ci{ 189bf215546Sopenharmony_ci switch (instr->format) { 190bf215546Sopenharmony_ci case Format::SMEM: return instr->smem().sync; 191bf215546Sopenharmony_ci case Format::MUBUF: return instr->mubuf().sync; 192bf215546Sopenharmony_ci case Format::MIMG: return instr->mimg().sync; 193bf215546Sopenharmony_ci case Format::MTBUF: return instr->mtbuf().sync; 194bf215546Sopenharmony_ci case Format::FLAT: 195bf215546Sopenharmony_ci case Format::GLOBAL: 196bf215546Sopenharmony_ci case Format::SCRATCH: return instr->flatlike().sync; 197bf215546Sopenharmony_ci case Format::DS: return instr->ds().sync; 198bf215546Sopenharmony_ci default: return memory_sync_info(); 199bf215546Sopenharmony_ci } 200bf215546Sopenharmony_ci} 201bf215546Sopenharmony_ci 202bf215546Sopenharmony_cibool 203bf215546Sopenharmony_cican_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra) 204bf215546Sopenharmony_ci{ 205bf215546Sopenharmony_ci if (!instr->isVALU()) 206bf215546Sopenharmony_ci return false; 207bf215546Sopenharmony_ci 208bf215546Sopenharmony_ci if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P()) 209bf215546Sopenharmony_ci return false; 210bf215546Sopenharmony_ci 211bf215546Sopenharmony_ci if (instr->isSDWA()) 212bf215546Sopenharmony_ci return true; 213bf215546Sopenharmony_ci 214bf215546Sopenharmony_ci if (instr->isVOP3()) { 215bf215546Sopenharmony_ci VOP3_instruction& vop3 = instr->vop3(); 216bf215546Sopenharmony_ci if (instr->format == Format::VOP3) 217bf215546Sopenharmony_ci return false; 218bf215546Sopenharmony_ci if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8) 219bf215546Sopenharmony_ci return false; 220bf215546Sopenharmony_ci if (vop3.omod && gfx_level < GFX9) 221bf215546Sopenharmony_ci return false; 222bf215546Sopenharmony_ci 223bf215546Sopenharmony_ci // TODO: return true if we know we will use vcc 224bf215546Sopenharmony_ci if (!pre_ra && instr->definitions.size() >= 2) 225bf215546Sopenharmony_ci return false; 226bf215546Sopenharmony_ci 227bf215546Sopenharmony_ci for (unsigned i = 1; i < instr->operands.size(); i++) { 228bf215546Sopenharmony_ci if (instr->operands[i].isLiteral()) 229bf215546Sopenharmony_ci return false; 230bf215546Sopenharmony_ci if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr)) 231bf215546Sopenharmony_ci return false; 232bf215546Sopenharmony_ci } 233bf215546Sopenharmony_ci } 234bf215546Sopenharmony_ci 235bf215546Sopenharmony_ci if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC()) 236bf215546Sopenharmony_ci return false; 237bf215546Sopenharmony_ci 238bf215546Sopenharmony_ci if (!instr->operands.empty()) { 239bf215546Sopenharmony_ci if (instr->operands[0].isLiteral()) 240bf215546Sopenharmony_ci return false; 241bf215546Sopenharmony_ci if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr)) 242bf215546Sopenharmony_ci return false; 243bf215546Sopenharmony_ci if (instr->operands[0].bytes() > 4) 244bf215546Sopenharmony_ci return false; 245bf215546Sopenharmony_ci if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4) 246bf215546Sopenharmony_ci return false; 247bf215546Sopenharmony_ci } 248bf215546Sopenharmony_ci 249bf215546Sopenharmony_ci bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 || 250bf215546Sopenharmony_ci instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16; 251bf215546Sopenharmony_ci 252bf215546Sopenharmony_ci if (gfx_level != GFX8 && is_mac) 253bf215546Sopenharmony_ci return false; 254bf215546Sopenharmony_ci 255bf215546Sopenharmony_ci // TODO: return true if we know we will use vcc 256bf215546Sopenharmony_ci if (!pre_ra && instr->isVOPC() && gfx_level == GFX8) 257bf215546Sopenharmony_ci return false; 258bf215546Sopenharmony_ci if (!pre_ra && instr->operands.size() >= 3 && !is_mac) 259bf215546Sopenharmony_ci return false; 260bf215546Sopenharmony_ci 261bf215546Sopenharmony_ci return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 && 262bf215546Sopenharmony_ci instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 && 263bf215546Sopenharmony_ci instr->opcode != aco_opcode::v_readfirstlane_b32 && 264bf215546Sopenharmony_ci instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32; 265bf215546Sopenharmony_ci} 266bf215546Sopenharmony_ci 267bf215546Sopenharmony_ci/* updates "instr" and returns the old instruction (or NULL if no update was needed) */ 268bf215546Sopenharmony_ciaco_ptr<Instruction> 269bf215546Sopenharmony_ciconvert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr) 270bf215546Sopenharmony_ci{ 271bf215546Sopenharmony_ci if (instr->isSDWA()) 272bf215546Sopenharmony_ci return NULL; 273bf215546Sopenharmony_ci 274bf215546Sopenharmony_ci aco_ptr<Instruction> tmp = std::move(instr); 275bf215546Sopenharmony_ci Format format = 276bf215546Sopenharmony_ci (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA); 277bf215546Sopenharmony_ci instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(), 278bf215546Sopenharmony_ci tmp->definitions.size())); 279bf215546Sopenharmony_ci std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); 280bf215546Sopenharmony_ci std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin()); 281bf215546Sopenharmony_ci 282bf215546Sopenharmony_ci SDWA_instruction& sdwa = instr->sdwa(); 283bf215546Sopenharmony_ci 284bf215546Sopenharmony_ci if (tmp->isVOP3()) { 285bf215546Sopenharmony_ci VOP3_instruction& vop3 = tmp->vop3(); 286bf215546Sopenharmony_ci memcpy(sdwa.neg, vop3.neg, sizeof(sdwa.neg)); 287bf215546Sopenharmony_ci memcpy(sdwa.abs, vop3.abs, sizeof(sdwa.abs)); 288bf215546Sopenharmony_ci sdwa.omod = vop3.omod; 289bf215546Sopenharmony_ci sdwa.clamp = vop3.clamp; 290bf215546Sopenharmony_ci } 291bf215546Sopenharmony_ci 292bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->operands.size(); i++) { 293bf215546Sopenharmony_ci /* SDWA only uses operands 0 and 1. */ 294bf215546Sopenharmony_ci if (i >= 2) 295bf215546Sopenharmony_ci break; 296bf215546Sopenharmony_ci 297bf215546Sopenharmony_ci sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false); 298bf215546Sopenharmony_ci } 299bf215546Sopenharmony_ci 300bf215546Sopenharmony_ci sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false); 301bf215546Sopenharmony_ci 302bf215546Sopenharmony_ci if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8) 303bf215546Sopenharmony_ci instr->definitions[0].setFixed(vcc); 304bf215546Sopenharmony_ci if (instr->definitions.size() >= 2) 305bf215546Sopenharmony_ci instr->definitions[1].setFixed(vcc); 306bf215546Sopenharmony_ci if (instr->operands.size() >= 3) 307bf215546Sopenharmony_ci instr->operands[2].setFixed(vcc); 308bf215546Sopenharmony_ci 309bf215546Sopenharmony_ci instr->pass_flags = tmp->pass_flags; 310bf215546Sopenharmony_ci 311bf215546Sopenharmony_ci return tmp; 312bf215546Sopenharmony_ci} 313bf215546Sopenharmony_ci 314bf215546Sopenharmony_cibool 315bf215546Sopenharmony_cican_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8) 316bf215546Sopenharmony_ci{ 317bf215546Sopenharmony_ci assert(instr->isVALU() && !instr->operands.empty()); 318bf215546Sopenharmony_ci 319bf215546Sopenharmony_ci if (instr->isDPP()) 320bf215546Sopenharmony_ci return instr->isDPP8() == dpp8; 321bf215546Sopenharmony_ci 322bf215546Sopenharmony_ci if (instr->operands.size() && instr->operands[0].isLiteral()) 323bf215546Sopenharmony_ci return false; 324bf215546Sopenharmony_ci 325bf215546Sopenharmony_ci if (instr->isSDWA()) 326bf215546Sopenharmony_ci return false; 327bf215546Sopenharmony_ci 328bf215546Sopenharmony_ci if (!pre_ra && (instr->isVOPC() || instr->definitions.size() > 1) && 329bf215546Sopenharmony_ci instr->definitions.back().physReg() != vcc) 330bf215546Sopenharmony_ci return false; 331bf215546Sopenharmony_ci 332bf215546Sopenharmony_ci if (!pre_ra && instr->operands.size() >= 3 && instr->operands[2].physReg() != vcc) 333bf215546Sopenharmony_ci return false; 334bf215546Sopenharmony_ci 335bf215546Sopenharmony_ci if (instr->isVOP3()) { 336bf215546Sopenharmony_ci const VOP3_instruction* vop3 = &instr->vop3(); 337bf215546Sopenharmony_ci if (vop3->clamp || vop3->omod || vop3->opsel) 338bf215546Sopenharmony_ci return false; 339bf215546Sopenharmony_ci if (dpp8) 340bf215546Sopenharmony_ci return false; 341bf215546Sopenharmony_ci if (instr->format == Format::VOP3) 342bf215546Sopenharmony_ci return false; 343bf215546Sopenharmony_ci if (instr->operands.size() > 1 && !instr->operands[1].isOfType(RegType::vgpr)) 344bf215546Sopenharmony_ci return false; 345bf215546Sopenharmony_ci } 346bf215546Sopenharmony_ci 347bf215546Sopenharmony_ci /* there are more cases but those all take 64-bit inputs */ 348bf215546Sopenharmony_ci return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 && 349bf215546Sopenharmony_ci instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 && 350bf215546Sopenharmony_ci instr->opcode != aco_opcode::v_readfirstlane_b32 && 351bf215546Sopenharmony_ci instr->opcode != aco_opcode::v_cvt_f64_i32 && 352bf215546Sopenharmony_ci instr->opcode != aco_opcode::v_cvt_f64_f32 && instr->opcode != aco_opcode::v_cvt_f64_u32; 353bf215546Sopenharmony_ci} 354bf215546Sopenharmony_ci 355bf215546Sopenharmony_ciaco_ptr<Instruction> 356bf215546Sopenharmony_ciconvert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8) 357bf215546Sopenharmony_ci{ 358bf215546Sopenharmony_ci if (instr->isDPP()) 359bf215546Sopenharmony_ci return NULL; 360bf215546Sopenharmony_ci 361bf215546Sopenharmony_ci aco_ptr<Instruction> tmp = std::move(instr); 362bf215546Sopenharmony_ci Format format = (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) | 363bf215546Sopenharmony_ci (dpp8 ? (uint32_t)Format::DPP8 : (uint32_t)Format::DPP16)); 364bf215546Sopenharmony_ci if (dpp8) 365bf215546Sopenharmony_ci instr.reset(create_instruction<DPP8_instruction>(tmp->opcode, format, tmp->operands.size(), 366bf215546Sopenharmony_ci tmp->definitions.size())); 367bf215546Sopenharmony_ci else 368bf215546Sopenharmony_ci instr.reset(create_instruction<DPP16_instruction>(tmp->opcode, format, tmp->operands.size(), 369bf215546Sopenharmony_ci tmp->definitions.size())); 370bf215546Sopenharmony_ci std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); 371bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->definitions.size(); i++) 372bf215546Sopenharmony_ci instr->definitions[i] = tmp->definitions[i]; 373bf215546Sopenharmony_ci 374bf215546Sopenharmony_ci if (dpp8) { 375bf215546Sopenharmony_ci DPP8_instruction* dpp = &instr->dpp8(); 376bf215546Sopenharmony_ci for (unsigned i = 0; i < 8; i++) 377bf215546Sopenharmony_ci dpp->lane_sel[i] = i; 378bf215546Sopenharmony_ci } else { 379bf215546Sopenharmony_ci DPP16_instruction* dpp = &instr->dpp16(); 380bf215546Sopenharmony_ci dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3); 381bf215546Sopenharmony_ci dpp->row_mask = 0xf; 382bf215546Sopenharmony_ci dpp->bank_mask = 0xf; 383bf215546Sopenharmony_ci 384bf215546Sopenharmony_ci if (tmp->isVOP3()) { 385bf215546Sopenharmony_ci const VOP3_instruction* vop3 = &tmp->vop3(); 386bf215546Sopenharmony_ci memcpy(dpp->neg, vop3->neg, sizeof(dpp->neg)); 387bf215546Sopenharmony_ci memcpy(dpp->abs, vop3->abs, sizeof(dpp->abs)); 388bf215546Sopenharmony_ci } 389bf215546Sopenharmony_ci } 390bf215546Sopenharmony_ci 391bf215546Sopenharmony_ci if (instr->isVOPC() || instr->definitions.size() > 1) 392bf215546Sopenharmony_ci instr->definitions.back().setFixed(vcc); 393bf215546Sopenharmony_ci 394bf215546Sopenharmony_ci if (instr->operands.size() >= 3) 395bf215546Sopenharmony_ci instr->operands[2].setFixed(vcc); 396bf215546Sopenharmony_ci 397bf215546Sopenharmony_ci instr->pass_flags = tmp->pass_flags; 398bf215546Sopenharmony_ci 399bf215546Sopenharmony_ci return tmp; 400bf215546Sopenharmony_ci} 401bf215546Sopenharmony_ci 402bf215546Sopenharmony_cibool 403bf215546Sopenharmony_cican_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx) 404bf215546Sopenharmony_ci{ 405bf215546Sopenharmony_ci /* opsel is only GFX9+ */ 406bf215546Sopenharmony_ci if (gfx_level < GFX9) 407bf215546Sopenharmony_ci return false; 408bf215546Sopenharmony_ci 409bf215546Sopenharmony_ci switch (op) { 410bf215546Sopenharmony_ci case aco_opcode::v_div_fixup_f16: 411bf215546Sopenharmony_ci case aco_opcode::v_fma_f16: 412bf215546Sopenharmony_ci case aco_opcode::v_mad_f16: 413bf215546Sopenharmony_ci case aco_opcode::v_mad_u16: 414bf215546Sopenharmony_ci case aco_opcode::v_mad_i16: 415bf215546Sopenharmony_ci case aco_opcode::v_med3_f16: 416bf215546Sopenharmony_ci case aco_opcode::v_med3_i16: 417bf215546Sopenharmony_ci case aco_opcode::v_med3_u16: 418bf215546Sopenharmony_ci case aco_opcode::v_min3_f16: 419bf215546Sopenharmony_ci case aco_opcode::v_min3_i16: 420bf215546Sopenharmony_ci case aco_opcode::v_min3_u16: 421bf215546Sopenharmony_ci case aco_opcode::v_max3_f16: 422bf215546Sopenharmony_ci case aco_opcode::v_max3_i16: 423bf215546Sopenharmony_ci case aco_opcode::v_max3_u16: 424bf215546Sopenharmony_ci case aco_opcode::v_max_u16_e64: 425bf215546Sopenharmony_ci case aco_opcode::v_max_i16_e64: 426bf215546Sopenharmony_ci case aco_opcode::v_min_u16_e64: 427bf215546Sopenharmony_ci case aco_opcode::v_min_i16_e64: 428bf215546Sopenharmony_ci case aco_opcode::v_add_i16: 429bf215546Sopenharmony_ci case aco_opcode::v_sub_i16: 430bf215546Sopenharmony_ci case aco_opcode::v_add_u16_e64: 431bf215546Sopenharmony_ci case aco_opcode::v_sub_u16_e64: 432bf215546Sopenharmony_ci case aco_opcode::v_lshlrev_b16_e64: 433bf215546Sopenharmony_ci case aco_opcode::v_lshrrev_b16_e64: 434bf215546Sopenharmony_ci case aco_opcode::v_ashrrev_i16_e64: 435bf215546Sopenharmony_ci case aco_opcode::v_mul_lo_u16_e64: return true; 436bf215546Sopenharmony_ci case aco_opcode::v_pack_b32_f16: 437bf215546Sopenharmony_ci case aco_opcode::v_cvt_pknorm_i16_f16: 438bf215546Sopenharmony_ci case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1; 439bf215546Sopenharmony_ci case aco_opcode::v_mad_u32_u16: 440bf215546Sopenharmony_ci case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2; 441bf215546Sopenharmony_ci default: return false; 442bf215546Sopenharmony_ci } 443bf215546Sopenharmony_ci} 444bf215546Sopenharmony_ci 445bf215546Sopenharmony_cibool 446bf215546Sopenharmony_ciinstr_is_16bit(amd_gfx_level gfx_level, aco_opcode op) 447bf215546Sopenharmony_ci{ 448bf215546Sopenharmony_ci /* partial register writes are GFX9+, only */ 449bf215546Sopenharmony_ci if (gfx_level < GFX9) 450bf215546Sopenharmony_ci return false; 451bf215546Sopenharmony_ci 452bf215546Sopenharmony_ci switch (op) { 453bf215546Sopenharmony_ci /* VOP3 */ 454bf215546Sopenharmony_ci case aco_opcode::v_mad_f16: 455bf215546Sopenharmony_ci case aco_opcode::v_mad_u16: 456bf215546Sopenharmony_ci case aco_opcode::v_mad_i16: 457bf215546Sopenharmony_ci case aco_opcode::v_fma_f16: 458bf215546Sopenharmony_ci case aco_opcode::v_div_fixup_f16: 459bf215546Sopenharmony_ci case aco_opcode::v_interp_p2_f16: 460bf215546Sopenharmony_ci case aco_opcode::v_fma_mixlo_f16: 461bf215546Sopenharmony_ci case aco_opcode::v_fma_mixhi_f16: 462bf215546Sopenharmony_ci /* VOP2 */ 463bf215546Sopenharmony_ci case aco_opcode::v_mac_f16: 464bf215546Sopenharmony_ci case aco_opcode::v_madak_f16: 465bf215546Sopenharmony_ci case aco_opcode::v_madmk_f16: return gfx_level >= GFX9; 466bf215546Sopenharmony_ci case aco_opcode::v_add_f16: 467bf215546Sopenharmony_ci case aco_opcode::v_sub_f16: 468bf215546Sopenharmony_ci case aco_opcode::v_subrev_f16: 469bf215546Sopenharmony_ci case aco_opcode::v_mul_f16: 470bf215546Sopenharmony_ci case aco_opcode::v_max_f16: 471bf215546Sopenharmony_ci case aco_opcode::v_min_f16: 472bf215546Sopenharmony_ci case aco_opcode::v_ldexp_f16: 473bf215546Sopenharmony_ci case aco_opcode::v_fmac_f16: 474bf215546Sopenharmony_ci case aco_opcode::v_fmamk_f16: 475bf215546Sopenharmony_ci case aco_opcode::v_fmaak_f16: 476bf215546Sopenharmony_ci /* VOP1 */ 477bf215546Sopenharmony_ci case aco_opcode::v_cvt_f16_f32: 478bf215546Sopenharmony_ci case aco_opcode::v_cvt_f16_u16: 479bf215546Sopenharmony_ci case aco_opcode::v_cvt_f16_i16: 480bf215546Sopenharmony_ci case aco_opcode::v_rcp_f16: 481bf215546Sopenharmony_ci case aco_opcode::v_sqrt_f16: 482bf215546Sopenharmony_ci case aco_opcode::v_rsq_f16: 483bf215546Sopenharmony_ci case aco_opcode::v_log_f16: 484bf215546Sopenharmony_ci case aco_opcode::v_exp_f16: 485bf215546Sopenharmony_ci case aco_opcode::v_frexp_mant_f16: 486bf215546Sopenharmony_ci case aco_opcode::v_frexp_exp_i16_f16: 487bf215546Sopenharmony_ci case aco_opcode::v_floor_f16: 488bf215546Sopenharmony_ci case aco_opcode::v_ceil_f16: 489bf215546Sopenharmony_ci case aco_opcode::v_trunc_f16: 490bf215546Sopenharmony_ci case aco_opcode::v_rndne_f16: 491bf215546Sopenharmony_ci case aco_opcode::v_fract_f16: 492bf215546Sopenharmony_ci case aco_opcode::v_sin_f16: 493bf215546Sopenharmony_ci case aco_opcode::v_cos_f16: return gfx_level >= GFX10; 494bf215546Sopenharmony_ci // TODO: confirm whether these write 16 or 32 bit on GFX10+ 495bf215546Sopenharmony_ci // case aco_opcode::v_cvt_u16_f16: 496bf215546Sopenharmony_ci // case aco_opcode::v_cvt_i16_f16: 497bf215546Sopenharmony_ci // case aco_opcode::p_cvt_f16_f32_rtne: 498bf215546Sopenharmony_ci // case aco_opcode::v_cvt_norm_i16_f16: 499bf215546Sopenharmony_ci // case aco_opcode::v_cvt_norm_u16_f16: 500bf215546Sopenharmony_ci /* on GFX10, all opsel instructions preserve the high bits */ 501bf215546Sopenharmony_ci default: return gfx_level >= GFX10 && can_use_opsel(gfx_level, op, -1); 502bf215546Sopenharmony_ci } 503bf215546Sopenharmony_ci} 504bf215546Sopenharmony_ci 505bf215546Sopenharmony_ciuint32_t 506bf215546Sopenharmony_ciget_reduction_identity(ReduceOp op, unsigned idx) 507bf215546Sopenharmony_ci{ 508bf215546Sopenharmony_ci switch (op) { 509bf215546Sopenharmony_ci case iadd8: 510bf215546Sopenharmony_ci case iadd16: 511bf215546Sopenharmony_ci case iadd32: 512bf215546Sopenharmony_ci case iadd64: 513bf215546Sopenharmony_ci case fadd16: 514bf215546Sopenharmony_ci case fadd32: 515bf215546Sopenharmony_ci case fadd64: 516bf215546Sopenharmony_ci case ior8: 517bf215546Sopenharmony_ci case ior16: 518bf215546Sopenharmony_ci case ior32: 519bf215546Sopenharmony_ci case ior64: 520bf215546Sopenharmony_ci case ixor8: 521bf215546Sopenharmony_ci case ixor16: 522bf215546Sopenharmony_ci case ixor32: 523bf215546Sopenharmony_ci case ixor64: 524bf215546Sopenharmony_ci case umax8: 525bf215546Sopenharmony_ci case umax16: 526bf215546Sopenharmony_ci case umax32: 527bf215546Sopenharmony_ci case umax64: return 0; 528bf215546Sopenharmony_ci case imul8: 529bf215546Sopenharmony_ci case imul16: 530bf215546Sopenharmony_ci case imul32: 531bf215546Sopenharmony_ci case imul64: return idx ? 0 : 1; 532bf215546Sopenharmony_ci case fmul16: return 0x3c00u; /* 1.0 */ 533bf215546Sopenharmony_ci case fmul32: return 0x3f800000u; /* 1.0 */ 534bf215546Sopenharmony_ci case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */ 535bf215546Sopenharmony_ci case imin8: return INT8_MAX; 536bf215546Sopenharmony_ci case imin16: return INT16_MAX; 537bf215546Sopenharmony_ci case imin32: return INT32_MAX; 538bf215546Sopenharmony_ci case imin64: return idx ? 0x7fffffffu : 0xffffffffu; 539bf215546Sopenharmony_ci case imax8: return INT8_MIN; 540bf215546Sopenharmony_ci case imax16: return INT16_MIN; 541bf215546Sopenharmony_ci case imax32: return INT32_MIN; 542bf215546Sopenharmony_ci case imax64: return idx ? 0x80000000u : 0; 543bf215546Sopenharmony_ci case umin8: 544bf215546Sopenharmony_ci case umin16: 545bf215546Sopenharmony_ci case iand8: 546bf215546Sopenharmony_ci case iand16: return 0xffffffffu; 547bf215546Sopenharmony_ci case umin32: 548bf215546Sopenharmony_ci case umin64: 549bf215546Sopenharmony_ci case iand32: 550bf215546Sopenharmony_ci case iand64: return 0xffffffffu; 551bf215546Sopenharmony_ci case fmin16: return 0x7c00u; /* infinity */ 552bf215546Sopenharmony_ci case fmin32: return 0x7f800000u; /* infinity */ 553bf215546Sopenharmony_ci case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */ 554bf215546Sopenharmony_ci case fmax16: return 0xfc00u; /* negative infinity */ 555bf215546Sopenharmony_ci case fmax32: return 0xff800000u; /* negative infinity */ 556bf215546Sopenharmony_ci case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */ 557bf215546Sopenharmony_ci default: unreachable("Invalid reduction operation"); break; 558bf215546Sopenharmony_ci } 559bf215546Sopenharmony_ci return 0; 560bf215546Sopenharmony_ci} 561bf215546Sopenharmony_ci 562bf215546Sopenharmony_cibool 563bf215546Sopenharmony_cineeds_exec_mask(const Instruction* instr) 564bf215546Sopenharmony_ci{ 565bf215546Sopenharmony_ci if (instr->isVALU()) { 566bf215546Sopenharmony_ci return instr->opcode != aco_opcode::v_readlane_b32 && 567bf215546Sopenharmony_ci instr->opcode != aco_opcode::v_readlane_b32_e64 && 568bf215546Sopenharmony_ci instr->opcode != aco_opcode::v_writelane_b32 && 569bf215546Sopenharmony_ci instr->opcode != aco_opcode::v_writelane_b32_e64; 570bf215546Sopenharmony_ci } 571bf215546Sopenharmony_ci 572bf215546Sopenharmony_ci if (instr->isVMEM() || instr->isFlatLike()) 573bf215546Sopenharmony_ci return true; 574bf215546Sopenharmony_ci 575bf215546Sopenharmony_ci if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier()) 576bf215546Sopenharmony_ci return instr->reads_exec(); 577bf215546Sopenharmony_ci 578bf215546Sopenharmony_ci if (instr->isPseudo()) { 579bf215546Sopenharmony_ci switch (instr->opcode) { 580bf215546Sopenharmony_ci case aco_opcode::p_create_vector: 581bf215546Sopenharmony_ci case aco_opcode::p_extract_vector: 582bf215546Sopenharmony_ci case aco_opcode::p_split_vector: 583bf215546Sopenharmony_ci case aco_opcode::p_phi: 584bf215546Sopenharmony_ci case aco_opcode::p_parallelcopy: 585bf215546Sopenharmony_ci for (Definition def : instr->definitions) { 586bf215546Sopenharmony_ci if (def.getTemp().type() == RegType::vgpr) 587bf215546Sopenharmony_ci return true; 588bf215546Sopenharmony_ci } 589bf215546Sopenharmony_ci return instr->reads_exec(); 590bf215546Sopenharmony_ci case aco_opcode::p_spill: 591bf215546Sopenharmony_ci case aco_opcode::p_reload: 592bf215546Sopenharmony_ci case aco_opcode::p_end_linear_vgpr: 593bf215546Sopenharmony_ci case aco_opcode::p_logical_start: 594bf215546Sopenharmony_ci case aco_opcode::p_logical_end: 595bf215546Sopenharmony_ci case aco_opcode::p_startpgm: 596bf215546Sopenharmony_ci case aco_opcode::p_init_scratch: return instr->reads_exec(); 597bf215546Sopenharmony_ci default: break; 598bf215546Sopenharmony_ci } 599bf215546Sopenharmony_ci } 600bf215546Sopenharmony_ci 601bf215546Sopenharmony_ci return true; 602bf215546Sopenharmony_ci} 603bf215546Sopenharmony_ci 604bf215546Sopenharmony_cistruct CmpInfo { 605bf215546Sopenharmony_ci aco_opcode ordered; 606bf215546Sopenharmony_ci aco_opcode unordered; 607bf215546Sopenharmony_ci aco_opcode swapped; 608bf215546Sopenharmony_ci aco_opcode inverse; 609bf215546Sopenharmony_ci aco_opcode vcmpx; 610bf215546Sopenharmony_ci aco_opcode f32; 611bf215546Sopenharmony_ci unsigned size; 612bf215546Sopenharmony_ci}; 613bf215546Sopenharmony_ci 614bf215546Sopenharmony_ciALWAYS_INLINE bool 615bf215546Sopenharmony_ciget_cmp_info(aco_opcode op, CmpInfo* info) 616bf215546Sopenharmony_ci{ 617bf215546Sopenharmony_ci info->ordered = aco_opcode::num_opcodes; 618bf215546Sopenharmony_ci info->unordered = aco_opcode::num_opcodes; 619bf215546Sopenharmony_ci info->swapped = aco_opcode::num_opcodes; 620bf215546Sopenharmony_ci info->inverse = aco_opcode::num_opcodes; 621bf215546Sopenharmony_ci info->f32 = aco_opcode::num_opcodes; 622bf215546Sopenharmony_ci switch (op) { 623bf215546Sopenharmony_ci // clang-format off 624bf215546Sopenharmony_ci#define CMP2(ord, unord, ord_swap, unord_swap, sz) \ 625bf215546Sopenharmony_ci case aco_opcode::v_cmp_##ord##_f##sz: \ 626bf215546Sopenharmony_ci case aco_opcode::v_cmp_n##unord##_f##sz: \ 627bf215546Sopenharmony_ci info->ordered = aco_opcode::v_cmp_##ord##_f##sz; \ 628bf215546Sopenharmony_ci info->unordered = aco_opcode::v_cmp_n##unord##_f##sz; \ 629bf215546Sopenharmony_ci info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \ 630bf215546Sopenharmony_ci : aco_opcode::v_cmp_n##unord_swap##_f##sz; \ 631bf215546Sopenharmony_ci info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \ 632bf215546Sopenharmony_ci : aco_opcode::v_cmp_n##ord##_f##sz; \ 633bf215546Sopenharmony_ci info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 \ 634bf215546Sopenharmony_ci : aco_opcode::v_cmp_n##unord##_f32; \ 635bf215546Sopenharmony_ci info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz \ 636bf215546Sopenharmony_ci : aco_opcode::v_cmpx_n##unord##_f##sz; \ 637bf215546Sopenharmony_ci info->size = sz; \ 638bf215546Sopenharmony_ci return true; 639bf215546Sopenharmony_ci#define CMP(ord, unord, ord_swap, unord_swap) \ 640bf215546Sopenharmony_ci CMP2(ord, unord, ord_swap, unord_swap, 16) \ 641bf215546Sopenharmony_ci CMP2(ord, unord, ord_swap, unord_swap, 32) \ 642bf215546Sopenharmony_ci CMP2(ord, unord, ord_swap, unord_swap, 64) 643bf215546Sopenharmony_ci CMP(lt, /*n*/ge, gt, /*n*/le) 644bf215546Sopenharmony_ci CMP(eq, /*n*/lg, eq, /*n*/lg) 645bf215546Sopenharmony_ci CMP(le, /*n*/gt, ge, /*n*/lt) 646bf215546Sopenharmony_ci CMP(gt, /*n*/le, lt, /*n*/ge) 647bf215546Sopenharmony_ci CMP(lg, /*n*/eq, lg, /*n*/eq) 648bf215546Sopenharmony_ci CMP(ge, /*n*/lt, le, /*n*/gt) 649bf215546Sopenharmony_ci#undef CMP 650bf215546Sopenharmony_ci#undef CMP2 651bf215546Sopenharmony_ci#define ORD_TEST(sz) \ 652bf215546Sopenharmony_ci case aco_opcode::v_cmp_u_f##sz: \ 653bf215546Sopenharmony_ci info->f32 = aco_opcode::v_cmp_u_f32; \ 654bf215546Sopenharmony_ci info->swapped = aco_opcode::v_cmp_u_f##sz; \ 655bf215546Sopenharmony_ci info->inverse = aco_opcode::v_cmp_o_f##sz; \ 656bf215546Sopenharmony_ci info->vcmpx = aco_opcode::v_cmpx_u_f##sz; \ 657bf215546Sopenharmony_ci info->size = sz; \ 658bf215546Sopenharmony_ci return true; \ 659bf215546Sopenharmony_ci case aco_opcode::v_cmp_o_f##sz: \ 660bf215546Sopenharmony_ci info->f32 = aco_opcode::v_cmp_o_f32; \ 661bf215546Sopenharmony_ci info->swapped = aco_opcode::v_cmp_o_f##sz; \ 662bf215546Sopenharmony_ci info->inverse = aco_opcode::v_cmp_u_f##sz; \ 663bf215546Sopenharmony_ci info->vcmpx = aco_opcode::v_cmpx_o_f##sz; \ 664bf215546Sopenharmony_ci info->size = sz; \ 665bf215546Sopenharmony_ci return true; 666bf215546Sopenharmony_ci ORD_TEST(16) 667bf215546Sopenharmony_ci ORD_TEST(32) 668bf215546Sopenharmony_ci ORD_TEST(64) 669bf215546Sopenharmony_ci#undef ORD_TEST 670bf215546Sopenharmony_ci#define CMPI2(op, swap, inv, type, sz) \ 671bf215546Sopenharmony_ci case aco_opcode::v_cmp_##op##_##type##sz: \ 672bf215546Sopenharmony_ci info->swapped = aco_opcode::v_cmp_##swap##_##type##sz; \ 673bf215546Sopenharmony_ci info->inverse = aco_opcode::v_cmp_##inv##_##type##sz; \ 674bf215546Sopenharmony_ci info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz; \ 675bf215546Sopenharmony_ci info->size = sz; \ 676bf215546Sopenharmony_ci return true; 677bf215546Sopenharmony_ci#define CMPI(op, swap, inv) \ 678bf215546Sopenharmony_ci CMPI2(op, swap, inv, i, 16) \ 679bf215546Sopenharmony_ci CMPI2(op, swap, inv, u, 16) \ 680bf215546Sopenharmony_ci CMPI2(op, swap, inv, i, 32) \ 681bf215546Sopenharmony_ci CMPI2(op, swap, inv, u, 32) \ 682bf215546Sopenharmony_ci CMPI2(op, swap, inv, i, 64) \ 683bf215546Sopenharmony_ci CMPI2(op, swap, inv, u, 64) 684bf215546Sopenharmony_ci CMPI(lt, gt, ge) 685bf215546Sopenharmony_ci CMPI(eq, eq, lg) 686bf215546Sopenharmony_ci CMPI(le, ge, gt) 687bf215546Sopenharmony_ci CMPI(gt, lt, le) 688bf215546Sopenharmony_ci CMPI(lg, lg, eq) 689bf215546Sopenharmony_ci CMPI(ge, le, lt) 690bf215546Sopenharmony_ci#undef CMPI 691bf215546Sopenharmony_ci#undef CMPI2 692bf215546Sopenharmony_ci#define CMPCLASS(sz) \ 693bf215546Sopenharmony_ci case aco_opcode::v_cmp_class_f##sz: \ 694bf215546Sopenharmony_ci info->vcmpx = aco_opcode::v_cmpx_class_f##sz; \ 695bf215546Sopenharmony_ci info->size = sz; \ 696bf215546Sopenharmony_ci return true; 697bf215546Sopenharmony_ci CMPCLASS(16) 698bf215546Sopenharmony_ci CMPCLASS(32) 699bf215546Sopenharmony_ci CMPCLASS(64) 700bf215546Sopenharmony_ci#undef CMPCLASS 701bf215546Sopenharmony_ci // clang-format on 702bf215546Sopenharmony_ci default: return false; 703bf215546Sopenharmony_ci } 704bf215546Sopenharmony_ci} 705bf215546Sopenharmony_ci 706bf215546Sopenharmony_ciaco_opcode 707bf215546Sopenharmony_ciget_ordered(aco_opcode op) 708bf215546Sopenharmony_ci{ 709bf215546Sopenharmony_ci CmpInfo info; 710bf215546Sopenharmony_ci return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes; 711bf215546Sopenharmony_ci} 712bf215546Sopenharmony_ci 713bf215546Sopenharmony_ciaco_opcode 714bf215546Sopenharmony_ciget_unordered(aco_opcode op) 715bf215546Sopenharmony_ci{ 716bf215546Sopenharmony_ci CmpInfo info; 717bf215546Sopenharmony_ci return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes; 718bf215546Sopenharmony_ci} 719bf215546Sopenharmony_ci 720bf215546Sopenharmony_ciaco_opcode 721bf215546Sopenharmony_ciget_inverse(aco_opcode op) 722bf215546Sopenharmony_ci{ 723bf215546Sopenharmony_ci CmpInfo info; 724bf215546Sopenharmony_ci return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes; 725bf215546Sopenharmony_ci} 726bf215546Sopenharmony_ci 727bf215546Sopenharmony_ciaco_opcode 728bf215546Sopenharmony_ciget_f32_cmp(aco_opcode op) 729bf215546Sopenharmony_ci{ 730bf215546Sopenharmony_ci CmpInfo info; 731bf215546Sopenharmony_ci return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes; 732bf215546Sopenharmony_ci} 733bf215546Sopenharmony_ci 734bf215546Sopenharmony_ciaco_opcode 735bf215546Sopenharmony_ciget_vcmpx(aco_opcode op) 736bf215546Sopenharmony_ci{ 737bf215546Sopenharmony_ci CmpInfo info; 738bf215546Sopenharmony_ci return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes; 739bf215546Sopenharmony_ci} 740bf215546Sopenharmony_ci 741bf215546Sopenharmony_ciunsigned 742bf215546Sopenharmony_ciget_cmp_bitsize(aco_opcode op) 743bf215546Sopenharmony_ci{ 744bf215546Sopenharmony_ci CmpInfo info; 745bf215546Sopenharmony_ci return get_cmp_info(op, &info) ? info.size : 0; 746bf215546Sopenharmony_ci} 747bf215546Sopenharmony_ci 748bf215546Sopenharmony_cibool 749bf215546Sopenharmony_ciis_cmp(aco_opcode op) 750bf215546Sopenharmony_ci{ 751bf215546Sopenharmony_ci CmpInfo info; 752bf215546Sopenharmony_ci return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes; 753bf215546Sopenharmony_ci} 754bf215546Sopenharmony_ci 755bf215546Sopenharmony_cibool 756bf215546Sopenharmony_cican_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op) 757bf215546Sopenharmony_ci{ 758bf215546Sopenharmony_ci if (instr->isDPP()) 759bf215546Sopenharmony_ci return false; 760bf215546Sopenharmony_ci 761bf215546Sopenharmony_ci if (instr->operands[0].isConstant() || 762bf215546Sopenharmony_ci (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr)) 763bf215546Sopenharmony_ci return false; 764bf215546Sopenharmony_ci 765bf215546Sopenharmony_ci switch (instr->opcode) { 766bf215546Sopenharmony_ci case aco_opcode::v_add_u32: 767bf215546Sopenharmony_ci case aco_opcode::v_add_co_u32: 768bf215546Sopenharmony_ci case aco_opcode::v_add_co_u32_e64: 769bf215546Sopenharmony_ci case aco_opcode::v_add_i32: 770bf215546Sopenharmony_ci case aco_opcode::v_add_f16: 771bf215546Sopenharmony_ci case aco_opcode::v_add_f32: 772bf215546Sopenharmony_ci case aco_opcode::v_mul_f16: 773bf215546Sopenharmony_ci case aco_opcode::v_mul_f32: 774bf215546Sopenharmony_ci case aco_opcode::v_or_b32: 775bf215546Sopenharmony_ci case aco_opcode::v_and_b32: 776bf215546Sopenharmony_ci case aco_opcode::v_xor_b32: 777bf215546Sopenharmony_ci case aco_opcode::v_max_f16: 778bf215546Sopenharmony_ci case aco_opcode::v_max_f32: 779bf215546Sopenharmony_ci case aco_opcode::v_min_f16: 780bf215546Sopenharmony_ci case aco_opcode::v_min_f32: 781bf215546Sopenharmony_ci case aco_opcode::v_max_i32: 782bf215546Sopenharmony_ci case aco_opcode::v_min_i32: 783bf215546Sopenharmony_ci case aco_opcode::v_max_u32: 784bf215546Sopenharmony_ci case aco_opcode::v_min_u32: 785bf215546Sopenharmony_ci case aco_opcode::v_max_i16: 786bf215546Sopenharmony_ci case aco_opcode::v_min_i16: 787bf215546Sopenharmony_ci case aco_opcode::v_max_u16: 788bf215546Sopenharmony_ci case aco_opcode::v_min_u16: 789bf215546Sopenharmony_ci case aco_opcode::v_max_i16_e64: 790bf215546Sopenharmony_ci case aco_opcode::v_min_i16_e64: 791bf215546Sopenharmony_ci case aco_opcode::v_max_u16_e64: 792bf215546Sopenharmony_ci case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true; 793bf215546Sopenharmony_ci case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true; 794bf215546Sopenharmony_ci case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true; 795bf215546Sopenharmony_ci case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true; 796bf215546Sopenharmony_ci case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true; 797bf215546Sopenharmony_ci case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true; 798bf215546Sopenharmony_ci default: { 799bf215546Sopenharmony_ci CmpInfo info; 800bf215546Sopenharmony_ci if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) { 801bf215546Sopenharmony_ci *new_op = info.swapped; 802bf215546Sopenharmony_ci return true; 803bf215546Sopenharmony_ci } 804bf215546Sopenharmony_ci return false; 805bf215546Sopenharmony_ci } 806bf215546Sopenharmony_ci } 807bf215546Sopenharmony_ci} 808bf215546Sopenharmony_ci 809bf215546Sopenharmony_ciwait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) 810bf215546Sopenharmony_ci{} 811bf215546Sopenharmony_ciwait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) 812bf215546Sopenharmony_ci : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) 813bf215546Sopenharmony_ci{} 814bf215546Sopenharmony_ci 815bf215546Sopenharmony_ciwait_imm::wait_imm(enum amd_gfx_level gfx_level, uint16_t packed) : vs(unset_counter) 816bf215546Sopenharmony_ci{ 817bf215546Sopenharmony_ci vm = packed & 0xf; 818bf215546Sopenharmony_ci if (gfx_level >= GFX9) 819bf215546Sopenharmony_ci vm |= (packed >> 10) & 0x30; 820bf215546Sopenharmony_ci 821bf215546Sopenharmony_ci exp = (packed >> 4) & 0x7; 822bf215546Sopenharmony_ci 823bf215546Sopenharmony_ci lgkm = (packed >> 8) & 0xf; 824bf215546Sopenharmony_ci if (gfx_level >= GFX10) 825bf215546Sopenharmony_ci lgkm |= (packed >> 8) & 0x30; 826bf215546Sopenharmony_ci} 827bf215546Sopenharmony_ci 828bf215546Sopenharmony_ciuint16_t 829bf215546Sopenharmony_ciwait_imm::pack(enum amd_gfx_level gfx_level) const 830bf215546Sopenharmony_ci{ 831bf215546Sopenharmony_ci uint16_t imm = 0; 832bf215546Sopenharmony_ci assert(exp == unset_counter || exp <= 0x7); 833bf215546Sopenharmony_ci switch (gfx_level) { 834bf215546Sopenharmony_ci case GFX11: 835bf215546Sopenharmony_ci assert(lgkm == unset_counter || lgkm <= 0x3f); 836bf215546Sopenharmony_ci assert(vm == unset_counter || vm <= 0x3f); 837bf215546Sopenharmony_ci imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7); 838bf215546Sopenharmony_ci break; 839bf215546Sopenharmony_ci case GFX10: 840bf215546Sopenharmony_ci case GFX10_3: 841bf215546Sopenharmony_ci assert(lgkm == unset_counter || lgkm <= 0x3f); 842bf215546Sopenharmony_ci assert(vm == unset_counter || vm <= 0x3f); 843bf215546Sopenharmony_ci imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); 844bf215546Sopenharmony_ci break; 845bf215546Sopenharmony_ci case GFX9: 846bf215546Sopenharmony_ci assert(lgkm == unset_counter || lgkm <= 0xf); 847bf215546Sopenharmony_ci assert(vm == unset_counter || vm <= 0x3f); 848bf215546Sopenharmony_ci imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); 849bf215546Sopenharmony_ci break; 850bf215546Sopenharmony_ci default: 851bf215546Sopenharmony_ci assert(lgkm == unset_counter || lgkm <= 0xf); 852bf215546Sopenharmony_ci assert(vm == unset_counter || vm <= 0xf); 853bf215546Sopenharmony_ci imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); 854bf215546Sopenharmony_ci break; 855bf215546Sopenharmony_ci } 856bf215546Sopenharmony_ci if (gfx_level < GFX9 && vm == wait_imm::unset_counter) 857bf215546Sopenharmony_ci imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the 858bf215546Sopenharmony_ci architecture when interpreting the immediate */ 859bf215546Sopenharmony_ci if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter) 860bf215546Sopenharmony_ci imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the 861bf215546Sopenharmony_ci architecture when interpreting the immediate */ 862bf215546Sopenharmony_ci return imm; 863bf215546Sopenharmony_ci} 864bf215546Sopenharmony_ci 865bf215546Sopenharmony_cibool 866bf215546Sopenharmony_ciwait_imm::combine(const wait_imm& other) 867bf215546Sopenharmony_ci{ 868bf215546Sopenharmony_ci bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs; 869bf215546Sopenharmony_ci vm = std::min(vm, other.vm); 870bf215546Sopenharmony_ci exp = std::min(exp, other.exp); 871bf215546Sopenharmony_ci lgkm = std::min(lgkm, other.lgkm); 872bf215546Sopenharmony_ci vs = std::min(vs, other.vs); 873bf215546Sopenharmony_ci return changed; 874bf215546Sopenharmony_ci} 875bf215546Sopenharmony_ci 876bf215546Sopenharmony_cibool 877bf215546Sopenharmony_ciwait_imm::empty() const 878bf215546Sopenharmony_ci{ 879bf215546Sopenharmony_ci return vm == unset_counter && exp == unset_counter && lgkm == unset_counter && 880bf215546Sopenharmony_ci vs == unset_counter; 881bf215546Sopenharmony_ci} 882bf215546Sopenharmony_ci 883bf215546Sopenharmony_cibool 884bf215546Sopenharmony_cishould_form_clause(const Instruction* a, const Instruction* b) 885bf215546Sopenharmony_ci{ 886bf215546Sopenharmony_ci /* Vertex attribute loads from the same binding likely load from similar addresses */ 887bf215546Sopenharmony_ci unsigned a_vtx_binding = 888bf215546Sopenharmony_ci a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0); 889bf215546Sopenharmony_ci unsigned b_vtx_binding = 890bf215546Sopenharmony_ci b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0); 891bf215546Sopenharmony_ci if (a_vtx_binding && a_vtx_binding == b_vtx_binding) 892bf215546Sopenharmony_ci return true; 893bf215546Sopenharmony_ci 894bf215546Sopenharmony_ci if (a->format != b->format) 895bf215546Sopenharmony_ci return false; 896bf215546Sopenharmony_ci 897bf215546Sopenharmony_ci /* Assume loads which don't use descriptors might load from similar addresses. */ 898bf215546Sopenharmony_ci if (a->isFlatLike()) 899bf215546Sopenharmony_ci return true; 900bf215546Sopenharmony_ci if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8) 901bf215546Sopenharmony_ci return true; 902bf215546Sopenharmony_ci 903bf215546Sopenharmony_ci /* If they load from the same descriptor, assume they might load from similar 904bf215546Sopenharmony_ci * addresses. 905bf215546Sopenharmony_ci */ 906bf215546Sopenharmony_ci if (a->isVMEM() || a->isSMEM()) 907bf215546Sopenharmony_ci return a->operands[0].tempId() == b->operands[0].tempId(); 908bf215546Sopenharmony_ci 909bf215546Sopenharmony_ci return false; 910bf215546Sopenharmony_ci} 911bf215546Sopenharmony_ci 912bf215546Sopenharmony_ci} // namespace aco 913