1/* 2 * Copyright © 2020 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_ir.h" 26 27#include "aco_builder.h" 28 29#include "util/debug.h" 30 31#include "c11/threads.h" 32 33namespace aco { 34 35uint64_t debug_flags = 0; 36 37static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR}, 38 {"validatera", DEBUG_VALIDATE_RA}, 39 {"perfwarn", DEBUG_PERFWARN}, 40 {"force-waitcnt", DEBUG_FORCE_WAITCNT}, 41 {"novn", DEBUG_NO_VN}, 42 {"noopt", DEBUG_NO_OPT}, 43 {"nosched", DEBUG_NO_SCHED}, 44 {"perfinfo", DEBUG_PERF_INFO}, 45 {"liveinfo", DEBUG_LIVE_INFO}, 46 {NULL, 0}}; 47 48static once_flag init_once_flag = ONCE_FLAG_INIT; 49 50static void 51init_once() 52{ 53 debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options); 54 55#ifndef NDEBUG 56 /* enable some flags by default on debug builds */ 57 debug_flags |= aco::DEBUG_VALIDATE_IR; 58#endif 59} 60 61void 62init() 63{ 64 call_once(&init_once_flag, init_once); 65} 66 67void 68init_program(Program* program, Stage stage, const struct aco_shader_info* info, 69 enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode, 70 ac_shader_config* config) 71{ 72 program->stage = stage; 73 program->config = config; 74 program->info = *info; 75 program->gfx_level = gfx_level; 76 if (family == CHIP_UNKNOWN) { 77 switch (gfx_level) { 78 case GFX6: program->family = CHIP_TAHITI; break; 79 case GFX7: program->family = CHIP_BONAIRE; break; 80 case GFX8: program->family = CHIP_POLARIS10; break; 81 case GFX9: program->family = CHIP_VEGA10; break; 82 case GFX10: program->family = CHIP_NAVI10; break; 83 default: program->family = CHIP_UNKNOWN; break; 84 } 85 } else { 86 program->family = family; 87 } 88 program->wave_size = info->wave_size; 89 program->lane_mask = program->wave_size == 32 ? s1 : s2; 90 91 program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024 : 92 gfx_level >= GFX7 ? 512 : 256; 93 program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule; 94 program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768; 95 /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */ 96 program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY; 97 98 program->dev.vgpr_limit = 256; 99 program->dev.physical_vgprs = 256; 100 program->dev.vgpr_alloc_granule = 4; 101 102 if (gfx_level >= GFX10) { 103 program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */ 104 program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512; 105 program->dev.sgpr_alloc_granule = 128; 106 program->dev.sgpr_limit = 107 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */ 108 if (gfx_level == GFX10_3) 109 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8; 110 else 111 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4; 112 } else if (program->gfx_level >= GFX8) { 113 program->dev.physical_sgprs = 800; 114 program->dev.sgpr_alloc_granule = 16; 115 program->dev.sgpr_limit = 102; 116 if (family == CHIP_TONGA || family == CHIP_ICELAND) 117 program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */ 118 } else { 119 program->dev.physical_sgprs = 512; 120 program->dev.sgpr_alloc_granule = 8; 121 program->dev.sgpr_limit = 104; 122 } 123 124 program->dev.max_wave64_per_simd = 10; 125 if (program->gfx_level >= GFX10_3) 126 program->dev.max_wave64_per_simd = 16; 127 else if (program->gfx_level == GFX10) 128 program->dev.max_wave64_per_simd = 20; 129 else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM) 130 program->dev.max_wave64_per_simd = 8; 131 132 program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4; 133 134 switch (program->family) { 135 /* GFX8 APUs */ 136 case CHIP_CARRIZO: 137 case CHIP_STONEY: 138 /* GFX9 APUS */ 139 case CHIP_RAVEN: 140 case CHIP_RAVEN2: 141 case CHIP_RENOIR: program->dev.xnack_enabled = true; break; 142 default: break; 143 } 144 145 program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS; 146 /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */ 147 program->dev.has_fast_fma32 = program->gfx_level >= GFX9; 148 if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO || 149 program->family == CHIP_HAWAII) 150 program->dev.has_fast_fma32 = true; 151 program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level >= GFX10; 152 153 program->dev.fused_mad_mix = program->gfx_level >= GFX10; 154 if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 || 155 program->family == CHIP_ARCTURUS || program->family == CHIP_ALDEBARAN) 156 program->dev.fused_mad_mix = true; 157 158 if (program->gfx_level >= GFX11) { 159 program->dev.scratch_global_offset_min = -4096; 160 program->dev.scratch_global_offset_max = 4095; 161 } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) { 162 program->dev.scratch_global_offset_min = -2048; 163 program->dev.scratch_global_offset_max = 2047; 164 } else if (program->gfx_level == GFX9) { 165 /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */ 166 program->dev.scratch_global_offset_min = 0; 167 program->dev.scratch_global_offset_max = 4095; 168 } 169 170 program->wgp_mode = wgp_mode; 171 172 program->progress = CompilationProgress::after_isel; 173 174 program->next_fp_mode.preserve_signed_zero_inf_nan32 = false; 175 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false; 176 program->next_fp_mode.must_flush_denorms32 = false; 177 program->next_fp_mode.must_flush_denorms16_64 = false; 178 program->next_fp_mode.care_about_round32 = false; 179 program->next_fp_mode.care_about_round16_64 = false; 180 program->next_fp_mode.denorm16_64 = fp_denorm_keep; 181 program->next_fp_mode.denorm32 = 0; 182 program->next_fp_mode.round16_64 = fp_round_ne; 183 program->next_fp_mode.round32 = fp_round_ne; 184} 185 186memory_sync_info 187get_sync_info(const Instruction* instr) 188{ 189 switch (instr->format) { 190 case Format::SMEM: return instr->smem().sync; 191 case Format::MUBUF: return instr->mubuf().sync; 192 case Format::MIMG: return instr->mimg().sync; 193 case Format::MTBUF: return instr->mtbuf().sync; 194 case Format::FLAT: 195 case Format::GLOBAL: 196 case Format::SCRATCH: return instr->flatlike().sync; 197 case Format::DS: return instr->ds().sync; 198 default: return memory_sync_info(); 199 } 200} 201 202bool 203can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra) 204{ 205 if (!instr->isVALU()) 206 return false; 207 208 if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P()) 209 return false; 210 211 if (instr->isSDWA()) 212 return true; 213 214 if (instr->isVOP3()) { 215 VOP3_instruction& vop3 = instr->vop3(); 216 if (instr->format == Format::VOP3) 217 return false; 218 if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8) 219 return false; 220 if (vop3.omod && gfx_level < GFX9) 221 return false; 222 223 // TODO: return true if we know we will use vcc 224 if (!pre_ra && instr->definitions.size() >= 2) 225 return false; 226 227 for (unsigned i = 1; i < instr->operands.size(); i++) { 228 if (instr->operands[i].isLiteral()) 229 return false; 230 if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr)) 231 return false; 232 } 233 } 234 235 if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC()) 236 return false; 237 238 if (!instr->operands.empty()) { 239 if (instr->operands[0].isLiteral()) 240 return false; 241 if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr)) 242 return false; 243 if (instr->operands[0].bytes() > 4) 244 return false; 245 if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4) 246 return false; 247 } 248 249 bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 || 250 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16; 251 252 if (gfx_level != GFX8 && is_mac) 253 return false; 254 255 // TODO: return true if we know we will use vcc 256 if (!pre_ra && instr->isVOPC() && gfx_level == GFX8) 257 return false; 258 if (!pre_ra && instr->operands.size() >= 3 && !is_mac) 259 return false; 260 261 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 && 262 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 && 263 instr->opcode != aco_opcode::v_readfirstlane_b32 && 264 instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32; 265} 266 267/* updates "instr" and returns the old instruction (or NULL if no update was needed) */ 268aco_ptr<Instruction> 269convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr) 270{ 271 if (instr->isSDWA()) 272 return NULL; 273 274 aco_ptr<Instruction> tmp = std::move(instr); 275 Format format = 276 (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA); 277 instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(), 278 tmp->definitions.size())); 279 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); 280 std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin()); 281 282 SDWA_instruction& sdwa = instr->sdwa(); 283 284 if (tmp->isVOP3()) { 285 VOP3_instruction& vop3 = tmp->vop3(); 286 memcpy(sdwa.neg, vop3.neg, sizeof(sdwa.neg)); 287 memcpy(sdwa.abs, vop3.abs, sizeof(sdwa.abs)); 288 sdwa.omod = vop3.omod; 289 sdwa.clamp = vop3.clamp; 290 } 291 292 for (unsigned i = 0; i < instr->operands.size(); i++) { 293 /* SDWA only uses operands 0 and 1. */ 294 if (i >= 2) 295 break; 296 297 sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false); 298 } 299 300 sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false); 301 302 if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8) 303 instr->definitions[0].setFixed(vcc); 304 if (instr->definitions.size() >= 2) 305 instr->definitions[1].setFixed(vcc); 306 if (instr->operands.size() >= 3) 307 instr->operands[2].setFixed(vcc); 308 309 instr->pass_flags = tmp->pass_flags; 310 311 return tmp; 312} 313 314bool 315can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8) 316{ 317 assert(instr->isVALU() && !instr->operands.empty()); 318 319 if (instr->isDPP()) 320 return instr->isDPP8() == dpp8; 321 322 if (instr->operands.size() && instr->operands[0].isLiteral()) 323 return false; 324 325 if (instr->isSDWA()) 326 return false; 327 328 if (!pre_ra && (instr->isVOPC() || instr->definitions.size() > 1) && 329 instr->definitions.back().physReg() != vcc) 330 return false; 331 332 if (!pre_ra && instr->operands.size() >= 3 && instr->operands[2].physReg() != vcc) 333 return false; 334 335 if (instr->isVOP3()) { 336 const VOP3_instruction* vop3 = &instr->vop3(); 337 if (vop3->clamp || vop3->omod || vop3->opsel) 338 return false; 339 if (dpp8) 340 return false; 341 if (instr->format == Format::VOP3) 342 return false; 343 if (instr->operands.size() > 1 && !instr->operands[1].isOfType(RegType::vgpr)) 344 return false; 345 } 346 347 /* there are more cases but those all take 64-bit inputs */ 348 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 && 349 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 && 350 instr->opcode != aco_opcode::v_readfirstlane_b32 && 351 instr->opcode != aco_opcode::v_cvt_f64_i32 && 352 instr->opcode != aco_opcode::v_cvt_f64_f32 && instr->opcode != aco_opcode::v_cvt_f64_u32; 353} 354 355aco_ptr<Instruction> 356convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8) 357{ 358 if (instr->isDPP()) 359 return NULL; 360 361 aco_ptr<Instruction> tmp = std::move(instr); 362 Format format = (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) | 363 (dpp8 ? (uint32_t)Format::DPP8 : (uint32_t)Format::DPP16)); 364 if (dpp8) 365 instr.reset(create_instruction<DPP8_instruction>(tmp->opcode, format, tmp->operands.size(), 366 tmp->definitions.size())); 367 else 368 instr.reset(create_instruction<DPP16_instruction>(tmp->opcode, format, tmp->operands.size(), 369 tmp->definitions.size())); 370 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); 371 for (unsigned i = 0; i < instr->definitions.size(); i++) 372 instr->definitions[i] = tmp->definitions[i]; 373 374 if (dpp8) { 375 DPP8_instruction* dpp = &instr->dpp8(); 376 for (unsigned i = 0; i < 8; i++) 377 dpp->lane_sel[i] = i; 378 } else { 379 DPP16_instruction* dpp = &instr->dpp16(); 380 dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3); 381 dpp->row_mask = 0xf; 382 dpp->bank_mask = 0xf; 383 384 if (tmp->isVOP3()) { 385 const VOP3_instruction* vop3 = &tmp->vop3(); 386 memcpy(dpp->neg, vop3->neg, sizeof(dpp->neg)); 387 memcpy(dpp->abs, vop3->abs, sizeof(dpp->abs)); 388 } 389 } 390 391 if (instr->isVOPC() || instr->definitions.size() > 1) 392 instr->definitions.back().setFixed(vcc); 393 394 if (instr->operands.size() >= 3) 395 instr->operands[2].setFixed(vcc); 396 397 instr->pass_flags = tmp->pass_flags; 398 399 return tmp; 400} 401 402bool 403can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx) 404{ 405 /* opsel is only GFX9+ */ 406 if (gfx_level < GFX9) 407 return false; 408 409 switch (op) { 410 case aco_opcode::v_div_fixup_f16: 411 case aco_opcode::v_fma_f16: 412 case aco_opcode::v_mad_f16: 413 case aco_opcode::v_mad_u16: 414 case aco_opcode::v_mad_i16: 415 case aco_opcode::v_med3_f16: 416 case aco_opcode::v_med3_i16: 417 case aco_opcode::v_med3_u16: 418 case aco_opcode::v_min3_f16: 419 case aco_opcode::v_min3_i16: 420 case aco_opcode::v_min3_u16: 421 case aco_opcode::v_max3_f16: 422 case aco_opcode::v_max3_i16: 423 case aco_opcode::v_max3_u16: 424 case aco_opcode::v_max_u16_e64: 425 case aco_opcode::v_max_i16_e64: 426 case aco_opcode::v_min_u16_e64: 427 case aco_opcode::v_min_i16_e64: 428 case aco_opcode::v_add_i16: 429 case aco_opcode::v_sub_i16: 430 case aco_opcode::v_add_u16_e64: 431 case aco_opcode::v_sub_u16_e64: 432 case aco_opcode::v_lshlrev_b16_e64: 433 case aco_opcode::v_lshrrev_b16_e64: 434 case aco_opcode::v_ashrrev_i16_e64: 435 case aco_opcode::v_mul_lo_u16_e64: return true; 436 case aco_opcode::v_pack_b32_f16: 437 case aco_opcode::v_cvt_pknorm_i16_f16: 438 case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1; 439 case aco_opcode::v_mad_u32_u16: 440 case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2; 441 default: return false; 442 } 443} 444 445bool 446instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op) 447{ 448 /* partial register writes are GFX9+, only */ 449 if (gfx_level < GFX9) 450 return false; 451 452 switch (op) { 453 /* VOP3 */ 454 case aco_opcode::v_mad_f16: 455 case aco_opcode::v_mad_u16: 456 case aco_opcode::v_mad_i16: 457 case aco_opcode::v_fma_f16: 458 case aco_opcode::v_div_fixup_f16: 459 case aco_opcode::v_interp_p2_f16: 460 case aco_opcode::v_fma_mixlo_f16: 461 case aco_opcode::v_fma_mixhi_f16: 462 /* VOP2 */ 463 case aco_opcode::v_mac_f16: 464 case aco_opcode::v_madak_f16: 465 case aco_opcode::v_madmk_f16: return gfx_level >= GFX9; 466 case aco_opcode::v_add_f16: 467 case aco_opcode::v_sub_f16: 468 case aco_opcode::v_subrev_f16: 469 case aco_opcode::v_mul_f16: 470 case aco_opcode::v_max_f16: 471 case aco_opcode::v_min_f16: 472 case aco_opcode::v_ldexp_f16: 473 case aco_opcode::v_fmac_f16: 474 case aco_opcode::v_fmamk_f16: 475 case aco_opcode::v_fmaak_f16: 476 /* VOP1 */ 477 case aco_opcode::v_cvt_f16_f32: 478 case aco_opcode::v_cvt_f16_u16: 479 case aco_opcode::v_cvt_f16_i16: 480 case aco_opcode::v_rcp_f16: 481 case aco_opcode::v_sqrt_f16: 482 case aco_opcode::v_rsq_f16: 483 case aco_opcode::v_log_f16: 484 case aco_opcode::v_exp_f16: 485 case aco_opcode::v_frexp_mant_f16: 486 case aco_opcode::v_frexp_exp_i16_f16: 487 case aco_opcode::v_floor_f16: 488 case aco_opcode::v_ceil_f16: 489 case aco_opcode::v_trunc_f16: 490 case aco_opcode::v_rndne_f16: 491 case aco_opcode::v_fract_f16: 492 case aco_opcode::v_sin_f16: 493 case aco_opcode::v_cos_f16: return gfx_level >= GFX10; 494 // TODO: confirm whether these write 16 or 32 bit on GFX10+ 495 // case aco_opcode::v_cvt_u16_f16: 496 // case aco_opcode::v_cvt_i16_f16: 497 // case aco_opcode::p_cvt_f16_f32_rtne: 498 // case aco_opcode::v_cvt_norm_i16_f16: 499 // case aco_opcode::v_cvt_norm_u16_f16: 500 /* on GFX10, all opsel instructions preserve the high bits */ 501 default: return gfx_level >= GFX10 && can_use_opsel(gfx_level, op, -1); 502 } 503} 504 505uint32_t 506get_reduction_identity(ReduceOp op, unsigned idx) 507{ 508 switch (op) { 509 case iadd8: 510 case iadd16: 511 case iadd32: 512 case iadd64: 513 case fadd16: 514 case fadd32: 515 case fadd64: 516 case ior8: 517 case ior16: 518 case ior32: 519 case ior64: 520 case ixor8: 521 case ixor16: 522 case ixor32: 523 case ixor64: 524 case umax8: 525 case umax16: 526 case umax32: 527 case umax64: return 0; 528 case imul8: 529 case imul16: 530 case imul32: 531 case imul64: return idx ? 0 : 1; 532 case fmul16: return 0x3c00u; /* 1.0 */ 533 case fmul32: return 0x3f800000u; /* 1.0 */ 534 case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */ 535 case imin8: return INT8_MAX; 536 case imin16: return INT16_MAX; 537 case imin32: return INT32_MAX; 538 case imin64: return idx ? 0x7fffffffu : 0xffffffffu; 539 case imax8: return INT8_MIN; 540 case imax16: return INT16_MIN; 541 case imax32: return INT32_MIN; 542 case imax64: return idx ? 0x80000000u : 0; 543 case umin8: 544 case umin16: 545 case iand8: 546 case iand16: return 0xffffffffu; 547 case umin32: 548 case umin64: 549 case iand32: 550 case iand64: return 0xffffffffu; 551 case fmin16: return 0x7c00u; /* infinity */ 552 case fmin32: return 0x7f800000u; /* infinity */ 553 case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */ 554 case fmax16: return 0xfc00u; /* negative infinity */ 555 case fmax32: return 0xff800000u; /* negative infinity */ 556 case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */ 557 default: unreachable("Invalid reduction operation"); break; 558 } 559 return 0; 560} 561 562bool 563needs_exec_mask(const Instruction* instr) 564{ 565 if (instr->isVALU()) { 566 return instr->opcode != aco_opcode::v_readlane_b32 && 567 instr->opcode != aco_opcode::v_readlane_b32_e64 && 568 instr->opcode != aco_opcode::v_writelane_b32 && 569 instr->opcode != aco_opcode::v_writelane_b32_e64; 570 } 571 572 if (instr->isVMEM() || instr->isFlatLike()) 573 return true; 574 575 if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier()) 576 return instr->reads_exec(); 577 578 if (instr->isPseudo()) { 579 switch (instr->opcode) { 580 case aco_opcode::p_create_vector: 581 case aco_opcode::p_extract_vector: 582 case aco_opcode::p_split_vector: 583 case aco_opcode::p_phi: 584 case aco_opcode::p_parallelcopy: 585 for (Definition def : instr->definitions) { 586 if (def.getTemp().type() == RegType::vgpr) 587 return true; 588 } 589 return instr->reads_exec(); 590 case aco_opcode::p_spill: 591 case aco_opcode::p_reload: 592 case aco_opcode::p_end_linear_vgpr: 593 case aco_opcode::p_logical_start: 594 case aco_opcode::p_logical_end: 595 case aco_opcode::p_startpgm: 596 case aco_opcode::p_init_scratch: return instr->reads_exec(); 597 default: break; 598 } 599 } 600 601 return true; 602} 603 604struct CmpInfo { 605 aco_opcode ordered; 606 aco_opcode unordered; 607 aco_opcode swapped; 608 aco_opcode inverse; 609 aco_opcode vcmpx; 610 aco_opcode f32; 611 unsigned size; 612}; 613 614ALWAYS_INLINE bool 615get_cmp_info(aco_opcode op, CmpInfo* info) 616{ 617 info->ordered = aco_opcode::num_opcodes; 618 info->unordered = aco_opcode::num_opcodes; 619 info->swapped = aco_opcode::num_opcodes; 620 info->inverse = aco_opcode::num_opcodes; 621 info->f32 = aco_opcode::num_opcodes; 622 switch (op) { 623 // clang-format off 624#define CMP2(ord, unord, ord_swap, unord_swap, sz) \ 625 case aco_opcode::v_cmp_##ord##_f##sz: \ 626 case aco_opcode::v_cmp_n##unord##_f##sz: \ 627 info->ordered = aco_opcode::v_cmp_##ord##_f##sz; \ 628 info->unordered = aco_opcode::v_cmp_n##unord##_f##sz; \ 629 info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \ 630 : aco_opcode::v_cmp_n##unord_swap##_f##sz; \ 631 info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \ 632 : aco_opcode::v_cmp_n##ord##_f##sz; \ 633 info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 \ 634 : aco_opcode::v_cmp_n##unord##_f32; \ 635 info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz \ 636 : aco_opcode::v_cmpx_n##unord##_f##sz; \ 637 info->size = sz; \ 638 return true; 639#define CMP(ord, unord, ord_swap, unord_swap) \ 640 CMP2(ord, unord, ord_swap, unord_swap, 16) \ 641 CMP2(ord, unord, ord_swap, unord_swap, 32) \ 642 CMP2(ord, unord, ord_swap, unord_swap, 64) 643 CMP(lt, /*n*/ge, gt, /*n*/le) 644 CMP(eq, /*n*/lg, eq, /*n*/lg) 645 CMP(le, /*n*/gt, ge, /*n*/lt) 646 CMP(gt, /*n*/le, lt, /*n*/ge) 647 CMP(lg, /*n*/eq, lg, /*n*/eq) 648 CMP(ge, /*n*/lt, le, /*n*/gt) 649#undef CMP 650#undef CMP2 651#define ORD_TEST(sz) \ 652 case aco_opcode::v_cmp_u_f##sz: \ 653 info->f32 = aco_opcode::v_cmp_u_f32; \ 654 info->swapped = aco_opcode::v_cmp_u_f##sz; \ 655 info->inverse = aco_opcode::v_cmp_o_f##sz; \ 656 info->vcmpx = aco_opcode::v_cmpx_u_f##sz; \ 657 info->size = sz; \ 658 return true; \ 659 case aco_opcode::v_cmp_o_f##sz: \ 660 info->f32 = aco_opcode::v_cmp_o_f32; \ 661 info->swapped = aco_opcode::v_cmp_o_f##sz; \ 662 info->inverse = aco_opcode::v_cmp_u_f##sz; \ 663 info->vcmpx = aco_opcode::v_cmpx_o_f##sz; \ 664 info->size = sz; \ 665 return true; 666 ORD_TEST(16) 667 ORD_TEST(32) 668 ORD_TEST(64) 669#undef ORD_TEST 670#define CMPI2(op, swap, inv, type, sz) \ 671 case aco_opcode::v_cmp_##op##_##type##sz: \ 672 info->swapped = aco_opcode::v_cmp_##swap##_##type##sz; \ 673 info->inverse = aco_opcode::v_cmp_##inv##_##type##sz; \ 674 info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz; \ 675 info->size = sz; \ 676 return true; 677#define CMPI(op, swap, inv) \ 678 CMPI2(op, swap, inv, i, 16) \ 679 CMPI2(op, swap, inv, u, 16) \ 680 CMPI2(op, swap, inv, i, 32) \ 681 CMPI2(op, swap, inv, u, 32) \ 682 CMPI2(op, swap, inv, i, 64) \ 683 CMPI2(op, swap, inv, u, 64) 684 CMPI(lt, gt, ge) 685 CMPI(eq, eq, lg) 686 CMPI(le, ge, gt) 687 CMPI(gt, lt, le) 688 CMPI(lg, lg, eq) 689 CMPI(ge, le, lt) 690#undef CMPI 691#undef CMPI2 692#define CMPCLASS(sz) \ 693 case aco_opcode::v_cmp_class_f##sz: \ 694 info->vcmpx = aco_opcode::v_cmpx_class_f##sz; \ 695 info->size = sz; \ 696 return true; 697 CMPCLASS(16) 698 CMPCLASS(32) 699 CMPCLASS(64) 700#undef CMPCLASS 701 // clang-format on 702 default: return false; 703 } 704} 705 706aco_opcode 707get_ordered(aco_opcode op) 708{ 709 CmpInfo info; 710 return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes; 711} 712 713aco_opcode 714get_unordered(aco_opcode op) 715{ 716 CmpInfo info; 717 return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes; 718} 719 720aco_opcode 721get_inverse(aco_opcode op) 722{ 723 CmpInfo info; 724 return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes; 725} 726 727aco_opcode 728get_f32_cmp(aco_opcode op) 729{ 730 CmpInfo info; 731 return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes; 732} 733 734aco_opcode 735get_vcmpx(aco_opcode op) 736{ 737 CmpInfo info; 738 return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes; 739} 740 741unsigned 742get_cmp_bitsize(aco_opcode op) 743{ 744 CmpInfo info; 745 return get_cmp_info(op, &info) ? info.size : 0; 746} 747 748bool 749is_cmp(aco_opcode op) 750{ 751 CmpInfo info; 752 return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes; 753} 754 755bool 756can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op) 757{ 758 if (instr->isDPP()) 759 return false; 760 761 if (instr->operands[0].isConstant() || 762 (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr)) 763 return false; 764 765 switch (instr->opcode) { 766 case aco_opcode::v_add_u32: 767 case aco_opcode::v_add_co_u32: 768 case aco_opcode::v_add_co_u32_e64: 769 case aco_opcode::v_add_i32: 770 case aco_opcode::v_add_f16: 771 case aco_opcode::v_add_f32: 772 case aco_opcode::v_mul_f16: 773 case aco_opcode::v_mul_f32: 774 case aco_opcode::v_or_b32: 775 case aco_opcode::v_and_b32: 776 case aco_opcode::v_xor_b32: 777 case aco_opcode::v_max_f16: 778 case aco_opcode::v_max_f32: 779 case aco_opcode::v_min_f16: 780 case aco_opcode::v_min_f32: 781 case aco_opcode::v_max_i32: 782 case aco_opcode::v_min_i32: 783 case aco_opcode::v_max_u32: 784 case aco_opcode::v_min_u32: 785 case aco_opcode::v_max_i16: 786 case aco_opcode::v_min_i16: 787 case aco_opcode::v_max_u16: 788 case aco_opcode::v_min_u16: 789 case aco_opcode::v_max_i16_e64: 790 case aco_opcode::v_min_i16_e64: 791 case aco_opcode::v_max_u16_e64: 792 case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true; 793 case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true; 794 case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true; 795 case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true; 796 case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true; 797 case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true; 798 default: { 799 CmpInfo info; 800 if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) { 801 *new_op = info.swapped; 802 return true; 803 } 804 return false; 805 } 806 } 807} 808 809wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) 810{} 811wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) 812 : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) 813{} 814 815wait_imm::wait_imm(enum amd_gfx_level gfx_level, uint16_t packed) : vs(unset_counter) 816{ 817 vm = packed & 0xf; 818 if (gfx_level >= GFX9) 819 vm |= (packed >> 10) & 0x30; 820 821 exp = (packed >> 4) & 0x7; 822 823 lgkm = (packed >> 8) & 0xf; 824 if (gfx_level >= GFX10) 825 lgkm |= (packed >> 8) & 0x30; 826} 827 828uint16_t 829wait_imm::pack(enum amd_gfx_level gfx_level) const 830{ 831 uint16_t imm = 0; 832 assert(exp == unset_counter || exp <= 0x7); 833 switch (gfx_level) { 834 case GFX11: 835 assert(lgkm == unset_counter || lgkm <= 0x3f); 836 assert(vm == unset_counter || vm <= 0x3f); 837 imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7); 838 break; 839 case GFX10: 840 case GFX10_3: 841 assert(lgkm == unset_counter || lgkm <= 0x3f); 842 assert(vm == unset_counter || vm <= 0x3f); 843 imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); 844 break; 845 case GFX9: 846 assert(lgkm == unset_counter || lgkm <= 0xf); 847 assert(vm == unset_counter || vm <= 0x3f); 848 imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); 849 break; 850 default: 851 assert(lgkm == unset_counter || lgkm <= 0xf); 852 assert(vm == unset_counter || vm <= 0xf); 853 imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); 854 break; 855 } 856 if (gfx_level < GFX9 && vm == wait_imm::unset_counter) 857 imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the 858 architecture when interpreting the immediate */ 859 if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter) 860 imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the 861 architecture when interpreting the immediate */ 862 return imm; 863} 864 865bool 866wait_imm::combine(const wait_imm& other) 867{ 868 bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs; 869 vm = std::min(vm, other.vm); 870 exp = std::min(exp, other.exp); 871 lgkm = std::min(lgkm, other.lgkm); 872 vs = std::min(vs, other.vs); 873 return changed; 874} 875 876bool 877wait_imm::empty() const 878{ 879 return vm == unset_counter && exp == unset_counter && lgkm == unset_counter && 880 vs == unset_counter; 881} 882 883bool 884should_form_clause(const Instruction* a, const Instruction* b) 885{ 886 /* Vertex attribute loads from the same binding likely load from similar addresses */ 887 unsigned a_vtx_binding = 888 a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0); 889 unsigned b_vtx_binding = 890 b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0); 891 if (a_vtx_binding && a_vtx_binding == b_vtx_binding) 892 return true; 893 894 if (a->format != b->format) 895 return false; 896 897 /* Assume loads which don't use descriptors might load from similar addresses. */ 898 if (a->isFlatLike()) 899 return true; 900 if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8) 901 return true; 902 903 /* If they load from the same descriptor, assume they might load from similar 904 * addresses. 905 */ 906 if (a->isVMEM() || a->isSMEM()) 907 return a->operands[0].tempId() == b->operands[0].tempId(); 908 909 return false; 910} 911 912} // namespace aco 913