1/* 2 * Copyright © 2018 Valve Corporation 3 * Copyright © 2018 Google 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 * 24 */ 25 26#include "aco_instruction_selection.h" 27 28#include "aco_builder.h" 29#include "aco_ir.h" 30#include "aco_interface.h" 31 32#include "common/ac_nir.h" 33#include "common/sid.h" 34 35#include "util/fast_idiv_by_const.h" 36#include "util/memstream.h" 37 38#include <array> 39#include <functional> 40#include <map> 41#include <numeric> 42#include <stack> 43#include <utility> 44#include <vector> 45 46namespace aco { 47namespace { 48 49#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__) 50 51static void 52_isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr, 53 const char* msg) 54{ 55 char* out; 56 size_t outsize; 57 struct u_memstream mem; 58 u_memstream_open(&mem, &out, &outsize); 59 FILE* const memf = u_memstream_get(&mem); 60 61 fprintf(memf, "%s: ", msg); 62 nir_print_instr(instr, memf); 63 u_memstream_close(&mem); 64 65 _aco_err(ctx->program, file, line, out); 66 free(out); 67} 68 69struct if_context { 70 Temp cond; 71 72 bool divergent_old; 73 bool exec_potentially_empty_discard_old; 74 bool exec_potentially_empty_break_old; 75 uint16_t exec_potentially_empty_break_depth_old; 76 77 unsigned BB_if_idx; 78 unsigned invert_idx; 79 bool uniform_has_then_branch; 80 bool then_branch_divergent; 81 Block BB_invert; 82 Block BB_endif; 83}; 84 85struct loop_context { 86 Block loop_exit; 87 88 unsigned header_idx_old; 89 Block* exit_old; 90 bool divergent_cont_old; 91 bool divergent_branch_old; 92 bool divergent_if_old; 93}; 94 95static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list); 96 97static void 98add_logical_edge(unsigned pred_idx, Block* succ) 99{ 100 succ->logical_preds.emplace_back(pred_idx); 101} 102 103static void 104add_linear_edge(unsigned pred_idx, Block* succ) 105{ 106 succ->linear_preds.emplace_back(pred_idx); 107} 108 109static void 110add_edge(unsigned pred_idx, Block* succ) 111{ 112 add_logical_edge(pred_idx, succ); 113 add_linear_edge(pred_idx, succ); 114} 115 116static void 117append_logical_start(Block* b) 118{ 119 Builder(NULL, b).pseudo(aco_opcode::p_logical_start); 120} 121 122static void 123append_logical_end(Block* b) 124{ 125 Builder(NULL, b).pseudo(aco_opcode::p_logical_end); 126} 127 128Temp 129get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def) 130{ 131 uint32_t id = ctx->first_temp_id + def->index; 132 return Temp(id, ctx->program->temp_rc[id]); 133} 134 135Temp 136emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero()) 137{ 138 Builder bld(ctx->program, ctx->block); 139 assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec)); 140 assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes()); 141 142 if (ctx->program->wave_size == 32) { 143 Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask; 144 return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base); 145 } 146 147 Operand mask_lo = Operand::c32(-1u); 148 Operand mask_hi = Operand::c32(-1u); 149 150 if (mask.isTemp()) { 151 RegClass rc = RegClass(mask.regClass().type(), 1); 152 Builder::Result mask_split = 153 bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask); 154 mask_lo = Operand(mask_split.def(0).getTemp()); 155 mask_hi = Operand(mask_split.def(1).getTemp()); 156 } else if (mask.physReg() == exec) { 157 mask_lo = Operand(exec_lo, s1); 158 mask_hi = Operand(exec_hi, s1); 159 } 160 161 Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base); 162 163 if (ctx->program->gfx_level <= GFX7) 164 return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo); 165 else 166 return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo); 167} 168 169Temp 170emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false) 171{ 172 if (bld.program->stage != fragment_fs) { 173 if (!dst.id()) 174 return src; 175 else 176 return bld.copy(Definition(dst), src); 177 } else if (!dst.id()) { 178 dst = bld.tmp(src.regClass()); 179 } 180 181 assert(src.size() == dst.size()); 182 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src); 183 bld.program->needs_wqm |= program_needs_wqm; 184 return dst; 185} 186 187static Temp 188emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) 189{ 190 if (index.regClass() == s1) 191 return bld.readlane(bld.def(s1), data, index); 192 193 if (ctx->options->gfx_level <= GFX7) { 194 /* GFX6-7: there is no bpermute instruction */ 195 Operand index_op(index); 196 Operand input_data(data); 197 index_op.setLateKill(true); 198 input_data.setLateKill(true); 199 200 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), 201 index_op, input_data); 202 } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) { 203 204 /* GFX10 wave64 mode: emulate full-wave bpermute */ 205 Temp index_is_lo = 206 bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index); 207 Builder::Result index_is_lo_split = 208 bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo); 209 Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), 210 index_is_lo_split.def(1).getTemp()); 211 Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), 212 index_is_lo_split.def(0).getTemp(), index_is_lo_n1); 213 Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index); 214 Operand input_data(data); 215 216 index_x4.setLateKill(true); 217 input_data.setLateKill(true); 218 same_half.setLateKill(true); 219 220 /* We need one pair of shared VGPRs: 221 * Note, that these have twice the allocation granularity of normal VGPRs */ 222 ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule; 223 224 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), 225 index_x4, input_data, same_half); 226 } else { 227 /* GFX8-9 or GFX10 wave32: bpermute works normally */ 228 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index); 229 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data); 230 } 231} 232 233static Temp 234emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask) 235{ 236 if (ctx->options->gfx_level >= GFX8) { 237 unsigned and_mask = mask & 0x1f; 238 unsigned or_mask = (mask >> 5) & 0x1f; 239 unsigned xor_mask = (mask >> 10) & 0x1f; 240 241 uint16_t dpp_ctrl = 0xffff; 242 243 if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) { 244 unsigned res[4] = {0, 1, 2, 3}; 245 for (unsigned i = 0; i < 4; i++) 246 res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3; 247 dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]); 248 } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) { 249 dpp_ctrl = dpp_row_rr(8); 250 } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) { 251 dpp_ctrl = dpp_row_mirror; 252 } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) { 253 dpp_ctrl = dpp_row_half_mirror; 254 } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && or_mask < 8 && 255 xor_mask < 8) { 256 // DPP8 comes last, as it does not allow several modifiers like `abs` that are available with DPP16 257 Builder::Result ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src); 258 for (unsigned i = 0; i < 8; i++) { 259 ret.instr->dpp8().lane_sel[i] = (((i & and_mask) | or_mask) ^ xor_mask) & 0x7; 260 } 261 return ret; 262 } 263 264 if (dpp_ctrl != 0xffff) 265 return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); 266 } 267 268 return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false); 269} 270 271Temp 272as_vgpr(Builder& bld, Temp val) 273{ 274 if (val.type() == RegType::sgpr) 275 return bld.copy(bld.def(RegType::vgpr, val.size()), val); 276 assert(val.type() == RegType::vgpr); 277 return val; 278} 279 280Temp 281as_vgpr(isel_context* ctx, Temp val) 282{ 283 Builder bld(ctx->program, ctx->block); 284 return as_vgpr(bld, val); 285} 286 287// assumes a != 0xffffffff 288void 289emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b) 290{ 291 assert(b != 0); 292 Builder bld(ctx->program, ctx->block); 293 294 if (util_is_power_of_two_or_zero(b)) { 295 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(util_logbase2(b)), a); 296 return; 297 } 298 299 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32); 300 301 assert(info.multiplier <= 0xffffffff); 302 303 bool pre_shift = info.pre_shift != 0; 304 bool increment = info.increment != 0; 305 bool multiply = true; 306 bool post_shift = info.post_shift != 0; 307 308 if (!pre_shift && !increment && !multiply && !post_shift) { 309 bld.copy(Definition(dst), a); 310 return; 311 } 312 313 Temp pre_shift_dst = a; 314 if (pre_shift) { 315 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst; 316 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand::c32(info.pre_shift), 317 a); 318 } 319 320 Temp increment_dst = pre_shift_dst; 321 if (increment) { 322 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst; 323 bld.vadd32(Definition(increment_dst), Operand::c32(info.increment), pre_shift_dst); 324 } 325 326 Temp multiply_dst = increment_dst; 327 if (multiply) { 328 multiply_dst = post_shift ? bld.tmp(v1) : dst; 329 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst, 330 bld.copy(bld.def(v1), Operand::c32(info.multiplier))); 331 } 332 333 if (post_shift) { 334 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(info.post_shift), 335 multiply_dst); 336 } 337} 338 339void 340emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst) 341{ 342 Builder bld(ctx->program, ctx->block); 343 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx)); 344} 345 346Temp 347emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc) 348{ 349 /* no need to extract the whole vector */ 350 if (src.regClass() == dst_rc) { 351 assert(idx == 0); 352 return src; 353 } 354 355 assert(src.bytes() > (idx * dst_rc.bytes())); 356 Builder bld(ctx->program, ctx->block); 357 auto it = ctx->allocated_vec.find(src.id()); 358 if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) { 359 if (it->second[idx].regClass() == dst_rc) { 360 return it->second[idx]; 361 } else { 362 assert(!dst_rc.is_subdword()); 363 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr); 364 return bld.copy(bld.def(dst_rc), it->second[idx]); 365 } 366 } 367 368 if (dst_rc.is_subdword()) 369 src = as_vgpr(ctx, src); 370 371 if (src.bytes() == dst_rc.bytes()) { 372 assert(idx == 0); 373 return bld.copy(bld.def(dst_rc), src); 374 } else { 375 Temp dst = bld.tmp(dst_rc); 376 emit_extract_vector(ctx, src, idx, dst); 377 return dst; 378 } 379} 380 381void 382emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components) 383{ 384 if (num_components == 1) 385 return; 386 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end()) 387 return; 388 RegClass rc; 389 if (num_components > vec_src.size()) { 390 if (vec_src.type() == RegType::sgpr) { 391 /* should still help get_alu_src() */ 392 emit_split_vector(ctx, vec_src, vec_src.size()); 393 return; 394 } 395 /* sub-dword split */ 396 rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword(); 397 } else { 398 rc = RegClass(vec_src.type(), vec_src.size() / num_components); 399 } 400 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>( 401 aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)}; 402 split->operands[0] = Operand(vec_src); 403 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 404 for (unsigned i = 0; i < num_components; i++) { 405 elems[i] = ctx->program->allocateTmp(rc); 406 split->definitions[i] = Definition(elems[i]); 407 } 408 ctx->block->instructions.emplace_back(std::move(split)); 409 ctx->allocated_vec.emplace(vec_src.id(), elems); 410} 411 412/* This vector expansion uses a mask to determine which elements in the new vector 413 * come from the original vector. The other elements are undefined. */ 414void 415expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask, 416 bool zero_padding = false) 417{ 418 assert(vec_src.type() == RegType::vgpr); 419 Builder bld(ctx->program, ctx->block); 420 421 if (dst.type() == RegType::sgpr && num_components > dst.size()) { 422 Temp tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, 2 * num_components)); 423 expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding); 424 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp_dst); 425 ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()]; 426 return; 427 } 428 429 emit_split_vector(ctx, vec_src, util_bitcount(mask)); 430 431 if (vec_src == dst) 432 return; 433 434 if (num_components == 1) { 435 if (dst.type() == RegType::sgpr) 436 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src); 437 else 438 bld.copy(Definition(dst), vec_src); 439 return; 440 } 441 442 unsigned component_bytes = dst.bytes() / num_components; 443 RegClass src_rc = RegClass::get(RegType::vgpr, component_bytes); 444 RegClass dst_rc = RegClass::get(dst.type(), component_bytes); 445 assert(dst.type() == RegType::vgpr || !src_rc.is_subdword()); 446 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 447 448 Temp padding = Temp(0, dst_rc); 449 if (zero_padding) 450 padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes)); 451 452 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 453 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; 454 vec->definitions[0] = Definition(dst); 455 unsigned k = 0; 456 for (unsigned i = 0; i < num_components; i++) { 457 if (mask & (1 << i)) { 458 Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc); 459 if (dst.type() == RegType::sgpr) 460 src = bld.as_uniform(src); 461 vec->operands[i] = Operand(src); 462 elems[i] = src; 463 } else { 464 vec->operands[i] = Operand::zero(component_bytes); 465 elems[i] = padding; 466 } 467 } 468 ctx->block->instructions.emplace_back(std::move(vec)); 469 ctx->allocated_vec.emplace(dst.id(), elems); 470} 471 472/* adjust misaligned small bit size loads */ 473void 474byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst) 475{ 476 Builder bld(ctx->program, ctx->block); 477 Operand shift; 478 Temp select = Temp(); 479 if (offset.isConstant()) { 480 assert(offset.constantValue() && offset.constantValue() < 4); 481 shift = Operand::c32(offset.constantValue() * 8); 482 } else { 483 /* bit_offset = 8 * (offset & 0x3) */ 484 Temp tmp = 485 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u)); 486 select = bld.tmp(s1); 487 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, 488 Operand::c32(3u)); 489 } 490 491 if (vec.size() == 1) { 492 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift); 493 } else if (vec.size() == 2) { 494 Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2); 495 bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift); 496 if (tmp == dst) 497 emit_split_vector(ctx, dst, 2); 498 else 499 emit_extract_vector(ctx, tmp, 0, dst); 500 } else if (vec.size() == 3 || vec.size() == 4) { 501 Temp lo = bld.tmp(s2), hi; 502 if (vec.size() == 3) { 503 /* this can happen if we use VMEM for a uniform load */ 504 hi = bld.tmp(s1); 505 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec); 506 } else { 507 hi = bld.tmp(s2); 508 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec); 509 hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero()); 510 } 511 if (select != Temp()) 512 hi = 513 bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select)); 514 lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift); 515 Temp mid = bld.tmp(s1); 516 lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo); 517 hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift); 518 mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid); 519 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid); 520 emit_split_vector(ctx, dst, 2); 521 } 522} 523 524void 525byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size) 526{ 527 Builder bld(ctx->program, ctx->block); 528 if (offset.isTemp()) { 529 Temp tmp[4] = {vec, vec, vec, vec}; 530 531 if (vec.size() == 4) { 532 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1); 533 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), 534 Definition(tmp[2]), Definition(tmp[3]), vec); 535 } else if (vec.size() == 3) { 536 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1); 537 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), 538 Definition(tmp[2]), vec); 539 } else if (vec.size() == 2) { 540 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1]; 541 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec); 542 } 543 for (unsigned i = 0; i < dst.size(); i++) 544 tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset); 545 546 vec = tmp[0]; 547 if (dst.size() == 2) 548 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]); 549 550 offset = Operand::zero(); 551 } 552 553 unsigned num_components = vec.bytes() / component_size; 554 if (vec.regClass() == dst.regClass()) { 555 assert(offset.constantValue() == 0); 556 bld.copy(Definition(dst), vec); 557 emit_split_vector(ctx, dst, num_components); 558 return; 559 } 560 561 emit_split_vector(ctx, vec, num_components); 562 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 563 RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword(); 564 565 assert(offset.constantValue() % component_size == 0); 566 unsigned skip = offset.constantValue() / component_size; 567 for (unsigned i = skip; i < num_components; i++) 568 elems[i - skip] = emit_extract_vector(ctx, vec, i, rc); 569 570 if (dst.type() == RegType::vgpr) { 571 /* if dst is vgpr - split the src and create a shrunk version according to the mask. */ 572 num_components = dst.bytes() / component_size; 573 aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>( 574 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; 575 for (unsigned i = 0; i < num_components; i++) 576 create_vec->operands[i] = Operand(elems[i]); 577 create_vec->definitions[0] = Definition(dst); 578 bld.insert(std::move(create_vec)); 579 580 } else if (skip) { 581 /* if dst is sgpr - split the src, but move the original to sgpr. */ 582 vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec); 583 byte_align_scalar(ctx, vec, offset, dst); 584 } else { 585 assert(dst.size() == vec.size()); 586 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); 587 } 588 589 ctx->allocated_vec.emplace(dst.id(), elems); 590} 591 592Temp 593get_ssa_temp_tex(struct isel_context* ctx, nir_ssa_def* def, bool is_16bit) 594{ 595 RegClass rc = RegClass::get(RegType::vgpr, (is_16bit ? 2 : 4) * def->num_components); 596 Temp tmp = get_ssa_temp(ctx, def); 597 if (tmp.bytes() != rc.bytes()) 598 return emit_extract_vector(ctx, tmp, 0, rc); 599 else 600 return tmp; 601} 602 603Temp 604bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2)) 605{ 606 Builder bld(ctx->program, ctx->block); 607 if (!dst.id()) 608 dst = bld.tmp(bld.lm); 609 610 assert(val.regClass() == s1); 611 assert(dst.regClass() == bld.lm); 612 613 return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(), 614 bld.scc(val)); 615} 616 617Temp 618bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1)) 619{ 620 Builder bld(ctx->program, ctx->block); 621 if (!dst.id()) 622 dst = bld.tmp(s1); 623 624 assert(val.regClass() == bld.lm); 625 assert(dst.regClass() == s1); 626 627 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */ 628 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(dst)), val, Operand(exec, bld.lm)); 629 return dst; 630} 631 632/** 633 * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than 634 * src_bits and dst_bits are truncated. 635 * 636 * Sign extension may be applied using the sign_extend parameter. The position of the input sign 637 * bit is indicated by src_bits in this case. 638 * 639 * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined. 640 */ 641Temp 642convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, 643 bool sign_extend, Temp dst = Temp()) 644{ 645 assert(!(sign_extend && dst_bits < src_bits) && 646 "Shrinking integers is not supported for signed inputs"); 647 648 if (!dst.id()) { 649 if (dst_bits % 32 == 0 || src.type() == RegType::sgpr) 650 dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u)); 651 else 652 dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword()); 653 } 654 655 assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8); 656 assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8); 657 658 if (dst.bytes() == src.bytes() && dst_bits < src_bits) { 659 /* Copy the raw value, leaving an undefined value in the upper bits for 660 * the caller to handle appropriately */ 661 return bld.copy(Definition(dst), src); 662 } else if (dst.bytes() < src.bytes()) { 663 return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero()); 664 } 665 666 Temp tmp = dst; 667 if (dst_bits == 64) 668 tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1); 669 670 if (tmp == src) { 671 } else if (src.regClass() == s1) { 672 assert(src_bits < 32); 673 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(), 674 Operand::c32(src_bits), Operand::c32((unsigned)sign_extend)); 675 } else { 676 assert(src_bits < 32); 677 bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(), Operand::c32(src_bits), 678 Operand::c32((unsigned)sign_extend)); 679 } 680 681 if (dst_bits == 64) { 682 if (sign_extend && dst.regClass() == s2) { 683 Temp high = 684 bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u)); 685 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high); 686 } else if (sign_extend && dst.regClass() == v2) { 687 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp); 688 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high); 689 } else { 690 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero()); 691 } 692 } 693 694 return dst; 695} 696 697enum sgpr_extract_mode { 698 sgpr_extract_sext, 699 sgpr_extract_zext, 700 sgpr_extract_undef, 701}; 702 703Temp 704extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode) 705{ 706 Temp vec = get_ssa_temp(ctx, src->src.ssa); 707 unsigned src_size = src->src.ssa->bit_size; 708 unsigned swizzle = src->swizzle[0]; 709 710 if (vec.size() > 1) { 711 assert(src_size == 16); 712 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1); 713 swizzle = swizzle & 1; 714 } 715 716 Builder bld(ctx->program, ctx->block); 717 Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst; 718 719 if (mode == sgpr_extract_undef && swizzle == 0) 720 bld.copy(Definition(tmp), vec); 721 else 722 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec), 723 Operand::c32(swizzle), Operand::c32(src_size), 724 Operand::c32((mode == sgpr_extract_sext))); 725 726 if (dst.regClass() == s2) 727 convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst); 728 729 return dst; 730} 731 732Temp 733get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1) 734{ 735 if (src.src.ssa->num_components == 1 && size == 1) 736 return get_ssa_temp(ctx, src.src.ssa); 737 738 Temp vec = get_ssa_temp(ctx, src.src.ssa); 739 unsigned elem_size = src.src.ssa->bit_size / 8u; 740 bool identity_swizzle = true; 741 742 for (unsigned i = 0; identity_swizzle && i < size; i++) { 743 if (src.swizzle[i] != i) 744 identity_swizzle = false; 745 } 746 if (identity_swizzle) 747 return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size)); 748 749 assert(elem_size > 0); 750 assert(vec.bytes() % elem_size == 0); 751 752 if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) { 753 assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16); 754 return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src, 755 sgpr_extract_undef); 756 } 757 758 bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr; 759 if (as_uniform) 760 vec = as_vgpr(ctx, vec); 761 762 RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() 763 : RegClass(vec.type(), elem_size / 4); 764 if (size == 1) { 765 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc); 766 } else { 767 assert(size <= 4); 768 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 769 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>( 770 aco_opcode::p_create_vector, Format::PSEUDO, size, 1)}; 771 for (unsigned i = 0; i < size; ++i) { 772 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc); 773 vec_instr->operands[i] = Operand{elems[i]}; 774 } 775 Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4)); 776 vec_instr->definitions[0] = Definition(dst); 777 ctx->block->instructions.emplace_back(std::move(vec_instr)); 778 ctx->allocated_vec.emplace(dst.id(), elems); 779 return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst; 780 } 781} 782 783Temp 784get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src) 785{ 786 /* returns v2b or v1 for vop3p usage. 787 * The source expects exactly 2 16bit components 788 * which are within the same dword 789 */ 790 assert(src.src.ssa->bit_size == 16); 791 assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1); 792 793 Temp tmp = get_ssa_temp(ctx, src.src.ssa); 794 if (tmp.size() == 1) 795 return tmp; 796 797 /* the size is larger than 1 dword: check the swizzle */ 798 unsigned dword = src.swizzle[0] >> 1; 799 800 /* extract a full dword if possible */ 801 if (tmp.bytes() >= (dword + 1) * 4) { 802 /* if the source is splitted into components, use p_create_vector */ 803 auto it = ctx->allocated_vec.find(tmp.id()); 804 if (it != ctx->allocated_vec.end()) { 805 unsigned index = dword << 1; 806 Builder bld(ctx->program, ctx->block); 807 if (it->second[index].regClass() == v2b) 808 return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index], 809 it->second[index + 1]); 810 } 811 return emit_extract_vector(ctx, tmp, dword, v1); 812 } else { 813 /* This must be a swizzled access to %a.zz where %a is v6b */ 814 assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0); 815 assert(tmp.regClass() == v6b && dword == 1); 816 return emit_extract_vector(ctx, tmp, dword * 2, v2b); 817 } 818} 819 820uint32_t 821get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx) 822{ 823 nir_ssa_scalar scalar = 824 nir_ssa_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]}; 825 return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config); 826} 827 828Temp 829convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false) 830{ 831 if (ptr.size() == 2) 832 return ptr; 833 Builder bld(ctx->program, ctx->block); 834 if (ptr.type() == RegType::vgpr && !non_uniform) 835 ptr = bld.as_uniform(ptr); 836 return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr, 837 Operand::c32((unsigned)ctx->options->address32_hi)); 838} 839 840void 841emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, 842 bool writes_scc, uint8_t uses_ub = 0) 843{ 844 aco_ptr<SOP2_instruction> sop2{ 845 create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)}; 846 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0])); 847 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1])); 848 sop2->definitions[0] = Definition(dst); 849 if (instr->no_unsigned_wrap) 850 sop2->definitions[0].setNUW(true); 851 if (writes_scc) 852 sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1); 853 854 for (int i = 0; i < 2; i++) { 855 if (uses_ub & (1 << i)) { 856 uint32_t src_ub = get_alu_src_ub(ctx, instr, i); 857 if (src_ub <= 0xffff) 858 sop2->operands[i].set16bit(true); 859 else if (src_ub <= 0xffffff) 860 sop2->operands[i].set24bit(true); 861 } 862 } 863 864 ctx->block->instructions.emplace_back(std::move(sop2)); 865} 866 867void 868emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst, 869 bool commutative, bool swap_srcs = false, bool flush_denorms = false, 870 bool nuw = false, uint8_t uses_ub = 0) 871{ 872 Builder bld(ctx->program, ctx->block); 873 bld.is_precise = instr->exact; 874 875 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]); 876 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]); 877 if (src1.type() == RegType::sgpr) { 878 if (commutative && src0.type() == RegType::vgpr) { 879 Temp t = src0; 880 src0 = src1; 881 src1 = t; 882 } else { 883 src1 = as_vgpr(ctx, src1); 884 } 885 } 886 887 Operand op[2] = {Operand(src0), Operand(src1)}; 888 889 for (int i = 0; i < 2; i++) { 890 if (uses_ub & (1 << i)) { 891 uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i); 892 if (src_ub <= 0xffff) 893 op[i].set16bit(true); 894 else if (src_ub <= 0xffffff) 895 op[i].set24bit(true); 896 } 897 } 898 899 if (flush_denorms && ctx->program->gfx_level < GFX9) { 900 assert(dst.size() == 1); 901 Temp tmp = bld.vop2(opc, bld.def(v1), op[0], op[1]); 902 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp); 903 } else { 904 if (nuw) { 905 bld.nuw().vop2(opc, Definition(dst), op[0], op[1]); 906 } else { 907 bld.vop2(opc, Definition(dst), op[0], op[1]); 908 } 909 } 910} 911 912void 913emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) 914{ 915 Builder bld(ctx->program, ctx->block); 916 bld.is_precise = instr->exact; 917 918 Temp src0 = get_alu_src(ctx, instr->src[0]); 919 Temp src1 = get_alu_src(ctx, instr->src[1]); 920 921 if (src1.type() == RegType::sgpr) { 922 assert(src0.type() == RegType::vgpr); 923 std::swap(src0, src1); 924 } 925 926 Temp src00 = bld.tmp(src0.type(), 1); 927 Temp src01 = bld.tmp(src0.type(), 1); 928 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 929 Temp src10 = bld.tmp(v1); 930 Temp src11 = bld.tmp(v1); 931 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 932 Temp lo = bld.vop2(op, bld.def(v1), src00, src10); 933 Temp hi = bld.vop2(op, bld.def(v1), src01, src11); 934 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 935} 936 937void 938emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, 939 bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false) 940{ 941 assert(num_sources == 2 || num_sources == 3); 942 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)}; 943 bool has_sgpr = false; 944 for (unsigned i = 0; i < num_sources; i++) { 945 src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]); 946 if (has_sgpr) 947 src[i] = as_vgpr(ctx, src[i]); 948 else 949 has_sgpr = src[i].type() == RegType::sgpr; 950 } 951 952 Builder bld(ctx->program, ctx->block); 953 bld.is_precise = instr->exact; 954 if (flush_denorms && ctx->program->gfx_level < GFX9) { 955 Temp tmp; 956 if (num_sources == 3) 957 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]); 958 else 959 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]); 960 if (dst.size() == 1) 961 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp); 962 else 963 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp); 964 } else if (num_sources == 3) { 965 bld.vop3(op, Definition(dst), src[0], src[1], src[2]); 966 } else { 967 bld.vop3(op, Definition(dst), src[0], src[1]); 968 } 969} 970 971Builder::Result 972emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, 973 bool swap_srcs = false) 974{ 975 Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]); 976 Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]); 977 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr) 978 src1 = as_vgpr(ctx, src1); 979 assert(instr->dest.dest.ssa.num_components == 2); 980 981 /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */ 982 unsigned opsel_lo = 983 (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1); 984 unsigned opsel_hi = 985 (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1); 986 987 Builder bld(ctx->program, ctx->block); 988 bld.is_precise = instr->exact; 989 Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi); 990 return res; 991} 992 993void 994emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp) 995{ 996 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)}; 997 bool has_sgpr = false; 998 for (unsigned i = 0; i < 3; i++) { 999 src[i] = get_alu_src(ctx, instr->src[i]); 1000 if (has_sgpr) 1001 src[i] = as_vgpr(ctx, src[i]); 1002 else 1003 has_sgpr = src[i].type() == RegType::sgpr; 1004 } 1005 1006 Builder bld(ctx->program, ctx->block); 1007 bld.is_precise = instr->exact; 1008 bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7).instr->vop3p().clamp = clamp; 1009} 1010 1011void 1012emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) 1013{ 1014 Builder bld(ctx->program, ctx->block); 1015 bld.is_precise = instr->exact; 1016 if (dst.type() == RegType::sgpr) 1017 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), 1018 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0]))); 1019 else 1020 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0])); 1021} 1022 1023void 1024emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) 1025{ 1026 Temp src0 = get_alu_src(ctx, instr->src[0]); 1027 Temp src1 = get_alu_src(ctx, instr->src[1]); 1028 assert(src0.size() == src1.size()); 1029 1030 aco_ptr<Instruction> vopc; 1031 if (src1.type() == RegType::sgpr) { 1032 if (src0.type() == RegType::vgpr) { 1033 /* to swap the operands, we might also have to change the opcode */ 1034 switch (op) { 1035 case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break; 1036 case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break; 1037 case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break; 1038 case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break; 1039 case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break; 1040 case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break; 1041 case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break; 1042 case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break; 1043 case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break; 1044 case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break; 1045 case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break; 1046 case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break; 1047 case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break; 1048 case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break; 1049 case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break; 1050 case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break; 1051 case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break; 1052 case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break; 1053 default: /* eq and ne are commutative */ break; 1054 } 1055 Temp t = src0; 1056 src0 = src1; 1057 src1 = t; 1058 } else { 1059 src1 = as_vgpr(ctx, src1); 1060 } 1061 } 1062 1063 Builder bld(ctx->program, ctx->block); 1064 bld.vopc(op, Definition(dst), src0, src1); 1065} 1066 1067void 1068emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) 1069{ 1070 Temp src0 = get_alu_src(ctx, instr->src[0]); 1071 Temp src1 = get_alu_src(ctx, instr->src[1]); 1072 Builder bld(ctx->program, ctx->block); 1073 1074 assert(dst.regClass() == bld.lm); 1075 assert(src0.type() == RegType::sgpr); 1076 assert(src1.type() == RegType::sgpr); 1077 assert(src0.regClass() == src1.regClass()); 1078 1079 /* Emit the SALU comparison instruction */ 1080 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1); 1081 /* Turn the result into a per-lane bool */ 1082 bool_to_vector_condition(ctx, cmp, dst); 1083} 1084 1085void 1086emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op, 1087 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, 1088 aco_opcode s64_op = aco_opcode::num_opcodes) 1089{ 1090 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op 1091 : instr->src[0].src.ssa->bit_size == 32 ? s32_op 1092 : aco_opcode::num_opcodes; 1093 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op 1094 : instr->src[0].src.ssa->bit_size == 32 ? v32_op 1095 : v16_op; 1096 bool use_valu = s_op == aco_opcode::num_opcodes || nir_dest_is_divergent(instr->dest.dest) || 1097 get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr || 1098 get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr; 1099 aco_opcode op = use_valu ? v_op : s_op; 1100 assert(op != aco_opcode::num_opcodes); 1101 assert(dst.regClass() == ctx->program->lane_mask); 1102 1103 if (use_valu) 1104 emit_vopc_instruction(ctx, instr, op, dst); 1105 else 1106 emit_sopc_instruction(ctx, instr, op, dst); 1107} 1108 1109void 1110emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op, 1111 Temp dst) 1112{ 1113 Builder bld(ctx->program, ctx->block); 1114 Temp src0 = get_alu_src(ctx, instr->src[0]); 1115 Temp src1 = get_alu_src(ctx, instr->src[1]); 1116 1117 assert(dst.regClass() == bld.lm); 1118 assert(src0.regClass() == bld.lm); 1119 assert(src1.regClass() == bld.lm); 1120 1121 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1); 1122} 1123 1124void 1125emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst) 1126{ 1127 Builder bld(ctx->program, ctx->block); 1128 Temp cond = get_alu_src(ctx, instr->src[0]); 1129 Temp then = get_alu_src(ctx, instr->src[1]); 1130 Temp els = get_alu_src(ctx, instr->src[2]); 1131 1132 assert(cond.regClass() == bld.lm); 1133 1134 if (dst.type() == RegType::vgpr) { 1135 aco_ptr<Instruction> bcsel; 1136 if (dst.size() == 1) { 1137 then = as_vgpr(ctx, then); 1138 els = as_vgpr(ctx, els); 1139 1140 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond); 1141 } else if (dst.size() == 2) { 1142 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1); 1143 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then); 1144 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1); 1145 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els); 1146 1147 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond); 1148 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond); 1149 1150 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 1151 } else { 1152 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1153 } 1154 return; 1155 } 1156 1157 if (instr->dest.dest.ssa.bit_size == 1) { 1158 assert(dst.regClass() == bld.lm); 1159 assert(then.regClass() == bld.lm); 1160 assert(els.regClass() == bld.lm); 1161 } 1162 1163 if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */ 1164 if (dst.regClass() == s1 || dst.regClass() == s2) { 1165 assert((then.regClass() == s1 || then.regClass() == s2) && 1166 els.regClass() == then.regClass()); 1167 assert(dst.size() == then.size()); 1168 aco_opcode op = 1169 dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64; 1170 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond))); 1171 } else { 1172 isel_err(&instr->instr, "Unimplemented uniform bcsel bit size"); 1173 } 1174 return; 1175 } 1176 1177 /* divergent boolean bcsel 1178 * this implements bcsel on bools: dst = s0 ? s1 : s2 1179 * are going to be: dst = (s0 & s1) | (~s0 & s2) */ 1180 assert(instr->dest.dest.ssa.bit_size == 1); 1181 1182 if (cond.id() != then.id()) 1183 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then); 1184 1185 if (cond.id() == els.id()) 1186 bld.copy(Definition(dst), then); 1187 else 1188 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then, 1189 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond)); 1190} 1191 1192void 1193emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op, 1194 uint32_t undo) 1195{ 1196 /* multiply by 16777216 to handle denormals */ 1197 Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.def(bld.lm), as_vgpr(ctx, val), 1198 bld.copy(bld.def(v1), Operand::c32((1u << 7) | (1u << 4)))); 1199 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val); 1200 scaled = bld.vop1(op, bld.def(v1), scaled); 1201 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled); 1202 1203 Temp not_scaled = bld.vop1(op, bld.def(v1), val); 1204 1205 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal); 1206} 1207 1208void 1209emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1210{ 1211 if (ctx->block->fp_mode.denorm32 == 0) { 1212 bld.vop1(aco_opcode::v_rcp_f32, dst, val); 1213 return; 1214 } 1215 1216 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u); 1217} 1218 1219void 1220emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1221{ 1222 if (ctx->block->fp_mode.denorm32 == 0) { 1223 bld.vop1(aco_opcode::v_rsq_f32, dst, val); 1224 return; 1225 } 1226 1227 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u); 1228} 1229 1230void 1231emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1232{ 1233 if (ctx->block->fp_mode.denorm32 == 0) { 1234 bld.vop1(aco_opcode::v_sqrt_f32, dst, val); 1235 return; 1236 } 1237 1238 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u); 1239} 1240 1241void 1242emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1243{ 1244 if (ctx->block->fp_mode.denorm32 == 0) { 1245 bld.vop1(aco_opcode::v_log_f32, dst, val); 1246 return; 1247 } 1248 1249 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u); 1250} 1251 1252Temp 1253emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1254{ 1255 if (ctx->options->gfx_level >= GFX7) 1256 return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val); 1257 1258 /* GFX6 doesn't support V_TRUNC_F64, lower it. */ 1259 /* TODO: create more efficient code! */ 1260 if (val.type() == RegType::sgpr) 1261 val = as_vgpr(ctx, val); 1262 1263 /* Split the input value. */ 1264 Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1); 1265 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val); 1266 1267 /* Extract the exponent and compute the unbiased value. */ 1268 Temp exponent = 1269 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u)); 1270 exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u)); 1271 1272 /* Extract the fractional part. */ 1273 Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u), 1274 Operand::c32(0x000fffffu)); 1275 fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent); 1276 1277 Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1); 1278 bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), 1279 fract_mask); 1280 1281 Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1); 1282 Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo); 1283 fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp); 1284 tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi); 1285 fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp); 1286 1287 /* Get the sign bit. */ 1288 Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi); 1289 1290 /* Decide the operation to apply depending on the unbiased exponent. */ 1291 Temp exp_lt0 = 1292 bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero()); 1293 Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, 1294 bld.copy(bld.def(v1), Operand::zero()), exp_lt0); 1295 Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0); 1296 Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u)); 1297 dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51); 1298 dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51); 1299 1300 return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi); 1301} 1302 1303Temp 1304emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1305{ 1306 if (ctx->options->gfx_level >= GFX7) 1307 return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val); 1308 1309 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually 1310 * lowered at NIR level for precision reasons). */ 1311 Temp src0 = as_vgpr(ctx, val); 1312 1313 Temp mask = bld.copy(bld.def(s1), Operand::c32(3u)); /* isnan */ 1314 Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u), 1315 Operand::c32(0x3fefffffu)); 1316 1317 Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.def(bld.lm), src0, mask); 1318 Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0); 1319 Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val); 1320 1321 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1); 1322 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0); 1323 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1); 1324 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min); 1325 1326 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan); 1327 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan); 1328 1329 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1); 1330 1331 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v); 1332 add->vop3().neg[1] = true; 1333 1334 return add->definitions[0].getTemp(); 1335} 1336 1337Temp 1338uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1) 1339{ 1340 if (bld.program->gfx_level < GFX8) { 1341 Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true); 1342 return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1), 1343 add.def(1).getTemp()); 1344 } 1345 1346 Builder::Result add(NULL); 1347 if (bld.program->gfx_level >= GFX9) { 1348 add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1); 1349 } else { 1350 add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1); 1351 } 1352 add.instr->vop3().clamp = 1; 1353 return dst.getTemp(); 1354} 1355 1356Temp 1357usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1) 1358{ 1359 if (bld.program->gfx_level < GFX8) { 1360 Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true); 1361 return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u), 1362 sub.def(1).getTemp()); 1363 } 1364 1365 Builder::Result sub(NULL); 1366 if (bld.program->gfx_level >= GFX9) { 1367 sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1); 1368 } else { 1369 sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1); 1370 } 1371 sub.instr->vop3().clamp = 1; 1372 return dst.getTemp(); 1373} 1374 1375void 1376visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) 1377{ 1378 if (!instr->dest.dest.is_ssa) { 1379 isel_err(&instr->instr, "nir alu dst not in ssa"); 1380 abort(); 1381 } 1382 Builder bld(ctx->program, ctx->block); 1383 bld.is_precise = instr->exact; 1384 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa); 1385 switch (instr->op) { 1386 case nir_op_vec2: 1387 case nir_op_vec3: 1388 case nir_op_vec4: 1389 case nir_op_vec5: 1390 case nir_op_vec8: 1391 case nir_op_vec16: { 1392 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 1393 unsigned num = instr->dest.dest.ssa.num_components; 1394 for (unsigned i = 0; i < num; ++i) 1395 elems[i] = get_alu_src(ctx, instr->src[i]); 1396 1397 if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) { 1398 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 1399 aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)}; 1400 RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u); 1401 for (unsigned i = 0; i < num; ++i) { 1402 if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword()) 1403 elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc); 1404 vec->operands[i] = Operand{elems[i]}; 1405 } 1406 vec->definitions[0] = Definition(dst); 1407 ctx->block->instructions.emplace_back(std::move(vec)); 1408 ctx->allocated_vec.emplace(dst.id(), elems); 1409 } else { 1410 bool use_s_pack = ctx->program->gfx_level >= GFX9; 1411 Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->dest.dest.ssa.bit_size) - 1)); 1412 1413 std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed; 1414 uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {}; 1415 for (unsigned i = 0; i < num; i++) { 1416 unsigned packed_size = use_s_pack ? 16 : 32; 1417 unsigned idx = i * instr->dest.dest.ssa.bit_size / packed_size; 1418 unsigned offset = i * instr->dest.dest.ssa.bit_size % packed_size; 1419 if (nir_src_is_const(instr->src[i].src)) { 1420 const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset; 1421 continue; 1422 } 1423 1424 if (offset != packed_size - instr->dest.dest.ssa.bit_size) 1425 elems[i] = 1426 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask); 1427 1428 if (offset) 1429 elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i], 1430 Operand::c32(offset)); 1431 1432 if (packed[idx].id()) 1433 packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i], 1434 packed[idx]); 1435 else 1436 packed[idx] = elems[i]; 1437 } 1438 1439 if (use_s_pack) { 1440 for (unsigned i = 0; i < dst.size(); i++) { 1441 bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id(); 1442 1443 if (packed[i * 2].id() && packed[i * 2 + 1].id()) 1444 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2], 1445 packed[i * 2 + 1]); 1446 else if (packed[i * 2 + 1].id()) 1447 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), 1448 Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]); 1449 else if (packed[i * 2].id()) 1450 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2], 1451 Operand::c32(const_vals[i * 2 + 1])); 1452 1453 if (same) 1454 const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16); 1455 else 1456 const_vals[i] = 0; 1457 } 1458 } 1459 1460 for (unsigned i = 0; i < dst.size(); i++) { 1461 if (const_vals[i] && packed[i].id()) 1462 packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), 1463 Operand::c32(const_vals[i]), packed[i]); 1464 else if (!packed[i].id()) 1465 packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i])); 1466 } 1467 1468 if (dst.size() == 1) 1469 bld.copy(Definition(dst), packed[0]); 1470 else if (dst.size() == 2) 1471 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]); 1472 else 1473 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1], 1474 packed[2]); 1475 } 1476 break; 1477 } 1478 case nir_op_mov: { 1479 Temp src = get_alu_src(ctx, instr->src[0]); 1480 if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) { 1481 /* use size() instead of bytes() for 8/16-bit */ 1482 assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov"); 1483 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); 1484 } else { 1485 assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov"); 1486 bld.copy(Definition(dst), src); 1487 } 1488 break; 1489 } 1490 case nir_op_inot: { 1491 Temp src = get_alu_src(ctx, instr->src[0]); 1492 if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { 1493 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst); 1494 } else if (dst.regClass() == v2) { 1495 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 1496 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 1497 lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo); 1498 hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi); 1499 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 1500 } else if (dst.type() == RegType::sgpr) { 1501 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64; 1502 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src); 1503 } else { 1504 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1505 } 1506 break; 1507 } 1508 case nir_op_iabs: { 1509 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1510 Temp src = get_alu_src_vop3p(ctx, instr->src[0]); 1511 1512 unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1; 1513 unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1; 1514 1515 Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(), 1516 src, opsel_lo, opsel_hi); 1517 bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi); 1518 break; 1519 } 1520 Temp src = get_alu_src(ctx, instr->src[0]); 1521 if (dst.regClass() == s1) { 1522 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src); 1523 } else if (dst.regClass() == v1) { 1524 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, 1525 bld.vsub32(bld.def(v1), Operand::zero(), src)); 1526 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) { 1527 bld.vop3( 1528 aco_opcode::v_max_i16_e64, Definition(dst), src, 1529 bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src)); 1530 } else if (dst.regClass() == v2b) { 1531 src = as_vgpr(ctx, src); 1532 bld.vop2(aco_opcode::v_max_i16, Definition(dst), src, 1533 bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src)); 1534 } else { 1535 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1536 } 1537 break; 1538 } 1539 case nir_op_isign: { 1540 Temp src = get_alu_src(ctx, instr->src[0]); 1541 if (dst.regClass() == s1) { 1542 Temp tmp = 1543 bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1)); 1544 bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u)); 1545 } else if (dst.regClass() == s2) { 1546 Temp neg = 1547 bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u)); 1548 Temp neqz; 1549 if (ctx->program->gfx_level >= GFX8) 1550 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero()); 1551 else 1552 neqz = 1553 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero()) 1554 .def(1) 1555 .getTemp(); 1556 /* SCC gets zero-extended to 64 bit */ 1557 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz)); 1558 } else if (dst.regClass() == v1) { 1559 bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u)); 1560 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) { 1561 bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u)); 1562 } else if (dst.regClass() == v2b) { 1563 src = as_vgpr(ctx, src); 1564 bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1), 1565 bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src)); 1566 } else if (dst.regClass() == v2) { 1567 Temp upper = emit_extract_vector(ctx, src, 1, v1); 1568 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper); 1569 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src); 1570 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz); 1571 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz); 1572 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 1573 } else { 1574 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1575 } 1576 break; 1577 } 1578 case nir_op_imax: { 1579 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) { 1580 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst); 1581 } else if (dst.regClass() == v2b) { 1582 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true); 1583 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1584 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst); 1585 } else if (dst.regClass() == v1) { 1586 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true); 1587 } else if (dst.regClass() == s1) { 1588 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true); 1589 } else { 1590 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1591 } 1592 break; 1593 } 1594 case nir_op_umax: { 1595 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) { 1596 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst); 1597 } else if (dst.regClass() == v2b) { 1598 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true); 1599 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1600 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst); 1601 } else if (dst.regClass() == v1) { 1602 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true); 1603 } else if (dst.regClass() == s1) { 1604 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true); 1605 } else { 1606 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1607 } 1608 break; 1609 } 1610 case nir_op_imin: { 1611 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) { 1612 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst); 1613 } else if (dst.regClass() == v2b) { 1614 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true); 1615 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1616 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst); 1617 } else if (dst.regClass() == v1) { 1618 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true); 1619 } else if (dst.regClass() == s1) { 1620 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true); 1621 } else { 1622 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1623 } 1624 break; 1625 } 1626 case nir_op_umin: { 1627 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) { 1628 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst); 1629 } else if (dst.regClass() == v2b) { 1630 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true); 1631 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1632 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst); 1633 } else if (dst.regClass() == v1) { 1634 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true); 1635 } else if (dst.regClass() == s1) { 1636 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true); 1637 } else { 1638 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1639 } 1640 break; 1641 } 1642 case nir_op_ior: { 1643 if (instr->dest.dest.ssa.bit_size == 1) { 1644 emit_boolean_logic(ctx, instr, Builder::s_or, dst); 1645 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { 1646 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true); 1647 } else if (dst.regClass() == v2) { 1648 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst); 1649 } else if (dst.regClass() == s1) { 1650 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true); 1651 } else if (dst.regClass() == s2) { 1652 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true); 1653 } else { 1654 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1655 } 1656 break; 1657 } 1658 case nir_op_iand: { 1659 if (instr->dest.dest.ssa.bit_size == 1) { 1660 emit_boolean_logic(ctx, instr, Builder::s_and, dst); 1661 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { 1662 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true); 1663 } else if (dst.regClass() == v2) { 1664 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst); 1665 } else if (dst.regClass() == s1) { 1666 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true); 1667 } else if (dst.regClass() == s2) { 1668 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true); 1669 } else { 1670 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1671 } 1672 break; 1673 } 1674 case nir_op_ixor: { 1675 if (instr->dest.dest.ssa.bit_size == 1) { 1676 emit_boolean_logic(ctx, instr, Builder::s_xor, dst); 1677 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { 1678 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true); 1679 } else if (dst.regClass() == v2) { 1680 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst); 1681 } else if (dst.regClass() == s1) { 1682 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true); 1683 } else if (dst.regClass() == s2) { 1684 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true); 1685 } else { 1686 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1687 } 1688 break; 1689 } 1690 case nir_op_ushr: { 1691 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) { 1692 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true); 1693 } else if (dst.regClass() == v2b) { 1694 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true); 1695 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1696 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true); 1697 } else if (dst.regClass() == v1) { 1698 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true); 1699 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) { 1700 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]), 1701 get_alu_src(ctx, instr->src[0])); 1702 } else if (dst.regClass() == v2) { 1703 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst); 1704 } else if (dst.regClass() == s2) { 1705 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true); 1706 } else if (dst.regClass() == s1) { 1707 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true); 1708 } else { 1709 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1710 } 1711 break; 1712 } 1713 case nir_op_ishl: { 1714 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) { 1715 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true); 1716 } else if (dst.regClass() == v2b) { 1717 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true); 1718 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1719 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true); 1720 } else if (dst.regClass() == v1) { 1721 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false, 1722 false, 2); 1723 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) { 1724 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]), 1725 get_alu_src(ctx, instr->src[0])); 1726 } else if (dst.regClass() == v2) { 1727 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst); 1728 } else if (dst.regClass() == s1) { 1729 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1); 1730 } else if (dst.regClass() == s2) { 1731 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true); 1732 } else { 1733 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1734 } 1735 break; 1736 } 1737 case nir_op_ishr: { 1738 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) { 1739 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true); 1740 } else if (dst.regClass() == v2b) { 1741 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true); 1742 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1743 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true); 1744 } else if (dst.regClass() == v1) { 1745 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true); 1746 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) { 1747 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]), 1748 get_alu_src(ctx, instr->src[0])); 1749 } else if (dst.regClass() == v2) { 1750 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst); 1751 } else if (dst.regClass() == s1) { 1752 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true); 1753 } else if (dst.regClass() == s2) { 1754 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true); 1755 } else { 1756 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1757 } 1758 break; 1759 } 1760 case nir_op_find_lsb: { 1761 Temp src = get_alu_src(ctx, instr->src[0]); 1762 if (src.regClass() == s1) { 1763 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src); 1764 } else if (src.regClass() == v1) { 1765 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst); 1766 } else if (src.regClass() == s2) { 1767 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src); 1768 } else { 1769 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1770 } 1771 break; 1772 } 1773 case nir_op_ufind_msb: 1774 case nir_op_ifind_msb: { 1775 Temp src = get_alu_src(ctx, instr->src[0]); 1776 if (src.regClass() == s1 || src.regClass() == s2) { 1777 aco_opcode op = src.regClass() == s2 1778 ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 1779 : aco_opcode::s_flbit_i32_i64) 1780 : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 1781 : aco_opcode::s_flbit_i32); 1782 Temp msb_rev = bld.sop1(op, bld.def(s1), src); 1783 1784 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), 1785 Operand::c32(src.size() * 32u - 1u), msb_rev); 1786 Temp msb = sub.def(0).getTemp(); 1787 Temp carry = sub.def(1).getTemp(); 1788 1789 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb, 1790 bld.scc(carry)); 1791 } else if (src.regClass() == v1) { 1792 aco_opcode op = 1793 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32; 1794 Temp msb_rev = bld.tmp(v1); 1795 emit_vop1_instruction(ctx, instr, op, msb_rev); 1796 Temp msb = bld.tmp(v1); 1797 Temp carry = 1798 bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp(); 1799 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry); 1800 } else if (src.regClass() == v2) { 1801 aco_opcode op = 1802 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32; 1803 1804 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 1805 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 1806 1807 lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)), 1808 bld.vop1(op, bld.def(v1), lo)); 1809 hi = bld.vop1(op, bld.def(v1), hi); 1810 Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi); 1811 1812 Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi); 1813 1814 Temp msb = bld.tmp(v1); 1815 Temp carry = 1816 bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp(); 1817 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry); 1818 } else { 1819 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1820 } 1821 break; 1822 } 1823 case nir_op_uclz: { 1824 Temp src = get_alu_src(ctx, instr->src[0]); 1825 if (src.regClass() == s1) { 1826 Temp msb_rev = bld.sop1(aco_opcode::s_flbit_i32_b32, bld.def(s1), src); 1827 bld.sop2(aco_opcode::s_min_u32, Definition(dst), Operand::c32(32u), msb_rev); 1828 } else if (src.regClass() == v1) { 1829 Temp msb_rev = bld.vop1(aco_opcode::v_ffbh_u32, bld.def(v1), src); 1830 bld.vop2(aco_opcode::v_min_u32, Definition(dst), Operand::c32(32u), msb_rev); 1831 } else { 1832 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1833 } 1834 break; 1835 } 1836 case nir_op_bitfield_reverse: { 1837 if (dst.regClass() == s1) { 1838 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0])); 1839 } else if (dst.regClass() == v1) { 1840 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0])); 1841 } else { 1842 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1843 } 1844 break; 1845 } 1846 case nir_op_iadd: { 1847 if (dst.regClass() == s1) { 1848 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true); 1849 break; 1850 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) { 1851 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst); 1852 break; 1853 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) { 1854 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true); 1855 break; 1856 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1857 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst); 1858 break; 1859 } 1860 1861 Temp src0 = get_alu_src(ctx, instr->src[0]); 1862 Temp src1 = get_alu_src(ctx, instr->src[1]); 1863 if (dst.type() == RegType::vgpr && dst.bytes() <= 4) { 1864 bld.vadd32(Definition(dst), Operand(src0), Operand(src1)); 1865 break; 1866 } 1867 1868 assert(src0.size() == 2 && src1.size() == 2); 1869 Temp src00 = bld.tmp(src0.type(), 1); 1870 Temp src01 = bld.tmp(dst.type(), 1); 1871 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 1872 Temp src10 = bld.tmp(src1.type(), 1); 1873 Temp src11 = bld.tmp(dst.type(), 1); 1874 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 1875 1876 if (dst.regClass() == s2) { 1877 Temp carry = bld.tmp(s1); 1878 Temp dst0 = 1879 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); 1880 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, 1881 bld.scc(carry)); 1882 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 1883 } else if (dst.regClass() == v2) { 1884 Temp dst0 = bld.tmp(v1); 1885 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp(); 1886 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry); 1887 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 1888 } else { 1889 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1890 } 1891 break; 1892 } 1893 case nir_op_uadd_sat: { 1894 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1895 Instruction* add_instr = 1896 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst); 1897 add_instr->vop3p().clamp = 1; 1898 break; 1899 } 1900 Temp src0 = get_alu_src(ctx, instr->src[0]); 1901 Temp src1 = get_alu_src(ctx, instr->src[1]); 1902 if (dst.regClass() == s1) { 1903 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1); 1904 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1); 1905 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp, 1906 bld.scc(carry)); 1907 break; 1908 } else if (dst.regClass() == v2b) { 1909 Instruction* add_instr; 1910 if (ctx->program->gfx_level >= GFX10) { 1911 add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr; 1912 } else { 1913 if (src1.type() == RegType::sgpr) 1914 std::swap(src0, src1); 1915 add_instr = 1916 bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr; 1917 } 1918 add_instr->vop3().clamp = 1; 1919 break; 1920 } else if (dst.regClass() == v1) { 1921 uadd32_sat(bld, Definition(dst), src0, src1); 1922 break; 1923 } 1924 1925 assert(src0.size() == 2 && src1.size() == 2); 1926 1927 Temp src00 = bld.tmp(src0.type(), 1); 1928 Temp src01 = bld.tmp(src0.type(), 1); 1929 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 1930 Temp src10 = bld.tmp(src1.type(), 1); 1931 Temp src11 = bld.tmp(src1.type(), 1); 1932 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 1933 1934 if (dst.regClass() == s2) { 1935 Temp carry0 = bld.tmp(s1); 1936 Temp carry1 = bld.tmp(s1); 1937 1938 Temp no_sat0 = 1939 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10); 1940 Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)), 1941 src01, src11, bld.scc(carry0)); 1942 1943 Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1); 1944 1945 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat, 1946 bld.scc(carry1)); 1947 } else if (dst.regClass() == v2) { 1948 Temp no_sat0 = bld.tmp(v1); 1949 Temp dst0 = bld.tmp(v1); 1950 Temp dst1 = bld.tmp(v1); 1951 1952 Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp(); 1953 Temp carry1; 1954 1955 if (ctx->program->gfx_level >= GFX8) { 1956 carry1 = bld.tmp(bld.lm); 1957 bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1), 1958 as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0) 1959 .instr->vop3() 1960 .clamp = 1; 1961 } else { 1962 Temp no_sat1 = bld.tmp(v1); 1963 carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp(); 1964 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1), 1965 carry1); 1966 } 1967 1968 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1), 1969 carry1); 1970 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 1971 } else { 1972 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1973 } 1974 break; 1975 } 1976 case nir_op_iadd_sat: { 1977 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1978 Instruction* add_instr = 1979 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst); 1980 add_instr->vop3p().clamp = 1; 1981 break; 1982 } 1983 Temp src0 = get_alu_src(ctx, instr->src[0]); 1984 Temp src1 = get_alu_src(ctx, instr->src[1]); 1985 if (dst.regClass() == s1) { 1986 Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero()); 1987 Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)), 1988 Operand::c32(INT32_MAX), cond); 1989 Temp overflow = bld.tmp(s1); 1990 Temp add = 1991 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1); 1992 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow)); 1993 break; 1994 } 1995 1996 src1 = as_vgpr(ctx, src1); 1997 1998 if (dst.regClass() == v2b) { 1999 Instruction* add_instr = 2000 bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr; 2001 add_instr->vop3().clamp = 1; 2002 } else if (dst.regClass() == v1) { 2003 Instruction* add_instr = 2004 bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr; 2005 add_instr->vop3().clamp = 1; 2006 } else { 2007 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2008 } 2009 break; 2010 } 2011 case nir_op_uadd_carry: { 2012 Temp src0 = get_alu_src(ctx, instr->src[0]); 2013 Temp src1 = get_alu_src(ctx, instr->src[1]); 2014 if (dst.regClass() == s1) { 2015 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1); 2016 break; 2017 } 2018 if (dst.regClass() == v1) { 2019 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp(); 2020 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u), 2021 carry); 2022 break; 2023 } 2024 2025 Temp src00 = bld.tmp(src0.type(), 1); 2026 Temp src01 = bld.tmp(dst.type(), 1); 2027 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 2028 Temp src10 = bld.tmp(src1.type(), 1); 2029 Temp src11 = bld.tmp(dst.type(), 1); 2030 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 2031 if (dst.regClass() == s2) { 2032 Temp carry = bld.tmp(s1); 2033 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); 2034 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, 2035 bld.scc(carry)) 2036 .def(1) 2037 .getTemp(); 2038 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero()); 2039 } else if (dst.regClass() == v2) { 2040 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp(); 2041 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp(); 2042 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), 2043 Operand::c32(1u), carry); 2044 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero()); 2045 } else { 2046 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2047 } 2048 break; 2049 } 2050 case nir_op_isub: { 2051 if (dst.regClass() == s1) { 2052 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true); 2053 break; 2054 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2055 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst); 2056 break; 2057 } 2058 2059 Temp src0 = get_alu_src(ctx, instr->src[0]); 2060 Temp src1 = get_alu_src(ctx, instr->src[1]); 2061 if (dst.regClass() == v1) { 2062 bld.vsub32(Definition(dst), src0, src1); 2063 break; 2064 } else if (dst.bytes() <= 2) { 2065 if (ctx->program->gfx_level >= GFX10) 2066 bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1); 2067 else if (src1.type() == RegType::sgpr) 2068 bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0)); 2069 else if (ctx->program->gfx_level >= GFX8) 2070 bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1)); 2071 else 2072 bld.vsub32(Definition(dst), src0, src1); 2073 break; 2074 } 2075 2076 Temp src00 = bld.tmp(src0.type(), 1); 2077 Temp src01 = bld.tmp(dst.type(), 1); 2078 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 2079 Temp src10 = bld.tmp(src1.type(), 1); 2080 Temp src11 = bld.tmp(dst.type(), 1); 2081 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 2082 if (dst.regClass() == s2) { 2083 Temp borrow = bld.tmp(s1); 2084 Temp dst0 = 2085 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10); 2086 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, 2087 bld.scc(borrow)); 2088 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 2089 } else if (dst.regClass() == v2) { 2090 Temp lower = bld.tmp(v1); 2091 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp(); 2092 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow); 2093 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 2094 } else { 2095 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2096 } 2097 break; 2098 } 2099 case nir_op_usub_borrow: { 2100 Temp src0 = get_alu_src(ctx, instr->src[0]); 2101 Temp src1 = get_alu_src(ctx, instr->src[1]); 2102 if (dst.regClass() == s1) { 2103 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1); 2104 break; 2105 } else if (dst.regClass() == v1) { 2106 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp(); 2107 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u), 2108 borrow); 2109 break; 2110 } 2111 2112 Temp src00 = bld.tmp(src0.type(), 1); 2113 Temp src01 = bld.tmp(dst.type(), 1); 2114 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 2115 Temp src10 = bld.tmp(src1.type(), 1); 2116 Temp src11 = bld.tmp(dst.type(), 1); 2117 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 2118 if (dst.regClass() == s2) { 2119 Temp borrow = bld.tmp(s1); 2120 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10); 2121 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, 2122 bld.scc(borrow)) 2123 .def(1) 2124 .getTemp(); 2125 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero()); 2126 } else if (dst.regClass() == v2) { 2127 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp(); 2128 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp(); 2129 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), 2130 Operand::c32(1u), borrow); 2131 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero()); 2132 } else { 2133 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2134 } 2135 break; 2136 } 2137 case nir_op_usub_sat: { 2138 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2139 Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst); 2140 sub_instr->vop3p().clamp = 1; 2141 break; 2142 } 2143 Temp src0 = get_alu_src(ctx, instr->src[0]); 2144 Temp src1 = get_alu_src(ctx, instr->src[1]); 2145 if (dst.regClass() == s1) { 2146 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1); 2147 bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1); 2148 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry)); 2149 break; 2150 } else if (dst.regClass() == v2b) { 2151 Instruction* sub_instr; 2152 if (ctx->program->gfx_level >= GFX10) { 2153 sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr; 2154 } else { 2155 aco_opcode op = aco_opcode::v_sub_u16; 2156 if (src1.type() == RegType::sgpr) { 2157 std::swap(src0, src1); 2158 op = aco_opcode::v_subrev_u16; 2159 } 2160 sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr; 2161 } 2162 sub_instr->vop3().clamp = 1; 2163 break; 2164 } else if (dst.regClass() == v1) { 2165 usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1)); 2166 break; 2167 } 2168 2169 assert(src0.size() == 2 && src1.size() == 2); 2170 Temp src00 = bld.tmp(src0.type(), 1); 2171 Temp src01 = bld.tmp(src0.type(), 1); 2172 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 2173 Temp src10 = bld.tmp(src1.type(), 1); 2174 Temp src11 = bld.tmp(src1.type(), 1); 2175 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 2176 2177 if (dst.regClass() == s2) { 2178 Temp carry0 = bld.tmp(s1); 2179 Temp carry1 = bld.tmp(s1); 2180 2181 Temp no_sat0 = 2182 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10); 2183 Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)), 2184 src01, src11, bld.scc(carry0)); 2185 2186 Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1); 2187 2188 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat, 2189 bld.scc(carry1)); 2190 } else if (dst.regClass() == v2) { 2191 Temp no_sat0 = bld.tmp(v1); 2192 Temp dst0 = bld.tmp(v1); 2193 Temp dst1 = bld.tmp(v1); 2194 2195 Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp(); 2196 Temp carry1; 2197 2198 if (ctx->program->gfx_level >= GFX8) { 2199 carry1 = bld.tmp(bld.lm); 2200 bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1), 2201 as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0) 2202 .instr->vop3() 2203 .clamp = 1; 2204 } else { 2205 Temp no_sat1 = bld.tmp(v1); 2206 carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp(); 2207 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u), 2208 carry1); 2209 } 2210 2211 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u), 2212 carry1); 2213 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 2214 } else { 2215 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2216 } 2217 break; 2218 } 2219 case nir_op_isub_sat: { 2220 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2221 Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst); 2222 sub_instr->vop3p().clamp = 1; 2223 break; 2224 } 2225 Temp src0 = get_alu_src(ctx, instr->src[0]); 2226 Temp src1 = get_alu_src(ctx, instr->src[1]); 2227 if (dst.regClass() == s1) { 2228 Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero()); 2229 Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)), 2230 Operand::c32(INT32_MAX), cond); 2231 Temp overflow = bld.tmp(s1); 2232 Temp sub = 2233 bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1); 2234 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow)); 2235 break; 2236 } 2237 2238 src1 = as_vgpr(ctx, src1); 2239 2240 if (dst.regClass() == v2b) { 2241 Instruction* sub_instr = 2242 bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr; 2243 sub_instr->vop3().clamp = 1; 2244 } else if (dst.regClass() == v1) { 2245 Instruction* sub_instr = 2246 bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr; 2247 sub_instr->vop3().clamp = 1; 2248 } else { 2249 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2250 } 2251 break; 2252 } 2253 case nir_op_imul: { 2254 if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) { 2255 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst); 2256 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) { 2257 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true); 2258 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2259 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst); 2260 } else if (dst.type() == RegType::vgpr) { 2261 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0); 2262 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1); 2263 2264 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) { 2265 bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff; 2266 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, 2267 true /* commutative */, false, false, nuw_16bit); 2268 } else if (nir_src_is_const(instr->src[0].src)) { 2269 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]), 2270 nir_src_as_uint(instr->src[0].src), false); 2271 } else if (nir_src_is_const(instr->src[1].src)) { 2272 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]), 2273 nir_src_as_uint(instr->src[1].src), false); 2274 } else { 2275 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst); 2276 } 2277 } else if (dst.regClass() == s1) { 2278 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false); 2279 } else { 2280 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2281 } 2282 break; 2283 } 2284 case nir_op_umul_high: { 2285 if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) { 2286 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false); 2287 } else if (dst.bytes() == 4) { 2288 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0); 2289 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1); 2290 2291 Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst; 2292 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) { 2293 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true); 2294 } else { 2295 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp); 2296 } 2297 2298 if (dst.regClass() == s1) 2299 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); 2300 } else { 2301 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2302 } 2303 break; 2304 } 2305 case nir_op_imul_high: { 2306 if (dst.regClass() == v1) { 2307 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst); 2308 } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) { 2309 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false); 2310 } else if (dst.regClass() == s1) { 2311 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]), 2312 as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); 2313 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); 2314 } else { 2315 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2316 } 2317 break; 2318 } 2319 case nir_op_fmul: { 2320 if (dst.regClass() == v2b) { 2321 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true); 2322 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2323 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst); 2324 } else if (dst.regClass() == v1) { 2325 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true); 2326 } else if (dst.regClass() == v2) { 2327 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst); 2328 } else { 2329 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2330 } 2331 break; 2332 } 2333 case nir_op_fmulz: { 2334 if (dst.regClass() == v1) { 2335 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true); 2336 } else { 2337 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2338 } 2339 break; 2340 } 2341 case nir_op_fadd: { 2342 if (dst.regClass() == v2b) { 2343 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true); 2344 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2345 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst); 2346 } else if (dst.regClass() == v1) { 2347 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true); 2348 } else if (dst.regClass() == v2) { 2349 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst); 2350 } else { 2351 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2352 } 2353 break; 2354 } 2355 case nir_op_fsub: { 2356 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2357 Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst); 2358 VOP3P_instruction& sub = add->vop3p(); 2359 sub.neg_lo[1] = true; 2360 sub.neg_hi[1] = true; 2361 break; 2362 } 2363 2364 Temp src0 = get_alu_src(ctx, instr->src[0]); 2365 Temp src1 = get_alu_src(ctx, instr->src[1]); 2366 if (dst.regClass() == v2b) { 2367 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) 2368 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false); 2369 else 2370 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true); 2371 } else if (dst.regClass() == v1) { 2372 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) 2373 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false); 2374 else 2375 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true); 2376 } else if (dst.regClass() == v2) { 2377 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0), 2378 as_vgpr(ctx, src1)); 2379 add->vop3().neg[1] = true; 2380 } else { 2381 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2382 } 2383 break; 2384 } 2385 case nir_op_ffma: { 2386 if (dst.regClass() == v2b) { 2387 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3); 2388 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2389 assert(instr->dest.dest.ssa.num_components == 2); 2390 2391 Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0])); 2392 Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1])); 2393 Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2])); 2394 2395 /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */ 2396 unsigned opsel_lo = 0, opsel_hi = 0; 2397 for (unsigned i = 0; i < 3; i++) { 2398 opsel_lo |= (instr->src[i].swizzle[0] & 1) << i; 2399 opsel_hi |= (instr->src[i].swizzle[1] & 1) << i; 2400 } 2401 2402 bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi); 2403 } else if (dst.regClass() == v1) { 2404 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst, 2405 ctx->block->fp_mode.must_flush_denorms32, 3); 2406 } else if (dst.regClass() == v2) { 2407 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3); 2408 } else { 2409 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2410 } 2411 break; 2412 } 2413 case nir_op_ffmaz: { 2414 if (dst.regClass() == v1) { 2415 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst, 2416 ctx->block->fp_mode.must_flush_denorms32, 3); 2417 } else { 2418 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2419 } 2420 break; 2421 } 2422 case nir_op_fmax: { 2423 if (dst.regClass() == v2b) { 2424 // TODO: check fp_mode.must_flush_denorms16_64 2425 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true); 2426 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2427 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst); 2428 } else if (dst.regClass() == v1) { 2429 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, 2430 ctx->block->fp_mode.must_flush_denorms32); 2431 } else if (dst.regClass() == v2) { 2432 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst, 2433 ctx->block->fp_mode.must_flush_denorms16_64); 2434 } else { 2435 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2436 } 2437 break; 2438 } 2439 case nir_op_fmin: { 2440 if (dst.regClass() == v2b) { 2441 // TODO: check fp_mode.must_flush_denorms16_64 2442 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true); 2443 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2444 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true); 2445 } else if (dst.regClass() == v1) { 2446 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, 2447 ctx->block->fp_mode.must_flush_denorms32); 2448 } else if (dst.regClass() == v2) { 2449 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst, 2450 ctx->block->fp_mode.must_flush_denorms16_64); 2451 } else { 2452 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2453 } 2454 break; 2455 } 2456 case nir_op_sdot_4x8_iadd: { 2457 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false); 2458 break; 2459 } 2460 case nir_op_sdot_4x8_iadd_sat: { 2461 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true); 2462 break; 2463 } 2464 case nir_op_udot_4x8_uadd: { 2465 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false); 2466 break; 2467 } 2468 case nir_op_udot_4x8_uadd_sat: { 2469 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true); 2470 break; 2471 } 2472 case nir_op_sdot_2x16_iadd: { 2473 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false); 2474 break; 2475 } 2476 case nir_op_sdot_2x16_iadd_sat: { 2477 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true); 2478 break; 2479 } 2480 case nir_op_udot_2x16_uadd: { 2481 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false); 2482 break; 2483 } 2484 case nir_op_udot_2x16_uadd_sat: { 2485 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true); 2486 break; 2487 } 2488 case nir_op_cube_face_coord_amd: { 2489 Temp in = get_alu_src(ctx, instr->src[0], 3); 2490 Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1), 2491 emit_extract_vector(ctx, in, 2, v1)}; 2492 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]); 2493 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma); 2494 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]); 2495 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]); 2496 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/), 2497 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma)); 2498 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/), 2499 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma)); 2500 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc); 2501 break; 2502 } 2503 case nir_op_cube_face_index_amd: { 2504 Temp in = get_alu_src(ctx, instr->src[0], 3); 2505 Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1), 2506 emit_extract_vector(ctx, in, 2, v1)}; 2507 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]); 2508 break; 2509 } 2510 case nir_op_bcsel: { 2511 emit_bcsel(ctx, instr, dst); 2512 break; 2513 } 2514 case nir_op_frsq: { 2515 if (dst.regClass() == v2b) { 2516 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst); 2517 } else if (dst.regClass() == v1) { 2518 Temp src = get_alu_src(ctx, instr->src[0]); 2519 emit_rsq(ctx, bld, Definition(dst), src); 2520 } else if (dst.regClass() == v2) { 2521 /* Lowered at NIR level for precision reasons. */ 2522 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst); 2523 } else { 2524 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2525 } 2526 break; 2527 } 2528 case nir_op_fneg: { 2529 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2530 Temp src = get_alu_src_vop3p(ctx, instr->src[0]); 2531 Instruction* vop3p = 2532 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00), 2533 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1); 2534 vop3p->vop3p().neg_lo[0] = true; 2535 vop3p->vop3p().neg_hi[0] = true; 2536 break; 2537 } 2538 Temp src = get_alu_src(ctx, instr->src[0]); 2539 if (dst.regClass() == v2b) { 2540 bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src)); 2541 } else if (dst.regClass() == v1) { 2542 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u), 2543 as_vgpr(ctx, src)); 2544 } else if (dst.regClass() == v2) { 2545 if (ctx->block->fp_mode.must_flush_denorms16_64) 2546 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000), 2547 as_vgpr(ctx, src)); 2548 Temp upper = bld.tmp(v1), lower = bld.tmp(v1); 2549 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 2550 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper); 2551 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 2552 } else { 2553 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2554 } 2555 break; 2556 } 2557 case nir_op_fabs: { 2558 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2559 Temp src = get_alu_src_vop3p(ctx, instr->src[0]); 2560 Instruction* vop3p = 2561 bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src, 2562 instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0) 2563 .instr; 2564 vop3p->vop3p().neg_lo[1] = true; 2565 vop3p->vop3p().neg_hi[1] = true; 2566 break; 2567 } 2568 Temp src = get_alu_src(ctx, instr->src[0]); 2569 if (dst.regClass() == v2b) { 2570 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), 2571 Operand::c16(0x3c00), as_vgpr(ctx, src)) 2572 .instr; 2573 mul->vop3().abs[1] = true; 2574 } else if (dst.regClass() == v1) { 2575 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst), 2576 Operand::c32(0x3f800000u), as_vgpr(ctx, src)) 2577 .instr; 2578 mul->vop3().abs[1] = true; 2579 } else if (dst.regClass() == v2) { 2580 if (ctx->block->fp_mode.must_flush_denorms16_64) 2581 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000), 2582 as_vgpr(ctx, src)); 2583 Temp upper = bld.tmp(v1), lower = bld.tmp(v1); 2584 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 2585 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper); 2586 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 2587 } else { 2588 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2589 } 2590 break; 2591 } 2592 case nir_op_fsat: { 2593 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2594 Temp src = get_alu_src_vop3p(ctx, instr->src[0]); 2595 Instruction* vop3p = 2596 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00), 2597 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1); 2598 vop3p->vop3p().clamp = true; 2599 break; 2600 } 2601 Temp src = get_alu_src(ctx, instr->src[0]); 2602 if (dst.regClass() == v2b) { 2603 bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00), 2604 src); 2605 } else if (dst.regClass() == v1) { 2606 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(), 2607 Operand::c32(0x3f800000u), src); 2608 /* apparently, it is not necessary to flush denorms if this instruction is used with these 2609 * operands */ 2610 // TODO: confirm that this holds under any circumstances 2611 } else if (dst.regClass() == v2) { 2612 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero()); 2613 add->vop3().clamp = true; 2614 } else { 2615 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2616 } 2617 break; 2618 } 2619 case nir_op_flog2: { 2620 if (dst.regClass() == v2b) { 2621 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst); 2622 } else if (dst.regClass() == v1) { 2623 Temp src = get_alu_src(ctx, instr->src[0]); 2624 emit_log2(ctx, bld, Definition(dst), src); 2625 } else { 2626 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2627 } 2628 break; 2629 } 2630 case nir_op_frcp: { 2631 if (dst.regClass() == v2b) { 2632 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst); 2633 } else if (dst.regClass() == v1) { 2634 Temp src = get_alu_src(ctx, instr->src[0]); 2635 emit_rcp(ctx, bld, Definition(dst), src); 2636 } else if (dst.regClass() == v2) { 2637 /* Lowered at NIR level for precision reasons. */ 2638 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst); 2639 } else { 2640 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2641 } 2642 break; 2643 } 2644 case nir_op_fexp2: { 2645 if (dst.regClass() == v2b) { 2646 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst); 2647 } else if (dst.regClass() == v1) { 2648 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst); 2649 } else { 2650 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2651 } 2652 break; 2653 } 2654 case nir_op_fsqrt: { 2655 if (dst.regClass() == v2b) { 2656 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst); 2657 } else if (dst.regClass() == v1) { 2658 Temp src = get_alu_src(ctx, instr->src[0]); 2659 emit_sqrt(ctx, bld, Definition(dst), src); 2660 } else if (dst.regClass() == v2) { 2661 /* Lowered at NIR level for precision reasons. */ 2662 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst); 2663 } else { 2664 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2665 } 2666 break; 2667 } 2668 case nir_op_ffract: { 2669 if (dst.regClass() == v2b) { 2670 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst); 2671 } else if (dst.regClass() == v1) { 2672 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst); 2673 } else if (dst.regClass() == v2) { 2674 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst); 2675 } else { 2676 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2677 } 2678 break; 2679 } 2680 case nir_op_ffloor: { 2681 if (dst.regClass() == v2b) { 2682 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst); 2683 } else if (dst.regClass() == v1) { 2684 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst); 2685 } else if (dst.regClass() == v2) { 2686 Temp src = get_alu_src(ctx, instr->src[0]); 2687 emit_floor_f64(ctx, bld, Definition(dst), src); 2688 } else { 2689 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2690 } 2691 break; 2692 } 2693 case nir_op_fceil: { 2694 if (dst.regClass() == v2b) { 2695 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst); 2696 } else if (dst.regClass() == v1) { 2697 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst); 2698 } else if (dst.regClass() == v2) { 2699 if (ctx->options->gfx_level >= GFX7) { 2700 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst); 2701 } else { 2702 /* GFX6 doesn't support V_CEIL_F64, lower it. */ 2703 /* trunc = trunc(src0) 2704 * if (src0 > 0.0 && src0 != trunc) 2705 * trunc += 1.0 2706 */ 2707 Temp src0 = get_alu_src(ctx, instr->src[0]); 2708 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0); 2709 Temp tmp0 = 2710 bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero()); 2711 Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc); 2712 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1); 2713 Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), 2714 bld.copy(bld.def(v1), Operand::zero()), 2715 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond); 2716 add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), 2717 bld.copy(bld.def(v1), Operand::zero()), add); 2718 bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add); 2719 } 2720 } else { 2721 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2722 } 2723 break; 2724 } 2725 case nir_op_ftrunc: { 2726 if (dst.regClass() == v2b) { 2727 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst); 2728 } else if (dst.regClass() == v1) { 2729 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst); 2730 } else if (dst.regClass() == v2) { 2731 Temp src = get_alu_src(ctx, instr->src[0]); 2732 emit_trunc_f64(ctx, bld, Definition(dst), src); 2733 } else { 2734 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2735 } 2736 break; 2737 } 2738 case nir_op_fround_even: { 2739 if (dst.regClass() == v2b) { 2740 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst); 2741 } else if (dst.regClass() == v1) { 2742 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst); 2743 } else if (dst.regClass() == v2) { 2744 if (ctx->options->gfx_level >= GFX7) { 2745 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst); 2746 } else { 2747 /* GFX6 doesn't support V_RNDNE_F64, lower it. */ 2748 Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1); 2749 Temp src0 = get_alu_src(ctx, instr->src[0]); 2750 bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0); 2751 2752 Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), 2753 bld.copy(bld.def(s1), Operand::c32(-2u))); 2754 Temp bfi = 2755 bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, 2756 bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi)); 2757 Temp tmp = 2758 bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, 2759 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi)); 2760 Instruction* sub = 2761 bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, 2762 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi)); 2763 sub->vop3().neg[1] = true; 2764 tmp = sub->definitions[0].getTemp(); 2765 2766 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u), 2767 Operand::c32(0x432fffffu)); 2768 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v); 2769 vop3->vop3().abs[0] = true; 2770 Temp cond = vop3->definitions[0].getTemp(); 2771 2772 Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1); 2773 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp); 2774 Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, 2775 as_vgpr(ctx, src0_lo), cond); 2776 Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, 2777 as_vgpr(ctx, src0_hi), cond); 2778 2779 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 2780 } 2781 } else { 2782 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2783 } 2784 break; 2785 } 2786 case nir_op_fsin_amd: 2787 case nir_op_fcos_amd: { 2788 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); 2789 aco_ptr<Instruction> norm; 2790 if (dst.regClass() == v2b) { 2791 aco_opcode opcode = 2792 instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16; 2793 bld.vop1(opcode, Definition(dst), src); 2794 } else if (dst.regClass() == v1) { 2795 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */ 2796 if (ctx->options->gfx_level < GFX9) 2797 src = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), src); 2798 2799 aco_opcode opcode = 2800 instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32; 2801 bld.vop1(opcode, Definition(dst), src); 2802 } else { 2803 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2804 } 2805 break; 2806 } 2807 case nir_op_ldexp: { 2808 if (dst.regClass() == v2b) { 2809 emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false); 2810 } else if (dst.regClass() == v1) { 2811 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst); 2812 } else if (dst.regClass() == v2) { 2813 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst); 2814 } else { 2815 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2816 } 2817 break; 2818 } 2819 case nir_op_frexp_sig: { 2820 if (dst.regClass() == v2b) { 2821 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst); 2822 } else if (dst.regClass() == v1) { 2823 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst); 2824 } else if (dst.regClass() == v2) { 2825 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst); 2826 } else { 2827 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2828 } 2829 break; 2830 } 2831 case nir_op_frexp_exp: { 2832 if (instr->src[0].src.ssa->bit_size == 16) { 2833 Temp src = get_alu_src(ctx, instr->src[0]); 2834 Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src); 2835 tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero()); 2836 convert_int(ctx, bld, tmp, 8, 32, true, dst); 2837 } else if (instr->src[0].src.ssa->bit_size == 32) { 2838 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst); 2839 } else if (instr->src[0].src.ssa->bit_size == 64) { 2840 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst); 2841 } else { 2842 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2843 } 2844 break; 2845 } 2846 case nir_op_fsign: { 2847 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); 2848 if (dst.regClass() == v2b) { 2849 assert(ctx->program->gfx_level >= GFX9); 2850 /* replace negative zero with positive zero */ 2851 src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src); 2852 src = 2853 bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u)); 2854 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src); 2855 } else if (dst.regClass() == v1) { 2856 src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src); 2857 src = 2858 bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u)); 2859 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src); 2860 } else if (dst.regClass() == v2) { 2861 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src); 2862 Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u)); 2863 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, 2864 emit_extract_vector(ctx, src, 1, v1), cond); 2865 2866 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src); 2867 tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u)); 2868 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond); 2869 2870 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper); 2871 } else { 2872 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2873 } 2874 break; 2875 } 2876 case nir_op_f2f16: 2877 case nir_op_f2f16_rtne: { 2878 Temp src = get_alu_src(ctx, instr->src[0]); 2879 if (instr->src[0].src.ssa->bit_size == 64) 2880 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src); 2881 if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) 2882 /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to 2883 * keep value numbering and the scheduler simpler. 2884 */ 2885 bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src); 2886 else 2887 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); 2888 break; 2889 } 2890 case nir_op_f2f16_rtz: { 2891 Temp src = get_alu_src(ctx, instr->src[0]); 2892 if (instr->src[0].src.ssa->bit_size == 64) 2893 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src); 2894 if (ctx->block->fp_mode.round16_64 == fp_round_tz) 2895 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); 2896 else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9) 2897 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero()); 2898 else 2899 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src)); 2900 break; 2901 } 2902 case nir_op_f2f32: { 2903 if (instr->src[0].src.ssa->bit_size == 16) { 2904 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst); 2905 } else if (instr->src[0].src.ssa->bit_size == 64) { 2906 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst); 2907 } else { 2908 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2909 } 2910 break; 2911 } 2912 case nir_op_f2f64: { 2913 Temp src = get_alu_src(ctx, instr->src[0]); 2914 if (instr->src[0].src.ssa->bit_size == 16) 2915 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); 2916 bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src); 2917 break; 2918 } 2919 case nir_op_i2f16: { 2920 assert(dst.regClass() == v2b); 2921 Temp src = get_alu_src(ctx, instr->src[0]); 2922 const unsigned input_size = instr->src[0].src.ssa->bit_size; 2923 if (input_size <= 16) { 2924 /* Expand integer to the size expected by the uint→float converter used below */ 2925 unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32); 2926 if (input_size != target_size) { 2927 src = convert_int(ctx, bld, src, input_size, target_size, true); 2928 } 2929 } else if (input_size == 64) { 2930 /* Truncate down to 32 bits; if any of the upper bits are relevant, 2931 * the value does not fall into the single-precision float range 2932 * anyway. SPIR-V does not mandate any specific behavior for such 2933 * large inputs. 2934 */ 2935 src = convert_int(ctx, bld, src, 64, 32, false); 2936 } 2937 2938 if (ctx->program->gfx_level >= GFX8 && input_size <= 16) { 2939 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src); 2940 } else { 2941 /* Convert to f32 and then down to f16. This is needed to handle 2942 * inputs slightly outside the range [INT16_MIN, INT16_MAX], 2943 * which are representable via f16 but wouldn't be converted 2944 * correctly by v_cvt_f16_i16. 2945 * 2946 * This is also the fallback-path taken on GFX7 and earlier, which 2947 * do not support direct f16⟷i16 conversions. 2948 */ 2949 src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src); 2950 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); 2951 } 2952 break; 2953 } 2954 case nir_op_i2f32: { 2955 assert(dst.size() == 1); 2956 Temp src = get_alu_src(ctx, instr->src[0]); 2957 const unsigned input_size = instr->src[0].src.ssa->bit_size; 2958 if (input_size <= 32) { 2959 if (input_size <= 16) { 2960 /* Sign-extend to 32-bits */ 2961 src = convert_int(ctx, bld, src, input_size, 32, true); 2962 } 2963 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src); 2964 } else { 2965 assert(input_size == 64); 2966 RegClass rc = RegClass(src.type(), 1); 2967 Temp lower = bld.tmp(rc), upper = bld.tmp(rc); 2968 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 2969 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); 2970 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper); 2971 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u)); 2972 upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper); 2973 bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper); 2974 } 2975 2976 break; 2977 } 2978 case nir_op_i2f64: { 2979 if (instr->src[0].src.ssa->bit_size <= 32) { 2980 Temp src = get_alu_src(ctx, instr->src[0]); 2981 if (instr->src[0].src.ssa->bit_size <= 16) 2982 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true); 2983 bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src); 2984 } else if (instr->src[0].src.ssa->bit_size == 64) { 2985 Temp src = get_alu_src(ctx, instr->src[0]); 2986 RegClass rc = RegClass(src.type(), 1); 2987 Temp lower = bld.tmp(rc), upper = bld.tmp(rc); 2988 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 2989 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); 2990 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper); 2991 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u)); 2992 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper); 2993 2994 } else { 2995 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2996 } 2997 break; 2998 } 2999 case nir_op_u2f16: { 3000 assert(dst.regClass() == v2b); 3001 Temp src = get_alu_src(ctx, instr->src[0]); 3002 const unsigned input_size = instr->src[0].src.ssa->bit_size; 3003 if (input_size <= 16) { 3004 /* Expand integer to the size expected by the uint→float converter used below */ 3005 unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32); 3006 if (input_size != target_size) { 3007 src = convert_int(ctx, bld, src, input_size, target_size, false); 3008 } 3009 } else if (input_size == 64) { 3010 /* Truncate down to 32 bits; if any of the upper bits are non-zero, 3011 * the value does not fall into the single-precision float range 3012 * anyway. SPIR-V does not mandate any specific behavior for such 3013 * large inputs. 3014 */ 3015 src = convert_int(ctx, bld, src, 64, 32, false); 3016 } 3017 3018 if (ctx->program->gfx_level >= GFX8) { 3019 /* float16 has a range of [0, 65519]. Converting from larger 3020 * inputs is UB, so we just need to consider the lower 16 bits */ 3021 bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src); 3022 } else { 3023 /* GFX7 and earlier do not support direct f16⟷u16 conversions */ 3024 src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src); 3025 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); 3026 } 3027 break; 3028 } 3029 case nir_op_u2f32: { 3030 assert(dst.size() == 1); 3031 Temp src = get_alu_src(ctx, instr->src[0]); 3032 const unsigned input_size = instr->src[0].src.ssa->bit_size; 3033 if (input_size == 8) { 3034 bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src); 3035 } else if (input_size <= 32) { 3036 if (input_size == 16) 3037 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false); 3038 bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src); 3039 } else { 3040 assert(input_size == 64); 3041 RegClass rc = RegClass(src.type(), 1); 3042 Temp lower = bld.tmp(rc), upper = bld.tmp(rc); 3043 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 3044 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); 3045 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper); 3046 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u)); 3047 upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper); 3048 bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper); 3049 } 3050 break; 3051 } 3052 case nir_op_u2f64: { 3053 if (instr->src[0].src.ssa->bit_size <= 32) { 3054 Temp src = get_alu_src(ctx, instr->src[0]); 3055 if (instr->src[0].src.ssa->bit_size <= 16) 3056 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false); 3057 bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src); 3058 } else if (instr->src[0].src.ssa->bit_size == 64) { 3059 Temp src = get_alu_src(ctx, instr->src[0]); 3060 RegClass rc = RegClass(src.type(), 1); 3061 Temp lower = bld.tmp(rc), upper = bld.tmp(rc); 3062 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 3063 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); 3064 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper); 3065 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u)); 3066 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper); 3067 } else { 3068 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3069 } 3070 break; 3071 } 3072 case nir_op_f2i8: 3073 case nir_op_f2i16: { 3074 if (instr->src[0].src.ssa->bit_size == 16) { 3075 if (ctx->program->gfx_level >= GFX8) { 3076 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst); 3077 } else { 3078 /* GFX7 and earlier do not support direct f16⟷i16 conversions */ 3079 Temp tmp = bld.tmp(v1); 3080 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp); 3081 tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp); 3082 tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false, 3083 (dst.type() == RegType::sgpr) ? Temp() : dst); 3084 if (dst.type() == RegType::sgpr) { 3085 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); 3086 } 3087 } 3088 } else if (instr->src[0].src.ssa->bit_size == 32) { 3089 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst); 3090 } else { 3091 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst); 3092 } 3093 break; 3094 } 3095 case nir_op_f2u8: 3096 case nir_op_f2u16: { 3097 if (instr->src[0].src.ssa->bit_size == 16) { 3098 if (ctx->program->gfx_level >= GFX8) { 3099 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst); 3100 } else { 3101 /* GFX7 and earlier do not support direct f16⟷u16 conversions */ 3102 Temp tmp = bld.tmp(v1); 3103 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp); 3104 tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp); 3105 tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false, 3106 (dst.type() == RegType::sgpr) ? Temp() : dst); 3107 if (dst.type() == RegType::sgpr) { 3108 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); 3109 } 3110 } 3111 } else if (instr->src[0].src.ssa->bit_size == 32) { 3112 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst); 3113 } else { 3114 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst); 3115 } 3116 break; 3117 } 3118 case nir_op_f2i32: { 3119 Temp src = get_alu_src(ctx, instr->src[0]); 3120 if (instr->src[0].src.ssa->bit_size == 16) { 3121 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); 3122 if (dst.type() == RegType::vgpr) { 3123 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp); 3124 } else { 3125 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), 3126 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp)); 3127 } 3128 } else if (instr->src[0].src.ssa->bit_size == 32) { 3129 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst); 3130 } else if (instr->src[0].src.ssa->bit_size == 64) { 3131 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst); 3132 } else { 3133 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3134 } 3135 break; 3136 } 3137 case nir_op_f2u32: { 3138 Temp src = get_alu_src(ctx, instr->src[0]); 3139 if (instr->src[0].src.ssa->bit_size == 16) { 3140 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); 3141 if (dst.type() == RegType::vgpr) { 3142 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp); 3143 } else { 3144 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), 3145 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp)); 3146 } 3147 } else if (instr->src[0].src.ssa->bit_size == 32) { 3148 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst); 3149 } else if (instr->src[0].src.ssa->bit_size == 64) { 3150 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst); 3151 } else { 3152 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3153 } 3154 break; 3155 } 3156 case nir_op_f2i64: { 3157 Temp src = get_alu_src(ctx, instr->src[0]); 3158 if (instr->src[0].src.ssa->bit_size == 16) 3159 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); 3160 3161 if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) { 3162 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); 3163 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::zero(), exponent, 3164 Operand::c32(64u)); 3165 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src); 3166 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), src); 3167 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa); 3168 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), mantissa); 3169 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa); 3170 Temp new_exponent = bld.tmp(v1); 3171 Temp borrow = 3172 bld.vsub32(Definition(new_exponent), Operand::c32(63u), exponent, true).def(1).getTemp(); 3173 if (ctx->program->gfx_level >= GFX8) 3174 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa); 3175 else 3176 mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent); 3177 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand::c32(0xfffffffeu)); 3178 Temp lower = bld.tmp(v1), upper = bld.tmp(v1); 3179 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); 3180 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, 3181 Operand::c32(0xffffffffu), borrow); 3182 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow); 3183 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower); 3184 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper); 3185 Temp new_lower = bld.tmp(v1); 3186 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp(); 3187 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow); 3188 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper); 3189 3190 } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) { 3191 if (src.type() == RegType::vgpr) 3192 src = bld.as_uniform(src); 3193 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, 3194 Operand::c32(0x80017u)); 3195 exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, 3196 Operand::c32(126u)); 3197 exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(), 3198 exponent); 3199 exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), 3200 Operand::c32(64u), exponent); 3201 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), 3202 Operand::c32(0x7fffffu), src); 3203 Temp sign = 3204 bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(31u)); 3205 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), 3206 Operand::c32(0x800000u), mantissa); 3207 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, 3208 Operand::c32(7u)); 3209 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa); 3210 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), 3211 Operand::c32(63u), exponent); 3212 mantissa = 3213 bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent); 3214 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, 3215 Operand::c32(0xffffffffu)); // exp >= 64 3216 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand::c32(0xfffffffeu)); 3217 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond); 3218 Temp lower = bld.tmp(s1), upper = bld.tmp(s1); 3219 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); 3220 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower); 3221 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper); 3222 Temp borrow = bld.tmp(s1); 3223 lower = 3224 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign); 3225 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, 3226 bld.scc(borrow)); 3227 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 3228 3229 } else if (instr->src[0].src.ssa->bit_size == 64) { 3230 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), 3231 Operand::c32(0x3df00000u)); 3232 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src); 3233 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); 3234 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), 3235 Operand::c32(0xc1f00000u)); 3236 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul); 3237 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); 3238 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); 3239 Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor); 3240 if (dst.type() == RegType::sgpr) { 3241 lower = bld.as_uniform(lower); 3242 upper = bld.as_uniform(upper); 3243 } 3244 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 3245 3246 } else { 3247 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3248 } 3249 break; 3250 } 3251 case nir_op_f2u64: { 3252 Temp src = get_alu_src(ctx, instr->src[0]); 3253 if (instr->src[0].src.ssa->bit_size == 16) 3254 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); 3255 3256 if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) { 3257 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); 3258 Temp exponent_in_range = 3259 bld.vopc(aco_opcode::v_cmp_ge_i32, bld.def(bld.lm), Operand::c32(64u), exponent); 3260 exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::zero(), exponent); 3261 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src); 3262 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa); 3263 Temp exponent_small = bld.vsub32(bld.def(v1), Operand::c32(24u), exponent); 3264 Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa); 3265 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa); 3266 Temp new_exponent = bld.tmp(v1); 3267 Temp cond_small = 3268 bld.vsub32(Definition(new_exponent), exponent, Operand::c32(24u), true).def(1).getTemp(); 3269 if (ctx->program->gfx_level >= GFX8) 3270 mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa); 3271 else 3272 mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent); 3273 Temp lower = bld.tmp(v1), upper = bld.tmp(v1); 3274 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); 3275 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small); 3276 upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand::zero(), 3277 cond_small); 3278 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), lower, 3279 exponent_in_range); 3280 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), upper, 3281 exponent_in_range); 3282 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 3283 3284 } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) { 3285 if (src.type() == RegType::vgpr) 3286 src = bld.as_uniform(src); 3287 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, 3288 Operand::c32(0x80017u)); 3289 exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, 3290 Operand::c32(126u)); 3291 exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(), 3292 exponent); 3293 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), 3294 Operand::c32(0x7fffffu), src); 3295 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), 3296 Operand::c32(0x800000u), mantissa); 3297 Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), 3298 Operand::c32(24u), exponent); 3299 Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, 3300 exponent_small); 3301 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa); 3302 Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), 3303 exponent, Operand::c32(24u)); 3304 mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, 3305 exponent_large); 3306 Temp cond = 3307 bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand::c32(64u), exponent); 3308 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, 3309 Operand::c32(0xffffffffu), cond); 3310 Temp lower = bld.tmp(s1), upper = bld.tmp(s1); 3311 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); 3312 Temp cond_small = 3313 bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand::c32(24u)); 3314 lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small); 3315 upper = 3316 bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::zero(), upper, cond_small); 3317 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 3318 3319 } else if (instr->src[0].src.ssa->bit_size == 64) { 3320 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), 3321 Operand::c32(0x3df00000u)); 3322 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src); 3323 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); 3324 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), 3325 Operand::c32(0xc1f00000u)); 3326 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul); 3327 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); 3328 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); 3329 Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor); 3330 if (dst.type() == RegType::sgpr) { 3331 lower = bld.as_uniform(lower); 3332 upper = bld.as_uniform(upper); 3333 } 3334 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 3335 3336 } else { 3337 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3338 } 3339 break; 3340 } 3341 case nir_op_b2f16: { 3342 Temp src = get_alu_src(ctx, instr->src[0]); 3343 assert(src.regClass() == bld.lm); 3344 3345 if (dst.regClass() == s1) { 3346 src = bool_to_scalar_condition(ctx, src); 3347 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src); 3348 } else if (dst.regClass() == v2b) { 3349 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u)); 3350 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src); 3351 } else { 3352 unreachable("Wrong destination register class for nir_op_b2f16."); 3353 } 3354 break; 3355 } 3356 case nir_op_b2f32: { 3357 Temp src = get_alu_src(ctx, instr->src[0]); 3358 assert(src.regClass() == bld.lm); 3359 3360 if (dst.regClass() == s1) { 3361 src = bool_to_scalar_condition(ctx, src); 3362 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src); 3363 } else if (dst.regClass() == v1) { 3364 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), 3365 Operand::c32(0x3f800000u), src); 3366 } else { 3367 unreachable("Wrong destination register class for nir_op_b2f32."); 3368 } 3369 break; 3370 } 3371 case nir_op_b2f64: { 3372 Temp src = get_alu_src(ctx, instr->src[0]); 3373 assert(src.regClass() == bld.lm); 3374 3375 if (dst.regClass() == s2) { 3376 src = bool_to_scalar_condition(ctx, src); 3377 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u), 3378 Operand::zero(), bld.scc(src)); 3379 } else if (dst.regClass() == v2) { 3380 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u)); 3381 Temp upper = 3382 bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src); 3383 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper); 3384 } else { 3385 unreachable("Wrong destination register class for nir_op_b2f64."); 3386 } 3387 break; 3388 } 3389 case nir_op_i2i8: 3390 case nir_op_i2i16: 3391 case nir_op_i2i32: 3392 case nir_op_i2i64: { 3393 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) { 3394 /* no need to do the extract in get_alu_src() */ 3395 sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size 3396 ? sgpr_extract_sext 3397 : sgpr_extract_undef; 3398 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode); 3399 } else { 3400 const unsigned input_bitsize = instr->src[0].src.ssa->bit_size; 3401 const unsigned output_bitsize = instr->dest.dest.ssa.bit_size; 3402 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize, 3403 output_bitsize > input_bitsize, dst); 3404 } 3405 break; 3406 } 3407 case nir_op_u2u8: 3408 case nir_op_u2u16: 3409 case nir_op_u2u32: 3410 case nir_op_u2u64: { 3411 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) { 3412 /* no need to do the extract in get_alu_src() */ 3413 sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size 3414 ? sgpr_extract_zext 3415 : sgpr_extract_undef; 3416 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode); 3417 } else { 3418 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size, 3419 instr->dest.dest.ssa.bit_size, false, dst); 3420 } 3421 break; 3422 } 3423 case nir_op_b2b32: 3424 case nir_op_b2i8: 3425 case nir_op_b2i16: 3426 case nir_op_b2i32: 3427 case nir_op_b2i64: { 3428 Temp src = get_alu_src(ctx, instr->src[0]); 3429 assert(src.regClass() == bld.lm); 3430 3431 Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst; 3432 if (tmp.regClass() == s1) { 3433 bool_to_scalar_condition(ctx, src, tmp); 3434 } else if (tmp.type() == RegType::vgpr) { 3435 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand::zero(), Operand::c32(1u), 3436 src); 3437 } else { 3438 unreachable("Invalid register class for b2i32"); 3439 } 3440 3441 if (tmp != dst) 3442 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero()); 3443 break; 3444 } 3445 case nir_op_b2b1: 3446 case nir_op_i2b1: { 3447 Temp src = get_alu_src(ctx, instr->src[0]); 3448 assert(dst.regClass() == bld.lm); 3449 3450 if (src.type() == RegType::vgpr) { 3451 assert(src.regClass() == v1 || src.regClass() == v2); 3452 assert(dst.regClass() == bld.lm); 3453 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32, 3454 Definition(dst), Operand::zero(), src); 3455 } else { 3456 assert(src.regClass() == s1 || src.regClass() == s2); 3457 Temp tmp; 3458 if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) { 3459 tmp = 3460 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src) 3461 .def(1) 3462 .getTemp(); 3463 } else { 3464 tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32, 3465 bld.scc(bld.def(s1)), Operand::zero(), src); 3466 } 3467 bool_to_vector_condition(ctx, tmp, dst); 3468 } 3469 break; 3470 } 3471 case nir_op_unpack_64_2x32: 3472 case nir_op_unpack_32_2x16: 3473 case nir_op_unpack_64_4x16: 3474 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0])); 3475 emit_split_vector(ctx, dst, instr->op == nir_op_unpack_64_4x16 ? 4 : 2); 3476 break; 3477 case nir_op_pack_64_2x32_split: { 3478 Temp src0 = get_alu_src(ctx, instr->src[0]); 3479 Temp src1 = get_alu_src(ctx, instr->src[1]); 3480 3481 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); 3482 break; 3483 } 3484 case nir_op_unpack_64_2x32_split_x: 3485 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), 3486 get_alu_src(ctx, instr->src[0])); 3487 break; 3488 case nir_op_unpack_64_2x32_split_y: 3489 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), 3490 get_alu_src(ctx, instr->src[0])); 3491 break; 3492 case nir_op_unpack_32_2x16_split_x: 3493 if (dst.type() == RegType::vgpr) { 3494 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), 3495 get_alu_src(ctx, instr->src[0])); 3496 } else { 3497 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0])); 3498 } 3499 break; 3500 case nir_op_unpack_32_2x16_split_y: 3501 if (dst.type() == RegType::vgpr) { 3502 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), 3503 get_alu_src(ctx, instr->src[0])); 3504 } else { 3505 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), 3506 get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u), 3507 Operand::zero()); 3508 } 3509 break; 3510 case nir_op_pack_32_2x16_split: { 3511 Temp src0 = get_alu_src(ctx, instr->src[0]); 3512 Temp src1 = get_alu_src(ctx, instr->src[1]); 3513 if (dst.regClass() == v1) { 3514 src0 = emit_extract_vector(ctx, src0, 0, v2b); 3515 src1 = emit_extract_vector(ctx, src1, 0, v2b); 3516 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); 3517 } else { 3518 src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, 3519 Operand::c32(0xFFFFu)); 3520 src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, 3521 Operand::c32(16u)); 3522 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1); 3523 } 3524 break; 3525 } 3526 case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break; 3527 case nir_op_pack_half_2x16_split: { 3528 if (dst.regClass() == v1) { 3529 if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9) 3530 emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst); 3531 else 3532 emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false); 3533 } else { 3534 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3535 } 3536 break; 3537 } 3538 case nir_op_pack_unorm_2x16: 3539 case nir_op_pack_snorm_2x16: { 3540 Temp src = get_alu_src(ctx, instr->src[0], 2); 3541 Temp src0 = emit_extract_vector(ctx, src, 0, v1); 3542 Temp src1 = emit_extract_vector(ctx, src, 1, v1); 3543 aco_opcode opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32 3544 : aco_opcode::v_cvt_pknorm_i16_f32; 3545 bld.vop3(opcode, Definition(dst), src0, src1); 3546 break; 3547 } 3548 case nir_op_pack_uint_2x16: 3549 case nir_op_pack_sint_2x16: { 3550 Temp src = get_alu_src(ctx, instr->src[0], 2); 3551 Temp src0 = emit_extract_vector(ctx, src, 0, v1); 3552 Temp src1 = emit_extract_vector(ctx, src, 1, v1); 3553 aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32 3554 : aco_opcode::v_cvt_pk_i16_i32; 3555 bld.vop3(opcode, Definition(dst), src0, src1); 3556 break; 3557 } 3558 case nir_op_unpack_half_2x16_split_x_flush_to_zero: 3559 case nir_op_unpack_half_2x16_split_x: { 3560 Temp src = get_alu_src(ctx, instr->src[0]); 3561 if (src.regClass() == v1) 3562 src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src); 3563 if (dst.regClass() == v1) { 3564 assert(ctx->block->fp_mode.must_flush_denorms16_64 == 3565 (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero)); 3566 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src); 3567 } else { 3568 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3569 } 3570 break; 3571 } 3572 case nir_op_unpack_half_2x16_split_y_flush_to_zero: 3573 case nir_op_unpack_half_2x16_split_y: { 3574 Temp src = get_alu_src(ctx, instr->src[0]); 3575 if (src.regClass() == s1) 3576 src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src, 3577 Operand::c32(1u), Operand::c32(16u), Operand::zero()); 3578 else 3579 src = 3580 bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp(); 3581 if (dst.regClass() == v1) { 3582 assert(ctx->block->fp_mode.must_flush_denorms16_64 == 3583 (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero)); 3584 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src); 3585 } else { 3586 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3587 } 3588 break; 3589 } 3590 case nir_op_sad_u8x4: { 3591 assert(dst.regClass() == v1); 3592 emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false); 3593 break; 3594 } 3595 case nir_op_fquantize2f16: { 3596 Temp src = get_alu_src(ctx, instr->src[0]); 3597 Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src); 3598 Temp f32, cmp_res; 3599 3600 if (ctx->program->gfx_level >= GFX8) { 3601 Temp mask = bld.copy( 3602 bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */ 3603 cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, mask); 3604 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); 3605 } else { 3606 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float, 3607 * so compare the result and flush to 0 if it's smaller. 3608 */ 3609 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); 3610 Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u)); 3611 Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest); 3612 tmp0->vop3().abs[0] = true; 3613 Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f32, bld.def(bld.lm), Operand::zero(), f32); 3614 cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc), 3615 tmp0->definitions[0].getTemp(), tmp1); 3616 } 3617 3618 if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) { 3619 Temp copysign_0 = 3620 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src)); 3621 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res); 3622 } else { 3623 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res); 3624 } 3625 break; 3626 } 3627 case nir_op_bfm: { 3628 Temp bits = get_alu_src(ctx, instr->src[0]); 3629 Temp offset = get_alu_src(ctx, instr->src[1]); 3630 3631 if (dst.regClass() == s1) { 3632 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset); 3633 } else if (dst.regClass() == v1) { 3634 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset); 3635 } else { 3636 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3637 } 3638 break; 3639 } 3640 case nir_op_bitfield_select: { 3641 3642 /* dst = (insert & bitmask) | (base & ~bitmask) */ 3643 if (dst.regClass() == s1) { 3644 Temp bitmask = get_alu_src(ctx, instr->src[0]); 3645 Temp insert = get_alu_src(ctx, instr->src[1]); 3646 Temp base = get_alu_src(ctx, instr->src[2]); 3647 aco_ptr<Instruction> sop2; 3648 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src); 3649 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src); 3650 Operand lhs; 3651 if (const_insert && const_bitmask) { 3652 lhs = Operand::c32(const_insert->u32 & const_bitmask->u32); 3653 } else { 3654 insert = 3655 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask); 3656 lhs = Operand(insert); 3657 } 3658 3659 Operand rhs; 3660 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src); 3661 if (const_base && const_bitmask) { 3662 rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32); 3663 } else { 3664 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask); 3665 rhs = Operand(base); 3666 } 3667 3668 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs); 3669 3670 } else if (dst.regClass() == v1) { 3671 emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3); 3672 } else { 3673 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3674 } 3675 break; 3676 } 3677 case nir_op_ubfe: 3678 case nir_op_ibfe: { 3679 if (dst.bytes() != 4) 3680 unreachable("Unsupported BFE bit size"); 3681 3682 if (dst.type() == RegType::sgpr) { 3683 Temp base = get_alu_src(ctx, instr->src[0]); 3684 3685 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src); 3686 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src); 3687 if (const_offset && const_bits) { 3688 uint32_t extract = (const_bits->u32 << 16) | (const_offset->u32 & 0x1f); 3689 aco_opcode opcode = 3690 instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32; 3691 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract)); 3692 break; 3693 } 3694 3695 Temp offset = get_alu_src(ctx, instr->src[1]); 3696 Temp bits = get_alu_src(ctx, instr->src[2]); 3697 if (instr->op == nir_op_ubfe) { 3698 Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset); 3699 Temp masked = 3700 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask); 3701 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset); 3702 } else { 3703 Operand bits_op = const_bits ? Operand::c32(const_bits->u32 << 16) 3704 : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), 3705 bld.def(s1, scc), bits, Operand::c32(16u)); 3706 Operand offset_op = const_offset 3707 ? Operand::c32(const_offset->u32 & 0x1fu) 3708 : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), 3709 offset, Operand::c32(0x1fu)); 3710 3711 Temp extract = 3712 bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op); 3713 bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract); 3714 } 3715 3716 } else { 3717 aco_opcode opcode = 3718 instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32; 3719 emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3); 3720 } 3721 break; 3722 } 3723 case nir_op_extract_u8: 3724 case nir_op_extract_i8: 3725 case nir_op_extract_u16: 3726 case nir_op_extract_i16: { 3727 bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8; 3728 unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2; 3729 uint32_t bits = comp == 4 ? 8 : 16; 3730 unsigned index = nir_src_as_uint(instr->src[1].src); 3731 if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) { 3732 assert(index == 0); 3733 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0])); 3734 } else if (dst.regClass() == s1 && instr->dest.dest.ssa.bit_size == 16) { 3735 Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa); 3736 unsigned swizzle = instr->src[0].swizzle[0]; 3737 if (vec.size() > 1) { 3738 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1); 3739 swizzle = swizzle & 1; 3740 } 3741 index += swizzle * instr->dest.dest.ssa.bit_size / bits; 3742 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec), 3743 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed)); 3744 } else { 3745 Temp src = get_alu_src(ctx, instr->src[0]); 3746 Definition def(dst); 3747 if (dst.bytes() == 8) { 3748 src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1)); 3749 index %= comp; 3750 def = bld.def(src.type(), 1); 3751 } 3752 assert(def.bytes() <= 4); 3753 if (def.regClass() == s1) { 3754 bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src), 3755 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed)); 3756 } else { 3757 src = emit_extract_vector(ctx, src, 0, def.regClass()); 3758 bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index), 3759 Operand::c32(bits), Operand::c32(is_signed)); 3760 } 3761 if (dst.size() == 2) 3762 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(), 3763 Operand::zero()); 3764 } 3765 break; 3766 } 3767 case nir_op_insert_u8: 3768 case nir_op_insert_u16: { 3769 unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2; 3770 uint32_t bits = comp == 4 ? 8 : 16; 3771 unsigned index = nir_src_as_uint(instr->src[1].src); 3772 if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) { 3773 assert(index == 0); 3774 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0])); 3775 } else { 3776 Temp src = get_alu_src(ctx, instr->src[0]); 3777 Definition def(dst); 3778 bool swap = false; 3779 if (dst.bytes() == 8) { 3780 src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1)); 3781 swap = index >= comp; 3782 index %= comp; 3783 def = bld.def(src.type(), 1); 3784 } 3785 if (def.regClass() == s1) { 3786 bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src), 3787 Operand::c32(index), Operand::c32(bits)); 3788 } else { 3789 src = emit_extract_vector(ctx, src, 0, def.regClass()); 3790 bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index), 3791 Operand::c32(bits)); 3792 } 3793 if (dst.size() == 2 && swap) 3794 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), 3795 def.getTemp()); 3796 else if (dst.size() == 2) 3797 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(), 3798 Operand::zero()); 3799 } 3800 break; 3801 } 3802 case nir_op_bit_count: { 3803 Temp src = get_alu_src(ctx, instr->src[0]); 3804 if (src.regClass() == s1) { 3805 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src); 3806 } else if (src.regClass() == v1) { 3807 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero()); 3808 } else if (src.regClass() == v2) { 3809 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1), 3810 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), 3811 emit_extract_vector(ctx, src, 0, v1), Operand::zero())); 3812 } else if (src.regClass() == s2) { 3813 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src); 3814 } else { 3815 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3816 } 3817 break; 3818 } 3819 case nir_op_flt: { 3820 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32, 3821 aco_opcode::v_cmp_lt_f64); 3822 break; 3823 } 3824 case nir_op_fge: { 3825 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32, 3826 aco_opcode::v_cmp_ge_f64); 3827 break; 3828 } 3829 case nir_op_feq: { 3830 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, 3831 aco_opcode::v_cmp_eq_f64); 3832 break; 3833 } 3834 case nir_op_fneu: { 3835 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, 3836 aco_opcode::v_cmp_neq_f64); 3837 break; 3838 } 3839 case nir_op_ilt: { 3840 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32, 3841 aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32); 3842 break; 3843 } 3844 case nir_op_ige: { 3845 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32, 3846 aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32); 3847 break; 3848 } 3849 case nir_op_ieq: { 3850 if (instr->src[0].src.ssa->bit_size == 1) 3851 emit_boolean_logic(ctx, instr, Builder::s_xnor, dst); 3852 else 3853 emit_comparison( 3854 ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32, 3855 aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, 3856 ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes); 3857 break; 3858 } 3859 case nir_op_ine: { 3860 if (instr->src[0].src.ssa->bit_size == 1) 3861 emit_boolean_logic(ctx, instr, Builder::s_xor, dst); 3862 else 3863 emit_comparison( 3864 ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32, 3865 aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, 3866 ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes); 3867 break; 3868 } 3869 case nir_op_ult: { 3870 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32, 3871 aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32); 3872 break; 3873 } 3874 case nir_op_uge: { 3875 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32, 3876 aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32); 3877 break; 3878 } 3879 case nir_op_fddx: 3880 case nir_op_fddy: 3881 case nir_op_fddx_fine: 3882 case nir_op_fddy_fine: 3883 case nir_op_fddx_coarse: 3884 case nir_op_fddy_coarse: { 3885 if (!nir_src_is_divergent(instr->src[0].src)) { 3886 /* Source is the same in all lanes, so the derivative is zero. 3887 * This also avoids emitting invalid IR. 3888 */ 3889 bld.copy(Definition(dst), Operand::zero()); 3890 break; 3891 } 3892 3893 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); 3894 uint16_t dpp_ctrl1, dpp_ctrl2; 3895 if (instr->op == nir_op_fddx_fine) { 3896 dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2); 3897 dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3); 3898 } else if (instr->op == nir_op_fddy_fine) { 3899 dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1); 3900 dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3); 3901 } else { 3902 dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0); 3903 if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse) 3904 dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1); 3905 else 3906 dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2); 3907 } 3908 3909 Temp tmp; 3910 if (ctx->program->gfx_level >= GFX8) { 3911 Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1); 3912 tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2); 3913 } else { 3914 Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1); 3915 Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2); 3916 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl); 3917 } 3918 emit_wqm(bld, tmp, dst, true); 3919 break; 3920 } 3921 default: isel_err(&instr->instr, "Unknown NIR ALU instr"); 3922 } 3923} 3924 3925void 3926visit_load_const(isel_context* ctx, nir_load_const_instr* instr) 3927{ 3928 Temp dst = get_ssa_temp(ctx, &instr->def); 3929 3930 // TODO: we really want to have the resulting type as this would allow for 64bit literals 3931 // which get truncated the lsb if double and msb if int 3932 // for now, we only use s_mov_b64 with 64bit inline constants 3933 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar."); 3934 assert(dst.type() == RegType::sgpr); 3935 3936 Builder bld(ctx->program, ctx->block); 3937 3938 if (instr->def.bit_size == 1) { 3939 assert(dst.regClass() == bld.lm); 3940 int val = instr->value[0].b ? -1 : 0; 3941 Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val); 3942 bld.copy(Definition(dst), op); 3943 } else if (instr->def.bit_size == 8) { 3944 bld.copy(Definition(dst), Operand::c32(instr->value[0].u8)); 3945 } else if (instr->def.bit_size == 16) { 3946 /* sign-extend to use s_movk_i32 instead of a literal */ 3947 bld.copy(Definition(dst), Operand::c32(instr->value[0].i16)); 3948 } else if (dst.size() == 1) { 3949 bld.copy(Definition(dst), Operand::c32(instr->value[0].u32)); 3950 } else { 3951 assert(dst.size() != 1); 3952 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 3953 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; 3954 if (instr->def.bit_size == 64) 3955 for (unsigned i = 0; i < dst.size(); i++) 3956 vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32); 3957 else { 3958 for (unsigned i = 0; i < dst.size(); i++) 3959 vec->operands[i] = Operand::c32(instr->value[i].u32); 3960 } 3961 vec->definitions[0] = Definition(dst); 3962 ctx->block->instructions.emplace_back(std::move(vec)); 3963 } 3964} 3965 3966bool 3967can_use_byte_align_for_global_load(unsigned num_components, unsigned component_size, 3968 unsigned align_, bool support_12_byte) 3969{ 3970 /* Only use byte-align for 8/16-bit loads if we won't have to increase it's size and won't have 3971 * to use unsupported load sizes. 3972 */ 3973 assert(util_is_power_of_two_nonzero(align_)); 3974 if (align_ < 4) { 3975 assert(component_size < 4); 3976 unsigned load_size = num_components * component_size; 3977 int new_size = align(load_size + (4 - align_), 4); 3978 return new_size == align(load_size, 4) && (new_size != 12 || support_12_byte); 3979 } 3980 return true; 3981} 3982 3983struct LoadEmitInfo { 3984 Operand offset; 3985 Temp dst; 3986 unsigned num_components; 3987 unsigned component_size; 3988 Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */ 3989 unsigned component_stride = 0; 3990 unsigned const_offset = 0; 3991 unsigned align_mul = 0; 3992 unsigned align_offset = 0; 3993 3994 bool glc = false; 3995 bool slc = false; 3996 unsigned swizzle_component_size = 0; 3997 memory_sync_info sync; 3998 Temp soffset = Temp(0, s1); 3999}; 4000 4001struct EmitLoadParameters { 4002 using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset, 4003 unsigned bytes_needed, unsigned align, unsigned const_offset, 4004 Temp dst_hint); 4005 4006 Callback callback; 4007 bool byte_align_loads; 4008 bool supports_8bit_16bit_loads; 4009 unsigned max_const_offset_plus_one; 4010}; 4011 4012void 4013emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, 4014 const EmitLoadParameters& params) 4015{ 4016 unsigned load_size = info.num_components * info.component_size; 4017 unsigned component_size = info.component_size; 4018 4019 unsigned num_vals = 0; 4020 Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp)); 4021 4022 unsigned const_offset = info.const_offset; 4023 4024 const unsigned align_mul = info.align_mul ? info.align_mul : component_size; 4025 unsigned align_offset = (info.align_offset + const_offset) % align_mul; 4026 4027 unsigned bytes_read = 0; 4028 while (bytes_read < load_size) { 4029 unsigned bytes_needed = load_size - bytes_read; 4030 4031 /* add buffer for unaligned loads */ 4032 int byte_align = 0; 4033 if (params.byte_align_loads) { 4034 byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1; 4035 } 4036 4037 if (byte_align) { 4038 if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) || 4039 !params.supports_8bit_16bit_loads) { 4040 if (info.component_stride) { 4041 assert(params.supports_8bit_16bit_loads && "unimplemented"); 4042 bytes_needed = 2; 4043 byte_align = 0; 4044 } else { 4045 bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align; 4046 bytes_needed = align(bytes_needed, 4); 4047 } 4048 } else { 4049 byte_align = 0; 4050 } 4051 } 4052 4053 if (info.swizzle_component_size) 4054 bytes_needed = MIN2(bytes_needed, info.swizzle_component_size); 4055 if (info.component_stride) 4056 bytes_needed = MIN2(bytes_needed, info.component_size); 4057 4058 bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4); 4059 4060 /* reduce constant offset */ 4061 Operand offset = info.offset; 4062 unsigned reduced_const_offset = const_offset; 4063 bool remove_const_offset_completely = need_to_align_offset; 4064 if (const_offset && 4065 (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) { 4066 unsigned to_add = const_offset; 4067 if (remove_const_offset_completely) { 4068 reduced_const_offset = 0; 4069 } else { 4070 to_add = 4071 const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one; 4072 reduced_const_offset %= params.max_const_offset_plus_one; 4073 } 4074 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp(); 4075 if (offset.isConstant()) { 4076 offset = Operand::c32(offset.constantValue() + to_add); 4077 } else if (offset_tmp.regClass() == s1) { 4078 offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp, 4079 Operand::c32(to_add)); 4080 } else if (offset_tmp.regClass() == v1) { 4081 offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add)); 4082 } else { 4083 Temp lo = bld.tmp(offset_tmp.type(), 1); 4084 Temp hi = bld.tmp(offset_tmp.type(), 1); 4085 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp); 4086 4087 if (offset_tmp.regClass() == s2) { 4088 Temp carry = bld.tmp(s1); 4089 lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo, 4090 Operand::c32(to_add)); 4091 hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry); 4092 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi); 4093 } else { 4094 Temp new_lo = bld.tmp(v1); 4095 Temp carry = 4096 bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp(); 4097 hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry); 4098 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi); 4099 } 4100 } 4101 } 4102 4103 /* align offset down if needed */ 4104 Operand aligned_offset = offset; 4105 unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul; 4106 if (need_to_align_offset) { 4107 align = 4; 4108 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp(); 4109 if (offset.isConstant()) { 4110 aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu); 4111 } else if (offset_tmp.regClass() == s1) { 4112 aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), 4113 Operand::c32(0xfffffffcu), offset_tmp); 4114 } else if (offset_tmp.regClass() == s2) { 4115 aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), 4116 Operand::c64(0xfffffffffffffffcllu), offset_tmp); 4117 } else if (offset_tmp.regClass() == v1) { 4118 aligned_offset = 4119 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp); 4120 } else if (offset_tmp.regClass() == v2) { 4121 Temp hi = bld.tmp(v1), lo = bld.tmp(v1); 4122 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp); 4123 lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo); 4124 aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi); 4125 } 4126 } 4127 Temp aligned_offset_tmp = 4128 aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset); 4129 4130 Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align, 4131 reduced_const_offset, byte_align ? Temp() : info.dst); 4132 4133 /* the callback wrote directly to dst */ 4134 if (val == info.dst) { 4135 assert(num_vals == 0); 4136 emit_split_vector(ctx, info.dst, info.num_components); 4137 return; 4138 } 4139 4140 /* shift result right if needed */ 4141 if (params.byte_align_loads && info.component_size < 4) { 4142 Operand byte_align_off = Operand::c32(byte_align); 4143 if (byte_align == -1) { 4144 if (offset.isConstant()) 4145 byte_align_off = Operand::c32(offset.constantValue() % 4u); 4146 else if (offset.size() == 2) 4147 byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, 4148 RegClass(offset.getTemp().type(), 1))); 4149 else 4150 byte_align_off = offset; 4151 } 4152 4153 assert(val.bytes() >= load_size && "unimplemented"); 4154 if (val.type() == RegType::sgpr) 4155 byte_align_scalar(ctx, val, byte_align_off, info.dst); 4156 else 4157 byte_align_vector(ctx, val, byte_align_off, info.dst, component_size); 4158 return; 4159 } 4160 4161 /* add result to list and advance */ 4162 if (info.component_stride) { 4163 assert(val.bytes() == info.component_size && "unimplemented"); 4164 const_offset += info.component_stride; 4165 align_offset = (align_offset + info.component_stride) % align_mul; 4166 } else { 4167 const_offset += val.bytes(); 4168 align_offset = (align_offset + val.bytes()) % align_mul; 4169 } 4170 bytes_read += val.bytes(); 4171 vals[num_vals++] = val; 4172 } 4173 4174 /* create array of components */ 4175 unsigned components_split = 0; 4176 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec; 4177 bool has_vgprs = false; 4178 for (unsigned i = 0; i < num_vals;) { 4179 Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp)); 4180 unsigned num_tmps = 0; 4181 unsigned tmp_size = 0; 4182 RegType reg_type = RegType::sgpr; 4183 while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) { 4184 if (vals[i].type() == RegType::vgpr) 4185 reg_type = RegType::vgpr; 4186 tmp_size += vals[i].bytes(); 4187 tmp[num_tmps++] = vals[i++]; 4188 } 4189 if (num_tmps > 1) { 4190 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 4191 aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)}; 4192 for (unsigned j = 0; j < num_tmps; j++) 4193 vec->operands[j] = Operand(tmp[j]); 4194 tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size)); 4195 vec->definitions[0] = Definition(tmp[0]); 4196 bld.insert(std::move(vec)); 4197 } 4198 4199 if (tmp[0].bytes() % component_size) { 4200 /* trim tmp[0] */ 4201 assert(i == num_vals); 4202 RegClass new_rc = 4203 RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size); 4204 tmp[0] = 4205 bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero()); 4206 } 4207 4208 RegClass elem_rc = RegClass::get(reg_type, component_size); 4209 4210 unsigned start = components_split; 4211 4212 if (tmp_size == elem_rc.bytes()) { 4213 allocated_vec[components_split++] = tmp[0]; 4214 } else { 4215 assert(tmp_size % elem_rc.bytes() == 0); 4216 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>( 4217 aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())}; 4218 for (auto& def : split->definitions) { 4219 Temp component = bld.tmp(elem_rc); 4220 allocated_vec[components_split++] = component; 4221 def = Definition(component); 4222 } 4223 split->operands[0] = Operand(tmp[0]); 4224 bld.insert(std::move(split)); 4225 } 4226 4227 /* try to p_as_uniform early so we can create more optimizable code and 4228 * also update allocated_vec */ 4229 for (unsigned j = start; j < components_split; j++) { 4230 if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) 4231 allocated_vec[j] = bld.as_uniform(allocated_vec[j]); 4232 has_vgprs |= allocated_vec[j].type() == RegType::vgpr; 4233 } 4234 } 4235 4236 /* concatenate components and p_as_uniform() result if needed */ 4237 if (info.dst.type() == RegType::vgpr || !has_vgprs) 4238 ctx->allocated_vec.emplace(info.dst.id(), allocated_vec); 4239 4240 int padding_bytes = 4241 MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0); 4242 4243 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 4244 aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)}; 4245 for (unsigned i = 0; i < info.num_components; i++) 4246 vec->operands[i] = Operand(allocated_vec[i]); 4247 if (padding_bytes) 4248 vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes)); 4249 if (info.dst.type() == RegType::sgpr && has_vgprs) { 4250 Temp tmp = bld.tmp(RegType::vgpr, info.dst.size()); 4251 vec->definitions[0] = Definition(tmp); 4252 bld.insert(std::move(vec)); 4253 bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp); 4254 } else { 4255 vec->definitions[0] = Definition(info.dst); 4256 bld.insert(std::move(vec)); 4257 } 4258} 4259 4260Operand 4261load_lds_size_m0(Builder& bld) 4262{ 4263 /* m0 does not need to be initialized on GFX9+ */ 4264 if (bld.program->gfx_level >= GFX9) 4265 return Operand(s1); 4266 4267 return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu))); 4268} 4269 4270Temp 4271lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, 4272 unsigned align, unsigned const_offset, Temp dst_hint) 4273{ 4274 offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset; 4275 4276 Operand m = load_lds_size_m0(bld); 4277 4278 bool large_ds_read = bld.program->gfx_level >= GFX7; 4279 bool usable_read2 = bld.program->gfx_level >= GFX7; 4280 4281 bool read2 = false; 4282 unsigned size = 0; 4283 aco_opcode op; 4284 if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) { 4285 size = 16; 4286 op = aco_opcode::ds_read_b128; 4287 } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) { 4288 size = 16; 4289 read2 = true; 4290 op = aco_opcode::ds_read2_b64; 4291 } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) { 4292 size = 12; 4293 op = aco_opcode::ds_read_b96; 4294 } else if (bytes_needed >= 8 && align % 8 == 0) { 4295 size = 8; 4296 op = aco_opcode::ds_read_b64; 4297 } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) { 4298 size = 8; 4299 read2 = true; 4300 op = aco_opcode::ds_read2_b32; 4301 } else if (bytes_needed >= 4 && align % 4 == 0) { 4302 size = 4; 4303 op = aco_opcode::ds_read_b32; 4304 } else if (bytes_needed >= 2 && align % 2 == 0) { 4305 size = 2; 4306 op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16; 4307 } else { 4308 size = 1; 4309 op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8; 4310 } 4311 4312 unsigned const_offset_unit = read2 ? size / 2u : 1u; 4313 unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536; 4314 4315 if (const_offset > (const_offset_range - const_offset_unit)) { 4316 unsigned excess = const_offset - (const_offset % const_offset_range); 4317 offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess)); 4318 const_offset -= excess; 4319 } 4320 4321 const_offset /= const_offset_unit; 4322 4323 RegClass rc = RegClass::get(RegType::vgpr, size); 4324 Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc); 4325 Instruction* instr; 4326 if (read2) 4327 instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1); 4328 else 4329 instr = bld.ds(op, Definition(val), offset, m, const_offset); 4330 instr->ds().sync = info.sync; 4331 4332 if (m.isUndefined()) 4333 instr->operands.pop_back(); 4334 4335 return val; 4336} 4337 4338const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX}; 4339 4340Temp 4341smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, 4342 unsigned align, unsigned const_offset, Temp dst_hint) 4343{ 4344 assert(align >= 4u); 4345 4346 bool buffer = info.resource.id() && info.resource.bytes() == 16; 4347 Temp addr = info.resource; 4348 if (!buffer && !addr.id()) { 4349 addr = offset; 4350 offset = Temp(); 4351 } 4352 4353 bytes_needed = MIN2(bytes_needed, 64); 4354 unsigned needed_round_up = util_next_power_of_two(bytes_needed); 4355 unsigned needed_round_down = needed_round_up >> (needed_round_up != bytes_needed ? 1 : 0); 4356 /* Only round-up global loads if it's aligned so that it won't cross pages */ 4357 bytes_needed = buffer || align % needed_round_up == 0 ? needed_round_up : needed_round_down; 4358 4359 aco_opcode op; 4360 if (bytes_needed <= 4) { 4361 op = buffer ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword; 4362 } else if (bytes_needed <= 8) { 4363 op = buffer ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2; 4364 } else if (bytes_needed <= 16) { 4365 op = buffer ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4; 4366 } else if (bytes_needed <= 32) { 4367 op = buffer ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8; 4368 } else { 4369 assert(bytes_needed == 64); 4370 op = buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16; 4371 } 4372 4373 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)}; 4374 if (buffer) { 4375 if (const_offset) 4376 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, 4377 Operand::c32(const_offset)); 4378 load->operands[0] = Operand(info.resource); 4379 load->operands[1] = Operand(offset); 4380 } else { 4381 load->operands[0] = Operand(addr); 4382 if (offset.id() && const_offset) 4383 load->operands[1] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, 4384 Operand::c32(const_offset)); 4385 else if (offset.id()) 4386 load->operands[1] = Operand(offset); 4387 else 4388 load->operands[1] = Operand::c32(const_offset); 4389 } 4390 RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u)); 4391 Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc); 4392 load->definitions[0] = Definition(val); 4393 load->glc = info.glc; 4394 load->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); 4395 load->sync = info.sync; 4396 bld.insert(std::move(load)); 4397 return val; 4398} 4399 4400const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024}; 4401 4402Temp 4403mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, 4404 unsigned align_, unsigned const_offset, Temp dst_hint) 4405{ 4406 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); 4407 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0); 4408 4409 if (info.soffset.id()) { 4410 if (soffset.isTemp()) 4411 vaddr = bld.copy(bld.def(v1), soffset); 4412 soffset = Operand(info.soffset); 4413 } 4414 4415 unsigned bytes_size = 0; 4416 aco_opcode op; 4417 if (bytes_needed == 1 || align_ % 2) { 4418 bytes_size = 1; 4419 op = aco_opcode::buffer_load_ubyte; 4420 } else if (bytes_needed == 2 || align_ % 4) { 4421 bytes_size = 2; 4422 op = aco_opcode::buffer_load_ushort; 4423 } else if (bytes_needed <= 4) { 4424 bytes_size = 4; 4425 op = aco_opcode::buffer_load_dword; 4426 } else if (bytes_needed <= 8) { 4427 bytes_size = 8; 4428 op = aco_opcode::buffer_load_dwordx2; 4429 } else if (bytes_needed <= 12 && bld.program->gfx_level > GFX6) { 4430 bytes_size = 12; 4431 op = aco_opcode::buffer_load_dwordx3; 4432 } else { 4433 bytes_size = 16; 4434 op = aco_opcode::buffer_load_dwordx4; 4435 } 4436 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)}; 4437 mubuf->operands[0] = Operand(info.resource); 4438 mubuf->operands[1] = vaddr; 4439 mubuf->operands[2] = soffset; 4440 mubuf->offen = (offset.type() == RegType::vgpr); 4441 mubuf->glc = info.glc; 4442 mubuf->dlc = 4443 info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); 4444 mubuf->slc = info.slc; 4445 mubuf->sync = info.sync; 4446 mubuf->offset = const_offset; 4447 mubuf->swizzled = info.swizzle_component_size != 0; 4448 RegClass rc = RegClass::get(RegType::vgpr, bytes_size); 4449 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); 4450 mubuf->definitions[0] = Definition(val); 4451 bld.insert(std::move(mubuf)); 4452 4453 return val; 4454} 4455 4456const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096}; 4457 4458Temp 4459scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, 4460 unsigned align_, unsigned const_offset, Temp dst_hint) 4461{ 4462 unsigned bytes_size = 0; 4463 aco_opcode op; 4464 if (bytes_needed == 1 || align_ % 2u) { 4465 bytes_size = 1; 4466 op = aco_opcode::scratch_load_ubyte; 4467 } else if (bytes_needed == 2 || align_ % 4u) { 4468 bytes_size = 2; 4469 op = aco_opcode::scratch_load_ushort; 4470 } else if (bytes_needed <= 4) { 4471 bytes_size = 4; 4472 op = aco_opcode::scratch_load_dword; 4473 } else if (bytes_needed <= 8) { 4474 bytes_size = 8; 4475 op = aco_opcode::scratch_load_dwordx2; 4476 } else if (bytes_needed <= 12) { 4477 bytes_size = 12; 4478 op = aco_opcode::scratch_load_dwordx3; 4479 } else { 4480 bytes_size = 16; 4481 op = aco_opcode::scratch_load_dwordx4; 4482 } 4483 RegClass rc = RegClass::get(RegType::vgpr, bytes_size); 4484 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); 4485 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, Format::SCRATCH, 2, 1)}; 4486 flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset); 4487 flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1); 4488 flat->sync = info.sync; 4489 flat->offset = const_offset; 4490 flat->definitions[0] = Definition(val); 4491 bld.insert(std::move(flat)); 4492 4493 return val; 4494} 4495 4496const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, false, true, 4096}; 4497const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, false, true, 2048}; 4498 4499Temp 4500get_gfx6_global_rsrc(Builder& bld, Temp addr) 4501{ 4502 uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 4503 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 4504 4505 if (addr.type() == RegType::vgpr) 4506 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(), 4507 Operand::c32(-1u), Operand::c32(rsrc_conf)); 4508 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u), 4509 Operand::c32(rsrc_conf)); 4510} 4511 4512Temp 4513add64_32(Builder& bld, Temp src0, Temp src1) 4514{ 4515 Temp src00 = bld.tmp(src0.type(), 1); 4516 Temp src01 = bld.tmp(src0.type(), 1); 4517 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 4518 4519 if (src0.type() == RegType::vgpr || src1.type() == RegType::vgpr) { 4520 Temp dst0 = bld.tmp(v1); 4521 Temp carry = bld.vadd32(Definition(dst0), src00, src1, true).def(1).getTemp(); 4522 Temp dst1 = bld.vadd32(bld.def(v1), src01, Operand::zero(), false, carry); 4523 return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1); 4524 } else { 4525 Temp carry = bld.tmp(s1); 4526 Temp dst0 = 4527 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src1); 4528 Temp dst1 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), src01, carry); 4529 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), dst0, dst1); 4530 } 4531} 4532 4533void 4534lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout, 4535 uint32_t* const_offset_inout, Temp* offset_inout) 4536{ 4537 Temp address = *address_inout; 4538 uint64_t const_offset = *const_offset_inout + offset_in; 4539 Temp offset = *offset_inout; 4540 4541 uint64_t max_const_offset_plus_one = 4542 1; /* GFX7/8/9: FLAT loads do not support constant offsets */ 4543 if (bld.program->gfx_level >= GFX9) 4544 max_const_offset_plus_one = bld.program->dev.scratch_global_offset_max; 4545 else if (bld.program->gfx_level == GFX6) 4546 max_const_offset_plus_one = 4096; /* MUBUF has a 12-bit unsigned offset field */ 4547 uint64_t excess_offset = const_offset - (const_offset % max_const_offset_plus_one); 4548 const_offset %= max_const_offset_plus_one; 4549 4550 if (!offset.id()) { 4551 while (unlikely(excess_offset > UINT32_MAX)) { 4552 address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(UINT32_MAX))); 4553 excess_offset -= UINT32_MAX; 4554 } 4555 if (excess_offset) 4556 offset = bld.copy(bld.def(s1), Operand::c32(excess_offset)); 4557 } else { 4558 /* If we add to "offset", we would transform the indended 4559 * "address + u2u64(offset) + u2u64(const_offset)" into 4560 * "address + u2u64(offset + const_offset)", so add to the address. 4561 * This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition, 4562 * but that should be really rare. 4563 */ 4564 while (excess_offset) { 4565 uint32_t src2 = MIN2(excess_offset, UINT32_MAX); 4566 address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(src2))); 4567 excess_offset -= src2; 4568 } 4569 } 4570 4571 if (bld.program->gfx_level == GFX6) { 4572 /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (VGPR address, SGPR offset) */ 4573 if (offset.type() != RegType::sgpr) { 4574 address = add64_32(bld, address, offset); 4575 offset = Temp(); 4576 } 4577 offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero()); 4578 } else if (bld.program->gfx_level <= GFX8) { 4579 /* GFX7,8 (FLAT): VGPR address */ 4580 if (offset.id()) { 4581 address = add64_32(bld, address, offset); 4582 offset = Temp(); 4583 } 4584 address = as_vgpr(bld, address); 4585 } else { 4586 /* GFX9+ (GLOBAL): (VGPR address), or (SGPR address and VGPR offset) */ 4587 if (address.type() == RegType::vgpr && offset.id()) { 4588 address = add64_32(bld, address, offset); 4589 offset = Temp(); 4590 } else if (address.type() == RegType::sgpr && offset.id()) { 4591 offset = as_vgpr(bld, offset); 4592 } 4593 if (address.type() == RegType::sgpr && !offset.id()) 4594 offset = bld.copy(bld.def(v1), bld.copy(bld.def(s1), Operand::zero())); 4595 } 4596 4597 *address_inout = address; 4598 *const_offset_inout = const_offset; 4599 *offset_inout = offset; 4600} 4601 4602Temp 4603global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, 4604 unsigned align_, unsigned const_offset, Temp dst_hint) 4605{ 4606 Temp addr = info.resource; 4607 if (!addr.id()) { 4608 addr = offset; 4609 offset = Temp(); 4610 } 4611 lower_global_address(bld, 0, &addr, &const_offset, &offset); 4612 4613 unsigned bytes_size = 0; 4614 bool use_mubuf = bld.program->gfx_level == GFX6; 4615 bool global = bld.program->gfx_level >= GFX9; 4616 aco_opcode op; 4617 if (bytes_needed == 1 || align_ % 2u) { 4618 bytes_size = 1; 4619 op = use_mubuf ? aco_opcode::buffer_load_ubyte 4620 : global ? aco_opcode::global_load_ubyte 4621 : aco_opcode::flat_load_ubyte; 4622 } else if (bytes_needed == 2 || align_ % 4u) { 4623 bytes_size = 2; 4624 op = use_mubuf ? aco_opcode::buffer_load_ushort 4625 : global ? aco_opcode::global_load_ushort 4626 : aco_opcode::flat_load_ushort; 4627 } else if (bytes_needed <= 4) { 4628 bytes_size = 4; 4629 op = use_mubuf ? aco_opcode::buffer_load_dword 4630 : global ? aco_opcode::global_load_dword 4631 : aco_opcode::flat_load_dword; 4632 } else if (bytes_needed <= 8 || (bytes_needed <= 12 && use_mubuf)) { 4633 bytes_size = 8; 4634 op = use_mubuf ? aco_opcode::buffer_load_dwordx2 4635 : global ? aco_opcode::global_load_dwordx2 4636 : aco_opcode::flat_load_dwordx2; 4637 } else if (bytes_needed <= 12 && !use_mubuf) { 4638 bytes_size = 12; 4639 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3; 4640 } else { 4641 bytes_size = 16; 4642 op = use_mubuf ? aco_opcode::buffer_load_dwordx4 4643 : global ? aco_opcode::global_load_dwordx4 4644 : aco_opcode::flat_load_dwordx4; 4645 } 4646 RegClass rc = RegClass::get(RegType::vgpr, bytes_size); 4647 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); 4648 if (use_mubuf) { 4649 aco_ptr<MUBUF_instruction> mubuf{ 4650 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)}; 4651 mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr)); 4652 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); 4653 mubuf->operands[2] = Operand(offset); 4654 mubuf->glc = info.glc; 4655 mubuf->dlc = false; 4656 mubuf->offset = const_offset; 4657 mubuf->addr64 = addr.type() == RegType::vgpr; 4658 mubuf->disable_wqm = false; 4659 mubuf->sync = info.sync; 4660 mubuf->definitions[0] = Definition(val); 4661 bld.insert(std::move(mubuf)); 4662 } else { 4663 aco_ptr<FLAT_instruction> flat{ 4664 create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)}; 4665 if (addr.regClass() == s2) { 4666 assert(global && offset.id() && offset.type() == RegType::vgpr); 4667 flat->operands[0] = Operand(offset); 4668 flat->operands[1] = Operand(addr); 4669 } else { 4670 assert(addr.type() == RegType::vgpr && !offset.id()); 4671 flat->operands[0] = Operand(addr); 4672 flat->operands[1] = Operand(s1); 4673 } 4674 flat->glc = info.glc; 4675 flat->dlc = 4676 info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); 4677 flat->sync = info.sync; 4678 assert(global || !const_offset); 4679 flat->offset = const_offset; 4680 flat->definitions[0] = Definition(val); 4681 bld.insert(std::move(flat)); 4682 } 4683 4684 return val; 4685} 4686 4687const EmitLoadParameters global_load_params{global_load_callback, true, true, UINT32_MAX}; 4688 4689Temp 4690load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst, 4691 Temp address, unsigned base_offset, unsigned align) 4692{ 4693 assert(util_is_power_of_two_nonzero(align)); 4694 4695 Builder bld(ctx->program, ctx->block); 4696 4697 LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes}; 4698 info.align_mul = align; 4699 info.align_offset = 0; 4700 info.sync = memory_sync_info(storage_shared); 4701 info.const_offset = base_offset; 4702 emit_load(ctx, bld, info, lds_load_params); 4703 4704 return dst; 4705} 4706 4707void 4708split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes, 4709 Temp src) 4710{ 4711 if (!count) 4712 return; 4713 4714 Builder bld(ctx->program, ctx->block); 4715 4716 /* count == 1 fast path */ 4717 if (count == 1) { 4718 if (dst_type == RegType::sgpr) 4719 dst[0] = bld.as_uniform(src); 4720 else 4721 dst[0] = as_vgpr(ctx, src); 4722 return; 4723 } 4724 4725 /* elem_size_bytes is the greatest common divisor which is a power of 2 */ 4726 unsigned elem_size_bytes = 4727 1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1); 4728 4729 ASSERTED bool is_subdword = elem_size_bytes < 4; 4730 assert(!is_subdword || dst_type == RegType::vgpr); 4731 4732 for (unsigned i = 0; i < count; i++) 4733 dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i])); 4734 4735 std::vector<Temp> temps; 4736 /* use allocated_vec if possible */ 4737 auto it = ctx->allocated_vec.find(src.id()); 4738 if (it != ctx->allocated_vec.end()) { 4739 if (!it->second[0].id()) 4740 goto split; 4741 unsigned elem_size = it->second[0].bytes(); 4742 assert(src.bytes() % elem_size == 0); 4743 4744 for (unsigned i = 0; i < src.bytes() / elem_size; i++) { 4745 if (!it->second[i].id()) 4746 goto split; 4747 } 4748 if (elem_size_bytes % elem_size) 4749 goto split; 4750 4751 temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size); 4752 elem_size_bytes = elem_size; 4753 } 4754 4755split: 4756 /* split src if necessary */ 4757 if (temps.empty()) { 4758 if (is_subdword && src.type() == RegType::sgpr) 4759 src = as_vgpr(ctx, src); 4760 if (dst_type == RegType::sgpr) 4761 src = bld.as_uniform(src); 4762 4763 unsigned num_elems = src.bytes() / elem_size_bytes; 4764 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>( 4765 aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)}; 4766 split->operands[0] = Operand(src); 4767 for (unsigned i = 0; i < num_elems; i++) { 4768 temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes))); 4769 split->definitions[i] = Definition(temps.back()); 4770 } 4771 bld.insert(std::move(split)); 4772 } 4773 4774 unsigned idx = 0; 4775 for (unsigned i = 0; i < count; i++) { 4776 unsigned op_count = dst[i].bytes() / elem_size_bytes; 4777 if (op_count == 1) { 4778 if (dst_type == RegType::sgpr) 4779 dst[i] = bld.as_uniform(temps[idx++]); 4780 else 4781 dst[i] = as_vgpr(ctx, temps[idx++]); 4782 continue; 4783 } 4784 4785 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, 4786 Format::PSEUDO, op_count, 1)}; 4787 for (unsigned j = 0; j < op_count; j++) { 4788 Temp tmp = temps[idx++]; 4789 if (dst_type == RegType::sgpr) 4790 tmp = bld.as_uniform(tmp); 4791 vec->operands[j] = Operand(tmp); 4792 } 4793 vec->definitions[0] = Definition(dst[i]); 4794 bld.insert(std::move(vec)); 4795 } 4796 return; 4797} 4798 4799bool 4800scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count) 4801{ 4802 unsigned start_elem = ffs(todo_mask) - 1; 4803 bool skip = !(mask & (1 << start_elem)); 4804 if (skip) 4805 mask = ~mask & todo_mask; 4806 4807 mask &= todo_mask; 4808 4809 u_bit_scan_consecutive_range(&mask, start, count); 4810 4811 return !skip; 4812} 4813 4814void 4815advance_write_mask(uint32_t* todo_mask, int start, int count) 4816{ 4817 *todo_mask &= ~u_bit_consecutive(0, count) << start; 4818} 4819 4820void 4821store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address, 4822 unsigned base_offset, unsigned align) 4823{ 4824 assert(util_is_power_of_two_nonzero(align)); 4825 assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8); 4826 4827 Builder bld(ctx->program, ctx->block); 4828 bool large_ds_write = ctx->options->gfx_level >= GFX7; 4829 bool usable_write2 = ctx->options->gfx_level >= GFX7; 4830 4831 unsigned write_count = 0; 4832 Temp write_datas[32]; 4833 unsigned offsets[32]; 4834 unsigned bytes[32]; 4835 aco_opcode opcodes[32]; 4836 4837 wrmask = util_widen_mask(wrmask, elem_size_bytes); 4838 4839 uint32_t todo = u_bit_consecutive(0, data.bytes()); 4840 while (todo) { 4841 int offset, byte; 4842 if (!scan_write_mask(wrmask, todo, &offset, &byte)) { 4843 offsets[write_count] = offset; 4844 bytes[write_count] = byte; 4845 opcodes[write_count] = aco_opcode::num_opcodes; 4846 write_count++; 4847 advance_write_mask(&todo, offset, byte); 4848 continue; 4849 } 4850 4851 bool aligned2 = offset % 2 == 0 && align % 2 == 0; 4852 bool aligned4 = offset % 4 == 0 && align % 4 == 0; 4853 bool aligned8 = offset % 8 == 0 && align % 8 == 0; 4854 bool aligned16 = offset % 16 == 0 && align % 16 == 0; 4855 4856 // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial 4857 aco_opcode op = aco_opcode::num_opcodes; 4858 if (byte >= 16 && aligned16 && large_ds_write) { 4859 op = aco_opcode::ds_write_b128; 4860 byte = 16; 4861 } else if (byte >= 12 && aligned16 && large_ds_write) { 4862 op = aco_opcode::ds_write_b96; 4863 byte = 12; 4864 } else if (byte >= 8 && aligned8) { 4865 op = aco_opcode::ds_write_b64; 4866 byte = 8; 4867 } else if (byte >= 4 && aligned4) { 4868 op = aco_opcode::ds_write_b32; 4869 byte = 4; 4870 } else if (byte >= 2 && aligned2) { 4871 op = aco_opcode::ds_write_b16; 4872 byte = 2; 4873 } else if (byte >= 1) { 4874 op = aco_opcode::ds_write_b8; 4875 byte = 1; 4876 } else { 4877 assert(false); 4878 } 4879 4880 offsets[write_count] = offset; 4881 bytes[write_count] = byte; 4882 opcodes[write_count] = op; 4883 write_count++; 4884 advance_write_mask(&todo, offset, byte); 4885 } 4886 4887 Operand m = load_lds_size_m0(bld); 4888 4889 split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data); 4890 4891 for (unsigned i = 0; i < write_count; i++) { 4892 aco_opcode op = opcodes[i]; 4893 if (op == aco_opcode::num_opcodes) 4894 continue; 4895 4896 Temp split_data = write_datas[i]; 4897 4898 unsigned second = write_count; 4899 if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) { 4900 for (second = i + 1; second < write_count; second++) { 4901 if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) { 4902 op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64; 4903 opcodes[second] = aco_opcode::num_opcodes; 4904 break; 4905 } 4906 } 4907 } 4908 4909 bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64; 4910 unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes(); 4911 4912 unsigned inline_offset = base_offset + offsets[i]; 4913 unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535; 4914 Temp address_offset = address; 4915 if (inline_offset > max_offset) { 4916 address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset); 4917 inline_offset = offsets[i]; 4918 } 4919 4920 /* offsets[i] shouldn't be large enough for this to happen */ 4921 assert(inline_offset <= max_offset); 4922 4923 Instruction* instr; 4924 if (write2) { 4925 Temp second_data = write_datas[second]; 4926 inline_offset /= split_data.bytes(); 4927 instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset, 4928 inline_offset + write2_off); 4929 } else { 4930 instr = bld.ds(op, address_offset, split_data, m, inline_offset); 4931 } 4932 instr->ds().sync = memory_sync_info(storage_shared); 4933 4934 if (m.isUndefined()) 4935 instr->operands.pop_back(); 4936 } 4937} 4938 4939aco_opcode 4940get_buffer_store_op(unsigned bytes) 4941{ 4942 switch (bytes) { 4943 case 1: return aco_opcode::buffer_store_byte; 4944 case 2: return aco_opcode::buffer_store_short; 4945 case 4: return aco_opcode::buffer_store_dword; 4946 case 8: return aco_opcode::buffer_store_dwordx2; 4947 case 12: return aco_opcode::buffer_store_dwordx3; 4948 case 16: return aco_opcode::buffer_store_dwordx4; 4949 } 4950 unreachable("Unexpected store size"); 4951 return aco_opcode::num_opcodes; 4952} 4953 4954void 4955split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type, 4956 Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count, 4957 Temp* write_datas, unsigned* offsets) 4958{ 4959 unsigned write_count_with_skips = 0; 4960 bool skips[16]; 4961 unsigned bytes[16]; 4962 4963 /* determine how to split the data */ 4964 unsigned todo = u_bit_consecutive(0, data.bytes()); 4965 while (todo) { 4966 int offset, byte; 4967 skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte); 4968 offsets[write_count_with_skips] = offset; 4969 if (skips[write_count_with_skips]) { 4970 bytes[write_count_with_skips] = byte; 4971 advance_write_mask(&todo, offset, byte); 4972 write_count_with_skips++; 4973 continue; 4974 } 4975 4976 /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be 4977 * larger than swizzle_element_size */ 4978 byte = MIN2(byte, swizzle_element_size); 4979 if (byte % 4) 4980 byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2); 4981 4982 /* SMEM and GFX6 VMEM can't emit 12-byte stores */ 4983 if ((ctx->program->gfx_level == GFX6 || smem) && byte == 12) 4984 byte = 8; 4985 4986 /* dword or larger stores have to be dword-aligned */ 4987 unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4; 4988 unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset; 4989 bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0; 4990 if (!dword_aligned) 4991 byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1); 4992 4993 bytes[write_count_with_skips] = byte; 4994 advance_write_mask(&todo, offset, byte); 4995 write_count_with_skips++; 4996 } 4997 4998 /* actually split data */ 4999 split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data); 5000 5001 /* remove skips */ 5002 for (unsigned i = 0; i < write_count_with_skips; i++) { 5003 if (skips[i]) 5004 continue; 5005 write_datas[*write_count] = write_datas[i]; 5006 offsets[*write_count] = offsets[i]; 5007 (*write_count)++; 5008 } 5009} 5010 5011Temp 5012create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type, 5013 unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp()) 5014{ 5015 Builder bld(ctx->program, ctx->block); 5016 unsigned dword_size = elem_size_bytes / 4; 5017 5018 if (!dst.id()) 5019 dst = bld.tmp(RegClass(reg_type, cnt * dword_size)); 5020 5021 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec; 5022 aco_ptr<Pseudo_instruction> instr{ 5023 create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)}; 5024 instr->definitions[0] = Definition(dst); 5025 5026 for (unsigned i = 0; i < cnt; ++i) { 5027 if (arr[i].id()) { 5028 assert(arr[i].size() == dword_size); 5029 allocated_vec[i] = arr[i]; 5030 instr->operands[i] = Operand(arr[i]); 5031 } else { 5032 Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), 5033 Operand::zero(dword_size == 2 ? 8 : 4)); 5034 allocated_vec[i] = zero; 5035 instr->operands[i] = Operand(zero); 5036 } 5037 } 5038 5039 bld.insert(std::move(instr)); 5040 5041 if (split_cnt) 5042 emit_split_vector(ctx, dst, split_cnt); 5043 else 5044 ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */ 5045 5046 return dst; 5047} 5048 5049inline unsigned 5050resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset) 5051{ 5052 if (const_offset >= 4096) { 5053 unsigned excess_const_offset = const_offset / 4096u * 4096u; 5054 const_offset %= 4096u; 5055 5056 if (!voffset.id()) 5057 voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset)); 5058 else if (unlikely(voffset.regClass() == s1)) 5059 voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), 5060 Operand::c32(excess_const_offset), Operand(voffset)); 5061 else if (likely(voffset.regClass() == v1)) 5062 voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset)); 5063 else 5064 unreachable("Unsupported register class of voffset"); 5065 } 5066 5067 return const_offset; 5068} 5069 5070void 5071emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata, 5072 unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(), 5073 bool slc = false, bool swizzled = false) 5074{ 5075 assert(vdata.id()); 5076 assert(vdata.size() != 3 || ctx->program->gfx_level != GFX6); 5077 assert(vdata.size() >= 1 && vdata.size() <= 4); 5078 5079 Builder bld(ctx->program, ctx->block); 5080 aco_opcode op = get_buffer_store_op(vdata.bytes()); 5081 const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset); 5082 5083 Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1); 5084 Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero(); 5085 bool glc = ctx->program->gfx_level < GFX11; 5086 Builder::Result r = 5087 bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset, 5088 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled, 5089 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, 5090 /* glc */ glc, /* dlc*/ false, /* slc */ slc); 5091 5092 r.instr->mubuf().sync = sync; 5093} 5094 5095void 5096store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, 5097 unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask, 5098 bool allow_combining = true, memory_sync_info sync = memory_sync_info(), 5099 bool slc = false) 5100{ 5101 Builder bld(ctx->program, ctx->block); 5102 assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); 5103 assert(write_mask); 5104 write_mask = util_widen_mask(write_mask, elem_size_bytes); 5105 5106 unsigned write_count = 0; 5107 Temp write_datas[32]; 5108 unsigned offsets[32]; 5109 split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4, 5110 &write_count, write_datas, offsets); 5111 5112 for (unsigned i = 0; i < write_count; i++) { 5113 unsigned const_offset = offsets[i] + base_const_offset; 5114 emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync, 5115 slc, !allow_combining); 5116 } 5117} 5118 5119void 5120load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset, 5121 unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components, 5122 unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true, 5123 bool slc = false, memory_sync_info sync = memory_sync_info()) 5124{ 5125 assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); 5126 assert((num_components * elem_size_bytes) == dst.bytes()); 5127 assert(!!stride != allow_combining); 5128 5129 Builder bld(ctx->program, ctx->block); 5130 5131 LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor}; 5132 info.component_stride = allow_combining ? 0 : stride; 5133 info.glc = true; 5134 info.slc = slc; 5135 info.swizzle_component_size = allow_combining ? 0 : 4; 5136 info.align_mul = MIN2(elem_size_bytes, 4); 5137 info.align_offset = 0; 5138 info.soffset = soffset; 5139 info.const_offset = base_const_offset; 5140 info.sync = sync; 5141 emit_load(ctx, bld, info, mubuf_load_params); 5142} 5143 5144Temp 5145wave_id_in_threadgroup(isel_context* ctx) 5146{ 5147 Builder bld(ctx->program, ctx->block); 5148 return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), 5149 get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(24u | (4u << 16))); 5150} 5151 5152Temp 5153thread_id_in_threadgroup(isel_context* ctx) 5154{ 5155 /* tid_in_tg = wave_id * wave_size + tid_in_wave */ 5156 5157 Builder bld(ctx->program, ctx->block); 5158 Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1)); 5159 5160 if (ctx->program->workgroup_size <= ctx->program->wave_size) 5161 return tid_in_wave; 5162 5163 Temp wave_id_in_tg = wave_id_in_threadgroup(ctx); 5164 Temp num_pre_threads = 5165 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg, 5166 Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u)); 5167 return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave)); 5168} 5169 5170bool 5171store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr) 5172{ 5173 unsigned write_mask = nir_intrinsic_write_mask(instr); 5174 unsigned component = nir_intrinsic_component(instr); 5175 unsigned idx = nir_intrinsic_base(instr) * 4u + component; 5176 nir_src offset = *nir_get_io_offset_src(instr); 5177 5178 if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) 5179 return false; 5180 5181 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 5182 5183 if (instr->src[0].ssa->bit_size == 64) 5184 write_mask = util_widen_mask(write_mask, 2); 5185 5186 RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1; 5187 5188 for (unsigned i = 0; i < 8; ++i) { 5189 if (write_mask & (1 << i)) { 5190 ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u); 5191 ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc); 5192 } 5193 idx++; 5194 } 5195 5196 if (ctx->stage == fragment_fs && ctx->program->info.ps.has_epilog) { 5197 unsigned index = nir_intrinsic_base(instr) - FRAG_RESULT_DATA0; 5198 5199 if (nir_intrinsic_src_type(instr) == nir_type_float16) { 5200 ctx->output_color_types |= ACO_TYPE_FLOAT16 << (index * 2); 5201 } else if (nir_intrinsic_src_type(instr) == nir_type_int16) { 5202 ctx->output_color_types |= ACO_TYPE_INT16 << (index * 2); 5203 } else if (nir_intrinsic_src_type(instr) == nir_type_uint16) { 5204 ctx->output_color_types |= ACO_TYPE_UINT16 << (index * 2); 5205 } 5206 } 5207 5208 return true; 5209} 5210 5211bool 5212load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst) 5213{ 5214 /* Only TCS per-vertex inputs are supported by this function. 5215 * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations 5216 * is the same. 5217 */ 5218 if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq) 5219 return false; 5220 5221 nir_src* off_src = nir_get_io_offset_src(instr); 5222 nir_src* vertex_index_src = nir_get_io_arrayed_index_src(instr); 5223 nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr; 5224 bool can_use_temps = 5225 nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic && 5226 nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id; 5227 5228 if (!can_use_temps) 5229 return false; 5230 5231 unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) + 5232 4 * nir_src_as_uint(*off_src); 5233 Temp* src = &ctx->inputs.temps[idx]; 5234 create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst); 5235 5236 return true; 5237} 5238 5239static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos); 5240 5241void 5242visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr) 5243{ 5244 if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs || 5245 ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg || ctx->stage == mesh_ngg || 5246 (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) || 5247 ctx->shader->info.stage == MESA_SHADER_GEOMETRY) { 5248 bool stored_to_temps = store_output_to_temps(ctx, instr); 5249 if (!stored_to_temps) { 5250 isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction"); 5251 abort(); 5252 } 5253 } else { 5254 unreachable("Shader stage not implemented"); 5255 } 5256} 5257 5258void 5259emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst, 5260 Temp prim_mask) 5261{ 5262 Temp coord1 = emit_extract_vector(ctx, src, 0, v1); 5263 Temp coord2 = emit_extract_vector(ctx, src, 1, v1); 5264 5265 Builder bld(ctx->program, ctx->block); 5266 5267 if (dst.regClass() == v2b) { 5268 if (ctx->program->dev.has_16bank_lds) { 5269 assert(ctx->options->gfx_level <= GFX8); 5270 Builder::Result interp_p1 = 5271 bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */, 5272 bld.m0(prim_mask), idx, component); 5273 interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1, 5274 bld.m0(prim_mask), interp_p1, idx, component); 5275 bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask), 5276 interp_p1, idx, component); 5277 } else { 5278 aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16; 5279 5280 if (ctx->options->gfx_level == GFX8) 5281 interp_p2_op = aco_opcode::v_interp_p2_legacy_f16; 5282 5283 Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1, 5284 bld.m0(prim_mask), idx, component); 5285 bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, 5286 component); 5287 } 5288 } else { 5289 Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, 5290 bld.m0(prim_mask), idx, component); 5291 5292 if (ctx->program->dev.has_16bank_lds) 5293 interp_p1.instr->operands[0].setLateKill(true); 5294 5295 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, 5296 idx, component); 5297 } 5298} 5299 5300void 5301emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components) 5302{ 5303 Builder bld(ctx->program, ctx->block); 5304 5305 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>( 5306 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)); 5307 for (unsigned i = 0; i < num_components; i++) { 5308 if (ctx->args->ac.frag_pos[i].used) 5309 vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i])); 5310 else 5311 vec->operands[i] = Operand(v1); 5312 } 5313 if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) { 5314 assert(num_components == 4); 5315 vec->operands[3] = 5316 bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3])); 5317 } 5318 5319 for (Operand& op : vec->operands) 5320 op = op.isUndefined() ? Operand::zero() : op; 5321 5322 vec->definitions[0] = Definition(dst); 5323 ctx->block->instructions.emplace_back(std::move(vec)); 5324 emit_split_vector(ctx, dst, num_components); 5325 return; 5326} 5327 5328void 5329emit_load_frag_shading_rate(isel_context* ctx, Temp dst) 5330{ 5331 Builder bld(ctx->program, ctx->block); 5332 Temp cond; 5333 5334 /* VRS Rate X = Ancillary[2:3] 5335 * VRS Rate Y = Ancillary[4:5] 5336 */ 5337 Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary), 5338 Operand::c32(2u), Operand::c32(2u)); 5339 Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary), 5340 Operand::c32(4u), Operand::c32(2u)); 5341 5342 /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */ 5343 cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate)); 5344 x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()), 5345 bld.copy(bld.def(v1), Operand::c32(4u)), cond); 5346 5347 /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */ 5348 cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate)); 5349 y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()), 5350 bld.copy(bld.def(v1), Operand::c32(1u)), cond); 5351 5352 bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate)); 5353} 5354 5355void 5356visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr) 5357{ 5358 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5359 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa); 5360 unsigned idx = nir_intrinsic_base(instr); 5361 unsigned component = nir_intrinsic_component(instr); 5362 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask); 5363 5364 assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1])); 5365 5366 if (instr->dest.ssa.num_components == 1) { 5367 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask); 5368 } else { 5369 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>( 5370 aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1)); 5371 for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) { 5372 Temp tmp = ctx->program->allocateTmp(instr->dest.ssa.bit_size == 16 ? v2b : v1); 5373 emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask); 5374 vec->operands[i] = Operand(tmp); 5375 } 5376 vec->definitions[0] = Definition(dst); 5377 ctx->block->instructions.emplace_back(std::move(vec)); 5378 } 5379} 5380 5381bool 5382check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset, 5383 unsigned binding_align, unsigned channels) 5384{ 5385 unsigned vertex_byte_size = vtx_info->chan_byte_size * channels; 5386 if (vtx_info->chan_byte_size != 4 && channels == 3) 5387 return false; 5388 5389 /* Split typed vertex buffer loads on GFX6 and GFX10+ to avoid any 5390 * alignment issues that triggers memory violations and eventually a GPU 5391 * hang. This can happen if the stride (static or dynamic) is unaligned and 5392 * also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO 5393 * offset is 2 for R16G16B16A16_SNORM). 5394 */ 5395 return (ctx->options->gfx_level >= GFX7 && ctx->options->gfx_level <= GFX9) || 5396 (offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0); 5397} 5398 5399uint8_t 5400get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset, 5401 unsigned* channels, unsigned max_channels, unsigned binding_align) 5402{ 5403 if (!vtx_info->chan_byte_size) { 5404 *channels = vtx_info->num_channels; 5405 return vtx_info->chan_format; 5406 } 5407 5408 unsigned num_channels = *channels; 5409 if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) { 5410 unsigned new_channels = num_channels + 1; 5411 /* first, assume more loads is worse and try using a larger data format */ 5412 while (new_channels <= max_channels && 5413 !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) { 5414 new_channels++; 5415 } 5416 5417 if (new_channels > max_channels) { 5418 /* then try decreasing load size (at the cost of more loads) */ 5419 new_channels = *channels; 5420 while (new_channels > 1 && 5421 !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) 5422 new_channels--; 5423 } 5424 5425 if (new_channels < *channels) 5426 *channels = new_channels; 5427 num_channels = new_channels; 5428 } 5429 5430 switch (vtx_info->chan_format) { 5431 case V_008F0C_BUF_DATA_FORMAT_8: 5432 return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8, 5433 V_008F0C_BUF_DATA_FORMAT_INVALID, 5434 V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1]; 5435 case V_008F0C_BUF_DATA_FORMAT_16: 5436 return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16, 5437 V_008F0C_BUF_DATA_FORMAT_INVALID, 5438 V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1]; 5439 case V_008F0C_BUF_DATA_FORMAT_32: 5440 return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32, 5441 V_008F0C_BUF_DATA_FORMAT_32_32_32, 5442 V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1]; 5443 } 5444 unreachable("shouldn't reach here"); 5445 return V_008F0C_BUF_DATA_FORMAT_INVALID; 5446} 5447 5448void 5449visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) 5450{ 5451 Builder bld(ctx->program, ctx->block); 5452 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5453 nir_src offset = *nir_get_io_offset_src(instr); 5454 5455 if (ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->program->info.vs.dynamic_inputs) { 5456 if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) 5457 isel_err(offset.ssa->parent_instr, 5458 "Unimplemented non-zero nir_intrinsic_load_input offset"); 5459 5460 unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0; 5461 unsigned component = nir_intrinsic_component(instr); 5462 unsigned bitsize = instr->dest.ssa.bit_size; 5463 unsigned num_components = instr->dest.ssa.num_components; 5464 5465 Temp input = get_arg(ctx, ctx->args->vs_inputs[location]); 5466 5467 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>( 5468 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; 5469 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 5470 for (unsigned i = 0; i < num_components; i++) { 5471 elems[i] = emit_extract_vector(ctx, input, component + i, bitsize == 64 ? v2 : v1); 5472 if (bitsize == 16) { 5473 if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float) 5474 elems[i] = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), elems[i]); 5475 else 5476 elems[i] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), elems[i], 5477 Operand::c32(0u)); 5478 } 5479 vec->operands[i] = Operand(elems[i]); 5480 } 5481 vec->definitions[0] = Definition(dst); 5482 ctx->block->instructions.emplace_back(std::move(vec)); 5483 ctx->allocated_vec.emplace(dst.id(), elems); 5484 } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) { 5485 5486 if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) 5487 isel_err(offset.ssa->parent_instr, 5488 "Unimplemented non-zero nir_intrinsic_load_input offset"); 5489 5490 Temp vertex_buffers = 5491 convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers)); 5492 5493 unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0; 5494 unsigned component = nir_intrinsic_component(instr); 5495 unsigned bitsize = instr->dest.ssa.bit_size; 5496 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location]; 5497 uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location]; 5498 uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location]; 5499 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location]; 5500 unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding]; 5501 5502 unsigned dfmt = attrib_format & 0xf; 5503 unsigned nfmt = (attrib_format >> 4) & 0x7; 5504 const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt); 5505 5506 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component; 5507 unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels); 5508 5509 unsigned desc_index = 5510 ctx->program->info.vs.use_per_attribute_vb_descs ? location : attrib_binding; 5511 desc_index = util_bitcount(ctx->program->info.vs.vb_desc_usage_mask & 5512 u_bit_consecutive(0, desc_index)); 5513 Operand off = bld.copy(bld.def(s1), Operand::c32(desc_index * 16u)); 5514 Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off); 5515 5516 Temp index; 5517 if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) { 5518 uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location]; 5519 Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance); 5520 if (divisor) { 5521 Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id); 5522 if (divisor != 1) { 5523 Temp divided = bld.tmp(v1); 5524 emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor); 5525 index = bld.vadd32(bld.def(v1), start_instance, divided); 5526 } else { 5527 index = bld.vadd32(bld.def(v1), start_instance, instance_id); 5528 } 5529 } else { 5530 index = bld.copy(bld.def(v1), start_instance); 5531 } 5532 } else { 5533 index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex), 5534 get_arg(ctx, ctx->args->ac.vertex_id)); 5535 } 5536 5537 Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp)); 5538 unsigned channel_start = 0; 5539 bool direct_fetch = false; 5540 5541 /* skip unused channels at the start */ 5542 if (vtx_info->chan_byte_size) { 5543 channel_start = ffs(mask) - 1; 5544 for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++) 5545 channels[i] = Temp(0, s1); 5546 } 5547 5548 /* load channels */ 5549 while (channel_start < num_channels) { 5550 unsigned fetch_component = num_channels - channel_start; 5551 unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size; 5552 5553 /* use MUBUF when possible to avoid possible alignment issues */ 5554 /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */ 5555 bool use_mubuf = 5556 (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || 5557 nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) && 5558 vtx_info->chan_byte_size == 4 && bitsize != 16; 5559 unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID; 5560 if (!use_mubuf) { 5561 fetch_dfmt = 5562 get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component, 5563 vtx_info->num_channels - channel_start, binding_align); 5564 } else { 5565 /* GFX6 only supports loading vec3 with MTBUF, split to vec2,scalar. */ 5566 if (fetch_component == 3 && ctx->options->gfx_level == GFX6) 5567 fetch_component = 2; 5568 } 5569 5570 unsigned fetch_bytes = fetch_component * bitsize / 8; 5571 5572 Temp fetch_index = index; 5573 if (attrib_stride != 0 && fetch_offset > attrib_stride) { 5574 fetch_index = 5575 bld.vadd32(bld.def(v1), Operand::c32(fetch_offset / attrib_stride), fetch_index); 5576 fetch_offset = fetch_offset % attrib_stride; 5577 } 5578 5579 Operand soffset = Operand::zero(); 5580 if (fetch_offset >= 4096) { 5581 soffset = bld.copy(bld.def(s1), Operand::c32(fetch_offset / 4096 * 4096)); 5582 fetch_offset %= 4096; 5583 } 5584 5585 aco_opcode opcode; 5586 switch (fetch_bytes) { 5587 case 2: 5588 assert(!use_mubuf && bitsize == 16); 5589 opcode = aco_opcode::tbuffer_load_format_d16_x; 5590 break; 5591 case 4: 5592 if (bitsize == 16) { 5593 assert(!use_mubuf); 5594 opcode = aco_opcode::tbuffer_load_format_d16_xy; 5595 } else { 5596 opcode = 5597 use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; 5598 } 5599 break; 5600 case 6: 5601 assert(!use_mubuf && bitsize == 16); 5602 opcode = aco_opcode::tbuffer_load_format_d16_xyz; 5603 break; 5604 case 8: 5605 if (bitsize == 16) { 5606 assert(!use_mubuf); 5607 opcode = aco_opcode::tbuffer_load_format_d16_xyzw; 5608 } else { 5609 opcode = 5610 use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; 5611 } 5612 break; 5613 case 12: 5614 assert(ctx->options->gfx_level >= GFX7 || 5615 (!use_mubuf && ctx->options->gfx_level == GFX6)); 5616 opcode = 5617 use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz; 5618 break; 5619 case 16: 5620 opcode = 5621 use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw; 5622 break; 5623 default: unreachable("Unimplemented load_input vector size"); 5624 } 5625 5626 Temp fetch_dst; 5627 if (channel_start == 0 && fetch_bytes == dst.bytes() && num_channels <= 3) { 5628 direct_fetch = true; 5629 fetch_dst = dst; 5630 } else { 5631 fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes)); 5632 } 5633 5634 if (use_mubuf) { 5635 Instruction* mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index, 5636 soffset, fetch_offset, false, false, true) 5637 .instr; 5638 mubuf->mubuf().vtx_binding = attrib_binding + 1; 5639 } else { 5640 Instruction* mtbuf = bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index, 5641 soffset, fetch_dfmt, nfmt, fetch_offset, false, true) 5642 .instr; 5643 mtbuf->mtbuf().vtx_binding = attrib_binding + 1; 5644 } 5645 5646 emit_split_vector(ctx, fetch_dst, fetch_dst.bytes() * 8 / bitsize); 5647 5648 if (fetch_component == 1) { 5649 channels[channel_start] = fetch_dst; 5650 } else { 5651 for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++) 5652 channels[channel_start + i] = 5653 emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1); 5654 } 5655 5656 channel_start += fetch_component; 5657 } 5658 5659 if (!direct_fetch) { 5660 bool is_float = 5661 nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT; 5662 5663 unsigned num_components = instr->dest.ssa.num_components; 5664 5665 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>( 5666 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; 5667 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 5668 unsigned num_temp = 0; 5669 for (unsigned i = 0; i < num_components; i++) { 5670 unsigned idx = i + component; 5671 if (idx < num_channels && channels[idx].id()) { 5672 Temp channel = channels[idx]; 5673 vec->operands[i] = Operand(channel); 5674 5675 num_temp++; 5676 elems[i] = channel; 5677 } else if (is_float && idx == 3) { 5678 vec->operands[i] = bitsize == 16 ? Operand::c16(0x3c00u) : Operand::c32(0x3f800000u); 5679 } else if (!is_float && idx == 3) { 5680 vec->operands[i] = Operand::get_const(ctx->options->gfx_level, 1u, bitsize / 8u); 5681 } else { 5682 vec->operands[i] = Operand::zero(bitsize / 8u); 5683 } 5684 } 5685 vec->definitions[0] = Definition(dst); 5686 ctx->block->instructions.emplace_back(std::move(vec)); 5687 emit_split_vector(ctx, dst, num_components); 5688 5689 if (num_temp == num_components) 5690 ctx->allocated_vec.emplace(dst.id(), elems); 5691 } 5692 } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) { 5693 if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) 5694 isel_err(offset.ssa->parent_instr, 5695 "Unimplemented non-zero nir_intrinsic_load_input offset"); 5696 5697 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask); 5698 5699 unsigned idx = nir_intrinsic_base(instr); 5700 unsigned component = nir_intrinsic_component(instr); 5701 unsigned vertex_id = 2; /* P0 */ 5702 5703 if (instr->intrinsic == nir_intrinsic_load_input_vertex) { 5704 nir_const_value* src0 = nir_src_as_const_value(instr->src[0]); 5705 switch (src0->u32) { 5706 case 0: 5707 vertex_id = 2; /* P0 */ 5708 break; 5709 case 1: 5710 vertex_id = 0; /* P10 */ 5711 break; 5712 case 2: 5713 vertex_id = 1; /* P20 */ 5714 break; 5715 default: unreachable("invalid vertex index"); 5716 } 5717 } 5718 5719 if (instr->dest.ssa.num_components == 1 && 5720 instr->dest.ssa.bit_size != 64) { 5721 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id), 5722 bld.m0(prim_mask), idx, component); 5723 } else { 5724 unsigned num_components = instr->dest.ssa.num_components; 5725 if (instr->dest.ssa.bit_size == 64) 5726 num_components *= 2; 5727 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 5728 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; 5729 for (unsigned i = 0; i < num_components; i++) { 5730 unsigned chan_component = (component + i) % 4; 5731 unsigned chan_idx = idx + (component + i) / 4; 5732 vec->operands[i] = bld.vintrp( 5733 aco_opcode::v_interp_mov_f32, bld.def(instr->dest.ssa.bit_size == 16 ? v2b : v1), 5734 Operand::c32(vertex_id), bld.m0(prim_mask), chan_idx, chan_component); 5735 } 5736 vec->definitions[0] = Definition(dst); 5737 bld.insert(std::move(vec)); 5738 } 5739 } else { 5740 unreachable("Shader stage not implemented"); 5741 } 5742} 5743 5744void 5745visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr) 5746{ 5747 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); 5748 5749 Builder bld(ctx->program, ctx->block); 5750 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5751 5752 if (load_input_from_temps(ctx, instr, dst)) 5753 return; 5754 5755 unreachable("LDS-based TCS input should have been lowered in NIR."); 5756} 5757 5758void 5759visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr) 5760{ 5761 switch (ctx->shader->info.stage) { 5762 case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break; 5763 default: unreachable("Unimplemented shader stage"); 5764 } 5765} 5766 5767void 5768visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr) 5769{ 5770 assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL); 5771 5772 Builder bld(ctx->program, ctx->block); 5773 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5774 5775 Operand tes_u(get_arg(ctx, ctx->args->ac.tes_u)); 5776 Operand tes_v(get_arg(ctx, ctx->args->ac.tes_v)); 5777 Operand tes_w = Operand::zero(); 5778 5779 if (ctx->shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) { 5780 Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v); 5781 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp); 5782 tes_w = Operand(tmp); 5783 } 5784 5785 Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w); 5786 emit_split_vector(ctx, tess_coord, 3); 5787} 5788 5789void 5790load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst, 5791 Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false, 5792 bool allow_smem = true, memory_sync_info sync = memory_sync_info()) 5793{ 5794 Builder bld(ctx->program, ctx->block); 5795 5796 bool use_smem = 5797 dst.type() != RegType::vgpr && (!glc || ctx->options->gfx_level >= GFX8) && allow_smem; 5798 if (use_smem) 5799 offset = bld.as_uniform(offset); 5800 else { 5801 /* GFX6-7 are affected by a hw bug that prevents address clamping to 5802 * work correctly when the SGPR offset is used. 5803 */ 5804 if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8) 5805 offset = as_vgpr(ctx, offset); 5806 } 5807 5808 LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc}; 5809 info.glc = glc; 5810 info.sync = sync; 5811 info.align_mul = align_mul; 5812 info.align_offset = align_offset; 5813 if (use_smem) 5814 emit_load(ctx, bld, info, smem_load_params); 5815 else 5816 emit_load(ctx, bld, info, mubuf_load_params); 5817} 5818 5819void 5820visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr) 5821{ 5822 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5823 Builder bld(ctx->program, ctx->block); 5824 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 5825 5826 unsigned size = instr->dest.ssa.bit_size / 8; 5827 load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), 5828 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr)); 5829} 5830 5831void 5832visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr) 5833{ 5834 Builder bld(ctx->program, ctx->block); 5835 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5836 unsigned offset = nir_intrinsic_base(instr); 5837 unsigned count = instr->dest.ssa.num_components; 5838 nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]); 5839 5840 if (instr->dest.ssa.bit_size == 64) 5841 count *= 2; 5842 5843 if (index_cv && instr->dest.ssa.bit_size >= 32) { 5844 unsigned start = (offset + index_cv->u32) / 4u; 5845 uint64_t mask = BITFIELD64_MASK(count) << start; 5846 if ((ctx->args->ac.inline_push_const_mask | mask) == ctx->args->ac.inline_push_const_mask && 5847 start + count <= (sizeof(ctx->args->ac.inline_push_const_mask) * 8u)) { 5848 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 5849 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 5850 aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; 5851 unsigned arg_index = 5852 util_bitcount64(ctx->args->ac.inline_push_const_mask & BITFIELD64_MASK(start)); 5853 for (unsigned i = 0; i < count; ++i) { 5854 elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[arg_index++]); 5855 vec->operands[i] = Operand{elems[i]}; 5856 } 5857 vec->definitions[0] = Definition(dst); 5858 ctx->block->instructions.emplace_back(std::move(vec)); 5859 ctx->allocated_vec.emplace(dst.id(), elems); 5860 return; 5861 } 5862 } 5863 5864 Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 5865 if (offset != 0) // TODO check if index != 0 as well 5866 index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), 5867 Operand::c32(offset), index); 5868 Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants)); 5869 Temp vec = dst; 5870 bool trim = false; 5871 bool aligned = true; 5872 5873 if (instr->dest.ssa.bit_size == 8) { 5874 aligned = index_cv && (offset + index_cv->u32) % 4 == 0; 5875 bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4); 5876 if (!aligned) 5877 vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2); 5878 } else if (instr->dest.ssa.bit_size == 16) { 5879 aligned = index_cv && (offset + index_cv->u32) % 4 == 0; 5880 if (!aligned) 5881 vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1); 5882 } 5883 5884 aco_opcode op; 5885 5886 switch (vec.size()) { 5887 case 1: op = aco_opcode::s_load_dword; break; 5888 case 2: op = aco_opcode::s_load_dwordx2; break; 5889 case 3: 5890 vec = bld.tmp(s4); 5891 trim = true; 5892 FALLTHROUGH; 5893 case 4: op = aco_opcode::s_load_dwordx4; break; 5894 case 6: 5895 vec = bld.tmp(s8); 5896 trim = true; 5897 FALLTHROUGH; 5898 case 8: op = aco_opcode::s_load_dwordx8; break; 5899 default: unreachable("unimplemented or forbidden load_push_constant."); 5900 } 5901 5902 bld.smem(op, Definition(vec), ptr, index).instr->smem().prevent_overflow = true; 5903 5904 if (!aligned) { 5905 Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index); 5906 byte_align_scalar(ctx, vec, byte_offset, dst); 5907 return; 5908 } 5909 5910 if (trim) { 5911 emit_split_vector(ctx, vec, 4); 5912 RegClass rc = dst.size() == 3 ? s1 : s2; 5913 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc), 5914 emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc)); 5915 } 5916 emit_split_vector(ctx, dst, instr->dest.ssa.num_components); 5917} 5918 5919void 5920visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr) 5921{ 5922 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5923 5924 Builder bld(ctx->program, ctx->block); 5925 5926 uint32_t desc_type = 5927 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 5928 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); 5929 if (ctx->options->gfx_level >= GFX10) { 5930 desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 5931 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | 5932 S_008F0C_RESOURCE_LEVEL(ctx->options->gfx_level < GFX11); 5933 } else { 5934 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 5935 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 5936 } 5937 5938 unsigned base = nir_intrinsic_base(instr); 5939 unsigned range = nir_intrinsic_range(instr); 5940 5941 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); 5942 if (base && offset.type() == RegType::sgpr) 5943 offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, 5944 Operand::c32(base)); 5945 else if (base && offset.type() == RegType::vgpr) 5946 offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset); 5947 5948 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), 5949 bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), 5950 Operand::c32(ctx->constant_data_offset)), 5951 Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)), 5952 Operand::c32(desc_type)); 5953 unsigned size = instr->dest.ssa.bit_size / 8; 5954 // TODO: get alignment information for subdword constants 5955 load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0); 5956} 5957 5958/* Packs multiple Temps of different sizes in to a vector of v1 Temps. 5959 * The byte count of each input Temp must be a multiple of 2. 5960 */ 5961static std::vector<Temp> 5962emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked) 5963{ 5964 Builder bld(ctx->program, ctx->block); 5965 std::vector<Temp> packed; 5966 Temp low = Temp(); 5967 for (Temp tmp : unpacked) { 5968 assert(tmp.bytes() % 2 == 0); 5969 unsigned byte_idx = 0; 5970 while (byte_idx < tmp.bytes()) { 5971 if (low != Temp()) { 5972 Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b); 5973 Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, high); 5974 low = Temp(); 5975 packed.push_back(dword); 5976 byte_idx += 2; 5977 } else if (byte_idx % 4 == 0 && (byte_idx + 4) <= tmp.bytes()) { 5978 packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1)); 5979 byte_idx += 4; 5980 } else { 5981 low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b); 5982 byte_idx += 2; 5983 } 5984 } 5985 } 5986 if (low != Temp()) { 5987 Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, Operand(v2b)); 5988 packed.push_back(dword); 5989 } 5990 return packed; 5991} 5992 5993static bool 5994should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array) 5995{ 5996 if (sampler_dim == GLSL_SAMPLER_DIM_BUF) 5997 return false; 5998 ac_image_dim dim = ac_get_sampler_dim(ctx->options->gfx_level, sampler_dim, is_array); 5999 return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray || 6000 dim == ac_image_2darraymsaa; 6001} 6002 6003static int 6004image_type_to_components_count(enum glsl_sampler_dim dim, bool array) 6005{ 6006 switch (dim) { 6007 case GLSL_SAMPLER_DIM_BUF: return 1; 6008 case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1; 6009 case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2; 6010 case GLSL_SAMPLER_DIM_MS: return array ? 4 : 3; 6011 case GLSL_SAMPLER_DIM_3D: 6012 case GLSL_SAMPLER_DIM_CUBE: return 3; 6013 case GLSL_SAMPLER_DIM_RECT: 6014 case GLSL_SAMPLER_DIM_SUBPASS: return 2; 6015 case GLSL_SAMPLER_DIM_SUBPASS_MS: return 3; 6016 default: break; 6017 } 6018 return 0; 6019} 6020 6021static MIMG_instruction* 6022emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp, 6023 std::vector<Temp> coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1)) 6024{ 6025 /* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. */ 6026 unsigned max_nsa_size = bld.program->gfx_level >= GFX10_3 ? 13 : 5; 6027 bool use_nsa = bld.program->gfx_level >= GFX10 && coords.size() <= max_nsa_size; 6028 6029 if (!use_nsa) { 6030 Temp coord = coords[0]; 6031 if (coords.size() > 1) { 6032 coord = bld.tmp(RegType::vgpr, coords.size()); 6033 6034 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 6035 aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; 6036 for (unsigned i = 0; i < coords.size(); i++) 6037 vec->operands[i] = Operand(coords[i]); 6038 vec->definitions[0] = Definition(coord); 6039 bld.insert(std::move(vec)); 6040 } else if (coord.type() == RegType::sgpr) { 6041 coord = bld.copy(bld.def(v1), coord); 6042 } 6043 6044 if (wqm_mask) { 6045 /* We don't need the bias, sample index, compare value or offset to be 6046 * computed in WQM but if the p_create_vector copies the coordinates, then it 6047 * needs to be in WQM. */ 6048 coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true); 6049 } 6050 6051 coords[0] = coord; 6052 coords.resize(1); 6053 } else { 6054 for (unsigned i = 0; i < coords.size(); i++) { 6055 if (wqm_mask & (1u << i)) 6056 coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true); 6057 } 6058 6059 for (Temp& coord : coords) { 6060 if (coord.type() == RegType::sgpr) 6061 coord = bld.copy(bld.def(v1), coord); 6062 } 6063 } 6064 6065 aco_ptr<MIMG_instruction> mimg{ 6066 create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), dst.isTemp())}; 6067 if (dst.isTemp()) 6068 mimg->definitions[0] = dst; 6069 mimg->operands[0] = Operand(rsrc); 6070 mimg->operands[1] = samp; 6071 mimg->operands[2] = vdata; 6072 for (unsigned i = 0; i < coords.size(); i++) 6073 mimg->operands[3 + i] = Operand(coords[i]); 6074 6075 MIMG_instruction* res = mimg.get(); 6076 bld.insert(std::move(mimg)); 6077 return res; 6078} 6079 6080void 6081visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr) 6082{ 6083 Builder bld(ctx->program, ctx->block); 6084 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6085 Temp resource = get_ssa_temp(ctx, instr->src[0].ssa); 6086 Temp node = get_ssa_temp(ctx, instr->src[1].ssa); 6087 Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa); 6088 Temp origin = get_ssa_temp(ctx, instr->src[3].ssa); 6089 Temp dir = get_ssa_temp(ctx, instr->src[4].ssa); 6090 Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa); 6091 6092 std::vector<Temp> args; 6093 args.push_back(emit_extract_vector(ctx, node, 0, v1)); 6094 args.push_back(emit_extract_vector(ctx, node, 1, v1)); 6095 args.push_back(as_vgpr(ctx, tmax)); 6096 args.push_back(emit_extract_vector(ctx, origin, 0, v1)); 6097 args.push_back(emit_extract_vector(ctx, origin, 1, v1)); 6098 args.push_back(emit_extract_vector(ctx, origin, 2, v1)); 6099 args.push_back(emit_extract_vector(ctx, dir, 0, v1)); 6100 args.push_back(emit_extract_vector(ctx, dir, 1, v1)); 6101 args.push_back(emit_extract_vector(ctx, dir, 2, v1)); 6102 args.push_back(emit_extract_vector(ctx, inv_dir, 0, v1)); 6103 args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1)); 6104 args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1)); 6105 6106 MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst), 6107 resource, Operand(s4), args); 6108 mimg->dim = ac_image_1d; 6109 mimg->dmask = 0xf; 6110 mimg->unrm = true; 6111 mimg->r128 = true; 6112} 6113 6114static std::vector<Temp> 6115get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr) 6116{ 6117 6118 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa); 6119 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 6120 bool is_array = nir_intrinsic_image_array(instr); 6121 ASSERTED bool add_frag_pos = 6122 (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); 6123 assert(!add_frag_pos && "Input attachments should be lowered."); 6124 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); 6125 bool gfx9_1d = ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D; 6126 int count = image_type_to_components_count(dim, is_array); 6127 std::vector<Temp> coords(count); 6128 Builder bld(ctx->program, ctx->block); 6129 6130 if (is_ms) 6131 coords[--count] = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1); 6132 6133 if (gfx9_1d) { 6134 coords[0] = emit_extract_vector(ctx, src0, 0, v1); 6135 coords.resize(coords.size() + 1); 6136 coords[1] = bld.copy(bld.def(v1), Operand::zero()); 6137 if (is_array) 6138 coords[2] = emit_extract_vector(ctx, src0, 1, v1); 6139 } else { 6140 for (int i = 0; i < count; i++) 6141 coords[i] = emit_extract_vector(ctx, src0, i, v1); 6142 } 6143 6144 if (ctx->options->key.image_2d_view_of_3d && 6145 dim == GLSL_SAMPLER_DIM_2D && !is_array) { 6146 /* The hw can't bind a slice of a 3D image as a 2D image, because it 6147 * ignores BASE_ARRAY if the target is 3D. The workaround is to read 6148 * BASE_ARRAY and set it as the 3rd address operand for all 2D images. 6149 */ 6150 assert(ctx->options->gfx_level == GFX9); 6151 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 6152 Temp rsrc_word5 = emit_extract_vector(ctx, rsrc, 5, v1); 6153 /* Extract the BASE_ARRAY field [0:12] from the descriptor. */ 6154 Temp first_layer = 6155 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), rsrc_word5, 6156 Operand::c32(0u), Operand::c32(13u)); 6157 coords.emplace_back(first_layer); 6158 } 6159 6160 if (instr->intrinsic == nir_intrinsic_bindless_image_load || 6161 instr->intrinsic == nir_intrinsic_bindless_image_sparse_load || 6162 instr->intrinsic == nir_intrinsic_bindless_image_store) { 6163 int lod_index = instr->intrinsic == nir_intrinsic_bindless_image_store ? 4 : 3; 6164 bool level_zero = 6165 nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0; 6166 6167 if (!level_zero) 6168 coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa)); 6169 } 6170 6171 return coords; 6172} 6173 6174memory_sync_info 6175get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics) 6176{ 6177 /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */ 6178 if (semantics & semantic_atomicrmw) 6179 return memory_sync_info(storage, semantics); 6180 6181 unsigned access = nir_intrinsic_access(instr); 6182 6183 if (access & ACCESS_VOLATILE) 6184 semantics |= semantic_volatile; 6185 if (access & ACCESS_CAN_REORDER) 6186 semantics |= semantic_can_reorder | semantic_private; 6187 6188 return memory_sync_info(storage, semantics); 6189} 6190 6191Operand 6192emit_tfe_init(Builder& bld, Temp dst) 6193{ 6194 Temp tmp = bld.tmp(dst.regClass()); 6195 6196 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 6197 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; 6198 for (unsigned i = 0; i < dst.size(); i++) 6199 vec->operands[i] = Operand::zero(); 6200 vec->definitions[0] = Definition(tmp); 6201 /* Since this is fixed to an instruction's definition register, any CSE will 6202 * just create copies. Copying costs about the same as zero-initialization, 6203 * but these copies can break up clauses. 6204 */ 6205 vec->definitions[0].setNoCSE(true); 6206 bld.insert(std::move(vec)); 6207 6208 return Operand(tmp); 6209} 6210 6211void 6212visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr) 6213{ 6214 Builder bld(ctx->program, ctx->block); 6215 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 6216 bool is_array = nir_intrinsic_image_array(instr); 6217 bool is_sparse = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load; 6218 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6219 6220 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0); 6221 unsigned access = nir_intrinsic_access(instr); 6222 6223 unsigned result_size = instr->dest.ssa.num_components - is_sparse; 6224 unsigned expand_mask = 6225 nir_ssa_def_components_read(&instr->dest.ssa) & u_bit_consecutive(0, result_size); 6226 expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */ 6227 if (dim == GLSL_SAMPLER_DIM_BUF) 6228 expand_mask = (1u << util_last_bit(expand_mask)) - 1u; 6229 unsigned dmask = expand_mask; 6230 if (instr->dest.ssa.bit_size == 64) { 6231 expand_mask &= 0x9; 6232 /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */ 6233 dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0); 6234 } 6235 if (is_sparse) 6236 expand_mask |= 1 << result_size; 6237 6238 bool d16 = instr->dest.ssa.bit_size == 16; 6239 assert(!d16 || !is_sparse); 6240 6241 unsigned num_bytes = util_bitcount(dmask) * (d16 ? 2 : 4) + is_sparse * 4; 6242 6243 Temp tmp; 6244 if (num_bytes == dst.bytes() && dst.type() == RegType::vgpr) 6245 tmp = dst; 6246 else 6247 tmp = bld.tmp(RegClass::get(RegType::vgpr, num_bytes)); 6248 6249 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 6250 6251 if (dim == GLSL_SAMPLER_DIM_BUF) { 6252 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); 6253 6254 aco_opcode opcode; 6255 if (!d16) { 6256 switch (util_bitcount(dmask)) { 6257 case 1: opcode = aco_opcode::buffer_load_format_x; break; 6258 case 2: opcode = aco_opcode::buffer_load_format_xy; break; 6259 case 3: opcode = aco_opcode::buffer_load_format_xyz; break; 6260 case 4: opcode = aco_opcode::buffer_load_format_xyzw; break; 6261 default: unreachable(">4 channel buffer image load"); 6262 } 6263 } else { 6264 switch (util_bitcount(dmask)) { 6265 case 1: opcode = aco_opcode::buffer_load_format_d16_x; break; 6266 case 2: opcode = aco_opcode::buffer_load_format_d16_xy; break; 6267 case 3: opcode = aco_opcode::buffer_load_format_d16_xyz; break; 6268 case 4: opcode = aco_opcode::buffer_load_format_d16_xyzw; break; 6269 default: unreachable(">4 channel buffer image load"); 6270 } 6271 } 6272 aco_ptr<MUBUF_instruction> load{ 6273 create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)}; 6274 load->operands[0] = Operand(resource); 6275 load->operands[1] = Operand(vindex); 6276 load->operands[2] = Operand::c32(0); 6277 load->definitions[0] = Definition(tmp); 6278 load->idxen = true; 6279 load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); 6280 load->dlc = 6281 load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3); 6282 load->sync = sync; 6283 load->tfe = is_sparse; 6284 if (load->tfe) 6285 load->operands[3] = emit_tfe_init(bld, tmp); 6286 ctx->block->instructions.emplace_back(std::move(load)); 6287 } else { 6288 std::vector<Temp> coords = get_image_coords(ctx, instr); 6289 6290 bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0; 6291 aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip; 6292 6293 Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1); 6294 MIMG_instruction* load = 6295 emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata); 6296 load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; 6297 load->dlc = 6298 load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3); 6299 load->dim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array); 6300 load->d16 = d16; 6301 load->dmask = dmask; 6302 load->unrm = true; 6303 load->da = should_declare_array(ctx, dim, is_array); 6304 load->sync = sync; 6305 load->tfe = is_sparse; 6306 } 6307 6308 if (is_sparse && instr->dest.ssa.bit_size == 64) { 6309 /* The result components are 64-bit but the sparse residency code is 6310 * 32-bit. So add a zero to the end so expand_vector() works correctly. 6311 */ 6312 tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp, 6313 Operand::zero()); 6314 } 6315 6316 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask, 6317 instr->dest.ssa.bit_size == 64); 6318} 6319 6320void 6321visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) 6322{ 6323 Builder bld(ctx->program, ctx->block); 6324 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 6325 bool is_array = nir_intrinsic_image_array(instr); 6326 Temp data = get_ssa_temp(ctx, instr->src[3].ssa); 6327 bool d16 = instr->src[3].ssa->bit_size == 16; 6328 6329 /* only R64_UINT and R64_SINT supported */ 6330 if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8) 6331 data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2)); 6332 data = as_vgpr(ctx, data); 6333 6334 uint32_t num_components = d16 ? instr->src[3].ssa->num_components : data.size(); 6335 6336 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0); 6337 unsigned access = nir_intrinsic_access(instr); 6338 bool glc = ctx->options->gfx_level == GFX6 || 6339 ((access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)) && 6340 ctx->program->gfx_level < GFX11); 6341 6342 if (dim == GLSL_SAMPLER_DIM_BUF) { 6343 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 6344 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); 6345 aco_opcode opcode; 6346 if (!d16) { 6347 switch (num_components) { 6348 case 1: opcode = aco_opcode::buffer_store_format_x; break; 6349 case 2: opcode = aco_opcode::buffer_store_format_xy; break; 6350 case 3: opcode = aco_opcode::buffer_store_format_xyz; break; 6351 case 4: opcode = aco_opcode::buffer_store_format_xyzw; break; 6352 default: unreachable(">4 channel buffer image store"); 6353 } 6354 } else { 6355 switch (num_components) { 6356 case 1: opcode = aco_opcode::buffer_store_format_d16_x; break; 6357 case 2: opcode = aco_opcode::buffer_store_format_d16_xy; break; 6358 case 3: opcode = aco_opcode::buffer_store_format_d16_xyz; break; 6359 case 4: opcode = aco_opcode::buffer_store_format_d16_xyzw; break; 6360 default: unreachable(">4 channel buffer image store"); 6361 } 6362 } 6363 aco_ptr<MUBUF_instruction> store{ 6364 create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)}; 6365 store->operands[0] = Operand(rsrc); 6366 store->operands[1] = Operand(vindex); 6367 store->operands[2] = Operand::c32(0); 6368 store->operands[3] = Operand(data); 6369 store->idxen = true; 6370 store->glc = glc; 6371 store->dlc = false; 6372 store->disable_wqm = true; 6373 store->sync = sync; 6374 ctx->program->needs_exact = true; 6375 ctx->block->instructions.emplace_back(std::move(store)); 6376 return; 6377 } 6378 6379 assert(data.type() == RegType::vgpr); 6380 std::vector<Temp> coords = get_image_coords(ctx, instr); 6381 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 6382 6383 bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0; 6384 aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip; 6385 6386 uint32_t dmask = BITFIELD_MASK(num_components); 6387 /* remove zero/undef elements from data, components which aren't in dmask 6388 * are zeroed anyway 6389 */ 6390 if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) { 6391 for (uint32_t i = 0; i < instr->num_components; i++) { 6392 nir_ssa_scalar comp = nir_ssa_scalar_resolved(instr->src[3].ssa, i); 6393 if (comp.def->parent_instr->type == nir_instr_type_ssa_undef || 6394 (nir_ssa_scalar_is_const(comp) && nir_ssa_scalar_as_uint(comp) == 0)) 6395 dmask &= ~BITFIELD_BIT(i); 6396 } 6397 6398 /* dmask cannot be 0, at least one vgpr is always read */ 6399 if (dmask == 0) 6400 dmask = 1; 6401 6402 if (dmask != BITFIELD_MASK(num_components)) { 6403 uint32_t dmask_count = util_bitcount(dmask); 6404 RegClass rc = d16 ? v2b : v1; 6405 if (dmask_count == 1) { 6406 data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc); 6407 } else { 6408 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 6409 aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)}; 6410 uint32_t index = 0; 6411 u_foreach_bit(bit, dmask) { 6412 vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc)); 6413 } 6414 data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes())); 6415 vec->definitions[0] = Definition(data); 6416 bld.insert(std::move(vec)); 6417 } 6418 } 6419 } 6420 6421 MIMG_instruction* store = 6422 emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data)); 6423 store->glc = glc; 6424 store->dlc = false; 6425 store->dim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array); 6426 store->d16 = d16; 6427 store->dmask = dmask; 6428 store->unrm = true; 6429 store->da = should_declare_array(ctx, dim, is_array); 6430 store->disable_wqm = true; 6431 store->sync = sync; 6432 ctx->program->needs_exact = true; 6433 return; 6434} 6435 6436void 6437visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr) 6438{ 6439 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); 6440 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 6441 bool is_array = nir_intrinsic_image_array(instr); 6442 Builder bld(ctx->program, ctx->block); 6443 6444 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa)); 6445 bool cmpswap = instr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap; 6446 bool is_64bit = data.bytes() == 8; 6447 assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented."); 6448 6449 if (cmpswap) 6450 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2), 6451 get_ssa_temp(ctx, instr->src[4].ssa), data); 6452 6453 aco_opcode buf_op, buf_op64, image_op; 6454 switch (instr->intrinsic) { 6455 case nir_intrinsic_bindless_image_atomic_add: 6456 buf_op = aco_opcode::buffer_atomic_add; 6457 buf_op64 = aco_opcode::buffer_atomic_add_x2; 6458 image_op = aco_opcode::image_atomic_add; 6459 break; 6460 case nir_intrinsic_bindless_image_atomic_umin: 6461 buf_op = aco_opcode::buffer_atomic_umin; 6462 buf_op64 = aco_opcode::buffer_atomic_umin_x2; 6463 image_op = aco_opcode::image_atomic_umin; 6464 break; 6465 case nir_intrinsic_bindless_image_atomic_imin: 6466 buf_op = aco_opcode::buffer_atomic_smin; 6467 buf_op64 = aco_opcode::buffer_atomic_smin_x2; 6468 image_op = aco_opcode::image_atomic_smin; 6469 break; 6470 case nir_intrinsic_bindless_image_atomic_umax: 6471 buf_op = aco_opcode::buffer_atomic_umax; 6472 buf_op64 = aco_opcode::buffer_atomic_umax_x2; 6473 image_op = aco_opcode::image_atomic_umax; 6474 break; 6475 case nir_intrinsic_bindless_image_atomic_imax: 6476 buf_op = aco_opcode::buffer_atomic_smax; 6477 buf_op64 = aco_opcode::buffer_atomic_smax_x2; 6478 image_op = aco_opcode::image_atomic_smax; 6479 break; 6480 case nir_intrinsic_bindless_image_atomic_and: 6481 buf_op = aco_opcode::buffer_atomic_and; 6482 buf_op64 = aco_opcode::buffer_atomic_and_x2; 6483 image_op = aco_opcode::image_atomic_and; 6484 break; 6485 case nir_intrinsic_bindless_image_atomic_or: 6486 buf_op = aco_opcode::buffer_atomic_or; 6487 buf_op64 = aco_opcode::buffer_atomic_or_x2; 6488 image_op = aco_opcode::image_atomic_or; 6489 break; 6490 case nir_intrinsic_bindless_image_atomic_xor: 6491 buf_op = aco_opcode::buffer_atomic_xor; 6492 buf_op64 = aco_opcode::buffer_atomic_xor_x2; 6493 image_op = aco_opcode::image_atomic_xor; 6494 break; 6495 case nir_intrinsic_bindless_image_atomic_exchange: 6496 buf_op = aco_opcode::buffer_atomic_swap; 6497 buf_op64 = aco_opcode::buffer_atomic_swap_x2; 6498 image_op = aco_opcode::image_atomic_swap; 6499 break; 6500 case nir_intrinsic_bindless_image_atomic_comp_swap: 6501 buf_op = aco_opcode::buffer_atomic_cmpswap; 6502 buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2; 6503 image_op = aco_opcode::image_atomic_cmpswap; 6504 break; 6505 case nir_intrinsic_bindless_image_atomic_fmin: 6506 buf_op = aco_opcode::buffer_atomic_fmin; 6507 buf_op64 = aco_opcode::buffer_atomic_fmin_x2; 6508 image_op = aco_opcode::image_atomic_fmin; 6509 break; 6510 case nir_intrinsic_bindless_image_atomic_fmax: 6511 buf_op = aco_opcode::buffer_atomic_fmax; 6512 buf_op64 = aco_opcode::buffer_atomic_fmax_x2; 6513 image_op = aco_opcode::image_atomic_fmax; 6514 break; 6515 default: 6516 unreachable("visit_image_atomic should only be called with " 6517 "nir_intrinsic_bindless_image_atomic_* instructions."); 6518 } 6519 6520 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6521 memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw); 6522 6523 if (dim == GLSL_SAMPLER_DIM_BUF) { 6524 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); 6525 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 6526 // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet 6527 // implemented."); 6528 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>( 6529 is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)}; 6530 mubuf->operands[0] = Operand(resource); 6531 mubuf->operands[1] = Operand(vindex); 6532 mubuf->operands[2] = Operand::c32(0); 6533 mubuf->operands[3] = Operand(data); 6534 Definition def = 6535 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition(); 6536 if (return_previous) 6537 mubuf->definitions[0] = def; 6538 mubuf->offset = 0; 6539 mubuf->idxen = true; 6540 mubuf->glc = return_previous; 6541 mubuf->dlc = false; /* Not needed for atomics */ 6542 mubuf->disable_wqm = true; 6543 mubuf->sync = sync; 6544 ctx->program->needs_exact = true; 6545 ctx->block->instructions.emplace_back(std::move(mubuf)); 6546 if (return_previous && cmpswap) 6547 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero()); 6548 return; 6549 } 6550 6551 std::vector<Temp> coords = get_image_coords(ctx, instr); 6552 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 6553 Definition def = 6554 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition(); 6555 MIMG_instruction* mimg = 6556 emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data)); 6557 mimg->glc = return_previous; 6558 mimg->dlc = false; /* Not needed for atomics */ 6559 mimg->dim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array); 6560 mimg->dmask = (1 << data.size()) - 1; 6561 mimg->unrm = true; 6562 mimg->da = should_declare_array(ctx, dim, is_array); 6563 mimg->disable_wqm = true; 6564 mimg->sync = sync; 6565 ctx->program->needs_exact = true; 6566 if (return_previous && cmpswap) 6567 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero()); 6568 return; 6569} 6570 6571void 6572get_buffer_size(isel_context* ctx, Temp desc, Temp dst) 6573{ 6574 if (ctx->options->gfx_level == GFX8) { 6575 /* we only have to divide by 1, 2, 4, 8, 12 or 16 */ 6576 Builder bld(ctx->program, ctx->block); 6577 6578 Temp size = emit_extract_vector(ctx, desc, 2, s1); 6579 6580 Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), 6581 bld.copy(bld.def(v1), Operand::c32(0xaaaaaaabu)), size); 6582 size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), 6583 bld.as_uniform(size_div3), Operand::c32(1u)); 6584 6585 Temp stride = emit_extract_vector(ctx, desc, 1, s1); 6586 stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, 6587 Operand::c32((5u << 16) | 16u)); 6588 6589 Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand::c32(12u)); 6590 size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12)); 6591 6592 Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst; 6593 bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), size, 6594 bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride)); 6595 if (dst.type() == RegType::vgpr) 6596 bld.copy(Definition(dst), shr_dst); 6597 6598 /* TODO: we can probably calculate this faster with v_skip when stride != 12 */ 6599 } else { 6600 emit_extract_vector(ctx, desc, 2, dst); 6601 } 6602} 6603 6604void 6605visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr) 6606{ 6607 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 6608 bool is_array = nir_intrinsic_image_array(instr); 6609 Builder bld(ctx->program, ctx->block); 6610 6611 if (dim == GLSL_SAMPLER_DIM_BUF) { 6612 Temp desc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 6613 return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa)); 6614 } 6615 6616 /* LOD */ 6617 assert(nir_src_as_uint(instr->src[1]) == 0); 6618 std::vector<Temp> lod{bld.copy(bld.def(v1), Operand::zero())}; 6619 6620 /* Resource */ 6621 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 6622 6623 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6624 6625 MIMG_instruction* mimg = 6626 emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(dst), resource, Operand(s4), lod); 6627 uint8_t& dmask = mimg->dmask; 6628 mimg->dim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array); 6629 mimg->dmask = (1 << instr->dest.ssa.num_components) - 1; 6630 mimg->da = is_array; 6631 6632 if (ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) { 6633 assert(instr->dest.ssa.num_components == 2); 6634 dmask = 0x5; 6635 } 6636 6637 emit_split_vector(ctx, dst, instr->dest.ssa.num_components); 6638} 6639 6640void 6641get_image_samples(isel_context* ctx, Definition dst, Temp resource) 6642{ 6643 Builder bld(ctx->program, ctx->block); 6644 6645 Temp dword3 = emit_extract_vector(ctx, resource, 3, s1); 6646 Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, 6647 Operand::c32(16u | 4u << 16)); 6648 Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand::c32(1u), 6649 samples_log2); 6650 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, 6651 Operand::c32(28u | 4u << 16 /* offset=28, width=4 */)); 6652 6653 Operand default_sample = Operand::c32(1u); 6654 if (ctx->options->robust_buffer_access) { 6655 /* Extract the second dword of the descriptor, if it's 6656 * all zero, then it's a null descriptor. 6657 */ 6658 Temp dword1 = emit_extract_vector(ctx, resource, 1, s1); 6659 Temp is_non_null_descriptor = 6660 bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand::zero()); 6661 default_sample = Operand(is_non_null_descriptor); 6662 } 6663 6664 Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand::c32(14u)); 6665 bld.sop2(aco_opcode::s_cselect_b32, dst, samples, default_sample, bld.scc(is_msaa)); 6666} 6667 6668void 6669visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr) 6670{ 6671 Builder bld(ctx->program, ctx->block); 6672 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6673 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 6674 get_image_samples(ctx, Definition(dst), resource); 6675} 6676 6677void 6678visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) 6679{ 6680 Builder bld(ctx->program, ctx->block); 6681 unsigned num_components = instr->num_components; 6682 6683 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6684 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 6685 6686 unsigned access = nir_intrinsic_access(instr); 6687 bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); 6688 unsigned size = instr->dest.ssa.bit_size / 8; 6689 6690 bool allow_smem = access & ACCESS_CAN_REORDER; 6691 6692 load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), 6693 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem, 6694 get_memory_sync_info(instr, storage_buffer, 0)); 6695} 6696 6697void 6698visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) 6699{ 6700 Builder bld(ctx->program, ctx->block); 6701 Temp data = get_ssa_temp(ctx, instr->src[0].ssa); 6702 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; 6703 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes); 6704 Temp offset = get_ssa_temp(ctx, instr->src[2].ssa); 6705 6706 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)); 6707 6708 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); 6709 bool glc = 6710 (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)) && 6711 ctx->program->gfx_level < GFX11; 6712 6713 unsigned write_count = 0; 6714 Temp write_datas[32]; 6715 unsigned offsets[32]; 6716 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count, 6717 write_datas, offsets); 6718 6719 /* GFX6-7 are affected by a hw bug that prevents address clamping to work 6720 * correctly when the SGPR offset is used. 6721 */ 6722 if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8) 6723 offset = as_vgpr(ctx, offset); 6724 6725 for (unsigned i = 0; i < write_count; i++) { 6726 aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); 6727 6728 aco_ptr<MUBUF_instruction> store{ 6729 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)}; 6730 store->operands[0] = Operand(rsrc); 6731 store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); 6732 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0); 6733 store->operands[3] = Operand(write_datas[i]); 6734 store->offset = offsets[i]; 6735 store->offen = (offset.type() == RegType::vgpr); 6736 store->glc = glc; 6737 store->dlc = false; 6738 store->disable_wqm = true; 6739 store->sync = sync; 6740 ctx->program->needs_exact = true; 6741 ctx->block->instructions.emplace_back(std::move(store)); 6742 } 6743} 6744 6745void 6746visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) 6747{ 6748 Builder bld(ctx->program, ctx->block); 6749 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); 6750 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)); 6751 bool cmpswap = instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap; 6752 6753 if (cmpswap) 6754 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2), 6755 get_ssa_temp(ctx, instr->src[3].ssa), data); 6756 6757 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa); 6758 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 6759 6760 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6761 6762 aco_opcode op32, op64; 6763 switch (instr->intrinsic) { 6764 case nir_intrinsic_ssbo_atomic_add: 6765 op32 = aco_opcode::buffer_atomic_add; 6766 op64 = aco_opcode::buffer_atomic_add_x2; 6767 break; 6768 case nir_intrinsic_ssbo_atomic_imin: 6769 op32 = aco_opcode::buffer_atomic_smin; 6770 op64 = aco_opcode::buffer_atomic_smin_x2; 6771 break; 6772 case nir_intrinsic_ssbo_atomic_umin: 6773 op32 = aco_opcode::buffer_atomic_umin; 6774 op64 = aco_opcode::buffer_atomic_umin_x2; 6775 break; 6776 case nir_intrinsic_ssbo_atomic_imax: 6777 op32 = aco_opcode::buffer_atomic_smax; 6778 op64 = aco_opcode::buffer_atomic_smax_x2; 6779 break; 6780 case nir_intrinsic_ssbo_atomic_umax: 6781 op32 = aco_opcode::buffer_atomic_umax; 6782 op64 = aco_opcode::buffer_atomic_umax_x2; 6783 break; 6784 case nir_intrinsic_ssbo_atomic_and: 6785 op32 = aco_opcode::buffer_atomic_and; 6786 op64 = aco_opcode::buffer_atomic_and_x2; 6787 break; 6788 case nir_intrinsic_ssbo_atomic_or: 6789 op32 = aco_opcode::buffer_atomic_or; 6790 op64 = aco_opcode::buffer_atomic_or_x2; 6791 break; 6792 case nir_intrinsic_ssbo_atomic_xor: 6793 op32 = aco_opcode::buffer_atomic_xor; 6794 op64 = aco_opcode::buffer_atomic_xor_x2; 6795 break; 6796 case nir_intrinsic_ssbo_atomic_exchange: 6797 op32 = aco_opcode::buffer_atomic_swap; 6798 op64 = aco_opcode::buffer_atomic_swap_x2; 6799 break; 6800 case nir_intrinsic_ssbo_atomic_comp_swap: 6801 op32 = aco_opcode::buffer_atomic_cmpswap; 6802 op64 = aco_opcode::buffer_atomic_cmpswap_x2; 6803 break; 6804 case nir_intrinsic_ssbo_atomic_fmin: 6805 op32 = aco_opcode::buffer_atomic_fmin; 6806 op64 = aco_opcode::buffer_atomic_fmin_x2; 6807 break; 6808 case nir_intrinsic_ssbo_atomic_fmax: 6809 op32 = aco_opcode::buffer_atomic_fmax; 6810 op64 = aco_opcode::buffer_atomic_fmax_x2; 6811 break; 6812 default: 6813 unreachable( 6814 "visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions."); 6815 } 6816 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; 6817 aco_ptr<MUBUF_instruction> mubuf{ 6818 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; 6819 mubuf->operands[0] = Operand(rsrc); 6820 mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); 6821 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0); 6822 mubuf->operands[3] = Operand(data); 6823 Definition def = 6824 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition(); 6825 if (return_previous) 6826 mubuf->definitions[0] = def; 6827 mubuf->offset = 0; 6828 mubuf->offen = (offset.type() == RegType::vgpr); 6829 mubuf->glc = return_previous; 6830 mubuf->dlc = false; /* Not needed for atomics */ 6831 mubuf->disable_wqm = true; 6832 mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); 6833 ctx->program->needs_exact = true; 6834 ctx->block->instructions.emplace_back(std::move(mubuf)); 6835 if (return_previous && cmpswap) 6836 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero()); 6837} 6838 6839void 6840parse_global(isel_context* ctx, nir_intrinsic_instr* intrin, Temp* address, uint32_t* const_offset, 6841 Temp* offset) 6842{ 6843 bool is_store = intrin->intrinsic == nir_intrinsic_store_global_amd; 6844 *address = get_ssa_temp(ctx, intrin->src[is_store ? 1 : 0].ssa); 6845 6846 *const_offset = nir_intrinsic_base(intrin); 6847 6848 unsigned num_src = nir_intrinsic_infos[intrin->intrinsic].num_srcs; 6849 nir_src offset_src = intrin->src[num_src - 1]; 6850 if (!nir_src_is_const(offset_src) || nir_src_as_uint(offset_src)) 6851 *offset = get_ssa_temp(ctx, offset_src.ssa); 6852 else 6853 *offset = Temp(); 6854} 6855 6856void 6857visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr) 6858{ 6859 Builder bld(ctx->program, ctx->block); 6860 unsigned num_components = instr->num_components; 6861 unsigned component_size = instr->dest.ssa.bit_size / 8; 6862 6863 Temp addr, offset; 6864 uint32_t const_offset; 6865 parse_global(ctx, instr, &addr, &const_offset, &offset); 6866 6867 LoadEmitInfo info = {Operand(addr), get_ssa_temp(ctx, &instr->dest.ssa), num_components, 6868 component_size}; 6869 if (offset.id()) { 6870 info.resource = addr; 6871 info.offset = Operand(offset); 6872 } 6873 info.const_offset = const_offset; 6874 info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); 6875 info.align_mul = nir_intrinsic_align_mul(instr); 6876 info.align_offset = nir_intrinsic_align_offset(instr); 6877 info.sync = get_memory_sync_info(instr, storage_buffer, 0); 6878 6879 /* Don't expand global loads when they use MUBUF or SMEM. 6880 * Global loads don't have the bounds checking that buffer loads have that 6881 * makes this safe. 6882 */ 6883 unsigned align = nir_intrinsic_align(instr); 6884 bool byte_align_for_smem_mubuf = 6885 can_use_byte_align_for_global_load(num_components, component_size, align, false); 6886 6887 /* VMEM stores don't update the SMEM cache and it's difficult to prove that 6888 * it's safe to use SMEM */ 6889 bool can_use_smem = 6890 (nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE) && byte_align_for_smem_mubuf; 6891 if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->gfx_level < GFX8) || 6892 !can_use_smem) { 6893 EmitLoadParameters params = global_load_params; 6894 params.byte_align_loads = ctx->options->gfx_level > GFX6 || byte_align_for_smem_mubuf; 6895 emit_load(ctx, bld, info, params); 6896 } else { 6897 if (info.resource.id()) 6898 info.resource = bld.as_uniform(info.resource); 6899 info.offset = Operand(bld.as_uniform(info.offset)); 6900 emit_load(ctx, bld, info, smem_load_params); 6901 } 6902} 6903 6904void 6905visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr) 6906{ 6907 Builder bld(ctx->program, ctx->block); 6908 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; 6909 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes); 6910 6911 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 6912 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); 6913 bool glc = 6914 (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)) && 6915 ctx->program->gfx_level < GFX11; 6916 6917 unsigned write_count = 0; 6918 Temp write_datas[32]; 6919 unsigned offsets[32]; 6920 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count, 6921 write_datas, offsets); 6922 6923 Temp addr, offset; 6924 uint32_t const_offset; 6925 parse_global(ctx, instr, &addr, &const_offset, &offset); 6926 6927 for (unsigned i = 0; i < write_count; i++) { 6928 Temp write_address = addr; 6929 uint32_t write_const_offset = const_offset; 6930 Temp write_offset = offset; 6931 lower_global_address(bld, offsets[i], &write_address, &write_const_offset, &write_offset); 6932 6933 if (ctx->options->gfx_level >= GFX7) { 6934 bool global = ctx->options->gfx_level >= GFX9; 6935 aco_opcode op; 6936 switch (write_datas[i].bytes()) { 6937 case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break; 6938 case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break; 6939 case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break; 6940 case 8: 6941 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2; 6942 break; 6943 case 12: 6944 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3; 6945 break; 6946 case 16: 6947 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4; 6948 break; 6949 default: unreachable("store_global not implemented for this size."); 6950 } 6951 6952 aco_ptr<FLAT_instruction> flat{ 6953 create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)}; 6954 if (write_address.regClass() == s2) { 6955 assert(global && write_offset.id() && write_offset.type() == RegType::vgpr); 6956 flat->operands[0] = Operand(write_offset); 6957 flat->operands[1] = Operand(write_address); 6958 } else { 6959 assert(write_address.type() == RegType::vgpr && !write_offset.id()); 6960 flat->operands[0] = Operand(write_address); 6961 flat->operands[1] = Operand(s1); 6962 } 6963 flat->operands[2] = Operand(write_datas[i]); 6964 flat->glc = glc; 6965 flat->dlc = false; 6966 assert(global || !write_const_offset); 6967 flat->offset = write_const_offset; 6968 flat->disable_wqm = true; 6969 flat->sync = sync; 6970 ctx->program->needs_exact = true; 6971 ctx->block->instructions.emplace_back(std::move(flat)); 6972 } else { 6973 assert(ctx->options->gfx_level == GFX6); 6974 6975 aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); 6976 6977 Temp rsrc = get_gfx6_global_rsrc(bld, write_address); 6978 6979 aco_ptr<MUBUF_instruction> mubuf{ 6980 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)}; 6981 mubuf->operands[0] = Operand(rsrc); 6982 mubuf->operands[1] = 6983 write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1); 6984 mubuf->operands[2] = Operand(write_offset); 6985 mubuf->operands[3] = Operand(write_datas[i]); 6986 mubuf->glc = glc; 6987 mubuf->dlc = false; 6988 mubuf->offset = write_const_offset; 6989 mubuf->addr64 = write_address.type() == RegType::vgpr; 6990 mubuf->disable_wqm = true; 6991 mubuf->sync = sync; 6992 ctx->program->needs_exact = true; 6993 ctx->block->instructions.emplace_back(std::move(mubuf)); 6994 } 6995 } 6996} 6997 6998void 6999visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr) 7000{ 7001 Builder bld(ctx->program, ctx->block); 7002 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); 7003 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); 7004 bool cmpswap = instr->intrinsic == nir_intrinsic_global_atomic_comp_swap_amd; 7005 7006 if (cmpswap) 7007 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2), 7008 get_ssa_temp(ctx, instr->src[2].ssa), data); 7009 7010 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 7011 7012 aco_opcode op32, op64; 7013 7014 Temp addr, offset; 7015 uint32_t const_offset; 7016 parse_global(ctx, instr, &addr, &const_offset, &offset); 7017 lower_global_address(bld, 0, &addr, &const_offset, &offset); 7018 7019 if (ctx->options->gfx_level >= GFX7) { 7020 bool global = ctx->options->gfx_level >= GFX9; 7021 switch (instr->intrinsic) { 7022 case nir_intrinsic_global_atomic_add_amd: 7023 op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add; 7024 op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2; 7025 break; 7026 case nir_intrinsic_global_atomic_imin_amd: 7027 op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin; 7028 op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2; 7029 break; 7030 case nir_intrinsic_global_atomic_umin_amd: 7031 op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin; 7032 op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2; 7033 break; 7034 case nir_intrinsic_global_atomic_imax_amd: 7035 op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax; 7036 op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2; 7037 break; 7038 case nir_intrinsic_global_atomic_umax_amd: 7039 op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax; 7040 op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2; 7041 break; 7042 case nir_intrinsic_global_atomic_and_amd: 7043 op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and; 7044 op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2; 7045 break; 7046 case nir_intrinsic_global_atomic_or_amd: 7047 op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or; 7048 op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2; 7049 break; 7050 case nir_intrinsic_global_atomic_xor_amd: 7051 op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor; 7052 op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2; 7053 break; 7054 case nir_intrinsic_global_atomic_exchange_amd: 7055 op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap; 7056 op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2; 7057 break; 7058 case nir_intrinsic_global_atomic_comp_swap_amd: 7059 op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap; 7060 op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2; 7061 break; 7062 case nir_intrinsic_global_atomic_fmin_amd: 7063 op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin; 7064 op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2; 7065 break; 7066 case nir_intrinsic_global_atomic_fmax_amd: 7067 op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax; 7068 op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2; 7069 break; 7070 default: 7071 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* " 7072 "instructions."); 7073 } 7074 7075 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; 7076 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>( 7077 op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)}; 7078 if (addr.regClass() == s2) { 7079 assert(global && offset.id() && offset.type() == RegType::vgpr); 7080 flat->operands[0] = Operand(offset); 7081 flat->operands[1] = Operand(addr); 7082 } else { 7083 assert(addr.type() == RegType::vgpr && !offset.id()); 7084 flat->operands[0] = Operand(addr); 7085 flat->operands[1] = Operand(s1); 7086 } 7087 flat->operands[2] = Operand(data); 7088 if (return_previous) 7089 flat->definitions[0] = Definition(dst); 7090 flat->glc = return_previous; 7091 flat->dlc = false; /* Not needed for atomics */ 7092 assert(global || !const_offset); 7093 flat->offset = const_offset; 7094 flat->disable_wqm = true; 7095 flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); 7096 ctx->program->needs_exact = true; 7097 ctx->block->instructions.emplace_back(std::move(flat)); 7098 } else { 7099 assert(ctx->options->gfx_level == GFX6); 7100 7101 switch (instr->intrinsic) { 7102 case nir_intrinsic_global_atomic_add_amd: 7103 op32 = aco_opcode::buffer_atomic_add; 7104 op64 = aco_opcode::buffer_atomic_add_x2; 7105 break; 7106 case nir_intrinsic_global_atomic_imin_amd: 7107 op32 = aco_opcode::buffer_atomic_smin; 7108 op64 = aco_opcode::buffer_atomic_smin_x2; 7109 break; 7110 case nir_intrinsic_global_atomic_umin_amd: 7111 op32 = aco_opcode::buffer_atomic_umin; 7112 op64 = aco_opcode::buffer_atomic_umin_x2; 7113 break; 7114 case nir_intrinsic_global_atomic_imax_amd: 7115 op32 = aco_opcode::buffer_atomic_smax; 7116 op64 = aco_opcode::buffer_atomic_smax_x2; 7117 break; 7118 case nir_intrinsic_global_atomic_umax_amd: 7119 op32 = aco_opcode::buffer_atomic_umax; 7120 op64 = aco_opcode::buffer_atomic_umax_x2; 7121 break; 7122 case nir_intrinsic_global_atomic_and_amd: 7123 op32 = aco_opcode::buffer_atomic_and; 7124 op64 = aco_opcode::buffer_atomic_and_x2; 7125 break; 7126 case nir_intrinsic_global_atomic_or_amd: 7127 op32 = aco_opcode::buffer_atomic_or; 7128 op64 = aco_opcode::buffer_atomic_or_x2; 7129 break; 7130 case nir_intrinsic_global_atomic_xor_amd: 7131 op32 = aco_opcode::buffer_atomic_xor; 7132 op64 = aco_opcode::buffer_atomic_xor_x2; 7133 break; 7134 case nir_intrinsic_global_atomic_exchange_amd: 7135 op32 = aco_opcode::buffer_atomic_swap; 7136 op64 = aco_opcode::buffer_atomic_swap_x2; 7137 break; 7138 case nir_intrinsic_global_atomic_comp_swap_amd: 7139 op32 = aco_opcode::buffer_atomic_cmpswap; 7140 op64 = aco_opcode::buffer_atomic_cmpswap_x2; 7141 break; 7142 case nir_intrinsic_global_atomic_fmin_amd: 7143 op32 = aco_opcode::buffer_atomic_fmin; 7144 op64 = aco_opcode::buffer_atomic_fmin_x2; 7145 break; 7146 case nir_intrinsic_global_atomic_fmax_amd: 7147 op32 = aco_opcode::buffer_atomic_fmax; 7148 op64 = aco_opcode::buffer_atomic_fmax_x2; 7149 break; 7150 default: 7151 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* " 7152 "instructions."); 7153 } 7154 7155 Temp rsrc = get_gfx6_global_rsrc(bld, addr); 7156 7157 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; 7158 7159 aco_ptr<MUBUF_instruction> mubuf{ 7160 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; 7161 mubuf->operands[0] = Operand(rsrc); 7162 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); 7163 mubuf->operands[2] = Operand(offset); 7164 mubuf->operands[3] = Operand(data); 7165 Definition def = 7166 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition(); 7167 if (return_previous) 7168 mubuf->definitions[0] = def; 7169 mubuf->glc = return_previous; 7170 mubuf->dlc = false; 7171 mubuf->offset = const_offset; 7172 mubuf->addr64 = addr.type() == RegType::vgpr; 7173 mubuf->disable_wqm = true; 7174 mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); 7175 ctx->program->needs_exact = true; 7176 ctx->block->instructions.emplace_back(std::move(mubuf)); 7177 if (return_previous && cmpswap) 7178 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero()); 7179 } 7180} 7181 7182unsigned 7183aco_storage_mode_from_nir_mem_mode(unsigned mem_mode) 7184{ 7185 unsigned storage = storage_none; 7186 7187 if (mem_mode & nir_var_shader_out) 7188 storage |= storage_vmem_output; 7189 if ((mem_mode & nir_var_mem_ssbo) || (mem_mode & nir_var_mem_global)) 7190 storage |= storage_buffer; 7191 if (mem_mode & nir_var_mem_task_payload) 7192 storage |= storage_task_payload; 7193 if (mem_mode & nir_var_mem_shared) 7194 storage |= storage_shared; 7195 if (mem_mode & nir_var_image) 7196 storage |= storage_image; 7197 7198 return storage; 7199} 7200 7201void 7202visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) 7203{ 7204 Builder bld(ctx->program, ctx->block); 7205 7206 Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa); 7207 Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa)); 7208 Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa)); 7209 Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa)); 7210 7211 bool swizzled = nir_intrinsic_is_swizzled(intrin); 7212 bool reorder = nir_intrinsic_can_reorder(intrin); 7213 bool slc = nir_intrinsic_slc_amd(intrin); 7214 7215 unsigned const_offset = nir_intrinsic_base(intrin); 7216 unsigned elem_size_bytes = intrin->dest.ssa.bit_size / 8u; 7217 unsigned num_components = intrin->dest.ssa.num_components; 7218 unsigned swizzle_element_size = swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0; 7219 7220 nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin); 7221 memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode)); 7222 7223 load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes, 7224 num_components, swizzle_element_size, !swizzled, reorder, slc, sync); 7225} 7226 7227void 7228visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) 7229{ 7230 Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa); 7231 Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa); 7232 Temp v_offset = get_ssa_temp(ctx, intrin->src[2].ssa); 7233 Temp s_offset = get_ssa_temp(ctx, intrin->src[3].ssa); 7234 7235 bool swizzled = nir_intrinsic_is_swizzled(intrin); 7236 bool slc = nir_intrinsic_slc_amd(intrin); 7237 7238 unsigned const_offset = nir_intrinsic_base(intrin); 7239 unsigned write_mask = nir_intrinsic_write_mask(intrin); 7240 unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u; 7241 7242 nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin); 7243 memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode)); 7244 7245 store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes, 7246 write_mask, !swizzled, sync, slc); 7247} 7248 7249void 7250visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr) 7251{ 7252 Builder bld(ctx->program, ctx->block); 7253 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 7254 Temp base = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 7255 Temp offset = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)); 7256 7257 aco_opcode opcode = aco_opcode::s_load_dword; 7258 unsigned size = 1; 7259 7260 assert(dst.bytes() <= 64); 7261 7262 if (dst.bytes() > 32) { 7263 opcode = aco_opcode::s_load_dwordx16; 7264 size = 16; 7265 } else if (dst.bytes() > 16) { 7266 opcode = aco_opcode::s_load_dwordx8; 7267 size = 8; 7268 } else if (dst.bytes() > 8) { 7269 opcode = aco_opcode::s_load_dwordx4; 7270 size = 4; 7271 } else if (dst.bytes() > 4) { 7272 opcode = aco_opcode::s_load_dwordx2; 7273 size = 2; 7274 } 7275 7276 if (dst.size() != size) { 7277 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), 7278 bld.smem(opcode, bld.def(RegType::sgpr, size), base, offset), Operand::c32(0u)); 7279 } else { 7280 bld.smem(opcode, Definition(dst), base, offset); 7281 } 7282 emit_split_vector(ctx, dst, instr->dest.ssa.num_components); 7283} 7284 7285sync_scope 7286translate_nir_scope(nir_scope scope) 7287{ 7288 switch (scope) { 7289 case NIR_SCOPE_NONE: 7290 case NIR_SCOPE_INVOCATION: return scope_invocation; 7291 case NIR_SCOPE_SUBGROUP: return scope_subgroup; 7292 case NIR_SCOPE_WORKGROUP: return scope_workgroup; 7293 case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily; 7294 case NIR_SCOPE_DEVICE: return scope_device; 7295 case NIR_SCOPE_SHADER_CALL: return scope_invocation; 7296 } 7297 unreachable("invalid scope"); 7298} 7299 7300void 7301emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr) 7302{ 7303 Builder bld(ctx->program, ctx->block); 7304 7305 unsigned storage_allowed = storage_buffer | storage_image; 7306 unsigned semantics = 0; 7307 sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr)); 7308 sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr)); 7309 7310 /* We use shared storage for the following: 7311 * - compute shaders expose it in their API 7312 * - when tessellation is used, TCS and VS I/O is lowered to shared memory 7313 * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory 7314 * - additionally, when NGG is used on GFX10+, shared memory is used for certain features 7315 */ 7316 bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS || 7317 ctx->stage.hw == HWStage::HS || 7318 (ctx->stage.hw == HWStage::GS && ctx->program->gfx_level >= GFX9) || 7319 ctx->stage.hw == HWStage::NGG; 7320 7321 if (shared_storage_used) 7322 storage_allowed |= storage_shared; 7323 7324 /* Task payload: Task Shader output, Mesh Shader input */ 7325 if (ctx->stage.has(SWStage::MS) || ctx->stage.has(SWStage::TS)) 7326 storage_allowed |= storage_task_payload; 7327 7328 /* Allow VMEM output for all stages that can have outputs. */ 7329 if (ctx->stage.hw != HWStage::CS && ctx->stage.hw != HWStage::FS) 7330 storage_allowed |= storage_vmem_output; 7331 7332 /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half. 7333 * They are allowed in CS, TCS, and in any NGG shader. 7334 */ 7335 ASSERTED bool workgroup_scope_allowed = 7336 ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::HS || ctx->stage.hw == HWStage::NGG; 7337 7338 unsigned nir_storage = nir_intrinsic_memory_modes(instr); 7339 unsigned storage = aco_storage_mode_from_nir_mem_mode(nir_storage); 7340 storage &= storage_allowed; 7341 7342 unsigned nir_semantics = nir_intrinsic_memory_semantics(instr); 7343 if (nir_semantics & NIR_MEMORY_ACQUIRE) 7344 semantics |= semantic_acquire | semantic_release; 7345 if (nir_semantics & NIR_MEMORY_RELEASE) 7346 semantics |= semantic_acquire | semantic_release; 7347 7348 assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE))); 7349 assert(exec_scope != scope_workgroup || workgroup_scope_allowed); 7350 7351 bld.barrier(aco_opcode::p_barrier, 7352 memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope), 7353 exec_scope); 7354} 7355 7356void 7357visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr) 7358{ 7359 // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read() 7360 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 7361 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 7362 Builder bld(ctx->program, ctx->block); 7363 7364 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8; 7365 unsigned num_components = instr->dest.ssa.num_components; 7366 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes; 7367 load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align); 7368} 7369 7370void 7371visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr) 7372{ 7373 unsigned writemask = nir_intrinsic_write_mask(instr); 7374 Temp data = get_ssa_temp(ctx, instr->src[0].ssa); 7375 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); 7376 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; 7377 7378 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes; 7379 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align); 7380} 7381 7382void 7383visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr) 7384{ 7385 unsigned offset = nir_intrinsic_base(instr); 7386 Builder bld(ctx->program, ctx->block); 7387 Operand m = load_lds_size_m0(bld); 7388 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); 7389 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 7390 7391 unsigned num_operands = 3; 7392 aco_opcode op32, op64, op32_rtn, op64_rtn; 7393 switch (instr->intrinsic) { 7394 case nir_intrinsic_shared_atomic_add: 7395 op32 = aco_opcode::ds_add_u32; 7396 op64 = aco_opcode::ds_add_u64; 7397 op32_rtn = aco_opcode::ds_add_rtn_u32; 7398 op64_rtn = aco_opcode::ds_add_rtn_u64; 7399 break; 7400 case nir_intrinsic_shared_atomic_imin: 7401 op32 = aco_opcode::ds_min_i32; 7402 op64 = aco_opcode::ds_min_i64; 7403 op32_rtn = aco_opcode::ds_min_rtn_i32; 7404 op64_rtn = aco_opcode::ds_min_rtn_i64; 7405 break; 7406 case nir_intrinsic_shared_atomic_umin: 7407 op32 = aco_opcode::ds_min_u32; 7408 op64 = aco_opcode::ds_min_u64; 7409 op32_rtn = aco_opcode::ds_min_rtn_u32; 7410 op64_rtn = aco_opcode::ds_min_rtn_u64; 7411 break; 7412 case nir_intrinsic_shared_atomic_imax: 7413 op32 = aco_opcode::ds_max_i32; 7414 op64 = aco_opcode::ds_max_i64; 7415 op32_rtn = aco_opcode::ds_max_rtn_i32; 7416 op64_rtn = aco_opcode::ds_max_rtn_i64; 7417 break; 7418 case nir_intrinsic_shared_atomic_umax: 7419 op32 = aco_opcode::ds_max_u32; 7420 op64 = aco_opcode::ds_max_u64; 7421 op32_rtn = aco_opcode::ds_max_rtn_u32; 7422 op64_rtn = aco_opcode::ds_max_rtn_u64; 7423 break; 7424 case nir_intrinsic_shared_atomic_and: 7425 op32 = aco_opcode::ds_and_b32; 7426 op64 = aco_opcode::ds_and_b64; 7427 op32_rtn = aco_opcode::ds_and_rtn_b32; 7428 op64_rtn = aco_opcode::ds_and_rtn_b64; 7429 break; 7430 case nir_intrinsic_shared_atomic_or: 7431 op32 = aco_opcode::ds_or_b32; 7432 op64 = aco_opcode::ds_or_b64; 7433 op32_rtn = aco_opcode::ds_or_rtn_b32; 7434 op64_rtn = aco_opcode::ds_or_rtn_b64; 7435 break; 7436 case nir_intrinsic_shared_atomic_xor: 7437 op32 = aco_opcode::ds_xor_b32; 7438 op64 = aco_opcode::ds_xor_b64; 7439 op32_rtn = aco_opcode::ds_xor_rtn_b32; 7440 op64_rtn = aco_opcode::ds_xor_rtn_b64; 7441 break; 7442 case nir_intrinsic_shared_atomic_exchange: 7443 op32 = aco_opcode::ds_write_b32; 7444 op64 = aco_opcode::ds_write_b64; 7445 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32; 7446 op64_rtn = aco_opcode::ds_wrxchg_rtn_b64; 7447 break; 7448 case nir_intrinsic_shared_atomic_comp_swap: 7449 op32 = aco_opcode::ds_cmpst_b32; 7450 op64 = aco_opcode::ds_cmpst_b64; 7451 op32_rtn = aco_opcode::ds_cmpst_rtn_b32; 7452 op64_rtn = aco_opcode::ds_cmpst_rtn_b64; 7453 num_operands = 4; 7454 break; 7455 case nir_intrinsic_shared_atomic_fadd: 7456 op32 = aco_opcode::ds_add_f32; 7457 op32_rtn = aco_opcode::ds_add_rtn_f32; 7458 op64 = aco_opcode::num_opcodes; 7459 op64_rtn = aco_opcode::num_opcodes; 7460 break; 7461 case nir_intrinsic_shared_atomic_fmin: 7462 op32 = aco_opcode::ds_min_f32; 7463 op32_rtn = aco_opcode::ds_min_rtn_f32; 7464 op64 = aco_opcode::ds_min_f64; 7465 op64_rtn = aco_opcode::ds_min_rtn_f64; 7466 break; 7467 case nir_intrinsic_shared_atomic_fmax: 7468 op32 = aco_opcode::ds_max_f32; 7469 op32_rtn = aco_opcode::ds_max_rtn_f32; 7470 op64 = aco_opcode::ds_max_f64; 7471 op64_rtn = aco_opcode::ds_max_rtn_f64; 7472 break; 7473 default: unreachable("Unhandled shared atomic intrinsic"); 7474 } 7475 7476 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); 7477 7478 aco_opcode op; 7479 if (data.size() == 1) { 7480 assert(instr->dest.ssa.bit_size == 32); 7481 op = return_previous ? op32_rtn : op32; 7482 } else { 7483 assert(instr->dest.ssa.bit_size == 64); 7484 op = return_previous ? op64_rtn : op64; 7485 } 7486 7487 if (offset > 65535) { 7488 address = bld.vadd32(bld.def(v1), Operand::c32(offset), address); 7489 offset = 0; 7490 } 7491 7492 aco_ptr<DS_instruction> ds; 7493 ds.reset( 7494 create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0)); 7495 ds->operands[0] = Operand(address); 7496 ds->operands[1] = Operand(data); 7497 if (num_operands == 4) { 7498 Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)); 7499 ds->operands[2] = Operand(data2); 7500 } 7501 ds->operands[num_operands - 1] = m; 7502 ds->offset0 = offset; 7503 if (return_previous) 7504 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa)); 7505 ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw); 7506 7507 if (m.isUndefined()) 7508 ds->operands.pop_back(); 7509 7510 ctx->block->instructions.emplace_back(std::move(ds)); 7511} 7512 7513void 7514visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr) 7515{ 7516 bool is_store = instr->intrinsic == nir_intrinsic_store_shared2_amd; 7517 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[is_store].ssa)); 7518 Builder bld(ctx->program, ctx->block); 7519 7520 assert(bld.program->gfx_level >= GFX7); 7521 7522 bool is64bit = (is_store ? instr->src[0].ssa->bit_size : instr->dest.ssa.bit_size) == 64; 7523 uint8_t offset0 = nir_intrinsic_offset0(instr); 7524 uint8_t offset1 = nir_intrinsic_offset1(instr); 7525 bool st64 = nir_intrinsic_st64(instr); 7526 7527 Operand m = load_lds_size_m0(bld); 7528 Instruction* ds; 7529 if (is_store) { 7530 aco_opcode op = st64 7531 ? (is64bit ? aco_opcode::ds_write2st64_b64 : aco_opcode::ds_write2st64_b32) 7532 : (is64bit ? aco_opcode::ds_write2_b64 : aco_opcode::ds_write2_b32); 7533 Temp data = get_ssa_temp(ctx, instr->src[0].ssa); 7534 RegClass comp_rc = is64bit ? v2 : v1; 7535 Temp data0 = emit_extract_vector(ctx, data, 0, comp_rc); 7536 Temp data1 = emit_extract_vector(ctx, data, 1, comp_rc); 7537 ds = bld.ds(op, address, data0, data1, m, offset0, offset1); 7538 } else { 7539 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 7540 Definition tmp_dst(dst.type() == RegType::vgpr ? dst : bld.tmp(is64bit ? v4 : v2)); 7541 aco_opcode op = st64 ? (is64bit ? aco_opcode::ds_read2st64_b64 : aco_opcode::ds_read2st64_b32) 7542 : (is64bit ? aco_opcode::ds_read2_b64 : aco_opcode::ds_read2_b32); 7543 ds = bld.ds(op, tmp_dst, address, m, offset0, offset1); 7544 } 7545 ds->ds().sync = memory_sync_info(storage_shared); 7546 if (m.isUndefined()) 7547 ds->operands.pop_back(); 7548 7549 if (!is_store) { 7550 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 7551 if (dst.type() == RegType::sgpr) { 7552 emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size()); 7553 Temp comp[4]; 7554 /* Use scalar v_readfirstlane_b32 for better 32-bit copy propagation */ 7555 for (unsigned i = 0; i < dst.size(); i++) 7556 comp[i] = bld.as_uniform(emit_extract_vector(ctx, ds->definitions[0].getTemp(), i, v1)); 7557 if (is64bit) { 7558 Temp comp0 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[0], comp[1]); 7559 Temp comp1 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[2], comp[3]); 7560 ctx->allocated_vec[comp0.id()] = {comp[0], comp[1]}; 7561 ctx->allocated_vec[comp1.id()] = {comp[2], comp[3]}; 7562 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp0, comp1); 7563 ctx->allocated_vec[dst.id()] = {comp0, comp1}; 7564 } else { 7565 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp[0], comp[1]); 7566 } 7567 } 7568 7569 emit_split_vector(ctx, dst, 2); 7570 } 7571} 7572 7573Temp 7574get_scratch_resource(isel_context* ctx) 7575{ 7576 Builder bld(ctx->program, ctx->block); 7577 Temp scratch_addr = ctx->program->private_segment_buffer; 7578 if (ctx->stage.hw != HWStage::CS) 7579 scratch_addr = 7580 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero()); 7581 7582 uint32_t rsrc_conf = 7583 S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2); 7584 7585 if (ctx->program->gfx_level >= GFX10) { 7586 rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 7587 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | 7588 S_008F0C_RESOURCE_LEVEL(ctx->program->gfx_level < GFX11); 7589 } else if (ctx->program->gfx_level <= 7590 GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */ 7591 rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 7592 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 7593 } 7594 7595 /* older generations need element size = 4 bytes. element size removed in GFX9 */ 7596 if (ctx->program->gfx_level <= GFX8) 7597 rsrc_conf |= S_008F0C_ELEMENT_SIZE(1); 7598 7599 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u), 7600 Operand::c32(rsrc_conf)); 7601} 7602 7603void 7604visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr) 7605{ 7606 Builder bld(ctx->program, ctx->block); 7607 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 7608 7609 LoadEmitInfo info = {Operand(v1), dst, instr->dest.ssa.num_components, 7610 instr->dest.ssa.bit_size / 8u}; 7611 info.align_mul = nir_intrinsic_align_mul(instr); 7612 info.align_offset = nir_intrinsic_align_offset(instr); 7613 info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0; 7614 info.sync = memory_sync_info(storage_scratch, semantic_private); 7615 if (ctx->program->gfx_level >= GFX9) { 7616 if (nir_src_is_const(instr->src[0])) { 7617 uint32_t max = ctx->program->dev.scratch_global_offset_max + 1; 7618 info.offset = 7619 bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max))); 7620 info.const_offset = nir_src_as_uint(instr->src[0]) % max; 7621 } else { 7622 info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa)); 7623 } 7624 EmitLoadParameters params = scratch_flat_load_params; 7625 params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1; 7626 emit_load(ctx, bld, info, params); 7627 } else { 7628 info.resource = get_scratch_resource(ctx); 7629 info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa))); 7630 info.soffset = ctx->program->scratch_offset; 7631 emit_load(ctx, bld, info, scratch_mubuf_load_params); 7632 } 7633} 7634 7635void 7636visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) 7637{ 7638 Builder bld(ctx->program, ctx->block); 7639 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 7640 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa); 7641 7642 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; 7643 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes); 7644 7645 unsigned write_count = 0; 7646 Temp write_datas[32]; 7647 unsigned offsets[32]; 7648 unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16; 7649 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size, 7650 &write_count, write_datas, offsets); 7651 7652 if (ctx->program->gfx_level >= GFX9) { 7653 uint32_t max = ctx->program->dev.scratch_global_offset_max + 1; 7654 offset = nir_src_is_const(instr->src[1]) ? Temp(0, s1) : offset; 7655 uint32_t base_const_offset = 7656 nir_src_is_const(instr->src[1]) ? nir_src_as_uint(instr->src[1]) : 0; 7657 7658 for (unsigned i = 0; i < write_count; i++) { 7659 aco_opcode op; 7660 switch (write_datas[i].bytes()) { 7661 case 1: op = aco_opcode::scratch_store_byte; break; 7662 case 2: op = aco_opcode::scratch_store_short; break; 7663 case 4: op = aco_opcode::scratch_store_dword; break; 7664 case 8: op = aco_opcode::scratch_store_dwordx2; break; 7665 case 12: op = aco_opcode::scratch_store_dwordx3; break; 7666 case 16: op = aco_opcode::scratch_store_dwordx4; break; 7667 default: unreachable("Unexpected store size"); 7668 } 7669 7670 uint32_t const_offset = base_const_offset + offsets[i]; 7671 assert(const_offset < max || offset.id() == 0); 7672 7673 Operand addr = offset.regClass() == s1 ? Operand(v1) : Operand(offset); 7674 Operand saddr = offset.regClass() == s1 ? Operand(offset) : Operand(s1); 7675 if (offset.id() == 0) 7676 saddr = bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(const_offset, max))); 7677 7678 bld.scratch(op, addr, saddr, write_datas[i], const_offset % max, 7679 memory_sync_info(storage_scratch, semantic_private)); 7680 } 7681 } else { 7682 Temp rsrc = get_scratch_resource(ctx); 7683 offset = as_vgpr(ctx, offset); 7684 for (unsigned i = 0; i < write_count; i++) { 7685 aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); 7686 Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, 7687 write_datas[i], offsets[i], true, true); 7688 mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private); 7689 } 7690 } 7691} 7692 7693void 7694visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr) 7695{ 7696 Builder bld(ctx->program, ctx->block); 7697 7698 unsigned stream = nir_intrinsic_stream_id(instr); 7699 Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 7700 next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u); 7701 nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]); 7702 7703 /* get GSVS ring */ 7704 Temp gsvs_ring = 7705 bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, 7706 Operand::c32(RING_GSVS_GS * 16u)); 7707 7708 unsigned num_components = ctx->program->info.gs.num_stream_output_components[stream]; 7709 7710 unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out; 7711 unsigned stream_offset = 0; 7712 for (unsigned i = 0; i < stream; i++) { 7713 unsigned prev_stride = 4u * ctx->program->info.gs.num_stream_output_components[i] * 7714 ctx->shader->info.gs.vertices_out; 7715 stream_offset += prev_stride * ctx->program->wave_size; 7716 } 7717 7718 /* Limit on the stride field for <= GFX7. */ 7719 assert(stride < (1 << 14)); 7720 7721 Temp gsvs_dwords[4]; 7722 for (unsigned i = 0; i < 4; i++) 7723 gsvs_dwords[i] = bld.tmp(s1); 7724 bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]), 7725 Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring); 7726 7727 if (stream_offset) { 7728 Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand::c32(stream_offset)); 7729 7730 Temp carry = bld.tmp(s1); 7731 gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), 7732 gsvs_dwords[0], stream_offset_tmp); 7733 gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), 7734 gsvs_dwords[1], Operand::zero(), bld.scc(carry)); 7735 } 7736 7737 gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], 7738 Operand::c32(S_008F04_STRIDE(stride))); 7739 gsvs_dwords[2] = bld.copy(bld.def(s1), Operand::c32(ctx->program->wave_size)); 7740 7741 gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1], 7742 gsvs_dwords[2], gsvs_dwords[3]); 7743 7744 unsigned offset = 0; 7745 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) { 7746 if (ctx->program->info.gs.output_streams[i] != stream) 7747 continue; 7748 7749 for (unsigned j = 0; j < 4; j++) { 7750 if (!(ctx->program->info.gs.output_usage_mask[i] & (1 << j))) 7751 continue; 7752 7753 if (ctx->outputs.mask[i] & (1 << j)) { 7754 Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex); 7755 unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u; 7756 if (const_offset >= 4096u) { 7757 if (vaddr_offset.isUndefined()) 7758 vaddr_offset = bld.copy(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u)); 7759 else 7760 vaddr_offset = bld.vadd32(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u), 7761 vaddr_offset); 7762 const_offset %= 4096u; 7763 } 7764 7765 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>( 7766 aco_opcode::buffer_store_dword, Format::MUBUF, 4, 0)}; 7767 mubuf->operands[0] = Operand(gsvs_ring); 7768 mubuf->operands[1] = vaddr_offset; 7769 mubuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset)); 7770 mubuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]); 7771 mubuf->offen = !vaddr_offset.isUndefined(); 7772 mubuf->offset = const_offset; 7773 mubuf->glc = ctx->program->gfx_level < GFX11; 7774 mubuf->slc = true; 7775 mubuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder); 7776 bld.insert(std::move(mubuf)); 7777 } 7778 7779 offset += ctx->shader->info.gs.vertices_out; 7780 } 7781 7782 /* outputs for the next vertex are undefined and keeping them around can 7783 * create invalid IR with control flow */ 7784 ctx->outputs.mask[i] = 0; 7785 } 7786 7787 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream)); 7788} 7789 7790Temp 7791emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src) 7792{ 7793 Builder bld(ctx->program, ctx->block); 7794 7795 if (cluster_size == 1) { 7796 return src; 7797 } 7798 if (op == nir_op_iand && cluster_size == 4) { 7799 /* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */ 7800 Temp tmp = 7801 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); 7802 return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), 7803 bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp)); 7804 } else if (op == nir_op_ior && cluster_size == 4) { 7805 /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */ 7806 return bld.sop1( 7807 Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), 7808 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))); 7809 } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) { 7810 /* subgroupAnd(val) -> (exec & ~val) == 0 */ 7811 Temp tmp = 7812 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src) 7813 .def(1) 7814 .getTemp(); 7815 Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp)); 7816 return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond); 7817 } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) { 7818 /* subgroupOr(val) -> (val & exec) != 0 */ 7819 Temp tmp = 7820 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)) 7821 .def(1) 7822 .getTemp(); 7823 return bool_to_vector_condition(ctx, tmp); 7824 } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) { 7825 /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */ 7826 Temp tmp = 7827 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 7828 tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp); 7829 tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u)) 7830 .def(1) 7831 .getTemp(); 7832 return bool_to_vector_condition(ctx, tmp); 7833 } else { 7834 /* subgroupClustered{And,Or,Xor}(val, n): 7835 * lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32) 7836 * cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1) 7837 * subgroupClusteredAnd(): 7838 * return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask 7839 * subgroupClusteredOr(): 7840 * return ((val & exec) >> cluster_offset) & cluster_mask != 0 7841 * subgroupClusteredXor(): 7842 * return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0 7843 */ 7844 Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1)); 7845 Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), 7846 Operand::c32(~uint32_t(cluster_size - 1)), lane_id); 7847 7848 Temp tmp; 7849 if (op == nir_op_iand) 7850 tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, 7851 Operand(exec, bld.lm)); 7852 else 7853 tmp = 7854 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 7855 7856 uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u; 7857 7858 if (ctx->program->gfx_level <= GFX7) 7859 tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset); 7860 else if (ctx->program->wave_size == 64) 7861 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp); 7862 else 7863 tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp); 7864 tmp = emit_extract_vector(ctx, tmp, 0, v1); 7865 if (cluster_mask != 0xffffffff) 7866 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp); 7867 7868 if (op == nir_op_iand) { 7869 return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::c32(cluster_mask), 7870 tmp); 7871 } else if (op == nir_op_ior) { 7872 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp); 7873 } else if (op == nir_op_ixor) { 7874 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), 7875 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero())); 7876 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp); 7877 } 7878 assert(false); 7879 return Temp(); 7880 } 7881} 7882 7883Temp 7884emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src) 7885{ 7886 Builder bld(ctx->program, ctx->block); 7887 assert(src.regClass() == bld.lm); 7888 7889 /* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0 7890 * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0 7891 * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0 7892 */ 7893 Temp tmp; 7894 if (op == nir_op_iand) 7895 tmp = 7896 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); 7897 else 7898 tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 7899 7900 Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp)); 7901 7902 if (op == nir_op_iand) 7903 return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), mbcnt); 7904 else if (op == nir_op_ior) 7905 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), mbcnt); 7906 else if (op == nir_op_ixor) 7907 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), 7908 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt)); 7909 7910 assert(false); 7911 return Temp(); 7912} 7913 7914Temp 7915emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src) 7916{ 7917 Builder bld(ctx->program, ctx->block); 7918 7919 /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val 7920 * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val 7921 * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val 7922 */ 7923 Temp tmp = emit_boolean_exclusive_scan(ctx, op, src); 7924 if (op == nir_op_iand) 7925 return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src); 7926 else if (op == nir_op_ior) 7927 return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src); 7928 else if (op == nir_op_ixor) 7929 return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src); 7930 7931 assert(false); 7932 return Temp(); 7933} 7934 7935ReduceOp 7936get_reduce_op(nir_op op, unsigned bit_size) 7937{ 7938 switch (op) { 7939#define CASEI(name) \ 7940 case nir_op_##name: \ 7941 return (bit_size == 32) ? name##32 \ 7942 : (bit_size == 16) ? name##16 \ 7943 : (bit_size == 8) ? name##8 \ 7944 : name##64; 7945#define CASEF(name) \ 7946 case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; 7947 CASEI(iadd) 7948 CASEI(imul) 7949 CASEI(imin) 7950 CASEI(umin) 7951 CASEI(imax) 7952 CASEI(umax) 7953 CASEI(iand) 7954 CASEI(ior) 7955 CASEI(ixor) 7956 CASEF(fadd) 7957 CASEF(fmul) 7958 CASEF(fmin) 7959 CASEF(fmax) 7960 default: unreachable("unknown reduction op"); 7961#undef CASEI 7962#undef CASEF 7963 } 7964} 7965 7966void 7967emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src) 7968{ 7969 Builder bld(ctx->program, ctx->block); 7970 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa)); 7971 assert(dst.regClass().type() != RegType::vgpr); 7972 if (src.regClass().type() == RegType::vgpr) 7973 bld.pseudo(aco_opcode::p_as_uniform, dst, src); 7974 else 7975 bld.copy(dst, src); 7976} 7977 7978void 7979emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count) 7980{ 7981 Builder bld(ctx->program, ctx->block); 7982 Temp src_tmp = get_ssa_temp(ctx, src.ssa); 7983 7984 if (op == nir_op_fadd) { 7985 src_tmp = as_vgpr(ctx, src_tmp); 7986 Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8)) 7987 : dst.getTemp(); 7988 7989 if (src.ssa->bit_size == 16) { 7990 count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count); 7991 bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp); 7992 } else { 7993 assert(src.ssa->bit_size == 32); 7994 count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count); 7995 bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp); 7996 } 7997 7998 if (tmp != dst.getTemp()) 7999 bld.pseudo(aco_opcode::p_as_uniform, dst, tmp); 8000 8001 return; 8002 } 8003 8004 if (dst.regClass() == s1) 8005 src_tmp = bld.as_uniform(src_tmp); 8006 8007 if (op == nir_op_ixor && count.type() == RegType::sgpr) 8008 count = 8009 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u)); 8010 else if (op == nir_op_ixor) 8011 count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count); 8012 8013 assert(dst.getTemp().type() == count.type()); 8014 8015 if (nir_src_is_const(src)) { 8016 if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2) 8017 bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero()); 8018 else if (nir_src_as_uint(src) == 1) 8019 bld.copy(dst, count); 8020 else if (nir_src_as_uint(src) == 0) 8021 bld.copy(dst, Operand::zero(dst.bytes())); 8022 else if (count.type() == RegType::vgpr) 8023 bld.v_mul_imm(dst, count, nir_src_as_uint(src)); 8024 else 8025 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count); 8026 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) { 8027 bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count); 8028 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) { 8029 bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count); 8030 } else if (dst.getTemp().type() == RegType::vgpr) { 8031 bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count); 8032 } else { 8033 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count); 8034 } 8035} 8036 8037bool 8038emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr) 8039{ 8040 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr); 8041 if (op == nir_op_imul || op == nir_op_fmul) 8042 return false; 8043 8044 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) { 8045 Builder bld(ctx->program, ctx->block); 8046 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa)); 8047 unsigned bit_size = instr->src[0].ssa->bit_size; 8048 if (bit_size > 32) 8049 return false; 8050 8051 Temp thread_count = 8052 bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm)); 8053 8054 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count); 8055 } else { 8056 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa)); 8057 } 8058 8059 return true; 8060} 8061 8062bool 8063emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr) 8064{ 8065 Builder bld(ctx->program, ctx->block); 8066 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa)); 8067 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr); 8068 bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan; 8069 8070 if (op == nir_op_imul || op == nir_op_fmul) 8071 return false; 8072 8073 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) { 8074 if (instr->src[0].ssa->bit_size > 32) 8075 return false; 8076 8077 Temp packed_tid; 8078 if (inc) 8079 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u)); 8080 else 8081 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm)); 8082 8083 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid); 8084 return true; 8085 } 8086 8087 assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax || 8088 op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax); 8089 8090 if (inc) { 8091 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa)); 8092 return true; 8093 } 8094 8095 /* Copy the source and write the reduction operation identity to the first lane. */ 8096 Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)); 8097 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8098 ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size); 8099 if (dst.bytes() == 8) { 8100 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 8101 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 8102 uint32_t identity_lo = get_reduction_identity(reduce_op, 0); 8103 uint32_t identity_hi = get_reduction_identity(reduce_op, 1); 8104 8105 lo = 8106 bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo); 8107 hi = 8108 bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi); 8109 bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi); 8110 } else { 8111 uint32_t identity = get_reduction_identity(reduce_op, 0); 8112 bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane, 8113 as_vgpr(ctx, src)); 8114 } 8115 8116 return true; 8117} 8118 8119Temp 8120emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size, 8121 Definition dst, Temp src) 8122{ 8123 assert(src.bytes() <= 8); 8124 assert(src.type() == RegType::vgpr); 8125 8126 Builder bld(ctx->program, ctx->block); 8127 8128 unsigned num_defs = 0; 8129 Definition defs[5]; 8130 defs[num_defs++] = dst; 8131 defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */ 8132 8133 /* scalar identity temporary */ 8134 bool need_sitmp = (ctx->program->gfx_level <= GFX7 || ctx->program->gfx_level >= GFX10) && 8135 aco_op != aco_opcode::p_reduce; 8136 if (aco_op == aco_opcode::p_exclusive_scan) { 8137 need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 || 8138 op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 || 8139 op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 || 8140 op == fmul64); 8141 } 8142 if (need_sitmp) 8143 defs[num_defs++] = bld.def(RegType::sgpr, dst.size()); 8144 8145 /* scc clobber */ 8146 defs[num_defs++] = bld.def(s1, scc); 8147 8148 /* vcc clobber */ 8149 bool clobber_vcc = false; 8150 if ((op == iadd32 || op == imul64) && ctx->program->gfx_level < GFX9) 8151 clobber_vcc = true; 8152 if ((op == iadd8 || op == iadd16) && ctx->program->gfx_level < GFX8) 8153 clobber_vcc = true; 8154 if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64) 8155 clobber_vcc = true; 8156 8157 if (clobber_vcc) 8158 defs[num_defs++] = bld.def(bld.lm, vcc); 8159 8160 Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>( 8161 aco_op, Format::PSEUDO_REDUCTION, 3, num_defs); 8162 reduce->operands[0] = Operand(src); 8163 /* setup_reduce_temp will update these undef operands if needed */ 8164 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear()); 8165 reduce->operands[2] = Operand(v1.as_linear()); 8166 std::copy(defs, defs + num_defs, reduce->definitions.begin()); 8167 8168 reduce->reduce_op = op; 8169 reduce->cluster_size = cluster_size; 8170 bld.insert(std::move(reduce)); 8171 8172 return dst.getTemp(); 8173} 8174 8175void 8176emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2) 8177{ 8178 Builder bld(ctx->program, ctx->block); 8179 Temp p1 = emit_extract_vector(ctx, bary, 0, v1); 8180 Temp p2 = emit_extract_vector(ctx, bary, 1, v1); 8181 8182 Temp ddx_1, ddx_2, ddy_1, ddy_2; 8183 uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0); 8184 uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1); 8185 uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2); 8186 8187 /* Build DD X/Y */ 8188 if (ctx->program->gfx_level >= GFX8) { 8189 Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0); 8190 ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1); 8191 ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2); 8192 Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0); 8193 ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1); 8194 ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2); 8195 } else { 8196 Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0); 8197 ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1); 8198 ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1); 8199 ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2); 8200 ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_1); 8201 8202 Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0); 8203 ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1); 8204 ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_2); 8205 ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2); 8206 ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2); 8207 } 8208 8209 /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */ 8210 aco_opcode mad = 8211 ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32; 8212 Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1); 8213 Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2); 8214 tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1); 8215 tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2); 8216 Temp wqm1 = bld.tmp(v1); 8217 emit_wqm(bld, tmp1, wqm1, true); 8218 Temp wqm2 = bld.tmp(v1); 8219 emit_wqm(bld, tmp2, wqm2, true); 8220 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2); 8221 return; 8222} 8223 8224Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i); 8225void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt); 8226static void create_primitive_exports(isel_context *ctx, Temp prim_ch1); 8227static void create_vs_exports(isel_context* ctx); 8228 8229Temp 8230get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, 8231 enum glsl_interp_mode interp) 8232{ 8233 bool linear = interp == INTERP_MODE_NOPERSPECTIVE; 8234 if (intrin == nir_intrinsic_load_barycentric_pixel || 8235 intrin == nir_intrinsic_load_barycentric_at_sample || 8236 intrin == nir_intrinsic_load_barycentric_at_offset) { 8237 return get_arg(ctx, linear ? ctx->args->ac.linear_center : ctx->args->ac.persp_center); 8238 } else if (intrin == nir_intrinsic_load_barycentric_centroid) { 8239 return linear ? ctx->linear_centroid : ctx->persp_centroid; 8240 } else { 8241 assert(intrin == nir_intrinsic_load_barycentric_sample); 8242 return get_arg(ctx, linear ? ctx->args->ac.linear_sample : ctx->args->ac.persp_sample); 8243 } 8244} 8245 8246void 8247visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) 8248{ 8249 Builder bld(ctx->program, ctx->block); 8250 switch (instr->intrinsic) { 8251 case nir_intrinsic_load_barycentric_sample: 8252 case nir_intrinsic_load_barycentric_pixel: 8253 case nir_intrinsic_load_barycentric_centroid: { 8254 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr); 8255 Temp bary = get_interp_param(ctx, instr->intrinsic, mode); 8256 assert(bary.size() == 2); 8257 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8258 bld.copy(Definition(dst), bary); 8259 emit_split_vector(ctx, dst, 2); 8260 break; 8261 } 8262 case nir_intrinsic_load_barycentric_model: { 8263 Temp model = get_arg(ctx, ctx->args->ac.pull_model); 8264 assert(model.size() == 3); 8265 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8266 bld.copy(Definition(dst), model); 8267 emit_split_vector(ctx, dst, 3); 8268 break; 8269 } 8270 case nir_intrinsic_load_barycentric_at_sample: { 8271 Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr)); 8272 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8273 uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16; 8274 if (ctx->options->key.ps.num_samples == 2) { 8275 sample_pos_offset += 1 << 3; 8276 } else if (ctx->options->key.ps.num_samples == 4) { 8277 sample_pos_offset += 3 << 3; 8278 } else if (ctx->options->key.ps.num_samples == 8) { 8279 sample_pos_offset += 7 << 3; 8280 } else { 8281 assert(ctx->options->key.ps.num_samples == 0); 8282 bld.copy(Definition(dst), bary); 8283 emit_split_vector(ctx, dst, 2); 8284 break; 8285 } 8286 8287 Temp sample_pos; 8288 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); 8289 nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]); 8290 Temp private_segment_buffer = ctx->program->private_segment_buffer; 8291 // TODO: bounds checking? 8292 if (addr.type() == RegType::sgpr) { 8293 Operand offset; 8294 if (const_addr) { 8295 sample_pos_offset += const_addr->u32 << 3; 8296 offset = Operand::c32(sample_pos_offset); 8297 } else if (ctx->options->gfx_level >= GFX9) { 8298 offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, 8299 Operand::c32(sample_pos_offset)); 8300 } else { 8301 offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, 8302 Operand::c32(3u)); 8303 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, 8304 Operand::c32(sample_pos_offset)); 8305 } 8306 8307 Operand off = bld.copy(bld.def(s1), Operand(offset)); 8308 sample_pos = 8309 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off); 8310 8311 } else if (ctx->options->gfx_level >= GFX9) { 8312 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr); 8313 sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, 8314 private_segment_buffer, sample_pos_offset); 8315 } else if (ctx->options->gfx_level >= GFX7) { 8316 /* addr += private_segment_buffer + sample_pos_offset */ 8317 Temp tmp0 = bld.tmp(s1); 8318 Temp tmp1 = bld.tmp(s1); 8319 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), 8320 private_segment_buffer); 8321 Definition scc_tmp = bld.def(s1, scc); 8322 tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, 8323 Operand::c32(sample_pos_offset)); 8324 tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, 8325 Operand::zero(), bld.scc(scc_tmp.getTemp())); 8326 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr); 8327 Temp pck0 = bld.tmp(v1); 8328 Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp(); 8329 tmp1 = as_vgpr(ctx, tmp1); 8330 Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.def(bld.lm), tmp1, 8331 Operand::zero(), carry); 8332 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1); 8333 8334 /* sample_pos = flat_load_dwordx2 addr */ 8335 sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1)); 8336 } else { 8337 assert(ctx->options->gfx_level == GFX6); 8338 8339 uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 8340 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 8341 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, 8342 Operand::zero(), Operand::c32(rsrc_conf)); 8343 8344 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr); 8345 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand::zero()); 8346 8347 sample_pos = bld.tmp(v2); 8348 8349 aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>( 8350 aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)}; 8351 load->definitions[0] = Definition(sample_pos); 8352 load->operands[0] = Operand(rsrc); 8353 load->operands[1] = Operand(addr); 8354 load->operands[2] = Operand::zero(); 8355 load->offset = sample_pos_offset; 8356 load->offen = 0; 8357 load->addr64 = true; 8358 load->glc = false; 8359 load->dlc = false; 8360 load->disable_wqm = false; 8361 ctx->block->instructions.emplace_back(std::move(load)); 8362 } 8363 8364 /* sample_pos -= 0.5 */ 8365 Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1)); 8366 Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1)); 8367 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos); 8368 pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand::c32(0x3f000000u)); 8369 pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand::c32(0x3f000000u)); 8370 8371 emit_interp_center(ctx, dst, bary, pos1, pos2); 8372 break; 8373 } 8374 case nir_intrinsic_load_barycentric_at_offset: { 8375 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); 8376 RegClass rc = RegClass(offset.type(), 1); 8377 Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc); 8378 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset); 8379 Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr)); 8380 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2); 8381 break; 8382 } 8383 case nir_intrinsic_load_front_face: { 8384 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8385 Operand::zero(), get_arg(ctx, ctx->args->ac.front_face)); 8386 break; 8387 } 8388 case nir_intrinsic_load_view_index: { 8389 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8390 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index))); 8391 break; 8392 } 8393 case nir_intrinsic_load_frag_coord: { 8394 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4); 8395 break; 8396 } 8397 case nir_intrinsic_load_frag_shading_rate: 8398 emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->dest.ssa)); 8399 break; 8400 case nir_intrinsic_load_sample_pos: { 8401 Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]); 8402 Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]); 8403 bld.pseudo( 8404 aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8405 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(), 8406 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero()); 8407 break; 8408 } 8409 case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break; 8410 case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break; 8411 case nir_intrinsic_store_output: visit_store_output(ctx, instr); break; 8412 case nir_intrinsic_load_input: 8413 case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break; 8414 case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break; 8415 case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break; 8416 case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break; 8417 case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break; 8418 case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break; 8419 case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break; 8420 case nir_intrinsic_shared_atomic_add: 8421 case nir_intrinsic_shared_atomic_imin: 8422 case nir_intrinsic_shared_atomic_umin: 8423 case nir_intrinsic_shared_atomic_imax: 8424 case nir_intrinsic_shared_atomic_umax: 8425 case nir_intrinsic_shared_atomic_and: 8426 case nir_intrinsic_shared_atomic_or: 8427 case nir_intrinsic_shared_atomic_xor: 8428 case nir_intrinsic_shared_atomic_exchange: 8429 case nir_intrinsic_shared_atomic_comp_swap: 8430 case nir_intrinsic_shared_atomic_fadd: 8431 case nir_intrinsic_shared_atomic_fmin: 8432 case nir_intrinsic_shared_atomic_fmax: visit_shared_atomic(ctx, instr); break; 8433 case nir_intrinsic_load_shared2_amd: 8434 case nir_intrinsic_store_shared2_amd: visit_access_shared2_amd(ctx, instr); break; 8435 case nir_intrinsic_bindless_image_load: 8436 case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break; 8437 case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break; 8438 case nir_intrinsic_bindless_image_atomic_add: 8439 case nir_intrinsic_bindless_image_atomic_umin: 8440 case nir_intrinsic_bindless_image_atomic_imin: 8441 case nir_intrinsic_bindless_image_atomic_umax: 8442 case nir_intrinsic_bindless_image_atomic_imax: 8443 case nir_intrinsic_bindless_image_atomic_and: 8444 case nir_intrinsic_bindless_image_atomic_or: 8445 case nir_intrinsic_bindless_image_atomic_xor: 8446 case nir_intrinsic_bindless_image_atomic_exchange: 8447 case nir_intrinsic_bindless_image_atomic_comp_swap: 8448 case nir_intrinsic_bindless_image_atomic_fmin: 8449 case nir_intrinsic_bindless_image_atomic_fmax: visit_image_atomic(ctx, instr); break; 8450 case nir_intrinsic_bindless_image_size: visit_image_size(ctx, instr); break; 8451 case nir_intrinsic_bindless_image_samples: visit_image_samples(ctx, instr); break; 8452 case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break; 8453 case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break; 8454 case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break; 8455 case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break; 8456 case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break; 8457 case nir_intrinsic_load_global_amd: visit_load_global(ctx, instr); break; 8458 case nir_intrinsic_store_global_amd: visit_store_global(ctx, instr); break; 8459 case nir_intrinsic_global_atomic_add_amd: 8460 case nir_intrinsic_global_atomic_imin_amd: 8461 case nir_intrinsic_global_atomic_umin_amd: 8462 case nir_intrinsic_global_atomic_imax_amd: 8463 case nir_intrinsic_global_atomic_umax_amd: 8464 case nir_intrinsic_global_atomic_and_amd: 8465 case nir_intrinsic_global_atomic_or_amd: 8466 case nir_intrinsic_global_atomic_xor_amd: 8467 case nir_intrinsic_global_atomic_exchange_amd: 8468 case nir_intrinsic_global_atomic_comp_swap_amd: 8469 case nir_intrinsic_global_atomic_fmin_amd: 8470 case nir_intrinsic_global_atomic_fmax_amd: visit_global_atomic(ctx, instr); break; 8471 case nir_intrinsic_ssbo_atomic_add: 8472 case nir_intrinsic_ssbo_atomic_imin: 8473 case nir_intrinsic_ssbo_atomic_umin: 8474 case nir_intrinsic_ssbo_atomic_imax: 8475 case nir_intrinsic_ssbo_atomic_umax: 8476 case nir_intrinsic_ssbo_atomic_and: 8477 case nir_intrinsic_ssbo_atomic_or: 8478 case nir_intrinsic_ssbo_atomic_xor: 8479 case nir_intrinsic_ssbo_atomic_exchange: 8480 case nir_intrinsic_ssbo_atomic_comp_swap: 8481 case nir_intrinsic_ssbo_atomic_fmin: 8482 case nir_intrinsic_ssbo_atomic_fmax: visit_atomic_ssbo(ctx, instr); break; 8483 case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break; 8484 case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break; 8485 case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break; 8486 case nir_intrinsic_load_num_workgroups: { 8487 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8488 if (ctx->args->load_grid_size_from_user_sgpr) { 8489 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.num_work_groups)); 8490 } else { 8491 Temp addr = get_arg(ctx, ctx->args->ac.num_work_groups); 8492 assert(addr.regClass() == s2); 8493 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), 8494 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()), 8495 bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8))); 8496 } 8497 emit_split_vector(ctx, dst, 3); 8498 break; 8499 } 8500 case nir_intrinsic_load_ray_launch_size_addr_amd: { 8501 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8502 Temp addr = get_arg(ctx, ctx->args->ac.ray_launch_size_addr); 8503 assert(addr.regClass() == s2); 8504 bld.copy(Definition(dst), Operand(addr)); 8505 break; 8506 } 8507 case nir_intrinsic_load_local_invocation_id: { 8508 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8509 if (ctx->options->gfx_level >= GFX11) { 8510 Temp local_ids[3]; 8511 8512 /* Thread IDs are packed in VGPR0, 10 bits per component. */ 8513 for (uint32_t i = 0; i < 3; i++) { 8514 local_ids[i] = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), 8515 get_arg(ctx, ctx->args->ac.local_invocation_ids), 8516 Operand::c32(i * 10u), Operand::c32(10u)); 8517 } 8518 8519 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), local_ids[0], local_ids[1], 8520 local_ids[2]); 8521 } else { 8522 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids))); 8523 } 8524 emit_split_vector(ctx, dst, 3); 8525 break; 8526 } 8527 case nir_intrinsic_load_workgroup_id: { 8528 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8529 if (ctx->stage.hw == HWStage::CS) { 8530 const struct ac_arg* ids = ctx->args->ac.workgroup_ids; 8531 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), 8532 ids[0].used ? Operand(get_arg(ctx, ids[0])) : Operand::zero(), 8533 ids[1].used ? Operand(get_arg(ctx, ids[1])) : Operand::zero(), 8534 ids[2].used ? Operand(get_arg(ctx, ids[2])) : Operand::zero()); 8535 emit_split_vector(ctx, dst, 3); 8536 } else { 8537 isel_err(&instr->instr, "Unsupported stage for load_workgroup_id"); 8538 } 8539 break; 8540 } 8541 case nir_intrinsic_load_local_invocation_index: { 8542 if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) { 8543 if (ctx->options->gfx_level >= GFX11) { 8544 /* On GFX11, RelAutoIndex is WaveID * WaveSize + ThreadID. */ 8545 Temp wave_id = 8546 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), 8547 get_arg(ctx, ctx->args->ac.tcs_wave_id), Operand::c32(0u | (5u << 16))); 8548 8549 Temp temp = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), wave_id, 8550 Operand::c32(ctx->program->wave_size)); 8551 Temp thread_id = emit_mbcnt(ctx, bld.tmp(v1)); 8552 8553 bld.vadd32(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), temp, thread_id); 8554 } else { 8555 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8556 get_arg(ctx, ctx->args->ac.vs_rel_patch_id)); 8557 } 8558 break; 8559 } else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) { 8560 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx)); 8561 break; 8562 } else if (ctx->program->workgroup_size <= ctx->program->wave_size) { 8563 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa)); 8564 break; 8565 } 8566 8567 Temp id = emit_mbcnt(ctx, bld.tmp(v1)); 8568 8569 /* The tg_size bits [6:11] contain the subgroup id, 8570 * we need this multiplied by the wave size, and then OR the thread id to it. 8571 */ 8572 if (ctx->program->wave_size == 64) { 8573 /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just 8574 * feed that to v_or */ 8575 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), 8576 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size)); 8577 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, 8578 id); 8579 } else { 8580 /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */ 8581 Temp tg_num = 8582 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), 8583 get_arg(ctx, ctx->args->ac.tg_size), Operand::c32(0x6u | (0x6u << 16))); 8584 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8585 tg_num, Operand::c32(0x5u), id); 8586 } 8587 break; 8588 } 8589 case nir_intrinsic_load_subgroup_id: { 8590 if (ctx->stage.hw == HWStage::CS) { 8591 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8592 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size), 8593 Operand::c32(0x6u | (0x6u << 16))); 8594 } else if (ctx->stage.hw == HWStage::NGG) { 8595 /* Get the id of the current wave within the threadgroup (workgroup) */ 8596 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8597 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info), 8598 Operand::c32(24u | (4u << 16))); 8599 } else { 8600 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::zero()); 8601 } 8602 break; 8603 } 8604 case nir_intrinsic_load_subgroup_invocation: { 8605 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa)); 8606 break; 8607 } 8608 case nir_intrinsic_load_num_subgroups: { 8609 if (ctx->stage.hw == HWStage::CS) 8610 bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8611 bld.def(s1, scc), Operand::c32(0x3fu), get_arg(ctx, ctx->args->ac.tg_size)); 8612 else if (ctx->stage.hw == HWStage::NGG) 8613 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8614 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info), 8615 Operand::c32(28u | (4u << 16))); 8616 else 8617 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x1u)); 8618 break; 8619 } 8620 case nir_intrinsic_ballot: { 8621 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8622 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8623 8624 if (instr->src[0].ssa->bit_size == 1) { 8625 assert(src.regClass() == bld.lm); 8626 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) { 8627 src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src); 8628 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) { 8629 src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src); 8630 } else { 8631 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 8632 } 8633 8634 /* Make sure that all inactive lanes return zero. 8635 * Value-numbering might remove the comparison above */ 8636 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 8637 if (dst.size() != bld.lm.size()) { 8638 /* Wave32 with ballot size set to 64 */ 8639 src = 8640 bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero()); 8641 } 8642 8643 emit_wqm(bld, src, dst); 8644 break; 8645 } 8646 case nir_intrinsic_shuffle: 8647 case nir_intrinsic_read_invocation: { 8648 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8649 if (!nir_src_is_divergent(instr->src[0])) { 8650 emit_uniform_subgroup(ctx, instr, src); 8651 } else { 8652 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa); 8653 if (instr->intrinsic == nir_intrinsic_read_invocation || 8654 !nir_src_is_divergent(instr->src[1])) 8655 tid = bld.as_uniform(tid); 8656 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8657 8658 if (instr->dest.ssa.bit_size != 1) 8659 src = as_vgpr(ctx, src); 8660 8661 if (src.regClass() == v1b || src.regClass() == v2b) { 8662 Temp tmp = bld.tmp(v1); 8663 tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp); 8664 if (dst.type() == RegType::vgpr) 8665 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), 8666 bld.def(src.regClass() == v1b ? v3b : v2b), tmp); 8667 else 8668 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); 8669 } else if (src.regClass() == v1) { 8670 emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst); 8671 } else if (src.regClass() == v2) { 8672 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 8673 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 8674 lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo)); 8675 hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi)); 8676 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 8677 emit_split_vector(ctx, dst, 2); 8678 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) { 8679 assert(src.regClass() == bld.lm); 8680 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid); 8681 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst); 8682 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) { 8683 assert(src.regClass() == bld.lm); 8684 Temp tmp; 8685 if (ctx->program->gfx_level <= GFX7) 8686 tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid); 8687 else if (ctx->program->wave_size == 64) 8688 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src); 8689 else 8690 tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src); 8691 tmp = emit_extract_vector(ctx, tmp, 0, v1); 8692 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp); 8693 emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp), 8694 dst); 8695 } else { 8696 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 8697 } 8698 } 8699 break; 8700 } 8701 case nir_intrinsic_load_sample_id: { 8702 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8703 get_arg(ctx, ctx->args->ac.ancillary), Operand::c32(8u), Operand::c32(4u)); 8704 break; 8705 } 8706 case nir_intrinsic_read_first_invocation: { 8707 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8708 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8709 if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) { 8710 emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst); 8711 } else if (src.regClass() == v2) { 8712 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 8713 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 8714 lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo)); 8715 hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi)); 8716 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 8717 emit_split_vector(ctx, dst, 2); 8718 } else if (instr->dest.ssa.bit_size == 1) { 8719 assert(src.regClass() == bld.lm); 8720 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, 8721 bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm))); 8722 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst); 8723 } else { 8724 bld.copy(Definition(dst), src); 8725 } 8726 break; 8727 } 8728 case nir_intrinsic_vote_all: { 8729 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8730 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8731 assert(src.regClass() == bld.lm); 8732 assert(dst.regClass() == bld.lm); 8733 8734 Temp tmp = 8735 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src) 8736 .def(1) 8737 .getTemp(); 8738 Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp)); 8739 bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond); 8740 break; 8741 } 8742 case nir_intrinsic_vote_any: { 8743 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8744 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8745 assert(src.regClass() == bld.lm); 8746 assert(dst.regClass() == bld.lm); 8747 8748 Temp tmp = bool_to_scalar_condition(ctx, src); 8749 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst); 8750 break; 8751 } 8752 case nir_intrinsic_reduce: 8753 case nir_intrinsic_inclusive_scan: 8754 case nir_intrinsic_exclusive_scan: { 8755 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8756 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8757 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr); 8758 unsigned cluster_size = 8759 instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0; 8760 cluster_size = util_next_power_of_two( 8761 MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size)); 8762 8763 if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size && 8764 instr->dest.ssa.bit_size != 1) { 8765 /* We use divergence analysis to assign the regclass, so check if it's 8766 * working as expected */ 8767 ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan; 8768 if (instr->intrinsic == nir_intrinsic_inclusive_scan) 8769 expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor; 8770 assert(nir_dest_is_divergent(instr->dest) == expected_divergent); 8771 8772 if (instr->intrinsic == nir_intrinsic_reduce) { 8773 if (emit_uniform_reduce(ctx, instr)) 8774 break; 8775 } else if (emit_uniform_scan(ctx, instr)) { 8776 break; 8777 } 8778 } 8779 8780 if (instr->dest.ssa.bit_size == 1) { 8781 if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin) 8782 op = nir_op_iand; 8783 else if (op == nir_op_iadd) 8784 op = nir_op_ixor; 8785 else if (op == nir_op_umax || op == nir_op_imax) 8786 op = nir_op_ior; 8787 assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor); 8788 8789 switch (instr->intrinsic) { 8790 case nir_intrinsic_reduce: 8791 emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst); 8792 break; 8793 case nir_intrinsic_exclusive_scan: 8794 emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst); 8795 break; 8796 case nir_intrinsic_inclusive_scan: 8797 emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst); 8798 break; 8799 default: assert(false); 8800 } 8801 } else if (cluster_size == 1) { 8802 bld.copy(Definition(dst), src); 8803 } else { 8804 unsigned bit_size = instr->src[0].ssa->bit_size; 8805 8806 src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8)); 8807 8808 ReduceOp reduce_op = get_reduce_op(op, bit_size); 8809 8810 aco_opcode aco_op; 8811 switch (instr->intrinsic) { 8812 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break; 8813 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break; 8814 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break; 8815 default: unreachable("unknown reduce intrinsic"); 8816 } 8817 8818 Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, 8819 bld.def(dst.regClass()), src); 8820 emit_wqm(bld, tmp_dst, dst); 8821 } 8822 break; 8823 } 8824 case nir_intrinsic_quad_broadcast: 8825 case nir_intrinsic_quad_swap_horizontal: 8826 case nir_intrinsic_quad_swap_vertical: 8827 case nir_intrinsic_quad_swap_diagonal: 8828 case nir_intrinsic_quad_swizzle_amd: { 8829 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8830 8831 if (!nir_dest_is_divergent(instr->dest)) { 8832 emit_uniform_subgroup(ctx, instr, src); 8833 break; 8834 } 8835 8836 /* Quad broadcast lane. */ 8837 unsigned lane = 0; 8838 /* Use VALU for the bool instructions that don't have a SALU-only special case. */ 8839 bool bool_use_valu = instr->dest.ssa.bit_size == 1; 8840 8841 uint16_t dpp_ctrl = 0; 8842 8843 switch (instr->intrinsic) { 8844 case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break; 8845 case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break; 8846 case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break; 8847 case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break; 8848 case nir_intrinsic_quad_broadcast: 8849 lane = nir_src_as_const_value(instr->src[1])->u32; 8850 dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane); 8851 bool_use_valu = false; 8852 break; 8853 default: break; 8854 } 8855 8856 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8857 Temp tmp(dst); 8858 8859 /* Setup source. */ 8860 if (bool_use_valu) 8861 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), 8862 Operand::c32(-1), src); 8863 else if (instr->dest.ssa.bit_size != 1) 8864 src = as_vgpr(ctx, src); 8865 8866 /* Setup temporary destination. */ 8867 if (bool_use_valu) 8868 tmp = bld.tmp(v1); 8869 else if (ctx->program->stage == fragment_fs) 8870 tmp = bld.tmp(dst.regClass()); 8871 8872 if (instr->dest.ssa.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) { 8873 /* Special case for quad broadcast using SALU only. */ 8874 assert(src.regClass() == bld.lm && tmp.regClass() == bld.lm); 8875 8876 uint32_t half_mask = 0x11111111u << lane; 8877 Operand mask_tmp = bld.lm.bytes() == 4 8878 ? Operand::c32(half_mask) 8879 : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm), 8880 Operand::c32(half_mask), Operand::c32(half_mask)); 8881 8882 src = 8883 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 8884 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src); 8885 bld.sop1(Builder::s_wqm, Definition(tmp), src); 8886 } else if (instr->dest.ssa.bit_size <= 32 || bool_use_valu) { 8887 unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->dest.ssa.bit_size / 8; 8888 Definition def = excess_bytes ? bld.def(v1) : Definition(tmp); 8889 8890 if (ctx->program->gfx_level >= GFX8) 8891 bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl); 8892 else 8893 bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl); 8894 8895 if (excess_bytes) 8896 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp), 8897 bld.def(RegClass::get(tmp.type(), excess_bytes)), def.getTemp()); 8898 } else if (instr->dest.ssa.bit_size == 64) { 8899 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 8900 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 8901 8902 if (ctx->program->gfx_level >= GFX8) { 8903 lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl); 8904 hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl); 8905 } else { 8906 lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl); 8907 hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl); 8908 } 8909 8910 bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lo, hi); 8911 emit_split_vector(ctx, tmp, 2); 8912 } else { 8913 isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size."); 8914 } 8915 8916 if (tmp.id() != dst.id()) { 8917 if (bool_use_valu) 8918 tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp); 8919 8920 /* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */ 8921 emit_wqm(bld, tmp, dst, true); 8922 } 8923 8924 break; 8925 } 8926 case nir_intrinsic_masked_swizzle_amd: { 8927 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8928 if (!nir_dest_is_divergent(instr->dest)) { 8929 emit_uniform_subgroup(ctx, instr, src); 8930 break; 8931 } 8932 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8933 uint32_t mask = nir_intrinsic_swizzle_mask(instr); 8934 8935 if (instr->dest.ssa.bit_size != 1) 8936 src = as_vgpr(ctx, src); 8937 8938 if (instr->dest.ssa.bit_size == 1) { 8939 assert(src.regClass() == bld.lm); 8940 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), 8941 Operand::c32(-1), src); 8942 src = emit_masked_swizzle(ctx, bld, src, mask); 8943 Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src); 8944 emit_wqm(bld, tmp, dst); 8945 } else if (dst.regClass() == v1b) { 8946 Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask)); 8947 emit_extract_vector(ctx, tmp, 0, dst); 8948 } else if (dst.regClass() == v2b) { 8949 Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask)); 8950 emit_extract_vector(ctx, tmp, 0, dst); 8951 } else if (dst.regClass() == v1) { 8952 emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst); 8953 } else if (dst.regClass() == v2) { 8954 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 8955 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 8956 lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask)); 8957 hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask)); 8958 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 8959 emit_split_vector(ctx, dst, 2); 8960 } else { 8961 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 8962 } 8963 break; 8964 } 8965 case nir_intrinsic_write_invocation_amd: { 8966 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 8967 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)); 8968 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)); 8969 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8970 if (dst.regClass() == v1) { 8971 /* src2 is ignored for writelane. RA assigns the same reg for dst */ 8972 emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst); 8973 } else if (dst.regClass() == v2) { 8974 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1); 8975 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1); 8976 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src); 8977 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val); 8978 Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi)); 8979 Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi)); 8980 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 8981 emit_split_vector(ctx, dst, 2); 8982 } else { 8983 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 8984 } 8985 break; 8986 } 8987 case nir_intrinsic_mbcnt_amd: { 8988 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8989 Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); 8990 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8991 /* Fit 64-bit mask for wave32 */ 8992 src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size())); 8993 Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src)); 8994 emit_wqm(bld, wqm_tmp, dst); 8995 break; 8996 } 8997 case nir_intrinsic_byte_permute_amd: { 8998 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8999 assert(dst.regClass() == v1); 9000 assert(ctx->program->gfx_level >= GFX8); 9001 bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa), 9002 as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)), 9003 as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa))); 9004 break; 9005 } 9006 case nir_intrinsic_lane_permute_16_amd: { 9007 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 9008 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9009 assert(ctx->program->gfx_level >= GFX10); 9010 9011 if (src.regClass() == s1) { 9012 bld.copy(Definition(dst), src); 9013 } else if (dst.regClass() == v1 && src.regClass() == v1) { 9014 bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src, 9015 bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)), 9016 bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa))); 9017 } else { 9018 isel_err(&instr->instr, "Unimplemented lane_permute_16_amd"); 9019 } 9020 break; 9021 } 9022 case nir_intrinsic_load_helper_invocation: 9023 case nir_intrinsic_is_helper_invocation: { 9024 /* load_helper() after demote() get lowered to is_helper(). 9025 * Otherwise, these two behave the same. */ 9026 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9027 bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm)); 9028 ctx->block->kind |= block_kind_needs_lowering; 9029 ctx->program->needs_exact = true; 9030 break; 9031 } 9032 case nir_intrinsic_demote: 9033 bld.pseudo(aco_opcode::p_demote_to_helper, Operand::c32(-1u)); 9034 9035 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) 9036 ctx->cf_info.exec_potentially_empty_discard = true; 9037 ctx->block->kind |= block_kind_uses_discard; 9038 ctx->program->needs_exact = true; 9039 break; 9040 case nir_intrinsic_demote_if: { 9041 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 9042 assert(src.regClass() == bld.lm); 9043 Temp cond = 9044 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 9045 bld.pseudo(aco_opcode::p_demote_to_helper, cond); 9046 9047 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) 9048 ctx->cf_info.exec_potentially_empty_discard = true; 9049 ctx->block->kind |= block_kind_uses_discard; 9050 ctx->program->needs_exact = true; 9051 break; 9052 } 9053 case nir_intrinsic_terminate: 9054 case nir_intrinsic_terminate_if: 9055 case nir_intrinsic_discard: 9056 case nir_intrinsic_discard_if: { 9057 Operand cond = Operand::c32(-1u); 9058 if (instr->intrinsic == nir_intrinsic_discard_if || 9059 instr->intrinsic == nir_intrinsic_terminate_if) { 9060 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 9061 assert(src.regClass() == bld.lm); 9062 cond = 9063 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 9064 } 9065 9066 bld.pseudo(aco_opcode::p_discard_if, cond); 9067 9068 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) 9069 ctx->cf_info.exec_potentially_empty_discard = true; 9070 ctx->block->kind |= block_kind_uses_discard; 9071 ctx->program->needs_exact = true; 9072 break; 9073 } 9074 case nir_intrinsic_first_invocation: { 9075 emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)), 9076 get_ssa_temp(ctx, &instr->dest.ssa)); 9077 break; 9078 } 9079 case nir_intrinsic_last_invocation: { 9080 Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm)); 9081 Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), 9082 Operand::c32(ctx->program->wave_size - 1u), flbit); 9083 emit_wqm(bld, last, get_ssa_temp(ctx, &instr->dest.ssa)); 9084 break; 9085 } 9086 case nir_intrinsic_elect: { 9087 /* p_elect is lowered in aco_insert_exec_mask. 9088 * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize 9089 * two p_elect with different exec masks as the same. 9090 */ 9091 Temp elected = bld.pseudo(aco_opcode::p_elect, bld.def(bld.lm), Operand(exec, bld.lm)); 9092 emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->dest.ssa)); 9093 ctx->block->kind |= block_kind_needs_lowering; 9094 break; 9095 } 9096 case nir_intrinsic_shader_clock: { 9097 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9098 if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP && 9099 ctx->options->gfx_level >= GFX10_3) { 9100 /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */ 9101 Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29); 9102 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero()); 9103 } else { 9104 aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE 9105 ? aco_opcode::s_memrealtime 9106 : aco_opcode::s_memtime; 9107 bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile)); 9108 } 9109 emit_split_vector(ctx, dst, 2); 9110 break; 9111 } 9112 case nir_intrinsic_load_vertex_id_zero_base: { 9113 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9114 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id)); 9115 break; 9116 } 9117 case nir_intrinsic_load_first_vertex: { 9118 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9119 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex)); 9120 break; 9121 } 9122 case nir_intrinsic_load_base_instance: { 9123 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9124 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance)); 9125 break; 9126 } 9127 case nir_intrinsic_load_instance_id: { 9128 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9129 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id)); 9130 break; 9131 } 9132 case nir_intrinsic_load_draw_id: { 9133 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9134 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id)); 9135 break; 9136 } 9137 case nir_intrinsic_load_invocation_id: { 9138 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9139 9140 if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) { 9141 if (ctx->options->gfx_level >= GFX10) 9142 bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u), 9143 get_arg(ctx, ctx->args->ac.gs_invocation_id)); 9144 else 9145 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id)); 9146 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) { 9147 bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids), 9148 Operand::c32(8u), Operand::c32(5u)); 9149 } else { 9150 unreachable("Unsupported stage for load_invocation_id"); 9151 } 9152 9153 break; 9154 } 9155 case nir_intrinsic_load_primitive_id: { 9156 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9157 9158 switch (ctx->shader->info.stage) { 9159 case MESA_SHADER_GEOMETRY: 9160 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id)); 9161 break; 9162 case MESA_SHADER_TESS_CTRL: 9163 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id)); 9164 break; 9165 case MESA_SHADER_TESS_EVAL: 9166 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id)); 9167 break; 9168 default: 9169 if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) { 9170 /* In case of NGG, the GS threads always have the primitive ID 9171 * even if there is no SW GS. */ 9172 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id)); 9173 break; 9174 } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) { 9175 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vs_prim_id)); 9176 break; 9177 } 9178 unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id"); 9179 } 9180 9181 break; 9182 } 9183 case nir_intrinsic_emit_vertex_with_counter: { 9184 assert(ctx->stage.hw == HWStage::GS); 9185 visit_emit_vertex_with_counter(ctx, instr); 9186 break; 9187 } 9188 case nir_intrinsic_end_primitive_with_counter: { 9189 if (ctx->stage.hw != HWStage::NGG) { 9190 unsigned stream = nir_intrinsic_stream_id(instr); 9191 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, 9192 sendmsg_gs(true, false, stream)); 9193 } 9194 break; 9195 } 9196 case nir_intrinsic_set_vertex_and_primitive_count: { 9197 assert(ctx->stage.hw == HWStage::GS); 9198 /* unused in the legacy pipeline, the HW keeps track of this for us */ 9199 break; 9200 } 9201 case nir_intrinsic_has_input_vertex_amd: 9202 case nir_intrinsic_has_input_primitive_amd: { 9203 assert(ctx->stage.hw == HWStage::NGG); 9204 unsigned i = instr->intrinsic == nir_intrinsic_has_input_vertex_amd ? 0 : 1; 9205 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), merged_wave_info_to_mask(ctx, i)); 9206 break; 9207 } 9208 case nir_intrinsic_export_vertex_amd: { 9209 ctx->block->kind |= block_kind_export_end; 9210 create_vs_exports(ctx); 9211 break; 9212 } 9213 case nir_intrinsic_export_primitive_amd: { 9214 Temp prim_ch1 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 9215 create_primitive_exports(ctx, prim_ch1); 9216 break; 9217 } 9218 case nir_intrinsic_alloc_vertices_and_primitives_amd: { 9219 assert(ctx->stage.hw == HWStage::NGG); 9220 Temp num_vertices = get_ssa_temp(ctx, instr->src[0].ssa); 9221 Temp num_primitives = get_ssa_temp(ctx, instr->src[1].ssa); 9222 ngg_emit_sendmsg_gs_alloc_req(ctx, num_vertices, num_primitives); 9223 break; 9224 } 9225 case nir_intrinsic_gds_atomic_add_amd: { 9226 Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa); 9227 Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa); 9228 Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa); 9229 Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val))); 9230 bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u, 9231 true); 9232 break; 9233 } 9234 case nir_intrinsic_load_sbt_base_amd: { 9235 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9236 Temp addr = get_arg(ctx, ctx->args->ac.sbt_descriptors); 9237 assert(addr.regClass() == s2); 9238 bld.copy(Definition(dst), Operand(addr)); 9239 break; 9240 } 9241 case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break; 9242 case nir_intrinsic_overwrite_vs_arguments_amd: { 9243 ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa); 9244 ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa); 9245 break; 9246 } 9247 case nir_intrinsic_overwrite_tes_arguments_amd: { 9248 ctx->arg_temps[ctx->args->ac.tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa); 9249 ctx->arg_temps[ctx->args->ac.tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa); 9250 ctx->arg_temps[ctx->args->ac.tes_rel_patch_id.arg_index] = 9251 get_ssa_temp(ctx, instr->src[2].ssa); 9252 ctx->arg_temps[ctx->args->ac.tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa); 9253 break; 9254 } 9255 case nir_intrinsic_load_force_vrs_rates_amd: { 9256 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 9257 get_arg(ctx, ctx->args->ac.force_vrs_rates)); 9258 break; 9259 } 9260 case nir_intrinsic_load_scalar_arg_amd: 9261 case nir_intrinsic_load_vector_arg_amd: { 9262 assert(nir_intrinsic_base(instr) < ctx->args->ac.arg_count); 9263 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9264 Temp src = ctx->arg_temps[nir_intrinsic_base(instr)]; 9265 assert(src.id()); 9266 assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr : RegType::vgpr)); 9267 bld.copy(Definition(dst), src); 9268 emit_split_vector(ctx, dst, dst.size()); 9269 break; 9270 } 9271 default: 9272 isel_err(&instr->instr, "Unimplemented intrinsic instr"); 9273 abort(); 9274 9275 break; 9276 } 9277} 9278 9279void 9280build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc, 9281 Temp* out_tc) 9282{ 9283 Builder bld(ctx->program, ctx->block); 9284 9285 Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1); 9286 Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1); 9287 Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1); 9288 9289 Operand neg_one = Operand::c32(0xbf800000u); 9290 Operand one = Operand::c32(0x3f800000u); 9291 Operand two = Operand::c32(0x40000000u); 9292 Operand four = Operand::c32(0x40800000u); 9293 9294 Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), Operand::zero(), ma); 9295 Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive); 9296 Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::zero(), sgn_ma); 9297 9298 Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), four, id); 9299 Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id); 9300 is_ma_y = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), is_ma_y, is_ma_z); 9301 Temp is_not_ma_x = 9302 bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), is_ma_z, is_ma_y); 9303 9304 /* select sc */ 9305 Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x); 9306 Temp sgn = bld.vop2_e64( 9307 aco_opcode::v_cndmask_b32, bld.def(v1), 9308 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), one, is_ma_y); 9309 *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn); 9310 9311 /* select tc */ 9312 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y); 9313 sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y); 9314 *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn); 9315 9316 /* select ma */ 9317 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), 9318 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y), 9319 deriv_z, is_ma_z); 9320 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffffu), tmp); 9321 *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp); 9322} 9323 9324void 9325prepare_cube_coords(isel_context* ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy, 9326 bool is_deriv, bool is_array) 9327{ 9328 Builder bld(ctx->program, ctx->block); 9329 Temp ma, tc, sc, id; 9330 aco_opcode madak = 9331 ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32; 9332 aco_opcode madmk = 9333 ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32; 9334 9335 /* see comment in ac_prepare_cube_coords() */ 9336 if (is_array && ctx->options->gfx_level <= GFX8) 9337 coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), coords[3]); 9338 9339 ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]); 9340 9341 aco_ptr<VOP3_instruction> vop3a{ 9342 create_instruction<VOP3_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)}; 9343 vop3a->operands[0] = Operand(ma); 9344 vop3a->abs[0] = true; 9345 Temp invma = bld.tmp(v1); 9346 vop3a->definitions[0] = Definition(invma); 9347 ctx->block->instructions.emplace_back(std::move(vop3a)); 9348 9349 sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]); 9350 if (!is_deriv) 9351 sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand::c32(0x3fc00000u /*1.5*/)); 9352 9353 tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]); 9354 if (!is_deriv) 9355 tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand::c32(0x3fc00000u /*1.5*/)); 9356 9357 id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]); 9358 9359 if (is_deriv) { 9360 sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma); 9361 tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma); 9362 9363 for (unsigned i = 0; i < 2; i++) { 9364 /* see comment in ac_prepare_cube_coords() */ 9365 Temp deriv_ma; 9366 Temp deriv_sc, deriv_tc; 9367 build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc); 9368 9369 deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma); 9370 9371 Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), 9372 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma), 9373 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc)); 9374 Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), 9375 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma), 9376 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc)); 9377 *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y); 9378 } 9379 9380 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), sc); 9381 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), tc); 9382 } 9383 9384 if (is_array) { 9385 id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand::c32(0x41000000u /*8.0*/)); 9386 coords.erase(coords.begin() + 3); 9387 } 9388 coords[0] = sc; 9389 coords[1] = tc; 9390 coords[2] = id; 9391} 9392 9393void 9394get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4]) 9395{ 9396 if (vec->parent_instr->type != nir_instr_type_alu) 9397 return; 9398 nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr); 9399 if (vec_instr->op != nir_op_vec(vec->num_components)) 9400 return; 9401 9402 for (unsigned i = 0; i < vec->num_components; i++) { 9403 cv[i] = 9404 vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL; 9405 } 9406} 9407 9408void 9409visit_tex(isel_context* ctx, nir_tex_instr* instr) 9410{ 9411 assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical); 9412 9413 Builder bld(ctx->program, ctx->block); 9414 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false, 9415 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, 9416 has_sample_index = false, has_clamped_lod = false; 9417 Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(), 9418 offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(), 9419 coord = Temp(); 9420 std::vector<Temp> coords; 9421 std::vector<Temp> derivs; 9422 nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL}; 9423 9424 for (unsigned i = 0; i < instr->num_srcs; i++) { 9425 switch (instr->src[i].src_type) { 9426 case nir_tex_src_texture_handle: 9427 resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa)); 9428 break; 9429 case nir_tex_src_sampler_handle: 9430 sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa)); 9431 break; 9432 default: break; 9433 } 9434 } 9435 9436 bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 && 9437 (instr->dest_type & (nir_type_int | nir_type_uint)); 9438 bool tg4_integer_cube_workaround = 9439 tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE; 9440 9441 bool a16 = false, g16 = false; 9442 9443 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); 9444 if (coord_idx > 0) 9445 a16 = instr->src[coord_idx].src.ssa->bit_size == 16; 9446 9447 int ddx_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddx); 9448 if (ddx_idx > 0) 9449 g16 = instr->src[ddx_idx].src.ssa->bit_size == 16; 9450 9451 for (unsigned i = 0; i < instr->num_srcs; i++) { 9452 switch (instr->src[i].src_type) { 9453 case nir_tex_src_coord: { 9454 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32)); 9455 coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16); 9456 break; 9457 } 9458 case nir_tex_src_bias: 9459 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32)); 9460 /* Doesn't need get_ssa_temp_tex because we pack it into its own dword anyway. */ 9461 bias = get_ssa_temp(ctx, instr->src[i].src.ssa); 9462 has_bias = true; 9463 break; 9464 case nir_tex_src_lod: { 9465 if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) { 9466 level_zero = true; 9467 } else { 9468 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32)); 9469 lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16); 9470 has_lod = true; 9471 } 9472 break; 9473 } 9474 case nir_tex_src_min_lod: 9475 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32)); 9476 clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16); 9477 has_clamped_lod = true; 9478 break; 9479 case nir_tex_src_comparator: 9480 if (instr->is_shadow) { 9481 assert(instr->src[i].src.ssa->bit_size == 32); 9482 compare = get_ssa_temp(ctx, instr->src[i].src.ssa); 9483 has_compare = true; 9484 } 9485 break; 9486 case nir_tex_src_offset: 9487 assert(instr->src[i].src.ssa->bit_size == 32); 9488 offset = get_ssa_temp(ctx, instr->src[i].src.ssa); 9489 get_const_vec(instr->src[i].src.ssa, const_offset); 9490 has_offset = true; 9491 break; 9492 case nir_tex_src_ddx: 9493 assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32)); 9494 ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16); 9495 has_ddx = true; 9496 break; 9497 case nir_tex_src_ddy: 9498 assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32)); 9499 ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16); 9500 has_ddy = true; 9501 break; 9502 case nir_tex_src_ms_index: 9503 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32)); 9504 sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16); 9505 has_sample_index = true; 9506 break; 9507 case nir_tex_src_texture_offset: 9508 case nir_tex_src_sampler_offset: 9509 default: break; 9510 } 9511 } 9512 9513 if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 9514 return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa)); 9515 9516 if (instr->op == nir_texop_texture_samples) { 9517 get_image_samples(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), resource); 9518 return; 9519 } 9520 9521 if (has_offset) { 9522 assert(instr->op != nir_texop_txf); 9523 9524 aco_ptr<Instruction> tmp_instr; 9525 Temp acc, pack = Temp(); 9526 9527 uint32_t pack_const = 0; 9528 for (unsigned i = 0; i < offset.size(); i++) { 9529 if (!const_offset[i]) 9530 continue; 9531 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i); 9532 } 9533 9534 if (offset.type() == RegType::sgpr) { 9535 for (unsigned i = 0; i < offset.size(); i++) { 9536 if (const_offset[i]) 9537 continue; 9538 9539 acc = emit_extract_vector(ctx, offset, i, s1); 9540 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, 9541 Operand::c32(0x3Fu)); 9542 9543 if (i) { 9544 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, 9545 Operand::c32(8u * i)); 9546 } 9547 9548 if (pack == Temp()) { 9549 pack = acc; 9550 } else { 9551 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc); 9552 } 9553 } 9554 9555 if (pack_const && pack != Temp()) 9556 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), 9557 Operand::c32(pack_const), pack); 9558 } else { 9559 for (unsigned i = 0; i < offset.size(); i++) { 9560 if (const_offset[i]) 9561 continue; 9562 9563 acc = emit_extract_vector(ctx, offset, i, v1); 9564 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc); 9565 9566 if (i) { 9567 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc); 9568 } 9569 9570 if (pack == Temp()) { 9571 pack = acc; 9572 } else { 9573 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc); 9574 } 9575 } 9576 9577 if (pack_const && pack != Temp()) 9578 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack); 9579 } 9580 if (pack_const && pack == Temp()) 9581 offset = bld.copy(bld.def(v1), Operand::c32(pack_const)); 9582 else if (pack == Temp()) 9583 has_offset = false; 9584 else 9585 offset = pack; 9586 } 9587 9588 unsigned wqm_coord_count = 0; 9589 std::vector<Temp> unpacked_coord; 9590 if (ctx->options->gfx_level == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D && 9591 instr->op != nir_texop_lod && instr->coord_components) { 9592 RegClass rc = a16 ? v2b : v1; 9593 for (unsigned i = 0; i < coord.bytes() / rc.bytes(); i++) 9594 unpacked_coord.emplace_back(emit_extract_vector(ctx, coord, i, rc)); 9595 9596 assert(unpacked_coord.size() > 0 && unpacked_coord.size() < 3); 9597 9598 Operand coord2d; 9599 /* 0.5 for floating point coords, 0 for integer. */ 9600 if (a16) 9601 coord2d = instr->op == nir_texop_txf ? Operand::c16(0) : Operand::c16(0x3800); 9602 else 9603 coord2d = instr->op == nir_texop_txf ? Operand::c32(0) : Operand::c32(0x3f000000); 9604 unpacked_coord.insert(std::next(unpacked_coord.begin()), bld.copy(bld.def(rc), coord2d)); 9605 wqm_coord_count = a16 ? DIV_ROUND_UP(unpacked_coord.size(), 2) : unpacked_coord.size(); 9606 } else if (coord != Temp()) { 9607 unpacked_coord.push_back(coord); 9608 wqm_coord_count = DIV_ROUND_UP(coord.bytes(), 4); 9609 } 9610 9611 if (has_sample_index) 9612 unpacked_coord.push_back(sample_index); 9613 if (has_lod) 9614 unpacked_coord.push_back(lod); 9615 if (has_clamped_lod) 9616 unpacked_coord.push_back(clamped_lod); 9617 9618 coords = emit_pack_v1(ctx, unpacked_coord); 9619 9620 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE || !a16); 9621 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components) 9622 prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, 9623 instr->is_array && instr->op != nir_texop_lod); 9624 9625 /* pack derivatives */ 9626 if (has_ddx || has_ddy) { 9627 RegClass rc = g16 ? v2b : v1; 9628 assert(a16 == g16 || ctx->options->gfx_level >= GFX10); 9629 std::array<Temp, 2> ddxddy = {ddx, ddy}; 9630 for (Temp tmp : ddxddy) { 9631 if (tmp == Temp()) 9632 continue; 9633 std::vector<Temp> unpacked = {tmp}; 9634 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->gfx_level == GFX9) { 9635 assert(has_ddx && has_ddy); 9636 Temp zero = bld.copy(bld.def(rc), Operand::zero(rc.bytes())); 9637 unpacked.push_back(zero); 9638 } 9639 for (Temp derv : emit_pack_v1(ctx, unpacked)) 9640 derivs.push_back(derv); 9641 } 9642 has_derivs = true; 9643 } 9644 9645 bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array); 9646 9647 /* Build tex instruction */ 9648 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa) & 0xf; 9649 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 9650 dmask = u_bit_consecutive(0, util_last_bit(dmask)); 9651 if (instr->is_sparse) 9652 dmask = MAX2(dmask, 1) | 0x10; 9653 unsigned dim = 9654 ctx->options->gfx_level >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF 9655 ? ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array) 9656 : 0; 9657 bool d16 = instr->dest.ssa.bit_size == 16; 9658 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9659 Temp tmp_dst = dst; 9660 9661 /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */ 9662 if (instr->op == nir_texop_tg4) { 9663 assert(instr->dest.ssa.num_components == (4 + instr->is_sparse)); 9664 if (instr->is_shadow) 9665 dmask = 1; 9666 else 9667 dmask = 1 << instr->component; 9668 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr) 9669 tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4)); 9670 } else if (instr->op == nir_texop_fragment_mask_fetch_amd) { 9671 tmp_dst = bld.tmp(v1); 9672 } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || 9673 dst.type() == RegType::sgpr) { 9674 unsigned bytes = util_bitcount(dmask) * instr->dest.ssa.bit_size / 8; 9675 tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, bytes)); 9676 } 9677 9678 if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) { 9679 if (!has_lod) 9680 lod = bld.copy(bld.def(v1), Operand::zero()); 9681 9682 MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(tmp_dst), 9683 resource, Operand(s4), std::vector<Temp>{lod}); 9684 if (ctx->options->gfx_level == GFX9 && instr->op == nir_texop_txs && 9685 instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) { 9686 tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1); 9687 } else if (instr->op == nir_texop_query_levels) { 9688 tex->dmask = 1 << 3; 9689 } else { 9690 tex->dmask = dmask; 9691 } 9692 tex->da = da; 9693 tex->dim = dim; 9694 9695 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); 9696 return; 9697 } 9698 9699 Temp tg4_compare_cube_wa64 = Temp(); 9700 9701 if (tg4_integer_workarounds) { 9702 Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero()); 9703 Temp size = bld.tmp(v2); 9704 MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size), 9705 resource, Operand(s4), std::vector<Temp>{tg4_lod}); 9706 tex->dim = dim; 9707 tex->dmask = 0x3; 9708 tex->da = da; 9709 emit_split_vector(ctx, size, size.size()); 9710 9711 Temp half_texel[2]; 9712 for (unsigned i = 0; i < 2; i++) { 9713 half_texel[i] = emit_extract_vector(ctx, size, i, v1); 9714 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]); 9715 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]); 9716 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), 9717 Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]); 9718 } 9719 9720 if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) { 9721 /* In vulkan, whether the sampler uses unnormalized 9722 * coordinates or not is a dynamic property of the 9723 * sampler. Hence, to figure out whether or not we 9724 * need to divide by the texture size, we need to test 9725 * the sampler at runtime. This tests the bit set by 9726 * radv_init_sampler(). 9727 */ 9728 unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1; 9729 Temp not_needed = 9730 bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx)); 9731 9732 not_needed = bool_to_vector_condition(ctx, not_needed); 9733 half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), 9734 Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed); 9735 half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), 9736 Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed); 9737 } 9738 9739 Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]), 9740 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])}; 9741 9742 if (tg4_integer_cube_workaround) { 9743 /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */ 9744 Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp)); 9745 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>( 9746 aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())}; 9747 split->operands[0] = Operand(resource); 9748 for (unsigned i = 0; i < resource.size(); i++) { 9749 desc[i] = bld.tmp(s1); 9750 split->definitions[i] = Definition(desc[i]); 9751 } 9752 ctx->block->instructions.emplace_back(std::move(split)); 9753 9754 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], 9755 Operand::c32(20u | (6u << 16))); 9756 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt, 9757 Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8)); 9758 9759 Temp nfmt; 9760 if (instr->dest_type & nir_type_uint) { 9761 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), 9762 Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED), 9763 Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa)); 9764 } else { 9765 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), 9766 Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED), 9767 Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa)); 9768 } 9769 tg4_compare_cube_wa64 = bld.tmp(bld.lm); 9770 bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64); 9771 9772 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, 9773 Operand::c32(26u)); 9774 9775 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1], 9776 Operand::c32(C_008F14_NUM_FORMAT)); 9777 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt); 9778 9779 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>( 9780 aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)}; 9781 for (unsigned i = 0; i < resource.size(); i++) 9782 vec->operands[i] = Operand(desc[i]); 9783 resource = bld.tmp(resource.regClass()); 9784 vec->definitions[0] = Definition(resource); 9785 ctx->block->instructions.emplace_back(std::move(vec)); 9786 9787 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0], 9788 tg4_compare_cube_wa64); 9789 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1], 9790 tg4_compare_cube_wa64); 9791 } 9792 coords[0] = new_coords[0]; 9793 coords[1] = new_coords[1]; 9794 } 9795 9796 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { 9797 // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return 9798 // ac_build_buffer_load_format_gfx9_safe() 9799 9800 assert(coords.size() == 1); 9801 aco_opcode op; 9802 if (d16) { 9803 switch (util_last_bit(dmask & 0xf)) { 9804 case 1: op = aco_opcode::buffer_load_format_d16_x; break; 9805 case 2: op = aco_opcode::buffer_load_format_d16_xy; break; 9806 case 3: op = aco_opcode::buffer_load_format_d16_xyz; break; 9807 case 4: op = aco_opcode::buffer_load_format_d16_xyzw; break; 9808 default: unreachable("Tex instruction loads more than 4 components."); 9809 } 9810 } else { 9811 switch (util_last_bit(dmask & 0xf)) { 9812 case 1: op = aco_opcode::buffer_load_format_x; break; 9813 case 2: op = aco_opcode::buffer_load_format_xy; break; 9814 case 3: op = aco_opcode::buffer_load_format_xyz; break; 9815 case 4: op = aco_opcode::buffer_load_format_xyzw; break; 9816 default: unreachable("Tex instruction loads more than 4 components."); 9817 } 9818 } 9819 9820 aco_ptr<MUBUF_instruction> mubuf{ 9821 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)}; 9822 mubuf->operands[0] = Operand(resource); 9823 mubuf->operands[1] = Operand(coords[0]); 9824 mubuf->operands[2] = Operand::c32(0); 9825 mubuf->definitions[0] = Definition(tmp_dst); 9826 mubuf->idxen = true; 9827 mubuf->tfe = instr->is_sparse; 9828 if (mubuf->tfe) 9829 mubuf->operands[3] = emit_tfe_init(bld, tmp_dst); 9830 ctx->block->instructions.emplace_back(std::move(mubuf)); 9831 9832 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); 9833 return; 9834 } 9835 9836 /* gather MIMG address components */ 9837 std::vector<Temp> args; 9838 unsigned wqm_mask = 0; 9839 if (has_offset) { 9840 wqm_mask |= u_bit_consecutive(args.size(), 1); 9841 args.emplace_back(offset); 9842 } 9843 if (has_bias) 9844 args.emplace_back(emit_pack_v1(ctx, {bias})[0]); 9845 if (has_compare) 9846 args.emplace_back(compare); 9847 if (has_derivs) 9848 args.insert(args.end(), derivs.begin(), derivs.end()); 9849 9850 wqm_mask |= u_bit_consecutive(args.size(), wqm_coord_count); 9851 args.insert(args.end(), coords.begin(), coords.end()); 9852 9853 if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd || 9854 instr->op == nir_texop_fragment_mask_fetch_amd) { 9855 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || 9856 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS 9857 ? aco_opcode::image_load 9858 : aco_opcode::image_load_mip; 9859 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); 9860 MIMG_instruction* tex = 9861 emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata); 9862 if (instr->op == nir_texop_fragment_mask_fetch_amd) 9863 tex->dim = da ? ac_image_2darray : ac_image_2d; 9864 else 9865 tex->dim = dim; 9866 tex->dmask = dmask & 0xf; 9867 tex->unrm = true; 9868 tex->da = da; 9869 tex->tfe = instr->is_sparse; 9870 tex->d16 = d16; 9871 tex->a16 = a16; 9872 9873 if (instr->op == nir_texop_fragment_mask_fetch_amd) { 9874 /* Use 0x76543210 if the image doesn't have FMASK. */ 9875 assert(dmask == 1 && dst.bytes() == 4); 9876 assert(dst.id() != tmp_dst.id()); 9877 9878 if (dst.regClass() == s1) { 9879 Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(), 9880 emit_extract_vector(ctx, resource, 1, s1)); 9881 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), 9882 bld.as_uniform(tmp_dst), Operand::c32(0x76543210), 9883 bld.scc(is_not_null)); 9884 } else { 9885 Temp is_not_null = bld.tmp(bld.lm); 9886 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(), 9887 emit_extract_vector(ctx, resource, 1, s1)); 9888 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), 9889 bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null); 9890 } 9891 } else { 9892 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); 9893 } 9894 return; 9895 } 9896 9897 bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16; 9898 9899 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered. 9900 aco_opcode opcode = aco_opcode::image_sample; 9901 if (has_offset) { /* image_sample_*_o */ 9902 if (has_clamped_lod) { 9903 if (has_compare) { 9904 opcode = aco_opcode::image_sample_c_cl_o; 9905 if (separate_g16) 9906 opcode = aco_opcode::image_sample_c_d_cl_o_g16; 9907 else if (has_derivs) 9908 opcode = aco_opcode::image_sample_c_d_cl_o; 9909 if (has_bias) 9910 opcode = aco_opcode::image_sample_c_b_cl_o; 9911 } else { 9912 opcode = aco_opcode::image_sample_cl_o; 9913 if (separate_g16) 9914 opcode = aco_opcode::image_sample_d_cl_o_g16; 9915 else if (has_derivs) 9916 opcode = aco_opcode::image_sample_d_cl_o; 9917 if (has_bias) 9918 opcode = aco_opcode::image_sample_b_cl_o; 9919 } 9920 } else if (has_compare) { 9921 opcode = aco_opcode::image_sample_c_o; 9922 if (separate_g16) 9923 opcode = aco_opcode::image_sample_c_d_o_g16; 9924 else if (has_derivs) 9925 opcode = aco_opcode::image_sample_c_d_o; 9926 if (has_bias) 9927 opcode = aco_opcode::image_sample_c_b_o; 9928 if (level_zero) 9929 opcode = aco_opcode::image_sample_c_lz_o; 9930 if (has_lod) 9931 opcode = aco_opcode::image_sample_c_l_o; 9932 } else { 9933 opcode = aco_opcode::image_sample_o; 9934 if (separate_g16) 9935 opcode = aco_opcode::image_sample_d_o_g16; 9936 else if (has_derivs) 9937 opcode = aco_opcode::image_sample_d_o; 9938 if (has_bias) 9939 opcode = aco_opcode::image_sample_b_o; 9940 if (level_zero) 9941 opcode = aco_opcode::image_sample_lz_o; 9942 if (has_lod) 9943 opcode = aco_opcode::image_sample_l_o; 9944 } 9945 } else if (has_clamped_lod) { /* image_sample_*_cl */ 9946 if (has_compare) { 9947 opcode = aco_opcode::image_sample_c_cl; 9948 if (separate_g16) 9949 opcode = aco_opcode::image_sample_c_d_cl_g16; 9950 else if (has_derivs) 9951 opcode = aco_opcode::image_sample_c_d_cl; 9952 if (has_bias) 9953 opcode = aco_opcode::image_sample_c_b_cl; 9954 } else { 9955 opcode = aco_opcode::image_sample_cl; 9956 if (separate_g16) 9957 opcode = aco_opcode::image_sample_d_cl_g16; 9958 else if (has_derivs) 9959 opcode = aco_opcode::image_sample_d_cl; 9960 if (has_bias) 9961 opcode = aco_opcode::image_sample_b_cl; 9962 } 9963 } else { /* no offset */ 9964 if (has_compare) { 9965 opcode = aco_opcode::image_sample_c; 9966 if (separate_g16) 9967 opcode = aco_opcode::image_sample_c_d_g16; 9968 else if (has_derivs) 9969 opcode = aco_opcode::image_sample_c_d; 9970 if (has_bias) 9971 opcode = aco_opcode::image_sample_c_b; 9972 if (level_zero) 9973 opcode = aco_opcode::image_sample_c_lz; 9974 if (has_lod) 9975 opcode = aco_opcode::image_sample_c_l; 9976 } else { 9977 opcode = aco_opcode::image_sample; 9978 if (separate_g16) 9979 opcode = aco_opcode::image_sample_d_g16; 9980 else if (has_derivs) 9981 opcode = aco_opcode::image_sample_d; 9982 if (has_bias) 9983 opcode = aco_opcode::image_sample_b; 9984 if (level_zero) 9985 opcode = aco_opcode::image_sample_lz; 9986 if (has_lod) 9987 opcode = aco_opcode::image_sample_l; 9988 } 9989 } 9990 9991 if (instr->op == nir_texop_tg4) { 9992 if (has_offset) { /* image_gather4_*_o */ 9993 if (has_compare) { 9994 opcode = aco_opcode::image_gather4_c_lz_o; 9995 if (has_lod) 9996 opcode = aco_opcode::image_gather4_c_l_o; 9997 if (has_bias) 9998 opcode = aco_opcode::image_gather4_c_b_o; 9999 } else { 10000 opcode = aco_opcode::image_gather4_lz_o; 10001 if (has_lod) 10002 opcode = aco_opcode::image_gather4_l_o; 10003 if (has_bias) 10004 opcode = aco_opcode::image_gather4_b_o; 10005 } 10006 } else { 10007 if (has_compare) { 10008 opcode = aco_opcode::image_gather4_c_lz; 10009 if (has_lod) 10010 opcode = aco_opcode::image_gather4_c_l; 10011 if (has_bias) 10012 opcode = aco_opcode::image_gather4_c_b; 10013 } else { 10014 opcode = aco_opcode::image_gather4_lz; 10015 if (has_lod) 10016 opcode = aco_opcode::image_gather4_l; 10017 if (has_bias) 10018 opcode = aco_opcode::image_gather4_b; 10019 } 10020 } 10021 } else if (instr->op == nir_texop_lod) { 10022 opcode = aco_opcode::image_get_lod; 10023 } 10024 10025 bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod && 10026 !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS && 10027 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS; 10028 10029 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); 10030 MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler), 10031 args, implicit_derivs ? wqm_mask : 0, vdata); 10032 tex->dim = dim; 10033 tex->dmask = dmask & 0xf; 10034 tex->da = da; 10035 tex->tfe = instr->is_sparse; 10036 tex->d16 = d16; 10037 tex->a16 = a16; 10038 10039 if (tg4_integer_cube_workaround) { 10040 assert(tmp_dst.id() != dst.id()); 10041 assert(tmp_dst.size() == dst.size()); 10042 10043 emit_split_vector(ctx, tmp_dst, tmp_dst.size()); 10044 Temp val[4]; 10045 for (unsigned i = 0; i < 4; i++) { 10046 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1); 10047 Temp cvt_val; 10048 if (instr->dest_type & nir_type_uint) 10049 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]); 10050 else 10051 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]); 10052 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, 10053 tg4_compare_cube_wa64); 10054 } 10055 10056 Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass()); 10057 if (instr->is_sparse) 10058 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2], 10059 val[3], emit_extract_vector(ctx, tmp_dst, 4, v1)); 10060 else 10061 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2], 10062 val[3]); 10063 } 10064 unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask; 10065 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask); 10066} 10067 10068Operand 10069get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical) 10070{ 10071 Temp tmp = get_ssa_temp(ctx, ssa); 10072 if (ssa->parent_instr->type == nir_instr_type_ssa_undef) { 10073 return Operand(rc); 10074 } else if (logical && ssa->bit_size == 1 && 10075 ssa->parent_instr->type == nir_instr_type_load_const) { 10076 if (ctx->program->wave_size == 64) 10077 return Operand::c64(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX 10078 : 0u); 10079 else 10080 return Operand::c32(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX 10081 : 0u); 10082 } else { 10083 return Operand(tmp); 10084 } 10085} 10086 10087void 10088visit_phi(isel_context* ctx, nir_phi_instr* instr) 10089{ 10090 aco_ptr<Pseudo_instruction> phi; 10091 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 10092 assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask); 10093 10094 bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest); 10095 logical |= (ctx->block->kind & block_kind_merge) != 0; 10096 aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi; 10097 10098 /* we want a sorted list of sources, since the predecessor list is also sorted */ 10099 std::map<unsigned, nir_ssa_def*> phi_src; 10100 nir_foreach_phi_src (src, instr) 10101 phi_src[src->pred->index] = src->src.ssa; 10102 10103 std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds; 10104 unsigned num_operands = 0; 10105 Operand* const operands = (Operand*)alloca( 10106 (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand)); 10107 unsigned num_defined = 0; 10108 unsigned cur_pred_idx = 0; 10109 for (std::pair<unsigned, nir_ssa_def*> src : phi_src) { 10110 if (cur_pred_idx < preds.size()) { 10111 /* handle missing preds (IF merges with discard/break) and extra preds 10112 * (loop exit with discard) */ 10113 unsigned block = ctx->cf_info.nir_to_aco[src.first]; 10114 unsigned skipped = 0; 10115 while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block) 10116 skipped++; 10117 if (cur_pred_idx + skipped < preds.size()) { 10118 for (unsigned i = 0; i < skipped; i++) 10119 operands[num_operands++] = Operand(dst.regClass()); 10120 cur_pred_idx += skipped; 10121 } else { 10122 continue; 10123 } 10124 } 10125 /* Handle missing predecessors at the end. This shouldn't happen with loop 10126 * headers and we can't ignore these sources for loop header phis. */ 10127 if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size()) 10128 continue; 10129 cur_pred_idx++; 10130 Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical); 10131 operands[num_operands++] = op; 10132 num_defined += !op.isUndefined(); 10133 } 10134 /* handle block_kind_continue_or_break at loop exit blocks */ 10135 while (cur_pred_idx++ < preds.size()) 10136 operands[num_operands++] = Operand(dst.regClass()); 10137 10138 /* If the loop ends with a break, still add a linear continue edge in case 10139 * that break is divergent or continue_or_break is used. We'll either remove 10140 * this operand later in visit_loop() if it's not necessary or replace the 10141 * undef with something correct. */ 10142 if (!logical && ctx->block->kind & block_kind_loop_header) { 10143 nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent); 10144 nir_block* last = nir_loop_last_block(loop); 10145 if (last->successors[0] != instr->instr.block) 10146 operands[num_operands++] = Operand(RegClass()); 10147 } 10148 10149 /* we can use a linear phi in some cases if one src is undef */ 10150 if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) { 10151 phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, 10152 num_operands, 1)); 10153 10154 Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]]; 10155 Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]]; 10156 assert(invert->kind & block_kind_invert); 10157 10158 unsigned then_block = invert->linear_preds[0]; 10159 10160 Block* insert_block = NULL; 10161 for (unsigned i = 0; i < num_operands; i++) { 10162 Operand op = operands[i]; 10163 if (op.isUndefined()) 10164 continue; 10165 insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block; 10166 phi->operands[0] = op; 10167 break; 10168 } 10169 assert(insert_block); /* should be handled by the "num_defined == 0" case above */ 10170 phi->operands[1] = Operand(dst.regClass()); 10171 phi->definitions[0] = Definition(dst); 10172 insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi)); 10173 return; 10174 } 10175 10176 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1)); 10177 for (unsigned i = 0; i < num_operands; i++) 10178 phi->operands[i] = operands[i]; 10179 phi->definitions[0] = Definition(dst); 10180 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi)); 10181} 10182 10183void 10184visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr) 10185{ 10186 Temp dst = get_ssa_temp(ctx, &instr->def); 10187 10188 assert(dst.type() == RegType::sgpr); 10189 10190 if (dst.size() == 1) { 10191 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero()); 10192 } else { 10193 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 10194 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; 10195 for (unsigned i = 0; i < dst.size(); i++) 10196 vec->operands[i] = Operand::zero(); 10197 vec->definitions[0] = Definition(dst); 10198 ctx->block->instructions.emplace_back(std::move(vec)); 10199 } 10200} 10201 10202void 10203begin_loop(isel_context* ctx, loop_context* lc) 10204{ 10205 // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true 10206 append_logical_end(ctx->block); 10207 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform; 10208 Builder bld(ctx->program, ctx->block); 10209 bld.branch(aco_opcode::p_branch, bld.def(s2)); 10210 unsigned loop_preheader_idx = ctx->block->index; 10211 10212 lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level)); 10213 10214 ctx->program->next_loop_depth++; 10215 10216 Block* loop_header = ctx->program->create_and_insert_block(); 10217 loop_header->kind |= block_kind_loop_header; 10218 add_edge(loop_preheader_idx, loop_header); 10219 ctx->block = loop_header; 10220 10221 append_logical_start(ctx->block); 10222 10223 lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index); 10224 lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit); 10225 lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false); 10226 lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false); 10227 lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false); 10228} 10229 10230void 10231end_loop(isel_context* ctx, loop_context* lc) 10232{ 10233 // TODO: what if a loop ends with a unconditional or uniformly branched continue 10234 // and this branch is never taken? 10235 if (!ctx->cf_info.has_branch) { 10236 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx; 10237 Builder bld(ctx->program, ctx->block); 10238 append_logical_end(ctx->block); 10239 10240 if (ctx->cf_info.exec_potentially_empty_discard || 10241 ctx->cf_info.exec_potentially_empty_break) { 10242 /* Discards can result in code running with an empty exec mask. 10243 * This would result in divergent breaks not ever being taken. As a 10244 * workaround, break the loop when the loop mask is empty instead of 10245 * always continuing. */ 10246 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform); 10247 unsigned block_idx = ctx->block->index; 10248 10249 /* create helper blocks to avoid critical edges */ 10250 Block* break_block = ctx->program->create_and_insert_block(); 10251 break_block->kind = block_kind_uniform; 10252 bld.reset(break_block); 10253 bld.branch(aco_opcode::p_branch, bld.def(s2)); 10254 add_linear_edge(block_idx, break_block); 10255 add_linear_edge(break_block->index, &lc->loop_exit); 10256 10257 Block* continue_block = ctx->program->create_and_insert_block(); 10258 continue_block->kind = block_kind_uniform; 10259 bld.reset(continue_block); 10260 bld.branch(aco_opcode::p_branch, bld.def(s2)); 10261 add_linear_edge(block_idx, continue_block); 10262 add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]); 10263 10264 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10265 add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]); 10266 ctx->block = &ctx->program->blocks[block_idx]; 10267 } else { 10268 ctx->block->kind |= (block_kind_continue | block_kind_uniform); 10269 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10270 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]); 10271 else 10272 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]); 10273 } 10274 10275 bld.reset(ctx->block); 10276 bld.branch(aco_opcode::p_branch, bld.def(s2)); 10277 } 10278 10279 ctx->cf_info.has_branch = false; 10280 ctx->program->next_loop_depth--; 10281 10282 // TODO: if the loop has not a single exit, we must add one °° 10283 /* emit loop successor block */ 10284 ctx->block = ctx->program->insert_block(std::move(lc->loop_exit)); 10285 append_logical_start(ctx->block); 10286 10287#if 0 10288 // TODO: check if it is beneficial to not branch on continues 10289 /* trim linear phis in loop header */ 10290 for (auto&& instr : loop_entry->instructions) { 10291 if (instr->opcode == aco_opcode::p_linear_phi) { 10292 aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)}; 10293 new_phi->definitions[0] = instr->definitions[0]; 10294 for (unsigned i = 0; i < new_phi->operands.size(); i++) 10295 new_phi->operands[i] = instr->operands[i]; 10296 /* check that the remaining operands are all the same */ 10297 for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++) 10298 assert(instr->operands[i].tempId() == instr->operands.back().tempId()); 10299 instr.swap(new_phi); 10300 } else if (instr->opcode == aco_opcode::p_phi) { 10301 continue; 10302 } else { 10303 break; 10304 } 10305 } 10306#endif 10307 10308 ctx->cf_info.parent_loop.header_idx = lc->header_idx_old; 10309 ctx->cf_info.parent_loop.exit = lc->exit_old; 10310 ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old; 10311 ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old; 10312 ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old; 10313 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) 10314 ctx->cf_info.exec_potentially_empty_discard = false; 10315} 10316 10317void 10318emit_loop_jump(isel_context* ctx, bool is_break) 10319{ 10320 Builder bld(ctx->program, ctx->block); 10321 Block* logical_target; 10322 append_logical_end(ctx->block); 10323 unsigned idx = ctx->block->index; 10324 10325 if (is_break) { 10326 logical_target = ctx->cf_info.parent_loop.exit; 10327 add_logical_edge(idx, logical_target); 10328 ctx->block->kind |= block_kind_break; 10329 10330 if (!ctx->cf_info.parent_if.is_divergent && 10331 !ctx->cf_info.parent_loop.has_divergent_continue) { 10332 /* uniform break - directly jump out of the loop */ 10333 ctx->block->kind |= block_kind_uniform; 10334 ctx->cf_info.has_branch = true; 10335 bld.branch(aco_opcode::p_branch, bld.def(s2)); 10336 add_linear_edge(idx, logical_target); 10337 return; 10338 } 10339 ctx->cf_info.parent_loop.has_divergent_branch = true; 10340 } else { 10341 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx]; 10342 add_logical_edge(idx, logical_target); 10343 ctx->block->kind |= block_kind_continue; 10344 10345 if (!ctx->cf_info.parent_if.is_divergent) { 10346 /* uniform continue - directly jump to the loop header */ 10347 ctx->block->kind |= block_kind_uniform; 10348 ctx->cf_info.has_branch = true; 10349 bld.branch(aco_opcode::p_branch, bld.def(s2)); 10350 add_linear_edge(idx, logical_target); 10351 return; 10352 } 10353 10354 /* for potential uniform breaks after this continue, 10355 we must ensure that they are handled correctly */ 10356 ctx->cf_info.parent_loop.has_divergent_continue = true; 10357 ctx->cf_info.parent_loop.has_divergent_branch = true; 10358 } 10359 10360 if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) { 10361 ctx->cf_info.exec_potentially_empty_break = true; 10362 ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth; 10363 } 10364 10365 /* remove critical edges from linear CFG */ 10366 bld.branch(aco_opcode::p_branch, bld.def(s2)); 10367 Block* break_block = ctx->program->create_and_insert_block(); 10368 break_block->kind |= block_kind_uniform; 10369 add_linear_edge(idx, break_block); 10370 /* the loop_header pointer might be invalidated by this point */ 10371 if (!is_break) 10372 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx]; 10373 add_linear_edge(break_block->index, logical_target); 10374 bld.reset(break_block); 10375 bld.branch(aco_opcode::p_branch, bld.def(s2)); 10376 10377 Block* continue_block = ctx->program->create_and_insert_block(); 10378 add_linear_edge(idx, continue_block); 10379 append_logical_start(continue_block); 10380 ctx->block = continue_block; 10381} 10382 10383void 10384emit_loop_break(isel_context* ctx) 10385{ 10386 emit_loop_jump(ctx, true); 10387} 10388 10389void 10390emit_loop_continue(isel_context* ctx) 10391{ 10392 emit_loop_jump(ctx, false); 10393} 10394 10395void 10396visit_jump(isel_context* ctx, nir_jump_instr* instr) 10397{ 10398 /* visit_block() would usually do this but divergent jumps updates ctx->block */ 10399 ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index; 10400 10401 switch (instr->type) { 10402 case nir_jump_break: emit_loop_break(ctx); break; 10403 case nir_jump_continue: emit_loop_continue(ctx); break; 10404 default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort(); 10405 } 10406} 10407 10408void 10409visit_block(isel_context* ctx, nir_block* block) 10410{ 10411 nir_foreach_instr (instr, block) { 10412 switch (instr->type) { 10413 case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break; 10414 case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break; 10415 case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break; 10416 case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break; 10417 case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break; 10418 case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break; 10419 case nir_instr_type_deref: break; 10420 case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break; 10421 default: isel_err(instr, "Unknown NIR instr type"); 10422 } 10423 } 10424 10425 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10426 ctx->cf_info.nir_to_aco[block->index] = ctx->block->index; 10427} 10428 10429static Operand 10430create_continue_phis(isel_context* ctx, unsigned first, unsigned last, 10431 aco_ptr<Instruction>& header_phi, Operand* vals) 10432{ 10433 vals[0] = Operand(header_phi->definitions[0].getTemp()); 10434 RegClass rc = vals[0].regClass(); 10435 10436 unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth; 10437 10438 unsigned next_pred = 1; 10439 10440 for (unsigned idx = first + 1; idx <= last; idx++) { 10441 Block& block = ctx->program->blocks[idx]; 10442 if (block.loop_nest_depth != loop_nest_depth) { 10443 vals[idx - first] = vals[idx - 1 - first]; 10444 continue; 10445 } 10446 10447 if ((block.kind & block_kind_continue) && block.index != last) { 10448 vals[idx - first] = header_phi->operands[next_pred]; 10449 next_pred++; 10450 continue; 10451 } 10452 10453 bool all_same = true; 10454 for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++) 10455 all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first]; 10456 10457 Operand val; 10458 if (all_same) { 10459 val = vals[block.linear_preds[0] - first]; 10460 } else { 10461 aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>( 10462 aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1)); 10463 for (unsigned i = 0; i < block.linear_preds.size(); i++) 10464 phi->operands[i] = vals[block.linear_preds[i] - first]; 10465 val = Operand(ctx->program->allocateTmp(rc)); 10466 phi->definitions[0] = Definition(val.getTemp()); 10467 block.instructions.emplace(block.instructions.begin(), std::move(phi)); 10468 } 10469 vals[idx - first] = val; 10470 } 10471 10472 return vals[last - first]; 10473} 10474 10475static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond); 10476static void begin_uniform_if_else(isel_context* ctx, if_context* ic); 10477static void end_uniform_if(isel_context* ctx, if_context* ic); 10478 10479static void 10480visit_loop(isel_context* ctx, nir_loop* loop) 10481{ 10482 loop_context lc; 10483 begin_loop(ctx, &lc); 10484 10485 /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the 10486 * loop header are live. Handle this without complicating the ACO IR by creating a dummy break. 10487 */ 10488 if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) { 10489 Builder bld(ctx->program, ctx->block); 10490 Temp cond = bld.copy(bld.def(s1, scc), Operand::zero()); 10491 if_context ic; 10492 begin_uniform_if_then(ctx, &ic, cond); 10493 emit_loop_break(ctx); 10494 begin_uniform_if_else(ctx, &ic); 10495 end_uniform_if(ctx, &ic); 10496 } 10497 10498 bool unreachable = visit_cf_list(ctx, &loop->body); 10499 10500 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx; 10501 10502 /* Fixup phis in loop header from unreachable blocks. 10503 * has_branch/has_divergent_branch also indicates if the loop ends with a 10504 * break/continue instruction, but we don't emit those if unreachable=true */ 10505 if (unreachable) { 10506 assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch); 10507 bool linear = ctx->cf_info.has_branch; 10508 bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch; 10509 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) { 10510 if ((logical && instr->opcode == aco_opcode::p_phi) || 10511 (linear && instr->opcode == aco_opcode::p_linear_phi)) { 10512 /* the last operand should be the one that needs to be removed */ 10513 instr->operands.pop_back(); 10514 } else if (!is_phi(instr)) { 10515 break; 10516 } 10517 } 10518 } 10519 10520 /* Fixup linear phis in loop header from expecting a continue. Both this fixup 10521 * and the previous one shouldn't both happen at once because a break in the 10522 * merge block would get CSE'd */ 10523 if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) { 10524 unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1); 10525 Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand)); 10526 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) { 10527 if (instr->opcode == aco_opcode::p_linear_phi) { 10528 if (ctx->cf_info.has_branch) 10529 instr->operands.pop_back(); 10530 else 10531 instr->operands.back() = 10532 create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals); 10533 } else if (!is_phi(instr)) { 10534 break; 10535 } 10536 } 10537 } 10538 10539 end_loop(ctx, &lc); 10540} 10541 10542static void 10543begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond) 10544{ 10545 ic->cond = cond; 10546 10547 append_logical_end(ctx->block); 10548 ctx->block->kind |= block_kind_branch; 10549 10550 /* branch to linear then block */ 10551 assert(cond.regClass() == ctx->program->lane_mask); 10552 aco_ptr<Pseudo_branch_instruction> branch; 10553 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, 10554 Format::PSEUDO_BRANCH, 1, 1)); 10555 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10556 branch->operands[0] = Operand(cond); 10557 ctx->block->instructions.push_back(std::move(branch)); 10558 10559 ic->BB_if_idx = ctx->block->index; 10560 ic->BB_invert = Block(); 10561 /* Invert blocks are intentionally not marked as top level because they 10562 * are not part of the logical cfg. */ 10563 ic->BB_invert.kind |= block_kind_invert; 10564 ic->BB_endif = Block(); 10565 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level)); 10566 10567 ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard; 10568 ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break; 10569 ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth; 10570 ic->divergent_old = ctx->cf_info.parent_if.is_divergent; 10571 ctx->cf_info.parent_if.is_divergent = true; 10572 10573 /* divergent branches use cbranch_execz */ 10574 ctx->cf_info.exec_potentially_empty_discard = false; 10575 ctx->cf_info.exec_potentially_empty_break = false; 10576 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; 10577 10578 /** emit logical then block */ 10579 ctx->program->next_divergent_if_logical_depth++; 10580 Block* BB_then_logical = ctx->program->create_and_insert_block(); 10581 add_edge(ic->BB_if_idx, BB_then_logical); 10582 ctx->block = BB_then_logical; 10583 append_logical_start(BB_then_logical); 10584} 10585 10586static void 10587begin_divergent_if_else(isel_context* ctx, if_context* ic) 10588{ 10589 Block* BB_then_logical = ctx->block; 10590 append_logical_end(BB_then_logical); 10591 /* branch from logical then block to invert block */ 10592 aco_ptr<Pseudo_branch_instruction> branch; 10593 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10594 Format::PSEUDO_BRANCH, 0, 1)); 10595 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10596 BB_then_logical->instructions.emplace_back(std::move(branch)); 10597 add_linear_edge(BB_then_logical->index, &ic->BB_invert); 10598 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10599 add_logical_edge(BB_then_logical->index, &ic->BB_endif); 10600 BB_then_logical->kind |= block_kind_uniform; 10601 assert(!ctx->cf_info.has_branch); 10602 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch; 10603 ctx->cf_info.parent_loop.has_divergent_branch = false; 10604 ctx->program->next_divergent_if_logical_depth--; 10605 10606 /** emit linear then block */ 10607 Block* BB_then_linear = ctx->program->create_and_insert_block(); 10608 BB_then_linear->kind |= block_kind_uniform; 10609 add_linear_edge(ic->BB_if_idx, BB_then_linear); 10610 /* branch from linear then block to invert block */ 10611 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10612 Format::PSEUDO_BRANCH, 0, 1)); 10613 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10614 BB_then_linear->instructions.emplace_back(std::move(branch)); 10615 add_linear_edge(BB_then_linear->index, &ic->BB_invert); 10616 10617 /** emit invert merge block */ 10618 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert)); 10619 ic->invert_idx = ctx->block->index; 10620 10621 /* branch to linear else block (skip else) */ 10622 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10623 Format::PSEUDO_BRANCH, 0, 1)); 10624 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10625 ctx->block->instructions.push_back(std::move(branch)); 10626 10627 ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard; 10628 ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break; 10629 ic->exec_potentially_empty_break_depth_old = std::min( 10630 ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); 10631 /* divergent branches use cbranch_execz */ 10632 ctx->cf_info.exec_potentially_empty_discard = false; 10633 ctx->cf_info.exec_potentially_empty_break = false; 10634 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; 10635 10636 /** emit logical else block */ 10637 ctx->program->next_divergent_if_logical_depth++; 10638 Block* BB_else_logical = ctx->program->create_and_insert_block(); 10639 add_logical_edge(ic->BB_if_idx, BB_else_logical); 10640 add_linear_edge(ic->invert_idx, BB_else_logical); 10641 ctx->block = BB_else_logical; 10642 append_logical_start(BB_else_logical); 10643} 10644 10645static void 10646end_divergent_if(isel_context* ctx, if_context* ic) 10647{ 10648 Block* BB_else_logical = ctx->block; 10649 append_logical_end(BB_else_logical); 10650 10651 /* branch from logical else block to endif block */ 10652 aco_ptr<Pseudo_branch_instruction> branch; 10653 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10654 Format::PSEUDO_BRANCH, 0, 1)); 10655 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10656 BB_else_logical->instructions.emplace_back(std::move(branch)); 10657 add_linear_edge(BB_else_logical->index, &ic->BB_endif); 10658 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10659 add_logical_edge(BB_else_logical->index, &ic->BB_endif); 10660 BB_else_logical->kind |= block_kind_uniform; 10661 ctx->program->next_divergent_if_logical_depth--; 10662 10663 assert(!ctx->cf_info.has_branch); 10664 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent; 10665 10666 /** emit linear else block */ 10667 Block* BB_else_linear = ctx->program->create_and_insert_block(); 10668 BB_else_linear->kind |= block_kind_uniform; 10669 add_linear_edge(ic->invert_idx, BB_else_linear); 10670 10671 /* branch from linear else block to endif block */ 10672 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10673 Format::PSEUDO_BRANCH, 0, 1)); 10674 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10675 BB_else_linear->instructions.emplace_back(std::move(branch)); 10676 add_linear_edge(BB_else_linear->index, &ic->BB_endif); 10677 10678 /** emit endif merge block */ 10679 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif)); 10680 append_logical_start(ctx->block); 10681 10682 ctx->cf_info.parent_if.is_divergent = ic->divergent_old; 10683 ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old; 10684 ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old; 10685 ctx->cf_info.exec_potentially_empty_break_depth = std::min( 10686 ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); 10687 if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth && 10688 !ctx->cf_info.parent_if.is_divergent) { 10689 ctx->cf_info.exec_potentially_empty_break = false; 10690 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; 10691 } 10692 /* uniform control flow never has an empty exec-mask */ 10693 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) { 10694 ctx->cf_info.exec_potentially_empty_discard = false; 10695 ctx->cf_info.exec_potentially_empty_break = false; 10696 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; 10697 } 10698} 10699 10700static void 10701begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond) 10702{ 10703 assert(cond.regClass() == s1); 10704 10705 append_logical_end(ctx->block); 10706 ctx->block->kind |= block_kind_uniform; 10707 10708 aco_ptr<Pseudo_branch_instruction> branch; 10709 aco_opcode branch_opcode = aco_opcode::p_cbranch_z; 10710 branch.reset( 10711 create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1)); 10712 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10713 branch->operands[0] = Operand(cond); 10714 branch->operands[0].setFixed(scc); 10715 ctx->block->instructions.emplace_back(std::move(branch)); 10716 10717 ic->BB_if_idx = ctx->block->index; 10718 ic->BB_endif = Block(); 10719 ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level; 10720 10721 ctx->cf_info.has_branch = false; 10722 ctx->cf_info.parent_loop.has_divergent_branch = false; 10723 10724 /** emit then block */ 10725 ctx->program->next_uniform_if_depth++; 10726 Block* BB_then = ctx->program->create_and_insert_block(); 10727 add_edge(ic->BB_if_idx, BB_then); 10728 append_logical_start(BB_then); 10729 ctx->block = BB_then; 10730} 10731 10732static void 10733begin_uniform_if_else(isel_context* ctx, if_context* ic) 10734{ 10735 Block* BB_then = ctx->block; 10736 10737 ic->uniform_has_then_branch = ctx->cf_info.has_branch; 10738 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch; 10739 10740 if (!ic->uniform_has_then_branch) { 10741 append_logical_end(BB_then); 10742 /* branch from then block to endif block */ 10743 aco_ptr<Pseudo_branch_instruction> branch; 10744 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10745 Format::PSEUDO_BRANCH, 0, 1)); 10746 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10747 BB_then->instructions.emplace_back(std::move(branch)); 10748 add_linear_edge(BB_then->index, &ic->BB_endif); 10749 if (!ic->then_branch_divergent) 10750 add_logical_edge(BB_then->index, &ic->BB_endif); 10751 BB_then->kind |= block_kind_uniform; 10752 } 10753 10754 ctx->cf_info.has_branch = false; 10755 ctx->cf_info.parent_loop.has_divergent_branch = false; 10756 10757 /** emit else block */ 10758 Block* BB_else = ctx->program->create_and_insert_block(); 10759 add_edge(ic->BB_if_idx, BB_else); 10760 append_logical_start(BB_else); 10761 ctx->block = BB_else; 10762} 10763 10764static void 10765end_uniform_if(isel_context* ctx, if_context* ic) 10766{ 10767 Block* BB_else = ctx->block; 10768 10769 if (!ctx->cf_info.has_branch) { 10770 append_logical_end(BB_else); 10771 /* branch from then block to endif block */ 10772 aco_ptr<Pseudo_branch_instruction> branch; 10773 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10774 Format::PSEUDO_BRANCH, 0, 1)); 10775 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10776 BB_else->instructions.emplace_back(std::move(branch)); 10777 add_linear_edge(BB_else->index, &ic->BB_endif); 10778 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10779 add_logical_edge(BB_else->index, &ic->BB_endif); 10780 BB_else->kind |= block_kind_uniform; 10781 } 10782 10783 ctx->cf_info.has_branch &= ic->uniform_has_then_branch; 10784 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent; 10785 10786 /** emit endif merge block */ 10787 ctx->program->next_uniform_if_depth--; 10788 if (!ctx->cf_info.has_branch) { 10789 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif)); 10790 append_logical_start(ctx->block); 10791 } 10792} 10793 10794static bool 10795visit_if(isel_context* ctx, nir_if* if_stmt) 10796{ 10797 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa); 10798 Builder bld(ctx->program, ctx->block); 10799 aco_ptr<Pseudo_branch_instruction> branch; 10800 if_context ic; 10801 10802 if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */ 10803 /** 10804 * Uniform conditionals are represented in the following way*) : 10805 * 10806 * The linear and logical CFG: 10807 * BB_IF 10808 * / \ 10809 * BB_THEN (logical) BB_ELSE (logical) 10810 * \ / 10811 * BB_ENDIF 10812 * 10813 * *) Exceptions may be due to break and continue statements within loops 10814 * If a break/continue happens within uniform control flow, it branches 10815 * to the loop exit/entry block. Otherwise, it branches to the next 10816 * merge block. 10817 **/ 10818 10819 assert(cond.regClass() == ctx->program->lane_mask); 10820 cond = bool_to_scalar_condition(ctx, cond); 10821 10822 begin_uniform_if_then(ctx, &ic, cond); 10823 visit_cf_list(ctx, &if_stmt->then_list); 10824 10825 begin_uniform_if_else(ctx, &ic); 10826 visit_cf_list(ctx, &if_stmt->else_list); 10827 10828 end_uniform_if(ctx, &ic); 10829 } else { /* non-uniform condition */ 10830 /** 10831 * To maintain a logical and linear CFG without critical edges, 10832 * non-uniform conditionals are represented in the following way*) : 10833 * 10834 * The linear CFG: 10835 * BB_IF 10836 * / \ 10837 * BB_THEN (logical) BB_THEN (linear) 10838 * \ / 10839 * BB_INVERT (linear) 10840 * / \ 10841 * BB_ELSE (logical) BB_ELSE (linear) 10842 * \ / 10843 * BB_ENDIF 10844 * 10845 * The logical CFG: 10846 * BB_IF 10847 * / \ 10848 * BB_THEN (logical) BB_ELSE (logical) 10849 * \ / 10850 * BB_ENDIF 10851 * 10852 * *) Exceptions may be due to break and continue statements within loops 10853 **/ 10854 10855 begin_divergent_if_then(ctx, &ic, cond); 10856 visit_cf_list(ctx, &if_stmt->then_list); 10857 10858 begin_divergent_if_else(ctx, &ic); 10859 visit_cf_list(ctx, &if_stmt->else_list); 10860 10861 end_divergent_if(ctx, &ic); 10862 } 10863 10864 return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty(); 10865} 10866 10867static bool 10868visit_cf_list(isel_context* ctx, struct exec_list* list) 10869{ 10870 foreach_list_typed (nir_cf_node, node, node, list) { 10871 switch (node->type) { 10872 case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break; 10873 case nir_cf_node_if: 10874 if (!visit_if(ctx, nir_cf_node_as_if(node))) 10875 return true; 10876 break; 10877 case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break; 10878 default: unreachable("unimplemented cf list type"); 10879 } 10880 } 10881 return false; 10882} 10883 10884static void 10885export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos) 10886{ 10887 assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG); 10888 10889 const uint8_t *vs_output_param_offset = 10890 ctx->stage.has(SWStage::GS) ? ctx->program->info.vs.outinfo.vs_output_param_offset : 10891 ctx->stage.has(SWStage::TES) ? ctx->program->info.tes.outinfo.vs_output_param_offset : 10892 ctx->stage.has(SWStage::MS) ? ctx->program->info.ms.outinfo.vs_output_param_offset : 10893 ctx->program->info.vs.outinfo.vs_output_param_offset; 10894 10895 assert(vs_output_param_offset); 10896 10897 int offset = vs_output_param_offset[slot]; 10898 unsigned mask = ctx->outputs.mask[slot]; 10899 if (!is_pos && !mask) 10900 return; 10901 if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED) 10902 return; 10903 aco_ptr<Export_instruction> exp{ 10904 create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)}; 10905 exp->enabled_mask = mask; 10906 for (unsigned i = 0; i < 4; ++i) { 10907 if (mask & (1 << i)) 10908 exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]); 10909 else 10910 exp->operands[i] = Operand(v1); 10911 } 10912 /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang. 10913 * Setting valid_mask=1 prevents it and has no other effect. 10914 */ 10915 exp->valid_mask = ctx->options->gfx_level == GFX10 && is_pos && *next_pos == 0; 10916 exp->done = false; 10917 exp->compressed = false; 10918 if (is_pos) 10919 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++; 10920 else 10921 exp->dest = V_008DFC_SQ_EXP_PARAM + offset; 10922 ctx->block->instructions.emplace_back(std::move(exp)); 10923} 10924 10925static void 10926export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos, 10927 const aco_vp_output_info* outinfo) 10928{ 10929 aco_ptr<Export_instruction> exp{ 10930 create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)}; 10931 exp->enabled_mask = 0; 10932 for (unsigned i = 0; i < 4; ++i) 10933 exp->operands[i] = Operand(v1); 10934 if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) { 10935 exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]); 10936 exp->enabled_mask |= 0x1; 10937 } 10938 if (ctx->outputs.mask[VARYING_SLOT_LAYER] && !outinfo->writes_layer_per_primitive) { 10939 exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]); 10940 exp->enabled_mask |= 0x4; 10941 } 10942 if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT] && !outinfo->writes_viewport_index_per_primitive) { 10943 if (ctx->options->gfx_level < GFX9) { 10944 exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]); 10945 exp->enabled_mask |= 0x8; 10946 } else { 10947 Builder bld(ctx->program, ctx->block); 10948 10949 Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), 10950 Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u])); 10951 if (exp->operands[2].isTemp()) 10952 out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]); 10953 10954 exp->operands[2] = Operand(out); 10955 exp->enabled_mask |= 0x4; 10956 } 10957 } 10958 if (ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_SHADING_RATE]) { 10959 exp->operands[1] = Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]); 10960 exp->enabled_mask |= 0x2; 10961 } 10962 10963 exp->valid_mask = ctx->options->gfx_level == GFX10 && *next_pos == 0; 10964 exp->done = false; 10965 exp->compressed = false; 10966 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++; 10967 ctx->block->instructions.emplace_back(std::move(exp)); 10968} 10969 10970static void 10971create_vs_exports(isel_context* ctx) 10972{ 10973 assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG); 10974 const aco_vp_output_info* outinfo = 10975 ctx->stage.has(SWStage::GS) ? &ctx->program->info.vs.outinfo : 10976 ctx->stage.has(SWStage::TES) ? &ctx->program->info.tes.outinfo : 10977 ctx->stage.has(SWStage::MS) ? &ctx->program->info.ms.outinfo : 10978 &ctx->program->info.vs.outinfo; 10979 10980 assert(outinfo); 10981 ctx->block->kind |= block_kind_export_end; 10982 10983 /* Hardware requires position data to always be exported, even if the 10984 * application did not write gl_Position. 10985 */ 10986 ctx->outputs.mask[VARYING_SLOT_POS] = 0xf; 10987 10988 /* the order these position exports are created is important */ 10989 int next_pos = 0; 10990 export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos); 10991 10992 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index || 10993 outinfo->writes_primitive_shading_rate) { 10994 export_vs_psiz_layer_viewport_vrs(ctx, &next_pos, outinfo); 10995 } 10996 if (ctx->num_clip_distances + ctx->num_cull_distances > 0) 10997 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos); 10998 if (ctx->num_clip_distances + ctx->num_cull_distances > 4) 10999 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos); 11000 11001 if (ctx->export_clip_dists) { 11002 if (ctx->num_clip_distances + ctx->num_cull_distances > 0) 11003 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos); 11004 if (ctx->num_clip_distances + ctx->num_cull_distances > 4) 11005 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos); 11006 } 11007 11008 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { 11009 if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && i != VARYING_SLOT_PRIMITIVE_ID && 11010 i != VARYING_SLOT_VIEWPORT) 11011 continue; 11012 if (ctx->shader && ctx->shader->info.per_primitive_outputs & BITFIELD64_BIT(i)) 11013 continue; 11014 11015 export_vs_varying(ctx, i, false, NULL); 11016 } 11017} 11018 11019static void 11020create_primitive_exports(isel_context *ctx, Temp prim_ch1) 11021{ 11022 assert(ctx->stage.hw == HWStage::NGG); 11023 const aco_vp_output_info* outinfo = 11024 ctx->stage.has(SWStage::GS) ? &ctx->program->info.vs.outinfo : 11025 ctx->stage.has(SWStage::TES) ? &ctx->program->info.tes.outinfo : 11026 ctx->stage.has(SWStage::MS) ? &ctx->program->info.ms.outinfo : 11027 &ctx->program->info.vs.outinfo; 11028 11029 Builder bld(ctx->program, ctx->block); 11030 11031 /* When layer, viewport etc. are per-primitive, they need to be encoded in 11032 * the primitive export instruction's second channel. The encoding is: 11033 * bits 31..30: VRS rate Y 11034 * bits 29..28: VRS rate X 11035 * bits 23..20: viewport 11036 * bits 19..17: layer 11037 */ 11038 Temp ch2 = bld.copy(bld.def(v1), Operand::c32(0)); 11039 uint en_mask = 1; 11040 11041 if (outinfo->writes_layer_per_primitive) { 11042 en_mask |= 2; 11043 Temp tmp = ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]; 11044 ch2 = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), tmp, Operand::c32(17), ch2); 11045 } 11046 if (outinfo->writes_viewport_index_per_primitive) { 11047 en_mask |= 2; 11048 Temp tmp = ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]; 11049 ch2 = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), tmp, Operand::c32(20), ch2); 11050 } 11051 if (outinfo->writes_primitive_shading_rate_per_primitive) { 11052 en_mask |= 2; 11053 Temp tmp = ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]; 11054 ch2 = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), tmp, ch2); 11055 } 11056 11057 Operand prim_ch2 = (en_mask & 2) ? Operand(ch2) : Operand(v1); 11058 11059 bld.exp(aco_opcode::exp, prim_ch1, prim_ch2, Operand(v1), Operand(v1), 11060 en_mask /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, 11061 true /* done */, false /* valid mask */); 11062 11063 /* Export generic per-primitive attributes. */ 11064 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { 11065 if (!(ctx->shader->info.per_primitive_outputs & BITFIELD64_BIT(i))) 11066 continue; 11067 if (i == VARYING_SLOT_PRIMITIVE_SHADING_RATE) 11068 continue; 11069 11070 export_vs_varying(ctx, i, false, NULL); 11071 } 11072} 11073 11074static bool 11075export_fs_mrt_z(isel_context* ctx) 11076{ 11077 Builder bld(ctx->program, ctx->block); 11078 unsigned enabled_channels = 0; 11079 bool compr = false; 11080 Operand values[4]; 11081 11082 for (unsigned i = 0; i < 4; ++i) { 11083 values[i] = Operand(v1); 11084 } 11085 11086 /* Both stencil and sample mask only need 16-bits. */ 11087 if (!ctx->program->info.ps.writes_z && 11088 (ctx->program->info.ps.writes_stencil || ctx->program->info.ps.writes_sample_mask)) { 11089 compr = ctx->program->gfx_level < GFX11; /* COMPR flag */ 11090 11091 if (ctx->program->info.ps.writes_stencil) { 11092 /* Stencil should be in X[23:16]. */ 11093 values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]); 11094 values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), values[0]); 11095 enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x1 : 0x3; 11096 } 11097 11098 if (ctx->program->info.ps.writes_sample_mask) { 11099 /* SampleMask should be in Y[15:0]. */ 11100 values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]); 11101 enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x2 : 0xc; 11102 } 11103 11104 if (ctx->options->key.ps.alpha_to_coverage_via_mrtz && 11105 (ctx->outputs.mask[FRAG_RESULT_DATA0] & 0x8)) { 11106 /* MRT0 alpha should be in Y[31:16] if alpha-to-coverage is enabled and MRTZ is present. */ 11107 assert(ctx->program->gfx_level >= GFX11); 11108 Operand mrtz_alpha = Operand(ctx->outputs.temps[FRAG_RESULT_DATA0 + 3u]); 11109 mrtz_alpha = 11110 bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), mrtz_alpha); 11111 if (ctx->program->info.ps.writes_sample_mask) { 11112 /* Ignore the high 16 bits of the sample mask. */ 11113 values[1] = bld.vop3(aco_opcode::v_and_or_b32, bld.def(v1), values[1], 11114 Operand::c32(0x0000ffffu), mrtz_alpha); 11115 } else { 11116 values[1] = mrtz_alpha; 11117 } 11118 enabled_channels |= 0x2; 11119 } 11120 } else { 11121 if (ctx->program->info.ps.writes_z) { 11122 values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]); 11123 enabled_channels |= 0x1; 11124 } 11125 11126 if (ctx->program->info.ps.writes_stencil) { 11127 values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]); 11128 enabled_channels |= 0x2; 11129 } 11130 11131 if (ctx->program->info.ps.writes_sample_mask) { 11132 values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]); 11133 enabled_channels |= 0x4; 11134 } 11135 11136 if (ctx->options->key.ps.alpha_to_coverage_via_mrtz && 11137 (ctx->outputs.mask[FRAG_RESULT_DATA0] & 0x8)) { 11138 assert(ctx->program->gfx_level >= GFX11); 11139 values[3] = Operand(ctx->outputs.temps[FRAG_RESULT_DATA0 + 3u]); 11140 enabled_channels |= 0x8; 11141 } 11142 } 11143 11144 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X 11145 * writemask component. 11146 */ 11147 if (ctx->options->gfx_level == GFX6 && ctx->options->family != CHIP_OLAND && 11148 ctx->options->family != CHIP_HAINAN) { 11149 enabled_channels |= 0x1; 11150 } 11151 11152 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, 11153 V_008DFC_SQ_EXP_MRTZ, compr); 11154 11155 return true; 11156} 11157 11158struct mrt_color_export { 11159 int slot; 11160 unsigned write_mask; 11161 Operand values[4]; 11162 uint8_t col_format; 11163 11164 /* Fields below are only used for PS epilogs. */ 11165 bool is_int8; 11166 bool is_int10; 11167 bool enable_mrt_output_nan_fixup; 11168}; 11169 11170static bool 11171export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export *out, 11172 bool is_ps_epilog) 11173{ 11174 Builder bld(ctx->program, ctx->block); 11175 Operand values[4]; 11176 11177 for (unsigned i = 0; i < 4; ++i) { 11178 values[i] = out->values[i]; 11179 } 11180 11181 unsigned target; 11182 unsigned enabled_channels = 0; 11183 aco_opcode compr_op = aco_opcode::num_opcodes; 11184 bool compr = false; 11185 11186 target = V_008DFC_SQ_EXP_MRT + out->slot; 11187 11188 /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */ 11189 if (out->enable_mrt_output_nan_fixup && 11190 (out->col_format == V_028714_SPI_SHADER_32_R || out->col_format == V_028714_SPI_SHADER_32_GR || 11191 out->col_format == V_028714_SPI_SHADER_32_AR || out->col_format == V_028714_SPI_SHADER_32_ABGR || 11192 out->col_format == V_028714_SPI_SHADER_FP16_ABGR)) { 11193 u_foreach_bit(i, out->write_mask) { 11194 Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.def(bld.lm), values[i], 11195 bld.copy(bld.def(v1), Operand::c32(3u))); 11196 values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i], 11197 bld.copy(bld.def(v1), Operand::zero()), isnan); 11198 } 11199 } 11200 11201 switch (out->col_format) { 11202 case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break; 11203 11204 case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break; 11205 11206 case V_028714_SPI_SHADER_32_AR: 11207 if (ctx->options->gfx_level >= GFX10) { 11208 /* Special case: on GFX10, the outputs are different for 32_AR */ 11209 enabled_channels = 0x3; 11210 values[1] = values[3]; 11211 values[3] = Operand(v1); 11212 } else { 11213 enabled_channels = 0x9; 11214 } 11215 break; 11216 11217 case V_028714_SPI_SHADER_FP16_ABGR: 11218 if (is_ps_epilog) { 11219 for (int i = 0; i < 2; i++) { 11220 bool enabled = (out->write_mask >> (i * 2)) & 0x3; 11221 if (enabled) { 11222 enabled_channels |= 0x3 << (i * 2); 11223 if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) { 11224 values[i] = 11225 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), 11226 values[i * 2].isUndefined() ? Operand::zero() : values[i * 2], 11227 values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]); 11228 } else { 11229 values[i] = 11230 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), 11231 values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2], 11232 values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]); 11233 } 11234 } else { 11235 values[i] = Operand(v1); 11236 } 11237 } 11238 values[2] = Operand(v1); 11239 values[3] = Operand(v1); 11240 } else { 11241 enabled_channels = util_widen_mask(out->write_mask, 2); 11242 } 11243 compr = true; 11244 break; 11245 11246 case V_028714_SPI_SHADER_UNORM16_ABGR: 11247 if (is_ps_epilog) { 11248 compr_op = aco_opcode::v_cvt_pknorm_u16_f32; 11249 } else { 11250 enabled_channels = util_widen_mask(out->write_mask, 2); 11251 compr = true; 11252 } 11253 break; 11254 11255 case V_028714_SPI_SHADER_SNORM16_ABGR: 11256 if (is_ps_epilog) { 11257 compr_op = aco_opcode::v_cvt_pknorm_i16_f32; 11258 } else { 11259 enabled_channels = util_widen_mask(out->write_mask, 2); 11260 compr = true; 11261 } 11262 break; 11263 11264 case V_028714_SPI_SHADER_UINT16_ABGR: 11265 if (is_ps_epilog) { 11266 compr_op = aco_opcode::v_cvt_pk_u16_u32; 11267 if (out->is_int8 || out->is_int10) { 11268 /* clamp */ 11269 uint32_t max_rgb = out->is_int8 ? 255 : out->is_int10 ? 1023 : 0; 11270 11271 u_foreach_bit(i, out->write_mask) { 11272 uint32_t max = i == 3 && out->is_int10 ? 3 : max_rgb; 11273 11274 values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]); 11275 } 11276 } 11277 } else { 11278 enabled_channels = util_widen_mask(out->write_mask, 2); 11279 compr = true; 11280 } 11281 break; 11282 11283 case V_028714_SPI_SHADER_SINT16_ABGR: 11284 if (is_ps_epilog) { 11285 compr_op = aco_opcode::v_cvt_pk_i16_i32; 11286 if (out->is_int8 || out->is_int10) { 11287 /* clamp */ 11288 uint32_t max_rgb = out->is_int8 ? 127 : out->is_int10 ? 511 : 0; 11289 uint32_t min_rgb = out->is_int8 ? -128 : out->is_int10 ? -512 : 0; 11290 11291 u_foreach_bit(i, out->write_mask) { 11292 uint32_t max = i == 3 && out->is_int10 ? 1 : max_rgb; 11293 uint32_t min = i == 3 && out->is_int10 ? -2u : min_rgb; 11294 11295 values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]); 11296 values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]); 11297 } 11298 } 11299 } else { 11300 enabled_channels = util_widen_mask(out->write_mask, 2); 11301 compr = true; 11302 } 11303 break; 11304 11305 case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break; 11306 11307 case V_028714_SPI_SHADER_ZERO: 11308 default: return false; 11309 } 11310 11311 if (compr_op != aco_opcode::num_opcodes) { 11312 for (int i = 0; i < 2; i++) { 11313 /* check if at least one of the values to be compressed is enabled */ 11314 bool enabled = (out->write_mask >> (i * 2)) & 0x3; 11315 if (enabled) { 11316 enabled_channels |= 0x3 << (i * 2); 11317 values[i] = bld.vop3( 11318 compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2], 11319 values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]); 11320 } else { 11321 values[i] = Operand(v1); 11322 } 11323 } 11324 values[2] = Operand(v1); 11325 values[3] = Operand(v1); 11326 compr = true; 11327 } else if (!compr) { 11328 for (int i = 0; i < 4; i++) 11329 values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1); 11330 } 11331 11332 if (ctx->program->gfx_level >= GFX11) { 11333 /* GFX11 doesn't use COMPR for exports, but the channel mask should be 11334 * 0x3 instead. 11335 */ 11336 enabled_channels = compr ? 0x3 : enabled_channels; 11337 compr = false; 11338 } 11339 11340 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target, 11341 compr); 11342 return true; 11343} 11344 11345static void 11346create_fs_null_export(isel_context* ctx) 11347{ 11348 /* FS must always have exports. 11349 * So when there are none, we need to add a null export. 11350 */ 11351 11352 Builder bld(ctx->program, ctx->block); 11353 /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */ 11354 unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL; 11355 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 11356 /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true); 11357} 11358 11359static void 11360create_fs_jump_to_epilog(isel_context* ctx) 11361{ 11362 Builder bld(ctx->program, ctx->block); 11363 std::vector<Operand> color_exports; 11364 PhysReg exports_start(256); /* VGPR 0 */ 11365 11366 for (unsigned slot = FRAG_RESULT_DATA0; slot < FRAG_RESULT_DATA7 + 1; ++slot) { 11367 unsigned color_index = slot - FRAG_RESULT_DATA0; 11368 unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3; 11369 unsigned write_mask = ctx->outputs.mask[slot]; 11370 11371 if (!write_mask) 11372 continue; 11373 11374 PhysReg color_start(exports_start.reg() + color_index * 4); 11375 11376 for (unsigned i = 0; i < 4; i++) { 11377 if (!(write_mask & BITFIELD_BIT(i))) { 11378 color_exports.emplace_back(Operand(v1)); 11379 continue; 11380 } 11381 11382 PhysReg chan_reg = color_start.advance(i * 4u); 11383 Operand chan(ctx->outputs.temps[slot * 4u + i]); 11384 11385 if (color_type == ACO_TYPE_FLOAT16) { 11386 chan = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), chan); 11387 } else if (color_type == ACO_TYPE_INT16 || color_type == ACO_TYPE_UINT16) { 11388 bool sign_ext = color_type == ACO_TYPE_INT16; 11389 Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext); 11390 chan = Operand(tmp); 11391 } 11392 11393 chan.setFixed(chan_reg); 11394 color_exports.emplace_back(chan); 11395 } 11396 } 11397 11398 Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ps_epilog_pc)); 11399 11400 aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>( 11401 aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + color_exports.size(), 0)}; 11402 jump->operands[0] = Operand(continue_pc); 11403 for (unsigned i = 0; i < color_exports.size(); i++) { 11404 jump->operands[i + 1] = color_exports[i]; 11405 } 11406 ctx->block->instructions.emplace_back(std::move(jump)); 11407} 11408 11409static void 11410create_fs_exports(isel_context* ctx) 11411{ 11412 Builder bld(ctx->program, ctx->block); 11413 bool exported = false; 11414 11415 /* Export depth, stencil and sample mask. */ 11416 if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] || 11417 ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK]) 11418 exported |= export_fs_mrt_z(ctx); 11419 11420 if (ctx->program->info.ps.has_epilog) { 11421 create_fs_jump_to_epilog(ctx); 11422 } else { 11423 /* Export all color render targets. */ 11424 for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i) { 11425 if (!ctx->outputs.mask[i]) 11426 continue; 11427 11428 struct mrt_color_export out = {0}; 11429 11430 out.slot = i - FRAG_RESULT_DATA0; 11431 out.write_mask = ctx->outputs.mask[i]; 11432 out.col_format = (ctx->options->key.ps.col_format >> (4 * out.slot)) & 0xf; 11433 11434 for (unsigned c = 0; c < 4; ++c) { 11435 if (out.write_mask & (1 << c)) { 11436 out.values[c] = Operand(ctx->outputs.temps[i * 4u + c]); 11437 } else { 11438 out.values[c] = Operand(v1); 11439 } 11440 } 11441 11442 exported |= export_fs_mrt_color(ctx, &out, false); 11443 } 11444 11445 if (!exported) 11446 create_fs_null_export(ctx); 11447 } 11448 11449 ctx->block->kind |= block_kind_export_end; 11450} 11451 11452static void 11453emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset, 11454 const struct aco_stream_output* output) 11455{ 11456 assert(ctx->stage.hw == HWStage::VS); 11457 11458 unsigned loc = output->location; 11459 unsigned buf = output->buffer; 11460 11461 unsigned writemask = output->component_mask & ctx->outputs.mask[loc]; 11462 while (writemask) { 11463 int start, count; 11464 u_bit_scan_consecutive_range(&writemask, &start, &count); 11465 if (count == 3 && ctx->options->gfx_level == GFX6) { 11466 /* GFX6 doesn't support storing vec3, split it. */ 11467 writemask |= 1u << (start + 2); 11468 count = 2; 11469 } 11470 11471 unsigned offset = output->offset + (start - (ffs(output->component_mask) - 1)) * 4; 11472 11473 Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count)); 11474 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 11475 aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; 11476 for (int i = 0; i < count; ++i) 11477 vec->operands[i] = Operand(ctx->outputs.temps[loc * 4 + start + i]); 11478 vec->definitions[0] = Definition(write_data); 11479 ctx->block->instructions.emplace_back(std::move(vec)); 11480 11481 aco_opcode opcode = get_buffer_store_op(count * 4); 11482 aco_ptr<MUBUF_instruction> store{ 11483 create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)}; 11484 store->operands[0] = Operand(so_buffers[buf]); 11485 store->operands[1] = Operand(so_write_offset[buf]); 11486 store->operands[2] = Operand::c32(0); 11487 store->operands[3] = Operand(write_data); 11488 if (offset > 4095) { 11489 /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */ 11490 Builder bld(ctx->program, ctx->block); 11491 store->operands[1] = 11492 bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf])); 11493 } else { 11494 store->offset = offset; 11495 } 11496 store->offen = true; 11497 store->glc = ctx->program->gfx_level < GFX11; 11498 store->dlc = false; 11499 store->slc = true; 11500 ctx->block->instructions.emplace_back(std::move(store)); 11501 } 11502} 11503 11504static void 11505emit_streamout(isel_context* ctx, unsigned stream) 11506{ 11507 Builder bld(ctx->program, ctx->block); 11508 11509 Temp so_vtx_count = 11510 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), 11511 get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u)); 11512 11513 Temp tid = emit_mbcnt(ctx, bld.tmp(v1)); 11514 11515 Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid); 11516 11517 if_context ic; 11518 begin_divergent_if_then(ctx, &ic, can_emit); 11519 11520 bld.reset(ctx->block); 11521 11522 Temp so_write_index = 11523 bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid); 11524 11525 Temp so_buffers[4]; 11526 Temp so_write_offset[4]; 11527 Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers)); 11528 11529 for (unsigned i = 0; i < 4; i++) { 11530 unsigned stride = ctx->program->info.so.strides[i]; 11531 if (!stride) 11532 continue; 11533 11534 so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, 11535 bld.copy(bld.def(s1), Operand::c32(i * 16u))); 11536 11537 if (stride == 1) { 11538 Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), 11539 get_arg(ctx, ctx->args->ac.streamout_write_index), 11540 get_arg(ctx, ctx->args->ac.streamout_offset[i])); 11541 Temp new_offset = bld.vadd32(bld.def(v1), offset, tid); 11542 11543 so_write_offset[i] = 11544 bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset); 11545 } else { 11546 Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u); 11547 Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u), 11548 get_arg(ctx, ctx->args->ac.streamout_offset[i])); 11549 so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2); 11550 } 11551 } 11552 11553 for (unsigned i = 0; i < ctx->program->info.so.num_outputs; i++) { 11554 const struct aco_stream_output* output = &ctx->program->info.so.outputs[i]; 11555 if (stream != output->stream) 11556 continue; 11557 11558 emit_stream_output(ctx, so_buffers, so_write_offset, output); 11559 } 11560 11561 begin_divergent_if_else(ctx, &ic); 11562 end_divergent_if(ctx, &ic); 11563} 11564 11565Pseudo_instruction* 11566add_startpgm(struct isel_context* ctx) 11567{ 11568 unsigned def_count = 0; 11569 for (unsigned i = 0; i < ctx->args->ac.arg_count; i++) { 11570 if (ctx->args->ac.args[i].skip) 11571 continue; 11572 unsigned align = MIN2(4, util_next_power_of_two(ctx->args->ac.args[i].size)); 11573 if (ctx->args->ac.args[i].file == AC_ARG_SGPR && ctx->args->ac.args[i].offset % align) 11574 def_count += ctx->args->ac.args[i].size; 11575 else 11576 def_count++; 11577 } 11578 11579 Pseudo_instruction* startpgm = 11580 create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count); 11581 ctx->block->instructions.emplace_back(startpgm); 11582 for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) { 11583 if (ctx->args->ac.args[i].skip) 11584 continue; 11585 11586 enum ac_arg_regfile file = ctx->args->ac.args[i].file; 11587 unsigned size = ctx->args->ac.args[i].size; 11588 unsigned reg = ctx->args->ac.args[i].offset; 11589 RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size); 11590 11591 if (file == AC_ARG_SGPR && reg % MIN2(4, util_next_power_of_two(size))) { 11592 Temp elems[16]; 11593 for (unsigned j = 0; j < size; j++) { 11594 elems[j] = ctx->program->allocateTmp(s1); 11595 startpgm->definitions[arg++] = Definition(elems[j].id(), PhysReg{reg + j}, s1); 11596 } 11597 ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4); 11598 } else { 11599 Temp dst = ctx->program->allocateTmp(type); 11600 ctx->arg_temps[i] = dst; 11601 startpgm->definitions[arg] = Definition(dst); 11602 startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256}); 11603 arg++; 11604 } 11605 } 11606 11607 /* Stash these in the program so that they can be accessed later when 11608 * handling spilling. 11609 */ 11610 ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets); 11611 if (ctx->program->gfx_level <= GFX10_3) { 11612 ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset); 11613 11614 if (ctx->program->gfx_level >= GFX9) { 11615 Operand scratch_offset(ctx->program->scratch_offset); 11616 scratch_offset.setLateKill(true); 11617 Builder bld(ctx->program, ctx->block); 11618 bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), 11619 ctx->program->private_segment_buffer, scratch_offset); 11620 } 11621 } 11622 11623 if (ctx->stage.has(SWStage::VS) && ctx->program->info.vs.dynamic_inputs) { 11624 unsigned num_attributes = util_last_bit(ctx->program->info.vs.vb_desc_usage_mask); 11625 for (unsigned i = 0; i < num_attributes; i++) { 11626 Definition def(get_arg(ctx, ctx->args->vs_inputs[i])); 11627 11628 unsigned idx = ctx->args->vs_inputs[i].arg_index; 11629 def.setFixed(PhysReg(256 + ctx->args->ac.args[idx].offset)); 11630 11631 ctx->program->vs_inputs.push_back(def); 11632 } 11633 } 11634 11635 return startpgm; 11636} 11637 11638void 11639fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm) 11640{ 11641 assert(ctx->shader->info.stage == MESA_SHADER_VERTEX); 11642 Builder bld(ctx->program, ctx->block); 11643 constexpr unsigned hs_idx = 1u; 11644 Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), 11645 get_arg(ctx, ctx->args->ac.merged_wave_info), 11646 Operand::c32((8u << 16) | (hs_idx * 8u))); 11647 Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp()); 11648 11649 /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */ 11650 11651 Temp instance_id = 11652 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id), 11653 get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads); 11654 Temp vs_rel_patch_id = 11655 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids), 11656 get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads); 11657 Temp vertex_id = 11658 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id), 11659 get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads); 11660 11661 ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id; 11662 ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id; 11663 ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id; 11664} 11665 11666void 11667split_arguments(isel_context* ctx, Pseudo_instruction* startpgm) 11668{ 11669 /* Split all arguments except for the first (ring_offsets) and the last 11670 * (exec) so that the dead channels don't stay live throughout the program. 11671 */ 11672 for (int i = 1; i < startpgm->definitions.size(); i++) { 11673 if (startpgm->definitions[i].regClass().size() > 1) { 11674 emit_split_vector(ctx, startpgm->definitions[i].getTemp(), 11675 startpgm->definitions[i].regClass().size()); 11676 } 11677 } 11678} 11679 11680void 11681handle_bc_optimize(isel_context* ctx) 11682{ 11683 /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */ 11684 Builder bld(ctx->program, ctx->block); 11685 uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena; 11686 bool uses_center = 11687 G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena); 11688 bool uses_persp_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena); 11689 bool uses_linear_centroid = G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena); 11690 11691 if (uses_persp_centroid) 11692 ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid); 11693 if (uses_linear_centroid) 11694 ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid); 11695 11696 if (uses_center && (uses_persp_centroid || uses_linear_centroid)) { 11697 Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), 11698 get_arg(ctx, ctx->args->ac.prim_mask), Operand::zero()); 11699 11700 if (uses_persp_centroid) { 11701 Temp new_coord[2]; 11702 for (unsigned i = 0; i < 2; i++) { 11703 Temp persp_centroid = 11704 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1); 11705 Temp persp_center = 11706 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1); 11707 new_coord[i] = 11708 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), persp_centroid, persp_center, sel); 11709 } 11710 ctx->persp_centroid = bld.tmp(v2); 11711 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid), 11712 Operand(new_coord[0]), Operand(new_coord[1])); 11713 emit_split_vector(ctx, ctx->persp_centroid, 2); 11714 } 11715 11716 if (uses_linear_centroid) { 11717 Temp new_coord[2]; 11718 for (unsigned i = 0; i < 2; i++) { 11719 Temp linear_centroid = 11720 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1); 11721 Temp linear_center = 11722 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1); 11723 new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), linear_centroid, 11724 linear_center, sel); 11725 } 11726 ctx->linear_centroid = bld.tmp(v2); 11727 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid), 11728 Operand(new_coord[0]), Operand(new_coord[1])); 11729 emit_split_vector(ctx, ctx->linear_centroid, 2); 11730 } 11731 } 11732} 11733 11734void 11735setup_fp_mode(isel_context* ctx, nir_shader* shader) 11736{ 11737 Program* program = ctx->program; 11738 11739 unsigned float_controls = shader->info.float_controls_execution_mode; 11740 11741 program->next_fp_mode.preserve_signed_zero_inf_nan32 = 11742 float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32; 11743 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = 11744 float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 | 11745 FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64); 11746 11747 program->next_fp_mode.must_flush_denorms32 = 11748 float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32; 11749 program->next_fp_mode.must_flush_denorms16_64 = 11750 float_controls & 11751 (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64); 11752 11753 program->next_fp_mode.care_about_round32 = 11754 float_controls & 11755 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32); 11756 11757 program->next_fp_mode.care_about_round16_64 = 11758 float_controls & 11759 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 | 11760 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64); 11761 11762 /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and 11763 * the precision seems needed for Wolfenstein: Youngblood to render correctly */ 11764 if (program->next_fp_mode.must_flush_denorms16_64) 11765 program->next_fp_mode.denorm16_64 = 0; 11766 else 11767 program->next_fp_mode.denorm16_64 = fp_denorm_keep; 11768 11769 /* preserving fp32 denorms is expensive, so only do it if asked */ 11770 if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) 11771 program->next_fp_mode.denorm32 = fp_denorm_keep; 11772 else 11773 program->next_fp_mode.denorm32 = 0; 11774 11775 if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) 11776 program->next_fp_mode.round32 = fp_round_tz; 11777 else 11778 program->next_fp_mode.round32 = fp_round_ne; 11779 11780 if (float_controls & 11781 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64)) 11782 program->next_fp_mode.round16_64 = fp_round_tz; 11783 else 11784 program->next_fp_mode.round16_64 = fp_round_ne; 11785 11786 ctx->block->fp_mode = program->next_fp_mode; 11787} 11788 11789void 11790cleanup_cfg(Program* program) 11791{ 11792 /* create linear_succs/logical_succs */ 11793 for (Block& BB : program->blocks) { 11794 for (unsigned idx : BB.linear_preds) 11795 program->blocks[idx].linear_succs.emplace_back(BB.index); 11796 for (unsigned idx : BB.logical_preds) 11797 program->blocks[idx].logical_succs.emplace_back(BB.index); 11798 } 11799} 11800 11801Temp 11802lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true) 11803{ 11804 assert(count.regClass() == s1); 11805 11806 Builder bld(ctx->program, ctx->block); 11807 Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero()); 11808 Temp cond; 11809 11810 if (ctx->program->wave_size == 64) { 11811 /* If we know that all 64 threads can't be active at a time, we just use the mask as-is */ 11812 if (!allow64) 11813 return mask; 11814 11815 /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */ 11816 Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, 11817 Operand::c32(6u /* log2(64) */)); 11818 cond = 11819 bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64)); 11820 } else { 11821 /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of 11822 * the register */ 11823 cond = emit_extract_vector(ctx, mask, 0, bld.lm); 11824 } 11825 11826 return cond; 11827} 11828 11829Temp 11830merged_wave_info_to_mask(isel_context* ctx, unsigned i) 11831{ 11832 Builder bld(ctx->program, ctx->block); 11833 11834 /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */ 11835 Temp count = i == 0 11836 ? get_arg(ctx, ctx->args->ac.merged_wave_info) 11837 : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), 11838 get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(i * 8u)); 11839 11840 return lanecount_to_mask(ctx, count); 11841} 11842 11843void 11844ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt) 11845{ 11846 assert(vtx_cnt.id() && prm_cnt.id()); 11847 11848 Builder bld(ctx->program, ctx->block); 11849 Temp prm_cnt_0; 11850 11851 if (ctx->program->gfx_level == GFX10 && 11852 (ctx->stage.has(SWStage::GS) || ctx->program->info.has_ngg_culling)) { 11853 /* Navi 1x workaround: check whether the workgroup has no output. 11854 * If so, change the number of exported vertices and primitives to 1. 11855 */ 11856 prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand::zero()); 11857 prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), prm_cnt, 11858 bld.scc(prm_cnt_0)); 11859 vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), vtx_cnt, 11860 bld.scc(prm_cnt_0)); 11861 } 11862 11863 /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */ 11864 Temp tmp = 11865 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand::c32(12u)); 11866 tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt); 11867 11868 /* Request the SPI to allocate space for the primitives and vertices 11869 * that will be exported by the threadgroup. 11870 */ 11871 bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req); 11872 11873 if (prm_cnt_0.id()) { 11874 /* Navi 1x workaround: export a triangle with NaN coordinates when NGG has no output. 11875 * It can't have all-zero positions because that would render an undesired pixel with 11876 * conservative rasterization. 11877 */ 11878 Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)); 11879 Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), 11880 Operand::c32_or_c64(1u, ctx->program->wave_size == 64), first_lane); 11881 cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond, 11882 Operand::zero(ctx->program->wave_size == 64 ? 8 : 4), bld.scc(prm_cnt_0)); 11883 11884 if_context ic_prim_0; 11885 begin_divergent_if_then(ctx, &ic_prim_0, cond); 11886 bld.reset(ctx->block); 11887 ctx->block->kind |= block_kind_export_end; 11888 11889 /* Use zero: means that it's a triangle whose every vertex index is 0. */ 11890 Temp zero = bld.copy(bld.def(v1), Operand::zero()); 11891 /* Use NaN for the coordinates, so that the rasterizer allways culls it. */ 11892 Temp nan_coord = bld.copy(bld.def(v1), Operand::c32(-1u)); 11893 11894 bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */, 11895 V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */, 11896 false /* valid mask */); 11897 bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */, 11898 V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */, 11899 true /* valid mask */); 11900 11901 begin_divergent_if_else(ctx, &ic_prim_0); 11902 end_divergent_if(ctx, &ic_prim_0); 11903 bld.reset(ctx->block); 11904 } 11905} 11906 11907} /* end namespace */ 11908 11909void 11910select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders, 11911 ac_shader_config* config, const struct aco_compiler_options* options, 11912 const struct aco_shader_info* info, 11913 const struct radv_shader_args* args) 11914{ 11915 isel_context ctx = setup_isel_context(program, shader_count, shaders, config, options, info, args, false, false); 11916 if_context ic_merged_wave_info; 11917 bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS); 11918 11919 for (unsigned i = 0; i < shader_count; i++) { 11920 nir_shader* nir = shaders[i]; 11921 init_context(&ctx, nir); 11922 11923 setup_fp_mode(&ctx, nir); 11924 11925 if (!i) { 11926 /* needs to be after init_context() for FS */ 11927 Pseudo_instruction* startpgm = add_startpgm(&ctx); 11928 append_logical_start(ctx.block); 11929 11930 if (unlikely(ctx.options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs)) 11931 fix_ls_vgpr_init_bug(&ctx, startpgm); 11932 11933 split_arguments(&ctx, startpgm); 11934 11935 if (!info->vs.has_prolog && 11936 (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) { 11937 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u); 11938 } 11939 } 11940 11941 /* In a merged VS+TCS HS, the VS implementation can be completely empty. */ 11942 nir_function_impl* func = nir_shader_get_entrypoint(nir); 11943 bool empty_shader = 11944 nir_cf_list_is_empty_block(&func->body) && 11945 ((nir->info.stage == MESA_SHADER_VERTEX && 11946 (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) || 11947 (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs)); 11948 11949 bool check_merged_wave_info = 11950 ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1)); 11951 bool endif_merged_wave_info = 11952 ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1)); 11953 11954 if (program->gfx_level == GFX10 && program->stage.hw == HWStage::NGG && 11955 program->stage.num_sw_stages() == 1) { 11956 /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before 11957 * s_sendmsg(GS_ALLOC_REQ). */ 11958 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u); 11959 } 11960 11961 if (check_merged_wave_info) { 11962 Temp cond = merged_wave_info_to_mask(&ctx, i); 11963 begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond); 11964 } 11965 11966 if (i) { 11967 Builder bld(ctx.program, ctx.block); 11968 11969 /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */ 11970 bool tcs_skip_barrier = ctx.stage == vertex_tess_control_hs && 11971 ctx.tcs_temp_only_inputs == nir->info.inputs_read; 11972 11973 if (!ngg_gs && !tcs_skip_barrier) { 11974 sync_scope scope = 11975 ctx.stage == vertex_tess_control_hs && 11976 program->wave_size % ctx.options->key.tcs.tess_input_vertices == 0 && 11977 ctx.options->key.tcs.tess_input_vertices == nir->info.tess.tcs_vertices_out 11978 ? scope_subgroup 11979 : scope_workgroup; 11980 bld.barrier(aco_opcode::p_barrier, 11981 memory_sync_info(storage_shared, semantic_acqrel, scope), scope); 11982 } 11983 11984 if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) { 11985 ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc), 11986 get_arg(&ctx, args->ac.merged_wave_info), Operand::c32(2u), 11987 Operand::c32(8u), Operand::zero()); 11988 } 11989 } else if (ctx.stage == geometry_gs) 11990 ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id); 11991 11992 if (ctx.stage == fragment_fs) 11993 handle_bc_optimize(&ctx); 11994 11995 visit_cf_list(&ctx, &func->body); 11996 11997 if (ctx.program->info.so.num_outputs && ctx.stage.hw == HWStage::VS) 11998 emit_streamout(&ctx, 0); 11999 12000 if (ctx.stage.hw == HWStage::VS) { 12001 create_vs_exports(&ctx); 12002 } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) { 12003 Builder bld(ctx.program, ctx.block); 12004 bld.barrier(aco_opcode::p_barrier, 12005 memory_sync_info(storage_vmem_output, semantic_release, scope_device)); 12006 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, 12007 sendmsg_gs_done(false, false, 0)); 12008 } 12009 12010 if (ctx.stage == fragment_fs) { 12011 create_fs_exports(&ctx); 12012 } 12013 12014 if (endif_merged_wave_info) { 12015 begin_divergent_if_else(&ctx, &ic_merged_wave_info); 12016 end_divergent_if(&ctx, &ic_merged_wave_info); 12017 } 12018 12019 if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) { 12020 /* Outputs of the previous stage are inputs to the next stage */ 12021 ctx.inputs = ctx.outputs; 12022 ctx.outputs = shader_io_state(); 12023 } 12024 12025 cleanup_context(&ctx); 12026 } 12027 12028 program->config->float_mode = program->blocks[0].fp_mode.val; 12029 12030 append_logical_end(ctx.block); 12031 ctx.block->kind |= block_kind_uniform; 12032 Builder bld(ctx.program, ctx.block); 12033 bld.sopp(aco_opcode::s_endpgm); 12034 12035 cleanup_cfg(program); 12036} 12037 12038void 12039select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config, 12040 const struct aco_compiler_options* options, 12041 const struct aco_shader_info* info, 12042 const struct radv_shader_args* args) 12043{ 12044 isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, options, info, args, true, false); 12045 12046 ctx.block->fp_mode = program->next_fp_mode; 12047 12048 add_startpgm(&ctx); 12049 append_logical_start(ctx.block); 12050 12051 Builder bld(ctx.program, ctx.block); 12052 12053 Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), 12054 program->private_segment_buffer, Operand::c32(RING_GSVS_VS * 16u)); 12055 12056 Operand stream_id = Operand::zero(); 12057 if (program->info.so.num_outputs) 12058 stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), 12059 get_arg(&ctx, ctx.args->ac.streamout_config), Operand::c32(0x20018u)); 12060 12061 Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), 12062 get_arg(&ctx, ctx.args->ac.vertex_id)); 12063 12064 std::stack<if_context, std::vector<if_context>> if_contexts; 12065 12066 for (unsigned stream = 0; stream < 4; stream++) { 12067 if (stream_id.isConstant() && stream != stream_id.constantValue()) 12068 continue; 12069 12070 unsigned num_components = program->info.gs.num_stream_output_components[stream]; 12071 if (stream > 0 && (!num_components || !program->info.so.num_outputs)) 12072 continue; 12073 12074 memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask)); 12075 12076 if (!stream_id.isConstant()) { 12077 Temp cond = 12078 bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand::c32(stream)); 12079 if_contexts.emplace(); 12080 begin_uniform_if_then(&ctx, &if_contexts.top(), cond); 12081 bld.reset(ctx.block); 12082 } 12083 12084 unsigned offset = 0; 12085 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { 12086 if (program->info.gs.output_streams[i] != stream) 12087 continue; 12088 12089 unsigned output_usage_mask = program->info.gs.output_usage_mask[i]; 12090 unsigned length = util_last_bit(output_usage_mask); 12091 for (unsigned j = 0; j < length; ++j) { 12092 if (!(output_usage_mask & (1 << j))) 12093 continue; 12094 12095 Temp val = bld.tmp(v1); 12096 unsigned const_offset = offset * program->info.gs.vertices_out * 16 * 4; 12097 load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true, 12098 true, true); 12099 12100 ctx.outputs.mask[i] |= 1 << j; 12101 ctx.outputs.temps[i * 4u + j] = val; 12102 12103 offset++; 12104 } 12105 } 12106 12107 if (program->info.so.num_outputs) { 12108 emit_streamout(&ctx, stream); 12109 bld.reset(ctx.block); 12110 } 12111 12112 if (stream == 0) { 12113 create_vs_exports(&ctx); 12114 } 12115 12116 if (!stream_id.isConstant()) { 12117 begin_uniform_if_else(&ctx, &if_contexts.top()); 12118 bld.reset(ctx.block); 12119 } 12120 } 12121 12122 while (!if_contexts.empty()) { 12123 end_uniform_if(&ctx, &if_contexts.top()); 12124 if_contexts.pop(); 12125 } 12126 12127 program->config->float_mode = program->blocks[0].fp_mode.val; 12128 12129 append_logical_end(ctx.block); 12130 ctx.block->kind |= block_kind_uniform; 12131 bld.reset(ctx.block); 12132 bld.sopp(aco_opcode::s_endpgm); 12133 12134 cleanup_cfg(program); 12135} 12136 12137void 12138select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config, 12139 const struct aco_compiler_options* options, 12140 const struct aco_shader_info* info, 12141 const struct radv_shader_args* args) 12142{ 12143 assert(options->gfx_level == GFX8); 12144 12145 init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode, 12146 config); 12147 12148 isel_context ctx = {}; 12149 ctx.program = program; 12150 ctx.args = args; 12151 ctx.options = options; 12152 ctx.stage = program->stage; 12153 12154 ctx.block = ctx.program->create_and_insert_block(); 12155 ctx.block->kind = block_kind_top_level; 12156 12157 program->workgroup_size = 1; /* XXX */ 12158 12159 add_startpgm(&ctx); 12160 append_logical_start(ctx.block); 12161 12162 Builder bld(ctx.program, ctx.block); 12163 12164 /* Load the buffer descriptor from TMA. */ 12165 bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2), 12166 Operand::zero()); 12167 12168 /* Store TTMP0-TTMP1. */ 12169 bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(), 12170 Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true); 12171 12172 uint32_t hw_regs_idx[] = { 12173 2, /* HW_REG_STATUS */ 12174 3, /* HW_REG_TRAP_STS */ 12175 4, /* HW_REG_HW_ID */ 12176 7, /* HW_REG_IB_STS */ 12177 }; 12178 12179 /* Store some hardware registers. */ 12180 for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) { 12181 /* "((size - 1) << 11) | register" */ 12182 bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1), 12183 ((20 - 1) << 11) | hw_regs_idx[i]); 12184 12185 bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4), 12186 Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true); 12187 } 12188 12189 program->config->float_mode = program->blocks[0].fp_mode.val; 12190 12191 append_logical_end(ctx.block); 12192 ctx.block->kind |= block_kind_uniform; 12193 bld.sopp(aco_opcode::s_endpgm); 12194 12195 cleanup_cfg(program); 12196} 12197 12198Operand 12199get_arg_fixed(const struct radv_shader_args* args, struct ac_arg arg) 12200{ 12201 assert(arg.used); 12202 12203 enum ac_arg_regfile file = args->ac.args[arg.arg_index].file; 12204 unsigned size = args->ac.args[arg.arg_index].size; 12205 unsigned reg = args->ac.args[arg.arg_index].offset; 12206 12207 return Operand(PhysReg(file == AC_ARG_SGPR ? reg : reg + 256), 12208 RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size)); 12209} 12210 12211unsigned 12212load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max) 12213{ 12214 unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max); 12215 12216 unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3); 12217 if (bld.program->gfx_level >= GFX10 && num_loads > 1) 12218 bld.sopp(aco_opcode::s_clause, -1, num_loads - 1); 12219 12220 for (unsigned i = 0; i < count;) { 12221 unsigned size = 1u << util_logbase2(MIN2(count - i, 4)); 12222 12223 if (size == 4) 12224 bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base, 12225 Operand::c32((start + i) * 16u)); 12226 else if (size == 2) 12227 bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base, 12228 Operand::c32((start + i) * 16u)); 12229 else 12230 bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base, 12231 Operand::c32((start + i) * 16u)); 12232 12233 dest = dest.advance(size * 16u); 12234 i += size; 12235 } 12236 12237 return count; 12238} 12239 12240Operand 12241calc_nontrivial_instance_id(Builder& bld, const struct radv_shader_args* args, unsigned index, 12242 Operand instance_id, Operand start_instance, PhysReg tmp_sgpr, 12243 PhysReg tmp_vgpr0, PhysReg tmp_vgpr1) 12244{ 12245 bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2), 12246 get_arg_fixed(args, args->prolog_inputs), Operand::c32(8u + index * 8u)); 12247 12248 wait_imm lgkm_imm; 12249 lgkm_imm.lgkm = 0; 12250 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->gfx_level)); 12251 12252 Definition fetch_index_def(tmp_vgpr0, v1); 12253 Operand fetch_index(tmp_vgpr0, v1); 12254 12255 Operand div_info(tmp_sgpr, s1); 12256 if (bld.program->gfx_level >= GFX8 && bld.program->gfx_level < GFX11) { 12257 /* use SDWA */ 12258 if (bld.program->gfx_level < GFX9) { 12259 bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info); 12260 div_info = Operand(tmp_vgpr1, v1); 12261 } 12262 12263 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id); 12264 12265 Instruction* instr; 12266 if (bld.program->gfx_level >= GFX9) 12267 instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr; 12268 else 12269 instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm), 12270 div_info, fetch_index) 12271 .instr; 12272 instr->sdwa().sel[0] = SubdwordSel::ubyte1; 12273 12274 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1), 12275 fetch_index); 12276 12277 instr = 12278 bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr; 12279 instr->sdwa().sel[0] = SubdwordSel::ubyte2; 12280 } else { 12281 Operand tmp_op(tmp_vgpr1, v1); 12282 Definition tmp_def(tmp_vgpr1, v1); 12283 12284 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id); 12285 12286 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u)); 12287 bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true); 12288 12289 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index, 12290 Operand(tmp_sgpr.advance(4), s1)); 12291 12292 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u)); 12293 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index); 12294 } 12295 12296 bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true); 12297 12298 return fetch_index; 12299} 12300 12301void 12302select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shader_config* config, 12303 const struct aco_compiler_options* options, 12304 const struct aco_shader_info* info, 12305 const struct radv_shader_args* args, unsigned* num_preserved_sgprs) 12306{ 12307 assert(key->num_attributes > 0); 12308 12309 /* This should be enough for any shader/stage. */ 12310 unsigned max_user_sgprs = options->gfx_level >= GFX9 ? 32 : 16; 12311 *num_preserved_sgprs = max_user_sgprs + 14; 12312 12313 init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode, 12314 config); 12315 12316 Block* block = program->create_and_insert_block(); 12317 block->kind = block_kind_top_level; 12318 12319 program->workgroup_size = 64; 12320 calc_min_waves(program); 12321 12322 Builder bld(program, block); 12323 12324 block->instructions.reserve(16 + key->num_attributes * 4); 12325 12326 bld.sopp(aco_opcode::s_setprio, -1u, 0x3u); 12327 12328 uint32_t attrib_mask = BITFIELD_MASK(key->num_attributes); 12329 bool has_nontrivial_divisors = key->state.nontrivial_divisors & attrib_mask; 12330 12331 wait_imm lgkm_imm; 12332 lgkm_imm.lgkm = 0; 12333 12334 /* choose sgprs */ 12335 PhysReg vertex_buffers(align(*num_preserved_sgprs, 2)); 12336 PhysReg prolog_input = vertex_buffers.advance(8); 12337 PhysReg desc( 12338 align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4)); 12339 12340 Operand start_instance = get_arg_fixed(args, args->ac.start_instance); 12341 Operand instance_id = get_arg_fixed(args, args->ac.instance_id); 12342 12343 PhysReg attributes_start(256 + args->ac.num_vgprs_used); 12344 /* choose vgprs that won't be used for anything else until the last attribute load */ 12345 PhysReg vertex_index(attributes_start.reg() + key->num_attributes * 4 - 1); 12346 PhysReg instance_index(attributes_start.reg() + key->num_attributes * 4 - 2); 12347 PhysReg start_instance_vgpr(attributes_start.reg() + key->num_attributes * 4 - 3); 12348 PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + key->num_attributes * 4 - 4); 12349 PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + key->num_attributes * 4); 12350 12351 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1), 12352 get_arg_fixed(args, args->ac.vertex_buffers)); 12353 if (options->address32_hi >= 0xffff8000 || options->address32_hi <= 0x7fff) { 12354 bld.sopk(aco_opcode::s_movk_i32, Definition(vertex_buffers.advance(4), s1), 12355 options->address32_hi & 0xFFFF); 12356 } else { 12357 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1), 12358 Operand::c32((unsigned)options->address32_hi)); 12359 } 12360 12361 /* calculate vgpr requirements */ 12362 unsigned num_vgprs = attributes_start.reg() - 256; 12363 num_vgprs += key->num_attributes * 4; 12364 if (has_nontrivial_divisors && program->gfx_level <= GFX8) 12365 num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */ 12366 unsigned num_sgprs = 0; 12367 12368 for (unsigned loc = 0; loc < key->num_attributes;) { 12369 unsigned num_descs = 12370 load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, key->num_attributes - loc); 12371 num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg()); 12372 12373 if (loc == 0) { 12374 /* perform setup while we load the descriptors */ 12375 if (key->is_ngg || key->next_stage != MESA_SHADER_VERTEX) { 12376 Operand count = get_arg_fixed(args, args->ac.merged_wave_info); 12377 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u)); 12378 if (program->wave_size == 64) { 12379 bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count, 12380 Operand::c32(6u /* log2(64) */)); 12381 bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX), 12382 Operand(exec, s2), Operand(scc, s1)); 12383 } 12384 } 12385 12386 bool needs_instance_index = false; 12387 bool needs_start_instance = false; 12388 u_foreach_bit(i, key->state.instance_rate_inputs & attrib_mask) 12389 { 12390 needs_instance_index |= key->state.divisors[i] == 1; 12391 needs_start_instance |= key->state.divisors[i] == 0; 12392 } 12393 bool needs_vertex_index = ~key->state.instance_rate_inputs & attrib_mask; 12394 if (needs_vertex_index) 12395 bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->ac.base_vertex), 12396 get_arg_fixed(args, args->ac.vertex_id), false, Operand(s2), true); 12397 if (needs_instance_index) 12398 bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false, 12399 Operand(s2), true); 12400 if (needs_start_instance) 12401 bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance); 12402 } 12403 12404 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level)); 12405 12406 for (unsigned i = 0; i < num_descs; i++, loc++) { 12407 PhysReg dest(attributes_start.reg() + loc * 4u); 12408 12409 /* calculate index */ 12410 Operand fetch_index = Operand(vertex_index, v1); 12411 if (key->state.instance_rate_inputs & (1u << loc)) { 12412 uint32_t divisor = key->state.divisors[loc]; 12413 if (divisor) { 12414 fetch_index = instance_id; 12415 if (key->state.nontrivial_divisors & (1u << loc)) { 12416 unsigned index = 12417 util_bitcount(key->state.nontrivial_divisors & BITFIELD_MASK(loc)); 12418 fetch_index = calc_nontrivial_instance_id( 12419 bld, args, index, instance_id, start_instance, prolog_input, 12420 nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1); 12421 } else { 12422 fetch_index = Operand(instance_index, v1); 12423 } 12424 } else { 12425 fetch_index = Operand(start_instance_vgpr, v1); 12426 } 12427 } 12428 12429 /* perform load */ 12430 PhysReg cur_desc = desc.advance(i * 16); 12431 if ((key->misaligned_mask & (1u << loc))) { 12432 unsigned dfmt = key->state.formats[loc] & 0xf; 12433 unsigned nfmt = key->state.formats[loc] >> 4; 12434 const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt); 12435 for (unsigned j = 0; j < vtx_info->num_channels; j++) { 12436 bool post_shuffle = key->state.post_shuffle & (1u << loc); 12437 unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j); 12438 12439 /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec 12440 * doesn't require this to work, but some GL CTS tests over Zink do this anyway. 12441 * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't 12442 * care). 12443 */ 12444 if (vtx_info->chan_format == V_008F0C_BUF_DATA_FORMAT_32) 12445 bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1), 12446 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false, 12447 false, true); 12448 else 12449 bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1), 12450 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 12451 vtx_info->chan_format, nfmt, offset, false, true); 12452 } 12453 uint32_t one = 12454 nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT 12455 ? 1u 12456 : 0x3f800000u; 12457 for (unsigned j = vtx_info->num_channels; j < 4; j++) { 12458 bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1), 12459 Operand::c32(j == 3 ? one : 0u)); 12460 } 12461 } else { 12462 bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4), 12463 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true); 12464 } 12465 } 12466 } 12467 12468 if (key->state.alpha_adjust_lo | key->state.alpha_adjust_hi) { 12469 wait_imm vm_imm; 12470 vm_imm.vm = 0; 12471 bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->gfx_level)); 12472 } 12473 12474 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW. 12475 * so we may need to fix it up. */ 12476 u_foreach_bit(loc, (key->state.alpha_adjust_lo | key->state.alpha_adjust_hi)) 12477 { 12478 PhysReg alpha(attributes_start.reg() + loc * 4u + 3); 12479 12480 unsigned alpha_adjust = (key->state.alpha_adjust_lo >> loc) & 0x1; 12481 alpha_adjust |= ((key->state.alpha_adjust_hi >> loc) & 0x1) << 1; 12482 12483 if (alpha_adjust == ALPHA_ADJUST_SSCALED) 12484 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1)); 12485 12486 /* For the integer-like cases, do a natural sign extension. 12487 * 12488 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 12489 * and happen to contain 0, 1, 2, 3 as the two LSBs of the 12490 * exponent. 12491 */ 12492 unsigned offset = alpha_adjust == ALPHA_ADJUST_SNORM ? 23u : 0u; 12493 bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1), 12494 Operand::c32(offset), Operand::c32(2u)); 12495 12496 /* Convert back to the right type. */ 12497 if (alpha_adjust == ALPHA_ADJUST_SNORM) { 12498 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1)); 12499 bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u), 12500 Operand(alpha, v1)); 12501 } else if (alpha_adjust == ALPHA_ADJUST_SSCALED) { 12502 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1)); 12503 } 12504 } 12505 12506 block->kind |= block_kind_uniform; 12507 12508 /* continue on to the main shader */ 12509 Operand continue_pc = get_arg_fixed(args, args->prolog_inputs); 12510 if (has_nontrivial_divisors) { 12511 bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2), 12512 get_arg_fixed(args, args->prolog_inputs), Operand::c32(0u)); 12513 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level)); 12514 continue_pc = Operand(prolog_input, s2); 12515 } 12516 12517 bld.sop1(aco_opcode::s_setpc_b64, continue_pc); 12518 12519 program->config->float_mode = program->blocks[0].fp_mode.val; 12520 /* addition on GFX6-8 requires a carry-out (we use VCC) */ 12521 program->needs_vcc = program->gfx_level <= GFX8; 12522 program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs); 12523 program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs); 12524} 12525 12526void 12527select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key, ac_shader_config* config, 12528 const struct aco_compiler_options* options, 12529 const struct aco_shader_info* info, 12530 const struct radv_shader_args* args) 12531{ 12532 isel_context ctx = setup_isel_context(program, 0, NULL, config, options, info, args, false, true); 12533 12534 ctx.block->fp_mode = program->next_fp_mode; 12535 12536 add_startpgm(&ctx); 12537 append_logical_start(ctx.block); 12538 12539 Builder bld(ctx.program, ctx.block); 12540 12541 /* Export all color render targets */ 12542 bool exported = false; 12543 12544 for (unsigned i = 0; i < 8; i++) { 12545 unsigned col_format = (key->spi_shader_col_format >> (i * 4)) & 0xf; 12546 12547 if (col_format == V_028714_SPI_SHADER_ZERO) 12548 continue; 12549 12550 struct mrt_color_export out; 12551 12552 out.slot = i; 12553 out.write_mask = 0xf; 12554 out.col_format = col_format; 12555 out.is_int8 = (key->color_is_int8 >> i) & 1; 12556 out.is_int10 = (key->color_is_int10 >> i) & 1; 12557 out.enable_mrt_output_nan_fixup = (key->enable_mrt_output_nan_fixup >> i) & 1; 12558 12559 Temp inputs = get_arg(&ctx, ctx.args->ps_epilog_inputs[i]); 12560 for (unsigned c = 0; c < 4; ++c) { 12561 out.values[c] = Operand(emit_extract_vector(&ctx, inputs, c, v1)); 12562 } 12563 12564 exported |= export_fs_mrt_color(&ctx, &out, true); 12565 } 12566 12567 if (!exported) 12568 create_fs_null_export(&ctx); 12569 12570 program->config->float_mode = program->blocks[0].fp_mode.val; 12571 12572 append_logical_end(ctx.block); 12573 ctx.block->kind |= block_kind_export_end; 12574 bld.reset(ctx.block); 12575 bld.sopp(aco_opcode::s_endpgm); 12576 12577 cleanup_cfg(program); 12578} 12579} // namespace aco 12580