Lines Matching refs:ctx
49 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
52 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
65 _aco_err(ctx->program, file, line, out);
95 static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
129 get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def)
131 uint32_t id = ctx->first_temp_id + def->index;
132 return Temp(id, ctx->program->temp_rc[id]);
136 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
138 Builder bld(ctx->program, ctx->block);
142 if (ctx->program->wave_size == 32) {
163 if (ctx->program->gfx_level <= GFX7)
188 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
193 if (ctx->options->gfx_level <= GFX7) {
202 } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
222 ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
234 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
236 if (ctx->options->gfx_level >= GFX8) {
254 } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && or_mask < 8 &&
281 as_vgpr(isel_context* ctx, Temp val)
283 Builder bld(ctx->program, ctx->block);
289 emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b)
292 Builder bld(ctx->program, ctx->block);
340 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
342 Builder bld(ctx->program, ctx->block);
347 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
356 Builder bld(ctx->program, ctx->block);
357 auto it = ctx->allocated_vec.find(src.id());
358 if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
369 src = as_vgpr(ctx, src);
376 emit_extract_vector(ctx, src, idx, dst);
382 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
386 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
392 emit_split_vector(ctx, vec_src, vec_src.size());
405 elems[i] = ctx->program->allocateTmp(rc);
408 ctx->block->instructions.emplace_back(std::move(split));
409 ctx->allocated_vec.emplace(vec_src.id(), elems);
415 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask,
419 Builder bld(ctx->program, ctx->block);
423 expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding);
425 ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()];
429 emit_split_vector(ctx, vec_src, util_bitcount(mask));
458 Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc);
468 ctx->block->instructions.emplace_back(std::move(vec));
469 ctx->allocated_vec.emplace(dst.id(), elems);
474 byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
476 Builder bld(ctx->program, ctx->block);
497 emit_split_vector(ctx, dst, 2);
499 emit_extract_vector(ctx, tmp, 0, dst);
520 emit_split_vector(ctx, dst, 2);
525 byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
527 Builder bld(ctx->program, ctx->block);
557 emit_split_vector(ctx, dst, num_components);
561 emit_split_vector(ctx, vec, num_components);
568 elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
583 byte_align_scalar(ctx, vec, offset, dst);
589 ctx->allocated_vec.emplace(dst.id(), elems);
593 get_ssa_temp_tex(struct isel_context* ctx, nir_ssa_def* def, bool is_16bit)
596 Temp tmp = get_ssa_temp(ctx, def);
598 return emit_extract_vector(ctx, tmp, 0, rc);
604 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
606 Builder bld(ctx->program, ctx->block);
618 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
620 Builder bld(ctx->program, ctx->block);
642 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
704 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
706 Temp vec = get_ssa_temp(ctx, src->src.ssa);
712 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
716 Builder bld(ctx->program, ctx->block);
727 convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
733 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
736 return get_ssa_temp(ctx, src.src.ssa);
738 Temp vec = get_ssa_temp(ctx, src.src.ssa);
747 return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
754 return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
760 vec = as_vgpr(ctx, vec);
765 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
772 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
775 Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
777 ctx->block->instructions.emplace_back(std::move(vec_instr));
778 ctx->allocated_vec.emplace(dst.id(), elems);
779 return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
784 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
793 Temp tmp = get_ssa_temp(ctx, src.src.ssa);
803 auto it = ctx->allocated_vec.find(tmp.id());
804 if (it != ctx->allocated_vec.end()) {
806 Builder bld(ctx->program, ctx->block);
811 return emit_extract_vector(ctx, tmp, dword, v1);
816 return emit_extract_vector(ctx, tmp, dword * 2, v2b);
821 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
825 return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
829 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
833 Builder bld(ctx->program, ctx->block);
837 Operand::c32((unsigned)ctx->options->address32_hi));
841 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
846 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
847 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
852 sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
856 uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
864 ctx->block->instructions.emplace_back(std::move(sop2));
868 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
872 Builder bld(ctx->program, ctx->block);
875 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
876 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
883 src1 = as_vgpr(ctx, src1);
891 uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
899 if (flush_denorms && ctx->program->gfx_level < GFX9) {
913 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
915 Builder bld(ctx->program, ctx->block);
918 Temp src0 = get_alu_src(ctx, instr->src[0]);
919 Temp src1 = get_alu_src(ctx, instr->src[1]);
938 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
945 src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
947 src[i] = as_vgpr(ctx, src[i]);
952 Builder bld(ctx->program, ctx->block);
954 if (flush_denorms && ctx->program->gfx_level < GFX9) {
972 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
975 Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
976 Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
978 src1 = as_vgpr(ctx, src1);
987 Builder bld(ctx->program, ctx->block);
994 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp)
999 src[i] = get_alu_src(ctx, instr->src[i]);
1001 src[i] = as_vgpr(ctx, src[i]);
1006 Builder bld(ctx->program, ctx->block);
1012 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1014 Builder bld(ctx->program, ctx->block);
1018 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
1020 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
1024 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1026 Temp src0 = get_alu_src(ctx, instr->src[0]);
1027 Temp src1 = get_alu_src(ctx, instr->src[1]);
1059 src1 = as_vgpr(ctx, src1);
1063 Builder bld(ctx->program, ctx->block);
1068 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1070 Temp src0 = get_alu_src(ctx, instr->src[0]);
1071 Temp src1 = get_alu_src(ctx, instr->src[1]);
1072 Builder bld(ctx->program, ctx->block);
1082 bool_to_vector_condition(ctx, cmp, dst);
1086 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1097 get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1098 get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1101 assert(dst.regClass() == ctx->program->lane_mask);
1104 emit_vopc_instruction(ctx, instr, op, dst);
1106 emit_sopc_instruction(ctx, instr, op, dst);
1110 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1113 Builder bld(ctx->program, ctx->block);
1114 Temp src0 = get_alu_src(ctx, instr->src[0]);
1115 Temp src1 = get_alu_src(ctx, instr->src[1]);
1125 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1127 Builder bld(ctx->program, ctx->block);
1128 Temp cond = get_alu_src(ctx, instr->src[0]);
1129 Temp then = get_alu_src(ctx, instr->src[1]);
1130 Temp els = get_alu_src(ctx, instr->src[2]);
1137 then = as_vgpr(ctx, then);
1138 els = as_vgpr(ctx, els);
1170 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1193 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1197 Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.def(bld.lm), as_vgpr(ctx, val),
1209 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1211 if (ctx->block->fp_mode.denorm32 == 0) {
1216 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1220 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1222 if (ctx->block->fp_mode.denorm32 == 0) {
1227 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1231 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1233 if (ctx->block->fp_mode.denorm32 == 0) {
1238 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1242 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1244 if (ctx->block->fp_mode.denorm32 == 0) {
1249 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1253 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1255 if (ctx->options->gfx_level >= GFX7)
1261 val = as_vgpr(ctx, val);
1304 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1306 if (ctx->options->gfx_level >= GFX7)
1311 Temp src0 = as_vgpr(ctx, val);
1376 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1382 Builder bld(ctx->program, ctx->block);
1384 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1395 elems[i] = get_alu_src(ctx, instr->src[i]);
1403 elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1407 ctx->block->instructions.emplace_back(std::move(vec));
1408 ctx->allocated_vec.emplace(dst.id(), elems);
1410 bool use_s_pack = ctx->program->gfx_level >= GFX9;
1479 Temp src = get_alu_src(ctx, instr->src[0]);
1491 Temp src = get_alu_src(ctx, instr->src[0]);
1493 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1510 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
1520 Temp src = get_alu_src(ctx, instr->src[0]);
1526 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1531 src = as_vgpr(ctx, src);
1540 Temp src = get_alu_src(ctx, instr->src[0]);
1549 if (ctx->program->gfx_level >= GFX8)
1560 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
1563 src = as_vgpr(ctx, src);
1567 Temp upper = emit_extract_vector(ctx, src, 1, v1);
1579 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1580 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1582 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1584 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1586 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1588 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1595 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1596 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1598 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1600 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1602 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1604 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1611 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1612 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1614 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1616 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1618 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1620 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1627 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1628 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1630 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1632 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1634 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1636 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1644 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1646 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1648 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1650 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1652 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1660 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1662 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1664 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1666 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1668 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1676 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1678 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1680 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1682 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1684 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1691 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1692 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1694 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1696 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1698 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1699 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1700 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1701 get_alu_src(ctx, instr->src[0]));
1703 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1705 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1707 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1714 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1715 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1717 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1719 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1721 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1723 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1724 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1725 get_alu_src(ctx, instr->src[0]));
1727 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1729 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1731 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1738 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1739 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1741 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1743 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1745 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1746 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1747 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1748 get_alu_src(ctx, instr->src[0]));
1750 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1752 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1754 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1761 Temp src = get_alu_src(ctx, instr->src[0]);
1765 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1775 Temp src = get_alu_src(ctx, instr->src[0]);
1795 emit_vop1_instruction(ctx, instr, op, msb_rev);
1824 Temp src = get_alu_src(ctx, instr->src[0]);
1838 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1840 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1848 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1850 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
1851 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1853 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
1854 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1857 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1861 Temp src0 = get_alu_src(ctx, instr->src[0]);
1862 Temp src1 = get_alu_src(ctx, instr->src[1]);
1896 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1900 Temp src0 = get_alu_src(ctx, instr->src[0]);
1901 Temp src1 = get_alu_src(ctx, instr->src[1]);
1910 if (ctx->program->gfx_level >= GFX10) {
1916 bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1955 if (ctx->program->gfx_level >= GFX8) {
1958 as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
1979 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
1983 Temp src0 = get_alu_src(ctx, instr->src[0]);
1984 Temp src1 = get_alu_src(ctx, instr->src[1]);
1996 src1 = as_vgpr(ctx, src1);
2012 Temp src0 = get_alu_src(ctx, instr->src[0]);
2013 Temp src1 = get_alu_src(ctx, instr->src[1]);
2052 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
2055 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2059 Temp src0 = get_alu_src(ctx, instr->src[0]);
2060 Temp src1 = get_alu_src(ctx, instr->src[1]);
2065 if (ctx->program->gfx_level >= GFX10)
2068 bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
2069 else if (ctx->program->gfx_level >= GFX8)
2070 bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
2100 Temp src0 = get_alu_src(ctx, instr->src[0]);
2101 Temp src1 = get_alu_src(ctx, instr->src[1]);
2139 Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2143 Temp src0 = get_alu_src(ctx, instr->src[0]);
2144 Temp src1 = get_alu_src(ctx, instr->src[1]);
2152 if (ctx->program->gfx_level >= GFX10) {
2160 sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
2165 usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
2198 if (ctx->program->gfx_level >= GFX8) {
2201 as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
2221 Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
2225 Temp src0 = get_alu_src(ctx, instr->src[0]);
2226 Temp src1 = get_alu_src(ctx, instr->src[1]);
2238 src1 = as_vgpr(ctx, src1);
2254 if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
2255 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
2256 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
2257 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
2259 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
2261 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2262 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2266 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
2269 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
2272 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
2275 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
2278 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
2285 if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2286 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
2288 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2289 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2293 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2295 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2307 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2308 } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2309 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2311 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2312 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2321 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2323 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2325 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2327 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2335 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
2343 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2345 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2347 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2349 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2357 Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2364 Temp src0 = get_alu_src(ctx, instr->src[0]);
2365 Temp src1 = get_alu_src(ctx, instr->src[1]);
2368 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2370 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2373 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2375 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2377 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2378 as_vgpr(ctx, src1));
2387 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
2391 Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
2392 Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
2393 Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
2404 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
2405 ctx->block->fp_mode.must_flush_denorms32, 3);
2407 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
2415 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
2416 ctx->block->fp_mode.must_flush_denorms32, 3);
2425 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
2427 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2429 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2430 ctx->block->fp_mode.must_flush_denorms32);
2432 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2433 ctx->block->fp_mode.must_flush_denorms16_64);
2442 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
2444 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2446 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2447 ctx->block->fp_mode.must_flush_denorms32);
2449 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2450 ctx->block->fp_mode.must_flush_denorms16_64);
2457 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2461 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2465 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2469 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2473 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2477 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2481 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2485 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2489 Temp in = get_alu_src(ctx, instr->src[0], 3);
2490 Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2491 emit_extract_vector(ctx, in, 2, v1)};
2504 Temp in = get_alu_src(ctx, instr->src[0], 3);
2505 Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2506 emit_extract_vector(ctx, in, 2, v1)};
2511 emit_bcsel(ctx, instr, dst);
2516 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2518 Temp src = get_alu_src(ctx, instr->src[0]);
2519 emit_rsq(ctx, bld, Definition(dst), src);
2522 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2530 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2538 Temp src = get_alu_src(ctx, instr->src[0]);
2540 bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2543 as_vgpr(ctx, src));
2545 if (ctx->block->fp_mode.must_flush_denorms16_64)
2547 as_vgpr(ctx, src));
2559 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2568 Temp src = get_alu_src(ctx, instr->src[0]);
2571 Operand::c16(0x3c00), as_vgpr(ctx, src))
2576 Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2580 if (ctx->block->fp_mode.must_flush_denorms16_64)
2582 as_vgpr(ctx, src));
2594 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2601 Temp src = get_alu_src(ctx, instr->src[0]);
2621 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2623 Temp src = get_alu_src(ctx, instr->src[0]);
2624 emit_log2(ctx, bld, Definition(dst), src);
2632 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2634 Temp src = get_alu_src(ctx, instr->src[0]);
2635 emit_rcp(ctx, bld, Definition(dst), src);
2638 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2646 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2648 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2656 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2658 Temp src = get_alu_src(ctx, instr->src[0]);
2659 emit_sqrt(ctx, bld, Definition(dst), src);
2662 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2670 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2672 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2674 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2682 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2684 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2686 Temp src = get_alu_src(ctx, instr->src[0]);
2687 emit_floor_f64(ctx, bld, Definition(dst), src);
2695 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2697 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2699 if (ctx->options->gfx_level >= GFX7) {
2700 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2707 Temp src0 = get_alu_src(ctx, instr->src[0]);
2708 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2727 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2729 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2731 Temp src = get_alu_src(ctx, instr->src[0]);
2732 emit_trunc_f64(ctx, bld, Definition(dst), src);
2740 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2742 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2744 if (ctx->options->gfx_level >= GFX7) {
2745 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2749 Temp src0 = get_alu_src(ctx, instr->src[0]);
2756 bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2775 as_vgpr(ctx, src0_lo), cond);
2777 as_vgpr(ctx, src0_hi), cond);
2788 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2796 if (ctx->options->gfx_level < GFX9)
2809 emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2811 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2813 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2821 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2823 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2825 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2833 Temp src = get_alu_src(ctx, instr->src[0]);
2836 convert_int(ctx, bld, tmp, 8, 32, true, dst);
2838 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2840 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2847 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2849 assert(ctx->program->gfx_level >= GFX9);
2864 emit_extract_vector(ctx, src, 1, v1), cond);
2878 Temp src = get_alu_src(ctx, instr->src[0]);
2881 if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2891 Temp src = get_alu_src(ctx, instr->src[0]);
2894 if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2896 else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
2899 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2904 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2906 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2913 Temp src = get_alu_src(ctx, instr->src[0]);
2921 Temp src = get_alu_src(ctx, instr->src[0]);
2925 unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
2927 src = convert_int(ctx, bld, src, input_size, target_size, true);
2935 src = convert_int(ctx, bld, src, 64, 32, false);
2938 if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
2956 Temp src = get_alu_src(ctx, instr->src[0]);
2961 src = convert_int(ctx, bld, src, input_size, 32, true);
2980 Temp src = get_alu_src(ctx, instr->src[0]);
2982 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2985 Temp src = get_alu_src(ctx, instr->src[0]);
3001 Temp src = get_alu_src(ctx, instr->src[0]);
3005 unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
3007 src = convert_int(ctx, bld, src, input_size, target_size, false);
3015 src = convert_int(ctx, bld, src, 64, 32, false);
3018 if (ctx->program->gfx_level >= GFX8) {
3031 Temp src = get_alu_src(ctx, instr->src[0]);
3037 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3054 Temp src = get_alu_src(ctx, instr->src[0]);
3056 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3059 Temp src = get_alu_src(ctx, instr->src[0]);
3075 if (ctx->program->gfx_level >= GFX8) {
3076 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
3080 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3082 tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
3089 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3091 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3098 if (ctx->program->gfx_level >= GFX8) {
3099 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
3103 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3105 tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
3112 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3114 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3119 Temp src = get_alu_src(ctx, instr->src[0]);
3129 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3131 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3138 Temp src = get_alu_src(ctx, instr->src[0]);
3148 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3150 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3157 Temp src = get_alu_src(ctx, instr->src[0]);
3173 if (ctx->program->gfx_level >= GFX8)
3232 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
3236 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
3252 Temp src = get_alu_src(ctx, instr->src[0]);
3269 if (ctx->program->gfx_level >= GFX8)
3322 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
3326 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
3342 Temp src = get_alu_src(ctx, instr->src[0]);
3346 src = bool_to_scalar_condition(ctx, src);
3357 Temp src = get_alu_src(ctx, instr->src[0]);
3361 src = bool_to_scalar_condition(ctx, src);
3372 Temp src = get_alu_src(ctx, instr->src[0]);
3376 src = bool_to_scalar_condition(ctx, src);
3398 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3402 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3416 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3418 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3428 Temp src = get_alu_src(ctx, instr->src[0]);
3433 bool_to_scalar_condition(ctx, src, tmp);
3447 Temp src = get_alu_src(ctx, instr->src[0]);
3458 if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
3467 bool_to_vector_condition(ctx, tmp, dst);
3474 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3475 emit_split_vector(ctx, dst, instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3478 Temp src0 = get_alu_src(ctx, instr->src[0]);
3479 Temp src1 = get_alu_src(ctx, instr->src[1]);
3486 get_alu_src(ctx, instr->src[0]));
3490 get_alu_src(ctx, instr->src[0]));
3495 get_alu_src(ctx, instr->src[0]));
3497 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3503 get_alu_src(ctx, instr->src[0]));
3506 get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3511 Temp src0 = get_alu_src(ctx, instr->src[0]);
3512 Temp src1 = get_alu_src(ctx, instr->src[1]);
3514 src0 = emit_extract_vector(ctx, src0, 0, v2b);
3515 src1 = emit_extract_vector(ctx, src1, 0, v2b);
3526 case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3529 if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
3530 emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3532 emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3540 Temp src = get_alu_src(ctx, instr->src[0], 2);
3541 Temp src0 = emit_extract_vector(ctx, src, 0, v1);
3542 Temp src1 = emit_extract_vector(ctx, src, 1, v1);
3550 Temp src = get_alu_src(ctx, instr->src[0], 2);
3551 Temp src0 = emit_extract_vector(ctx, src, 0, v1);
3552 Temp src1 = emit_extract_vector(ctx, src, 1, v1);
3560 Temp src = get_alu_src(ctx, instr->src[0]);
3564 assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3574 Temp src = get_alu_src(ctx, instr->src[0]);
3582 assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3592 emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
3596 Temp src = get_alu_src(ctx, instr->src[0]);
3600 if (ctx->program->gfx_level >= GFX8) {
3618 if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3620 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3628 Temp bits = get_alu_src(ctx, instr->src[0]);
3629 Temp offset = get_alu_src(ctx, instr->src[1]);
3644 Temp bitmask = get_alu_src(ctx, instr->src[0]);
3645 Temp insert = get_alu_src(ctx, instr->src[1]);
3646 Temp base = get_alu_src(ctx, instr->src[2]);
3671 emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3683 Temp base = get_alu_src(ctx, instr->src[0]);
3695 Temp offset = get_alu_src(ctx, instr->src[1]);
3696 Temp bits = get_alu_src(ctx, instr->src[2]);
3719 emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3733 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3735 Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3738 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3745 Temp src = get_alu_src(ctx, instr->src[0]);
3748 src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3757 src = emit_extract_vector(ctx, src, 0, def.regClass());
3774 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3776 Temp src = get_alu_src(ctx, instr->src[0]);
3780 src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3789 src = emit_extract_vector(ctx, src, 0, def.regClass());
3803 Temp src = get_alu_src(ctx, instr->src[0]);
3809 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3811 emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3820 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3825 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3830 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3835 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3840 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3845 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3851 emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3854 ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3856 ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3861 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3864 ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3866 ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3870 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3875 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3893 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3910 if (ctx->program->gfx_level >= GFX8) {
3926 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3928 Temp dst = get_ssa_temp(ctx, &instr->def);
3936 Builder bld(ctx->program, ctx->block);
3962 ctx->block->instructions.emplace_back(std::move(vec));
4013 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
4136 emit_split_vector(ctx, info.dst, info.num_components);
4147 byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
4155 byte_align_scalar(ctx, val, byte_align_off, info.dst);
4157 byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
4238 ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
4690 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4695 Builder bld(ctx->program, ctx->block);
4697 LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4702 emit_load(ctx, bld, info, lds_load_params);
4708 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4714 Builder bld(ctx->program, ctx->block);
4721 dst[0] = as_vgpr(ctx, src);
4737 auto it = ctx->allocated_vec.find(src.id());
4738 if (it != ctx->allocated_vec.end()) {
4759 src = as_vgpr(ctx, src);
4781 dst[i] = as_vgpr(ctx, temps[idx++]);
4821 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4827 Builder bld(ctx->program, ctx->block);
4828 bool large_ds_write = ctx->options->gfx_level >= GFX7;
4829 bool usable_write2 = ctx->options->gfx_level >= GFX7;
4889 split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
4955 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
4983 if ((ctx->program->gfx_level == GFX6 || smem) && byte == 12)
4999 split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
5012 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
5015 Builder bld(ctx->program, ctx->block);
5042 emit_split_vector(ctx, dst, split_cnt);
5044 ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
5071 emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
5076 assert(vdata.size() != 3 || ctx->program->gfx_level != GFX6);
5079 Builder bld(ctx->program, ctx->block);
5083 Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
5085 bool glc = ctx->program->gfx_level < GFX11;
5096 store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
5101 Builder bld(ctx->program, ctx->block);
5109 split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4,
5114 emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync,
5120 load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
5129 Builder bld(ctx->program, ctx->block);
5141 emit_load(ctx, bld, info, mubuf_load_params);
5145 wave_id_in_threadgroup(isel_context* ctx)
5147 Builder bld(ctx->program, ctx->block);
5149 get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(24u | (4u << 16)));
5153 thread_id_in_threadgroup(isel_context* ctx)
5157 Builder bld(ctx->program, ctx->block);
5158 Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
5160 if (ctx->program->workgroup_size <= ctx->program->wave_size)
5163 Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
5166 Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
5171 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
5181 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5190 ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
5191 ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
5196 if (ctx->stage == fragment_fs && ctx->program->info.ps.has_epilog) {
5200 ctx->output_color_types |= ACO_TYPE_FLOAT16 << (index * 2);
5202 ctx->output_color_types |= ACO_TYPE_INT16 << (index * 2);
5204 ctx->output_color_types |= ACO_TYPE_UINT16 << (index * 2);
5212 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
5218 if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
5233 Temp* src = &ctx->inputs.temps[idx];
5234 create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
5239 static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos);
5242 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
5244 if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs ||
5245 ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg || ctx->stage == mesh_ngg ||
5246 (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
5247 ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
5248 bool stored_to_temps = store_output_to_temps(ctx, instr);
5259 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5262 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5263 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5265 Builder bld(ctx->program, ctx->block);
5268 if (ctx->program->dev.has_16bank_lds) {
5269 assert(ctx->options->gfx_level <= GFX8);
5280 if (ctx->options->gfx_level == GFX8)
5292 if (ctx->program->dev.has_16bank_lds)
5301 emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
5303 Builder bld(ctx->program, ctx->block);
5308 if (ctx->args->ac.frag_pos[i].used)
5309 vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
5313 if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
5316 bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
5323 ctx->block->instructions.emplace_back(std::move(vec));
5324 emit_split_vector(ctx, dst, num_components);
5329 emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
5331 Builder bld(ctx->program, ctx->block);
5337 Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
5339 Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
5356 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
5358 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5359 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
5362 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
5367 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
5372 Temp tmp = ctx->program->allocateTmp(instr->dest.ssa.bit_size == 16 ? v2b : v1);
5373 emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
5377 ctx->block->instructions.emplace_back(std::move(vec));
5382 check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
5395 return (ctx->options->gfx_level >= GFX7 && ctx->options->gfx_level <= GFX9) ||
5400 get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
5409 if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) {
5413 !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) {
5421 !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels))
5449 visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
5451 Builder bld(ctx->program, ctx->block);
5452 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5455 if (ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->program->info.vs.dynamic_inputs) {
5465 Temp input = get_arg(ctx, ctx->args->vs_inputs[location]);
5471 elems[i] = emit_extract_vector(ctx, input, component + i, bitsize == 64 ? v2 : v1);
5482 ctx->block->instructions.emplace_back(std::move(vec));
5483 ctx->allocated_vec.emplace(dst.id(), elems);
5484 } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
5491 convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers));
5496 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
5497 uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
5498 uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
5499 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
5500 unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];
5510 ctx->program->info.vs.use_per_attribute_vb_descs ? location : attrib_binding;
5511 desc_index = util_bitcount(ctx->program->info.vs.vb_desc_usage_mask &
5517 if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
5518 uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
5519 Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
5521 Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
5524 emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
5533 index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex),
5534 get_arg(ctx, ctx->args->ac.vertex_id));
5562 get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component,
5566 if (fetch_component == 3 && ctx->options->gfx_level == GFX6)
5614 assert(ctx->options->gfx_level >= GFX7 ||
5615 (!use_mubuf && ctx->options->gfx_level == GFX6));
5646 emit_split_vector(ctx, fetch_dst, fetch_dst.bytes() * 8 / bitsize);
5653 emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1);
5680 vec->operands[i] = Operand::get_const(ctx->options->gfx_level, 1u, bitsize / 8u);
5686 ctx->block->instructions.emplace_back(std::move(vec));
5687 emit_split_vector(ctx, dst, num_components);
5690 ctx->allocated_vec.emplace(dst.id(), elems);
5692 } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
5697 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
5745 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5747 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5749 Builder bld(ctx->program, ctx->block);
5750 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5752 if (load_input_from_temps(ctx, instr, dst))
5759 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5761 switch (ctx->shader->info.stage) {
5762 case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5768 visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5770 assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5772 Builder bld(ctx->program, ctx->block);
5773 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5775 Operand tes_u(get_arg(ctx, ctx->args->ac.tes_u));
5776 Operand tes_v(get_arg(ctx, ctx->args->ac.tes_v));
5779 if (ctx->shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
5786 emit_split_vector(ctx, tess_coord, 3);
5790 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5794 Builder bld(ctx->program, ctx->block);
5797 dst.type() != RegType::vgpr && (!glc || ctx->options->gfx_level >= GFX8) && allow_smem;
5804 if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
5805 offset = as_vgpr(ctx, offset);
5814 emit_load(ctx, bld, info, smem_load_params);
5816 emit_load(ctx, bld, info, mubuf_load_params);
5820 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5822 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5823 Builder bld(ctx->program, ctx->block);
5824 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5827 load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5832 visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5834 Builder bld(ctx->program, ctx->block);
5835 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5846 if ((ctx->args->ac.inline_push_const_mask | mask) == ctx->args->ac.inline_push_const_mask &&
5847 start + count <= (sizeof(ctx->args->ac.inline_push_const_mask) * 8u)) {
5852 util_bitcount64(ctx->args->ac.inline_push_const_mask & BITFIELD64_MASK(start));
5854 elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[arg_index++]);
5858 ctx->block->instructions.emplace_back(std::move(vec));
5859 ctx->allocated_vec.emplace(dst.id(), elems);
5864 Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5868 Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
5906 byte_align_scalar(ctx, vec, byte_offset, dst);
5911 emit_split_vector(ctx, vec, 4);
5913 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5914 emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5916 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5920 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5922 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5924 Builder bld(ctx->program, ctx->block);
5929 if (ctx->options->gfx_level >= GFX10) {
5932 S_008F0C_RESOURCE_LEVEL(ctx->options->gfx_level < GFX11);
5941 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5950 Operand::c32(ctx->constant_data_offset)),
5951 Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5955 load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5962 emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
5964 Builder bld(ctx->program, ctx->block);
5972 Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5978 packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1));
5981 low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5994 should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array)
5998 ac_image_dim dim = ac_get_sampler_dim(ctx->options->gfx_level, sampler_dim, is_array);
6081 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
6083 Builder bld(ctx->program, ctx->block);
6084 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6085 Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
6086 Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
6087 Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
6088 Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
6089 Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
6090 Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
6093 args.push_back(emit_extract_vector(ctx, node, 0, v1));
6094 args.push_back(emit_extract_vector(ctx, node, 1, v1));
6095 args.push_back(as_vgpr(ctx, tmax));
6096 args.push_back(emit_extract_vector(ctx, origin, 0, v1));
6097 args.push_back(emit_extract_vector(ctx, origin, 1, v1));
6098 args.push_back(emit_extract_vector(ctx, origin, 2, v1));
6099 args.push_back(emit_extract_vector(ctx, dir, 0, v1));
6100 args.push_back(emit_extract_vector(ctx, dir, 1, v1));
6101 args.push_back(emit_extract_vector(ctx, dir, 2, v1));
6102 args.push_back(emit_extract_vector(ctx, inv_dir, 0, v1));
6103 args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1));
6104 args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1));
6115 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6118 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6125 bool gfx9_1d = ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6128 Builder bld(ctx->program, ctx->block);
6131 coords[--count] = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1);
6134 coords[0] = emit_extract_vector(ctx, src0, 0, v1);
6138 coords[2] = emit_extract_vector(ctx, src0, 1, v1);
6141 coords[i] = emit_extract_vector(ctx, src0, i, v1);
6144 if (ctx->options->key.image_2d_view_of_3d &&
6150 assert(ctx->options->gfx_level == GFX9);
6151 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6152 Temp rsrc_word5 = emit_extract_vector(ctx, rsrc, 5, v1);
6168 coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
6212 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6214 Builder bld(ctx->program, ctx->block);
6218 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6249 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6252 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6281 load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6286 ctx->block->instructions.emplace_back(std::move(load));
6288 std::vector<Temp> coords = get_image_coords(ctx, instr);
6298 load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6299 load->dim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6303 load->da = should_declare_array(ctx, dim, is_array);
6316 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask,
6321 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6323 Builder bld(ctx->program, ctx->block);
6326 Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6331 data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6332 data = as_vgpr(ctx, data);
6338 bool glc = ctx->options->gfx_level == GFX6 ||
6340 ctx->program->gfx_level < GFX11);
6343 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6344 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6374 ctx->program->needs_exact = true;
6375 ctx->block->instructions.emplace_back(std::move(store));
6380 std::vector<Temp> coords = get_image_coords(ctx, instr);
6381 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6406 data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
6412 vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
6425 store->dim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6429 store->da = should_declare_array(ctx, dim, is_array);
6432 ctx->program->needs_exact = true;
6437 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6442 Builder bld(ctx->program, ctx->block);
6444 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6451 get_ssa_temp(ctx, instr->src[4].ssa), data);
6520 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6524 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6525 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6526 // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet
6544 ctx->program->needs_exact = true;
6545 ctx->block->instructions.emplace_back(std::move(mubuf));
6551 std::vector<Temp> coords = get_image_coords(ctx, instr);
6552 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6559 mimg->dim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6562 mimg->da = should_declare_array(ctx, dim, is_array);
6565 ctx->program->needs_exact = true;
6572 get_buffer_size(isel_context* ctx, Temp desc, Temp dst)
6574 if (ctx->options->gfx_level == GFX8) {
6576 Builder bld(ctx->program, ctx->block);
6578 Temp size = emit_extract_vector(ctx, desc, 2, s1);
6585 Temp stride = emit_extract_vector(ctx, desc, 1, s1);
6600 emit_extract_vector(ctx, desc, 2, dst);
6605 visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr)
6609 Builder bld(ctx->program, ctx->block);
6612 Temp desc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6613 return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa));
6621 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6623 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6628 mimg->dim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6632 if (ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
6637 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
6641 get_image_samples(isel_context* ctx, Definition dst, Temp resource)
6643 Builder bld(ctx->program, ctx->block);
6645 Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6654 if (ctx->options->robust_buffer_access) {
6658 Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
6669 visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr)
6671 Builder bld(ctx->program, ctx->block);
6672 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6673 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6674 get_image_samples(ctx, Definition(dst), resource);
6678 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6680 Builder bld(ctx->program, ctx->block);
6683 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6684 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6692 load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6698 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6700 Builder bld(ctx->program, ctx->block);
6701 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6704 Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6706 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6711 ctx->program->gfx_level < GFX11;
6716 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6722 if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
6723 offset = as_vgpr(ctx, offset);
6740 ctx->program->needs_exact = true;
6741 ctx->block->instructions.emplace_back(std::move(store));
6746 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6748 Builder bld(ctx->program, ctx->block);
6750 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6755 get_ssa_temp(ctx, instr->src[3].ssa), data);
6757 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6758 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6760 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6833 ctx->program->needs_exact = true;
6834 ctx->block->instructions.emplace_back(std::move(mubuf));
6840 parse_global(isel_context* ctx, nir_intrinsic_instr* intrin, Temp* address, uint32_t* const_offset,
6844 *address = get_ssa_temp(ctx, intrin->src[is_store ? 1 : 0].ssa);
6851 *offset = get_ssa_temp(ctx, offset_src.ssa);
6857 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6859 Builder bld(ctx->program, ctx->block);
6865 parse_global(ctx, instr, &addr, &const_offset, &offset);
6867 LoadEmitInfo info = {Operand(addr), get_ssa_temp(ctx, &instr->dest.ssa), num_components,
6891 if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->gfx_level < GFX8) ||
6894 params.byte_align_loads = ctx->options->gfx_level > GFX6 || byte_align_for_smem_mubuf;
6895 emit_load(ctx, bld, info, params);
6900 emit_load(ctx, bld, info, smem_load_params);
6905 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6907 Builder bld(ctx->program, ctx->block);
6911 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6915 ctx->program->gfx_level < GFX11;
6920 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6925 parse_global(ctx, instr, &addr, &const_offset, &offset);
6933 if (ctx->options->gfx_level >= GFX7) {
6934 bool global = ctx->options->gfx_level >= GFX9;
6970 ctx->program->needs_exact = true;
6971 ctx->block->instructions.emplace_back(std::move(flat));
6973 assert(ctx->options->gfx_level == GFX6);
6992 ctx->program->needs_exact = true;
6993 ctx->block->instructions.emplace_back(std::move(mubuf));
6999 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7001 Builder bld(ctx->program, ctx->block);
7003 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7008 get_ssa_temp(ctx, instr->src[2].ssa), data);
7010 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7016 parse_global(ctx, instr, &addr, &const_offset, &offset);
7019 if (ctx->options->gfx_level >= GFX7) {
7020 bool global = ctx->options->gfx_level >= GFX9;
7096 ctx->program->needs_exact = true;
7097 ctx->block->instructions.emplace_back(std::move(flat));
7099 assert(ctx->options->gfx_level == GFX6);
7175 ctx->program->needs_exact = true;
7176 ctx->block->instructions.emplace_back(std::move(mubuf));
7202 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7204 Builder bld(ctx->program, ctx->block);
7206 Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa);
7207 Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
7208 Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
7209 Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
7218 unsigned swizzle_element_size = swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
7223 load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7228 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7230 Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7231 Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa);
7232 Temp v_offset = get_ssa_temp(ctx, intrin->src[2].ssa);
7233 Temp s_offset = get_ssa_temp(ctx, intrin->src[3].ssa);
7245 store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7250 visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr)
7252 Builder bld(ctx->program, ctx->block);
7253 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7254 Temp base = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
7255 Temp offset = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7282 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
7301 emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7303 Builder bld(ctx->program, ctx->block);
7316 bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS ||
7317 ctx->stage.hw == HWStage::HS ||
7318 (ctx->stage.hw == HWStage::GS && ctx->program->gfx_level >= GFX9) ||
7319 ctx->stage.hw == HWStage::NGG;
7325 if (ctx->stage.has(SWStage::MS) || ctx->stage.has(SWStage::TS))
7329 if (ctx->stage.hw != HWStage::CS && ctx->stage.hw != HWStage::FS)
7336 ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::HS || ctx->stage.hw == HWStage::NGG;
7357 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7360 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7361 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7362 Builder bld(ctx->program, ctx->block);
7367 load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7371 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7374 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7375 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7379 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7383 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7386 Builder bld(ctx->program, ctx->block);
7388 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7389 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7498 Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7504 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
7510 ctx->block->instructions.emplace_back(std::move(ds));
7514 visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
7517 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[is_store].ssa));
7518 Builder bld(ctx->program, ctx->block);
7533 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7535 Temp data0 = emit_extract_vector(ctx, data, 0, comp_rc);
7536 Temp data1 = emit_extract_vector(ctx, data, 1, comp_rc);
7539 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7550 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7552 emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size());
7556 comp[i] = bld.as_uniform(emit_extract_vector(ctx, ds->definitions[0].getTemp(), i, v1));
7560 ctx->allocated_vec[comp0.id()] = {comp[0], comp[1]};
7561 ctx->allocated_vec[comp1.id()] = {comp[2], comp[3]};
7563 ctx->allocated_vec[dst.id()] = {comp0, comp1};
7569 emit_split_vector(ctx, dst, 2);
7574 get_scratch_resource(isel_context* ctx)
7576 Builder bld(ctx->program, ctx->block);
7577 Temp scratch_addr = ctx->program->private_segment_buffer;
7578 if (ctx->stage.hw != HWStage::CS)
7583 S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7585 if (ctx->program->gfx_level >= GFX10) {
7588 S_008F0C_RESOURCE_LEVEL(ctx->program->gfx_level < GFX11);
7589 } else if (ctx->program->gfx_level <=
7596 if (ctx->program->gfx_level <= GFX8)
7604 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7606 Builder bld(ctx->program, ctx->block);
7607 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7613 info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
7615 if (ctx->program->gfx_level >= GFX9) {
7617 uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7622 info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
7625 params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
7626 emit_load(ctx, bld, info, params);
7628 info.resource = get_scratch_resource(ctx);
7629 info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7630 info.soffset = ctx->program->scratch_offset;
7631 emit_load(ctx, bld, info, scratch_mubuf_load_params);
7636 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7638 Builder bld(ctx->program, ctx->block);
7639 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7640 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
7648 unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
7649 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7652 if (ctx->program->gfx_level >= GFX9) {
7653 uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7682 Temp rsrc = get_scratch_resource(ctx);
7683 offset = as_vgpr(ctx, offset);
7686 Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
7694 visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr)
7696 Builder bld(ctx->program, ctx->block);
7699 Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7705 bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer,
7708 unsigned num_components = ctx->program->info.gs.num_stream_output_components[stream];
7710 unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
7713 unsigned prev_stride = 4u * ctx->program->info.gs.num_stream_output_components[i] *
7714 ctx->shader->info.gs.vertices_out;
7715 stream_offset += prev_stride * ctx->program->wave_size;
7739 gsvs_dwords[2] = bld.copy(bld.def(s1), Operand::c32(ctx->program->wave_size));
7746 if (ctx->program->info.gs.output_streams[i] != stream)
7750 if (!(ctx->program->info.gs.output_usage_mask[i] & (1 << j)))
7753 if (ctx->outputs.mask[i] & (1 << j)) {
7769 mubuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset));
7770 mubuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
7773 mubuf->glc = ctx->program->gfx_level < GFX11;
7779 offset += ctx->shader->info.gs.vertices_out;
7784 ctx->outputs.mask[i] = 0;
7787 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
7791 emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
7793 Builder bld(ctx->program, ctx->block);
7809 } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7815 Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
7817 } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7823 return bool_to_vector_condition(ctx, tmp);
7824 } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7832 return bool_to_vector_condition(ctx, tmp);
7844 Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));
7858 if (ctx->program->gfx_level <= GFX7)
7860 else if (ctx->program->wave_size == 64)
7864 tmp = emit_extract_vector(ctx, tmp, 0, v1);
7884 emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
7886 Builder bld(ctx->program, ctx->block);
7900 Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
7915 emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
7917 Builder bld(ctx->program, ctx->block);
7923 Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7967 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7969 Builder bld(ctx->program, ctx->block);
7970 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7979 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7981 Builder bld(ctx->program, ctx->block);
7982 Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7985 src_tmp = as_vgpr(ctx, src_tmp);
8026 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
8028 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
8038 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
8045 Builder bld(ctx->program, ctx->block);
8046 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
8054 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
8056 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
8063 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
8065 Builder bld(ctx->program, ctx->block);
8066 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
8079 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
8081 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
8083 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
8091 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
8097 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8113 as_vgpr(ctx, src));
8120 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
8126 Builder bld(ctx->program, ctx->block);
8134 bool need_sitmp = (ctx->program->gfx_level <= GFX7 || ctx->program->gfx_level >= GFX10) &&
8150 if ((op == iadd32 || op == imul64) && ctx->program->gfx_level < GFX9)
8152 if ((op == iadd8 || op == iadd16) && ctx->program->gfx_level < GFX8)
8176 emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
8178 Builder bld(ctx->program, ctx->block);
8179 Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
8180 Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
8188 if (ctx->program->gfx_level >= GFX8) {
8211 ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
8224 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
8225 void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt);
8226 static void create_primitive_exports(isel_context *ctx, Temp prim_ch1);
8227 static void create_vs_exports(isel_context* ctx);
8230 get_interp_param(isel_context* ctx, nir_intrinsic_op intrin,
8237 return get_arg(ctx, linear ? ctx->args->ac.linear_center : ctx->args->ac.persp_center);
8239 return linear ? ctx->linear_centroid : ctx->persp_centroid;
8242 return get_arg(ctx, linear ? ctx->args->ac.linear_sample : ctx->args->ac.persp_sample);
8247 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
8249 Builder bld(ctx->program, ctx->block);
8255 Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
8257 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8259 emit_split_vector(ctx, dst, 2);
8263 Temp model = get_arg(ctx, ctx->args->ac.pull_model);
8265 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8267 emit_split_vector(ctx, dst, 3);
8271 Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8272 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8274 if (ctx->options->key.ps.num_samples == 2) {
8276 } else if (ctx->options->key.ps.num_samples == 4) {
8278 } else if (ctx->options->key.ps.num_samples == 8) {
8281 assert(ctx->options->key.ps.num_samples == 0);
8283 emit_split_vector(ctx, dst, 2);
8288 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
8290 Temp private_segment_buffer = ctx->program->private_segment_buffer;
8297 } else if (ctx->options->gfx_level >= GFX9) {
8311 } else if (ctx->options->gfx_level >= GFX9) {
8315 } else if (ctx->options->gfx_level >= GFX7) {
8329 tmp1 = as_vgpr(ctx, tmp1);
8337 assert(ctx->options->gfx_level == GFX6);
8361 ctx->block->instructions.emplace_back(std::move(load));
8371 emit_interp_center(ctx, dst, bary, pos1, pos2);
8375 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8379 Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8380 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2);
8384 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8385 Operand::zero(), get_arg(ctx, ctx->args->ac.front_face));
8389 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8390 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
8394 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
8398 emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8401 Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
8402 Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
8404 aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8409 case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8410 case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8411 case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8413 case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break;
8414 case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8415 case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8416 case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8417 case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8418 case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8419 case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8432 case nir_intrinsic_shared_atomic_fmax: visit_shared_atomic(ctx, instr); break;
8434 case nir_intrinsic_store_shared2_amd: visit_access_shared2_amd(ctx, instr); break;
8436 case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break;
8437 case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break;
8449 case nir_intrinsic_bindless_image_atomic_fmax: visit_image_atomic(ctx, instr); break;
8450 case nir_intrinsic_bindless_image_size: visit_image_size(ctx, instr); break;
8451 case nir_intrinsic_bindless_image_samples: visit_image_samples(ctx, instr); break;
8452 case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8453 case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8454 case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8455 case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8456 case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break;
8457 case nir_intrinsic_load_global_amd: visit_load_global(ctx, instr); break;
8458 case nir_intrinsic_store_global_amd: visit_store_global(ctx, instr); break;
8470 case nir_intrinsic_global_atomic_fmax_amd: visit_global_atomic(ctx, instr); break;
8482 case nir_intrinsic_ssbo_atomic_fmax: visit_atomic_ssbo(ctx, instr); break;
8483 case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8484 case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8485 case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break;
8487 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8488 if (ctx->args->load_grid_size_from_user_sgpr) {
8489 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.num_work_groups));
8491 Temp addr = get_arg(ctx, ctx->args->ac.num_work_groups);
8497 emit_split_vector(ctx, dst, 3);
8501 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8502 Temp addr = get_arg(ctx, ctx->args->ac.ray_launch_size_addr);
8508 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8509 if (ctx->options->gfx_level >= GFX11) {
8515 get_arg(ctx, ctx->args->ac.local_invocation_ids),
8522 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
8524 emit_split_vector(ctx, dst, 3);
8528 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8529 if (ctx->stage.hw == HWStage::CS) {
8530 const struct ac_arg* ids = ctx->args->ac.workgroup_ids;
8532 ids[0].used ? Operand(get_arg(ctx, ids[0])) : Operand::zero(),
8533 ids[1].used ? Operand(get_arg(ctx, ids[1])) : Operand::zero(),
8534 ids[2].used ? Operand(get_arg(ctx, ids[2])) : Operand::zero());
8535 emit_split_vector(ctx, dst, 3);
8542 if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) {
8543 if (ctx->options->gfx_level >= GFX11) {
8547 get_arg(ctx, ctx->args->ac.tcs_wave_id), Operand::c32(0u | (5u << 16)));
8550 Operand::c32(ctx->program->wave_size));
8551 Temp thread_id = emit_mbcnt(ctx, bld.tmp(v1));
8553 bld.vadd32(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), temp, thread_id);
8555 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8556 get_arg(ctx, ctx->args->ac.vs_rel_patch_id));
8559 } else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) {
8560 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx));
8562 } else if (ctx->program->workgroup_size <= ctx->program->wave_size) {
8563 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8567 Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8572 if (ctx->program->wave_size == 64) {
8576 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size));
8577 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num,
8583 get_arg(ctx, ctx->args->ac.tg_size), Operand::c32(0x6u | (0x6u << 16)));
8584 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8590 if (ctx->stage.hw == HWStage::CS) {
8591 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8592 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size),
8594 } else if (ctx->stage.hw == HWStage::NGG) {
8596 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8597 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8600 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::zero());
8605 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8609 if (ctx->stage.hw == HWStage::CS)
8610 bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8611 bld.def(s1, scc), Operand::c32(0x3fu), get_arg(ctx, ctx->args->ac.tg_size));
8612 else if (ctx->stage.hw == HWStage::NGG)
8613 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8614 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8617 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x1u));
8621 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8622 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8648 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8650 emit_uniform_subgroup(ctx, instr, src);
8652 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8656 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8659 src = as_vgpr(ctx, src);
8663 tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);
8670 emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst);
8674 lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo));
8675 hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi));
8677 emit_split_vector(ctx, dst, 2);
8681 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8685 if (ctx->program->gfx_level <= GFX7)
8687 else if (ctx->program->wave_size == 64)
8691 tmp = emit_extract_vector(ctx, tmp, 0, v1);
8702 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8703 get_arg(ctx, ctx->args->ac.ancillary), Operand::c32(8u), Operand::c32(4u));
8707 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8708 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8717 emit_split_vector(ctx, dst, 2);
8722 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8729 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8730 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8738 Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
8743 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8744 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8748 Temp tmp = bool_to_scalar_condition(ctx, src);
8749 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8755 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8756 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8761 MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8763 if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&
8773 if (emit_uniform_reduce(ctx, instr))
8775 } else if (emit_uniform_scan(ctx, instr)) {
8791 emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
8794 emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst);
8797 emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);
8806 src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8818 Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,
8829 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8832 emit_uniform_subgroup(ctx, instr, src);
8856 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8864 src = as_vgpr(ctx, src);
8869 else if (ctx->program->stage == fragment_fs)
8890 if (ctx->program->gfx_level >= GFX8)
8902 if (ctx->program->gfx_level >= GFX8) {
8911 emit_split_vector(ctx, tmp, 2);
8927 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8929 emit_uniform_subgroup(ctx, instr, src);
8932 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8936 src = as_vgpr(ctx, src);
8942 src = emit_masked_swizzle(ctx, bld, src, mask);
8946 Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8947 emit_extract_vector(ctx, tmp, 0, dst);
8949 Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8950 emit_extract_vector(ctx, tmp, 0, dst);
8952 emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst);
8956 lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask));
8957 hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask));
8959 emit_split_vector(ctx, dst, 2);
8966 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8967 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8968 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8969 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8981 emit_split_vector(ctx, dst, 2);
8988 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8989 Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8990 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8992 src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8993 Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src));
8998 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9000 assert(ctx->program->gfx_level >= GFX8);
9001 bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa),
9002 as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)),
9003 as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
9007 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9008 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9009 assert(ctx->program->gfx_level >= GFX10);
9015 bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
9016 bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
9026 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9028 ctx->block->kind |= block_kind_needs_lowering;
9029 ctx->program->needs_exact = true;
9035 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
9036 ctx->cf_info.exec_potentially_empty_discard = true;
9037 ctx->block->kind |= block_kind_uses_discard;
9038 ctx->program->needs_exact = true;
9041 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9047 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
9048 ctx->cf_info.exec_potentially_empty_discard = true;
9049 ctx->block->kind |= block_kind_uses_discard;
9050 ctx->program->needs_exact = true;
9060 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9068 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
9069 ctx->cf_info.exec_potentially_empty_discard = true;
9070 ctx->block->kind |= block_kind_uses_discard;
9071 ctx->program->needs_exact = true;
9076 get_ssa_temp(ctx, &instr->dest.ssa));
9082 Operand::c32(ctx->program->wave_size - 1u), flbit);
9083 emit_wqm(bld, last, get_ssa_temp(ctx, &instr->dest.ssa));
9092 emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->dest.ssa));
9093 ctx->block->kind |= block_kind_needs_lowering;
9097 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9099 ctx->options->gfx_level >= GFX10_3) {
9109 emit_split_vector(ctx, dst, 2);
9113 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9114 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
9118 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9119 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
9123 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9124 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
9128 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9129 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
9133 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9134 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
9138 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9140 if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
9141 if (ctx->options->gfx_level >= GFX10)
9143 get_arg(ctx, ctx->args->ac.gs_invocation_id));
9145 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
9146 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
9147 bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
9156 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9158 switch (ctx->shader->info.stage) {
9160 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
9163 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
9166 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
9169 if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {
9172 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
9174 } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
9175 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vs_prim_id));
9184 assert(ctx->stage.hw == HWStage::GS);
9185 visit_emit_vertex_with_counter(ctx, instr);
9189 if (ctx->stage.hw != HWStage::NGG) {
9191 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1,
9197 assert(ctx->stage.hw == HWStage::GS);
9203 assert(ctx->stage.hw == HWStage::NGG);
9205 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), merged_wave_info_to_mask(ctx, i));
9209 ctx->block->kind |= block_kind_export_end;
9210 create_vs_exports(ctx);
9214 Temp prim_ch1 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
9215 create_primitive_exports(ctx, prim_ch1);
9219 assert(ctx->stage.hw == HWStage::NGG);
9220 Temp num_vertices = get_ssa_temp(ctx, instr->src[0].ssa);
9221 Temp num_primitives = get_ssa_temp(ctx, instr->src[1].ssa);
9222 ngg_emit_sendmsg_gs_alloc_req(ctx, num_vertices, num_primitives);
9226 Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
9227 Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
9228 Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
9230 bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
9235 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9236 Temp addr = get_arg(ctx, ctx->args->ac.sbt_descriptors);
9241 case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
9243 ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9244 ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9248 ctx->arg_temps[ctx->args->ac.tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9249 ctx->arg_temps[ctx->args->ac.tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9250 ctx->arg_temps[ctx->args->ac.tes_rel_patch_id.arg_index] =
9251 get_ssa_temp(ctx, instr->src[2].ssa);
9252 ctx->arg_temps[ctx->args->ac.tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
9256 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
9257 get_arg(ctx, ctx->args->ac.force_vrs_rates));
9262 assert(nir_intrinsic_base(instr) < ctx->args->ac.arg_count);
9263 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9264 Temp src = ctx->arg_temps[nir_intrinsic_base(instr)];
9268 emit_split_vector(ctx, dst, dst.size());
9280 build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc,
9283 Builder bld(ctx->program, ctx->block);
9285 Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
9286 Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
9287 Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
9325 prepare_cube_coords(isel_context* ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy,
9328 Builder bld(ctx->program, ctx->block);
9331 ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;
9333 ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;
9336 if (is_array && ctx->options->gfx_level <= GFX8)
9347 ctx->block->instructions.emplace_back(std::move(vop3a));
9367 build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc);
9409 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9413 Builder bld(ctx->program, ctx->block);
9427 resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9430 sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9436 bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 &&
9455 coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9461 bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9469 lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9476 clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9482 compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9488 offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9494 ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9499 ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9504 sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9514 return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa));
9517 get_image_samples(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), resource);
9539 acc = emit_extract_vector(ctx, offset, i, s1);
9563 acc = emit_extract_vector(ctx, offset, i, v1);
9590 if (ctx->options->gfx_level == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9594 unpacked_coord.emplace_back(emit_extract_vector(ctx, coord, i, rc));
9618 coords = emit_pack_v1(ctx, unpacked_coord);
9622 prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd,
9628 assert(a16 == g16 || ctx->options->gfx_level >= GFX10);
9634 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->gfx_level == GFX9) {
9639 for (Temp derv : emit_pack_v1(ctx, unpacked))
9645 bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
9654 ctx->options->gfx_level >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
9655 ? ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array)
9658 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9684 if (ctx->options->gfx_level == GFX9 && instr->op == nir_texop_txs &&
9695 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9709 emit_split_vector(ctx, size, size.size());
9713 half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9732 not_needed = bool_to_vector_condition(ctx, not_needed);
9752 ctx->block->instructions.emplace_back(std::move(split));
9770 bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9785 ctx->block->instructions.emplace_back(std::move(vec));
9797 // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9830 ctx->block->instructions.emplace_back(std::move(mubuf));
9832 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9844 args.emplace_back(emit_pack_v1(ctx, {bias})[0]);
9880 emit_extract_vector(ctx, resource, 1, s1));
9887 emit_extract_vector(ctx, resource, 1, s1));
9892 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9897 bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16;
10043 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
10046 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
10059 val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
10065 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
10069 get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical)
10071 Temp tmp = get_ssa_temp(ctx, ssa);
10076 if (ctx->program->wave_size == 64)
10088 visit_phi(isel_context* ctx, nir_phi_instr* instr)
10091 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
10092 assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
10095 logical |= (ctx->block->kind & block_kind_merge) != 0;
10103 std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
10113 unsigned block = ctx->cf_info.nir_to_aco[src.first];
10127 if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
10130 Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
10142 if (!logical && ctx->block->kind & block_kind_loop_header) {
10150 if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
10154 Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
10155 Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
10165 insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
10180 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
10184 visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr)
10186 Temp dst = get_ssa_temp(ctx, &instr->def);
10191 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
10198 ctx->block->instructions.emplace_back(std::move(vec));
10203 begin_loop(isel_context* ctx, loop_context* lc)
10206 append_logical_end(ctx->block);
10207 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
10208 Builder bld(ctx->program, ctx->block);
10210 unsigned loop_preheader_idx = ctx->block->index;
10212 lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
10214 ctx->program->next_loop_depth++;
10216 Block* loop_header = ctx->program->create_and_insert_block();
10219 ctx->block = loop_header;
10221 append_logical_start(ctx->block);
10223 lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
10224 lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
10225 lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
10226 lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
10227 lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
10231 end_loop(isel_context* ctx, loop_context* lc)
10235 if (!ctx->cf_info.has_branch) {
10236 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10237 Builder bld(ctx->program, ctx->block);
10238 append_logical_end(ctx->block);
10240 if (ctx->cf_info.exec_potentially_empty_discard ||
10241 ctx->cf_info.exec_potentially_empty_break) {
10246 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
10247 unsigned block_idx = ctx->block->index;
10250 Block* break_block = ctx->program->create_and_insert_block();
10257 Block* continue_block = ctx->program->create_and_insert_block();
10262 add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
10264 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10265 add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
10266 ctx->block = &ctx->program->blocks[block_idx];
10268 ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10269 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10270 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10272 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10275 bld.reset(ctx->block);
10279 ctx->cf_info.has_branch = false;
10280 ctx->program->next_loop_depth--;
10284 ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10285 append_logical_start(ctx->block);
10308 ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10309 ctx->cf_info.parent_loop.exit = lc->exit_old;
10310 ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10311 ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10312 ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10313 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10314 ctx->cf_info.exec_potentially_empty_discard = false;
10318 emit_loop_jump(isel_context* ctx, bool is_break)
10320 Builder bld(ctx->program, ctx->block);
10322 append_logical_end(ctx->block);
10323 unsigned idx = ctx->block->index;
10326 logical_target = ctx->cf_info.parent_loop.exit;
10328 ctx->block->kind |= block_kind_break;
10330 if (!ctx->cf_info.parent_if.is_divergent &&
10331 !ctx->cf_info.parent_loop.has_divergent_continue) {
10333 ctx->block->kind |= block_kind_uniform;
10334 ctx->cf_info.has_branch = true;
10339 ctx->cf_info.parent_loop.has_divergent_branch = true;
10341 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10343 ctx->block->kind |= block_kind_continue;
10345 if (!ctx->cf_info.parent_if.is_divergent) {
10347 ctx->block->kind |= block_kind_uniform;
10348 ctx->cf_info.has_branch = true;
10356 ctx->cf_info.parent_loop.has_divergent_continue = true;
10357 ctx->cf_info.parent_loop.has_divergent_branch = true;
10360 if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10361 ctx->cf_info.exec_potentially_empty_break = true;
10362 ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10367 Block* break_block = ctx->program->create_and_insert_block();
10372 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10377 Block* continue_block = ctx->program->create_and_insert_block();
10380 ctx->block = continue_block;
10384 emit_loop_break(isel_context* ctx)
10386 emit_loop_jump(ctx, true);
10390 emit_loop_continue(isel_context* ctx)
10392 emit_loop_jump(ctx, false);
10396 visit_jump(isel_context* ctx, nir_jump_instr* instr)
10398 /* visit_block() would usually do this but divergent jumps updates ctx->block */
10399 ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10402 case nir_jump_break: emit_loop_break(ctx); break;
10403 case nir_jump_continue: emit_loop_continue(ctx); break;
10409 visit_block(isel_context* ctx, nir_block* block)
10413 case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10414 case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10415 case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10416 case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10417 case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10418 case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break;
10420 case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10425 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10426 ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10430 create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10436 unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10441 Block& block = ctx->program->blocks[idx];
10465 val = Operand(ctx->program->allocateTmp(rc));
10475 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10476 static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10477 static void end_uniform_if(isel_context* ctx, if_context* ic);
10480 visit_loop(isel_context* ctx, nir_loop* loop)
10483 begin_loop(ctx, &lc);
10489 Builder bld(ctx->program, ctx->block);
10492 begin_uniform_if_then(ctx, &ic, cond);
10493 emit_loop_break(ctx);
10494 begin_uniform_if_else(ctx, &ic);
10495 end_uniform_if(ctx, &ic);
10498 bool unreachable = visit_cf_list(ctx, &loop->body);
10500 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10506 assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10507 bool linear = ctx->cf_info.has_branch;
10508 bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10509 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10524 unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10526 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10528 if (ctx->cf_info.has_branch)
10532 create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10539 end_loop(ctx, &lc);
10543 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
10547 append_logical_end(ctx->block);
10548 ctx->block->kind |= block_kind_branch;
10551 assert(cond.regClass() == ctx->program->lane_mask);
10555 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10557 ctx->block->instructions.push_back(std::move(branch));
10559 ic->BB_if_idx = ctx->block->index;
10565 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10567 ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10568 ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10569 ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10570 ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10571 ctx->cf_info.parent_if.is_divergent = true;
10574 ctx->cf_info.exec_potentially_empty_discard = false;
10575 ctx->cf_info.exec_potentially_empty_break = false;
10576 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10579 ctx->program->next_divergent_if_logical_depth++;
10580 Block* BB_then_logical = ctx->program->create_and_insert_block();
10582 ctx->block = BB_then_logical;
10587 begin_divergent_if_else(isel_context* ctx, if_context* ic)
10589 Block* BB_then_logical = ctx->block;
10595 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10598 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10601 assert(!ctx->cf_info.has_branch);
10602 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10603 ctx->cf_info.parent_loop.has_divergent_branch = false;
10604 ctx->program->next_divergent_if_logical_depth--;
10607 Block* BB_then_linear = ctx->program->create_and_insert_block();
10613 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10618 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10619 ic->invert_idx = ctx->block->index;
10624 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10625 ctx->block->instructions.push_back(std::move(branch));
10627 ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10628 ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10630 ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10632 ctx->cf_info.exec_potentially_empty_discard = false;
10633 ctx->cf_info.exec_potentially_empty_break = false;
10634 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10637 ctx->program->next_divergent_if_logical_depth++;
10638 Block* BB_else_logical = ctx->program->create_and_insert_block();
10641 ctx->block = BB_else_logical;
10646 end_divergent_if(isel_context* ctx, if_context* ic)
10648 Block* BB_else_logical = ctx->block;
10655 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10658 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10661 ctx->program->next_divergent_if_logical_depth--;
10663 assert(!ctx->cf_info.has_branch);
10664 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10667 Block* BB_else_linear = ctx->program->create_and_insert_block();
10674 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10679 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10680 append_logical_start(ctx->block);
10682 ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10683 ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10684 ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10685 ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10686 ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10687 if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10688 !ctx->cf_info.parent_if.is_divergent) {
10689 ctx->cf_info.exec_potentially_empty_break = false;
10690 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10693 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10694 ctx->cf_info.exec_potentially_empty_discard = false;
10695 ctx->cf_info.exec_potentially_empty_break = false;
10696 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10701 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10705 append_logical_end(ctx->block);
10706 ctx->block->kind |= block_kind_uniform;
10712 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10715 ctx->block->instructions.emplace_back(std::move(branch));
10717 ic->BB_if_idx = ctx->block->index;
10719 ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10721 ctx->cf_info.has_branch = false;
10722 ctx->cf_info.parent_loop.has_divergent_branch = false;
10725 ctx->program->next_uniform_if_depth++;
10726 Block* BB_then = ctx->program->create_and_insert_block();
10729 ctx->block = BB_then;
10733 begin_uniform_if_else(isel_context* ctx, if_context* ic)
10735 Block* BB_then = ctx->block;
10737 ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10738 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10746 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10754 ctx->cf_info.has_branch = false;
10755 ctx->cf_info.parent_loop.has_divergent_branch = false;
10758 Block* BB_else = ctx->program->create_and_insert_block();
10761 ctx->block = BB_else;
10765 end_uniform_if(isel_context* ctx, if_context* ic)
10767 Block* BB_else = ctx->block;
10769 if (!ctx->cf_info.has_branch) {
10775 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10778 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10783 ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10784 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10787 ctx->program->next_uniform_if_depth--;
10788 if (!ctx->cf_info.has_branch) {
10789 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10790 append_logical_start(ctx->block);
10795 visit_if(isel_context* ctx, nir_if* if_stmt)
10797 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10798 Builder bld(ctx->program, ctx->block);
10819 assert(cond.regClass() == ctx->program->lane_mask);
10820 cond = bool_to_scalar_condition(ctx, cond);
10822 begin_uniform_if_then(ctx, &ic, cond);
10823 visit_cf_list(ctx, &if_stmt->then_list);
10825 begin_uniform_if_else(ctx, &ic);
10826 visit_cf_list(ctx, &if_stmt->else_list);
10828 end_uniform_if(ctx, &ic);
10855 begin_divergent_if_then(ctx, &ic, cond);
10856 visit_cf_list(ctx, &if_stmt->then_list);
10858 begin_divergent_if_else(ctx, &ic);
10859 visit_cf_list(ctx, &if_stmt->else_list);
10861 end_divergent_if(ctx, &ic);
10864 return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10868 visit_cf_list(isel_context* ctx, struct exec_list* list)
10872 case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10874 if (!visit_if(ctx, nir_cf_node_as_if(node)))
10877 case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10885 export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos)
10887 assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10890 ctx->stage.has(SWStage::GS) ? ctx->program->info.vs.outinfo.vs_output_param_offset :
10891 ctx->stage.has(SWStage::TES) ? ctx->program->info.tes.outinfo.vs_output_param_offset :
10892 ctx->stage.has(SWStage::MS) ? ctx->program->info.ms.outinfo.vs_output_param_offset :
10893 ctx->program->info.vs.outinfo.vs_output_param_offset;
10898 unsigned mask = ctx->outputs.mask[slot];
10908 exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10915 exp->valid_mask = ctx->options->gfx_level == GFX10 && is_pos && *next_pos == 0;
10922 ctx->block->instructions.emplace_back(std::move(exp));
10926 export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos,
10934 if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
10935 exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
10938 if (ctx->outputs.mask[VARYING_SLOT_LAYER] && !outinfo->writes_layer_per_primitive) {
10939 exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
10942 if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT] && !outinfo->writes_viewport_index_per_primitive) {
10943 if (ctx->options->gfx_level < GFX9) {
10944 exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
10947 Builder bld(ctx->program, ctx->block);
10950 Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
10958 if (ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_SHADING_RATE]) {
10959 exp->operands[1] = Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]);
10963 exp->valid_mask = ctx->options->gfx_level == GFX10 && *next_pos == 0;
10967 ctx->block->instructions.emplace_back(std::move(exp));
10971 create_vs_exports(isel_context* ctx)
10973 assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10975 ctx->stage.has(SWStage::GS) ? &ctx->program->info.vs.outinfo :
10976 ctx->stage.has(SWStage::TES) ? &ctx->program->info.tes.outinfo :
10977 ctx->stage.has(SWStage::MS) ? &ctx->program->info.ms.outinfo :
10978 &ctx->program->info.vs.outinfo;
10981 ctx->block->kind |= block_kind_export_end;
10986 ctx->outputs.mask[VARYING_SLOT_POS] = 0xf;
10990 export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
10994 export_vs_psiz_layer_viewport_vrs(ctx, &next_pos, outinfo);
10996 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10997 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
10998 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10999 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
11001 if (ctx->export_clip_dists) {
11002 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
11003 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
11004 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
11005 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
11012 if (ctx->shader && ctx->shader->info.per_primitive_outputs & BITFIELD64_BIT(i))
11015 export_vs_varying(ctx, i, false, NULL);
11020 create_primitive_exports(isel_context *ctx, Temp prim_ch1)
11022 assert(ctx->stage.hw == HWStage::NGG);
11024 ctx->stage.has(SWStage::GS) ? &ctx->program->info.vs.outinfo :
11025 ctx->stage.has(SWStage::TES) ? &ctx->program->info.tes.outinfo :
11026 ctx->stage.has(SWStage::MS) ? &ctx->program->info.ms.outinfo :
11027 &ctx->program->info.vs.outinfo;
11029 Builder bld(ctx->program, ctx->block);
11043 Temp tmp = ctx->outputs.temps[VARYING_SLOT_LAYER * 4u];
11048 Temp tmp = ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u];
11053 Temp tmp = ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u];
11065 if (!(ctx->shader->info.per_primitive_outputs & BITFIELD64_BIT(i)))
11070 export_vs_varying(ctx, i, false, NULL);
11075 export_fs_mrt_z(isel_context* ctx)
11077 Builder bld(ctx->program, ctx->block);
11087 if (!ctx->program->info.ps.writes_z &&
11088 (ctx->program->info.ps.writes_stencil || ctx->program->info.ps.writes_sample_mask)) {
11089 compr = ctx->program->gfx_level < GFX11; /* COMPR flag */
11091 if (ctx->program->info.ps.writes_stencil) {
11093 values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
11095 enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x1 : 0x3;
11098 if (ctx->program->info.ps.writes_sample_mask) {
11100 values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
11101 enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x2 : 0xc;
11104 if (ctx->options->key.ps.alpha_to_coverage_via_mrtz &&
11105 (ctx->outputs.mask[FRAG_RESULT_DATA0] & 0x8)) {
11107 assert(ctx->program->gfx_level >= GFX11);
11108 Operand mrtz_alpha = Operand(ctx->outputs.temps[FRAG_RESULT_DATA0 + 3u]);
11111 if (ctx->program->info.ps.writes_sample_mask) {
11121 if (ctx->program->info.ps.writes_z) {
11122 values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
11126 if (ctx->program->info.ps.writes_stencil) {
11127 values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
11131 if (ctx->program->info.ps.writes_sample_mask) {
11132 values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
11136 if (ctx->options->key.ps.alpha_to_coverage_via_mrtz &&
11137 (ctx->outputs.mask[FRAG_RESULT_DATA0] & 0x8)) {
11138 assert(ctx->program->gfx_level >= GFX11);
11139 values[3] = Operand(ctx->outputs.temps[FRAG_RESULT_DATA0 + 3u]);
11147 if (ctx->options->gfx_level == GFX6 && ctx->options->family != CHIP_OLAND &&
11148 ctx->options->family != CHIP_HAINAN) {
11171 export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export *out,
11174 Builder bld(ctx->program, ctx->block);
11207 if (ctx->options->gfx_level >= GFX10) {
11223 if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) {
11332 if (ctx->program->gfx_level >= GFX11) {
11346 create_fs_null_export(isel_context* ctx)
11352 Builder bld(ctx->program, ctx->block);
11354 unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
11360 create_fs_jump_to_epilog(isel_context* ctx)
11362 Builder bld(ctx->program, ctx->block);
11368 unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3;
11369 unsigned write_mask = ctx->outputs.mask[slot];
11383 Operand chan(ctx->outputs.temps[slot * 4u + i]);
11389 Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext);
11398 Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ps_epilog_pc));
11406 ctx->block->instructions.emplace_back(std::move(jump));
11410 create_fs_exports(isel_context* ctx)
11412 Builder bld(ctx->program, ctx->block);
11416 if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
11417 ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11418 exported |= export_fs_mrt_z(ctx);
11420 if (ctx->program->info.ps.has_epilog) {
11421 create_fs_jump_to_epilog(ctx);
11425 if (!ctx->outputs.mask[i])
11431 out.write_mask = ctx->outputs.mask[i];
11432 out.col_format = (ctx->options->key.ps.col_format >> (4 * out.slot)) & 0xf;
11436 out.values[c] = Operand(ctx->outputs.temps[i * 4u + c]);
11442 exported |= export_fs_mrt_color(ctx, &out, false);
11446 create_fs_null_export(ctx);
11449 ctx->block->kind |= block_kind_export_end;
11453 emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
11456 assert(ctx->stage.hw == HWStage::VS);
11461 unsigned writemask = output->component_mask & ctx->outputs.mask[loc];
11465 if (count == 3 && ctx->options->gfx_level == GFX6) {
11473 Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
11477 vec->operands[i] = Operand(ctx->outputs.temps[loc * 4 + start + i]);
11479 ctx->block->instructions.emplace_back(std::move(vec));
11490 Builder bld(ctx->program, ctx->block);
11497 store->glc = ctx->program->gfx_level < GFX11;
11500 ctx->block->instructions.emplace_back(std::move(store));
11505 emit_streamout(isel_context* ctx, unsigned stream)
11507 Builder bld(ctx->program, ctx->block);
11511 get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));
11513 Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
11518 begin_divergent_if_then(ctx, &ic, can_emit);
11520 bld.reset(ctx->block);
11523 bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
11527 Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
11530 unsigned stride = ctx->program->info.so.strides[i];
11539 get_arg(ctx, ctx->args->ac.streamout_write_index),
11540 get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11548 get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11553 for (unsigned i = 0; i < ctx->program->info.so.num_outputs; i++) {
11554 const struct aco_stream_output* output = &ctx->program->info.so.outputs[i];
11558 emit_stream_output(ctx, so_buffers, so_write_offset, output);
11561 begin_divergent_if_else(ctx, &ic);
11562 end_divergent_if(ctx, &ic);
11566 add_startpgm(struct isel_context* ctx)
11569 for (unsigned i = 0; i < ctx->args->ac.arg_count; i++) {
11570 if (ctx->args->ac.args[i].skip)
11572 unsigned align = MIN2(4, util_next_power_of_two(ctx->args->ac.args[i].size));
11573 if (ctx->args->ac.args[i].file == AC_ARG_SGPR && ctx->args->ac.args[i].offset % align)
11574 def_count += ctx->args->ac.args[i].size;
11581 ctx->block->instructions.emplace_back(startpgm);
11582 for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {
11583 if (ctx->args->ac.args[i].skip)
11586 enum ac_arg_regfile file = ctx->args->ac.args[i].file;
11587 unsigned size = ctx->args->ac.args[i].size;
11588 unsigned reg = ctx->args->ac.args[i].offset;
11594 elems[j] = ctx->program->allocateTmp(s1);
11597 ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4);
11599 Temp dst = ctx->program->allocateTmp(type);
11600 ctx->arg_temps[i] = dst;
11610 ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11611 if (ctx->program->gfx_level <= GFX10_3) {
11612 ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset);
11614 if (ctx->program->gfx_level >= GFX9) {
11615 Operand scratch_offset(ctx->program->scratch_offset);
11617 Builder bld(ctx->program, ctx->block);
11619 ctx->program->private_segment_buffer, scratch_offset);
11623 if (ctx->stage.has(SWStage::VS) && ctx->program->info.vs.dynamic_inputs) {
11624 unsigned num_attributes = util_last_bit(ctx->program->info.vs.vb_desc_usage_mask);
11626 Definition def(get_arg(ctx, ctx->args->vs_inputs[i]));
11628 unsigned idx = ctx->args->vs_inputs[i].arg_index;
11629 def.setFixed(PhysReg(256 + ctx->args->ac.args[idx].offset));
11631 ctx->program->vs_inputs.push_back(def);
11639 fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm)
11641 assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
11642 Builder bld(ctx->program, ctx->block);
11645 get_arg(ctx, ctx->args->ac.merged_wave_info),
11647 Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11652 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id),
11653 get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads);
11655 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
11656 get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads);
11658 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id),
11659 get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads);
11661 ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
11662 ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11663 ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
11667 split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11674 emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11681 handle_bc_optimize(isel_context* ctx)
11684 Builder bld(ctx->program, ctx->block);
11685 uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
11692 ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
11694 ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
11698 get_arg(ctx, ctx->args->ac.prim_mask), Operand::zero());
11704 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
11706 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
11710 ctx->persp_centroid = bld.tmp(v2);
11711 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
11713 emit_split_vector(ctx, ctx->persp_centroid, 2);
11720 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
11722 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
11726 ctx->linear_centroid = bld.tmp(v2);
11727 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
11729 emit_split_vector(ctx, ctx->linear_centroid, 2);
11735 setup_fp_mode(isel_context* ctx, nir_shader* shader)
11737 Program* program = ctx->program;
11786 ctx->block->fp_mode = program->next_fp_mode;
11802 lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true)
11806 Builder bld(ctx->program, ctx->block);
11810 if (ctx->program->wave_size == 64) {
11823 cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11830 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11832 Builder bld(ctx->program, ctx->block);
11836 ? get_arg(ctx, ctx->args->ac.merged_wave_info)
11838 get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(i * 8u));
11840 return lanecount_to_mask(ctx, count);
11844 ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt)
11848 Builder bld(ctx->program, ctx->block);
11851 if (ctx->program->gfx_level == GFX10 &&
11852 (ctx->stage.has(SWStage::GS) || ctx->program->info.has_ngg_culling)) {
11880 Operand::c32_or_c64(1u, ctx->program->wave_size == 64), first_lane);
11882 Operand::zero(ctx->program->wave_size == 64 ? 8 : 4), bld.scc(prm_cnt_0));
11885 begin_divergent_if_then(ctx, &ic_prim_0, cond);
11886 bld.reset(ctx->block);
11887 ctx->block->kind |= block_kind_export_end;
11901 begin_divergent_if_else(ctx, &ic_prim_0);
11902 end_divergent_if(ctx, &ic_prim_0);
11903 bld.reset(ctx->block);
11915 isel_context ctx = setup_isel_context(program, shader_count, shaders, config, options, info, args, false, false);
11917 bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
11921 init_context(&ctx, nir);
11923 setup_fp_mode(&ctx, nir);
11927 Pseudo_instruction* startpgm = add_startpgm(&ctx);
11928 append_logical_start(ctx.block);
11930 if (unlikely(ctx.options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
11931 fix_ls_vgpr_init_bug(&ctx, startpgm);
11933 split_arguments(&ctx, startpgm);
11937 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11946 (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11947 (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11950 ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11952 ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11958 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11962 Temp cond = merged_wave_info_to_mask(&ctx, i);
11963 begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
11967 Builder bld(ctx.program, ctx.block);
11970 bool tcs_skip_barrier = ctx.stage == vertex_tess_control_hs &&
11971 ctx.tcs_temp_only_inputs == nir->info.inputs_read;
11975 ctx.stage == vertex_tess_control_hs &&
11976 program->wave_size % ctx.options->key.tcs.tess_input_vertices == 0 &&
11977 ctx.options->key.tcs.tess_input_vertices == nir->info.tess.tcs_vertices_out
11984 if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
11985 ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc),
11986 get_arg(&ctx, args->ac.merged_wave_info), Operand::c32(2u),
11989 } else if (ctx.stage == geometry_gs)
11990 ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id);
11992 if (ctx.stage == fragment_fs)
11993 handle_bc_optimize(&ctx);
11995 visit_cf_list(&ctx, &func->body);
11997 if (ctx.program->info.so.num_outputs && ctx.stage.hw == HWStage::VS)
11998 emit_streamout(&ctx, 0);
12000 if (ctx.stage.hw == HWStage::VS) {
12001 create_vs_exports(&ctx);
12003 Builder bld(ctx.program, ctx.block);
12006 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1,
12010 if (ctx.stage == fragment_fs) {
12011 create_fs_exports(&ctx);
12015 begin_divergent_if_else(&ctx, &ic_merged_wave_info);
12016 end_divergent_if(&ctx, &ic_merged_wave_info);
12019 if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
12021 ctx.inputs = ctx.outputs;
12022 ctx.outputs = shader_io_state();
12025 cleanup_context(&ctx);
12030 append_logical_end(ctx.block);
12031 ctx.block->kind |= block_kind_uniform;
12032 Builder bld(ctx.program, ctx.block);
12044 isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, options, info, args, true, false);
12046 ctx.block->fp_mode = program->next_fp_mode;
12048 add_startpgm(&ctx);
12049 append_logical_start(ctx.block);
12051 Builder bld(ctx.program, ctx.block);
12059 get_arg(&ctx, ctx.args->ac.streamout_config), Operand::c32(0x20018u));
12062 get_arg(&ctx, ctx.args->ac.vertex_id));
12074 memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
12080 begin_uniform_if_then(&ctx, &if_contexts.top(), cond);
12081 bld.reset(ctx.block);
12097 load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,
12100 ctx.outputs.mask[i] |= 1 << j;
12101 ctx.outputs.temps[i * 4u + j] = val;
12108 emit_streamout(&ctx, stream);
12109 bld.reset(ctx.block);
12113 create_vs_exports(&ctx);
12117 begin_uniform_if_else(&ctx, &if_contexts.top());
12118 bld.reset(ctx.block);
12123 end_uniform_if(&ctx, &if_contexts.top());
12129 append_logical_end(ctx.block);
12130 ctx.block->kind |= block_kind_uniform;
12131 bld.reset(ctx.block);
12148 isel_context ctx = {};
12149 ctx.program = program;
12150 ctx.args = args;
12151 ctx.options = options;
12152 ctx.stage = program->stage;
12154 ctx.block = ctx.program->create_and_insert_block();
12155 ctx.block->kind = block_kind_top_level;
12159 add_startpgm(&ctx);
12160 append_logical_start(ctx.block);
12162 Builder bld(ctx.program, ctx.block);
12191 append_logical_end(ctx.block);
12192 ctx.block->kind |= block_kind_uniform;
12532 isel_context ctx = setup_isel_context(program, 0, NULL, config, options, info, args, false, true);
12534 ctx.block->fp_mode = program->next_fp_mode;
12536 add_startpgm(&ctx);
12537 append_logical_start(ctx.block);
12539 Builder bld(ctx.program, ctx.block);
12559 Temp inputs = get_arg(&ctx, ctx.args->ps_epilog_inputs[i]);
12561 out.values[c] = Operand(emit_extract_vector(&ctx, inputs, c, v1));
12564 exported |= export_fs_mrt_color(&ctx, &out, true);
12568 create_fs_null_export(&ctx);
12572 append_logical_end(ctx.block);
12573 ctx.block->kind |= block_kind_export_end;
12574 bld.reset(ctx.block);