1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_builder.h" 26#include "aco_ir.h" 27 28#include "util/half_float.h" 29#include "util/memstream.h" 30 31#include <algorithm> 32#include <array> 33#include <vector> 34 35namespace aco { 36 37#ifndef NDEBUG 38void 39perfwarn(Program* program, bool cond, const char* msg, Instruction* instr) 40{ 41 if (cond) { 42 char* out; 43 size_t outsize; 44 struct u_memstream mem; 45 u_memstream_open(&mem, &out, &outsize); 46 FILE* const memf = u_memstream_get(&mem); 47 48 fprintf(memf, "%s: ", msg); 49 aco_print_instr(instr, memf); 50 u_memstream_close(&mem); 51 52 aco_perfwarn(program, out); 53 free(out); 54 55 if (debug_flags & DEBUG_PERFWARN) 56 exit(1); 57 } 58} 59#endif 60 61/** 62 * The optimizer works in 4 phases: 63 * (1) The first pass collects information for each ssa-def, 64 * propagates reg->reg operands of the same type, inline constants 65 * and neg/abs input modifiers. 66 * (2) The second pass combines instructions like mad, omod, clamp and 67 * propagates sgpr's on VALU instructions. 68 * This pass depends on information collected in the first pass. 69 * (3) The third pass goes backwards, and selects instructions, 70 * i.e. decides if a mad instruction is profitable and eliminates dead code. 71 * (4) The fourth pass cleans up the sequence: literals get applied and dead 72 * instructions are removed from the sequence. 73 */ 74 75struct mad_info { 76 aco_ptr<Instruction> add_instr; 77 uint32_t mul_temp_id; 78 uint16_t literal_idx; 79 bool check_literal; 80 81 mad_info(aco_ptr<Instruction> instr, uint32_t id) 82 : add_instr(std::move(instr)), mul_temp_id(id), literal_idx(0), check_literal(false) 83 {} 84}; 85 86enum Label { 87 label_vec = 1 << 0, 88 label_constant_32bit = 1 << 1, 89 /* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and 90 * 32-bit operations but this shouldn't cause any issues because we don't 91 * look through any conversions */ 92 label_abs = 1 << 2, 93 label_neg = 1 << 3, 94 label_mul = 1 << 4, 95 label_temp = 1 << 5, 96 label_literal = 1 << 6, 97 label_mad = 1 << 7, 98 label_omod2 = 1 << 8, 99 label_omod4 = 1 << 9, 100 label_omod5 = 1 << 10, 101 label_clamp = 1 << 12, 102 label_undefined = 1 << 14, 103 label_vcc = 1 << 15, 104 label_b2f = 1 << 16, 105 label_add_sub = 1 << 17, 106 label_bitwise = 1 << 18, 107 label_minmax = 1 << 19, 108 label_vopc = 1 << 20, 109 label_uniform_bool = 1 << 21, 110 label_constant_64bit = 1 << 22, 111 label_uniform_bitwise = 1 << 23, 112 label_scc_invert = 1 << 24, 113 label_scc_needed = 1 << 26, 114 label_b2i = 1 << 27, 115 label_fcanonicalize = 1 << 28, 116 label_constant_16bit = 1 << 29, 117 label_usedef = 1 << 30, /* generic label */ 118 label_vop3p = 1ull << 31, /* 1ull to prevent sign extension */ 119 label_canonicalized = 1ull << 32, 120 label_extract = 1ull << 33, 121 label_insert = 1ull << 34, 122 label_dpp16 = 1ull << 35, 123 label_dpp8 = 1ull << 36, 124 label_f2f32 = 1ull << 37, 125 label_f2f16 = 1ull << 38, 126 label_split = 1ull << 39, 127}; 128 129static constexpr uint64_t instr_usedef_labels = 130 label_vec | label_mul | label_mad | label_add_sub | label_vop3p | label_bitwise | 131 label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract | label_dpp16 | 132 label_dpp8 | label_f2f32; 133static constexpr uint64_t instr_mod_labels = 134 label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f16; 135 136static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels | label_split; 137static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | 138 label_uniform_bool | label_scc_invert | label_b2i | 139 label_fcanonicalize; 140static constexpr uint32_t val_labels = 141 label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal; 142 143static_assert((instr_labels & temp_labels) == 0, "labels cannot intersect"); 144static_assert((instr_labels & val_labels) == 0, "labels cannot intersect"); 145static_assert((temp_labels & val_labels) == 0, "labels cannot intersect"); 146 147struct ssa_info { 148 uint64_t label; 149 union { 150 uint32_t val; 151 Temp temp; 152 Instruction* instr; 153 }; 154 155 ssa_info() : label(0) {} 156 157 void add_label(Label new_label) 158 { 159 /* Since all the instr_usedef_labels use instr for the same thing 160 * (indicating the defining instruction), there is usually no need to 161 * clear any other instr labels. */ 162 if (new_label & instr_usedef_labels) 163 label &= ~(instr_mod_labels | temp_labels | val_labels); /* instr, temp and val alias */ 164 165 if (new_label & instr_mod_labels) { 166 label &= ~instr_labels; 167 label &= ~(temp_labels | val_labels); /* instr, temp and val alias */ 168 } 169 170 if (new_label & temp_labels) { 171 label &= ~temp_labels; 172 label &= ~(instr_labels | val_labels); /* instr, temp and val alias */ 173 } 174 175 uint32_t const_labels = 176 label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit; 177 if (new_label & const_labels) { 178 label &= ~val_labels | const_labels; 179 label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */ 180 } else if (new_label & val_labels) { 181 label &= ~val_labels; 182 label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */ 183 } 184 185 label |= new_label; 186 } 187 188 void set_vec(Instruction* vec) 189 { 190 add_label(label_vec); 191 instr = vec; 192 } 193 194 bool is_vec() { return label & label_vec; } 195 196 void set_constant(amd_gfx_level gfx_level, uint64_t constant) 197 { 198 Operand op16 = Operand::c16(constant); 199 Operand op32 = Operand::get_const(gfx_level, constant, 4); 200 add_label(label_literal); 201 val = constant; 202 203 /* check that no upper bits are lost in case of packed 16bit constants */ 204 if (gfx_level >= GFX8 && !op16.isLiteral() && 205 op16.constantValue16(true) == ((constant >> 16) & 0xffff)) 206 add_label(label_constant_16bit); 207 208 if (!op32.isLiteral()) 209 add_label(label_constant_32bit); 210 211 if (Operand::is_constant_representable(constant, 8)) 212 add_label(label_constant_64bit); 213 214 if (label & label_constant_64bit) { 215 val = Operand::c64(constant).constantValue(); 216 if (val != constant) 217 label &= ~(label_literal | label_constant_16bit | label_constant_32bit); 218 } 219 } 220 221 bool is_constant(unsigned bits) 222 { 223 switch (bits) { 224 case 8: return label & label_literal; 225 case 16: return label & label_constant_16bit; 226 case 32: return label & label_constant_32bit; 227 case 64: return label & label_constant_64bit; 228 } 229 return false; 230 } 231 232 bool is_literal(unsigned bits) 233 { 234 bool is_lit = label & label_literal; 235 switch (bits) { 236 case 8: return false; 237 case 16: return is_lit && ~(label & label_constant_16bit); 238 case 32: return is_lit && ~(label & label_constant_32bit); 239 case 64: return false; 240 } 241 return false; 242 } 243 244 bool is_constant_or_literal(unsigned bits) 245 { 246 if (bits == 64) 247 return label & label_constant_64bit; 248 else 249 return label & label_literal; 250 } 251 252 void set_abs(Temp abs_temp) 253 { 254 add_label(label_abs); 255 temp = abs_temp; 256 } 257 258 bool is_abs() { return label & label_abs; } 259 260 void set_neg(Temp neg_temp) 261 { 262 add_label(label_neg); 263 temp = neg_temp; 264 } 265 266 bool is_neg() { return label & label_neg; } 267 268 void set_neg_abs(Temp neg_abs_temp) 269 { 270 add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg)); 271 temp = neg_abs_temp; 272 } 273 274 void set_mul(Instruction* mul) 275 { 276 add_label(label_mul); 277 instr = mul; 278 } 279 280 bool is_mul() { return label & label_mul; } 281 282 void set_temp(Temp tmp) 283 { 284 add_label(label_temp); 285 temp = tmp; 286 } 287 288 bool is_temp() { return label & label_temp; } 289 290 void set_mad(Instruction* mad, uint32_t mad_info_idx) 291 { 292 add_label(label_mad); 293 mad->pass_flags = mad_info_idx; 294 instr = mad; 295 } 296 297 bool is_mad() { return label & label_mad; } 298 299 void set_omod2(Instruction* mul) 300 { 301 add_label(label_omod2); 302 instr = mul; 303 } 304 305 bool is_omod2() { return label & label_omod2; } 306 307 void set_omod4(Instruction* mul) 308 { 309 add_label(label_omod4); 310 instr = mul; 311 } 312 313 bool is_omod4() { return label & label_omod4; } 314 315 void set_omod5(Instruction* mul) 316 { 317 add_label(label_omod5); 318 instr = mul; 319 } 320 321 bool is_omod5() { return label & label_omod5; } 322 323 void set_clamp(Instruction* med3) 324 { 325 add_label(label_clamp); 326 instr = med3; 327 } 328 329 bool is_clamp() { return label & label_clamp; } 330 331 void set_f2f16(Instruction* conv) 332 { 333 add_label(label_f2f16); 334 instr = conv; 335 } 336 337 bool is_f2f16() { return label & label_f2f16; } 338 339 void set_undefined() { add_label(label_undefined); } 340 341 bool is_undefined() { return label & label_undefined; } 342 343 void set_vcc(Temp vcc_val) 344 { 345 add_label(label_vcc); 346 temp = vcc_val; 347 } 348 349 bool is_vcc() { return label & label_vcc; } 350 351 void set_b2f(Temp b2f_val) 352 { 353 add_label(label_b2f); 354 temp = b2f_val; 355 } 356 357 bool is_b2f() { return label & label_b2f; } 358 359 void set_add_sub(Instruction* add_sub_instr) 360 { 361 add_label(label_add_sub); 362 instr = add_sub_instr; 363 } 364 365 bool is_add_sub() { return label & label_add_sub; } 366 367 void set_bitwise(Instruction* bitwise_instr) 368 { 369 add_label(label_bitwise); 370 instr = bitwise_instr; 371 } 372 373 bool is_bitwise() { return label & label_bitwise; } 374 375 void set_uniform_bitwise() { add_label(label_uniform_bitwise); } 376 377 bool is_uniform_bitwise() { return label & label_uniform_bitwise; } 378 379 void set_minmax(Instruction* minmax_instr) 380 { 381 add_label(label_minmax); 382 instr = minmax_instr; 383 } 384 385 bool is_minmax() { return label & label_minmax; } 386 387 void set_vopc(Instruction* vopc_instr) 388 { 389 add_label(label_vopc); 390 instr = vopc_instr; 391 } 392 393 bool is_vopc() { return label & label_vopc; } 394 395 void set_scc_needed() { add_label(label_scc_needed); } 396 397 bool is_scc_needed() { return label & label_scc_needed; } 398 399 void set_scc_invert(Temp scc_inv) 400 { 401 add_label(label_scc_invert); 402 temp = scc_inv; 403 } 404 405 bool is_scc_invert() { return label & label_scc_invert; } 406 407 void set_uniform_bool(Temp uniform_bool) 408 { 409 add_label(label_uniform_bool); 410 temp = uniform_bool; 411 } 412 413 bool is_uniform_bool() { return label & label_uniform_bool; } 414 415 void set_b2i(Temp b2i_val) 416 { 417 add_label(label_b2i); 418 temp = b2i_val; 419 } 420 421 bool is_b2i() { return label & label_b2i; } 422 423 void set_usedef(Instruction* label_instr) 424 { 425 add_label(label_usedef); 426 instr = label_instr; 427 } 428 429 bool is_usedef() { return label & label_usedef; } 430 431 void set_vop3p(Instruction* vop3p_instr) 432 { 433 add_label(label_vop3p); 434 instr = vop3p_instr; 435 } 436 437 bool is_vop3p() { return label & label_vop3p; } 438 439 void set_fcanonicalize(Temp tmp) 440 { 441 add_label(label_fcanonicalize); 442 temp = tmp; 443 } 444 445 bool is_fcanonicalize() { return label & label_fcanonicalize; } 446 447 void set_canonicalized() { add_label(label_canonicalized); } 448 449 bool is_canonicalized() { return label & label_canonicalized; } 450 451 void set_f2f32(Instruction* cvt) 452 { 453 add_label(label_f2f32); 454 instr = cvt; 455 } 456 457 bool is_f2f32() { return label & label_f2f32; } 458 459 void set_extract(Instruction* extract) 460 { 461 add_label(label_extract); 462 instr = extract; 463 } 464 465 bool is_extract() { return label & label_extract; } 466 467 void set_insert(Instruction* insert) 468 { 469 add_label(label_insert); 470 instr = insert; 471 } 472 473 bool is_insert() { return label & label_insert; } 474 475 void set_dpp16(Instruction* mov) 476 { 477 add_label(label_dpp16); 478 instr = mov; 479 } 480 481 void set_dpp8(Instruction* mov) 482 { 483 add_label(label_dpp8); 484 instr = mov; 485 } 486 487 bool is_dpp() { return label & (label_dpp16 | label_dpp8); } 488 bool is_dpp16() { return label & label_dpp16; } 489 bool is_dpp8() { return label & label_dpp8; } 490 491 void set_split(Instruction* split) 492 { 493 add_label(label_split); 494 instr = split; 495 } 496 497 bool is_split() { return label & label_split; } 498}; 499 500struct opt_ctx { 501 Program* program; 502 float_mode fp_mode; 503 std::vector<aco_ptr<Instruction>> instructions; 504 ssa_info* info; 505 std::pair<uint32_t, Temp> last_literal; 506 std::vector<mad_info> mad_infos; 507 std::vector<uint16_t> uses; 508}; 509 510bool 511can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr) 512{ 513 if (instr->isVOP3()) 514 return true; 515 516 if (instr->isVOP3P()) 517 return false; 518 519 if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->gfx_level < GFX10) 520 return false; 521 522 if (instr->isDPP() || instr->isSDWA()) 523 return false; 524 525 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 && 526 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 && 527 instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 && 528 instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 && 529 instr->opcode != aco_opcode::v_readlane_b32 && 530 instr->opcode != aco_opcode::v_writelane_b32 && 531 instr->opcode != aco_opcode::v_readfirstlane_b32; 532} 533 534bool 535pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsigned index) 536{ 537 if (instr->definitions.empty()) 538 return false; 539 540 const bool vgpr = 541 instr->opcode == aco_opcode::p_as_uniform || 542 std::all_of(instr->definitions.begin(), instr->definitions.end(), 543 [](const Definition& def) { return def.regClass().type() == RegType::vgpr; }); 544 545 /* don't propagate VGPRs into SGPR instructions */ 546 if (temp.type() == RegType::vgpr && !vgpr) 547 return false; 548 549 bool can_accept_sgpr = 550 ctx.program->gfx_level >= GFX9 || 551 std::none_of(instr->definitions.begin(), instr->definitions.end(), 552 [](const Definition& def) { return def.regClass().is_subdword(); }); 553 554 switch (instr->opcode) { 555 case aco_opcode::p_phi: 556 case aco_opcode::p_linear_phi: 557 case aco_opcode::p_parallelcopy: 558 case aco_opcode::p_create_vector: 559 if (temp.bytes() != instr->operands[index].bytes()) 560 return false; 561 break; 562 case aco_opcode::p_extract_vector: 563 case aco_opcode::p_extract: 564 if (temp.type() == RegType::sgpr && !can_accept_sgpr) 565 return false; 566 break; 567 case aco_opcode::p_split_vector: { 568 if (temp.type() == RegType::sgpr && !can_accept_sgpr) 569 return false; 570 /* don't increase the vector size */ 571 if (temp.bytes() > instr->operands[index].bytes()) 572 return false; 573 /* We can decrease the vector size as smaller temporaries are only 574 * propagated by p_as_uniform instructions. 575 * If this propagation leads to invalid IR or hits the assertion below, 576 * it means that some undefined bytes within a dword are begin accessed 577 * and a bug in instruction_selection is likely. */ 578 int decrease = instr->operands[index].bytes() - temp.bytes(); 579 while (decrease > 0) { 580 decrease -= instr->definitions.back().bytes(); 581 instr->definitions.pop_back(); 582 } 583 assert(decrease == 0); 584 break; 585 } 586 case aco_opcode::p_as_uniform: 587 if (temp.regClass() == instr->definitions[0].regClass()) 588 instr->opcode = aco_opcode::p_parallelcopy; 589 break; 590 default: return false; 591 } 592 593 instr->operands[index].setTemp(temp); 594 return true; 595} 596 597/* This expects the DPP modifier to be removed. */ 598bool 599can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr) 600{ 601 if (instr->isSDWA() && ctx.program->gfx_level < GFX9) 602 return false; 603 return instr->opcode != aco_opcode::v_readfirstlane_b32 && 604 instr->opcode != aco_opcode::v_readlane_b32 && 605 instr->opcode != aco_opcode::v_readlane_b32_e64 && 606 instr->opcode != aco_opcode::v_writelane_b32 && 607 instr->opcode != aco_opcode::v_writelane_b32_e64 && 608 instr->opcode != aco_opcode::v_permlane16_b32 && 609 instr->opcode != aco_opcode::v_permlanex16_b32; 610} 611 612void 613to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr) 614{ 615 if (instr->isVOP3()) 616 return; 617 618 aco_ptr<Instruction> tmp = std::move(instr); 619 Format format = asVOP3(tmp->format); 620 instr.reset(create_instruction<VOP3_instruction>(tmp->opcode, format, tmp->operands.size(), 621 tmp->definitions.size())); 622 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); 623 for (unsigned i = 0; i < instr->definitions.size(); i++) { 624 instr->definitions[i] = tmp->definitions[i]; 625 if (instr->definitions[i].isTemp()) { 626 ssa_info& info = ctx.info[instr->definitions[i].tempId()]; 627 if (info.label & instr_usedef_labels && info.instr == tmp.get()) 628 info.instr = instr.get(); 629 } 630 } 631 /* we don't need to update any instr_mod_labels because they either haven't 632 * been applied yet or this instruction isn't dead and so they've been ignored */ 633 634 instr->pass_flags = tmp->pass_flags; 635} 636 637bool 638is_operand_vgpr(Operand op) 639{ 640 return op.isTemp() && op.getTemp().type() == RegType::vgpr; 641} 642 643void 644to_SDWA(opt_ctx& ctx, aco_ptr<Instruction>& instr) 645{ 646 aco_ptr<Instruction> tmp = convert_to_SDWA(ctx.program->gfx_level, instr); 647 if (!tmp) 648 return; 649 650 for (unsigned i = 0; i < instr->definitions.size(); i++) { 651 ssa_info& info = ctx.info[instr->definitions[i].tempId()]; 652 if (info.label & instr_labels && info.instr == tmp.get()) 653 info.instr = instr.get(); 654 } 655} 656 657/* only covers special cases */ 658bool 659alu_can_accept_constant(aco_opcode opcode, unsigned operand) 660{ 661 switch (opcode) { 662 case aco_opcode::v_interp_p2_f32: 663 case aco_opcode::v_mac_f32: 664 case aco_opcode::v_writelane_b32: 665 case aco_opcode::v_writelane_b32_e64: 666 case aco_opcode::v_cndmask_b32: return operand != 2; 667 case aco_opcode::s_addk_i32: 668 case aco_opcode::s_mulk_i32: 669 case aco_opcode::p_wqm: 670 case aco_opcode::p_extract_vector: 671 case aco_opcode::p_split_vector: 672 case aco_opcode::v_readlane_b32: 673 case aco_opcode::v_readlane_b32_e64: 674 case aco_opcode::v_readfirstlane_b32: 675 case aco_opcode::p_extract: 676 case aco_opcode::p_insert: return operand != 0; 677 default: return true; 678 } 679} 680 681bool 682valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand) 683{ 684 if (instr->opcode == aco_opcode::v_readlane_b32 || 685 instr->opcode == aco_opcode::v_readlane_b32_e64 || 686 instr->opcode == aco_opcode::v_writelane_b32 || 687 instr->opcode == aco_opcode::v_writelane_b32_e64) 688 return operand != 1; 689 if (instr->opcode == aco_opcode::v_permlane16_b32 || 690 instr->opcode == aco_opcode::v_permlanex16_b32) 691 return operand == 0; 692 return true; 693} 694 695/* check constant bus and literal limitations */ 696bool 697check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand* operands) 698{ 699 int limit = ctx.program->gfx_level >= GFX10 ? 2 : 1; 700 Operand literal32(s1); 701 Operand literal64(s2); 702 unsigned num_sgprs = 0; 703 unsigned sgpr[] = {0, 0}; 704 705 for (unsigned i = 0; i < num_operands; i++) { 706 Operand op = operands[i]; 707 708 if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) { 709 /* two reads of the same SGPR count as 1 to the limit */ 710 if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) { 711 if (num_sgprs < 2) 712 sgpr[num_sgprs++] = op.tempId(); 713 limit--; 714 if (limit < 0) 715 return false; 716 } 717 } else if (op.isLiteral()) { 718 if (ctx.program->gfx_level < GFX10) 719 return false; 720 721 if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue()) 722 return false; 723 if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue()) 724 return false; 725 726 /* Any number of 32-bit literals counts as only 1 to the limit. Same 727 * (but separately) for 64-bit literals. */ 728 if (op.size() == 1 && literal32.isUndefined()) { 729 limit--; 730 literal32 = op; 731 } else if (op.size() == 2 && literal64.isUndefined()) { 732 limit--; 733 literal64 = op; 734 } 735 736 if (limit < 0) 737 return false; 738 } 739 } 740 741 return true; 742} 743 744bool 745parse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* base, uint32_t* offset, 746 bool prevent_overflow) 747{ 748 Operand op = instr->operands[op_index]; 749 750 if (!op.isTemp()) 751 return false; 752 Temp tmp = op.getTemp(); 753 if (!ctx.info[tmp.id()].is_add_sub()) 754 return false; 755 756 Instruction* add_instr = ctx.info[tmp.id()].instr; 757 758 unsigned mask = 0x3; 759 bool is_sub = false; 760 switch (add_instr->opcode) { 761 case aco_opcode::v_add_u32: 762 case aco_opcode::v_add_co_u32: 763 case aco_opcode::v_add_co_u32_e64: 764 case aco_opcode::s_add_i32: 765 case aco_opcode::s_add_u32: break; 766 case aco_opcode::v_sub_u32: 767 case aco_opcode::v_sub_i32: 768 case aco_opcode::v_sub_co_u32: 769 case aco_opcode::v_sub_co_u32_e64: 770 case aco_opcode::s_sub_u32: 771 case aco_opcode::s_sub_i32: 772 mask = 0x2; 773 is_sub = true; 774 break; 775 case aco_opcode::v_subrev_u32: 776 case aco_opcode::v_subrev_co_u32: 777 case aco_opcode::v_subrev_co_u32_e64: 778 mask = 0x1; 779 is_sub = true; 780 break; 781 default: return false; 782 } 783 if (prevent_overflow && !add_instr->definitions[0].isNUW()) 784 return false; 785 786 if (add_instr->usesModifiers()) 787 return false; 788 789 u_foreach_bit (i, mask) { 790 if (add_instr->operands[i].isConstant()) { 791 *offset = add_instr->operands[i].constantValue() * (uint32_t)(is_sub ? -1 : 1); 792 } else if (add_instr->operands[i].isTemp() && 793 ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal(32)) { 794 *offset = ctx.info[add_instr->operands[i].tempId()].val * (uint32_t)(is_sub ? -1 : 1); 795 } else { 796 continue; 797 } 798 if (!add_instr->operands[!i].isTemp()) 799 continue; 800 801 uint32_t offset2 = 0; 802 if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) { 803 *offset += offset2; 804 } else { 805 *base = add_instr->operands[!i].getTemp(); 806 } 807 return true; 808 } 809 810 return false; 811} 812 813void 814skip_smem_offset_align(opt_ctx& ctx, SMEM_instruction* smem) 815{ 816 bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4); 817 if (soe && !smem->operands[1].isConstant()) 818 return; 819 /* We don't need to check the constant offset because the address seems to be calculated with 820 * (offset&-4 + const_offset&-4), not (offset+const_offset)&-4. 821 */ 822 823 Operand& op = smem->operands[soe ? smem->operands.size() - 1 : 1]; 824 if (!op.isTemp() || !ctx.info[op.tempId()].is_bitwise()) 825 return; 826 827 Instruction* bitwise_instr = ctx.info[op.tempId()].instr; 828 if (bitwise_instr->opcode != aco_opcode::s_and_b32) 829 return; 830 831 if (bitwise_instr->operands[0].constantEquals(-4) && 832 bitwise_instr->operands[1].isOfType(op.regClass().type())) 833 op.setTemp(bitwise_instr->operands[1].getTemp()); 834 else if (bitwise_instr->operands[1].constantEquals(-4) && 835 bitwise_instr->operands[0].isOfType(op.regClass().type())) 836 op.setTemp(bitwise_instr->operands[0].getTemp()); 837} 838 839void 840smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr) 841{ 842 /* skip &-4 before offset additions: load((a + 16) & -4, 0) */ 843 if (!instr->operands.empty()) 844 skip_smem_offset_align(ctx, &instr->smem()); 845 846 /* propagate constants and combine additions */ 847 if (!instr->operands.empty() && instr->operands[1].isTemp()) { 848 SMEM_instruction& smem = instr->smem(); 849 ssa_info info = ctx.info[instr->operands[1].tempId()]; 850 851 Temp base; 852 uint32_t offset; 853 bool prevent_overflow = smem.operands[0].size() > 2 || smem.prevent_overflow; 854 if (info.is_constant_or_literal(32) && 855 ((ctx.program->gfx_level == GFX6 && info.val <= 0x3FF) || 856 (ctx.program->gfx_level == GFX7 && info.val <= 0xFFFFFFFF) || 857 (ctx.program->gfx_level >= GFX8 && info.val <= 0xFFFFF))) { 858 instr->operands[1] = Operand::c32(info.val); 859 } else if (parse_base_offset(ctx, instr.get(), 1, &base, &offset, prevent_overflow) && 860 base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->gfx_level >= GFX9 && 861 offset % 4u == 0) { 862 bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4); 863 if (soe) { 864 if (ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) && 865 ctx.info[smem.operands.back().tempId()].val == 0) { 866 smem.operands[1] = Operand::c32(offset); 867 smem.operands.back() = Operand(base); 868 } 869 } else { 870 SMEM_instruction* new_instr = create_instruction<SMEM_instruction>( 871 smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size()); 872 new_instr->operands[0] = smem.operands[0]; 873 new_instr->operands[1] = Operand::c32(offset); 874 if (smem.definitions.empty()) 875 new_instr->operands[2] = smem.operands[2]; 876 new_instr->operands.back() = Operand(base); 877 if (!smem.definitions.empty()) 878 new_instr->definitions[0] = smem.definitions[0]; 879 new_instr->sync = smem.sync; 880 new_instr->glc = smem.glc; 881 new_instr->dlc = smem.dlc; 882 new_instr->nv = smem.nv; 883 new_instr->disable_wqm = smem.disable_wqm; 884 instr.reset(new_instr); 885 } 886 } 887 } 888 889 /* skip &-4 after offset additions: load(a & -4, 16) */ 890 if (!instr->operands.empty()) 891 skip_smem_offset_align(ctx, &instr->smem()); 892} 893 894unsigned 895get_operand_size(aco_ptr<Instruction>& instr, unsigned index) 896{ 897 if (instr->isPseudo()) 898 return instr->operands[index].bytes() * 8u; 899 else if (instr->opcode == aco_opcode::v_mad_u64_u32 || 900 instr->opcode == aco_opcode::v_mad_i64_i32) 901 return index == 2 ? 64 : 32; 902 else if (instr->opcode == aco_opcode::v_fma_mix_f32 || 903 instr->opcode == aco_opcode::v_fma_mixlo_f16) 904 return instr->vop3p().opsel_hi & (1u << index) ? 16 : 32; 905 else if (instr->isVALU() || instr->isSALU()) 906 return instr_info.operand_size[(int)instr->opcode]; 907 else 908 return 0; 909} 910 911Operand 912get_constant_op(opt_ctx& ctx, ssa_info info, uint32_t bits) 913{ 914 if (bits == 64) 915 return Operand::c32_or_c64(info.val, true); 916 return Operand::get_const(ctx.program->gfx_level, info.val, bits / 8u); 917} 918 919void 920propagate_constants_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info, unsigned i) 921{ 922 if (!info.is_constant_or_literal(32)) 923 return; 924 925 assert(instr->operands[i].isTemp()); 926 unsigned bits = get_operand_size(instr, i); 927 if (info.is_constant(bits)) { 928 instr->operands[i] = get_constant_op(ctx, info, bits); 929 return; 930 } 931 932 /* The accumulation operand of dot product instructions ignores opsel. */ 933 bool cannot_use_opsel = 934 (instr->opcode == aco_opcode::v_dot4_i32_i8 || instr->opcode == aco_opcode::v_dot2_i32_i16 || 935 instr->opcode == aco_opcode::v_dot4_u32_u8 || instr->opcode == aco_opcode::v_dot2_u32_u16) && 936 i == 2; 937 if (cannot_use_opsel) 938 return; 939 940 /* try to fold inline constants */ 941 VOP3P_instruction* vop3p = &instr->vop3p(); 942 bool opsel_lo = (vop3p->opsel_lo >> i) & 1; 943 bool opsel_hi = (vop3p->opsel_hi >> i) & 1; 944 945 Operand const_op[2]; 946 bool const_opsel[2] = {false, false}; 947 for (unsigned j = 0; j < 2; j++) { 948 if ((unsigned)opsel_lo != j && (unsigned)opsel_hi != j) 949 continue; /* this half is unused */ 950 951 uint16_t val = info.val >> (j ? 16 : 0); 952 Operand op = Operand::get_const(ctx.program->gfx_level, val, bits / 8u); 953 if (bits == 32 && op.isLiteral()) /* try sign extension */ 954 op = Operand::get_const(ctx.program->gfx_level, val | 0xffff0000, 4); 955 if (bits == 32 && op.isLiteral()) { /* try shifting left */ 956 op = Operand::get_const(ctx.program->gfx_level, val << 16, 4); 957 const_opsel[j] = true; 958 } 959 if (op.isLiteral()) 960 return; 961 const_op[j] = op; 962 } 963 964 Operand const_lo = const_op[0]; 965 Operand const_hi = const_op[1]; 966 bool const_lo_opsel = const_opsel[0]; 967 bool const_hi_opsel = const_opsel[1]; 968 969 if (opsel_lo == opsel_hi) { 970 /* use the single 16bit value */ 971 instr->operands[i] = opsel_lo ? const_hi : const_lo; 972 973 /* opsel must point the same for both halves */ 974 opsel_lo = opsel_lo ? const_hi_opsel : const_lo_opsel; 975 opsel_hi = opsel_lo; 976 } else if (const_lo == const_hi) { 977 /* both constants are the same */ 978 instr->operands[i] = const_lo; 979 980 /* opsel must point the same for both halves */ 981 opsel_lo = const_lo_opsel; 982 opsel_hi = const_lo_opsel; 983 } else if (const_lo.constantValue16(const_lo_opsel) == 984 const_hi.constantValue16(!const_hi_opsel)) { 985 instr->operands[i] = const_hi; 986 987 /* redirect opsel selection */ 988 opsel_lo = opsel_lo ? const_hi_opsel : !const_hi_opsel; 989 opsel_hi = opsel_hi ? const_hi_opsel : !const_hi_opsel; 990 } else if (const_hi.constantValue16(const_hi_opsel) == 991 const_lo.constantValue16(!const_lo_opsel)) { 992 instr->operands[i] = const_lo; 993 994 /* redirect opsel selection */ 995 opsel_lo = opsel_lo ? !const_lo_opsel : const_lo_opsel; 996 opsel_hi = opsel_hi ? !const_lo_opsel : const_lo_opsel; 997 } else if (bits == 16 && const_lo.constantValue() == (const_hi.constantValue() ^ (1 << 15))) { 998 assert(const_lo_opsel == false && const_hi_opsel == false); 999 1000 /* const_lo == -const_hi */ 1001 if (!instr_info.can_use_input_modifiers[(int)instr->opcode]) 1002 return; 1003 1004 instr->operands[i] = Operand::c16(const_lo.constantValue() & 0x7FFF); 1005 bool neg_lo = const_lo.constantValue() & (1 << 15); 1006 vop3p->neg_lo[i] ^= opsel_lo ^ neg_lo; 1007 vop3p->neg_hi[i] ^= opsel_hi ^ neg_lo; 1008 1009 /* opsel must point to lo for both operands */ 1010 opsel_lo = false; 1011 opsel_hi = false; 1012 } 1013 1014 vop3p->opsel_lo = opsel_lo ? (vop3p->opsel_lo | (1 << i)) : (vop3p->opsel_lo & ~(1 << i)); 1015 vop3p->opsel_hi = opsel_hi ? (vop3p->opsel_hi | (1 << i)) : (vop3p->opsel_hi & ~(1 << i)); 1016} 1017 1018bool 1019fixed_to_exec(Operand op) 1020{ 1021 return op.isFixed() && op.physReg() == exec; 1022} 1023 1024SubdwordSel 1025parse_extract(Instruction* instr) 1026{ 1027 if (instr->opcode == aco_opcode::p_extract) { 1028 unsigned size = instr->operands[2].constantValue() / 8; 1029 unsigned offset = instr->operands[1].constantValue() * size; 1030 bool sext = instr->operands[3].constantEquals(1); 1031 return SubdwordSel(size, offset, sext); 1032 } else if (instr->opcode == aco_opcode::p_insert && instr->operands[1].constantEquals(0)) { 1033 return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword; 1034 } else if (instr->opcode == aco_opcode::p_extract_vector) { 1035 unsigned size = instr->definitions[0].bytes(); 1036 unsigned offset = instr->operands[1].constantValue() * size; 1037 if (size <= 2) 1038 return SubdwordSel(size, offset, false); 1039 } else if (instr->opcode == aco_opcode::p_split_vector) { 1040 assert(instr->operands[0].bytes() == 4 && instr->definitions[1].bytes() == 2); 1041 return SubdwordSel(2, 2, false); 1042 } 1043 1044 return SubdwordSel(); 1045} 1046 1047SubdwordSel 1048parse_insert(Instruction* instr) 1049{ 1050 if (instr->opcode == aco_opcode::p_extract && instr->operands[3].constantEquals(0) && 1051 instr->operands[1].constantEquals(0)) { 1052 return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword; 1053 } else if (instr->opcode == aco_opcode::p_insert) { 1054 unsigned size = instr->operands[2].constantValue() / 8; 1055 unsigned offset = instr->operands[1].constantValue() * size; 1056 return SubdwordSel(size, offset, false); 1057 } else { 1058 return SubdwordSel(); 1059 } 1060} 1061 1062bool 1063can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info) 1064{ 1065 if (idx >= 2) 1066 return false; 1067 1068 Temp tmp = info.instr->operands[0].getTemp(); 1069 SubdwordSel sel = parse_extract(info.instr); 1070 1071 if (!sel) { 1072 return false; 1073 } else if (sel.size() == 4) { 1074 return true; 1075 } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel.size() == 1 && !sel.sign_extend()) { 1076 return true; 1077 } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) && 1078 (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) { 1079 if (instr->isSDWA() && instr->sdwa().sel[idx] != SubdwordSel::dword) 1080 return false; 1081 return true; 1082 } else if (instr->isVOP3() && sel.size() == 2 && 1083 can_use_opsel(ctx.program->gfx_level, instr->opcode, idx) && 1084 !(instr->vop3().opsel & (1 << idx))) { 1085 return true; 1086 } else if (instr->opcode == aco_opcode::p_extract) { 1087 SubdwordSel instrSel = parse_extract(instr.get()); 1088 1089 /* the outer offset must be within extracted range */ 1090 if (instrSel.offset() >= sel.size()) 1091 return false; 1092 1093 /* don't remove the sign-extension when increasing the size further */ 1094 if (instrSel.size() > sel.size() && !instrSel.sign_extend() && sel.sign_extend()) 1095 return false; 1096 1097 return true; 1098 } 1099 1100 return false; 1101} 1102 1103/* Combine an p_extract (or p_insert, in some cases) instruction with instr. 1104 * instr(p_extract(...)) -> instr() 1105 */ 1106void 1107apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info) 1108{ 1109 Temp tmp = info.instr->operands[0].getTemp(); 1110 SubdwordSel sel = parse_extract(info.instr); 1111 assert(sel); 1112 1113 instr->operands[idx].set16bit(false); 1114 instr->operands[idx].set24bit(false); 1115 1116 ctx.info[tmp.id()].label &= ~label_insert; 1117 1118 if (sel.size() == 4) { 1119 /* full dword selection */ 1120 } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel.size() == 1 && !sel.sign_extend()) { 1121 switch (sel.offset()) { 1122 case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break; 1123 case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break; 1124 case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break; 1125 case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break; 1126 } 1127 } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && instr->operands[0].isConstant() && 1128 sel.offset() == 0 && 1129 ((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) || 1130 (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) { 1131 /* The undesireable upper bits are already shifted out. */ 1132 return; 1133 } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) && 1134 (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) { 1135 to_SDWA(ctx, instr); 1136 static_cast<SDWA_instruction*>(instr.get())->sel[idx] = sel; 1137 } else if (instr->isVOP3()) { 1138 if (sel.offset()) 1139 instr->vop3().opsel |= 1 << idx; 1140 } else if (instr->opcode == aco_opcode::p_extract) { 1141 SubdwordSel instrSel = parse_extract(instr.get()); 1142 1143 unsigned size = std::min(sel.size(), instrSel.size()); 1144 unsigned offset = sel.offset() + instrSel.offset(); 1145 unsigned sign_extend = 1146 instrSel.sign_extend() && (sel.sign_extend() || instrSel.size() <= sel.size()); 1147 1148 instr->operands[1] = Operand::c32(offset / size); 1149 instr->operands[2] = Operand::c32(size * 8u); 1150 instr->operands[3] = Operand::c32(sign_extend); 1151 return; 1152 } 1153 1154 /* Output modifier, label_vopc and label_f2f32 seem to be the only one worth keeping at the 1155 * moment 1156 */ 1157 for (Definition& def : instr->definitions) 1158 ctx.info[def.tempId()].label &= (label_vopc | label_f2f32 | instr_mod_labels); 1159} 1160 1161void 1162check_sdwa_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr) 1163{ 1164 for (unsigned i = 0; i < instr->operands.size(); i++) { 1165 Operand op = instr->operands[i]; 1166 if (!op.isTemp()) 1167 continue; 1168 ssa_info& info = ctx.info[op.tempId()]; 1169 if (info.is_extract() && (info.instr->operands[0].getTemp().type() == RegType::vgpr || 1170 op.getTemp().type() == RegType::sgpr)) { 1171 if (!can_apply_extract(ctx, instr, i, info)) 1172 info.label &= ~label_extract; 1173 } 1174 } 1175} 1176 1177bool 1178does_fp_op_flush_denorms(opt_ctx& ctx, aco_opcode op) 1179{ 1180 if (ctx.program->gfx_level <= GFX8) { 1181 switch (op) { 1182 case aco_opcode::v_min_f32: 1183 case aco_opcode::v_max_f32: 1184 case aco_opcode::v_med3_f32: 1185 case aco_opcode::v_min3_f32: 1186 case aco_opcode::v_max3_f32: 1187 case aco_opcode::v_min_f16: 1188 case aco_opcode::v_max_f16: return false; 1189 default: break; 1190 } 1191 } 1192 return op != aco_opcode::v_cndmask_b32; 1193} 1194 1195bool 1196can_eliminate_fcanonicalize(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp tmp) 1197{ 1198 float_mode* fp = &ctx.fp_mode; 1199 if (ctx.info[tmp.id()].is_canonicalized() || 1200 (tmp.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep) 1201 return true; 1202 1203 aco_opcode op = instr->opcode; 1204 return instr_info.can_use_input_modifiers[(int)op] && does_fp_op_flush_denorms(ctx, op); 1205} 1206 1207bool 1208can_eliminate_and_exec(opt_ctx& ctx, Temp tmp, unsigned pass_flags) 1209{ 1210 if (ctx.info[tmp.id()].is_vopc()) { 1211 Instruction* vopc_instr = ctx.info[tmp.id()].instr; 1212 /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus 1213 * already produces the same result */ 1214 return vopc_instr->pass_flags == pass_flags; 1215 } 1216 if (ctx.info[tmp.id()].is_bitwise()) { 1217 Instruction* instr = ctx.info[tmp.id()].instr; 1218 if (instr->operands.size() != 2 || instr->pass_flags != pass_flags) 1219 return false; 1220 if (!(instr->operands[0].isTemp() && instr->operands[1].isTemp())) 1221 return false; 1222 return can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), pass_flags) && 1223 can_eliminate_and_exec(ctx, instr->operands[1].getTemp(), pass_flags); 1224 } 1225 return false; 1226} 1227 1228bool 1229is_copy_label(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info) 1230{ 1231 return info.is_temp() || 1232 (info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp)); 1233} 1234 1235bool 1236is_op_canonicalized(opt_ctx& ctx, Operand op) 1237{ 1238 float_mode* fp = &ctx.fp_mode; 1239 if ((op.isTemp() && ctx.info[op.tempId()].is_canonicalized()) || 1240 (op.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep) 1241 return true; 1242 1243 if (op.isConstant() || (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(32))) { 1244 uint32_t val = op.isTemp() ? ctx.info[op.tempId()].val : op.constantValue(); 1245 if (op.bytes() == 2) 1246 return (val & 0x7fff) == 0 || (val & 0x7fff) > 0x3ff; 1247 else if (op.bytes() == 4) 1248 return (val & 0x7fffffff) == 0 || (val & 0x7fffffff) > 0x7fffff; 1249 } 1250 return false; 1251} 1252 1253bool 1254is_scratch_offset_valid(opt_ctx& ctx, Instruction* instr, int32_t offset) 1255{ 1256 bool negative_unaligned_scratch_offset_bug = ctx.program->gfx_level == GFX10; 1257 int32_t min = ctx.program->dev.scratch_global_offset_min; 1258 int32_t max = ctx.program->dev.scratch_global_offset_max; 1259 1260 bool has_vgpr_offset = instr && !instr->operands[0].isUndefined(); 1261 if (negative_unaligned_scratch_offset_bug && has_vgpr_offset && offset < 0 && offset % 4) 1262 return false; 1263 1264 return offset >= min && offset <= max; 1265} 1266 1267void 1268label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr) 1269{ 1270 if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) { 1271 ASSERTED bool all_const = false; 1272 for (Operand& op : instr->operands) 1273 all_const = 1274 all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32)); 1275 perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get()); 1276 1277 ASSERTED bool is_copy = instr->opcode == aco_opcode::s_mov_b32 || 1278 instr->opcode == aco_opcode::s_mov_b64 || 1279 instr->opcode == aco_opcode::v_mov_b32; 1280 perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead", 1281 instr.get()); 1282 } 1283 1284 if (instr->isSMEM()) 1285 smem_combine(ctx, instr); 1286 1287 for (unsigned i = 0; i < instr->operands.size(); i++) { 1288 if (!instr->operands[i].isTemp()) 1289 continue; 1290 1291 ssa_info info = ctx.info[instr->operands[i].tempId()]; 1292 /* propagate undef */ 1293 if (info.is_undefined() && is_phi(instr)) 1294 instr->operands[i] = Operand(instr->operands[i].regClass()); 1295 /* propagate reg->reg of same type */ 1296 while (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) { 1297 instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp); 1298 info = ctx.info[info.temp.id()]; 1299 } 1300 1301 /* PSEUDO: propagate temporaries */ 1302 if (instr->isPseudo()) { 1303 while (info.is_temp()) { 1304 pseudo_propagate_temp(ctx, instr, info.temp, i); 1305 info = ctx.info[info.temp.id()]; 1306 } 1307 } 1308 1309 /* SALU / PSEUDO: propagate inline constants */ 1310 if (instr->isSALU() || instr->isPseudo()) { 1311 unsigned bits = get_operand_size(instr, i); 1312 if ((info.is_constant(bits) || (info.is_literal(bits) && instr->isPseudo())) && 1313 !instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) { 1314 instr->operands[i] = get_constant_op(ctx, info, bits); 1315 continue; 1316 } 1317 } 1318 1319 /* VALU: propagate neg, abs & inline constants */ 1320 else if (instr->isVALU()) { 1321 if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::vgpr && 1322 valu_can_accept_vgpr(instr, i)) { 1323 instr->operands[i].setTemp(info.temp); 1324 info = ctx.info[info.temp.id()]; 1325 } 1326 /* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */ 1327 if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) && 1328 instr->operands.size() == 1) { 1329 instr->format = withoutDPP(instr->format); 1330 instr->operands[i].setTemp(info.temp); 1331 info = ctx.info[info.temp.id()]; 1332 } 1333 1334 /* for instructions other than v_cndmask_b32, the size of the instruction should match the 1335 * operand size */ 1336 unsigned can_use_mod = 1337 instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4; 1338 can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode]; 1339 1340 if (instr->isSDWA()) 1341 can_use_mod = can_use_mod && instr->sdwa().sel[i].size() == 4; 1342 else 1343 can_use_mod = can_use_mod && (instr->isDPP16() || can_use_VOP3(ctx, instr)); 1344 1345 unsigned bits = get_operand_size(instr, i); 1346 bool mod_bitsize_compat = instr->operands[i].bytes() * 8 == bits; 1347 1348 if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32 && mod_bitsize_compat) { 1349 instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32; 1350 instr->operands[i].setTemp(info.temp); 1351 } else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16 && mod_bitsize_compat) { 1352 instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16; 1353 instr->operands[i].setTemp(info.temp); 1354 } else if (info.is_neg() && can_use_mod && mod_bitsize_compat && 1355 can_eliminate_fcanonicalize(ctx, instr, info.temp)) { 1356 if (!instr->isDPP() && !instr->isSDWA()) 1357 to_VOP3(ctx, instr); 1358 instr->operands[i].setTemp(info.temp); 1359 if (instr->isDPP16() && !instr->dpp16().abs[i]) 1360 instr->dpp16().neg[i] = true; 1361 else if (instr->isSDWA() && !instr->sdwa().abs[i]) 1362 instr->sdwa().neg[i] = true; 1363 else if (instr->isVOP3() && !instr->vop3().abs[i]) 1364 instr->vop3().neg[i] = true; 1365 } 1366 if (info.is_abs() && can_use_mod && mod_bitsize_compat && 1367 can_eliminate_fcanonicalize(ctx, instr, info.temp)) { 1368 if (!instr->isDPP() && !instr->isSDWA()) 1369 to_VOP3(ctx, instr); 1370 instr->operands[i] = Operand(info.temp); 1371 if (instr->isDPP16()) 1372 instr->dpp16().abs[i] = true; 1373 else if (instr->isSDWA()) 1374 instr->sdwa().abs[i] = true; 1375 else 1376 instr->vop3().abs[i] = true; 1377 continue; 1378 } 1379 1380 if (instr->isVOP3P()) { 1381 propagate_constants_vop3p(ctx, instr, info, i); 1382 continue; 1383 } 1384 1385 if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) && 1386 (!instr->isSDWA() || ctx.program->gfx_level >= GFX9)) { 1387 Operand op = get_constant_op(ctx, info, bits); 1388 perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, 1389 "v_cndmask_b32 with a constant selector", instr.get()); 1390 if (i == 0 || instr->isSDWA() || instr->opcode == aco_opcode::v_readlane_b32 || 1391 instr->opcode == aco_opcode::v_writelane_b32) { 1392 instr->format = withoutDPP(instr->format); 1393 instr->operands[i] = op; 1394 continue; 1395 } else if (!instr->isVOP3() && can_swap_operands(instr, &instr->opcode)) { 1396 instr->operands[i] = instr->operands[0]; 1397 instr->operands[0] = op; 1398 continue; 1399 } else if (can_use_VOP3(ctx, instr)) { 1400 to_VOP3(ctx, instr); 1401 instr->operands[i] = op; 1402 continue; 1403 } 1404 } 1405 } 1406 1407 /* MUBUF: propagate constants and combine additions */ 1408 else if (instr->isMUBUF()) { 1409 MUBUF_instruction& mubuf = instr->mubuf(); 1410 Temp base; 1411 uint32_t offset; 1412 while (info.is_temp()) 1413 info = ctx.info[info.temp.id()]; 1414 1415 /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr 1416 * overflow for scratch accesses works only on GFX9+ and saddr overflow 1417 * never works. Since swizzling is the only thing that separates 1418 * scratch accesses and other accesses and swizzling changing how 1419 * addressing works significantly, this probably applies to swizzled 1420 * MUBUF accesses. */ 1421 bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->gfx_level < GFX9; 1422 1423 if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) && 1424 mubuf.offset + info.val < 4096) { 1425 assert(!mubuf.idxen); 1426 instr->operands[1] = Operand(v1); 1427 mubuf.offset += info.val; 1428 mubuf.offen = false; 1429 continue; 1430 } else if (i == 2 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) { 1431 instr->operands[2] = Operand::c32(0); 1432 mubuf.offset += info.val; 1433 continue; 1434 } else if (mubuf.offen && i == 1 && 1435 parse_base_offset(ctx, instr.get(), i, &base, &offset, 1436 vaddr_prevent_overflow) && 1437 base.regClass() == v1 && mubuf.offset + offset < 4096) { 1438 assert(!mubuf.idxen); 1439 instr->operands[1].setTemp(base); 1440 mubuf.offset += offset; 1441 continue; 1442 } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) && 1443 base.regClass() == s1 && mubuf.offset + offset < 4096) { 1444 instr->operands[i].setTemp(base); 1445 mubuf.offset += offset; 1446 continue; 1447 } 1448 } 1449 1450 /* SCRATCH: propagate constants and combine additions */ 1451 else if (instr->isScratch()) { 1452 FLAT_instruction& scratch = instr->scratch(); 1453 Temp base; 1454 uint32_t offset; 1455 while (info.is_temp()) 1456 info = ctx.info[info.temp.id()]; 1457 1458 if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) && 1459 base.regClass() == instr->operands[i].regClass() && 1460 is_scratch_offset_valid(ctx, instr.get(), scratch.offset + (int32_t)offset)) { 1461 instr->operands[i].setTemp(base); 1462 scratch.offset += (int32_t)offset; 1463 continue; 1464 } else if (i <= 1 && info.is_constant_or_literal(32) && 1465 ctx.program->gfx_level >= GFX10_3 && 1466 is_scratch_offset_valid(ctx, NULL, scratch.offset + (int32_t)info.val)) { 1467 /* GFX10.3+ can disable both SADDR and ADDR. */ 1468 instr->operands[i] = Operand(instr->operands[i].regClass()); 1469 scratch.offset += (int32_t)info.val; 1470 continue; 1471 } 1472 } 1473 1474 /* DS: combine additions */ 1475 else if (instr->isDS()) { 1476 1477 DS_instruction& ds = instr->ds(); 1478 Temp base; 1479 uint32_t offset; 1480 bool has_usable_ds_offset = ctx.program->gfx_level >= GFX7; 1481 if (has_usable_ds_offset && i == 0 && 1482 parse_base_offset(ctx, instr.get(), i, &base, &offset, false) && 1483 base.regClass() == instr->operands[i].regClass() && 1484 instr->opcode != aco_opcode::ds_swizzle_b32) { 1485 if (instr->opcode == aco_opcode::ds_write2_b32 || 1486 instr->opcode == aco_opcode::ds_read2_b32 || 1487 instr->opcode == aco_opcode::ds_write2_b64 || 1488 instr->opcode == aco_opcode::ds_read2_b64 || 1489 instr->opcode == aco_opcode::ds_write2st64_b32 || 1490 instr->opcode == aco_opcode::ds_read2st64_b32 || 1491 instr->opcode == aco_opcode::ds_write2st64_b64 || 1492 instr->opcode == aco_opcode::ds_read2st64_b64) { 1493 bool is64bit = instr->opcode == aco_opcode::ds_write2_b64 || 1494 instr->opcode == aco_opcode::ds_read2_b64 || 1495 instr->opcode == aco_opcode::ds_write2st64_b64 || 1496 instr->opcode == aco_opcode::ds_read2st64_b64; 1497 bool st64 = instr->opcode == aco_opcode::ds_write2st64_b32 || 1498 instr->opcode == aco_opcode::ds_read2st64_b32 || 1499 instr->opcode == aco_opcode::ds_write2st64_b64 || 1500 instr->opcode == aco_opcode::ds_read2st64_b64; 1501 unsigned shifts = (is64bit ? 3 : 2) + (st64 ? 6 : 0); 1502 unsigned mask = BITFIELD_MASK(shifts); 1503 1504 if ((offset & mask) == 0 && ds.offset0 + (offset >> shifts) <= 255 && 1505 ds.offset1 + (offset >> shifts) <= 255) { 1506 instr->operands[i].setTemp(base); 1507 ds.offset0 += offset >> shifts; 1508 ds.offset1 += offset >> shifts; 1509 } 1510 } else { 1511 if (ds.offset0 + offset <= 65535) { 1512 instr->operands[i].setTemp(base); 1513 ds.offset0 += offset; 1514 } 1515 } 1516 } 1517 } 1518 1519 else if (instr->isBranch()) { 1520 if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) { 1521 /* Flip the branch instruction to get rid of the scc_invert instruction */ 1522 instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz 1523 : aco_opcode::p_cbranch_z; 1524 instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp); 1525 } 1526 } 1527 } 1528 1529 /* if this instruction doesn't define anything, return */ 1530 if (instr->definitions.empty()) { 1531 check_sdwa_extract(ctx, instr); 1532 return; 1533 } 1534 1535 if (instr->isVALU() || instr->isVINTRP()) { 1536 if (instr_info.can_use_output_modifiers[(int)instr->opcode] || instr->isVINTRP() || 1537 instr->opcode == aco_opcode::v_cndmask_b32) { 1538 bool canonicalized = true; 1539 if (!does_fp_op_flush_denorms(ctx, instr->opcode)) { 1540 unsigned ops = instr->opcode == aco_opcode::v_cndmask_b32 ? 2 : instr->operands.size(); 1541 for (unsigned i = 0; canonicalized && (i < ops); i++) 1542 canonicalized = is_op_canonicalized(ctx, instr->operands[i]); 1543 } 1544 if (canonicalized) 1545 ctx.info[instr->definitions[0].tempId()].set_canonicalized(); 1546 } 1547 1548 if (instr->isVOPC()) { 1549 ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get()); 1550 check_sdwa_extract(ctx, instr); 1551 return; 1552 } 1553 if (instr->isVOP3P()) { 1554 ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get()); 1555 return; 1556 } 1557 } 1558 1559 switch (instr->opcode) { 1560 case aco_opcode::p_create_vector: { 1561 bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() && 1562 instr->operands[0].regClass() == instr->definitions[0].regClass(); 1563 if (copy_prop) { 1564 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); 1565 break; 1566 } 1567 1568 /* expand vector operands */ 1569 std::vector<Operand> ops; 1570 unsigned offset = 0; 1571 for (const Operand& op : instr->operands) { 1572 /* ensure that any expanded operands are properly aligned */ 1573 bool aligned = offset % 4 == 0 || op.bytes() < 4; 1574 offset += op.bytes(); 1575 if (aligned && op.isTemp() && ctx.info[op.tempId()].is_vec()) { 1576 Instruction* vec = ctx.info[op.tempId()].instr; 1577 for (const Operand& vec_op : vec->operands) 1578 ops.emplace_back(vec_op); 1579 } else { 1580 ops.emplace_back(op); 1581 } 1582 } 1583 1584 /* combine expanded operands to new vector */ 1585 if (ops.size() != instr->operands.size()) { 1586 assert(ops.size() > instr->operands.size()); 1587 Definition def = instr->definitions[0]; 1588 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, 1589 Format::PSEUDO, ops.size(), 1)); 1590 for (unsigned i = 0; i < ops.size(); i++) { 1591 if (ops[i].isTemp() && ctx.info[ops[i].tempId()].is_temp() && 1592 ops[i].regClass() == ctx.info[ops[i].tempId()].temp.regClass()) 1593 ops[i].setTemp(ctx.info[ops[i].tempId()].temp); 1594 instr->operands[i] = ops[i]; 1595 } 1596 instr->definitions[0] = def; 1597 } else { 1598 for (unsigned i = 0; i < ops.size(); i++) { 1599 assert(instr->operands[i] == ops[i]); 1600 } 1601 } 1602 ctx.info[instr->definitions[0].tempId()].set_vec(instr.get()); 1603 1604 if (instr->operands.size() == 2) { 1605 /* check if this is created from split_vector */ 1606 if (instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_split()) { 1607 Instruction* split = ctx.info[instr->operands[1].tempId()].instr; 1608 if (instr->operands[0].isTemp() && 1609 instr->operands[0].getTemp() == split->definitions[0].getTemp()) 1610 ctx.info[instr->definitions[0].tempId()].set_temp(split->operands[0].getTemp()); 1611 } 1612 } 1613 break; 1614 } 1615 case aco_opcode::p_split_vector: { 1616 ssa_info& info = ctx.info[instr->operands[0].tempId()]; 1617 1618 if (info.is_constant_or_literal(32)) { 1619 uint64_t val = info.val; 1620 for (Definition def : instr->definitions) { 1621 uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u); 1622 ctx.info[def.tempId()].set_constant(ctx.program->gfx_level, val & mask); 1623 val >>= def.bytes() * 8u; 1624 } 1625 break; 1626 } else if (!info.is_vec()) { 1627 if (instr->definitions.size() == 2 && instr->operands[0].isTemp() && 1628 instr->definitions[0].bytes() == instr->definitions[1].bytes()) { 1629 ctx.info[instr->definitions[1].tempId()].set_split(instr.get()); 1630 if (instr->operands[0].bytes() == 4) { 1631 /* D16 subdword split */ 1632 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); 1633 ctx.info[instr->definitions[1].tempId()].set_extract(instr.get()); 1634 } 1635 } 1636 break; 1637 } 1638 1639 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; 1640 unsigned split_offset = 0; 1641 unsigned vec_offset = 0; 1642 unsigned vec_index = 0; 1643 for (unsigned i = 0; i < instr->definitions.size(); 1644 split_offset += instr->definitions[i++].bytes()) { 1645 while (vec_offset < split_offset && vec_index < vec->operands.size()) 1646 vec_offset += vec->operands[vec_index++].bytes(); 1647 1648 if (vec_offset != split_offset || 1649 vec->operands[vec_index].bytes() != instr->definitions[i].bytes()) 1650 continue; 1651 1652 Operand vec_op = vec->operands[vec_index]; 1653 if (vec_op.isConstant()) { 1654 ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->gfx_level, 1655 vec_op.constantValue64()); 1656 } else if (vec_op.isUndefined()) { 1657 ctx.info[instr->definitions[i].tempId()].set_undefined(); 1658 } else { 1659 assert(vec_op.isTemp()); 1660 ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp()); 1661 } 1662 } 1663 break; 1664 } 1665 case aco_opcode::p_extract_vector: { /* mov */ 1666 ssa_info& info = ctx.info[instr->operands[0].tempId()]; 1667 const unsigned index = instr->operands[1].constantValue(); 1668 const unsigned dst_offset = index * instr->definitions[0].bytes(); 1669 1670 if (info.is_vec()) { 1671 /* check if we index directly into a vector element */ 1672 Instruction* vec = info.instr; 1673 unsigned offset = 0; 1674 1675 for (const Operand& op : vec->operands) { 1676 if (offset < dst_offset) { 1677 offset += op.bytes(); 1678 continue; 1679 } else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) { 1680 break; 1681 } 1682 instr->operands[0] = op; 1683 break; 1684 } 1685 } else if (info.is_constant_or_literal(32)) { 1686 /* propagate constants */ 1687 uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u); 1688 uint32_t val = (info.val >> (dst_offset * 8u)) & mask; 1689 instr->operands[0] = 1690 Operand::get_const(ctx.program->gfx_level, val, instr->definitions[0].bytes()); 1691 ; 1692 } 1693 1694 if (instr->operands[0].bytes() != instr->definitions[0].bytes()) { 1695 if (instr->operands[0].size() != 1) 1696 break; 1697 1698 if (index == 0) 1699 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); 1700 else 1701 ctx.info[instr->definitions[0].tempId()].set_extract(instr.get()); 1702 break; 1703 } 1704 1705 /* convert this extract into a copy instruction */ 1706 instr->opcode = aco_opcode::p_parallelcopy; 1707 instr->operands.pop_back(); 1708 FALLTHROUGH; 1709 } 1710 case aco_opcode::p_parallelcopy: /* propagate */ 1711 if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() && 1712 instr->operands[0].regClass() != instr->definitions[0].regClass()) { 1713 /* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so 1714 * duplicate the vector instead. 1715 */ 1716 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; 1717 aco_ptr<Instruction> old_copy = std::move(instr); 1718 1719 instr.reset(create_instruction<Pseudo_instruction>( 1720 aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1)); 1721 instr->definitions[0] = old_copy->definitions[0]; 1722 std::copy(vec->operands.begin(), vec->operands.end(), instr->operands.begin()); 1723 for (unsigned i = 0; i < vec->operands.size(); i++) { 1724 Operand& op = instr->operands[i]; 1725 if (op.isTemp() && ctx.info[op.tempId()].is_temp() && 1726 ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type()) 1727 op.setTemp(ctx.info[op.tempId()].temp); 1728 } 1729 ctx.info[instr->definitions[0].tempId()].set_vec(instr.get()); 1730 break; 1731 } 1732 FALLTHROUGH; 1733 case aco_opcode::p_as_uniform: 1734 if (instr->definitions[0].isFixed()) { 1735 /* don't copy-propagate copies into fixed registers */ 1736 } else if (instr->usesModifiers()) { 1737 // TODO 1738 } else if (instr->operands[0].isConstant()) { 1739 ctx.info[instr->definitions[0].tempId()].set_constant( 1740 ctx.program->gfx_level, instr->operands[0].constantValue64()); 1741 } else if (instr->operands[0].isTemp()) { 1742 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); 1743 if (ctx.info[instr->operands[0].tempId()].is_canonicalized()) 1744 ctx.info[instr->definitions[0].tempId()].set_canonicalized(); 1745 } else { 1746 assert(instr->operands[0].isFixed()); 1747 } 1748 break; 1749 case aco_opcode::v_mov_b32: 1750 if (instr->isDPP16()) { 1751 /* anything else doesn't make sense in SSA */ 1752 assert(instr->dpp16().row_mask == 0xf && instr->dpp16().bank_mask == 0xf); 1753 ctx.info[instr->definitions[0].tempId()].set_dpp16(instr.get()); 1754 } else if (instr->isDPP8()) { 1755 ctx.info[instr->definitions[0].tempId()].set_dpp8(instr.get()); 1756 } 1757 break; 1758 case aco_opcode::p_is_helper: 1759 if (!ctx.program->needs_wqm) 1760 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->gfx_level, 0u); 1761 break; 1762 case aco_opcode::v_mul_f64: ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); break; 1763 case aco_opcode::v_mul_f16: 1764 case aco_opcode::v_mul_f32: 1765 case aco_opcode::v_mul_legacy_f32: { /* omod */ 1766 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); 1767 1768 /* TODO: try to move the negate/abs modifier to the consumer instead */ 1769 bool uses_mods = instr->usesModifiers(); 1770 bool fp16 = instr->opcode == aco_opcode::v_mul_f16; 1771 1772 for (unsigned i = 0; i < 2; i++) { 1773 if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) { 1774 if (!instr->isDPP() && !instr->isSDWA() && 1775 (instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) || /* 1.0 */ 1776 instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u))) { /* -1.0 */ 1777 bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u); 1778 1779 VOP3_instruction* vop3 = instr->isVOP3() ? &instr->vop3() : NULL; 1780 if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod)) 1781 continue; 1782 1783 bool abs = vop3 && vop3->abs[i]; 1784 bool neg = neg1 ^ (vop3 && vop3->neg[i]); 1785 1786 Temp other = instr->operands[i].getTemp(); 1787 if (abs && neg && other.type() == RegType::vgpr) 1788 ctx.info[instr->definitions[0].tempId()].set_neg_abs(other); 1789 else if (abs && !neg && other.type() == RegType::vgpr) 1790 ctx.info[instr->definitions[0].tempId()].set_abs(other); 1791 else if (!abs && neg && other.type() == RegType::vgpr) 1792 ctx.info[instr->definitions[0].tempId()].set_neg(other); 1793 else if (!abs && !neg) 1794 ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(other); 1795 } else if (uses_mods) { 1796 continue; 1797 } else if (instr->operands[!i].constantValue() == 1798 (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */ 1799 ctx.info[instr->operands[i].tempId()].set_omod2(instr.get()); 1800 } else if (instr->operands[!i].constantValue() == 1801 (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */ 1802 ctx.info[instr->operands[i].tempId()].set_omod4(instr.get()); 1803 } else if (instr->operands[!i].constantValue() == 1804 (fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */ 1805 ctx.info[instr->operands[i].tempId()].set_omod5(instr.get()); 1806 } else if (instr->operands[!i].constantValue() == 0u && 1807 (!(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64 1808 : ctx.fp_mode.preserve_signed_zero_inf_nan32) || 1809 instr->opcode == aco_opcode::v_mul_legacy_f32)) { /* 0.0 */ 1810 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->gfx_level, 0u); 1811 } else { 1812 continue; 1813 } 1814 break; 1815 } 1816 } 1817 break; 1818 } 1819 case aco_opcode::v_mul_lo_u16: 1820 case aco_opcode::v_mul_lo_u16_e64: 1821 case aco_opcode::v_mul_u32_u24: 1822 ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); 1823 break; 1824 case aco_opcode::v_med3_f16: 1825 case aco_opcode::v_med3_f32: { /* clamp */ 1826 VOP3_instruction& vop3 = instr->vop3(); 1827 if (vop3.abs[0] || vop3.abs[1] || vop3.abs[2] || vop3.neg[0] || vop3.neg[1] || vop3.neg[2] || 1828 vop3.omod != 0 || vop3.opsel != 0) 1829 break; 1830 1831 unsigned idx = 0; 1832 bool found_zero = false, found_one = false; 1833 bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16; 1834 for (unsigned i = 0; i < 3; i++) { 1835 if (instr->operands[i].constantEquals(0)) 1836 found_zero = true; 1837 else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */ 1838 found_one = true; 1839 else 1840 idx = i; 1841 } 1842 if (found_zero && found_one && instr->operands[idx].isTemp()) 1843 ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get()); 1844 break; 1845 } 1846 case aco_opcode::v_cndmask_b32: 1847 if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(0xFFFFFFFF)) 1848 ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp()); 1849 else if (instr->operands[0].constantEquals(0) && 1850 instr->operands[1].constantEquals(0x3f800000u)) 1851 ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp()); 1852 else if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(1)) 1853 ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp()); 1854 1855 break; 1856 case aco_opcode::v_cmp_lg_u32: 1857 if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */ 1858 instr->operands[0].constantEquals(0) && instr->operands[1].isTemp() && 1859 ctx.info[instr->operands[1].tempId()].is_vcc()) 1860 ctx.info[instr->definitions[0].tempId()].set_temp( 1861 ctx.info[instr->operands[1].tempId()].temp); 1862 break; 1863 case aco_opcode::p_linear_phi: { 1864 /* lower_bool_phis() can create phis like this */ 1865 bool all_same_temp = instr->operands[0].isTemp(); 1866 /* this check is needed when moving uniform loop counters out of a divergent loop */ 1867 if (all_same_temp) 1868 all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass(); 1869 for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) { 1870 if (!instr->operands[i].isTemp() || 1871 instr->operands[i].tempId() != instr->operands[0].tempId()) 1872 all_same_temp = false; 1873 } 1874 if (all_same_temp) { 1875 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); 1876 } else { 1877 bool all_undef = instr->operands[0].isUndefined(); 1878 for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) { 1879 if (!instr->operands[i].isUndefined()) 1880 all_undef = false; 1881 } 1882 if (all_undef) 1883 ctx.info[instr->definitions[0].tempId()].set_undefined(); 1884 } 1885 break; 1886 } 1887 case aco_opcode::v_add_u32: 1888 case aco_opcode::v_add_co_u32: 1889 case aco_opcode::v_add_co_u32_e64: 1890 case aco_opcode::s_add_i32: 1891 case aco_opcode::s_add_u32: 1892 case aco_opcode::v_subbrev_co_u32: 1893 case aco_opcode::v_sub_u32: 1894 case aco_opcode::v_sub_i32: 1895 case aco_opcode::v_sub_co_u32: 1896 case aco_opcode::v_sub_co_u32_e64: 1897 case aco_opcode::s_sub_u32: 1898 case aco_opcode::s_sub_i32: 1899 case aco_opcode::v_subrev_u32: 1900 case aco_opcode::v_subrev_co_u32: 1901 case aco_opcode::v_subrev_co_u32_e64: 1902 ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get()); 1903 break; 1904 case aco_opcode::s_not_b32: 1905 case aco_opcode::s_not_b64: 1906 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) { 1907 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise(); 1908 ctx.info[instr->definitions[1].tempId()].set_scc_invert( 1909 ctx.info[instr->operands[0].tempId()].temp); 1910 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) { 1911 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise(); 1912 ctx.info[instr->definitions[1].tempId()].set_scc_invert( 1913 ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); 1914 } 1915 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); 1916 break; 1917 case aco_opcode::s_and_b32: 1918 case aco_opcode::s_and_b64: 1919 if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) { 1920 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) { 1921 /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a 1922 * uniform bool into divergent */ 1923 ctx.info[instr->definitions[1].tempId()].set_temp( 1924 ctx.info[instr->operands[0].tempId()].temp); 1925 ctx.info[instr->definitions[0].tempId()].set_uniform_bool( 1926 ctx.info[instr->operands[0].tempId()].temp); 1927 break; 1928 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) { 1929 /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction 1930 * already produces the same SCC */ 1931 ctx.info[instr->definitions[1].tempId()].set_temp( 1932 ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); 1933 ctx.info[instr->definitions[0].tempId()].set_uniform_bool( 1934 ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); 1935 break; 1936 } else if ((ctx.program->stage.num_sw_stages() > 1 || 1937 ctx.program->stage.hw == HWStage::NGG) && 1938 instr->pass_flags == 1) { 1939 /* In case of merged shaders, pass_flags=1 means that all lanes are active (exec=-1), so 1940 * s_and is unnecessary. */ 1941 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); 1942 break; 1943 } else if (can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), instr->pass_flags)) { 1944 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); 1945 break; 1946 } 1947 } 1948 FALLTHROUGH; 1949 case aco_opcode::s_or_b32: 1950 case aco_opcode::s_or_b64: 1951 case aco_opcode::s_xor_b32: 1952 case aco_opcode::s_xor_b64: 1953 if (std::all_of(instr->operands.begin(), instr->operands.end(), 1954 [&ctx](const Operand& op) 1955 { 1956 return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() || 1957 ctx.info[op.tempId()].is_uniform_bitwise()); 1958 })) { 1959 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise(); 1960 } 1961 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); 1962 break; 1963 case aco_opcode::s_lshl_b32: 1964 case aco_opcode::v_or_b32: 1965 case aco_opcode::v_lshlrev_b32: 1966 case aco_opcode::v_bcnt_u32_b32: 1967 case aco_opcode::v_and_b32: 1968 case aco_opcode::v_xor_b32: 1969 ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); 1970 break; 1971 case aco_opcode::v_min_f32: 1972 case aco_opcode::v_min_f16: 1973 case aco_opcode::v_min_u32: 1974 case aco_opcode::v_min_i32: 1975 case aco_opcode::v_min_u16: 1976 case aco_opcode::v_min_i16: 1977 case aco_opcode::v_min_u16_e64: 1978 case aco_opcode::v_min_i16_e64: 1979 case aco_opcode::v_max_f32: 1980 case aco_opcode::v_max_f16: 1981 case aco_opcode::v_max_u32: 1982 case aco_opcode::v_max_i32: 1983 case aco_opcode::v_max_u16: 1984 case aco_opcode::v_max_i16: 1985 case aco_opcode::v_max_u16_e64: 1986 case aco_opcode::v_max_i16_e64: 1987 ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get()); 1988 break; 1989 case aco_opcode::s_cselect_b64: 1990 case aco_opcode::s_cselect_b32: 1991 if (instr->operands[0].constantEquals((unsigned)-1) && instr->operands[1].constantEquals(0)) { 1992 /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */ 1993 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp()); 1994 } 1995 if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) { 1996 /* Flip the operands to get rid of the scc_invert instruction */ 1997 std::swap(instr->operands[0], instr->operands[1]); 1998 instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp); 1999 } 2000 break; 2001 case aco_opcode::p_wqm: 2002 if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_scc_invert()) { 2003 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); 2004 } 2005 break; 2006 case aco_opcode::s_mul_i32: 2007 /* Testing every uint32_t shows that 0x3f800000*n is never a denormal. 2008 * This pattern is created from a uniform nir_op_b2f. */ 2009 if (instr->operands[0].constantEquals(0x3f800000u)) 2010 ctx.info[instr->definitions[0].tempId()].set_canonicalized(); 2011 break; 2012 case aco_opcode::p_extract: { 2013 if (instr->definitions[0].bytes() == 4) { 2014 ctx.info[instr->definitions[0].tempId()].set_extract(instr.get()); 2015 if (instr->operands[0].regClass() == v1 && parse_insert(instr.get())) 2016 ctx.info[instr->operands[0].tempId()].set_insert(instr.get()); 2017 } 2018 break; 2019 } 2020 case aco_opcode::p_insert: { 2021 if (instr->operands[0].bytes() == 4) { 2022 if (instr->operands[0].regClass() == v1) 2023 ctx.info[instr->operands[0].tempId()].set_insert(instr.get()); 2024 if (parse_extract(instr.get())) 2025 ctx.info[instr->definitions[0].tempId()].set_extract(instr.get()); 2026 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); 2027 } 2028 break; 2029 } 2030 case aco_opcode::ds_read_u8: 2031 case aco_opcode::ds_read_u8_d16: 2032 case aco_opcode::ds_read_u16: 2033 case aco_opcode::ds_read_u16_d16: { 2034 ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); 2035 break; 2036 } 2037 case aco_opcode::v_cvt_f16_f32: { 2038 if (instr->operands[0].isTemp()) 2039 ctx.info[instr->operands[0].tempId()].set_f2f16(instr.get()); 2040 break; 2041 } 2042 case aco_opcode::v_cvt_f32_f16: { 2043 if (instr->operands[0].isTemp()) 2044 ctx.info[instr->definitions[0].tempId()].set_f2f32(instr.get()); 2045 break; 2046 } 2047 default: break; 2048 } 2049 2050 /* Don't remove label_extract if we can't apply the extract to 2051 * neg/abs instructions because we'll likely combine it into another valu. */ 2052 if (!(ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs))) 2053 check_sdwa_extract(ctx, instr); 2054} 2055 2056unsigned 2057original_temp_id(opt_ctx& ctx, Temp tmp) 2058{ 2059 if (ctx.info[tmp.id()].is_temp()) 2060 return ctx.info[tmp.id()].temp.id(); 2061 else 2062 return tmp.id(); 2063} 2064 2065void 2066decrease_uses(opt_ctx& ctx, Instruction* instr) 2067{ 2068 if (!--ctx.uses[instr->definitions[0].tempId()]) { 2069 for (const Operand& op : instr->operands) { 2070 if (op.isTemp()) 2071 ctx.uses[op.tempId()]--; 2072 } 2073 } 2074} 2075 2076Instruction* 2077follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false) 2078{ 2079 if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels)) 2080 return nullptr; 2081 if (!ignore_uses && ctx.uses[op.tempId()] > 1) 2082 return nullptr; 2083 2084 Instruction* instr = ctx.info[op.tempId()].instr; 2085 2086 if (instr->definitions.size() == 2) { 2087 assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId()); 2088 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) 2089 return nullptr; 2090 } 2091 2092 return instr; 2093} 2094 2095/* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b) 2096 * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */ 2097bool 2098combine_ordering_test(opt_ctx& ctx, aco_ptr<Instruction>& instr) 2099{ 2100 if (instr->definitions[0].regClass() != ctx.program->lane_mask) 2101 return false; 2102 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) 2103 return false; 2104 2105 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32; 2106 2107 bool neg[2] = {false, false}; 2108 bool abs[2] = {false, false}; 2109 uint8_t opsel = 0; 2110 Instruction* op_instr[2]; 2111 Temp op[2]; 2112 2113 unsigned bitsize = 0; 2114 for (unsigned i = 0; i < 2; i++) { 2115 op_instr[i] = follow_operand(ctx, instr->operands[i], true); 2116 if (!op_instr[i]) 2117 return false; 2118 2119 aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32; 2120 unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode); 2121 2122 if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp) 2123 return false; 2124 if (bitsize && op_bitsize != bitsize) 2125 return false; 2126 if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp()) 2127 return false; 2128 2129 if (op_instr[i]->isVOP3()) { 2130 VOP3_instruction& vop3 = op_instr[i]->vop3(); 2131 if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 || 2132 vop3.opsel == 2) 2133 return false; 2134 neg[i] = vop3.neg[0]; 2135 abs[i] = vop3.abs[0]; 2136 opsel |= (vop3.opsel & 1) << i; 2137 } else if (op_instr[i]->isSDWA()) { 2138 return false; 2139 } 2140 2141 Temp op0 = op_instr[i]->operands[0].getTemp(); 2142 Temp op1 = op_instr[i]->operands[1].getTemp(); 2143 if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1)) 2144 return false; 2145 2146 op[i] = op1; 2147 bitsize = op_bitsize; 2148 } 2149 2150 if (op[1].type() == RegType::sgpr) 2151 std::swap(op[0], op[1]); 2152 unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr); 2153 if (num_sgprs > (ctx.program->gfx_level >= GFX10 ? 2 : 1)) 2154 return false; 2155 2156 ctx.uses[op[0].id()]++; 2157 ctx.uses[op[1].id()]++; 2158 decrease_uses(ctx, op_instr[0]); 2159 decrease_uses(ctx, op_instr[1]); 2160 2161 aco_opcode new_op = aco_opcode::num_opcodes; 2162 switch (bitsize) { 2163 case 16: new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; break; 2164 case 32: new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; break; 2165 case 64: new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; break; 2166 } 2167 Instruction* new_instr; 2168 if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) { 2169 VOP3_instruction* vop3 = 2170 create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1); 2171 for (unsigned i = 0; i < 2; i++) { 2172 vop3->neg[i] = neg[i]; 2173 vop3->abs[i] = abs[i]; 2174 } 2175 vop3->opsel = opsel; 2176 new_instr = static_cast<Instruction*>(vop3); 2177 } else { 2178 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1); 2179 } 2180 new_instr->operands[0] = Operand(op[0]); 2181 new_instr->operands[1] = Operand(op[1]); 2182 new_instr->definitions[0] = instr->definitions[0]; 2183 2184 ctx.info[instr->definitions[0].tempId()].label = 0; 2185 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); 2186 2187 instr.reset(new_instr); 2188 2189 return true; 2190} 2191 2192/* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b) 2193 * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */ 2194bool 2195combine_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr) 2196{ 2197 if (instr->definitions[0].regClass() != ctx.program->lane_mask) 2198 return false; 2199 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) 2200 return false; 2201 2202 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32; 2203 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; 2204 2205 Instruction* nan_test = follow_operand(ctx, instr->operands[0], true); 2206 Instruction* cmp = follow_operand(ctx, instr->operands[1], true); 2207 if (!nan_test || !cmp) 2208 return false; 2209 if (nan_test->isSDWA() || cmp->isSDWA()) 2210 return false; 2211 2212 if (get_f32_cmp(cmp->opcode) == expected_nan_test) 2213 std::swap(nan_test, cmp); 2214 else if (get_f32_cmp(nan_test->opcode) != expected_nan_test) 2215 return false; 2216 2217 if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode)) 2218 return false; 2219 2220 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp()) 2221 return false; 2222 if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp()) 2223 return false; 2224 2225 unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp()); 2226 unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp()); 2227 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp()); 2228 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp()); 2229 if (prop_cmp0 != prop_nan0 && prop_cmp0 != prop_nan1) 2230 return false; 2231 if (prop_cmp1 != prop_nan0 && prop_cmp1 != prop_nan1) 2232 return false; 2233 2234 ctx.uses[cmp->operands[0].tempId()]++; 2235 ctx.uses[cmp->operands[1].tempId()]++; 2236 decrease_uses(ctx, nan_test); 2237 decrease_uses(ctx, cmp); 2238 2239 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode); 2240 Instruction* new_instr; 2241 if (cmp->isVOP3()) { 2242 VOP3_instruction* new_vop3 = 2243 create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1); 2244 VOP3_instruction& cmp_vop3 = cmp->vop3(); 2245 memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs)); 2246 memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg)); 2247 new_vop3->clamp = cmp_vop3.clamp; 2248 new_vop3->omod = cmp_vop3.omod; 2249 new_vop3->opsel = cmp_vop3.opsel; 2250 new_instr = new_vop3; 2251 } else { 2252 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1); 2253 } 2254 new_instr->operands[0] = cmp->operands[0]; 2255 new_instr->operands[1] = cmp->operands[1]; 2256 new_instr->definitions[0] = instr->definitions[0]; 2257 2258 ctx.info[instr->definitions[0].tempId()].label = 0; 2259 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); 2260 2261 instr.reset(new_instr); 2262 2263 return true; 2264} 2265 2266bool 2267is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value) 2268{ 2269 if (op.isConstant()) { 2270 *value = op.constantValue64(); 2271 return true; 2272 } else if (op.isTemp()) { 2273 unsigned id = original_temp_id(ctx, op.getTemp()); 2274 if (!ctx.info[id].is_constant_or_literal(bit_size)) 2275 return false; 2276 *value = get_constant_op(ctx, ctx.info[id], bit_size).constantValue64(); 2277 return true; 2278 } 2279 return false; 2280} 2281 2282bool 2283is_constant_nan(uint64_t value, unsigned bit_size) 2284{ 2285 if (bit_size == 16) 2286 return ((value >> 10) & 0x1f) == 0x1f && (value & 0x3ff); 2287 else if (bit_size == 32) 2288 return ((value >> 23) & 0xff) == 0xff && (value & 0x7fffff); 2289 else 2290 return ((value >> 52) & 0x7ff) == 0x7ff && (value & 0xfffffffffffff); 2291} 2292 2293/* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b) 2294 * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */ 2295bool 2296combine_constant_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr) 2297{ 2298 if (instr->definitions[0].regClass() != ctx.program->lane_mask) 2299 return false; 2300 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) 2301 return false; 2302 2303 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32; 2304 2305 Instruction* nan_test = follow_operand(ctx, instr->operands[0], true); 2306 Instruction* cmp = follow_operand(ctx, instr->operands[1], true); 2307 2308 if (!nan_test || !cmp || nan_test->isSDWA() || cmp->isSDWA()) 2309 return false; 2310 if (nan_test->isSDWA() || cmp->isSDWA()) 2311 return false; 2312 2313 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32; 2314 if (get_f32_cmp(cmp->opcode) == expected_nan_test) 2315 std::swap(nan_test, cmp); 2316 else if (get_f32_cmp(nan_test->opcode) != expected_nan_test) 2317 return false; 2318 2319 unsigned bit_size = get_cmp_bitsize(cmp->opcode); 2320 if (!is_cmp(cmp->opcode) || get_cmp_bitsize(nan_test->opcode) != bit_size) 2321 return false; 2322 2323 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp()) 2324 return false; 2325 if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp()) 2326 return false; 2327 2328 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp()); 2329 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp()); 2330 if (prop_nan0 != prop_nan1) 2331 return false; 2332 2333 if (nan_test->isVOP3()) { 2334 VOP3_instruction& vop3 = nan_test->vop3(); 2335 if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 || 2336 vop3.opsel == 2) 2337 return false; 2338 } 2339 2340 int constant_operand = -1; 2341 for (unsigned i = 0; i < 2; i++) { 2342 if (cmp->operands[i].isTemp() && 2343 original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) { 2344 constant_operand = !i; 2345 break; 2346 } 2347 } 2348 if (constant_operand == -1) 2349 return false; 2350 2351 uint64_t constant_value; 2352 if (!is_operand_constant(ctx, cmp->operands[constant_operand], bit_size, &constant_value)) 2353 return false; 2354 if (is_constant_nan(constant_value, bit_size)) 2355 return false; 2356 2357 if (cmp->operands[0].isTemp()) 2358 ctx.uses[cmp->operands[0].tempId()]++; 2359 if (cmp->operands[1].isTemp()) 2360 ctx.uses[cmp->operands[1].tempId()]++; 2361 decrease_uses(ctx, nan_test); 2362 decrease_uses(ctx, cmp); 2363 2364 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode); 2365 Instruction* new_instr; 2366 if (cmp->isVOP3()) { 2367 VOP3_instruction* new_vop3 = 2368 create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1); 2369 VOP3_instruction& cmp_vop3 = cmp->vop3(); 2370 memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs)); 2371 memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg)); 2372 new_vop3->clamp = cmp_vop3.clamp; 2373 new_vop3->omod = cmp_vop3.omod; 2374 new_vop3->opsel = cmp_vop3.opsel; 2375 new_instr = new_vop3; 2376 } else { 2377 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1); 2378 } 2379 new_instr->operands[0] = cmp->operands[0]; 2380 new_instr->operands[1] = cmp->operands[1]; 2381 new_instr->definitions[0] = instr->definitions[0]; 2382 2383 ctx.info[instr->definitions[0].tempId()].label = 0; 2384 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); 2385 2386 instr.reset(new_instr); 2387 2388 return true; 2389} 2390 2391/* s_andn2(exec, cmp(a, b)) -> get_inverse(cmp)(a, b) */ 2392bool 2393combine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr) 2394{ 2395 if (!instr->operands[0].isFixed() || instr->operands[0].physReg() != exec) 2396 return false; 2397 if (ctx.uses[instr->definitions[1].tempId()]) 2398 return false; 2399 2400 Instruction* cmp = follow_operand(ctx, instr->operands[1]); 2401 if (!cmp) 2402 return false; 2403 2404 aco_opcode new_opcode = get_inverse(cmp->opcode); 2405 if (new_opcode == aco_opcode::num_opcodes) 2406 return false; 2407 2408 if (cmp->operands[0].isTemp()) 2409 ctx.uses[cmp->operands[0].tempId()]++; 2410 if (cmp->operands[1].isTemp()) 2411 ctx.uses[cmp->operands[1].tempId()]++; 2412 decrease_uses(ctx, cmp); 2413 2414 /* This creates a new instruction instead of modifying the existing 2415 * comparison so that the comparison is done with the correct exec mask. */ 2416 Instruction* new_instr; 2417 if (cmp->isVOP3()) { 2418 VOP3_instruction* new_vop3 = 2419 create_instruction<VOP3_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1); 2420 VOP3_instruction& cmp_vop3 = cmp->vop3(); 2421 memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs)); 2422 memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg)); 2423 new_vop3->clamp = cmp_vop3.clamp; 2424 new_vop3->omod = cmp_vop3.omod; 2425 new_vop3->opsel = cmp_vop3.opsel; 2426 new_instr = new_vop3; 2427 } else if (cmp->isSDWA()) { 2428 SDWA_instruction* new_sdwa = create_instruction<SDWA_instruction>( 2429 new_opcode, (Format)((uint16_t)Format::SDWA | (uint16_t)Format::VOPC), 2, 1); 2430 SDWA_instruction& cmp_sdwa = cmp->sdwa(); 2431 memcpy(new_sdwa->abs, cmp_sdwa.abs, sizeof(new_sdwa->abs)); 2432 memcpy(new_sdwa->sel, cmp_sdwa.sel, sizeof(new_sdwa->sel)); 2433 memcpy(new_sdwa->neg, cmp_sdwa.neg, sizeof(new_sdwa->neg)); 2434 new_sdwa->dst_sel = cmp_sdwa.dst_sel; 2435 new_sdwa->clamp = cmp_sdwa.clamp; 2436 new_sdwa->omod = cmp_sdwa.omod; 2437 new_instr = new_sdwa; 2438 } else if (cmp->isDPP16()) { 2439 DPP16_instruction* new_dpp = create_instruction<DPP16_instruction>( 2440 new_opcode, (Format)((uint16_t)Format::DPP16 | (uint16_t)Format::VOPC), 2, 1); 2441 DPP16_instruction& cmp_dpp = cmp->dpp16(); 2442 memcpy(new_dpp->abs, cmp_dpp.abs, sizeof(new_dpp->abs)); 2443 memcpy(new_dpp->neg, cmp_dpp.neg, sizeof(new_dpp->neg)); 2444 new_dpp->dpp_ctrl = cmp_dpp.dpp_ctrl; 2445 new_dpp->row_mask = cmp_dpp.row_mask; 2446 new_dpp->bank_mask = cmp_dpp.bank_mask; 2447 new_dpp->bound_ctrl = cmp_dpp.bound_ctrl; 2448 new_instr = new_dpp; 2449 } else if (cmp->isDPP8()) { 2450 DPP8_instruction* new_dpp = create_instruction<DPP8_instruction>( 2451 new_opcode, (Format)((uint16_t)Format::DPP8 | (uint16_t)Format::VOPC), 2, 1); 2452 DPP8_instruction& cmp_dpp = cmp->dpp8(); 2453 memcpy(new_dpp->lane_sel, cmp_dpp.lane_sel, sizeof(new_dpp->lane_sel)); 2454 new_instr = new_dpp; 2455 } else { 2456 new_instr = create_instruction<VOPC_instruction>(new_opcode, Format::VOPC, 2, 1); 2457 } 2458 new_instr->operands[0] = cmp->operands[0]; 2459 new_instr->operands[1] = cmp->operands[1]; 2460 new_instr->definitions[0] = instr->definitions[0]; 2461 2462 ctx.info[instr->definitions[0].tempId()].label = 0; 2463 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); 2464 2465 instr.reset(new_instr); 2466 2467 return true; 2468} 2469 2470/* op1(op2(1, 2), 0) if swap = false 2471 * op1(0, op2(1, 2)) if swap = true */ 2472bool 2473match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op1_instr, bool swap, 2474 const char* shuffle_str, Operand operands[3], bool neg[3], bool abs[3], 2475 uint8_t* opsel, bool* op1_clamp, uint8_t* op1_omod, bool* inbetween_neg, 2476 bool* inbetween_abs, bool* inbetween_opsel, bool* precise) 2477{ 2478 /* checks */ 2479 if (op1_instr->opcode != op1) 2480 return false; 2481 2482 Instruction* op2_instr = follow_operand(ctx, op1_instr->operands[swap]); 2483 if (!op2_instr || op2_instr->opcode != op2) 2484 return false; 2485 if (fixed_to_exec(op2_instr->operands[0]) || fixed_to_exec(op2_instr->operands[1])) 2486 return false; 2487 2488 VOP3_instruction* op1_vop3 = op1_instr->isVOP3() ? &op1_instr->vop3() : NULL; 2489 VOP3_instruction* op2_vop3 = op2_instr->isVOP3() ? &op2_instr->vop3() : NULL; 2490 2491 if (op1_instr->isSDWA() || op2_instr->isSDWA()) 2492 return false; 2493 if (op1_instr->isDPP() || op2_instr->isDPP()) 2494 return false; 2495 2496 /* don't support inbetween clamp/omod */ 2497 if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod)) 2498 return false; 2499 2500 /* get operands and modifiers and check inbetween modifiers */ 2501 *op1_clamp = op1_vop3 ? op1_vop3->clamp : false; 2502 *op1_omod = op1_vop3 ? op1_vop3->omod : 0u; 2503 2504 if (inbetween_neg) 2505 *inbetween_neg = op1_vop3 ? op1_vop3->neg[swap] : false; 2506 else if (op1_vop3 && op1_vop3->neg[swap]) 2507 return false; 2508 2509 if (inbetween_abs) 2510 *inbetween_abs = op1_vop3 ? op1_vop3->abs[swap] : false; 2511 else if (op1_vop3 && op1_vop3->abs[swap]) 2512 return false; 2513 2514 if (inbetween_opsel) 2515 *inbetween_opsel = op1_vop3 ? op1_vop3->opsel & (1 << (unsigned)swap) : false; 2516 else if (op1_vop3 && op1_vop3->opsel & (1 << (unsigned)swap)) 2517 return false; 2518 2519 *precise = op1_instr->definitions[0].isPrecise() || op2_instr->definitions[0].isPrecise(); 2520 2521 int shuffle[3]; 2522 shuffle[shuffle_str[0] - '0'] = 0; 2523 shuffle[shuffle_str[1] - '0'] = 1; 2524 shuffle[shuffle_str[2] - '0'] = 2; 2525 2526 operands[shuffle[0]] = op1_instr->operands[!swap]; 2527 neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false; 2528 abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false; 2529 if (op1_vop3 && (op1_vop3->opsel & (1 << (unsigned)!swap))) 2530 *opsel |= 1 << shuffle[0]; 2531 2532 for (unsigned i = 0; i < 2; i++) { 2533 operands[shuffle[i + 1]] = op2_instr->operands[i]; 2534 neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false; 2535 abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false; 2536 if (op2_vop3 && op2_vop3->opsel & (1 << i)) 2537 *opsel |= 1 << shuffle[i + 1]; 2538 } 2539 2540 /* check operands */ 2541 if (!check_vop3_operands(ctx, 3, operands)) 2542 return false; 2543 2544 return true; 2545} 2546 2547void 2548create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr, 2549 Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel, bool clamp, 2550 unsigned omod) 2551{ 2552 VOP3_instruction* new_instr = create_instruction<VOP3_instruction>(opcode, Format::VOP3, 3, 1); 2553 memcpy(new_instr->abs, abs, sizeof(bool[3])); 2554 memcpy(new_instr->neg, neg, sizeof(bool[3])); 2555 new_instr->clamp = clamp; 2556 new_instr->omod = omod; 2557 new_instr->opsel = opsel; 2558 new_instr->operands[0] = operands[0]; 2559 new_instr->operands[1] = operands[1]; 2560 new_instr->operands[2] = operands[2]; 2561 new_instr->definitions[0] = instr->definitions[0]; 2562 ctx.info[instr->definitions[0].tempId()].label = 0; 2563 2564 instr.reset(new_instr); 2565} 2566 2567bool 2568combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op, 2569 const char* shuffle, uint8_t ops) 2570{ 2571 for (unsigned swap = 0; swap < 2; swap++) { 2572 if (!((1 << swap) & ops)) 2573 continue; 2574 2575 Operand operands[3]; 2576 bool neg[3], abs[3], clamp, precise; 2577 uint8_t opsel = 0, omod = 0; 2578 if (match_op3_for_vop3(ctx, instr->opcode, op2, instr.get(), swap, shuffle, operands, neg, 2579 abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) { 2580 ctx.uses[instr->operands[swap].tempId()]--; 2581 create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod); 2582 return true; 2583 } 2584 } 2585 return false; 2586} 2587 2588/* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */ 2589bool 2590combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr) 2591{ 2592 bool is_or = instr->opcode == aco_opcode::v_or_b32; 2593 aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32; 2594 2595 if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32, 2596 "120", 1 | 2)) 2597 return true; 2598 if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32, 2599 "120", 1 | 2)) 2600 return true; 2601 if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2)) 2602 return true; 2603 if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2)) 2604 return true; 2605 2606 if (instr->isSDWA() || instr->isDPP()) 2607 return false; 2608 2609 /* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b) 2610 * v_or_b32(p_insert(a, 0, 8/16), b) -> v_and_or_b32(a, 0xff/0xffff, b) 2611 * v_or_b32(p_insert(a, 24/16, 8/16), b) -> v_lshl_or_b32(a, 24/16, b) 2612 * v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b) 2613 */ 2614 for (unsigned i = 0; i < 2; i++) { 2615 Instruction* extins = follow_operand(ctx, instr->operands[i]); 2616 if (!extins) 2617 continue; 2618 2619 aco_opcode op; 2620 Operand operands[3]; 2621 2622 if (extins->opcode == aco_opcode::p_insert && 2623 (extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) { 2624 op = new_op_lshl; 2625 operands[1] = 2626 Operand::c32(extins->operands[1].constantValue() * extins->operands[2].constantValue()); 2627 } else if (is_or && 2628 (extins->opcode == aco_opcode::p_insert || 2629 (extins->opcode == aco_opcode::p_extract && 2630 extins->operands[3].constantEquals(0))) && 2631 extins->operands[1].constantEquals(0)) { 2632 op = aco_opcode::v_and_or_b32; 2633 operands[1] = Operand::c32(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu); 2634 } else { 2635 continue; 2636 } 2637 2638 operands[0] = extins->operands[0]; 2639 operands[2] = instr->operands[!i]; 2640 2641 if (!check_vop3_operands(ctx, 3, operands)) 2642 continue; 2643 2644 bool neg[3] = {}, abs[3] = {}; 2645 uint8_t opsel = 0, omod = 0; 2646 bool clamp = false; 2647 if (instr->isVOP3()) 2648 clamp = instr->vop3().clamp; 2649 2650 ctx.uses[instr->operands[i].tempId()]--; 2651 create_vop3_for_op3(ctx, op, instr, operands, neg, abs, opsel, clamp, omod); 2652 return true; 2653 } 2654 2655 return false; 2656} 2657 2658bool 2659combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode minmax3) 2660{ 2661 /* TODO: this can handle SDWA min/max instructions by using opsel */ 2662 if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2)) 2663 return true; 2664 2665 /* min(-max(a, b), c) -> min3(c, -a, -b) * 2666 * max(-min(a, b), c) -> max3(c, -a, -b) */ 2667 for (unsigned swap = 0; swap < 2; swap++) { 2668 Operand operands[3]; 2669 bool neg[3], abs[3], clamp, precise; 2670 uint8_t opsel = 0, omod = 0; 2671 bool inbetween_neg; 2672 if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "012", operands, neg, 2673 abs, &opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) && 2674 inbetween_neg) { 2675 ctx.uses[instr->operands[swap].tempId()]--; 2676 neg[1] = !neg[1]; 2677 neg[2] = !neg[2]; 2678 create_vop3_for_op3(ctx, minmax3, instr, operands, neg, abs, opsel, clamp, omod); 2679 return true; 2680 } 2681 } 2682 return false; 2683} 2684 2685/* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b) 2686 * s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b) 2687 * s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b) 2688 * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b) 2689 * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b) 2690 * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */ 2691bool 2692combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr) 2693{ 2694 /* checks */ 2695 if (!instr->operands[0].isTemp()) 2696 return false; 2697 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) 2698 return false; 2699 2700 Instruction* op2_instr = follow_operand(ctx, instr->operands[0]); 2701 if (!op2_instr) 2702 return false; 2703 switch (op2_instr->opcode) { 2704 case aco_opcode::s_and_b32: 2705 case aco_opcode::s_or_b32: 2706 case aco_opcode::s_xor_b32: 2707 case aco_opcode::s_and_b64: 2708 case aco_opcode::s_or_b64: 2709 case aco_opcode::s_xor_b64: break; 2710 default: return false; 2711 } 2712 2713 /* create instruction */ 2714 std::swap(instr->definitions[0], op2_instr->definitions[0]); 2715 std::swap(instr->definitions[1], op2_instr->definitions[1]); 2716 ctx.uses[instr->operands[0].tempId()]--; 2717 ctx.info[op2_instr->definitions[0].tempId()].label = 0; 2718 2719 switch (op2_instr->opcode) { 2720 case aco_opcode::s_and_b32: op2_instr->opcode = aco_opcode::s_nand_b32; break; 2721 case aco_opcode::s_or_b32: op2_instr->opcode = aco_opcode::s_nor_b32; break; 2722 case aco_opcode::s_xor_b32: op2_instr->opcode = aco_opcode::s_xnor_b32; break; 2723 case aco_opcode::s_and_b64: op2_instr->opcode = aco_opcode::s_nand_b64; break; 2724 case aco_opcode::s_or_b64: op2_instr->opcode = aco_opcode::s_nor_b64; break; 2725 case aco_opcode::s_xor_b64: op2_instr->opcode = aco_opcode::s_xnor_b64; break; 2726 default: break; 2727 } 2728 2729 return true; 2730} 2731 2732/* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b) 2733 * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b) 2734 * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b) 2735 * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */ 2736bool 2737combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr) 2738{ 2739 if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool()) 2740 return false; 2741 2742 for (unsigned i = 0; i < 2; i++) { 2743 Instruction* op2_instr = follow_operand(ctx, instr->operands[i]); 2744 if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 && 2745 op2_instr->opcode != aco_opcode::s_not_b64)) 2746 continue; 2747 if (ctx.uses[op2_instr->definitions[1].tempId()] || fixed_to_exec(op2_instr->operands[0])) 2748 continue; 2749 2750 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() && 2751 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue()) 2752 continue; 2753 2754 ctx.uses[instr->operands[i].tempId()]--; 2755 instr->operands[0] = instr->operands[!i]; 2756 instr->operands[1] = op2_instr->operands[0]; 2757 ctx.info[instr->definitions[0].tempId()].label = 0; 2758 2759 switch (instr->opcode) { 2760 case aco_opcode::s_and_b32: instr->opcode = aco_opcode::s_andn2_b32; break; 2761 case aco_opcode::s_or_b32: instr->opcode = aco_opcode::s_orn2_b32; break; 2762 case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_andn2_b64; break; 2763 case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_orn2_b64; break; 2764 default: break; 2765 } 2766 2767 return true; 2768 } 2769 return false; 2770} 2771 2772/* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */ 2773bool 2774combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr) 2775{ 2776 if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()]) 2777 return false; 2778 2779 for (unsigned i = 0; i < 2; i++) { 2780 Instruction* op2_instr = follow_operand(ctx, instr->operands[i], true); 2781 if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 || 2782 ctx.uses[op2_instr->definitions[1].tempId()]) 2783 continue; 2784 if (!op2_instr->operands[1].isConstant() || fixed_to_exec(op2_instr->operands[0])) 2785 continue; 2786 2787 uint32_t shift = op2_instr->operands[1].constantValue(); 2788 if (shift < 1 || shift > 4) 2789 continue; 2790 2791 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() && 2792 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue()) 2793 continue; 2794 2795 ctx.uses[instr->operands[i].tempId()]--; 2796 instr->operands[1] = instr->operands[!i]; 2797 instr->operands[0] = op2_instr->operands[0]; 2798 ctx.info[instr->definitions[0].tempId()].label = 0; 2799 2800 instr->opcode = std::array<aco_opcode, 4>{ 2801 aco_opcode::s_lshl1_add_u32, aco_opcode::s_lshl2_add_u32, aco_opcode::s_lshl3_add_u32, 2802 aco_opcode::s_lshl4_add_u32}[shift - 1]; 2803 2804 return true; 2805 } 2806 return false; 2807} 2808 2809bool 2810combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops) 2811{ 2812 if (instr->usesModifiers()) 2813 return false; 2814 2815 for (unsigned i = 0; i < 2; i++) { 2816 if (!((1 << i) & ops)) 2817 continue; 2818 if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2i() && 2819 ctx.uses[instr->operands[i].tempId()] == 1) { 2820 2821 aco_ptr<Instruction> new_instr; 2822 if (instr->operands[!i].isTemp() && 2823 instr->operands[!i].getTemp().type() == RegType::vgpr) { 2824 new_instr.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 2)); 2825 } else if (ctx.program->gfx_level >= GFX10 || 2826 (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) { 2827 new_instr.reset( 2828 create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOP2), 3, 2)); 2829 } else { 2830 return false; 2831 } 2832 ctx.uses[instr->operands[i].tempId()]--; 2833 new_instr->definitions[0] = instr->definitions[0]; 2834 if (instr->definitions.size() == 2) { 2835 new_instr->definitions[1] = instr->definitions[1]; 2836 } else { 2837 new_instr->definitions[1] = 2838 Definition(ctx.program->allocateTmp(ctx.program->lane_mask)); 2839 /* Make sure the uses vector is large enough and the number of 2840 * uses properly initialized to 0. 2841 */ 2842 ctx.uses.push_back(0); 2843 } 2844 new_instr->operands[0] = Operand::zero(); 2845 new_instr->operands[1] = instr->operands[!i]; 2846 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp); 2847 instr = std::move(new_instr); 2848 ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get()); 2849 return true; 2850 } 2851 } 2852 2853 return false; 2854} 2855 2856bool 2857combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr) 2858{ 2859 if (instr->usesModifiers()) 2860 return false; 2861 2862 for (unsigned i = 0; i < 2; i++) { 2863 Instruction* op_instr = follow_operand(ctx, instr->operands[i]); 2864 if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 && 2865 !op_instr->usesModifiers() && op_instr->operands[0].isTemp() && 2866 op_instr->operands[0].getTemp().type() == RegType::vgpr && 2867 op_instr->operands[1].constantEquals(0)) { 2868 aco_ptr<Instruction> new_instr{ 2869 create_instruction<VOP3_instruction>(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)}; 2870 ctx.uses[instr->operands[i].tempId()]--; 2871 new_instr->operands[0] = op_instr->operands[0]; 2872 new_instr->operands[1] = instr->operands[!i]; 2873 new_instr->definitions[0] = instr->definitions[0]; 2874 instr = std::move(new_instr); 2875 ctx.info[instr->definitions[0].tempId()].label = 0; 2876 2877 return true; 2878 } 2879 } 2880 2881 return false; 2882} 2883 2884bool 2885get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min3, aco_opcode* max3, 2886 aco_opcode* med3, bool* some_gfx9_only) 2887{ 2888 switch (op) { 2889#define MINMAX(type, gfx9) \ 2890 case aco_opcode::v_min_##type: \ 2891 case aco_opcode::v_max_##type: \ 2892 *min = aco_opcode::v_min_##type; \ 2893 *max = aco_opcode::v_max_##type; \ 2894 *med3 = aco_opcode::v_med3_##type; \ 2895 *min3 = aco_opcode::v_min3_##type; \ 2896 *max3 = aco_opcode::v_max3_##type; \ 2897 *some_gfx9_only = gfx9; \ 2898 return true; 2899#define MINMAX_E64(type, gfx9) \ 2900 case aco_opcode::v_min_##type##_e64: \ 2901 case aco_opcode::v_max_##type##_e64: \ 2902 *min = aco_opcode::v_min_##type##_e64; \ 2903 *max = aco_opcode::v_max_##type##_e64; \ 2904 *med3 = aco_opcode::v_med3_##type; \ 2905 *min3 = aco_opcode::v_min3_##type; \ 2906 *max3 = aco_opcode::v_max3_##type; \ 2907 *some_gfx9_only = gfx9; \ 2908 return true; 2909 MINMAX(f32, false) 2910 MINMAX(u32, false) 2911 MINMAX(i32, false) 2912 MINMAX(f16, true) 2913 MINMAX(u16, true) 2914 MINMAX(i16, true) 2915 MINMAX_E64(u16, true) 2916 MINMAX_E64(i16, true) 2917#undef MINMAX_E64 2918#undef MINMAX 2919 default: return false; 2920 } 2921} 2922 2923/* when ub > lb: 2924 * v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub) 2925 * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub) 2926 */ 2927bool 2928combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode min, aco_opcode max, 2929 aco_opcode med) 2930{ 2931 /* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's 2932 * FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if 2933 * minVal > maxVal, which means we can always select it to a v_med3_f32 */ 2934 aco_opcode other_op; 2935 if (instr->opcode == min) 2936 other_op = max; 2937 else if (instr->opcode == max) 2938 other_op = min; 2939 else 2940 return false; 2941 2942 for (unsigned swap = 0; swap < 2; swap++) { 2943 Operand operands[3]; 2944 bool neg[3], abs[3], clamp, precise; 2945 uint8_t opsel = 0, omod = 0; 2946 if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, "012", operands, neg, 2947 abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) { 2948 /* max(min(src, upper), lower) returns upper if src is NaN, but 2949 * med3(src, lower, upper) returns lower. 2950 */ 2951 if (precise && instr->opcode != min && 2952 (min == aco_opcode::v_min_f16 || min == aco_opcode::v_min_f32)) 2953 continue; 2954 2955 int const0_idx = -1, const1_idx = -1; 2956 uint32_t const0 = 0, const1 = 0; 2957 for (int i = 0; i < 3; i++) { 2958 uint32_t val; 2959 bool hi16 = opsel & (1 << i); 2960 if (operands[i].isConstant()) { 2961 val = hi16 ? operands[i].constantValue16(true) : operands[i].constantValue(); 2962 } else if (operands[i].isTemp() && 2963 ctx.info[operands[i].tempId()].is_constant_or_literal(32)) { 2964 val = ctx.info[operands[i].tempId()].val >> (hi16 ? 16 : 0); 2965 } else { 2966 continue; 2967 } 2968 if (const0_idx >= 0) { 2969 const1_idx = i; 2970 const1 = val; 2971 } else { 2972 const0_idx = i; 2973 const0 = val; 2974 } 2975 } 2976 if (const0_idx < 0 || const1_idx < 0) 2977 continue; 2978 2979 int lower_idx = const0_idx; 2980 switch (min) { 2981 case aco_opcode::v_min_f32: 2982 case aco_opcode::v_min_f16: { 2983 float const0_f, const1_f; 2984 if (min == aco_opcode::v_min_f32) { 2985 memcpy(&const0_f, &const0, 4); 2986 memcpy(&const1_f, &const1, 4); 2987 } else { 2988 const0_f = _mesa_half_to_float(const0); 2989 const1_f = _mesa_half_to_float(const1); 2990 } 2991 if (abs[const0_idx]) 2992 const0_f = fabsf(const0_f); 2993 if (abs[const1_idx]) 2994 const1_f = fabsf(const1_f); 2995 if (neg[const0_idx]) 2996 const0_f = -const0_f; 2997 if (neg[const1_idx]) 2998 const1_f = -const1_f; 2999 lower_idx = const0_f < const1_f ? const0_idx : const1_idx; 3000 break; 3001 } 3002 case aco_opcode::v_min_u32: { 3003 lower_idx = const0 < const1 ? const0_idx : const1_idx; 3004 break; 3005 } 3006 case aco_opcode::v_min_u16: 3007 case aco_opcode::v_min_u16_e64: { 3008 lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx; 3009 break; 3010 } 3011 case aco_opcode::v_min_i32: { 3012 int32_t const0_i = 3013 const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0; 3014 int32_t const1_i = 3015 const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1; 3016 lower_idx = const0_i < const1_i ? const0_idx : const1_idx; 3017 break; 3018 } 3019 case aco_opcode::v_min_i16: 3020 case aco_opcode::v_min_i16_e64: { 3021 int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0; 3022 int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1; 3023 lower_idx = const0_i < const1_i ? const0_idx : const1_idx; 3024 break; 3025 } 3026 default: break; 3027 } 3028 int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx; 3029 3030 if (instr->opcode == min) { 3031 if (upper_idx != 0 || lower_idx == 0) 3032 return false; 3033 } else { 3034 if (upper_idx == 0 || lower_idx != 0) 3035 return false; 3036 } 3037 3038 ctx.uses[instr->operands[swap].tempId()]--; 3039 create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod); 3040 3041 return true; 3042 } 3043 } 3044 3045 return false; 3046} 3047 3048void 3049apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr) 3050{ 3051 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 || 3052 instr->opcode == aco_opcode::v_lshrrev_b64 || 3053 instr->opcode == aco_opcode::v_ashrrev_i64; 3054 3055 /* find candidates and create the set of sgprs already read */ 3056 unsigned sgpr_ids[2] = {0, 0}; 3057 uint32_t operand_mask = 0; 3058 bool has_literal = false; 3059 for (unsigned i = 0; i < instr->operands.size(); i++) { 3060 if (instr->operands[i].isLiteral()) 3061 has_literal = true; 3062 if (!instr->operands[i].isTemp()) 3063 continue; 3064 if (instr->operands[i].getTemp().type() == RegType::sgpr) { 3065 if (instr->operands[i].tempId() != sgpr_ids[0]) 3066 sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId(); 3067 } 3068 ssa_info& info = ctx.info[instr->operands[i].tempId()]; 3069 if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::sgpr) 3070 operand_mask |= 1u << i; 3071 if (info.is_extract() && info.instr->operands[0].getTemp().type() == RegType::sgpr) 3072 operand_mask |= 1u << i; 3073 } 3074 unsigned max_sgprs = 1; 3075 if (ctx.program->gfx_level >= GFX10 && !is_shift64) 3076 max_sgprs = 2; 3077 if (has_literal) 3078 max_sgprs--; 3079 3080 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1]; 3081 3082 /* keep on applying sgprs until there is nothing left to be done */ 3083 while (operand_mask) { 3084 uint32_t sgpr_idx = 0; 3085 uint32_t sgpr_info_id = 0; 3086 uint32_t mask = operand_mask; 3087 /* choose a sgpr */ 3088 while (mask) { 3089 unsigned i = u_bit_scan(&mask); 3090 uint16_t uses = ctx.uses[instr->operands[i].tempId()]; 3091 if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) { 3092 sgpr_idx = i; 3093 sgpr_info_id = instr->operands[i].tempId(); 3094 } 3095 } 3096 operand_mask &= ~(1u << sgpr_idx); 3097 3098 ssa_info& info = ctx.info[sgpr_info_id]; 3099 3100 /* Applying two sgprs require making it VOP3, so don't do it unless it's 3101 * definitively beneficial. 3102 * TODO: this is too conservative because later the use count could be reduced to 1 */ 3103 if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3() && 3104 !instr->isSDWA() && instr->format != Format::VOP3P) 3105 break; 3106 3107 Temp sgpr = info.is_extract() ? info.instr->operands[0].getTemp() : info.temp; 3108 bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1]; 3109 if (new_sgpr && num_sgprs >= max_sgprs) 3110 continue; 3111 3112 if (sgpr_idx == 0) 3113 instr->format = withoutDPP(instr->format); 3114 3115 if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() || 3116 info.is_extract()) { 3117 /* can_apply_extract() checks SGPR encoding restrictions */ 3118 if (info.is_extract() && can_apply_extract(ctx, instr, sgpr_idx, info)) 3119 apply_extract(ctx, instr, sgpr_idx, info); 3120 else if (info.is_extract()) 3121 continue; 3122 instr->operands[sgpr_idx] = Operand(sgpr); 3123 } else if (can_swap_operands(instr, &instr->opcode)) { 3124 instr->operands[sgpr_idx] = instr->operands[0]; 3125 instr->operands[0] = Operand(sgpr); 3126 /* swap bits using a 4-entry LUT */ 3127 uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf; 3128 operand_mask = (operand_mask & ~0x3) | swapped; 3129 } else if (can_use_VOP3(ctx, instr) && !info.is_extract()) { 3130 to_VOP3(ctx, instr); 3131 instr->operands[sgpr_idx] = Operand(sgpr); 3132 } else { 3133 continue; 3134 } 3135 3136 if (new_sgpr) 3137 sgpr_ids[num_sgprs++] = sgpr.id(); 3138 ctx.uses[sgpr_info_id]--; 3139 ctx.uses[sgpr.id()]++; 3140 3141 /* TODO: handle when it's a VGPR */ 3142 if ((ctx.info[sgpr.id()].label & (label_extract | label_temp)) && 3143 ctx.info[sgpr.id()].temp.type() == RegType::sgpr) 3144 operand_mask |= 1u << sgpr_idx; 3145 } 3146} 3147 3148template <typename T> 3149bool 3150apply_omod_clamp_helper(opt_ctx& ctx, T* instr, ssa_info& def_info) 3151{ 3152 if (!def_info.is_clamp() && (instr->clamp || instr->omod)) 3153 return false; 3154 3155 if (def_info.is_omod2()) 3156 instr->omod = 1; 3157 else if (def_info.is_omod4()) 3158 instr->omod = 2; 3159 else if (def_info.is_omod5()) 3160 instr->omod = 3; 3161 else if (def_info.is_clamp()) 3162 instr->clamp = true; 3163 3164 return true; 3165} 3166 3167/* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */ 3168bool 3169apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr) 3170{ 3171 if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 || 3172 !instr_info.can_use_output_modifiers[(int)instr->opcode]) 3173 return false; 3174 3175 bool can_vop3 = can_use_VOP3(ctx, instr); 3176 bool is_mad_mix = 3177 instr->opcode == aco_opcode::v_fma_mix_f32 || instr->opcode == aco_opcode::v_fma_mixlo_f16; 3178 if (!instr->isSDWA() && !is_mad_mix && !can_vop3) 3179 return false; 3180 3181 /* omod flushes -0 to +0 and has no effect if denormals are enabled. SDWA omod is GFX9+. */ 3182 bool can_use_omod = (can_vop3 || ctx.program->gfx_level >= GFX9) && !instr->isVOP3P(); 3183 if (instr->definitions[0].bytes() == 4) 3184 can_use_omod = 3185 can_use_omod && ctx.fp_mode.denorm32 == 0 && !ctx.fp_mode.preserve_signed_zero_inf_nan32; 3186 else 3187 can_use_omod = can_use_omod && ctx.fp_mode.denorm16_64 == 0 && 3188 !ctx.fp_mode.preserve_signed_zero_inf_nan16_64; 3189 3190 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()]; 3191 3192 uint64_t omod_labels = label_omod2 | label_omod4 | label_omod5; 3193 if (!def_info.is_clamp() && !(can_use_omod && (def_info.label & omod_labels))) 3194 return false; 3195 /* if the omod/clamp instruction is dead, then the single user of this 3196 * instruction is a different instruction */ 3197 if (!ctx.uses[def_info.instr->definitions[0].tempId()]) 3198 return false; 3199 3200 if (def_info.instr->definitions[0].bytes() != instr->definitions[0].bytes()) 3201 return false; 3202 3203 /* MADs/FMAs are created later, so we don't have to update the original add */ 3204 assert(!ctx.info[instr->definitions[0].tempId()].is_mad()); 3205 3206 if (instr->isSDWA()) { 3207 if (!apply_omod_clamp_helper(ctx, &instr->sdwa(), def_info)) 3208 return false; 3209 } else if (instr->isVOP3P()) { 3210 assert(def_info.is_clamp()); 3211 instr->vop3p().clamp = true; 3212 } else { 3213 to_VOP3(ctx, instr); 3214 if (!apply_omod_clamp_helper(ctx, &instr->vop3(), def_info)) 3215 return false; 3216 } 3217 3218 instr->definitions[0].swapTemp(def_info.instr->definitions[0]); 3219 ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert | label_f2f16; 3220 ctx.uses[def_info.instr->definitions[0].tempId()]--; 3221 3222 return true; 3223} 3224 3225/* Combine an p_insert (or p_extract, in some cases) instruction with instr. 3226 * p_insert(instr(...)) -> instr_insert(). 3227 */ 3228bool 3229apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr) 3230{ 3231 if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1) 3232 return false; 3233 3234 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()]; 3235 if (!def_info.is_insert()) 3236 return false; 3237 /* if the insert instruction is dead, then the single user of this 3238 * instruction is a different instruction */ 3239 if (!ctx.uses[def_info.instr->definitions[0].tempId()]) 3240 return false; 3241 3242 /* MADs/FMAs are created later, so we don't have to update the original add */ 3243 assert(!ctx.info[instr->definitions[0].tempId()].is_mad()); 3244 3245 SubdwordSel sel = parse_insert(def_info.instr); 3246 assert(sel); 3247 3248 if (!can_use_SDWA(ctx.program->gfx_level, instr, true)) 3249 return false; 3250 3251 to_SDWA(ctx, instr); 3252 if (instr->sdwa().dst_sel.size() != 4) 3253 return false; 3254 static_cast<SDWA_instruction*>(instr.get())->dst_sel = sel; 3255 3256 instr->definitions[0].swapTemp(def_info.instr->definitions[0]); 3257 ctx.info[instr->definitions[0].tempId()].label = 0; 3258 ctx.uses[def_info.instr->definitions[0].tempId()]--; 3259 3260 return true; 3261} 3262 3263/* Remove superfluous extract after ds_read like so: 3264 * p_extract(ds_read_uN(), 0, N, 0) -> ds_read_uN() 3265 */ 3266bool 3267apply_ds_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract) 3268{ 3269 /* Check if p_extract has a usedef operand and is the only user. */ 3270 if (!ctx.info[extract->operands[0].tempId()].is_usedef() || 3271 ctx.uses[extract->operands[0].tempId()] > 1) 3272 return false; 3273 3274 /* Check if the usedef is a DS instruction. */ 3275 Instruction* ds = ctx.info[extract->operands[0].tempId()].instr; 3276 if (ds->format != Format::DS) 3277 return false; 3278 3279 unsigned extract_idx = extract->operands[1].constantValue(); 3280 unsigned bits_extracted = extract->operands[2].constantValue(); 3281 unsigned sign_ext = extract->operands[3].constantValue(); 3282 unsigned dst_bitsize = extract->definitions[0].bytes() * 8u; 3283 3284 /* TODO: These are doable, but probably don't occour too often. */ 3285 if (extract_idx || sign_ext || dst_bitsize != 32) 3286 return false; 3287 3288 unsigned bits_loaded = 0; 3289 if (ds->opcode == aco_opcode::ds_read_u8 || ds->opcode == aco_opcode::ds_read_u8_d16) 3290 bits_loaded = 8; 3291 else if (ds->opcode == aco_opcode::ds_read_u16 || ds->opcode == aco_opcode::ds_read_u16_d16) 3292 bits_loaded = 16; 3293 else 3294 return false; 3295 3296 /* Shrink the DS load if the extracted bit size is smaller. */ 3297 bits_loaded = MIN2(bits_loaded, bits_extracted); 3298 3299 /* Change the DS opcode so it writes the full register. */ 3300 if (bits_loaded == 8) 3301 ds->opcode = aco_opcode::ds_read_u8; 3302 else if (bits_loaded == 16) 3303 ds->opcode = aco_opcode::ds_read_u16; 3304 else 3305 unreachable("Forgot to add DS opcode above."); 3306 3307 /* The DS now produces the exact same thing as the extract, remove the extract. */ 3308 std::swap(ds->definitions[0], extract->definitions[0]); 3309 ctx.uses[extract->definitions[0].tempId()] = 0; 3310 ctx.info[ds->definitions[0].tempId()].label = 0; 3311 return true; 3312} 3313 3314/* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */ 3315bool 3316combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr) 3317{ 3318 if (instr->usesModifiers()) 3319 return false; 3320 3321 for (unsigned i = 0; i < 2; i++) { 3322 Instruction* op_instr = follow_operand(ctx, instr->operands[i], true); 3323 if (op_instr && op_instr->opcode == aco_opcode::v_subbrev_co_u32 && 3324 op_instr->operands[0].constantEquals(0) && op_instr->operands[1].constantEquals(0) && 3325 !op_instr->usesModifiers()) { 3326 3327 aco_ptr<Instruction> new_instr; 3328 if (instr->operands[!i].isTemp() && 3329 instr->operands[!i].getTemp().type() == RegType::vgpr) { 3330 new_instr.reset( 3331 create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)); 3332 } else if (ctx.program->gfx_level >= GFX10 || 3333 (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) { 3334 new_instr.reset(create_instruction<VOP3_instruction>(aco_opcode::v_cndmask_b32, 3335 asVOP3(Format::VOP2), 3, 1)); 3336 } else { 3337 return false; 3338 } 3339 3340 ctx.uses[instr->operands[i].tempId()]--; 3341 if (ctx.uses[instr->operands[i].tempId()]) 3342 ctx.uses[op_instr->operands[2].tempId()]++; 3343 3344 new_instr->operands[0] = Operand::zero(); 3345 new_instr->operands[1] = instr->operands[!i]; 3346 new_instr->operands[2] = Operand(op_instr->operands[2]); 3347 new_instr->definitions[0] = instr->definitions[0]; 3348 instr = std::move(new_instr); 3349 ctx.info[instr->definitions[0].tempId()].label = 0; 3350 return true; 3351 } 3352 } 3353 3354 return false; 3355} 3356 3357/* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1<<b, c) 3358 * v_add_co(c, v_lshlrev(a, b)) -> v_mad_u32_u24(b, 1<<a, c) 3359 * v_sub(c, s_lshl(a, b)) -> v_mad_i32_i24(a, -(1<<b), c) 3360 * v_sub(c, v_lshlrev(a, b)) -> v_mad_i32_i24(b, -(1<<a), c) 3361 */ 3362bool 3363combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr, bool is_sub) 3364{ 3365 if (instr->usesModifiers()) 3366 return false; 3367 3368 /* Substractions: start at operand 1 to avoid mixup such as 3369 * turning v_sub(v_lshlrev(a, b), c) into v_mad_i32_i24(b, -(1<<a), c) 3370 */ 3371 unsigned start_op_idx = is_sub ? 1 : 0; 3372 3373 /* Don't allow 24-bit operands on subtraction because 3374 * v_mad_i32_i24 applies a sign extension. 3375 */ 3376 bool allow_24bit = !is_sub; 3377 3378 for (unsigned i = start_op_idx; i < 2; i++) { 3379 Instruction* op_instr = follow_operand(ctx, instr->operands[i]); 3380 if (!op_instr) 3381 continue; 3382 3383 if (op_instr->opcode != aco_opcode::s_lshl_b32 && 3384 op_instr->opcode != aco_opcode::v_lshlrev_b32) 3385 continue; 3386 3387 int shift_op_idx = op_instr->opcode == aco_opcode::s_lshl_b32 ? 1 : 0; 3388 3389 if (op_instr->operands[shift_op_idx].isConstant() && 3390 ((allow_24bit && op_instr->operands[!shift_op_idx].is24bit()) || 3391 op_instr->operands[!shift_op_idx].is16bit())) { 3392 uint32_t multiplier = 1 << (op_instr->operands[shift_op_idx].constantValue() % 32u); 3393 if (is_sub) 3394 multiplier = -multiplier; 3395 if (is_sub ? (multiplier < 0xff800000) : (multiplier > 0xffffff)) 3396 continue; 3397 3398 Operand ops[3] = { 3399 op_instr->operands[!shift_op_idx], 3400 Operand::c32(multiplier), 3401 instr->operands[!i], 3402 }; 3403 if (!check_vop3_operands(ctx, 3, ops)) 3404 return false; 3405 3406 ctx.uses[instr->operands[i].tempId()]--; 3407 3408 aco_opcode mad_op = is_sub ? aco_opcode::v_mad_i32_i24 : aco_opcode::v_mad_u32_u24; 3409 aco_ptr<VOP3_instruction> new_instr{ 3410 create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)}; 3411 for (unsigned op_idx = 0; op_idx < 3; ++op_idx) 3412 new_instr->operands[op_idx] = ops[op_idx]; 3413 new_instr->definitions[0] = instr->definitions[0]; 3414 instr = std::move(new_instr); 3415 ctx.info[instr->definitions[0].tempId()].label = 0; 3416 return true; 3417 } 3418 } 3419 3420 return false; 3421} 3422 3423void 3424propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opsel_hi) 3425{ 3426 /* propagate swizzles which apply to a result down to the instruction's operands: 3427 * result = a.xy + b.xx -> result.yx = a.yx + b.xx */ 3428 assert((opsel_lo & 1) == opsel_lo); 3429 assert((opsel_hi & 1) == opsel_hi); 3430 uint8_t tmp_lo = instr->opsel_lo; 3431 uint8_t tmp_hi = instr->opsel_hi; 3432 bool neg_lo[3] = {instr->neg_lo[0], instr->neg_lo[1], instr->neg_lo[2]}; 3433 bool neg_hi[3] = {instr->neg_hi[0], instr->neg_hi[1], instr->neg_hi[2]}; 3434 if (opsel_lo == 1) { 3435 instr->opsel_lo = tmp_hi; 3436 for (unsigned i = 0; i < 3; i++) 3437 instr->neg_lo[i] = neg_hi[i]; 3438 } 3439 if (opsel_hi == 0) { 3440 instr->opsel_hi = tmp_lo; 3441 for (unsigned i = 0; i < 3; i++) 3442 instr->neg_hi[i] = neg_lo[i]; 3443 } 3444} 3445 3446void 3447combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr) 3448{ 3449 VOP3P_instruction* vop3p = &instr->vop3p(); 3450 3451 /* apply clamp */ 3452 if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) && 3453 vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1 && 3454 !((vop3p->opsel_lo | vop3p->opsel_hi) & 2)) { 3455 3456 ssa_info& info = ctx.info[instr->operands[0].tempId()]; 3457 if (info.is_vop3p() && instr_info.can_use_output_modifiers[(int)info.instr->opcode]) { 3458 VOP3P_instruction* candidate = &ctx.info[instr->operands[0].tempId()].instr->vop3p(); 3459 candidate->clamp = true; 3460 propagate_swizzles(candidate, vop3p->opsel_lo, vop3p->opsel_hi); 3461 instr->definitions[0].swapTemp(candidate->definitions[0]); 3462 ctx.info[candidate->definitions[0].tempId()].instr = candidate; 3463 ctx.uses[instr->definitions[0].tempId()]--; 3464 return; 3465 } 3466 } 3467 3468 /* check for fneg modifiers */ 3469 if (instr_info.can_use_input_modifiers[(int)instr->opcode]) { 3470 for (unsigned i = 0; i < instr->operands.size(); i++) { 3471 Operand& op = instr->operands[i]; 3472 if (!op.isTemp()) 3473 continue; 3474 3475 ssa_info& info = ctx.info[op.tempId()]; 3476 if (info.is_vop3p() && info.instr->opcode == aco_opcode::v_pk_mul_f16 && 3477 info.instr->operands[1].constantEquals(0x3C00)) { 3478 3479 VOP3P_instruction* fneg = &info.instr->vop3p(); 3480 3481 if ((fneg->opsel_lo | fneg->opsel_hi) & 2) 3482 continue; 3483 3484 Operand ops[3]; 3485 for (unsigned j = 0; j < instr->operands.size(); j++) 3486 ops[j] = instr->operands[j]; 3487 ops[i] = info.instr->operands[0]; 3488 if (!check_vop3_operands(ctx, instr->operands.size(), ops)) 3489 continue; 3490 3491 if (fneg->clamp) 3492 continue; 3493 instr->operands[i] = fneg->operands[0]; 3494 3495 /* opsel_lo/hi is either 0 or 1: 3496 * if 0 - pick selection from fneg->lo 3497 * if 1 - pick selection from fneg->hi 3498 */ 3499 bool opsel_lo = (vop3p->opsel_lo >> i) & 1; 3500 bool opsel_hi = (vop3p->opsel_hi >> i) & 1; 3501 bool neg_lo = fneg->neg_lo[0] ^ fneg->neg_lo[1]; 3502 bool neg_hi = fneg->neg_hi[0] ^ fneg->neg_hi[1]; 3503 vop3p->neg_lo[i] ^= opsel_lo ? neg_hi : neg_lo; 3504 vop3p->neg_hi[i] ^= opsel_hi ? neg_hi : neg_lo; 3505 vop3p->opsel_lo ^= ((opsel_lo ? ~fneg->opsel_hi : fneg->opsel_lo) & 1) << i; 3506 vop3p->opsel_hi ^= ((opsel_hi ? ~fneg->opsel_hi : fneg->opsel_lo) & 1) << i; 3507 3508 if (--ctx.uses[fneg->definitions[0].tempId()]) 3509 ctx.uses[fneg->operands[0].tempId()]++; 3510 } 3511 } 3512 } 3513 3514 if (instr->opcode == aco_opcode::v_pk_add_f16 || instr->opcode == aco_opcode::v_pk_add_u16) { 3515 bool fadd = instr->opcode == aco_opcode::v_pk_add_f16; 3516 if (fadd && instr->definitions[0].isPrecise()) 3517 return; 3518 3519 Instruction* mul_instr = nullptr; 3520 unsigned add_op_idx = 0; 3521 uint8_t opsel_lo = 0, opsel_hi = 0; 3522 uint32_t uses = UINT32_MAX; 3523 3524 /* find the 'best' mul instruction to combine with the add */ 3525 for (unsigned i = 0; i < 2; i++) { 3526 if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_vop3p()) 3527 continue; 3528 ssa_info& info = ctx.info[instr->operands[i].tempId()]; 3529 if (fadd) { 3530 if (info.instr->opcode != aco_opcode::v_pk_mul_f16 || 3531 info.instr->definitions[0].isPrecise()) 3532 continue; 3533 } else { 3534 if (info.instr->opcode != aco_opcode::v_pk_mul_lo_u16) 3535 continue; 3536 } 3537 3538 Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]}; 3539 if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op)) 3540 continue; 3541 3542 /* no clamp allowed between mul and add */ 3543 if (info.instr->vop3p().clamp) 3544 continue; 3545 3546 mul_instr = info.instr; 3547 add_op_idx = 1 - i; 3548 opsel_lo = (vop3p->opsel_lo >> i) & 1; 3549 opsel_hi = (vop3p->opsel_hi >> i) & 1; 3550 uses = ctx.uses[instr->operands[i].tempId()]; 3551 } 3552 3553 if (!mul_instr) 3554 return; 3555 3556 /* convert to mad */ 3557 Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1], instr->operands[add_op_idx]}; 3558 ctx.uses[mul_instr->definitions[0].tempId()]--; 3559 if (ctx.uses[mul_instr->definitions[0].tempId()]) { 3560 if (op[0].isTemp()) 3561 ctx.uses[op[0].tempId()]++; 3562 if (op[1].isTemp()) 3563 ctx.uses[op[1].tempId()]++; 3564 } 3565 3566 /* turn packed mul+add into v_pk_fma_f16 */ 3567 assert(mul_instr->isVOP3P()); 3568 aco_opcode mad = fadd ? aco_opcode::v_pk_fma_f16 : aco_opcode::v_pk_mad_u16; 3569 aco_ptr<VOP3P_instruction> fma{ 3570 create_instruction<VOP3P_instruction>(mad, Format::VOP3P, 3, 1)}; 3571 VOP3P_instruction* mul = &mul_instr->vop3p(); 3572 for (unsigned i = 0; i < 2; i++) { 3573 fma->operands[i] = op[i]; 3574 fma->neg_lo[i] = mul->neg_lo[i]; 3575 fma->neg_hi[i] = mul->neg_hi[i]; 3576 } 3577 fma->operands[2] = op[2]; 3578 fma->clamp = vop3p->clamp; 3579 fma->opsel_lo = mul->opsel_lo; 3580 fma->opsel_hi = mul->opsel_hi; 3581 propagate_swizzles(fma.get(), opsel_lo, opsel_hi); 3582 fma->opsel_lo |= (vop3p->opsel_lo << (2 - add_op_idx)) & 0x4; 3583 fma->opsel_hi |= (vop3p->opsel_hi << (2 - add_op_idx)) & 0x4; 3584 fma->neg_lo[2] = vop3p->neg_lo[add_op_idx]; 3585 fma->neg_hi[2] = vop3p->neg_hi[add_op_idx]; 3586 fma->neg_lo[1] = fma->neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx]; 3587 fma->neg_hi[1] = fma->neg_hi[1] ^ vop3p->neg_hi[1 - add_op_idx]; 3588 fma->definitions[0] = instr->definitions[0]; 3589 instr = std::move(fma); 3590 ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get()); 3591 return; 3592 } 3593} 3594 3595bool 3596can_use_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr) 3597{ 3598 if (ctx.program->gfx_level < GFX9) 3599 return false; 3600 3601 /* v_mad_mix* on GFX9 always flushes denormals for 16-bit inputs/outputs */ 3602 if (ctx.program->gfx_level == GFX9 && ctx.fp_mode.denorm16_64) 3603 return false; 3604 3605 switch (instr->opcode) { 3606 case aco_opcode::v_add_f32: 3607 case aco_opcode::v_sub_f32: 3608 case aco_opcode::v_subrev_f32: 3609 case aco_opcode::v_mul_f32: 3610 case aco_opcode::v_fma_f32: break; 3611 case aco_opcode::v_fma_mix_f32: 3612 case aco_opcode::v_fma_mixlo_f16: return true; 3613 default: return false; 3614 } 3615 3616 if (instr->opcode == aco_opcode::v_fma_f32 && !ctx.program->dev.fused_mad_mix && 3617 instr->definitions[0].isPrecise()) 3618 return false; 3619 3620 if (instr->isVOP3()) 3621 return !instr->vop3().omod && !(instr->vop3().opsel & 0x8); 3622 3623 return instr->format == Format::VOP2; 3624} 3625 3626void 3627to_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr) 3628{ 3629 bool is_add = instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32; 3630 3631 aco_ptr<VOP3P_instruction> vop3p{ 3632 create_instruction<VOP3P_instruction>(aco_opcode::v_fma_mix_f32, Format::VOP3P, 3, 1)}; 3633 3634 vop3p->opsel_lo = instr->isVOP3() ? ((instr->vop3().opsel & 0x7) << (is_add ? 1 : 0)) : 0x0; 3635 vop3p->opsel_hi = 0x0; 3636 for (unsigned i = 0; i < instr->operands.size(); i++) { 3637 vop3p->operands[is_add + i] = instr->operands[i]; 3638 vop3p->neg_lo[is_add + i] = instr->isVOP3() && instr->vop3().neg[i]; 3639 vop3p->neg_lo[is_add + i] |= instr->isSDWA() && instr->sdwa().neg[i]; 3640 vop3p->neg_hi[is_add + i] = instr->isVOP3() && instr->vop3().abs[i]; 3641 vop3p->neg_hi[is_add + i] |= instr->isSDWA() && instr->sdwa().abs[i]; 3642 vop3p->opsel_lo |= (instr->isSDWA() && instr->sdwa().sel[i].offset()) << (is_add + i); 3643 } 3644 if (instr->opcode == aco_opcode::v_mul_f32) { 3645 vop3p->opsel_hi &= 0x3; 3646 vop3p->operands[2] = Operand::zero(); 3647 vop3p->neg_lo[2] = true; 3648 } else if (is_add) { 3649 vop3p->opsel_hi &= 0x6; 3650 vop3p->operands[0] = Operand::c32(0x3f800000); 3651 if (instr->opcode == aco_opcode::v_sub_f32) 3652 vop3p->neg_lo[2] ^= true; 3653 else if (instr->opcode == aco_opcode::v_subrev_f32) 3654 vop3p->neg_lo[1] ^= true; 3655 } 3656 vop3p->definitions[0] = instr->definitions[0]; 3657 vop3p->clamp = instr->isVOP3() && instr->vop3().clamp; 3658 instr = std::move(vop3p); 3659 3660 ctx.info[instr->definitions[0].tempId()].label &= label_f2f16 | label_clamp | label_mul; 3661 if (ctx.info[instr->definitions[0].tempId()].label & label_mul) 3662 ctx.info[instr->definitions[0].tempId()].instr = instr.get(); 3663} 3664 3665bool 3666combine_output_conversion(opt_ctx& ctx, aco_ptr<Instruction>& instr) 3667{ 3668 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()]; 3669 if (!def_info.is_f2f16()) 3670 return false; 3671 Instruction* conv = def_info.instr; 3672 3673 if (!can_use_mad_mix(ctx, instr) || ctx.uses[instr->definitions[0].tempId()] != 1) 3674 return false; 3675 3676 if (!ctx.uses[conv->definitions[0].tempId()]) 3677 return false; 3678 3679 if (conv->usesModifiers()) 3680 return false; 3681 3682 if (!instr->isVOP3P()) 3683 to_mad_mix(ctx, instr); 3684 3685 instr->opcode = aco_opcode::v_fma_mixlo_f16; 3686 instr->definitions[0].swapTemp(conv->definitions[0]); 3687 if (conv->definitions[0].isPrecise()) 3688 instr->definitions[0].setPrecise(true); 3689 ctx.info[instr->definitions[0].tempId()].label &= label_clamp; 3690 ctx.uses[conv->definitions[0].tempId()]--; 3691 3692 return true; 3693} 3694 3695void 3696combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr) 3697{ 3698 if (!can_use_mad_mix(ctx, instr)) 3699 return; 3700 3701 for (unsigned i = 0; i < instr->operands.size(); i++) { 3702 if (!instr->operands[i].isTemp()) 3703 continue; 3704 Temp tmp = instr->operands[i].getTemp(); 3705 if (!ctx.info[tmp.id()].is_f2f32()) 3706 continue; 3707 3708 Instruction* conv = ctx.info[tmp.id()].instr; 3709 if (conv->isSDWA() && (conv->sdwa().dst_sel.size() != 4 || conv->sdwa().sel[0].size() != 2 || 3710 conv->sdwa().clamp || conv->sdwa().omod)) { 3711 continue; 3712 } else if (conv->isVOP3() && (conv->vop3().clamp || conv->vop3().omod)) { 3713 continue; 3714 } else if (conv->isDPP()) { 3715 continue; 3716 } 3717 3718 if (get_operand_size(instr, i) != 32) 3719 continue; 3720 3721 /* Conversion to VOP3P will add inline constant operands, but that shouldn't affect 3722 * check_vop3_operands(). */ 3723 Operand op[3]; 3724 for (unsigned j = 0; j < instr->operands.size(); j++) 3725 op[j] = instr->operands[j]; 3726 op[i] = conv->operands[0]; 3727 if (!check_vop3_operands(ctx, instr->operands.size(), op)) 3728 continue; 3729 3730 if (!instr->isVOP3P()) { 3731 bool is_add = 3732 instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32; 3733 to_mad_mix(ctx, instr); 3734 i += is_add; 3735 } 3736 3737 if (--ctx.uses[tmp.id()]) 3738 ctx.uses[conv->operands[0].tempId()]++; 3739 instr->operands[i].setTemp(conv->operands[0].getTemp()); 3740 if (conv->definitions[0].isPrecise()) 3741 instr->definitions[0].setPrecise(true); 3742 instr->vop3p().opsel_hi ^= 1u << i; 3743 if (conv->isSDWA() && conv->sdwa().sel[0].offset() == 2) 3744 instr->vop3p().opsel_lo |= 1u << i; 3745 bool neg = (conv->isVOP3() && conv->vop3().neg[0]) || (conv->isSDWA() && conv->sdwa().neg[0]); 3746 bool abs = (conv->isVOP3() && conv->vop3().abs[0]) || (conv->isSDWA() && conv->sdwa().abs[0]); 3747 if (!instr->vop3p().neg_hi[i]) { 3748 instr->vop3p().neg_lo[i] ^= neg; 3749 instr->vop3p().neg_hi[i] = abs; 3750 } 3751 } 3752} 3753 3754// TODO: we could possibly move the whole label_instruction pass to combine_instruction: 3755// this would mean that we'd have to fix the instruction uses while value propagation 3756 3757/* also returns true for inf */ 3758bool 3759is_pow_of_two(opt_ctx& ctx, Operand op) 3760{ 3761 if (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(op.bytes() * 8)) 3762 return is_pow_of_two(ctx, get_constant_op(ctx, ctx.info[op.tempId()], op.bytes() * 8)); 3763 else if (!op.isConstant()) 3764 return false; 3765 3766 uint64_t val = op.constantValue64(); 3767 3768 if (op.bytes() == 4) { 3769 uint32_t exponent = (val & 0x7f800000) >> 23; 3770 uint32_t fraction = val & 0x007fffff; 3771 return (exponent >= 127) && (fraction == 0); 3772 } else if (op.bytes() == 2) { 3773 uint32_t exponent = (val & 0x7c00) >> 10; 3774 uint32_t fraction = val & 0x03ff; 3775 return (exponent >= 15) && (fraction == 0); 3776 } else { 3777 assert(op.bytes() == 8); 3778 uint64_t exponent = (val & UINT64_C(0x7ff0000000000000)) >> 52; 3779 uint64_t fraction = val & UINT64_C(0x000fffffffffffff); 3780 return (exponent >= 1023) && (fraction == 0); 3781 } 3782} 3783 3784void 3785combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr) 3786{ 3787 if (instr->definitions.empty() || is_dead(ctx.uses, instr.get())) 3788 return; 3789 3790 if (instr->isVALU()) { 3791 /* Apply SDWA. Do this after label_instruction() so it can remove 3792 * label_extract if not all instructions can take SDWA. */ 3793 for (unsigned i = 0; i < instr->operands.size(); i++) { 3794 Operand& op = instr->operands[i]; 3795 if (!op.isTemp()) 3796 continue; 3797 ssa_info& info = ctx.info[op.tempId()]; 3798 if (!info.is_extract()) 3799 continue; 3800 /* if there are that many uses, there are likely better combinations */ 3801 // TODO: delay applying extract to a point where we know better 3802 if (ctx.uses[op.tempId()] > 4) { 3803 info.label &= ~label_extract; 3804 continue; 3805 } 3806 if (info.is_extract() && 3807 (info.instr->operands[0].getTemp().type() == RegType::vgpr || 3808 instr->operands[i].getTemp().type() == RegType::sgpr) && 3809 can_apply_extract(ctx, instr, i, info)) { 3810 /* Increase use count of the extract's operand if the extract still has uses. */ 3811 apply_extract(ctx, instr, i, info); 3812 if (--ctx.uses[instr->operands[i].tempId()]) 3813 ctx.uses[info.instr->operands[0].tempId()]++; 3814 instr->operands[i].setTemp(info.instr->operands[0].getTemp()); 3815 } 3816 } 3817 3818 if (can_apply_sgprs(ctx, instr)) 3819 apply_sgprs(ctx, instr); 3820 combine_mad_mix(ctx, instr); 3821 while (apply_omod_clamp(ctx, instr) | combine_output_conversion(ctx, instr)) 3822 ; 3823 apply_insert(ctx, instr); 3824 } 3825 3826 if (instr->isVOP3P() && instr->opcode != aco_opcode::v_fma_mix_f32 && 3827 instr->opcode != aco_opcode::v_fma_mixlo_f16) 3828 return combine_vop3p(ctx, instr); 3829 3830 if (instr->isSDWA() || instr->isDPP()) 3831 return; 3832 3833 if (instr->opcode == aco_opcode::p_extract) { 3834 ssa_info& info = ctx.info[instr->operands[0].tempId()]; 3835 if (info.is_extract() && can_apply_extract(ctx, instr, 0, info)) { 3836 apply_extract(ctx, instr, 0, info); 3837 if (--ctx.uses[instr->operands[0].tempId()]) 3838 ctx.uses[info.instr->operands[0].tempId()]++; 3839 instr->operands[0].setTemp(info.instr->operands[0].getTemp()); 3840 } 3841 3842 apply_ds_extract(ctx, instr); 3843 } 3844 3845 /* TODO: There are still some peephole optimizations that could be done: 3846 * - abs(a - b) -> s_absdiff_i32 3847 * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32 3848 * - patterns for v_alignbit_b32 and v_alignbyte_b32 3849 * These aren't probably too interesting though. 3850 * There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but 3851 * probably more useful than the previously mentioned optimizations. 3852 * The various comparison optimizations also currently only work with 32-bit 3853 * floats. */ 3854 3855 /* neg(mul(a, b)) -> mul(neg(a), b), abs(mul(a, b)) -> mul(abs(a), abs(b)) */ 3856 if ((ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)) && 3857 ctx.uses[instr->operands[1].tempId()] == 1) { 3858 Temp val = ctx.info[instr->definitions[0].tempId()].temp; 3859 3860 if (!ctx.info[val.id()].is_mul()) 3861 return; 3862 3863 Instruction* mul_instr = ctx.info[val.id()].instr; 3864 3865 if (mul_instr->operands[0].isLiteral()) 3866 return; 3867 if (mul_instr->isVOP3() && mul_instr->vop3().clamp) 3868 return; 3869 if (mul_instr->isSDWA() || mul_instr->isDPP() || mul_instr->isVOP3P()) 3870 return; 3871 if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32 && 3872 ctx.fp_mode.preserve_signed_zero_inf_nan32) 3873 return; 3874 if (mul_instr->definitions[0].bytes() != instr->definitions[0].bytes()) 3875 return; 3876 3877 /* convert to mul(neg(a), b), mul(abs(a), abs(b)) or mul(neg(abs(a)), abs(b)) */ 3878 ctx.uses[mul_instr->definitions[0].tempId()]--; 3879 Definition def = instr->definitions[0]; 3880 bool is_neg = ctx.info[instr->definitions[0].tempId()].is_neg(); 3881 bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs(); 3882 instr.reset( 3883 create_instruction<VOP3_instruction>(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1)); 3884 instr->operands[0] = mul_instr->operands[0]; 3885 instr->operands[1] = mul_instr->operands[1]; 3886 instr->definitions[0] = def; 3887 VOP3_instruction& new_mul = instr->vop3(); 3888 if (mul_instr->isVOP3()) { 3889 VOP3_instruction& mul = mul_instr->vop3(); 3890 new_mul.neg[0] = mul.neg[0]; 3891 new_mul.neg[1] = mul.neg[1]; 3892 new_mul.abs[0] = mul.abs[0]; 3893 new_mul.abs[1] = mul.abs[1]; 3894 new_mul.omod = mul.omod; 3895 } 3896 if (is_abs) { 3897 new_mul.neg[0] = new_mul.neg[1] = false; 3898 new_mul.abs[0] = new_mul.abs[1] = true; 3899 } 3900 new_mul.neg[0] ^= is_neg; 3901 new_mul.clamp = false; 3902 3903 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); 3904 return; 3905 } 3906 3907 /* combine mul+add -> mad */ 3908 bool is_add_mix = 3909 (instr->opcode == aco_opcode::v_fma_mix_f32 || 3910 instr->opcode == aco_opcode::v_fma_mixlo_f16) && 3911 !instr->vop3p().neg_lo[0] && 3912 ((instr->operands[0].constantEquals(0x3f800000) && (instr->vop3p().opsel_hi & 0x1) == 0) || 3913 (instr->operands[0].constantEquals(0x3C00) && (instr->vop3p().opsel_hi & 0x1) && 3914 !(instr->vop3p().opsel_lo & 0x1))); 3915 bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 || 3916 instr->opcode == aco_opcode::v_subrev_f32; 3917 bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 || 3918 instr->opcode == aco_opcode::v_subrev_f16; 3919 bool mad64 = instr->opcode == aco_opcode::v_add_f64; 3920 if (is_add_mix || mad16 || mad32 || mad64) { 3921 Instruction* mul_instr = nullptr; 3922 unsigned add_op_idx = 0; 3923 uint32_t uses = UINT32_MAX; 3924 bool emit_fma = false; 3925 /* find the 'best' mul instruction to combine with the add */ 3926 for (unsigned i = is_add_mix ? 1 : 0; i < instr->operands.size(); i++) { 3927 if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_mul()) 3928 continue; 3929 ssa_info& info = ctx.info[instr->operands[i].tempId()]; 3930 3931 /* no clamp/omod allowed between mul and add */ 3932 if (info.instr->isVOP3() && (info.instr->vop3().clamp || info.instr->vop3().omod)) 3933 continue; 3934 if (info.instr->isVOP3P() && info.instr->vop3p().clamp) 3935 continue; 3936 /* v_fma_mix_f32/etc can't do omod */ 3937 if (info.instr->isVOP3P() && instr->isVOP3() && instr->vop3().omod) 3938 continue; 3939 /* don't promote fp16 to fp32 or remove fp32->fp16->fp32 conversions */ 3940 if (is_add_mix && info.instr->definitions[0].bytes() == 2) 3941 continue; 3942 3943 if (get_operand_size(instr, i) != info.instr->definitions[0].bytes() * 8) 3944 continue; 3945 3946 bool legacy = info.instr->opcode == aco_opcode::v_mul_legacy_f32; 3947 bool mad_mix = is_add_mix || info.instr->isVOP3P(); 3948 3949 /* Multiplication by power-of-two should never need rounding. 1/power-of-two also works, 3950 * but using fma removes denormal flushing (0xfffffe * 0.5 + 0x810001a2). 3951 */ 3952 bool is_fma_precise = is_pow_of_two(ctx, info.instr->operands[0]) || 3953 is_pow_of_two(ctx, info.instr->operands[1]); 3954 3955 bool has_fma = mad16 || mad64 || (legacy && ctx.program->gfx_level >= GFX10_3) || 3956 (mad32 && !legacy && !mad_mix && ctx.program->dev.has_fast_fma32) || 3957 (mad_mix && ctx.program->dev.fused_mad_mix); 3958 bool has_mad = mad_mix ? !ctx.program->dev.fused_mad_mix 3959 : ((mad32 && ctx.program->gfx_level < GFX10_3) || 3960 (mad16 && ctx.program->gfx_level <= GFX9)); 3961 bool can_use_fma = 3962 has_fma && 3963 (!(info.instr->definitions[0].isPrecise() || instr->definitions[0].isPrecise()) || 3964 is_fma_precise); 3965 bool can_use_mad = 3966 has_mad && (mad_mix || mad32 ? ctx.fp_mode.denorm32 : ctx.fp_mode.denorm16_64) == 0; 3967 if (mad_mix && legacy) 3968 continue; 3969 if (!can_use_fma && !can_use_mad) 3970 continue; 3971 3972 unsigned candidate_add_op_idx = is_add_mix ? (3 - i) : (1 - i); 3973 Operand op[3] = {info.instr->operands[0], info.instr->operands[1], 3974 instr->operands[candidate_add_op_idx]}; 3975 if (info.instr->isSDWA() || info.instr->isDPP() || !check_vop3_operands(ctx, 3, op) || 3976 ctx.uses[instr->operands[i].tempId()] > uses) 3977 continue; 3978 3979 if (ctx.uses[instr->operands[i].tempId()] == uses) { 3980 unsigned cur_idx = mul_instr->definitions[0].tempId(); 3981 unsigned new_idx = info.instr->definitions[0].tempId(); 3982 if (cur_idx > new_idx) 3983 continue; 3984 } 3985 3986 mul_instr = info.instr; 3987 add_op_idx = candidate_add_op_idx; 3988 uses = ctx.uses[instr->operands[i].tempId()]; 3989 emit_fma = !can_use_mad; 3990 } 3991 3992 if (mul_instr) { 3993 /* turn mul+add into v_mad/v_fma */ 3994 Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1], 3995 instr->operands[add_op_idx]}; 3996 ctx.uses[mul_instr->definitions[0].tempId()]--; 3997 if (ctx.uses[mul_instr->definitions[0].tempId()]) { 3998 if (op[0].isTemp()) 3999 ctx.uses[op[0].tempId()]++; 4000 if (op[1].isTemp()) 4001 ctx.uses[op[1].tempId()]++; 4002 } 4003 4004 bool neg[3] = {false, false, false}; 4005 bool abs[3] = {false, false, false}; 4006 unsigned omod = 0; 4007 bool clamp = false; 4008 uint8_t opsel_lo = 0; 4009 uint8_t opsel_hi = 0; 4010 4011 if (mul_instr->isVOP3()) { 4012 VOP3_instruction& vop3 = mul_instr->vop3(); 4013 neg[0] = vop3.neg[0]; 4014 neg[1] = vop3.neg[1]; 4015 abs[0] = vop3.abs[0]; 4016 abs[1] = vop3.abs[1]; 4017 } else if (mul_instr->isVOP3P()) { 4018 VOP3P_instruction& vop3p = mul_instr->vop3p(); 4019 neg[0] = vop3p.neg_lo[0]; 4020 neg[1] = vop3p.neg_lo[1]; 4021 abs[0] = vop3p.neg_hi[0]; 4022 abs[1] = vop3p.neg_hi[1]; 4023 opsel_lo = vop3p.opsel_lo & 0x3; 4024 opsel_hi = vop3p.opsel_hi & 0x3; 4025 } 4026 4027 if (instr->isVOP3()) { 4028 VOP3_instruction& vop3 = instr->vop3(); 4029 neg[2] = vop3.neg[add_op_idx]; 4030 abs[2] = vop3.abs[add_op_idx]; 4031 omod = vop3.omod; 4032 clamp = vop3.clamp; 4033 /* abs of the multiplication result */ 4034 if (vop3.abs[1 - add_op_idx]) { 4035 neg[0] = false; 4036 neg[1] = false; 4037 abs[0] = true; 4038 abs[1] = true; 4039 } 4040 /* neg of the multiplication result */ 4041 neg[1] = neg[1] ^ vop3.neg[1 - add_op_idx]; 4042 } else if (instr->isVOP3P()) { 4043 VOP3P_instruction& vop3p = instr->vop3p(); 4044 neg[2] = vop3p.neg_lo[add_op_idx]; 4045 abs[2] = vop3p.neg_hi[add_op_idx]; 4046 opsel_lo |= vop3p.opsel_lo & (1 << add_op_idx) ? 0x4 : 0x0; 4047 opsel_hi |= vop3p.opsel_hi & (1 << add_op_idx) ? 0x4 : 0x0; 4048 clamp = vop3p.clamp; 4049 /* abs of the multiplication result */ 4050 if (vop3p.neg_hi[3 - add_op_idx]) { 4051 neg[0] = false; 4052 neg[1] = false; 4053 abs[0] = true; 4054 abs[1] = true; 4055 } 4056 /* neg of the multiplication result */ 4057 neg[1] = neg[1] ^ vop3p.neg_lo[3 - add_op_idx]; 4058 } 4059 4060 if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16) 4061 neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true; 4062 else if (instr->opcode == aco_opcode::v_subrev_f32 || 4063 instr->opcode == aco_opcode::v_subrev_f16) 4064 neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true; 4065 4066 aco_ptr<Instruction> add_instr = std::move(instr); 4067 if (add_instr->isVOP3P() || mul_instr->isVOP3P()) { 4068 assert(!omod); 4069 4070 aco_opcode mad_op = add_instr->definitions[0].bytes() == 2 ? aco_opcode::v_fma_mixlo_f16 4071 : aco_opcode::v_fma_mix_f32; 4072 aco_ptr<VOP3P_instruction> mad{ 4073 create_instruction<VOP3P_instruction>(mad_op, Format::VOP3P, 3, 1)}; 4074 for (unsigned i = 0; i < 3; i++) { 4075 mad->operands[i] = op[i]; 4076 mad->neg_lo[i] = neg[i]; 4077 mad->neg_hi[i] = abs[i]; 4078 } 4079 mad->clamp = clamp; 4080 mad->opsel_lo = opsel_lo; 4081 mad->opsel_hi = opsel_hi; 4082 4083 instr = std::move(mad); 4084 } else { 4085 aco_opcode mad_op = emit_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32; 4086 if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32) { 4087 assert(emit_fma == (ctx.program->gfx_level >= GFX10_3)); 4088 mad_op = emit_fma ? aco_opcode::v_fma_legacy_f32 : aco_opcode::v_mad_legacy_f32; 4089 } else if (mad16) { 4090 mad_op = emit_fma ? (ctx.program->gfx_level == GFX8 ? aco_opcode::v_fma_legacy_f16 4091 : aco_opcode::v_fma_f16) 4092 : (ctx.program->gfx_level == GFX8 ? aco_opcode::v_mad_legacy_f16 4093 : aco_opcode::v_mad_f16); 4094 } else if (mad64) { 4095 mad_op = aco_opcode::v_fma_f64; 4096 } 4097 4098 aco_ptr<VOP3_instruction> mad{ 4099 create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)}; 4100 for (unsigned i = 0; i < 3; i++) { 4101 mad->operands[i] = op[i]; 4102 mad->neg[i] = neg[i]; 4103 mad->abs[i] = abs[i]; 4104 } 4105 mad->omod = omod; 4106 mad->clamp = clamp; 4107 4108 instr = std::move(mad); 4109 } 4110 instr->definitions[0] = add_instr->definitions[0]; 4111 4112 /* mark this ssa_def to be re-checked for profitability and literals */ 4113 ctx.mad_infos.emplace_back(std::move(add_instr), mul_instr->definitions[0].tempId()); 4114 ctx.info[instr->definitions[0].tempId()].set_mad(instr.get(), ctx.mad_infos.size() - 1); 4115 return; 4116 } 4117 } 4118 /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */ 4119 else if (((instr->opcode == aco_opcode::v_mul_f32 && 4120 !ctx.fp_mode.preserve_signed_zero_inf_nan32) || 4121 instr->opcode == aco_opcode::v_mul_legacy_f32) && 4122 !instr->usesModifiers() && !ctx.fp_mode.must_flush_denorms32) { 4123 for (unsigned i = 0; i < 2; i++) { 4124 if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() && 4125 ctx.uses[instr->operands[i].tempId()] == 1 && instr->operands[!i].isTemp() && 4126 instr->operands[!i].getTemp().type() == RegType::vgpr) { 4127 ctx.uses[instr->operands[i].tempId()]--; 4128 ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++; 4129 4130 aco_ptr<VOP2_instruction> new_instr{ 4131 create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)}; 4132 new_instr->operands[0] = Operand::zero(); 4133 new_instr->operands[1] = instr->operands[!i]; 4134 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp); 4135 new_instr->definitions[0] = instr->definitions[0]; 4136 instr = std::move(new_instr); 4137 ctx.info[instr->definitions[0].tempId()].label = 0; 4138 return; 4139 } 4140 } 4141 } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->gfx_level >= GFX9) { 4142 if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012", 4143 1 | 2)) { 4144 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32, 4145 "012", 1 | 2)) { 4146 } else if (combine_add_or_then_and_lshl(ctx, instr)) { 4147 } 4148 } else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->gfx_level >= GFX10) { 4149 if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012", 4150 1 | 2)) { 4151 } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32, 4152 "012", 1 | 2)) { 4153 } 4154 } else if (instr->opcode == aco_opcode::v_add_u16) { 4155 combine_three_valu_op( 4156 ctx, instr, aco_opcode::v_mul_lo_u16, 4157 ctx.program->gfx_level == GFX8 ? aco_opcode::v_mad_legacy_u16 : aco_opcode::v_mad_u16, 4158 "120", 1 | 2); 4159 } else if (instr->opcode == aco_opcode::v_add_u16_e64) { 4160 combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16_e64, aco_opcode::v_mad_u16, "120", 4161 1 | 2); 4162 } else if (instr->opcode == aco_opcode::v_add_u32) { 4163 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) { 4164 } else if (combine_add_bcnt(ctx, instr)) { 4165 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, 4166 aco_opcode::v_mad_u32_u24, "120", 1 | 2)) { 4167 } else if (ctx.program->gfx_level >= GFX9 && !instr->usesModifiers()) { 4168 if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", 4169 1 | 2)) { 4170 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, 4171 "120", 1 | 2)) { 4172 } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32, 4173 "012", 1 | 2)) { 4174 } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32, 4175 "012", 1 | 2)) { 4176 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, 4177 "012", 1 | 2)) { 4178 } else if (combine_add_or_then_and_lshl(ctx, instr)) { 4179 } 4180 } 4181 } else if (instr->opcode == aco_opcode::v_add_co_u32 || 4182 instr->opcode == aco_opcode::v_add_co_u32_e64) { 4183 bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0; 4184 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) { 4185 } else if (!carry_out && combine_add_bcnt(ctx, instr)) { 4186 } else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, 4187 aco_opcode::v_mad_u32_u24, "120", 1 | 2)) { 4188 } else if (!carry_out && combine_add_lshl(ctx, instr, false)) { 4189 } 4190 } else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 || 4191 instr->opcode == aco_opcode::v_sub_co_u32_e64) { 4192 bool carry_out = 4193 instr->opcode != aco_opcode::v_sub_u32 && ctx.uses[instr->definitions[1].tempId()] > 0; 4194 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2)) { 4195 } else if (!carry_out && combine_add_lshl(ctx, instr, true)) { 4196 } 4197 } else if (instr->opcode == aco_opcode::v_subrev_u32 || 4198 instr->opcode == aco_opcode::v_subrev_co_u32 || 4199 instr->opcode == aco_opcode::v_subrev_co_u32_e64) { 4200 combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1); 4201 } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->gfx_level >= GFX9) { 4202 combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 4203 2); 4204 } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && 4205 ctx.program->gfx_level >= GFX9) { 4206 combine_salu_lshl_add(ctx, instr); 4207 } else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) { 4208 combine_salu_not_bitwise(ctx, instr); 4209 } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 || 4210 instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) { 4211 if (combine_ordering_test(ctx, instr)) { 4212 } else if (combine_comparison_ordering(ctx, instr)) { 4213 } else if (combine_constant_comparison_ordering(ctx, instr)) { 4214 } else if (combine_salu_n2(ctx, instr)) { 4215 } 4216 } else if (instr->opcode == aco_opcode::v_and_b32) { 4217 combine_and_subbrev(ctx, instr); 4218 } else if (instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) { 4219 /* set existing v_fma_f32 with label_mad so we can create v_fmamk_f32/v_fmaak_f32. 4220 * since ctx.uses[mad_info::mul_temp_id] is always 0, we don't have to worry about 4221 * select_instruction() using mad_info::add_instr. 4222 */ 4223 ctx.mad_infos.emplace_back(nullptr, 0); 4224 ctx.info[instr->definitions[0].tempId()].set_mad(instr.get(), ctx.mad_infos.size() - 1); 4225 } else { 4226 aco_opcode min, max, min3, max3, med3; 4227 bool some_gfx9_only; 4228 if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) && 4229 (!some_gfx9_only || ctx.program->gfx_level >= GFX9)) { 4230 if (combine_minmax(ctx, instr, instr->opcode == min ? max : min, 4231 instr->opcode == min ? min3 : max3)) { 4232 } else { 4233 combine_clamp(ctx, instr, min, max, med3); 4234 } 4235 } 4236 } 4237 4238 /* do this after combine_salu_n2() */ 4239 if (instr->opcode == aco_opcode::s_andn2_b32 || instr->opcode == aco_opcode::s_andn2_b64) 4240 combine_inverse_comparison(ctx, instr); 4241} 4242 4243bool 4244to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr) 4245{ 4246 /* Check every operand to make sure they are suitable. */ 4247 for (Operand& op : instr->operands) { 4248 if (!op.isTemp()) 4249 return false; 4250 if (!ctx.info[op.tempId()].is_uniform_bool() && !ctx.info[op.tempId()].is_uniform_bitwise()) 4251 return false; 4252 } 4253 4254 switch (instr->opcode) { 4255 case aco_opcode::s_and_b32: 4256 case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break; 4257 case aco_opcode::s_or_b32: 4258 case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_or_b32; break; 4259 case aco_opcode::s_xor_b32: 4260 case aco_opcode::s_xor_b64: instr->opcode = aco_opcode::s_absdiff_i32; break; 4261 default: 4262 /* Don't transform other instructions. They are very unlikely to appear here. */ 4263 return false; 4264 } 4265 4266 for (Operand& op : instr->operands) { 4267 ctx.uses[op.tempId()]--; 4268 4269 if (ctx.info[op.tempId()].is_uniform_bool()) { 4270 /* Just use the uniform boolean temp. */ 4271 op.setTemp(ctx.info[op.tempId()].temp); 4272 } else if (ctx.info[op.tempId()].is_uniform_bitwise()) { 4273 /* Use the SCC definition of the predecessor instruction. 4274 * This allows the predecessor to get picked up by the same optimization (if it has no 4275 * divergent users), and it also makes sure that the current instruction will keep working 4276 * even if the predecessor won't be transformed. 4277 */ 4278 Instruction* pred_instr = ctx.info[op.tempId()].instr; 4279 assert(pred_instr->definitions.size() >= 2); 4280 assert(pred_instr->definitions[1].isFixed() && 4281 pred_instr->definitions[1].physReg() == scc); 4282 op.setTemp(pred_instr->definitions[1].getTemp()); 4283 } else { 4284 unreachable("Invalid operand on uniform bitwise instruction."); 4285 } 4286 4287 ctx.uses[op.tempId()]++; 4288 } 4289 4290 instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1)); 4291 assert(instr->operands[0].regClass() == s1); 4292 assert(instr->operands[1].regClass() == s1); 4293 return true; 4294} 4295 4296void 4297select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr) 4298{ 4299 const uint32_t threshold = 4; 4300 4301 if (is_dead(ctx.uses, instr.get())) { 4302 instr.reset(); 4303 return; 4304 } 4305 4306 /* convert split_vector into a copy or extract_vector if only one definition is ever used */ 4307 if (instr->opcode == aco_opcode::p_split_vector) { 4308 unsigned num_used = 0; 4309 unsigned idx = 0; 4310 unsigned split_offset = 0; 4311 for (unsigned i = 0, offset = 0; i < instr->definitions.size(); 4312 offset += instr->definitions[i++].bytes()) { 4313 if (ctx.uses[instr->definitions[i].tempId()]) { 4314 num_used++; 4315 idx = i; 4316 split_offset = offset; 4317 } 4318 } 4319 bool done = false; 4320 if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() && 4321 ctx.uses[instr->operands[0].tempId()] == 1) { 4322 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; 4323 4324 unsigned off = 0; 4325 Operand op; 4326 for (Operand& vec_op : vec->operands) { 4327 if (off == split_offset) { 4328 op = vec_op; 4329 break; 4330 } 4331 off += vec_op.bytes(); 4332 } 4333 if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) { 4334 ctx.uses[instr->operands[0].tempId()]--; 4335 for (Operand& vec_op : vec->operands) { 4336 if (vec_op.isTemp()) 4337 ctx.uses[vec_op.tempId()]--; 4338 } 4339 if (op.isTemp()) 4340 ctx.uses[op.tempId()]++; 4341 4342 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>( 4343 aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)}; 4344 extract->operands[0] = op; 4345 extract->definitions[0] = instr->definitions[idx]; 4346 instr = std::move(extract); 4347 4348 done = true; 4349 } 4350 } 4351 4352 if (!done && num_used == 1 && 4353 instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 && 4354 split_offset % instr->definitions[idx].bytes() == 0) { 4355 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>( 4356 aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)}; 4357 extract->operands[0] = instr->operands[0]; 4358 extract->operands[1] = 4359 Operand::c32((uint32_t)split_offset / instr->definitions[idx].bytes()); 4360 extract->definitions[0] = instr->definitions[idx]; 4361 instr = std::move(extract); 4362 } 4363 } 4364 4365 mad_info* mad_info = NULL; 4366 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) { 4367 mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags]; 4368 /* re-check mad instructions */ 4369 if (ctx.uses[mad_info->mul_temp_id] && mad_info->add_instr) { 4370 ctx.uses[mad_info->mul_temp_id]++; 4371 if (instr->operands[0].isTemp()) 4372 ctx.uses[instr->operands[0].tempId()]--; 4373 if (instr->operands[1].isTemp()) 4374 ctx.uses[instr->operands[1].tempId()]--; 4375 instr.swap(mad_info->add_instr); 4376 mad_info = NULL; 4377 } 4378 /* check literals */ 4379 else if (!instr->usesModifiers() && !instr->isVOP3P() && 4380 instr->opcode != aco_opcode::v_fma_f64 && 4381 instr->opcode != aco_opcode::v_mad_legacy_f32 && 4382 instr->opcode != aco_opcode::v_fma_legacy_f32) { 4383 /* FMA can only take literals on GFX10+ */ 4384 if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) && 4385 ctx.program->gfx_level < GFX10) 4386 return; 4387 /* There are no v_fmaak_legacy_f16/v_fmamk_legacy_f16 and on chips where VOP3 can take 4388 * literals (GFX10+), these instructions don't exist. 4389 */ 4390 if (instr->opcode == aco_opcode::v_fma_legacy_f16) 4391 return; 4392 4393 uint32_t literal_idx = 0; 4394 uint32_t literal_uses = UINT32_MAX; 4395 4396 /* Try using v_madak/v_fmaak */ 4397 if (instr->operands[2].isTemp() && 4398 ctx.info[instr->operands[2].tempId()].is_literal(get_operand_size(instr, 2))) { 4399 bool has_sgpr = false; 4400 bool has_vgpr = false; 4401 for (unsigned i = 0; i < 2; i++) { 4402 if (!instr->operands[i].isTemp()) 4403 continue; 4404 has_sgpr |= instr->operands[i].getTemp().type() == RegType::sgpr; 4405 has_vgpr |= instr->operands[i].getTemp().type() == RegType::vgpr; 4406 } 4407 /* Encoding limitations requires a VGPR operand. The constant bus limitations before 4408 * GFX10 disallows SGPRs. 4409 */ 4410 if ((!has_sgpr || ctx.program->gfx_level >= GFX10) && has_vgpr) { 4411 literal_idx = 2; 4412 literal_uses = ctx.uses[instr->operands[2].tempId()]; 4413 } 4414 } 4415 4416 /* Try using v_madmk/v_fmamk */ 4417 /* Encoding limitations requires a VGPR operand. */ 4418 if (instr->operands[2].isTemp() && instr->operands[2].getTemp().type() == RegType::vgpr) { 4419 for (unsigned i = 0; i < 2; i++) { 4420 if (!instr->operands[i].isTemp()) 4421 continue; 4422 4423 /* The constant bus limitations before GFX10 disallows SGPRs. */ 4424 if (ctx.program->gfx_level < GFX10 && instr->operands[!i].isTemp() && 4425 instr->operands[!i].getTemp().type() == RegType::sgpr) 4426 continue; 4427 4428 if (ctx.info[instr->operands[i].tempId()].is_literal(get_operand_size(instr, i)) && 4429 ctx.uses[instr->operands[i].tempId()] < literal_uses) { 4430 literal_idx = i; 4431 literal_uses = ctx.uses[instr->operands[i].tempId()]; 4432 } 4433 } 4434 } 4435 4436 /* Limit the number of literals to apply to not increase the code 4437 * size too much, but always apply literals for v_mad->v_madak 4438 * because both instructions are 64-bit and this doesn't increase 4439 * code size. 4440 * TODO: try to apply the literals earlier to lower the number of 4441 * uses below threshold 4442 */ 4443 if (literal_uses < threshold || literal_idx == 2) { 4444 ctx.uses[instr->operands[literal_idx].tempId()]--; 4445 mad_info->check_literal = true; 4446 mad_info->literal_idx = literal_idx; 4447 return; 4448 } 4449 } 4450 } 4451 4452 /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions 4453 * when it isn't beneficial */ 4454 if (instr->isBranch() && instr->operands.size() && instr->operands[0].isTemp() && 4455 instr->operands[0].isFixed() && instr->operands[0].physReg() == scc) { 4456 ctx.info[instr->operands[0].tempId()].set_scc_needed(); 4457 return; 4458 } else if ((instr->opcode == aco_opcode::s_cselect_b64 || 4459 instr->opcode == aco_opcode::s_cselect_b32) && 4460 instr->operands[2].isTemp()) { 4461 ctx.info[instr->operands[2].tempId()].set_scc_needed(); 4462 } else if (instr->opcode == aco_opcode::p_wqm && instr->operands[0].isTemp() && 4463 ctx.info[instr->definitions[0].tempId()].is_scc_needed()) { 4464 /* Propagate label so it is correctly detected by the uniform bool transform */ 4465 ctx.info[instr->operands[0].tempId()].set_scc_needed(); 4466 4467 /* Fix definition to SCC, this will prevent RA from adding superfluous moves */ 4468 instr->definitions[0].setFixed(scc); 4469 } 4470 4471 /* check for literals */ 4472 if (!instr->isSALU() && !instr->isVALU()) 4473 return; 4474 4475 /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */ 4476 if (instr->definitions.size() && ctx.uses[instr->definitions[0].tempId()] == 0 && 4477 ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) { 4478 bool transform_done = to_uniform_bool_instr(ctx, instr); 4479 4480 if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) { 4481 /* Swap the two definition IDs in order to avoid overusing the SCC. 4482 * This reduces extra moves generated by RA. */ 4483 uint32_t def0_id = instr->definitions[0].getTemp().id(); 4484 uint32_t def1_id = instr->definitions[1].getTemp().id(); 4485 instr->definitions[0].setTemp(Temp(def1_id, s1)); 4486 instr->definitions[1].setTemp(Temp(def0_id, s1)); 4487 } 4488 4489 return; 4490 } 4491 4492 /* Combine DPP copies into VALU. This should be done after creating MAD/FMA. */ 4493 if (instr->isVALU()) { 4494 for (unsigned i = 0; i < instr->operands.size(); i++) { 4495 if (!instr->operands[i].isTemp()) 4496 continue; 4497 ssa_info info = ctx.info[instr->operands[i].tempId()]; 4498 4499 aco_opcode swapped_op; 4500 if (info.is_dpp() && info.instr->pass_flags == instr->pass_flags && 4501 (i == 0 || can_swap_operands(instr, &swapped_op)) && 4502 can_use_DPP(instr, true, info.is_dpp8()) && !instr->isDPP()) { 4503 bool dpp8 = info.is_dpp8(); 4504 convert_to_DPP(instr, dpp8); 4505 if (dpp8) { 4506 DPP8_instruction* dpp = &instr->dpp8(); 4507 for (unsigned j = 0; j < 8; ++j) 4508 dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j]; 4509 if (i) { 4510 instr->opcode = swapped_op; 4511 std::swap(instr->operands[0], instr->operands[1]); 4512 } 4513 } else { 4514 DPP16_instruction* dpp = &instr->dpp16(); 4515 if (i) { 4516 instr->opcode = swapped_op; 4517 std::swap(instr->operands[0], instr->operands[1]); 4518 std::swap(dpp->neg[0], dpp->neg[1]); 4519 std::swap(dpp->abs[0], dpp->abs[1]); 4520 } 4521 dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl; 4522 dpp->bound_ctrl = info.instr->dpp16().bound_ctrl; 4523 dpp->neg[0] ^= info.instr->dpp16().neg[0] && !dpp->abs[0]; 4524 dpp->abs[0] |= info.instr->dpp16().abs[0]; 4525 } 4526 if (--ctx.uses[info.instr->definitions[0].tempId()]) 4527 ctx.uses[info.instr->operands[0].tempId()]++; 4528 instr->operands[0].setTemp(info.instr->operands[0].getTemp()); 4529 break; 4530 } 4531 } 4532 } 4533 4534 if (instr->isSDWA() || (instr->isVOP3() && ctx.program->gfx_level < GFX10) || 4535 (instr->isVOP3P() && ctx.program->gfx_level < GFX10)) 4536 return; /* some encodings can't ever take literals */ 4537 4538 /* we do not apply the literals yet as we don't know if it is profitable */ 4539 Operand current_literal(s1); 4540 4541 unsigned literal_id = 0; 4542 unsigned literal_uses = UINT32_MAX; 4543 Operand literal(s1); 4544 unsigned num_operands = 1; 4545 if (instr->isSALU() || 4546 (ctx.program->gfx_level >= GFX10 && (can_use_VOP3(ctx, instr) || instr->isVOP3P()))) 4547 num_operands = instr->operands.size(); 4548 /* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */ 4549 else if (instr->isVALU() && instr->operands.size() >= 3) 4550 return; 4551 4552 unsigned sgpr_ids[2] = {0, 0}; 4553 bool is_literal_sgpr = false; 4554 uint32_t mask = 0; 4555 4556 /* choose a literal to apply */ 4557 for (unsigned i = 0; i < num_operands; i++) { 4558 Operand op = instr->operands[i]; 4559 unsigned bits = get_operand_size(instr, i); 4560 4561 if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr && 4562 op.tempId() != sgpr_ids[0]) 4563 sgpr_ids[!!sgpr_ids[0]] = op.tempId(); 4564 4565 if (op.isLiteral()) { 4566 current_literal = op; 4567 continue; 4568 } else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal(bits)) { 4569 continue; 4570 } 4571 4572 if (!alu_can_accept_constant(instr->opcode, i)) 4573 continue; 4574 4575 if (ctx.uses[op.tempId()] < literal_uses) { 4576 is_literal_sgpr = op.getTemp().type() == RegType::sgpr; 4577 mask = 0; 4578 literal = Operand::c32(ctx.info[op.tempId()].val); 4579 literal_uses = ctx.uses[op.tempId()]; 4580 literal_id = op.tempId(); 4581 } 4582 4583 mask |= (op.tempId() == literal_id) << i; 4584 } 4585 4586 /* don't go over the constant bus limit */ 4587 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 || 4588 instr->opcode == aco_opcode::v_lshrrev_b64 || 4589 instr->opcode == aco_opcode::v_ashrrev_i64; 4590 unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX; 4591 if (ctx.program->gfx_level >= GFX10 && !is_shift64) 4592 const_bus_limit = 2; 4593 4594 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1]; 4595 if (num_sgprs == const_bus_limit && !is_literal_sgpr) 4596 return; 4597 4598 if (literal_id && literal_uses < threshold && 4599 (current_literal.isUndefined() || 4600 (current_literal.size() == literal.size() && 4601 current_literal.constantValue() == literal.constantValue()))) { 4602 /* mark the literal to be applied */ 4603 while (mask) { 4604 unsigned i = u_bit_scan(&mask); 4605 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id) 4606 ctx.uses[instr->operands[i].tempId()]--; 4607 } 4608 } 4609} 4610 4611static aco_opcode 4612sopk_opcode_for_sopc(aco_opcode opcode) 4613{ 4614#define CTOK(op) \ 4615 case aco_opcode::s_cmp_##op##_i32: return aco_opcode::s_cmpk_##op##_i32; \ 4616 case aco_opcode::s_cmp_##op##_u32: return aco_opcode::s_cmpk_##op##_u32; 4617 switch (opcode) { 4618 CTOK(eq) 4619 CTOK(lg) 4620 CTOK(gt) 4621 CTOK(ge) 4622 CTOK(lt) 4623 CTOK(le) 4624 default: return aco_opcode::num_opcodes; 4625 } 4626#undef CTOK 4627} 4628 4629static bool 4630sopc_is_signed(aco_opcode opcode) 4631{ 4632#define SOPC(op) \ 4633 case aco_opcode::s_cmp_##op##_i32: return true; \ 4634 case aco_opcode::s_cmp_##op##_u32: return false; 4635 switch (opcode) { 4636 SOPC(eq) 4637 SOPC(lg) 4638 SOPC(gt) 4639 SOPC(ge) 4640 SOPC(lt) 4641 SOPC(le) 4642 default: unreachable("Not a valid SOPC instruction."); 4643 } 4644#undef SOPC 4645} 4646 4647static aco_opcode 4648sopc_32_swapped(aco_opcode opcode) 4649{ 4650#define SOPC(op1, op2) \ 4651 case aco_opcode::s_cmp_##op1##_i32: return aco_opcode::s_cmp_##op2##_i32; \ 4652 case aco_opcode::s_cmp_##op1##_u32: return aco_opcode::s_cmp_##op2##_u32; 4653 switch (opcode) { 4654 SOPC(eq, eq) 4655 SOPC(lg, lg) 4656 SOPC(gt, lt) 4657 SOPC(ge, le) 4658 SOPC(lt, gt) 4659 SOPC(le, ge) 4660 default: return aco_opcode::num_opcodes; 4661 } 4662#undef SOPC 4663} 4664 4665static void 4666try_convert_sopc_to_sopk(aco_ptr<Instruction>& instr) 4667{ 4668 if (sopk_opcode_for_sopc(instr->opcode) == aco_opcode::num_opcodes) 4669 return; 4670 4671 if (instr->operands[0].isLiteral()) { 4672 std::swap(instr->operands[0], instr->operands[1]); 4673 instr->opcode = sopc_32_swapped(instr->opcode); 4674 } 4675 4676 if (!instr->operands[1].isLiteral()) 4677 return; 4678 4679 if (instr->operands[0].isFixed() && instr->operands[0].physReg() >= 128) 4680 return; 4681 4682 uint32_t value = instr->operands[1].constantValue(); 4683 4684 const uint32_t i16_mask = 0xffff8000u; 4685 4686 bool value_is_i16 = (value & i16_mask) == 0 || (value & i16_mask) == i16_mask; 4687 bool value_is_u16 = !(value & 0xffff0000u); 4688 4689 if (!value_is_i16 && !value_is_u16) 4690 return; 4691 4692 if (!value_is_i16 && sopc_is_signed(instr->opcode)) { 4693 if (instr->opcode == aco_opcode::s_cmp_lg_i32) 4694 instr->opcode = aco_opcode::s_cmp_lg_u32; 4695 else if (instr->opcode == aco_opcode::s_cmp_eq_i32) 4696 instr->opcode = aco_opcode::s_cmp_eq_u32; 4697 else 4698 return; 4699 } else if (!value_is_u16 && !sopc_is_signed(instr->opcode)) { 4700 if (instr->opcode == aco_opcode::s_cmp_lg_u32) 4701 instr->opcode = aco_opcode::s_cmp_lg_i32; 4702 else if (instr->opcode == aco_opcode::s_cmp_eq_u32) 4703 instr->opcode = aco_opcode::s_cmp_eq_i32; 4704 else 4705 return; 4706 } 4707 4708 static_assert(sizeof(SOPK_instruction) <= sizeof(SOPC_instruction), 4709 "Invalid direct instruction cast."); 4710 instr->format = Format::SOPK; 4711 SOPK_instruction* instr_sopk = &instr->sopk(); 4712 4713 instr_sopk->imm = instr_sopk->operands[1].constantValue() & 0xffff; 4714 instr_sopk->opcode = sopk_opcode_for_sopc(instr_sopk->opcode); 4715 instr_sopk->operands.pop_back(); 4716} 4717 4718void 4719apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr) 4720{ 4721 /* Cleanup Dead Instructions */ 4722 if (!instr) 4723 return; 4724 4725 /* apply literals on MAD */ 4726 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) { 4727 mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags]; 4728 if (info->check_literal && 4729 (ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) { 4730 aco_ptr<Instruction> new_mad; 4731 4732 aco_opcode new_op = 4733 info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32; 4734 if (instr->opcode == aco_opcode::v_fma_f32) 4735 new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32; 4736 else if (instr->opcode == aco_opcode::v_mad_f16 || 4737 instr->opcode == aco_opcode::v_mad_legacy_f16) 4738 new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16; 4739 else if (instr->opcode == aco_opcode::v_fma_f16) 4740 new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16; 4741 4742 new_mad.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 1)); 4743 if (info->literal_idx == 2) { /* add literal -> madak */ 4744 new_mad->operands[0] = instr->operands[0]; 4745 new_mad->operands[1] = instr->operands[1]; 4746 if (!new_mad->operands[1].isTemp() || 4747 new_mad->operands[1].getTemp().type() == RegType::sgpr) 4748 std::swap(new_mad->operands[0], new_mad->operands[1]); 4749 } else { /* mul literal -> madmk */ 4750 new_mad->operands[0] = instr->operands[1 - info->literal_idx]; 4751 new_mad->operands[1] = instr->operands[2]; 4752 } 4753 new_mad->operands[2] = 4754 Operand::c32(ctx.info[instr->operands[info->literal_idx].tempId()].val); 4755 new_mad->definitions[0] = instr->definitions[0]; 4756 ctx.instructions.emplace_back(std::move(new_mad)); 4757 return; 4758 } 4759 } 4760 4761 /* apply literals on other SALU/VALU */ 4762 if (instr->isSALU() || instr->isVALU()) { 4763 for (unsigned i = 0; i < instr->operands.size(); i++) { 4764 Operand op = instr->operands[i]; 4765 unsigned bits = get_operand_size(instr, i); 4766 if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) { 4767 Operand literal = Operand::literal32(ctx.info[op.tempId()].val); 4768 instr->format = withoutDPP(instr->format); 4769 if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P) 4770 to_VOP3(ctx, instr); 4771 instr->operands[i] = literal; 4772 } 4773 } 4774 } 4775 4776 if (instr->isSOPC()) 4777 try_convert_sopc_to_sopk(instr); 4778 4779 /* allow more s_addk_i32 optimizations if carry isn't used */ 4780 if (instr->opcode == aco_opcode::s_add_u32 && ctx.uses[instr->definitions[1].tempId()] == 0 && 4781 (instr->operands[0].isLiteral() || instr->operands[1].isLiteral())) 4782 instr->opcode = aco_opcode::s_add_i32; 4783 4784 ctx.instructions.emplace_back(std::move(instr)); 4785} 4786 4787void 4788optimize(Program* program) 4789{ 4790 opt_ctx ctx; 4791 ctx.program = program; 4792 std::vector<ssa_info> info(program->peekAllocationId()); 4793 ctx.info = info.data(); 4794 4795 /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */ 4796 for (Block& block : program->blocks) { 4797 ctx.fp_mode = block.fp_mode; 4798 for (aco_ptr<Instruction>& instr : block.instructions) 4799 label_instruction(ctx, instr); 4800 } 4801 4802 ctx.uses = dead_code_analysis(program); 4803 4804 /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */ 4805 for (Block& block : program->blocks) { 4806 ctx.fp_mode = block.fp_mode; 4807 for (aco_ptr<Instruction>& instr : block.instructions) 4808 combine_instruction(ctx, instr); 4809 } 4810 4811 /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */ 4812 for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend(); 4813 ++block_rit) { 4814 Block* block = &(*block_rit); 4815 ctx.fp_mode = block->fp_mode; 4816 for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend(); 4817 ++instr_rit) 4818 select_instruction(ctx, *instr_rit); 4819 } 4820 4821 /* 4. Add literals to instructions */ 4822 for (Block& block : program->blocks) { 4823 ctx.instructions.clear(); 4824 ctx.fp_mode = block.fp_mode; 4825 for (aco_ptr<Instruction>& instr : block.instructions) 4826 apply_literals(ctx, instr); 4827 block.instructions.swap(ctx.instructions); 4828 } 4829} 4830 4831} // namespace aco 4832