1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2021 Valve Corporation
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci *
23bf215546Sopenharmony_ci */
24bf215546Sopenharmony_ci
25bf215546Sopenharmony_ci#include "aco_builder.h"
26bf215546Sopenharmony_ci#include "aco_ir.h"
27bf215546Sopenharmony_ci
28bf215546Sopenharmony_ci#include <algorithm>
29bf215546Sopenharmony_ci#include <array>
30bf215546Sopenharmony_ci#include <bitset>
31bf215546Sopenharmony_ci#include <vector>
32bf215546Sopenharmony_ci
33bf215546Sopenharmony_cinamespace aco {
34bf215546Sopenharmony_cinamespace {
35bf215546Sopenharmony_ci
36bf215546Sopenharmony_ciconstexpr const size_t max_reg_cnt = 512;
37bf215546Sopenharmony_ciconstexpr const size_t max_sgpr_cnt = 128;
38bf215546Sopenharmony_ciconstexpr const size_t min_vgpr = 256;
39bf215546Sopenharmony_ciconstexpr const size_t max_vgpr_cnt = 256;
40bf215546Sopenharmony_ci
41bf215546Sopenharmony_cistruct Idx {
42bf215546Sopenharmony_ci   bool operator==(const Idx& other) const { return block == other.block && instr == other.instr; }
43bf215546Sopenharmony_ci   bool operator!=(const Idx& other) const { return !operator==(other); }
44bf215546Sopenharmony_ci
45bf215546Sopenharmony_ci   bool found() const { return block != UINT32_MAX; }
46bf215546Sopenharmony_ci
47bf215546Sopenharmony_ci   uint32_t block;
48bf215546Sopenharmony_ci   uint32_t instr;
49bf215546Sopenharmony_ci};
50bf215546Sopenharmony_ci
51bf215546Sopenharmony_ciIdx not_written_in_block{UINT32_MAX, 0};
52bf215546Sopenharmony_ciIdx clobbered{UINT32_MAX, 1};
53bf215546Sopenharmony_ciIdx const_or_undef{UINT32_MAX, 2};
54bf215546Sopenharmony_ciIdx written_by_multiple_instrs{UINT32_MAX, 3};
55bf215546Sopenharmony_ci
56bf215546Sopenharmony_cistruct pr_opt_ctx {
57bf215546Sopenharmony_ci   Program* program;
58bf215546Sopenharmony_ci   Block* current_block;
59bf215546Sopenharmony_ci   uint32_t current_instr_idx;
60bf215546Sopenharmony_ci   std::vector<uint16_t> uses;
61bf215546Sopenharmony_ci   std::vector<std::array<Idx, max_reg_cnt>> instr_idx_by_regs;
62bf215546Sopenharmony_ci
63bf215546Sopenharmony_ci   void reset_block(Block* block)
64bf215546Sopenharmony_ci   {
65bf215546Sopenharmony_ci      current_block = block;
66bf215546Sopenharmony_ci      current_instr_idx = 0;
67bf215546Sopenharmony_ci
68bf215546Sopenharmony_ci      if ((block->kind & block_kind_loop_header) || block->linear_preds.empty()) {
69bf215546Sopenharmony_ci         std::fill(instr_idx_by_regs[block->index].begin(), instr_idx_by_regs[block->index].end(),
70bf215546Sopenharmony_ci                   not_written_in_block);
71bf215546Sopenharmony_ci      } else {
72bf215546Sopenharmony_ci         const uint32_t first_linear_pred = block->linear_preds[0];
73bf215546Sopenharmony_ci         const std::vector<uint32_t>& linear_preds = block->linear_preds;
74bf215546Sopenharmony_ci
75bf215546Sopenharmony_ci         for (unsigned i = 0; i < max_sgpr_cnt; i++) {
76bf215546Sopenharmony_ci            const bool all_same = std::all_of(
77bf215546Sopenharmony_ci               std::next(linear_preds.begin()), linear_preds.end(),
78bf215546Sopenharmony_ci               [=](unsigned pred)
79bf215546Sopenharmony_ci               { return instr_idx_by_regs[pred][i] == instr_idx_by_regs[first_linear_pred][i]; });
80bf215546Sopenharmony_ci
81bf215546Sopenharmony_ci            if (all_same)
82bf215546Sopenharmony_ci               instr_idx_by_regs[block->index][i] = instr_idx_by_regs[first_linear_pred][i];
83bf215546Sopenharmony_ci            else
84bf215546Sopenharmony_ci               instr_idx_by_regs[block->index][i] = written_by_multiple_instrs;
85bf215546Sopenharmony_ci         }
86bf215546Sopenharmony_ci
87bf215546Sopenharmony_ci         if (!block->logical_preds.empty()) {
88bf215546Sopenharmony_ci            /* We assume that VGPRs are only read by blocks which have a logical predecessor,
89bf215546Sopenharmony_ci             * ie. any block that reads any VGPR has at least 1 logical predecessor.
90bf215546Sopenharmony_ci             */
91bf215546Sopenharmony_ci            const unsigned first_logical_pred = block->logical_preds[0];
92bf215546Sopenharmony_ci            const std::vector<uint32_t>& logical_preds = block->logical_preds;
93bf215546Sopenharmony_ci
94bf215546Sopenharmony_ci            for (unsigned i = min_vgpr; i < (min_vgpr + max_vgpr_cnt); i++) {
95bf215546Sopenharmony_ci               const bool all_same = std::all_of(
96bf215546Sopenharmony_ci                  std::next(logical_preds.begin()), logical_preds.end(),
97bf215546Sopenharmony_ci                  [=](unsigned pred) {
98bf215546Sopenharmony_ci                     return instr_idx_by_regs[pred][i] == instr_idx_by_regs[first_logical_pred][i];
99bf215546Sopenharmony_ci                  });
100bf215546Sopenharmony_ci
101bf215546Sopenharmony_ci               if (all_same)
102bf215546Sopenharmony_ci                  instr_idx_by_regs[block->index][i] = instr_idx_by_regs[first_logical_pred][i];
103bf215546Sopenharmony_ci               else
104bf215546Sopenharmony_ci                  instr_idx_by_regs[block->index][i] = written_by_multiple_instrs;
105bf215546Sopenharmony_ci            }
106bf215546Sopenharmony_ci         } else {
107bf215546Sopenharmony_ci            /* If a block has no logical predecessors, it is not part of the
108bf215546Sopenharmony_ci             * logical CFG and therefore it also won't have any logical successors.
109bf215546Sopenharmony_ci             * Such a block does not write any VGPRs ever.
110bf215546Sopenharmony_ci             */
111bf215546Sopenharmony_ci            assert(block->logical_succs.empty());
112bf215546Sopenharmony_ci         }
113bf215546Sopenharmony_ci      }
114bf215546Sopenharmony_ci   }
115bf215546Sopenharmony_ci
116bf215546Sopenharmony_ci   Instruction* get(Idx idx) { return program->blocks[idx.block].instructions[idx.instr].get(); }
117bf215546Sopenharmony_ci};
118bf215546Sopenharmony_ci
119bf215546Sopenharmony_civoid
120bf215546Sopenharmony_cisave_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
121bf215546Sopenharmony_ci{
122bf215546Sopenharmony_ci   for (const Definition& def : instr->definitions) {
123bf215546Sopenharmony_ci      assert(def.regClass().type() != RegType::sgpr || def.physReg().reg() <= 255);
124bf215546Sopenharmony_ci      assert(def.regClass().type() != RegType::vgpr || def.physReg().reg() >= 256);
125bf215546Sopenharmony_ci
126bf215546Sopenharmony_ci      unsigned dw_size = DIV_ROUND_UP(def.bytes(), 4u);
127bf215546Sopenharmony_ci      unsigned r = def.physReg().reg();
128bf215546Sopenharmony_ci      Idx idx{ctx.current_block->index, ctx.current_instr_idx};
129bf215546Sopenharmony_ci
130bf215546Sopenharmony_ci      if (def.regClass().is_subdword())
131bf215546Sopenharmony_ci         idx = clobbered;
132bf215546Sopenharmony_ci
133bf215546Sopenharmony_ci      assert((r + dw_size) <= max_reg_cnt);
134bf215546Sopenharmony_ci      assert(def.size() == dw_size || def.regClass().is_subdword());
135bf215546Sopenharmony_ci      std::fill(ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r,
136bf215546Sopenharmony_ci                ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r + dw_size, idx);
137bf215546Sopenharmony_ci   }
138bf215546Sopenharmony_ci}
139bf215546Sopenharmony_ci
140bf215546Sopenharmony_ciIdx
141bf215546Sopenharmony_cilast_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc)
142bf215546Sopenharmony_ci{
143bf215546Sopenharmony_ci   /* Verify that all of the operand's registers are written by the same instruction. */
144bf215546Sopenharmony_ci   assert(physReg.reg() < max_reg_cnt);
145bf215546Sopenharmony_ci   Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][physReg.reg()];
146bf215546Sopenharmony_ci   unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u);
147bf215546Sopenharmony_ci   unsigned r = physReg.reg();
148bf215546Sopenharmony_ci   bool all_same =
149bf215546Sopenharmony_ci      std::all_of(ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r,
150bf215546Sopenharmony_ci                  ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r + dw_size,
151bf215546Sopenharmony_ci                  [instr_idx](Idx i) { return i == instr_idx; });
152bf215546Sopenharmony_ci
153bf215546Sopenharmony_ci   return all_same ? instr_idx : written_by_multiple_instrs;
154bf215546Sopenharmony_ci}
155bf215546Sopenharmony_ci
156bf215546Sopenharmony_ciIdx
157bf215546Sopenharmony_cilast_writer_idx(pr_opt_ctx& ctx, const Operand& op)
158bf215546Sopenharmony_ci{
159bf215546Sopenharmony_ci   if (op.isConstant() || op.isUndefined())
160bf215546Sopenharmony_ci      return const_or_undef;
161bf215546Sopenharmony_ci
162bf215546Sopenharmony_ci   return last_writer_idx(ctx, op.physReg(), op.regClass());
163bf215546Sopenharmony_ci}
164bf215546Sopenharmony_ci
165bf215546Sopenharmony_cibool
166bf215546Sopenharmony_ciis_clobbered_since(pr_opt_ctx& ctx, PhysReg reg, RegClass rc, const Idx& idx)
167bf215546Sopenharmony_ci{
168bf215546Sopenharmony_ci   /* If we didn't find an instruction, assume that the register is clobbered. */
169bf215546Sopenharmony_ci   if (!idx.found())
170bf215546Sopenharmony_ci      return true;
171bf215546Sopenharmony_ci
172bf215546Sopenharmony_ci   /* TODO: We currently can't keep track of subdword registers. */
173bf215546Sopenharmony_ci   if (rc.is_subdword())
174bf215546Sopenharmony_ci      return true;
175bf215546Sopenharmony_ci
176bf215546Sopenharmony_ci   unsigned begin_reg = reg.reg();
177bf215546Sopenharmony_ci   unsigned end_reg = begin_reg + rc.size();
178bf215546Sopenharmony_ci   unsigned current_block_idx = ctx.current_block->index;
179bf215546Sopenharmony_ci
180bf215546Sopenharmony_ci   for (unsigned r = begin_reg; r < end_reg; ++r) {
181bf215546Sopenharmony_ci      Idx& i = ctx.instr_idx_by_regs[current_block_idx][r];
182bf215546Sopenharmony_ci      if (i == clobbered || i == written_by_multiple_instrs)
183bf215546Sopenharmony_ci         return true;
184bf215546Sopenharmony_ci      else if (i == not_written_in_block)
185bf215546Sopenharmony_ci         continue;
186bf215546Sopenharmony_ci
187bf215546Sopenharmony_ci      assert(i.found());
188bf215546Sopenharmony_ci
189bf215546Sopenharmony_ci      if (i.block > idx.block || (i.block == idx.block && i.instr > idx.instr))
190bf215546Sopenharmony_ci         return true;
191bf215546Sopenharmony_ci   }
192bf215546Sopenharmony_ci
193bf215546Sopenharmony_ci   return false;
194bf215546Sopenharmony_ci}
195bf215546Sopenharmony_ci
196bf215546Sopenharmony_citemplate <typename T>
197bf215546Sopenharmony_cibool
198bf215546Sopenharmony_ciis_clobbered_since(pr_opt_ctx& ctx, const T& t, const Idx& idx)
199bf215546Sopenharmony_ci{
200bf215546Sopenharmony_ci   return is_clobbered_since(ctx, t.physReg(), t.regClass(), idx);
201bf215546Sopenharmony_ci}
202bf215546Sopenharmony_ci
203bf215546Sopenharmony_civoid
204bf215546Sopenharmony_citry_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
205bf215546Sopenharmony_ci{
206bf215546Sopenharmony_ci   /* We are looking for the following pattern:
207bf215546Sopenharmony_ci    *
208bf215546Sopenharmony_ci    * vcc = ...                      ; last_vcc_wr
209bf215546Sopenharmony_ci    * sX, scc = s_and_bXX vcc, exec  ; op0_instr
210bf215546Sopenharmony_ci    * (...vcc and exec must not be clobbered inbetween...)
211bf215546Sopenharmony_ci    * s_cbranch_XX scc               ; instr
212bf215546Sopenharmony_ci    *
213bf215546Sopenharmony_ci    * If possible, the above is optimized into:
214bf215546Sopenharmony_ci    *
215bf215546Sopenharmony_ci    * vcc = ...                      ; last_vcc_wr
216bf215546Sopenharmony_ci    * s_cbranch_XX vcc               ; instr modified to use vcc
217bf215546Sopenharmony_ci    */
218bf215546Sopenharmony_ci
219bf215546Sopenharmony_ci   /* Don't try to optimize this on GFX6-7 because SMEM may corrupt the vccz bit. */
220bf215546Sopenharmony_ci   if (ctx.program->gfx_level < GFX8)
221bf215546Sopenharmony_ci      return;
222bf215546Sopenharmony_ci
223bf215546Sopenharmony_ci   if (instr->format != Format::PSEUDO_BRANCH || instr->operands.size() == 0 ||
224bf215546Sopenharmony_ci       instr->operands[0].physReg() != scc)
225bf215546Sopenharmony_ci      return;
226bf215546Sopenharmony_ci
227bf215546Sopenharmony_ci   Idx op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
228bf215546Sopenharmony_ci   Idx last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
229bf215546Sopenharmony_ci
230bf215546Sopenharmony_ci   /* We need to make sure:
231bf215546Sopenharmony_ci    * - the instructions that wrote the operand register and VCC are both found
232bf215546Sopenharmony_ci    * - the operand register used by the branch, and VCC were both written in the current block
233bf215546Sopenharmony_ci    * - EXEC hasn't been clobbered since the last VCC write
234bf215546Sopenharmony_ci    * - VCC hasn't been clobbered since the operand register was written
235bf215546Sopenharmony_ci    *   (ie. the last VCC writer precedes the op0 writer)
236bf215546Sopenharmony_ci    */
237bf215546Sopenharmony_ci   if (!op0_instr_idx.found() || !last_vcc_wr_idx.found() ||
238bf215546Sopenharmony_ci       op0_instr_idx.block != ctx.current_block->index ||
239bf215546Sopenharmony_ci       last_vcc_wr_idx.block != ctx.current_block->index ||
240bf215546Sopenharmony_ci       is_clobbered_since(ctx, exec, ctx.program->lane_mask, last_vcc_wr_idx) ||
241bf215546Sopenharmony_ci       is_clobbered_since(ctx, vcc, ctx.program->lane_mask, op0_instr_idx))
242bf215546Sopenharmony_ci      return;
243bf215546Sopenharmony_ci
244bf215546Sopenharmony_ci   Instruction* op0_instr = ctx.get(op0_instr_idx);
245bf215546Sopenharmony_ci   Instruction* last_vcc_wr = ctx.get(last_vcc_wr_idx);
246bf215546Sopenharmony_ci
247bf215546Sopenharmony_ci   if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
248bf215546Sopenharmony_ci        op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
249bf215546Sopenharmony_ci       op0_instr->operands[0].physReg() != vcc || op0_instr->operands[1].physReg() != exec ||
250bf215546Sopenharmony_ci       !last_vcc_wr->isVOPC())
251bf215546Sopenharmony_ci      return;
252bf215546Sopenharmony_ci
253bf215546Sopenharmony_ci   assert(last_vcc_wr->definitions[0].tempId() == op0_instr->operands[0].tempId());
254bf215546Sopenharmony_ci
255bf215546Sopenharmony_ci   /* Reduce the uses of the SCC def */
256bf215546Sopenharmony_ci   ctx.uses[instr->operands[0].tempId()]--;
257bf215546Sopenharmony_ci   /* Use VCC instead of SCC in the branch */
258bf215546Sopenharmony_ci   instr->operands[0] = op0_instr->operands[0];
259bf215546Sopenharmony_ci}
260bf215546Sopenharmony_ci
261bf215546Sopenharmony_civoid
262bf215546Sopenharmony_citry_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
263bf215546Sopenharmony_ci{
264bf215546Sopenharmony_ci   /* We are looking for the following pattern:
265bf215546Sopenharmony_ci    *
266bf215546Sopenharmony_ci    * s_bfe_u32 s0, s3, 0x40018  ; outputs SGPR and SCC if the SGPR != 0
267bf215546Sopenharmony_ci    * s_cmp_eq_i32 s0, 0         ; comparison between the SGPR and 0
268bf215546Sopenharmony_ci    * s_cbranch_scc0 BB3         ; use the result of the comparison, eg. branch or cselect
269bf215546Sopenharmony_ci    *
270bf215546Sopenharmony_ci    * If possible, the above is optimized into:
271bf215546Sopenharmony_ci    *
272bf215546Sopenharmony_ci    * s_bfe_u32 s0, s3, 0x40018  ; original instruction
273bf215546Sopenharmony_ci    * s_cbranch_scc1 BB3         ; modified to use SCC directly rather than the SGPR with comparison
274bf215546Sopenharmony_ci    *
275bf215546Sopenharmony_ci    */
276bf215546Sopenharmony_ci
277bf215546Sopenharmony_ci   if (!instr->isSALU() && !instr->isBranch())
278bf215546Sopenharmony_ci      return;
279bf215546Sopenharmony_ci
280bf215546Sopenharmony_ci   if (instr->isSOPC() &&
281bf215546Sopenharmony_ci       (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
282bf215546Sopenharmony_ci        instr->opcode == aco_opcode::s_cmp_lg_u32 || instr->opcode == aco_opcode::s_cmp_lg_i32 ||
283bf215546Sopenharmony_ci        instr->opcode == aco_opcode::s_cmp_eq_u64 || instr->opcode == aco_opcode::s_cmp_lg_u64) &&
284bf215546Sopenharmony_ci       (instr->operands[0].constantEquals(0) || instr->operands[1].constantEquals(0)) &&
285bf215546Sopenharmony_ci       (instr->operands[0].isTemp() || instr->operands[1].isTemp())) {
286bf215546Sopenharmony_ci      /* Make sure the constant is always in operand 1 */
287bf215546Sopenharmony_ci      if (instr->operands[0].isConstant())
288bf215546Sopenharmony_ci         std::swap(instr->operands[0], instr->operands[1]);
289bf215546Sopenharmony_ci
290bf215546Sopenharmony_ci      if (ctx.uses[instr->operands[0].tempId()] > 1)
291bf215546Sopenharmony_ci         return;
292bf215546Sopenharmony_ci
293bf215546Sopenharmony_ci      /* Make sure both SCC and Operand 0 are written by the same instruction. */
294bf215546Sopenharmony_ci      Idx wr_idx = last_writer_idx(ctx, instr->operands[0]);
295bf215546Sopenharmony_ci      Idx sccwr_idx = last_writer_idx(ctx, scc, s1);
296bf215546Sopenharmony_ci      if (!wr_idx.found() || wr_idx != sccwr_idx)
297bf215546Sopenharmony_ci         return;
298bf215546Sopenharmony_ci
299bf215546Sopenharmony_ci      Instruction* wr_instr = ctx.get(wr_idx);
300bf215546Sopenharmony_ci      if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 ||
301bf215546Sopenharmony_ci          wr_instr->definitions[1].physReg() != scc)
302bf215546Sopenharmony_ci         return;
303bf215546Sopenharmony_ci
304bf215546Sopenharmony_ci      /* Look for instructions which set SCC := (D != 0) */
305bf215546Sopenharmony_ci      switch (wr_instr->opcode) {
306bf215546Sopenharmony_ci      case aco_opcode::s_bfe_i32:
307bf215546Sopenharmony_ci      case aco_opcode::s_bfe_i64:
308bf215546Sopenharmony_ci      case aco_opcode::s_bfe_u32:
309bf215546Sopenharmony_ci      case aco_opcode::s_bfe_u64:
310bf215546Sopenharmony_ci      case aco_opcode::s_and_b32:
311bf215546Sopenharmony_ci      case aco_opcode::s_and_b64:
312bf215546Sopenharmony_ci      case aco_opcode::s_andn2_b32:
313bf215546Sopenharmony_ci      case aco_opcode::s_andn2_b64:
314bf215546Sopenharmony_ci      case aco_opcode::s_or_b32:
315bf215546Sopenharmony_ci      case aco_opcode::s_or_b64:
316bf215546Sopenharmony_ci      case aco_opcode::s_orn2_b32:
317bf215546Sopenharmony_ci      case aco_opcode::s_orn2_b64:
318bf215546Sopenharmony_ci      case aco_opcode::s_xor_b32:
319bf215546Sopenharmony_ci      case aco_opcode::s_xor_b64:
320bf215546Sopenharmony_ci      case aco_opcode::s_not_b32:
321bf215546Sopenharmony_ci      case aco_opcode::s_not_b64:
322bf215546Sopenharmony_ci      case aco_opcode::s_nor_b32:
323bf215546Sopenharmony_ci      case aco_opcode::s_nor_b64:
324bf215546Sopenharmony_ci      case aco_opcode::s_xnor_b32:
325bf215546Sopenharmony_ci      case aco_opcode::s_xnor_b64:
326bf215546Sopenharmony_ci      case aco_opcode::s_nand_b32:
327bf215546Sopenharmony_ci      case aco_opcode::s_nand_b64:
328bf215546Sopenharmony_ci      case aco_opcode::s_lshl_b32:
329bf215546Sopenharmony_ci      case aco_opcode::s_lshl_b64:
330bf215546Sopenharmony_ci      case aco_opcode::s_lshr_b32:
331bf215546Sopenharmony_ci      case aco_opcode::s_lshr_b64:
332bf215546Sopenharmony_ci      case aco_opcode::s_ashr_i32:
333bf215546Sopenharmony_ci      case aco_opcode::s_ashr_i64:
334bf215546Sopenharmony_ci      case aco_opcode::s_abs_i32:
335bf215546Sopenharmony_ci      case aco_opcode::s_absdiff_i32: break;
336bf215546Sopenharmony_ci      default: return;
337bf215546Sopenharmony_ci      }
338bf215546Sopenharmony_ci
339bf215546Sopenharmony_ci      /* Use the SCC def from wr_instr */
340bf215546Sopenharmony_ci      ctx.uses[instr->operands[0].tempId()]--;
341bf215546Sopenharmony_ci      instr->operands[0] = Operand(wr_instr->definitions[1].getTemp(), scc);
342bf215546Sopenharmony_ci      ctx.uses[instr->operands[0].tempId()]++;
343bf215546Sopenharmony_ci
344bf215546Sopenharmony_ci      /* Set the opcode and operand to 32-bit */
345bf215546Sopenharmony_ci      instr->operands[1] = Operand::zero();
346bf215546Sopenharmony_ci      instr->opcode =
347bf215546Sopenharmony_ci         (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
348bf215546Sopenharmony_ci          instr->opcode == aco_opcode::s_cmp_eq_u64)
349bf215546Sopenharmony_ci            ? aco_opcode::s_cmp_eq_u32
350bf215546Sopenharmony_ci            : aco_opcode::s_cmp_lg_u32;
351bf215546Sopenharmony_ci   } else if ((instr->format == Format::PSEUDO_BRANCH && instr->operands.size() == 1 &&
352bf215546Sopenharmony_ci               instr->operands[0].physReg() == scc) ||
353bf215546Sopenharmony_ci              instr->opcode == aco_opcode::s_cselect_b32) {
354bf215546Sopenharmony_ci
355bf215546Sopenharmony_ci      /* For cselect, operand 2 is the SCC condition */
356bf215546Sopenharmony_ci      unsigned scc_op_idx = 0;
357bf215546Sopenharmony_ci      if (instr->opcode == aco_opcode::s_cselect_b32) {
358bf215546Sopenharmony_ci         scc_op_idx = 2;
359bf215546Sopenharmony_ci      }
360bf215546Sopenharmony_ci
361bf215546Sopenharmony_ci      Idx wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]);
362bf215546Sopenharmony_ci      if (!wr_idx.found())
363bf215546Sopenharmony_ci         return;
364bf215546Sopenharmony_ci
365bf215546Sopenharmony_ci      Instruction* wr_instr = ctx.get(wr_idx);
366bf215546Sopenharmony_ci
367bf215546Sopenharmony_ci      /* Check if we found the pattern above. */
368bf215546Sopenharmony_ci      if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 &&
369bf215546Sopenharmony_ci          wr_instr->opcode != aco_opcode::s_cmp_lg_u32)
370bf215546Sopenharmony_ci         return;
371bf215546Sopenharmony_ci      if (wr_instr->operands[0].physReg() != scc)
372bf215546Sopenharmony_ci         return;
373bf215546Sopenharmony_ci      if (!wr_instr->operands[1].constantEquals(0))
374bf215546Sopenharmony_ci         return;
375bf215546Sopenharmony_ci
376bf215546Sopenharmony_ci      /* The optimization can be unsafe when there are other users. */
377bf215546Sopenharmony_ci      if (ctx.uses[instr->operands[scc_op_idx].tempId()] > 1)
378bf215546Sopenharmony_ci         return;
379bf215546Sopenharmony_ci
380bf215546Sopenharmony_ci      if (wr_instr->opcode == aco_opcode::s_cmp_eq_u32) {
381bf215546Sopenharmony_ci         /* Flip the meaning of the instruction to correctly use the SCC. */
382bf215546Sopenharmony_ci         if (instr->format == Format::PSEUDO_BRANCH)
383bf215546Sopenharmony_ci            instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
384bf215546Sopenharmony_ci                                                                     : aco_opcode::p_cbranch_z;
385bf215546Sopenharmony_ci         else if (instr->opcode == aco_opcode::s_cselect_b32)
386bf215546Sopenharmony_ci            std::swap(instr->operands[0], instr->operands[1]);
387bf215546Sopenharmony_ci         else
388bf215546Sopenharmony_ci            unreachable(
389bf215546Sopenharmony_ci               "scc_nocompare optimization is only implemented for p_cbranch and s_cselect");
390bf215546Sopenharmony_ci      }
391bf215546Sopenharmony_ci
392bf215546Sopenharmony_ci      /* Use the SCC def from the original instruction, not the comparison */
393bf215546Sopenharmony_ci      ctx.uses[instr->operands[scc_op_idx].tempId()]--;
394bf215546Sopenharmony_ci      instr->operands[scc_op_idx] = wr_instr->operands[0];
395bf215546Sopenharmony_ci   }
396bf215546Sopenharmony_ci}
397bf215546Sopenharmony_ci
398bf215546Sopenharmony_civoid
399bf215546Sopenharmony_citry_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
400bf215546Sopenharmony_ci{
401bf215546Sopenharmony_ci   /* We are looking for the following pattern:
402bf215546Sopenharmony_ci    *
403bf215546Sopenharmony_ci    * v_mov_dpp vA, vB, ...      ; move instruction with DPP
404bf215546Sopenharmony_ci    * v_xxx vC, vA, ...          ; current instr that uses the result from the move
405bf215546Sopenharmony_ci    *
406bf215546Sopenharmony_ci    * If possible, the above is optimized into:
407bf215546Sopenharmony_ci    *
408bf215546Sopenharmony_ci    * v_xxx_dpp vC, vB, ...      ; current instr modified to use DPP directly
409bf215546Sopenharmony_ci    *
410bf215546Sopenharmony_ci    */
411bf215546Sopenharmony_ci
412bf215546Sopenharmony_ci   if (!instr->isVALU() || instr->isDPP())
413bf215546Sopenharmony_ci      return;
414bf215546Sopenharmony_ci
415bf215546Sopenharmony_ci   for (unsigned i = 0; i < MIN2(2, instr->operands.size()); i++) {
416bf215546Sopenharmony_ci      Idx op_instr_idx = last_writer_idx(ctx, instr->operands[i]);
417bf215546Sopenharmony_ci      if (!op_instr_idx.found())
418bf215546Sopenharmony_ci         continue;
419bf215546Sopenharmony_ci
420bf215546Sopenharmony_ci      const Instruction* mov = ctx.get(op_instr_idx);
421bf215546Sopenharmony_ci      if (mov->opcode != aco_opcode::v_mov_b32 || !mov->isDPP())
422bf215546Sopenharmony_ci         continue;
423bf215546Sopenharmony_ci      bool dpp8 = mov->isDPP8();
424bf215546Sopenharmony_ci      if (!can_use_DPP(instr, false, dpp8))
425bf215546Sopenharmony_ci         return;
426bf215546Sopenharmony_ci
427bf215546Sopenharmony_ci      /* If we aren't going to remove the v_mov_b32, we have to ensure that it doesn't overwrite
428bf215546Sopenharmony_ci       * it's own operand before we use it.
429bf215546Sopenharmony_ci       */
430bf215546Sopenharmony_ci      if (mov->definitions[0].physReg() == mov->operands[0].physReg() &&
431bf215546Sopenharmony_ci          (!mov->definitions[0].tempId() || ctx.uses[mov->definitions[0].tempId()] > 1))
432bf215546Sopenharmony_ci         continue;
433bf215546Sopenharmony_ci
434bf215546Sopenharmony_ci      /* Don't propagate DPP if the source register is overwritten since the move. */
435bf215546Sopenharmony_ci      if (is_clobbered_since(ctx, mov->operands[0], op_instr_idx))
436bf215546Sopenharmony_ci         continue;
437bf215546Sopenharmony_ci
438bf215546Sopenharmony_ci      if (i && !can_swap_operands(instr, &instr->opcode))
439bf215546Sopenharmony_ci         continue;
440bf215546Sopenharmony_ci
441bf215546Sopenharmony_ci      if (!dpp8) /* anything else doesn't make sense in SSA */
442bf215546Sopenharmony_ci         assert(mov->dpp16().row_mask == 0xf && mov->dpp16().bank_mask == 0xf);
443bf215546Sopenharmony_ci
444bf215546Sopenharmony_ci      if (--ctx.uses[mov->definitions[0].tempId()])
445bf215546Sopenharmony_ci         ctx.uses[mov->operands[0].tempId()]++;
446bf215546Sopenharmony_ci
447bf215546Sopenharmony_ci      convert_to_DPP(instr, dpp8);
448bf215546Sopenharmony_ci
449bf215546Sopenharmony_ci      if (dpp8) {
450bf215546Sopenharmony_ci         DPP8_instruction* dpp = &instr->dpp8();
451bf215546Sopenharmony_ci         if (i) {
452bf215546Sopenharmony_ci            std::swap(dpp->operands[0], dpp->operands[1]);
453bf215546Sopenharmony_ci         }
454bf215546Sopenharmony_ci         dpp->operands[0] = mov->operands[0];
455bf215546Sopenharmony_ci         memcpy(dpp->lane_sel, mov->dpp8().lane_sel, sizeof(dpp->lane_sel));
456bf215546Sopenharmony_ci      } else {
457bf215546Sopenharmony_ci         DPP16_instruction* dpp = &instr->dpp16();
458bf215546Sopenharmony_ci         if (i) {
459bf215546Sopenharmony_ci            std::swap(dpp->operands[0], dpp->operands[1]);
460bf215546Sopenharmony_ci            std::swap(dpp->neg[0], dpp->neg[1]);
461bf215546Sopenharmony_ci            std::swap(dpp->abs[0], dpp->abs[1]);
462bf215546Sopenharmony_ci         }
463bf215546Sopenharmony_ci         dpp->operands[0] = mov->operands[0];
464bf215546Sopenharmony_ci         dpp->dpp_ctrl = mov->dpp16().dpp_ctrl;
465bf215546Sopenharmony_ci         dpp->bound_ctrl = true;
466bf215546Sopenharmony_ci         dpp->neg[0] ^= mov->dpp16().neg[0] && !dpp->abs[0];
467bf215546Sopenharmony_ci         dpp->abs[0] |= mov->dpp16().abs[0];
468bf215546Sopenharmony_ci      }
469bf215546Sopenharmony_ci      return;
470bf215546Sopenharmony_ci   }
471bf215546Sopenharmony_ci}
472bf215546Sopenharmony_ci
473bf215546Sopenharmony_civoid
474bf215546Sopenharmony_ciprocess_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
475bf215546Sopenharmony_ci{
476bf215546Sopenharmony_ci   try_apply_branch_vcc(ctx, instr);
477bf215546Sopenharmony_ci
478bf215546Sopenharmony_ci   try_optimize_scc_nocompare(ctx, instr);
479bf215546Sopenharmony_ci
480bf215546Sopenharmony_ci   try_combine_dpp(ctx, instr);
481bf215546Sopenharmony_ci
482bf215546Sopenharmony_ci   if (instr)
483bf215546Sopenharmony_ci      save_reg_writes(ctx, instr);
484bf215546Sopenharmony_ci
485bf215546Sopenharmony_ci   ctx.current_instr_idx++;
486bf215546Sopenharmony_ci}
487bf215546Sopenharmony_ci
488bf215546Sopenharmony_ci} // namespace
489bf215546Sopenharmony_ci
490bf215546Sopenharmony_civoid
491bf215546Sopenharmony_cioptimize_postRA(Program* program)
492bf215546Sopenharmony_ci{
493bf215546Sopenharmony_ci   pr_opt_ctx ctx;
494bf215546Sopenharmony_ci   ctx.program = program;
495bf215546Sopenharmony_ci   ctx.uses = dead_code_analysis(program);
496bf215546Sopenharmony_ci   ctx.instr_idx_by_regs.resize(program->blocks.size());
497bf215546Sopenharmony_ci
498bf215546Sopenharmony_ci   /* Forward pass
499bf215546Sopenharmony_ci    * Goes through each instruction exactly once, and can transform
500bf215546Sopenharmony_ci    * instructions or adjust the use counts of temps.
501bf215546Sopenharmony_ci    */
502bf215546Sopenharmony_ci   for (auto& block : program->blocks) {
503bf215546Sopenharmony_ci      ctx.reset_block(&block);
504bf215546Sopenharmony_ci
505bf215546Sopenharmony_ci      for (aco_ptr<Instruction>& instr : block.instructions)
506bf215546Sopenharmony_ci         process_instruction(ctx, instr);
507bf215546Sopenharmony_ci   }
508bf215546Sopenharmony_ci
509bf215546Sopenharmony_ci   /* Cleanup pass
510bf215546Sopenharmony_ci    * Gets rid of instructions which are manually deleted or
511bf215546Sopenharmony_ci    * no longer have any uses.
512bf215546Sopenharmony_ci    */
513bf215546Sopenharmony_ci   for (auto& block : program->blocks) {
514bf215546Sopenharmony_ci      auto new_end = std::remove_if(block.instructions.begin(), block.instructions.end(),
515bf215546Sopenharmony_ci                                    [&ctx](const aco_ptr<Instruction>& instr)
516bf215546Sopenharmony_ci                                    { return !instr || is_dead(ctx.uses, instr.get()); });
517bf215546Sopenharmony_ci      block.instructions.resize(new_end - block.instructions.begin());
518bf215546Sopenharmony_ci   }
519bf215546Sopenharmony_ci}
520bf215546Sopenharmony_ci
521bf215546Sopenharmony_ci} // namespace aco
522