1/*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25#include "aco_ir.h"
26
27#include <algorithm>
28#include <map>
29#include <vector>
30
31namespace aco {
32namespace {
33
34struct phi_info_item {
35   Definition def;
36   Operand op;
37};
38
39struct ssa_elimination_ctx {
40   /* The outer vectors should be indexed by block index. The inner vectors store phi information
41    * for each block. */
42   std::vector<std::vector<phi_info_item>> logical_phi_info;
43   std::vector<std::vector<phi_info_item>> linear_phi_info;
44   std::vector<bool> empty_blocks;
45   std::vector<bool> blocks_incoming_exec_used;
46   Program* program;
47
48   ssa_elimination_ctx(Program* program_)
49       : logical_phi_info(program_->blocks.size()), linear_phi_info(program_->blocks.size()),
50         empty_blocks(program_->blocks.size(), true),
51         blocks_incoming_exec_used(program_->blocks.size(), true), program(program_)
52   {}
53};
54
55void
56collect_phi_info(ssa_elimination_ctx& ctx)
57{
58   for (Block& block : ctx.program->blocks) {
59      for (aco_ptr<Instruction>& phi : block.instructions) {
60         if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
61            break;
62
63         for (unsigned i = 0; i < phi->operands.size(); i++) {
64            if (phi->operands[i].isUndefined())
65               continue;
66            if (phi->operands[i].physReg() == phi->definitions[0].physReg())
67               continue;
68
69            assert(phi->definitions[0].size() == phi->operands[i].size());
70
71            std::vector<unsigned>& preds =
72               phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
73            uint32_t pred_idx = preds[i];
74            auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx]
75                                                              : ctx.linear_phi_info[pred_idx];
76            info_vec.push_back({phi->definitions[0], phi->operands[i]});
77            ctx.empty_blocks[pred_idx] = false;
78         }
79      }
80   }
81}
82
83void
84insert_parallelcopies(ssa_elimination_ctx& ctx)
85{
86   /* insert the parallelcopies from logical phis before p_logical_end */
87   for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
88      auto& logical_phi_info = ctx.logical_phi_info[block_idx];
89      if (logical_phi_info.empty())
90         continue;
91
92      Block& block = ctx.program->blocks[block_idx];
93      unsigned idx = block.instructions.size() - 1;
94      while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {
95         assert(idx > 0);
96         idx--;
97      }
98
99      std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx);
100      aco_ptr<Pseudo_instruction> pc{
101         create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
102                                                logical_phi_info.size(), logical_phi_info.size())};
103      unsigned i = 0;
104      for (auto& phi_info : logical_phi_info) {
105         pc->definitions[i] = phi_info.def;
106         pc->operands[i] = phi_info.op;
107         i++;
108      }
109      /* this shouldn't be needed since we're only copying vgprs */
110      pc->tmp_in_scc = false;
111      block.instructions.insert(it, std::move(pc));
112   }
113
114   /* insert parallelcopies for the linear phis at the end of blocks just before the branch */
115   for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
116      auto& linear_phi_info = ctx.linear_phi_info[block_idx];
117      if (linear_phi_info.empty())
118         continue;
119
120      Block& block = ctx.program->blocks[block_idx];
121      std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
122      --it;
123      assert((*it)->isBranch());
124      PhysReg scratch_sgpr = (*it)->definitions[0].physReg();
125      aco_ptr<Pseudo_instruction> pc{
126         create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
127                                                linear_phi_info.size(), linear_phi_info.size())};
128      unsigned i = 0;
129      for (auto& phi_info : linear_phi_info) {
130         pc->definitions[i] = phi_info.def;
131         pc->operands[i] = phi_info.op;
132         i++;
133      }
134      pc->tmp_in_scc = block.scc_live_out;
135      pc->scratch_sgpr = scratch_sgpr;
136      block.instructions.insert(it, std::move(pc));
137   }
138}
139
140bool
141is_empty_block(Block* block, bool ignore_exec_writes)
142{
143   /* check if this block is empty and the exec mask is not needed */
144   for (aco_ptr<Instruction>& instr : block->instructions) {
145      switch (instr->opcode) {
146      case aco_opcode::p_linear_phi:
147      case aco_opcode::p_phi:
148      case aco_opcode::p_logical_start:
149      case aco_opcode::p_logical_end:
150      case aco_opcode::p_branch: break;
151      case aco_opcode::p_parallelcopy:
152         for (unsigned i = 0; i < instr->definitions.size(); i++) {
153            if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
154               continue;
155            if (instr->definitions[i].physReg() != instr->operands[i].physReg())
156               return false;
157         }
158         break;
159      case aco_opcode::s_andn2_b64:
160      case aco_opcode::s_andn2_b32:
161         if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
162            break;
163         return false;
164      default: return false;
165      }
166   }
167   return true;
168}
169
170void
171try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
172{
173   /* check if the successor is another merge block which restores exec */
174   // TODO: divergent loops also restore exec
175   if (block->linear_succs.size() != 1 ||
176       !(ctx.program->blocks[block->linear_succs[0]].kind & block_kind_merge))
177      return;
178
179   /* check if this block is empty */
180   if (!is_empty_block(block, true))
181      return;
182
183   /* keep the branch instruction and remove the rest */
184   aco_ptr<Instruction> branch = std::move(block->instructions.back());
185   block->instructions.clear();
186   block->instructions.emplace_back(std::move(branch));
187}
188
189void
190try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
191{
192   assert(block->linear_succs.size() == 2);
193   /* only remove this block if the successor got removed as well */
194   if (block->linear_succs[0] != block->linear_succs[1])
195      return;
196
197   /* check if block is otherwise empty */
198   if (!is_empty_block(block, true))
199      return;
200
201   unsigned succ_idx = block->linear_succs[0];
202   assert(block->linear_preds.size() == 2);
203   for (unsigned i = 0; i < 2; i++) {
204      Block* pred = &ctx.program->blocks[block->linear_preds[i]];
205      pred->linear_succs[0] = succ_idx;
206      ctx.program->blocks[succ_idx].linear_preds[i] = pred->index;
207
208      Pseudo_branch_instruction& branch = pred->instructions.back()->branch();
209      assert(branch.isBranch());
210      branch.target[0] = succ_idx;
211      branch.target[1] = succ_idx;
212   }
213
214   block->instructions.clear();
215   block->linear_preds.clear();
216   block->linear_succs.clear();
217}
218
219void
220try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
221{
222   if (!is_empty_block(block, false))
223      return;
224
225   Block& pred = ctx.program->blocks[block->linear_preds[0]];
226   Block& succ = ctx.program->blocks[block->linear_succs[0]];
227   Pseudo_branch_instruction& branch = pred.instructions.back()->branch();
228   if (branch.opcode == aco_opcode::p_branch) {
229      branch.target[0] = succ.index;
230      branch.target[1] = succ.index;
231   } else if (branch.target[0] == block->index) {
232      branch.target[0] = succ.index;
233   } else if (branch.target[0] == succ.index) {
234      assert(branch.target[1] == block->index);
235      branch.target[1] = succ.index;
236      branch.opcode = aco_opcode::p_branch;
237   } else if (branch.target[1] == block->index) {
238      /* check if there is a fall-through path from block to succ */
239      bool falls_through = block->index < succ.index;
240      for (unsigned j = block->index + 1; falls_through && j < succ.index; j++) {
241         assert(ctx.program->blocks[j].index == j);
242         if (!ctx.program->blocks[j].instructions.empty())
243            falls_through = false;
244      }
245      if (falls_through) {
246         branch.target[1] = succ.index;
247      } else {
248         /* check if there is a fall-through path for the alternative target */
249         if (block->index >= branch.target[0])
250            return;
251         for (unsigned j = block->index + 1; j < branch.target[0]; j++) {
252            if (!ctx.program->blocks[j].instructions.empty())
253               return;
254         }
255
256         /* This is a (uniform) break or continue block. The branch condition has to be inverted. */
257         if (branch.opcode == aco_opcode::p_cbranch_z)
258            branch.opcode = aco_opcode::p_cbranch_nz;
259         else if (branch.opcode == aco_opcode::p_cbranch_nz)
260            branch.opcode = aco_opcode::p_cbranch_z;
261         else
262            assert(false);
263         /* also invert the linear successors */
264         pred.linear_succs[0] = pred.linear_succs[1];
265         pred.linear_succs[1] = succ.index;
266         branch.target[1] = branch.target[0];
267         branch.target[0] = succ.index;
268      }
269   } else {
270      assert(false);
271   }
272
273   if (branch.target[0] == branch.target[1])
274      branch.opcode = aco_opcode::p_branch;
275
276   for (unsigned i = 0; i < pred.linear_succs.size(); i++)
277      if (pred.linear_succs[i] == block->index)
278         pred.linear_succs[i] = succ.index;
279
280   for (unsigned i = 0; i < succ.linear_preds.size(); i++)
281      if (succ.linear_preds[i] == block->index)
282         succ.linear_preds[i] = pred.index;
283
284   block->instructions.clear();
285   block->linear_preds.clear();
286   block->linear_succs.clear();
287}
288
289bool
290instr_writes_exec(Instruction* instr)
291{
292   for (Definition& def : instr->definitions)
293      if (def.physReg() == exec || def.physReg() == exec_hi)
294         return true;
295
296   return false;
297}
298
299void
300eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
301{
302   /* Check if any successor needs the outgoing exec mask from the current block. */
303
304   bool exec_write_used;
305
306   if (!ctx.logical_phi_info[block.index].empty()) {
307      exec_write_used = true;
308   } else {
309      bool copy_to_exec = false;
310      bool copy_from_exec = false;
311
312      for (const auto& successor_phi_info : ctx.linear_phi_info[block.index]) {
313         copy_to_exec |= successor_phi_info.def.physReg() == exec;
314         copy_from_exec |= successor_phi_info.op.physReg() == exec;
315      }
316
317      if (copy_from_exec)
318         exec_write_used = true;
319      else if (copy_to_exec)
320         exec_write_used = false;
321      else
322         /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
323         exec_write_used =
324            std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
325                        [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
326   }
327
328   /* Go through all instructions and eliminate useless exec writes. */
329
330   for (int i = block.instructions.size() - 1; i >= 0; --i) {
331      aco_ptr<Instruction>& instr = block.instructions[i];
332
333      /* We already take information from phis into account before the loop, so let's just break on
334       * phis. */
335      if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)
336         break;
337
338      /* See if the current instruction needs or writes exec. */
339      bool needs_exec = needs_exec_mask(instr.get());
340      bool writes_exec = instr_writes_exec(instr.get());
341
342      /* See if we found an unused exec write. */
343      if (writes_exec && !exec_write_used) {
344         instr.reset();
345         continue;
346      }
347
348      /* For a newly encountered exec write, clear the used flag. */
349      if (writes_exec)
350         exec_write_used = false;
351
352      /* If the current instruction needs exec, mark it as used. */
353      exec_write_used |= needs_exec;
354   }
355
356   /* Remember if the current block needs an incoming exec mask from its predecessors. */
357   ctx.blocks_incoming_exec_used[block.index] = exec_write_used;
358
359   /* Cleanup: remove deleted instructions from the vector. */
360   auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
361   block.instructions.resize(new_end - block.instructions.begin());
362}
363
364void
365jump_threading(ssa_elimination_ctx& ctx)
366{
367   for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
368      Block* block = &ctx.program->blocks[i];
369      eliminate_useless_exec_writes_in_block(ctx, *block);
370
371      if (!ctx.empty_blocks[i])
372         continue;
373
374      if (block->kind & block_kind_invert) {
375         try_remove_invert_block(ctx, block);
376         continue;
377      }
378
379      if (block->linear_succs.size() > 1)
380         continue;
381
382      if (block->kind & block_kind_merge || block->kind & block_kind_loop_exit)
383         try_remove_merge_block(ctx, block);
384
385      if (block->linear_preds.size() == 1)
386         try_remove_simple_block(ctx, block);
387   }
388}
389
390} /* end namespace */
391
392void
393ssa_elimination(Program* program)
394{
395   ssa_elimination_ctx ctx(program);
396
397   /* Collect information about every phi-instruction */
398   collect_phi_info(ctx);
399
400   /* eliminate empty blocks */
401   jump_threading(ctx);
402
403   /* insert parallelcopies from SSA elimination */
404   insert_parallelcopies(ctx);
405}
406} // namespace aco
407