xref: /third_party/mesa3d/src/amd/compiler/aco_ir.h (revision bf215546)
1/*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25#ifndef ACO_IR_H
26#define ACO_IR_H
27
28#include "aco_opcodes.h"
29#include "aco_util.h"
30#include "aco_interface.h"
31#include "aco_shader_info.h"
32#include "vulkan/radv_shader.h"
33
34#include "nir.h"
35
36#include <bitset>
37#include <memory>
38#include <vector>
39
40struct radv_shader_args;
41
42namespace aco {
43
44extern uint64_t debug_flags;
45
46enum {
47   DEBUG_VALIDATE_IR = 0x1,
48   DEBUG_VALIDATE_RA = 0x2,
49   DEBUG_PERFWARN = 0x4,
50   DEBUG_FORCE_WAITCNT = 0x8,
51   DEBUG_NO_VN = 0x10,
52   DEBUG_NO_OPT = 0x20,
53   DEBUG_NO_SCHED = 0x40,
54   DEBUG_PERF_INFO = 0x80,
55   DEBUG_LIVE_INFO = 0x100,
56};
57
58/**
59 * Representation of the instruction's microcode encoding format
60 * Note: Some Vector ALU Formats can be combined, such that:
61 * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
62 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
63 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
64 *
65 * (*) The same is applicable for VOP1 and VOPC instructions.
66 */
67enum class Format : std::uint16_t {
68   /* Pseudo Instruction Format */
69   PSEUDO = 0,
70   /* Scalar ALU & Control Formats */
71   SOP1 = 1,
72   SOP2 = 2,
73   SOPK = 3,
74   SOPP = 4,
75   SOPC = 5,
76   /* Scalar Memory Format */
77   SMEM = 6,
78   /* LDS/GDS Format */
79   DS = 8,
80   /* Vector Memory Buffer Formats */
81   MTBUF = 9,
82   MUBUF = 10,
83   /* Vector Memory Image Format */
84   MIMG = 11,
85   /* Export Format */
86   EXP = 12,
87   /* Flat Formats */
88   FLAT = 13,
89   GLOBAL = 14,
90   SCRATCH = 15,
91
92   PSEUDO_BRANCH = 16,
93   PSEUDO_BARRIER = 17,
94   PSEUDO_REDUCTION = 18,
95
96   /* Vector ALU Formats */
97   VOP3P = 19,
98   VOP1 = 1 << 8,
99   VOP2 = 1 << 9,
100   VOPC = 1 << 10,
101   VOP3 = 1 << 11,
102   /* Vector Parameter Interpolation Format */
103   VINTRP = 1 << 12,
104   DPP16 = 1 << 13,
105   SDWA = 1 << 14,
106   DPP8 = 1 << 15,
107};
108
109enum class instr_class : uint8_t {
110   valu32 = 0,
111   valu_convert32 = 1,
112   valu64 = 2,
113   valu_quarter_rate32 = 3,
114   valu_fma = 4,
115   valu_transcendental32 = 5,
116   valu_double = 6,
117   valu_double_add = 7,
118   valu_double_convert = 8,
119   valu_double_transcendental = 9,
120   salu = 10,
121   smem = 11,
122   barrier = 12,
123   branch = 13,
124   sendmsg = 14,
125   ds = 15,
126   exp = 16,
127   vmem = 17,
128   waitcnt = 18,
129   other = 19,
130   count,
131};
132
133enum storage_class : uint8_t {
134   storage_none = 0x0,   /* no synchronization and can be reordered around aliasing stores */
135   storage_buffer = 0x1, /* SSBOs and global memory */
136   storage_atomic_counter = 0x2, /* not used for Vulkan */
137   storage_image = 0x4,
138   storage_shared = 0x8,       /* or TCS output */
139   storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
140   storage_task_payload = 0x20,/* Task-Mesh payload */
141   storage_scratch = 0x40,
142   storage_vgpr_spill = 0x80,
143   storage_count = 8, /* not counting storage_none */
144};
145
146enum memory_semantics : uint8_t {
147   semantic_none = 0x0,
148   /* for loads: don't move any access after this load to before this load (even other loads)
149    * for barriers: don't move any access after the barrier to before any
150    * atomics/control_barriers/sendmsg_gs_done before the barrier */
151   semantic_acquire = 0x1,
152   /* for stores: don't move any access before this store to after this store
153    * for barriers: don't move any access before the barrier to after any
154    * atomics/control_barriers/sendmsg_gs_done after the barrier */
155   semantic_release = 0x2,
156
157   /* the rest are for load/stores/atomics only */
158   /* cannot be DCE'd or CSE'd */
159   semantic_volatile = 0x4,
160   /* does not interact with barriers and assumes this lane is the only lane
161    * accessing this memory */
162   semantic_private = 0x8,
163   /* this operation can be reordered around operations of the same storage.
164    * says nothing about barriers */
165   semantic_can_reorder = 0x10,
166   /* this is a atomic instruction (may only read or write memory) */
167   semantic_atomic = 0x20,
168   /* this is instruction both reads and writes memory */
169   semantic_rmw = 0x40,
170
171   semantic_acqrel = semantic_acquire | semantic_release,
172   semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
173};
174
175enum sync_scope : uint8_t {
176   scope_invocation = 0,
177   scope_subgroup = 1,
178   scope_workgroup = 2,
179   scope_queuefamily = 3,
180   scope_device = 4,
181};
182
183struct memory_sync_info {
184   memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
185   memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
186       : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
187   {}
188
189   storage_class storage : 8;
190   memory_semantics semantics : 8;
191   sync_scope scope : 8;
192
193   bool operator==(const memory_sync_info& rhs) const
194   {
195      return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
196   }
197
198   bool can_reorder() const
199   {
200      if (semantics & semantic_acqrel)
201         return false;
202      /* Also check storage so that zero-initialized memory_sync_info can be
203       * reordered. */
204      return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
205   }
206};
207static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
208
209enum fp_round {
210   fp_round_ne = 0,
211   fp_round_pi = 1,
212   fp_round_ni = 2,
213   fp_round_tz = 3,
214};
215
216enum fp_denorm {
217   /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
218    * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
219   fp_denorm_flush = 0x0,
220   fp_denorm_keep_in = 0x1,
221   fp_denorm_keep_out = 0x2,
222   fp_denorm_keep = 0x3,
223};
224
225struct float_mode {
226   /* matches encoding of the MODE register */
227   union {
228      struct {
229         fp_round round32 : 2;
230         fp_round round16_64 : 2;
231         unsigned denorm32 : 2;
232         unsigned denorm16_64 : 2;
233      };
234      struct {
235         uint8_t round : 4;
236         uint8_t denorm : 4;
237      };
238      uint8_t val = 0;
239   };
240   /* if false, optimizations which may remove infs/nan/-0.0 can be done */
241   bool preserve_signed_zero_inf_nan32 : 1;
242   bool preserve_signed_zero_inf_nan16_64 : 1;
243   /* if false, optimizations which may remove denormal flushing can be done */
244   bool must_flush_denorms32 : 1;
245   bool must_flush_denorms16_64 : 1;
246   bool care_about_round32 : 1;
247   bool care_about_round16_64 : 1;
248
249   /* Returns true if instructions using the mode "other" can safely use the
250    * current one instead. */
251   bool canReplace(float_mode other) const noexcept
252   {
253      return val == other.val &&
254             (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
255             (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
256             (must_flush_denorms32 || !other.must_flush_denorms32) &&
257             (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
258             (care_about_round32 || !other.care_about_round32) &&
259             (care_about_round16_64 || !other.care_about_round16_64);
260   }
261};
262
263struct wait_imm {
264   static const uint8_t unset_counter = 0xff;
265
266   uint8_t vm;
267   uint8_t exp;
268   uint8_t lgkm;
269   uint8_t vs;
270
271   wait_imm();
272   wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
273   wait_imm(enum amd_gfx_level chip, uint16_t packed);
274
275   uint16_t pack(enum amd_gfx_level chip) const;
276
277   bool combine(const wait_imm& other);
278
279   bool empty() const;
280};
281
282constexpr Format
283asVOP3(Format format)
284{
285   return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
286};
287
288constexpr Format
289asSDWA(Format format)
290{
291   assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
292   return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
293}
294
295constexpr Format
296withoutDPP(Format format)
297{
298   return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8));
299}
300
301enum class RegType {
302   none = 0,
303   sgpr,
304   vgpr,
305   linear_vgpr,
306};
307
308struct RegClass {
309
310   enum RC : uint8_t {
311      s1 = 1,
312      s2 = 2,
313      s3 = 3,
314      s4 = 4,
315      s6 = 6,
316      s8 = 8,
317      s16 = 16,
318      v1 = s1 | (1 << 5),
319      v2 = s2 | (1 << 5),
320      v3 = s3 | (1 << 5),
321      v4 = s4 | (1 << 5),
322      v5 = 5 | (1 << 5),
323      v6 = 6 | (1 << 5),
324      v7 = 7 | (1 << 5),
325      v8 = 8 | (1 << 5),
326      /* byte-sized register class */
327      v1b = v1 | (1 << 7),
328      v2b = v2 | (1 << 7),
329      v3b = v3 | (1 << 7),
330      v4b = v4 | (1 << 7),
331      v6b = v6 | (1 << 7),
332      v8b = v8 | (1 << 7),
333      /* these are used for WWM and spills to vgpr */
334      v1_linear = v1 | (1 << 6),
335      v2_linear = v2 | (1 << 6),
336   };
337
338   RegClass() = default;
339   constexpr RegClass(RC rc_) : rc(rc_) {}
340   constexpr RegClass(RegType type, unsigned size)
341       : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
342   {}
343
344   constexpr operator RC() const { return rc; }
345   explicit operator bool() = delete;
346
347   constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
348   constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
349   constexpr bool is_subdword() const { return rc & (1 << 7); }
350   constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
351   // TODO: use size() less in favor of bytes()
352   constexpr unsigned size() const { return (bytes() + 3) >> 2; }
353   constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
354   constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
355   constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
356
357   static constexpr RegClass get(RegType type, unsigned bytes)
358   {
359      if (type == RegType::sgpr) {
360         return RegClass(type, DIV_ROUND_UP(bytes, 4u));
361      } else {
362         return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
363      }
364   }
365
366   constexpr RegClass resize(unsigned bytes) const
367   {
368      if (is_linear_vgpr()) {
369         assert(bytes % 4u == 0);
370         return get(RegType::vgpr, bytes).as_linear();
371      }
372      return get(type(), bytes);
373   }
374
375private:
376   RC rc;
377};
378
379/* transitional helper expressions */
380static constexpr RegClass s1{RegClass::s1};
381static constexpr RegClass s2{RegClass::s2};
382static constexpr RegClass s3{RegClass::s3};
383static constexpr RegClass s4{RegClass::s4};
384static constexpr RegClass s8{RegClass::s8};
385static constexpr RegClass s16{RegClass::s16};
386static constexpr RegClass v1{RegClass::v1};
387static constexpr RegClass v2{RegClass::v2};
388static constexpr RegClass v3{RegClass::v3};
389static constexpr RegClass v4{RegClass::v4};
390static constexpr RegClass v5{RegClass::v5};
391static constexpr RegClass v6{RegClass::v6};
392static constexpr RegClass v7{RegClass::v7};
393static constexpr RegClass v8{RegClass::v8};
394static constexpr RegClass v1b{RegClass::v1b};
395static constexpr RegClass v2b{RegClass::v2b};
396static constexpr RegClass v3b{RegClass::v3b};
397static constexpr RegClass v4b{RegClass::v4b};
398static constexpr RegClass v6b{RegClass::v6b};
399static constexpr RegClass v8b{RegClass::v8b};
400
401/**
402 * Temp Class
403 * Each temporary virtual register has a
404 * register class (i.e. size and type)
405 * and SSA id.
406 */
407struct Temp {
408   Temp() noexcept : id_(0), reg_class(0) {}
409   constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
410
411   constexpr uint32_t id() const noexcept { return id_; }
412   constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
413
414   constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
415   constexpr unsigned size() const noexcept { return regClass().size(); }
416   constexpr RegType type() const noexcept { return regClass().type(); }
417   constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
418
419   constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
420   constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
421   constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
422
423private:
424   uint32_t id_ : 24;
425   uint32_t reg_class : 8;
426};
427
428/**
429 * PhysReg
430 * Represents the physical register for each
431 * Operand and Definition.
432 */
433struct PhysReg {
434   constexpr PhysReg() = default;
435   explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
436   constexpr unsigned reg() const { return reg_b >> 2; }
437   constexpr unsigned byte() const { return reg_b & 0x3; }
438   constexpr operator unsigned() const { return reg(); }
439   constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
440   constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
441   constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
442   constexpr PhysReg advance(int bytes) const
443   {
444      PhysReg res = *this;
445      res.reg_b += bytes;
446      return res;
447   }
448
449   uint16_t reg_b = 0;
450};
451
452/* helper expressions for special registers */
453static constexpr PhysReg m0{124};
454static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */
455static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */
456static constexpr PhysReg vcc{106};
457static constexpr PhysReg vcc_hi{107};
458static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
459static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
460static constexpr PhysReg ttmp0{112};
461static constexpr PhysReg ttmp1{113};
462static constexpr PhysReg ttmp2{114};
463static constexpr PhysReg ttmp3{115};
464static constexpr PhysReg ttmp4{116};
465static constexpr PhysReg ttmp5{117};
466static constexpr PhysReg ttmp6{118};
467static constexpr PhysReg ttmp7{119};
468static constexpr PhysReg ttmp8{120};
469static constexpr PhysReg ttmp9{121};
470static constexpr PhysReg ttmp10{122};
471static constexpr PhysReg ttmp11{123};
472static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
473static constexpr PhysReg exec{126};
474static constexpr PhysReg exec_lo{126};
475static constexpr PhysReg exec_hi{127};
476static constexpr PhysReg vccz{251};
477static constexpr PhysReg execz{252};
478static constexpr PhysReg scc{253};
479
480/**
481 * Operand Class
482 * Initially, each Operand refers to either
483 * a temporary virtual register
484 * or to a constant value
485 * Temporary registers get mapped to physical register during RA
486 * Constant values are inlined into the instruction sequence.
487 */
488class Operand final {
489public:
490   constexpr Operand()
491       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
492         isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
493         is24bit_(false), signext(false)
494   {}
495
496   explicit Operand(Temp r) noexcept
497   {
498      data_.temp = r;
499      if (r.id()) {
500         isTemp_ = true;
501      } else {
502         isUndef_ = true;
503         setFixed(PhysReg{128});
504      }
505   };
506   explicit Operand(Temp r, PhysReg reg) noexcept
507   {
508      assert(r.id()); /* Don't allow fixing an undef to a register */
509      data_.temp = r;
510      isTemp_ = true;
511      setFixed(reg);
512   };
513
514   /* 8-bit constant */
515   static Operand c8(uint8_t v) noexcept
516   {
517      /* 8-bit constants are only used for copies and copies from any 8-bit
518       * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
519       * to be inline constants. */
520      Operand op;
521      op.control_ = 0;
522      op.data_.i = v;
523      op.isConstant_ = true;
524      op.constSize = 0;
525      op.setFixed(PhysReg{0u});
526      return op;
527   };
528
529   /* 16-bit constant */
530   static Operand c16(uint16_t v) noexcept
531   {
532      Operand op;
533      op.control_ = 0;
534      op.data_.i = v;
535      op.isConstant_ = true;
536      op.constSize = 1;
537      if (v <= 64)
538         op.setFixed(PhysReg{128u + v});
539      else if (v >= 0xFFF0) /* [-16 .. -1] */
540         op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
541      else if (v == 0x3800) /* 0.5 */
542         op.setFixed(PhysReg{240});
543      else if (v == 0xB800) /* -0.5 */
544         op.setFixed(PhysReg{241});
545      else if (v == 0x3C00) /* 1.0 */
546         op.setFixed(PhysReg{242});
547      else if (v == 0xBC00) /* -1.0 */
548         op.setFixed(PhysReg{243});
549      else if (v == 0x4000) /* 2.0 */
550         op.setFixed(PhysReg{244});
551      else if (v == 0xC000) /* -2.0 */
552         op.setFixed(PhysReg{245});
553      else if (v == 0x4400) /* 4.0 */
554         op.setFixed(PhysReg{246});
555      else if (v == 0xC400) /* -4.0 */
556         op.setFixed(PhysReg{247});
557      else if (v == 0x3118) /* 1/2 PI */
558         op.setFixed(PhysReg{248});
559      else /* Literal Constant */
560         op.setFixed(PhysReg{255});
561      return op;
562   }
563
564   /* 32-bit constant */
565   static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
566
567   /* 64-bit constant */
568   static Operand c64(uint64_t v) noexcept
569   {
570      Operand op;
571      op.control_ = 0;
572      op.isConstant_ = true;
573      op.constSize = 3;
574      if (v <= 64) {
575         op.data_.i = (uint32_t)v;
576         op.setFixed(PhysReg{128 + (uint32_t)v});
577      } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
578         op.data_.i = (uint32_t)v;
579         op.setFixed(PhysReg{192 - (uint32_t)v});
580      } else if (v == 0x3FE0000000000000) { /* 0.5 */
581         op.data_.i = 0x3f000000;
582         op.setFixed(PhysReg{240});
583      } else if (v == 0xBFE0000000000000) { /* -0.5 */
584         op.data_.i = 0xbf000000;
585         op.setFixed(PhysReg{241});
586      } else if (v == 0x3FF0000000000000) { /* 1.0 */
587         op.data_.i = 0x3f800000;
588         op.setFixed(PhysReg{242});
589      } else if (v == 0xBFF0000000000000) { /* -1.0 */
590         op.data_.i = 0xbf800000;
591         op.setFixed(PhysReg{243});
592      } else if (v == 0x4000000000000000) { /* 2.0 */
593         op.data_.i = 0x40000000;
594         op.setFixed(PhysReg{244});
595      } else if (v == 0xC000000000000000) { /* -2.0 */
596         op.data_.i = 0xc0000000;
597         op.setFixed(PhysReg{245});
598      } else if (v == 0x4010000000000000) { /* 4.0 */
599         op.data_.i = 0x40800000;
600         op.setFixed(PhysReg{246});
601      } else if (v == 0xC010000000000000) { /* -4.0 */
602         op.data_.i = 0xc0800000;
603         op.setFixed(PhysReg{247});
604      } else { /* Literal Constant: we don't know if it is a long or double.*/
605         op.signext = v >> 63;
606         op.data_.i = v & 0xffffffffu;
607         op.setFixed(PhysReg{255});
608         assert(op.constantValue64() == v &&
609                "attempt to create a unrepresentable 64-bit literal constant");
610      }
611      return op;
612   }
613
614   /* 32-bit constant stored as a 32-bit or 64-bit operand */
615   static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
616   {
617      Operand op;
618      op.control_ = 0;
619      op.data_.i = v;
620      op.isConstant_ = true;
621      op.constSize = is64bit ? 3 : 2;
622      if (v <= 64)
623         op.setFixed(PhysReg{128 + v});
624      else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
625         op.setFixed(PhysReg{192 - v});
626      else if (v == 0x3f000000) /* 0.5 */
627         op.setFixed(PhysReg{240});
628      else if (v == 0xbf000000) /* -0.5 */
629         op.setFixed(PhysReg{241});
630      else if (v == 0x3f800000) /* 1.0 */
631         op.setFixed(PhysReg{242});
632      else if (v == 0xbf800000) /* -1.0 */
633         op.setFixed(PhysReg{243});
634      else if (v == 0x40000000) /* 2.0 */
635         op.setFixed(PhysReg{244});
636      else if (v == 0xc0000000) /* -2.0 */
637         op.setFixed(PhysReg{245});
638      else if (v == 0x40800000) /* 4.0 */
639         op.setFixed(PhysReg{246});
640      else if (v == 0xc0800000) /* -4.0 */
641         op.setFixed(PhysReg{247});
642      else { /* Literal Constant */
643         assert(!is64bit && "attempt to create a 64-bit literal constant");
644         op.setFixed(PhysReg{255});
645      }
646      return op;
647   }
648
649   static Operand literal32(uint32_t v) noexcept
650   {
651      Operand op;
652      op.control_ = 0;
653      op.data_.i = v;
654      op.isConstant_ = true;
655      op.constSize = 2;
656      op.setFixed(PhysReg{255});
657      return op;
658   }
659
660   explicit Operand(RegClass type) noexcept
661   {
662      isUndef_ = true;
663      data_.temp = Temp(0, type);
664      setFixed(PhysReg{128});
665   };
666   explicit Operand(PhysReg reg, RegClass type) noexcept
667   {
668      data_.temp = Temp(0, type);
669      setFixed(reg);
670   }
671
672   static Operand zero(unsigned bytes = 4) noexcept
673   {
674      if (bytes == 8)
675         return Operand::c64(0);
676      else if (bytes == 4)
677         return Operand::c32(0);
678      else if (bytes == 2)
679         return Operand::c16(0);
680      assert(bytes == 1);
681      return Operand::c8(0);
682   }
683
684   /* This is useful over the constructors when you want to take a gfx level
685    * for 1/2 PI or an unknown operand size.
686    */
687   static Operand get_const(enum amd_gfx_level chip, uint64_t val, unsigned bytes)
688   {
689      if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
690         /* 1/2 PI can be an inline constant on GFX8+ */
691         Operand op = Operand::c32(val);
692         op.setFixed(PhysReg{248});
693         return op;
694      }
695
696      if (bytes == 8)
697         return Operand::c64(val);
698      else if (bytes == 4)
699         return Operand::c32(val);
700      else if (bytes == 2)
701         return Operand::c16(val);
702      assert(bytes == 1);
703      return Operand::c8(val);
704   }
705
706   static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
707                                         bool sext = false)
708   {
709      if (bytes <= 4)
710         return true;
711
712      if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
713         return true;
714      uint64_t upper33 = val & 0xFFFFFFFF80000000;
715      if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
716         return true;
717
718      return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
719             val == 0x3FE0000000000000 ||              /* 0.5 */
720             val == 0xBFE0000000000000 ||              /* -0.5 */
721             val == 0x3FF0000000000000 ||              /* 1.0 */
722             val == 0xBFF0000000000000 ||              /* -1.0 */
723             val == 0x4000000000000000 ||              /* 2.0 */
724             val == 0xC000000000000000 ||              /* -2.0 */
725             val == 0x4010000000000000 ||              /* 4.0 */
726             val == 0xC010000000000000;                /* -4.0 */
727   }
728
729   constexpr bool isTemp() const noexcept { return isTemp_; }
730
731   constexpr void setTemp(Temp t) noexcept
732   {
733      assert(!isConstant_);
734      isTemp_ = true;
735      data_.temp = t;
736   }
737
738   constexpr Temp getTemp() const noexcept { return data_.temp; }
739
740   constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
741
742   constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); }
743
744   constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
745
746   constexpr unsigned bytes() const noexcept
747   {
748      if (isConstant())
749         return 1 << constSize;
750      else
751         return data_.temp.bytes();
752   }
753
754   constexpr unsigned size() const noexcept
755   {
756      if (isConstant())
757         return constSize > 2 ? 2 : 1;
758      else
759         return data_.temp.size();
760   }
761
762   constexpr bool isFixed() const noexcept { return isFixed_; }
763
764   constexpr PhysReg physReg() const noexcept { return reg_; }
765
766   constexpr void setFixed(PhysReg reg) noexcept
767   {
768      isFixed_ = reg != unsigned(-1);
769      reg_ = reg;
770   }
771
772   constexpr bool isConstant() const noexcept { return isConstant_; }
773
774   constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
775
776   constexpr bool isUndefined() const noexcept { return isUndef_; }
777
778   constexpr uint32_t constantValue() const noexcept { return data_.i; }
779
780   constexpr bool constantEquals(uint32_t cmp) const noexcept
781   {
782      return isConstant() && constantValue() == cmp;
783   }
784
785   constexpr uint64_t constantValue64() const noexcept
786   {
787      if (constSize == 3) {
788         if (reg_ <= 192)
789            return reg_ - 128;
790         else if (reg_ <= 208)
791            return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
792
793         switch (reg_) {
794         case 240: return 0x3FE0000000000000;
795         case 241: return 0xBFE0000000000000;
796         case 242: return 0x3FF0000000000000;
797         case 243: return 0xBFF0000000000000;
798         case 244: return 0x4000000000000000;
799         case 245: return 0xC000000000000000;
800         case 246: return 0x4010000000000000;
801         case 247: return 0xC010000000000000;
802         case 255:
803            return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
804         }
805         unreachable("invalid register for 64-bit constant");
806      } else {
807         return data_.i;
808      }
809   }
810
811   /* Value if this were used with vop3/opsel or vop3p. */
812   constexpr uint16_t constantValue16(bool opsel) const noexcept
813   {
814      assert(bytes() == 2 || bytes() == 4);
815      if (opsel) {
816         if (bytes() == 2 && int16_t(data_.i) >= -16 && int16_t(data_.i) <= 64 && !isLiteral())
817            return int16_t(data_.i) >> 16; /* 16-bit inline integers are sign-extended, even with fp16 instrs */
818         else
819            return data_.i >> 16;
820      }
821      return data_.i;
822   }
823
824   constexpr bool isOfType(RegType type) const noexcept
825   {
826      return hasRegClass() && regClass().type() == type;
827   }
828
829   /* Indicates that the killed operand's live range intersects with the
830    * instruction's definitions. Unlike isKill() and isFirstKill(), this is
831    * not set by liveness analysis. */
832   constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
833
834   constexpr bool isLateKill() const noexcept { return isLateKill_; }
835
836   constexpr void setKill(bool flag) noexcept
837   {
838      isKill_ = flag;
839      if (!flag)
840         setFirstKill(false);
841   }
842
843   constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
844
845   constexpr void setFirstKill(bool flag) noexcept
846   {
847      isFirstKill_ = flag;
848      if (flag)
849         setKill(flag);
850   }
851
852   /* When there are multiple operands killing the same temporary,
853    * isFirstKill() is only returns true for the first one. */
854   constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
855
856   constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
857
858   constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
859
860   constexpr bool operator==(Operand other) const noexcept
861   {
862      if (other.size() != size())
863         return false;
864      if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
865         return false;
866      if (isFixed() && other.isFixed() && physReg() != other.physReg())
867         return false;
868      if (isLiteral())
869         return other.isLiteral() && other.constantValue() == constantValue();
870      else if (isConstant())
871         return other.isConstant() && other.physReg() == physReg();
872      else if (isUndefined())
873         return other.isUndefined() && other.regClass() == regClass();
874      else
875         return other.isTemp() && other.getTemp() == getTemp();
876   }
877
878   constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
879
880   constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
881
882   constexpr bool is16bit() const noexcept { return is16bit_; }
883
884   constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
885
886   constexpr bool is24bit() const noexcept { return is24bit_; }
887
888private:
889   union {
890      Temp temp;
891      uint32_t i;
892      float f;
893   } data_ = {Temp(0, s1)};
894   PhysReg reg_;
895   union {
896      struct {
897         uint8_t isTemp_ : 1;
898         uint8_t isFixed_ : 1;
899         uint8_t isConstant_ : 1;
900         uint8_t isKill_ : 1;
901         uint8_t isUndef_ : 1;
902         uint8_t isFirstKill_ : 1;
903         uint8_t constSize : 2;
904         uint8_t isLateKill_ : 1;
905         uint8_t is16bit_ : 1;
906         uint8_t is24bit_ : 1;
907         uint8_t signext : 1;
908      };
909      /* can't initialize bit-fields in c++11, so work around using a union */
910      uint16_t control_ = 0;
911   };
912};
913
914/**
915 * Definition Class
916 * Definitions are the results of Instructions
917 * and refer to temporary virtual registers
918 * which are later mapped to physical registers
919 */
920class Definition final {
921public:
922   constexpr Definition()
923       : temp(Temp(0, s1)), reg_(0), isFixed_(0), isKill_(0), isPrecise_(0), isNUW_(0), isNoCSE_(0)
924   {}
925   Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
926   explicit Definition(Temp tmp) noexcept : temp(tmp) {}
927   Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
928   Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
929   {
930      setFixed(reg);
931   }
932
933   constexpr bool isTemp() const noexcept { return tempId() > 0; }
934
935   constexpr Temp getTemp() const noexcept { return temp; }
936
937   constexpr uint32_t tempId() const noexcept { return temp.id(); }
938
939   constexpr void setTemp(Temp t) noexcept { temp = t; }
940
941   void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
942
943   constexpr RegClass regClass() const noexcept { return temp.regClass(); }
944
945   constexpr unsigned bytes() const noexcept { return temp.bytes(); }
946
947   constexpr unsigned size() const noexcept { return temp.size(); }
948
949   constexpr bool isFixed() const noexcept { return isFixed_; }
950
951   constexpr PhysReg physReg() const noexcept { return reg_; }
952
953   constexpr void setFixed(PhysReg reg) noexcept
954   {
955      isFixed_ = 1;
956      reg_ = reg;
957   }
958
959   constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
960
961   constexpr bool isKill() const noexcept { return isKill_; }
962
963   constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
964
965   constexpr bool isPrecise() const noexcept { return isPrecise_; }
966
967   /* No Unsigned Wrap */
968   constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
969
970   constexpr bool isNUW() const noexcept { return isNUW_; }
971
972   constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
973
974   constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
975
976private:
977   Temp temp = Temp(0, s1);
978   PhysReg reg_;
979   union {
980      struct {
981         uint8_t isFixed_ : 1;
982         uint8_t isKill_ : 1;
983         uint8_t isPrecise_ : 1;
984         uint8_t isNUW_ : 1;
985         uint8_t isNoCSE_ : 1;
986      };
987      /* can't initialize bit-fields in c++11, so work around using a union */
988      uint8_t control_ = 0;
989   };
990};
991
992struct Block;
993struct Instruction;
994struct Pseudo_instruction;
995struct SOP1_instruction;
996struct SOP2_instruction;
997struct SOPK_instruction;
998struct SOPP_instruction;
999struct SOPC_instruction;
1000struct SMEM_instruction;
1001struct DS_instruction;
1002struct MTBUF_instruction;
1003struct MUBUF_instruction;
1004struct MIMG_instruction;
1005struct Export_instruction;
1006struct FLAT_instruction;
1007struct Pseudo_branch_instruction;
1008struct Pseudo_barrier_instruction;
1009struct Pseudo_reduction_instruction;
1010struct VOP3P_instruction;
1011struct VOP1_instruction;
1012struct VOP2_instruction;
1013struct VOPC_instruction;
1014struct VOP3_instruction;
1015struct Interp_instruction;
1016struct DPP16_instruction;
1017struct DPP8_instruction;
1018struct SDWA_instruction;
1019
1020struct Instruction {
1021   aco_opcode opcode;
1022   Format format;
1023   uint32_t pass_flags;
1024
1025   aco::span<Operand> operands;
1026   aco::span<Definition> definitions;
1027
1028   constexpr bool usesModifiers() const noexcept;
1029
1030   constexpr bool reads_exec() const noexcept
1031   {
1032      for (const Operand& op : operands) {
1033         if (op.isFixed() && op.physReg() == exec)
1034            return true;
1035      }
1036      return false;
1037   }
1038
1039   Pseudo_instruction& pseudo() noexcept
1040   {
1041      assert(isPseudo());
1042      return *(Pseudo_instruction*)this;
1043   }
1044   const Pseudo_instruction& pseudo() const noexcept
1045   {
1046      assert(isPseudo());
1047      return *(Pseudo_instruction*)this;
1048   }
1049   constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
1050   SOP1_instruction& sop1() noexcept
1051   {
1052      assert(isSOP1());
1053      return *(SOP1_instruction*)this;
1054   }
1055   const SOP1_instruction& sop1() const noexcept
1056   {
1057      assert(isSOP1());
1058      return *(SOP1_instruction*)this;
1059   }
1060   constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
1061   SOP2_instruction& sop2() noexcept
1062   {
1063      assert(isSOP2());
1064      return *(SOP2_instruction*)this;
1065   }
1066   const SOP2_instruction& sop2() const noexcept
1067   {
1068      assert(isSOP2());
1069      return *(SOP2_instruction*)this;
1070   }
1071   constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
1072   SOPK_instruction& sopk() noexcept
1073   {
1074      assert(isSOPK());
1075      return *(SOPK_instruction*)this;
1076   }
1077   const SOPK_instruction& sopk() const noexcept
1078   {
1079      assert(isSOPK());
1080      return *(SOPK_instruction*)this;
1081   }
1082   constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
1083   SOPP_instruction& sopp() noexcept
1084   {
1085      assert(isSOPP());
1086      return *(SOPP_instruction*)this;
1087   }
1088   const SOPP_instruction& sopp() const noexcept
1089   {
1090      assert(isSOPP());
1091      return *(SOPP_instruction*)this;
1092   }
1093   constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
1094   SOPC_instruction& sopc() noexcept
1095   {
1096      assert(isSOPC());
1097      return *(SOPC_instruction*)this;
1098   }
1099   const SOPC_instruction& sopc() const noexcept
1100   {
1101      assert(isSOPC());
1102      return *(SOPC_instruction*)this;
1103   }
1104   constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
1105   SMEM_instruction& smem() noexcept
1106   {
1107      assert(isSMEM());
1108      return *(SMEM_instruction*)this;
1109   }
1110   const SMEM_instruction& smem() const noexcept
1111   {
1112      assert(isSMEM());
1113      return *(SMEM_instruction*)this;
1114   }
1115   constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
1116   DS_instruction& ds() noexcept
1117   {
1118      assert(isDS());
1119      return *(DS_instruction*)this;
1120   }
1121   const DS_instruction& ds() const noexcept
1122   {
1123      assert(isDS());
1124      return *(DS_instruction*)this;
1125   }
1126   constexpr bool isDS() const noexcept { return format == Format::DS; }
1127   MTBUF_instruction& mtbuf() noexcept
1128   {
1129      assert(isMTBUF());
1130      return *(MTBUF_instruction*)this;
1131   }
1132   const MTBUF_instruction& mtbuf() const noexcept
1133   {
1134      assert(isMTBUF());
1135      return *(MTBUF_instruction*)this;
1136   }
1137   constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
1138   MUBUF_instruction& mubuf() noexcept
1139   {
1140      assert(isMUBUF());
1141      return *(MUBUF_instruction*)this;
1142   }
1143   const MUBUF_instruction& mubuf() const noexcept
1144   {
1145      assert(isMUBUF());
1146      return *(MUBUF_instruction*)this;
1147   }
1148   constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
1149   MIMG_instruction& mimg() noexcept
1150   {
1151      assert(isMIMG());
1152      return *(MIMG_instruction*)this;
1153   }
1154   const MIMG_instruction& mimg() const noexcept
1155   {
1156      assert(isMIMG());
1157      return *(MIMG_instruction*)this;
1158   }
1159   constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
1160   Export_instruction& exp() noexcept
1161   {
1162      assert(isEXP());
1163      return *(Export_instruction*)this;
1164   }
1165   const Export_instruction& exp() const noexcept
1166   {
1167      assert(isEXP());
1168      return *(Export_instruction*)this;
1169   }
1170   constexpr bool isEXP() const noexcept { return format == Format::EXP; }
1171   FLAT_instruction& flat() noexcept
1172   {
1173      assert(isFlat());
1174      return *(FLAT_instruction*)this;
1175   }
1176   const FLAT_instruction& flat() const noexcept
1177   {
1178      assert(isFlat());
1179      return *(FLAT_instruction*)this;
1180   }
1181   constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
1182   FLAT_instruction& global() noexcept
1183   {
1184      assert(isGlobal());
1185      return *(FLAT_instruction*)this;
1186   }
1187   const FLAT_instruction& global() const noexcept
1188   {
1189      assert(isGlobal());
1190      return *(FLAT_instruction*)this;
1191   }
1192   constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
1193   FLAT_instruction& scratch() noexcept
1194   {
1195      assert(isScratch());
1196      return *(FLAT_instruction*)this;
1197   }
1198   const FLAT_instruction& scratch() const noexcept
1199   {
1200      assert(isScratch());
1201      return *(FLAT_instruction*)this;
1202   }
1203   constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
1204   Pseudo_branch_instruction& branch() noexcept
1205   {
1206      assert(isBranch());
1207      return *(Pseudo_branch_instruction*)this;
1208   }
1209   const Pseudo_branch_instruction& branch() const noexcept
1210   {
1211      assert(isBranch());
1212      return *(Pseudo_branch_instruction*)this;
1213   }
1214   constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
1215   Pseudo_barrier_instruction& barrier() noexcept
1216   {
1217      assert(isBarrier());
1218      return *(Pseudo_barrier_instruction*)this;
1219   }
1220   const Pseudo_barrier_instruction& barrier() const noexcept
1221   {
1222      assert(isBarrier());
1223      return *(Pseudo_barrier_instruction*)this;
1224   }
1225   constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
1226   Pseudo_reduction_instruction& reduction() noexcept
1227   {
1228      assert(isReduction());
1229      return *(Pseudo_reduction_instruction*)this;
1230   }
1231   const Pseudo_reduction_instruction& reduction() const noexcept
1232   {
1233      assert(isReduction());
1234      return *(Pseudo_reduction_instruction*)this;
1235   }
1236   constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
1237   VOP3P_instruction& vop3p() noexcept
1238   {
1239      assert(isVOP3P());
1240      return *(VOP3P_instruction*)this;
1241   }
1242   const VOP3P_instruction& vop3p() const noexcept
1243   {
1244      assert(isVOP3P());
1245      return *(VOP3P_instruction*)this;
1246   }
1247   constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
1248   VOP1_instruction& vop1() noexcept
1249   {
1250      assert(isVOP1());
1251      return *(VOP1_instruction*)this;
1252   }
1253   const VOP1_instruction& vop1() const noexcept
1254   {
1255      assert(isVOP1());
1256      return *(VOP1_instruction*)this;
1257   }
1258   constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
1259   VOP2_instruction& vop2() noexcept
1260   {
1261      assert(isVOP2());
1262      return *(VOP2_instruction*)this;
1263   }
1264   const VOP2_instruction& vop2() const noexcept
1265   {
1266      assert(isVOP2());
1267      return *(VOP2_instruction*)this;
1268   }
1269   constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
1270   VOPC_instruction& vopc() noexcept
1271   {
1272      assert(isVOPC());
1273      return *(VOPC_instruction*)this;
1274   }
1275   const VOPC_instruction& vopc() const noexcept
1276   {
1277      assert(isVOPC());
1278      return *(VOPC_instruction*)this;
1279   }
1280   constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
1281   VOP3_instruction& vop3() noexcept
1282   {
1283      assert(isVOP3());
1284      return *(VOP3_instruction*)this;
1285   }
1286   const VOP3_instruction& vop3() const noexcept
1287   {
1288      assert(isVOP3());
1289      return *(VOP3_instruction*)this;
1290   }
1291   constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
1292   Interp_instruction& vintrp() noexcept
1293   {
1294      assert(isVINTRP());
1295      return *(Interp_instruction*)this;
1296   }
1297   const Interp_instruction& vintrp() const noexcept
1298   {
1299      assert(isVINTRP());
1300      return *(Interp_instruction*)this;
1301   }
1302   constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
1303   DPP16_instruction& dpp16() noexcept
1304   {
1305      assert(isDPP16());
1306      return *(DPP16_instruction*)this;
1307   }
1308   const DPP16_instruction& dpp16() const noexcept
1309   {
1310      assert(isDPP16());
1311      return *(DPP16_instruction*)this;
1312   }
1313   constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; }
1314   DPP8_instruction& dpp8() noexcept
1315   {
1316      assert(isDPP8());
1317      return *(DPP8_instruction*)this;
1318   }
1319   const DPP8_instruction& dpp8() const noexcept
1320   {
1321      assert(isDPP8());
1322      return *(DPP8_instruction*)this;
1323   }
1324   constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; }
1325   constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); }
1326   SDWA_instruction& sdwa() noexcept
1327   {
1328      assert(isSDWA());
1329      return *(SDWA_instruction*)this;
1330   }
1331   const SDWA_instruction& sdwa() const noexcept
1332   {
1333      assert(isSDWA());
1334      return *(SDWA_instruction*)this;
1335   }
1336   constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1337
1338   FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1339
1340   const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1341
1342   constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1343
1344   constexpr bool isVALU() const noexcept
1345   {
1346      return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P();
1347   }
1348
1349   constexpr bool isSALU() const noexcept
1350   {
1351      return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1352   }
1353
1354   constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1355};
1356static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1357
1358struct SOPK_instruction : public Instruction {
1359   uint16_t imm;
1360   uint16_t padding;
1361};
1362static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1363
1364struct SOPP_instruction : public Instruction {
1365   uint32_t imm;
1366   int block;
1367};
1368static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1369
1370struct SOPC_instruction : public Instruction {
1371   uint32_t padding;
1372};
1373static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1374
1375struct SOP1_instruction : public Instruction {};
1376static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1377
1378struct SOP2_instruction : public Instruction {
1379   uint32_t padding;
1380};
1381static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1382
1383/**
1384 * Scalar Memory Format:
1385 * For s_(buffer_)load_dword*:
1386 * Operand(0): SBASE - SGPR-pair which provides base address
1387 * Operand(1): Offset - immediate (un)signed offset or SGPR
1388 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1389 * Operand(n-1): SOffset - SGPR offset (Vega only)
1390 *
1391 * Having no operands is also valid for instructions such as s_dcache_inv.
1392 *
1393 */
1394struct SMEM_instruction : public Instruction {
1395   memory_sync_info sync;
1396   bool glc : 1; /* VI+: globally coherent */
1397   bool dlc : 1; /* NAVI: device level coherent */
1398   bool nv : 1;  /* VEGA only: Non-volatile */
1399   bool disable_wqm : 1;
1400   bool prevent_overflow : 1; /* avoid overflow when combining additions */
1401   uint8_t padding : 3;
1402};
1403static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1404
1405struct VOP1_instruction : public Instruction {};
1406static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1407
1408struct VOP2_instruction : public Instruction {};
1409static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1410
1411struct VOPC_instruction : public Instruction {};
1412static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1413
1414struct VOP3_instruction : public Instruction {
1415   bool abs[3];
1416   bool neg[3];
1417   uint8_t opsel : 4;
1418   uint8_t omod : 2;
1419   bool clamp : 1;
1420   uint8_t padding0 : 1;
1421   uint8_t padding1;
1422};
1423static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1424
1425struct VOP3P_instruction : public Instruction {
1426   bool neg_lo[3];
1427   bool neg_hi[3]; /* abs modifier, for v_mad_mix/v_fma_mix */
1428   uint8_t opsel_lo : 3;
1429   uint8_t opsel_hi : 3;
1430   bool clamp : 1;
1431   uint8_t padding0 : 1;
1432   uint8_t padding1;
1433};
1434static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1435
1436/**
1437 * Data Parallel Primitives Format:
1438 * This format can be used for VOP1, VOP2 or VOPC instructions.
1439 * The swizzle applies to the src0 operand.
1440 *
1441 */
1442struct DPP16_instruction : public Instruction {
1443   bool abs[2];
1444   bool neg[2];
1445   uint16_t dpp_ctrl;
1446   uint8_t row_mask : 4;
1447   uint8_t bank_mask : 4;
1448   bool bound_ctrl : 1;
1449   uint8_t padding : 7;
1450};
1451static_assert(sizeof(DPP16_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1452
1453struct DPP8_instruction : public Instruction {
1454   uint8_t lane_sel[8];
1455};
1456static_assert(sizeof(DPP8_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1457
1458struct SubdwordSel {
1459   enum sdwa_sel : uint8_t {
1460      ubyte = 0x4,
1461      uword = 0x8,
1462      dword = 0x10,
1463      sext = 0x20,
1464      sbyte = ubyte | sext,
1465      sword = uword | sext,
1466
1467      ubyte0 = ubyte,
1468      ubyte1 = ubyte | 1,
1469      ubyte2 = ubyte | 2,
1470      ubyte3 = ubyte | 3,
1471      sbyte0 = sbyte,
1472      sbyte1 = sbyte | 1,
1473      sbyte2 = sbyte | 2,
1474      sbyte3 = sbyte | 3,
1475      uword0 = uword,
1476      uword1 = uword | 2,
1477      sword0 = sword,
1478      sword1 = sword | 2,
1479   };
1480
1481   SubdwordSel() : sel((sdwa_sel)0) {}
1482   constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
1483   constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1484       : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1485   {}
1486   constexpr operator sdwa_sel() const { return sel; }
1487   explicit operator bool() const { return sel != 0; }
1488
1489   constexpr unsigned size() const { return (sel >> 2) & 0x7; }
1490   constexpr unsigned offset() const { return sel & 0x3; }
1491   constexpr bool sign_extend() const { return sel & sext; }
1492   constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1493   {
1494      reg_byte_offset += offset();
1495      if (size() == 1)
1496         return reg_byte_offset;
1497      else if (size() == 2)
1498         return 4 + (reg_byte_offset >> 1);
1499      else
1500         return 6;
1501   }
1502
1503private:
1504   sdwa_sel sel;
1505};
1506
1507/**
1508 * Sub-Dword Addressing Format:
1509 * This format can be used for VOP1, VOP2 or VOPC instructions.
1510 *
1511 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1512 * the definition doesn't have to be VCC on GFX9+.
1513 *
1514 */
1515struct SDWA_instruction : public Instruction {
1516   /* these destination modifiers aren't available with VOPC except for
1517    * clamp on GFX8 */
1518   SubdwordSel sel[2];
1519   SubdwordSel dst_sel;
1520   bool neg[2];
1521   bool abs[2];
1522   bool clamp : 1;
1523   uint8_t omod : 2; /* GFX9+ */
1524   uint8_t padding : 5;
1525};
1526static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1527
1528struct Interp_instruction : public Instruction {
1529   uint8_t attribute;
1530   uint8_t component;
1531   uint16_t padding;
1532};
1533static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1534
1535/**
1536 * Local and Global Data Sharing instructions
1537 * Operand(0): ADDR - VGPR which supplies the address.
1538 * Operand(1): DATA0 - First data VGPR.
1539 * Operand(2): DATA1 - Second data VGPR.
1540 * Operand(n-1): M0 - LDS size.
1541 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1542 *
1543 */
1544struct DS_instruction : public Instruction {
1545   memory_sync_info sync;
1546   bool gds;
1547   uint16_t offset0;
1548   uint8_t offset1;
1549   uint8_t padding;
1550};
1551static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1552
1553/**
1554 * Vector Memory Untyped-buffer Instructions
1555 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1556 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1557 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1558 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1559 *
1560 */
1561struct MUBUF_instruction : public Instruction {
1562   memory_sync_info sync;
1563   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1564   bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
1565   bool addr64 : 1;          /* SI, CIK: Address size is 64-bit */
1566   bool glc : 1;             /* globally coherent */
1567   bool dlc : 1;             /* NAVI: device level coherent */
1568   bool slc : 1;             /* system level coherent */
1569   bool tfe : 1;             /* texture fail enable */
1570   bool lds : 1;             /* Return read-data to LDS instead of VGPRs */
1571   uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1572   uint16_t offset : 12;     /* Unsigned byte offset - 12 bit */
1573   uint16_t swizzled : 1;
1574   uint16_t padding0 : 2;
1575   uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1576   uint16_t padding1 : 10;
1577};
1578static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1579
1580/**
1581 * Vector Memory Typed-buffer Instructions
1582 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1583 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1584 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1585 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1586 *
1587 */
1588struct MTBUF_instruction : public Instruction {
1589   memory_sync_info sync;
1590   uint8_t dfmt : 4;         /* Data Format of data in memory buffer */
1591   uint8_t nfmt : 3;         /* Numeric format of data in memory */
1592   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1593   uint16_t idxen : 1;       /* Supply an index from VGPR (VADDR) */
1594   uint16_t glc : 1;         /* globally coherent */
1595   uint16_t dlc : 1;         /* NAVI: device level coherent */
1596   uint16_t slc : 1;         /* system level coherent */
1597   uint16_t tfe : 1;         /* texture fail enable */
1598   uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1599   uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1600   uint16_t padding : 4;
1601   uint16_t offset; /* Unsigned byte offset - 12 bit */
1602};
1603static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1604
1605/**
1606 * Vector Memory Image Instructions
1607 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1608 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1609 * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1610 * Operand(3): VADDR - Address source. Can carry an offset or an index.
1611 * Definition(0): VDATA - Vector GPR for read result.
1612 *
1613 */
1614struct MIMG_instruction : public Instruction {
1615   memory_sync_info sync;
1616   uint8_t dmask;        /* Data VGPR enable mask */
1617   uint8_t dim : 3;      /* NAVI: dimensionality */
1618   bool unrm : 1;        /* Force address to be un-normalized */
1619   bool dlc : 1;         /* NAVI: device level coherent */
1620   bool glc : 1;         /* globally coherent */
1621   bool slc : 1;         /* system level coherent */
1622   bool tfe : 1;         /* texture fail enable */
1623   bool da : 1;          /* declare an array */
1624   bool lwe : 1;         /* LOD warning enable */
1625   bool r128 : 1;        /* NAVI: Texture resource size */
1626   bool a16 : 1;         /* VEGA, NAVI: Address components are 16-bits */
1627   bool d16 : 1;         /* Convert 32-bit data to 16-bit data */
1628   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1629   uint8_t padding0 : 2;
1630   uint8_t padding1;
1631   uint8_t padding2;
1632};
1633static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1634
1635/**
1636 * Flat/Scratch/Global Instructions
1637 * Operand(0): ADDR
1638 * Operand(1): SADDR
1639 * Operand(2) / Definition(0): DATA/VDST
1640 *
1641 */
1642struct FLAT_instruction : public Instruction {
1643   memory_sync_info sync;
1644   bool slc : 1; /* system level coherent */
1645   bool glc : 1; /* globally coherent */
1646   bool dlc : 1; /* NAVI: device level coherent */
1647   bool lds : 1;
1648   bool nv : 1;
1649   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1650   uint8_t padding0 : 2;
1651   int16_t offset; /* Vega/Navi only */
1652   uint16_t padding1;
1653};
1654static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1655
1656struct Export_instruction : public Instruction {
1657   uint8_t enabled_mask;
1658   uint8_t dest;
1659   bool compressed : 1;
1660   bool done : 1;
1661   bool valid_mask : 1;
1662   uint8_t padding0 : 5;
1663   uint8_t padding1;
1664};
1665static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1666
1667struct Pseudo_instruction : public Instruction {
1668   PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1669   bool tmp_in_scc;
1670   uint8_t padding;
1671};
1672static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1673
1674struct Pseudo_branch_instruction : public Instruction {
1675   /* target[0] is the block index of the branch target.
1676    * For conditional branches, target[1] contains the fall-through alternative.
1677    * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1678    */
1679   uint32_t target[2];
1680};
1681static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1682
1683struct Pseudo_barrier_instruction : public Instruction {
1684   memory_sync_info sync;
1685   sync_scope exec_scope;
1686};
1687static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1688
1689enum ReduceOp : uint16_t {
1690   // clang-format off
1691   iadd8, iadd16, iadd32, iadd64,
1692   imul8, imul16, imul32, imul64,
1693          fadd16, fadd32, fadd64,
1694          fmul16, fmul32, fmul64,
1695   imin8, imin16, imin32, imin64,
1696   imax8, imax16, imax32, imax64,
1697   umin8, umin16, umin32, umin64,
1698   umax8, umax16, umax32, umax64,
1699          fmin16, fmin32, fmin64,
1700          fmax16, fmax32, fmax64,
1701   iand8, iand16, iand32, iand64,
1702   ior8, ior16, ior32, ior64,
1703   ixor8, ixor16, ixor32, ixor64,
1704   num_reduce_ops,
1705   // clang-format on
1706};
1707
1708/**
1709 * Subgroup Reduction Instructions, everything except for the data to be
1710 * reduced and the result as inserted by setup_reduce_temp().
1711 * Operand(0): data to be reduced
1712 * Operand(1): reduce temporary
1713 * Operand(2): vector temporary
1714 * Definition(0): result
1715 * Definition(1): scalar temporary
1716 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1717 * Definition(3): scc clobber
1718 * Definition(4): vcc clobber
1719 *
1720 */
1721struct Pseudo_reduction_instruction : public Instruction {
1722   ReduceOp reduce_op;
1723   uint16_t cluster_size; // must be 0 for scans
1724};
1725static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1726              "Unexpected padding");
1727
1728struct instr_deleter_functor {
1729   void operator()(void* p) { free(p); }
1730};
1731
1732template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1733
1734template <typename T>
1735T*
1736create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1737                   uint32_t num_definitions)
1738{
1739   std::size_t size =
1740      sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1741   char* data = (char*)calloc(1, size);
1742   T* inst = (T*)data;
1743
1744   inst->opcode = opcode;
1745   inst->format = format;
1746
1747   uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1748   inst->operands = aco::span<Operand>(operands_offset, num_operands);
1749   uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1750   inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1751
1752   return inst;
1753}
1754
1755constexpr bool
1756Instruction::usesModifiers() const noexcept
1757{
1758   if (isDPP() || isSDWA())
1759      return true;
1760
1761   if (isVOP3P()) {
1762      const VOP3P_instruction& vop3p = this->vop3p();
1763      for (unsigned i = 0; i < operands.size(); i++) {
1764         if (vop3p.neg_lo[i] || vop3p.neg_hi[i])
1765            return true;
1766
1767         /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1768         if (!(vop3p.opsel_hi & (1 << i)))
1769            return true;
1770      }
1771      return vop3p.opsel_lo || vop3p.clamp;
1772   } else if (isVOP3()) {
1773      const VOP3_instruction& vop3 = this->vop3();
1774      for (unsigned i = 0; i < operands.size(); i++) {
1775         if (vop3.abs[i] || vop3.neg[i])
1776            return true;
1777      }
1778      return vop3.opsel || vop3.clamp || vop3.omod;
1779   }
1780   return false;
1781}
1782
1783constexpr bool
1784is_phi(Instruction* instr)
1785{
1786   return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1787}
1788
1789static inline bool
1790is_phi(aco_ptr<Instruction>& instr)
1791{
1792   return is_phi(instr.get());
1793}
1794
1795memory_sync_info get_sync_info(const Instruction* instr);
1796
1797bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);
1798
1799bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
1800bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
1801bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra);
1802bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8);
1803/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1804aco_ptr<Instruction> convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr);
1805aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8);
1806bool needs_exec_mask(const Instruction* instr);
1807
1808aco_opcode get_ordered(aco_opcode op);
1809aco_opcode get_unordered(aco_opcode op);
1810aco_opcode get_inverse(aco_opcode op);
1811aco_opcode get_f32_cmp(aco_opcode op);
1812aco_opcode get_vcmpx(aco_opcode op);
1813unsigned get_cmp_bitsize(aco_opcode op);
1814bool is_cmp(aco_opcode op);
1815
1816bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op);
1817
1818uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1819
1820unsigned get_mimg_nsa_dwords(const Instruction* instr);
1821
1822bool should_form_clause(const Instruction* a, const Instruction* b);
1823
1824enum block_kind {
1825   /* uniform indicates that leaving this block,
1826    * all actives lanes stay active */
1827   block_kind_uniform = 1 << 0,
1828   block_kind_top_level = 1 << 1,
1829   block_kind_loop_preheader = 1 << 2,
1830   block_kind_loop_header = 1 << 3,
1831   block_kind_loop_exit = 1 << 4,
1832   block_kind_continue = 1 << 5,
1833   block_kind_break = 1 << 6,
1834   block_kind_continue_or_break = 1 << 7,
1835   block_kind_branch = 1 << 8,
1836   block_kind_merge = 1 << 9,
1837   block_kind_invert = 1 << 10,
1838   block_kind_uses_discard = 1 << 12,
1839   block_kind_needs_lowering = 1 << 13,
1840   block_kind_export_end = 1 << 15,
1841};
1842
1843struct RegisterDemand {
1844   constexpr RegisterDemand() = default;
1845   constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1846   int16_t vgpr = 0;
1847   int16_t sgpr = 0;
1848
1849   constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1850   {
1851      return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1852   }
1853
1854   constexpr bool exceeds(const RegisterDemand other) const noexcept
1855   {
1856      return vgpr > other.vgpr || sgpr > other.sgpr;
1857   }
1858
1859   constexpr RegisterDemand operator+(const Temp t) const noexcept
1860   {
1861      if (t.type() == RegType::sgpr)
1862         return RegisterDemand(vgpr, sgpr + t.size());
1863      else
1864         return RegisterDemand(vgpr + t.size(), sgpr);
1865   }
1866
1867   constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1868   {
1869      return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1870   }
1871
1872   constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1873   {
1874      return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1875   }
1876
1877   constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1878   {
1879      vgpr += other.vgpr;
1880      sgpr += other.sgpr;
1881      return *this;
1882   }
1883
1884   constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1885   {
1886      vgpr -= other.vgpr;
1887      sgpr -= other.sgpr;
1888      return *this;
1889   }
1890
1891   constexpr RegisterDemand& operator+=(const Temp t) noexcept
1892   {
1893      if (t.type() == RegType::sgpr)
1894         sgpr += t.size();
1895      else
1896         vgpr += t.size();
1897      return *this;
1898   }
1899
1900   constexpr RegisterDemand& operator-=(const Temp t) noexcept
1901   {
1902      if (t.type() == RegType::sgpr)
1903         sgpr -= t.size();
1904      else
1905         vgpr -= t.size();
1906      return *this;
1907   }
1908
1909   constexpr void update(const RegisterDemand other) noexcept
1910   {
1911      vgpr = std::max(vgpr, other.vgpr);
1912      sgpr = std::max(sgpr, other.sgpr);
1913   }
1914};
1915
1916/* CFG */
1917struct Block {
1918   float_mode fp_mode;
1919   unsigned index;
1920   unsigned offset = 0;
1921   std::vector<aco_ptr<Instruction>> instructions;
1922   std::vector<unsigned> logical_preds;
1923   std::vector<unsigned> linear_preds;
1924   std::vector<unsigned> logical_succs;
1925   std::vector<unsigned> linear_succs;
1926   RegisterDemand register_demand = RegisterDemand();
1927   uint16_t loop_nest_depth = 0;
1928   uint16_t divergent_if_logical_depth = 0;
1929   uint16_t uniform_if_depth = 0;
1930   uint16_t kind = 0;
1931   int logical_idom = -1;
1932   int linear_idom = -1;
1933
1934   /* this information is needed for predecessors to blocks with phis when
1935    * moving out of ssa */
1936   bool scc_live_out = false;
1937
1938   Block() : index(0) {}
1939};
1940
1941/*
1942 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1943 */
1944enum class SWStage : uint16_t {
1945   None = 0,
1946   VS = 1 << 0,     /* Vertex Shader */
1947   GS = 1 << 1,     /* Geometry Shader */
1948   TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
1949   TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
1950   FS = 1 << 4,     /* Fragment aka Pixel Shader */
1951   CS = 1 << 5,     /* Compute Shader */
1952   TS = 1 << 6,     /* Task Shader */
1953   MS = 1 << 7,     /* Mesh Shader */
1954   GSCopy = 1 << 8, /* GS Copy Shader (internal) */
1955
1956   /* Stage combinations merged to run on a single HWStage */
1957   VS_GS = VS | GS,
1958   VS_TCS = VS | TCS,
1959   TES_GS = TES | GS,
1960};
1961
1962constexpr SWStage
1963operator|(SWStage a, SWStage b)
1964{
1965   return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b));
1966}
1967
1968/*
1969 * Shader stages as running on the AMD GPU.
1970 *
1971 * The relation between HWStages and SWStages is not a one-to-one mapping:
1972 * Some SWStages are merged by ACO to run on a single HWStage.
1973 * See README.md for details.
1974 */
1975enum class HWStage : uint8_t {
1976   VS,
1977   ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1978   GS,  /* Geometry shader on GFX10/legacy and GFX6-9. */
1979   NGG, /* Primitive shader, used to implement VS, TES, GS. */
1980   LS,  /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1981   HS,  /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1982   FS,
1983   CS,
1984};
1985
1986/*
1987 * Set of SWStages to be merged into a single shader paired with the
1988 * HWStage it will run on.
1989 */
1990struct Stage {
1991   constexpr Stage() = default;
1992
1993   explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
1994
1995   /* Check if the given SWStage is included */
1996   constexpr bool has(SWStage stage) const
1997   {
1998      return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage));
1999   }
2000
2001   unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); }
2002
2003   constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
2004
2005   constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
2006
2007   /* Mask of merged software stages */
2008   SWStage sw = SWStage::None;
2009
2010   /* Active hardware stage */
2011   HWStage hw{};
2012};
2013
2014/* possible settings of Program::stage */
2015static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
2016static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
2017static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
2018static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
2019static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
2020/* Mesh shading pipeline */
2021static constexpr Stage task_cs(HWStage::CS, SWStage::TS);
2022static constexpr Stage mesh_ngg(HWStage::NGG, SWStage::MS);
2023/* GFX10/NGG */
2024static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
2025static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
2026static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
2027static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
2028/* GFX9 (and GFX10 if NGG isn't used) */
2029static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
2030static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
2031static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
2032/* pre-GFX9 */
2033static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
2034static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
2035static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
2036static constexpr Stage tess_eval_es(HWStage::ES,
2037                                    SWStage::TES); /* tesselation evaluation before geometry */
2038static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
2039
2040enum statistic {
2041   statistic_hash,
2042   statistic_instructions,
2043   statistic_copies,
2044   statistic_branches,
2045   statistic_latency,
2046   statistic_inv_throughput,
2047   statistic_vmem_clauses,
2048   statistic_smem_clauses,
2049   statistic_sgpr_presched,
2050   statistic_vgpr_presched,
2051   num_statistics
2052};
2053
2054struct DeviceInfo {
2055   uint16_t lds_encoding_granule;
2056   uint16_t lds_alloc_granule;
2057   uint32_t lds_limit; /* in bytes */
2058   bool has_16bank_lds;
2059   uint16_t physical_sgprs;
2060   uint16_t physical_vgprs;
2061   uint16_t vgpr_limit;
2062   uint16_t sgpr_limit;
2063   uint16_t sgpr_alloc_granule;
2064   uint16_t vgpr_alloc_granule; /* must be power of two */
2065   unsigned max_wave64_per_simd;
2066   unsigned simd_per_cu;
2067   bool has_fast_fma32 = false;
2068   bool has_mac_legacy32 = false;
2069   bool fused_mad_mix = false;
2070   bool xnack_enabled = false;
2071   bool sram_ecc_enabled = false;
2072
2073   int16_t scratch_global_offset_min;
2074   int16_t scratch_global_offset_max;
2075};
2076
2077enum class CompilationProgress {
2078   after_isel,
2079   after_spilling,
2080   after_ra,
2081};
2082
2083class Program final {
2084public:
2085   std::vector<Block> blocks;
2086   std::vector<RegClass> temp_rc = {s1};
2087   RegisterDemand max_reg_demand = RegisterDemand();
2088   ac_shader_config* config;
2089   struct aco_shader_info info;
2090   enum amd_gfx_level gfx_level;
2091   enum radeon_family family;
2092   DeviceInfo dev;
2093   unsigned wave_size;
2094   RegClass lane_mask;
2095   Stage stage;
2096   bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2097   bool needs_wqm = false;   /* there exists a p_wqm instruction */
2098
2099   std::vector<uint8_t> constant_data;
2100   Temp private_segment_buffer;
2101   Temp scratch_offset;
2102
2103   uint16_t num_waves = 0;
2104   uint16_t min_waves = 0;
2105   unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2106   bool wgp_mode;
2107   bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
2108
2109   bool needs_vcc = false;
2110
2111   CompilationProgress progress;
2112
2113   bool collect_statistics = false;
2114   uint32_t statistics[num_statistics];
2115
2116   float_mode next_fp_mode;
2117   unsigned next_loop_depth = 0;
2118   unsigned next_divergent_if_logical_depth = 0;
2119   unsigned next_uniform_if_depth = 0;
2120
2121   std::vector<Definition> vs_inputs;
2122
2123   struct {
2124      FILE* output = stderr;
2125      bool shorten_messages = false;
2126      void (*func)(void* private_data, enum aco_compiler_debug_level level, const char* message);
2127      void* private_data;
2128   } debug;
2129
2130   uint32_t allocateId(RegClass rc)
2131   {
2132      assert(allocationID <= 16777215);
2133      temp_rc.push_back(rc);
2134      return allocationID++;
2135   }
2136
2137   void allocateRange(unsigned amount)
2138   {
2139      assert(allocationID + amount <= 16777216);
2140      temp_rc.resize(temp_rc.size() + amount);
2141      allocationID += amount;
2142   }
2143
2144   Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2145
2146   uint32_t peekAllocationId() { return allocationID; }
2147
2148   friend void reindex_ssa(Program* program);
2149   friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
2150
2151   Block* create_and_insert_block()
2152   {
2153      Block block;
2154      return insert_block(std::move(block));
2155   }
2156
2157   Block* insert_block(Block&& block)
2158   {
2159      block.index = blocks.size();
2160      block.fp_mode = next_fp_mode;
2161      block.loop_nest_depth = next_loop_depth;
2162      block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2163      block.uniform_if_depth = next_uniform_if_depth;
2164      blocks.emplace_back(std::move(block));
2165      return &blocks.back();
2166   }
2167
2168private:
2169   uint32_t allocationID = 1;
2170};
2171
2172struct live {
2173   /* live temps out per block */
2174   std::vector<IDSet> live_out;
2175   /* register demand (sgpr/vgpr) per instruction per block */
2176   std::vector<std::vector<RegisterDemand>> register_demand;
2177};
2178
2179struct ra_test_policy {
2180   /* Force RA to always use its pessimistic fallback algorithm */
2181   bool skip_optimistic_path = false;
2182};
2183
2184void init();
2185
2186void init_program(Program* program, Stage stage, const struct aco_shader_info* info,
2187                  enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
2188                  ac_shader_config* config);
2189
2190void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2191                    ac_shader_config* config, const struct aco_compiler_options* options,
2192                    const struct aco_shader_info* info,
2193                    const struct radv_shader_args* args);
2194void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
2195                           const struct aco_compiler_options* options,
2196                           const struct aco_shader_info* info,
2197                           const struct radv_shader_args* args);
2198void select_trap_handler_shader(Program* program, struct nir_shader* shader,
2199                                ac_shader_config* config,
2200                                const struct aco_compiler_options* options,
2201                                const struct aco_shader_info* info,
2202                                const struct radv_shader_args* args);
2203void select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key,
2204                      ac_shader_config* config,
2205                      const struct aco_compiler_options* options,
2206                      const struct aco_shader_info* info,
2207                      const struct radv_shader_args* args,
2208                      unsigned* num_preserved_sgprs);
2209
2210void select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key,
2211                      ac_shader_config* config,
2212                      const struct aco_compiler_options* options,
2213                      const struct aco_shader_info* info,
2214                      const struct radv_shader_args* args);
2215
2216void lower_phis(Program* program);
2217void calc_min_waves(Program* program);
2218void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2219live live_var_analysis(Program* program);
2220std::vector<uint16_t> dead_code_analysis(Program* program);
2221void dominator_tree(Program* program);
2222void insert_exec_mask(Program* program);
2223void value_numbering(Program* program);
2224void optimize(Program* program);
2225void optimize_postRA(Program* program);
2226void setup_reduce_temp(Program* program);
2227void lower_to_cssa(Program* program, live& live_vars);
2228void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
2229                         ra_test_policy = {});
2230void ssa_elimination(Program* program);
2231void lower_to_hw_instr(Program* program);
2232void schedule_program(Program* program, live& live_vars);
2233void spill(Program* program, live& live_vars);
2234void insert_wait_states(Program* program);
2235void insert_NOPs(Program* program);
2236void form_hard_clauses(Program* program);
2237unsigned emit_program(Program* program, std::vector<uint32_t>& code);
2238/**
2239 * Returns true if print_asm can disassemble the given program for the current build/runtime
2240 * configuration
2241 */
2242bool check_print_asm_support(Program* program);
2243bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2244bool validate_ir(Program* program);
2245bool validate_ra(Program* program);
2246#ifndef NDEBUG
2247void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
2248#else
2249#define perfwarn(program, cond, msg, ...)                                                          \
2250   do {                                                                                            \
2251   } while (0)
2252#endif
2253
2254void collect_presched_stats(Program* program);
2255void collect_preasm_stats(Program* program);
2256void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2257
2258enum print_flags {
2259   print_no_ssa = 0x1,
2260   print_perf_info = 0x2,
2261   print_kill = 0x4,
2262   print_live_vars = 0x8,
2263};
2264
2265void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2266void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0);
2267void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2268void aco_print_program(const Program* program, FILE* output, const live& live_vars,
2269                       unsigned flags = 0);
2270
2271void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
2272void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2273
2274#define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
2275#define aco_err(program, ...)      _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2276
2277/* utilities for dealing with register demand */
2278RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
2279RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
2280RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
2281                                 aco_ptr<Instruction>& instr_before);
2282
2283/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2284uint16_t get_extra_sgprs(Program* program);
2285
2286/* adjust num_waves for workgroup size and LDS limits */
2287uint16_t max_suitable_waves(Program* program, uint16_t waves);
2288
2289/* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2290uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2291uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2292
2293/* return number of addressable sgprs/vgprs for max_waves */
2294uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2295uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2296
2297typedef struct {
2298   const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2299   const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2300   const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2301   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2302   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2303   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2304   const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2305   const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2306   /* sizes used for input/output modifiers and constants */
2307   const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2308   const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2309} Info;
2310
2311extern const Info instr_info;
2312
2313} // namespace aco
2314
2315#endif /* ACO_IR_H */
2316