xref: /third_party/mesa3d/src/amd/compiler/aco_ir.cpp (revision bf215546)
1/*
2 * Copyright © 2020 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25#include "aco_ir.h"
26
27#include "aco_builder.h"
28
29#include "util/debug.h"
30
31#include "c11/threads.h"
32
33namespace aco {
34
35uint64_t debug_flags = 0;
36
37static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR},
38                                                         {"validatera", DEBUG_VALIDATE_RA},
39                                                         {"perfwarn", DEBUG_PERFWARN},
40                                                         {"force-waitcnt", DEBUG_FORCE_WAITCNT},
41                                                         {"novn", DEBUG_NO_VN},
42                                                         {"noopt", DEBUG_NO_OPT},
43                                                         {"nosched", DEBUG_NO_SCHED},
44                                                         {"perfinfo", DEBUG_PERF_INFO},
45                                                         {"liveinfo", DEBUG_LIVE_INFO},
46                                                         {NULL, 0}};
47
48static once_flag init_once_flag = ONCE_FLAG_INIT;
49
50static void
51init_once()
52{
53   debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
54
55#ifndef NDEBUG
56   /* enable some flags by default on debug builds */
57   debug_flags |= aco::DEBUG_VALIDATE_IR;
58#endif
59}
60
61void
62init()
63{
64   call_once(&init_once_flag, init_once);
65}
66
67void
68init_program(Program* program, Stage stage, const struct aco_shader_info* info,
69             enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
70             ac_shader_config* config)
71{
72   program->stage = stage;
73   program->config = config;
74   program->info = *info;
75   program->gfx_level = gfx_level;
76   if (family == CHIP_UNKNOWN) {
77      switch (gfx_level) {
78      case GFX6: program->family = CHIP_TAHITI; break;
79      case GFX7: program->family = CHIP_BONAIRE; break;
80      case GFX8: program->family = CHIP_POLARIS10; break;
81      case GFX9: program->family = CHIP_VEGA10; break;
82      case GFX10: program->family = CHIP_NAVI10; break;
83      default: program->family = CHIP_UNKNOWN; break;
84      }
85   } else {
86      program->family = family;
87   }
88   program->wave_size = info->wave_size;
89   program->lane_mask = program->wave_size == 32 ? s1 : s2;
90
91   program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024 :
92                                       gfx_level >= GFX7 ? 512 : 256;
93   program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
94   program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
95   /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
96   program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
97
98   program->dev.vgpr_limit = 256;
99   program->dev.physical_vgprs = 256;
100   program->dev.vgpr_alloc_granule = 4;
101
102   if (gfx_level >= GFX10) {
103      program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
104      program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
105      program->dev.sgpr_alloc_granule = 128;
106      program->dev.sgpr_limit =
107         108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
108      if (gfx_level == GFX10_3)
109         program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
110      else
111         program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
112   } else if (program->gfx_level >= GFX8) {
113      program->dev.physical_sgprs = 800;
114      program->dev.sgpr_alloc_granule = 16;
115      program->dev.sgpr_limit = 102;
116      if (family == CHIP_TONGA || family == CHIP_ICELAND)
117         program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
118   } else {
119      program->dev.physical_sgprs = 512;
120      program->dev.sgpr_alloc_granule = 8;
121      program->dev.sgpr_limit = 104;
122   }
123
124   program->dev.max_wave64_per_simd = 10;
125   if (program->gfx_level >= GFX10_3)
126      program->dev.max_wave64_per_simd = 16;
127   else if (program->gfx_level == GFX10)
128      program->dev.max_wave64_per_simd = 20;
129   else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
130      program->dev.max_wave64_per_simd = 8;
131
132   program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4;
133
134   switch (program->family) {
135   /* GFX8 APUs */
136   case CHIP_CARRIZO:
137   case CHIP_STONEY:
138   /* GFX9 APUS */
139   case CHIP_RAVEN:
140   case CHIP_RAVEN2:
141   case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
142   default: break;
143   }
144
145   program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS;
146   /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
147   program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
148   if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
149       program->family == CHIP_HAWAII)
150      program->dev.has_fast_fma32 = true;
151   program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level >= GFX10;
152
153   program->dev.fused_mad_mix = program->gfx_level >= GFX10;
154   if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
155       program->family == CHIP_ARCTURUS || program->family == CHIP_ALDEBARAN)
156      program->dev.fused_mad_mix = true;
157
158   if (program->gfx_level >= GFX11) {
159      program->dev.scratch_global_offset_min = -4096;
160      program->dev.scratch_global_offset_max = 4095;
161   } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
162      program->dev.scratch_global_offset_min = -2048;
163      program->dev.scratch_global_offset_max = 2047;
164   } else if (program->gfx_level == GFX9) {
165      /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
166      program->dev.scratch_global_offset_min = 0;
167      program->dev.scratch_global_offset_max = 4095;
168   }
169
170   program->wgp_mode = wgp_mode;
171
172   program->progress = CompilationProgress::after_isel;
173
174   program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
175   program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
176   program->next_fp_mode.must_flush_denorms32 = false;
177   program->next_fp_mode.must_flush_denorms16_64 = false;
178   program->next_fp_mode.care_about_round32 = false;
179   program->next_fp_mode.care_about_round16_64 = false;
180   program->next_fp_mode.denorm16_64 = fp_denorm_keep;
181   program->next_fp_mode.denorm32 = 0;
182   program->next_fp_mode.round16_64 = fp_round_ne;
183   program->next_fp_mode.round32 = fp_round_ne;
184}
185
186memory_sync_info
187get_sync_info(const Instruction* instr)
188{
189   switch (instr->format) {
190   case Format::SMEM: return instr->smem().sync;
191   case Format::MUBUF: return instr->mubuf().sync;
192   case Format::MIMG: return instr->mimg().sync;
193   case Format::MTBUF: return instr->mtbuf().sync;
194   case Format::FLAT:
195   case Format::GLOBAL:
196   case Format::SCRATCH: return instr->flatlike().sync;
197   case Format::DS: return instr->ds().sync;
198   default: return memory_sync_info();
199   }
200}
201
202bool
203can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
204{
205   if (!instr->isVALU())
206      return false;
207
208   if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
209      return false;
210
211   if (instr->isSDWA())
212      return true;
213
214   if (instr->isVOP3()) {
215      VOP3_instruction& vop3 = instr->vop3();
216      if (instr->format == Format::VOP3)
217         return false;
218      if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
219         return false;
220      if (vop3.omod && gfx_level < GFX9)
221         return false;
222
223      // TODO: return true if we know we will use vcc
224      if (!pre_ra && instr->definitions.size() >= 2)
225         return false;
226
227      for (unsigned i = 1; i < instr->operands.size(); i++) {
228         if (instr->operands[i].isLiteral())
229            return false;
230         if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
231            return false;
232      }
233   }
234
235   if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
236      return false;
237
238   if (!instr->operands.empty()) {
239      if (instr->operands[0].isLiteral())
240         return false;
241      if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
242         return false;
243      if (instr->operands[0].bytes() > 4)
244         return false;
245      if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
246         return false;
247   }
248
249   bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
250                 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
251
252   if (gfx_level != GFX8 && is_mac)
253      return false;
254
255   // TODO: return true if we know we will use vcc
256   if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
257      return false;
258   if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
259      return false;
260
261   return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
262          instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
263          instr->opcode != aco_opcode::v_readfirstlane_b32 &&
264          instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
265}
266
267/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
268aco_ptr<Instruction>
269convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
270{
271   if (instr->isSDWA())
272      return NULL;
273
274   aco_ptr<Instruction> tmp = std::move(instr);
275   Format format =
276      (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
277   instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
278                                                    tmp->definitions.size()));
279   std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
280   std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
281
282   SDWA_instruction& sdwa = instr->sdwa();
283
284   if (tmp->isVOP3()) {
285      VOP3_instruction& vop3 = tmp->vop3();
286      memcpy(sdwa.neg, vop3.neg, sizeof(sdwa.neg));
287      memcpy(sdwa.abs, vop3.abs, sizeof(sdwa.abs));
288      sdwa.omod = vop3.omod;
289      sdwa.clamp = vop3.clamp;
290   }
291
292   for (unsigned i = 0; i < instr->operands.size(); i++) {
293      /* SDWA only uses operands 0 and 1. */
294      if (i >= 2)
295         break;
296
297      sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
298   }
299
300   sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
301
302   if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
303      instr->definitions[0].setFixed(vcc);
304   if (instr->definitions.size() >= 2)
305      instr->definitions[1].setFixed(vcc);
306   if (instr->operands.size() >= 3)
307      instr->operands[2].setFixed(vcc);
308
309   instr->pass_flags = tmp->pass_flags;
310
311   return tmp;
312}
313
314bool
315can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8)
316{
317   assert(instr->isVALU() && !instr->operands.empty());
318
319   if (instr->isDPP())
320      return instr->isDPP8() == dpp8;
321
322   if (instr->operands.size() && instr->operands[0].isLiteral())
323      return false;
324
325   if (instr->isSDWA())
326      return false;
327
328   if (!pre_ra && (instr->isVOPC() || instr->definitions.size() > 1) &&
329       instr->definitions.back().physReg() != vcc)
330      return false;
331
332   if (!pre_ra && instr->operands.size() >= 3 && instr->operands[2].physReg() != vcc)
333      return false;
334
335   if (instr->isVOP3()) {
336      const VOP3_instruction* vop3 = &instr->vop3();
337      if (vop3->clamp || vop3->omod || vop3->opsel)
338         return false;
339      if (dpp8)
340         return false;
341      if (instr->format == Format::VOP3)
342         return false;
343      if (instr->operands.size() > 1 && !instr->operands[1].isOfType(RegType::vgpr))
344         return false;
345   }
346
347   /* there are more cases but those all take 64-bit inputs */
348   return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
349          instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
350          instr->opcode != aco_opcode::v_readfirstlane_b32 &&
351          instr->opcode != aco_opcode::v_cvt_f64_i32 &&
352          instr->opcode != aco_opcode::v_cvt_f64_f32 && instr->opcode != aco_opcode::v_cvt_f64_u32;
353}
354
355aco_ptr<Instruction>
356convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8)
357{
358   if (instr->isDPP())
359      return NULL;
360
361   aco_ptr<Instruction> tmp = std::move(instr);
362   Format format = (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) |
363                            (dpp8 ? (uint32_t)Format::DPP8 : (uint32_t)Format::DPP16));
364   if (dpp8)
365      instr.reset(create_instruction<DPP8_instruction>(tmp->opcode, format, tmp->operands.size(),
366                                                       tmp->definitions.size()));
367   else
368      instr.reset(create_instruction<DPP16_instruction>(tmp->opcode, format, tmp->operands.size(),
369                                                        tmp->definitions.size()));
370   std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
371   for (unsigned i = 0; i < instr->definitions.size(); i++)
372      instr->definitions[i] = tmp->definitions[i];
373
374   if (dpp8) {
375      DPP8_instruction* dpp = &instr->dpp8();
376      for (unsigned i = 0; i < 8; i++)
377         dpp->lane_sel[i] = i;
378   } else {
379      DPP16_instruction* dpp = &instr->dpp16();
380      dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
381      dpp->row_mask = 0xf;
382      dpp->bank_mask = 0xf;
383
384      if (tmp->isVOP3()) {
385         const VOP3_instruction* vop3 = &tmp->vop3();
386         memcpy(dpp->neg, vop3->neg, sizeof(dpp->neg));
387         memcpy(dpp->abs, vop3->abs, sizeof(dpp->abs));
388      }
389   }
390
391   if (instr->isVOPC() || instr->definitions.size() > 1)
392      instr->definitions.back().setFixed(vcc);
393
394   if (instr->operands.size() >= 3)
395      instr->operands[2].setFixed(vcc);
396
397   instr->pass_flags = tmp->pass_flags;
398
399   return tmp;
400}
401
402bool
403can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
404{
405   /* opsel is only GFX9+ */
406   if (gfx_level < GFX9)
407      return false;
408
409   switch (op) {
410   case aco_opcode::v_div_fixup_f16:
411   case aco_opcode::v_fma_f16:
412   case aco_opcode::v_mad_f16:
413   case aco_opcode::v_mad_u16:
414   case aco_opcode::v_mad_i16:
415   case aco_opcode::v_med3_f16:
416   case aco_opcode::v_med3_i16:
417   case aco_opcode::v_med3_u16:
418   case aco_opcode::v_min3_f16:
419   case aco_opcode::v_min3_i16:
420   case aco_opcode::v_min3_u16:
421   case aco_opcode::v_max3_f16:
422   case aco_opcode::v_max3_i16:
423   case aco_opcode::v_max3_u16:
424   case aco_opcode::v_max_u16_e64:
425   case aco_opcode::v_max_i16_e64:
426   case aco_opcode::v_min_u16_e64:
427   case aco_opcode::v_min_i16_e64:
428   case aco_opcode::v_add_i16:
429   case aco_opcode::v_sub_i16:
430   case aco_opcode::v_add_u16_e64:
431   case aco_opcode::v_sub_u16_e64:
432   case aco_opcode::v_lshlrev_b16_e64:
433   case aco_opcode::v_lshrrev_b16_e64:
434   case aco_opcode::v_ashrrev_i16_e64:
435   case aco_opcode::v_mul_lo_u16_e64: return true;
436   case aco_opcode::v_pack_b32_f16:
437   case aco_opcode::v_cvt_pknorm_i16_f16:
438   case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
439   case aco_opcode::v_mad_u32_u16:
440   case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
441   default: return false;
442   }
443}
444
445bool
446instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
447{
448   /* partial register writes are GFX9+, only */
449   if (gfx_level < GFX9)
450      return false;
451
452   switch (op) {
453   /* VOP3 */
454   case aco_opcode::v_mad_f16:
455   case aco_opcode::v_mad_u16:
456   case aco_opcode::v_mad_i16:
457   case aco_opcode::v_fma_f16:
458   case aco_opcode::v_div_fixup_f16:
459   case aco_opcode::v_interp_p2_f16:
460   case aco_opcode::v_fma_mixlo_f16:
461   case aco_opcode::v_fma_mixhi_f16:
462   /* VOP2 */
463   case aco_opcode::v_mac_f16:
464   case aco_opcode::v_madak_f16:
465   case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
466   case aco_opcode::v_add_f16:
467   case aco_opcode::v_sub_f16:
468   case aco_opcode::v_subrev_f16:
469   case aco_opcode::v_mul_f16:
470   case aco_opcode::v_max_f16:
471   case aco_opcode::v_min_f16:
472   case aco_opcode::v_ldexp_f16:
473   case aco_opcode::v_fmac_f16:
474   case aco_opcode::v_fmamk_f16:
475   case aco_opcode::v_fmaak_f16:
476   /* VOP1 */
477   case aco_opcode::v_cvt_f16_f32:
478   case aco_opcode::v_cvt_f16_u16:
479   case aco_opcode::v_cvt_f16_i16:
480   case aco_opcode::v_rcp_f16:
481   case aco_opcode::v_sqrt_f16:
482   case aco_opcode::v_rsq_f16:
483   case aco_opcode::v_log_f16:
484   case aco_opcode::v_exp_f16:
485   case aco_opcode::v_frexp_mant_f16:
486   case aco_opcode::v_frexp_exp_i16_f16:
487   case aco_opcode::v_floor_f16:
488   case aco_opcode::v_ceil_f16:
489   case aco_opcode::v_trunc_f16:
490   case aco_opcode::v_rndne_f16:
491   case aco_opcode::v_fract_f16:
492   case aco_opcode::v_sin_f16:
493   case aco_opcode::v_cos_f16: return gfx_level >= GFX10;
494   // TODO: confirm whether these write 16 or 32 bit on GFX10+
495   // case aco_opcode::v_cvt_u16_f16:
496   // case aco_opcode::v_cvt_i16_f16:
497   // case aco_opcode::p_cvt_f16_f32_rtne:
498   // case aco_opcode::v_cvt_norm_i16_f16:
499   // case aco_opcode::v_cvt_norm_u16_f16:
500   /* on GFX10, all opsel instructions preserve the high bits */
501   default: return gfx_level >= GFX10 && can_use_opsel(gfx_level, op, -1);
502   }
503}
504
505uint32_t
506get_reduction_identity(ReduceOp op, unsigned idx)
507{
508   switch (op) {
509   case iadd8:
510   case iadd16:
511   case iadd32:
512   case iadd64:
513   case fadd16:
514   case fadd32:
515   case fadd64:
516   case ior8:
517   case ior16:
518   case ior32:
519   case ior64:
520   case ixor8:
521   case ixor16:
522   case ixor32:
523   case ixor64:
524   case umax8:
525   case umax16:
526   case umax32:
527   case umax64: return 0;
528   case imul8:
529   case imul16:
530   case imul32:
531   case imul64: return idx ? 0 : 1;
532   case fmul16: return 0x3c00u;                /* 1.0 */
533   case fmul32: return 0x3f800000u;            /* 1.0 */
534   case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
535   case imin8: return INT8_MAX;
536   case imin16: return INT16_MAX;
537   case imin32: return INT32_MAX;
538   case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
539   case imax8: return INT8_MIN;
540   case imax16: return INT16_MIN;
541   case imax32: return INT32_MIN;
542   case imax64: return idx ? 0x80000000u : 0;
543   case umin8:
544   case umin16:
545   case iand8:
546   case iand16: return 0xffffffffu;
547   case umin32:
548   case umin64:
549   case iand32:
550   case iand64: return 0xffffffffu;
551   case fmin16: return 0x7c00u;                /* infinity */
552   case fmin32: return 0x7f800000u;            /* infinity */
553   case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
554   case fmax16: return 0xfc00u;                /* negative infinity */
555   case fmax32: return 0xff800000u;            /* negative infinity */
556   case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
557   default: unreachable("Invalid reduction operation"); break;
558   }
559   return 0;
560}
561
562bool
563needs_exec_mask(const Instruction* instr)
564{
565   if (instr->isVALU()) {
566      return instr->opcode != aco_opcode::v_readlane_b32 &&
567             instr->opcode != aco_opcode::v_readlane_b32_e64 &&
568             instr->opcode != aco_opcode::v_writelane_b32 &&
569             instr->opcode != aco_opcode::v_writelane_b32_e64;
570   }
571
572   if (instr->isVMEM() || instr->isFlatLike())
573      return true;
574
575   if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
576      return instr->reads_exec();
577
578   if (instr->isPseudo()) {
579      switch (instr->opcode) {
580      case aco_opcode::p_create_vector:
581      case aco_opcode::p_extract_vector:
582      case aco_opcode::p_split_vector:
583      case aco_opcode::p_phi:
584      case aco_opcode::p_parallelcopy:
585         for (Definition def : instr->definitions) {
586            if (def.getTemp().type() == RegType::vgpr)
587               return true;
588         }
589         return instr->reads_exec();
590      case aco_opcode::p_spill:
591      case aco_opcode::p_reload:
592      case aco_opcode::p_end_linear_vgpr:
593      case aco_opcode::p_logical_start:
594      case aco_opcode::p_logical_end:
595      case aco_opcode::p_startpgm:
596      case aco_opcode::p_init_scratch: return instr->reads_exec();
597      default: break;
598      }
599   }
600
601   return true;
602}
603
604struct CmpInfo {
605   aco_opcode ordered;
606   aco_opcode unordered;
607   aco_opcode swapped;
608   aco_opcode inverse;
609   aco_opcode vcmpx;
610   aco_opcode f32;
611   unsigned size;
612};
613
614ALWAYS_INLINE bool
615get_cmp_info(aco_opcode op, CmpInfo* info)
616{
617   info->ordered = aco_opcode::num_opcodes;
618   info->unordered = aco_opcode::num_opcodes;
619   info->swapped = aco_opcode::num_opcodes;
620   info->inverse = aco_opcode::num_opcodes;
621   info->f32 = aco_opcode::num_opcodes;
622   switch (op) {
623      // clang-format off
624#define CMP2(ord, unord, ord_swap, unord_swap, sz)                                                 \
625   case aco_opcode::v_cmp_##ord##_f##sz:                                                           \
626   case aco_opcode::v_cmp_n##unord##_f##sz:                                                        \
627      info->ordered = aco_opcode::v_cmp_##ord##_f##sz;                                             \
628      info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;                                        \
629      info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
630                                                      : aco_opcode::v_cmp_n##unord_swap##_f##sz;   \
631      info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
632                                                               : aco_opcode::v_cmp_n##ord##_f##sz; \
633      info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32            \
634                                                        : aco_opcode::v_cmp_n##unord##_f32;        \
635      info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz       \
636                                                          : aco_opcode::v_cmpx_n##unord##_f##sz;   \
637      info->size = sz;                                                                             \
638      return true;
639#define CMP(ord, unord, ord_swap, unord_swap)                                                      \
640   CMP2(ord, unord, ord_swap, unord_swap, 16)                                                      \
641   CMP2(ord, unord, ord_swap, unord_swap, 32)                                                      \
642   CMP2(ord, unord, ord_swap, unord_swap, 64)
643      CMP(lt, /*n*/ge, gt, /*n*/le)
644      CMP(eq, /*n*/lg, eq, /*n*/lg)
645      CMP(le, /*n*/gt, ge, /*n*/lt)
646      CMP(gt, /*n*/le, lt, /*n*/ge)
647      CMP(lg, /*n*/eq, lg, /*n*/eq)
648      CMP(ge, /*n*/lt, le, /*n*/gt)
649#undef CMP
650#undef CMP2
651#define ORD_TEST(sz)                                                                               \
652   case aco_opcode::v_cmp_u_f##sz:                                                                 \
653      info->f32 = aco_opcode::v_cmp_u_f32;                                                         \
654      info->swapped = aco_opcode::v_cmp_u_f##sz;                                                   \
655      info->inverse = aco_opcode::v_cmp_o_f##sz;                                                   \
656      info->vcmpx = aco_opcode::v_cmpx_u_f##sz;                                                    \
657      info->size = sz;                                                                             \
658      return true;                                                                                 \
659   case aco_opcode::v_cmp_o_f##sz:                                                                 \
660      info->f32 = aco_opcode::v_cmp_o_f32;                                                         \
661      info->swapped = aco_opcode::v_cmp_o_f##sz;                                                   \
662      info->inverse = aco_opcode::v_cmp_u_f##sz;                                                   \
663      info->vcmpx = aco_opcode::v_cmpx_o_f##sz;                                                    \
664      info->size = sz;                                                                             \
665      return true;
666      ORD_TEST(16)
667      ORD_TEST(32)
668      ORD_TEST(64)
669#undef ORD_TEST
670#define CMPI2(op, swap, inv, type, sz)                                                             \
671   case aco_opcode::v_cmp_##op##_##type##sz:                                                       \
672      info->swapped = aco_opcode::v_cmp_##swap##_##type##sz;                                       \
673      info->inverse = aco_opcode::v_cmp_##inv##_##type##sz;                                        \
674      info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz;                                          \
675      info->size = sz;                                                                             \
676      return true;
677#define CMPI(op, swap, inv)                                                                        \
678   CMPI2(op, swap, inv, i, 16)                                                                     \
679   CMPI2(op, swap, inv, u, 16)                                                                     \
680   CMPI2(op, swap, inv, i, 32)                                                                     \
681   CMPI2(op, swap, inv, u, 32)                                                                     \
682   CMPI2(op, swap, inv, i, 64)                                                                     \
683   CMPI2(op, swap, inv, u, 64)
684      CMPI(lt, gt, ge)
685      CMPI(eq, eq, lg)
686      CMPI(le, ge, gt)
687      CMPI(gt, lt, le)
688      CMPI(lg, lg, eq)
689      CMPI(ge, le, lt)
690#undef CMPI
691#undef CMPI2
692#define CMPCLASS(sz)                                                                               \
693   case aco_opcode::v_cmp_class_f##sz:                                                             \
694      info->vcmpx = aco_opcode::v_cmpx_class_f##sz;                                                \
695      info->size = sz;                                                                             \
696      return true;
697      CMPCLASS(16)
698      CMPCLASS(32)
699      CMPCLASS(64)
700#undef CMPCLASS
701      // clang-format on
702   default: return false;
703   }
704}
705
706aco_opcode
707get_ordered(aco_opcode op)
708{
709   CmpInfo info;
710   return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
711}
712
713aco_opcode
714get_unordered(aco_opcode op)
715{
716   CmpInfo info;
717   return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
718}
719
720aco_opcode
721get_inverse(aco_opcode op)
722{
723   CmpInfo info;
724   return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
725}
726
727aco_opcode
728get_f32_cmp(aco_opcode op)
729{
730   CmpInfo info;
731   return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
732}
733
734aco_opcode
735get_vcmpx(aco_opcode op)
736{
737   CmpInfo info;
738   return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
739}
740
741unsigned
742get_cmp_bitsize(aco_opcode op)
743{
744   CmpInfo info;
745   return get_cmp_info(op, &info) ? info.size : 0;
746}
747
748bool
749is_cmp(aco_opcode op)
750{
751   CmpInfo info;
752   return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
753}
754
755bool
756can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op)
757{
758   if (instr->isDPP())
759      return false;
760
761   if (instr->operands[0].isConstant() ||
762       (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
763      return false;
764
765   switch (instr->opcode) {
766   case aco_opcode::v_add_u32:
767   case aco_opcode::v_add_co_u32:
768   case aco_opcode::v_add_co_u32_e64:
769   case aco_opcode::v_add_i32:
770   case aco_opcode::v_add_f16:
771   case aco_opcode::v_add_f32:
772   case aco_opcode::v_mul_f16:
773   case aco_opcode::v_mul_f32:
774   case aco_opcode::v_or_b32:
775   case aco_opcode::v_and_b32:
776   case aco_opcode::v_xor_b32:
777   case aco_opcode::v_max_f16:
778   case aco_opcode::v_max_f32:
779   case aco_opcode::v_min_f16:
780   case aco_opcode::v_min_f32:
781   case aco_opcode::v_max_i32:
782   case aco_opcode::v_min_i32:
783   case aco_opcode::v_max_u32:
784   case aco_opcode::v_min_u32:
785   case aco_opcode::v_max_i16:
786   case aco_opcode::v_min_i16:
787   case aco_opcode::v_max_u16:
788   case aco_opcode::v_min_u16:
789   case aco_opcode::v_max_i16_e64:
790   case aco_opcode::v_min_i16_e64:
791   case aco_opcode::v_max_u16_e64:
792   case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
793   case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
794   case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
795   case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
796   case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
797   case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
798   default: {
799      CmpInfo info;
800      if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
801         *new_op = info.swapped;
802         return true;
803      }
804      return false;
805   }
806   }
807}
808
809wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
810{}
811wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
812    : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
813{}
814
815wait_imm::wait_imm(enum amd_gfx_level gfx_level, uint16_t packed) : vs(unset_counter)
816{
817   vm = packed & 0xf;
818   if (gfx_level >= GFX9)
819      vm |= (packed >> 10) & 0x30;
820
821   exp = (packed >> 4) & 0x7;
822
823   lgkm = (packed >> 8) & 0xf;
824   if (gfx_level >= GFX10)
825      lgkm |= (packed >> 8) & 0x30;
826}
827
828uint16_t
829wait_imm::pack(enum amd_gfx_level gfx_level) const
830{
831   uint16_t imm = 0;
832   assert(exp == unset_counter || exp <= 0x7);
833   switch (gfx_level) {
834   case GFX11:
835      assert(lgkm == unset_counter || lgkm <= 0x3f);
836      assert(vm == unset_counter || vm <= 0x3f);
837      imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
838      break;
839   case GFX10:
840   case GFX10_3:
841      assert(lgkm == unset_counter || lgkm <= 0x3f);
842      assert(vm == unset_counter || vm <= 0x3f);
843      imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
844      break;
845   case GFX9:
846      assert(lgkm == unset_counter || lgkm <= 0xf);
847      assert(vm == unset_counter || vm <= 0x3f);
848      imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
849      break;
850   default:
851      assert(lgkm == unset_counter || lgkm <= 0xf);
852      assert(vm == unset_counter || vm <= 0xf);
853      imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
854      break;
855   }
856   if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
857      imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
858                        architecture when interpreting the immediate */
859   if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
860      imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
861                        architecture when interpreting the immediate */
862   return imm;
863}
864
865bool
866wait_imm::combine(const wait_imm& other)
867{
868   bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
869   vm = std::min(vm, other.vm);
870   exp = std::min(exp, other.exp);
871   lgkm = std::min(lgkm, other.lgkm);
872   vs = std::min(vs, other.vs);
873   return changed;
874}
875
876bool
877wait_imm::empty() const
878{
879   return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
880          vs == unset_counter;
881}
882
883bool
884should_form_clause(const Instruction* a, const Instruction* b)
885{
886   /* Vertex attribute loads from the same binding likely load from similar addresses */
887   unsigned a_vtx_binding =
888      a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
889   unsigned b_vtx_binding =
890      b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
891   if (a_vtx_binding && a_vtx_binding == b_vtx_binding)
892      return true;
893
894   if (a->format != b->format)
895      return false;
896
897   /* Assume loads which don't use descriptors might load from similar addresses. */
898   if (a->isFlatLike())
899      return true;
900   if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
901      return true;
902
903   /* If they load from the same descriptor, assume they might load from similar
904    * addresses.
905    */
906   if (a->isVMEM() || a->isSMEM())
907      return a->operands[0].tempId() == b->operands[0].tempId();
908
909   return false;
910}
911
912} // namespace aco
913