1/*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sub license, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18 * USE OR OTHER DEALINGS IN THE SOFTWARE.
19 *
20 * The above copyright notice and this permission notice (including the
21 * next paragraph) shall be included in all copies or substantial portions
22 * of the Software.
23 *
24 */
25/* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26#include "ac_llvm_build.h"
27
28#include "ac_nir.h"
29#include "ac_llvm_util.h"
30#include "ac_shader_util.h"
31#include "c11/threads.h"
32#include "shader_enums.h"
33#include "sid.h"
34#include "util/bitscan.h"
35#include "util/macros.h"
36#include "util/u_atomic.h"
37#include "util/u_math.h"
38#include <llvm-c/Core.h>
39#include <llvm/Config/llvm-config.h>
40
41#include <assert.h>
42#include <stdio.h>
43
44#define AC_LLVM_INITIAL_CF_DEPTH 4
45
46/* Data for if/else/endif and bgnloop/endloop control flow structures.
47 */
48struct ac_llvm_flow {
49   /* Loop exit or next part of if/else/endif. */
50   LLVMBasicBlockRef next_block;
51   LLVMBasicBlockRef loop_entry_block;
52};
53
54/* Initialize module-independent parts of the context.
55 *
56 * The caller is responsible for initializing ctx::module and ctx::builder.
57 */
58void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
59                          enum amd_gfx_level gfx_level, enum radeon_family family,
60                          bool has_3d_cube_border_color_mipmap,
61                          enum ac_float_mode float_mode, unsigned wave_size,
62                          unsigned ballot_mask_bits)
63{
64   ctx->context = LLVMContextCreate();
65   #if LLVM_VERSION_MAJOR >= 15
66   LLVMContextSetOpaquePointers(ctx->context, false);
67   #endif
68
69   ctx->gfx_level = gfx_level;
70   ctx->family = family;
71   ctx->has_3d_cube_border_color_mipmap = has_3d_cube_border_color_mipmap;
72   ctx->wave_size = wave_size;
73   ctx->ballot_mask_bits = ballot_mask_bits;
74   ctx->float_mode = float_mode;
75   ctx->module = ac_create_module(compiler->tm, ctx->context);
76   ctx->builder = ac_create_builder(ctx->context, float_mode);
77
78   ctx->voidt = LLVMVoidTypeInContext(ctx->context);
79   ctx->i1 = LLVMInt1TypeInContext(ctx->context);
80   ctx->i8 = LLVMInt8TypeInContext(ctx->context);
81   ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
82   ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
83   ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
84   ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
85   ctx->intptr = ctx->i32;
86   ctx->f16 = LLVMHalfTypeInContext(ctx->context);
87   ctx->f32 = LLVMFloatTypeInContext(ctx->context);
88   ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
89   ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
90   ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
91   ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
92   ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
93   ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
94   ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
95   ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
96   ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
97   ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
98   ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
99   ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
100   ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
101   ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
102
103   ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
104   ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
105   ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
106   ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
107   ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
108   ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
109   ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
110   ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
111   ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
112   ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
113   ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
114   ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
115   ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
116   ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
117   ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
118   ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
119
120   ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
121   ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
122
123   ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
124
125   ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
126
127   ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
128
129   ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
130   ctx->flow = calloc(1, sizeof(*ctx->flow));
131}
132
133void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
134{
135   free(ctx->flow->stack);
136   free(ctx->flow);
137   ctx->flow = NULL;
138}
139
140int ac_get_llvm_num_components(LLVMValueRef value)
141{
142   LLVMTypeRef type = LLVMTypeOf(value);
143   unsigned num_components =
144      LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
145   return num_components;
146}
147
148LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
149{
150   if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
151      assert(index == 0);
152      return value;
153   }
154
155   return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
156}
157
158int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
159{
160   if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
161      type = LLVMGetElementType(type);
162
163   if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
164      return LLVMGetIntTypeWidth(type);
165
166   if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
167      if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
168         return 32;
169   }
170
171   if (type == ctx->f16)
172      return 16;
173   if (type == ctx->f32)
174      return 32;
175   if (type == ctx->f64)
176      return 64;
177
178   unreachable("Unhandled type kind in get_elem_bits");
179}
180
181unsigned ac_get_type_size(LLVMTypeRef type)
182{
183   LLVMTypeKind kind = LLVMGetTypeKind(type);
184
185   switch (kind) {
186   case LLVMIntegerTypeKind:
187      return LLVMGetIntTypeWidth(type) / 8;
188   case LLVMHalfTypeKind:
189      return 2;
190   case LLVMFloatTypeKind:
191      return 4;
192   case LLVMDoubleTypeKind:
193      return 8;
194   case LLVMPointerTypeKind:
195      if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
196         return 4;
197      return 8;
198   case LLVMVectorTypeKind:
199      return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
200   case LLVMArrayTypeKind:
201      return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
202   default:
203      assert(0);
204      return 0;
205   }
206}
207
208static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
209{
210   if (t == ctx->i1)
211      return ctx->i1;
212   else if (t == ctx->i8)
213      return ctx->i8;
214   else if (t == ctx->f16 || t == ctx->i16)
215      return ctx->i16;
216   else if (t == ctx->f32 || t == ctx->i32)
217      return ctx->i32;
218   else if (t == ctx->f64 || t == ctx->i64)
219      return ctx->i64;
220   else
221      unreachable("Unhandled integer size");
222}
223
224LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
225{
226   if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
227      LLVMTypeRef elem_type = LLVMGetElementType(t);
228      return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
229   }
230   if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
231      switch (LLVMGetPointerAddressSpace(t)) {
232      case AC_ADDR_SPACE_GLOBAL:
233         return ctx->i64;
234      case AC_ADDR_SPACE_CONST_32BIT:
235      case AC_ADDR_SPACE_LDS:
236         return ctx->i32;
237      default:
238         unreachable("unhandled address space");
239      }
240   }
241   return to_integer_type_scalar(ctx, t);
242}
243
244LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
245{
246   LLVMTypeRef type = LLVMTypeOf(v);
247   if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
248      return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
249   }
250   return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
251}
252
253LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
254{
255   LLVMTypeRef type = LLVMTypeOf(v);
256   if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
257      return v;
258   return ac_to_integer(ctx, v);
259}
260
261static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
262{
263   if (t == ctx->i8)
264      return ctx->i8;
265   else if (t == ctx->i16 || t == ctx->f16)
266      return ctx->f16;
267   else if (t == ctx->i32 || t == ctx->f32)
268      return ctx->f32;
269   else if (t == ctx->i64 || t == ctx->f64)
270      return ctx->f64;
271   else
272      unreachable("Unhandled float size");
273}
274
275LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
276{
277   if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
278      LLVMTypeRef elem_type = LLVMGetElementType(t);
279      return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
280   }
281   return to_float_type_scalar(ctx, t);
282}
283
284LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
285{
286   LLVMTypeRef type = LLVMTypeOf(v);
287   return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
288}
289
290LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
291                                LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
292                                unsigned attrib_mask)
293{
294   LLVMValueRef call;
295   bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
296
297   LLVMTypeRef param_types[32];
298   assert(param_count <= 32);
299   for (unsigned i = 0; i < param_count; ++i) {
300      assert(params[i]);
301      param_types[i] = LLVMTypeOf(params[i]);
302   }
303
304   LLVMTypeRef function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
305   LLVMValueRef function = LLVMGetNamedFunction(ctx->module, name);
306
307   if (!function) {
308      function = LLVMAddFunction(ctx->module, name, function_type);
309
310      LLVMSetFunctionCallConv(function, LLVMCCallConv);
311      LLVMSetLinkage(function, LLVMExternalLinkage);
312
313      if (!set_callsite_attrs)
314         ac_add_func_attributes(ctx->context, function, attrib_mask);
315   }
316
317   call = LLVMBuildCall2(ctx->builder, function_type, function, params, param_count, "");
318   if (set_callsite_attrs)
319      ac_add_func_attributes(ctx->context, call, attrib_mask);
320   return call;
321}
322
323/**
324 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
325 * intrinsic names).
326 */
327void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
328{
329   LLVMTypeRef elem_type = type;
330
331   if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
332      unsigned count = LLVMCountStructElementTypes(type);
333      int ret = snprintf(buf, bufsize, "sl_");
334      buf += ret;
335      bufsize -= ret;
336
337      LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
338      LLVMGetStructElementTypes(type, elems);
339
340      for (unsigned i = 0; i < count; i++) {
341         ac_build_type_name_for_intr(elems[i], buf, bufsize);
342         ret = strlen(buf);
343         buf += ret;
344         bufsize -= ret;
345      }
346
347      snprintf(buf, bufsize, "s");
348      return;
349   }
350
351   assert(bufsize >= 8);
352   if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
353      int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
354      if (ret < 0) {
355         char *type_name = LLVMPrintTypeToString(type);
356         fprintf(stderr, "Error building type name for: %s\n", type_name);
357         LLVMDisposeMessage(type_name);
358         return;
359      }
360      elem_type = LLVMGetElementType(type);
361      buf += ret;
362      bufsize -= ret;
363   }
364   switch (LLVMGetTypeKind(elem_type)) {
365   default:
366      break;
367   case LLVMIntegerTypeKind:
368      snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
369      break;
370   case LLVMHalfTypeKind:
371      snprintf(buf, bufsize, "f16");
372      break;
373   case LLVMFloatTypeKind:
374      snprintf(buf, bufsize, "f32");
375      break;
376   case LLVMDoubleTypeKind:
377      snprintf(buf, bufsize, "f64");
378      break;
379   }
380}
381
382/**
383 * Helper function that builds an LLVM IR PHI node and immediately adds
384 * incoming edges.
385 */
386LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
387                          LLVMValueRef *values, LLVMBasicBlockRef *blocks)
388{
389   LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
390   LLVMAddIncoming(phi, values, blocks, count_incoming);
391   return phi;
392}
393
394void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage)
395{
396   /* GFX6 only: s_barrier isn’t needed in TCS because an entire patch always fits into
397    * a single wave due to a bug workaround disallowing multi-wave HS workgroups.
398    */
399   if (ctx->gfx_level == GFX6 && stage == MESA_SHADER_TESS_CTRL)
400      return;
401
402   ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
403}
404
405/* Prevent optimizations (at least of memory accesses) across the current
406 * point in the program by emitting empty inline assembly that is marked as
407 * having side effects.
408 *
409 * Optionally, a value can be passed through the inline assembly to prevent
410 * LLVM from hoisting calls to ReadNone functions.
411 */
412void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)
413{
414   static int counter = 0;
415
416   LLVMBuilderRef builder = ctx->builder;
417   char code[16];
418   const char *constraint = sgpr ? "=s,0" : "=v,0";
419
420   snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));
421
422   if (!pgpr) {
423      LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
424      LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
425      LLVMBuildCall2(builder, ftype, inlineasm, NULL, 0, "");
426   } else if (LLVMTypeOf(*pgpr) == ctx->i32) {
427      /* Simple version for i32 that allows the caller to set LLVM metadata on the call
428       * instruction. */
429      LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
430      LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
431
432      *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
433   } else if (LLVMTypeOf(*pgpr) == ctx->i16) {
434      /* Simple version for i16 that allows the caller to set LLVM metadata on the call
435       * instruction. */
436      LLVMTypeRef ftype = LLVMFunctionType(ctx->i16, &ctx->i16, 1, false);
437      LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
438
439      *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
440   } else if (LLVMGetTypeKind(LLVMTypeOf(*pgpr)) == LLVMPointerTypeKind) {
441      LLVMTypeRef type = LLVMTypeOf(*pgpr);
442      LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);
443      LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
444
445      *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
446   } else {
447      LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
448      LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
449      LLVMTypeRef type = LLVMTypeOf(*pgpr);
450      unsigned bitsize = ac_get_elem_bits(ctx, type);
451      LLVMValueRef vgpr = *pgpr;
452      LLVMTypeRef vgpr_type;
453      unsigned vgpr_size;
454      LLVMValueRef vgpr0;
455
456      if (bitsize < 32)
457         vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
458
459      vgpr_type = LLVMTypeOf(vgpr);
460      vgpr_size = ac_get_type_size(vgpr_type);
461
462      assert(vgpr_size % 4 == 0);
463
464      vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
465      vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
466      vgpr0 = LLVMBuildCall2(builder, ftype, inlineasm, &vgpr0, 1, "");
467      vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
468      vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
469
470      if (bitsize < 32)
471         vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
472
473      *pgpr = vgpr;
474   }
475}
476
477LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)
478{
479   const char *subgroup = "llvm.readcyclecounter";
480   const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
481
482   LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
483   return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
484}
485
486LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
487{
488   const char *name;
489
490   if (LLVMTypeOf(value) == ctx->i1)
491      value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
492
493   if (ctx->wave_size == 64)
494      name = "llvm.amdgcn.icmp.i64.i32";
495   else
496      name = "llvm.amdgcn.icmp.i32.i32";
497
498   LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
499
500   /* We currently have no other way to prevent LLVM from lifting the icmp
501    * calls to a dominating basic block.
502    */
503   ac_build_optimization_barrier(ctx, &args[0], false);
504
505   args[0] = ac_to_integer(ctx, args[0]);
506
507   return ac_build_intrinsic(
508      ctx, name, ctx->iN_wavemask, args, 3,
509      AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
510}
511
512LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
513{
514   const char *name;
515
516   if (ctx->wave_size == 64)
517      name = "llvm.amdgcn.icmp.i64.i1";
518   else
519      name = "llvm.amdgcn.icmp.i32.i1";
520
521   LLVMValueRef args[3] = {
522      value,
523      ctx->i1false,
524      LLVMConstInt(ctx->i32, LLVMIntNE, 0),
525   };
526
527   return ac_build_intrinsic(
528      ctx, name, ctx->iN_wavemask, args, 3,
529      AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
530}
531
532LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
533{
534   LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
535   LLVMValueRef vote_set = ac_build_ballot(ctx, value);
536   return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
537}
538
539LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
540{
541   LLVMValueRef vote_set = ac_build_ballot(ctx, value);
542   return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
543                        "");
544}
545
546LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
547{
548   LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
549   LLVMValueRef vote_set = ac_build_ballot(ctx, value);
550
551   LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
552   LLVMValueRef none =
553      LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
554   return LLVMBuildOr(ctx->builder, all, none, "");
555}
556
557LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
558                                            unsigned value_count, unsigned component)
559{
560   LLVMValueRef vec = NULL;
561
562   if (value_count == 1) {
563      return values[component];
564   } else if (!value_count)
565      unreachable("value_count is 0");
566
567   for (unsigned i = component; i < value_count + component; i++) {
568      LLVMValueRef value = values[i];
569
570      if (i == component)
571         vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
572      LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
573      vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
574   }
575   return vec;
576}
577
578LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
579                                             unsigned value_count, unsigned value_stride,
580                                             bool always_vector)
581{
582   LLVMBuilderRef builder = ctx->builder;
583   LLVMValueRef vec = NULL;
584   unsigned i;
585
586   if (value_count == 1 && !always_vector) {
587      return values[0];
588   } else if (!value_count)
589      unreachable("value_count is 0");
590
591   for (i = 0; i < value_count; i++) {
592      LLVMValueRef value = values[i * value_stride];
593
594      if (!i)
595         vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
596      LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
597      vec = LLVMBuildInsertElement(builder, vec, value, index, "");
598   }
599   return vec;
600}
601
602LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
603                                    unsigned value_count)
604{
605   return ac_build_gather_values_extended(ctx, values, value_count, 1, false);
606}
607
608LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
609{
610   unsigned a_size = ac_get_llvm_num_components(a);
611   unsigned b_size = ac_get_llvm_num_components(b);
612
613   LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
614   for (unsigned i = 0; i < a_size; i++)
615      elems[i] = ac_llvm_extract_elem(ctx, a, i);
616   for (unsigned i = 0; i < b_size; i++)
617      elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
618
619   return ac_build_gather_values(ctx, elems, a_size + b_size);
620}
621
622/* Expand a scalar or vector to <dst_channels x type> by filling the remaining
623 * channels with undef. Extract at most src_channels components from the input.
624 */
625LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
626                             unsigned src_channels, unsigned dst_channels)
627{
628   LLVMTypeRef elemtype;
629   LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
630
631   if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
632      unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
633
634      if (src_channels == dst_channels && vec_size == dst_channels)
635         return value;
636
637      src_channels = MIN2(src_channels, vec_size);
638
639      for (unsigned i = 0; i < src_channels; i++)
640         chan[i] = ac_llvm_extract_elem(ctx, value, i);
641
642      elemtype = LLVMGetElementType(LLVMTypeOf(value));
643   } else {
644      if (src_channels) {
645         assert(src_channels == 1);
646         chan[0] = value;
647      }
648      elemtype = LLVMTypeOf(value);
649   }
650
651   for (unsigned i = src_channels; i < dst_channels; i++)
652      chan[i] = LLVMGetUndef(elemtype);
653
654   return ac_build_gather_values(ctx, chan, dst_channels);
655}
656
657/* Extract components [start, start + channels) from a vector.
658 */
659LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
660                                   unsigned channels)
661{
662   LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
663
664   for (unsigned i = 0; i < channels; i++)
665      chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
666
667   return ac_build_gather_values(ctx, chan, channels);
668}
669
670/* Expand a scalar or vector to <4 x type> by filling the remaining channels
671 * with undef. Extract at most num_channels components from the input.
672 */
673LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
674                                     unsigned num_channels)
675{
676   return ac_build_expand(ctx, value, num_channels, 4);
677}
678
679LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
680{
681   unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
682   const char *name;
683
684   if (type_size == 2)
685      name = "llvm.rint.f16";
686   else if (type_size == 4)
687      name = "llvm.rint.f32";
688   else
689      name = "llvm.rint.f64";
690
691   return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE);
692}
693
694LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
695{
696   unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
697   const char *name;
698
699   /* For doubles, we need precise division to pass GLCTS. */
700   if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8)
701      return LLVMBuildFDiv(ctx->builder, num, den, "");
702
703   if (type_size == 2)
704      name = "llvm.amdgcn.rcp.f16";
705   else if (type_size == 4)
706      name = "llvm.amdgcn.rcp.f32";
707   else
708      name = "llvm.amdgcn.rcp.f64";
709
710   LLVMValueRef rcp =
711      ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE);
712
713   return LLVMBuildFMul(ctx->builder, num, rcp, "");
714}
715
716/* See fast_idiv_by_const.h. */
717/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
718LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
719                                LLVMValueRef multiplier, LLVMValueRef pre_shift,
720                                LLVMValueRef post_shift, LLVMValueRef increment)
721{
722   LLVMBuilderRef builder = ctx->builder;
723
724   num = LLVMBuildLShr(builder, num, pre_shift, "");
725   num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
726                      LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
727   num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
728   num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
729   num = LLVMBuildTrunc(builder, num, ctx->i32, "");
730   return LLVMBuildLShr(builder, num, post_shift, "");
731}
732
733/* See fast_idiv_by_const.h. */
734/* If num != UINT_MAX, this more efficient version can be used. */
735/* Set: increment = util_fast_udiv_info::increment; */
736LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
737                                    LLVMValueRef multiplier, LLVMValueRef pre_shift,
738                                    LLVMValueRef post_shift, LLVMValueRef increment)
739{
740   LLVMBuilderRef builder = ctx->builder;
741
742   num = LLVMBuildLShr(builder, num, pre_shift, "");
743   num = LLVMBuildNUWAdd(builder, num, increment, "");
744   num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
745                      LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
746   num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
747   num = LLVMBuildTrunc(builder, num, ctx->i32, "");
748   return LLVMBuildLShr(builder, num, post_shift, "");
749}
750
751/* See fast_idiv_by_const.h. */
752/* Both operands must fit in 31 bits and the divisor must not be 1. */
753LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
754                                              LLVMValueRef multiplier, LLVMValueRef post_shift)
755{
756   LLVMBuilderRef builder = ctx->builder;
757
758   num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
759                      LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
760   num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
761   num = LLVMBuildTrunc(builder, num, ctx->i32, "");
762   return LLVMBuildLShr(builder, num, post_shift, "");
763}
764
765/* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
766 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
767 * already multiplied by two. id is the cube face number.
768 */
769struct cube_selection_coords {
770   LLVMValueRef stc[2];
771   LLVMValueRef ma;
772   LLVMValueRef id;
773};
774
775static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3],
776                                 struct cube_selection_coords *out)
777{
778   LLVMTypeRef f32 = ctx->f32;
779
780   out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE);
781   out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE);
782   out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE);
783   out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE);
784}
785
786/**
787 * Build a manual selection sequence for cube face sc/tc coordinates and
788 * major axis vector (multiplied by 2 for consistency) for the given
789 * vec3 \p coords, for the face implied by \p selcoords.
790 *
791 * For the major axis, we always adjust the sign to be in the direction of
792 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
793 * the selcoords major axis.
794 */
795static void build_cube_select(struct ac_llvm_context *ctx,
796                              const struct cube_selection_coords *selcoords,
797                              const LLVMValueRef *coords, LLVMValueRef *out_st,
798                              LLVMValueRef *out_ma)
799{
800   LLVMBuilderRef builder = ctx->builder;
801   LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
802   LLVMValueRef is_ma_positive;
803   LLVMValueRef sgn_ma;
804   LLVMValueRef is_ma_z, is_not_ma_z;
805   LLVMValueRef is_ma_y;
806   LLVMValueRef is_ma_x;
807   LLVMValueRef sgn;
808   LLVMValueRef tmp;
809
810   is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), "");
811   sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0),
812                            LLVMConstReal(f32, -1.0), "");
813
814   is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
815   is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
816   is_ma_y = LLVMBuildAnd(
817      builder, is_not_ma_z,
818      LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
819   is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
820
821   /* Select sc */
822   tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
823   sgn = LLVMBuildSelect(
824      builder, is_ma_y, LLVMConstReal(f32, 1.0),
825      LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
826   out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
827
828   /* Select tc */
829   tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
830   sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), "");
831   out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
832
833   /* Select ma */
834   tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
835                         LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
836   tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
837   *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
838}
839
840void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod,
841                            LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg)
842{
843
844   LLVMBuilderRef builder = ctx->builder;
845   struct cube_selection_coords selcoords;
846   LLVMValueRef coords[3];
847   LLVMValueRef invma;
848
849   if (is_array && !is_lod) {
850      LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
851
852      /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
853       *
854       *    "For Array forms, the array layer used will be
855       *
856       *       max(0, min(d−1, floor(layer+0.5)))
857       *
858       *     where d is the depth of the texture array and layer
859       *     comes from the component indicated in the tables below.
860       *     Workaroudn for an issue where the layer is taken from a
861       *     helper invocation which happens to fall on a different
862       *     layer due to extrapolation."
863       *
864       * GFX8 and earlier attempt to implement this in hardware by
865       * clamping the value of coords[2] = (8 * layer) + face.
866       * Unfortunately, this means that the we end up with the wrong
867       * face when clamping occurs.
868       *
869       * Clamp the layer earlier to work around the issue.
870       */
871      if (ctx->gfx_level <= GFX8) {
872         LLVMValueRef ge0;
873         ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
874         tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
875      }
876
877      coords_arg[3] = tmp;
878   }
879
880   build_cube_intrinsic(ctx, coords_arg, &selcoords);
881
882   invma =
883      ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
884   invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
885
886   for (int i = 0; i < 2; ++i)
887      coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
888
889   coords[2] = selcoords.id;
890
891   if (is_deriv && derivs_arg) {
892      LLVMValueRef derivs[4];
893      int axis;
894
895      /* Convert cube derivatives to 2D derivatives. */
896      for (axis = 0; axis < 2; axis++) {
897         LLVMValueRef deriv_st[2];
898         LLVMValueRef deriv_ma;
899
900         /* Transform the derivative alongside the texture
901          * coordinate. Mathematically, the correct formula is
902          * as follows. Assume we're projecting onto the +Z face
903          * and denote by dx/dh the derivative of the (original)
904          * X texture coordinate with respect to horizontal
905          * window coordinates. The projection onto the +Z face
906          * plane is:
907          *
908          *   f(x,z) = x/z
909          *
910          * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
911          *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
912          *
913          * This motivatives the implementation below.
914          *
915          * Whether this actually gives the expected results for
916          * apps that might feed in derivatives obtained via
917          * finite differences is anyone's guess. The OpenGL spec
918          * seems awfully quiet about how textureGrad for cube
919          * maps should be handled.
920          */
921         build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma);
922
923         deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
924
925         for (int i = 0; i < 2; ++i)
926            derivs[axis * 2 + i] =
927               LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""),
928                             LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
929      }
930
931      memcpy(derivs_arg, derivs, sizeof(derivs));
932   }
933
934   /* Shift the texture coordinate. This must be applied after the
935    * derivative calculation.
936    */
937   for (int i = 0; i < 2; ++i)
938      coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
939
940   if (is_array) {
941      /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
942      /* coords_arg.w component - array_index for cube arrays */
943      coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
944   }
945
946   memcpy(coords_arg, coords, sizeof(coords));
947}
948
949LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
950                                LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
951                                LLVMValueRef j)
952{
953   LLVMValueRef args[5];
954
955   if (ctx->gfx_level >= GFX11) {
956      LLVMValueRef p;
957      LLVMValueRef p10;
958
959      args[0] = llvm_chan;
960      args[1] = attr_number;
961      args[2] = params;
962
963      p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
964                             ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
965
966      args[0] = p;
967      args[1] = i;
968      args[2] = p;
969
970      p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10",
971                               ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
972
973      args[0] = p;
974      args[1] = j;
975      args[2] = p10;
976
977      return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2",
978                                ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
979
980   } else {
981      LLVMValueRef p1;
982
983      args[0] = i;
984      args[1] = llvm_chan;
985      args[2] = attr_number;
986      args[3] = params;
987
988      p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
989                              ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
990
991      args[0] = p1;
992      args[1] = j;
993      args[2] = llvm_chan;
994      args[3] = attr_number;
995      args[4] = params;
996
997      return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
998                                ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
999   }
1000}
1001
1002LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
1003                                    LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
1004                                    LLVMValueRef j, bool high_16bits)
1005{
1006   LLVMValueRef args[6];
1007
1008   if (ctx->gfx_level >= GFX11) {
1009      LLVMValueRef p;
1010      LLVMValueRef p10;
1011
1012      args[0] = llvm_chan;
1013      args[1] = attr_number;
1014      args[2] = params;
1015
1016      p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
1017                             ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
1018
1019      args[0] = p;
1020      args[1] = i;
1021      args[2] = p;
1022      args[3] = high_16bits ? ctx->i1true : ctx->i1false;
1023
1024      p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10.f16",
1025                               ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
1026
1027      args[0] = p;
1028      args[1] = j;
1029      args[2] = p10;
1030      args[3] = high_16bits ? ctx->i1true : ctx->i1false;
1031
1032      return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2.f16",
1033                                ctx->f16, args, 4, AC_FUNC_ATTR_READNONE);
1034
1035   } else {
1036      LLVMValueRef p1;
1037
1038      args[0] = i;
1039      args[1] = llvm_chan;
1040      args[2] = attr_number;
1041      args[3] = high_16bits ? ctx->i1true : ctx->i1false;
1042      args[4] = params;
1043
1044      p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
1045                              AC_FUNC_ATTR_READNONE);
1046
1047      args[0] = p1;
1048      args[1] = j;
1049      args[2] = llvm_chan;
1050      args[3] = attr_number;
1051      args[4] = high_16bits ? ctx->i1true : ctx->i1false;
1052      args[5] = params;
1053
1054      return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
1055                                AC_FUNC_ATTR_READNONE);
1056   }
1057}
1058
1059LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter,
1060                                    LLVMValueRef llvm_chan, LLVMValueRef attr_number,
1061                                    LLVMValueRef params)
1062{
1063   LLVMValueRef args[4];
1064
1065   if (ctx->gfx_level >= GFX11) {
1066      LLVMValueRef p;
1067
1068      args[0] = llvm_chan;
1069      args[1] = attr_number;
1070      args[2] = params;
1071
1072      p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
1073                             ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
1074      p = ac_build_quad_swizzle(ctx, p, 0, 0, 0 ,0);
1075      return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, AC_FUNC_ATTR_READNONE);
1076   } else {
1077      args[0] = parameter;
1078      args[1] = llvm_chan;
1079      args[2] = attr_number;
1080      args[3] = params;
1081
1082      return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4,
1083                                AC_FUNC_ATTR_READNONE);
1084   }
1085}
1086
1087LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1088                              LLVMValueRef index)
1089{
1090   return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1091}
1092
1093LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1094{
1095   LLVMValueRef indices[2] = {
1096      ctx->i32_0,
1097      index,
1098   };
1099   return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1100}
1101
1102LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index)
1103{
1104   LLVMValueRef offset_ptr = LLVMBuildGEP(ctx->builder, ptr, &index, 1, "");
1105   return LLVMBuildPointerCast(ctx->builder, offset_ptr, LLVMTypeOf(ptr), "");
1106}
1107
1108void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index,
1109                            LLVMValueRef value)
1110{
1111   LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index));
1112}
1113
1114/**
1115 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1116 * It's equivalent to doing a load from &base_ptr[index].
1117 *
1118 * \param base_ptr  Where the array starts.
1119 * \param index     The element index into the array.
1120 * \param uniform   Whether the base_ptr and index can be assumed to be
1121 *                  dynamically uniform (i.e. load to an SGPR)
1122 * \param invariant Whether the load is invariant (no other opcodes affect it)
1123 * \param no_unsigned_wraparound
1124 *    For all possible re-associations and re-distributions of an expression
1125 *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1126 *    without inbounds in base_ptr), this parameter is true if "addr + offset"
1127 *    does not result in an unsigned integer wraparound. This is used for
1128 *    optimal code generation of 32-bit pointer arithmetic.
1129 *
1130 *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
1131 *    integer wraparound can't be an imm offset in s_load_dword, because
1132 *    the instruction performs "addr + offset" in 64 bits.
1133 *
1134 *    Expected usage for bindless textures by chaining GEPs:
1135 *      // possible unsigned wraparound, don't use InBounds:
1136 *      ptr1 = LLVMBuildGEP(base_ptr, index);
1137 *      image = load(ptr1); // becomes "s_load ptr1, 0"
1138 *
1139 *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1140 *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1141 */
1142static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1143                                         LLVMValueRef index, bool uniform, bool invariant,
1144                                         bool no_unsigned_wraparound)
1145{
1146   LLVMValueRef pointer, result;
1147
1148   if (no_unsigned_wraparound &&
1149       LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1150      pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1151   else
1152      pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1153
1154   if (uniform)
1155      LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1156   result = LLVMBuildLoad(ctx->builder, pointer, "");
1157   if (invariant)
1158      LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1159   LLVMSetAlignment(result, 4);
1160   return result;
1161}
1162
1163LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1164{
1165   return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1166}
1167
1168LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1169                                     LLVMValueRef index)
1170{
1171   return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1172}
1173
1174/* This assumes that there is no unsigned integer wraparound during the address
1175 * computation, excluding all GEPs within base_ptr. */
1176LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1177                                   LLVMValueRef index)
1178{
1179   return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1180}
1181
1182/* See ac_build_load_custom() documentation. */
1183LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1184                                                   LLVMValueRef base_ptr, LLVMValueRef index)
1185{
1186   return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1187}
1188
1189static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1190{
1191   return cache_policy |
1192          (ctx->gfx_level >= GFX10 && ctx->gfx_level < GFX11 && cache_policy & ac_glc ? ac_dlc : 0);
1193}
1194
1195static unsigned get_store_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1196{
1197   if (ctx->gfx_level >= GFX11)
1198      cache_policy &= ~ac_glc; /* GLC has no effect on stores */
1199   return cache_policy;
1200}
1201
1202static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1203                                         LLVMValueRef data, LLVMValueRef vindex,
1204                                         LLVMValueRef voffset, LLVMValueRef soffset,
1205                                         unsigned cache_policy, bool use_format)
1206{
1207   LLVMValueRef args[6];
1208   int idx = 0;
1209   args[idx++] = data;
1210   args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1211   if (vindex)
1212      args[idx++] = vindex ? vindex : ctx->i32_0;
1213   args[idx++] = voffset ? voffset : ctx->i32_0;
1214   args[idx++] = soffset ? soffset : ctx->i32_0;
1215   args[idx++] = LLVMConstInt(ctx->i32, get_store_cache_policy(ctx, cache_policy), 0);
1216   const char *indexing_kind = vindex ? "struct" : "raw";
1217   char name[256], type_name[8];
1218
1219   ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1220
1221   if (use_format) {
1222      snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1223               type_name);
1224   } else {
1225      snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1226   }
1227
1228   ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1229}
1230
1231void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1232                                  LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
1233{
1234   ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true);
1235}
1236
1237/* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */
1238void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1239                                 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1240                                 unsigned cache_policy)
1241{
1242   unsigned num_channels = ac_get_llvm_num_components(vdata);
1243
1244   /* Split 3 channel stores if unsupported. */
1245   if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false)) {
1246      LLVMValueRef v[3], v01, voffset2;
1247
1248      for (int i = 0; i < 3; i++) {
1249         v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1250      }
1251      v01 = ac_build_gather_values(ctx, v, 2);
1252
1253      voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
1254                              LLVMConstInt(ctx->i32, 8, 0), "");
1255
1256      ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, cache_policy);
1257      ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, cache_policy);
1258      return;
1259   }
1260
1261   ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset,
1262                                cache_policy, false);
1263}
1264
1265static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1266                                                LLVMValueRef vindex, LLVMValueRef voffset,
1267                                                LLVMValueRef soffset, unsigned num_channels,
1268                                                LLVMTypeRef channel_type, unsigned cache_policy,
1269                                                bool can_speculate, bool use_format,
1270                                                bool structurized)
1271{
1272   LLVMValueRef args[5];
1273   int idx = 0;
1274   args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1275   if (structurized)
1276      args[idx++] = vindex ? vindex : ctx->i32_0;
1277   args[idx++] = voffset ? voffset : ctx->i32_0;
1278   args[idx++] = soffset ? soffset : ctx->i32_0;
1279   args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1280   unsigned func =
1281      !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels;
1282   const char *indexing_kind = structurized ? "struct" : "raw";
1283   char name[256], type_name[8];
1284
1285   /* D16 is only supported on gfx8+ */
1286   assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1287          ctx->gfx_level >= GFX8);
1288
1289   LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1290   ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1291
1292   if (use_format) {
1293      snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1294               type_name);
1295   } else {
1296      snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1297   }
1298
1299   return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1300}
1301
1302LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1303                                  LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1304                                  LLVMTypeRef channel_type, unsigned cache_policy,
1305                                  bool can_speculate, bool allow_smem)
1306{
1307   if (allow_smem && !(cache_policy & ac_slc) &&
1308       (!(cache_policy & ac_glc) || ctx->gfx_level >= GFX8)) {
1309      assert(vindex == NULL);
1310
1311      LLVMValueRef result[8];
1312
1313      LLVMValueRef offset = voffset ? voffset : ctx->i32_0;
1314      if (soffset)
1315         offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1316
1317      for (int i = 0; i < num_channels; i++) {
1318         if (i) {
1319            offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), "");
1320         }
1321         LLVMValueRef args[3] = {
1322            rsrc,
1323            offset,
1324            LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1325         };
1326         result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3,
1327                                        AC_FUNC_ATTR_READNONE);
1328      }
1329      if (num_channels == 1)
1330         return result[0];
1331
1332      if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false))
1333         result[num_channels++] = LLVMGetUndef(ctx->f32);
1334      return ac_build_gather_values(ctx, result, num_channels);
1335   }
1336
1337   return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, soffset, num_channels,
1338                                      channel_type, cache_policy, can_speculate, false, false);
1339}
1340
1341LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1342                                         LLVMValueRef vindex, LLVMValueRef voffset,
1343                                         unsigned num_channels, unsigned cache_policy,
1344                                         bool can_speculate, bool d16, bool tfe)
1345{
1346   if (tfe) {
1347      assert(!d16);
1348
1349      cache_policy = get_load_cache_policy(ctx, cache_policy);
1350
1351      char code[256];
1352      /* The definition in the assembly and the one in the constraint string
1353       * differs because of an assembler bug.
1354       */
1355      snprintf(code, sizeof(code),
1356               "v_mov_b32 v0, 0\n"
1357               "v_mov_b32 v1, 0\n"
1358               "v_mov_b32 v2, 0\n"
1359               "v_mov_b32 v3, 0\n"
1360               "v_mov_b32 v4, 0\n"
1361               "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
1362               "s_waitcnt vmcnt(0)",
1363               cache_policy & ac_glc ? "glc" : "",
1364               cache_policy & ac_slc ? "slc" : "",
1365               cache_policy & ac_dlc ? "dlc" : "");
1366
1367      LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
1368      LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
1369      LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
1370
1371      LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
1372                                   voffset ? voffset : ctx->i32_0};
1373
1374      LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
1375                             LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
1376      LLVMValueRef res = LLVMBuildCall2(ctx->builder, calltype, inlineasm, args, 2, "");
1377
1378      return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
1379                             ac_llvm_extract_elem(ctx, res, 4));
1380   }
1381
1382   return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,
1383                                      d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,
1384                                      true);
1385}
1386
1387static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1388                                          LLVMValueRef vindex, LLVMValueRef voffset,
1389                                          LLVMValueRef soffset, unsigned num_channels,
1390                                          unsigned dfmt, unsigned nfmt, unsigned cache_policy,
1391                                          bool can_speculate, bool structurized)
1392{
1393   LLVMValueRef args[6];
1394   int idx = 0;
1395   args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1396   if (structurized)
1397      args[idx++] = vindex ? vindex : ctx->i32_0;
1398   args[idx++] = voffset ? voffset : ctx->i32_0;
1399   args[idx++] = soffset ? soffset : ctx->i32_0;
1400   args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->gfx_level, dfmt, nfmt), 0);
1401   args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1402   unsigned func =
1403      !ac_has_vec3_support(ctx->gfx_level, true) && num_channels == 3 ? 4 : num_channels;
1404   const char *indexing_kind = structurized ? "struct" : "raw";
1405   char name[256], type_name[8];
1406
1407   LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1408   ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1409
1410   snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1411
1412   return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1413}
1414
1415LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1416                                          LLVMValueRef vindex, LLVMValueRef voffset,
1417                                          LLVMValueRef soffset, unsigned num_channels,
1418                                          unsigned dfmt, unsigned nfmt, unsigned cache_policy,
1419                                          bool can_speculate)
1420{
1421   return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, num_channels, dfmt,
1422                                nfmt, cache_policy, can_speculate, true);
1423}
1424
1425LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1426                                        LLVMValueRef voffset, LLVMValueRef soffset,
1427                                        unsigned cache_policy)
1428{
1429   return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1430                                      cache_policy, false, false, false);
1431}
1432
1433LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1434                                       LLVMValueRef voffset, LLVMValueRef soffset,
1435                                       unsigned cache_policy)
1436{
1437   return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
1438                                      false, false, false);
1439}
1440
1441/**
1442 * Convert an 11- or 10-bit unsigned floating point number to an f32.
1443 *
1444 * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1445 * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1446 */
1447static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src,
1448                                    unsigned exp_bits, unsigned mant_bits)
1449{
1450   assert(LLVMTypeOf(src) == ctx->i32);
1451
1452   LLVMValueRef tmp;
1453   LLVMValueRef mantissa;
1454   mantissa =
1455      LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1456
1457   /* Converting normal numbers is just a shift + correcting the exponent bias */
1458   unsigned normal_shift = 23 - mant_bits;
1459   unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1460   LLVMValueRef shifted, normal;
1461
1462   shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1463   normal =
1464      LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1465
1466   /* Converting nan/inf numbers is the same, but with a different exponent update */
1467   LLVMValueRef naninf;
1468   naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1469
1470   /* Converting denormals is the complex case: determine the leading zeros of the
1471    * mantissa to obtain the correct shift for the mantissa and exponent correction.
1472    */
1473   LLVMValueRef denormal;
1474   LLVMValueRef params[2] = {
1475      mantissa, ctx->i1true, /* result can be undef when arg is 0 */
1476   };
1477   LLVMValueRef ctlz =
1478      ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
1479
1480   /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1481   tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1482   denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1483
1484   unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1485   tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1486   tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1487   denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1488
1489   /* Select the final result. */
1490   LLVMValueRef result;
1491
1492   tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1493                       LLVMConstInt(ctx->i32, ((1ULL << exp_bits) - 1) << mant_bits, false), "");
1494   result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1495
1496   tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1497                       LLVMConstInt(ctx->i32, 1ULL << mant_bits, false), "");
1498   result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1499
1500   tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1501   result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1502
1503   return ac_to_float(ctx, result);
1504}
1505
1506/**
1507 * Generate a fully general open coded buffer format fetch with all required
1508 * fixups suitable for vertex fetch, using non-format buffer loads.
1509 *
1510 * Some combinations of argument values have special interpretations:
1511 * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1512 * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1513 *
1514 * \param log_size log(size of channel in bytes)
1515 * \param num_channels number of channels (1 to 4)
1516 * \param format AC_FETCH_FORMAT_xxx value
1517 * \param reverse whether XYZ channels are reversed
1518 * \param known_aligned whether the source is known to be aligned to hardware's
1519 *                      effective element size for loading the given format
1520 *                      (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1521 * \param rsrc buffer resource descriptor
1522 * \return the resulting vector of floats or integers bitcast to <4 x i32>
1523 */
1524LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size,
1525                                            unsigned num_channels, unsigned format, bool reverse,
1526                                            bool known_aligned, LLVMValueRef rsrc,
1527                                            LLVMValueRef vindex, LLVMValueRef voffset,
1528                                            LLVMValueRef soffset, unsigned cache_policy,
1529                                            bool can_speculate)
1530{
1531   LLVMValueRef tmp;
1532   unsigned load_log_size = log_size;
1533   unsigned load_num_channels = num_channels;
1534   if (log_size == 3) {
1535      load_log_size = 2;
1536      if (format == AC_FETCH_FORMAT_FLOAT) {
1537         load_num_channels = 2 * num_channels;
1538      } else {
1539         load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1540      }
1541   }
1542
1543   int log_recombine = 0;
1544   if ((ctx->gfx_level == GFX6 || ctx->gfx_level >= GFX10) && !known_aligned) {
1545      /* Avoid alignment restrictions by loading one byte at a time. */
1546      load_num_channels <<= load_log_size;
1547      log_recombine = load_log_size;
1548      load_log_size = 0;
1549   } else if (load_num_channels == 2 || load_num_channels == 4) {
1550      log_recombine = -util_logbase2(load_num_channels);
1551      load_num_channels = 1;
1552      load_log_size += -log_recombine;
1553   }
1554
1555   LLVMValueRef loads[32]; /* up to 32 bytes */
1556   for (unsigned i = 0; i < load_num_channels; ++i) {
1557      tmp =
1558         LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1559      LLVMTypeRef channel_type =
1560         load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32;
1561      unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1562      loads[i] =
1563         ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type,
1564                                     cache_policy, can_speculate, false, true);
1565      if (load_log_size >= 2)
1566         loads[i] = ac_to_integer(ctx, loads[i]);
1567   }
1568
1569   if (log_recombine > 0) {
1570      /* Recombine bytes if necessary (GFX6 only) */
1571      LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1572
1573      for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1574         LLVMValueRef accum = NULL;
1575         for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1576            tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1577            if (i == 0) {
1578               accum = tmp;
1579            } else {
1580               tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), "");
1581               accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1582            }
1583         }
1584         loads[dst] = accum;
1585      }
1586   } else if (log_recombine < 0) {
1587      /* Split vectors of dwords */
1588      if (load_log_size > 2) {
1589         assert(load_num_channels == 1);
1590         LLVMValueRef loaded = loads[0];
1591         unsigned log_split = load_log_size - 2;
1592         log_recombine += log_split;
1593         load_num_channels = 1 << log_split;
1594         load_log_size = 2;
1595         for (unsigned i = 0; i < load_num_channels; ++i) {
1596            tmp = LLVMConstInt(ctx->i32, i, false);
1597            loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1598         }
1599      }
1600
1601      /* Further split dwords and shorts if required */
1602      if (log_recombine < 0) {
1603         for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0;
1604              --src) {
1605            unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1606            LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1607            LLVMValueRef loaded = loads[src - 1];
1608            LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1609            for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1610               tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1611               tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1612               loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1613            }
1614         }
1615      }
1616   }
1617
1618   if (log_size == 3) {
1619      if (format == AC_FETCH_FORMAT_FLOAT) {
1620         for (unsigned i = 0; i < num_channels; ++i) {
1621            tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1622            loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1623         }
1624      } else if (format == AC_FETCH_FORMAT_FIXED) {
1625         /* 10_11_11_FLOAT */
1626         LLVMValueRef data = loads[0];
1627         LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1628         LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1629         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1630         LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1631         LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1632
1633         loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1634         loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1635         loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1636
1637         num_channels = 3;
1638         log_size = 2;
1639         format = AC_FETCH_FORMAT_FLOAT;
1640      } else {
1641         /* 2_10_10_10 data formats */
1642         LLVMValueRef data = loads[0];
1643         LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1644         LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1645         loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1646         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1647         loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1648         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1649         loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1650         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1651         loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1652
1653         num_channels = 4;
1654      }
1655   }
1656
1657   if (format == AC_FETCH_FORMAT_FLOAT) {
1658      if (log_size != 2) {
1659         for (unsigned chan = 0; chan < num_channels; ++chan) {
1660            tmp = ac_to_float(ctx, loads[chan]);
1661            if (log_size == 3)
1662               tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1663            else if (log_size == 1)
1664               tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1665            loads[chan] = ac_to_integer(ctx, tmp);
1666         }
1667      }
1668   } else if (format == AC_FETCH_FORMAT_UINT) {
1669      if (log_size != 2) {
1670         for (unsigned chan = 0; chan < num_channels; ++chan)
1671            loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1672      }
1673   } else if (format == AC_FETCH_FORMAT_SINT) {
1674      if (log_size != 2) {
1675         for (unsigned chan = 0; chan < num_channels; ++chan)
1676            loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1677      }
1678   } else {
1679      bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED ||
1680                    format == AC_FETCH_FORMAT_UINT;
1681
1682      for (unsigned chan = 0; chan < num_channels; ++chan) {
1683         if (unsign) {
1684            tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1685         } else {
1686            tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1687         }
1688
1689         LLVMValueRef scale = NULL;
1690         if (format == AC_FETCH_FORMAT_FIXED) {
1691            assert(log_size == 2);
1692            scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1693         } else if (format == AC_FETCH_FORMAT_UNORM) {
1694            unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1695            scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1696         } else if (format == AC_FETCH_FORMAT_SNORM) {
1697            unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1698            scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1699         }
1700         if (scale)
1701            tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1702
1703         if (format == AC_FETCH_FORMAT_SNORM) {
1704            /* Clamp to [-1, 1] */
1705            LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1706            LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1707            tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1708         }
1709
1710         loads[chan] = ac_to_integer(ctx, tmp);
1711      }
1712   }
1713
1714   while (num_channels < 4) {
1715      if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1716         loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1717      } else {
1718         loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1719      }
1720      num_channels++;
1721   }
1722
1723   if (reverse) {
1724      tmp = loads[0];
1725      loads[0] = loads[2];
1726      loads[2] = tmp;
1727   }
1728
1729   return ac_build_gather_values(ctx, loads, 4);
1730}
1731
1732void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1733                                 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1734                                 unsigned cache_policy)
1735{
1736   vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1737
1738   ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false);
1739}
1740
1741void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1742                                LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
1743{
1744   vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1745
1746   ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false);
1747}
1748
1749/**
1750 * Set range metadata on an instruction.  This can only be used on load and
1751 * call instructions.  If you know an instruction can only produce the values
1752 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1753 * \p lo is the minimum value inclusive.
1754 * \p hi is the maximum value exclusive.
1755 */
1756void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1757                           unsigned hi)
1758{
1759   LLVMValueRef range_md, md_args[2];
1760   LLVMTypeRef type = LLVMTypeOf(value);
1761   LLVMContextRef context = LLVMGetTypeContext(type);
1762
1763   md_args[0] = LLVMConstInt(type, lo, false);
1764   md_args[1] = LLVMConstInt(type, hi, false);
1765   range_md = LLVMMDNodeInContext(context, md_args, 2);
1766   LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1767}
1768
1769LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1770{
1771   return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));
1772}
1773
1774/*
1775 * AMD GCN implements derivatives using the local data store (LDS)
1776 * All writes to the LDS happen in all executing threads at
1777 * the same time. TID is the Thread ID for the current
1778 * thread and is a value between 0 and 63, representing
1779 * the thread's position in the wavefront.
1780 *
1781 * For the pixel shader threads are grouped into quads of four pixels.
1782 * The TIDs of the pixels of a quad are:
1783 *
1784 *  +------+------+
1785 *  |4n + 0|4n + 1|
1786 *  +------+------+
1787 *  |4n + 2|4n + 3|
1788 *  +------+------+
1789 *
1790 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1791 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1792 * the current pixel's column, and masking with 0xfffffffe yields the TID
1793 * of the left pixel of the current pixel's row.
1794 *
1795 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1796 * adding 2 yields the TID of the pixel below the top pixel.
1797 */
1798LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1799{
1800   unsigned tl_lanes[4], trbl_lanes[4];
1801   char name[32], type[8];
1802   LLVMValueRef tl, trbl;
1803   LLVMTypeRef result_type;
1804   LLVMValueRef result;
1805
1806   result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1807
1808   if (result_type == ctx->f16)
1809      val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1810   else if (result_type == ctx->v2f16)
1811      val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1812
1813   for (unsigned i = 0; i < 4; ++i) {
1814      tl_lanes[i] = i & mask;
1815      trbl_lanes[i] = (i & mask) + idx;
1816   }
1817
1818   tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1819   trbl =
1820      ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1821
1822   if (result_type == ctx->f16) {
1823      tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1824      trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1825   }
1826
1827   tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1828   trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1829   result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1830
1831   ac_build_type_name_for_intr(result_type, type, sizeof(type));
1832   snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1833
1834   return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1835}
1836
1837void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id)
1838{
1839   LLVMValueRef args[2];
1840   args[0] = LLVMConstInt(ctx->i32, msg, false);
1841   args[1] = wave_id;
1842   ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1843}
1844
1845LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1846{
1847   LLVMValueRef msb =
1848      ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE);
1849
1850   /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1851    * the index from LSB. Invert it by doing "31 - msb". */
1852   msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1853
1854   LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1855   LLVMValueRef cond =
1856      LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1857                  LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1858
1859   return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1860}
1861
1862LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1863{
1864   const char *intrin_name;
1865   LLVMTypeRef type;
1866   LLVMValueRef highest_bit;
1867   LLVMValueRef zero;
1868   unsigned bitsize;
1869
1870   bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1871   switch (bitsize) {
1872   case 64:
1873      intrin_name = "llvm.ctlz.i64";
1874      type = ctx->i64;
1875      highest_bit = LLVMConstInt(ctx->i64, 63, false);
1876      zero = ctx->i64_0;
1877      break;
1878   case 32:
1879      intrin_name = "llvm.ctlz.i32";
1880      type = ctx->i32;
1881      highest_bit = LLVMConstInt(ctx->i32, 31, false);
1882      zero = ctx->i32_0;
1883      break;
1884   case 16:
1885      intrin_name = "llvm.ctlz.i16";
1886      type = ctx->i16;
1887      highest_bit = LLVMConstInt(ctx->i16, 15, false);
1888      zero = ctx->i16_0;
1889      break;
1890   case 8:
1891      intrin_name = "llvm.ctlz.i8";
1892      type = ctx->i8;
1893      highest_bit = LLVMConstInt(ctx->i8, 7, false);
1894      zero = ctx->i8_0;
1895      break;
1896   default:
1897      unreachable(!"invalid bitsize");
1898      break;
1899   }
1900
1901   LLVMValueRef params[2] = {
1902      arg,
1903      ctx->i1true,
1904   };
1905
1906   LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
1907
1908   /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1909    * the index from LSB. Invert it by doing "31 - msb". */
1910   msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1911
1912   if (bitsize == 64) {
1913      msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1914   } else if (bitsize < 32) {
1915      msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1916   }
1917
1918   /* check for zero */
1919   return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1920                          LLVMConstInt(ctx->i32, -1, true), msb, "");
1921}
1922
1923LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1924{
1925   char name[64], type[64];
1926
1927   ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1928   snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1929   LLVMValueRef args[2] = {a, b};
1930   return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1931}
1932
1933LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1934{
1935   char name[64], type[64];
1936
1937   ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1938   snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1939   LLVMValueRef args[2] = {a, b};
1940   return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1941}
1942
1943LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1944{
1945   LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1946   return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1947}
1948
1949LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1950{
1951   LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1952   return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1953}
1954
1955LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1956{
1957   LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1958   return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1959}
1960
1961LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1962{
1963   LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1964   return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1965}
1966
1967LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1968{
1969   LLVMTypeRef t = LLVMTypeOf(value);
1970   return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1971                        LLVMConstReal(t, 1.0));
1972}
1973
1974void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1975{
1976   LLVMValueRef args[9];
1977
1978   args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1979   args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1980
1981   if (a->compr) {
1982      assert(ctx->gfx_level < GFX11);
1983
1984      args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1985      args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1986      args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1987      args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1988
1989      ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1990   } else {
1991      args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->f32, "");
1992      args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->f32, "");
1993      args[4] = LLVMBuildBitCast(ctx->builder, a->out[2], ctx->f32, "");
1994      args[5] = LLVMBuildBitCast(ctx->builder, a->out[3], ctx->f32, "");
1995      args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1996      args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1997
1998      ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1999   }
2000}
2001
2002void ac_build_export_null(struct ac_llvm_context *ctx, bool uses_discard)
2003{
2004   struct ac_export_args args;
2005
2006   /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
2007    * for discard.
2008    */
2009   if (ctx->gfx_level >= GFX10 && !uses_discard)
2010      return;
2011
2012   args.enabled_channels = 0x0; /* enabled channels */
2013   args.valid_mask = 1;         /* whether the EXEC mask is valid */
2014   args.done = 1;               /* DONE bit */
2015   /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
2016   args.target = ctx->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
2017   args.compr = 0;                       /* COMPR flag (0 = 32-bit export) */
2018   args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2019   args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2020   args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2021   args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2022
2023   ac_build_export(ctx, &args);
2024}
2025
2026static unsigned ac_num_coords(enum ac_image_dim dim)
2027{
2028   switch (dim) {
2029   case ac_image_1d:
2030      return 1;
2031   case ac_image_2d:
2032   case ac_image_1darray:
2033      return 2;
2034   case ac_image_3d:
2035   case ac_image_cube:
2036   case ac_image_2darray:
2037   case ac_image_2dmsaa:
2038      return 3;
2039   case ac_image_2darraymsaa:
2040      return 4;
2041   default:
2042      unreachable("ac_num_coords: bad dim");
2043   }
2044}
2045
2046static unsigned ac_num_derivs(enum ac_image_dim dim)
2047{
2048   switch (dim) {
2049   case ac_image_1d:
2050   case ac_image_1darray:
2051      return 2;
2052   case ac_image_2d:
2053   case ac_image_2darray:
2054   case ac_image_cube:
2055      return 4;
2056   case ac_image_3d:
2057      return 6;
2058   case ac_image_2dmsaa:
2059   case ac_image_2darraymsaa:
2060   default:
2061      unreachable("derivatives not supported");
2062   }
2063}
2064
2065static const char *get_atomic_name(enum ac_atomic_op op)
2066{
2067   switch (op) {
2068   case ac_atomic_swap:
2069      return "swap";
2070   case ac_atomic_add:
2071      return "add";
2072   case ac_atomic_sub:
2073      return "sub";
2074   case ac_atomic_smin:
2075      return "smin";
2076   case ac_atomic_umin:
2077      return "umin";
2078   case ac_atomic_smax:
2079      return "smax";
2080   case ac_atomic_umax:
2081      return "umax";
2082   case ac_atomic_and:
2083      return "and";
2084   case ac_atomic_or:
2085      return "or";
2086   case ac_atomic_xor:
2087      return "xor";
2088   case ac_atomic_inc_wrap:
2089      return "inc";
2090   case ac_atomic_dec_wrap:
2091      return "dec";
2092   case ac_atomic_fmin:
2093      return "fmin";
2094   case ac_atomic_fmax:
2095      return "fmax";
2096   }
2097   unreachable("bad atomic op");
2098}
2099
2100LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
2101{
2102   const char *overload[3] = {"", "", ""};
2103   unsigned num_overloads = 0;
2104   LLVMValueRef args[18];
2105   unsigned num_args = 0;
2106   enum ac_image_dim dim = a->dim;
2107
2108   assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
2109   assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2110           a->opcode != ac_image_store_mip) ||
2111          a->lod);
2112   assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2113          (!a->compare && !a->offset));
2114   assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2115           a->opcode == ac_image_get_lod) ||
2116          !a->bias);
2117   assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
2118          1);
2119   assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
2120   assert(!a->d16 || (ctx->gfx_level >= GFX8 && a->opcode != ac_image_atomic &&
2121                      a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
2122                      a->opcode != ac_image_get_resinfo));
2123   assert(!a->a16 || ctx->gfx_level >= GFX9);
2124   assert(a->g16 == a->a16 || ctx->gfx_level >= GFX10);
2125
2126   assert(!a->offset ||
2127          ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);
2128   assert(!a->bias ||
2129          ac_get_elem_bits(ctx, LLVMTypeOf(a->bias)) == 32);
2130   assert(!a->compare ||
2131          ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);
2132   assert(!a->derivs[0] ||
2133          ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&
2134           (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));
2135   assert(!a->coords[0] ||
2136          ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&
2137           (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));
2138   assert(!a->lod ||
2139          ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&
2140           (a->opcode == ac_image_get_resinfo ||
2141            ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==
2142            ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));
2143   assert(!a->min_lod ||
2144          ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==
2145          ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));
2146
2147   if (a->opcode == ac_image_get_lod) {
2148      switch (dim) {
2149      case ac_image_1darray:
2150         dim = ac_image_1d;
2151         break;
2152      case ac_image_2darray:
2153      case ac_image_cube:
2154         dim = ac_image_2d;
2155         break;
2156      default:
2157         break;
2158      }
2159   }
2160
2161   bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2162                 a->opcode == ac_image_get_lod;
2163   bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
2164   bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2165               a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
2166   LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);
2167   uint8_t dmask = a->dmask;
2168   LLVMTypeRef data_type;
2169   char data_type_str[32];
2170
2171   if (atomic) {
2172      data_type = LLVMTypeOf(a->data[0]);
2173   } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2174      /* Image stores might have been shrinked using the format. */
2175      data_type = LLVMTypeOf(a->data[0]);
2176      dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
2177   } else {
2178      data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
2179   }
2180
2181   if (a->tfe) {
2182      data_type = LLVMStructTypeInContext(
2183         ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
2184   }
2185
2186   if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2187      args[num_args++] = a->data[0];
2188      if (a->opcode == ac_image_atomic_cmpswap)
2189         args[num_args++] = a->data[1];
2190   }
2191
2192   if (!atomic)
2193      args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
2194
2195   if (a->offset)
2196      args[num_args++] = ac_to_integer(ctx, a->offset);
2197   if (a->bias) {
2198      args[num_args++] = ac_to_float(ctx, a->bias);
2199      overload[num_overloads++] = ".f32";
2200   }
2201   if (a->compare)
2202      args[num_args++] = ac_to_float(ctx, a->compare);
2203   if (a->derivs[0]) {
2204      unsigned count = ac_num_derivs(dim);
2205      for (unsigned i = 0; i < count; ++i)
2206         args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2207      overload[num_overloads++] = a->g16 ? ".f16" : ".f32";
2208   }
2209   unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2210   for (unsigned i = 0; i < num_coords; ++i)
2211      args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2212   if (a->lod)
2213      args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2214   if (a->min_lod)
2215      args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
2216
2217   overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");
2218
2219   args[num_args++] = a->resource;
2220   if (sample) {
2221      args[num_args++] = a->sampler;
2222      args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2223   }
2224
2225   args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
2226   args[num_args++] = LLVMConstInt(
2227      ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
2228
2229   const char *name;
2230   const char *atomic_subop = "";
2231   switch (a->opcode) {
2232   case ac_image_sample:
2233      name = "sample";
2234      break;
2235   case ac_image_gather4:
2236      name = "gather4";
2237      break;
2238   case ac_image_load:
2239      name = "load";
2240      break;
2241   case ac_image_load_mip:
2242      name = "load.mip";
2243      break;
2244   case ac_image_store:
2245      name = "store";
2246      break;
2247   case ac_image_store_mip:
2248      name = "store.mip";
2249      break;
2250   case ac_image_atomic:
2251      name = "atomic.";
2252      atomic_subop = get_atomic_name(a->atomic);
2253      break;
2254   case ac_image_atomic_cmpswap:
2255      name = "atomic.";
2256      atomic_subop = "cmpswap";
2257      break;
2258   case ac_image_get_lod:
2259      name = "getlod";
2260      break;
2261   case ac_image_get_resinfo:
2262      name = "getresinfo";
2263      break;
2264   default:
2265      unreachable("invalid image opcode");
2266   }
2267
2268   const char *dimname;
2269   switch (dim) {
2270   case ac_image_1d:
2271      dimname = "1d";
2272      break;
2273   case ac_image_2d:
2274      dimname = "2d";
2275      break;
2276   case ac_image_3d:
2277      dimname = "3d";
2278      break;
2279   case ac_image_cube:
2280      dimname = "cube";
2281      break;
2282   case ac_image_1darray:
2283      dimname = "1darray";
2284      break;
2285   case ac_image_2darray:
2286      dimname = "2darray";
2287      break;
2288   case ac_image_2dmsaa:
2289      dimname = "2dmsaa";
2290      break;
2291   case ac_image_2darraymsaa:
2292      dimname = "2darraymsaa";
2293      break;
2294   default:
2295      unreachable("invalid dim");
2296   }
2297
2298   ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
2299
2300   bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2301   char intr_name[96];
2302   snprintf(intr_name, sizeof(intr_name),
2303            "llvm.amdgcn.image.%s%s" /* base name */
2304            "%s%s%s%s"               /* sample/gather modifiers */
2305            ".%s.%s%s%s%s",          /* dimension and type overloads */
2306            name, atomic_subop, a->compare ? ".c" : "",
2307            a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
2308            a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
2309            data_type_str, overload[0], overload[1], overload[2]);
2310
2311   LLVMTypeRef retty;
2312   if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2313      retty = ctx->voidt;
2314   else
2315      retty = data_type;
2316
2317   LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
2318   if (a->tfe) {
2319      LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
2320      LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
2321      result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
2322   }
2323
2324   if (!sample && !atomic && retty != ctx->voidt)
2325      result = ac_to_integer(ctx, result);
2326
2327   return result;
2328}
2329
2330LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
2331{
2332   LLVMValueRef samples;
2333
2334   /* Read the samples from the descriptor directly.
2335    * Hardware doesn't have any instruction for this.
2336    */
2337   samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
2338   samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
2339   samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
2340   samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
2341   return samples;
2342}
2343
2344LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2345{
2346   return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2,
2347                             AC_FUNC_ATTR_READNONE);
2348}
2349
2350LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2351{
2352   LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2,
2353                                         AC_FUNC_ATTR_READNONE);
2354   return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2355}
2356
2357LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2358{
2359   LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2,
2360                                         AC_FUNC_ATTR_READNONE);
2361   return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2362}
2363
2364LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
2365                                         LLVMValueRef args[2])
2366{
2367   LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2368   LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2369   LLVMValueRef code = LLVMConstInlineAsm(calltype,
2370                                          ctx->gfx_level >= GFX11 ?
2371                                             "v_cvt_pk_norm_i16_f16 $0, $1, $2" :
2372                                             "v_cvt_pknorm_i16_f16 $0, $1, $2",
2373                                          "=v,v,v", false, false);
2374   return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
2375}
2376
2377LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
2378                                         LLVMValueRef args[2])
2379{
2380   LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2381   LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2382   LLVMValueRef code = LLVMConstInlineAsm(calltype,
2383                                          ctx->gfx_level >= GFX11 ?
2384                                             "v_cvt_pk_norm_u16_f16 $0, $1, $2" :
2385                                             "v_cvt_pknorm_u16_f16 $0, $1, $2",
2386                                          "=v,v,v", false, false);
2387   return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
2388}
2389
2390/* The 8-bit and 10-bit clamping is for HW workarounds. */
2391LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2392                                 bool hi)
2393{
2394   assert(bits == 8 || bits == 10 || bits == 16);
2395
2396   LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2397   LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2398   LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2399   LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2400
2401   /* Clamp. */
2402   if (bits != 16) {
2403      for (int i = 0; i < 2; i++) {
2404         bool alpha = hi && i == 1;
2405         args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2406         args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2407      }
2408   }
2409
2410   LLVMValueRef res =
2411      ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2412   return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2413}
2414
2415/* The 8-bit and 10-bit clamping is for HW workarounds. */
2416LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2417                                 bool hi)
2418{
2419   assert(bits == 8 || bits == 10 || bits == 16);
2420
2421   LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2422   LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2423
2424   /* Clamp. */
2425   if (bits != 16) {
2426      for (int i = 0; i < 2; i++) {
2427         bool alpha = hi && i == 1;
2428         args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2429      }
2430   }
2431
2432   LLVMValueRef res =
2433      ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2434   return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2435}
2436
2437LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2438{
2439   return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE);
2440}
2441
2442void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2443{
2444   ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2445}
2446
2447LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2448                          LLVMValueRef width, bool is_signed)
2449{
2450   LLVMValueRef args[] = {
2451      input,
2452      offset,
2453      width,
2454   };
2455
2456   return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2457                             ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
2458}
2459
2460LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2461                           LLVMValueRef s2)
2462{
2463   return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2464}
2465
2466LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2467                           LLVMValueRef s2)
2468{
2469   /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2470   if (ctx->gfx_level >= GFX10) {
2471      return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3,
2472                                AC_FUNC_ATTR_READNONE);
2473   }
2474
2475   return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2476}
2477
2478void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2479{
2480   if (!wait_flags)
2481      return;
2482
2483   unsigned expcnt = 7;
2484   unsigned lgkmcnt = 63;
2485   unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
2486   unsigned vscnt = 63;
2487
2488   if (wait_flags & AC_WAIT_EXP)
2489      expcnt = 0;
2490   if (wait_flags & AC_WAIT_LGKM)
2491      lgkmcnt = 0;
2492   if (wait_flags & AC_WAIT_VLOAD)
2493      vmcnt = 0;
2494
2495   if (wait_flags & AC_WAIT_VSTORE) {
2496      if (ctx->gfx_level >= GFX10)
2497         vscnt = 0;
2498      else
2499         vmcnt = 0;
2500   }
2501
2502   /* There is no intrinsic for vscnt(0), so use a fence. */
2503   if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2504       vscnt == 0) {
2505      assert(!(wait_flags & AC_WAIT_EXP));
2506      LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2507      return;
2508   }
2509
2510   unsigned simm16;
2511
2512   if (ctx->gfx_level >= GFX11)
2513      simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
2514   else
2515      simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2516
2517   LLVMValueRef args[1] = {
2518      LLVMConstInt(ctx->i32, simm16, false),
2519   };
2520   ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2521}
2522
2523LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2524                           LLVMTypeRef type)
2525{
2526   unsigned bitsize = ac_get_elem_bits(ctx, type);
2527   LLVMValueRef zero = LLVMConstReal(type, 0.0);
2528   LLVMValueRef one = LLVMConstReal(type, 1.0);
2529   LLVMValueRef result;
2530
2531   if (bitsize == 64 || (bitsize == 16 && ctx->gfx_level <= GFX8) || type == ctx->v2f16) {
2532      /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2533       * doesn't expose an intrinsic.
2534       */
2535      result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2536   } else {
2537      LLVMTypeRef type;
2538      char *intr;
2539
2540      if (bitsize == 16) {
2541         intr = "llvm.amdgcn.fmed3.f16";
2542         type = ctx->f16;
2543      } else {
2544         assert(bitsize == 32);
2545         intr = "llvm.amdgcn.fmed3.f32";
2546         type = ctx->f32;
2547      }
2548
2549      LLVMValueRef params[] = {
2550         zero,
2551         one,
2552         src,
2553      };
2554
2555      result = ac_build_intrinsic(ctx, intr, type, params, 3,
2556                                  AC_FUNC_ATTR_READNONE);
2557   }
2558
2559   if (ctx->gfx_level < GFX9 && bitsize == 32) {
2560      /* Only pre-GFX9 chips do not flush denorms. */
2561      result = ac_build_canonicalize(ctx, result, bitsize);
2562   }
2563
2564   return result;
2565}
2566
2567LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2568{
2569   LLVMTypeRef type;
2570   char *intr;
2571
2572   if (bitsize == 16) {
2573      intr = "llvm.amdgcn.fract.f16";
2574      type = ctx->f16;
2575   } else if (bitsize == 32) {
2576      intr = "llvm.amdgcn.fract.f32";
2577      type = ctx->f32;
2578   } else {
2579      intr = "llvm.amdgcn.fract.f64";
2580      type = ctx->f64;
2581   }
2582
2583   LLVMValueRef params[] = {
2584      src0,
2585   };
2586   return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
2587}
2588
2589LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2590{
2591
2592   if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2593      LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2594      unsigned vec_size = LLVMGetVectorSize(type);
2595      LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2596
2597      for (unsigned i = 0; i < vec_size; i++)
2598         scalars[i] = scalar;
2599      return LLVMConstVector(scalars, vec_size);
2600   }
2601   return LLVMConstInt(type, value, 0);
2602}
2603
2604LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2605{
2606   LLVMTypeRef type = LLVMTypeOf(src0);
2607   LLVMValueRef val;
2608
2609   /* v_med3 is selected only when max is first. (LLVM bug?) */
2610   val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2611   return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2612}
2613
2614static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2615{
2616   ac_enable_signed_zeros(ctx);
2617   /* (val + 0) converts negative zero to positive zero. */
2618   val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2619   ac_disable_signed_zeros(ctx);
2620   return val;
2621}
2622
2623LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2624{
2625   LLVMTypeRef type = LLVMTypeOf(src);
2626   LLVMValueRef pos, neg, dw[2], val;
2627   unsigned bitsize = ac_get_elem_bits(ctx, type);
2628
2629   /* The standard version leads to this:
2630    *   v_cmp_ngt_f32_e64 s[0:1], s4, 0                       ; D40B0000 00010004
2631    *   v_cndmask_b32_e64 v4, 1.0, s4, s[0:1]                 ; D5010004 000008F2
2632    *   v_cmp_le_f32_e32 vcc, 0, v4                           ; 7C060880
2633    *   v_cndmask_b32_e32 v4, -1.0, v4, vcc                   ; 020808F3
2634    *
2635    * The isign version:
2636    *   v_add_f32_e64 v4, s4, 0                               ; D5030004 00010004
2637    *   v_med3_i32 v4, v4, -1, 1                              ; D5580004 02058304
2638    *   v_cvt_f32_i32_e32 v4, v4                              ; 7E080B04
2639    *
2640    * (src0 + 0) converts negative zero to positive zero.
2641    * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2642    *
2643    * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2644    * reduction. (FP64 comparisons are as fast as int64 comparisons)
2645    */
2646   if (bitsize == 16 || bitsize == 32) {
2647      val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2648      val = ac_build_isign(ctx, val);
2649      return LLVMBuildSIToFP(ctx->builder, val, type, "");
2650   }
2651
2652   assert(bitsize == 64);
2653   pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2654   neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2655   dw[0] = ctx->i32_0;
2656   dw[1] = LLVMBuildSelect(
2657      ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2658      LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2659      "");
2660   return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2661}
2662
2663LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2664{
2665   LLVMValueRef result;
2666   unsigned bitsize;
2667
2668   bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2669
2670   switch (bitsize) {
2671   case 128:
2672      result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1,
2673                                  AC_FUNC_ATTR_READNONE);
2674      result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2675      break;
2676   case 64:
2677      result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2678                                  AC_FUNC_ATTR_READNONE);
2679
2680      result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2681      break;
2682   case 32:
2683      result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2684                                  AC_FUNC_ATTR_READNONE);
2685      break;
2686   case 16:
2687      result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2688                                  AC_FUNC_ATTR_READNONE);
2689
2690      result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2691      break;
2692   case 8:
2693      result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2694                                  AC_FUNC_ATTR_READNONE);
2695
2696      result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2697      break;
2698   default:
2699      unreachable(!"invalid bitsize");
2700      break;
2701   }
2702
2703   return result;
2704}
2705
2706LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2707{
2708   LLVMValueRef result;
2709   unsigned bitsize;
2710
2711   bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2712
2713   switch (bitsize) {
2714   case 64:
2715      result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2716                                  AC_FUNC_ATTR_READNONE);
2717
2718      result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2719      break;
2720   case 32:
2721      result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2722                                  AC_FUNC_ATTR_READNONE);
2723      break;
2724   case 16:
2725      result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2726                                  AC_FUNC_ATTR_READNONE);
2727
2728      result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2729      break;
2730   case 8:
2731      result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2732                                  AC_FUNC_ATTR_READNONE);
2733
2734      result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2735      break;
2736   default:
2737      unreachable(!"invalid bitsize");
2738      break;
2739   }
2740
2741   return result;
2742}
2743
2744void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2745{
2746   LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2747   ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1,
2748                      AC_FUNC_ATTR_CONVERGENT);
2749}
2750
2751void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2752{
2753   unsigned lds_size = ctx->gfx_level >= GFX7 ? 65536 : 32768;
2754   ctx->lds = LLVMBuildIntToPtr(
2755      ctx->builder, ctx->i32_0,
2756      LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds");
2757}
2758
2759LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2760{
2761   return LLVMBuildLoad2(ctx->builder, ctx->i32, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
2762}
2763
2764void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2765{
2766   value = ac_to_integer(ctx, value);
2767   ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2768}
2769
2770LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2771{
2772   unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2773   const char *intrin_name;
2774   LLVMTypeRef type;
2775   LLVMValueRef zero;
2776
2777   switch (src0_bitsize) {
2778   case 64:
2779      intrin_name = "llvm.cttz.i64";
2780      type = ctx->i64;
2781      zero = ctx->i64_0;
2782      break;
2783   case 32:
2784      intrin_name = "llvm.cttz.i32";
2785      type = ctx->i32;
2786      zero = ctx->i32_0;
2787      break;
2788   case 16:
2789      intrin_name = "llvm.cttz.i16";
2790      type = ctx->i16;
2791      zero = ctx->i16_0;
2792      break;
2793   case 8:
2794      intrin_name = "llvm.cttz.i8";
2795      type = ctx->i8;
2796      zero = ctx->i8_0;
2797      break;
2798   default:
2799      unreachable(!"invalid bitsize");
2800   }
2801
2802   LLVMValueRef params[2] = {
2803      src0,
2804
2805      /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2806       * add special code to check for x=0. The reason is that
2807       * the LLVM behavior for x=0 is different from what we
2808       * need here. However, LLVM also assumes that ffs(x) is
2809       * in [0, 31], but GLSL expects that ffs(0) = -1, so
2810       * a conditional assignment to handle 0 is still required.
2811       *
2812       * The hardware already implements the correct behavior.
2813       */
2814      ctx->i1true,
2815   };
2816
2817   LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
2818
2819   if (src0_bitsize == 64) {
2820      lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2821   } else if (src0_bitsize < 32) {
2822      lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
2823   }
2824
2825   /* TODO: We need an intrinsic to skip this conditional. */
2826   /* Check for zero: */
2827   return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
2828                          LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2829}
2830
2831LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2832{
2833   return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
2834}
2835
2836LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2837{
2838   return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
2839}
2840
2841static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
2842{
2843   if (ctx->flow->depth > 0)
2844      return &ctx->flow->stack[ctx->flow->depth - 1];
2845   return NULL;
2846}
2847
2848static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
2849{
2850   for (unsigned i = ctx->flow->depth; i > 0; --i) {
2851      if (ctx->flow->stack[i - 1].loop_entry_block)
2852         return &ctx->flow->stack[i - 1];
2853   }
2854   return NULL;
2855}
2856
2857static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
2858{
2859   struct ac_llvm_flow *flow;
2860
2861   if (ctx->flow->depth >= ctx->flow->depth_max) {
2862      unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
2863
2864      ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
2865      ctx->flow->depth_max = new_max;
2866   }
2867
2868   flow = &ctx->flow->stack[ctx->flow->depth];
2869   ctx->flow->depth++;
2870
2871   flow->next_block = NULL;
2872   flow->loop_entry_block = NULL;
2873   return flow;
2874}
2875
2876static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
2877{
2878   char buf[32];
2879   snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2880   LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2881}
2882
2883/* Append a basic block at the level of the parent flow.
2884 */
2885static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
2886{
2887   assert(ctx->flow->depth >= 1);
2888
2889   if (ctx->flow->depth >= 2) {
2890      struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
2891
2892      return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
2893   }
2894
2895   LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
2896   return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
2897}
2898
2899/* Emit a branch to the given default target for the current block if
2900 * applicable -- that is, if the current block does not already contain a
2901 * branch from a break or continue.
2902 */
2903static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
2904{
2905   if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
2906      LLVMBuildBr(builder, target);
2907}
2908
2909void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
2910{
2911   struct ac_llvm_flow *flow = push_flow(ctx);
2912   flow->loop_entry_block = append_basic_block(ctx, "LOOP");
2913   flow->next_block = append_basic_block(ctx, "ENDLOOP");
2914   set_basicblock_name(flow->loop_entry_block, "loop", label_id);
2915   LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2916   LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
2917}
2918
2919void ac_build_break(struct ac_llvm_context *ctx)
2920{
2921   struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2922   LLVMBuildBr(ctx->builder, flow->next_block);
2923}
2924
2925void ac_build_continue(struct ac_llvm_context *ctx)
2926{
2927   struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2928   LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2929}
2930
2931void ac_build_else(struct ac_llvm_context *ctx, int label_id)
2932{
2933   struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2934   LLVMBasicBlockRef endif_block;
2935
2936   assert(!current_branch->loop_entry_block);
2937
2938   endif_block = append_basic_block(ctx, "ENDIF");
2939   emit_default_branch(ctx->builder, endif_block);
2940
2941   LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2942   set_basicblock_name(current_branch->next_block, "else", label_id);
2943
2944   current_branch->next_block = endif_block;
2945}
2946
2947/* Invoked after a branch is exited. */
2948static void ac_branch_exited(struct ac_llvm_context *ctx)
2949{
2950   if (ctx->flow->depth == 0 && ctx->conditional_demote_seen) {
2951      /* The previous conditional branch contained demote. Kill threads
2952       * after all conditional blocks because amdgcn.wqm.vote doesn't
2953       * return usable values inside the blocks.
2954       *
2955       * This is an optional optimization that only kills whole inactive quads.
2956       */
2957      LLVMValueRef cond = LLVMBuildLoad2(ctx->builder, ctx->i1, ctx->postponed_kill, "");
2958      ac_build_kill_if_false(ctx, ac_build_wqm_vote(ctx, cond));
2959      ctx->conditional_demote_seen = false;
2960   }
2961}
2962
2963void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
2964{
2965   struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2966
2967   assert(!current_branch->loop_entry_block);
2968
2969   emit_default_branch(ctx->builder, current_branch->next_block);
2970   LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2971   set_basicblock_name(current_branch->next_block, "endif", label_id);
2972
2973   ctx->flow->depth--;
2974   ac_branch_exited(ctx);
2975}
2976
2977void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
2978{
2979   struct ac_llvm_flow *current_loop = get_current_flow(ctx);
2980
2981   assert(current_loop->loop_entry_block);
2982
2983   emit_default_branch(ctx->builder, current_loop->loop_entry_block);
2984
2985   LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
2986   set_basicblock_name(current_loop->next_block, "endloop", label_id);
2987   ctx->flow->depth--;
2988   ac_branch_exited(ctx);
2989}
2990
2991void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
2992{
2993   struct ac_llvm_flow *flow = push_flow(ctx);
2994   LLVMBasicBlockRef if_block;
2995
2996   if_block = append_basic_block(ctx, "IF");
2997   flow->next_block = append_basic_block(ctx, "ELSE");
2998   set_basicblock_name(if_block, "if", label_id);
2999   LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3000   LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3001}
3002
3003LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3004{
3005   LLVMBuilderRef builder = ac->builder;
3006   LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3007   LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3008   LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3009   LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3010   LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3011   LLVMValueRef res;
3012
3013   if (first_instr) {
3014      LLVMPositionBuilderBefore(first_builder, first_instr);
3015   } else {
3016      LLVMPositionBuilderAtEnd(first_builder, first_block);
3017   }
3018
3019   res = LLVMBuildAlloca(first_builder, type, name);
3020   LLVMDisposeBuilder(first_builder);
3021   return res;
3022}
3023
3024LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3025{
3026   LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3027   LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3028   return ptr;
3029}
3030
3031LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name)
3032{
3033   LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name);
3034   LLVMBuildStore(ac->builder, val, ptr);
3035   return ptr;
3036}
3037
3038LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
3039{
3040   int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3041   return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
3042}
3043
3044LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
3045{
3046   unsigned num_components = ac_get_llvm_num_components(value);
3047   if (count == num_components)
3048      return value;
3049
3050   LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
3051   masks[0] = ctx->i32_0;
3052   masks[1] = ctx->i32_1;
3053   for (unsigned i = 2; i < count; i++)
3054      masks[i] = LLVMConstInt(ctx->i32, i, false);
3055
3056   if (count == 1)
3057      return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
3058
3059   LLVMValueRef swizzle = LLVMConstVector(masks, count);
3060   return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3061}
3062
3063/* If param is i64 and bitwidth <= 32, the return value will be i32. */
3064LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
3065                             unsigned bitwidth)
3066{
3067   LLVMValueRef value = param;
3068   if (rshift)
3069      value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
3070
3071   if (rshift + bitwidth < 32) {
3072      uint64_t mask = (1ull << bitwidth) - 1;
3073      value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
3074   }
3075
3076   if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
3077      value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
3078   return value;
3079}
3080
3081/* Adjust the sample index according to FMASK.
3082 *
3083 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3084 * which is the identity mapping. Each nibble says which physical sample
3085 * should be fetched to get that sample.
3086 *
3087 * For example, 0x11111100 means there are only 2 samples stored and
3088 * the second sample covers 3/4 of the pixel. When reading samples 0
3089 * and 1, return physical sample 0 (determined by the first two 0s
3090 * in FMASK), otherwise return physical sample 1.
3091 *
3092 * The sample index should be adjusted as follows:
3093 *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3094 */
3095void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr,
3096                              bool is_array_tex)
3097{
3098   struct ac_image_args fmask_load = {0};
3099   fmask_load.opcode = ac_image_load;
3100   fmask_load.resource = fmask;
3101   fmask_load.dmask = 0xf;
3102   fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3103   fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3104
3105   fmask_load.coords[0] = addr[0];
3106   fmask_load.coords[1] = addr[1];
3107   if (is_array_tex)
3108      fmask_load.coords[2] = addr[2];
3109   fmask_load.a16 = ac_get_elem_bits(ac, LLVMTypeOf(addr[0])) == 16;
3110
3111   LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3112   fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, "");
3113
3114   /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3115    * resource descriptor is 0 (invalid).
3116    */
3117   LLVMValueRef tmp;
3118   tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3119   tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3120   tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3121   fmask_value =
3122      LLVMBuildSelect(ac->builder, tmp, fmask_value, LLVMConstInt(ac->i32, 0x76543210, false), "");
3123
3124   /* Apply the formula. */
3125   unsigned sample_chan = is_array_tex ? 3 : 2;
3126   LLVMValueRef final_sample;
3127   final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3128                               LLVMConstInt(LLVMTypeOf(addr[0]), 4, 0), "");
3129   final_sample = LLVMBuildLShr(ac->builder, fmask_value,
3130                                LLVMBuildZExt(ac->builder, final_sample, ac->i32, ""), "");
3131   /* Mask the sample index by 0x7, because 0x8 means an unknown value
3132    * with EQAA, so those will map to 0. */
3133   addr[sample_chan] = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");
3134   if (fmask_load.a16)
3135      addr[sample_chan] = LLVMBuildTrunc(ac->builder, final_sample, ac->i16, "");
3136}
3137
3138static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
3139                                       LLVMValueRef lane, bool with_opt_barrier)
3140{
3141   LLVMTypeRef type = LLVMTypeOf(src);
3142   LLVMValueRef result;
3143
3144   if (with_opt_barrier)
3145      ac_build_optimization_barrier(ctx, &src, false);
3146
3147   src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3148   if (lane)
3149      lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
3150
3151   result =
3152      ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3153                         ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2,
3154                         AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3155
3156   return LLVMBuildTrunc(ctx->builder, result, type, "");
3157}
3158
3159static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
3160                                             LLVMValueRef lane, bool with_opt_barrier)
3161{
3162   LLVMTypeRef src_type = LLVMTypeOf(src);
3163   src = ac_to_integer(ctx, src);
3164   unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3165   LLVMValueRef ret;
3166
3167   if (bits > 32) {
3168      assert(bits % 32 == 0);
3169      LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3170      LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3171      ret = LLVMGetUndef(vec_type);
3172      for (unsigned i = 0; i < bits / 32; i++) {
3173         LLVMValueRef ret_comp;
3174
3175         src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3176
3177         ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3178
3179         ret =
3180            LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3181      }
3182   } else {
3183      ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3184   }
3185
3186   if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
3187      return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
3188   return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3189}
3190
3191/**
3192 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3193 *
3194 * The optimization barrier is not needed if the value is the same in all lanes
3195 * or if this is called in the outermost block.
3196 *
3197 * @param ctx
3198 * @param src
3199 * @param lane - id of the lane or NULL for the first active lane
3200 * @return value of the lane
3201 */
3202LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
3203                                              LLVMValueRef lane)
3204{
3205   return ac_build_readlane_common(ctx, src, lane, false);
3206}
3207
3208LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3209{
3210   return ac_build_readlane_common(ctx, src, lane, true);
3211}
3212
3213LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
3214                                LLVMValueRef lane)
3215{
3216   return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3217                             (LLVMValueRef[]){value, lane, src}, 3,
3218                             AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3219}
3220
3221LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)
3222{
3223   LLVMValueRef val;
3224
3225   if (ctx->wave_size == 32) {
3226      val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3227                               (LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3228   } else {
3229      LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
3230      LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
3231      LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
3232      val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3233                               (LLVMValueRef[]){mask_lo, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3234      val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
3235                               2, AC_FUNC_ATTR_READNONE);
3236   }
3237
3238   /* Bug workaround. LLVM always believes the upper bound of mbcnt to be the wave size,
3239    * regardless of ac_set_range_metadata. Use an extra add instruction to work around it.
3240    */
3241   if (add_src != NULL && add_src != ctx->i32_0) {
3242      return LLVMBuildAdd(ctx->builder, val, add_src, "");
3243   }
3244
3245   return val;
3246}
3247
3248LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3249{
3250   return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);
3251}
3252
3253enum dpp_ctrl
3254{
3255   _dpp_quad_perm = 0x000,
3256   _dpp_row_sl = 0x100,
3257   _dpp_row_sr = 0x110,
3258   _dpp_row_rr = 0x120,
3259   dpp_wf_sl1 = 0x130,
3260   dpp_wf_rl1 = 0x134,
3261   dpp_wf_sr1 = 0x138,
3262   dpp_wf_rr1 = 0x13C,
3263   dpp_row_mirror = 0x140,
3264   dpp_row_half_mirror = 0x141,
3265   dpp_row_bcast15 = 0x142,
3266   dpp_row_bcast31 = 0x143
3267};
3268
3269static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
3270                                          unsigned lane3)
3271{
3272   assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3273   return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3274}
3275
3276static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
3277{
3278   assert(amount > 0 && amount < 16);
3279   return _dpp_row_sr | amount;
3280}
3281
3282static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3283                                  enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3284                                  bool bound_ctrl)
3285{
3286   LLVMTypeRef type = LLVMTypeOf(src);
3287   LLVMValueRef res;
3288
3289   old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
3290   src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3291
3292   res = ac_build_intrinsic(
3293      ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
3294      (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3295                       LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
3296                       LLVMConstInt(ctx->i1, bound_ctrl, 0)},
3297      6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3298
3299   return LLVMBuildTrunc(ctx->builder, res, type, "");
3300}
3301
3302static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3303                                 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3304                                 bool bound_ctrl)
3305{
3306   LLVMTypeRef src_type = LLVMTypeOf(src);
3307   src = ac_to_integer(ctx, src);
3308   old = ac_to_integer(ctx, old);
3309   unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3310   LLVMValueRef ret;
3311   if (bits > 32) {
3312      assert(bits % 32 == 0);
3313      LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3314      LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3315      LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3316      ret = LLVMGetUndef(vec_type);
3317      for (unsigned i = 0; i < bits / 32; i++) {
3318         src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3319         old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
3320         LLVMValueRef ret_comp =
3321            _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3322         ret =
3323            LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3324      }
3325   } else {
3326      ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3327   }
3328   return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3329}
3330
3331static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
3332                                         uint64_t sel, bool exchange_rows, bool bound_ctrl)
3333{
3334   LLVMTypeRef type = LLVMTypeOf(src);
3335   LLVMValueRef result;
3336
3337   src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3338
3339   LLVMValueRef args[6] = {
3340      src,
3341      src,
3342      LLVMConstInt(ctx->i32, sel, false),
3343      LLVMConstInt(ctx->i32, sel >> 32, false),
3344      ctx->i1true, /* fi */
3345      bound_ctrl ? ctx->i1true : ctx->i1false,
3346   };
3347
3348   result =
3349      ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
3350                         ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3351
3352   return LLVMBuildTrunc(ctx->builder, result, type, "");
3353}
3354
3355static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3356                                        bool exchange_rows, bool bound_ctrl)
3357{
3358   LLVMTypeRef src_type = LLVMTypeOf(src);
3359   src = ac_to_integer(ctx, src);
3360   unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3361   LLVMValueRef ret;
3362   if (bits > 32) {
3363      assert(bits % 32 == 0);
3364      LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3365      LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3366      ret = LLVMGetUndef(vec_type);
3367      for (unsigned i = 0; i < bits / 32; i++) {
3368         src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3369         LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3370         ret =
3371            LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3372      }
3373   } else {
3374      ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3375   }
3376   return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3377}
3378
3379static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3380{
3381   assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3382   return and_mask | (or_mask << 5) | (xor_mask << 10);
3383}
3384
3385static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
3386                                         unsigned mask)
3387{
3388   LLVMTypeRef src_type = LLVMTypeOf(src);
3389   LLVMValueRef ret;
3390
3391   src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3392
3393   ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
3394                            (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
3395                            AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3396
3397   return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3398}
3399
3400LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3401{
3402   LLVMTypeRef src_type = LLVMTypeOf(src);
3403   src = ac_to_integer(ctx, src);
3404   unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3405   LLVMValueRef ret;
3406   if (bits > 32) {
3407      assert(bits % 32 == 0);
3408      LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3409      LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3410      ret = LLVMGetUndef(vec_type);
3411      for (unsigned i = 0; i < bits / 32; i++) {
3412         src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3413         LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
3414         ret =
3415            LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3416      }
3417   } else {
3418      ret = _ac_build_ds_swizzle(ctx, src, mask);
3419   }
3420   return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3421}
3422
3423static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3424{
3425   LLVMTypeRef src_type = LLVMTypeOf(src);
3426   unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3427   char name[32], type[8];
3428   LLVMValueRef ret;
3429
3430   src = ac_to_integer(ctx, src);
3431
3432   if (bitsize < 32)
3433      src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3434
3435   ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3436   snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3437   ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1,
3438                            AC_FUNC_ATTR_READNONE);
3439
3440   if (bitsize < 32)
3441      ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3442
3443   return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3444}
3445
3446static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3447                                          LLVMValueRef inactive)
3448{
3449   char name[33], type[8];
3450   LLVMTypeRef src_type = LLVMTypeOf(src);
3451   unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3452   src = ac_to_integer(ctx, src);
3453   inactive = ac_to_integer(ctx, inactive);
3454
3455   if (bitsize < 32) {
3456      src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3457      inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3458   }
3459
3460   ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3461   snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3462   LLVMValueRef ret =
3463      ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2,
3464                         AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3465   if (bitsize < 32)
3466      ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3467
3468   return ret;
3469}
3470
3471static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3472                                           unsigned type_size)
3473{
3474
3475   if (type_size == 0) {
3476      switch (op) {
3477      case nir_op_ior:
3478      case nir_op_ixor:
3479         return LLVMConstInt(ctx->i1, 0, 0);
3480      case nir_op_iand:
3481         return LLVMConstInt(ctx->i1, 1, 0);
3482      default:
3483         unreachable("bad reduction intrinsic");
3484      }
3485   } else if (type_size == 1) {
3486      switch (op) {
3487      case nir_op_iadd:
3488         return ctx->i8_0;
3489      case nir_op_imul:
3490         return ctx->i8_1;
3491      case nir_op_imin:
3492         return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3493      case nir_op_umin:
3494         return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3495      case nir_op_imax:
3496         return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3497      case nir_op_umax:
3498         return ctx->i8_0;
3499      case nir_op_iand:
3500         return LLVMConstInt(ctx->i8, -1, 0);
3501      case nir_op_ior:
3502         return ctx->i8_0;
3503      case nir_op_ixor:
3504         return ctx->i8_0;
3505      default:
3506         unreachable("bad reduction intrinsic");
3507      }
3508   } else if (type_size == 2) {
3509      switch (op) {
3510      case nir_op_iadd:
3511         return ctx->i16_0;
3512      case nir_op_fadd:
3513         return ctx->f16_0;
3514      case nir_op_imul:
3515         return ctx->i16_1;
3516      case nir_op_fmul:
3517         return ctx->f16_1;
3518      case nir_op_imin:
3519         return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3520      case nir_op_umin:
3521         return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3522      case nir_op_fmin:
3523         return LLVMConstReal(ctx->f16, INFINITY);
3524      case nir_op_imax:
3525         return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3526      case nir_op_umax:
3527         return ctx->i16_0;
3528      case nir_op_fmax:
3529         return LLVMConstReal(ctx->f16, -INFINITY);
3530      case nir_op_iand:
3531         return LLVMConstInt(ctx->i16, -1, 0);
3532      case nir_op_ior:
3533         return ctx->i16_0;
3534      case nir_op_ixor:
3535         return ctx->i16_0;
3536      default:
3537         unreachable("bad reduction intrinsic");
3538      }
3539   } else if (type_size == 4) {
3540      switch (op) {
3541      case nir_op_iadd:
3542         return ctx->i32_0;
3543      case nir_op_fadd:
3544         return ctx->f32_0;
3545      case nir_op_imul:
3546         return ctx->i32_1;
3547      case nir_op_fmul:
3548         return ctx->f32_1;
3549      case nir_op_imin:
3550         return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3551      case nir_op_umin:
3552         return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3553      case nir_op_fmin:
3554         return LLVMConstReal(ctx->f32, INFINITY);
3555      case nir_op_imax:
3556         return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3557      case nir_op_umax:
3558         return ctx->i32_0;
3559      case nir_op_fmax:
3560         return LLVMConstReal(ctx->f32, -INFINITY);
3561      case nir_op_iand:
3562         return LLVMConstInt(ctx->i32, -1, 0);
3563      case nir_op_ior:
3564         return ctx->i32_0;
3565      case nir_op_ixor:
3566         return ctx->i32_0;
3567      default:
3568         unreachable("bad reduction intrinsic");
3569      }
3570   } else { /* type_size == 64bit */
3571      switch (op) {
3572      case nir_op_iadd:
3573         return ctx->i64_0;
3574      case nir_op_fadd:
3575         return ctx->f64_0;
3576      case nir_op_imul:
3577         return ctx->i64_1;
3578      case nir_op_fmul:
3579         return ctx->f64_1;
3580      case nir_op_imin:
3581         return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3582      case nir_op_umin:
3583         return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3584      case nir_op_fmin:
3585         return LLVMConstReal(ctx->f64, INFINITY);
3586      case nir_op_imax:
3587         return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3588      case nir_op_umax:
3589         return ctx->i64_0;
3590      case nir_op_fmax:
3591         return LLVMConstReal(ctx->f64, -INFINITY);
3592      case nir_op_iand:
3593         return LLVMConstInt(ctx->i64, -1, 0);
3594      case nir_op_ior:
3595         return ctx->i64_0;
3596      case nir_op_ixor:
3597         return ctx->i64_0;
3598      default:
3599         unreachable("bad reduction intrinsic");
3600      }
3601   }
3602}
3603
3604static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3605                                    nir_op op)
3606{
3607   bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3608   bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3609   switch (op) {
3610   case nir_op_iadd:
3611      return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3612   case nir_op_fadd:
3613      return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3614   case nir_op_imul:
3615      return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3616   case nir_op_fmul:
3617      return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3618   case nir_op_imin:
3619      return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3620                             lhs, rhs, "");
3621   case nir_op_umin:
3622      return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3623                             lhs, rhs, "");
3624   case nir_op_fmin:
3625      return ac_build_intrinsic(
3626         ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3627         _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3628         AC_FUNC_ATTR_READNONE);
3629   case nir_op_imax:
3630      return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3631                             lhs, rhs, "");
3632   case nir_op_umax:
3633      return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3634                             lhs, rhs, "");
3635   case nir_op_fmax:
3636      return ac_build_intrinsic(
3637         ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3638         _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3639         AC_FUNC_ATTR_READNONE);
3640   case nir_op_iand:
3641      return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3642   case nir_op_ior:
3643      return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3644   case nir_op_ixor:
3645      return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3646   default:
3647      unreachable("bad reduction intrinsic");
3648   }
3649}
3650
3651/**
3652 * \param src The value to shift.
3653 * \param identity The value to use the first lane.
3654 * \param maxprefix specifies that the result only needs to be correct for a
3655 *     prefix of this many threads
3656 * \return src, shifted 1 lane up, and identity shifted into lane 0.
3657 */
3658static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3659                                               LLVMValueRef identity, unsigned maxprefix)
3660{
3661   if (ctx->gfx_level >= GFX10) {
3662      /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3663      LLVMValueRef active, tmp1, tmp2;
3664      LLVMValueRef tid = ac_get_thread_id(ctx);
3665
3666      tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3667
3668      tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3669
3670      if (maxprefix > 32) {
3671         active =
3672            LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3673
3674         tmp2 = LLVMBuildSelect(ctx->builder, active,
3675                                ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3676                                tmp2, "");
3677
3678         active = LLVMBuildOr(
3679            ctx->builder, active,
3680            LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3681                          LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3682                          LLVMConstInt(ctx->i32, 0x10, false), ""),
3683            "");
3684         return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3685      } else if (maxprefix > 16) {
3686         active =
3687            LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3688
3689         return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3690      }
3691   } else if (ctx->gfx_level >= GFX8) {
3692      return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3693   }
3694
3695   /* wavefront shift_right by 1 on SI/CI */
3696   LLVMValueRef active, tmp1, tmp2;
3697   LLVMValueRef tid = ac_get_thread_id(ctx);
3698   tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3699   tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3700   active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3701                          LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3702                          LLVMConstInt(ctx->i32, 0x4, 0), "");
3703   tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3704   tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3705   active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3706                          LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3707                          LLVMConstInt(ctx->i32, 0x8, 0), "");
3708   tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3709   tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3710   active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3711                          LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3712                          LLVMConstInt(ctx->i32, 0x10, 0), "");
3713   tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3714   tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3715   active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3716   tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3717   active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
3718   return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3719}
3720
3721/**
3722 * \param maxprefix specifies that the result only needs to be correct for a
3723 *     prefix of this many threads
3724 */
3725static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3726                                  LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3727{
3728   LLVMValueRef result, tmp;
3729
3730   if (!inclusive)
3731      src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3732
3733   result = src;
3734
3735   if (ctx->gfx_level <= GFX7) {
3736      assert(maxprefix == 64);
3737      LLVMValueRef tid = ac_get_thread_id(ctx);
3738      LLVMValueRef active;
3739      tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3740      active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3741                             LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3742      tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3743      result = ac_build_alu_op(ctx, result, tmp, op);
3744      tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3745      active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3746                             LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3747                             ctx->i32_0, "");
3748      tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3749      result = ac_build_alu_op(ctx, result, tmp, op);
3750      tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3751      active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3752                             LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3753                             ctx->i32_0, "");
3754      tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3755      result = ac_build_alu_op(ctx, result, tmp, op);
3756      tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3757      active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3758                             LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3759                             ctx->i32_0, "");
3760      tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3761      result = ac_build_alu_op(ctx, result, tmp, op);
3762      tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3763      active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3764                             LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3765                             ctx->i32_0, "");
3766      tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3767      result = ac_build_alu_op(ctx, result, tmp, op);
3768      tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3769      active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3770                             LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3771                             ctx->i32_0, "");
3772      tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3773      result = ac_build_alu_op(ctx, result, tmp, op);
3774      return result;
3775   }
3776
3777   if (maxprefix <= 1)
3778      return result;
3779   tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3780   result = ac_build_alu_op(ctx, result, tmp, op);
3781   if (maxprefix <= 2)
3782      return result;
3783   tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3784   result = ac_build_alu_op(ctx, result, tmp, op);
3785   if (maxprefix <= 3)
3786      return result;
3787   tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3788   result = ac_build_alu_op(ctx, result, tmp, op);
3789   if (maxprefix <= 4)
3790      return result;
3791   tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3792   result = ac_build_alu_op(ctx, result, tmp, op);
3793   if (maxprefix <= 8)
3794      return result;
3795   tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3796   result = ac_build_alu_op(ctx, result, tmp, op);
3797   if (maxprefix <= 16)
3798      return result;
3799
3800   if (ctx->gfx_level >= GFX10) {
3801      LLVMValueRef tid = ac_get_thread_id(ctx);
3802      LLVMValueRef active;
3803
3804      tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3805
3806      active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3807                             LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3808                             ctx->i32_0, "");
3809
3810      tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3811
3812      result = ac_build_alu_op(ctx, result, tmp, op);
3813
3814      if (maxprefix <= 32)
3815         return result;
3816
3817      tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3818
3819      active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3820
3821      tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3822
3823      result = ac_build_alu_op(ctx, result, tmp, op);
3824      return result;
3825   }
3826
3827   tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3828   result = ac_build_alu_op(ctx, result, tmp, op);
3829   if (maxprefix <= 32)
3830      return result;
3831   tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3832   result = ac_build_alu_op(ctx, result, tmp, op);
3833   return result;
3834}
3835
3836LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3837{
3838   LLVMValueRef result;
3839
3840   if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3841      LLVMBuilderRef builder = ctx->builder;
3842      src = LLVMBuildZExt(builder, src, ctx->i32, "");
3843      result = ac_build_ballot(ctx, src);
3844      result = ac_build_mbcnt(ctx, result);
3845      result = LLVMBuildAdd(builder, result, src, "");
3846      return result;
3847   }
3848
3849   ac_build_optimization_barrier(ctx, &src, false);
3850
3851   LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3852   result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3853                             LLVMTypeOf(identity), "");
3854   result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
3855
3856   return ac_build_wwm(ctx, result);
3857}
3858
3859LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3860{
3861   LLVMValueRef result;
3862
3863   if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3864      LLVMBuilderRef builder = ctx->builder;
3865      src = LLVMBuildZExt(builder, src, ctx->i32, "");
3866      result = ac_build_ballot(ctx, src);
3867      result = ac_build_mbcnt(ctx, result);
3868      return result;
3869   }
3870
3871   ac_build_optimization_barrier(ctx, &src, false);
3872
3873   LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3874   result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3875                             LLVMTypeOf(identity), "");
3876   result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
3877
3878   return ac_build_wwm(ctx, result);
3879}
3880
3881LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
3882                             unsigned cluster_size)
3883{
3884   if (cluster_size == 1)
3885      return src;
3886   ac_build_optimization_barrier(ctx, &src, false);
3887   LLVMValueRef result, swap;
3888   LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3889   result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3890                             LLVMTypeOf(identity), "");
3891   swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3892   result = ac_build_alu_op(ctx, result, swap, op);
3893   if (cluster_size == 2)
3894      return ac_build_wwm(ctx, result);
3895
3896   swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3897   result = ac_build_alu_op(ctx, result, swap, op);
3898   if (cluster_size == 4)
3899      return ac_build_wwm(ctx, result);
3900
3901   if (ctx->gfx_level >= GFX8)
3902      swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3903   else
3904      swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3905   result = ac_build_alu_op(ctx, result, swap, op);
3906   if (cluster_size == 8)
3907      return ac_build_wwm(ctx, result);
3908
3909   if (ctx->gfx_level >= GFX8)
3910      swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3911   else
3912      swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3913   result = ac_build_alu_op(ctx, result, swap, op);
3914   if (cluster_size == 16)
3915      return ac_build_wwm(ctx, result);
3916
3917   if (ctx->gfx_level >= GFX10)
3918      swap = ac_build_permlane16(ctx, result, 0, true, false);
3919   else if (ctx->gfx_level >= GFX8 && cluster_size != 32)
3920      swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3921   else
3922      swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3923   result = ac_build_alu_op(ctx, result, swap, op);
3924   if (cluster_size == 32)
3925      return ac_build_wwm(ctx, result);
3926
3927   if (ctx->gfx_level >= GFX8) {
3928      if (ctx->wave_size == 64) {
3929         if (ctx->gfx_level >= GFX10)
3930            swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3931         else
3932            swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3933         result = ac_build_alu_op(ctx, result, swap, op);
3934         result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3935      }
3936
3937      return ac_build_wwm(ctx, result);
3938   } else {
3939      swap = ac_build_readlane(ctx, result, ctx->i32_0);
3940      result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3941      result = ac_build_alu_op(ctx, result, swap, op);
3942      return ac_build_wwm(ctx, result);
3943   }
3944}
3945
3946/**
3947 * "Top half" of a scan that reduces per-wave values across an entire
3948 * workgroup.
3949 *
3950 * The source value must be present in the highest lane of the wave, and the
3951 * highest lane must be live.
3952 */
3953void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3954{
3955   if (ws->maxwaves <= 1)
3956      return;
3957
3958   const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
3959   LLVMBuilderRef builder = ctx->builder;
3960   LLVMValueRef tid = ac_get_thread_id(ctx);
3961   LLVMValueRef tmp;
3962
3963   tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
3964   ac_build_ifcc(ctx, tmp, 1000);
3965   LLVMBuildStore(builder, ws->src,
3966                  LLVMBuildGEP2(builder, LLVMTypeOf(ws->src), ws->scratch, &ws->waveidx, 1, ""));
3967   ac_build_endif(ctx, 1000);
3968}
3969
3970/**
3971 * "Bottom half" of a scan that reduces per-wave values across an entire
3972 * workgroup.
3973 *
3974 * The caller must place a barrier between the top and bottom halves.
3975 */
3976void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3977{
3978   const LLVMTypeRef type = LLVMTypeOf(ws->src);
3979   const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
3980
3981   if (ws->maxwaves <= 1) {
3982      ws->result_reduce = ws->src;
3983      ws->result_inclusive = ws->src;
3984      ws->result_exclusive = identity;
3985      return;
3986   }
3987   assert(ws->maxwaves <= 32);
3988
3989   LLVMBuilderRef builder = ctx->builder;
3990   LLVMValueRef tid = ac_get_thread_id(ctx);
3991   LLVMBasicBlockRef bbs[2];
3992   LLVMValueRef phivalues_scan[2];
3993   LLVMValueRef tmp, tmp2;
3994
3995   bbs[0] = LLVMGetInsertBlock(builder);
3996   phivalues_scan[0] = LLVMGetUndef(type);
3997
3998   if (ws->enable_reduce)
3999      tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4000   else if (ws->enable_inclusive)
4001      tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4002   else
4003      tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4004   ac_build_ifcc(ctx, tmp, 1001);
4005   {
4006      tmp = LLVMBuildLoad2(builder, LLVMTypeOf(ws->src),
4007                          LLVMBuildGEP2(builder, LLVMTypeOf(ws->src), ws->scratch, &tid, 1, ""), "");
4008
4009      ac_build_optimization_barrier(ctx, &tmp, false);
4010
4011      bbs[1] = LLVMGetInsertBlock(builder);
4012      phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4013   }
4014   ac_build_endif(ctx, 1001);
4015
4016   const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4017
4018   if (ws->enable_reduce) {
4019      tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4020      ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4021   }
4022   if (ws->enable_inclusive)
4023      ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4024   if (ws->enable_exclusive) {
4025      tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4026      tmp = ac_build_readlane(ctx, scan, tmp);
4027      tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4028      ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4029   }
4030}
4031
4032/**
4033 * Inclusive scan of a per-wave value across an entire workgroup.
4034 *
4035 * This implies an s_barrier instruction.
4036 *
4037 * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4038 * of the workgroup are live. (This requirement cannot easily be relaxed in a
4039 * useful manner because of the barrier in the algorithm.)
4040 */
4041void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4042{
4043   ac_build_wg_wavescan_top(ctx, ws);
4044   ac_build_waitcnt(ctx, AC_WAIT_LGKM);
4045   ac_build_s_barrier(ctx, ws->stage);
4046   ac_build_wg_wavescan_bottom(ctx, ws);
4047}
4048
4049/**
4050 * "Top half" of a scan that reduces per-thread values across an entire
4051 * workgroup.
4052 *
4053 * All lanes must be active when this code runs.
4054 */
4055void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4056{
4057   if (ws->enable_exclusive) {
4058      ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4059      if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4060         ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4061      ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4062   } else {
4063      ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4064   }
4065
4066   bool enable_inclusive = ws->enable_inclusive;
4067   bool enable_exclusive = ws->enable_exclusive;
4068   ws->enable_inclusive = false;
4069   ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4070   ac_build_wg_wavescan_top(ctx, ws);
4071   ws->enable_inclusive = enable_inclusive;
4072   ws->enable_exclusive = enable_exclusive;
4073}
4074
4075/**
4076 * "Bottom half" of a scan that reduces per-thread values across an entire
4077 * workgroup.
4078 *
4079 * The caller must place a barrier between the top and bottom halves.
4080 */
4081void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4082{
4083   bool enable_inclusive = ws->enable_inclusive;
4084   bool enable_exclusive = ws->enable_exclusive;
4085   ws->enable_inclusive = false;
4086   ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4087   ac_build_wg_wavescan_bottom(ctx, ws);
4088   ws->enable_inclusive = enable_inclusive;
4089   ws->enable_exclusive = enable_exclusive;
4090
4091   /* ws->result_reduce is already the correct value */
4092   if (ws->enable_inclusive)
4093      ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4094   if (ws->enable_exclusive)
4095      ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4096}
4097
4098/**
4099 * A scan that reduces per-thread values across an entire workgroup.
4100 *
4101 * The caller must ensure that all lanes are active when this code runs
4102 * (WWM is insufficient!), because there is an implied barrier.
4103 */
4104void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4105{
4106   ac_build_wg_scan_top(ctx, ws);
4107   ac_build_waitcnt(ctx, AC_WAIT_LGKM);
4108   ac_build_s_barrier(ctx, ws->stage);
4109   ac_build_wg_scan_bottom(ctx, ws);
4110}
4111
4112static void _ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
4113                                             LLVMValueRef *arg0, LLVMValueRef *arg1)
4114{
4115   LLVMValueRef tid;
4116   LLVMValueRef src0, src1;
4117   LLVMValueRef tmp0;
4118   LLVMValueRef params[2];
4119   LLVMValueRef is_even;
4120
4121   src0 = LLVMBuildBitCast(ctx->builder, *arg0, ctx->i32, "");
4122   src1 = LLVMBuildBitCast(ctx->builder, *arg1, ctx->i32, "");
4123
4124   /* swap odd,even lanes of arg_0*/
4125   params[0] = src0;
4126   params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
4127   src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
4128                             ctx->i32, params, 2, AC_FUNC_ATTR_CONVERGENT);
4129
4130   /* swap even lanes between arg_0 and arg_1 */
4131   tid = ac_get_thread_id(ctx);
4132   is_even = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
4133                           LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""),
4134                           ctx->i32_0, "");
4135   tmp0 = src0;
4136   src0 = LLVMBuildSelect(ctx->builder, is_even, src1, src0, "");
4137   src1 = LLVMBuildSelect(ctx->builder, is_even, tmp0, src1, "");
4138
4139   /* swap odd,even lanes again for arg_0*/
4140   params[0] = src0;
4141   params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
4142   src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
4143                             ctx->i32, params, 2, AC_FUNC_ATTR_CONVERGENT);
4144
4145   *arg0 = src0;
4146   *arg1 = src1;
4147}
4148
4149void ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
4150                                     struct ac_export_args *mrt0,
4151                                     struct ac_export_args *mrt1)
4152{
4153   assert(ctx->gfx_level >= GFX11);
4154   assert(mrt0->enabled_channels == mrt1->enabled_channels);
4155
4156   for (int i = 0; i < 4; i++) {
4157      if (mrt0->enabled_channels & (1 << i) && mrt1->enabled_channels & (1 << i))
4158         _ac_build_dual_src_blend_swizzle(ctx, &mrt0->out[i], &mrt1->out[i]);
4159   }
4160}
4161
4162LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
4163                                   unsigned lane1, unsigned lane2, unsigned lane3)
4164{
4165   unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4166   if (ctx->gfx_level >= GFX8) {
4167      return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4168   } else {
4169      return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4170   }
4171}
4172
4173LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4174{
4175   LLVMTypeRef type = LLVMTypeOf(src);
4176   LLVMValueRef result;
4177
4178   index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4179   src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
4180
4181   result =
4182      ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2,
4183                         AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4184   return LLVMBuildTrunc(ctx->builder, result, type, "");
4185}
4186
4187LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4188{
4189   LLVMTypeRef type;
4190   char *intr;
4191
4192   if (bitsize == 16) {
4193      intr = "llvm.amdgcn.frexp.exp.i16.f16";
4194      type = ctx->i16;
4195   } else if (bitsize == 32) {
4196      intr = "llvm.amdgcn.frexp.exp.i32.f32";
4197      type = ctx->i32;
4198   } else {
4199      intr = "llvm.amdgcn.frexp.exp.i32.f64";
4200      type = ctx->i32;
4201   }
4202
4203   LLVMValueRef params[] = {
4204      src0,
4205   };
4206   return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4207}
4208LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4209{
4210   LLVMTypeRef type;
4211   char *intr;
4212
4213   if (bitsize == 16) {
4214      intr = "llvm.amdgcn.frexp.mant.f16";
4215      type = ctx->f16;
4216   } else if (bitsize == 32) {
4217      intr = "llvm.amdgcn.frexp.mant.f32";
4218      type = ctx->f32;
4219   } else {
4220      intr = "llvm.amdgcn.frexp.mant.f64";
4221      type = ctx->f64;
4222   }
4223
4224   LLVMValueRef params[] = {
4225      src0,
4226   };
4227   return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4228}
4229
4230LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4231{
4232   LLVMTypeRef type;
4233   char *intr;
4234
4235   if (bitsize == 16) {
4236      intr = "llvm.canonicalize.f16";
4237      type = ctx->f16;
4238   } else if (bitsize == 32) {
4239      intr = "llvm.canonicalize.f32";
4240      type = ctx->f32;
4241   } else {
4242      intr = "llvm.canonicalize.f64";
4243      type = ctx->f64;
4244   }
4245
4246   LLVMValueRef params[] = {
4247      src0,
4248   };
4249   return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4250}
4251
4252/*
4253 * this takes an I,J coordinate pair,
4254 * and works out the X and Y derivatives.
4255 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4256 */
4257LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4258{
4259   LLVMValueRef result[4], a;
4260   unsigned i;
4261
4262   for (i = 0; i < 2; i++) {
4263      a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
4264      result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4265      result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4266   }
4267   return ac_build_gather_values(ctx, result, 4);
4268}
4269
4270LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4271{
4272   LLVMValueRef result;
4273
4274   if (LLVM_VERSION_MAJOR >= 13) {
4275      result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0,
4276                                  AC_FUNC_ATTR_READONLY | AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
4277   } else {
4278      result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0,
4279                                  AC_FUNC_ATTR_READNONE);
4280   }
4281   return LLVMBuildNot(ctx->builder, result, "");
4282}
4283
4284LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx)
4285{
4286   if (!ctx->postponed_kill)
4287      return ac_build_load_helper_invocation(ctx);
4288
4289   /* postponed_kill should be NULL on LLVM 13+ */
4290   assert(LLVM_VERSION_MAJOR < 13);
4291
4292   /* !(exact && postponed) */
4293   LLVMValueRef exact =
4294      ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4295
4296   LLVMValueRef postponed = LLVMBuildLoad2(ctx->builder, ctx->i1, ctx->postponed_kill, "");
4297   return LLVMBuildNot(ctx->builder, LLVMBuildAnd(ctx->builder, exact, postponed, ""), "");
4298}
4299
4300LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args,
4301                           unsigned num_args)
4302{
4303   LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4304   LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4305   return ret;
4306}
4307
4308void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
4309                     LLVMValueRef samplemask, LLVMValueRef mrt0_alpha, bool is_last,
4310                     struct ac_export_args *args)
4311{
4312   unsigned mask = 0;
4313   unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL,
4314                                                mrt0_alpha != NULL);
4315
4316   assert(depth || stencil || samplemask);
4317
4318   memset(args, 0, sizeof(*args));
4319
4320   if (is_last) {
4321      args->valid_mask = 1; /* whether the EXEC mask is valid */
4322      args->done = 1;       /* DONE bit */
4323   }
4324
4325   /* Specify the target we are exporting */
4326   args->target = V_008DFC_SQ_EXP_MRTZ;
4327
4328   args->compr = 0;                       /* COMP flag */
4329   args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
4330   args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
4331   args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
4332   args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
4333
4334   if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
4335      assert(!depth);
4336      args->compr = ctx->gfx_level < GFX11; /* COMPR flag */
4337
4338      if (stencil) {
4339         /* Stencil should be in X[23:16]. */
4340         stencil = ac_to_integer(ctx, stencil);
4341         stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
4342         args->out[0] = ac_to_float(ctx, stencil);
4343         mask |= ctx->gfx_level >= GFX11 ? 0x1 : 0x3;
4344      }
4345      if (samplemask) {
4346         /* SampleMask should be in Y[15:0]. */
4347         args->out[1] = samplemask;
4348         mask |= ctx->gfx_level >= GFX11 ? 0x2 : 0xc;
4349      }
4350   } else {
4351      if (depth) {
4352         args->out[0] = depth;
4353         mask |= 0x1;
4354      }
4355      if (stencil) {
4356         args->out[1] = stencil;
4357         mask |= 0x2;
4358      }
4359      if (samplemask) {
4360         args->out[2] = samplemask;
4361         mask |= 0x4;
4362      }
4363      if (mrt0_alpha) {
4364         args->out[3] = mrt0_alpha;
4365         mask |= 0x8;
4366      }
4367   }
4368
4369   /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
4370    * at the X writemask component. */
4371   if (ctx->gfx_level == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN)
4372      mask |= 0x1;
4373
4374   /* Specify which components to enable */
4375   args->enabled_channels = mask;
4376}
4377
4378/* Send GS Alloc Req message from the first wave of the group to SPI.
4379 * Message payload is:
4380 * - bits 0..10: vertices in group
4381 * - bits 12..22: primitives in group
4382 */
4383void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id,
4384                                   LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt)
4385{
4386   LLVMBuilderRef builder = ctx->builder;
4387   LLVMValueRef tmp;
4388   bool export_dummy_prim = false;
4389
4390   /* HW workaround for a GPU hang with 100% culling.
4391    * We always have to export at least 1 primitive.
4392    * Export a degenerate triangle using vertex 0 for all 3 vertices.
4393    */
4394   if (prim_cnt == ctx->i32_0 && ctx->gfx_level == GFX10) {
4395      assert(vtx_cnt == ctx->i32_0);
4396      prim_cnt = ctx->i32_1;
4397      vtx_cnt = ctx->i32_1;
4398      export_dummy_prim = true;
4399   }
4400
4401   if (wave_id)
4402      ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
4403
4404   tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), "");
4405   tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
4406   ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
4407
4408   if (export_dummy_prim) {
4409      struct ac_ngg_prim prim = {0};
4410      /* The vertex indices are 0,0,0. */
4411      prim.passthrough = ctx->i32_0;
4412
4413      struct ac_export_args pos = {0};
4414      /* The hw culls primitives with NaN. */
4415      pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = LLVMConstReal(ctx->f32, NAN);
4416      pos.target = V_008DFC_SQ_EXP_POS;
4417      pos.enabled_channels = 0xf;
4418      pos.done = true;
4419
4420      ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""),
4421                    5021);
4422      ac_build_export_prim(ctx, &prim);
4423      ac_build_export(ctx, &pos);
4424      ac_build_endif(ctx, 5021);
4425   }
4426
4427   if (wave_id)
4428      ac_build_endif(ctx, 5020);
4429}
4430
4431
4432LLVMValueRef ac_pack_edgeflags_for_export(struct ac_llvm_context *ctx,
4433                                          const struct ac_shader_args *args)
4434{
4435   /* Use the following trick to extract the edge flags:
4436    *   extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10
4437    *   shifted = v_mul_u32_u24 extracted, 0x80402u   ; shift the bits: 8->9, 9->19, 10->29
4438    *   result = v_and_b32 shifted, 0x20080200        ; remove garbage
4439    */
4440   LLVMValueRef tmp = LLVMBuildAnd(ctx->builder,
4441                                   ac_get_arg(ctx, args->gs_invocation_id),
4442                                   LLVMConstInt(ctx->i32, 0x700, 0), "");
4443   tmp = LLVMBuildMul(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x80402u, 0), "");
4444   return LLVMBuildAnd(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x20080200, 0), "");
4445}
4446
4447LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4448{
4449   /* The prim export format is:
4450    *  - bits 0..8: index 0
4451    *  - bit 9: edge flag 0
4452    *  - bits 10..18: index 1
4453    *  - bit 19: edge flag 1
4454    *  - bits 20..28: index 2
4455    *  - bit 29: edge flag 2
4456    *  - bit 31: null primitive (skip)
4457    */
4458   LLVMBuilderRef builder = ctx->builder;
4459   LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");
4460   LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");
4461   result = LLVMBuildOr(ctx->builder, result, prim->edgeflags, "");
4462
4463   for (unsigned i = 0; i < prim->num_vertices; ++i) {
4464      tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), "");
4465      result = LLVMBuildOr(builder, result, tmp, "");
4466   }
4467   return result;
4468}
4469
4470void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4471{
4472   struct ac_export_args args;
4473
4474   if (prim->passthrough) {
4475      args.out[0] = prim->passthrough;
4476   } else {
4477      args.out[0] = ac_pack_prim_export(ctx, prim);
4478   }
4479
4480   args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, "");
4481   args.out[1] = LLVMGetUndef(ctx->f32);
4482   args.out[2] = LLVMGetUndef(ctx->f32);
4483   args.out[3] = LLVMGetUndef(ctx->f32);
4484
4485   args.target = V_008DFC_SQ_EXP_PRIM;
4486   args.enabled_channels = 1;
4487   args.done = true;
4488   args.valid_mask = false;
4489   args.compr = false;
4490
4491   ac_build_export(ctx, &args);
4492}
4493
4494static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
4495{
4496   if (type == AC_ARG_FLOAT) {
4497      return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
4498   } else if (type == AC_ARG_INT) {
4499      return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
4500   } else {
4501      LLVMTypeRef ptr_type;
4502      switch (type) {
4503      case AC_ARG_CONST_PTR:
4504         ptr_type = ctx->i8;
4505         break;
4506      case AC_ARG_CONST_FLOAT_PTR:
4507         ptr_type = ctx->f32;
4508         break;
4509      case AC_ARG_CONST_PTR_PTR:
4510         ptr_type = ac_array_in_const32_addr_space(ctx->i8);
4511         break;
4512      case AC_ARG_CONST_DESC_PTR:
4513         ptr_type = ctx->v4i32;
4514         break;
4515      case AC_ARG_CONST_IMAGE_PTR:
4516         ptr_type = ctx->v8i32;
4517         break;
4518      default:
4519         unreachable("unknown arg type");
4520      }
4521      if (size == 1) {
4522         return ac_array_in_const32_addr_space(ptr_type);
4523      } else {
4524         assert(size == 2);
4525         return ac_array_in_const_addr_space(ptr_type);
4526      }
4527   }
4528}
4529
4530LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
4531                           enum ac_llvm_calling_convention convention, const char *name,
4532                           LLVMTypeRef ret_type, LLVMModuleRef module)
4533{
4534   LLVMTypeRef arg_types[AC_MAX_ARGS];
4535
4536   for (unsigned i = 0; i < args->arg_count; i++) {
4537      arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
4538   }
4539
4540   LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0);
4541
4542   LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
4543   LLVMBasicBlockRef main_function_body =
4544      LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
4545   LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
4546
4547   LLVMSetFunctionCallConv(main_function, convention);
4548   for (unsigned i = 0; i < args->arg_count; ++i) {
4549      LLVMValueRef P = LLVMGetParam(main_function, i);
4550
4551      if (args->args[i].file != AC_ARG_SGPR)
4552         continue;
4553
4554      ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG);
4555
4556      if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4557         ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
4558         ac_add_attr_dereferenceable(P, UINT64_MAX);
4559         ac_add_attr_alignment(P, 4);
4560      }
4561   }
4562
4563   ctx->main_function = main_function;
4564
4565   /* Enable denormals for FP16 and FP64: */
4566   LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
4567   /* Disable denormals for FP32: */
4568   LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
4569                                      "preserve-sign,preserve-sign");
4570   return main_function;
4571}
4572
4573void ac_build_s_endpgm(struct ac_llvm_context *ctx)
4574{
4575   LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
4576   LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
4577   LLVMBuildCall2(ctx->builder, calltype, code, NULL, 0, "");
4578}
4579
4580/**
4581 * Convert triangle strip indices to triangle indices. This is used to decompose
4582 * triangle strips into triangles.
4583 */
4584void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd,
4585                                                 LLVMValueRef flatshade_first,
4586                                                 LLVMValueRef index[3])
4587{
4588   LLVMBuilderRef builder = ctx->builder;
4589   LLVMValueRef out[3];
4590
4591   /* We need to change the vertex order for odd triangles to get correct
4592    * front/back facing by swapping 2 vertex indices, but we also have to
4593    * keep the provoking vertex in the same place.
4594    *
4595    * If the first vertex is provoking, swap index 1 and 2.
4596    * If the last vertex is provoking, swap index 0 and 1.
4597    */
4598   out[0] = LLVMBuildSelect(builder, flatshade_first, index[0],
4599                            LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), "");
4600   out[1] = LLVMBuildSelect(builder, flatshade_first,
4601                            LLVMBuildSelect(builder, is_odd, index[2], index[1], ""),
4602                            LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), "");
4603   out[2] = LLVMBuildSelect(builder, flatshade_first,
4604                            LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], "");
4605   memcpy(index, out, sizeof(out));
4606}
4607
4608LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a)
4609{
4610   LLVMValueRef args[2] = {
4611      a,
4612      LLVMConstInt(ctx->i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, 0),
4613   };
4614   return ac_build_intrinsic(ctx, "llvm.amdgcn.class.f32", ctx->i1, args, 2,
4615                             AC_FUNC_ATTR_READNONE);
4616}
4617