1/* 2 * Copyright 2014 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sub license, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 18 * USE OR OTHER DEALINGS IN THE SOFTWARE. 19 * 20 * The above copyright notice and this permission notice (including the 21 * next paragraph) shall be included in all copies or substantial portions 22 * of the Software. 23 * 24 */ 25/* based on pieces from si_pipe.c and radeon_llvm_emit.c */ 26#include "ac_llvm_build.h" 27 28#include "ac_nir.h" 29#include "ac_llvm_util.h" 30#include "ac_shader_util.h" 31#include "c11/threads.h" 32#include "shader_enums.h" 33#include "sid.h" 34#include "util/bitscan.h" 35#include "util/macros.h" 36#include "util/u_atomic.h" 37#include "util/u_math.h" 38#include <llvm-c/Core.h> 39#include <llvm/Config/llvm-config.h> 40 41#include <assert.h> 42#include <stdio.h> 43 44#define AC_LLVM_INITIAL_CF_DEPTH 4 45 46/* Data for if/else/endif and bgnloop/endloop control flow structures. 47 */ 48struct ac_llvm_flow { 49 /* Loop exit or next part of if/else/endif. */ 50 LLVMBasicBlockRef next_block; 51 LLVMBasicBlockRef loop_entry_block; 52}; 53 54/* Initialize module-independent parts of the context. 55 * 56 * The caller is responsible for initializing ctx::module and ctx::builder. 57 */ 58void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler, 59 enum amd_gfx_level gfx_level, enum radeon_family family, 60 bool has_3d_cube_border_color_mipmap, 61 enum ac_float_mode float_mode, unsigned wave_size, 62 unsigned ballot_mask_bits) 63{ 64 ctx->context = LLVMContextCreate(); 65 #if LLVM_VERSION_MAJOR >= 15 66 LLVMContextSetOpaquePointers(ctx->context, false); 67 #endif 68 69 ctx->gfx_level = gfx_level; 70 ctx->family = family; 71 ctx->has_3d_cube_border_color_mipmap = has_3d_cube_border_color_mipmap; 72 ctx->wave_size = wave_size; 73 ctx->ballot_mask_bits = ballot_mask_bits; 74 ctx->float_mode = float_mode; 75 ctx->module = ac_create_module(compiler->tm, ctx->context); 76 ctx->builder = ac_create_builder(ctx->context, float_mode); 77 78 ctx->voidt = LLVMVoidTypeInContext(ctx->context); 79 ctx->i1 = LLVMInt1TypeInContext(ctx->context); 80 ctx->i8 = LLVMInt8TypeInContext(ctx->context); 81 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16); 82 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32); 83 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64); 84 ctx->i128 = LLVMIntTypeInContext(ctx->context, 128); 85 ctx->intptr = ctx->i32; 86 ctx->f16 = LLVMHalfTypeInContext(ctx->context); 87 ctx->f32 = LLVMFloatTypeInContext(ctx->context); 88 ctx->f64 = LLVMDoubleTypeInContext(ctx->context); 89 ctx->v2i16 = LLVMVectorType(ctx->i16, 2); 90 ctx->v4i16 = LLVMVectorType(ctx->i16, 4); 91 ctx->v2f16 = LLVMVectorType(ctx->f16, 2); 92 ctx->v4f16 = LLVMVectorType(ctx->f16, 4); 93 ctx->v2i32 = LLVMVectorType(ctx->i32, 2); 94 ctx->v3i32 = LLVMVectorType(ctx->i32, 3); 95 ctx->v4i32 = LLVMVectorType(ctx->i32, 4); 96 ctx->v2f32 = LLVMVectorType(ctx->f32, 2); 97 ctx->v3f32 = LLVMVectorType(ctx->f32, 3); 98 ctx->v4f32 = LLVMVectorType(ctx->f32, 4); 99 ctx->v8i32 = LLVMVectorType(ctx->i32, 8); 100 ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size); 101 ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits); 102 103 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false); 104 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false); 105 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false); 106 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false); 107 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false); 108 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); 109 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false); 110 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false); 111 ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false); 112 ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false); 113 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0); 114 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0); 115 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); 116 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0); 117 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0); 118 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0); 119 120 ctx->i1false = LLVMConstInt(ctx->i1, 0, false); 121 ctx->i1true = LLVMConstInt(ctx->i1, 1, false); 122 123 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5); 124 125 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14); 126 127 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14); 128 129 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0); 130 ctx->flow = calloc(1, sizeof(*ctx->flow)); 131} 132 133void ac_llvm_context_dispose(struct ac_llvm_context *ctx) 134{ 135 free(ctx->flow->stack); 136 free(ctx->flow); 137 ctx->flow = NULL; 138} 139 140int ac_get_llvm_num_components(LLVMValueRef value) 141{ 142 LLVMTypeRef type = LLVMTypeOf(value); 143 unsigned num_components = 144 LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1; 145 return num_components; 146} 147 148LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index) 149{ 150 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) { 151 assert(index == 0); 152 return value; 153 } 154 155 return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), ""); 156} 157 158int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type) 159{ 160 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) 161 type = LLVMGetElementType(type); 162 163 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind) 164 return LLVMGetIntTypeWidth(type); 165 166 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { 167 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS) 168 return 32; 169 } 170 171 if (type == ctx->f16) 172 return 16; 173 if (type == ctx->f32) 174 return 32; 175 if (type == ctx->f64) 176 return 64; 177 178 unreachable("Unhandled type kind in get_elem_bits"); 179} 180 181unsigned ac_get_type_size(LLVMTypeRef type) 182{ 183 LLVMTypeKind kind = LLVMGetTypeKind(type); 184 185 switch (kind) { 186 case LLVMIntegerTypeKind: 187 return LLVMGetIntTypeWidth(type) / 8; 188 case LLVMHalfTypeKind: 189 return 2; 190 case LLVMFloatTypeKind: 191 return 4; 192 case LLVMDoubleTypeKind: 193 return 8; 194 case LLVMPointerTypeKind: 195 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT) 196 return 4; 197 return 8; 198 case LLVMVectorTypeKind: 199 return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type)); 200 case LLVMArrayTypeKind: 201 return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type)); 202 default: 203 assert(0); 204 return 0; 205 } 206} 207 208static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) 209{ 210 if (t == ctx->i1) 211 return ctx->i1; 212 else if (t == ctx->i8) 213 return ctx->i8; 214 else if (t == ctx->f16 || t == ctx->i16) 215 return ctx->i16; 216 else if (t == ctx->f32 || t == ctx->i32) 217 return ctx->i32; 218 else if (t == ctx->f64 || t == ctx->i64) 219 return ctx->i64; 220 else 221 unreachable("Unhandled integer size"); 222} 223 224LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t) 225{ 226 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { 227 LLVMTypeRef elem_type = LLVMGetElementType(t); 228 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t)); 229 } 230 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) { 231 switch (LLVMGetPointerAddressSpace(t)) { 232 case AC_ADDR_SPACE_GLOBAL: 233 return ctx->i64; 234 case AC_ADDR_SPACE_CONST_32BIT: 235 case AC_ADDR_SPACE_LDS: 236 return ctx->i32; 237 default: 238 unreachable("unhandled address space"); 239 } 240 } 241 return to_integer_type_scalar(ctx, t); 242} 243 244LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v) 245{ 246 LLVMTypeRef type = LLVMTypeOf(v); 247 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { 248 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), ""); 249 } 250 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), ""); 251} 252 253LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v) 254{ 255 LLVMTypeRef type = LLVMTypeOf(v); 256 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) 257 return v; 258 return ac_to_integer(ctx, v); 259} 260 261static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) 262{ 263 if (t == ctx->i8) 264 return ctx->i8; 265 else if (t == ctx->i16 || t == ctx->f16) 266 return ctx->f16; 267 else if (t == ctx->i32 || t == ctx->f32) 268 return ctx->f32; 269 else if (t == ctx->i64 || t == ctx->f64) 270 return ctx->f64; 271 else 272 unreachable("Unhandled float size"); 273} 274 275LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t) 276{ 277 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { 278 LLVMTypeRef elem_type = LLVMGetElementType(t); 279 return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t)); 280 } 281 return to_float_type_scalar(ctx, t); 282} 283 284LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v) 285{ 286 LLVMTypeRef type = LLVMTypeOf(v); 287 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), ""); 288} 289 290LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, 291 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count, 292 unsigned attrib_mask) 293{ 294 LLVMValueRef call; 295 bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY); 296 297 LLVMTypeRef param_types[32]; 298 assert(param_count <= 32); 299 for (unsigned i = 0; i < param_count; ++i) { 300 assert(params[i]); 301 param_types[i] = LLVMTypeOf(params[i]); 302 } 303 304 LLVMTypeRef function_type = LLVMFunctionType(return_type, param_types, param_count, 0); 305 LLVMValueRef function = LLVMGetNamedFunction(ctx->module, name); 306 307 if (!function) { 308 function = LLVMAddFunction(ctx->module, name, function_type); 309 310 LLVMSetFunctionCallConv(function, LLVMCCallConv); 311 LLVMSetLinkage(function, LLVMExternalLinkage); 312 313 if (!set_callsite_attrs) 314 ac_add_func_attributes(ctx->context, function, attrib_mask); 315 } 316 317 call = LLVMBuildCall2(ctx->builder, function_type, function, params, param_count, ""); 318 if (set_callsite_attrs) 319 ac_add_func_attributes(ctx->context, call, attrib_mask); 320 return call; 321} 322 323/** 324 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with 325 * intrinsic names). 326 */ 327void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize) 328{ 329 LLVMTypeRef elem_type = type; 330 331 if (LLVMGetTypeKind(type) == LLVMStructTypeKind) { 332 unsigned count = LLVMCountStructElementTypes(type); 333 int ret = snprintf(buf, bufsize, "sl_"); 334 buf += ret; 335 bufsize -= ret; 336 337 LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef)); 338 LLVMGetStructElementTypes(type, elems); 339 340 for (unsigned i = 0; i < count; i++) { 341 ac_build_type_name_for_intr(elems[i], buf, bufsize); 342 ret = strlen(buf); 343 buf += ret; 344 bufsize -= ret; 345 } 346 347 snprintf(buf, bufsize, "s"); 348 return; 349 } 350 351 assert(bufsize >= 8); 352 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { 353 int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type)); 354 if (ret < 0) { 355 char *type_name = LLVMPrintTypeToString(type); 356 fprintf(stderr, "Error building type name for: %s\n", type_name); 357 LLVMDisposeMessage(type_name); 358 return; 359 } 360 elem_type = LLVMGetElementType(type); 361 buf += ret; 362 bufsize -= ret; 363 } 364 switch (LLVMGetTypeKind(elem_type)) { 365 default: 366 break; 367 case LLVMIntegerTypeKind: 368 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type)); 369 break; 370 case LLVMHalfTypeKind: 371 snprintf(buf, bufsize, "f16"); 372 break; 373 case LLVMFloatTypeKind: 374 snprintf(buf, bufsize, "f32"); 375 break; 376 case LLVMDoubleTypeKind: 377 snprintf(buf, bufsize, "f64"); 378 break; 379 } 380} 381 382/** 383 * Helper function that builds an LLVM IR PHI node and immediately adds 384 * incoming edges. 385 */ 386LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming, 387 LLVMValueRef *values, LLVMBasicBlockRef *blocks) 388{ 389 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, ""); 390 LLVMAddIncoming(phi, values, blocks, count_incoming); 391 return phi; 392} 393 394void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage) 395{ 396 /* GFX6 only: s_barrier isn’t needed in TCS because an entire patch always fits into 397 * a single wave due to a bug workaround disallowing multi-wave HS workgroups. 398 */ 399 if (ctx->gfx_level == GFX6 && stage == MESA_SHADER_TESS_CTRL) 400 return; 401 402 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT); 403} 404 405/* Prevent optimizations (at least of memory accesses) across the current 406 * point in the program by emitting empty inline assembly that is marked as 407 * having side effects. 408 * 409 * Optionally, a value can be passed through the inline assembly to prevent 410 * LLVM from hoisting calls to ReadNone functions. 411 */ 412void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr) 413{ 414 static int counter = 0; 415 416 LLVMBuilderRef builder = ctx->builder; 417 char code[16]; 418 const char *constraint = sgpr ? "=s,0" : "=v,0"; 419 420 snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter)); 421 422 if (!pgpr) { 423 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); 424 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false); 425 LLVMBuildCall2(builder, ftype, inlineasm, NULL, 0, ""); 426 } else if (LLVMTypeOf(*pgpr) == ctx->i32) { 427 /* Simple version for i32 that allows the caller to set LLVM metadata on the call 428 * instruction. */ 429 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false); 430 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false); 431 432 *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, ""); 433 } else if (LLVMTypeOf(*pgpr) == ctx->i16) { 434 /* Simple version for i16 that allows the caller to set LLVM metadata on the call 435 * instruction. */ 436 LLVMTypeRef ftype = LLVMFunctionType(ctx->i16, &ctx->i16, 1, false); 437 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false); 438 439 *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, ""); 440 } else if (LLVMGetTypeKind(LLVMTypeOf(*pgpr)) == LLVMPointerTypeKind) { 441 LLVMTypeRef type = LLVMTypeOf(*pgpr); 442 LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false); 443 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false); 444 445 *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, ""); 446 } else { 447 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false); 448 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false); 449 LLVMTypeRef type = LLVMTypeOf(*pgpr); 450 unsigned bitsize = ac_get_elem_bits(ctx, type); 451 LLVMValueRef vgpr = *pgpr; 452 LLVMTypeRef vgpr_type; 453 unsigned vgpr_size; 454 LLVMValueRef vgpr0; 455 456 if (bitsize < 32) 457 vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, ""); 458 459 vgpr_type = LLVMTypeOf(vgpr); 460 vgpr_size = ac_get_type_size(vgpr_type); 461 462 assert(vgpr_size % 4 == 0); 463 464 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), ""); 465 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, ""); 466 vgpr0 = LLVMBuildCall2(builder, ftype, inlineasm, &vgpr0, 1, ""); 467 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, ""); 468 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, ""); 469 470 if (bitsize < 32) 471 vgpr = LLVMBuildTrunc(builder, vgpr, type, ""); 472 473 *pgpr = vgpr; 474 } 475} 476 477LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope) 478{ 479 const char *subgroup = "llvm.readcyclecounter"; 480 const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup; 481 482 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0); 483 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, ""); 484} 485 486LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value) 487{ 488 const char *name; 489 490 if (LLVMTypeOf(value) == ctx->i1) 491 value = LLVMBuildZExt(ctx->builder, value, ctx->i32, ""); 492 493 if (ctx->wave_size == 64) 494 name = "llvm.amdgcn.icmp.i64.i32"; 495 else 496 name = "llvm.amdgcn.icmp.i32.i32"; 497 498 LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)}; 499 500 /* We currently have no other way to prevent LLVM from lifting the icmp 501 * calls to a dominating basic block. 502 */ 503 ac_build_optimization_barrier(ctx, &args[0], false); 504 505 args[0] = ac_to_integer(ctx, args[0]); 506 507 return ac_build_intrinsic( 508 ctx, name, ctx->iN_wavemask, args, 3, 509 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 510} 511 512LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value) 513{ 514 const char *name; 515 516 if (ctx->wave_size == 64) 517 name = "llvm.amdgcn.icmp.i64.i1"; 518 else 519 name = "llvm.amdgcn.icmp.i32.i1"; 520 521 LLVMValueRef args[3] = { 522 value, 523 ctx->i1false, 524 LLVMConstInt(ctx->i32, LLVMIntNE, 0), 525 }; 526 527 return ac_build_intrinsic( 528 ctx, name, ctx->iN_wavemask, args, 3, 529 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 530} 531 532LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value) 533{ 534 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); 535 LLVMValueRef vote_set = ac_build_ballot(ctx, value); 536 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, ""); 537} 538 539LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value) 540{ 541 LLVMValueRef vote_set = ac_build_ballot(ctx, value); 542 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), 543 ""); 544} 545 546LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value) 547{ 548 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); 549 LLVMValueRef vote_set = ac_build_ballot(ctx, value); 550 551 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, ""); 552 LLVMValueRef none = 553 LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), ""); 554 return LLVMBuildOr(ctx->builder, all, none, ""); 555} 556 557LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, 558 unsigned value_count, unsigned component) 559{ 560 LLVMValueRef vec = NULL; 561 562 if (value_count == 1) { 563 return values[component]; 564 } else if (!value_count) 565 unreachable("value_count is 0"); 566 567 for (unsigned i = component; i < value_count + component; i++) { 568 LLVMValueRef value = values[i]; 569 570 if (i == component) 571 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count)); 572 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false); 573 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, ""); 574 } 575 return vec; 576} 577 578LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values, 579 unsigned value_count, unsigned value_stride, 580 bool always_vector) 581{ 582 LLVMBuilderRef builder = ctx->builder; 583 LLVMValueRef vec = NULL; 584 unsigned i; 585 586 if (value_count == 1 && !always_vector) { 587 return values[0]; 588 } else if (!value_count) 589 unreachable("value_count is 0"); 590 591 for (i = 0; i < value_count; i++) { 592 LLVMValueRef value = values[i * value_stride]; 593 594 if (!i) 595 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count)); 596 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false); 597 vec = LLVMBuildInsertElement(builder, vec, value, index, ""); 598 } 599 return vec; 600} 601 602LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, 603 unsigned value_count) 604{ 605 return ac_build_gather_values_extended(ctx, values, value_count, 1, false); 606} 607 608LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 609{ 610 unsigned a_size = ac_get_llvm_num_components(a); 611 unsigned b_size = ac_get_llvm_num_components(b); 612 613 LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef)); 614 for (unsigned i = 0; i < a_size; i++) 615 elems[i] = ac_llvm_extract_elem(ctx, a, i); 616 for (unsigned i = 0; i < b_size; i++) 617 elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i); 618 619 return ac_build_gather_values(ctx, elems, a_size + b_size); 620} 621 622/* Expand a scalar or vector to <dst_channels x type> by filling the remaining 623 * channels with undef. Extract at most src_channels components from the input. 624 */ 625LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value, 626 unsigned src_channels, unsigned dst_channels) 627{ 628 LLVMTypeRef elemtype; 629 LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef)); 630 631 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) { 632 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value)); 633 634 if (src_channels == dst_channels && vec_size == dst_channels) 635 return value; 636 637 src_channels = MIN2(src_channels, vec_size); 638 639 for (unsigned i = 0; i < src_channels; i++) 640 chan[i] = ac_llvm_extract_elem(ctx, value, i); 641 642 elemtype = LLVMGetElementType(LLVMTypeOf(value)); 643 } else { 644 if (src_channels) { 645 assert(src_channels == 1); 646 chan[0] = value; 647 } 648 elemtype = LLVMTypeOf(value); 649 } 650 651 for (unsigned i = src_channels; i < dst_channels; i++) 652 chan[i] = LLVMGetUndef(elemtype); 653 654 return ac_build_gather_values(ctx, chan, dst_channels); 655} 656 657/* Extract components [start, start + channels) from a vector. 658 */ 659LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start, 660 unsigned channels) 661{ 662 LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef)); 663 664 for (unsigned i = 0; i < channels; i++) 665 chan[i] = ac_llvm_extract_elem(ctx, value, i + start); 666 667 return ac_build_gather_values(ctx, chan, channels); 668} 669 670/* Expand a scalar or vector to <4 x type> by filling the remaining channels 671 * with undef. Extract at most num_channels components from the input. 672 */ 673LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value, 674 unsigned num_channels) 675{ 676 return ac_build_expand(ctx, value, num_channels, 4); 677} 678 679LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value) 680{ 681 unsigned type_size = ac_get_type_size(LLVMTypeOf(value)); 682 const char *name; 683 684 if (type_size == 2) 685 name = "llvm.rint.f16"; 686 else if (type_size == 4) 687 name = "llvm.rint.f32"; 688 else 689 name = "llvm.rint.f64"; 690 691 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE); 692} 693 694LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den) 695{ 696 unsigned type_size = ac_get_type_size(LLVMTypeOf(den)); 697 const char *name; 698 699 /* For doubles, we need precise division to pass GLCTS. */ 700 if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8) 701 return LLVMBuildFDiv(ctx->builder, num, den, ""); 702 703 if (type_size == 2) 704 name = "llvm.amdgcn.rcp.f16"; 705 else if (type_size == 4) 706 name = "llvm.amdgcn.rcp.f32"; 707 else 708 name = "llvm.amdgcn.rcp.f64"; 709 710 LLVMValueRef rcp = 711 ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE); 712 713 return LLVMBuildFMul(ctx->builder, num, rcp, ""); 714} 715 716/* See fast_idiv_by_const.h. */ 717/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */ 718LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num, 719 LLVMValueRef multiplier, LLVMValueRef pre_shift, 720 LLVMValueRef post_shift, LLVMValueRef increment) 721{ 722 LLVMBuilderRef builder = ctx->builder; 723 724 num = LLVMBuildLShr(builder, num, pre_shift, ""); 725 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""), 726 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); 727 num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), ""); 728 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); 729 num = LLVMBuildTrunc(builder, num, ctx->i32, ""); 730 return LLVMBuildLShr(builder, num, post_shift, ""); 731} 732 733/* See fast_idiv_by_const.h. */ 734/* If num != UINT_MAX, this more efficient version can be used. */ 735/* Set: increment = util_fast_udiv_info::increment; */ 736LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num, 737 LLVMValueRef multiplier, LLVMValueRef pre_shift, 738 LLVMValueRef post_shift, LLVMValueRef increment) 739{ 740 LLVMBuilderRef builder = ctx->builder; 741 742 num = LLVMBuildLShr(builder, num, pre_shift, ""); 743 num = LLVMBuildNUWAdd(builder, num, increment, ""); 744 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""), 745 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); 746 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); 747 num = LLVMBuildTrunc(builder, num, ctx->i32, ""); 748 return LLVMBuildLShr(builder, num, post_shift, ""); 749} 750 751/* See fast_idiv_by_const.h. */ 752/* Both operands must fit in 31 bits and the divisor must not be 1. */ 753LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num, 754 LLVMValueRef multiplier, LLVMValueRef post_shift) 755{ 756 LLVMBuilderRef builder = ctx->builder; 757 758 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""), 759 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); 760 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); 761 num = LLVMBuildTrunc(builder, num, ctx->i32, ""); 762 return LLVMBuildLShr(builder, num, post_shift, ""); 763} 764 765/* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27 766 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is 767 * already multiplied by two. id is the cube face number. 768 */ 769struct cube_selection_coords { 770 LLVMValueRef stc[2]; 771 LLVMValueRef ma; 772 LLVMValueRef id; 773}; 774 775static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3], 776 struct cube_selection_coords *out) 777{ 778 LLVMTypeRef f32 = ctx->f32; 779 780 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE); 781 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE); 782 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE); 783 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE); 784} 785 786/** 787 * Build a manual selection sequence for cube face sc/tc coordinates and 788 * major axis vector (multiplied by 2 for consistency) for the given 789 * vec3 \p coords, for the face implied by \p selcoords. 790 * 791 * For the major axis, we always adjust the sign to be in the direction of 792 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards 793 * the selcoords major axis. 794 */ 795static void build_cube_select(struct ac_llvm_context *ctx, 796 const struct cube_selection_coords *selcoords, 797 const LLVMValueRef *coords, LLVMValueRef *out_st, 798 LLVMValueRef *out_ma) 799{ 800 LLVMBuilderRef builder = ctx->builder; 801 LLVMTypeRef f32 = LLVMTypeOf(coords[0]); 802 LLVMValueRef is_ma_positive; 803 LLVMValueRef sgn_ma; 804 LLVMValueRef is_ma_z, is_not_ma_z; 805 LLVMValueRef is_ma_y; 806 LLVMValueRef is_ma_x; 807 LLVMValueRef sgn; 808 LLVMValueRef tmp; 809 810 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), ""); 811 sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0), 812 LLVMConstReal(f32, -1.0), ""); 813 814 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), ""); 815 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, ""); 816 is_ma_y = LLVMBuildAnd( 817 builder, is_not_ma_z, 818 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), ""); 819 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), ""); 820 821 /* Select sc */ 822 tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], ""); 823 sgn = LLVMBuildSelect( 824 builder, is_ma_y, LLVMConstReal(f32, 1.0), 825 LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), ""); 826 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, ""); 827 828 /* Select tc */ 829 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], ""); 830 sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), ""); 831 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, ""); 832 833 /* Select ma */ 834 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], 835 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), ""); 836 tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE); 837 *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), ""); 838} 839 840void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod, 841 LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg) 842{ 843 844 LLVMBuilderRef builder = ctx->builder; 845 struct cube_selection_coords selcoords; 846 LLVMValueRef coords[3]; 847 LLVMValueRef invma; 848 849 if (is_array && !is_lod) { 850 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]); 851 852 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says: 853 * 854 * "For Array forms, the array layer used will be 855 * 856 * max(0, min(d−1, floor(layer+0.5))) 857 * 858 * where d is the depth of the texture array and layer 859 * comes from the component indicated in the tables below. 860 * Workaroudn for an issue where the layer is taken from a 861 * helper invocation which happens to fall on a different 862 * layer due to extrapolation." 863 * 864 * GFX8 and earlier attempt to implement this in hardware by 865 * clamping the value of coords[2] = (8 * layer) + face. 866 * Unfortunately, this means that the we end up with the wrong 867 * face when clamping occurs. 868 * 869 * Clamp the layer earlier to work around the issue. 870 */ 871 if (ctx->gfx_level <= GFX8) { 872 LLVMValueRef ge0; 873 ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, ""); 874 tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, ""); 875 } 876 877 coords_arg[3] = tmp; 878 } 879 880 build_cube_intrinsic(ctx, coords_arg, &selcoords); 881 882 invma = 883 ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE); 884 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma); 885 886 for (int i = 0; i < 2; ++i) 887 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, ""); 888 889 coords[2] = selcoords.id; 890 891 if (is_deriv && derivs_arg) { 892 LLVMValueRef derivs[4]; 893 int axis; 894 895 /* Convert cube derivatives to 2D derivatives. */ 896 for (axis = 0; axis < 2; axis++) { 897 LLVMValueRef deriv_st[2]; 898 LLVMValueRef deriv_ma; 899 900 /* Transform the derivative alongside the texture 901 * coordinate. Mathematically, the correct formula is 902 * as follows. Assume we're projecting onto the +Z face 903 * and denote by dx/dh the derivative of the (original) 904 * X texture coordinate with respect to horizontal 905 * window coordinates. The projection onto the +Z face 906 * plane is: 907 * 908 * f(x,z) = x/z 909 * 910 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh 911 * = 1/z * dx/dh - x/z * 1/z * dz/dh. 912 * 913 * This motivatives the implementation below. 914 * 915 * Whether this actually gives the expected results for 916 * apps that might feed in derivatives obtained via 917 * finite differences is anyone's guess. The OpenGL spec 918 * seems awfully quiet about how textureGrad for cube 919 * maps should be handled. 920 */ 921 build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma); 922 923 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, ""); 924 925 for (int i = 0; i < 2; ++i) 926 derivs[axis * 2 + i] = 927 LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""), 928 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), ""); 929 } 930 931 memcpy(derivs_arg, derivs, sizeof(derivs)); 932 } 933 934 /* Shift the texture coordinate. This must be applied after the 935 * derivative calculation. 936 */ 937 for (int i = 0; i < 2; ++i) 938 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), ""); 939 940 if (is_array) { 941 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */ 942 /* coords_arg.w component - array_index for cube arrays */ 943 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]); 944 } 945 946 memcpy(coords_arg, coords, sizeof(coords)); 947} 948 949LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan, 950 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, 951 LLVMValueRef j) 952{ 953 LLVMValueRef args[5]; 954 955 if (ctx->gfx_level >= GFX11) { 956 LLVMValueRef p; 957 LLVMValueRef p10; 958 959 args[0] = llvm_chan; 960 args[1] = attr_number; 961 args[2] = params; 962 963 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load", 964 ctx->f32, args, 3, AC_FUNC_ATTR_READNONE); 965 966 args[0] = p; 967 args[1] = i; 968 args[2] = p; 969 970 p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10", 971 ctx->f32, args, 3, AC_FUNC_ATTR_READNONE); 972 973 args[0] = p; 974 args[1] = j; 975 args[2] = p10; 976 977 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2", 978 ctx->f32, args, 3, AC_FUNC_ATTR_READNONE); 979 980 } else { 981 LLVMValueRef p1; 982 983 args[0] = i; 984 args[1] = llvm_chan; 985 args[2] = attr_number; 986 args[3] = params; 987 988 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", 989 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); 990 991 args[0] = p1; 992 args[1] = j; 993 args[2] = llvm_chan; 994 args[3] = attr_number; 995 args[4] = params; 996 997 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", 998 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); 999 } 1000} 1001 1002LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan, 1003 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, 1004 LLVMValueRef j, bool high_16bits) 1005{ 1006 LLVMValueRef args[6]; 1007 1008 if (ctx->gfx_level >= GFX11) { 1009 LLVMValueRef p; 1010 LLVMValueRef p10; 1011 1012 args[0] = llvm_chan; 1013 args[1] = attr_number; 1014 args[2] = params; 1015 1016 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load", 1017 ctx->f32, args, 3, AC_FUNC_ATTR_READNONE); 1018 1019 args[0] = p; 1020 args[1] = i; 1021 args[2] = p; 1022 args[3] = high_16bits ? ctx->i1true : ctx->i1false; 1023 1024 p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10.f16", 1025 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); 1026 1027 args[0] = p; 1028 args[1] = j; 1029 args[2] = p10; 1030 args[3] = high_16bits ? ctx->i1true : ctx->i1false; 1031 1032 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2.f16", 1033 ctx->f16, args, 4, AC_FUNC_ATTR_READNONE); 1034 1035 } else { 1036 LLVMValueRef p1; 1037 1038 args[0] = i; 1039 args[1] = llvm_chan; 1040 args[2] = attr_number; 1041 args[3] = high_16bits ? ctx->i1true : ctx->i1false; 1042 args[4] = params; 1043 1044 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5, 1045 AC_FUNC_ATTR_READNONE); 1046 1047 args[0] = p1; 1048 args[1] = j; 1049 args[2] = llvm_chan; 1050 args[3] = attr_number; 1051 args[4] = high_16bits ? ctx->i1true : ctx->i1false; 1052 args[5] = params; 1053 1054 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6, 1055 AC_FUNC_ATTR_READNONE); 1056 } 1057} 1058 1059LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter, 1060 LLVMValueRef llvm_chan, LLVMValueRef attr_number, 1061 LLVMValueRef params) 1062{ 1063 LLVMValueRef args[4]; 1064 1065 if (ctx->gfx_level >= GFX11) { 1066 LLVMValueRef p; 1067 1068 args[0] = llvm_chan; 1069 args[1] = attr_number; 1070 args[2] = params; 1071 1072 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load", 1073 ctx->f32, args, 3, AC_FUNC_ATTR_READNONE); 1074 p = ac_build_quad_swizzle(ctx, p, 0, 0, 0 ,0); 1075 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, AC_FUNC_ATTR_READNONE); 1076 } else { 1077 args[0] = parameter; 1078 args[1] = llvm_chan; 1079 args[2] = attr_number; 1080 args[3] = params; 1081 1082 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4, 1083 AC_FUNC_ATTR_READNONE); 1084 } 1085} 1086 1087LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, 1088 LLVMValueRef index) 1089{ 1090 return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); 1091} 1092 1093LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) 1094{ 1095 LLVMValueRef indices[2] = { 1096 ctx->i32_0, 1097 index, 1098 }; 1099 return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, ""); 1100} 1101 1102LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index) 1103{ 1104 LLVMValueRef offset_ptr = LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""); 1105 return LLVMBuildPointerCast(ctx->builder, offset_ptr, LLVMTypeOf(ptr), ""); 1106} 1107 1108void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index, 1109 LLVMValueRef value) 1110{ 1111 LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index)); 1112} 1113 1114/** 1115 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad. 1116 * It's equivalent to doing a load from &base_ptr[index]. 1117 * 1118 * \param base_ptr Where the array starts. 1119 * \param index The element index into the array. 1120 * \param uniform Whether the base_ptr and index can be assumed to be 1121 * dynamically uniform (i.e. load to an SGPR) 1122 * \param invariant Whether the load is invariant (no other opcodes affect it) 1123 * \param no_unsigned_wraparound 1124 * For all possible re-associations and re-distributions of an expression 1125 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs 1126 * without inbounds in base_ptr), this parameter is true if "addr + offset" 1127 * does not result in an unsigned integer wraparound. This is used for 1128 * optimal code generation of 32-bit pointer arithmetic. 1129 * 1130 * For example, a 32-bit immediate offset that causes a 32-bit unsigned 1131 * integer wraparound can't be an imm offset in s_load_dword, because 1132 * the instruction performs "addr + offset" in 64 bits. 1133 * 1134 * Expected usage for bindless textures by chaining GEPs: 1135 * // possible unsigned wraparound, don't use InBounds: 1136 * ptr1 = LLVMBuildGEP(base_ptr, index); 1137 * image = load(ptr1); // becomes "s_load ptr1, 0" 1138 * 1139 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize); 1140 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds 1141 */ 1142static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, 1143 LLVMValueRef index, bool uniform, bool invariant, 1144 bool no_unsigned_wraparound) 1145{ 1146 LLVMValueRef pointer, result; 1147 1148 if (no_unsigned_wraparound && 1149 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT) 1150 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, ""); 1151 else 1152 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); 1153 1154 if (uniform) 1155 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); 1156 result = LLVMBuildLoad(ctx->builder, pointer, ""); 1157 if (invariant) 1158 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); 1159 LLVMSetAlignment(result, 4); 1160 return result; 1161} 1162 1163LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) 1164{ 1165 return ac_build_load_custom(ctx, base_ptr, index, false, false, false); 1166} 1167 1168LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, 1169 LLVMValueRef index) 1170{ 1171 return ac_build_load_custom(ctx, base_ptr, index, false, true, false); 1172} 1173 1174/* This assumes that there is no unsigned integer wraparound during the address 1175 * computation, excluding all GEPs within base_ptr. */ 1176LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, 1177 LLVMValueRef index) 1178{ 1179 return ac_build_load_custom(ctx, base_ptr, index, true, true, true); 1180} 1181 1182/* See ac_build_load_custom() documentation. */ 1183LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, 1184 LLVMValueRef base_ptr, LLVMValueRef index) 1185{ 1186 return ac_build_load_custom(ctx, base_ptr, index, true, true, false); 1187} 1188 1189static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy) 1190{ 1191 return cache_policy | 1192 (ctx->gfx_level >= GFX10 && ctx->gfx_level < GFX11 && cache_policy & ac_glc ? ac_dlc : 0); 1193} 1194 1195static unsigned get_store_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy) 1196{ 1197 if (ctx->gfx_level >= GFX11) 1198 cache_policy &= ~ac_glc; /* GLC has no effect on stores */ 1199 return cache_policy; 1200} 1201 1202static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1203 LLVMValueRef data, LLVMValueRef vindex, 1204 LLVMValueRef voffset, LLVMValueRef soffset, 1205 unsigned cache_policy, bool use_format) 1206{ 1207 LLVMValueRef args[6]; 1208 int idx = 0; 1209 args[idx++] = data; 1210 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); 1211 if (vindex) 1212 args[idx++] = vindex ? vindex : ctx->i32_0; 1213 args[idx++] = voffset ? voffset : ctx->i32_0; 1214 args[idx++] = soffset ? soffset : ctx->i32_0; 1215 args[idx++] = LLVMConstInt(ctx->i32, get_store_cache_policy(ctx, cache_policy), 0); 1216 const char *indexing_kind = vindex ? "struct" : "raw"; 1217 char name[256], type_name[8]; 1218 1219 ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name)); 1220 1221 if (use_format) { 1222 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind, 1223 type_name); 1224 } else { 1225 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name); 1226 } 1227 1228 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); 1229} 1230 1231void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data, 1232 LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy) 1233{ 1234 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true); 1235} 1236 1237/* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */ 1238void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, 1239 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, 1240 unsigned cache_policy) 1241{ 1242 unsigned num_channels = ac_get_llvm_num_components(vdata); 1243 1244 /* Split 3 channel stores if unsupported. */ 1245 if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false)) { 1246 LLVMValueRef v[3], v01, voffset2; 1247 1248 for (int i = 0; i < 3; i++) { 1249 v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), ""); 1250 } 1251 v01 = ac_build_gather_values(ctx, v, 2); 1252 1253 voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, 1254 LLVMConstInt(ctx->i32, 8, 0), ""); 1255 1256 ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, cache_policy); 1257 ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, cache_policy); 1258 return; 1259 } 1260 1261 ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset, 1262 cache_policy, false); 1263} 1264 1265static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1266 LLVMValueRef vindex, LLVMValueRef voffset, 1267 LLVMValueRef soffset, unsigned num_channels, 1268 LLVMTypeRef channel_type, unsigned cache_policy, 1269 bool can_speculate, bool use_format, 1270 bool structurized) 1271{ 1272 LLVMValueRef args[5]; 1273 int idx = 0; 1274 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); 1275 if (structurized) 1276 args[idx++] = vindex ? vindex : ctx->i32_0; 1277 args[idx++] = voffset ? voffset : ctx->i32_0; 1278 args[idx++] = soffset ? soffset : ctx->i32_0; 1279 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); 1280 unsigned func = 1281 !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels; 1282 const char *indexing_kind = structurized ? "struct" : "raw"; 1283 char name[256], type_name[8]; 1284 1285 /* D16 is only supported on gfx8+ */ 1286 assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) || 1287 ctx->gfx_level >= GFX8); 1288 1289 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type; 1290 ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); 1291 1292 if (use_format) { 1293 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind, 1294 type_name); 1295 } else { 1296 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name); 1297 } 1298 1299 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate)); 1300} 1301 1302LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels, 1303 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, 1304 LLVMTypeRef channel_type, unsigned cache_policy, 1305 bool can_speculate, bool allow_smem) 1306{ 1307 if (allow_smem && !(cache_policy & ac_slc) && 1308 (!(cache_policy & ac_glc) || ctx->gfx_level >= GFX8)) { 1309 assert(vindex == NULL); 1310 1311 LLVMValueRef result[8]; 1312 1313 LLVMValueRef offset = voffset ? voffset : ctx->i32_0; 1314 if (soffset) 1315 offset = LLVMBuildAdd(ctx->builder, offset, soffset, ""); 1316 1317 for (int i = 0; i < num_channels; i++) { 1318 if (i) { 1319 offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), ""); 1320 } 1321 LLVMValueRef args[3] = { 1322 rsrc, 1323 offset, 1324 LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0), 1325 }; 1326 result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3, 1327 AC_FUNC_ATTR_READNONE); 1328 } 1329 if (num_channels == 1) 1330 return result[0]; 1331 1332 if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false)) 1333 result[num_channels++] = LLVMGetUndef(ctx->f32); 1334 return ac_build_gather_values(ctx, result, num_channels); 1335 } 1336 1337 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, soffset, num_channels, 1338 channel_type, cache_policy, can_speculate, false, false); 1339} 1340 1341LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1342 LLVMValueRef vindex, LLVMValueRef voffset, 1343 unsigned num_channels, unsigned cache_policy, 1344 bool can_speculate, bool d16, bool tfe) 1345{ 1346 if (tfe) { 1347 assert(!d16); 1348 1349 cache_policy = get_load_cache_policy(ctx, cache_policy); 1350 1351 char code[256]; 1352 /* The definition in the assembly and the one in the constraint string 1353 * differs because of an assembler bug. 1354 */ 1355 snprintf(code, sizeof(code), 1356 "v_mov_b32 v0, 0\n" 1357 "v_mov_b32 v1, 0\n" 1358 "v_mov_b32 v2, 0\n" 1359 "v_mov_b32 v3, 0\n" 1360 "v_mov_b32 v4, 0\n" 1361 "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n" 1362 "s_waitcnt vmcnt(0)", 1363 cache_policy & ac_glc ? "glc" : "", 1364 cache_policy & ac_slc ? "slc" : "", 1365 cache_policy & ac_dlc ? "dlc" : ""); 1366 1367 LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32}; 1368 LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false); 1369 LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false); 1370 1371 LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0, 1372 voffset ? voffset : ctx->i32_0}; 1373 1374 LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2), 1375 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")}; 1376 LLVMValueRef res = LLVMBuildCall2(ctx->builder, calltype, inlineasm, args, 2, ""); 1377 1378 return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels), 1379 ac_llvm_extract_elem(ctx, res, 4)); 1380 } 1381 1382 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels, 1383 d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true, 1384 true); 1385} 1386 1387static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1388 LLVMValueRef vindex, LLVMValueRef voffset, 1389 LLVMValueRef soffset, unsigned num_channels, 1390 unsigned dfmt, unsigned nfmt, unsigned cache_policy, 1391 bool can_speculate, bool structurized) 1392{ 1393 LLVMValueRef args[6]; 1394 int idx = 0; 1395 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); 1396 if (structurized) 1397 args[idx++] = vindex ? vindex : ctx->i32_0; 1398 args[idx++] = voffset ? voffset : ctx->i32_0; 1399 args[idx++] = soffset ? soffset : ctx->i32_0; 1400 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->gfx_level, dfmt, nfmt), 0); 1401 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); 1402 unsigned func = 1403 !ac_has_vec3_support(ctx->gfx_level, true) && num_channels == 3 ? 4 : num_channels; 1404 const char *indexing_kind = structurized ? "struct" : "raw"; 1405 char name[256], type_name[8]; 1406 1407 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; 1408 ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); 1409 1410 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name); 1411 1412 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate)); 1413} 1414 1415LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1416 LLVMValueRef vindex, LLVMValueRef voffset, 1417 LLVMValueRef soffset, unsigned num_channels, 1418 unsigned dfmt, unsigned nfmt, unsigned cache_policy, 1419 bool can_speculate) 1420{ 1421 return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, num_channels, dfmt, 1422 nfmt, cache_policy, can_speculate, true); 1423} 1424 1425LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1426 LLVMValueRef voffset, LLVMValueRef soffset, 1427 unsigned cache_policy) 1428{ 1429 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16, 1430 cache_policy, false, false, false); 1431} 1432 1433LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1434 LLVMValueRef voffset, LLVMValueRef soffset, 1435 unsigned cache_policy) 1436{ 1437 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy, 1438 false, false, false); 1439} 1440 1441/** 1442 * Convert an 11- or 10-bit unsigned floating point number to an f32. 1443 * 1444 * The input exponent is expected to be biased analogous to IEEE-754, i.e. by 1445 * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs). 1446 */ 1447static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, 1448 unsigned exp_bits, unsigned mant_bits) 1449{ 1450 assert(LLVMTypeOf(src) == ctx->i32); 1451 1452 LLVMValueRef tmp; 1453 LLVMValueRef mantissa; 1454 mantissa = 1455 LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), ""); 1456 1457 /* Converting normal numbers is just a shift + correcting the exponent bias */ 1458 unsigned normal_shift = 23 - mant_bits; 1459 unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1); 1460 LLVMValueRef shifted, normal; 1461 1462 shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), ""); 1463 normal = 1464 LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), ""); 1465 1466 /* Converting nan/inf numbers is the same, but with a different exponent update */ 1467 LLVMValueRef naninf; 1468 naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), ""); 1469 1470 /* Converting denormals is the complex case: determine the leading zeros of the 1471 * mantissa to obtain the correct shift for the mantissa and exponent correction. 1472 */ 1473 LLVMValueRef denormal; 1474 LLVMValueRef params[2] = { 1475 mantissa, ctx->i1true, /* result can be undef when arg is 0 */ 1476 }; 1477 LLVMValueRef ctlz = 1478 ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE); 1479 1480 /* Shift such that the leading 1 ends up as the LSB of the exponent field. */ 1481 tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), ""); 1482 denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, ""); 1483 1484 unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1; 1485 tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, ""); 1486 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), ""); 1487 denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, ""); 1488 1489 /* Select the final result. */ 1490 LLVMValueRef result; 1491 1492 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, 1493 LLVMConstInt(ctx->i32, ((1ULL << exp_bits) - 1) << mant_bits, false), ""); 1494 result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, ""); 1495 1496 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, 1497 LLVMConstInt(ctx->i32, 1ULL << mant_bits, false), ""); 1498 result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, ""); 1499 1500 tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, ""); 1501 result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, ""); 1502 1503 return ac_to_float(ctx, result); 1504} 1505 1506/** 1507 * Generate a fully general open coded buffer format fetch with all required 1508 * fixups suitable for vertex fetch, using non-format buffer loads. 1509 * 1510 * Some combinations of argument values have special interpretations: 1511 * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT 1512 * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format 1513 * 1514 * \param log_size log(size of channel in bytes) 1515 * \param num_channels number of channels (1 to 4) 1516 * \param format AC_FETCH_FORMAT_xxx value 1517 * \param reverse whether XYZ channels are reversed 1518 * \param known_aligned whether the source is known to be aligned to hardware's 1519 * effective element size for loading the given format 1520 * (note: this means dword alignment for 8_8_8_8, 16_16, etc.) 1521 * \param rsrc buffer resource descriptor 1522 * \return the resulting vector of floats or integers bitcast to <4 x i32> 1523 */ 1524LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size, 1525 unsigned num_channels, unsigned format, bool reverse, 1526 bool known_aligned, LLVMValueRef rsrc, 1527 LLVMValueRef vindex, LLVMValueRef voffset, 1528 LLVMValueRef soffset, unsigned cache_policy, 1529 bool can_speculate) 1530{ 1531 LLVMValueRef tmp; 1532 unsigned load_log_size = log_size; 1533 unsigned load_num_channels = num_channels; 1534 if (log_size == 3) { 1535 load_log_size = 2; 1536 if (format == AC_FETCH_FORMAT_FLOAT) { 1537 load_num_channels = 2 * num_channels; 1538 } else { 1539 load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */ 1540 } 1541 } 1542 1543 int log_recombine = 0; 1544 if ((ctx->gfx_level == GFX6 || ctx->gfx_level >= GFX10) && !known_aligned) { 1545 /* Avoid alignment restrictions by loading one byte at a time. */ 1546 load_num_channels <<= load_log_size; 1547 log_recombine = load_log_size; 1548 load_log_size = 0; 1549 } else if (load_num_channels == 2 || load_num_channels == 4) { 1550 log_recombine = -util_logbase2(load_num_channels); 1551 load_num_channels = 1; 1552 load_log_size += -log_recombine; 1553 } 1554 1555 LLVMValueRef loads[32]; /* up to 32 bytes */ 1556 for (unsigned i = 0; i < load_num_channels; ++i) { 1557 tmp = 1558 LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), ""); 1559 LLVMTypeRef channel_type = 1560 load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32; 1561 unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2); 1562 loads[i] = 1563 ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type, 1564 cache_policy, can_speculate, false, true); 1565 if (load_log_size >= 2) 1566 loads[i] = ac_to_integer(ctx, loads[i]); 1567 } 1568 1569 if (log_recombine > 0) { 1570 /* Recombine bytes if necessary (GFX6 only) */ 1571 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16; 1572 1573 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) { 1574 LLVMValueRef accum = NULL; 1575 for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) { 1576 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, ""); 1577 if (i == 0) { 1578 accum = tmp; 1579 } else { 1580 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), ""); 1581 accum = LLVMBuildOr(ctx->builder, accum, tmp, ""); 1582 } 1583 } 1584 loads[dst] = accum; 1585 } 1586 } else if (log_recombine < 0) { 1587 /* Split vectors of dwords */ 1588 if (load_log_size > 2) { 1589 assert(load_num_channels == 1); 1590 LLVMValueRef loaded = loads[0]; 1591 unsigned log_split = load_log_size - 2; 1592 log_recombine += log_split; 1593 load_num_channels = 1 << log_split; 1594 load_log_size = 2; 1595 for (unsigned i = 0; i < load_num_channels; ++i) { 1596 tmp = LLVMConstInt(ctx->i32, i, false); 1597 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, ""); 1598 } 1599 } 1600 1601 /* Further split dwords and shorts if required */ 1602 if (log_recombine < 0) { 1603 for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0; 1604 --src) { 1605 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine); 1606 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits); 1607 LLVMValueRef loaded = loads[src - 1]; 1608 LLVMTypeRef loaded_type = LLVMTypeOf(loaded); 1609 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) { 1610 tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false); 1611 tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, ""); 1612 loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, ""); 1613 } 1614 } 1615 } 1616 } 1617 1618 if (log_size == 3) { 1619 if (format == AC_FETCH_FORMAT_FLOAT) { 1620 for (unsigned i = 0; i < num_channels; ++i) { 1621 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2); 1622 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, ""); 1623 } 1624 } else if (format == AC_FETCH_FORMAT_FIXED) { 1625 /* 10_11_11_FLOAT */ 1626 LLVMValueRef data = loads[0]; 1627 LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false); 1628 LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, ""); 1629 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), ""); 1630 LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, ""); 1631 LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), ""); 1632 1633 loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6)); 1634 loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6)); 1635 loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5)); 1636 1637 num_channels = 3; 1638 log_size = 2; 1639 format = AC_FETCH_FORMAT_FLOAT; 1640 } else { 1641 /* 2_10_10_10 data formats */ 1642 LLVMValueRef data = loads[0]; 1643 LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10); 1644 LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2); 1645 loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, ""); 1646 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), ""); 1647 loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); 1648 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), ""); 1649 loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); 1650 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), ""); 1651 loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, ""); 1652 1653 num_channels = 4; 1654 } 1655 } 1656 1657 if (format == AC_FETCH_FORMAT_FLOAT) { 1658 if (log_size != 2) { 1659 for (unsigned chan = 0; chan < num_channels; ++chan) { 1660 tmp = ac_to_float(ctx, loads[chan]); 1661 if (log_size == 3) 1662 tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, ""); 1663 else if (log_size == 1) 1664 tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, ""); 1665 loads[chan] = ac_to_integer(ctx, tmp); 1666 } 1667 } 1668 } else if (format == AC_FETCH_FORMAT_UINT) { 1669 if (log_size != 2) { 1670 for (unsigned chan = 0; chan < num_channels; ++chan) 1671 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, ""); 1672 } 1673 } else if (format == AC_FETCH_FORMAT_SINT) { 1674 if (log_size != 2) { 1675 for (unsigned chan = 0; chan < num_channels; ++chan) 1676 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, ""); 1677 } 1678 } else { 1679 bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED || 1680 format == AC_FETCH_FORMAT_UINT; 1681 1682 for (unsigned chan = 0; chan < num_channels; ++chan) { 1683 if (unsign) { 1684 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, ""); 1685 } else { 1686 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, ""); 1687 } 1688 1689 LLVMValueRef scale = NULL; 1690 if (format == AC_FETCH_FORMAT_FIXED) { 1691 assert(log_size == 2); 1692 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000); 1693 } else if (format == AC_FETCH_FORMAT_UNORM) { 1694 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); 1695 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1)); 1696 } else if (format == AC_FETCH_FORMAT_SNORM) { 1697 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); 1698 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1)); 1699 } 1700 if (scale) 1701 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, ""); 1702 1703 if (format == AC_FETCH_FORMAT_SNORM) { 1704 /* Clamp to [-1, 1] */ 1705 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); 1706 LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, ""); 1707 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, ""); 1708 } 1709 1710 loads[chan] = ac_to_integer(ctx, tmp); 1711 } 1712 } 1713 1714 while (num_channels < 4) { 1715 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) { 1716 loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0; 1717 } else { 1718 loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0); 1719 } 1720 num_channels++; 1721 } 1722 1723 if (reverse) { 1724 tmp = loads[0]; 1725 loads[0] = loads[2]; 1726 loads[2] = tmp; 1727 } 1728 1729 return ac_build_gather_values(ctx, loads, 4); 1730} 1731 1732void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1733 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset, 1734 unsigned cache_policy) 1735{ 1736 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, ""); 1737 1738 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false); 1739} 1740 1741void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, 1742 LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy) 1743{ 1744 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, ""); 1745 1746 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false); 1747} 1748 1749/** 1750 * Set range metadata on an instruction. This can only be used on load and 1751 * call instructions. If you know an instruction can only produce the values 1752 * 0, 1, 2, you would do set_range_metadata(value, 0, 3); 1753 * \p lo is the minimum value inclusive. 1754 * \p hi is the maximum value exclusive. 1755 */ 1756void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo, 1757 unsigned hi) 1758{ 1759 LLVMValueRef range_md, md_args[2]; 1760 LLVMTypeRef type = LLVMTypeOf(value); 1761 LLVMContextRef context = LLVMGetTypeContext(type); 1762 1763 md_args[0] = LLVMConstInt(type, lo, false); 1764 md_args[1] = LLVMConstInt(type, hi, false); 1765 range_md = LLVMMDNodeInContext(context, md_args, 2); 1766 LLVMSetMetadata(value, ctx->range_md_kind, range_md); 1767} 1768 1769LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx) 1770{ 1771 return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0)); 1772} 1773 1774/* 1775 * AMD GCN implements derivatives using the local data store (LDS) 1776 * All writes to the LDS happen in all executing threads at 1777 * the same time. TID is the Thread ID for the current 1778 * thread and is a value between 0 and 63, representing 1779 * the thread's position in the wavefront. 1780 * 1781 * For the pixel shader threads are grouped into quads of four pixels. 1782 * The TIDs of the pixels of a quad are: 1783 * 1784 * +------+------+ 1785 * |4n + 0|4n + 1| 1786 * +------+------+ 1787 * |4n + 2|4n + 3| 1788 * +------+------+ 1789 * 1790 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel 1791 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of 1792 * the current pixel's column, and masking with 0xfffffffe yields the TID 1793 * of the left pixel of the current pixel's row. 1794 * 1795 * Adding 1 yields the TID of the pixel to the right of the left pixel, and 1796 * adding 2 yields the TID of the pixel below the top pixel. 1797 */ 1798LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val) 1799{ 1800 unsigned tl_lanes[4], trbl_lanes[4]; 1801 char name[32], type[8]; 1802 LLVMValueRef tl, trbl; 1803 LLVMTypeRef result_type; 1804 LLVMValueRef result; 1805 1806 result_type = ac_to_float_type(ctx, LLVMTypeOf(val)); 1807 1808 if (result_type == ctx->f16) 1809 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, ""); 1810 else if (result_type == ctx->v2f16) 1811 val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, ""); 1812 1813 for (unsigned i = 0; i < 4; ++i) { 1814 tl_lanes[i] = i & mask; 1815 trbl_lanes[i] = (i & mask) + idx; 1816 } 1817 1818 tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]); 1819 trbl = 1820 ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]); 1821 1822 if (result_type == ctx->f16) { 1823 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, ""); 1824 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, ""); 1825 } 1826 1827 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, ""); 1828 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, ""); 1829 result = LLVMBuildFSub(ctx->builder, trbl, tl, ""); 1830 1831 ac_build_type_name_for_intr(result_type, type, sizeof(type)); 1832 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type); 1833 1834 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0); 1835} 1836 1837void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id) 1838{ 1839 LLVMValueRef args[2]; 1840 args[0] = LLVMConstInt(ctx->i32, msg, false); 1841 args[1] = wave_id; 1842 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0); 1843} 1844 1845LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type) 1846{ 1847 LLVMValueRef msb = 1848 ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE); 1849 1850 /* The HW returns the last bit index from MSB, but NIR/TGSI wants 1851 * the index from LSB. Invert it by doing "31 - msb". */ 1852 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, ""); 1853 1854 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true); 1855 LLVMValueRef cond = 1856 LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""), 1857 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), ""); 1858 1859 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, ""); 1860} 1861 1862LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type) 1863{ 1864 const char *intrin_name; 1865 LLVMTypeRef type; 1866 LLVMValueRef highest_bit; 1867 LLVMValueRef zero; 1868 unsigned bitsize; 1869 1870 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg)); 1871 switch (bitsize) { 1872 case 64: 1873 intrin_name = "llvm.ctlz.i64"; 1874 type = ctx->i64; 1875 highest_bit = LLVMConstInt(ctx->i64, 63, false); 1876 zero = ctx->i64_0; 1877 break; 1878 case 32: 1879 intrin_name = "llvm.ctlz.i32"; 1880 type = ctx->i32; 1881 highest_bit = LLVMConstInt(ctx->i32, 31, false); 1882 zero = ctx->i32_0; 1883 break; 1884 case 16: 1885 intrin_name = "llvm.ctlz.i16"; 1886 type = ctx->i16; 1887 highest_bit = LLVMConstInt(ctx->i16, 15, false); 1888 zero = ctx->i16_0; 1889 break; 1890 case 8: 1891 intrin_name = "llvm.ctlz.i8"; 1892 type = ctx->i8; 1893 highest_bit = LLVMConstInt(ctx->i8, 7, false); 1894 zero = ctx->i8_0; 1895 break; 1896 default: 1897 unreachable(!"invalid bitsize"); 1898 break; 1899 } 1900 1901 LLVMValueRef params[2] = { 1902 arg, 1903 ctx->i1true, 1904 }; 1905 1906 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE); 1907 1908 /* The HW returns the last bit index from MSB, but TGSI/NIR wants 1909 * the index from LSB. Invert it by doing "31 - msb". */ 1910 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, ""); 1911 1912 if (bitsize == 64) { 1913 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, ""); 1914 } else if (bitsize < 32) { 1915 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, ""); 1916 } 1917 1918 /* check for zero */ 1919 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""), 1920 LLVMConstInt(ctx->i32, -1, true), msb, ""); 1921} 1922 1923LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1924{ 1925 char name[64], type[64]; 1926 1927 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type)); 1928 snprintf(name, sizeof(name), "llvm.minnum.%s", type); 1929 LLVMValueRef args[2] = {a, b}; 1930 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); 1931} 1932 1933LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1934{ 1935 char name[64], type[64]; 1936 1937 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type)); 1938 snprintf(name, sizeof(name), "llvm.maxnum.%s", type); 1939 LLVMValueRef args[2] = {a, b}; 1940 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); 1941} 1942 1943LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1944{ 1945 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, ""); 1946 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 1947} 1948 1949LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1950{ 1951 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, ""); 1952 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 1953} 1954 1955LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1956{ 1957 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, ""); 1958 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 1959} 1960 1961LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1962{ 1963 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, ""); 1964 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 1965} 1966 1967LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value) 1968{ 1969 LLVMTypeRef t = LLVMTypeOf(value); 1970 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)), 1971 LLVMConstReal(t, 1.0)); 1972} 1973 1974void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a) 1975{ 1976 LLVMValueRef args[9]; 1977 1978 args[0] = LLVMConstInt(ctx->i32, a->target, 0); 1979 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0); 1980 1981 if (a->compr) { 1982 assert(ctx->gfx_level < GFX11); 1983 1984 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, ""); 1985 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, ""); 1986 args[4] = LLVMConstInt(ctx->i1, a->done, 0); 1987 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0); 1988 1989 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0); 1990 } else { 1991 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->f32, ""); 1992 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->f32, ""); 1993 args[4] = LLVMBuildBitCast(ctx->builder, a->out[2], ctx->f32, ""); 1994 args[5] = LLVMBuildBitCast(ctx->builder, a->out[3], ctx->f32, ""); 1995 args[6] = LLVMConstInt(ctx->i1, a->done, 0); 1996 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0); 1997 1998 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0); 1999 } 2000} 2001 2002void ac_build_export_null(struct ac_llvm_context *ctx, bool uses_discard) 2003{ 2004 struct ac_export_args args; 2005 2006 /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask 2007 * for discard. 2008 */ 2009 if (ctx->gfx_level >= GFX10 && !uses_discard) 2010 return; 2011 2012 args.enabled_channels = 0x0; /* enabled channels */ 2013 args.valid_mask = 1; /* whether the EXEC mask is valid */ 2014 args.done = 1; /* DONE bit */ 2015 /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */ 2016 args.target = ctx->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL; 2017 args.compr = 0; /* COMPR flag (0 = 32-bit export) */ 2018 args.out[0] = LLVMGetUndef(ctx->f32); /* R */ 2019 args.out[1] = LLVMGetUndef(ctx->f32); /* G */ 2020 args.out[2] = LLVMGetUndef(ctx->f32); /* B */ 2021 args.out[3] = LLVMGetUndef(ctx->f32); /* A */ 2022 2023 ac_build_export(ctx, &args); 2024} 2025 2026static unsigned ac_num_coords(enum ac_image_dim dim) 2027{ 2028 switch (dim) { 2029 case ac_image_1d: 2030 return 1; 2031 case ac_image_2d: 2032 case ac_image_1darray: 2033 return 2; 2034 case ac_image_3d: 2035 case ac_image_cube: 2036 case ac_image_2darray: 2037 case ac_image_2dmsaa: 2038 return 3; 2039 case ac_image_2darraymsaa: 2040 return 4; 2041 default: 2042 unreachable("ac_num_coords: bad dim"); 2043 } 2044} 2045 2046static unsigned ac_num_derivs(enum ac_image_dim dim) 2047{ 2048 switch (dim) { 2049 case ac_image_1d: 2050 case ac_image_1darray: 2051 return 2; 2052 case ac_image_2d: 2053 case ac_image_2darray: 2054 case ac_image_cube: 2055 return 4; 2056 case ac_image_3d: 2057 return 6; 2058 case ac_image_2dmsaa: 2059 case ac_image_2darraymsaa: 2060 default: 2061 unreachable("derivatives not supported"); 2062 } 2063} 2064 2065static const char *get_atomic_name(enum ac_atomic_op op) 2066{ 2067 switch (op) { 2068 case ac_atomic_swap: 2069 return "swap"; 2070 case ac_atomic_add: 2071 return "add"; 2072 case ac_atomic_sub: 2073 return "sub"; 2074 case ac_atomic_smin: 2075 return "smin"; 2076 case ac_atomic_umin: 2077 return "umin"; 2078 case ac_atomic_smax: 2079 return "smax"; 2080 case ac_atomic_umax: 2081 return "umax"; 2082 case ac_atomic_and: 2083 return "and"; 2084 case ac_atomic_or: 2085 return "or"; 2086 case ac_atomic_xor: 2087 return "xor"; 2088 case ac_atomic_inc_wrap: 2089 return "inc"; 2090 case ac_atomic_dec_wrap: 2091 return "dec"; 2092 case ac_atomic_fmin: 2093 return "fmin"; 2094 case ac_atomic_fmax: 2095 return "fmax"; 2096 } 2097 unreachable("bad atomic op"); 2098} 2099 2100LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a) 2101{ 2102 const char *overload[3] = {"", "", ""}; 2103 unsigned num_overloads = 0; 2104 LLVMValueRef args[18]; 2105 unsigned num_args = 0; 2106 enum ac_image_dim dim = a->dim; 2107 2108 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero); 2109 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip && 2110 a->opcode != ac_image_store_mip) || 2111 a->lod); 2112 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || 2113 (!a->compare && !a->offset)); 2114 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || 2115 a->opcode == ac_image_get_lod) || 2116 !a->bias); 2117 assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <= 2118 1); 2119 assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1); 2120 assert(!a->d16 || (ctx->gfx_level >= GFX8 && a->opcode != ac_image_atomic && 2121 a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod && 2122 a->opcode != ac_image_get_resinfo)); 2123 assert(!a->a16 || ctx->gfx_level >= GFX9); 2124 assert(a->g16 == a->a16 || ctx->gfx_level >= GFX10); 2125 2126 assert(!a->offset || 2127 ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32); 2128 assert(!a->bias || 2129 ac_get_elem_bits(ctx, LLVMTypeOf(a->bias)) == 32); 2130 assert(!a->compare || 2131 ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32); 2132 assert(!a->derivs[0] || 2133 ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) && 2134 (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32))); 2135 assert(!a->coords[0] || 2136 ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) && 2137 (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32))); 2138 assert(!a->lod || 2139 ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) && 2140 (a->opcode == ac_image_get_resinfo || 2141 ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) == 2142 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0]))))); 2143 assert(!a->min_lod || 2144 ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) == 2145 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0]))); 2146 2147 if (a->opcode == ac_image_get_lod) { 2148 switch (dim) { 2149 case ac_image_1darray: 2150 dim = ac_image_1d; 2151 break; 2152 case ac_image_2darray: 2153 case ac_image_cube: 2154 dim = ac_image_2d; 2155 break; 2156 default: 2157 break; 2158 } 2159 } 2160 2161 bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || 2162 a->opcode == ac_image_get_lod; 2163 bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap; 2164 bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || 2165 a->opcode == ac_image_load || a->opcode == ac_image_load_mip; 2166 LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32); 2167 uint8_t dmask = a->dmask; 2168 LLVMTypeRef data_type; 2169 char data_type_str[32]; 2170 2171 if (atomic) { 2172 data_type = LLVMTypeOf(a->data[0]); 2173 } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) { 2174 /* Image stores might have been shrinked using the format. */ 2175 data_type = LLVMTypeOf(a->data[0]); 2176 dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1; 2177 } else { 2178 data_type = a->d16 ? ctx->v4f16 : ctx->v4f32; 2179 } 2180 2181 if (a->tfe) { 2182 data_type = LLVMStructTypeInContext( 2183 ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false); 2184 } 2185 2186 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) { 2187 args[num_args++] = a->data[0]; 2188 if (a->opcode == ac_image_atomic_cmpswap) 2189 args[num_args++] = a->data[1]; 2190 } 2191 2192 if (!atomic) 2193 args[num_args++] = LLVMConstInt(ctx->i32, dmask, false); 2194 2195 if (a->offset) 2196 args[num_args++] = ac_to_integer(ctx, a->offset); 2197 if (a->bias) { 2198 args[num_args++] = ac_to_float(ctx, a->bias); 2199 overload[num_overloads++] = ".f32"; 2200 } 2201 if (a->compare) 2202 args[num_args++] = ac_to_float(ctx, a->compare); 2203 if (a->derivs[0]) { 2204 unsigned count = ac_num_derivs(dim); 2205 for (unsigned i = 0; i < count; ++i) 2206 args[num_args++] = ac_to_float(ctx, a->derivs[i]); 2207 overload[num_overloads++] = a->g16 ? ".f16" : ".f32"; 2208 } 2209 unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0; 2210 for (unsigned i = 0; i < num_coords; ++i) 2211 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, ""); 2212 if (a->lod) 2213 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, ""); 2214 if (a->min_lod) 2215 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, ""); 2216 2217 overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32"); 2218 2219 args[num_args++] = a->resource; 2220 if (sample) { 2221 args[num_args++] = a->sampler; 2222 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false); 2223 } 2224 2225 args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */ 2226 args[num_args++] = LLVMConstInt( 2227 ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false); 2228 2229 const char *name; 2230 const char *atomic_subop = ""; 2231 switch (a->opcode) { 2232 case ac_image_sample: 2233 name = "sample"; 2234 break; 2235 case ac_image_gather4: 2236 name = "gather4"; 2237 break; 2238 case ac_image_load: 2239 name = "load"; 2240 break; 2241 case ac_image_load_mip: 2242 name = "load.mip"; 2243 break; 2244 case ac_image_store: 2245 name = "store"; 2246 break; 2247 case ac_image_store_mip: 2248 name = "store.mip"; 2249 break; 2250 case ac_image_atomic: 2251 name = "atomic."; 2252 atomic_subop = get_atomic_name(a->atomic); 2253 break; 2254 case ac_image_atomic_cmpswap: 2255 name = "atomic."; 2256 atomic_subop = "cmpswap"; 2257 break; 2258 case ac_image_get_lod: 2259 name = "getlod"; 2260 break; 2261 case ac_image_get_resinfo: 2262 name = "getresinfo"; 2263 break; 2264 default: 2265 unreachable("invalid image opcode"); 2266 } 2267 2268 const char *dimname; 2269 switch (dim) { 2270 case ac_image_1d: 2271 dimname = "1d"; 2272 break; 2273 case ac_image_2d: 2274 dimname = "2d"; 2275 break; 2276 case ac_image_3d: 2277 dimname = "3d"; 2278 break; 2279 case ac_image_cube: 2280 dimname = "cube"; 2281 break; 2282 case ac_image_1darray: 2283 dimname = "1darray"; 2284 break; 2285 case ac_image_2darray: 2286 dimname = "2darray"; 2287 break; 2288 case ac_image_2dmsaa: 2289 dimname = "2dmsaa"; 2290 break; 2291 case ac_image_2darraymsaa: 2292 dimname = "2darraymsaa"; 2293 break; 2294 default: 2295 unreachable("invalid dim"); 2296 } 2297 2298 ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str)); 2299 2300 bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4); 2301 char intr_name[96]; 2302 snprintf(intr_name, sizeof(intr_name), 2303 "llvm.amdgcn.image.%s%s" /* base name */ 2304 "%s%s%s%s" /* sample/gather modifiers */ 2305 ".%s.%s%s%s%s", /* dimension and type overloads */ 2306 name, atomic_subop, a->compare ? ".c" : "", 2307 a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "", 2308 a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname, 2309 data_type_str, overload[0], overload[1], overload[2]); 2310 2311 LLVMTypeRef retty; 2312 if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) 2313 retty = ctx->voidt; 2314 else 2315 retty = data_type; 2316 2317 LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes); 2318 if (a->tfe) { 2319 LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, ""); 2320 LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, ""); 2321 result = ac_build_concat(ctx, texel, ac_to_float(ctx, code)); 2322 } 2323 2324 if (!sample && !atomic && retty != ctx->voidt) 2325 result = ac_to_integer(ctx, result); 2326 2327 return result; 2328} 2329 2330LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc) 2331{ 2332 LLVMValueRef samples; 2333 2334 /* Read the samples from the descriptor directly. 2335 * Hardware doesn't have any instruction for this. 2336 */ 2337 samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), ""); 2338 samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), ""); 2339 samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), ""); 2340 samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, ""); 2341 return samples; 2342} 2343 2344LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2]) 2345{ 2346 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2, 2347 AC_FUNC_ATTR_READNONE); 2348} 2349 2350LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2]) 2351{ 2352 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2, 2353 AC_FUNC_ATTR_READNONE); 2354 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2355} 2356 2357LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2]) 2358{ 2359 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2, 2360 AC_FUNC_ATTR_READNONE); 2361 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2362} 2363 2364LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx, 2365 LLVMValueRef args[2]) 2366{ 2367 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16}; 2368 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false); 2369 LLVMValueRef code = LLVMConstInlineAsm(calltype, 2370 ctx->gfx_level >= GFX11 ? 2371 "v_cvt_pk_norm_i16_f16 $0, $1, $2" : 2372 "v_cvt_pknorm_i16_f16 $0, $1, $2", 2373 "=v,v,v", false, false); 2374 return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, ""); 2375} 2376 2377LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx, 2378 LLVMValueRef args[2]) 2379{ 2380 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16}; 2381 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false); 2382 LLVMValueRef code = LLVMConstInlineAsm(calltype, 2383 ctx->gfx_level >= GFX11 ? 2384 "v_cvt_pk_norm_u16_f16 $0, $1, $2" : 2385 "v_cvt_pknorm_u16_f16 $0, $1, $2", 2386 "=v,v,v", false, false); 2387 return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, ""); 2388} 2389 2390/* The 8-bit and 10-bit clamping is for HW workarounds. */ 2391LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, 2392 bool hi) 2393{ 2394 assert(bits == 8 || bits == 10 || bits == 16); 2395 2396 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0); 2397 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0); 2398 LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1; 2399 LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0); 2400 2401 /* Clamp. */ 2402 if (bits != 16) { 2403 for (int i = 0; i < 2; i++) { 2404 bool alpha = hi && i == 1; 2405 args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb); 2406 args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb); 2407 } 2408 } 2409 2410 LLVMValueRef res = 2411 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE); 2412 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2413} 2414 2415/* The 8-bit and 10-bit clamping is for HW workarounds. */ 2416LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, 2417 bool hi) 2418{ 2419 assert(bits == 8 || bits == 10 || bits == 16); 2420 2421 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0); 2422 LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0); 2423 2424 /* Clamp. */ 2425 if (bits != 16) { 2426 for (int i = 0; i < 2; i++) { 2427 bool alpha = hi && i == 1; 2428 args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb); 2429 } 2430 } 2431 2432 LLVMValueRef res = 2433 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE); 2434 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2435} 2436 2437LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1) 2438{ 2439 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE); 2440} 2441 2442void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1) 2443{ 2444 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0); 2445} 2446 2447LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset, 2448 LLVMValueRef width, bool is_signed) 2449{ 2450 LLVMValueRef args[] = { 2451 input, 2452 offset, 2453 width, 2454 }; 2455 2456 return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32", 2457 ctx->i32, args, 3, AC_FUNC_ATTR_READNONE); 2458} 2459 2460LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1, 2461 LLVMValueRef s2) 2462{ 2463 return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, ""); 2464} 2465 2466LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1, 2467 LLVMValueRef s2) 2468{ 2469 /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */ 2470 if (ctx->gfx_level >= GFX10) { 2471 return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3, 2472 AC_FUNC_ATTR_READNONE); 2473 } 2474 2475 return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, ""); 2476} 2477 2478void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags) 2479{ 2480 if (!wait_flags) 2481 return; 2482 2483 unsigned expcnt = 7; 2484 unsigned lgkmcnt = 63; 2485 unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15; 2486 unsigned vscnt = 63; 2487 2488 if (wait_flags & AC_WAIT_EXP) 2489 expcnt = 0; 2490 if (wait_flags & AC_WAIT_LGKM) 2491 lgkmcnt = 0; 2492 if (wait_flags & AC_WAIT_VLOAD) 2493 vmcnt = 0; 2494 2495 if (wait_flags & AC_WAIT_VSTORE) { 2496 if (ctx->gfx_level >= GFX10) 2497 vscnt = 0; 2498 else 2499 vmcnt = 0; 2500 } 2501 2502 /* There is no intrinsic for vscnt(0), so use a fence. */ 2503 if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) || 2504 vscnt == 0) { 2505 assert(!(wait_flags & AC_WAIT_EXP)); 2506 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, ""); 2507 return; 2508 } 2509 2510 unsigned simm16; 2511 2512 if (ctx->gfx_level >= GFX11) 2513 simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10); 2514 else 2515 simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14); 2516 2517 LLVMValueRef args[1] = { 2518 LLVMConstInt(ctx->i32, simm16, false), 2519 }; 2520 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0); 2521} 2522 2523LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src, 2524 LLVMTypeRef type) 2525{ 2526 unsigned bitsize = ac_get_elem_bits(ctx, type); 2527 LLVMValueRef zero = LLVMConstReal(type, 0.0); 2528 LLVMValueRef one = LLVMConstReal(type, 1.0); 2529 LLVMValueRef result; 2530 2531 if (bitsize == 64 || (bitsize == 16 && ctx->gfx_level <= GFX8) || type == ctx->v2f16) { 2532 /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM 2533 * doesn't expose an intrinsic. 2534 */ 2535 result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one); 2536 } else { 2537 LLVMTypeRef type; 2538 char *intr; 2539 2540 if (bitsize == 16) { 2541 intr = "llvm.amdgcn.fmed3.f16"; 2542 type = ctx->f16; 2543 } else { 2544 assert(bitsize == 32); 2545 intr = "llvm.amdgcn.fmed3.f32"; 2546 type = ctx->f32; 2547 } 2548 2549 LLVMValueRef params[] = { 2550 zero, 2551 one, 2552 src, 2553 }; 2554 2555 result = ac_build_intrinsic(ctx, intr, type, params, 3, 2556 AC_FUNC_ATTR_READNONE); 2557 } 2558 2559 if (ctx->gfx_level < GFX9 && bitsize == 32) { 2560 /* Only pre-GFX9 chips do not flush denorms. */ 2561 result = ac_build_canonicalize(ctx, result, bitsize); 2562 } 2563 2564 return result; 2565} 2566 2567LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) 2568{ 2569 LLVMTypeRef type; 2570 char *intr; 2571 2572 if (bitsize == 16) { 2573 intr = "llvm.amdgcn.fract.f16"; 2574 type = ctx->f16; 2575 } else if (bitsize == 32) { 2576 intr = "llvm.amdgcn.fract.f32"; 2577 type = ctx->f32; 2578 } else { 2579 intr = "llvm.amdgcn.fract.f64"; 2580 type = ctx->f64; 2581 } 2582 2583 LLVMValueRef params[] = { 2584 src0, 2585 }; 2586 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); 2587} 2588 2589LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value) 2590{ 2591 2592 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { 2593 LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0); 2594 unsigned vec_size = LLVMGetVectorSize(type); 2595 LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef)); 2596 2597 for (unsigned i = 0; i < vec_size; i++) 2598 scalars[i] = scalar; 2599 return LLVMConstVector(scalars, vec_size); 2600 } 2601 return LLVMConstInt(type, value, 0); 2602} 2603 2604LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0) 2605{ 2606 LLVMTypeRef type = LLVMTypeOf(src0); 2607 LLVMValueRef val; 2608 2609 /* v_med3 is selected only when max is first. (LLVM bug?) */ 2610 val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1)); 2611 return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1)); 2612} 2613 2614static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val) 2615{ 2616 ac_enable_signed_zeros(ctx); 2617 /* (val + 0) converts negative zero to positive zero. */ 2618 val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), ""); 2619 ac_disable_signed_zeros(ctx); 2620 return val; 2621} 2622 2623LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src) 2624{ 2625 LLVMTypeRef type = LLVMTypeOf(src); 2626 LLVMValueRef pos, neg, dw[2], val; 2627 unsigned bitsize = ac_get_elem_bits(ctx, type); 2628 2629 /* The standard version leads to this: 2630 * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004 2631 * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2 2632 * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880 2633 * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3 2634 * 2635 * The isign version: 2636 * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004 2637 * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304 2638 * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04 2639 * 2640 * (src0 + 0) converts negative zero to positive zero. 2641 * After that, int(fsign(x)) == isign(floatBitsToInt(x)). 2642 * 2643 * For FP64, use the standard version, which doesn't suffer from the huge DP rate 2644 * reduction. (FP64 comparisons are as fast as int64 comparisons) 2645 */ 2646 if (bitsize == 16 || bitsize == 32) { 2647 val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src)); 2648 val = ac_build_isign(ctx, val); 2649 return LLVMBuildSIToFP(ctx->builder, val, type, ""); 2650 } 2651 2652 assert(bitsize == 64); 2653 pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, ""); 2654 neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, ""); 2655 dw[0] = ctx->i32_0; 2656 dw[1] = LLVMBuildSelect( 2657 ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0), 2658 LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""), 2659 ""); 2660 return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, ""); 2661} 2662 2663LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) 2664{ 2665 LLVMValueRef result; 2666 unsigned bitsize; 2667 2668 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); 2669 2670 switch (bitsize) { 2671 case 128: 2672 result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1, 2673 AC_FUNC_ATTR_READNONE); 2674 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); 2675 break; 2676 case 64: 2677 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 2678 AC_FUNC_ATTR_READNONE); 2679 2680 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); 2681 break; 2682 case 32: 2683 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 2684 AC_FUNC_ATTR_READNONE); 2685 break; 2686 case 16: 2687 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 2688 AC_FUNC_ATTR_READNONE); 2689 2690 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2691 break; 2692 case 8: 2693 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 2694 AC_FUNC_ATTR_READNONE); 2695 2696 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2697 break; 2698 default: 2699 unreachable(!"invalid bitsize"); 2700 break; 2701 } 2702 2703 return result; 2704} 2705 2706LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0) 2707{ 2708 LLVMValueRef result; 2709 unsigned bitsize; 2710 2711 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); 2712 2713 switch (bitsize) { 2714 case 64: 2715 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 2716 AC_FUNC_ATTR_READNONE); 2717 2718 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); 2719 break; 2720 case 32: 2721 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 2722 AC_FUNC_ATTR_READNONE); 2723 break; 2724 case 16: 2725 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 2726 AC_FUNC_ATTR_READNONE); 2727 2728 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2729 break; 2730 case 8: 2731 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 2732 AC_FUNC_ATTR_READNONE); 2733 2734 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2735 break; 2736 default: 2737 unreachable(!"invalid bitsize"); 2738 break; 2739 } 2740 2741 return result; 2742} 2743 2744void ac_init_exec_full_mask(struct ac_llvm_context *ctx) 2745{ 2746 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); 2747 ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1, 2748 AC_FUNC_ATTR_CONVERGENT); 2749} 2750 2751void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx) 2752{ 2753 unsigned lds_size = ctx->gfx_level >= GFX7 ? 65536 : 32768; 2754 ctx->lds = LLVMBuildIntToPtr( 2755 ctx->builder, ctx->i32_0, 2756 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds"); 2757} 2758 2759LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr) 2760{ 2761 return LLVMBuildLoad2(ctx->builder, ctx->i32, ac_build_gep0(ctx, ctx->lds, dw_addr), ""); 2762} 2763 2764void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value) 2765{ 2766 value = ac_to_integer(ctx, value); 2767 ac_build_indexed_store(ctx, ctx->lds, dw_addr, value); 2768} 2769 2770LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0) 2771{ 2772 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); 2773 const char *intrin_name; 2774 LLVMTypeRef type; 2775 LLVMValueRef zero; 2776 2777 switch (src0_bitsize) { 2778 case 64: 2779 intrin_name = "llvm.cttz.i64"; 2780 type = ctx->i64; 2781 zero = ctx->i64_0; 2782 break; 2783 case 32: 2784 intrin_name = "llvm.cttz.i32"; 2785 type = ctx->i32; 2786 zero = ctx->i32_0; 2787 break; 2788 case 16: 2789 intrin_name = "llvm.cttz.i16"; 2790 type = ctx->i16; 2791 zero = ctx->i16_0; 2792 break; 2793 case 8: 2794 intrin_name = "llvm.cttz.i8"; 2795 type = ctx->i8; 2796 zero = ctx->i8_0; 2797 break; 2798 default: 2799 unreachable(!"invalid bitsize"); 2800 } 2801 2802 LLVMValueRef params[2] = { 2803 src0, 2804 2805 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't 2806 * add special code to check for x=0. The reason is that 2807 * the LLVM behavior for x=0 is different from what we 2808 * need here. However, LLVM also assumes that ffs(x) is 2809 * in [0, 31], but GLSL expects that ffs(0) = -1, so 2810 * a conditional assignment to handle 0 is still required. 2811 * 2812 * The hardware already implements the correct behavior. 2813 */ 2814 ctx->i1true, 2815 }; 2816 2817 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE); 2818 2819 if (src0_bitsize == 64) { 2820 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, ""); 2821 } else if (src0_bitsize < 32) { 2822 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, ""); 2823 } 2824 2825 /* TODO: We need an intrinsic to skip this conditional. */ 2826 /* Check for zero: */ 2827 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""), 2828 LLVMConstInt(ctx->i32, -1, 0), lsb, ""); 2829} 2830 2831LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type) 2832{ 2833 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST); 2834} 2835 2836LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type) 2837{ 2838 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT); 2839} 2840 2841static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx) 2842{ 2843 if (ctx->flow->depth > 0) 2844 return &ctx->flow->stack[ctx->flow->depth - 1]; 2845 return NULL; 2846} 2847 2848static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx) 2849{ 2850 for (unsigned i = ctx->flow->depth; i > 0; --i) { 2851 if (ctx->flow->stack[i - 1].loop_entry_block) 2852 return &ctx->flow->stack[i - 1]; 2853 } 2854 return NULL; 2855} 2856 2857static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx) 2858{ 2859 struct ac_llvm_flow *flow; 2860 2861 if (ctx->flow->depth >= ctx->flow->depth_max) { 2862 unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH); 2863 2864 ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack)); 2865 ctx->flow->depth_max = new_max; 2866 } 2867 2868 flow = &ctx->flow->stack[ctx->flow->depth]; 2869 ctx->flow->depth++; 2870 2871 flow->next_block = NULL; 2872 flow->loop_entry_block = NULL; 2873 return flow; 2874} 2875 2876static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id) 2877{ 2878 char buf[32]; 2879 snprintf(buf, sizeof(buf), "%s%d", base, label_id); 2880 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf); 2881} 2882 2883/* Append a basic block at the level of the parent flow. 2884 */ 2885static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name) 2886{ 2887 assert(ctx->flow->depth >= 1); 2888 2889 if (ctx->flow->depth >= 2) { 2890 struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2]; 2891 2892 return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name); 2893 } 2894 2895 LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder)); 2896 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name); 2897} 2898 2899/* Emit a branch to the given default target for the current block if 2900 * applicable -- that is, if the current block does not already contain a 2901 * branch from a break or continue. 2902 */ 2903static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target) 2904{ 2905 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder))) 2906 LLVMBuildBr(builder, target); 2907} 2908 2909void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id) 2910{ 2911 struct ac_llvm_flow *flow = push_flow(ctx); 2912 flow->loop_entry_block = append_basic_block(ctx, "LOOP"); 2913 flow->next_block = append_basic_block(ctx, "ENDLOOP"); 2914 set_basicblock_name(flow->loop_entry_block, "loop", label_id); 2915 LLVMBuildBr(ctx->builder, flow->loop_entry_block); 2916 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block); 2917} 2918 2919void ac_build_break(struct ac_llvm_context *ctx) 2920{ 2921 struct ac_llvm_flow *flow = get_innermost_loop(ctx); 2922 LLVMBuildBr(ctx->builder, flow->next_block); 2923} 2924 2925void ac_build_continue(struct ac_llvm_context *ctx) 2926{ 2927 struct ac_llvm_flow *flow = get_innermost_loop(ctx); 2928 LLVMBuildBr(ctx->builder, flow->loop_entry_block); 2929} 2930 2931void ac_build_else(struct ac_llvm_context *ctx, int label_id) 2932{ 2933 struct ac_llvm_flow *current_branch = get_current_flow(ctx); 2934 LLVMBasicBlockRef endif_block; 2935 2936 assert(!current_branch->loop_entry_block); 2937 2938 endif_block = append_basic_block(ctx, "ENDIF"); 2939 emit_default_branch(ctx->builder, endif_block); 2940 2941 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); 2942 set_basicblock_name(current_branch->next_block, "else", label_id); 2943 2944 current_branch->next_block = endif_block; 2945} 2946 2947/* Invoked after a branch is exited. */ 2948static void ac_branch_exited(struct ac_llvm_context *ctx) 2949{ 2950 if (ctx->flow->depth == 0 && ctx->conditional_demote_seen) { 2951 /* The previous conditional branch contained demote. Kill threads 2952 * after all conditional blocks because amdgcn.wqm.vote doesn't 2953 * return usable values inside the blocks. 2954 * 2955 * This is an optional optimization that only kills whole inactive quads. 2956 */ 2957 LLVMValueRef cond = LLVMBuildLoad2(ctx->builder, ctx->i1, ctx->postponed_kill, ""); 2958 ac_build_kill_if_false(ctx, ac_build_wqm_vote(ctx, cond)); 2959 ctx->conditional_demote_seen = false; 2960 } 2961} 2962 2963void ac_build_endif(struct ac_llvm_context *ctx, int label_id) 2964{ 2965 struct ac_llvm_flow *current_branch = get_current_flow(ctx); 2966 2967 assert(!current_branch->loop_entry_block); 2968 2969 emit_default_branch(ctx->builder, current_branch->next_block); 2970 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); 2971 set_basicblock_name(current_branch->next_block, "endif", label_id); 2972 2973 ctx->flow->depth--; 2974 ac_branch_exited(ctx); 2975} 2976 2977void ac_build_endloop(struct ac_llvm_context *ctx, int label_id) 2978{ 2979 struct ac_llvm_flow *current_loop = get_current_flow(ctx); 2980 2981 assert(current_loop->loop_entry_block); 2982 2983 emit_default_branch(ctx->builder, current_loop->loop_entry_block); 2984 2985 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block); 2986 set_basicblock_name(current_loop->next_block, "endloop", label_id); 2987 ctx->flow->depth--; 2988 ac_branch_exited(ctx); 2989} 2990 2991void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id) 2992{ 2993 struct ac_llvm_flow *flow = push_flow(ctx); 2994 LLVMBasicBlockRef if_block; 2995 2996 if_block = append_basic_block(ctx, "IF"); 2997 flow->next_block = append_basic_block(ctx, "ELSE"); 2998 set_basicblock_name(if_block, "if", label_id); 2999 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block); 3000 LLVMPositionBuilderAtEnd(ctx->builder, if_block); 3001} 3002 3003LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name) 3004{ 3005 LLVMBuilderRef builder = ac->builder; 3006 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder); 3007 LLVMValueRef function = LLVMGetBasicBlockParent(current_block); 3008 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function); 3009 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block); 3010 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context); 3011 LLVMValueRef res; 3012 3013 if (first_instr) { 3014 LLVMPositionBuilderBefore(first_builder, first_instr); 3015 } else { 3016 LLVMPositionBuilderAtEnd(first_builder, first_block); 3017 } 3018 3019 res = LLVMBuildAlloca(first_builder, type, name); 3020 LLVMDisposeBuilder(first_builder); 3021 return res; 3022} 3023 3024LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name) 3025{ 3026 LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name); 3027 LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr); 3028 return ptr; 3029} 3030 3031LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name) 3032{ 3033 LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name); 3034 LLVMBuildStore(ac->builder, val, ptr); 3035 return ptr; 3036} 3037 3038LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type) 3039{ 3040 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); 3041 return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), ""); 3042} 3043 3044LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count) 3045{ 3046 unsigned num_components = ac_get_llvm_num_components(value); 3047 if (count == num_components) 3048 return value; 3049 3050 LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef)); 3051 masks[0] = ctx->i32_0; 3052 masks[1] = ctx->i32_1; 3053 for (unsigned i = 2; i < count; i++) 3054 masks[i] = LLVMConstInt(ctx->i32, i, false); 3055 3056 if (count == 1) 3057 return LLVMBuildExtractElement(ctx->builder, value, masks[0], ""); 3058 3059 LLVMValueRef swizzle = LLVMConstVector(masks, count); 3060 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, ""); 3061} 3062 3063/* If param is i64 and bitwidth <= 32, the return value will be i32. */ 3064LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift, 3065 unsigned bitwidth) 3066{ 3067 LLVMValueRef value = param; 3068 if (rshift) 3069 value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), ""); 3070 3071 if (rshift + bitwidth < 32) { 3072 uint64_t mask = (1ull << bitwidth) - 1; 3073 value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), ""); 3074 } 3075 3076 if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64) 3077 value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, ""); 3078 return value; 3079} 3080 3081/* Adjust the sample index according to FMASK. 3082 * 3083 * For uncompressed MSAA surfaces, FMASK should return 0x76543210, 3084 * which is the identity mapping. Each nibble says which physical sample 3085 * should be fetched to get that sample. 3086 * 3087 * For example, 0x11111100 means there are only 2 samples stored and 3088 * the second sample covers 3/4 of the pixel. When reading samples 0 3089 * and 1, return physical sample 0 (determined by the first two 0s 3090 * in FMASK), otherwise return physical sample 1. 3091 * 3092 * The sample index should be adjusted as follows: 3093 * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF; 3094 */ 3095void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr, 3096 bool is_array_tex) 3097{ 3098 struct ac_image_args fmask_load = {0}; 3099 fmask_load.opcode = ac_image_load; 3100 fmask_load.resource = fmask; 3101 fmask_load.dmask = 0xf; 3102 fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d; 3103 fmask_load.attributes = AC_FUNC_ATTR_READNONE; 3104 3105 fmask_load.coords[0] = addr[0]; 3106 fmask_load.coords[1] = addr[1]; 3107 if (is_array_tex) 3108 fmask_load.coords[2] = addr[2]; 3109 fmask_load.a16 = ac_get_elem_bits(ac, LLVMTypeOf(addr[0])) == 16; 3110 3111 LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load); 3112 fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, ""); 3113 3114 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK 3115 * resource descriptor is 0 (invalid). 3116 */ 3117 LLVMValueRef tmp; 3118 tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, ""); 3119 tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, ""); 3120 tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, ""); 3121 fmask_value = 3122 LLVMBuildSelect(ac->builder, tmp, fmask_value, LLVMConstInt(ac->i32, 0x76543210, false), ""); 3123 3124 /* Apply the formula. */ 3125 unsigned sample_chan = is_array_tex ? 3 : 2; 3126 LLVMValueRef final_sample; 3127 final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], 3128 LLVMConstInt(LLVMTypeOf(addr[0]), 4, 0), ""); 3129 final_sample = LLVMBuildLShr(ac->builder, fmask_value, 3130 LLVMBuildZExt(ac->builder, final_sample, ac->i32, ""), ""); 3131 /* Mask the sample index by 0x7, because 0x8 means an unknown value 3132 * with EQAA, so those will map to 0. */ 3133 addr[sample_chan] = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), ""); 3134 if (fmask_load.a16) 3135 addr[sample_chan] = LLVMBuildTrunc(ac->builder, final_sample, ac->i16, ""); 3136} 3137 3138static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, 3139 LLVMValueRef lane, bool with_opt_barrier) 3140{ 3141 LLVMTypeRef type = LLVMTypeOf(src); 3142 LLVMValueRef result; 3143 3144 if (with_opt_barrier) 3145 ac_build_optimization_barrier(ctx, &src, false); 3146 3147 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3148 if (lane) 3149 lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, ""); 3150 3151 result = 3152 ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane", 3153 ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2, 3154 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3155 3156 return LLVMBuildTrunc(ctx->builder, result, type, ""); 3157} 3158 3159static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src, 3160 LLVMValueRef lane, bool with_opt_barrier) 3161{ 3162 LLVMTypeRef src_type = LLVMTypeOf(src); 3163 src = ac_to_integer(ctx, src); 3164 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); 3165 LLVMValueRef ret; 3166 3167 if (bits > 32) { 3168 assert(bits % 32 == 0); 3169 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); 3170 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); 3171 ret = LLVMGetUndef(vec_type); 3172 for (unsigned i = 0; i < bits / 32; i++) { 3173 LLVMValueRef ret_comp; 3174 3175 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); 3176 3177 ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier); 3178 3179 ret = 3180 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); 3181 } 3182 } else { 3183 ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier); 3184 } 3185 3186 if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind) 3187 return LLVMBuildIntToPtr(ctx->builder, ret, src_type, ""); 3188 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3189} 3190 3191/** 3192 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic. 3193 * 3194 * The optimization barrier is not needed if the value is the same in all lanes 3195 * or if this is called in the outermost block. 3196 * 3197 * @param ctx 3198 * @param src 3199 * @param lane - id of the lane or NULL for the first active lane 3200 * @return value of the lane 3201 */ 3202LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src, 3203 LLVMValueRef lane) 3204{ 3205 return ac_build_readlane_common(ctx, src, lane, false); 3206} 3207 3208LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) 3209{ 3210 return ac_build_readlane_common(ctx, src, lane, true); 3211} 3212 3213LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, 3214 LLVMValueRef lane) 3215{ 3216 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32, 3217 (LLVMValueRef[]){value, lane, src}, 3, 3218 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3219} 3220 3221LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src) 3222{ 3223 LLVMValueRef val; 3224 3225 if (ctx->wave_size == 32) { 3226 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, 3227 (LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE); 3228 } else { 3229 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, ""); 3230 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, ""); 3231 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, ""); 3232 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, 3233 (LLVMValueRef[]){mask_lo, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE); 3234 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val}, 3235 2, AC_FUNC_ATTR_READNONE); 3236 } 3237 3238 /* Bug workaround. LLVM always believes the upper bound of mbcnt to be the wave size, 3239 * regardless of ac_set_range_metadata. Use an extra add instruction to work around it. 3240 */ 3241 if (add_src != NULL && add_src != ctx->i32_0) { 3242 return LLVMBuildAdd(ctx->builder, val, add_src, ""); 3243 } 3244 3245 return val; 3246} 3247 3248LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask) 3249{ 3250 return ac_build_mbcnt_add(ctx, mask, ctx->i32_0); 3251} 3252 3253enum dpp_ctrl 3254{ 3255 _dpp_quad_perm = 0x000, 3256 _dpp_row_sl = 0x100, 3257 _dpp_row_sr = 0x110, 3258 _dpp_row_rr = 0x120, 3259 dpp_wf_sl1 = 0x130, 3260 dpp_wf_rl1 = 0x134, 3261 dpp_wf_sr1 = 0x138, 3262 dpp_wf_rr1 = 0x13C, 3263 dpp_row_mirror = 0x140, 3264 dpp_row_half_mirror = 0x141, 3265 dpp_row_bcast15 = 0x142, 3266 dpp_row_bcast31 = 0x143 3267}; 3268 3269static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, 3270 unsigned lane3) 3271{ 3272 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4); 3273 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6); 3274} 3275 3276static inline enum dpp_ctrl dpp_row_sr(unsigned amount) 3277{ 3278 assert(amount > 0 && amount < 16); 3279 return _dpp_row_sr | amount; 3280} 3281 3282static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, 3283 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, 3284 bool bound_ctrl) 3285{ 3286 LLVMTypeRef type = LLVMTypeOf(src); 3287 LLVMValueRef res; 3288 3289 old = LLVMBuildZExt(ctx->builder, old, ctx->i32, ""); 3290 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3291 3292 res = ac_build_intrinsic( 3293 ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32, 3294 (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0), 3295 LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0), 3296 LLVMConstInt(ctx->i1, bound_ctrl, 0)}, 3297 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3298 3299 return LLVMBuildTrunc(ctx->builder, res, type, ""); 3300} 3301 3302static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, 3303 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, 3304 bool bound_ctrl) 3305{ 3306 LLVMTypeRef src_type = LLVMTypeOf(src); 3307 src = ac_to_integer(ctx, src); 3308 old = ac_to_integer(ctx, old); 3309 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); 3310 LLVMValueRef ret; 3311 if (bits > 32) { 3312 assert(bits % 32 == 0); 3313 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); 3314 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); 3315 LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, ""); 3316 ret = LLVMGetUndef(vec_type); 3317 for (unsigned i = 0; i < bits / 32; i++) { 3318 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); 3319 old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), ""); 3320 LLVMValueRef ret_comp = 3321 _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl); 3322 ret = 3323 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); 3324 } 3325 } else { 3326 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl); 3327 } 3328 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3329} 3330 3331static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, 3332 uint64_t sel, bool exchange_rows, bool bound_ctrl) 3333{ 3334 LLVMTypeRef type = LLVMTypeOf(src); 3335 LLVMValueRef result; 3336 3337 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3338 3339 LLVMValueRef args[6] = { 3340 src, 3341 src, 3342 LLVMConstInt(ctx->i32, sel, false), 3343 LLVMConstInt(ctx->i32, sel >> 32, false), 3344 ctx->i1true, /* fi */ 3345 bound_ctrl ? ctx->i1true : ctx->i1false, 3346 }; 3347 3348 result = 3349 ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16", 3350 ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3351 3352 return LLVMBuildTrunc(ctx->builder, result, type, ""); 3353} 3354 3355static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, 3356 bool exchange_rows, bool bound_ctrl) 3357{ 3358 LLVMTypeRef src_type = LLVMTypeOf(src); 3359 src = ac_to_integer(ctx, src); 3360 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); 3361 LLVMValueRef ret; 3362 if (bits > 32) { 3363 assert(bits % 32 == 0); 3364 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); 3365 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); 3366 ret = LLVMGetUndef(vec_type); 3367 for (unsigned i = 0; i < bits / 32; i++) { 3368 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); 3369 LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl); 3370 ret = 3371 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); 3372 } 3373 } else { 3374 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl); 3375 } 3376 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3377} 3378 3379static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) 3380{ 3381 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32); 3382 return and_mask | (or_mask << 5) | (xor_mask << 10); 3383} 3384 3385static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, 3386 unsigned mask) 3387{ 3388 LLVMTypeRef src_type = LLVMTypeOf(src); 3389 LLVMValueRef ret; 3390 3391 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3392 3393 ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32, 3394 (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2, 3395 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3396 3397 return LLVMBuildTrunc(ctx->builder, ret, src_type, ""); 3398} 3399 3400LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) 3401{ 3402 LLVMTypeRef src_type = LLVMTypeOf(src); 3403 src = ac_to_integer(ctx, src); 3404 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); 3405 LLVMValueRef ret; 3406 if (bits > 32) { 3407 assert(bits % 32 == 0); 3408 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); 3409 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); 3410 ret = LLVMGetUndef(vec_type); 3411 for (unsigned i = 0; i < bits / 32; i++) { 3412 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); 3413 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask); 3414 ret = 3415 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); 3416 } 3417 } else { 3418 ret = _ac_build_ds_swizzle(ctx, src, mask); 3419 } 3420 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3421} 3422 3423static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src) 3424{ 3425 LLVMTypeRef src_type = LLVMTypeOf(src); 3426 unsigned bitsize = ac_get_elem_bits(ctx, src_type); 3427 char name[32], type[8]; 3428 LLVMValueRef ret; 3429 3430 src = ac_to_integer(ctx, src); 3431 3432 if (bitsize < 32) 3433 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3434 3435 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); 3436 snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type); 3437 ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1, 3438 AC_FUNC_ATTR_READNONE); 3439 3440 if (bitsize < 32) 3441 ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), ""); 3442 3443 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3444} 3445 3446static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src, 3447 LLVMValueRef inactive) 3448{ 3449 char name[33], type[8]; 3450 LLVMTypeRef src_type = LLVMTypeOf(src); 3451 unsigned bitsize = ac_get_elem_bits(ctx, src_type); 3452 src = ac_to_integer(ctx, src); 3453 inactive = ac_to_integer(ctx, inactive); 3454 3455 if (bitsize < 32) { 3456 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3457 inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, ""); 3458 } 3459 3460 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); 3461 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type); 3462 LLVMValueRef ret = 3463 ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2, 3464 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3465 if (bitsize < 32) 3466 ret = LLVMBuildTrunc(ctx->builder, ret, src_type, ""); 3467 3468 return ret; 3469} 3470 3471static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, 3472 unsigned type_size) 3473{ 3474 3475 if (type_size == 0) { 3476 switch (op) { 3477 case nir_op_ior: 3478 case nir_op_ixor: 3479 return LLVMConstInt(ctx->i1, 0, 0); 3480 case nir_op_iand: 3481 return LLVMConstInt(ctx->i1, 1, 0); 3482 default: 3483 unreachable("bad reduction intrinsic"); 3484 } 3485 } else if (type_size == 1) { 3486 switch (op) { 3487 case nir_op_iadd: 3488 return ctx->i8_0; 3489 case nir_op_imul: 3490 return ctx->i8_1; 3491 case nir_op_imin: 3492 return LLVMConstInt(ctx->i8, INT8_MAX, 0); 3493 case nir_op_umin: 3494 return LLVMConstInt(ctx->i8, UINT8_MAX, 0); 3495 case nir_op_imax: 3496 return LLVMConstInt(ctx->i8, INT8_MIN, 0); 3497 case nir_op_umax: 3498 return ctx->i8_0; 3499 case nir_op_iand: 3500 return LLVMConstInt(ctx->i8, -1, 0); 3501 case nir_op_ior: 3502 return ctx->i8_0; 3503 case nir_op_ixor: 3504 return ctx->i8_0; 3505 default: 3506 unreachable("bad reduction intrinsic"); 3507 } 3508 } else if (type_size == 2) { 3509 switch (op) { 3510 case nir_op_iadd: 3511 return ctx->i16_0; 3512 case nir_op_fadd: 3513 return ctx->f16_0; 3514 case nir_op_imul: 3515 return ctx->i16_1; 3516 case nir_op_fmul: 3517 return ctx->f16_1; 3518 case nir_op_imin: 3519 return LLVMConstInt(ctx->i16, INT16_MAX, 0); 3520 case nir_op_umin: 3521 return LLVMConstInt(ctx->i16, UINT16_MAX, 0); 3522 case nir_op_fmin: 3523 return LLVMConstReal(ctx->f16, INFINITY); 3524 case nir_op_imax: 3525 return LLVMConstInt(ctx->i16, INT16_MIN, 0); 3526 case nir_op_umax: 3527 return ctx->i16_0; 3528 case nir_op_fmax: 3529 return LLVMConstReal(ctx->f16, -INFINITY); 3530 case nir_op_iand: 3531 return LLVMConstInt(ctx->i16, -1, 0); 3532 case nir_op_ior: 3533 return ctx->i16_0; 3534 case nir_op_ixor: 3535 return ctx->i16_0; 3536 default: 3537 unreachable("bad reduction intrinsic"); 3538 } 3539 } else if (type_size == 4) { 3540 switch (op) { 3541 case nir_op_iadd: 3542 return ctx->i32_0; 3543 case nir_op_fadd: 3544 return ctx->f32_0; 3545 case nir_op_imul: 3546 return ctx->i32_1; 3547 case nir_op_fmul: 3548 return ctx->f32_1; 3549 case nir_op_imin: 3550 return LLVMConstInt(ctx->i32, INT32_MAX, 0); 3551 case nir_op_umin: 3552 return LLVMConstInt(ctx->i32, UINT32_MAX, 0); 3553 case nir_op_fmin: 3554 return LLVMConstReal(ctx->f32, INFINITY); 3555 case nir_op_imax: 3556 return LLVMConstInt(ctx->i32, INT32_MIN, 0); 3557 case nir_op_umax: 3558 return ctx->i32_0; 3559 case nir_op_fmax: 3560 return LLVMConstReal(ctx->f32, -INFINITY); 3561 case nir_op_iand: 3562 return LLVMConstInt(ctx->i32, -1, 0); 3563 case nir_op_ior: 3564 return ctx->i32_0; 3565 case nir_op_ixor: 3566 return ctx->i32_0; 3567 default: 3568 unreachable("bad reduction intrinsic"); 3569 } 3570 } else { /* type_size == 64bit */ 3571 switch (op) { 3572 case nir_op_iadd: 3573 return ctx->i64_0; 3574 case nir_op_fadd: 3575 return ctx->f64_0; 3576 case nir_op_imul: 3577 return ctx->i64_1; 3578 case nir_op_fmul: 3579 return ctx->f64_1; 3580 case nir_op_imin: 3581 return LLVMConstInt(ctx->i64, INT64_MAX, 0); 3582 case nir_op_umin: 3583 return LLVMConstInt(ctx->i64, UINT64_MAX, 0); 3584 case nir_op_fmin: 3585 return LLVMConstReal(ctx->f64, INFINITY); 3586 case nir_op_imax: 3587 return LLVMConstInt(ctx->i64, INT64_MIN, 0); 3588 case nir_op_umax: 3589 return ctx->i64_0; 3590 case nir_op_fmax: 3591 return LLVMConstReal(ctx->f64, -INFINITY); 3592 case nir_op_iand: 3593 return LLVMConstInt(ctx->i64, -1, 0); 3594 case nir_op_ior: 3595 return ctx->i64_0; 3596 case nir_op_ixor: 3597 return ctx->i64_0; 3598 default: 3599 unreachable("bad reduction intrinsic"); 3600 } 3601 } 3602} 3603 3604static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, 3605 nir_op op) 3606{ 3607 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8; 3608 bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4; 3609 switch (op) { 3610 case nir_op_iadd: 3611 return LLVMBuildAdd(ctx->builder, lhs, rhs, ""); 3612 case nir_op_fadd: 3613 return LLVMBuildFAdd(ctx->builder, lhs, rhs, ""); 3614 case nir_op_imul: 3615 return LLVMBuildMul(ctx->builder, lhs, rhs, ""); 3616 case nir_op_fmul: 3617 return LLVMBuildFMul(ctx->builder, lhs, rhs, ""); 3618 case nir_op_imin: 3619 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""), 3620 lhs, rhs, ""); 3621 case nir_op_umin: 3622 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""), 3623 lhs, rhs, ""); 3624 case nir_op_fmin: 3625 return ac_build_intrinsic( 3626 ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16", 3627 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 3628 AC_FUNC_ATTR_READNONE); 3629 case nir_op_imax: 3630 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""), 3631 lhs, rhs, ""); 3632 case nir_op_umax: 3633 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""), 3634 lhs, rhs, ""); 3635 case nir_op_fmax: 3636 return ac_build_intrinsic( 3637 ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16", 3638 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 3639 AC_FUNC_ATTR_READNONE); 3640 case nir_op_iand: 3641 return LLVMBuildAnd(ctx->builder, lhs, rhs, ""); 3642 case nir_op_ior: 3643 return LLVMBuildOr(ctx->builder, lhs, rhs, ""); 3644 case nir_op_ixor: 3645 return LLVMBuildXor(ctx->builder, lhs, rhs, ""); 3646 default: 3647 unreachable("bad reduction intrinsic"); 3648 } 3649} 3650 3651/** 3652 * \param src The value to shift. 3653 * \param identity The value to use the first lane. 3654 * \param maxprefix specifies that the result only needs to be correct for a 3655 * prefix of this many threads 3656 * \return src, shifted 1 lane up, and identity shifted into lane 0. 3657 */ 3658static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src, 3659 LLVMValueRef identity, unsigned maxprefix) 3660{ 3661 if (ctx->gfx_level >= GFX10) { 3662 /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */ 3663 LLVMValueRef active, tmp1, tmp2; 3664 LLVMValueRef tid = ac_get_thread_id(ctx); 3665 3666 tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); 3667 3668 tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false); 3669 3670 if (maxprefix > 32) { 3671 active = 3672 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), ""); 3673 3674 tmp2 = LLVMBuildSelect(ctx->builder, active, 3675 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)), 3676 tmp2, ""); 3677 3678 active = LLVMBuildOr( 3679 ctx->builder, active, 3680 LLVMBuildICmp(ctx->builder, LLVMIntEQ, 3681 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""), 3682 LLVMConstInt(ctx->i32, 0x10, false), ""), 3683 ""); 3684 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3685 } else if (maxprefix > 16) { 3686 active = 3687 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), ""); 3688 3689 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3690 } 3691 } else if (ctx->gfx_level >= GFX8) { 3692 return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); 3693 } 3694 3695 /* wavefront shift_right by 1 on SI/CI */ 3696 LLVMValueRef active, tmp1, tmp2; 3697 LLVMValueRef tid = ac_get_thread_id(ctx); 3698 tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2)); 3699 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00)); 3700 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, 3701 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""), 3702 LLVMConstInt(ctx->i32, 0x4, 0), ""); 3703 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3704 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00)); 3705 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, 3706 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""), 3707 LLVMConstInt(ctx->i32, 0x8, 0), ""); 3708 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3709 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00)); 3710 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, 3711 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""), 3712 LLVMConstInt(ctx->i32, 0x10, 0), ""); 3713 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3714 tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0)); 3715 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), ""); 3716 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3717 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), ""); 3718 return LLVMBuildSelect(ctx->builder, active, identity, tmp1, ""); 3719} 3720 3721/** 3722 * \param maxprefix specifies that the result only needs to be correct for a 3723 * prefix of this many threads 3724 */ 3725static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, 3726 LLVMValueRef identity, unsigned maxprefix, bool inclusive) 3727{ 3728 LLVMValueRef result, tmp; 3729 3730 if (!inclusive) 3731 src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix); 3732 3733 result = src; 3734 3735 if (ctx->gfx_level <= GFX7) { 3736 assert(maxprefix == 64); 3737 LLVMValueRef tid = ac_get_thread_id(ctx); 3738 LLVMValueRef active; 3739 tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00)); 3740 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3741 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, ""); 3742 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3743 result = ac_build_alu_op(ctx, result, tmp, op); 3744 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00)); 3745 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3746 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""), 3747 ctx->i32_0, ""); 3748 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3749 result = ac_build_alu_op(ctx, result, tmp, op); 3750 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00)); 3751 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3752 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""), 3753 ctx->i32_0, ""); 3754 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3755 result = ac_build_alu_op(ctx, result, tmp, op); 3756 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00)); 3757 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3758 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""), 3759 ctx->i32_0, ""); 3760 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3761 result = ac_build_alu_op(ctx, result, tmp, op); 3762 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00)); 3763 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3764 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""), 3765 ctx->i32_0, ""); 3766 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3767 result = ac_build_alu_op(ctx, result, tmp, op); 3768 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0)); 3769 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3770 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""), 3771 ctx->i32_0, ""); 3772 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3773 result = ac_build_alu_op(ctx, result, tmp, op); 3774 return result; 3775 } 3776 3777 if (maxprefix <= 1) 3778 return result; 3779 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); 3780 result = ac_build_alu_op(ctx, result, tmp, op); 3781 if (maxprefix <= 2) 3782 return result; 3783 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false); 3784 result = ac_build_alu_op(ctx, result, tmp, op); 3785 if (maxprefix <= 3) 3786 return result; 3787 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false); 3788 result = ac_build_alu_op(ctx, result, tmp, op); 3789 if (maxprefix <= 4) 3790 return result; 3791 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false); 3792 result = ac_build_alu_op(ctx, result, tmp, op); 3793 if (maxprefix <= 8) 3794 return result; 3795 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false); 3796 result = ac_build_alu_op(ctx, result, tmp, op); 3797 if (maxprefix <= 16) 3798 return result; 3799 3800 if (ctx->gfx_level >= GFX10) { 3801 LLVMValueRef tid = ac_get_thread_id(ctx); 3802 LLVMValueRef active; 3803 3804 tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false); 3805 3806 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3807 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""), 3808 ctx->i32_0, ""); 3809 3810 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3811 3812 result = ac_build_alu_op(ctx, result, tmp, op); 3813 3814 if (maxprefix <= 32) 3815 return result; 3816 3817 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); 3818 3819 active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), ""); 3820 3821 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3822 3823 result = ac_build_alu_op(ctx, result, tmp, op); 3824 return result; 3825 } 3826 3827 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); 3828 result = ac_build_alu_op(ctx, result, tmp, op); 3829 if (maxprefix <= 32) 3830 return result; 3831 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); 3832 result = ac_build_alu_op(ctx, result, tmp, op); 3833 return result; 3834} 3835 3836LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) 3837{ 3838 LLVMValueRef result; 3839 3840 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { 3841 LLVMBuilderRef builder = ctx->builder; 3842 src = LLVMBuildZExt(builder, src, ctx->i32, ""); 3843 result = ac_build_ballot(ctx, src); 3844 result = ac_build_mbcnt(ctx, result); 3845 result = LLVMBuildAdd(builder, result, src, ""); 3846 return result; 3847 } 3848 3849 ac_build_optimization_barrier(ctx, &src, false); 3850 3851 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); 3852 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), 3853 LLVMTypeOf(identity), ""); 3854 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true); 3855 3856 return ac_build_wwm(ctx, result); 3857} 3858 3859LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) 3860{ 3861 LLVMValueRef result; 3862 3863 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { 3864 LLVMBuilderRef builder = ctx->builder; 3865 src = LLVMBuildZExt(builder, src, ctx->i32, ""); 3866 result = ac_build_ballot(ctx, src); 3867 result = ac_build_mbcnt(ctx, result); 3868 return result; 3869 } 3870 3871 ac_build_optimization_barrier(ctx, &src, false); 3872 3873 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); 3874 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), 3875 LLVMTypeOf(identity), ""); 3876 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false); 3877 3878 return ac_build_wwm(ctx, result); 3879} 3880 3881LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, 3882 unsigned cluster_size) 3883{ 3884 if (cluster_size == 1) 3885 return src; 3886 ac_build_optimization_barrier(ctx, &src, false); 3887 LLVMValueRef result, swap; 3888 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); 3889 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), 3890 LLVMTypeOf(identity), ""); 3891 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2); 3892 result = ac_build_alu_op(ctx, result, swap, op); 3893 if (cluster_size == 2) 3894 return ac_build_wwm(ctx, result); 3895 3896 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1); 3897 result = ac_build_alu_op(ctx, result, swap, op); 3898 if (cluster_size == 4) 3899 return ac_build_wwm(ctx, result); 3900 3901 if (ctx->gfx_level >= GFX8) 3902 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false); 3903 else 3904 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04)); 3905 result = ac_build_alu_op(ctx, result, swap, op); 3906 if (cluster_size == 8) 3907 return ac_build_wwm(ctx, result); 3908 3909 if (ctx->gfx_level >= GFX8) 3910 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false); 3911 else 3912 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08)); 3913 result = ac_build_alu_op(ctx, result, swap, op); 3914 if (cluster_size == 16) 3915 return ac_build_wwm(ctx, result); 3916 3917 if (ctx->gfx_level >= GFX10) 3918 swap = ac_build_permlane16(ctx, result, 0, true, false); 3919 else if (ctx->gfx_level >= GFX8 && cluster_size != 32) 3920 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); 3921 else 3922 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10)); 3923 result = ac_build_alu_op(ctx, result, swap, op); 3924 if (cluster_size == 32) 3925 return ac_build_wwm(ctx, result); 3926 3927 if (ctx->gfx_level >= GFX8) { 3928 if (ctx->wave_size == 64) { 3929 if (ctx->gfx_level >= GFX10) 3930 swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); 3931 else 3932 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); 3933 result = ac_build_alu_op(ctx, result, swap, op); 3934 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0)); 3935 } 3936 3937 return ac_build_wwm(ctx, result); 3938 } else { 3939 swap = ac_build_readlane(ctx, result, ctx->i32_0); 3940 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0)); 3941 result = ac_build_alu_op(ctx, result, swap, op); 3942 return ac_build_wwm(ctx, result); 3943 } 3944} 3945 3946/** 3947 * "Top half" of a scan that reduces per-wave values across an entire 3948 * workgroup. 3949 * 3950 * The source value must be present in the highest lane of the wave, and the 3951 * highest lane must be live. 3952 */ 3953void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 3954{ 3955 if (ws->maxwaves <= 1) 3956 return; 3957 3958 const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false); 3959 LLVMBuilderRef builder = ctx->builder; 3960 LLVMValueRef tid = ac_get_thread_id(ctx); 3961 LLVMValueRef tmp; 3962 3963 tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, ""); 3964 ac_build_ifcc(ctx, tmp, 1000); 3965 LLVMBuildStore(builder, ws->src, 3966 LLVMBuildGEP2(builder, LLVMTypeOf(ws->src), ws->scratch, &ws->waveidx, 1, "")); 3967 ac_build_endif(ctx, 1000); 3968} 3969 3970/** 3971 * "Bottom half" of a scan that reduces per-wave values across an entire 3972 * workgroup. 3973 * 3974 * The caller must place a barrier between the top and bottom halves. 3975 */ 3976void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 3977{ 3978 const LLVMTypeRef type = LLVMTypeOf(ws->src); 3979 const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type)); 3980 3981 if (ws->maxwaves <= 1) { 3982 ws->result_reduce = ws->src; 3983 ws->result_inclusive = ws->src; 3984 ws->result_exclusive = identity; 3985 return; 3986 } 3987 assert(ws->maxwaves <= 32); 3988 3989 LLVMBuilderRef builder = ctx->builder; 3990 LLVMValueRef tid = ac_get_thread_id(ctx); 3991 LLVMBasicBlockRef bbs[2]; 3992 LLVMValueRef phivalues_scan[2]; 3993 LLVMValueRef tmp, tmp2; 3994 3995 bbs[0] = LLVMGetInsertBlock(builder); 3996 phivalues_scan[0] = LLVMGetUndef(type); 3997 3998 if (ws->enable_reduce) 3999 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, ""); 4000 else if (ws->enable_inclusive) 4001 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, ""); 4002 else 4003 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, ""); 4004 ac_build_ifcc(ctx, tmp, 1001); 4005 { 4006 tmp = LLVMBuildLoad2(builder, LLVMTypeOf(ws->src), 4007 LLVMBuildGEP2(builder, LLVMTypeOf(ws->src), ws->scratch, &tid, 1, ""), ""); 4008 4009 ac_build_optimization_barrier(ctx, &tmp, false); 4010 4011 bbs[1] = LLVMGetInsertBlock(builder); 4012 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true); 4013 } 4014 ac_build_endif(ctx, 1001); 4015 4016 const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs); 4017 4018 if (ws->enable_reduce) { 4019 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, ""); 4020 ws->result_reduce = ac_build_readlane(ctx, scan, tmp); 4021 } 4022 if (ws->enable_inclusive) 4023 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx); 4024 if (ws->enable_exclusive) { 4025 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, ""); 4026 tmp = ac_build_readlane(ctx, scan, tmp); 4027 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, ""); 4028 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, ""); 4029 } 4030} 4031 4032/** 4033 * Inclusive scan of a per-wave value across an entire workgroup. 4034 * 4035 * This implies an s_barrier instruction. 4036 * 4037 * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads 4038 * of the workgroup are live. (This requirement cannot easily be relaxed in a 4039 * useful manner because of the barrier in the algorithm.) 4040 */ 4041void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4042{ 4043 ac_build_wg_wavescan_top(ctx, ws); 4044 ac_build_waitcnt(ctx, AC_WAIT_LGKM); 4045 ac_build_s_barrier(ctx, ws->stage); 4046 ac_build_wg_wavescan_bottom(ctx, ws); 4047} 4048 4049/** 4050 * "Top half" of a scan that reduces per-thread values across an entire 4051 * workgroup. 4052 * 4053 * All lanes must be active when this code runs. 4054 */ 4055void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4056{ 4057 if (ws->enable_exclusive) { 4058 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op); 4059 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd) 4060 ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, ""); 4061 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op); 4062 } else { 4063 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op); 4064 } 4065 4066 bool enable_inclusive = ws->enable_inclusive; 4067 bool enable_exclusive = ws->enable_exclusive; 4068 ws->enable_inclusive = false; 4069 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; 4070 ac_build_wg_wavescan_top(ctx, ws); 4071 ws->enable_inclusive = enable_inclusive; 4072 ws->enable_exclusive = enable_exclusive; 4073} 4074 4075/** 4076 * "Bottom half" of a scan that reduces per-thread values across an entire 4077 * workgroup. 4078 * 4079 * The caller must place a barrier between the top and bottom halves. 4080 */ 4081void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4082{ 4083 bool enable_inclusive = ws->enable_inclusive; 4084 bool enable_exclusive = ws->enable_exclusive; 4085 ws->enable_inclusive = false; 4086 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; 4087 ac_build_wg_wavescan_bottom(ctx, ws); 4088 ws->enable_inclusive = enable_inclusive; 4089 ws->enable_exclusive = enable_exclusive; 4090 4091 /* ws->result_reduce is already the correct value */ 4092 if (ws->enable_inclusive) 4093 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op); 4094 if (ws->enable_exclusive) 4095 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op); 4096} 4097 4098/** 4099 * A scan that reduces per-thread values across an entire workgroup. 4100 * 4101 * The caller must ensure that all lanes are active when this code runs 4102 * (WWM is insufficient!), because there is an implied barrier. 4103 */ 4104void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4105{ 4106 ac_build_wg_scan_top(ctx, ws); 4107 ac_build_waitcnt(ctx, AC_WAIT_LGKM); 4108 ac_build_s_barrier(ctx, ws->stage); 4109 ac_build_wg_scan_bottom(ctx, ws); 4110} 4111 4112static void _ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx, 4113 LLVMValueRef *arg0, LLVMValueRef *arg1) 4114{ 4115 LLVMValueRef tid; 4116 LLVMValueRef src0, src1; 4117 LLVMValueRef tmp0; 4118 LLVMValueRef params[2]; 4119 LLVMValueRef is_even; 4120 4121 src0 = LLVMBuildBitCast(ctx->builder, *arg0, ctx->i32, ""); 4122 src1 = LLVMBuildBitCast(ctx->builder, *arg1, ctx->i32, ""); 4123 4124 /* swap odd,even lanes of arg_0*/ 4125 params[0] = src0; 4126 params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0); 4127 src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32", 4128 ctx->i32, params, 2, AC_FUNC_ATTR_CONVERGENT); 4129 4130 /* swap even lanes between arg_0 and arg_1 */ 4131 tid = ac_get_thread_id(ctx); 4132 is_even = LLVMBuildICmp(ctx->builder, LLVMIntEQ, 4133 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), 4134 ctx->i32_0, ""); 4135 tmp0 = src0; 4136 src0 = LLVMBuildSelect(ctx->builder, is_even, src1, src0, ""); 4137 src1 = LLVMBuildSelect(ctx->builder, is_even, tmp0, src1, ""); 4138 4139 /* swap odd,even lanes again for arg_0*/ 4140 params[0] = src0; 4141 params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0); 4142 src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32", 4143 ctx->i32, params, 2, AC_FUNC_ATTR_CONVERGENT); 4144 4145 *arg0 = src0; 4146 *arg1 = src1; 4147} 4148 4149void ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx, 4150 struct ac_export_args *mrt0, 4151 struct ac_export_args *mrt1) 4152{ 4153 assert(ctx->gfx_level >= GFX11); 4154 assert(mrt0->enabled_channels == mrt1->enabled_channels); 4155 4156 for (int i = 0; i < 4; i++) { 4157 if (mrt0->enabled_channels & (1 << i) && mrt1->enabled_channels & (1 << i)) 4158 _ac_build_dual_src_blend_swizzle(ctx, &mrt0->out[i], &mrt1->out[i]); 4159 } 4160} 4161 4162LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0, 4163 unsigned lane1, unsigned lane2, unsigned lane3) 4164{ 4165 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3); 4166 if (ctx->gfx_level >= GFX8) { 4167 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false); 4168 } else { 4169 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask); 4170 } 4171} 4172 4173LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index) 4174{ 4175 LLVMTypeRef type = LLVMTypeOf(src); 4176 LLVMValueRef result; 4177 4178 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); 4179 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 4180 4181 result = 4182 ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2, 4183 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 4184 return LLVMBuildTrunc(ctx->builder, result, type, ""); 4185} 4186 4187LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) 4188{ 4189 LLVMTypeRef type; 4190 char *intr; 4191 4192 if (bitsize == 16) { 4193 intr = "llvm.amdgcn.frexp.exp.i16.f16"; 4194 type = ctx->i16; 4195 } else if (bitsize == 32) { 4196 intr = "llvm.amdgcn.frexp.exp.i32.f32"; 4197 type = ctx->i32; 4198 } else { 4199 intr = "llvm.amdgcn.frexp.exp.i32.f64"; 4200 type = ctx->i32; 4201 } 4202 4203 LLVMValueRef params[] = { 4204 src0, 4205 }; 4206 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); 4207} 4208LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) 4209{ 4210 LLVMTypeRef type; 4211 char *intr; 4212 4213 if (bitsize == 16) { 4214 intr = "llvm.amdgcn.frexp.mant.f16"; 4215 type = ctx->f16; 4216 } else if (bitsize == 32) { 4217 intr = "llvm.amdgcn.frexp.mant.f32"; 4218 type = ctx->f32; 4219 } else { 4220 intr = "llvm.amdgcn.frexp.mant.f64"; 4221 type = ctx->f64; 4222 } 4223 4224 LLVMValueRef params[] = { 4225 src0, 4226 }; 4227 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); 4228} 4229 4230LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) 4231{ 4232 LLVMTypeRef type; 4233 char *intr; 4234 4235 if (bitsize == 16) { 4236 intr = "llvm.canonicalize.f16"; 4237 type = ctx->f16; 4238 } else if (bitsize == 32) { 4239 intr = "llvm.canonicalize.f32"; 4240 type = ctx->f32; 4241 } else { 4242 intr = "llvm.canonicalize.f64"; 4243 type = ctx->f64; 4244 } 4245 4246 LLVMValueRef params[] = { 4247 src0, 4248 }; 4249 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); 4250} 4251 4252/* 4253 * this takes an I,J coordinate pair, 4254 * and works out the X and Y derivatives. 4255 * it returns DDX(I), DDX(J), DDY(I), DDY(J). 4256 */ 4257LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij) 4258{ 4259 LLVMValueRef result[4], a; 4260 unsigned i; 4261 4262 for (i = 0; i < 2; i++) { 4263 a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), ""); 4264 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a); 4265 result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a); 4266 } 4267 return ac_build_gather_values(ctx, result, 4); 4268} 4269 4270LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx) 4271{ 4272 LLVMValueRef result; 4273 4274 if (LLVM_VERSION_MAJOR >= 13) { 4275 result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0, 4276 AC_FUNC_ATTR_READONLY | AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); 4277 } else { 4278 result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, 4279 AC_FUNC_ATTR_READNONE); 4280 } 4281 return LLVMBuildNot(ctx->builder, result, ""); 4282} 4283 4284LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx) 4285{ 4286 if (!ctx->postponed_kill) 4287 return ac_build_load_helper_invocation(ctx); 4288 4289 /* postponed_kill should be NULL on LLVM 13+ */ 4290 assert(LLVM_VERSION_MAJOR < 13); 4291 4292 /* !(exact && postponed) */ 4293 LLVMValueRef exact = 4294 ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE); 4295 4296 LLVMValueRef postponed = LLVMBuildLoad2(ctx->builder, ctx->i1, ctx->postponed_kill, ""); 4297 return LLVMBuildNot(ctx->builder, LLVMBuildAnd(ctx->builder, exact, postponed, ""), ""); 4298} 4299 4300LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args, 4301 unsigned num_args) 4302{ 4303 LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, ""); 4304 LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func)); 4305 return ret; 4306} 4307 4308void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil, 4309 LLVMValueRef samplemask, LLVMValueRef mrt0_alpha, bool is_last, 4310 struct ac_export_args *args) 4311{ 4312 unsigned mask = 0; 4313 unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL, 4314 mrt0_alpha != NULL); 4315 4316 assert(depth || stencil || samplemask); 4317 4318 memset(args, 0, sizeof(*args)); 4319 4320 if (is_last) { 4321 args->valid_mask = 1; /* whether the EXEC mask is valid */ 4322 args->done = 1; /* DONE bit */ 4323 } 4324 4325 /* Specify the target we are exporting */ 4326 args->target = V_008DFC_SQ_EXP_MRTZ; 4327 4328 args->compr = 0; /* COMP flag */ 4329 args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */ 4330 args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */ 4331 args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */ 4332 args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */ 4333 4334 if (format == V_028710_SPI_SHADER_UINT16_ABGR) { 4335 assert(!depth); 4336 args->compr = ctx->gfx_level < GFX11; /* COMPR flag */ 4337 4338 if (stencil) { 4339 /* Stencil should be in X[23:16]. */ 4340 stencil = ac_to_integer(ctx, stencil); 4341 stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), ""); 4342 args->out[0] = ac_to_float(ctx, stencil); 4343 mask |= ctx->gfx_level >= GFX11 ? 0x1 : 0x3; 4344 } 4345 if (samplemask) { 4346 /* SampleMask should be in Y[15:0]. */ 4347 args->out[1] = samplemask; 4348 mask |= ctx->gfx_level >= GFX11 ? 0x2 : 0xc; 4349 } 4350 } else { 4351 if (depth) { 4352 args->out[0] = depth; 4353 mask |= 0x1; 4354 } 4355 if (stencil) { 4356 args->out[1] = stencil; 4357 mask |= 0x2; 4358 } 4359 if (samplemask) { 4360 args->out[2] = samplemask; 4361 mask |= 0x4; 4362 } 4363 if (mrt0_alpha) { 4364 args->out[3] = mrt0_alpha; 4365 mask |= 0x8; 4366 } 4367 } 4368 4369 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks 4370 * at the X writemask component. */ 4371 if (ctx->gfx_level == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN) 4372 mask |= 0x1; 4373 4374 /* Specify which components to enable */ 4375 args->enabled_channels = mask; 4376} 4377 4378/* Send GS Alloc Req message from the first wave of the group to SPI. 4379 * Message payload is: 4380 * - bits 0..10: vertices in group 4381 * - bits 12..22: primitives in group 4382 */ 4383void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id, 4384 LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt) 4385{ 4386 LLVMBuilderRef builder = ctx->builder; 4387 LLVMValueRef tmp; 4388 bool export_dummy_prim = false; 4389 4390 /* HW workaround for a GPU hang with 100% culling. 4391 * We always have to export at least 1 primitive. 4392 * Export a degenerate triangle using vertex 0 for all 3 vertices. 4393 */ 4394 if (prim_cnt == ctx->i32_0 && ctx->gfx_level == GFX10) { 4395 assert(vtx_cnt == ctx->i32_0); 4396 prim_cnt = ctx->i32_1; 4397 vtx_cnt = ctx->i32_1; 4398 export_dummy_prim = true; 4399 } 4400 4401 if (wave_id) 4402 ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020); 4403 4404 tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), ""); 4405 tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); 4406 ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp); 4407 4408 if (export_dummy_prim) { 4409 struct ac_ngg_prim prim = {0}; 4410 /* The vertex indices are 0,0,0. */ 4411 prim.passthrough = ctx->i32_0; 4412 4413 struct ac_export_args pos = {0}; 4414 /* The hw culls primitives with NaN. */ 4415 pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = LLVMConstReal(ctx->f32, NAN); 4416 pos.target = V_008DFC_SQ_EXP_POS; 4417 pos.enabled_channels = 0xf; 4418 pos.done = true; 4419 4420 ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""), 4421 5021); 4422 ac_build_export_prim(ctx, &prim); 4423 ac_build_export(ctx, &pos); 4424 ac_build_endif(ctx, 5021); 4425 } 4426 4427 if (wave_id) 4428 ac_build_endif(ctx, 5020); 4429} 4430 4431 4432LLVMValueRef ac_pack_edgeflags_for_export(struct ac_llvm_context *ctx, 4433 const struct ac_shader_args *args) 4434{ 4435 /* Use the following trick to extract the edge flags: 4436 * extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10 4437 * shifted = v_mul_u32_u24 extracted, 0x80402u ; shift the bits: 8->9, 9->19, 10->29 4438 * result = v_and_b32 shifted, 0x20080200 ; remove garbage 4439 */ 4440 LLVMValueRef tmp = LLVMBuildAnd(ctx->builder, 4441 ac_get_arg(ctx, args->gs_invocation_id), 4442 LLVMConstInt(ctx->i32, 0x700, 0), ""); 4443 tmp = LLVMBuildMul(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x80402u, 0), ""); 4444 return LLVMBuildAnd(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x20080200, 0), ""); 4445} 4446 4447LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim) 4448{ 4449 /* The prim export format is: 4450 * - bits 0..8: index 0 4451 * - bit 9: edge flag 0 4452 * - bits 10..18: index 1 4453 * - bit 19: edge flag 1 4454 * - bits 20..28: index 2 4455 * - bit 29: edge flag 2 4456 * - bit 31: null primitive (skip) 4457 */ 4458 LLVMBuilderRef builder = ctx->builder; 4459 LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, ""); 4460 LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), ""); 4461 result = LLVMBuildOr(ctx->builder, result, prim->edgeflags, ""); 4462 4463 for (unsigned i = 0; i < prim->num_vertices; ++i) { 4464 tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), ""); 4465 result = LLVMBuildOr(builder, result, tmp, ""); 4466 } 4467 return result; 4468} 4469 4470void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim) 4471{ 4472 struct ac_export_args args; 4473 4474 if (prim->passthrough) { 4475 args.out[0] = prim->passthrough; 4476 } else { 4477 args.out[0] = ac_pack_prim_export(ctx, prim); 4478 } 4479 4480 args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, ""); 4481 args.out[1] = LLVMGetUndef(ctx->f32); 4482 args.out[2] = LLVMGetUndef(ctx->f32); 4483 args.out[3] = LLVMGetUndef(ctx->f32); 4484 4485 args.target = V_008DFC_SQ_EXP_PRIM; 4486 args.enabled_channels = 1; 4487 args.done = true; 4488 args.valid_mask = false; 4489 args.compr = false; 4490 4491 ac_build_export(ctx, &args); 4492} 4493 4494static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx) 4495{ 4496 if (type == AC_ARG_FLOAT) { 4497 return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size); 4498 } else if (type == AC_ARG_INT) { 4499 return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size); 4500 } else { 4501 LLVMTypeRef ptr_type; 4502 switch (type) { 4503 case AC_ARG_CONST_PTR: 4504 ptr_type = ctx->i8; 4505 break; 4506 case AC_ARG_CONST_FLOAT_PTR: 4507 ptr_type = ctx->f32; 4508 break; 4509 case AC_ARG_CONST_PTR_PTR: 4510 ptr_type = ac_array_in_const32_addr_space(ctx->i8); 4511 break; 4512 case AC_ARG_CONST_DESC_PTR: 4513 ptr_type = ctx->v4i32; 4514 break; 4515 case AC_ARG_CONST_IMAGE_PTR: 4516 ptr_type = ctx->v8i32; 4517 break; 4518 default: 4519 unreachable("unknown arg type"); 4520 } 4521 if (size == 1) { 4522 return ac_array_in_const32_addr_space(ptr_type); 4523 } else { 4524 assert(size == 2); 4525 return ac_array_in_const_addr_space(ptr_type); 4526 } 4527 } 4528} 4529 4530LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx, 4531 enum ac_llvm_calling_convention convention, const char *name, 4532 LLVMTypeRef ret_type, LLVMModuleRef module) 4533{ 4534 LLVMTypeRef arg_types[AC_MAX_ARGS]; 4535 4536 for (unsigned i = 0; i < args->arg_count; i++) { 4537 arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx); 4538 } 4539 4540 LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0); 4541 4542 LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type); 4543 LLVMBasicBlockRef main_function_body = 4544 LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body"); 4545 LLVMPositionBuilderAtEnd(ctx->builder, main_function_body); 4546 4547 LLVMSetFunctionCallConv(main_function, convention); 4548 for (unsigned i = 0; i < args->arg_count; ++i) { 4549 LLVMValueRef P = LLVMGetParam(main_function, i); 4550 4551 if (args->args[i].file != AC_ARG_SGPR) 4552 continue; 4553 4554 ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG); 4555 4556 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { 4557 ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS); 4558 ac_add_attr_dereferenceable(P, UINT64_MAX); 4559 ac_add_attr_alignment(P, 4); 4560 } 4561 } 4562 4563 ctx->main_function = main_function; 4564 4565 /* Enable denormals for FP16 and FP64: */ 4566 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee"); 4567 /* Disable denormals for FP32: */ 4568 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32", 4569 "preserve-sign,preserve-sign"); 4570 return main_function; 4571} 4572 4573void ac_build_s_endpgm(struct ac_llvm_context *ctx) 4574{ 4575 LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false); 4576 LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false); 4577 LLVMBuildCall2(ctx->builder, calltype, code, NULL, 0, ""); 4578} 4579 4580/** 4581 * Convert triangle strip indices to triangle indices. This is used to decompose 4582 * triangle strips into triangles. 4583 */ 4584void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd, 4585 LLVMValueRef flatshade_first, 4586 LLVMValueRef index[3]) 4587{ 4588 LLVMBuilderRef builder = ctx->builder; 4589 LLVMValueRef out[3]; 4590 4591 /* We need to change the vertex order for odd triangles to get correct 4592 * front/back facing by swapping 2 vertex indices, but we also have to 4593 * keep the provoking vertex in the same place. 4594 * 4595 * If the first vertex is provoking, swap index 1 and 2. 4596 * If the last vertex is provoking, swap index 0 and 1. 4597 */ 4598 out[0] = LLVMBuildSelect(builder, flatshade_first, index[0], 4599 LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), ""); 4600 out[1] = LLVMBuildSelect(builder, flatshade_first, 4601 LLVMBuildSelect(builder, is_odd, index[2], index[1], ""), 4602 LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), ""); 4603 out[2] = LLVMBuildSelect(builder, flatshade_first, 4604 LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], ""); 4605 memcpy(index, out, sizeof(out)); 4606} 4607 4608LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a) 4609{ 4610 LLVMValueRef args[2] = { 4611 a, 4612 LLVMConstInt(ctx->i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, 0), 4613 }; 4614 return ac_build_intrinsic(ctx, "llvm.amdgcn.class.f32", ctx->i1, args, 2, 4615 AC_FUNC_ATTR_READNONE); 4616} 4617