1/************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper functions for type conversions. 32 * 33 * We want to use the fastest type for a given computation whenever feasible. 34 * The other side of this is that we need to be able convert between several 35 * types accurately and efficiently. 36 * 37 * Conversion between types of different bit width is quite complex since a 38 * 39 * To remember there are a few invariants in type conversions: 40 * 41 * - register width must remain constant: 42 * 43 * src_type.width * src_type.length == dst_type.width * dst_type.length 44 * 45 * - total number of elements must remain constant: 46 * 47 * src_type.length * num_srcs == dst_type.length * num_dsts 48 * 49 * It is not always possible to do the conversion both accurately and 50 * efficiently, usually due to lack of adequate machine instructions. In these 51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as 52 * there this functions can be used anywhere. In the future we might have a 53 * precision parameter which can gauge the accuracy vs efficiency compromise, 54 * but for now if the data conversion between two stages happens to be the 55 * bottleneck, then most likely should just avoid converting at all and run 56 * both stages with the same type. 57 * 58 * Make sure to run lp_test_conv unit test after any change to this file. 59 * 60 * @author Jose Fonseca <jfonseca@vmware.com> 61 */ 62 63 64#include "util/u_debug.h" 65#include "util/u_math.h" 66#include "util/half_float.h" 67#include "util/u_cpu_detect.h" 68 69#include "lp_bld_type.h" 70#include "lp_bld_const.h" 71#include "lp_bld_arit.h" 72#include "lp_bld_bitarit.h" 73#include "lp_bld_pack.h" 74#include "lp_bld_conv.h" 75#include "lp_bld_logic.h" 76#include "lp_bld_intr.h" 77#include "lp_bld_printf.h" 78#include "lp_bld_format.h" 79 80 81/* the lp_test_format test fails on mingw/i686 at -O2 with gcc 10.x 82 * ref https://gitlab.freedesktop.org/mesa/mesa/-/issues/3906 83 */ 84 85#if defined(__MINGW32__) && !defined(__MINGW64__) && (__GNUC__ == 10) 86#warning "disabling caller-saves optimization for this file to work around compiler bug" 87#pragma GCC optimize("-fno-caller-saves") 88#endif 89 90/** 91 * Converts int16 half-float to float32 92 * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16) 93 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps] 94 * 95 * @param src value to convert 96 * 97 */ 98LLVMValueRef 99lp_build_half_to_float(struct gallivm_state *gallivm, 100 LLVMValueRef src) 101{ 102 LLVMBuilderRef builder = gallivm->builder; 103 LLVMTypeRef src_type = LLVMTypeOf(src); 104 unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ? 105 LLVMGetVectorSize(src_type) : 1; 106 107 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length); 108 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length); 109 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); 110 LLVMValueRef h; 111 112 if (util_get_cpu_caps()->has_f16c && 113 (src_length == 4 || src_length == 8)) { 114 if (LLVM_VERSION_MAJOR < 11) { 115 const char *intrinsic = NULL; 116 if (src_length == 4) { 117 src = lp_build_pad_vector(gallivm, src, 8); 118 intrinsic = "llvm.x86.vcvtph2ps.128"; 119 } 120 else { 121 intrinsic = "llvm.x86.vcvtph2ps.256"; 122 } 123 src = LLVMBuildBitCast(builder, src, 124 LLVMVectorType(LLVMInt16TypeInContext(gallivm->context), 8), ""); 125 return lp_build_intrinsic_unary(builder, intrinsic, 126 lp_build_vec_type(gallivm, f32_type), src); 127 } else { 128 /* 129 * XXX: could probably use on other archs as well. 130 * But if the cpu doesn't support it natively it looks like the backends still 131 * can't lower it and will try to call out to external libraries, which will crash. 132 */ 133 /* 134 * XXX: lp_build_vec_type() would use int16 vector. Probably need to revisit 135 * this at some point. 136 */ 137 src = LLVMBuildBitCast(builder, src, 138 LLVMVectorType(LLVMHalfTypeInContext(gallivm->context), src_length), ""); 139 return LLVMBuildFPExt(builder, src, lp_build_vec_type(gallivm, f32_type), ""); 140 } 141 } 142 143 h = LLVMBuildZExt(builder, src, int_vec_type, ""); 144 return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true); 145} 146 147 148/** 149 * Converts float32 to int16 half-float 150 * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16) 151 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph] 152 * 153 * @param src value to convert 154 * 155 * Convert float32 to half floats, preserving Infs and NaNs, 156 * with rounding towards zero (trunc). 157 * XXX: For GL, would prefer rounding towards nearest(-even). 158 */ 159LLVMValueRef 160lp_build_float_to_half(struct gallivm_state *gallivm, 161 LLVMValueRef src) 162{ 163 LLVMBuilderRef builder = gallivm->builder; 164 LLVMTypeRef f32_vec_type = LLVMTypeOf(src); 165 unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind 166 ? LLVMGetVectorSize(f32_vec_type) : 1; 167 struct lp_type i32_type = lp_type_int_vec(32, 32 * length); 168 struct lp_type i16_type = lp_type_int_vec(16, 16 * length); 169 LLVMValueRef result; 170 171 /* 172 * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits 173 * directly, without any (x86 or generic) intrinsics. 174 * Albeit the rounding mode cannot be specified (and is undefined, 175 * though in practice on x86 seems to do nearest-even but it may 176 * be dependent on instruction set support), so is essentially 177 * useless. 178 */ 179 180 if (util_get_cpu_caps()->has_f16c && 181 (length == 4 || length == 8)) { 182 struct lp_type i168_type = lp_type_int_vec(16, 16 * 8); 183 unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */ 184 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); 185 const char *intrinsic = NULL; 186 if (length == 4) { 187 intrinsic = "llvm.x86.vcvtps2ph.128"; 188 } 189 else { 190 intrinsic = "llvm.x86.vcvtps2ph.256"; 191 } 192 result = lp_build_intrinsic_binary(builder, intrinsic, 193 lp_build_vec_type(gallivm, i168_type), 194 src, LLVMConstInt(i32t, mode, 0)); 195 if (length == 4) { 196 result = lp_build_extract_range(gallivm, result, 0, 4); 197 } 198 result = LLVMBuildBitCast(builder, result, lp_build_vec_type(gallivm, lp_type_float_vec(16, 16 * length)), ""); 199 } 200 201 else { 202 result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true); 203 /* Convert int32 vector to int16 vector by trunc (might generate bad code) */ 204 result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), ""); 205 } 206 207 /* 208 * Debugging code. 209 */ 210 if (0) { 211 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); 212 LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context); 213 LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context); 214 LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length)); 215 unsigned i; 216 217 LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0); 218 LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)_mesa_float_to_half)); 219 func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "_mesa_float_to_half"); 220 221 for (i = 0; i < length; ++i) { 222 LLVMValueRef index = LLVMConstInt(i32t, i, 0); 223 LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, ""); 224#if 0 225 /* 226 * XXX: not really supported by backends. 227 * Even if they would now, rounding mode cannot be specified and 228 * is undefined. 229 */ 230 LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32); 231#else 232 LLVMValueRef f16 = LLVMBuildCall2(builder, func_type, func, &f32, 1, ""); 233#endif 234 ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, ""); 235 } 236 237 lp_build_print_value(gallivm, "src = ", src); 238 lp_build_print_value(gallivm, "llvm = ", result); 239 lp_build_print_value(gallivm, "util = ", ref_result); 240 lp_build_printf(gallivm, "\n"); 241 } 242 243 return result; 244} 245 246 247/** 248 * Special case for converting clamped IEEE-754 floats to unsigned norms. 249 * 250 * The mathematical voodoo below may seem excessive but it is actually 251 * paramount we do it this way for several reasons. First, there is no single 252 * precision FP to unsigned integer conversion Intel SSE instruction. Second, 253 * secondly, even if there was, since the FP's mantissa takes only a fraction 254 * of register bits the typically scale and cast approach would require double 255 * precision for accurate results, and therefore half the throughput 256 * 257 * Although the result values can be scaled to an arbitrary bit width specified 258 * by dst_width, the actual result type will have the same width. 259 * 260 * Ex: src = { float, float, float, float } 261 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. 262 */ 263LLVMValueRef 264lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, 265 struct lp_type src_type, 266 unsigned dst_width, 267 LLVMValueRef src) 268{ 269 LLVMBuilderRef builder = gallivm->builder; 270 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type); 271 LLVMValueRef res; 272 unsigned mantissa; 273 274 assert(src_type.floating); 275 assert(dst_width <= src_type.width); 276 src_type.sign = FALSE; 277 278 mantissa = lp_mantissa(src_type); 279 280 if (dst_width <= mantissa) { 281 /* 282 * Apply magic coefficients that will make the desired result to appear 283 * in the lowest significant bits of the mantissa, with correct rounding. 284 * 285 * This only works if the destination width fits in the mantissa. 286 */ 287 288 unsigned long long ubound; 289 unsigned long long mask; 290 double scale; 291 double bias; 292 293 ubound = (1ULL << dst_width); 294 mask = ubound - 1; 295 scale = (double)mask/ubound; 296 bias = (double)(1ULL << (mantissa - dst_width)); 297 298 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); 299 /* instead of fadd/and could (with sse2) just use lp_build_iround */ 300 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), ""); 301 res = LLVMBuildBitCast(builder, res, int_vec_type, ""); 302 res = LLVMBuildAnd(builder, res, 303 lp_build_const_int_vec(gallivm, src_type, mask), ""); 304 } 305 else if (dst_width == (mantissa + 1)) { 306 /* 307 * The destination width matches exactly what can be represented in 308 * floating point (i.e., mantissa + 1 bits). Even so correct rounding 309 * still needs to be applied (only for numbers in [0.5-1.0] would 310 * conversion using truncation after scaling be sufficient). 311 */ 312 double scale; 313 struct lp_build_context uf32_bld; 314 315 lp_build_context_init(&uf32_bld, gallivm, src_type); 316 scale = (double)((1ULL << dst_width) - 1); 317 318 res = LLVMBuildFMul(builder, src, 319 lp_build_const_vec(gallivm, src_type, scale), ""); 320 res = lp_build_iround(&uf32_bld, res); 321 } 322 else { 323 /* 324 * The destination exceeds what can be represented in the floating point. 325 * So multiply by the largest power two we get away with, and when 326 * subtract the most significant bit to rescale to normalized values. 327 * 328 * The largest power of two factor we can get away is 329 * (1 << (src_type.width - 1)), because we need to use signed . In theory it 330 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states 331 * INT_MIN should be returned in FPToSI, which is the correct result for 332 * values near 1.0! 333 * 334 * This means we get (src_type.width - 1) correct bits for values near 0.0, 335 * and (mantissa + 1) correct bits for values near 1.0. Equally or more 336 * important, we also get exact results for 0.0 and 1.0. 337 */ 338 339 unsigned n = MIN2(src_type.width - 1u, dst_width); 340 341 double scale = (double)(1ULL << n); 342 unsigned lshift = dst_width - n; 343 unsigned rshift = n; 344 LLVMValueRef lshifted; 345 LLVMValueRef rshifted; 346 347 res = LLVMBuildFMul(builder, src, 348 lp_build_const_vec(gallivm, src_type, scale), ""); 349 if (!src_type.sign && src_type.width == 32) 350 res = LLVMBuildFPToUI(builder, res, int_vec_type, ""); 351 else 352 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 353 354 /* 355 * Align the most significant bit to its final place. 356 * 357 * This will cause 1.0 to overflow to 0, but the later adjustment will 358 * get it right. 359 */ 360 if (lshift) { 361 lshifted = LLVMBuildShl(builder, res, 362 lp_build_const_int_vec(gallivm, src_type, 363 lshift), ""); 364 } else { 365 lshifted = res; 366 } 367 368 /* 369 * Align the most significant bit to the right. 370 */ 371 rshifted = LLVMBuildLShr(builder, res, 372 lp_build_const_int_vec(gallivm, src_type, rshift), 373 ""); 374 375 /* 376 * Subtract the MSB to the LSB, therefore re-scaling from 377 * (1 << dst_width) to ((1 << dst_width) - 1). 378 */ 379 380 res = LLVMBuildSub(builder, lshifted, rshifted, ""); 381 } 382 383 return res; 384} 385 386 387/** 388 * Inverse of lp_build_clamped_float_to_unsigned_norm above. 389 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] 390 * return {float, float, float, float} with values in range [0, 1]. 391 */ 392LLVMValueRef 393lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm, 394 unsigned src_width, 395 struct lp_type dst_type, 396 LLVMValueRef src) 397{ 398 LLVMBuilderRef builder = gallivm->builder; 399 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type); 400 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type); 401 LLVMValueRef bias_; 402 LLVMValueRef res; 403 unsigned mantissa; 404 unsigned n; 405 unsigned long long ubound; 406 unsigned long long mask; 407 double scale; 408 double bias; 409 410 assert(dst_type.floating); 411 412 mantissa = lp_mantissa(dst_type); 413 414 if (src_width <= (mantissa + 1)) { 415 /* 416 * The source width matches fits what can be represented in floating 417 * point (i.e., mantissa + 1 bits). So do a straight multiplication 418 * followed by casting. No further rounding is necessary. 419 */ 420 421 scale = 1.0/(double)((1ULL << src_width) - 1); 422 res = LLVMBuildSIToFP(builder, src, vec_type, ""); 423 res = LLVMBuildFMul(builder, res, 424 lp_build_const_vec(gallivm, dst_type, scale), ""); 425 return res; 426 } 427 else { 428 /* 429 * The source width exceeds what can be represented in floating 430 * point. So truncate the incoming values. 431 */ 432 433 n = MIN2(mantissa, src_width); 434 435 ubound = ((unsigned long long)1 << n); 436 mask = ubound - 1; 437 scale = (double)ubound/mask; 438 bias = (double)((unsigned long long)1 << (mantissa - n)); 439 440 res = src; 441 442 if (src_width > mantissa) { 443 int shift = src_width - mantissa; 444 res = LLVMBuildLShr(builder, res, 445 lp_build_const_int_vec(gallivm, dst_type, shift), ""); 446 } 447 448 bias_ = lp_build_const_vec(gallivm, dst_type, bias); 449 450 res = LLVMBuildOr(builder, 451 res, 452 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); 453 454 res = LLVMBuildBitCast(builder, res, vec_type, ""); 455 456 res = LLVMBuildFSub(builder, res, bias_, ""); 457 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), ""); 458 } 459 460 return res; 461} 462 463 464/** 465 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used. 466 * 467 * Returns the number of dsts created from src 468 */ 469int lp_build_conv_auto(struct gallivm_state *gallivm, 470 struct lp_type src_type, 471 struct lp_type* dst_type, 472 const LLVMValueRef *src, 473 unsigned num_srcs, 474 LLVMValueRef *dst) 475{ 476 unsigned i; 477 int num_dsts = num_srcs; 478 479 if (src_type.floating == dst_type->floating && 480 src_type.width == dst_type->width && 481 src_type.length == dst_type->length && 482 src_type.fixed == dst_type->fixed && 483 src_type.norm == dst_type->norm && 484 src_type.sign == dst_type->sign) 485 return num_dsts; 486 487 /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8 488 */ 489 if (src_type.norm == 0 && 490 src_type.width == 32 && 491 src_type.fixed == 0 && 492 493 dst_type->floating == 0 && 494 dst_type->fixed == 0 && 495 dst_type->width == 8 && 496 497 ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) || 498 (src_type.floating == 0 && dst_type->floating == 0 && 499 src_type.sign == dst_type->sign && dst_type->norm == 0))) { 500 501 /* Special case 4x4x32 --> 1x16x8 */ 502 if (src_type.length == 4 && 503 (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec)) 504 { 505 num_dsts = (num_srcs + 3) / 4; 506 dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4; 507 508 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); 509 return num_dsts; 510 } 511 512 /* Special case 2x8x32 --> 1x16x8 */ 513 if (src_type.length == 8 && 514 util_get_cpu_caps()->has_avx) 515 { 516 num_dsts = (num_srcs + 1) / 2; 517 dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8; 518 519 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); 520 return num_dsts; 521 } 522 } 523 524 /* lp_build_resize does not support M:N */ 525 if (src_type.width == dst_type->width) { 526 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); 527 } else { 528 /* 529 * If dst_width is 16 bits and src_width 32 and the dst vector size 530 * 64bit, try feeding 2 vectors at once so pack intrinsics can be used. 531 * (For AVX, this isn't needed, since we usually get 256bit src and 532 * 128bit dst vectors which works ok. If we do AVX2 pack this should 533 * be extended but need to be able to tell conversion code about pack 534 * ordering first.) 535 */ 536 unsigned ratio = 1; 537 if (src_type.width == 2 * dst_type->width && 538 src_type.length == dst_type->length && 539 dst_type->floating == 0 && (num_srcs % 2 == 0) && 540 dst_type->width * dst_type->length == 64) { 541 ratio = 2; 542 num_dsts /= 2; 543 dst_type->length *= 2; 544 } 545 for (i = 0; i < num_dsts; i++) { 546 lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1); 547 } 548 } 549 550 return num_dsts; 551} 552 553 554/** 555 * Generic type conversion. 556 * 557 * TODO: Take a precision argument, or even better, add a new precision member 558 * to the lp_type union. 559 */ 560void 561lp_build_conv(struct gallivm_state *gallivm, 562 struct lp_type src_type, 563 struct lp_type dst_type, 564 const LLVMValueRef *src, unsigned num_srcs, 565 LLVMValueRef *dst, unsigned num_dsts) 566{ 567 LLVMBuilderRef builder = gallivm->builder; 568 struct lp_type tmp_type; 569 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 570 unsigned num_tmps; 571 unsigned i; 572 573 /* We must not loose or gain channels. Only precision */ 574 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 575 576 assert(src_type.length <= LP_MAX_VECTOR_LENGTH); 577 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); 578 assert(num_srcs <= LP_MAX_VECTOR_LENGTH); 579 assert(num_dsts <= LP_MAX_VECTOR_LENGTH); 580 581 tmp_type = src_type; 582 for(i = 0; i < num_srcs; ++i) { 583 assert(lp_check_value(src_type, src[i])); 584 tmp[i] = src[i]; 585 } 586 num_tmps = num_srcs; 587 588 589 /* 590 * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8 591 * Only float -> s/unorm8 and (u)int32->(u)int8. 592 * XXX: This should cover all interesting backend cases for 8 bit, 593 * but should use same strategy if dst is 16 bit. 594 */ 595 if (src_type.norm == 0 && 596 src_type.width == 32 && 597 src_type.length == 4 && 598 src_type.fixed == 0 && 599 600 dst_type.floating == 0 && 601 dst_type.fixed == 0 && 602 dst_type.width == 8 && 603 604 ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) || 605 (src_type.floating == 0 && dst_type.floating == 0 && 606 src_type.sign == dst_type.sign && dst_type.norm == 0)) && 607 608 ((dst_type.length == 16 && 4 * num_dsts == num_srcs) || 609 (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) && 610 611 (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec)) 612 { 613 struct lp_build_context bld; 614 struct lp_type int16_type, int32_type; 615 struct lp_type dst_type_ext = dst_type; 616 LLVMValueRef const_scale; 617 unsigned i, j; 618 619 lp_build_context_init(&bld, gallivm, src_type); 620 621 dst_type_ext.length = 16; 622 int16_type = int32_type = dst_type_ext; 623 624 int16_type.width *= 2; 625 int16_type.length /= 2; 626 int16_type.sign = 1; 627 628 int32_type.width *= 4; 629 int32_type.length /= 4; 630 int32_type.sign = 1; 631 632 const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type)); 633 634 for (i = 0; i < num_dsts; ++i, src += 4) { 635 LLVMValueRef lo, hi; 636 637 if (src_type.floating) { 638 for (j = 0; j < dst_type.length / 4; ++j) { 639 /* 640 * XXX This is not actually fully correct. The float to int 641 * conversion will produce 0x80000000 value for everything 642 * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq). 643 * Hence, NaNs and negatives will get clamped just fine to zero 644 * (relying on clamping pack behavior) when converting to unorm, 645 * however too large values (both finite and infinite) will also 646 * end up as zero, not 255. 647 * For snorm, for now we'll keep bug compatibility with generic 648 * conversion path (meaning too large values are fine, but 649 * NaNs get converted to -128 (purely by luck, as we don't 650 * specify nan behavior for the max there) instead of 0). 651 * 652 * dEQP has GLES31 tests that expect +inf -> 255.0. 653 */ 654 if (dst_type.sign) { 655 tmp[j] = lp_build_min(&bld, bld.one, src[j]); 656 657 } 658 else { 659 if (1) { 660 tmp[j] = lp_build_min_ext(&bld, bld.one, src[j], 661 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 662 } 663 tmp[j] = src[j]; 664 } 665 tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, ""); 666 tmp[j] = lp_build_iround(&bld, tmp[j]); 667 } 668 } else { 669 for (j = 0; j < dst_type.length / 4; ++j) { 670 if (!dst_type.sign) { 671 /* 672 * Pack clamp is always signed->unsigned (or signed->signed). 673 * Hence need min. 674 */ 675 LLVMValueRef const_max; 676 const_max = lp_build_const_int_vec(gallivm, src_type, 255); 677 tmp[j] = lp_build_min(&bld, src[j], const_max); 678 } else { 679 tmp[j] = src[j]; 680 } 681 } 682 } 683 684 if (num_srcs == 1) { 685 tmp[1] = tmp[0]; 686 } 687 688 /* relying on clamping behavior of sse2 intrinsics here */ 689 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); 690 691 if (num_srcs < 4) { 692 hi = lo; 693 } 694 else { 695 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); 696 } 697 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi); 698 } 699 if (num_srcs < 4) { 700 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length); 701 } 702 703 return; 704 } 705 706 /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8 707 */ 708 else if (src_type.norm == 0 && 709 src_type.width == 32 && 710 src_type.length == 8 && 711 src_type.fixed == 0 && 712 713 dst_type.floating == 0 && 714 dst_type.fixed == 0 && 715 dst_type.width == 8 && 716 717 ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) || 718 (src_type.floating == 0 && dst_type.floating == 0 && 719 src_type.sign == dst_type.sign && dst_type.norm == 0)) && 720 721 ((dst_type.length == 16 && 2 * num_dsts == num_srcs) || 722 (num_dsts == 1 && dst_type.length * num_srcs == 8)) && 723 724 util_get_cpu_caps()->has_avx) { 725 726 struct lp_build_context bld; 727 struct lp_type int16_type, int32_type; 728 struct lp_type dst_type_ext = dst_type; 729 LLVMValueRef const_scale; 730 unsigned i; 731 732 lp_build_context_init(&bld, gallivm, src_type); 733 734 dst_type_ext.length = 16; 735 int16_type = int32_type = dst_type_ext; 736 737 int16_type.width *= 2; 738 int16_type.length /= 2; 739 int16_type.sign = 1; 740 741 int32_type.width *= 4; 742 int32_type.length /= 4; 743 int32_type.sign = 1; 744 745 const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type)); 746 747 for (i = 0; i < num_dsts; ++i, src += 2) { 748 unsigned j; 749 for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) { 750 LLVMValueRef lo, hi, a; 751 752 a = src[j]; 753 if (src_type.floating) { 754 if (dst_type.sign) { 755 a = lp_build_min(&bld, bld.one, a); 756 757 } 758 else { 759 if (1) { 760 a = lp_build_min_ext(&bld, bld.one, a, 761 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 762 } 763 } 764 a = LLVMBuildFMul(builder, a, const_scale, ""); 765 a = lp_build_iround(&bld, a); 766 } else { 767 if (!dst_type.sign) { 768 LLVMValueRef const_max; 769 const_max = lp_build_const_int_vec(gallivm, src_type, 255); 770 a = lp_build_min(&bld, a, const_max); 771 } 772 } 773 lo = lp_build_extract_range(gallivm, a, 0, 4); 774 hi = lp_build_extract_range(gallivm, a, 4, 4); 775 /* relying on clamping behavior of sse2 intrinsics here */ 776 tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi); 777 } 778 779 if (num_srcs == 1) { 780 tmp[1] = tmp[0]; 781 } 782 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]); 783 } 784 785 if (num_srcs == 1) { 786 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length); 787 } 788 789 return; 790 } 791 792 /* Special case -> 16bit half-float 793 */ 794 else if (dst_type.floating && dst_type.width == 16) 795 { 796 /* Only support src as 32bit float currently */ 797 assert(src_type.floating && src_type.width == 32); 798 799 for(i = 0; i < num_tmps; ++i) 800 dst[i] = lp_build_float_to_half(gallivm, tmp[i]); 801 802 return; 803 } 804 805 /* Pre convert half-floats to floats 806 */ 807 else if (src_type.floating && src_type.width == 16) 808 { 809 for(i = 0; i < num_tmps; ++i) 810 tmp[i] = lp_build_half_to_float(gallivm, tmp[i]); 811 812 tmp_type.width = 32; 813 } 814 815 /* 816 * Clamp if necessary 817 */ 818 819 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { 820 struct lp_build_context bld; 821 double src_min = lp_const_min(src_type); 822 double dst_min = lp_const_min(dst_type); 823 double src_max = lp_const_max(src_type); 824 double dst_max = lp_const_max(dst_type); 825 LLVMValueRef thres; 826 827 lp_build_context_init(&bld, gallivm, tmp_type); 828 829 if(src_min < dst_min) { 830 if(dst_min == 0.0) 831 thres = bld.zero; 832 else 833 thres = lp_build_const_vec(gallivm, src_type, dst_min); 834 for(i = 0; i < num_tmps; ++i) 835 tmp[i] = lp_build_max(&bld, tmp[i], thres); 836 } 837 838 if(src_max > dst_max) { 839 if(dst_max == 1.0) 840 thres = bld.one; 841 else 842 thres = lp_build_const_vec(gallivm, src_type, dst_max); 843 for(i = 0; i < num_tmps; ++i) 844 tmp[i] = lp_build_min(&bld, tmp[i], thres); 845 } 846 } 847 848 /* 849 * Scale to the narrowest range 850 */ 851 852 if(dst_type.floating) { 853 /* Nothing to do */ 854 } 855 else if(tmp_type.floating) { 856 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { 857 for(i = 0; i < num_tmps; ++i) { 858 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm, 859 tmp_type, 860 dst_type.width, 861 tmp[i]); 862 } 863 tmp_type.floating = FALSE; 864 } 865 else { 866 double dst_scale = lp_const_scale(dst_type); 867 868 if (dst_scale != 1.0) { 869 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale); 870 for(i = 0; i < num_tmps; ++i) 871 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 872 } 873 874 /* 875 * these functions will use fptosi in some form which won't work 876 * with 32bit uint dst. Causes lp_test_conv failures though. 877 */ 878 if (0) 879 assert(dst_type.sign || dst_type.width < 32); 880 881 if (dst_type.sign && dst_type.norm && !dst_type.fixed) { 882 struct lp_build_context bld; 883 884 lp_build_context_init(&bld, gallivm, tmp_type); 885 for(i = 0; i < num_tmps; ++i) { 886 tmp[i] = lp_build_iround(&bld, tmp[i]); 887 } 888 tmp_type.floating = FALSE; 889 } 890 else { 891 LLVMTypeRef tmp_vec_type; 892 893 tmp_type.floating = FALSE; 894 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 895 for(i = 0; i < num_tmps; ++i) { 896#if 0 897 if(dst_type.sign) 898 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 899 else 900 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); 901#else 902 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ 903 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 904#endif 905 } 906 } 907 } 908 } 909 else { 910 unsigned src_shift = lp_const_shift(src_type); 911 unsigned dst_shift = lp_const_shift(dst_type); 912 unsigned src_offset = lp_const_offset(src_type); 913 unsigned dst_offset = lp_const_offset(dst_type); 914 struct lp_build_context bld; 915 lp_build_context_init(&bld, gallivm, tmp_type); 916 917 /* Compensate for different offsets */ 918 /* sscaled -> unorm and similar would cause negative shift count, skip */ 919 if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) { 920 for (i = 0; i < num_tmps; ++i) { 921 LLVMValueRef shifted; 922 923 shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1); 924 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, ""); 925 } 926 } 927 928 if(src_shift > dst_shift) { 929 for(i = 0; i < num_tmps; ++i) 930 tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift); 931 } 932 } 933 934 /* 935 * Truncate or expand bit width 936 * 937 * No data conversion should happen here, although the sign bits are 938 * crucial to avoid bad clamping. 939 */ 940 941 { 942 struct lp_type new_type; 943 944 new_type = tmp_type; 945 new_type.sign = dst_type.sign; 946 new_type.width = dst_type.width; 947 new_type.length = dst_type.length; 948 949 /* 950 * Note that resize when using packs can sometimes get min/max 951 * clamping for free. Should be able to exploit this... 952 */ 953 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); 954 955 tmp_type = new_type; 956 num_tmps = num_dsts; 957 } 958 959 /* 960 * Scale to the widest range 961 */ 962 963 if(src_type.floating) { 964 /* Nothing to do */ 965 } 966 else if(!src_type.floating && dst_type.floating) { 967 if(!src_type.fixed && !src_type.sign && src_type.norm) { 968 for(i = 0; i < num_tmps; ++i) { 969 tmp[i] = lp_build_unsigned_norm_to_float(gallivm, 970 src_type.width, 971 dst_type, 972 tmp[i]); 973 } 974 tmp_type.floating = TRUE; 975 } 976 else { 977 double src_scale = lp_const_scale(src_type); 978 LLVMTypeRef tmp_vec_type; 979 980 /* Use an equally sized integer for intermediate computations */ 981 tmp_type.floating = TRUE; 982 tmp_type.sign = TRUE; 983 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 984 for(i = 0; i < num_tmps; ++i) { 985#if 0 986 if(dst_type.sign) 987 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 988 else 989 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); 990#else 991 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ 992 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 993#endif 994 } 995 996 if (src_scale != 1.0) { 997 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale); 998 for(i = 0; i < num_tmps; ++i) 999 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 1000 } 1001 1002 /* the formula above will produce value below -1.0 for most negative 1003 * value but everything seems happy with that hence disable for now */ 1004 if (0 && !src_type.fixed && src_type.norm && src_type.sign) { 1005 struct lp_build_context bld; 1006 1007 lp_build_context_init(&bld, gallivm, dst_type); 1008 for(i = 0; i < num_tmps; ++i) { 1009 tmp[i] = lp_build_max(&bld, tmp[i], 1010 lp_build_const_vec(gallivm, dst_type, -1.0f)); 1011 } 1012 } 1013 } 1014 } 1015 else { 1016 unsigned src_shift = lp_const_shift(src_type); 1017 unsigned dst_shift = lp_const_shift(dst_type); 1018 unsigned src_offset = lp_const_offset(src_type); 1019 unsigned dst_offset = lp_const_offset(dst_type); 1020 struct lp_build_context bld; 1021 lp_build_context_init(&bld, gallivm, tmp_type); 1022 1023 if (src_shift < dst_shift) { 1024 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH]; 1025 1026 if (dst_shift - src_shift < dst_type.width) { 1027 for (i = 0; i < num_tmps; ++i) { 1028 pre_shift[i] = tmp[i]; 1029 tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift); 1030 } 1031 } 1032 else { 1033 /* 1034 * This happens for things like sscaled -> unorm conversions. Shift 1035 * counts equal to bit width cause undefined results, so hack around it. 1036 */ 1037 for (i = 0; i < num_tmps; ++i) { 1038 pre_shift[i] = tmp[i]; 1039 tmp[i] = lp_build_zero(gallivm, dst_type); 1040 } 1041 } 1042 1043 /* Compensate for different offsets */ 1044 if (dst_offset > src_offset) { 1045 for (i = 0; i < num_tmps; ++i) { 1046 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], ""); 1047 } 1048 } 1049 } 1050 } 1051 1052 for(i = 0; i < num_dsts; ++i) { 1053 dst[i] = tmp[i]; 1054 assert(lp_check_value(dst_type, dst[i])); 1055 } 1056} 1057 1058 1059/** 1060 * Bit mask conversion. 1061 * 1062 * This will convert the integer masks that match the given types. 1063 * 1064 * The mask values should 0 or -1, i.e., all bits either set to zero or one. 1065 * Any other value will likely cause unpredictable results. 1066 * 1067 * This is basically a very trimmed down version of lp_build_conv. 1068 */ 1069void 1070lp_build_conv_mask(struct gallivm_state *gallivm, 1071 struct lp_type src_type, 1072 struct lp_type dst_type, 1073 const LLVMValueRef *src, unsigned num_srcs, 1074 LLVMValueRef *dst, unsigned num_dsts) 1075{ 1076 1077 /* We must not loose or gain channels. Only precision */ 1078 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 1079 1080 /* 1081 * Drop 1082 * 1083 * We assume all values are 0 or -1 1084 */ 1085 1086 src_type.floating = FALSE; 1087 src_type.fixed = FALSE; 1088 src_type.sign = TRUE; 1089 src_type.norm = FALSE; 1090 1091 dst_type.floating = FALSE; 1092 dst_type.fixed = FALSE; 1093 dst_type.sign = TRUE; 1094 dst_type.norm = FALSE; 1095 1096 /* 1097 * Truncate or expand bit width 1098 */ 1099 1100 lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts); 1101} 1102