1bf215546Sopenharmony_ci/************************************************************************** 2bf215546Sopenharmony_ci * 3bf215546Sopenharmony_ci * Copyright 2009 VMware, Inc. 4bf215546Sopenharmony_ci * All Rights Reserved. 5bf215546Sopenharmony_ci * 6bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 7bf215546Sopenharmony_ci * copy of this software and associated documentation files (the 8bf215546Sopenharmony_ci * "Software"), to deal in the Software without restriction, including 9bf215546Sopenharmony_ci * without limitation the rights to use, copy, modify, merge, publish, 10bf215546Sopenharmony_ci * distribute, sub license, and/or sell copies of the Software, and to 11bf215546Sopenharmony_ci * permit persons to whom the Software is furnished to do so, subject to 12bf215546Sopenharmony_ci * the following conditions: 13bf215546Sopenharmony_ci * 14bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the 15bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions 16bf215546Sopenharmony_ci * of the Software. 17bf215546Sopenharmony_ci * 18bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19bf215546Sopenharmony_ci * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20bf215546Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21bf215546Sopenharmony_ci * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22bf215546Sopenharmony_ci * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23bf215546Sopenharmony_ci * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24bf215546Sopenharmony_ci * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25bf215546Sopenharmony_ci * 26bf215546Sopenharmony_ci **************************************************************************/ 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_ci/** 30bf215546Sopenharmony_ci * @file 31bf215546Sopenharmony_ci * Helper functions for type conversions. 32bf215546Sopenharmony_ci * 33bf215546Sopenharmony_ci * We want to use the fastest type for a given computation whenever feasible. 34bf215546Sopenharmony_ci * The other side of this is that we need to be able convert between several 35bf215546Sopenharmony_ci * types accurately and efficiently. 36bf215546Sopenharmony_ci * 37bf215546Sopenharmony_ci * Conversion between types of different bit width is quite complex since a 38bf215546Sopenharmony_ci * 39bf215546Sopenharmony_ci * To remember there are a few invariants in type conversions: 40bf215546Sopenharmony_ci * 41bf215546Sopenharmony_ci * - register width must remain constant: 42bf215546Sopenharmony_ci * 43bf215546Sopenharmony_ci * src_type.width * src_type.length == dst_type.width * dst_type.length 44bf215546Sopenharmony_ci * 45bf215546Sopenharmony_ci * - total number of elements must remain constant: 46bf215546Sopenharmony_ci * 47bf215546Sopenharmony_ci * src_type.length * num_srcs == dst_type.length * num_dsts 48bf215546Sopenharmony_ci * 49bf215546Sopenharmony_ci * It is not always possible to do the conversion both accurately and 50bf215546Sopenharmony_ci * efficiently, usually due to lack of adequate machine instructions. In these 51bf215546Sopenharmony_ci * cases it is important not to cut shortcuts here and sacrifice accuracy, as 52bf215546Sopenharmony_ci * there this functions can be used anywhere. In the future we might have a 53bf215546Sopenharmony_ci * precision parameter which can gauge the accuracy vs efficiency compromise, 54bf215546Sopenharmony_ci * but for now if the data conversion between two stages happens to be the 55bf215546Sopenharmony_ci * bottleneck, then most likely should just avoid converting at all and run 56bf215546Sopenharmony_ci * both stages with the same type. 57bf215546Sopenharmony_ci * 58bf215546Sopenharmony_ci * Make sure to run lp_test_conv unit test after any change to this file. 59bf215546Sopenharmony_ci * 60bf215546Sopenharmony_ci * @author Jose Fonseca <jfonseca@vmware.com> 61bf215546Sopenharmony_ci */ 62bf215546Sopenharmony_ci 63bf215546Sopenharmony_ci 64bf215546Sopenharmony_ci#include "util/u_debug.h" 65bf215546Sopenharmony_ci#include "util/u_math.h" 66bf215546Sopenharmony_ci#include "util/half_float.h" 67bf215546Sopenharmony_ci#include "util/u_cpu_detect.h" 68bf215546Sopenharmony_ci 69bf215546Sopenharmony_ci#include "lp_bld_type.h" 70bf215546Sopenharmony_ci#include "lp_bld_const.h" 71bf215546Sopenharmony_ci#include "lp_bld_arit.h" 72bf215546Sopenharmony_ci#include "lp_bld_bitarit.h" 73bf215546Sopenharmony_ci#include "lp_bld_pack.h" 74bf215546Sopenharmony_ci#include "lp_bld_conv.h" 75bf215546Sopenharmony_ci#include "lp_bld_logic.h" 76bf215546Sopenharmony_ci#include "lp_bld_intr.h" 77bf215546Sopenharmony_ci#include "lp_bld_printf.h" 78bf215546Sopenharmony_ci#include "lp_bld_format.h" 79bf215546Sopenharmony_ci 80bf215546Sopenharmony_ci 81bf215546Sopenharmony_ci/* the lp_test_format test fails on mingw/i686 at -O2 with gcc 10.x 82bf215546Sopenharmony_ci * ref https://gitlab.freedesktop.org/mesa/mesa/-/issues/3906 83bf215546Sopenharmony_ci */ 84bf215546Sopenharmony_ci 85bf215546Sopenharmony_ci#if defined(__MINGW32__) && !defined(__MINGW64__) && (__GNUC__ == 10) 86bf215546Sopenharmony_ci#warning "disabling caller-saves optimization for this file to work around compiler bug" 87bf215546Sopenharmony_ci#pragma GCC optimize("-fno-caller-saves") 88bf215546Sopenharmony_ci#endif 89bf215546Sopenharmony_ci 90bf215546Sopenharmony_ci/** 91bf215546Sopenharmony_ci * Converts int16 half-float to float32 92bf215546Sopenharmony_ci * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16) 93bf215546Sopenharmony_ci * [llvm.x86.vcvtph2ps / _mm_cvtph_ps] 94bf215546Sopenharmony_ci * 95bf215546Sopenharmony_ci * @param src value to convert 96bf215546Sopenharmony_ci * 97bf215546Sopenharmony_ci */ 98bf215546Sopenharmony_ciLLVMValueRef 99bf215546Sopenharmony_cilp_build_half_to_float(struct gallivm_state *gallivm, 100bf215546Sopenharmony_ci LLVMValueRef src) 101bf215546Sopenharmony_ci{ 102bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 103bf215546Sopenharmony_ci LLVMTypeRef src_type = LLVMTypeOf(src); 104bf215546Sopenharmony_ci unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ? 105bf215546Sopenharmony_ci LLVMGetVectorSize(src_type) : 1; 106bf215546Sopenharmony_ci 107bf215546Sopenharmony_ci struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length); 108bf215546Sopenharmony_ci struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length); 109bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); 110bf215546Sopenharmony_ci LLVMValueRef h; 111bf215546Sopenharmony_ci 112bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_f16c && 113bf215546Sopenharmony_ci (src_length == 4 || src_length == 8)) { 114bf215546Sopenharmony_ci if (LLVM_VERSION_MAJOR < 11) { 115bf215546Sopenharmony_ci const char *intrinsic = NULL; 116bf215546Sopenharmony_ci if (src_length == 4) { 117bf215546Sopenharmony_ci src = lp_build_pad_vector(gallivm, src, 8); 118bf215546Sopenharmony_ci intrinsic = "llvm.x86.vcvtph2ps.128"; 119bf215546Sopenharmony_ci } 120bf215546Sopenharmony_ci else { 121bf215546Sopenharmony_ci intrinsic = "llvm.x86.vcvtph2ps.256"; 122bf215546Sopenharmony_ci } 123bf215546Sopenharmony_ci src = LLVMBuildBitCast(builder, src, 124bf215546Sopenharmony_ci LLVMVectorType(LLVMInt16TypeInContext(gallivm->context), 8), ""); 125bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, intrinsic, 126bf215546Sopenharmony_ci lp_build_vec_type(gallivm, f32_type), src); 127bf215546Sopenharmony_ci } else { 128bf215546Sopenharmony_ci /* 129bf215546Sopenharmony_ci * XXX: could probably use on other archs as well. 130bf215546Sopenharmony_ci * But if the cpu doesn't support it natively it looks like the backends still 131bf215546Sopenharmony_ci * can't lower it and will try to call out to external libraries, which will crash. 132bf215546Sopenharmony_ci */ 133bf215546Sopenharmony_ci /* 134bf215546Sopenharmony_ci * XXX: lp_build_vec_type() would use int16 vector. Probably need to revisit 135bf215546Sopenharmony_ci * this at some point. 136bf215546Sopenharmony_ci */ 137bf215546Sopenharmony_ci src = LLVMBuildBitCast(builder, src, 138bf215546Sopenharmony_ci LLVMVectorType(LLVMHalfTypeInContext(gallivm->context), src_length), ""); 139bf215546Sopenharmony_ci return LLVMBuildFPExt(builder, src, lp_build_vec_type(gallivm, f32_type), ""); 140bf215546Sopenharmony_ci } 141bf215546Sopenharmony_ci } 142bf215546Sopenharmony_ci 143bf215546Sopenharmony_ci h = LLVMBuildZExt(builder, src, int_vec_type, ""); 144bf215546Sopenharmony_ci return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true); 145bf215546Sopenharmony_ci} 146bf215546Sopenharmony_ci 147bf215546Sopenharmony_ci 148bf215546Sopenharmony_ci/** 149bf215546Sopenharmony_ci * Converts float32 to int16 half-float 150bf215546Sopenharmony_ci * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16) 151bf215546Sopenharmony_ci * [llvm.x86.vcvtps2ph / _mm_cvtps_ph] 152bf215546Sopenharmony_ci * 153bf215546Sopenharmony_ci * @param src value to convert 154bf215546Sopenharmony_ci * 155bf215546Sopenharmony_ci * Convert float32 to half floats, preserving Infs and NaNs, 156bf215546Sopenharmony_ci * with rounding towards zero (trunc). 157bf215546Sopenharmony_ci * XXX: For GL, would prefer rounding towards nearest(-even). 158bf215546Sopenharmony_ci */ 159bf215546Sopenharmony_ciLLVMValueRef 160bf215546Sopenharmony_cilp_build_float_to_half(struct gallivm_state *gallivm, 161bf215546Sopenharmony_ci LLVMValueRef src) 162bf215546Sopenharmony_ci{ 163bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 164bf215546Sopenharmony_ci LLVMTypeRef f32_vec_type = LLVMTypeOf(src); 165bf215546Sopenharmony_ci unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind 166bf215546Sopenharmony_ci ? LLVMGetVectorSize(f32_vec_type) : 1; 167bf215546Sopenharmony_ci struct lp_type i32_type = lp_type_int_vec(32, 32 * length); 168bf215546Sopenharmony_ci struct lp_type i16_type = lp_type_int_vec(16, 16 * length); 169bf215546Sopenharmony_ci LLVMValueRef result; 170bf215546Sopenharmony_ci 171bf215546Sopenharmony_ci /* 172bf215546Sopenharmony_ci * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits 173bf215546Sopenharmony_ci * directly, without any (x86 or generic) intrinsics. 174bf215546Sopenharmony_ci * Albeit the rounding mode cannot be specified (and is undefined, 175bf215546Sopenharmony_ci * though in practice on x86 seems to do nearest-even but it may 176bf215546Sopenharmony_ci * be dependent on instruction set support), so is essentially 177bf215546Sopenharmony_ci * useless. 178bf215546Sopenharmony_ci */ 179bf215546Sopenharmony_ci 180bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_f16c && 181bf215546Sopenharmony_ci (length == 4 || length == 8)) { 182bf215546Sopenharmony_ci struct lp_type i168_type = lp_type_int_vec(16, 16 * 8); 183bf215546Sopenharmony_ci unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */ 184bf215546Sopenharmony_ci LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); 185bf215546Sopenharmony_ci const char *intrinsic = NULL; 186bf215546Sopenharmony_ci if (length == 4) { 187bf215546Sopenharmony_ci intrinsic = "llvm.x86.vcvtps2ph.128"; 188bf215546Sopenharmony_ci } 189bf215546Sopenharmony_ci else { 190bf215546Sopenharmony_ci intrinsic = "llvm.x86.vcvtps2ph.256"; 191bf215546Sopenharmony_ci } 192bf215546Sopenharmony_ci result = lp_build_intrinsic_binary(builder, intrinsic, 193bf215546Sopenharmony_ci lp_build_vec_type(gallivm, i168_type), 194bf215546Sopenharmony_ci src, LLVMConstInt(i32t, mode, 0)); 195bf215546Sopenharmony_ci if (length == 4) { 196bf215546Sopenharmony_ci result = lp_build_extract_range(gallivm, result, 0, 4); 197bf215546Sopenharmony_ci } 198bf215546Sopenharmony_ci result = LLVMBuildBitCast(builder, result, lp_build_vec_type(gallivm, lp_type_float_vec(16, 16 * length)), ""); 199bf215546Sopenharmony_ci } 200bf215546Sopenharmony_ci 201bf215546Sopenharmony_ci else { 202bf215546Sopenharmony_ci result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true); 203bf215546Sopenharmony_ci /* Convert int32 vector to int16 vector by trunc (might generate bad code) */ 204bf215546Sopenharmony_ci result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), ""); 205bf215546Sopenharmony_ci } 206bf215546Sopenharmony_ci 207bf215546Sopenharmony_ci /* 208bf215546Sopenharmony_ci * Debugging code. 209bf215546Sopenharmony_ci */ 210bf215546Sopenharmony_ci if (0) { 211bf215546Sopenharmony_ci LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); 212bf215546Sopenharmony_ci LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context); 213bf215546Sopenharmony_ci LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context); 214bf215546Sopenharmony_ci LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length)); 215bf215546Sopenharmony_ci unsigned i; 216bf215546Sopenharmony_ci 217bf215546Sopenharmony_ci LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0); 218bf215546Sopenharmony_ci LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)_mesa_float_to_half)); 219bf215546Sopenharmony_ci func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "_mesa_float_to_half"); 220bf215546Sopenharmony_ci 221bf215546Sopenharmony_ci for (i = 0; i < length; ++i) { 222bf215546Sopenharmony_ci LLVMValueRef index = LLVMConstInt(i32t, i, 0); 223bf215546Sopenharmony_ci LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, ""); 224bf215546Sopenharmony_ci#if 0 225bf215546Sopenharmony_ci /* 226bf215546Sopenharmony_ci * XXX: not really supported by backends. 227bf215546Sopenharmony_ci * Even if they would now, rounding mode cannot be specified and 228bf215546Sopenharmony_ci * is undefined. 229bf215546Sopenharmony_ci */ 230bf215546Sopenharmony_ci LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32); 231bf215546Sopenharmony_ci#else 232bf215546Sopenharmony_ci LLVMValueRef f16 = LLVMBuildCall2(builder, func_type, func, &f32, 1, ""); 233bf215546Sopenharmony_ci#endif 234bf215546Sopenharmony_ci ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, ""); 235bf215546Sopenharmony_ci } 236bf215546Sopenharmony_ci 237bf215546Sopenharmony_ci lp_build_print_value(gallivm, "src = ", src); 238bf215546Sopenharmony_ci lp_build_print_value(gallivm, "llvm = ", result); 239bf215546Sopenharmony_ci lp_build_print_value(gallivm, "util = ", ref_result); 240bf215546Sopenharmony_ci lp_build_printf(gallivm, "\n"); 241bf215546Sopenharmony_ci } 242bf215546Sopenharmony_ci 243bf215546Sopenharmony_ci return result; 244bf215546Sopenharmony_ci} 245bf215546Sopenharmony_ci 246bf215546Sopenharmony_ci 247bf215546Sopenharmony_ci/** 248bf215546Sopenharmony_ci * Special case for converting clamped IEEE-754 floats to unsigned norms. 249bf215546Sopenharmony_ci * 250bf215546Sopenharmony_ci * The mathematical voodoo below may seem excessive but it is actually 251bf215546Sopenharmony_ci * paramount we do it this way for several reasons. First, there is no single 252bf215546Sopenharmony_ci * precision FP to unsigned integer conversion Intel SSE instruction. Second, 253bf215546Sopenharmony_ci * secondly, even if there was, since the FP's mantissa takes only a fraction 254bf215546Sopenharmony_ci * of register bits the typically scale and cast approach would require double 255bf215546Sopenharmony_ci * precision for accurate results, and therefore half the throughput 256bf215546Sopenharmony_ci * 257bf215546Sopenharmony_ci * Although the result values can be scaled to an arbitrary bit width specified 258bf215546Sopenharmony_ci * by dst_width, the actual result type will have the same width. 259bf215546Sopenharmony_ci * 260bf215546Sopenharmony_ci * Ex: src = { float, float, float, float } 261bf215546Sopenharmony_ci * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. 262bf215546Sopenharmony_ci */ 263bf215546Sopenharmony_ciLLVMValueRef 264bf215546Sopenharmony_cilp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, 265bf215546Sopenharmony_ci struct lp_type src_type, 266bf215546Sopenharmony_ci unsigned dst_width, 267bf215546Sopenharmony_ci LLVMValueRef src) 268bf215546Sopenharmony_ci{ 269bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 270bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type); 271bf215546Sopenharmony_ci LLVMValueRef res; 272bf215546Sopenharmony_ci unsigned mantissa; 273bf215546Sopenharmony_ci 274bf215546Sopenharmony_ci assert(src_type.floating); 275bf215546Sopenharmony_ci assert(dst_width <= src_type.width); 276bf215546Sopenharmony_ci src_type.sign = FALSE; 277bf215546Sopenharmony_ci 278bf215546Sopenharmony_ci mantissa = lp_mantissa(src_type); 279bf215546Sopenharmony_ci 280bf215546Sopenharmony_ci if (dst_width <= mantissa) { 281bf215546Sopenharmony_ci /* 282bf215546Sopenharmony_ci * Apply magic coefficients that will make the desired result to appear 283bf215546Sopenharmony_ci * in the lowest significant bits of the mantissa, with correct rounding. 284bf215546Sopenharmony_ci * 285bf215546Sopenharmony_ci * This only works if the destination width fits in the mantissa. 286bf215546Sopenharmony_ci */ 287bf215546Sopenharmony_ci 288bf215546Sopenharmony_ci unsigned long long ubound; 289bf215546Sopenharmony_ci unsigned long long mask; 290bf215546Sopenharmony_ci double scale; 291bf215546Sopenharmony_ci double bias; 292bf215546Sopenharmony_ci 293bf215546Sopenharmony_ci ubound = (1ULL << dst_width); 294bf215546Sopenharmony_ci mask = ubound - 1; 295bf215546Sopenharmony_ci scale = (double)mask/ubound; 296bf215546Sopenharmony_ci bias = (double)(1ULL << (mantissa - dst_width)); 297bf215546Sopenharmony_ci 298bf215546Sopenharmony_ci res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); 299bf215546Sopenharmony_ci /* instead of fadd/and could (with sse2) just use lp_build_iround */ 300bf215546Sopenharmony_ci res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), ""); 301bf215546Sopenharmony_ci res = LLVMBuildBitCast(builder, res, int_vec_type, ""); 302bf215546Sopenharmony_ci res = LLVMBuildAnd(builder, res, 303bf215546Sopenharmony_ci lp_build_const_int_vec(gallivm, src_type, mask), ""); 304bf215546Sopenharmony_ci } 305bf215546Sopenharmony_ci else if (dst_width == (mantissa + 1)) { 306bf215546Sopenharmony_ci /* 307bf215546Sopenharmony_ci * The destination width matches exactly what can be represented in 308bf215546Sopenharmony_ci * floating point (i.e., mantissa + 1 bits). Even so correct rounding 309bf215546Sopenharmony_ci * still needs to be applied (only for numbers in [0.5-1.0] would 310bf215546Sopenharmony_ci * conversion using truncation after scaling be sufficient). 311bf215546Sopenharmony_ci */ 312bf215546Sopenharmony_ci double scale; 313bf215546Sopenharmony_ci struct lp_build_context uf32_bld; 314bf215546Sopenharmony_ci 315bf215546Sopenharmony_ci lp_build_context_init(&uf32_bld, gallivm, src_type); 316bf215546Sopenharmony_ci scale = (double)((1ULL << dst_width) - 1); 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci res = LLVMBuildFMul(builder, src, 319bf215546Sopenharmony_ci lp_build_const_vec(gallivm, src_type, scale), ""); 320bf215546Sopenharmony_ci res = lp_build_iround(&uf32_bld, res); 321bf215546Sopenharmony_ci } 322bf215546Sopenharmony_ci else { 323bf215546Sopenharmony_ci /* 324bf215546Sopenharmony_ci * The destination exceeds what can be represented in the floating point. 325bf215546Sopenharmony_ci * So multiply by the largest power two we get away with, and when 326bf215546Sopenharmony_ci * subtract the most significant bit to rescale to normalized values. 327bf215546Sopenharmony_ci * 328bf215546Sopenharmony_ci * The largest power of two factor we can get away is 329bf215546Sopenharmony_ci * (1 << (src_type.width - 1)), because we need to use signed . In theory it 330bf215546Sopenharmony_ci * should be (1 << (src_type.width - 2)), but IEEE 754 rules states 331bf215546Sopenharmony_ci * INT_MIN should be returned in FPToSI, which is the correct result for 332bf215546Sopenharmony_ci * values near 1.0! 333bf215546Sopenharmony_ci * 334bf215546Sopenharmony_ci * This means we get (src_type.width - 1) correct bits for values near 0.0, 335bf215546Sopenharmony_ci * and (mantissa + 1) correct bits for values near 1.0. Equally or more 336bf215546Sopenharmony_ci * important, we also get exact results for 0.0 and 1.0. 337bf215546Sopenharmony_ci */ 338bf215546Sopenharmony_ci 339bf215546Sopenharmony_ci unsigned n = MIN2(src_type.width - 1u, dst_width); 340bf215546Sopenharmony_ci 341bf215546Sopenharmony_ci double scale = (double)(1ULL << n); 342bf215546Sopenharmony_ci unsigned lshift = dst_width - n; 343bf215546Sopenharmony_ci unsigned rshift = n; 344bf215546Sopenharmony_ci LLVMValueRef lshifted; 345bf215546Sopenharmony_ci LLVMValueRef rshifted; 346bf215546Sopenharmony_ci 347bf215546Sopenharmony_ci res = LLVMBuildFMul(builder, src, 348bf215546Sopenharmony_ci lp_build_const_vec(gallivm, src_type, scale), ""); 349bf215546Sopenharmony_ci if (!src_type.sign && src_type.width == 32) 350bf215546Sopenharmony_ci res = LLVMBuildFPToUI(builder, res, int_vec_type, ""); 351bf215546Sopenharmony_ci else 352bf215546Sopenharmony_ci res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 353bf215546Sopenharmony_ci 354bf215546Sopenharmony_ci /* 355bf215546Sopenharmony_ci * Align the most significant bit to its final place. 356bf215546Sopenharmony_ci * 357bf215546Sopenharmony_ci * This will cause 1.0 to overflow to 0, but the later adjustment will 358bf215546Sopenharmony_ci * get it right. 359bf215546Sopenharmony_ci */ 360bf215546Sopenharmony_ci if (lshift) { 361bf215546Sopenharmony_ci lshifted = LLVMBuildShl(builder, res, 362bf215546Sopenharmony_ci lp_build_const_int_vec(gallivm, src_type, 363bf215546Sopenharmony_ci lshift), ""); 364bf215546Sopenharmony_ci } else { 365bf215546Sopenharmony_ci lshifted = res; 366bf215546Sopenharmony_ci } 367bf215546Sopenharmony_ci 368bf215546Sopenharmony_ci /* 369bf215546Sopenharmony_ci * Align the most significant bit to the right. 370bf215546Sopenharmony_ci */ 371bf215546Sopenharmony_ci rshifted = LLVMBuildLShr(builder, res, 372bf215546Sopenharmony_ci lp_build_const_int_vec(gallivm, src_type, rshift), 373bf215546Sopenharmony_ci ""); 374bf215546Sopenharmony_ci 375bf215546Sopenharmony_ci /* 376bf215546Sopenharmony_ci * Subtract the MSB to the LSB, therefore re-scaling from 377bf215546Sopenharmony_ci * (1 << dst_width) to ((1 << dst_width) - 1). 378bf215546Sopenharmony_ci */ 379bf215546Sopenharmony_ci 380bf215546Sopenharmony_ci res = LLVMBuildSub(builder, lshifted, rshifted, ""); 381bf215546Sopenharmony_ci } 382bf215546Sopenharmony_ci 383bf215546Sopenharmony_ci return res; 384bf215546Sopenharmony_ci} 385bf215546Sopenharmony_ci 386bf215546Sopenharmony_ci 387bf215546Sopenharmony_ci/** 388bf215546Sopenharmony_ci * Inverse of lp_build_clamped_float_to_unsigned_norm above. 389bf215546Sopenharmony_ci * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] 390bf215546Sopenharmony_ci * return {float, float, float, float} with values in range [0, 1]. 391bf215546Sopenharmony_ci */ 392bf215546Sopenharmony_ciLLVMValueRef 393bf215546Sopenharmony_cilp_build_unsigned_norm_to_float(struct gallivm_state *gallivm, 394bf215546Sopenharmony_ci unsigned src_width, 395bf215546Sopenharmony_ci struct lp_type dst_type, 396bf215546Sopenharmony_ci LLVMValueRef src) 397bf215546Sopenharmony_ci{ 398bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 399bf215546Sopenharmony_ci LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type); 400bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type); 401bf215546Sopenharmony_ci LLVMValueRef bias_; 402bf215546Sopenharmony_ci LLVMValueRef res; 403bf215546Sopenharmony_ci unsigned mantissa; 404bf215546Sopenharmony_ci unsigned n; 405bf215546Sopenharmony_ci unsigned long long ubound; 406bf215546Sopenharmony_ci unsigned long long mask; 407bf215546Sopenharmony_ci double scale; 408bf215546Sopenharmony_ci double bias; 409bf215546Sopenharmony_ci 410bf215546Sopenharmony_ci assert(dst_type.floating); 411bf215546Sopenharmony_ci 412bf215546Sopenharmony_ci mantissa = lp_mantissa(dst_type); 413bf215546Sopenharmony_ci 414bf215546Sopenharmony_ci if (src_width <= (mantissa + 1)) { 415bf215546Sopenharmony_ci /* 416bf215546Sopenharmony_ci * The source width matches fits what can be represented in floating 417bf215546Sopenharmony_ci * point (i.e., mantissa + 1 bits). So do a straight multiplication 418bf215546Sopenharmony_ci * followed by casting. No further rounding is necessary. 419bf215546Sopenharmony_ci */ 420bf215546Sopenharmony_ci 421bf215546Sopenharmony_ci scale = 1.0/(double)((1ULL << src_width) - 1); 422bf215546Sopenharmony_ci res = LLVMBuildSIToFP(builder, src, vec_type, ""); 423bf215546Sopenharmony_ci res = LLVMBuildFMul(builder, res, 424bf215546Sopenharmony_ci lp_build_const_vec(gallivm, dst_type, scale), ""); 425bf215546Sopenharmony_ci return res; 426bf215546Sopenharmony_ci } 427bf215546Sopenharmony_ci else { 428bf215546Sopenharmony_ci /* 429bf215546Sopenharmony_ci * The source width exceeds what can be represented in floating 430bf215546Sopenharmony_ci * point. So truncate the incoming values. 431bf215546Sopenharmony_ci */ 432bf215546Sopenharmony_ci 433bf215546Sopenharmony_ci n = MIN2(mantissa, src_width); 434bf215546Sopenharmony_ci 435bf215546Sopenharmony_ci ubound = ((unsigned long long)1 << n); 436bf215546Sopenharmony_ci mask = ubound - 1; 437bf215546Sopenharmony_ci scale = (double)ubound/mask; 438bf215546Sopenharmony_ci bias = (double)((unsigned long long)1 << (mantissa - n)); 439bf215546Sopenharmony_ci 440bf215546Sopenharmony_ci res = src; 441bf215546Sopenharmony_ci 442bf215546Sopenharmony_ci if (src_width > mantissa) { 443bf215546Sopenharmony_ci int shift = src_width - mantissa; 444bf215546Sopenharmony_ci res = LLVMBuildLShr(builder, res, 445bf215546Sopenharmony_ci lp_build_const_int_vec(gallivm, dst_type, shift), ""); 446bf215546Sopenharmony_ci } 447bf215546Sopenharmony_ci 448bf215546Sopenharmony_ci bias_ = lp_build_const_vec(gallivm, dst_type, bias); 449bf215546Sopenharmony_ci 450bf215546Sopenharmony_ci res = LLVMBuildOr(builder, 451bf215546Sopenharmony_ci res, 452bf215546Sopenharmony_ci LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); 453bf215546Sopenharmony_ci 454bf215546Sopenharmony_ci res = LLVMBuildBitCast(builder, res, vec_type, ""); 455bf215546Sopenharmony_ci 456bf215546Sopenharmony_ci res = LLVMBuildFSub(builder, res, bias_, ""); 457bf215546Sopenharmony_ci res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), ""); 458bf215546Sopenharmony_ci } 459bf215546Sopenharmony_ci 460bf215546Sopenharmony_ci return res; 461bf215546Sopenharmony_ci} 462bf215546Sopenharmony_ci 463bf215546Sopenharmony_ci 464bf215546Sopenharmony_ci/** 465bf215546Sopenharmony_ci * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used. 466bf215546Sopenharmony_ci * 467bf215546Sopenharmony_ci * Returns the number of dsts created from src 468bf215546Sopenharmony_ci */ 469bf215546Sopenharmony_ciint lp_build_conv_auto(struct gallivm_state *gallivm, 470bf215546Sopenharmony_ci struct lp_type src_type, 471bf215546Sopenharmony_ci struct lp_type* dst_type, 472bf215546Sopenharmony_ci const LLVMValueRef *src, 473bf215546Sopenharmony_ci unsigned num_srcs, 474bf215546Sopenharmony_ci LLVMValueRef *dst) 475bf215546Sopenharmony_ci{ 476bf215546Sopenharmony_ci unsigned i; 477bf215546Sopenharmony_ci int num_dsts = num_srcs; 478bf215546Sopenharmony_ci 479bf215546Sopenharmony_ci if (src_type.floating == dst_type->floating && 480bf215546Sopenharmony_ci src_type.width == dst_type->width && 481bf215546Sopenharmony_ci src_type.length == dst_type->length && 482bf215546Sopenharmony_ci src_type.fixed == dst_type->fixed && 483bf215546Sopenharmony_ci src_type.norm == dst_type->norm && 484bf215546Sopenharmony_ci src_type.sign == dst_type->sign) 485bf215546Sopenharmony_ci return num_dsts; 486bf215546Sopenharmony_ci 487bf215546Sopenharmony_ci /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8 488bf215546Sopenharmony_ci */ 489bf215546Sopenharmony_ci if (src_type.norm == 0 && 490bf215546Sopenharmony_ci src_type.width == 32 && 491bf215546Sopenharmony_ci src_type.fixed == 0 && 492bf215546Sopenharmony_ci 493bf215546Sopenharmony_ci dst_type->floating == 0 && 494bf215546Sopenharmony_ci dst_type->fixed == 0 && 495bf215546Sopenharmony_ci dst_type->width == 8 && 496bf215546Sopenharmony_ci 497bf215546Sopenharmony_ci ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) || 498bf215546Sopenharmony_ci (src_type.floating == 0 && dst_type->floating == 0 && 499bf215546Sopenharmony_ci src_type.sign == dst_type->sign && dst_type->norm == 0))) { 500bf215546Sopenharmony_ci 501bf215546Sopenharmony_ci /* Special case 4x4x32 --> 1x16x8 */ 502bf215546Sopenharmony_ci if (src_type.length == 4 && 503bf215546Sopenharmony_ci (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec)) 504bf215546Sopenharmony_ci { 505bf215546Sopenharmony_ci num_dsts = (num_srcs + 3) / 4; 506bf215546Sopenharmony_ci dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4; 507bf215546Sopenharmony_ci 508bf215546Sopenharmony_ci lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); 509bf215546Sopenharmony_ci return num_dsts; 510bf215546Sopenharmony_ci } 511bf215546Sopenharmony_ci 512bf215546Sopenharmony_ci /* Special case 2x8x32 --> 1x16x8 */ 513bf215546Sopenharmony_ci if (src_type.length == 8 && 514bf215546Sopenharmony_ci util_get_cpu_caps()->has_avx) 515bf215546Sopenharmony_ci { 516bf215546Sopenharmony_ci num_dsts = (num_srcs + 1) / 2; 517bf215546Sopenharmony_ci dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8; 518bf215546Sopenharmony_ci 519bf215546Sopenharmony_ci lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); 520bf215546Sopenharmony_ci return num_dsts; 521bf215546Sopenharmony_ci } 522bf215546Sopenharmony_ci } 523bf215546Sopenharmony_ci 524bf215546Sopenharmony_ci /* lp_build_resize does not support M:N */ 525bf215546Sopenharmony_ci if (src_type.width == dst_type->width) { 526bf215546Sopenharmony_ci lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); 527bf215546Sopenharmony_ci } else { 528bf215546Sopenharmony_ci /* 529bf215546Sopenharmony_ci * If dst_width is 16 bits and src_width 32 and the dst vector size 530bf215546Sopenharmony_ci * 64bit, try feeding 2 vectors at once so pack intrinsics can be used. 531bf215546Sopenharmony_ci * (For AVX, this isn't needed, since we usually get 256bit src and 532bf215546Sopenharmony_ci * 128bit dst vectors which works ok. If we do AVX2 pack this should 533bf215546Sopenharmony_ci * be extended but need to be able to tell conversion code about pack 534bf215546Sopenharmony_ci * ordering first.) 535bf215546Sopenharmony_ci */ 536bf215546Sopenharmony_ci unsigned ratio = 1; 537bf215546Sopenharmony_ci if (src_type.width == 2 * dst_type->width && 538bf215546Sopenharmony_ci src_type.length == dst_type->length && 539bf215546Sopenharmony_ci dst_type->floating == 0 && (num_srcs % 2 == 0) && 540bf215546Sopenharmony_ci dst_type->width * dst_type->length == 64) { 541bf215546Sopenharmony_ci ratio = 2; 542bf215546Sopenharmony_ci num_dsts /= 2; 543bf215546Sopenharmony_ci dst_type->length *= 2; 544bf215546Sopenharmony_ci } 545bf215546Sopenharmony_ci for (i = 0; i < num_dsts; i++) { 546bf215546Sopenharmony_ci lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1); 547bf215546Sopenharmony_ci } 548bf215546Sopenharmony_ci } 549bf215546Sopenharmony_ci 550bf215546Sopenharmony_ci return num_dsts; 551bf215546Sopenharmony_ci} 552bf215546Sopenharmony_ci 553bf215546Sopenharmony_ci 554bf215546Sopenharmony_ci/** 555bf215546Sopenharmony_ci * Generic type conversion. 556bf215546Sopenharmony_ci * 557bf215546Sopenharmony_ci * TODO: Take a precision argument, or even better, add a new precision member 558bf215546Sopenharmony_ci * to the lp_type union. 559bf215546Sopenharmony_ci */ 560bf215546Sopenharmony_civoid 561bf215546Sopenharmony_cilp_build_conv(struct gallivm_state *gallivm, 562bf215546Sopenharmony_ci struct lp_type src_type, 563bf215546Sopenharmony_ci struct lp_type dst_type, 564bf215546Sopenharmony_ci const LLVMValueRef *src, unsigned num_srcs, 565bf215546Sopenharmony_ci LLVMValueRef *dst, unsigned num_dsts) 566bf215546Sopenharmony_ci{ 567bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 568bf215546Sopenharmony_ci struct lp_type tmp_type; 569bf215546Sopenharmony_ci LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 570bf215546Sopenharmony_ci unsigned num_tmps; 571bf215546Sopenharmony_ci unsigned i; 572bf215546Sopenharmony_ci 573bf215546Sopenharmony_ci /* We must not loose or gain channels. Only precision */ 574bf215546Sopenharmony_ci assert(src_type.length * num_srcs == dst_type.length * num_dsts); 575bf215546Sopenharmony_ci 576bf215546Sopenharmony_ci assert(src_type.length <= LP_MAX_VECTOR_LENGTH); 577bf215546Sopenharmony_ci assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); 578bf215546Sopenharmony_ci assert(num_srcs <= LP_MAX_VECTOR_LENGTH); 579bf215546Sopenharmony_ci assert(num_dsts <= LP_MAX_VECTOR_LENGTH); 580bf215546Sopenharmony_ci 581bf215546Sopenharmony_ci tmp_type = src_type; 582bf215546Sopenharmony_ci for(i = 0; i < num_srcs; ++i) { 583bf215546Sopenharmony_ci assert(lp_check_value(src_type, src[i])); 584bf215546Sopenharmony_ci tmp[i] = src[i]; 585bf215546Sopenharmony_ci } 586bf215546Sopenharmony_ci num_tmps = num_srcs; 587bf215546Sopenharmony_ci 588bf215546Sopenharmony_ci 589bf215546Sopenharmony_ci /* 590bf215546Sopenharmony_ci * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8 591bf215546Sopenharmony_ci * Only float -> s/unorm8 and (u)int32->(u)int8. 592bf215546Sopenharmony_ci * XXX: This should cover all interesting backend cases for 8 bit, 593bf215546Sopenharmony_ci * but should use same strategy if dst is 16 bit. 594bf215546Sopenharmony_ci */ 595bf215546Sopenharmony_ci if (src_type.norm == 0 && 596bf215546Sopenharmony_ci src_type.width == 32 && 597bf215546Sopenharmony_ci src_type.length == 4 && 598bf215546Sopenharmony_ci src_type.fixed == 0 && 599bf215546Sopenharmony_ci 600bf215546Sopenharmony_ci dst_type.floating == 0 && 601bf215546Sopenharmony_ci dst_type.fixed == 0 && 602bf215546Sopenharmony_ci dst_type.width == 8 && 603bf215546Sopenharmony_ci 604bf215546Sopenharmony_ci ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) || 605bf215546Sopenharmony_ci (src_type.floating == 0 && dst_type.floating == 0 && 606bf215546Sopenharmony_ci src_type.sign == dst_type.sign && dst_type.norm == 0)) && 607bf215546Sopenharmony_ci 608bf215546Sopenharmony_ci ((dst_type.length == 16 && 4 * num_dsts == num_srcs) || 609bf215546Sopenharmony_ci (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) && 610bf215546Sopenharmony_ci 611bf215546Sopenharmony_ci (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec)) 612bf215546Sopenharmony_ci { 613bf215546Sopenharmony_ci struct lp_build_context bld; 614bf215546Sopenharmony_ci struct lp_type int16_type, int32_type; 615bf215546Sopenharmony_ci struct lp_type dst_type_ext = dst_type; 616bf215546Sopenharmony_ci LLVMValueRef const_scale; 617bf215546Sopenharmony_ci unsigned i, j; 618bf215546Sopenharmony_ci 619bf215546Sopenharmony_ci lp_build_context_init(&bld, gallivm, src_type); 620bf215546Sopenharmony_ci 621bf215546Sopenharmony_ci dst_type_ext.length = 16; 622bf215546Sopenharmony_ci int16_type = int32_type = dst_type_ext; 623bf215546Sopenharmony_ci 624bf215546Sopenharmony_ci int16_type.width *= 2; 625bf215546Sopenharmony_ci int16_type.length /= 2; 626bf215546Sopenharmony_ci int16_type.sign = 1; 627bf215546Sopenharmony_ci 628bf215546Sopenharmony_ci int32_type.width *= 4; 629bf215546Sopenharmony_ci int32_type.length /= 4; 630bf215546Sopenharmony_ci int32_type.sign = 1; 631bf215546Sopenharmony_ci 632bf215546Sopenharmony_ci const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type)); 633bf215546Sopenharmony_ci 634bf215546Sopenharmony_ci for (i = 0; i < num_dsts; ++i, src += 4) { 635bf215546Sopenharmony_ci LLVMValueRef lo, hi; 636bf215546Sopenharmony_ci 637bf215546Sopenharmony_ci if (src_type.floating) { 638bf215546Sopenharmony_ci for (j = 0; j < dst_type.length / 4; ++j) { 639bf215546Sopenharmony_ci /* 640bf215546Sopenharmony_ci * XXX This is not actually fully correct. The float to int 641bf215546Sopenharmony_ci * conversion will produce 0x80000000 value for everything 642bf215546Sopenharmony_ci * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq). 643bf215546Sopenharmony_ci * Hence, NaNs and negatives will get clamped just fine to zero 644bf215546Sopenharmony_ci * (relying on clamping pack behavior) when converting to unorm, 645bf215546Sopenharmony_ci * however too large values (both finite and infinite) will also 646bf215546Sopenharmony_ci * end up as zero, not 255. 647bf215546Sopenharmony_ci * For snorm, for now we'll keep bug compatibility with generic 648bf215546Sopenharmony_ci * conversion path (meaning too large values are fine, but 649bf215546Sopenharmony_ci * NaNs get converted to -128 (purely by luck, as we don't 650bf215546Sopenharmony_ci * specify nan behavior for the max there) instead of 0). 651bf215546Sopenharmony_ci * 652bf215546Sopenharmony_ci * dEQP has GLES31 tests that expect +inf -> 255.0. 653bf215546Sopenharmony_ci */ 654bf215546Sopenharmony_ci if (dst_type.sign) { 655bf215546Sopenharmony_ci tmp[j] = lp_build_min(&bld, bld.one, src[j]); 656bf215546Sopenharmony_ci 657bf215546Sopenharmony_ci } 658bf215546Sopenharmony_ci else { 659bf215546Sopenharmony_ci if (1) { 660bf215546Sopenharmony_ci tmp[j] = lp_build_min_ext(&bld, bld.one, src[j], 661bf215546Sopenharmony_ci GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 662bf215546Sopenharmony_ci } 663bf215546Sopenharmony_ci tmp[j] = src[j]; 664bf215546Sopenharmony_ci } 665bf215546Sopenharmony_ci tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, ""); 666bf215546Sopenharmony_ci tmp[j] = lp_build_iround(&bld, tmp[j]); 667bf215546Sopenharmony_ci } 668bf215546Sopenharmony_ci } else { 669bf215546Sopenharmony_ci for (j = 0; j < dst_type.length / 4; ++j) { 670bf215546Sopenharmony_ci if (!dst_type.sign) { 671bf215546Sopenharmony_ci /* 672bf215546Sopenharmony_ci * Pack clamp is always signed->unsigned (or signed->signed). 673bf215546Sopenharmony_ci * Hence need min. 674bf215546Sopenharmony_ci */ 675bf215546Sopenharmony_ci LLVMValueRef const_max; 676bf215546Sopenharmony_ci const_max = lp_build_const_int_vec(gallivm, src_type, 255); 677bf215546Sopenharmony_ci tmp[j] = lp_build_min(&bld, src[j], const_max); 678bf215546Sopenharmony_ci } else { 679bf215546Sopenharmony_ci tmp[j] = src[j]; 680bf215546Sopenharmony_ci } 681bf215546Sopenharmony_ci } 682bf215546Sopenharmony_ci } 683bf215546Sopenharmony_ci 684bf215546Sopenharmony_ci if (num_srcs == 1) { 685bf215546Sopenharmony_ci tmp[1] = tmp[0]; 686bf215546Sopenharmony_ci } 687bf215546Sopenharmony_ci 688bf215546Sopenharmony_ci /* relying on clamping behavior of sse2 intrinsics here */ 689bf215546Sopenharmony_ci lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); 690bf215546Sopenharmony_ci 691bf215546Sopenharmony_ci if (num_srcs < 4) { 692bf215546Sopenharmony_ci hi = lo; 693bf215546Sopenharmony_ci } 694bf215546Sopenharmony_ci else { 695bf215546Sopenharmony_ci hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); 696bf215546Sopenharmony_ci } 697bf215546Sopenharmony_ci dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi); 698bf215546Sopenharmony_ci } 699bf215546Sopenharmony_ci if (num_srcs < 4) { 700bf215546Sopenharmony_ci dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length); 701bf215546Sopenharmony_ci } 702bf215546Sopenharmony_ci 703bf215546Sopenharmony_ci return; 704bf215546Sopenharmony_ci } 705bf215546Sopenharmony_ci 706bf215546Sopenharmony_ci /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8 707bf215546Sopenharmony_ci */ 708bf215546Sopenharmony_ci else if (src_type.norm == 0 && 709bf215546Sopenharmony_ci src_type.width == 32 && 710bf215546Sopenharmony_ci src_type.length == 8 && 711bf215546Sopenharmony_ci src_type.fixed == 0 && 712bf215546Sopenharmony_ci 713bf215546Sopenharmony_ci dst_type.floating == 0 && 714bf215546Sopenharmony_ci dst_type.fixed == 0 && 715bf215546Sopenharmony_ci dst_type.width == 8 && 716bf215546Sopenharmony_ci 717bf215546Sopenharmony_ci ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) || 718bf215546Sopenharmony_ci (src_type.floating == 0 && dst_type.floating == 0 && 719bf215546Sopenharmony_ci src_type.sign == dst_type.sign && dst_type.norm == 0)) && 720bf215546Sopenharmony_ci 721bf215546Sopenharmony_ci ((dst_type.length == 16 && 2 * num_dsts == num_srcs) || 722bf215546Sopenharmony_ci (num_dsts == 1 && dst_type.length * num_srcs == 8)) && 723bf215546Sopenharmony_ci 724bf215546Sopenharmony_ci util_get_cpu_caps()->has_avx) { 725bf215546Sopenharmony_ci 726bf215546Sopenharmony_ci struct lp_build_context bld; 727bf215546Sopenharmony_ci struct lp_type int16_type, int32_type; 728bf215546Sopenharmony_ci struct lp_type dst_type_ext = dst_type; 729bf215546Sopenharmony_ci LLVMValueRef const_scale; 730bf215546Sopenharmony_ci unsigned i; 731bf215546Sopenharmony_ci 732bf215546Sopenharmony_ci lp_build_context_init(&bld, gallivm, src_type); 733bf215546Sopenharmony_ci 734bf215546Sopenharmony_ci dst_type_ext.length = 16; 735bf215546Sopenharmony_ci int16_type = int32_type = dst_type_ext; 736bf215546Sopenharmony_ci 737bf215546Sopenharmony_ci int16_type.width *= 2; 738bf215546Sopenharmony_ci int16_type.length /= 2; 739bf215546Sopenharmony_ci int16_type.sign = 1; 740bf215546Sopenharmony_ci 741bf215546Sopenharmony_ci int32_type.width *= 4; 742bf215546Sopenharmony_ci int32_type.length /= 4; 743bf215546Sopenharmony_ci int32_type.sign = 1; 744bf215546Sopenharmony_ci 745bf215546Sopenharmony_ci const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type)); 746bf215546Sopenharmony_ci 747bf215546Sopenharmony_ci for (i = 0; i < num_dsts; ++i, src += 2) { 748bf215546Sopenharmony_ci unsigned j; 749bf215546Sopenharmony_ci for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) { 750bf215546Sopenharmony_ci LLVMValueRef lo, hi, a; 751bf215546Sopenharmony_ci 752bf215546Sopenharmony_ci a = src[j]; 753bf215546Sopenharmony_ci if (src_type.floating) { 754bf215546Sopenharmony_ci if (dst_type.sign) { 755bf215546Sopenharmony_ci a = lp_build_min(&bld, bld.one, a); 756bf215546Sopenharmony_ci 757bf215546Sopenharmony_ci } 758bf215546Sopenharmony_ci else { 759bf215546Sopenharmony_ci if (1) { 760bf215546Sopenharmony_ci a = lp_build_min_ext(&bld, bld.one, a, 761bf215546Sopenharmony_ci GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 762bf215546Sopenharmony_ci } 763bf215546Sopenharmony_ci } 764bf215546Sopenharmony_ci a = LLVMBuildFMul(builder, a, const_scale, ""); 765bf215546Sopenharmony_ci a = lp_build_iround(&bld, a); 766bf215546Sopenharmony_ci } else { 767bf215546Sopenharmony_ci if (!dst_type.sign) { 768bf215546Sopenharmony_ci LLVMValueRef const_max; 769bf215546Sopenharmony_ci const_max = lp_build_const_int_vec(gallivm, src_type, 255); 770bf215546Sopenharmony_ci a = lp_build_min(&bld, a, const_max); 771bf215546Sopenharmony_ci } 772bf215546Sopenharmony_ci } 773bf215546Sopenharmony_ci lo = lp_build_extract_range(gallivm, a, 0, 4); 774bf215546Sopenharmony_ci hi = lp_build_extract_range(gallivm, a, 4, 4); 775bf215546Sopenharmony_ci /* relying on clamping behavior of sse2 intrinsics here */ 776bf215546Sopenharmony_ci tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi); 777bf215546Sopenharmony_ci } 778bf215546Sopenharmony_ci 779bf215546Sopenharmony_ci if (num_srcs == 1) { 780bf215546Sopenharmony_ci tmp[1] = tmp[0]; 781bf215546Sopenharmony_ci } 782bf215546Sopenharmony_ci dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]); 783bf215546Sopenharmony_ci } 784bf215546Sopenharmony_ci 785bf215546Sopenharmony_ci if (num_srcs == 1) { 786bf215546Sopenharmony_ci dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length); 787bf215546Sopenharmony_ci } 788bf215546Sopenharmony_ci 789bf215546Sopenharmony_ci return; 790bf215546Sopenharmony_ci } 791bf215546Sopenharmony_ci 792bf215546Sopenharmony_ci /* Special case -> 16bit half-float 793bf215546Sopenharmony_ci */ 794bf215546Sopenharmony_ci else if (dst_type.floating && dst_type.width == 16) 795bf215546Sopenharmony_ci { 796bf215546Sopenharmony_ci /* Only support src as 32bit float currently */ 797bf215546Sopenharmony_ci assert(src_type.floating && src_type.width == 32); 798bf215546Sopenharmony_ci 799bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) 800bf215546Sopenharmony_ci dst[i] = lp_build_float_to_half(gallivm, tmp[i]); 801bf215546Sopenharmony_ci 802bf215546Sopenharmony_ci return; 803bf215546Sopenharmony_ci } 804bf215546Sopenharmony_ci 805bf215546Sopenharmony_ci /* Pre convert half-floats to floats 806bf215546Sopenharmony_ci */ 807bf215546Sopenharmony_ci else if (src_type.floating && src_type.width == 16) 808bf215546Sopenharmony_ci { 809bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) 810bf215546Sopenharmony_ci tmp[i] = lp_build_half_to_float(gallivm, tmp[i]); 811bf215546Sopenharmony_ci 812bf215546Sopenharmony_ci tmp_type.width = 32; 813bf215546Sopenharmony_ci } 814bf215546Sopenharmony_ci 815bf215546Sopenharmony_ci /* 816bf215546Sopenharmony_ci * Clamp if necessary 817bf215546Sopenharmony_ci */ 818bf215546Sopenharmony_ci 819bf215546Sopenharmony_ci if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { 820bf215546Sopenharmony_ci struct lp_build_context bld; 821bf215546Sopenharmony_ci double src_min = lp_const_min(src_type); 822bf215546Sopenharmony_ci double dst_min = lp_const_min(dst_type); 823bf215546Sopenharmony_ci double src_max = lp_const_max(src_type); 824bf215546Sopenharmony_ci double dst_max = lp_const_max(dst_type); 825bf215546Sopenharmony_ci LLVMValueRef thres; 826bf215546Sopenharmony_ci 827bf215546Sopenharmony_ci lp_build_context_init(&bld, gallivm, tmp_type); 828bf215546Sopenharmony_ci 829bf215546Sopenharmony_ci if(src_min < dst_min) { 830bf215546Sopenharmony_ci if(dst_min == 0.0) 831bf215546Sopenharmony_ci thres = bld.zero; 832bf215546Sopenharmony_ci else 833bf215546Sopenharmony_ci thres = lp_build_const_vec(gallivm, src_type, dst_min); 834bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) 835bf215546Sopenharmony_ci tmp[i] = lp_build_max(&bld, tmp[i], thres); 836bf215546Sopenharmony_ci } 837bf215546Sopenharmony_ci 838bf215546Sopenharmony_ci if(src_max > dst_max) { 839bf215546Sopenharmony_ci if(dst_max == 1.0) 840bf215546Sopenharmony_ci thres = bld.one; 841bf215546Sopenharmony_ci else 842bf215546Sopenharmony_ci thres = lp_build_const_vec(gallivm, src_type, dst_max); 843bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) 844bf215546Sopenharmony_ci tmp[i] = lp_build_min(&bld, tmp[i], thres); 845bf215546Sopenharmony_ci } 846bf215546Sopenharmony_ci } 847bf215546Sopenharmony_ci 848bf215546Sopenharmony_ci /* 849bf215546Sopenharmony_ci * Scale to the narrowest range 850bf215546Sopenharmony_ci */ 851bf215546Sopenharmony_ci 852bf215546Sopenharmony_ci if(dst_type.floating) { 853bf215546Sopenharmony_ci /* Nothing to do */ 854bf215546Sopenharmony_ci } 855bf215546Sopenharmony_ci else if(tmp_type.floating) { 856bf215546Sopenharmony_ci if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { 857bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) { 858bf215546Sopenharmony_ci tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm, 859bf215546Sopenharmony_ci tmp_type, 860bf215546Sopenharmony_ci dst_type.width, 861bf215546Sopenharmony_ci tmp[i]); 862bf215546Sopenharmony_ci } 863bf215546Sopenharmony_ci tmp_type.floating = FALSE; 864bf215546Sopenharmony_ci } 865bf215546Sopenharmony_ci else { 866bf215546Sopenharmony_ci double dst_scale = lp_const_scale(dst_type); 867bf215546Sopenharmony_ci 868bf215546Sopenharmony_ci if (dst_scale != 1.0) { 869bf215546Sopenharmony_ci LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale); 870bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) 871bf215546Sopenharmony_ci tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 872bf215546Sopenharmony_ci } 873bf215546Sopenharmony_ci 874bf215546Sopenharmony_ci /* 875bf215546Sopenharmony_ci * these functions will use fptosi in some form which won't work 876bf215546Sopenharmony_ci * with 32bit uint dst. Causes lp_test_conv failures though. 877bf215546Sopenharmony_ci */ 878bf215546Sopenharmony_ci if (0) 879bf215546Sopenharmony_ci assert(dst_type.sign || dst_type.width < 32); 880bf215546Sopenharmony_ci 881bf215546Sopenharmony_ci if (dst_type.sign && dst_type.norm && !dst_type.fixed) { 882bf215546Sopenharmony_ci struct lp_build_context bld; 883bf215546Sopenharmony_ci 884bf215546Sopenharmony_ci lp_build_context_init(&bld, gallivm, tmp_type); 885bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) { 886bf215546Sopenharmony_ci tmp[i] = lp_build_iround(&bld, tmp[i]); 887bf215546Sopenharmony_ci } 888bf215546Sopenharmony_ci tmp_type.floating = FALSE; 889bf215546Sopenharmony_ci } 890bf215546Sopenharmony_ci else { 891bf215546Sopenharmony_ci LLVMTypeRef tmp_vec_type; 892bf215546Sopenharmony_ci 893bf215546Sopenharmony_ci tmp_type.floating = FALSE; 894bf215546Sopenharmony_ci tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 895bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) { 896bf215546Sopenharmony_ci#if 0 897bf215546Sopenharmony_ci if(dst_type.sign) 898bf215546Sopenharmony_ci tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 899bf215546Sopenharmony_ci else 900bf215546Sopenharmony_ci tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); 901bf215546Sopenharmony_ci#else 902bf215546Sopenharmony_ci /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ 903bf215546Sopenharmony_ci tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 904bf215546Sopenharmony_ci#endif 905bf215546Sopenharmony_ci } 906bf215546Sopenharmony_ci } 907bf215546Sopenharmony_ci } 908bf215546Sopenharmony_ci } 909bf215546Sopenharmony_ci else { 910bf215546Sopenharmony_ci unsigned src_shift = lp_const_shift(src_type); 911bf215546Sopenharmony_ci unsigned dst_shift = lp_const_shift(dst_type); 912bf215546Sopenharmony_ci unsigned src_offset = lp_const_offset(src_type); 913bf215546Sopenharmony_ci unsigned dst_offset = lp_const_offset(dst_type); 914bf215546Sopenharmony_ci struct lp_build_context bld; 915bf215546Sopenharmony_ci lp_build_context_init(&bld, gallivm, tmp_type); 916bf215546Sopenharmony_ci 917bf215546Sopenharmony_ci /* Compensate for different offsets */ 918bf215546Sopenharmony_ci /* sscaled -> unorm and similar would cause negative shift count, skip */ 919bf215546Sopenharmony_ci if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) { 920bf215546Sopenharmony_ci for (i = 0; i < num_tmps; ++i) { 921bf215546Sopenharmony_ci LLVMValueRef shifted; 922bf215546Sopenharmony_ci 923bf215546Sopenharmony_ci shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1); 924bf215546Sopenharmony_ci tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, ""); 925bf215546Sopenharmony_ci } 926bf215546Sopenharmony_ci } 927bf215546Sopenharmony_ci 928bf215546Sopenharmony_ci if(src_shift > dst_shift) { 929bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) 930bf215546Sopenharmony_ci tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift); 931bf215546Sopenharmony_ci } 932bf215546Sopenharmony_ci } 933bf215546Sopenharmony_ci 934bf215546Sopenharmony_ci /* 935bf215546Sopenharmony_ci * Truncate or expand bit width 936bf215546Sopenharmony_ci * 937bf215546Sopenharmony_ci * No data conversion should happen here, although the sign bits are 938bf215546Sopenharmony_ci * crucial to avoid bad clamping. 939bf215546Sopenharmony_ci */ 940bf215546Sopenharmony_ci 941bf215546Sopenharmony_ci { 942bf215546Sopenharmony_ci struct lp_type new_type; 943bf215546Sopenharmony_ci 944bf215546Sopenharmony_ci new_type = tmp_type; 945bf215546Sopenharmony_ci new_type.sign = dst_type.sign; 946bf215546Sopenharmony_ci new_type.width = dst_type.width; 947bf215546Sopenharmony_ci new_type.length = dst_type.length; 948bf215546Sopenharmony_ci 949bf215546Sopenharmony_ci /* 950bf215546Sopenharmony_ci * Note that resize when using packs can sometimes get min/max 951bf215546Sopenharmony_ci * clamping for free. Should be able to exploit this... 952bf215546Sopenharmony_ci */ 953bf215546Sopenharmony_ci lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); 954bf215546Sopenharmony_ci 955bf215546Sopenharmony_ci tmp_type = new_type; 956bf215546Sopenharmony_ci num_tmps = num_dsts; 957bf215546Sopenharmony_ci } 958bf215546Sopenharmony_ci 959bf215546Sopenharmony_ci /* 960bf215546Sopenharmony_ci * Scale to the widest range 961bf215546Sopenharmony_ci */ 962bf215546Sopenharmony_ci 963bf215546Sopenharmony_ci if(src_type.floating) { 964bf215546Sopenharmony_ci /* Nothing to do */ 965bf215546Sopenharmony_ci } 966bf215546Sopenharmony_ci else if(!src_type.floating && dst_type.floating) { 967bf215546Sopenharmony_ci if(!src_type.fixed && !src_type.sign && src_type.norm) { 968bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) { 969bf215546Sopenharmony_ci tmp[i] = lp_build_unsigned_norm_to_float(gallivm, 970bf215546Sopenharmony_ci src_type.width, 971bf215546Sopenharmony_ci dst_type, 972bf215546Sopenharmony_ci tmp[i]); 973bf215546Sopenharmony_ci } 974bf215546Sopenharmony_ci tmp_type.floating = TRUE; 975bf215546Sopenharmony_ci } 976bf215546Sopenharmony_ci else { 977bf215546Sopenharmony_ci double src_scale = lp_const_scale(src_type); 978bf215546Sopenharmony_ci LLVMTypeRef tmp_vec_type; 979bf215546Sopenharmony_ci 980bf215546Sopenharmony_ci /* Use an equally sized integer for intermediate computations */ 981bf215546Sopenharmony_ci tmp_type.floating = TRUE; 982bf215546Sopenharmony_ci tmp_type.sign = TRUE; 983bf215546Sopenharmony_ci tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 984bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) { 985bf215546Sopenharmony_ci#if 0 986bf215546Sopenharmony_ci if(dst_type.sign) 987bf215546Sopenharmony_ci tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 988bf215546Sopenharmony_ci else 989bf215546Sopenharmony_ci tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); 990bf215546Sopenharmony_ci#else 991bf215546Sopenharmony_ci /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ 992bf215546Sopenharmony_ci tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 993bf215546Sopenharmony_ci#endif 994bf215546Sopenharmony_ci } 995bf215546Sopenharmony_ci 996bf215546Sopenharmony_ci if (src_scale != 1.0) { 997bf215546Sopenharmony_ci LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale); 998bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) 999bf215546Sopenharmony_ci tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 1000bf215546Sopenharmony_ci } 1001bf215546Sopenharmony_ci 1002bf215546Sopenharmony_ci /* the formula above will produce value below -1.0 for most negative 1003bf215546Sopenharmony_ci * value but everything seems happy with that hence disable for now */ 1004bf215546Sopenharmony_ci if (0 && !src_type.fixed && src_type.norm && src_type.sign) { 1005bf215546Sopenharmony_ci struct lp_build_context bld; 1006bf215546Sopenharmony_ci 1007bf215546Sopenharmony_ci lp_build_context_init(&bld, gallivm, dst_type); 1008bf215546Sopenharmony_ci for(i = 0; i < num_tmps; ++i) { 1009bf215546Sopenharmony_ci tmp[i] = lp_build_max(&bld, tmp[i], 1010bf215546Sopenharmony_ci lp_build_const_vec(gallivm, dst_type, -1.0f)); 1011bf215546Sopenharmony_ci } 1012bf215546Sopenharmony_ci } 1013bf215546Sopenharmony_ci } 1014bf215546Sopenharmony_ci } 1015bf215546Sopenharmony_ci else { 1016bf215546Sopenharmony_ci unsigned src_shift = lp_const_shift(src_type); 1017bf215546Sopenharmony_ci unsigned dst_shift = lp_const_shift(dst_type); 1018bf215546Sopenharmony_ci unsigned src_offset = lp_const_offset(src_type); 1019bf215546Sopenharmony_ci unsigned dst_offset = lp_const_offset(dst_type); 1020bf215546Sopenharmony_ci struct lp_build_context bld; 1021bf215546Sopenharmony_ci lp_build_context_init(&bld, gallivm, tmp_type); 1022bf215546Sopenharmony_ci 1023bf215546Sopenharmony_ci if (src_shift < dst_shift) { 1024bf215546Sopenharmony_ci LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH]; 1025bf215546Sopenharmony_ci 1026bf215546Sopenharmony_ci if (dst_shift - src_shift < dst_type.width) { 1027bf215546Sopenharmony_ci for (i = 0; i < num_tmps; ++i) { 1028bf215546Sopenharmony_ci pre_shift[i] = tmp[i]; 1029bf215546Sopenharmony_ci tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift); 1030bf215546Sopenharmony_ci } 1031bf215546Sopenharmony_ci } 1032bf215546Sopenharmony_ci else { 1033bf215546Sopenharmony_ci /* 1034bf215546Sopenharmony_ci * This happens for things like sscaled -> unorm conversions. Shift 1035bf215546Sopenharmony_ci * counts equal to bit width cause undefined results, so hack around it. 1036bf215546Sopenharmony_ci */ 1037bf215546Sopenharmony_ci for (i = 0; i < num_tmps; ++i) { 1038bf215546Sopenharmony_ci pre_shift[i] = tmp[i]; 1039bf215546Sopenharmony_ci tmp[i] = lp_build_zero(gallivm, dst_type); 1040bf215546Sopenharmony_ci } 1041bf215546Sopenharmony_ci } 1042bf215546Sopenharmony_ci 1043bf215546Sopenharmony_ci /* Compensate for different offsets */ 1044bf215546Sopenharmony_ci if (dst_offset > src_offset) { 1045bf215546Sopenharmony_ci for (i = 0; i < num_tmps; ++i) { 1046bf215546Sopenharmony_ci tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], ""); 1047bf215546Sopenharmony_ci } 1048bf215546Sopenharmony_ci } 1049bf215546Sopenharmony_ci } 1050bf215546Sopenharmony_ci } 1051bf215546Sopenharmony_ci 1052bf215546Sopenharmony_ci for(i = 0; i < num_dsts; ++i) { 1053bf215546Sopenharmony_ci dst[i] = tmp[i]; 1054bf215546Sopenharmony_ci assert(lp_check_value(dst_type, dst[i])); 1055bf215546Sopenharmony_ci } 1056bf215546Sopenharmony_ci} 1057bf215546Sopenharmony_ci 1058bf215546Sopenharmony_ci 1059bf215546Sopenharmony_ci/** 1060bf215546Sopenharmony_ci * Bit mask conversion. 1061bf215546Sopenharmony_ci * 1062bf215546Sopenharmony_ci * This will convert the integer masks that match the given types. 1063bf215546Sopenharmony_ci * 1064bf215546Sopenharmony_ci * The mask values should 0 or -1, i.e., all bits either set to zero or one. 1065bf215546Sopenharmony_ci * Any other value will likely cause unpredictable results. 1066bf215546Sopenharmony_ci * 1067bf215546Sopenharmony_ci * This is basically a very trimmed down version of lp_build_conv. 1068bf215546Sopenharmony_ci */ 1069bf215546Sopenharmony_civoid 1070bf215546Sopenharmony_cilp_build_conv_mask(struct gallivm_state *gallivm, 1071bf215546Sopenharmony_ci struct lp_type src_type, 1072bf215546Sopenharmony_ci struct lp_type dst_type, 1073bf215546Sopenharmony_ci const LLVMValueRef *src, unsigned num_srcs, 1074bf215546Sopenharmony_ci LLVMValueRef *dst, unsigned num_dsts) 1075bf215546Sopenharmony_ci{ 1076bf215546Sopenharmony_ci 1077bf215546Sopenharmony_ci /* We must not loose or gain channels. Only precision */ 1078bf215546Sopenharmony_ci assert(src_type.length * num_srcs == dst_type.length * num_dsts); 1079bf215546Sopenharmony_ci 1080bf215546Sopenharmony_ci /* 1081bf215546Sopenharmony_ci * Drop 1082bf215546Sopenharmony_ci * 1083bf215546Sopenharmony_ci * We assume all values are 0 or -1 1084bf215546Sopenharmony_ci */ 1085bf215546Sopenharmony_ci 1086bf215546Sopenharmony_ci src_type.floating = FALSE; 1087bf215546Sopenharmony_ci src_type.fixed = FALSE; 1088bf215546Sopenharmony_ci src_type.sign = TRUE; 1089bf215546Sopenharmony_ci src_type.norm = FALSE; 1090bf215546Sopenharmony_ci 1091bf215546Sopenharmony_ci dst_type.floating = FALSE; 1092bf215546Sopenharmony_ci dst_type.fixed = FALSE; 1093bf215546Sopenharmony_ci dst_type.sign = TRUE; 1094bf215546Sopenharmony_ci dst_type.norm = FALSE; 1095bf215546Sopenharmony_ci 1096bf215546Sopenharmony_ci /* 1097bf215546Sopenharmony_ci * Truncate or expand bit width 1098bf215546Sopenharmony_ci */ 1099bf215546Sopenharmony_ci 1100bf215546Sopenharmony_ci lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts); 1101bf215546Sopenharmony_ci} 1102