1bf215546Sopenharmony_ci/************************************************************************** 2bf215546Sopenharmony_ci * 3bf215546Sopenharmony_ci * Copyright 2009-2010 VMware, Inc. 4bf215546Sopenharmony_ci * All Rights Reserved. 5bf215546Sopenharmony_ci * 6bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 7bf215546Sopenharmony_ci * copy of this software and associated documentation files (the 8bf215546Sopenharmony_ci * "Software"), to deal in the Software without restriction, including 9bf215546Sopenharmony_ci * without limitation the rights to use, copy, modify, merge, publish, 10bf215546Sopenharmony_ci * distribute, sub license, and/or sell copies of the Software, and to 11bf215546Sopenharmony_ci * permit persons to whom the Software is furnished to do so, subject to 12bf215546Sopenharmony_ci * the following conditions: 13bf215546Sopenharmony_ci * 14bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the 15bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions 16bf215546Sopenharmony_ci * of the Software. 17bf215546Sopenharmony_ci * 18bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19bf215546Sopenharmony_ci * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20bf215546Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21bf215546Sopenharmony_ci * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22bf215546Sopenharmony_ci * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23bf215546Sopenharmony_ci * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24bf215546Sopenharmony_ci * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25bf215546Sopenharmony_ci * 26bf215546Sopenharmony_ci **************************************************************************/ 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_ci/** 30bf215546Sopenharmony_ci * @file 31bf215546Sopenharmony_ci * Helper 32bf215546Sopenharmony_ci * 33bf215546Sopenharmony_ci * LLVM IR doesn't support all basic arithmetic operations we care about (most 34bf215546Sopenharmony_ci * notably min/max and saturated operations), and it is often necessary to 35bf215546Sopenharmony_ci * resort machine-specific intrinsics directly. The functions here hide all 36bf215546Sopenharmony_ci * these implementation details from the other modules. 37bf215546Sopenharmony_ci * 38bf215546Sopenharmony_ci * We also do simple expressions simplification here. Reasons are: 39bf215546Sopenharmony_ci * - it is very easy given we have all necessary information readily available 40bf215546Sopenharmony_ci * - LLVM optimization passes fail to simplify several vector expressions 41bf215546Sopenharmony_ci * - We often know value constraints which the optimization passes have no way 42bf215546Sopenharmony_ci * of knowing, such as when source arguments are known to be in [0, 1] range. 43bf215546Sopenharmony_ci * 44bf215546Sopenharmony_ci * @author Jose Fonseca <jfonseca@vmware.com> 45bf215546Sopenharmony_ci */ 46bf215546Sopenharmony_ci 47bf215546Sopenharmony_ci 48bf215546Sopenharmony_ci#include <float.h> 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_ci#include <llvm/Config/llvm-config.h> 51bf215546Sopenharmony_ci 52bf215546Sopenharmony_ci#include "util/u_memory.h" 53bf215546Sopenharmony_ci#include "util/u_debug.h" 54bf215546Sopenharmony_ci#include "util/u_math.h" 55bf215546Sopenharmony_ci#include "util/u_cpu_detect.h" 56bf215546Sopenharmony_ci 57bf215546Sopenharmony_ci#include "lp_bld_type.h" 58bf215546Sopenharmony_ci#include "lp_bld_const.h" 59bf215546Sopenharmony_ci#include "lp_bld_init.h" 60bf215546Sopenharmony_ci#include "lp_bld_intr.h" 61bf215546Sopenharmony_ci#include "lp_bld_logic.h" 62bf215546Sopenharmony_ci#include "lp_bld_pack.h" 63bf215546Sopenharmony_ci#include "lp_bld_debug.h" 64bf215546Sopenharmony_ci#include "lp_bld_bitarit.h" 65bf215546Sopenharmony_ci#include "lp_bld_arit.h" 66bf215546Sopenharmony_ci#include "lp_bld_flow.h" 67bf215546Sopenharmony_ci 68bf215546Sopenharmony_ci#if defined(PIPE_ARCH_SSE) 69bf215546Sopenharmony_ci#include <xmmintrin.h> 70bf215546Sopenharmony_ci#endif 71bf215546Sopenharmony_ci 72bf215546Sopenharmony_ci#ifndef _MM_DENORMALS_ZERO_MASK 73bf215546Sopenharmony_ci#define _MM_DENORMALS_ZERO_MASK 0x0040 74bf215546Sopenharmony_ci#endif 75bf215546Sopenharmony_ci 76bf215546Sopenharmony_ci#ifndef _MM_FLUSH_ZERO_MASK 77bf215546Sopenharmony_ci#define _MM_FLUSH_ZERO_MASK 0x8000 78bf215546Sopenharmony_ci#endif 79bf215546Sopenharmony_ci 80bf215546Sopenharmony_ci#define EXP_POLY_DEGREE 5 81bf215546Sopenharmony_ci 82bf215546Sopenharmony_ci#define LOG_POLY_DEGREE 4 83bf215546Sopenharmony_ci 84bf215546Sopenharmony_ci 85bf215546Sopenharmony_ci/** 86bf215546Sopenharmony_ci * Generate min(a, b) 87bf215546Sopenharmony_ci * No checks for special case values of a or b = 1 or 0 are done. 88bf215546Sopenharmony_ci * NaN's are handled according to the behavior specified by the 89bf215546Sopenharmony_ci * nan_behavior argument. 90bf215546Sopenharmony_ci */ 91bf215546Sopenharmony_cistatic LLVMValueRef 92bf215546Sopenharmony_cilp_build_min_simple(struct lp_build_context *bld, 93bf215546Sopenharmony_ci LLVMValueRef a, 94bf215546Sopenharmony_ci LLVMValueRef b, 95bf215546Sopenharmony_ci enum gallivm_nan_behavior nan_behavior) 96bf215546Sopenharmony_ci{ 97bf215546Sopenharmony_ci const struct lp_type type = bld->type; 98bf215546Sopenharmony_ci const char *intrinsic = NULL; 99bf215546Sopenharmony_ci unsigned intr_size = 0; 100bf215546Sopenharmony_ci LLVMValueRef cond; 101bf215546Sopenharmony_ci 102bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 103bf215546Sopenharmony_ci assert(lp_check_value(type, b)); 104bf215546Sopenharmony_ci 105bf215546Sopenharmony_ci /* TODO: optimize the constant case */ 106bf215546Sopenharmony_ci 107bf215546Sopenharmony_ci if (type.floating && util_get_cpu_caps()->has_sse) { 108bf215546Sopenharmony_ci if (type.width == 32) { 109bf215546Sopenharmony_ci if (type.length == 1) { 110bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse.min.ss"; 111bf215546Sopenharmony_ci intr_size = 128; 112bf215546Sopenharmony_ci } 113bf215546Sopenharmony_ci else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) { 114bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse.min.ps"; 115bf215546Sopenharmony_ci intr_size = 128; 116bf215546Sopenharmony_ci } 117bf215546Sopenharmony_ci else { 118bf215546Sopenharmony_ci intrinsic = "llvm.x86.avx.min.ps.256"; 119bf215546Sopenharmony_ci intr_size = 256; 120bf215546Sopenharmony_ci } 121bf215546Sopenharmony_ci } 122bf215546Sopenharmony_ci if (type.width == 64 && util_get_cpu_caps()->has_sse2) { 123bf215546Sopenharmony_ci if (type.length == 1) { 124bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse2.min.sd"; 125bf215546Sopenharmony_ci intr_size = 128; 126bf215546Sopenharmony_ci } 127bf215546Sopenharmony_ci else if (type.length == 2 || !util_get_cpu_caps()->has_avx) { 128bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse2.min.pd"; 129bf215546Sopenharmony_ci intr_size = 128; 130bf215546Sopenharmony_ci } 131bf215546Sopenharmony_ci else { 132bf215546Sopenharmony_ci intrinsic = "llvm.x86.avx.min.pd.256"; 133bf215546Sopenharmony_ci intr_size = 256; 134bf215546Sopenharmony_ci } 135bf215546Sopenharmony_ci } 136bf215546Sopenharmony_ci } 137bf215546Sopenharmony_ci else if (type.floating && util_get_cpu_caps()->has_altivec) { 138bf215546Sopenharmony_ci if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 139bf215546Sopenharmony_ci debug_printf("%s: altivec doesn't support nan return nan behavior\n", 140bf215546Sopenharmony_ci __FUNCTION__); 141bf215546Sopenharmony_ci } 142bf215546Sopenharmony_ci if (type.width == 32 && type.length == 4) { 143bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vminfp"; 144bf215546Sopenharmony_ci intr_size = 128; 145bf215546Sopenharmony_ci } 146bf215546Sopenharmony_ci } else if (util_get_cpu_caps()->has_altivec) { 147bf215546Sopenharmony_ci intr_size = 128; 148bf215546Sopenharmony_ci if (type.width == 8) { 149bf215546Sopenharmony_ci if (!type.sign) { 150bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vminub"; 151bf215546Sopenharmony_ci } else { 152bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vminsb"; 153bf215546Sopenharmony_ci } 154bf215546Sopenharmony_ci } else if (type.width == 16) { 155bf215546Sopenharmony_ci if (!type.sign) { 156bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vminuh"; 157bf215546Sopenharmony_ci } else { 158bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vminsh"; 159bf215546Sopenharmony_ci } 160bf215546Sopenharmony_ci } else if (type.width == 32) { 161bf215546Sopenharmony_ci if (!type.sign) { 162bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vminuw"; 163bf215546Sopenharmony_ci } else { 164bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vminsw"; 165bf215546Sopenharmony_ci } 166bf215546Sopenharmony_ci } 167bf215546Sopenharmony_ci } 168bf215546Sopenharmony_ci 169bf215546Sopenharmony_ci if (intrinsic) { 170bf215546Sopenharmony_ci /* We need to handle nan's for floating point numbers. If one of the 171bf215546Sopenharmony_ci * inputs is nan the other should be returned (required by both D3D10+ 172bf215546Sopenharmony_ci * and OpenCL). 173bf215546Sopenharmony_ci * The sse intrinsics return the second operator in case of nan by 174bf215546Sopenharmony_ci * default so we need to special code to handle those. 175bf215546Sopenharmony_ci */ 176bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse && type.floating && 177bf215546Sopenharmony_ci nan_behavior == GALLIVM_NAN_RETURN_OTHER) { 178bf215546Sopenharmony_ci LLVMValueRef isnan, min; 179bf215546Sopenharmony_ci min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 180bf215546Sopenharmony_ci type, 181bf215546Sopenharmony_ci intr_size, a, b); 182bf215546Sopenharmony_ci isnan = lp_build_isnan(bld, b); 183bf215546Sopenharmony_ci return lp_build_select(bld, isnan, a, min); 184bf215546Sopenharmony_ci } else { 185bf215546Sopenharmony_ci return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 186bf215546Sopenharmony_ci type, 187bf215546Sopenharmony_ci intr_size, a, b); 188bf215546Sopenharmony_ci } 189bf215546Sopenharmony_ci } 190bf215546Sopenharmony_ci 191bf215546Sopenharmony_ci if (type.floating) { 192bf215546Sopenharmony_ci switch (nan_behavior) { 193bf215546Sopenharmony_ci case GALLIVM_NAN_RETURN_OTHER: { 194bf215546Sopenharmony_ci LLVMValueRef isnan = lp_build_isnan(bld, a); 195bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 196bf215546Sopenharmony_ci cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 197bf215546Sopenharmony_ci return lp_build_select(bld, cond, a, b); 198bf215546Sopenharmony_ci } 199bf215546Sopenharmony_ci break; 200bf215546Sopenharmony_ci case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN: 201bf215546Sopenharmony_ci cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b); 202bf215546Sopenharmony_ci return lp_build_select(bld, cond, a, b); 203bf215546Sopenharmony_ci case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN: 204bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a); 205bf215546Sopenharmony_ci return lp_build_select(bld, cond, b, a); 206bf215546Sopenharmony_ci case GALLIVM_NAN_BEHAVIOR_UNDEFINED: 207bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 208bf215546Sopenharmony_ci return lp_build_select(bld, cond, a, b); 209bf215546Sopenharmony_ci break; 210bf215546Sopenharmony_ci default: 211bf215546Sopenharmony_ci assert(0); 212bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 213bf215546Sopenharmony_ci return lp_build_select(bld, cond, a, b); 214bf215546Sopenharmony_ci } 215bf215546Sopenharmony_ci } else { 216bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 217bf215546Sopenharmony_ci return lp_build_select(bld, cond, a, b); 218bf215546Sopenharmony_ci } 219bf215546Sopenharmony_ci} 220bf215546Sopenharmony_ci 221bf215546Sopenharmony_ci 222bf215546Sopenharmony_ciLLVMValueRef 223bf215546Sopenharmony_cilp_build_fmuladd(LLVMBuilderRef builder, 224bf215546Sopenharmony_ci LLVMValueRef a, 225bf215546Sopenharmony_ci LLVMValueRef b, 226bf215546Sopenharmony_ci LLVMValueRef c) 227bf215546Sopenharmony_ci{ 228bf215546Sopenharmony_ci LLVMTypeRef type = LLVMTypeOf(a); 229bf215546Sopenharmony_ci assert(type == LLVMTypeOf(b)); 230bf215546Sopenharmony_ci assert(type == LLVMTypeOf(c)); 231bf215546Sopenharmony_ci 232bf215546Sopenharmony_ci char intrinsic[32]; 233bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type); 234bf215546Sopenharmony_ci LLVMValueRef args[] = { a, b, c }; 235bf215546Sopenharmony_ci return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0); 236bf215546Sopenharmony_ci} 237bf215546Sopenharmony_ci 238bf215546Sopenharmony_ci 239bf215546Sopenharmony_ci/** 240bf215546Sopenharmony_ci * Generate max(a, b) 241bf215546Sopenharmony_ci * No checks for special case values of a or b = 1 or 0 are done. 242bf215546Sopenharmony_ci * NaN's are handled according to the behavior specified by the 243bf215546Sopenharmony_ci * nan_behavior argument. 244bf215546Sopenharmony_ci */ 245bf215546Sopenharmony_cistatic LLVMValueRef 246bf215546Sopenharmony_cilp_build_max_simple(struct lp_build_context *bld, 247bf215546Sopenharmony_ci LLVMValueRef a, 248bf215546Sopenharmony_ci LLVMValueRef b, 249bf215546Sopenharmony_ci enum gallivm_nan_behavior nan_behavior) 250bf215546Sopenharmony_ci{ 251bf215546Sopenharmony_ci const struct lp_type type = bld->type; 252bf215546Sopenharmony_ci const char *intrinsic = NULL; 253bf215546Sopenharmony_ci unsigned intr_size = 0; 254bf215546Sopenharmony_ci LLVMValueRef cond; 255bf215546Sopenharmony_ci 256bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 257bf215546Sopenharmony_ci assert(lp_check_value(type, b)); 258bf215546Sopenharmony_ci 259bf215546Sopenharmony_ci /* TODO: optimize the constant case */ 260bf215546Sopenharmony_ci 261bf215546Sopenharmony_ci if (type.floating && util_get_cpu_caps()->has_sse) { 262bf215546Sopenharmony_ci if (type.width == 32) { 263bf215546Sopenharmony_ci if (type.length == 1) { 264bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse.max.ss"; 265bf215546Sopenharmony_ci intr_size = 128; 266bf215546Sopenharmony_ci } 267bf215546Sopenharmony_ci else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) { 268bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse.max.ps"; 269bf215546Sopenharmony_ci intr_size = 128; 270bf215546Sopenharmony_ci } 271bf215546Sopenharmony_ci else { 272bf215546Sopenharmony_ci intrinsic = "llvm.x86.avx.max.ps.256"; 273bf215546Sopenharmony_ci intr_size = 256; 274bf215546Sopenharmony_ci } 275bf215546Sopenharmony_ci } 276bf215546Sopenharmony_ci if (type.width == 64 && util_get_cpu_caps()->has_sse2) { 277bf215546Sopenharmony_ci if (type.length == 1) { 278bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse2.max.sd"; 279bf215546Sopenharmony_ci intr_size = 128; 280bf215546Sopenharmony_ci } 281bf215546Sopenharmony_ci else if (type.length == 2 || !util_get_cpu_caps()->has_avx) { 282bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse2.max.pd"; 283bf215546Sopenharmony_ci intr_size = 128; 284bf215546Sopenharmony_ci } 285bf215546Sopenharmony_ci else { 286bf215546Sopenharmony_ci intrinsic = "llvm.x86.avx.max.pd.256"; 287bf215546Sopenharmony_ci intr_size = 256; 288bf215546Sopenharmony_ci } 289bf215546Sopenharmony_ci } 290bf215546Sopenharmony_ci } 291bf215546Sopenharmony_ci else if (type.floating && util_get_cpu_caps()->has_altivec) { 292bf215546Sopenharmony_ci if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 293bf215546Sopenharmony_ci debug_printf("%s: altivec doesn't support nan return nan behavior\n", 294bf215546Sopenharmony_ci __FUNCTION__); 295bf215546Sopenharmony_ci } 296bf215546Sopenharmony_ci if (type.width == 32 || type.length == 4) { 297bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vmaxfp"; 298bf215546Sopenharmony_ci intr_size = 128; 299bf215546Sopenharmony_ci } 300bf215546Sopenharmony_ci } else if (util_get_cpu_caps()->has_altivec) { 301bf215546Sopenharmony_ci intr_size = 128; 302bf215546Sopenharmony_ci if (type.width == 8) { 303bf215546Sopenharmony_ci if (!type.sign) { 304bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vmaxub"; 305bf215546Sopenharmony_ci } else { 306bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vmaxsb"; 307bf215546Sopenharmony_ci } 308bf215546Sopenharmony_ci } else if (type.width == 16) { 309bf215546Sopenharmony_ci if (!type.sign) { 310bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vmaxuh"; 311bf215546Sopenharmony_ci } else { 312bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vmaxsh"; 313bf215546Sopenharmony_ci } 314bf215546Sopenharmony_ci } else if (type.width == 32) { 315bf215546Sopenharmony_ci if (!type.sign) { 316bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vmaxuw"; 317bf215546Sopenharmony_ci } else { 318bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vmaxsw"; 319bf215546Sopenharmony_ci } 320bf215546Sopenharmony_ci } 321bf215546Sopenharmony_ci } 322bf215546Sopenharmony_ci 323bf215546Sopenharmony_ci if (intrinsic) { 324bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse && type.floating && 325bf215546Sopenharmony_ci nan_behavior == GALLIVM_NAN_RETURN_OTHER) { 326bf215546Sopenharmony_ci LLVMValueRef isnan, max; 327bf215546Sopenharmony_ci max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 328bf215546Sopenharmony_ci type, 329bf215546Sopenharmony_ci intr_size, a, b); 330bf215546Sopenharmony_ci isnan = lp_build_isnan(bld, b); 331bf215546Sopenharmony_ci return lp_build_select(bld, isnan, a, max); 332bf215546Sopenharmony_ci } else { 333bf215546Sopenharmony_ci return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 334bf215546Sopenharmony_ci type, 335bf215546Sopenharmony_ci intr_size, a, b); 336bf215546Sopenharmony_ci } 337bf215546Sopenharmony_ci } 338bf215546Sopenharmony_ci 339bf215546Sopenharmony_ci if (type.floating) { 340bf215546Sopenharmony_ci switch (nan_behavior) { 341bf215546Sopenharmony_ci case GALLIVM_NAN_RETURN_OTHER: { 342bf215546Sopenharmony_ci LLVMValueRef isnan = lp_build_isnan(bld, a); 343bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 344bf215546Sopenharmony_ci cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 345bf215546Sopenharmony_ci return lp_build_select(bld, cond, a, b); 346bf215546Sopenharmony_ci } 347bf215546Sopenharmony_ci break; 348bf215546Sopenharmony_ci case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN: 349bf215546Sopenharmony_ci cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b); 350bf215546Sopenharmony_ci return lp_build_select(bld, cond, a, b); 351bf215546Sopenharmony_ci case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN: 352bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a); 353bf215546Sopenharmony_ci return lp_build_select(bld, cond, b, a); 354bf215546Sopenharmony_ci case GALLIVM_NAN_BEHAVIOR_UNDEFINED: 355bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 356bf215546Sopenharmony_ci return lp_build_select(bld, cond, a, b); 357bf215546Sopenharmony_ci break; 358bf215546Sopenharmony_ci default: 359bf215546Sopenharmony_ci assert(0); 360bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 361bf215546Sopenharmony_ci return lp_build_select(bld, cond, a, b); 362bf215546Sopenharmony_ci } 363bf215546Sopenharmony_ci } else { 364bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 365bf215546Sopenharmony_ci return lp_build_select(bld, cond, a, b); 366bf215546Sopenharmony_ci } 367bf215546Sopenharmony_ci} 368bf215546Sopenharmony_ci 369bf215546Sopenharmony_ci 370bf215546Sopenharmony_ci/** 371bf215546Sopenharmony_ci * Generate 1 - a, or ~a depending on bld->type. 372bf215546Sopenharmony_ci */ 373bf215546Sopenharmony_ciLLVMValueRef 374bf215546Sopenharmony_cilp_build_comp(struct lp_build_context *bld, 375bf215546Sopenharmony_ci LLVMValueRef a) 376bf215546Sopenharmony_ci{ 377bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 378bf215546Sopenharmony_ci const struct lp_type type = bld->type; 379bf215546Sopenharmony_ci 380bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 381bf215546Sopenharmony_ci 382bf215546Sopenharmony_ci if (a == bld->one) 383bf215546Sopenharmony_ci return bld->zero; 384bf215546Sopenharmony_ci if (a == bld->zero) 385bf215546Sopenharmony_ci return bld->one; 386bf215546Sopenharmony_ci 387bf215546Sopenharmony_ci if (type.norm && !type.floating && !type.fixed && !type.sign) { 388bf215546Sopenharmony_ci if (LLVMIsConstant(a)) 389bf215546Sopenharmony_ci return LLVMConstNot(a); 390bf215546Sopenharmony_ci else 391bf215546Sopenharmony_ci return LLVMBuildNot(builder, a, ""); 392bf215546Sopenharmony_ci } 393bf215546Sopenharmony_ci 394bf215546Sopenharmony_ci if (type.floating) 395bf215546Sopenharmony_ci return LLVMBuildFSub(builder, bld->one, a, ""); 396bf215546Sopenharmony_ci else 397bf215546Sopenharmony_ci return LLVMBuildSub(builder, bld->one, a, ""); 398bf215546Sopenharmony_ci} 399bf215546Sopenharmony_ci 400bf215546Sopenharmony_ci 401bf215546Sopenharmony_ci/** 402bf215546Sopenharmony_ci * Generate a + b 403bf215546Sopenharmony_ci */ 404bf215546Sopenharmony_ciLLVMValueRef 405bf215546Sopenharmony_cilp_build_add(struct lp_build_context *bld, 406bf215546Sopenharmony_ci LLVMValueRef a, 407bf215546Sopenharmony_ci LLVMValueRef b) 408bf215546Sopenharmony_ci{ 409bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 410bf215546Sopenharmony_ci const struct lp_type type = bld->type; 411bf215546Sopenharmony_ci LLVMValueRef res; 412bf215546Sopenharmony_ci 413bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 414bf215546Sopenharmony_ci assert(lp_check_value(type, b)); 415bf215546Sopenharmony_ci 416bf215546Sopenharmony_ci if (a == bld->zero) 417bf215546Sopenharmony_ci return b; 418bf215546Sopenharmony_ci if (b == bld->zero) 419bf215546Sopenharmony_ci return a; 420bf215546Sopenharmony_ci if (a == bld->undef || b == bld->undef) 421bf215546Sopenharmony_ci return bld->undef; 422bf215546Sopenharmony_ci 423bf215546Sopenharmony_ci if (type.norm) { 424bf215546Sopenharmony_ci const char *intrinsic = NULL; 425bf215546Sopenharmony_ci 426bf215546Sopenharmony_ci if (!type.sign && (a == bld->one || b == bld->one)) 427bf215546Sopenharmony_ci return bld->one; 428bf215546Sopenharmony_ci 429bf215546Sopenharmony_ci if (!type.floating && !type.fixed) { 430bf215546Sopenharmony_ci if (LLVM_VERSION_MAJOR >= 8) { 431bf215546Sopenharmony_ci char intrin[32]; 432bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat"; 433bf215546Sopenharmony_ci lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); 434bf215546Sopenharmony_ci return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); 435bf215546Sopenharmony_ci } 436bf215546Sopenharmony_ci if (type.width * type.length == 128) { 437bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse2) { 438bf215546Sopenharmony_ci if (type.width == 8) 439bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 440bf215546Sopenharmony_ci if (type.width == 16) 441bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 442bf215546Sopenharmony_ci } else if (util_get_cpu_caps()->has_altivec) { 443bf215546Sopenharmony_ci if (type.width == 8) 444bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; 445bf215546Sopenharmony_ci if (type.width == 16) 446bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; 447bf215546Sopenharmony_ci } 448bf215546Sopenharmony_ci } 449bf215546Sopenharmony_ci if (type.width * type.length == 256) { 450bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_avx2) { 451bf215546Sopenharmony_ci if (type.width == 8) 452bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b"; 453bf215546Sopenharmony_ci if (type.width == 16) 454bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w"; 455bf215546Sopenharmony_ci } 456bf215546Sopenharmony_ci } 457bf215546Sopenharmony_ci } 458bf215546Sopenharmony_ci 459bf215546Sopenharmony_ci if (intrinsic) 460bf215546Sopenharmony_ci return lp_build_intrinsic_binary(builder, intrinsic, 461bf215546Sopenharmony_ci lp_build_vec_type(bld->gallivm, bld->type), a, b); 462bf215546Sopenharmony_ci } 463bf215546Sopenharmony_ci 464bf215546Sopenharmony_ci if (type.norm && !type.floating && !type.fixed) { 465bf215546Sopenharmony_ci if (type.sign) { 466bf215546Sopenharmony_ci uint64_t sign = (uint64_t)1 << (type.width - 1); 467bf215546Sopenharmony_ci LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1); 468bf215546Sopenharmony_ci LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign); 469bf215546Sopenharmony_ci /* a_clamp_max is the maximum a for positive b, 470bf215546Sopenharmony_ci a_clamp_min is the minimum a for negative b. */ 471bf215546Sopenharmony_ci LLVMValueRef a_clamp_max = 472bf215546Sopenharmony_ci lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), 473bf215546Sopenharmony_ci GALLIVM_NAN_BEHAVIOR_UNDEFINED); 474bf215546Sopenharmony_ci LLVMValueRef a_clamp_min = 475bf215546Sopenharmony_ci lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), 476bf215546Sopenharmony_ci GALLIVM_NAN_BEHAVIOR_UNDEFINED); 477bf215546Sopenharmony_ci a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, 478bf215546Sopenharmony_ci bld->zero), a_clamp_max, a_clamp_min); 479bf215546Sopenharmony_ci } 480bf215546Sopenharmony_ci } 481bf215546Sopenharmony_ci 482bf215546Sopenharmony_ci if (type.floating) 483bf215546Sopenharmony_ci res = LLVMBuildFAdd(builder, a, b, ""); 484bf215546Sopenharmony_ci else 485bf215546Sopenharmony_ci res = LLVMBuildAdd(builder, a, b, ""); 486bf215546Sopenharmony_ci 487bf215546Sopenharmony_ci /* clamp to ceiling of 1.0 */ 488bf215546Sopenharmony_ci if (bld->type.norm && (bld->type.floating || bld->type.fixed)) 489bf215546Sopenharmony_ci res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 490bf215546Sopenharmony_ci 491bf215546Sopenharmony_ci if (type.norm && !type.floating && !type.fixed) { 492bf215546Sopenharmony_ci if (!type.sign) { 493bf215546Sopenharmony_ci /* 494bf215546Sopenharmony_ci * newer llvm versions no longer support the intrinsics, but recognize 495bf215546Sopenharmony_ci * the pattern. Since auto-upgrade of intrinsics doesn't work for jit 496bf215546Sopenharmony_ci * code, it is important we match the pattern llvm uses (and pray llvm 497bf215546Sopenharmony_ci * doesn't change it - and hope they decide on the same pattern for 498bf215546Sopenharmony_ci * all backends supporting it...). 499bf215546Sopenharmony_ci * NOTE: cmp/select does sext/trunc of the mask. Does not seem to 500bf215546Sopenharmony_ci * interfere with llvm's ability to recognize the pattern but seems 501bf215546Sopenharmony_ci * a bit brittle. 502bf215546Sopenharmony_ci * NOTE: llvm 9+ always uses (non arch specific) intrinsic. 503bf215546Sopenharmony_ci */ 504bf215546Sopenharmony_ci LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res); 505bf215546Sopenharmony_ci res = lp_build_select(bld, overflowed, 506bf215546Sopenharmony_ci LLVMConstAllOnes(bld->int_vec_type), res); 507bf215546Sopenharmony_ci } 508bf215546Sopenharmony_ci } 509bf215546Sopenharmony_ci 510bf215546Sopenharmony_ci /* XXX clamp to floor of -1 or 0??? */ 511bf215546Sopenharmony_ci 512bf215546Sopenharmony_ci return res; 513bf215546Sopenharmony_ci} 514bf215546Sopenharmony_ci 515bf215546Sopenharmony_ci 516bf215546Sopenharmony_ci/** Return the scalar sum of the elements of a. 517bf215546Sopenharmony_ci * Should avoid this operation whenever possible. 518bf215546Sopenharmony_ci */ 519bf215546Sopenharmony_ciLLVMValueRef 520bf215546Sopenharmony_cilp_build_horizontal_add(struct lp_build_context *bld, 521bf215546Sopenharmony_ci LLVMValueRef a) 522bf215546Sopenharmony_ci{ 523bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 524bf215546Sopenharmony_ci const struct lp_type type = bld->type; 525bf215546Sopenharmony_ci LLVMValueRef index, res; 526bf215546Sopenharmony_ci unsigned i, length; 527bf215546Sopenharmony_ci LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2]; 528bf215546Sopenharmony_ci LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2]; 529bf215546Sopenharmony_ci LLVMValueRef vecres, elem2; 530bf215546Sopenharmony_ci 531bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 532bf215546Sopenharmony_ci 533bf215546Sopenharmony_ci if (type.length == 1) { 534bf215546Sopenharmony_ci return a; 535bf215546Sopenharmony_ci } 536bf215546Sopenharmony_ci 537bf215546Sopenharmony_ci assert(!bld->type.norm); 538bf215546Sopenharmony_ci 539bf215546Sopenharmony_ci /* 540bf215546Sopenharmony_ci * for byte vectors can do much better with psadbw. 541bf215546Sopenharmony_ci * Using repeated shuffle/adds here. Note with multiple vectors 542bf215546Sopenharmony_ci * this can be done more efficiently as outlined in the intel 543bf215546Sopenharmony_ci * optimization manual. 544bf215546Sopenharmony_ci * Note: could cause data rearrangement if used with smaller element 545bf215546Sopenharmony_ci * sizes. 546bf215546Sopenharmony_ci */ 547bf215546Sopenharmony_ci 548bf215546Sopenharmony_ci vecres = a; 549bf215546Sopenharmony_ci length = type.length / 2; 550bf215546Sopenharmony_ci while (length > 1) { 551bf215546Sopenharmony_ci LLVMValueRef vec1, vec2; 552bf215546Sopenharmony_ci for (i = 0; i < length; i++) { 553bf215546Sopenharmony_ci shuffles1[i] = lp_build_const_int32(bld->gallivm, i); 554bf215546Sopenharmony_ci shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length); 555bf215546Sopenharmony_ci } 556bf215546Sopenharmony_ci vec1 = LLVMBuildShuffleVector(builder, vecres, vecres, 557bf215546Sopenharmony_ci LLVMConstVector(shuffles1, length), ""); 558bf215546Sopenharmony_ci vec2 = LLVMBuildShuffleVector(builder, vecres, vecres, 559bf215546Sopenharmony_ci LLVMConstVector(shuffles2, length), ""); 560bf215546Sopenharmony_ci if (type.floating) { 561bf215546Sopenharmony_ci vecres = LLVMBuildFAdd(builder, vec1, vec2, ""); 562bf215546Sopenharmony_ci } 563bf215546Sopenharmony_ci else { 564bf215546Sopenharmony_ci vecres = LLVMBuildAdd(builder, vec1, vec2, ""); 565bf215546Sopenharmony_ci } 566bf215546Sopenharmony_ci length = length >> 1; 567bf215546Sopenharmony_ci } 568bf215546Sopenharmony_ci 569bf215546Sopenharmony_ci /* always have vector of size 2 here */ 570bf215546Sopenharmony_ci assert(length == 1); 571bf215546Sopenharmony_ci 572bf215546Sopenharmony_ci index = lp_build_const_int32(bld->gallivm, 0); 573bf215546Sopenharmony_ci res = LLVMBuildExtractElement(builder, vecres, index, ""); 574bf215546Sopenharmony_ci index = lp_build_const_int32(bld->gallivm, 1); 575bf215546Sopenharmony_ci elem2 = LLVMBuildExtractElement(builder, vecres, index, ""); 576bf215546Sopenharmony_ci 577bf215546Sopenharmony_ci if (type.floating) 578bf215546Sopenharmony_ci res = LLVMBuildFAdd(builder, res, elem2, ""); 579bf215546Sopenharmony_ci else 580bf215546Sopenharmony_ci res = LLVMBuildAdd(builder, res, elem2, ""); 581bf215546Sopenharmony_ci 582bf215546Sopenharmony_ci return res; 583bf215546Sopenharmony_ci} 584bf215546Sopenharmony_ci 585bf215546Sopenharmony_ci 586bf215546Sopenharmony_ci/** 587bf215546Sopenharmony_ci * Return the horizontal sums of 4 float vectors as a float4 vector. 588bf215546Sopenharmony_ci * This uses the technique as outlined in Intel Optimization Manual. 589bf215546Sopenharmony_ci */ 590bf215546Sopenharmony_cistatic LLVMValueRef 591bf215546Sopenharmony_cilp_build_horizontal_add4x4f(struct lp_build_context *bld, 592bf215546Sopenharmony_ci LLVMValueRef src[4]) 593bf215546Sopenharmony_ci{ 594bf215546Sopenharmony_ci struct gallivm_state *gallivm = bld->gallivm; 595bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 596bf215546Sopenharmony_ci LLVMValueRef shuffles[4]; 597bf215546Sopenharmony_ci LLVMValueRef tmp[4]; 598bf215546Sopenharmony_ci LLVMValueRef sumtmp[2], shuftmp[2]; 599bf215546Sopenharmony_ci 600bf215546Sopenharmony_ci /* lower half of regs */ 601bf215546Sopenharmony_ci shuffles[0] = lp_build_const_int32(gallivm, 0); 602bf215546Sopenharmony_ci shuffles[1] = lp_build_const_int32(gallivm, 1); 603bf215546Sopenharmony_ci shuffles[2] = lp_build_const_int32(gallivm, 4); 604bf215546Sopenharmony_ci shuffles[3] = lp_build_const_int32(gallivm, 5); 605bf215546Sopenharmony_ci tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1], 606bf215546Sopenharmony_ci LLVMConstVector(shuffles, 4), ""); 607bf215546Sopenharmony_ci tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3], 608bf215546Sopenharmony_ci LLVMConstVector(shuffles, 4), ""); 609bf215546Sopenharmony_ci 610bf215546Sopenharmony_ci /* upper half of regs */ 611bf215546Sopenharmony_ci shuffles[0] = lp_build_const_int32(gallivm, 2); 612bf215546Sopenharmony_ci shuffles[1] = lp_build_const_int32(gallivm, 3); 613bf215546Sopenharmony_ci shuffles[2] = lp_build_const_int32(gallivm, 6); 614bf215546Sopenharmony_ci shuffles[3] = lp_build_const_int32(gallivm, 7); 615bf215546Sopenharmony_ci tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1], 616bf215546Sopenharmony_ci LLVMConstVector(shuffles, 4), ""); 617bf215546Sopenharmony_ci tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3], 618bf215546Sopenharmony_ci LLVMConstVector(shuffles, 4), ""); 619bf215546Sopenharmony_ci 620bf215546Sopenharmony_ci sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], ""); 621bf215546Sopenharmony_ci sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], ""); 622bf215546Sopenharmony_ci 623bf215546Sopenharmony_ci shuffles[0] = lp_build_const_int32(gallivm, 0); 624bf215546Sopenharmony_ci shuffles[1] = lp_build_const_int32(gallivm, 2); 625bf215546Sopenharmony_ci shuffles[2] = lp_build_const_int32(gallivm, 4); 626bf215546Sopenharmony_ci shuffles[3] = lp_build_const_int32(gallivm, 6); 627bf215546Sopenharmony_ci shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 628bf215546Sopenharmony_ci LLVMConstVector(shuffles, 4), ""); 629bf215546Sopenharmony_ci 630bf215546Sopenharmony_ci shuffles[0] = lp_build_const_int32(gallivm, 1); 631bf215546Sopenharmony_ci shuffles[1] = lp_build_const_int32(gallivm, 3); 632bf215546Sopenharmony_ci shuffles[2] = lp_build_const_int32(gallivm, 5); 633bf215546Sopenharmony_ci shuffles[3] = lp_build_const_int32(gallivm, 7); 634bf215546Sopenharmony_ci shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 635bf215546Sopenharmony_ci LLVMConstVector(shuffles, 4), ""); 636bf215546Sopenharmony_ci 637bf215546Sopenharmony_ci return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], ""); 638bf215546Sopenharmony_ci} 639bf215546Sopenharmony_ci 640bf215546Sopenharmony_ci 641bf215546Sopenharmony_ci/* 642bf215546Sopenharmony_ci * partially horizontally add 2-4 float vectors with length nx4, 643bf215546Sopenharmony_ci * i.e. only four adjacent values in each vector will be added, 644bf215546Sopenharmony_ci * assuming values are really grouped in 4 which also determines 645bf215546Sopenharmony_ci * output order. 646bf215546Sopenharmony_ci * 647bf215546Sopenharmony_ci * Return a vector of the same length as the initial vectors, 648bf215546Sopenharmony_ci * with the excess elements (if any) being undefined. 649bf215546Sopenharmony_ci * The element order is independent of number of input vectors. 650bf215546Sopenharmony_ci * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7 651bf215546Sopenharmony_ci * the output order thus will be 652bf215546Sopenharmony_ci * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef 653bf215546Sopenharmony_ci */ 654bf215546Sopenharmony_ciLLVMValueRef 655bf215546Sopenharmony_cilp_build_hadd_partial4(struct lp_build_context *bld, 656bf215546Sopenharmony_ci LLVMValueRef vectors[], 657bf215546Sopenharmony_ci unsigned num_vecs) 658bf215546Sopenharmony_ci{ 659bf215546Sopenharmony_ci struct gallivm_state *gallivm = bld->gallivm; 660bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 661bf215546Sopenharmony_ci LLVMValueRef ret_vec; 662bf215546Sopenharmony_ci LLVMValueRef tmp[4]; 663bf215546Sopenharmony_ci const char *intrinsic = NULL; 664bf215546Sopenharmony_ci 665bf215546Sopenharmony_ci assert(num_vecs >= 2 && num_vecs <= 4); 666bf215546Sopenharmony_ci assert(bld->type.floating); 667bf215546Sopenharmony_ci 668bf215546Sopenharmony_ci /* only use this with at least 2 vectors, as it is sort of expensive 669bf215546Sopenharmony_ci * (depending on cpu) and we always need two horizontal adds anyway, 670bf215546Sopenharmony_ci * so a shuffle/add approach might be better. 671bf215546Sopenharmony_ci */ 672bf215546Sopenharmony_ci 673bf215546Sopenharmony_ci tmp[0] = vectors[0]; 674bf215546Sopenharmony_ci tmp[1] = vectors[1]; 675bf215546Sopenharmony_ci 676bf215546Sopenharmony_ci tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0]; 677bf215546Sopenharmony_ci tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0]; 678bf215546Sopenharmony_ci 679bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 && 680bf215546Sopenharmony_ci bld->type.length == 4) { 681bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse3.hadd.ps"; 682bf215546Sopenharmony_ci } 683bf215546Sopenharmony_ci else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 && 684bf215546Sopenharmony_ci bld->type.length == 8) { 685bf215546Sopenharmony_ci intrinsic = "llvm.x86.avx.hadd.ps.256"; 686bf215546Sopenharmony_ci } 687bf215546Sopenharmony_ci if (intrinsic) { 688bf215546Sopenharmony_ci tmp[0] = lp_build_intrinsic_binary(builder, intrinsic, 689bf215546Sopenharmony_ci lp_build_vec_type(gallivm, bld->type), 690bf215546Sopenharmony_ci tmp[0], tmp[1]); 691bf215546Sopenharmony_ci if (num_vecs > 2) { 692bf215546Sopenharmony_ci tmp[1] = lp_build_intrinsic_binary(builder, intrinsic, 693bf215546Sopenharmony_ci lp_build_vec_type(gallivm, bld->type), 694bf215546Sopenharmony_ci tmp[2], tmp[3]); 695bf215546Sopenharmony_ci } 696bf215546Sopenharmony_ci else { 697bf215546Sopenharmony_ci tmp[1] = tmp[0]; 698bf215546Sopenharmony_ci } 699bf215546Sopenharmony_ci return lp_build_intrinsic_binary(builder, intrinsic, 700bf215546Sopenharmony_ci lp_build_vec_type(gallivm, bld->type), 701bf215546Sopenharmony_ci tmp[0], tmp[1]); 702bf215546Sopenharmony_ci } 703bf215546Sopenharmony_ci 704bf215546Sopenharmony_ci if (bld->type.length == 4) { 705bf215546Sopenharmony_ci ret_vec = lp_build_horizontal_add4x4f(bld, tmp); 706bf215546Sopenharmony_ci } 707bf215546Sopenharmony_ci else { 708bf215546Sopenharmony_ci LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4]; 709bf215546Sopenharmony_ci unsigned j; 710bf215546Sopenharmony_ci unsigned num_iter = bld->type.length / 4; 711bf215546Sopenharmony_ci struct lp_type parttype = bld->type; 712bf215546Sopenharmony_ci parttype.length = 4; 713bf215546Sopenharmony_ci for (j = 0; j < num_iter; j++) { 714bf215546Sopenharmony_ci LLVMValueRef partsrc[4]; 715bf215546Sopenharmony_ci unsigned i; 716bf215546Sopenharmony_ci for (i = 0; i < 4; i++) { 717bf215546Sopenharmony_ci partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4); 718bf215546Sopenharmony_ci } 719bf215546Sopenharmony_ci partres[j] = lp_build_horizontal_add4x4f(bld, partsrc); 720bf215546Sopenharmony_ci } 721bf215546Sopenharmony_ci ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter); 722bf215546Sopenharmony_ci } 723bf215546Sopenharmony_ci return ret_vec; 724bf215546Sopenharmony_ci} 725bf215546Sopenharmony_ci 726bf215546Sopenharmony_ci 727bf215546Sopenharmony_ci/** 728bf215546Sopenharmony_ci * Generate a - b 729bf215546Sopenharmony_ci */ 730bf215546Sopenharmony_ciLLVMValueRef 731bf215546Sopenharmony_cilp_build_sub(struct lp_build_context *bld, 732bf215546Sopenharmony_ci LLVMValueRef a, 733bf215546Sopenharmony_ci LLVMValueRef b) 734bf215546Sopenharmony_ci{ 735bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 736bf215546Sopenharmony_ci const struct lp_type type = bld->type; 737bf215546Sopenharmony_ci LLVMValueRef res; 738bf215546Sopenharmony_ci 739bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 740bf215546Sopenharmony_ci assert(lp_check_value(type, b)); 741bf215546Sopenharmony_ci 742bf215546Sopenharmony_ci if (b == bld->zero) 743bf215546Sopenharmony_ci return a; 744bf215546Sopenharmony_ci if (a == bld->undef || b == bld->undef) 745bf215546Sopenharmony_ci return bld->undef; 746bf215546Sopenharmony_ci if (a == b) 747bf215546Sopenharmony_ci return bld->zero; 748bf215546Sopenharmony_ci 749bf215546Sopenharmony_ci if (type.norm) { 750bf215546Sopenharmony_ci const char *intrinsic = NULL; 751bf215546Sopenharmony_ci 752bf215546Sopenharmony_ci if (!type.sign && b == bld->one) 753bf215546Sopenharmony_ci return bld->zero; 754bf215546Sopenharmony_ci 755bf215546Sopenharmony_ci if (!type.floating && !type.fixed) { 756bf215546Sopenharmony_ci if (LLVM_VERSION_MAJOR >= 8) { 757bf215546Sopenharmony_ci char intrin[32]; 758bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat"; 759bf215546Sopenharmony_ci lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); 760bf215546Sopenharmony_ci return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); 761bf215546Sopenharmony_ci } 762bf215546Sopenharmony_ci if (type.width * type.length == 128) { 763bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse2) { 764bf215546Sopenharmony_ci if (type.width == 8) 765bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 766bf215546Sopenharmony_ci if (type.width == 16) 767bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 768bf215546Sopenharmony_ci } else if (util_get_cpu_caps()->has_altivec) { 769bf215546Sopenharmony_ci if (type.width == 8) 770bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; 771bf215546Sopenharmony_ci if (type.width == 16) 772bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs"; 773bf215546Sopenharmony_ci } 774bf215546Sopenharmony_ci } 775bf215546Sopenharmony_ci if (type.width * type.length == 256) { 776bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_avx2) { 777bf215546Sopenharmony_ci if (type.width == 8) 778bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b"; 779bf215546Sopenharmony_ci if (type.width == 16) 780bf215546Sopenharmony_ci intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w"; 781bf215546Sopenharmony_ci } 782bf215546Sopenharmony_ci } 783bf215546Sopenharmony_ci } 784bf215546Sopenharmony_ci 785bf215546Sopenharmony_ci if (intrinsic) 786bf215546Sopenharmony_ci return lp_build_intrinsic_binary(builder, intrinsic, 787bf215546Sopenharmony_ci lp_build_vec_type(bld->gallivm, bld->type), a, b); 788bf215546Sopenharmony_ci } 789bf215546Sopenharmony_ci 790bf215546Sopenharmony_ci if (type.norm && !type.floating && !type.fixed) { 791bf215546Sopenharmony_ci if (type.sign) { 792bf215546Sopenharmony_ci uint64_t sign = (uint64_t)1 << (type.width - 1); 793bf215546Sopenharmony_ci LLVMValueRef max_val = 794bf215546Sopenharmony_ci lp_build_const_int_vec(bld->gallivm, type, sign - 1); 795bf215546Sopenharmony_ci LLVMValueRef min_val = 796bf215546Sopenharmony_ci lp_build_const_int_vec(bld->gallivm, type, sign); 797bf215546Sopenharmony_ci /* a_clamp_max is the maximum a for negative b, 798bf215546Sopenharmony_ci a_clamp_min is the minimum a for positive b. */ 799bf215546Sopenharmony_ci LLVMValueRef a_clamp_max = 800bf215546Sopenharmony_ci lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), 801bf215546Sopenharmony_ci GALLIVM_NAN_BEHAVIOR_UNDEFINED); 802bf215546Sopenharmony_ci LLVMValueRef a_clamp_min = 803bf215546Sopenharmony_ci lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), 804bf215546Sopenharmony_ci GALLIVM_NAN_BEHAVIOR_UNDEFINED); 805bf215546Sopenharmony_ci a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, 806bf215546Sopenharmony_ci bld->zero), 807bf215546Sopenharmony_ci a_clamp_min, a_clamp_max); 808bf215546Sopenharmony_ci } else { 809bf215546Sopenharmony_ci /* 810bf215546Sopenharmony_ci * This must match llvm pattern for saturated unsigned sub. 811bf215546Sopenharmony_ci * (lp_build_max_simple actually does the job with its current 812bf215546Sopenharmony_ci * definition but do it explicitly here.) 813bf215546Sopenharmony_ci * NOTE: cmp/select does sext/trunc of the mask. Does not seem to 814bf215546Sopenharmony_ci * interfere with llvm's ability to recognize the pattern but seems 815bf215546Sopenharmony_ci * a bit brittle. 816bf215546Sopenharmony_ci * NOTE: llvm 9+ always uses (non arch specific) intrinsic. 817bf215546Sopenharmony_ci */ 818bf215546Sopenharmony_ci LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 819bf215546Sopenharmony_ci a = lp_build_select(bld, no_ov, a, b); 820bf215546Sopenharmony_ci } 821bf215546Sopenharmony_ci } 822bf215546Sopenharmony_ci 823bf215546Sopenharmony_ci if (type.floating) 824bf215546Sopenharmony_ci res = LLVMBuildFSub(builder, a, b, ""); 825bf215546Sopenharmony_ci else 826bf215546Sopenharmony_ci res = LLVMBuildSub(builder, a, b, ""); 827bf215546Sopenharmony_ci 828bf215546Sopenharmony_ci if (bld->type.norm && (bld->type.floating || bld->type.fixed)) 829bf215546Sopenharmony_ci res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 830bf215546Sopenharmony_ci 831bf215546Sopenharmony_ci return res; 832bf215546Sopenharmony_ci} 833bf215546Sopenharmony_ci 834bf215546Sopenharmony_ci 835bf215546Sopenharmony_ci/** 836bf215546Sopenharmony_ci * Normalized multiplication. 837bf215546Sopenharmony_ci * 838bf215546Sopenharmony_ci * There are several approaches for (using 8-bit normalized multiplication as 839bf215546Sopenharmony_ci * an example): 840bf215546Sopenharmony_ci * 841bf215546Sopenharmony_ci * - alpha plus one 842bf215546Sopenharmony_ci * 843bf215546Sopenharmony_ci * makes the following approximation to the division (Sree) 844bf215546Sopenharmony_ci * 845bf215546Sopenharmony_ci * a*b/255 ~= (a*(b + 1)) >> 256 846bf215546Sopenharmony_ci * 847bf215546Sopenharmony_ci * which is the fastest method that satisfies the following OpenGL 848bf215546Sopenharmony_ci * criteria of 849bf215546Sopenharmony_ci * 850bf215546Sopenharmony_ci * 0*0 = 0 and 255*255 = 255 851bf215546Sopenharmony_ci * 852bf215546Sopenharmony_ci * - geometric series 853bf215546Sopenharmony_ci * 854bf215546Sopenharmony_ci * takes the geometric series approximation to the division 855bf215546Sopenharmony_ci * 856bf215546Sopenharmony_ci * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 857bf215546Sopenharmony_ci * 858bf215546Sopenharmony_ci * in this case just the first two terms to fit in 16bit arithmetic 859bf215546Sopenharmony_ci * 860bf215546Sopenharmony_ci * t/255 ~= (t + (t >> 8)) >> 8 861bf215546Sopenharmony_ci * 862bf215546Sopenharmony_ci * note that just by itself it doesn't satisfies the OpenGL criteria, 863bf215546Sopenharmony_ci * as 255*255 = 254, so the special case b = 255 must be accounted or 864bf215546Sopenharmony_ci * roundoff must be used. 865bf215546Sopenharmony_ci * 866bf215546Sopenharmony_ci * - geometric series plus rounding 867bf215546Sopenharmony_ci * 868bf215546Sopenharmony_ci * when using a geometric series division instead of truncating the result 869bf215546Sopenharmony_ci * use roundoff in the approximation (Jim Blinn) 870bf215546Sopenharmony_ci * 871bf215546Sopenharmony_ci * t/255 ~= (t + (t >> 8) + 0x80) >> 8 872bf215546Sopenharmony_ci * 873bf215546Sopenharmony_ci * achieving the exact results. 874bf215546Sopenharmony_ci * 875bf215546Sopenharmony_ci * 876bf215546Sopenharmony_ci * 877bf215546Sopenharmony_ci * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 878bf215546Sopenharmony_ci * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 879bf215546Sopenharmony_ci * @sa Michael Herf, The "double blend trick", May 2000, 880bf215546Sopenharmony_ci * http://www.stereopsis.com/doubleblend.html 881bf215546Sopenharmony_ci */ 882bf215546Sopenharmony_ciLLVMValueRef 883bf215546Sopenharmony_cilp_build_mul_norm(struct gallivm_state *gallivm, 884bf215546Sopenharmony_ci struct lp_type wide_type, 885bf215546Sopenharmony_ci LLVMValueRef a, LLVMValueRef b) 886bf215546Sopenharmony_ci{ 887bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 888bf215546Sopenharmony_ci struct lp_build_context bld; 889bf215546Sopenharmony_ci unsigned n; 890bf215546Sopenharmony_ci LLVMValueRef half; 891bf215546Sopenharmony_ci LLVMValueRef ab; 892bf215546Sopenharmony_ci 893bf215546Sopenharmony_ci assert(!wide_type.floating); 894bf215546Sopenharmony_ci assert(lp_check_value(wide_type, a)); 895bf215546Sopenharmony_ci assert(lp_check_value(wide_type, b)); 896bf215546Sopenharmony_ci 897bf215546Sopenharmony_ci lp_build_context_init(&bld, gallivm, wide_type); 898bf215546Sopenharmony_ci 899bf215546Sopenharmony_ci n = wide_type.width / 2; 900bf215546Sopenharmony_ci if (wide_type.sign) { 901bf215546Sopenharmony_ci --n; 902bf215546Sopenharmony_ci } 903bf215546Sopenharmony_ci 904bf215546Sopenharmony_ci /* 905bf215546Sopenharmony_ci * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW 906bf215546Sopenharmony_ci * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/ 907bf215546Sopenharmony_ci */ 908bf215546Sopenharmony_ci 909bf215546Sopenharmony_ci /* 910bf215546Sopenharmony_ci * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n 911bf215546Sopenharmony_ci */ 912bf215546Sopenharmony_ci 913bf215546Sopenharmony_ci ab = LLVMBuildMul(builder, a, b, ""); 914bf215546Sopenharmony_ci ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), ""); 915bf215546Sopenharmony_ci 916bf215546Sopenharmony_ci /* 917bf215546Sopenharmony_ci * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1)) 918bf215546Sopenharmony_ci */ 919bf215546Sopenharmony_ci 920bf215546Sopenharmony_ci half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1)); 921bf215546Sopenharmony_ci if (wide_type.sign) { 922bf215546Sopenharmony_ci LLVMValueRef minus_half = LLVMBuildNeg(builder, half, ""); 923bf215546Sopenharmony_ci LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1); 924bf215546Sopenharmony_ci half = lp_build_select(&bld, sign, minus_half, half); 925bf215546Sopenharmony_ci } 926bf215546Sopenharmony_ci ab = LLVMBuildAdd(builder, ab, half, ""); 927bf215546Sopenharmony_ci 928bf215546Sopenharmony_ci /* Final division */ 929bf215546Sopenharmony_ci ab = lp_build_shr_imm(&bld, ab, n); 930bf215546Sopenharmony_ci 931bf215546Sopenharmony_ci return ab; 932bf215546Sopenharmony_ci} 933bf215546Sopenharmony_ci 934bf215546Sopenharmony_ci 935bf215546Sopenharmony_ci/** 936bf215546Sopenharmony_ci * Generate a * b 937bf215546Sopenharmony_ci */ 938bf215546Sopenharmony_ciLLVMValueRef 939bf215546Sopenharmony_cilp_build_mul(struct lp_build_context *bld, 940bf215546Sopenharmony_ci LLVMValueRef a, 941bf215546Sopenharmony_ci LLVMValueRef b) 942bf215546Sopenharmony_ci{ 943bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 944bf215546Sopenharmony_ci const struct lp_type type = bld->type; 945bf215546Sopenharmony_ci 946bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 947bf215546Sopenharmony_ci assert(lp_check_value(type, b)); 948bf215546Sopenharmony_ci 949bf215546Sopenharmony_ci if (a == bld->zero) 950bf215546Sopenharmony_ci return bld->zero; 951bf215546Sopenharmony_ci if (a == bld->one) 952bf215546Sopenharmony_ci return b; 953bf215546Sopenharmony_ci if (b == bld->zero) 954bf215546Sopenharmony_ci return bld->zero; 955bf215546Sopenharmony_ci if (b == bld->one) 956bf215546Sopenharmony_ci return a; 957bf215546Sopenharmony_ci if (a == bld->undef || b == bld->undef) 958bf215546Sopenharmony_ci return bld->undef; 959bf215546Sopenharmony_ci 960bf215546Sopenharmony_ci if (!type.floating && !type.fixed && type.norm) { 961bf215546Sopenharmony_ci struct lp_type wide_type = lp_wider_type(type); 962bf215546Sopenharmony_ci LLVMValueRef al, ah, bl, bh, abl, abh, ab; 963bf215546Sopenharmony_ci 964bf215546Sopenharmony_ci lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah); 965bf215546Sopenharmony_ci lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh); 966bf215546Sopenharmony_ci 967bf215546Sopenharmony_ci /* PMULLW, PSRLW, PADDW */ 968bf215546Sopenharmony_ci abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl); 969bf215546Sopenharmony_ci abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh); 970bf215546Sopenharmony_ci 971bf215546Sopenharmony_ci ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh); 972bf215546Sopenharmony_ci 973bf215546Sopenharmony_ci return ab; 974bf215546Sopenharmony_ci } 975bf215546Sopenharmony_ci 976bf215546Sopenharmony_ci LLVMValueRef shift = type.fixed 977bf215546Sopenharmony_ci ? lp_build_const_int_vec(bld->gallivm, type, type.width/2) : NULL; 978bf215546Sopenharmony_ci 979bf215546Sopenharmony_ci LLVMValueRef res; 980bf215546Sopenharmony_ci if (type.floating) 981bf215546Sopenharmony_ci res = LLVMBuildFMul(builder, a, b, ""); 982bf215546Sopenharmony_ci else 983bf215546Sopenharmony_ci res = LLVMBuildMul(builder, a, b, ""); 984bf215546Sopenharmony_ci if (shift) { 985bf215546Sopenharmony_ci if (type.sign) 986bf215546Sopenharmony_ci res = LLVMBuildAShr(builder, res, shift, ""); 987bf215546Sopenharmony_ci else 988bf215546Sopenharmony_ci res = LLVMBuildLShr(builder, res, shift, ""); 989bf215546Sopenharmony_ci } 990bf215546Sopenharmony_ci 991bf215546Sopenharmony_ci return res; 992bf215546Sopenharmony_ci} 993bf215546Sopenharmony_ci 994bf215546Sopenharmony_ci 995bf215546Sopenharmony_ci/* 996bf215546Sopenharmony_ci * Widening mul, valid for 32x32 bit -> 64bit only. 997bf215546Sopenharmony_ci * Result is low 32bits, high bits returned in res_hi. 998bf215546Sopenharmony_ci * 999bf215546Sopenharmony_ci * Emits code that is meant to be compiled for the host CPU. 1000bf215546Sopenharmony_ci */ 1001bf215546Sopenharmony_ciLLVMValueRef 1002bf215546Sopenharmony_cilp_build_mul_32_lohi_cpu(struct lp_build_context *bld, 1003bf215546Sopenharmony_ci LLVMValueRef a, 1004bf215546Sopenharmony_ci LLVMValueRef b, 1005bf215546Sopenharmony_ci LLVMValueRef *res_hi) 1006bf215546Sopenharmony_ci{ 1007bf215546Sopenharmony_ci struct gallivm_state *gallivm = bld->gallivm; 1008bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 1009bf215546Sopenharmony_ci 1010bf215546Sopenharmony_ci assert(bld->type.width == 32); 1011bf215546Sopenharmony_ci assert(bld->type.floating == 0); 1012bf215546Sopenharmony_ci assert(bld->type.fixed == 0); 1013bf215546Sopenharmony_ci assert(bld->type.norm == 0); 1014bf215546Sopenharmony_ci 1015bf215546Sopenharmony_ci /* 1016bf215546Sopenharmony_ci * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces 1017bf215546Sopenharmony_ci * for x86 simd is atrocious (even if the high bits weren't required), 1018bf215546Sopenharmony_ci * trying to handle real 64bit inputs (which of course can't happen due 1019bf215546Sopenharmony_ci * to using 64bit umul with 32bit numbers zero-extended to 64bit, but 1020bf215546Sopenharmony_ci * apparently llvm does not recognize this widening mul). This includes 6 1021bf215546Sopenharmony_ci * (instead of 2) pmuludq plus extra adds and shifts 1022bf215546Sopenharmony_ci * The same story applies to signed mul, albeit fixing this requires sse41. 1023bf215546Sopenharmony_ci * https://llvm.org/bugs/show_bug.cgi?id=30845 1024bf215546Sopenharmony_ci * So, whip up our own code, albeit only for length 4 and 8 (which 1025bf215546Sopenharmony_ci * should be good enough)... 1026bf215546Sopenharmony_ci * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern 1027bf215546Sopenharmony_ci * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle 1028bf215546Sopenharmony_ci * for signed), which the fallback code does not, without this llvm 1029bf215546Sopenharmony_ci * will likely still produce atrocious code. 1030bf215546Sopenharmony_ci */ 1031bf215546Sopenharmony_ci if (LLVM_VERSION_MAJOR < 7 && 1032bf215546Sopenharmony_ci (bld->type.length == 4 || bld->type.length == 8) && 1033bf215546Sopenharmony_ci ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) || 1034bf215546Sopenharmony_ci util_get_cpu_caps()->has_sse4_1)) { 1035bf215546Sopenharmony_ci const char *intrinsic = NULL; 1036bf215546Sopenharmony_ci LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd; 1037bf215546Sopenharmony_ci LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec; 1038bf215546Sopenharmony_ci struct lp_type type_wide = lp_wider_type(bld->type); 1039bf215546Sopenharmony_ci LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide); 1040bf215546Sopenharmony_ci unsigned i; 1041bf215546Sopenharmony_ci for (i = 0; i < bld->type.length; i += 2) { 1042bf215546Sopenharmony_ci shuf[i] = lp_build_const_int32(gallivm, i+1); 1043bf215546Sopenharmony_ci shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 1044bf215546Sopenharmony_ci } 1045bf215546Sopenharmony_ci shuf_vec = LLVMConstVector(shuf, bld->type.length); 1046bf215546Sopenharmony_ci aeven = a; 1047bf215546Sopenharmony_ci beven = b; 1048bf215546Sopenharmony_ci aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, ""); 1049bf215546Sopenharmony_ci bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, ""); 1050bf215546Sopenharmony_ci 1051bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) { 1052bf215546Sopenharmony_ci if (bld->type.sign) { 1053bf215546Sopenharmony_ci intrinsic = "llvm.x86.avx2.pmul.dq"; 1054bf215546Sopenharmony_ci } else { 1055bf215546Sopenharmony_ci intrinsic = "llvm.x86.avx2.pmulu.dq"; 1056bf215546Sopenharmony_ci } 1057bf215546Sopenharmony_ci muleven = lp_build_intrinsic_binary(builder, intrinsic, 1058bf215546Sopenharmony_ci wider_type, aeven, beven); 1059bf215546Sopenharmony_ci mulodd = lp_build_intrinsic_binary(builder, intrinsic, 1060bf215546Sopenharmony_ci wider_type, aodd, bodd); 1061bf215546Sopenharmony_ci } 1062bf215546Sopenharmony_ci else { 1063bf215546Sopenharmony_ci /* for consistent naming look elsewhere... */ 1064bf215546Sopenharmony_ci if (bld->type.sign) { 1065bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse41.pmuldq"; 1066bf215546Sopenharmony_ci } else { 1067bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse2.pmulu.dq"; 1068bf215546Sopenharmony_ci } 1069bf215546Sopenharmony_ci /* 1070bf215546Sopenharmony_ci * XXX If we only have AVX but not AVX2 this is a pain. 1071bf215546Sopenharmony_ci * lp_build_intrinsic_binary_anylength() can't handle it 1072bf215546Sopenharmony_ci * (due to src and dst type not being identical). 1073bf215546Sopenharmony_ci */ 1074bf215546Sopenharmony_ci if (bld->type.length == 8) { 1075bf215546Sopenharmony_ci LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi; 1076bf215546Sopenharmony_ci LLVMValueRef aoddlo, aoddhi, boddlo, boddhi; 1077bf215546Sopenharmony_ci LLVMValueRef muleven2[2], mulodd2[2]; 1078bf215546Sopenharmony_ci struct lp_type type_wide_half = type_wide; 1079bf215546Sopenharmony_ci LLVMTypeRef wtype_half; 1080bf215546Sopenharmony_ci type_wide_half.length = 2; 1081bf215546Sopenharmony_ci wtype_half = lp_build_vec_type(gallivm, type_wide_half); 1082bf215546Sopenharmony_ci aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4); 1083bf215546Sopenharmony_ci aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4); 1084bf215546Sopenharmony_ci bevenlo = lp_build_extract_range(gallivm, beven, 0, 4); 1085bf215546Sopenharmony_ci bevenhi = lp_build_extract_range(gallivm, beven, 4, 4); 1086bf215546Sopenharmony_ci aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4); 1087bf215546Sopenharmony_ci aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4); 1088bf215546Sopenharmony_ci boddlo = lp_build_extract_range(gallivm, bodd, 0, 4); 1089bf215546Sopenharmony_ci boddhi = lp_build_extract_range(gallivm, bodd, 4, 4); 1090bf215546Sopenharmony_ci muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic, 1091bf215546Sopenharmony_ci wtype_half, aevenlo, bevenlo); 1092bf215546Sopenharmony_ci mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic, 1093bf215546Sopenharmony_ci wtype_half, aoddlo, boddlo); 1094bf215546Sopenharmony_ci muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic, 1095bf215546Sopenharmony_ci wtype_half, aevenhi, bevenhi); 1096bf215546Sopenharmony_ci mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic, 1097bf215546Sopenharmony_ci wtype_half, aoddhi, boddhi); 1098bf215546Sopenharmony_ci muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2); 1099bf215546Sopenharmony_ci mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2); 1100bf215546Sopenharmony_ci 1101bf215546Sopenharmony_ci } 1102bf215546Sopenharmony_ci else { 1103bf215546Sopenharmony_ci muleven = lp_build_intrinsic_binary(builder, intrinsic, 1104bf215546Sopenharmony_ci wider_type, aeven, beven); 1105bf215546Sopenharmony_ci mulodd = lp_build_intrinsic_binary(builder, intrinsic, 1106bf215546Sopenharmony_ci wider_type, aodd, bodd); 1107bf215546Sopenharmony_ci } 1108bf215546Sopenharmony_ci } 1109bf215546Sopenharmony_ci muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, ""); 1110bf215546Sopenharmony_ci mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, ""); 1111bf215546Sopenharmony_ci 1112bf215546Sopenharmony_ci for (i = 0; i < bld->type.length; i += 2) { 1113bf215546Sopenharmony_ci shuf[i] = lp_build_const_int32(gallivm, i + 1); 1114bf215546Sopenharmony_ci shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length); 1115bf215546Sopenharmony_ci } 1116bf215546Sopenharmony_ci shuf_vec = LLVMConstVector(shuf, bld->type.length); 1117bf215546Sopenharmony_ci *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); 1118bf215546Sopenharmony_ci 1119bf215546Sopenharmony_ci for (i = 0; i < bld->type.length; i += 2) { 1120bf215546Sopenharmony_ci shuf[i] = lp_build_const_int32(gallivm, i); 1121bf215546Sopenharmony_ci shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length); 1122bf215546Sopenharmony_ci } 1123bf215546Sopenharmony_ci shuf_vec = LLVMConstVector(shuf, bld->type.length); 1124bf215546Sopenharmony_ci return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); 1125bf215546Sopenharmony_ci } 1126bf215546Sopenharmony_ci else { 1127bf215546Sopenharmony_ci return lp_build_mul_32_lohi(bld, a, b, res_hi); 1128bf215546Sopenharmony_ci } 1129bf215546Sopenharmony_ci} 1130bf215546Sopenharmony_ci 1131bf215546Sopenharmony_ci 1132bf215546Sopenharmony_ci/* 1133bf215546Sopenharmony_ci * Widening mul, valid for <= 32 (8, 16, 32) -> 64 1134bf215546Sopenharmony_ci * Result is low N bits, high bits returned in res_hi. 1135bf215546Sopenharmony_ci * 1136bf215546Sopenharmony_ci * Emits generic code. 1137bf215546Sopenharmony_ci */ 1138bf215546Sopenharmony_ciLLVMValueRef 1139bf215546Sopenharmony_cilp_build_mul_32_lohi(struct lp_build_context *bld, 1140bf215546Sopenharmony_ci LLVMValueRef a, 1141bf215546Sopenharmony_ci LLVMValueRef b, 1142bf215546Sopenharmony_ci LLVMValueRef *res_hi) 1143bf215546Sopenharmony_ci{ 1144bf215546Sopenharmony_ci struct gallivm_state *gallivm = bld->gallivm; 1145bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 1146bf215546Sopenharmony_ci LLVMValueRef tmp, shift, res_lo; 1147bf215546Sopenharmony_ci struct lp_type type_tmp; 1148bf215546Sopenharmony_ci LLVMTypeRef wide_type, narrow_type; 1149bf215546Sopenharmony_ci 1150bf215546Sopenharmony_ci type_tmp = bld->type; 1151bf215546Sopenharmony_ci narrow_type = lp_build_vec_type(gallivm, type_tmp); 1152bf215546Sopenharmony_ci if (bld->type.width < 32) 1153bf215546Sopenharmony_ci type_tmp.width = 32; 1154bf215546Sopenharmony_ci else 1155bf215546Sopenharmony_ci type_tmp.width *= 2; 1156bf215546Sopenharmony_ci wide_type = lp_build_vec_type(gallivm, type_tmp); 1157bf215546Sopenharmony_ci shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width); 1158bf215546Sopenharmony_ci 1159bf215546Sopenharmony_ci if (bld->type.sign) { 1160bf215546Sopenharmony_ci a = LLVMBuildSExt(builder, a, wide_type, ""); 1161bf215546Sopenharmony_ci b = LLVMBuildSExt(builder, b, wide_type, ""); 1162bf215546Sopenharmony_ci } else { 1163bf215546Sopenharmony_ci a = LLVMBuildZExt(builder, a, wide_type, ""); 1164bf215546Sopenharmony_ci b = LLVMBuildZExt(builder, b, wide_type, ""); 1165bf215546Sopenharmony_ci } 1166bf215546Sopenharmony_ci tmp = LLVMBuildMul(builder, a, b, ""); 1167bf215546Sopenharmony_ci 1168bf215546Sopenharmony_ci res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, ""); 1169bf215546Sopenharmony_ci 1170bf215546Sopenharmony_ci /* Since we truncate anyway, LShr and AShr are equivalent. */ 1171bf215546Sopenharmony_ci tmp = LLVMBuildLShr(builder, tmp, shift, ""); 1172bf215546Sopenharmony_ci *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, ""); 1173bf215546Sopenharmony_ci 1174bf215546Sopenharmony_ci return res_lo; 1175bf215546Sopenharmony_ci} 1176bf215546Sopenharmony_ci 1177bf215546Sopenharmony_ci 1178bf215546Sopenharmony_ci/* a * b + c */ 1179bf215546Sopenharmony_ciLLVMValueRef 1180bf215546Sopenharmony_cilp_build_mad(struct lp_build_context *bld, 1181bf215546Sopenharmony_ci LLVMValueRef a, 1182bf215546Sopenharmony_ci LLVMValueRef b, 1183bf215546Sopenharmony_ci LLVMValueRef c) 1184bf215546Sopenharmony_ci{ 1185bf215546Sopenharmony_ci const struct lp_type type = bld->type; 1186bf215546Sopenharmony_ci if (type.floating) { 1187bf215546Sopenharmony_ci return lp_build_fmuladd(bld->gallivm->builder, a, b, c); 1188bf215546Sopenharmony_ci } else { 1189bf215546Sopenharmony_ci return lp_build_add(bld, lp_build_mul(bld, a, b), c); 1190bf215546Sopenharmony_ci } 1191bf215546Sopenharmony_ci} 1192bf215546Sopenharmony_ci 1193bf215546Sopenharmony_ci 1194bf215546Sopenharmony_ci/** 1195bf215546Sopenharmony_ci * Small vector x scale multiplication optimization. 1196bf215546Sopenharmony_ci */ 1197bf215546Sopenharmony_ciLLVMValueRef 1198bf215546Sopenharmony_cilp_build_mul_imm(struct lp_build_context *bld, 1199bf215546Sopenharmony_ci LLVMValueRef a, 1200bf215546Sopenharmony_ci int b) 1201bf215546Sopenharmony_ci{ 1202bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 1203bf215546Sopenharmony_ci LLVMValueRef factor; 1204bf215546Sopenharmony_ci 1205bf215546Sopenharmony_ci assert(lp_check_value(bld->type, a)); 1206bf215546Sopenharmony_ci 1207bf215546Sopenharmony_ci if (b == 0) 1208bf215546Sopenharmony_ci return bld->zero; 1209bf215546Sopenharmony_ci 1210bf215546Sopenharmony_ci if (b == 1) 1211bf215546Sopenharmony_ci return a; 1212bf215546Sopenharmony_ci 1213bf215546Sopenharmony_ci if (b == -1) 1214bf215546Sopenharmony_ci return lp_build_negate(bld, a); 1215bf215546Sopenharmony_ci 1216bf215546Sopenharmony_ci if (b == 2 && bld->type.floating) 1217bf215546Sopenharmony_ci return lp_build_add(bld, a, a); 1218bf215546Sopenharmony_ci 1219bf215546Sopenharmony_ci if (util_is_power_of_two_or_zero(b)) { 1220bf215546Sopenharmony_ci unsigned shift = ffs(b) - 1; 1221bf215546Sopenharmony_ci 1222bf215546Sopenharmony_ci if (bld->type.floating) { 1223bf215546Sopenharmony_ci#if 0 1224bf215546Sopenharmony_ci /* 1225bf215546Sopenharmony_ci * Power of two multiplication by directly manipulating the exponent. 1226bf215546Sopenharmony_ci * 1227bf215546Sopenharmony_ci * XXX: This might not be always faster, it will introduce a small 1228bf215546Sopenharmony_ci * error for multiplication by zero, and it will produce wrong results 1229bf215546Sopenharmony_ci * for Inf and NaN. 1230bf215546Sopenharmony_ci */ 1231bf215546Sopenharmony_ci unsigned mantissa = lp_mantissa(bld->type); 1232bf215546Sopenharmony_ci factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa); 1233bf215546Sopenharmony_ci a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), ""); 1234bf215546Sopenharmony_ci a = LLVMBuildAdd(builder, a, factor, ""); 1235bf215546Sopenharmony_ci a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), ""); 1236bf215546Sopenharmony_ci return a; 1237bf215546Sopenharmony_ci#endif 1238bf215546Sopenharmony_ci } 1239bf215546Sopenharmony_ci else { 1240bf215546Sopenharmony_ci factor = lp_build_const_vec(bld->gallivm, bld->type, shift); 1241bf215546Sopenharmony_ci return LLVMBuildShl(builder, a, factor, ""); 1242bf215546Sopenharmony_ci } 1243bf215546Sopenharmony_ci } 1244bf215546Sopenharmony_ci 1245bf215546Sopenharmony_ci factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b); 1246bf215546Sopenharmony_ci return lp_build_mul(bld, a, factor); 1247bf215546Sopenharmony_ci} 1248bf215546Sopenharmony_ci 1249bf215546Sopenharmony_ci 1250bf215546Sopenharmony_ci/** 1251bf215546Sopenharmony_ci * Generate a / b 1252bf215546Sopenharmony_ci */ 1253bf215546Sopenharmony_ciLLVMValueRef 1254bf215546Sopenharmony_cilp_build_div(struct lp_build_context *bld, 1255bf215546Sopenharmony_ci LLVMValueRef a, 1256bf215546Sopenharmony_ci LLVMValueRef b) 1257bf215546Sopenharmony_ci{ 1258bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 1259bf215546Sopenharmony_ci const struct lp_type type = bld->type; 1260bf215546Sopenharmony_ci 1261bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 1262bf215546Sopenharmony_ci assert(lp_check_value(type, b)); 1263bf215546Sopenharmony_ci 1264bf215546Sopenharmony_ci if (a == bld->zero) 1265bf215546Sopenharmony_ci return bld->zero; 1266bf215546Sopenharmony_ci if (a == bld->one && type.floating) 1267bf215546Sopenharmony_ci return lp_build_rcp(bld, b); 1268bf215546Sopenharmony_ci if (b == bld->zero) 1269bf215546Sopenharmony_ci return bld->undef; 1270bf215546Sopenharmony_ci if (b == bld->one) 1271bf215546Sopenharmony_ci return a; 1272bf215546Sopenharmony_ci if (a == bld->undef || b == bld->undef) 1273bf215546Sopenharmony_ci return bld->undef; 1274bf215546Sopenharmony_ci 1275bf215546Sopenharmony_ci /* fast rcp is disabled (just uses div), so makes no sense to try that */ 1276bf215546Sopenharmony_ci if (FALSE && 1277bf215546Sopenharmony_ci ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || 1278bf215546Sopenharmony_ci (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) && 1279bf215546Sopenharmony_ci type.floating) 1280bf215546Sopenharmony_ci return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 1281bf215546Sopenharmony_ci 1282bf215546Sopenharmony_ci if (type.floating) 1283bf215546Sopenharmony_ci return LLVMBuildFDiv(builder, a, b, ""); 1284bf215546Sopenharmony_ci else if (type.sign) 1285bf215546Sopenharmony_ci return LLVMBuildSDiv(builder, a, b, ""); 1286bf215546Sopenharmony_ci else 1287bf215546Sopenharmony_ci return LLVMBuildUDiv(builder, a, b, ""); 1288bf215546Sopenharmony_ci} 1289bf215546Sopenharmony_ci 1290bf215546Sopenharmony_ci 1291bf215546Sopenharmony_ci/** 1292bf215546Sopenharmony_ci * Linear interpolation helper. 1293bf215546Sopenharmony_ci * 1294bf215546Sopenharmony_ci * @param normalized whether we are interpolating normalized values, 1295bf215546Sopenharmony_ci * encoded in normalized integers, twice as wide. 1296bf215546Sopenharmony_ci * 1297bf215546Sopenharmony_ci * @sa http://www.stereopsis.com/doubleblend.html 1298bf215546Sopenharmony_ci */ 1299bf215546Sopenharmony_cistatic inline LLVMValueRef 1300bf215546Sopenharmony_cilp_build_lerp_simple(struct lp_build_context *bld, 1301bf215546Sopenharmony_ci LLVMValueRef x, 1302bf215546Sopenharmony_ci LLVMValueRef v0, 1303bf215546Sopenharmony_ci LLVMValueRef v1, 1304bf215546Sopenharmony_ci unsigned flags) 1305bf215546Sopenharmony_ci{ 1306bf215546Sopenharmony_ci unsigned half_width = bld->type.width/2; 1307bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 1308bf215546Sopenharmony_ci LLVMValueRef delta; 1309bf215546Sopenharmony_ci LLVMValueRef res; 1310bf215546Sopenharmony_ci 1311bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 1312bf215546Sopenharmony_ci assert(lp_check_value(bld->type, v0)); 1313bf215546Sopenharmony_ci assert(lp_check_value(bld->type, v1)); 1314bf215546Sopenharmony_ci 1315bf215546Sopenharmony_ci delta = lp_build_sub(bld, v1, v0); 1316bf215546Sopenharmony_ci 1317bf215546Sopenharmony_ci if (bld->type.floating) { 1318bf215546Sopenharmony_ci assert(flags == 0); 1319bf215546Sopenharmony_ci return lp_build_mad(bld, x, delta, v0); 1320bf215546Sopenharmony_ci } 1321bf215546Sopenharmony_ci 1322bf215546Sopenharmony_ci if (flags & LP_BLD_LERP_WIDE_NORMALIZED) { 1323bf215546Sopenharmony_ci if (!bld->type.sign) { 1324bf215546Sopenharmony_ci if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) { 1325bf215546Sopenharmony_ci /* 1326bf215546Sopenharmony_ci * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the 1327bf215546Sopenharmony_ci * most-significant-bit to the lowest-significant-bit, so that 1328bf215546Sopenharmony_ci * later we can just divide by 2**n instead of 2**n - 1. 1329bf215546Sopenharmony_ci */ 1330bf215546Sopenharmony_ci 1331bf215546Sopenharmony_ci x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1)); 1332bf215546Sopenharmony_ci } 1333bf215546Sopenharmony_ci 1334bf215546Sopenharmony_ci /* (x * delta) >> n */ 1335bf215546Sopenharmony_ci /* 1336bf215546Sopenharmony_ci * For this multiply, higher internal precision is required to pass 1337bf215546Sopenharmony_ci * CTS, the most efficient path to that is pmulhrsw on ssse3 and 1338bf215546Sopenharmony_ci * above. This could be opencoded on other arches if conformance was 1339bf215546Sopenharmony_ci * required. 1340bf215546Sopenharmony_ci */ 1341bf215546Sopenharmony_ci if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) { 1342bf215546Sopenharmony_ci res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7)); 1343bf215546Sopenharmony_ci res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff)); 1344bf215546Sopenharmony_ci } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) { 1345bf215546Sopenharmony_ci res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7)); 1346bf215546Sopenharmony_ci res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff)); 1347bf215546Sopenharmony_ci } else { 1348bf215546Sopenharmony_ci res = lp_build_mul(bld, x, delta); 1349bf215546Sopenharmony_ci res = lp_build_shr_imm(bld, res, half_width); 1350bf215546Sopenharmony_ci } 1351bf215546Sopenharmony_ci } else { 1352bf215546Sopenharmony_ci /* 1353bf215546Sopenharmony_ci * The rescaling trick above doesn't work for signed numbers, so 1354bf215546Sopenharmony_ci * use the 2**n - 1 divison approximation in lp_build_mul_norm 1355bf215546Sopenharmony_ci * instead. 1356bf215546Sopenharmony_ci */ 1357bf215546Sopenharmony_ci assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)); 1358bf215546Sopenharmony_ci res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta); 1359bf215546Sopenharmony_ci } 1360bf215546Sopenharmony_ci } else { 1361bf215546Sopenharmony_ci assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)); 1362bf215546Sopenharmony_ci res = lp_build_mul(bld, x, delta); 1363bf215546Sopenharmony_ci } 1364bf215546Sopenharmony_ci 1365bf215546Sopenharmony_ci if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) { 1366bf215546Sopenharmony_ci /* 1367bf215546Sopenharmony_ci * At this point both res and v0 only use the lower half of the bits, 1368bf215546Sopenharmony_ci * the rest is zero. Instead of add / mask, do add with half wide type. 1369bf215546Sopenharmony_ci */ 1370bf215546Sopenharmony_ci struct lp_type narrow_type; 1371bf215546Sopenharmony_ci struct lp_build_context narrow_bld; 1372bf215546Sopenharmony_ci 1373bf215546Sopenharmony_ci memset(&narrow_type, 0, sizeof narrow_type); 1374bf215546Sopenharmony_ci narrow_type.sign = bld->type.sign; 1375bf215546Sopenharmony_ci narrow_type.width = bld->type.width/2; 1376bf215546Sopenharmony_ci narrow_type.length = bld->type.length*2; 1377bf215546Sopenharmony_ci 1378bf215546Sopenharmony_ci lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type); 1379bf215546Sopenharmony_ci res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, ""); 1380bf215546Sopenharmony_ci v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, ""); 1381bf215546Sopenharmony_ci res = lp_build_add(&narrow_bld, v0, res); 1382bf215546Sopenharmony_ci res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 1383bf215546Sopenharmony_ci } else { 1384bf215546Sopenharmony_ci res = lp_build_add(bld, v0, res); 1385bf215546Sopenharmony_ci 1386bf215546Sopenharmony_ci if (bld->type.fixed) { 1387bf215546Sopenharmony_ci /* 1388bf215546Sopenharmony_ci * We need to mask out the high order bits when lerping 8bit 1389bf215546Sopenharmony_ci * normalized colors stored on 16bits 1390bf215546Sopenharmony_ci */ 1391bf215546Sopenharmony_ci /* XXX: This step is necessary for lerping 8bit colors stored on 1392bf215546Sopenharmony_ci * 16bits, but it will be wrong for true fixed point use cases. 1393bf215546Sopenharmony_ci * Basically we need a more powerful lp_type, capable of further 1394bf215546Sopenharmony_ci * distinguishing the values interpretation from the value storage. 1395bf215546Sopenharmony_ci */ 1396bf215546Sopenharmony_ci LLVMValueRef low_bits; 1397bf215546Sopenharmony_ci low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1); 1398bf215546Sopenharmony_ci res = LLVMBuildAnd(builder, res, low_bits, ""); 1399bf215546Sopenharmony_ci } 1400bf215546Sopenharmony_ci } 1401bf215546Sopenharmony_ci 1402bf215546Sopenharmony_ci return res; 1403bf215546Sopenharmony_ci} 1404bf215546Sopenharmony_ci 1405bf215546Sopenharmony_ci 1406bf215546Sopenharmony_ci/** 1407bf215546Sopenharmony_ci * Linear interpolation. 1408bf215546Sopenharmony_ci */ 1409bf215546Sopenharmony_ciLLVMValueRef 1410bf215546Sopenharmony_cilp_build_lerp(struct lp_build_context *bld, 1411bf215546Sopenharmony_ci LLVMValueRef x, 1412bf215546Sopenharmony_ci LLVMValueRef v0, 1413bf215546Sopenharmony_ci LLVMValueRef v1, 1414bf215546Sopenharmony_ci unsigned flags) 1415bf215546Sopenharmony_ci{ 1416bf215546Sopenharmony_ci const struct lp_type type = bld->type; 1417bf215546Sopenharmony_ci LLVMValueRef res; 1418bf215546Sopenharmony_ci 1419bf215546Sopenharmony_ci assert(lp_check_value(type, x)); 1420bf215546Sopenharmony_ci assert(lp_check_value(type, v0)); 1421bf215546Sopenharmony_ci assert(lp_check_value(type, v1)); 1422bf215546Sopenharmony_ci 1423bf215546Sopenharmony_ci assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED)); 1424bf215546Sopenharmony_ci 1425bf215546Sopenharmony_ci if (type.norm) { 1426bf215546Sopenharmony_ci struct lp_type wide_type; 1427bf215546Sopenharmony_ci struct lp_build_context wide_bld; 1428bf215546Sopenharmony_ci LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; 1429bf215546Sopenharmony_ci 1430bf215546Sopenharmony_ci assert(type.length >= 2); 1431bf215546Sopenharmony_ci 1432bf215546Sopenharmony_ci /* 1433bf215546Sopenharmony_ci * Create a wider integer type, enough to hold the 1434bf215546Sopenharmony_ci * intermediate result of the multiplication. 1435bf215546Sopenharmony_ci */ 1436bf215546Sopenharmony_ci memset(&wide_type, 0, sizeof wide_type); 1437bf215546Sopenharmony_ci wide_type.sign = type.sign; 1438bf215546Sopenharmony_ci wide_type.width = type.width*2; 1439bf215546Sopenharmony_ci wide_type.length = type.length/2; 1440bf215546Sopenharmony_ci 1441bf215546Sopenharmony_ci lp_build_context_init(&wide_bld, bld->gallivm, wide_type); 1442bf215546Sopenharmony_ci 1443bf215546Sopenharmony_ci lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh); 1444bf215546Sopenharmony_ci lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h); 1445bf215546Sopenharmony_ci lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h); 1446bf215546Sopenharmony_ci 1447bf215546Sopenharmony_ci /* 1448bf215546Sopenharmony_ci * Lerp both halves. 1449bf215546Sopenharmony_ci */ 1450bf215546Sopenharmony_ci 1451bf215546Sopenharmony_ci flags |= LP_BLD_LERP_WIDE_NORMALIZED; 1452bf215546Sopenharmony_ci 1453bf215546Sopenharmony_ci resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags); 1454bf215546Sopenharmony_ci resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags); 1455bf215546Sopenharmony_ci 1456bf215546Sopenharmony_ci res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh); 1457bf215546Sopenharmony_ci } else { 1458bf215546Sopenharmony_ci res = lp_build_lerp_simple(bld, x, v0, v1, flags); 1459bf215546Sopenharmony_ci } 1460bf215546Sopenharmony_ci 1461bf215546Sopenharmony_ci return res; 1462bf215546Sopenharmony_ci} 1463bf215546Sopenharmony_ci 1464bf215546Sopenharmony_ci 1465bf215546Sopenharmony_ci/** 1466bf215546Sopenharmony_ci * Bilinear interpolation. 1467bf215546Sopenharmony_ci * 1468bf215546Sopenharmony_ci * Values indices are in v_{yx}. 1469bf215546Sopenharmony_ci */ 1470bf215546Sopenharmony_ciLLVMValueRef 1471bf215546Sopenharmony_cilp_build_lerp_2d(struct lp_build_context *bld, 1472bf215546Sopenharmony_ci LLVMValueRef x, 1473bf215546Sopenharmony_ci LLVMValueRef y, 1474bf215546Sopenharmony_ci LLVMValueRef v00, 1475bf215546Sopenharmony_ci LLVMValueRef v01, 1476bf215546Sopenharmony_ci LLVMValueRef v10, 1477bf215546Sopenharmony_ci LLVMValueRef v11, 1478bf215546Sopenharmony_ci unsigned flags) 1479bf215546Sopenharmony_ci{ 1480bf215546Sopenharmony_ci LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags); 1481bf215546Sopenharmony_ci LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags); 1482bf215546Sopenharmony_ci return lp_build_lerp(bld, y, v0, v1, flags); 1483bf215546Sopenharmony_ci} 1484bf215546Sopenharmony_ci 1485bf215546Sopenharmony_ci 1486bf215546Sopenharmony_ciLLVMValueRef 1487bf215546Sopenharmony_cilp_build_lerp_3d(struct lp_build_context *bld, 1488bf215546Sopenharmony_ci LLVMValueRef x, 1489bf215546Sopenharmony_ci LLVMValueRef y, 1490bf215546Sopenharmony_ci LLVMValueRef z, 1491bf215546Sopenharmony_ci LLVMValueRef v000, 1492bf215546Sopenharmony_ci LLVMValueRef v001, 1493bf215546Sopenharmony_ci LLVMValueRef v010, 1494bf215546Sopenharmony_ci LLVMValueRef v011, 1495bf215546Sopenharmony_ci LLVMValueRef v100, 1496bf215546Sopenharmony_ci LLVMValueRef v101, 1497bf215546Sopenharmony_ci LLVMValueRef v110, 1498bf215546Sopenharmony_ci LLVMValueRef v111, 1499bf215546Sopenharmony_ci unsigned flags) 1500bf215546Sopenharmony_ci{ 1501bf215546Sopenharmony_ci LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags); 1502bf215546Sopenharmony_ci LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags); 1503bf215546Sopenharmony_ci return lp_build_lerp(bld, z, v0, v1, flags); 1504bf215546Sopenharmony_ci} 1505bf215546Sopenharmony_ci 1506bf215546Sopenharmony_ci 1507bf215546Sopenharmony_ci/** 1508bf215546Sopenharmony_ci * Generate min(a, b) 1509bf215546Sopenharmony_ci * Do checks for special cases but not for nans. 1510bf215546Sopenharmony_ci */ 1511bf215546Sopenharmony_ciLLVMValueRef 1512bf215546Sopenharmony_cilp_build_min(struct lp_build_context *bld, 1513bf215546Sopenharmony_ci LLVMValueRef a, 1514bf215546Sopenharmony_ci LLVMValueRef b) 1515bf215546Sopenharmony_ci{ 1516bf215546Sopenharmony_ci assert(lp_check_value(bld->type, a)); 1517bf215546Sopenharmony_ci assert(lp_check_value(bld->type, b)); 1518bf215546Sopenharmony_ci 1519bf215546Sopenharmony_ci if (a == bld->undef || b == bld->undef) 1520bf215546Sopenharmony_ci return bld->undef; 1521bf215546Sopenharmony_ci 1522bf215546Sopenharmony_ci if (a == b) 1523bf215546Sopenharmony_ci return a; 1524bf215546Sopenharmony_ci 1525bf215546Sopenharmony_ci if (bld->type.norm) { 1526bf215546Sopenharmony_ci if (!bld->type.sign) { 1527bf215546Sopenharmony_ci if (a == bld->zero || b == bld->zero) { 1528bf215546Sopenharmony_ci return bld->zero; 1529bf215546Sopenharmony_ci } 1530bf215546Sopenharmony_ci } 1531bf215546Sopenharmony_ci if (a == bld->one) 1532bf215546Sopenharmony_ci return b; 1533bf215546Sopenharmony_ci if (b == bld->one) 1534bf215546Sopenharmony_ci return a; 1535bf215546Sopenharmony_ci } 1536bf215546Sopenharmony_ci 1537bf215546Sopenharmony_ci return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 1538bf215546Sopenharmony_ci} 1539bf215546Sopenharmony_ci 1540bf215546Sopenharmony_ci 1541bf215546Sopenharmony_ci/** 1542bf215546Sopenharmony_ci * Generate min(a, b) 1543bf215546Sopenharmony_ci * NaN's are handled according to the behavior specified by the 1544bf215546Sopenharmony_ci * nan_behavior argument. 1545bf215546Sopenharmony_ci */ 1546bf215546Sopenharmony_ciLLVMValueRef 1547bf215546Sopenharmony_cilp_build_min_ext(struct lp_build_context *bld, 1548bf215546Sopenharmony_ci LLVMValueRef a, 1549bf215546Sopenharmony_ci LLVMValueRef b, 1550bf215546Sopenharmony_ci enum gallivm_nan_behavior nan_behavior) 1551bf215546Sopenharmony_ci{ 1552bf215546Sopenharmony_ci assert(lp_check_value(bld->type, a)); 1553bf215546Sopenharmony_ci assert(lp_check_value(bld->type, b)); 1554bf215546Sopenharmony_ci 1555bf215546Sopenharmony_ci if (a == bld->undef || b == bld->undef) 1556bf215546Sopenharmony_ci return bld->undef; 1557bf215546Sopenharmony_ci 1558bf215546Sopenharmony_ci if (a == b) 1559bf215546Sopenharmony_ci return a; 1560bf215546Sopenharmony_ci 1561bf215546Sopenharmony_ci if (bld->type.norm) { 1562bf215546Sopenharmony_ci if (!bld->type.sign) { 1563bf215546Sopenharmony_ci if (a == bld->zero || b == bld->zero) { 1564bf215546Sopenharmony_ci return bld->zero; 1565bf215546Sopenharmony_ci } 1566bf215546Sopenharmony_ci } 1567bf215546Sopenharmony_ci if (a == bld->one) 1568bf215546Sopenharmony_ci return b; 1569bf215546Sopenharmony_ci if (b == bld->one) 1570bf215546Sopenharmony_ci return a; 1571bf215546Sopenharmony_ci } 1572bf215546Sopenharmony_ci 1573bf215546Sopenharmony_ci return lp_build_min_simple(bld, a, b, nan_behavior); 1574bf215546Sopenharmony_ci} 1575bf215546Sopenharmony_ci 1576bf215546Sopenharmony_ci 1577bf215546Sopenharmony_ci/** 1578bf215546Sopenharmony_ci * Generate max(a, b) 1579bf215546Sopenharmony_ci * Do checks for special cases, but NaN behavior is undefined. 1580bf215546Sopenharmony_ci */ 1581bf215546Sopenharmony_ciLLVMValueRef 1582bf215546Sopenharmony_cilp_build_max(struct lp_build_context *bld, 1583bf215546Sopenharmony_ci LLVMValueRef a, 1584bf215546Sopenharmony_ci LLVMValueRef b) 1585bf215546Sopenharmony_ci{ 1586bf215546Sopenharmony_ci assert(lp_check_value(bld->type, a)); 1587bf215546Sopenharmony_ci assert(lp_check_value(bld->type, b)); 1588bf215546Sopenharmony_ci 1589bf215546Sopenharmony_ci if (a == bld->undef || b == bld->undef) 1590bf215546Sopenharmony_ci return bld->undef; 1591bf215546Sopenharmony_ci 1592bf215546Sopenharmony_ci if (a == b) 1593bf215546Sopenharmony_ci return a; 1594bf215546Sopenharmony_ci 1595bf215546Sopenharmony_ci if (bld->type.norm) { 1596bf215546Sopenharmony_ci if (a == bld->one || b == bld->one) 1597bf215546Sopenharmony_ci return bld->one; 1598bf215546Sopenharmony_ci if (!bld->type.sign) { 1599bf215546Sopenharmony_ci if (a == bld->zero) { 1600bf215546Sopenharmony_ci return b; 1601bf215546Sopenharmony_ci } 1602bf215546Sopenharmony_ci if (b == bld->zero) { 1603bf215546Sopenharmony_ci return a; 1604bf215546Sopenharmony_ci } 1605bf215546Sopenharmony_ci } 1606bf215546Sopenharmony_ci } 1607bf215546Sopenharmony_ci 1608bf215546Sopenharmony_ci return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 1609bf215546Sopenharmony_ci} 1610bf215546Sopenharmony_ci 1611bf215546Sopenharmony_ci 1612bf215546Sopenharmony_ci/** 1613bf215546Sopenharmony_ci * Generate max(a, b) 1614bf215546Sopenharmony_ci * Checks for special cases. 1615bf215546Sopenharmony_ci * NaN's are handled according to the behavior specified by the 1616bf215546Sopenharmony_ci * nan_behavior argument. 1617bf215546Sopenharmony_ci */ 1618bf215546Sopenharmony_ciLLVMValueRef 1619bf215546Sopenharmony_cilp_build_max_ext(struct lp_build_context *bld, 1620bf215546Sopenharmony_ci LLVMValueRef a, 1621bf215546Sopenharmony_ci LLVMValueRef b, 1622bf215546Sopenharmony_ci enum gallivm_nan_behavior nan_behavior) 1623bf215546Sopenharmony_ci{ 1624bf215546Sopenharmony_ci assert(lp_check_value(bld->type, a)); 1625bf215546Sopenharmony_ci assert(lp_check_value(bld->type, b)); 1626bf215546Sopenharmony_ci 1627bf215546Sopenharmony_ci if (a == bld->undef || b == bld->undef) 1628bf215546Sopenharmony_ci return bld->undef; 1629bf215546Sopenharmony_ci 1630bf215546Sopenharmony_ci if (a == b) 1631bf215546Sopenharmony_ci return a; 1632bf215546Sopenharmony_ci 1633bf215546Sopenharmony_ci if (bld->type.norm) { 1634bf215546Sopenharmony_ci if (a == bld->one || b == bld->one) 1635bf215546Sopenharmony_ci return bld->one; 1636bf215546Sopenharmony_ci if (!bld->type.sign) { 1637bf215546Sopenharmony_ci if (a == bld->zero) { 1638bf215546Sopenharmony_ci return b; 1639bf215546Sopenharmony_ci } 1640bf215546Sopenharmony_ci if (b == bld->zero) { 1641bf215546Sopenharmony_ci return a; 1642bf215546Sopenharmony_ci } 1643bf215546Sopenharmony_ci } 1644bf215546Sopenharmony_ci } 1645bf215546Sopenharmony_ci 1646bf215546Sopenharmony_ci return lp_build_max_simple(bld, a, b, nan_behavior); 1647bf215546Sopenharmony_ci} 1648bf215546Sopenharmony_ci 1649bf215546Sopenharmony_ci 1650bf215546Sopenharmony_ci/** 1651bf215546Sopenharmony_ci * Generate clamp(a, min, max) 1652bf215546Sopenharmony_ci * NaN behavior (for any of a, min, max) is undefined. 1653bf215546Sopenharmony_ci * Do checks for special cases. 1654bf215546Sopenharmony_ci */ 1655bf215546Sopenharmony_ciLLVMValueRef 1656bf215546Sopenharmony_cilp_build_clamp(struct lp_build_context *bld, 1657bf215546Sopenharmony_ci LLVMValueRef a, 1658bf215546Sopenharmony_ci LLVMValueRef min, 1659bf215546Sopenharmony_ci LLVMValueRef max) 1660bf215546Sopenharmony_ci{ 1661bf215546Sopenharmony_ci assert(lp_check_value(bld->type, a)); 1662bf215546Sopenharmony_ci assert(lp_check_value(bld->type, min)); 1663bf215546Sopenharmony_ci assert(lp_check_value(bld->type, max)); 1664bf215546Sopenharmony_ci 1665bf215546Sopenharmony_ci a = lp_build_min(bld, a, max); 1666bf215546Sopenharmony_ci a = lp_build_max(bld, a, min); 1667bf215546Sopenharmony_ci return a; 1668bf215546Sopenharmony_ci} 1669bf215546Sopenharmony_ci 1670bf215546Sopenharmony_ci 1671bf215546Sopenharmony_ci/** 1672bf215546Sopenharmony_ci * Generate clamp(a, 0, 1) 1673bf215546Sopenharmony_ci * A NaN will get converted to zero. 1674bf215546Sopenharmony_ci */ 1675bf215546Sopenharmony_ciLLVMValueRef 1676bf215546Sopenharmony_cilp_build_clamp_zero_one_nanzero(struct lp_build_context *bld, 1677bf215546Sopenharmony_ci LLVMValueRef a) 1678bf215546Sopenharmony_ci{ 1679bf215546Sopenharmony_ci a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 1680bf215546Sopenharmony_ci a = lp_build_min(bld, a, bld->one); 1681bf215546Sopenharmony_ci return a; 1682bf215546Sopenharmony_ci} 1683bf215546Sopenharmony_ci 1684bf215546Sopenharmony_ci 1685bf215546Sopenharmony_ci/** 1686bf215546Sopenharmony_ci * Generate abs(a) 1687bf215546Sopenharmony_ci */ 1688bf215546Sopenharmony_ciLLVMValueRef 1689bf215546Sopenharmony_cilp_build_abs(struct lp_build_context *bld, 1690bf215546Sopenharmony_ci LLVMValueRef a) 1691bf215546Sopenharmony_ci{ 1692bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 1693bf215546Sopenharmony_ci const struct lp_type type = bld->type; 1694bf215546Sopenharmony_ci LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1695bf215546Sopenharmony_ci 1696bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 1697bf215546Sopenharmony_ci 1698bf215546Sopenharmony_ci if (!type.sign) 1699bf215546Sopenharmony_ci return a; 1700bf215546Sopenharmony_ci 1701bf215546Sopenharmony_ci if (type.floating) { 1702bf215546Sopenharmony_ci char intrinsic[32]; 1703bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type); 1704bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 1705bf215546Sopenharmony_ci } 1706bf215546Sopenharmony_ci 1707bf215546Sopenharmony_ci if (type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) { 1708bf215546Sopenharmony_ci switch(type.width) { 1709bf215546Sopenharmony_ci case 8: 1710bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 1711bf215546Sopenharmony_ci case 16: 1712bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 1713bf215546Sopenharmony_ci case 32: 1714bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 1715bf215546Sopenharmony_ci } 1716bf215546Sopenharmony_ci } 1717bf215546Sopenharmony_ci else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) { 1718bf215546Sopenharmony_ci switch(type.width) { 1719bf215546Sopenharmony_ci case 8: 1720bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a); 1721bf215546Sopenharmony_ci case 16: 1722bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a); 1723bf215546Sopenharmony_ci case 32: 1724bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a); 1725bf215546Sopenharmony_ci } 1726bf215546Sopenharmony_ci } 1727bf215546Sopenharmony_ci 1728bf215546Sopenharmony_ci return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero), 1729bf215546Sopenharmony_ci a, LLVMBuildNeg(builder, a, "")); 1730bf215546Sopenharmony_ci} 1731bf215546Sopenharmony_ci 1732bf215546Sopenharmony_ci 1733bf215546Sopenharmony_ciLLVMValueRef 1734bf215546Sopenharmony_cilp_build_negate(struct lp_build_context *bld, 1735bf215546Sopenharmony_ci LLVMValueRef a) 1736bf215546Sopenharmony_ci{ 1737bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 1738bf215546Sopenharmony_ci 1739bf215546Sopenharmony_ci assert(lp_check_value(bld->type, a)); 1740bf215546Sopenharmony_ci 1741bf215546Sopenharmony_ci if (bld->type.floating) 1742bf215546Sopenharmony_ci a = LLVMBuildFNeg(builder, a, ""); 1743bf215546Sopenharmony_ci else 1744bf215546Sopenharmony_ci a = LLVMBuildNeg(builder, a, ""); 1745bf215546Sopenharmony_ci 1746bf215546Sopenharmony_ci return a; 1747bf215546Sopenharmony_ci} 1748bf215546Sopenharmony_ci 1749bf215546Sopenharmony_ci 1750bf215546Sopenharmony_ci/** Return -1, 0 or +1 depending on the sign of a */ 1751bf215546Sopenharmony_ciLLVMValueRef 1752bf215546Sopenharmony_cilp_build_sgn(struct lp_build_context *bld, 1753bf215546Sopenharmony_ci LLVMValueRef a) 1754bf215546Sopenharmony_ci{ 1755bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 1756bf215546Sopenharmony_ci const struct lp_type type = bld->type; 1757bf215546Sopenharmony_ci LLVMValueRef cond; 1758bf215546Sopenharmony_ci LLVMValueRef res; 1759bf215546Sopenharmony_ci 1760bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 1761bf215546Sopenharmony_ci 1762bf215546Sopenharmony_ci /* Handle non-zero case */ 1763bf215546Sopenharmony_ci if (!type.sign) { 1764bf215546Sopenharmony_ci /* if not zero then sign must be positive */ 1765bf215546Sopenharmony_ci res = bld->one; 1766bf215546Sopenharmony_ci } 1767bf215546Sopenharmony_ci else if (type.floating) { 1768bf215546Sopenharmony_ci LLVMTypeRef vec_type; 1769bf215546Sopenharmony_ci LLVMTypeRef int_type; 1770bf215546Sopenharmony_ci LLVMValueRef mask; 1771bf215546Sopenharmony_ci LLVMValueRef sign; 1772bf215546Sopenharmony_ci LLVMValueRef one; 1773bf215546Sopenharmony_ci unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 1774bf215546Sopenharmony_ci 1775bf215546Sopenharmony_ci int_type = lp_build_int_vec_type(bld->gallivm, type); 1776bf215546Sopenharmony_ci vec_type = lp_build_vec_type(bld->gallivm, type); 1777bf215546Sopenharmony_ci mask = lp_build_const_int_vec(bld->gallivm, type, maskBit); 1778bf215546Sopenharmony_ci 1779bf215546Sopenharmony_ci /* Take the sign bit and add it to 1 constant */ 1780bf215546Sopenharmony_ci sign = LLVMBuildBitCast(builder, a, int_type, ""); 1781bf215546Sopenharmony_ci sign = LLVMBuildAnd(builder, sign, mask, ""); 1782bf215546Sopenharmony_ci one = LLVMConstBitCast(bld->one, int_type); 1783bf215546Sopenharmony_ci res = LLVMBuildOr(builder, sign, one, ""); 1784bf215546Sopenharmony_ci res = LLVMBuildBitCast(builder, res, vec_type, ""); 1785bf215546Sopenharmony_ci } 1786bf215546Sopenharmony_ci else 1787bf215546Sopenharmony_ci { 1788bf215546Sopenharmony_ci /* signed int/norm/fixed point */ 1789bf215546Sopenharmony_ci /* could use psign with sse3 and appropriate vectors here */ 1790bf215546Sopenharmony_ci LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); 1791bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 1792bf215546Sopenharmony_ci res = lp_build_select(bld, cond, bld->one, minus_one); 1793bf215546Sopenharmony_ci } 1794bf215546Sopenharmony_ci 1795bf215546Sopenharmony_ci /* Handle zero */ 1796bf215546Sopenharmony_ci cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 1797bf215546Sopenharmony_ci res = lp_build_select(bld, cond, bld->zero, res); 1798bf215546Sopenharmony_ci 1799bf215546Sopenharmony_ci return res; 1800bf215546Sopenharmony_ci} 1801bf215546Sopenharmony_ci 1802bf215546Sopenharmony_ci 1803bf215546Sopenharmony_ci/** 1804bf215546Sopenharmony_ci * Set the sign of float vector 'a' according to 'sign'. 1805bf215546Sopenharmony_ci * If sign==0, return abs(a). 1806bf215546Sopenharmony_ci * If sign==1, return -abs(a); 1807bf215546Sopenharmony_ci * Other values for sign produce undefined results. 1808bf215546Sopenharmony_ci */ 1809bf215546Sopenharmony_ciLLVMValueRef 1810bf215546Sopenharmony_cilp_build_set_sign(struct lp_build_context *bld, 1811bf215546Sopenharmony_ci LLVMValueRef a, LLVMValueRef sign) 1812bf215546Sopenharmony_ci{ 1813bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 1814bf215546Sopenharmony_ci const struct lp_type type = bld->type; 1815bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1816bf215546Sopenharmony_ci LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1817bf215546Sopenharmony_ci LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1); 1818bf215546Sopenharmony_ci LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1819bf215546Sopenharmony_ci ~((unsigned long long) 1 << (type.width - 1))); 1820bf215546Sopenharmony_ci LLVMValueRef val, res; 1821bf215546Sopenharmony_ci 1822bf215546Sopenharmony_ci assert(type.floating); 1823bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 1824bf215546Sopenharmony_ci 1825bf215546Sopenharmony_ci /* val = reinterpret_cast<int>(a) */ 1826bf215546Sopenharmony_ci val = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1827bf215546Sopenharmony_ci /* val = val & mask */ 1828bf215546Sopenharmony_ci val = LLVMBuildAnd(builder, val, mask, ""); 1829bf215546Sopenharmony_ci /* sign = sign << shift */ 1830bf215546Sopenharmony_ci sign = LLVMBuildShl(builder, sign, shift, ""); 1831bf215546Sopenharmony_ci /* res = val | sign */ 1832bf215546Sopenharmony_ci res = LLVMBuildOr(builder, val, sign, ""); 1833bf215546Sopenharmony_ci /* res = reinterpret_cast<float>(res) */ 1834bf215546Sopenharmony_ci res = LLVMBuildBitCast(builder, res, vec_type, ""); 1835bf215546Sopenharmony_ci 1836bf215546Sopenharmony_ci return res; 1837bf215546Sopenharmony_ci} 1838bf215546Sopenharmony_ci 1839bf215546Sopenharmony_ci 1840bf215546Sopenharmony_ci/** 1841bf215546Sopenharmony_ci * Convert vector of (or scalar) int to vector of (or scalar) float. 1842bf215546Sopenharmony_ci */ 1843bf215546Sopenharmony_ciLLVMValueRef 1844bf215546Sopenharmony_cilp_build_int_to_float(struct lp_build_context *bld, 1845bf215546Sopenharmony_ci LLVMValueRef a) 1846bf215546Sopenharmony_ci{ 1847bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 1848bf215546Sopenharmony_ci const struct lp_type type = bld->type; 1849bf215546Sopenharmony_ci LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1850bf215546Sopenharmony_ci 1851bf215546Sopenharmony_ci assert(type.floating); 1852bf215546Sopenharmony_ci 1853bf215546Sopenharmony_ci return LLVMBuildSIToFP(builder, a, vec_type, ""); 1854bf215546Sopenharmony_ci} 1855bf215546Sopenharmony_ci 1856bf215546Sopenharmony_ci 1857bf215546Sopenharmony_cistatic boolean 1858bf215546Sopenharmony_ciarch_rounding_available(const struct lp_type type) 1859bf215546Sopenharmony_ci{ 1860bf215546Sopenharmony_ci if ((util_get_cpu_caps()->has_sse4_1 && 1861bf215546Sopenharmony_ci (type.length == 1 || type.width*type.length == 128)) || 1862bf215546Sopenharmony_ci (util_get_cpu_caps()->has_avx && type.width*type.length == 256) || 1863bf215546Sopenharmony_ci (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512)) 1864bf215546Sopenharmony_ci return TRUE; 1865bf215546Sopenharmony_ci else if ((util_get_cpu_caps()->has_altivec && 1866bf215546Sopenharmony_ci (type.width == 32 && type.length == 4))) 1867bf215546Sopenharmony_ci return TRUE; 1868bf215546Sopenharmony_ci else if (util_get_cpu_caps()->has_neon) 1869bf215546Sopenharmony_ci return TRUE; 1870bf215546Sopenharmony_ci else if (util_get_cpu_caps()->family == CPU_S390X) 1871bf215546Sopenharmony_ci return TRUE; 1872bf215546Sopenharmony_ci 1873bf215546Sopenharmony_ci return FALSE; 1874bf215546Sopenharmony_ci} 1875bf215546Sopenharmony_ci 1876bf215546Sopenharmony_cienum lp_build_round_mode 1877bf215546Sopenharmony_ci{ 1878bf215546Sopenharmony_ci LP_BUILD_ROUND_NEAREST = 0, 1879bf215546Sopenharmony_ci LP_BUILD_ROUND_FLOOR = 1, 1880bf215546Sopenharmony_ci LP_BUILD_ROUND_CEIL = 2, 1881bf215546Sopenharmony_ci LP_BUILD_ROUND_TRUNCATE = 3 1882bf215546Sopenharmony_ci}; 1883bf215546Sopenharmony_ci 1884bf215546Sopenharmony_ci 1885bf215546Sopenharmony_cistatic inline LLVMValueRef 1886bf215546Sopenharmony_cilp_build_iround_nearest_sse2(struct lp_build_context *bld, 1887bf215546Sopenharmony_ci LLVMValueRef a) 1888bf215546Sopenharmony_ci{ 1889bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 1890bf215546Sopenharmony_ci const struct lp_type type = bld->type; 1891bf215546Sopenharmony_ci LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1892bf215546Sopenharmony_ci LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type); 1893bf215546Sopenharmony_ci const char *intrinsic; 1894bf215546Sopenharmony_ci LLVMValueRef res; 1895bf215546Sopenharmony_ci 1896bf215546Sopenharmony_ci assert(type.floating); 1897bf215546Sopenharmony_ci /* using the double precision conversions is a bit more complicated */ 1898bf215546Sopenharmony_ci assert(type.width == 32); 1899bf215546Sopenharmony_ci 1900bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 1901bf215546Sopenharmony_ci assert(util_get_cpu_caps()->has_sse2); 1902bf215546Sopenharmony_ci 1903bf215546Sopenharmony_ci /* This is relying on MXCSR rounding mode, which should always be nearest. */ 1904bf215546Sopenharmony_ci if (type.length == 1) { 1905bf215546Sopenharmony_ci LLVMTypeRef vec_type; 1906bf215546Sopenharmony_ci LLVMValueRef undef; 1907bf215546Sopenharmony_ci LLVMValueRef arg; 1908bf215546Sopenharmony_ci LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1909bf215546Sopenharmony_ci 1910bf215546Sopenharmony_ci vec_type = LLVMVectorType(bld->elem_type, 4); 1911bf215546Sopenharmony_ci 1912bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse.cvtss2si"; 1913bf215546Sopenharmony_ci 1914bf215546Sopenharmony_ci undef = LLVMGetUndef(vec_type); 1915bf215546Sopenharmony_ci 1916bf215546Sopenharmony_ci arg = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1917bf215546Sopenharmony_ci 1918bf215546Sopenharmony_ci res = lp_build_intrinsic_unary(builder, intrinsic, 1919bf215546Sopenharmony_ci ret_type, arg); 1920bf215546Sopenharmony_ci } 1921bf215546Sopenharmony_ci else { 1922bf215546Sopenharmony_ci if (type.width* type.length == 128) { 1923bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse2.cvtps2dq"; 1924bf215546Sopenharmony_ci } 1925bf215546Sopenharmony_ci else { 1926bf215546Sopenharmony_ci assert(type.width*type.length == 256); 1927bf215546Sopenharmony_ci assert(util_get_cpu_caps()->has_avx); 1928bf215546Sopenharmony_ci 1929bf215546Sopenharmony_ci intrinsic = "llvm.x86.avx.cvt.ps2dq.256"; 1930bf215546Sopenharmony_ci } 1931bf215546Sopenharmony_ci res = lp_build_intrinsic_unary(builder, intrinsic, 1932bf215546Sopenharmony_ci ret_type, a); 1933bf215546Sopenharmony_ci } 1934bf215546Sopenharmony_ci 1935bf215546Sopenharmony_ci return res; 1936bf215546Sopenharmony_ci} 1937bf215546Sopenharmony_ci 1938bf215546Sopenharmony_ci 1939bf215546Sopenharmony_ci/* 1940bf215546Sopenharmony_ci */ 1941bf215546Sopenharmony_cistatic inline LLVMValueRef 1942bf215546Sopenharmony_cilp_build_round_altivec(struct lp_build_context *bld, 1943bf215546Sopenharmony_ci LLVMValueRef a, 1944bf215546Sopenharmony_ci enum lp_build_round_mode mode) 1945bf215546Sopenharmony_ci{ 1946bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 1947bf215546Sopenharmony_ci const struct lp_type type = bld->type; 1948bf215546Sopenharmony_ci const char *intrinsic = NULL; 1949bf215546Sopenharmony_ci 1950bf215546Sopenharmony_ci assert(type.floating); 1951bf215546Sopenharmony_ci 1952bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 1953bf215546Sopenharmony_ci assert(util_get_cpu_caps()->has_altivec); 1954bf215546Sopenharmony_ci 1955bf215546Sopenharmony_ci (void)type; 1956bf215546Sopenharmony_ci 1957bf215546Sopenharmony_ci switch (mode) { 1958bf215546Sopenharmony_ci case LP_BUILD_ROUND_NEAREST: 1959bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vrfin"; 1960bf215546Sopenharmony_ci break; 1961bf215546Sopenharmony_ci case LP_BUILD_ROUND_FLOOR: 1962bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vrfim"; 1963bf215546Sopenharmony_ci break; 1964bf215546Sopenharmony_ci case LP_BUILD_ROUND_CEIL: 1965bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vrfip"; 1966bf215546Sopenharmony_ci break; 1967bf215546Sopenharmony_ci case LP_BUILD_ROUND_TRUNCATE: 1968bf215546Sopenharmony_ci intrinsic = "llvm.ppc.altivec.vrfiz"; 1969bf215546Sopenharmony_ci break; 1970bf215546Sopenharmony_ci } 1971bf215546Sopenharmony_ci 1972bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 1973bf215546Sopenharmony_ci} 1974bf215546Sopenharmony_ci 1975bf215546Sopenharmony_ci 1976bf215546Sopenharmony_cistatic inline LLVMValueRef 1977bf215546Sopenharmony_cilp_build_round_arch(struct lp_build_context *bld, 1978bf215546Sopenharmony_ci LLVMValueRef a, 1979bf215546Sopenharmony_ci enum lp_build_round_mode mode) 1980bf215546Sopenharmony_ci{ 1981bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon || 1982bf215546Sopenharmony_ci util_get_cpu_caps()->family == CPU_S390X) { 1983bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 1984bf215546Sopenharmony_ci const struct lp_type type = bld->type; 1985bf215546Sopenharmony_ci const char *intrinsic_root; 1986bf215546Sopenharmony_ci char intrinsic[32]; 1987bf215546Sopenharmony_ci 1988bf215546Sopenharmony_ci assert(type.floating); 1989bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 1990bf215546Sopenharmony_ci (void)type; 1991bf215546Sopenharmony_ci 1992bf215546Sopenharmony_ci switch (mode) { 1993bf215546Sopenharmony_ci case LP_BUILD_ROUND_NEAREST: 1994bf215546Sopenharmony_ci intrinsic_root = "llvm.nearbyint"; 1995bf215546Sopenharmony_ci break; 1996bf215546Sopenharmony_ci case LP_BUILD_ROUND_FLOOR: 1997bf215546Sopenharmony_ci intrinsic_root = "llvm.floor"; 1998bf215546Sopenharmony_ci break; 1999bf215546Sopenharmony_ci case LP_BUILD_ROUND_CEIL: 2000bf215546Sopenharmony_ci intrinsic_root = "llvm.ceil"; 2001bf215546Sopenharmony_ci break; 2002bf215546Sopenharmony_ci case LP_BUILD_ROUND_TRUNCATE: 2003bf215546Sopenharmony_ci intrinsic_root = "llvm.trunc"; 2004bf215546Sopenharmony_ci break; 2005bf215546Sopenharmony_ci default: 2006bf215546Sopenharmony_ci unreachable("unhandled lp_build_round_mode"); 2007bf215546Sopenharmony_ci } 2008bf215546Sopenharmony_ci 2009bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type); 2010bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2011bf215546Sopenharmony_ci } 2012bf215546Sopenharmony_ci else /* (util_get_cpu_caps()->has_altivec) */ 2013bf215546Sopenharmony_ci return lp_build_round_altivec(bld, a, mode); 2014bf215546Sopenharmony_ci} 2015bf215546Sopenharmony_ci 2016bf215546Sopenharmony_ci 2017bf215546Sopenharmony_ci/** 2018bf215546Sopenharmony_ci * Return the integer part of a float (vector) value (== round toward zero). 2019bf215546Sopenharmony_ci * The returned value is a float (vector). 2020bf215546Sopenharmony_ci * Ex: trunc(-1.5) = -1.0 2021bf215546Sopenharmony_ci */ 2022bf215546Sopenharmony_ciLLVMValueRef 2023bf215546Sopenharmony_cilp_build_trunc(struct lp_build_context *bld, 2024bf215546Sopenharmony_ci LLVMValueRef a) 2025bf215546Sopenharmony_ci{ 2026bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2027bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2028bf215546Sopenharmony_ci 2029bf215546Sopenharmony_ci assert(type.floating); 2030bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2031bf215546Sopenharmony_ci 2032bf215546Sopenharmony_ci if (type.width == 16) { 2033bf215546Sopenharmony_ci char intrinsic[64]; 2034bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type); 2035bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2036bf215546Sopenharmony_ci } 2037bf215546Sopenharmony_ci 2038bf215546Sopenharmony_ci if (arch_rounding_available(type)) { 2039bf215546Sopenharmony_ci return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE); 2040bf215546Sopenharmony_ci } 2041bf215546Sopenharmony_ci else { 2042bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2043bf215546Sopenharmony_ci struct lp_type inttype; 2044bf215546Sopenharmony_ci struct lp_build_context intbld; 2045bf215546Sopenharmony_ci LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2046bf215546Sopenharmony_ci LLVMValueRef trunc, res, anosign, mask; 2047bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = bld->int_vec_type; 2048bf215546Sopenharmony_ci LLVMTypeRef vec_type = bld->vec_type; 2049bf215546Sopenharmony_ci 2050bf215546Sopenharmony_ci inttype = type; 2051bf215546Sopenharmony_ci inttype.floating = 0; 2052bf215546Sopenharmony_ci lp_build_context_init(&intbld, bld->gallivm, inttype); 2053bf215546Sopenharmony_ci 2054bf215546Sopenharmony_ci /* round by truncation */ 2055bf215546Sopenharmony_ci trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2056bf215546Sopenharmony_ci res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc"); 2057bf215546Sopenharmony_ci 2058bf215546Sopenharmony_ci /* mask out sign bit */ 2059bf215546Sopenharmony_ci anosign = lp_build_abs(bld, a); 2060bf215546Sopenharmony_ci /* 2061bf215546Sopenharmony_ci * mask out all values if anosign > 2^24 2062bf215546Sopenharmony_ci * This should work both for large ints (all rounding is no-op for them 2063bf215546Sopenharmony_ci * because such floats are always exact) as well as special cases like 2064bf215546Sopenharmony_ci * NaNs, Infs (taking advantage of the fact they use max exponent). 2065bf215546Sopenharmony_ci * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2066bf215546Sopenharmony_ci */ 2067bf215546Sopenharmony_ci anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2068bf215546Sopenharmony_ci cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2069bf215546Sopenharmony_ci mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2070bf215546Sopenharmony_ci return lp_build_select(bld, mask, a, res); 2071bf215546Sopenharmony_ci } 2072bf215546Sopenharmony_ci} 2073bf215546Sopenharmony_ci 2074bf215546Sopenharmony_ci 2075bf215546Sopenharmony_ci/** 2076bf215546Sopenharmony_ci * Return float (vector) rounded to nearest integer (vector). The returned 2077bf215546Sopenharmony_ci * value is a float (vector). 2078bf215546Sopenharmony_ci * Ex: round(0.9) = 1.0 2079bf215546Sopenharmony_ci * Ex: round(-1.5) = -2.0 2080bf215546Sopenharmony_ci */ 2081bf215546Sopenharmony_ciLLVMValueRef 2082bf215546Sopenharmony_cilp_build_round(struct lp_build_context *bld, 2083bf215546Sopenharmony_ci LLVMValueRef a) 2084bf215546Sopenharmony_ci{ 2085bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2086bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2087bf215546Sopenharmony_ci 2088bf215546Sopenharmony_ci assert(type.floating); 2089bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2090bf215546Sopenharmony_ci 2091bf215546Sopenharmony_ci if (type.width == 16) { 2092bf215546Sopenharmony_ci char intrinsic[64]; 2093bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type); 2094bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2095bf215546Sopenharmony_ci } 2096bf215546Sopenharmony_ci 2097bf215546Sopenharmony_ci if (arch_rounding_available(type)) { 2098bf215546Sopenharmony_ci return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST); 2099bf215546Sopenharmony_ci } 2100bf215546Sopenharmony_ci else { 2101bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2102bf215546Sopenharmony_ci struct lp_type inttype; 2103bf215546Sopenharmony_ci struct lp_build_context intbld; 2104bf215546Sopenharmony_ci LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2105bf215546Sopenharmony_ci LLVMValueRef res, anosign, mask; 2106bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = bld->int_vec_type; 2107bf215546Sopenharmony_ci LLVMTypeRef vec_type = bld->vec_type; 2108bf215546Sopenharmony_ci 2109bf215546Sopenharmony_ci inttype = type; 2110bf215546Sopenharmony_ci inttype.floating = 0; 2111bf215546Sopenharmony_ci lp_build_context_init(&intbld, bld->gallivm, inttype); 2112bf215546Sopenharmony_ci 2113bf215546Sopenharmony_ci res = lp_build_iround(bld, a); 2114bf215546Sopenharmony_ci res = LLVMBuildSIToFP(builder, res, vec_type, ""); 2115bf215546Sopenharmony_ci 2116bf215546Sopenharmony_ci /* mask out sign bit */ 2117bf215546Sopenharmony_ci anosign = lp_build_abs(bld, a); 2118bf215546Sopenharmony_ci /* 2119bf215546Sopenharmony_ci * mask out all values if anosign > 2^24 2120bf215546Sopenharmony_ci * This should work both for large ints (all rounding is no-op for them 2121bf215546Sopenharmony_ci * because such floats are always exact) as well as special cases like 2122bf215546Sopenharmony_ci * NaNs, Infs (taking advantage of the fact they use max exponent). 2123bf215546Sopenharmony_ci * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2124bf215546Sopenharmony_ci */ 2125bf215546Sopenharmony_ci anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2126bf215546Sopenharmony_ci cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2127bf215546Sopenharmony_ci mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2128bf215546Sopenharmony_ci return lp_build_select(bld, mask, a, res); 2129bf215546Sopenharmony_ci } 2130bf215546Sopenharmony_ci} 2131bf215546Sopenharmony_ci 2132bf215546Sopenharmony_ci 2133bf215546Sopenharmony_ci/** 2134bf215546Sopenharmony_ci * Return floor of float (vector), result is a float (vector) 2135bf215546Sopenharmony_ci * Ex: floor(1.1) = 1.0 2136bf215546Sopenharmony_ci * Ex: floor(-1.1) = -2.0 2137bf215546Sopenharmony_ci */ 2138bf215546Sopenharmony_ciLLVMValueRef 2139bf215546Sopenharmony_cilp_build_floor(struct lp_build_context *bld, 2140bf215546Sopenharmony_ci LLVMValueRef a) 2141bf215546Sopenharmony_ci{ 2142bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2143bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2144bf215546Sopenharmony_ci 2145bf215546Sopenharmony_ci assert(type.floating); 2146bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2147bf215546Sopenharmony_ci 2148bf215546Sopenharmony_ci if (arch_rounding_available(type)) { 2149bf215546Sopenharmony_ci return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR); 2150bf215546Sopenharmony_ci } 2151bf215546Sopenharmony_ci else { 2152bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2153bf215546Sopenharmony_ci struct lp_type inttype; 2154bf215546Sopenharmony_ci struct lp_build_context intbld; 2155bf215546Sopenharmony_ci LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2156bf215546Sopenharmony_ci LLVMValueRef trunc, res, anosign, mask; 2157bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = bld->int_vec_type; 2158bf215546Sopenharmony_ci LLVMTypeRef vec_type = bld->vec_type; 2159bf215546Sopenharmony_ci 2160bf215546Sopenharmony_ci if (type.width != 32) { 2161bf215546Sopenharmony_ci char intrinsic[32]; 2162bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type); 2163bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2164bf215546Sopenharmony_ci } 2165bf215546Sopenharmony_ci 2166bf215546Sopenharmony_ci assert(type.width == 32); /* might want to handle doubles at some point */ 2167bf215546Sopenharmony_ci 2168bf215546Sopenharmony_ci inttype = type; 2169bf215546Sopenharmony_ci inttype.floating = 0; 2170bf215546Sopenharmony_ci lp_build_context_init(&intbld, bld->gallivm, inttype); 2171bf215546Sopenharmony_ci 2172bf215546Sopenharmony_ci /* round by truncation */ 2173bf215546Sopenharmony_ci trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2174bf215546Sopenharmony_ci res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc"); 2175bf215546Sopenharmony_ci 2176bf215546Sopenharmony_ci if (type.sign) { 2177bf215546Sopenharmony_ci LLVMValueRef tmp; 2178bf215546Sopenharmony_ci 2179bf215546Sopenharmony_ci /* 2180bf215546Sopenharmony_ci * fix values if rounding is wrong (for non-special cases) 2181bf215546Sopenharmony_ci * - this is the case if trunc > a 2182bf215546Sopenharmony_ci */ 2183bf215546Sopenharmony_ci mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a); 2184bf215546Sopenharmony_ci /* tmp = trunc > a ? 1.0 : 0.0 */ 2185bf215546Sopenharmony_ci tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, ""); 2186bf215546Sopenharmony_ci tmp = lp_build_and(&intbld, mask, tmp); 2187bf215546Sopenharmony_ci tmp = LLVMBuildBitCast(builder, tmp, vec_type, ""); 2188bf215546Sopenharmony_ci res = lp_build_sub(bld, res, tmp); 2189bf215546Sopenharmony_ci } 2190bf215546Sopenharmony_ci 2191bf215546Sopenharmony_ci /* mask out sign bit */ 2192bf215546Sopenharmony_ci anosign = lp_build_abs(bld, a); 2193bf215546Sopenharmony_ci /* 2194bf215546Sopenharmony_ci * mask out all values if anosign > 2^24 2195bf215546Sopenharmony_ci * This should work both for large ints (all rounding is no-op for them 2196bf215546Sopenharmony_ci * because such floats are always exact) as well as special cases like 2197bf215546Sopenharmony_ci * NaNs, Infs (taking advantage of the fact they use max exponent). 2198bf215546Sopenharmony_ci * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2199bf215546Sopenharmony_ci */ 2200bf215546Sopenharmony_ci anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2201bf215546Sopenharmony_ci cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2202bf215546Sopenharmony_ci mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2203bf215546Sopenharmony_ci return lp_build_select(bld, mask, a, res); 2204bf215546Sopenharmony_ci } 2205bf215546Sopenharmony_ci} 2206bf215546Sopenharmony_ci 2207bf215546Sopenharmony_ci 2208bf215546Sopenharmony_ci/** 2209bf215546Sopenharmony_ci * Return ceiling of float (vector), returning float (vector). 2210bf215546Sopenharmony_ci * Ex: ceil( 1.1) = 2.0 2211bf215546Sopenharmony_ci * Ex: ceil(-1.1) = -1.0 2212bf215546Sopenharmony_ci */ 2213bf215546Sopenharmony_ciLLVMValueRef 2214bf215546Sopenharmony_cilp_build_ceil(struct lp_build_context *bld, 2215bf215546Sopenharmony_ci LLVMValueRef a) 2216bf215546Sopenharmony_ci{ 2217bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2218bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2219bf215546Sopenharmony_ci 2220bf215546Sopenharmony_ci assert(type.floating); 2221bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2222bf215546Sopenharmony_ci 2223bf215546Sopenharmony_ci if (arch_rounding_available(type)) { 2224bf215546Sopenharmony_ci return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL); 2225bf215546Sopenharmony_ci } 2226bf215546Sopenharmony_ci else { 2227bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2228bf215546Sopenharmony_ci struct lp_type inttype; 2229bf215546Sopenharmony_ci struct lp_build_context intbld; 2230bf215546Sopenharmony_ci LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2231bf215546Sopenharmony_ci LLVMValueRef trunc, res, anosign, mask, tmp; 2232bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = bld->int_vec_type; 2233bf215546Sopenharmony_ci LLVMTypeRef vec_type = bld->vec_type; 2234bf215546Sopenharmony_ci 2235bf215546Sopenharmony_ci if (type.width != 32) { 2236bf215546Sopenharmony_ci char intrinsic[32]; 2237bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type); 2238bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2239bf215546Sopenharmony_ci } 2240bf215546Sopenharmony_ci 2241bf215546Sopenharmony_ci assert(type.width == 32); /* might want to handle doubles at some point */ 2242bf215546Sopenharmony_ci 2243bf215546Sopenharmony_ci inttype = type; 2244bf215546Sopenharmony_ci inttype.floating = 0; 2245bf215546Sopenharmony_ci lp_build_context_init(&intbld, bld->gallivm, inttype); 2246bf215546Sopenharmony_ci 2247bf215546Sopenharmony_ci /* round by truncation */ 2248bf215546Sopenharmony_ci trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2249bf215546Sopenharmony_ci trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc"); 2250bf215546Sopenharmony_ci 2251bf215546Sopenharmony_ci /* 2252bf215546Sopenharmony_ci * fix values if rounding is wrong (for non-special cases) 2253bf215546Sopenharmony_ci * - this is the case if trunc < a 2254bf215546Sopenharmony_ci */ 2255bf215546Sopenharmony_ci mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a); 2256bf215546Sopenharmony_ci /* tmp = trunc < a ? 1.0 : 0.0 */ 2257bf215546Sopenharmony_ci tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, ""); 2258bf215546Sopenharmony_ci tmp = lp_build_and(&intbld, mask, tmp); 2259bf215546Sopenharmony_ci tmp = LLVMBuildBitCast(builder, tmp, vec_type, ""); 2260bf215546Sopenharmony_ci res = lp_build_add(bld, trunc, tmp); 2261bf215546Sopenharmony_ci 2262bf215546Sopenharmony_ci /* mask out sign bit */ 2263bf215546Sopenharmony_ci anosign = lp_build_abs(bld, a); 2264bf215546Sopenharmony_ci /* 2265bf215546Sopenharmony_ci * mask out all values if anosign > 2^24 2266bf215546Sopenharmony_ci * This should work both for large ints (all rounding is no-op for them 2267bf215546Sopenharmony_ci * because such floats are always exact) as well as special cases like 2268bf215546Sopenharmony_ci * NaNs, Infs (taking advantage of the fact they use max exponent). 2269bf215546Sopenharmony_ci * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2270bf215546Sopenharmony_ci */ 2271bf215546Sopenharmony_ci anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2272bf215546Sopenharmony_ci cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2273bf215546Sopenharmony_ci mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2274bf215546Sopenharmony_ci return lp_build_select(bld, mask, a, res); 2275bf215546Sopenharmony_ci } 2276bf215546Sopenharmony_ci} 2277bf215546Sopenharmony_ci 2278bf215546Sopenharmony_ci 2279bf215546Sopenharmony_ci/** 2280bf215546Sopenharmony_ci * Return fractional part of 'a' computed as a - floor(a) 2281bf215546Sopenharmony_ci * Typically used in texture coord arithmetic. 2282bf215546Sopenharmony_ci */ 2283bf215546Sopenharmony_ciLLVMValueRef 2284bf215546Sopenharmony_cilp_build_fract(struct lp_build_context *bld, 2285bf215546Sopenharmony_ci LLVMValueRef a) 2286bf215546Sopenharmony_ci{ 2287bf215546Sopenharmony_ci assert(bld->type.floating); 2288bf215546Sopenharmony_ci return lp_build_sub(bld, a, lp_build_floor(bld, a)); 2289bf215546Sopenharmony_ci} 2290bf215546Sopenharmony_ci 2291bf215546Sopenharmony_ci 2292bf215546Sopenharmony_ci/** 2293bf215546Sopenharmony_ci * Prevent returning 1.0 for very small negative values of 'a' by clamping 2294bf215546Sopenharmony_ci * against 0.99999(9). (Will also return that value for NaNs.) 2295bf215546Sopenharmony_ci */ 2296bf215546Sopenharmony_cistatic inline LLVMValueRef 2297bf215546Sopenharmony_ciclamp_fract(struct lp_build_context *bld, LLVMValueRef fract) 2298bf215546Sopenharmony_ci{ 2299bf215546Sopenharmony_ci LLVMValueRef max; 2300bf215546Sopenharmony_ci 2301bf215546Sopenharmony_ci /* this is the largest number smaller than 1.0 representable as float */ 2302bf215546Sopenharmony_ci max = lp_build_const_vec(bld->gallivm, bld->type, 2303bf215546Sopenharmony_ci 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1))); 2304bf215546Sopenharmony_ci return lp_build_min_ext(bld, fract, max, 2305bf215546Sopenharmony_ci GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 2306bf215546Sopenharmony_ci} 2307bf215546Sopenharmony_ci 2308bf215546Sopenharmony_ci 2309bf215546Sopenharmony_ci/** 2310bf215546Sopenharmony_ci * Same as lp_build_fract, but guarantees that the result is always smaller 2311bf215546Sopenharmony_ci * than one. Will also return the smaller-than-one value for infs, NaNs. 2312bf215546Sopenharmony_ci */ 2313bf215546Sopenharmony_ciLLVMValueRef 2314bf215546Sopenharmony_cilp_build_fract_safe(struct lp_build_context *bld, 2315bf215546Sopenharmony_ci LLVMValueRef a) 2316bf215546Sopenharmony_ci{ 2317bf215546Sopenharmony_ci return clamp_fract(bld, lp_build_fract(bld, a)); 2318bf215546Sopenharmony_ci} 2319bf215546Sopenharmony_ci 2320bf215546Sopenharmony_ci 2321bf215546Sopenharmony_ci/** 2322bf215546Sopenharmony_ci * Return the integer part of a float (vector) value (== round toward zero). 2323bf215546Sopenharmony_ci * The returned value is an integer (vector). 2324bf215546Sopenharmony_ci * Ex: itrunc(-1.5) = -1 2325bf215546Sopenharmony_ci */ 2326bf215546Sopenharmony_ciLLVMValueRef 2327bf215546Sopenharmony_cilp_build_itrunc(struct lp_build_context *bld, 2328bf215546Sopenharmony_ci LLVMValueRef a) 2329bf215546Sopenharmony_ci{ 2330bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2331bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2332bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2333bf215546Sopenharmony_ci 2334bf215546Sopenharmony_ci assert(type.floating); 2335bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2336bf215546Sopenharmony_ci 2337bf215546Sopenharmony_ci return LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2338bf215546Sopenharmony_ci} 2339bf215546Sopenharmony_ci 2340bf215546Sopenharmony_ci 2341bf215546Sopenharmony_ci/** 2342bf215546Sopenharmony_ci * Return float (vector) rounded to nearest integer (vector). The returned 2343bf215546Sopenharmony_ci * value is an integer (vector). 2344bf215546Sopenharmony_ci * Ex: iround(0.9) = 1 2345bf215546Sopenharmony_ci * Ex: iround(-1.5) = -2 2346bf215546Sopenharmony_ci */ 2347bf215546Sopenharmony_ciLLVMValueRef 2348bf215546Sopenharmony_cilp_build_iround(struct lp_build_context *bld, 2349bf215546Sopenharmony_ci LLVMValueRef a) 2350bf215546Sopenharmony_ci{ 2351bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2352bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2353bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = bld->int_vec_type; 2354bf215546Sopenharmony_ci LLVMValueRef res; 2355bf215546Sopenharmony_ci 2356bf215546Sopenharmony_ci assert(type.floating); 2357bf215546Sopenharmony_ci 2358bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2359bf215546Sopenharmony_ci 2360bf215546Sopenharmony_ci if ((util_get_cpu_caps()->has_sse2 && 2361bf215546Sopenharmony_ci ((type.width == 32) && (type.length == 1 || type.length == 4))) || 2362bf215546Sopenharmony_ci (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) { 2363bf215546Sopenharmony_ci return lp_build_iround_nearest_sse2(bld, a); 2364bf215546Sopenharmony_ci } 2365bf215546Sopenharmony_ci if (arch_rounding_available(type)) { 2366bf215546Sopenharmony_ci res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST); 2367bf215546Sopenharmony_ci } 2368bf215546Sopenharmony_ci else { 2369bf215546Sopenharmony_ci LLVMValueRef half; 2370bf215546Sopenharmony_ci 2371bf215546Sopenharmony_ci half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0)); 2372bf215546Sopenharmony_ci 2373bf215546Sopenharmony_ci if (type.sign) { 2374bf215546Sopenharmony_ci LLVMTypeRef vec_type = bld->vec_type; 2375bf215546Sopenharmony_ci LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 2376bf215546Sopenharmony_ci (unsigned long long)1 << (type.width - 1)); 2377bf215546Sopenharmony_ci LLVMValueRef sign; 2378bf215546Sopenharmony_ci 2379bf215546Sopenharmony_ci /* get sign bit */ 2380bf215546Sopenharmony_ci sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 2381bf215546Sopenharmony_ci sign = LLVMBuildAnd(builder, sign, mask, ""); 2382bf215546Sopenharmony_ci 2383bf215546Sopenharmony_ci /* sign * 0.5 */ 2384bf215546Sopenharmony_ci half = LLVMBuildBitCast(builder, half, int_vec_type, ""); 2385bf215546Sopenharmony_ci half = LLVMBuildOr(builder, sign, half, ""); 2386bf215546Sopenharmony_ci half = LLVMBuildBitCast(builder, half, vec_type, ""); 2387bf215546Sopenharmony_ci } 2388bf215546Sopenharmony_ci 2389bf215546Sopenharmony_ci res = LLVMBuildFAdd(builder, a, half, ""); 2390bf215546Sopenharmony_ci } 2391bf215546Sopenharmony_ci 2392bf215546Sopenharmony_ci res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 2393bf215546Sopenharmony_ci 2394bf215546Sopenharmony_ci return res; 2395bf215546Sopenharmony_ci} 2396bf215546Sopenharmony_ci 2397bf215546Sopenharmony_ci 2398bf215546Sopenharmony_ci/** 2399bf215546Sopenharmony_ci * Return floor of float (vector), result is an int (vector) 2400bf215546Sopenharmony_ci * Ex: ifloor(1.1) = 1.0 2401bf215546Sopenharmony_ci * Ex: ifloor(-1.1) = -2.0 2402bf215546Sopenharmony_ci */ 2403bf215546Sopenharmony_ciLLVMValueRef 2404bf215546Sopenharmony_cilp_build_ifloor(struct lp_build_context *bld, 2405bf215546Sopenharmony_ci LLVMValueRef a) 2406bf215546Sopenharmony_ci{ 2407bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2408bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2409bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = bld->int_vec_type; 2410bf215546Sopenharmony_ci LLVMValueRef res; 2411bf215546Sopenharmony_ci 2412bf215546Sopenharmony_ci assert(type.floating); 2413bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2414bf215546Sopenharmony_ci 2415bf215546Sopenharmony_ci res = a; 2416bf215546Sopenharmony_ci if (type.sign) { 2417bf215546Sopenharmony_ci if (arch_rounding_available(type)) { 2418bf215546Sopenharmony_ci res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR); 2419bf215546Sopenharmony_ci } 2420bf215546Sopenharmony_ci else { 2421bf215546Sopenharmony_ci struct lp_type inttype; 2422bf215546Sopenharmony_ci struct lp_build_context intbld; 2423bf215546Sopenharmony_ci LLVMValueRef trunc, itrunc, mask; 2424bf215546Sopenharmony_ci 2425bf215546Sopenharmony_ci assert(type.floating); 2426bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2427bf215546Sopenharmony_ci 2428bf215546Sopenharmony_ci inttype = type; 2429bf215546Sopenharmony_ci inttype.floating = 0; 2430bf215546Sopenharmony_ci lp_build_context_init(&intbld, bld->gallivm, inttype); 2431bf215546Sopenharmony_ci 2432bf215546Sopenharmony_ci /* round by truncation */ 2433bf215546Sopenharmony_ci itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2434bf215546Sopenharmony_ci trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc"); 2435bf215546Sopenharmony_ci 2436bf215546Sopenharmony_ci /* 2437bf215546Sopenharmony_ci * fix values if rounding is wrong (for non-special cases) 2438bf215546Sopenharmony_ci * - this is the case if trunc > a 2439bf215546Sopenharmony_ci * The results of doing this with NaNs, very large values etc. 2440bf215546Sopenharmony_ci * are undefined but this seems to be the case anyway. 2441bf215546Sopenharmony_ci */ 2442bf215546Sopenharmony_ci mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a); 2443bf215546Sopenharmony_ci /* cheapie minus one with mask since the mask is minus one / zero */ 2444bf215546Sopenharmony_ci return lp_build_add(&intbld, itrunc, mask); 2445bf215546Sopenharmony_ci } 2446bf215546Sopenharmony_ci } 2447bf215546Sopenharmony_ci 2448bf215546Sopenharmony_ci /* round to nearest (toward zero) */ 2449bf215546Sopenharmony_ci res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res"); 2450bf215546Sopenharmony_ci 2451bf215546Sopenharmony_ci return res; 2452bf215546Sopenharmony_ci} 2453bf215546Sopenharmony_ci 2454bf215546Sopenharmony_ci 2455bf215546Sopenharmony_ci/** 2456bf215546Sopenharmony_ci * Return ceiling of float (vector), returning int (vector). 2457bf215546Sopenharmony_ci * Ex: iceil( 1.1) = 2 2458bf215546Sopenharmony_ci * Ex: iceil(-1.1) = -1 2459bf215546Sopenharmony_ci */ 2460bf215546Sopenharmony_ciLLVMValueRef 2461bf215546Sopenharmony_cilp_build_iceil(struct lp_build_context *bld, 2462bf215546Sopenharmony_ci LLVMValueRef a) 2463bf215546Sopenharmony_ci{ 2464bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2465bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2466bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = bld->int_vec_type; 2467bf215546Sopenharmony_ci LLVMValueRef res; 2468bf215546Sopenharmony_ci 2469bf215546Sopenharmony_ci assert(type.floating); 2470bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2471bf215546Sopenharmony_ci 2472bf215546Sopenharmony_ci if (arch_rounding_available(type)) { 2473bf215546Sopenharmony_ci res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL); 2474bf215546Sopenharmony_ci } 2475bf215546Sopenharmony_ci else { 2476bf215546Sopenharmony_ci struct lp_type inttype; 2477bf215546Sopenharmony_ci struct lp_build_context intbld; 2478bf215546Sopenharmony_ci LLVMValueRef trunc, itrunc, mask; 2479bf215546Sopenharmony_ci 2480bf215546Sopenharmony_ci assert(type.floating); 2481bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2482bf215546Sopenharmony_ci 2483bf215546Sopenharmony_ci inttype = type; 2484bf215546Sopenharmony_ci inttype.floating = 0; 2485bf215546Sopenharmony_ci lp_build_context_init(&intbld, bld->gallivm, inttype); 2486bf215546Sopenharmony_ci 2487bf215546Sopenharmony_ci /* round by truncation */ 2488bf215546Sopenharmony_ci itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2489bf215546Sopenharmony_ci trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc"); 2490bf215546Sopenharmony_ci 2491bf215546Sopenharmony_ci /* 2492bf215546Sopenharmony_ci * fix values if rounding is wrong (for non-special cases) 2493bf215546Sopenharmony_ci * - this is the case if trunc < a 2494bf215546Sopenharmony_ci * The results of doing this with NaNs, very large values etc. 2495bf215546Sopenharmony_ci * are undefined but this seems to be the case anyway. 2496bf215546Sopenharmony_ci */ 2497bf215546Sopenharmony_ci mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a); 2498bf215546Sopenharmony_ci /* cheapie plus one with mask since the mask is minus one / zero */ 2499bf215546Sopenharmony_ci return lp_build_sub(&intbld, itrunc, mask); 2500bf215546Sopenharmony_ci } 2501bf215546Sopenharmony_ci 2502bf215546Sopenharmony_ci /* round to nearest (toward zero) */ 2503bf215546Sopenharmony_ci res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res"); 2504bf215546Sopenharmony_ci 2505bf215546Sopenharmony_ci return res; 2506bf215546Sopenharmony_ci} 2507bf215546Sopenharmony_ci 2508bf215546Sopenharmony_ci 2509bf215546Sopenharmony_ci/** 2510bf215546Sopenharmony_ci * Combined ifloor() & fract(). 2511bf215546Sopenharmony_ci * 2512bf215546Sopenharmony_ci * Preferred to calling the functions separately, as it will ensure that the 2513bf215546Sopenharmony_ci * strategy (floor() vs ifloor()) that results in less redundant work is used. 2514bf215546Sopenharmony_ci */ 2515bf215546Sopenharmony_civoid 2516bf215546Sopenharmony_cilp_build_ifloor_fract(struct lp_build_context *bld, 2517bf215546Sopenharmony_ci LLVMValueRef a, 2518bf215546Sopenharmony_ci LLVMValueRef *out_ipart, 2519bf215546Sopenharmony_ci LLVMValueRef *out_fpart) 2520bf215546Sopenharmony_ci{ 2521bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2522bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2523bf215546Sopenharmony_ci LLVMValueRef ipart; 2524bf215546Sopenharmony_ci 2525bf215546Sopenharmony_ci assert(type.floating); 2526bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2527bf215546Sopenharmony_ci 2528bf215546Sopenharmony_ci if (arch_rounding_available(type)) { 2529bf215546Sopenharmony_ci /* 2530bf215546Sopenharmony_ci * floor() is easier. 2531bf215546Sopenharmony_ci */ 2532bf215546Sopenharmony_ci 2533bf215546Sopenharmony_ci ipart = lp_build_floor(bld, a); 2534bf215546Sopenharmony_ci *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 2535bf215546Sopenharmony_ci *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart"); 2536bf215546Sopenharmony_ci } 2537bf215546Sopenharmony_ci else { 2538bf215546Sopenharmony_ci /* 2539bf215546Sopenharmony_ci * ifloor() is easier. 2540bf215546Sopenharmony_ci */ 2541bf215546Sopenharmony_ci 2542bf215546Sopenharmony_ci *out_ipart = lp_build_ifloor(bld, a); 2543bf215546Sopenharmony_ci ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart"); 2544bf215546Sopenharmony_ci *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 2545bf215546Sopenharmony_ci } 2546bf215546Sopenharmony_ci} 2547bf215546Sopenharmony_ci 2548bf215546Sopenharmony_ci 2549bf215546Sopenharmony_ci/** 2550bf215546Sopenharmony_ci * Same as lp_build_ifloor_fract, but guarantees that the fractional part is 2551bf215546Sopenharmony_ci * always smaller than one. 2552bf215546Sopenharmony_ci */ 2553bf215546Sopenharmony_civoid 2554bf215546Sopenharmony_cilp_build_ifloor_fract_safe(struct lp_build_context *bld, 2555bf215546Sopenharmony_ci LLVMValueRef a, 2556bf215546Sopenharmony_ci LLVMValueRef *out_ipart, 2557bf215546Sopenharmony_ci LLVMValueRef *out_fpart) 2558bf215546Sopenharmony_ci{ 2559bf215546Sopenharmony_ci lp_build_ifloor_fract(bld, a, out_ipart, out_fpart); 2560bf215546Sopenharmony_ci *out_fpart = clamp_fract(bld, *out_fpart); 2561bf215546Sopenharmony_ci} 2562bf215546Sopenharmony_ci 2563bf215546Sopenharmony_ci 2564bf215546Sopenharmony_ciLLVMValueRef 2565bf215546Sopenharmony_cilp_build_sqrt(struct lp_build_context *bld, 2566bf215546Sopenharmony_ci LLVMValueRef a) 2567bf215546Sopenharmony_ci{ 2568bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2569bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2570bf215546Sopenharmony_ci LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2571bf215546Sopenharmony_ci char intrinsic[32]; 2572bf215546Sopenharmony_ci 2573bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2574bf215546Sopenharmony_ci 2575bf215546Sopenharmony_ci assert(type.floating); 2576bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type); 2577bf215546Sopenharmony_ci 2578bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2579bf215546Sopenharmony_ci} 2580bf215546Sopenharmony_ci 2581bf215546Sopenharmony_ci 2582bf215546Sopenharmony_ci/** 2583bf215546Sopenharmony_ci * Do one Newton-Raphson step to improve reciprocate precision: 2584bf215546Sopenharmony_ci * 2585bf215546Sopenharmony_ci * x_{i+1} = x_i + x_i * (1 - a * x_i) 2586bf215546Sopenharmony_ci * 2587bf215546Sopenharmony_ci * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or 2588bf215546Sopenharmony_ci * +/-Inf, giving NaN instead. Certain applications rely on this behavior, 2589bf215546Sopenharmony_ci * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's 2590bf215546Sopenharmony_ci * halo. It would be necessary to clamp the argument to prevent this. 2591bf215546Sopenharmony_ci * 2592bf215546Sopenharmony_ci * See also: 2593bf215546Sopenharmony_ci * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division 2594bf215546Sopenharmony_ci * - http://softwarecommunity.intel.com/articles/eng/1818.htm 2595bf215546Sopenharmony_ci */ 2596bf215546Sopenharmony_cistatic inline LLVMValueRef 2597bf215546Sopenharmony_cilp_build_rcp_refine(struct lp_build_context *bld, 2598bf215546Sopenharmony_ci LLVMValueRef a, 2599bf215546Sopenharmony_ci LLVMValueRef rcp_a) 2600bf215546Sopenharmony_ci{ 2601bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2602bf215546Sopenharmony_ci LLVMValueRef neg_a; 2603bf215546Sopenharmony_ci LLVMValueRef res; 2604bf215546Sopenharmony_ci 2605bf215546Sopenharmony_ci neg_a = LLVMBuildFNeg(builder, a, ""); 2606bf215546Sopenharmony_ci res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one); 2607bf215546Sopenharmony_ci res = lp_build_fmuladd(builder, res, rcp_a, rcp_a); 2608bf215546Sopenharmony_ci 2609bf215546Sopenharmony_ci return res; 2610bf215546Sopenharmony_ci} 2611bf215546Sopenharmony_ci 2612bf215546Sopenharmony_ci 2613bf215546Sopenharmony_ciLLVMValueRef 2614bf215546Sopenharmony_cilp_build_rcp(struct lp_build_context *bld, 2615bf215546Sopenharmony_ci LLVMValueRef a) 2616bf215546Sopenharmony_ci{ 2617bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2618bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2619bf215546Sopenharmony_ci 2620bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2621bf215546Sopenharmony_ci 2622bf215546Sopenharmony_ci if (a == bld->zero) 2623bf215546Sopenharmony_ci return bld->undef; 2624bf215546Sopenharmony_ci if (a == bld->one) 2625bf215546Sopenharmony_ci return bld->one; 2626bf215546Sopenharmony_ci if (a == bld->undef) 2627bf215546Sopenharmony_ci return bld->undef; 2628bf215546Sopenharmony_ci 2629bf215546Sopenharmony_ci assert(type.floating); 2630bf215546Sopenharmony_ci 2631bf215546Sopenharmony_ci if (LLVMIsConstant(a)) 2632bf215546Sopenharmony_ci return LLVMBuildFDiv(builder, bld->one, a, ""); 2633bf215546Sopenharmony_ci 2634bf215546Sopenharmony_ci /* 2635bf215546Sopenharmony_ci * We don't use RCPPS because: 2636bf215546Sopenharmony_ci * - it only has 10bits of precision 2637bf215546Sopenharmony_ci * - it doesn't even get the reciprocate of 1.0 exactly 2638bf215546Sopenharmony_ci * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf 2639bf215546Sopenharmony_ci * - for recent processors the benefit over DIVPS is marginal, a case 2640bf215546Sopenharmony_ci * dependent 2641bf215546Sopenharmony_ci * 2642bf215546Sopenharmony_ci * We could still use it on certain processors if benchmarks show that the 2643bf215546Sopenharmony_ci * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for 2644bf215546Sopenharmony_ci * particular uses that require less workarounds. 2645bf215546Sopenharmony_ci */ 2646bf215546Sopenharmony_ci 2647bf215546Sopenharmony_ci if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || 2648bf215546Sopenharmony_ci (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){ 2649bf215546Sopenharmony_ci const unsigned num_iterations = 0; 2650bf215546Sopenharmony_ci LLVMValueRef res; 2651bf215546Sopenharmony_ci unsigned i; 2652bf215546Sopenharmony_ci const char *intrinsic = NULL; 2653bf215546Sopenharmony_ci 2654bf215546Sopenharmony_ci if (type.length == 4) { 2655bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse.rcp.ps"; 2656bf215546Sopenharmony_ci } 2657bf215546Sopenharmony_ci else { 2658bf215546Sopenharmony_ci intrinsic = "llvm.x86.avx.rcp.ps.256"; 2659bf215546Sopenharmony_ci } 2660bf215546Sopenharmony_ci 2661bf215546Sopenharmony_ci res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2662bf215546Sopenharmony_ci 2663bf215546Sopenharmony_ci for (i = 0; i < num_iterations; ++i) { 2664bf215546Sopenharmony_ci res = lp_build_rcp_refine(bld, a, res); 2665bf215546Sopenharmony_ci } 2666bf215546Sopenharmony_ci 2667bf215546Sopenharmony_ci return res; 2668bf215546Sopenharmony_ci } 2669bf215546Sopenharmony_ci 2670bf215546Sopenharmony_ci return LLVMBuildFDiv(builder, bld->one, a, ""); 2671bf215546Sopenharmony_ci} 2672bf215546Sopenharmony_ci 2673bf215546Sopenharmony_ci 2674bf215546Sopenharmony_ci/** 2675bf215546Sopenharmony_ci * Do one Newton-Raphson step to improve rsqrt precision: 2676bf215546Sopenharmony_ci * 2677bf215546Sopenharmony_ci * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) 2678bf215546Sopenharmony_ci * 2679bf215546Sopenharmony_ci * See also Intel 64 and IA-32 Architectures Optimization Manual. 2680bf215546Sopenharmony_ci */ 2681bf215546Sopenharmony_cistatic inline LLVMValueRef 2682bf215546Sopenharmony_cilp_build_rsqrt_refine(struct lp_build_context *bld, 2683bf215546Sopenharmony_ci LLVMValueRef a, 2684bf215546Sopenharmony_ci LLVMValueRef rsqrt_a) 2685bf215546Sopenharmony_ci{ 2686bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2687bf215546Sopenharmony_ci LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5); 2688bf215546Sopenharmony_ci LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0); 2689bf215546Sopenharmony_ci LLVMValueRef res; 2690bf215546Sopenharmony_ci 2691bf215546Sopenharmony_ci res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, ""); 2692bf215546Sopenharmony_ci res = LLVMBuildFMul(builder, a, res, ""); 2693bf215546Sopenharmony_ci res = LLVMBuildFSub(builder, three, res, ""); 2694bf215546Sopenharmony_ci res = LLVMBuildFMul(builder, rsqrt_a, res, ""); 2695bf215546Sopenharmony_ci res = LLVMBuildFMul(builder, half, res, ""); 2696bf215546Sopenharmony_ci 2697bf215546Sopenharmony_ci return res; 2698bf215546Sopenharmony_ci} 2699bf215546Sopenharmony_ci 2700bf215546Sopenharmony_ci 2701bf215546Sopenharmony_ci/** 2702bf215546Sopenharmony_ci * Generate 1/sqrt(a). 2703bf215546Sopenharmony_ci * Result is undefined for values < 0, infinity for +0. 2704bf215546Sopenharmony_ci */ 2705bf215546Sopenharmony_ciLLVMValueRef 2706bf215546Sopenharmony_cilp_build_rsqrt(struct lp_build_context *bld, 2707bf215546Sopenharmony_ci LLVMValueRef a) 2708bf215546Sopenharmony_ci{ 2709bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2710bf215546Sopenharmony_ci 2711bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2712bf215546Sopenharmony_ci 2713bf215546Sopenharmony_ci assert(type.floating); 2714bf215546Sopenharmony_ci 2715bf215546Sopenharmony_ci /* 2716bf215546Sopenharmony_ci * This should be faster but all denormals will end up as infinity. 2717bf215546Sopenharmony_ci */ 2718bf215546Sopenharmony_ci if (0 && lp_build_fast_rsqrt_available(type)) { 2719bf215546Sopenharmony_ci const unsigned num_iterations = 1; 2720bf215546Sopenharmony_ci LLVMValueRef res; 2721bf215546Sopenharmony_ci unsigned i; 2722bf215546Sopenharmony_ci 2723bf215546Sopenharmony_ci /* rsqrt(1.0) != 1.0 here */ 2724bf215546Sopenharmony_ci res = lp_build_fast_rsqrt(bld, a); 2725bf215546Sopenharmony_ci 2726bf215546Sopenharmony_ci if (num_iterations) { 2727bf215546Sopenharmony_ci /* 2728bf215546Sopenharmony_ci * Newton-Raphson will result in NaN instead of infinity for zero, 2729bf215546Sopenharmony_ci * and NaN instead of zero for infinity. 2730bf215546Sopenharmony_ci * Also, need to ensure rsqrt(1.0) == 1.0. 2731bf215546Sopenharmony_ci * All numbers smaller than FLT_MIN will result in +infinity 2732bf215546Sopenharmony_ci * (rsqrtps treats all denormals as zero). 2733bf215546Sopenharmony_ci */ 2734bf215546Sopenharmony_ci LLVMValueRef cmp; 2735bf215546Sopenharmony_ci LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN); 2736bf215546Sopenharmony_ci LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY); 2737bf215546Sopenharmony_ci 2738bf215546Sopenharmony_ci for (i = 0; i < num_iterations; ++i) { 2739bf215546Sopenharmony_ci res = lp_build_rsqrt_refine(bld, a, res); 2740bf215546Sopenharmony_ci } 2741bf215546Sopenharmony_ci cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min); 2742bf215546Sopenharmony_ci res = lp_build_select(bld, cmp, inf, res); 2743bf215546Sopenharmony_ci cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf); 2744bf215546Sopenharmony_ci res = lp_build_select(bld, cmp, bld->zero, res); 2745bf215546Sopenharmony_ci cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one); 2746bf215546Sopenharmony_ci res = lp_build_select(bld, cmp, bld->one, res); 2747bf215546Sopenharmony_ci } 2748bf215546Sopenharmony_ci 2749bf215546Sopenharmony_ci return res; 2750bf215546Sopenharmony_ci } 2751bf215546Sopenharmony_ci 2752bf215546Sopenharmony_ci return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2753bf215546Sopenharmony_ci} 2754bf215546Sopenharmony_ci 2755bf215546Sopenharmony_ci 2756bf215546Sopenharmony_ci/** 2757bf215546Sopenharmony_ci * If there's a fast (inaccurate) rsqrt instruction available 2758bf215546Sopenharmony_ci * (caller may want to avoid to call rsqrt_fast if it's not available, 2759bf215546Sopenharmony_ci * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if 2760bf215546Sopenharmony_ci * unavailable it would result in sqrt/div/mul so obviously 2761bf215546Sopenharmony_ci * much better to just call sqrt, skipping both div and mul). 2762bf215546Sopenharmony_ci */ 2763bf215546Sopenharmony_ciboolean 2764bf215546Sopenharmony_cilp_build_fast_rsqrt_available(struct lp_type type) 2765bf215546Sopenharmony_ci{ 2766bf215546Sopenharmony_ci assert(type.floating); 2767bf215546Sopenharmony_ci 2768bf215546Sopenharmony_ci if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || 2769bf215546Sopenharmony_ci (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) { 2770bf215546Sopenharmony_ci return true; 2771bf215546Sopenharmony_ci } 2772bf215546Sopenharmony_ci return false; 2773bf215546Sopenharmony_ci} 2774bf215546Sopenharmony_ci 2775bf215546Sopenharmony_ci 2776bf215546Sopenharmony_ci/** 2777bf215546Sopenharmony_ci * Generate 1/sqrt(a). 2778bf215546Sopenharmony_ci * Result is undefined for values < 0, infinity for +0. 2779bf215546Sopenharmony_ci * Precision is limited, only ~10 bits guaranteed 2780bf215546Sopenharmony_ci * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0). 2781bf215546Sopenharmony_ci */ 2782bf215546Sopenharmony_ciLLVMValueRef 2783bf215546Sopenharmony_cilp_build_fast_rsqrt(struct lp_build_context *bld, 2784bf215546Sopenharmony_ci LLVMValueRef a) 2785bf215546Sopenharmony_ci{ 2786bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 2787bf215546Sopenharmony_ci const struct lp_type type = bld->type; 2788bf215546Sopenharmony_ci 2789bf215546Sopenharmony_ci assert(lp_check_value(type, a)); 2790bf215546Sopenharmony_ci 2791bf215546Sopenharmony_ci if (lp_build_fast_rsqrt_available(type)) { 2792bf215546Sopenharmony_ci const char *intrinsic = NULL; 2793bf215546Sopenharmony_ci 2794bf215546Sopenharmony_ci if (type.length == 4) { 2795bf215546Sopenharmony_ci intrinsic = "llvm.x86.sse.rsqrt.ps"; 2796bf215546Sopenharmony_ci } 2797bf215546Sopenharmony_ci else { 2798bf215546Sopenharmony_ci intrinsic = "llvm.x86.avx.rsqrt.ps.256"; 2799bf215546Sopenharmony_ci } 2800bf215546Sopenharmony_ci return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2801bf215546Sopenharmony_ci } 2802bf215546Sopenharmony_ci else { 2803bf215546Sopenharmony_ci debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__); 2804bf215546Sopenharmony_ci } 2805bf215546Sopenharmony_ci return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2806bf215546Sopenharmony_ci} 2807bf215546Sopenharmony_ci 2808bf215546Sopenharmony_ci 2809bf215546Sopenharmony_ci/** 2810bf215546Sopenharmony_ci * Generate sin(a) or cos(a) using polynomial approximation. 2811bf215546Sopenharmony_ci * TODO: it might be worth recognizing sin and cos using same source 2812bf215546Sopenharmony_ci * (i.e. d3d10 sincos opcode). Obviously doing both at the same time 2813bf215546Sopenharmony_ci * would be way cheaper than calculating (nearly) everything twice... 2814bf215546Sopenharmony_ci * Not sure it's common enough to be worth bothering however, scs 2815bf215546Sopenharmony_ci * opcode could also benefit from calculating both though. 2816bf215546Sopenharmony_ci */ 2817bf215546Sopenharmony_cistatic LLVMValueRef 2818bf215546Sopenharmony_cilp_build_sin_or_cos(struct lp_build_context *bld, 2819bf215546Sopenharmony_ci LLVMValueRef a, 2820bf215546Sopenharmony_ci boolean cos) 2821bf215546Sopenharmony_ci{ 2822bf215546Sopenharmony_ci struct gallivm_state *gallivm = bld->gallivm; 2823bf215546Sopenharmony_ci LLVMBuilderRef b = gallivm->builder; 2824bf215546Sopenharmony_ci struct lp_type int_type = lp_int_type(bld->type); 2825bf215546Sopenharmony_ci 2826bf215546Sopenharmony_ci /* 2827bf215546Sopenharmony_ci * take the absolute value, 2828bf215546Sopenharmony_ci * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 2829bf215546Sopenharmony_ci */ 2830bf215546Sopenharmony_ci 2831bf215546Sopenharmony_ci LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 2832bf215546Sopenharmony_ci LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 2833bf215546Sopenharmony_ci 2834bf215546Sopenharmony_ci LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 2835bf215546Sopenharmony_ci LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 2836bf215546Sopenharmony_ci 2837bf215546Sopenharmony_ci /* 2838bf215546Sopenharmony_ci * scale by 4/Pi 2839bf215546Sopenharmony_ci * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 2840bf215546Sopenharmony_ci */ 2841bf215546Sopenharmony_ci 2842bf215546Sopenharmony_ci LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 2843bf215546Sopenharmony_ci LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 2844bf215546Sopenharmony_ci 2845bf215546Sopenharmony_ci /* 2846bf215546Sopenharmony_ci * store the integer part of y in mm0 2847bf215546Sopenharmony_ci * emm2 = _mm_cvttps_epi32(y); 2848bf215546Sopenharmony_ci */ 2849bf215546Sopenharmony_ci 2850bf215546Sopenharmony_ci LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 2851bf215546Sopenharmony_ci 2852bf215546Sopenharmony_ci /* 2853bf215546Sopenharmony_ci * j=(j+1) & (~1) (see the cephes sources) 2854bf215546Sopenharmony_ci * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 2855bf215546Sopenharmony_ci */ 2856bf215546Sopenharmony_ci 2857bf215546Sopenharmony_ci LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 2858bf215546Sopenharmony_ci LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 2859bf215546Sopenharmony_ci /* 2860bf215546Sopenharmony_ci * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 2861bf215546Sopenharmony_ci */ 2862bf215546Sopenharmony_ci LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 2863bf215546Sopenharmony_ci LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 2864bf215546Sopenharmony_ci 2865bf215546Sopenharmony_ci /* 2866bf215546Sopenharmony_ci * y = _mm_cvtepi32_ps(emm2); 2867bf215546Sopenharmony_ci */ 2868bf215546Sopenharmony_ci LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 2869bf215546Sopenharmony_ci 2870bf215546Sopenharmony_ci LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 2871bf215546Sopenharmony_ci LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 2872bf215546Sopenharmony_ci LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 2873bf215546Sopenharmony_ci LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000); 2874bf215546Sopenharmony_ci 2875bf215546Sopenharmony_ci /* 2876bf215546Sopenharmony_ci * Argument used for poly selection and sign bit determination 2877bf215546Sopenharmony_ci * is different for sin vs. cos. 2878bf215546Sopenharmony_ci */ 2879bf215546Sopenharmony_ci LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") : 2880bf215546Sopenharmony_ci emm2_and; 2881bf215546Sopenharmony_ci 2882bf215546Sopenharmony_ci LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4, 2883bf215546Sopenharmony_ci LLVMBuildNot(b, emm2_2, ""), ""), 2884bf215546Sopenharmony_ci const_29, "sign_bit") : 2885bf215546Sopenharmony_ci LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si, 2886bf215546Sopenharmony_ci LLVMBuildShl(b, emm2_add, 2887bf215546Sopenharmony_ci const_29, ""), ""), 2888bf215546Sopenharmony_ci sign_mask, "sign_bit"); 2889bf215546Sopenharmony_ci 2890bf215546Sopenharmony_ci /* 2891bf215546Sopenharmony_ci * get the polynom selection mask 2892bf215546Sopenharmony_ci * there is one polynom for 0 <= x <= Pi/4 2893bf215546Sopenharmony_ci * and another one for Pi/4<x<=Pi/2 2894bf215546Sopenharmony_ci * Both branches will be computed. 2895bf215546Sopenharmony_ci * 2896bf215546Sopenharmony_ci * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 2897bf215546Sopenharmony_ci * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 2898bf215546Sopenharmony_ci */ 2899bf215546Sopenharmony_ci 2900bf215546Sopenharmony_ci LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3"); 2901bf215546Sopenharmony_ci LLVMValueRef poly_mask = lp_build_compare(gallivm, 2902bf215546Sopenharmony_ci int_type, PIPE_FUNC_EQUAL, 2903bf215546Sopenharmony_ci emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 2904bf215546Sopenharmony_ci 2905bf215546Sopenharmony_ci /* 2906bf215546Sopenharmony_ci * _PS_CONST(minus_cephes_DP1, -0.78515625); 2907bf215546Sopenharmony_ci * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 2908bf215546Sopenharmony_ci * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 2909bf215546Sopenharmony_ci */ 2910bf215546Sopenharmony_ci LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 2911bf215546Sopenharmony_ci LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 2912bf215546Sopenharmony_ci LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 2913bf215546Sopenharmony_ci 2914bf215546Sopenharmony_ci /* 2915bf215546Sopenharmony_ci * The magic pass: "Extended precision modular arithmetic" 2916bf215546Sopenharmony_ci * x = ((x - y * DP1) - y * DP2) - y * DP3; 2917bf215546Sopenharmony_ci */ 2918bf215546Sopenharmony_ci LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs); 2919bf215546Sopenharmony_ci LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1); 2920bf215546Sopenharmony_ci LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2); 2921bf215546Sopenharmony_ci 2922bf215546Sopenharmony_ci /* 2923bf215546Sopenharmony_ci * Evaluate the first polynom (0 <= x <= Pi/4) 2924bf215546Sopenharmony_ci * 2925bf215546Sopenharmony_ci * z = _mm_mul_ps(x,x); 2926bf215546Sopenharmony_ci */ 2927bf215546Sopenharmony_ci LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 2928bf215546Sopenharmony_ci 2929bf215546Sopenharmony_ci /* 2930bf215546Sopenharmony_ci * _PS_CONST(coscof_p0, 2.443315711809948E-005); 2931bf215546Sopenharmony_ci * _PS_CONST(coscof_p1, -1.388731625493765E-003); 2932bf215546Sopenharmony_ci * _PS_CONST(coscof_p2, 4.166664568298827E-002); 2933bf215546Sopenharmony_ci */ 2934bf215546Sopenharmony_ci LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 2935bf215546Sopenharmony_ci LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 2936bf215546Sopenharmony_ci LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 2937bf215546Sopenharmony_ci 2938bf215546Sopenharmony_ci /* 2939bf215546Sopenharmony_ci * y = *(v4sf*)_ps_coscof_p0; 2940bf215546Sopenharmony_ci * y = _mm_mul_ps(y, z); 2941bf215546Sopenharmony_ci */ 2942bf215546Sopenharmony_ci LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1); 2943bf215546Sopenharmony_ci LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2); 2944bf215546Sopenharmony_ci LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 2945bf215546Sopenharmony_ci LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 2946bf215546Sopenharmony_ci 2947bf215546Sopenharmony_ci 2948bf215546Sopenharmony_ci /* 2949bf215546Sopenharmony_ci * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 2950bf215546Sopenharmony_ci * y = _mm_sub_ps(y, tmp); 2951bf215546Sopenharmony_ci * y = _mm_add_ps(y, *(v4sf*)_ps_1); 2952bf215546Sopenharmony_ci */ 2953bf215546Sopenharmony_ci LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 2954bf215546Sopenharmony_ci LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 2955bf215546Sopenharmony_ci LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 2956bf215546Sopenharmony_ci LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 2957bf215546Sopenharmony_ci LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 2958bf215546Sopenharmony_ci 2959bf215546Sopenharmony_ci /* 2960bf215546Sopenharmony_ci * _PS_CONST(sincof_p0, -1.9515295891E-4); 2961bf215546Sopenharmony_ci * _PS_CONST(sincof_p1, 8.3321608736E-3); 2962bf215546Sopenharmony_ci * _PS_CONST(sincof_p2, -1.6666654611E-1); 2963bf215546Sopenharmony_ci */ 2964bf215546Sopenharmony_ci LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 2965bf215546Sopenharmony_ci LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 2966bf215546Sopenharmony_ci LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 2967bf215546Sopenharmony_ci 2968bf215546Sopenharmony_ci /* 2969bf215546Sopenharmony_ci * Evaluate the second polynom (Pi/4 <= x <= 0) 2970bf215546Sopenharmony_ci * 2971bf215546Sopenharmony_ci * y2 = *(v4sf*)_ps_sincof_p0; 2972bf215546Sopenharmony_ci * y2 = _mm_mul_ps(y2, z); 2973bf215546Sopenharmony_ci * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 2974bf215546Sopenharmony_ci * y2 = _mm_mul_ps(y2, z); 2975bf215546Sopenharmony_ci * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 2976bf215546Sopenharmony_ci * y2 = _mm_mul_ps(y2, z); 2977bf215546Sopenharmony_ci * y2 = _mm_mul_ps(y2, x); 2978bf215546Sopenharmony_ci * y2 = _mm_add_ps(y2, x); 2979bf215546Sopenharmony_ci */ 2980bf215546Sopenharmony_ci 2981bf215546Sopenharmony_ci LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1); 2982bf215546Sopenharmony_ci LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2); 2983bf215546Sopenharmony_ci LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 2984bf215546Sopenharmony_ci LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3); 2985bf215546Sopenharmony_ci 2986bf215546Sopenharmony_ci /* 2987bf215546Sopenharmony_ci * select the correct result from the two polynoms 2988bf215546Sopenharmony_ci * xmm3 = poly_mask; 2989bf215546Sopenharmony_ci * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 2990bf215546Sopenharmony_ci * y = _mm_andnot_ps(xmm3, y); 2991bf215546Sopenharmony_ci * y = _mm_or_ps(y,y2); 2992bf215546Sopenharmony_ci */ 2993bf215546Sopenharmony_ci LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 2994bf215546Sopenharmony_ci LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 2995bf215546Sopenharmony_ci LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 2996bf215546Sopenharmony_ci LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv"); 2997bf215546Sopenharmony_ci LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 2998bf215546Sopenharmony_ci LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine"); 2999bf215546Sopenharmony_ci 3000bf215546Sopenharmony_ci /* 3001bf215546Sopenharmony_ci * update the sign 3002bf215546Sopenharmony_ci * y = _mm_xor_ps(y, sign_bit); 3003bf215546Sopenharmony_ci */ 3004bf215546Sopenharmony_ci LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign"); 3005bf215546Sopenharmony_ci LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 3006bf215546Sopenharmony_ci 3007bf215546Sopenharmony_ci LLVMValueRef isfinite = lp_build_isfinite(bld, a); 3008bf215546Sopenharmony_ci 3009bf215546Sopenharmony_ci /* clamp output to be within [-1, 1] */ 3010bf215546Sopenharmony_ci y_result = lp_build_clamp(bld, y_result, 3011bf215546Sopenharmony_ci lp_build_const_vec(bld->gallivm, bld->type, -1.f), 3012bf215546Sopenharmony_ci lp_build_const_vec(bld->gallivm, bld->type, 1.f)); 3013bf215546Sopenharmony_ci /* If a is -inf, inf or NaN then return NaN */ 3014bf215546Sopenharmony_ci y_result = lp_build_select(bld, isfinite, y_result, 3015bf215546Sopenharmony_ci lp_build_const_vec(bld->gallivm, bld->type, NAN)); 3016bf215546Sopenharmony_ci return y_result; 3017bf215546Sopenharmony_ci} 3018bf215546Sopenharmony_ci 3019bf215546Sopenharmony_ci 3020bf215546Sopenharmony_ci/** 3021bf215546Sopenharmony_ci * Generate sin(a) 3022bf215546Sopenharmony_ci */ 3023bf215546Sopenharmony_ciLLVMValueRef 3024bf215546Sopenharmony_cilp_build_sin(struct lp_build_context *bld, 3025bf215546Sopenharmony_ci LLVMValueRef a) 3026bf215546Sopenharmony_ci{ 3027bf215546Sopenharmony_ci const struct lp_type type = bld->type; 3028bf215546Sopenharmony_ci 3029bf215546Sopenharmony_ci if (type.width == 16) { 3030bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 3031bf215546Sopenharmony_ci LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3032bf215546Sopenharmony_ci char intrinsic[32]; 3033bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type); 3034bf215546Sopenharmony_ci LLVMValueRef args[] = { a }; 3035bf215546Sopenharmony_ci return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0); 3036bf215546Sopenharmony_ci } 3037bf215546Sopenharmony_ci 3038bf215546Sopenharmony_ci return lp_build_sin_or_cos(bld, a, FALSE); 3039bf215546Sopenharmony_ci} 3040bf215546Sopenharmony_ci 3041bf215546Sopenharmony_ci 3042bf215546Sopenharmony_ci/** 3043bf215546Sopenharmony_ci * Generate cos(a) 3044bf215546Sopenharmony_ci */ 3045bf215546Sopenharmony_ciLLVMValueRef 3046bf215546Sopenharmony_cilp_build_cos(struct lp_build_context *bld, 3047bf215546Sopenharmony_ci LLVMValueRef a) 3048bf215546Sopenharmony_ci{ 3049bf215546Sopenharmony_ci const struct lp_type type = bld->type; 3050bf215546Sopenharmony_ci 3051bf215546Sopenharmony_ci if (type.width == 16) { 3052bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 3053bf215546Sopenharmony_ci LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3054bf215546Sopenharmony_ci char intrinsic[32]; 3055bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type); 3056bf215546Sopenharmony_ci LLVMValueRef args[] = { a }; 3057bf215546Sopenharmony_ci return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0); 3058bf215546Sopenharmony_ci } 3059bf215546Sopenharmony_ci 3060bf215546Sopenharmony_ci return lp_build_sin_or_cos(bld, a, TRUE); 3061bf215546Sopenharmony_ci} 3062bf215546Sopenharmony_ci 3063bf215546Sopenharmony_ci 3064bf215546Sopenharmony_ci/** 3065bf215546Sopenharmony_ci * Generate pow(x, y) 3066bf215546Sopenharmony_ci */ 3067bf215546Sopenharmony_ciLLVMValueRef 3068bf215546Sopenharmony_cilp_build_pow(struct lp_build_context *bld, 3069bf215546Sopenharmony_ci LLVMValueRef x, 3070bf215546Sopenharmony_ci LLVMValueRef y) 3071bf215546Sopenharmony_ci{ 3072bf215546Sopenharmony_ci /* TODO: optimize the constant case */ 3073bf215546Sopenharmony_ci if (gallivm_debug & GALLIVM_DEBUG_PERF && 3074bf215546Sopenharmony_ci LLVMIsConstant(x) && LLVMIsConstant(y)) { 3075bf215546Sopenharmony_ci debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3076bf215546Sopenharmony_ci __FUNCTION__); 3077bf215546Sopenharmony_ci } 3078bf215546Sopenharmony_ci 3079bf215546Sopenharmony_ci LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f)); 3080bf215546Sopenharmony_ci LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2_safe(bld, x), y)); 3081bf215546Sopenharmony_ci 3082bf215546Sopenharmony_ci res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res); 3083bf215546Sopenharmony_ci return res; 3084bf215546Sopenharmony_ci} 3085bf215546Sopenharmony_ci 3086bf215546Sopenharmony_ci 3087bf215546Sopenharmony_ci/** 3088bf215546Sopenharmony_ci * Generate exp(x) 3089bf215546Sopenharmony_ci */ 3090bf215546Sopenharmony_ciLLVMValueRef 3091bf215546Sopenharmony_cilp_build_exp(struct lp_build_context *bld, 3092bf215546Sopenharmony_ci LLVMValueRef x) 3093bf215546Sopenharmony_ci{ 3094bf215546Sopenharmony_ci /* log2(e) = 1/log(2) */ 3095bf215546Sopenharmony_ci LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type, 3096bf215546Sopenharmony_ci 1.4426950408889634); 3097bf215546Sopenharmony_ci 3098bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3099bf215546Sopenharmony_ci 3100bf215546Sopenharmony_ci return lp_build_exp2(bld, lp_build_mul(bld, log2e, x)); 3101bf215546Sopenharmony_ci} 3102bf215546Sopenharmony_ci 3103bf215546Sopenharmony_ci 3104bf215546Sopenharmony_ci/** 3105bf215546Sopenharmony_ci * Generate log(x) 3106bf215546Sopenharmony_ci * Behavior is undefined with infs, 0s and nans 3107bf215546Sopenharmony_ci */ 3108bf215546Sopenharmony_ciLLVMValueRef 3109bf215546Sopenharmony_cilp_build_log(struct lp_build_context *bld, 3110bf215546Sopenharmony_ci LLVMValueRef x) 3111bf215546Sopenharmony_ci{ 3112bf215546Sopenharmony_ci /* log(2) */ 3113bf215546Sopenharmony_ci LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 3114bf215546Sopenharmony_ci 0.69314718055994529); 3115bf215546Sopenharmony_ci 3116bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3117bf215546Sopenharmony_ci 3118bf215546Sopenharmony_ci return lp_build_mul(bld, log2, lp_build_log2(bld, x)); 3119bf215546Sopenharmony_ci} 3120bf215546Sopenharmony_ci 3121bf215546Sopenharmony_ci 3122bf215546Sopenharmony_ci/** 3123bf215546Sopenharmony_ci * Generate log(x) that handles edge cases (infs, 0s and nans) 3124bf215546Sopenharmony_ci */ 3125bf215546Sopenharmony_ciLLVMValueRef 3126bf215546Sopenharmony_cilp_build_log_safe(struct lp_build_context *bld, 3127bf215546Sopenharmony_ci LLVMValueRef x) 3128bf215546Sopenharmony_ci{ 3129bf215546Sopenharmony_ci /* log(2) */ 3130bf215546Sopenharmony_ci LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 3131bf215546Sopenharmony_ci 0.69314718055994529); 3132bf215546Sopenharmony_ci 3133bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3134bf215546Sopenharmony_ci 3135bf215546Sopenharmony_ci return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x)); 3136bf215546Sopenharmony_ci} 3137bf215546Sopenharmony_ci 3138bf215546Sopenharmony_ci 3139bf215546Sopenharmony_ci/** 3140bf215546Sopenharmony_ci * Generate polynomial. 3141bf215546Sopenharmony_ci * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 3142bf215546Sopenharmony_ci */ 3143bf215546Sopenharmony_ciLLVMValueRef 3144bf215546Sopenharmony_cilp_build_polynomial(struct lp_build_context *bld, 3145bf215546Sopenharmony_ci LLVMValueRef x, 3146bf215546Sopenharmony_ci const double *coeffs, 3147bf215546Sopenharmony_ci unsigned num_coeffs) 3148bf215546Sopenharmony_ci{ 3149bf215546Sopenharmony_ci const struct lp_type type = bld->type; 3150bf215546Sopenharmony_ci LLVMValueRef even = NULL, odd = NULL; 3151bf215546Sopenharmony_ci LLVMValueRef x2; 3152bf215546Sopenharmony_ci unsigned i; 3153bf215546Sopenharmony_ci 3154bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3155bf215546Sopenharmony_ci 3156bf215546Sopenharmony_ci /* TODO: optimize the constant case */ 3157bf215546Sopenharmony_ci if (gallivm_debug & GALLIVM_DEBUG_PERF && 3158bf215546Sopenharmony_ci LLVMIsConstant(x)) { 3159bf215546Sopenharmony_ci debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3160bf215546Sopenharmony_ci __FUNCTION__); 3161bf215546Sopenharmony_ci } 3162bf215546Sopenharmony_ci 3163bf215546Sopenharmony_ci /* 3164bf215546Sopenharmony_ci * Calculate odd and even terms seperately to decrease data dependency 3165bf215546Sopenharmony_ci * Ex: 3166bf215546Sopenharmony_ci * c[0] + x^2 * c[2] + x^4 * c[4] ... 3167bf215546Sopenharmony_ci * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ... 3168bf215546Sopenharmony_ci */ 3169bf215546Sopenharmony_ci x2 = lp_build_mul(bld, x, x); 3170bf215546Sopenharmony_ci 3171bf215546Sopenharmony_ci for (i = num_coeffs; i--; ) { 3172bf215546Sopenharmony_ci LLVMValueRef coeff; 3173bf215546Sopenharmony_ci 3174bf215546Sopenharmony_ci coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]); 3175bf215546Sopenharmony_ci 3176bf215546Sopenharmony_ci if (i % 2 == 0) { 3177bf215546Sopenharmony_ci if (even) 3178bf215546Sopenharmony_ci even = lp_build_mad(bld, x2, even, coeff); 3179bf215546Sopenharmony_ci else 3180bf215546Sopenharmony_ci even = coeff; 3181bf215546Sopenharmony_ci } else { 3182bf215546Sopenharmony_ci if (odd) 3183bf215546Sopenharmony_ci odd = lp_build_mad(bld, x2, odd, coeff); 3184bf215546Sopenharmony_ci else 3185bf215546Sopenharmony_ci odd = coeff; 3186bf215546Sopenharmony_ci } 3187bf215546Sopenharmony_ci } 3188bf215546Sopenharmony_ci 3189bf215546Sopenharmony_ci if (odd) 3190bf215546Sopenharmony_ci return lp_build_mad(bld, odd, x, even); 3191bf215546Sopenharmony_ci else if (even) 3192bf215546Sopenharmony_ci return even; 3193bf215546Sopenharmony_ci else 3194bf215546Sopenharmony_ci return bld->undef; 3195bf215546Sopenharmony_ci} 3196bf215546Sopenharmony_ci 3197bf215546Sopenharmony_ci 3198bf215546Sopenharmony_ci/** 3199bf215546Sopenharmony_ci * Minimax polynomial fit of 2**x, in range [0, 1[ 3200bf215546Sopenharmony_ci */ 3201bf215546Sopenharmony_cistatic const double lp_build_exp2_polynomial[] = { 3202bf215546Sopenharmony_ci#if EXP_POLY_DEGREE == 5 3203bf215546Sopenharmony_ci 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */ 3204bf215546Sopenharmony_ci 0.693153073200168932794, 3205bf215546Sopenharmony_ci 0.240153617044375388211, 3206bf215546Sopenharmony_ci 0.0558263180532956664775, 3207bf215546Sopenharmony_ci 0.00898934009049466391101, 3208bf215546Sopenharmony_ci 0.00187757667519147912699 3209bf215546Sopenharmony_ci#elif EXP_POLY_DEGREE == 4 3210bf215546Sopenharmony_ci 1.00000259337069434683, 3211bf215546Sopenharmony_ci 0.693003834469974940458, 3212bf215546Sopenharmony_ci 0.24144275689150793076, 3213bf215546Sopenharmony_ci 0.0520114606103070150235, 3214bf215546Sopenharmony_ci 0.0135341679161270268764 3215bf215546Sopenharmony_ci#elif EXP_POLY_DEGREE == 3 3216bf215546Sopenharmony_ci 0.999925218562710312959, 3217bf215546Sopenharmony_ci 0.695833540494823811697, 3218bf215546Sopenharmony_ci 0.226067155427249155588, 3219bf215546Sopenharmony_ci 0.0780245226406372992967 3220bf215546Sopenharmony_ci#elif EXP_POLY_DEGREE == 2 3221bf215546Sopenharmony_ci 1.00172476321474503578, 3222bf215546Sopenharmony_ci 0.657636275736077639316, 3223bf215546Sopenharmony_ci 0.33718943461968720704 3224bf215546Sopenharmony_ci#else 3225bf215546Sopenharmony_ci#error 3226bf215546Sopenharmony_ci#endif 3227bf215546Sopenharmony_ci}; 3228bf215546Sopenharmony_ci 3229bf215546Sopenharmony_ci 3230bf215546Sopenharmony_ciLLVMValueRef 3231bf215546Sopenharmony_cilp_build_exp2(struct lp_build_context *bld, 3232bf215546Sopenharmony_ci LLVMValueRef x) 3233bf215546Sopenharmony_ci{ 3234bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 3235bf215546Sopenharmony_ci const struct lp_type type = bld->type; 3236bf215546Sopenharmony_ci LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3237bf215546Sopenharmony_ci LLVMValueRef ipart = NULL; 3238bf215546Sopenharmony_ci LLVMValueRef fpart = NULL; 3239bf215546Sopenharmony_ci LLVMValueRef expipart = NULL; 3240bf215546Sopenharmony_ci LLVMValueRef expfpart = NULL; 3241bf215546Sopenharmony_ci LLVMValueRef res = NULL; 3242bf215546Sopenharmony_ci 3243bf215546Sopenharmony_ci if (type.floating && type.width == 16) { 3244bf215546Sopenharmony_ci char intrinsic[32]; 3245bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type); 3246bf215546Sopenharmony_ci LLVMValueRef args[] = { x }; 3247bf215546Sopenharmony_ci return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0); 3248bf215546Sopenharmony_ci } 3249bf215546Sopenharmony_ci 3250bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3251bf215546Sopenharmony_ci 3252bf215546Sopenharmony_ci /* TODO: optimize the constant case */ 3253bf215546Sopenharmony_ci if (gallivm_debug & GALLIVM_DEBUG_PERF && 3254bf215546Sopenharmony_ci LLVMIsConstant(x)) { 3255bf215546Sopenharmony_ci debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3256bf215546Sopenharmony_ci __FUNCTION__); 3257bf215546Sopenharmony_ci } 3258bf215546Sopenharmony_ci 3259bf215546Sopenharmony_ci assert(type.floating && type.width == 32); 3260bf215546Sopenharmony_ci 3261bf215546Sopenharmony_ci /* We want to preserve NaN and make sure than for exp2 if x > 128, 3262bf215546Sopenharmony_ci * the result is INF and if it's smaller than -126.9 the result is 0 */ 3263bf215546Sopenharmony_ci x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x, 3264bf215546Sopenharmony_ci GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 3265bf215546Sopenharmony_ci x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), 3266bf215546Sopenharmony_ci x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 3267bf215546Sopenharmony_ci 3268bf215546Sopenharmony_ci /* ipart = floor(x) */ 3269bf215546Sopenharmony_ci /* fpart = x - ipart */ 3270bf215546Sopenharmony_ci lp_build_ifloor_fract(bld, x, &ipart, &fpart); 3271bf215546Sopenharmony_ci 3272bf215546Sopenharmony_ci /* expipart = (float) (1 << ipart) */ 3273bf215546Sopenharmony_ci expipart = LLVMBuildAdd(builder, ipart, 3274bf215546Sopenharmony_ci lp_build_const_int_vec(bld->gallivm, type, 127), ""); 3275bf215546Sopenharmony_ci expipart = LLVMBuildShl(builder, expipart, 3276bf215546Sopenharmony_ci lp_build_const_int_vec(bld->gallivm, type, 23), ""); 3277bf215546Sopenharmony_ci expipart = LLVMBuildBitCast(builder, expipart, vec_type, ""); 3278bf215546Sopenharmony_ci 3279bf215546Sopenharmony_ci expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 3280bf215546Sopenharmony_ci ARRAY_SIZE(lp_build_exp2_polynomial)); 3281bf215546Sopenharmony_ci 3282bf215546Sopenharmony_ci res = LLVMBuildFMul(builder, expipart, expfpart, ""); 3283bf215546Sopenharmony_ci 3284bf215546Sopenharmony_ci return res; 3285bf215546Sopenharmony_ci} 3286bf215546Sopenharmony_ci 3287bf215546Sopenharmony_ci 3288bf215546Sopenharmony_ci/** 3289bf215546Sopenharmony_ci * Extract the exponent of a IEEE-754 floating point value. 3290bf215546Sopenharmony_ci * 3291bf215546Sopenharmony_ci * Optionally apply an integer bias. 3292bf215546Sopenharmony_ci * 3293bf215546Sopenharmony_ci * Result is an integer value with 3294bf215546Sopenharmony_ci * 3295bf215546Sopenharmony_ci * ifloor(log2(x)) + bias 3296bf215546Sopenharmony_ci */ 3297bf215546Sopenharmony_ciLLVMValueRef 3298bf215546Sopenharmony_cilp_build_extract_exponent(struct lp_build_context *bld, 3299bf215546Sopenharmony_ci LLVMValueRef x, 3300bf215546Sopenharmony_ci int bias) 3301bf215546Sopenharmony_ci{ 3302bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 3303bf215546Sopenharmony_ci const struct lp_type type = bld->type; 3304bf215546Sopenharmony_ci unsigned mantissa = lp_mantissa(type); 3305bf215546Sopenharmony_ci LLVMValueRef res; 3306bf215546Sopenharmony_ci 3307bf215546Sopenharmony_ci assert(type.floating); 3308bf215546Sopenharmony_ci 3309bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3310bf215546Sopenharmony_ci 3311bf215546Sopenharmony_ci x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 3312bf215546Sopenharmony_ci 3313bf215546Sopenharmony_ci res = LLVMBuildLShr(builder, x, 3314bf215546Sopenharmony_ci lp_build_const_int_vec(bld->gallivm, type, mantissa), ""); 3315bf215546Sopenharmony_ci res = LLVMBuildAnd(builder, res, 3316bf215546Sopenharmony_ci lp_build_const_int_vec(bld->gallivm, type, 255), ""); 3317bf215546Sopenharmony_ci res = LLVMBuildSub(builder, res, 3318bf215546Sopenharmony_ci lp_build_const_int_vec(bld->gallivm, type, 127 - bias), ""); 3319bf215546Sopenharmony_ci 3320bf215546Sopenharmony_ci return res; 3321bf215546Sopenharmony_ci} 3322bf215546Sopenharmony_ci 3323bf215546Sopenharmony_ci 3324bf215546Sopenharmony_ci/** 3325bf215546Sopenharmony_ci * Extract the mantissa of the a floating. 3326bf215546Sopenharmony_ci * 3327bf215546Sopenharmony_ci * Result is a floating point value with 3328bf215546Sopenharmony_ci * 3329bf215546Sopenharmony_ci * x / floor(log2(x)) 3330bf215546Sopenharmony_ci */ 3331bf215546Sopenharmony_ciLLVMValueRef 3332bf215546Sopenharmony_cilp_build_extract_mantissa(struct lp_build_context *bld, 3333bf215546Sopenharmony_ci LLVMValueRef x) 3334bf215546Sopenharmony_ci{ 3335bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 3336bf215546Sopenharmony_ci const struct lp_type type = bld->type; 3337bf215546Sopenharmony_ci unsigned mantissa = lp_mantissa(type); 3338bf215546Sopenharmony_ci LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 3339bf215546Sopenharmony_ci (1ULL << mantissa) - 1); 3340bf215546Sopenharmony_ci LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); 3341bf215546Sopenharmony_ci LLVMValueRef res; 3342bf215546Sopenharmony_ci 3343bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3344bf215546Sopenharmony_ci 3345bf215546Sopenharmony_ci assert(type.floating); 3346bf215546Sopenharmony_ci 3347bf215546Sopenharmony_ci x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 3348bf215546Sopenharmony_ci 3349bf215546Sopenharmony_ci /* res = x / 2**ipart */ 3350bf215546Sopenharmony_ci res = LLVMBuildAnd(builder, x, mantmask, ""); 3351bf215546Sopenharmony_ci res = LLVMBuildOr(builder, res, one, ""); 3352bf215546Sopenharmony_ci res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 3353bf215546Sopenharmony_ci 3354bf215546Sopenharmony_ci return res; 3355bf215546Sopenharmony_ci} 3356bf215546Sopenharmony_ci 3357bf215546Sopenharmony_ci 3358bf215546Sopenharmony_ci 3359bf215546Sopenharmony_ci/** 3360bf215546Sopenharmony_ci * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[ 3361bf215546Sopenharmony_ci * These coefficients can be generate with 3362bf215546Sopenharmony_ci * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 3363bf215546Sopenharmony_ci */ 3364bf215546Sopenharmony_cistatic const double lp_build_log2_polynomial[] = { 3365bf215546Sopenharmony_ci#if LOG_POLY_DEGREE == 5 3366bf215546Sopenharmony_ci 2.88539008148777786488L, 3367bf215546Sopenharmony_ci 0.961796878841293367824L, 3368bf215546Sopenharmony_ci 0.577058946784739859012L, 3369bf215546Sopenharmony_ci 0.412914355135828735411L, 3370bf215546Sopenharmony_ci 0.308591899232910175289L, 3371bf215546Sopenharmony_ci 0.352376952300281371868L, 3372bf215546Sopenharmony_ci#elif LOG_POLY_DEGREE == 4 3373bf215546Sopenharmony_ci 2.88539009343309178325L, 3374bf215546Sopenharmony_ci 0.961791550404184197881L, 3375bf215546Sopenharmony_ci 0.577440339438736392009L, 3376bf215546Sopenharmony_ci 0.403343858251329912514L, 3377bf215546Sopenharmony_ci 0.406718052498846252698L, 3378bf215546Sopenharmony_ci#elif LOG_POLY_DEGREE == 3 3379bf215546Sopenharmony_ci 2.88538959748872753838L, 3380bf215546Sopenharmony_ci 0.961932915889597772928L, 3381bf215546Sopenharmony_ci 0.571118517972136195241L, 3382bf215546Sopenharmony_ci 0.493997535084709500285L, 3383bf215546Sopenharmony_ci#else 3384bf215546Sopenharmony_ci#error 3385bf215546Sopenharmony_ci#endif 3386bf215546Sopenharmony_ci}; 3387bf215546Sopenharmony_ci 3388bf215546Sopenharmony_ci 3389bf215546Sopenharmony_ci/** 3390bf215546Sopenharmony_ci * See http://www.devmaster.net/forums/showthread.php?p=43580 3391bf215546Sopenharmony_ci * http://en.wikipedia.org/wiki/Logarithm#Calculation 3392bf215546Sopenharmony_ci * http://www.nezumi.demon.co.uk/consult/logx.htm 3393bf215546Sopenharmony_ci * 3394bf215546Sopenharmony_ci * If handle_edge_cases is true the function will perform computations 3395bf215546Sopenharmony_ci * to match the required D3D10+ behavior for each of the edge cases. 3396bf215546Sopenharmony_ci * That means that if input is: 3397bf215546Sopenharmony_ci * - less than zero (to and including -inf) then NaN will be returned 3398bf215546Sopenharmony_ci * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned 3399bf215546Sopenharmony_ci * - +infinity, then +infinity will be returned 3400bf215546Sopenharmony_ci * - NaN, then NaN will be returned 3401bf215546Sopenharmony_ci * 3402bf215546Sopenharmony_ci * Those checks are fairly expensive so if you don't need them make sure 3403bf215546Sopenharmony_ci * handle_edge_cases is false. 3404bf215546Sopenharmony_ci */ 3405bf215546Sopenharmony_civoid 3406bf215546Sopenharmony_cilp_build_log2_approx(struct lp_build_context *bld, 3407bf215546Sopenharmony_ci LLVMValueRef x, 3408bf215546Sopenharmony_ci LLVMValueRef *p_exp, 3409bf215546Sopenharmony_ci LLVMValueRef *p_floor_log2, 3410bf215546Sopenharmony_ci LLVMValueRef *p_log2, 3411bf215546Sopenharmony_ci boolean handle_edge_cases) 3412bf215546Sopenharmony_ci{ 3413bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 3414bf215546Sopenharmony_ci const struct lp_type type = bld->type; 3415bf215546Sopenharmony_ci LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3416bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 3417bf215546Sopenharmony_ci 3418bf215546Sopenharmony_ci LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000); 3419bf215546Sopenharmony_ci LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff); 3420bf215546Sopenharmony_ci LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 3421bf215546Sopenharmony_ci 3422bf215546Sopenharmony_ci LLVMValueRef i = NULL; 3423bf215546Sopenharmony_ci LLVMValueRef y = NULL; 3424bf215546Sopenharmony_ci LLVMValueRef z = NULL; 3425bf215546Sopenharmony_ci LLVMValueRef exp = NULL; 3426bf215546Sopenharmony_ci LLVMValueRef mant = NULL; 3427bf215546Sopenharmony_ci LLVMValueRef logexp = NULL; 3428bf215546Sopenharmony_ci LLVMValueRef p_z = NULL; 3429bf215546Sopenharmony_ci LLVMValueRef res = NULL; 3430bf215546Sopenharmony_ci 3431bf215546Sopenharmony_ci if (bld->type.width == 16) { 3432bf215546Sopenharmony_ci char intrinsic[32]; 3433bf215546Sopenharmony_ci lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type); 3434bf215546Sopenharmony_ci LLVMValueRef args[] = { x }; 3435bf215546Sopenharmony_ci if (p_log2) 3436bf215546Sopenharmony_ci *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0); 3437bf215546Sopenharmony_ci return; 3438bf215546Sopenharmony_ci } 3439bf215546Sopenharmony_ci 3440bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3441bf215546Sopenharmony_ci 3442bf215546Sopenharmony_ci if (p_exp || p_floor_log2 || p_log2) { 3443bf215546Sopenharmony_ci /* TODO: optimize the constant case */ 3444bf215546Sopenharmony_ci if (gallivm_debug & GALLIVM_DEBUG_PERF && 3445bf215546Sopenharmony_ci LLVMIsConstant(x)) { 3446bf215546Sopenharmony_ci debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3447bf215546Sopenharmony_ci __FUNCTION__); 3448bf215546Sopenharmony_ci } 3449bf215546Sopenharmony_ci 3450bf215546Sopenharmony_ci assert(type.floating && type.width == 32); 3451bf215546Sopenharmony_ci 3452bf215546Sopenharmony_ci /* 3453bf215546Sopenharmony_ci * We don't explicitly handle denormalized numbers. They will yield a 3454bf215546Sopenharmony_ci * result in the neighbourhood of -127, which appears to be adequate 3455bf215546Sopenharmony_ci * enough. 3456bf215546Sopenharmony_ci */ 3457bf215546Sopenharmony_ci 3458bf215546Sopenharmony_ci i = LLVMBuildBitCast(builder, x, int_vec_type, ""); 3459bf215546Sopenharmony_ci 3460bf215546Sopenharmony_ci /* exp = (float) exponent(x) */ 3461bf215546Sopenharmony_ci exp = LLVMBuildAnd(builder, i, expmask, ""); 3462bf215546Sopenharmony_ci } 3463bf215546Sopenharmony_ci 3464bf215546Sopenharmony_ci if (p_floor_log2 || p_log2) { 3465bf215546Sopenharmony_ci logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), ""); 3466bf215546Sopenharmony_ci logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), ""); 3467bf215546Sopenharmony_ci logexp = LLVMBuildSIToFP(builder, logexp, vec_type, ""); 3468bf215546Sopenharmony_ci } 3469bf215546Sopenharmony_ci 3470bf215546Sopenharmony_ci if (p_log2) { 3471bf215546Sopenharmony_ci /* mant = 1 + (float) mantissa(x) */ 3472bf215546Sopenharmony_ci mant = LLVMBuildAnd(builder, i, mantmask, ""); 3473bf215546Sopenharmony_ci mant = LLVMBuildOr(builder, mant, one, ""); 3474bf215546Sopenharmony_ci mant = LLVMBuildBitCast(builder, mant, vec_type, ""); 3475bf215546Sopenharmony_ci 3476bf215546Sopenharmony_ci /* y = (mant - 1) / (mant + 1) */ 3477bf215546Sopenharmony_ci y = lp_build_div(bld, 3478bf215546Sopenharmony_ci lp_build_sub(bld, mant, bld->one), 3479bf215546Sopenharmony_ci lp_build_add(bld, mant, bld->one)); 3480bf215546Sopenharmony_ci 3481bf215546Sopenharmony_ci /* z = y^2 */ 3482bf215546Sopenharmony_ci z = lp_build_mul(bld, y, y); 3483bf215546Sopenharmony_ci 3484bf215546Sopenharmony_ci /* compute P(z) */ 3485bf215546Sopenharmony_ci p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial, 3486bf215546Sopenharmony_ci ARRAY_SIZE(lp_build_log2_polynomial)); 3487bf215546Sopenharmony_ci 3488bf215546Sopenharmony_ci /* y * P(z) + logexp */ 3489bf215546Sopenharmony_ci res = lp_build_mad(bld, y, p_z, logexp); 3490bf215546Sopenharmony_ci 3491bf215546Sopenharmony_ci if (type.floating && handle_edge_cases) { 3492bf215546Sopenharmony_ci LLVMValueRef negmask, infmask, zmask; 3493bf215546Sopenharmony_ci negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x, 3494bf215546Sopenharmony_ci lp_build_const_vec(bld->gallivm, type, 0.0f)); 3495bf215546Sopenharmony_ci zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, 3496bf215546Sopenharmony_ci lp_build_const_vec(bld->gallivm, type, 0.0f)); 3497bf215546Sopenharmony_ci infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x, 3498bf215546Sopenharmony_ci lp_build_const_vec(bld->gallivm, type, INFINITY)); 3499bf215546Sopenharmony_ci 3500bf215546Sopenharmony_ci /* If x is qual to inf make sure we return inf */ 3501bf215546Sopenharmony_ci res = lp_build_select(bld, infmask, 3502bf215546Sopenharmony_ci lp_build_const_vec(bld->gallivm, type, INFINITY), 3503bf215546Sopenharmony_ci res); 3504bf215546Sopenharmony_ci /* If x is qual to 0, return -inf */ 3505bf215546Sopenharmony_ci res = lp_build_select(bld, zmask, 3506bf215546Sopenharmony_ci lp_build_const_vec(bld->gallivm, type, -INFINITY), 3507bf215546Sopenharmony_ci res); 3508bf215546Sopenharmony_ci /* If x is nan or less than 0, return nan */ 3509bf215546Sopenharmony_ci res = lp_build_select(bld, negmask, 3510bf215546Sopenharmony_ci lp_build_const_vec(bld->gallivm, type, NAN), 3511bf215546Sopenharmony_ci res); 3512bf215546Sopenharmony_ci } 3513bf215546Sopenharmony_ci } 3514bf215546Sopenharmony_ci 3515bf215546Sopenharmony_ci if (p_exp) { 3516bf215546Sopenharmony_ci exp = LLVMBuildBitCast(builder, exp, vec_type, ""); 3517bf215546Sopenharmony_ci *p_exp = exp; 3518bf215546Sopenharmony_ci } 3519bf215546Sopenharmony_ci 3520bf215546Sopenharmony_ci if (p_floor_log2) 3521bf215546Sopenharmony_ci *p_floor_log2 = logexp; 3522bf215546Sopenharmony_ci 3523bf215546Sopenharmony_ci if (p_log2) 3524bf215546Sopenharmony_ci *p_log2 = res; 3525bf215546Sopenharmony_ci} 3526bf215546Sopenharmony_ci 3527bf215546Sopenharmony_ci 3528bf215546Sopenharmony_ci/* 3529bf215546Sopenharmony_ci * log2 implementation which doesn't have special code to 3530bf215546Sopenharmony_ci * handle edge cases (-inf, 0, inf, NaN). It's faster but 3531bf215546Sopenharmony_ci * the results for those cases are undefined. 3532bf215546Sopenharmony_ci */ 3533bf215546Sopenharmony_ciLLVMValueRef 3534bf215546Sopenharmony_cilp_build_log2(struct lp_build_context *bld, 3535bf215546Sopenharmony_ci LLVMValueRef x) 3536bf215546Sopenharmony_ci{ 3537bf215546Sopenharmony_ci LLVMValueRef res; 3538bf215546Sopenharmony_ci lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE); 3539bf215546Sopenharmony_ci return res; 3540bf215546Sopenharmony_ci} 3541bf215546Sopenharmony_ci 3542bf215546Sopenharmony_ci 3543bf215546Sopenharmony_ci/* 3544bf215546Sopenharmony_ci * Version of log2 which handles all edge cases. 3545bf215546Sopenharmony_ci * Look at documentation of lp_build_log2_approx for 3546bf215546Sopenharmony_ci * description of the behavior for each of the edge cases. 3547bf215546Sopenharmony_ci */ 3548bf215546Sopenharmony_ciLLVMValueRef 3549bf215546Sopenharmony_cilp_build_log2_safe(struct lp_build_context *bld, 3550bf215546Sopenharmony_ci LLVMValueRef x) 3551bf215546Sopenharmony_ci{ 3552bf215546Sopenharmony_ci LLVMValueRef res; 3553bf215546Sopenharmony_ci lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE); 3554bf215546Sopenharmony_ci return res; 3555bf215546Sopenharmony_ci} 3556bf215546Sopenharmony_ci 3557bf215546Sopenharmony_ci 3558bf215546Sopenharmony_ci/** 3559bf215546Sopenharmony_ci * Faster (and less accurate) log2. 3560bf215546Sopenharmony_ci * 3561bf215546Sopenharmony_ci * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) 3562bf215546Sopenharmony_ci * 3563bf215546Sopenharmony_ci * Piece-wise linear approximation, with exact results when x is a 3564bf215546Sopenharmony_ci * power of two. 3565bf215546Sopenharmony_ci * 3566bf215546Sopenharmony_ci * See http://www.flipcode.com/archives/Fast_log_Function.shtml 3567bf215546Sopenharmony_ci */ 3568bf215546Sopenharmony_ciLLVMValueRef 3569bf215546Sopenharmony_cilp_build_fast_log2(struct lp_build_context *bld, 3570bf215546Sopenharmony_ci LLVMValueRef x) 3571bf215546Sopenharmony_ci{ 3572bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 3573bf215546Sopenharmony_ci LLVMValueRef ipart; 3574bf215546Sopenharmony_ci LLVMValueRef fpart; 3575bf215546Sopenharmony_ci 3576bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3577bf215546Sopenharmony_ci 3578bf215546Sopenharmony_ci assert(bld->type.floating); 3579bf215546Sopenharmony_ci 3580bf215546Sopenharmony_ci /* ipart = floor(log2(x)) - 1 */ 3581bf215546Sopenharmony_ci ipart = lp_build_extract_exponent(bld, x, -1); 3582bf215546Sopenharmony_ci ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, ""); 3583bf215546Sopenharmony_ci 3584bf215546Sopenharmony_ci /* fpart = x / 2**ipart */ 3585bf215546Sopenharmony_ci fpart = lp_build_extract_mantissa(bld, x); 3586bf215546Sopenharmony_ci 3587bf215546Sopenharmony_ci /* ipart + fpart */ 3588bf215546Sopenharmony_ci return LLVMBuildFAdd(builder, ipart, fpart, ""); 3589bf215546Sopenharmony_ci} 3590bf215546Sopenharmony_ci 3591bf215546Sopenharmony_ci 3592bf215546Sopenharmony_ci/** 3593bf215546Sopenharmony_ci * Fast implementation of iround(log2(x)). 3594bf215546Sopenharmony_ci * 3595bf215546Sopenharmony_ci * Not an approximation -- it should give accurate results all the time. 3596bf215546Sopenharmony_ci */ 3597bf215546Sopenharmony_ciLLVMValueRef 3598bf215546Sopenharmony_cilp_build_ilog2(struct lp_build_context *bld, 3599bf215546Sopenharmony_ci LLVMValueRef x) 3600bf215546Sopenharmony_ci{ 3601bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 3602bf215546Sopenharmony_ci LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2); 3603bf215546Sopenharmony_ci LLVMValueRef ipart; 3604bf215546Sopenharmony_ci 3605bf215546Sopenharmony_ci assert(bld->type.floating); 3606bf215546Sopenharmony_ci 3607bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3608bf215546Sopenharmony_ci 3609bf215546Sopenharmony_ci /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ 3610bf215546Sopenharmony_ci x = LLVMBuildFMul(builder, x, sqrt2, ""); 3611bf215546Sopenharmony_ci 3612bf215546Sopenharmony_ci /* ipart = floor(log2(x) + 0.5) */ 3613bf215546Sopenharmony_ci ipart = lp_build_extract_exponent(bld, x, 0); 3614bf215546Sopenharmony_ci 3615bf215546Sopenharmony_ci return ipart; 3616bf215546Sopenharmony_ci} 3617bf215546Sopenharmony_ci 3618bf215546Sopenharmony_ciLLVMValueRef 3619bf215546Sopenharmony_cilp_build_mod(struct lp_build_context *bld, 3620bf215546Sopenharmony_ci LLVMValueRef x, 3621bf215546Sopenharmony_ci LLVMValueRef y) 3622bf215546Sopenharmony_ci{ 3623bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 3624bf215546Sopenharmony_ci LLVMValueRef res; 3625bf215546Sopenharmony_ci const struct lp_type type = bld->type; 3626bf215546Sopenharmony_ci 3627bf215546Sopenharmony_ci assert(lp_check_value(type, x)); 3628bf215546Sopenharmony_ci assert(lp_check_value(type, y)); 3629bf215546Sopenharmony_ci 3630bf215546Sopenharmony_ci if (type.floating) 3631bf215546Sopenharmony_ci res = LLVMBuildFRem(builder, x, y, ""); 3632bf215546Sopenharmony_ci else if (type.sign) 3633bf215546Sopenharmony_ci res = LLVMBuildSRem(builder, x, y, ""); 3634bf215546Sopenharmony_ci else 3635bf215546Sopenharmony_ci res = LLVMBuildURem(builder, x, y, ""); 3636bf215546Sopenharmony_ci return res; 3637bf215546Sopenharmony_ci} 3638bf215546Sopenharmony_ci 3639bf215546Sopenharmony_ci 3640bf215546Sopenharmony_ci/* 3641bf215546Sopenharmony_ci * For floating inputs it creates and returns a mask 3642bf215546Sopenharmony_ci * which is all 1's for channels which are NaN. 3643bf215546Sopenharmony_ci * Channels inside x which are not NaN will be 0. 3644bf215546Sopenharmony_ci */ 3645bf215546Sopenharmony_ciLLVMValueRef 3646bf215546Sopenharmony_cilp_build_isnan(struct lp_build_context *bld, 3647bf215546Sopenharmony_ci LLVMValueRef x) 3648bf215546Sopenharmony_ci{ 3649bf215546Sopenharmony_ci LLVMValueRef mask; 3650bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type); 3651bf215546Sopenharmony_ci 3652bf215546Sopenharmony_ci assert(bld->type.floating); 3653bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3654bf215546Sopenharmony_ci 3655bf215546Sopenharmony_ci mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x, 3656bf215546Sopenharmony_ci "isnotnan"); 3657bf215546Sopenharmony_ci mask = LLVMBuildNot(bld->gallivm->builder, mask, ""); 3658bf215546Sopenharmony_ci mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan"); 3659bf215546Sopenharmony_ci return mask; 3660bf215546Sopenharmony_ci} 3661bf215546Sopenharmony_ci 3662bf215546Sopenharmony_ci 3663bf215546Sopenharmony_ci/* Returns all 1's for floating point numbers that are 3664bf215546Sopenharmony_ci * finite numbers and returns all zeros for -inf, 3665bf215546Sopenharmony_ci * inf and nan's */ 3666bf215546Sopenharmony_ciLLVMValueRef 3667bf215546Sopenharmony_cilp_build_isfinite(struct lp_build_context *bld, 3668bf215546Sopenharmony_ci LLVMValueRef x) 3669bf215546Sopenharmony_ci{ 3670bf215546Sopenharmony_ci LLVMBuilderRef builder = bld->gallivm->builder; 3671bf215546Sopenharmony_ci LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type); 3672bf215546Sopenharmony_ci struct lp_type int_type = lp_int_type(bld->type); 3673bf215546Sopenharmony_ci LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, ""); 3674bf215546Sopenharmony_ci LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type, 3675bf215546Sopenharmony_ci 0x7f800000); 3676bf215546Sopenharmony_ci 3677bf215546Sopenharmony_ci if (!bld->type.floating) { 3678bf215546Sopenharmony_ci return lp_build_const_int_vec(bld->gallivm, bld->type, 0); 3679bf215546Sopenharmony_ci } 3680bf215546Sopenharmony_ci assert(bld->type.floating); 3681bf215546Sopenharmony_ci assert(lp_check_value(bld->type, x)); 3682bf215546Sopenharmony_ci assert(bld->type.width == 32); 3683bf215546Sopenharmony_ci 3684bf215546Sopenharmony_ci intx = LLVMBuildAnd(builder, intx, infornan32, ""); 3685bf215546Sopenharmony_ci return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL, 3686bf215546Sopenharmony_ci intx, infornan32); 3687bf215546Sopenharmony_ci} 3688bf215546Sopenharmony_ci 3689bf215546Sopenharmony_ci 3690bf215546Sopenharmony_ci/* 3691bf215546Sopenharmony_ci * Returns true if the number is nan or inf and false otherwise. 3692bf215546Sopenharmony_ci * The input has to be a floating point vector. 3693bf215546Sopenharmony_ci */ 3694bf215546Sopenharmony_ciLLVMValueRef 3695bf215546Sopenharmony_cilp_build_is_inf_or_nan(struct gallivm_state *gallivm, 3696bf215546Sopenharmony_ci const struct lp_type type, 3697bf215546Sopenharmony_ci LLVMValueRef x) 3698bf215546Sopenharmony_ci{ 3699bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 3700bf215546Sopenharmony_ci struct lp_type int_type = lp_int_type(type); 3701bf215546Sopenharmony_ci LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type, 3702bf215546Sopenharmony_ci 0x7f800000); 3703bf215546Sopenharmony_ci LLVMValueRef ret; 3704bf215546Sopenharmony_ci 3705bf215546Sopenharmony_ci assert(type.floating); 3706bf215546Sopenharmony_ci 3707bf215546Sopenharmony_ci ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), ""); 3708bf215546Sopenharmony_ci ret = LLVMBuildAnd(builder, ret, const0, ""); 3709bf215546Sopenharmony_ci ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL, 3710bf215546Sopenharmony_ci ret, const0); 3711bf215546Sopenharmony_ci 3712bf215546Sopenharmony_ci return ret; 3713bf215546Sopenharmony_ci} 3714bf215546Sopenharmony_ci 3715bf215546Sopenharmony_ci 3716bf215546Sopenharmony_ciLLVMValueRef 3717bf215546Sopenharmony_cilp_build_fpstate_get(struct gallivm_state *gallivm) 3718bf215546Sopenharmony_ci{ 3719bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse) { 3720bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 3721bf215546Sopenharmony_ci LLVMValueRef mxcsr_ptr = lp_build_alloca( 3722bf215546Sopenharmony_ci gallivm, 3723bf215546Sopenharmony_ci LLVMInt32TypeInContext(gallivm->context), 3724bf215546Sopenharmony_ci "mxcsr_ptr"); 3725bf215546Sopenharmony_ci LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr, 3726bf215546Sopenharmony_ci LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); 3727bf215546Sopenharmony_ci lp_build_intrinsic(builder, 3728bf215546Sopenharmony_ci "llvm.x86.sse.stmxcsr", 3729bf215546Sopenharmony_ci LLVMVoidTypeInContext(gallivm->context), 3730bf215546Sopenharmony_ci &mxcsr_ptr8, 1, 0); 3731bf215546Sopenharmony_ci return mxcsr_ptr; 3732bf215546Sopenharmony_ci } 3733bf215546Sopenharmony_ci return 0; 3734bf215546Sopenharmony_ci} 3735bf215546Sopenharmony_ci 3736bf215546Sopenharmony_civoid 3737bf215546Sopenharmony_cilp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm, 3738bf215546Sopenharmony_ci boolean zero) 3739bf215546Sopenharmony_ci{ 3740bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse) { 3741bf215546Sopenharmony_ci /* turn on DAZ (64) | FTZ (32768) = 32832 if available */ 3742bf215546Sopenharmony_ci int daz_ftz = _MM_FLUSH_ZERO_MASK; 3743bf215546Sopenharmony_ci 3744bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 3745bf215546Sopenharmony_ci LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm); 3746bf215546Sopenharmony_ci LLVMValueRef mxcsr = 3747bf215546Sopenharmony_ci LLVMBuildLoad2(builder, LLVMInt32TypeInContext(gallivm->context), mxcsr_ptr, "mxcsr"); 3748bf215546Sopenharmony_ci 3749bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_daz) { 3750bf215546Sopenharmony_ci /* Enable denormals are zero mode */ 3751bf215546Sopenharmony_ci daz_ftz |= _MM_DENORMALS_ZERO_MASK; 3752bf215546Sopenharmony_ci } 3753bf215546Sopenharmony_ci if (zero) { 3754bf215546Sopenharmony_ci mxcsr = LLVMBuildOr(builder, mxcsr, 3755bf215546Sopenharmony_ci LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), ""); 3756bf215546Sopenharmony_ci } else { 3757bf215546Sopenharmony_ci mxcsr = LLVMBuildAnd(builder, mxcsr, 3758bf215546Sopenharmony_ci LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), ""); 3759bf215546Sopenharmony_ci } 3760bf215546Sopenharmony_ci 3761bf215546Sopenharmony_ci LLVMBuildStore(builder, mxcsr, mxcsr_ptr); 3762bf215546Sopenharmony_ci lp_build_fpstate_set(gallivm, mxcsr_ptr); 3763bf215546Sopenharmony_ci } 3764bf215546Sopenharmony_ci} 3765bf215546Sopenharmony_ci 3766bf215546Sopenharmony_ci 3767bf215546Sopenharmony_civoid 3768bf215546Sopenharmony_cilp_build_fpstate_set(struct gallivm_state *gallivm, 3769bf215546Sopenharmony_ci LLVMValueRef mxcsr_ptr) 3770bf215546Sopenharmony_ci{ 3771bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse) { 3772bf215546Sopenharmony_ci LLVMBuilderRef builder = gallivm->builder; 3773bf215546Sopenharmony_ci mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr, 3774bf215546Sopenharmony_ci LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); 3775bf215546Sopenharmony_ci lp_build_intrinsic(builder, 3776bf215546Sopenharmony_ci "llvm.x86.sse.ldmxcsr", 3777bf215546Sopenharmony_ci LLVMVoidTypeInContext(gallivm->context), 3778bf215546Sopenharmony_ci &mxcsr_ptr, 1, 0); 3779bf215546Sopenharmony_ci } 3780bf215546Sopenharmony_ci} 3781