1bf215546Sopenharmony_ci/************************************************************************** 2bf215546Sopenharmony_ci * 3bf215546Sopenharmony_ci * Copyright 2008 VMware, Inc. 4bf215546Sopenharmony_ci * All Rights Reserved. 5bf215546Sopenharmony_ci * 6bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 7bf215546Sopenharmony_ci * copy of this software and associated documentation files (the 8bf215546Sopenharmony_ci * "Software"), to deal in the Software without restriction, including 9bf215546Sopenharmony_ci * without limitation the rights to use, copy, modify, merge, publish, 10bf215546Sopenharmony_ci * distribute, sub license, and/or sell copies of the Software, and to 11bf215546Sopenharmony_ci * permit persons to whom the Software is furnished to do so, subject to 12bf215546Sopenharmony_ci * the following conditions: 13bf215546Sopenharmony_ci * 14bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the 15bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions 16bf215546Sopenharmony_ci * of the Software. 17bf215546Sopenharmony_ci * 18bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19bf215546Sopenharmony_ci * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20bf215546Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21bf215546Sopenharmony_ci * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22bf215546Sopenharmony_ci * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23bf215546Sopenharmony_ci * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24bf215546Sopenharmony_ci * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25bf215546Sopenharmony_ci * 26bf215546Sopenharmony_ci **************************************************************************/ 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_ci 30bf215546Sopenharmony_ci#include "pipe/p_config.h" 31bf215546Sopenharmony_ci#include "util/u_math.h" 32bf215546Sopenharmony_ci#include "util/u_cpu_detect.h" 33bf215546Sopenharmony_ci 34bf215546Sopenharmony_ci#if defined(PIPE_ARCH_SSE) 35bf215546Sopenharmony_ci#include <xmmintrin.h> 36bf215546Sopenharmony_ci/* This is defined in pmmintrin.h, but it can only be included when -msse3 is 37bf215546Sopenharmony_ci * used, so just define it here to avoid further. */ 38bf215546Sopenharmony_ci#ifndef _MM_DENORMALS_ZERO_MASK 39bf215546Sopenharmony_ci#define _MM_DENORMALS_ZERO_MASK 0x0040 40bf215546Sopenharmony_ci#endif 41bf215546Sopenharmony_ci#endif 42bf215546Sopenharmony_ci 43bf215546Sopenharmony_ci 44bf215546Sopenharmony_ci/** log2(x), for x in [1.0, 2.0) */ 45bf215546Sopenharmony_cifloat log2_table[LOG2_TABLE_SIZE]; 46bf215546Sopenharmony_ci 47bf215546Sopenharmony_ci 48bf215546Sopenharmony_cistatic void 49bf215546Sopenharmony_ciinit_log2_table(void) 50bf215546Sopenharmony_ci{ 51bf215546Sopenharmony_ci unsigned i; 52bf215546Sopenharmony_ci for (i = 0; i < LOG2_TABLE_SIZE; i++) 53bf215546Sopenharmony_ci log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE)); 54bf215546Sopenharmony_ci} 55bf215546Sopenharmony_ci 56bf215546Sopenharmony_ci 57bf215546Sopenharmony_ci/** 58bf215546Sopenharmony_ci * One time init for math utilities. 59bf215546Sopenharmony_ci */ 60bf215546Sopenharmony_civoid 61bf215546Sopenharmony_ciutil_init_math(void) 62bf215546Sopenharmony_ci{ 63bf215546Sopenharmony_ci static bool initialized = false; 64bf215546Sopenharmony_ci if (!initialized) { 65bf215546Sopenharmony_ci init_log2_table(); 66bf215546Sopenharmony_ci initialized = true; 67bf215546Sopenharmony_ci } 68bf215546Sopenharmony_ci} 69bf215546Sopenharmony_ci 70bf215546Sopenharmony_ci/** 71bf215546Sopenharmony_ci * Fetches the contents of the fpstate (mxcsr on x86) register. 72bf215546Sopenharmony_ci * 73bf215546Sopenharmony_ci * On platforms without support for it just returns 0. 74bf215546Sopenharmony_ci */ 75bf215546Sopenharmony_ciunsigned 76bf215546Sopenharmony_ciutil_fpstate_get(void) 77bf215546Sopenharmony_ci{ 78bf215546Sopenharmony_ci unsigned mxcsr = 0; 79bf215546Sopenharmony_ci 80bf215546Sopenharmony_ci#if defined(PIPE_ARCH_SSE) 81bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse) { 82bf215546Sopenharmony_ci mxcsr = _mm_getcsr(); 83bf215546Sopenharmony_ci } 84bf215546Sopenharmony_ci#endif 85bf215546Sopenharmony_ci 86bf215546Sopenharmony_ci return mxcsr; 87bf215546Sopenharmony_ci} 88bf215546Sopenharmony_ci 89bf215546Sopenharmony_ci/** 90bf215546Sopenharmony_ci * Make sure that the fp treats the denormalized floating 91bf215546Sopenharmony_ci * point numbers as zero. 92bf215546Sopenharmony_ci * 93bf215546Sopenharmony_ci * This is the behavior required by D3D10. OpenGL doesn't care. 94bf215546Sopenharmony_ci */ 95bf215546Sopenharmony_ciunsigned 96bf215546Sopenharmony_ciutil_fpstate_set_denorms_to_zero(unsigned current_mxcsr) 97bf215546Sopenharmony_ci{ 98bf215546Sopenharmony_ci#if defined(PIPE_ARCH_SSE) 99bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse) { 100bf215546Sopenharmony_ci /* Enable flush to zero mode */ 101bf215546Sopenharmony_ci current_mxcsr |= _MM_FLUSH_ZERO_MASK; 102bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_daz) { 103bf215546Sopenharmony_ci /* Enable denormals are zero mode */ 104bf215546Sopenharmony_ci current_mxcsr |= _MM_DENORMALS_ZERO_MASK; 105bf215546Sopenharmony_ci } 106bf215546Sopenharmony_ci util_fpstate_set(current_mxcsr); 107bf215546Sopenharmony_ci } 108bf215546Sopenharmony_ci#endif 109bf215546Sopenharmony_ci return current_mxcsr; 110bf215546Sopenharmony_ci} 111bf215546Sopenharmony_ci 112bf215546Sopenharmony_ci/** 113bf215546Sopenharmony_ci * Set the state of the fpstate (mxcsr on x86) register. 114bf215546Sopenharmony_ci * 115bf215546Sopenharmony_ci * On platforms without support for it's a noop. 116bf215546Sopenharmony_ci */ 117bf215546Sopenharmony_civoid 118bf215546Sopenharmony_ciutil_fpstate_set(unsigned mxcsr) 119bf215546Sopenharmony_ci{ 120bf215546Sopenharmony_ci#if defined(PIPE_ARCH_SSE) 121bf215546Sopenharmony_ci if (util_get_cpu_caps()->has_sse) { 122bf215546Sopenharmony_ci _mm_setcsr(mxcsr); 123bf215546Sopenharmony_ci } 124bf215546Sopenharmony_ci#endif 125bf215546Sopenharmony_ci} 126bf215546Sopenharmony_ci 127bf215546Sopenharmony_ci/** 128bf215546Sopenharmony_ci * Compute inverse of 4x4 matrix. 129bf215546Sopenharmony_ci * 130bf215546Sopenharmony_ci * \return false if the source matrix is singular. 131bf215546Sopenharmony_ci * 132bf215546Sopenharmony_ci * \author 133bf215546Sopenharmony_ci * Code contributed by Jacques Leroy jle@star.be 134bf215546Sopenharmony_ci * 135bf215546Sopenharmony_ci * Calculates the inverse matrix by performing the gaussian matrix reduction 136bf215546Sopenharmony_ci * with partial pivoting followed by back/substitution with the loops manually 137bf215546Sopenharmony_ci * unrolled. 138bf215546Sopenharmony_ci */ 139bf215546Sopenharmony_cibool 140bf215546Sopenharmony_ciutil_invert_mat4x4(float *out, const float *m) 141bf215546Sopenharmony_ci{ 142bf215546Sopenharmony_ci float wtmp[4][8]; 143bf215546Sopenharmony_ci float m0, m1, m2, m3, s; 144bf215546Sopenharmony_ci float *r0, *r1, *r2, *r3; 145bf215546Sopenharmony_ci 146bf215546Sopenharmony_ci#define MAT(m, r, c) (m)[(c)*4 + (r)] 147bf215546Sopenharmony_ci#define SWAP_ROWS(a, b) \ 148bf215546Sopenharmony_ci { \ 149bf215546Sopenharmony_ci float *_tmp = a; \ 150bf215546Sopenharmony_ci (a) = (b); \ 151bf215546Sopenharmony_ci (b) = _tmp; \ 152bf215546Sopenharmony_ci } 153bf215546Sopenharmony_ci 154bf215546Sopenharmony_ci r0 = wtmp[0], r1 = wtmp[1], r2 = wtmp[2], r3 = wtmp[3]; 155bf215546Sopenharmony_ci 156bf215546Sopenharmony_ci r0[0] = MAT(m, 0, 0), r0[1] = MAT(m, 0, 1), r0[2] = MAT(m, 0, 2), r0[3] = MAT(m, 0, 3), 157bf215546Sopenharmony_ci r0[4] = 1.0, r0[5] = r0[6] = r0[7] = 0.0, 158bf215546Sopenharmony_ci 159bf215546Sopenharmony_ci r1[0] = MAT(m, 1, 0), r1[1] = MAT(m, 1, 1), r1[2] = MAT(m, 1, 2), r1[3] = MAT(m, 1, 3), 160bf215546Sopenharmony_ci r1[5] = 1.0, r1[4] = r1[6] = r1[7] = 0.0, 161bf215546Sopenharmony_ci 162bf215546Sopenharmony_ci r2[0] = MAT(m, 2, 0), r2[1] = MAT(m, 2, 1), r2[2] = MAT(m, 2, 2), r2[3] = MAT(m, 2, 3), 163bf215546Sopenharmony_ci r2[6] = 1.0, r2[4] = r2[5] = r2[7] = 0.0, 164bf215546Sopenharmony_ci 165bf215546Sopenharmony_ci r3[0] = MAT(m, 3, 0), r3[1] = MAT(m, 3, 1), r3[2] = MAT(m, 3, 2), r3[3] = MAT(m, 3, 3), 166bf215546Sopenharmony_ci r3[7] = 1.0, r3[4] = r3[5] = r3[6] = 0.0; 167bf215546Sopenharmony_ci 168bf215546Sopenharmony_ci /* choose pivot - or die */ 169bf215546Sopenharmony_ci if (fabsf(r3[0]) > fabsf(r2[0])) 170bf215546Sopenharmony_ci SWAP_ROWS(r3, r2); 171bf215546Sopenharmony_ci if (fabsf(r2[0]) > fabsf(r1[0])) 172bf215546Sopenharmony_ci SWAP_ROWS(r2, r1); 173bf215546Sopenharmony_ci if (fabsf(r1[0]) > fabsf(r0[0])) 174bf215546Sopenharmony_ci SWAP_ROWS(r1, r0); 175bf215546Sopenharmony_ci if (0.0F == r0[0]) 176bf215546Sopenharmony_ci return false; 177bf215546Sopenharmony_ci 178bf215546Sopenharmony_ci /* eliminate first variable */ 179bf215546Sopenharmony_ci m1 = r1[0] / r0[0]; 180bf215546Sopenharmony_ci m2 = r2[0] / r0[0]; 181bf215546Sopenharmony_ci m3 = r3[0] / r0[0]; 182bf215546Sopenharmony_ci s = r0[1]; 183bf215546Sopenharmony_ci r1[1] -= m1 * s; 184bf215546Sopenharmony_ci r2[1] -= m2 * s; 185bf215546Sopenharmony_ci r3[1] -= m3 * s; 186bf215546Sopenharmony_ci s = r0[2]; 187bf215546Sopenharmony_ci r1[2] -= m1 * s; 188bf215546Sopenharmony_ci r2[2] -= m2 * s; 189bf215546Sopenharmony_ci r3[2] -= m3 * s; 190bf215546Sopenharmony_ci s = r0[3]; 191bf215546Sopenharmony_ci r1[3] -= m1 * s; 192bf215546Sopenharmony_ci r2[3] -= m2 * s; 193bf215546Sopenharmony_ci r3[3] -= m3 * s; 194bf215546Sopenharmony_ci s = r0[4]; 195bf215546Sopenharmony_ci if (s != 0.0F) { 196bf215546Sopenharmony_ci r1[4] -= m1 * s; 197bf215546Sopenharmony_ci r2[4] -= m2 * s; 198bf215546Sopenharmony_ci r3[4] -= m3 * s; 199bf215546Sopenharmony_ci } 200bf215546Sopenharmony_ci s = r0[5]; 201bf215546Sopenharmony_ci if (s != 0.0F) { 202bf215546Sopenharmony_ci r1[5] -= m1 * s; 203bf215546Sopenharmony_ci r2[5] -= m2 * s; 204bf215546Sopenharmony_ci r3[5] -= m3 * s; 205bf215546Sopenharmony_ci } 206bf215546Sopenharmony_ci s = r0[6]; 207bf215546Sopenharmony_ci if (s != 0.0F) { 208bf215546Sopenharmony_ci r1[6] -= m1 * s; 209bf215546Sopenharmony_ci r2[6] -= m2 * s; 210bf215546Sopenharmony_ci r3[6] -= m3 * s; 211bf215546Sopenharmony_ci } 212bf215546Sopenharmony_ci s = r0[7]; 213bf215546Sopenharmony_ci if (s != 0.0F) { 214bf215546Sopenharmony_ci r1[7] -= m1 * s; 215bf215546Sopenharmony_ci r2[7] -= m2 * s; 216bf215546Sopenharmony_ci r3[7] -= m3 * s; 217bf215546Sopenharmony_ci } 218bf215546Sopenharmony_ci 219bf215546Sopenharmony_ci /* choose pivot - or die */ 220bf215546Sopenharmony_ci if (fabsf(r3[1]) > fabsf(r2[1])) 221bf215546Sopenharmony_ci SWAP_ROWS(r3, r2); 222bf215546Sopenharmony_ci if (fabsf(r2[1]) > fabsf(r1[1])) 223bf215546Sopenharmony_ci SWAP_ROWS(r2, r1); 224bf215546Sopenharmony_ci if (0.0F == r1[1]) 225bf215546Sopenharmony_ci return false; 226bf215546Sopenharmony_ci 227bf215546Sopenharmony_ci /* eliminate second variable */ 228bf215546Sopenharmony_ci m2 = r2[1] / r1[1]; 229bf215546Sopenharmony_ci m3 = r3[1] / r1[1]; 230bf215546Sopenharmony_ci r2[2] -= m2 * r1[2]; 231bf215546Sopenharmony_ci r3[2] -= m3 * r1[2]; 232bf215546Sopenharmony_ci r2[3] -= m2 * r1[3]; 233bf215546Sopenharmony_ci r3[3] -= m3 * r1[3]; 234bf215546Sopenharmony_ci s = r1[4]; 235bf215546Sopenharmony_ci if (0.0F != s) { 236bf215546Sopenharmony_ci r2[4] -= m2 * s; 237bf215546Sopenharmony_ci r3[4] -= m3 * s; 238bf215546Sopenharmony_ci } 239bf215546Sopenharmony_ci s = r1[5]; 240bf215546Sopenharmony_ci if (0.0F != s) { 241bf215546Sopenharmony_ci r2[5] -= m2 * s; 242bf215546Sopenharmony_ci r3[5] -= m3 * s; 243bf215546Sopenharmony_ci } 244bf215546Sopenharmony_ci s = r1[6]; 245bf215546Sopenharmony_ci if (0.0F != s) { 246bf215546Sopenharmony_ci r2[6] -= m2 * s; 247bf215546Sopenharmony_ci r3[6] -= m3 * s; 248bf215546Sopenharmony_ci } 249bf215546Sopenharmony_ci s = r1[7]; 250bf215546Sopenharmony_ci if (0.0F != s) { 251bf215546Sopenharmony_ci r2[7] -= m2 * s; 252bf215546Sopenharmony_ci r3[7] -= m3 * s; 253bf215546Sopenharmony_ci } 254bf215546Sopenharmony_ci 255bf215546Sopenharmony_ci /* choose pivot - or die */ 256bf215546Sopenharmony_ci if (fabsf(r3[2]) > fabsf(r2[2])) 257bf215546Sopenharmony_ci SWAP_ROWS(r3, r2); 258bf215546Sopenharmony_ci if (0.0F == r2[2]) 259bf215546Sopenharmony_ci return false; 260bf215546Sopenharmony_ci 261bf215546Sopenharmony_ci /* eliminate third variable */ 262bf215546Sopenharmony_ci m3 = r3[2] / r2[2]; 263bf215546Sopenharmony_ci r3[3] -= m3 * r2[3], r3[4] -= m3 * r2[4], r3[5] -= m3 * r2[5], r3[6] -= m3 * r2[6], 264bf215546Sopenharmony_ci r3[7] -= m3 * r2[7]; 265bf215546Sopenharmony_ci 266bf215546Sopenharmony_ci /* last check */ 267bf215546Sopenharmony_ci if (0.0F == r3[3]) 268bf215546Sopenharmony_ci return false; 269bf215546Sopenharmony_ci 270bf215546Sopenharmony_ci s = 1.0F / r3[3]; /* now back substitute row 3 */ 271bf215546Sopenharmony_ci r3[4] *= s; 272bf215546Sopenharmony_ci r3[5] *= s; 273bf215546Sopenharmony_ci r3[6] *= s; 274bf215546Sopenharmony_ci r3[7] *= s; 275bf215546Sopenharmony_ci 276bf215546Sopenharmony_ci m2 = r2[3]; /* now back substitute row 2 */ 277bf215546Sopenharmony_ci s = 1.0F / r2[2]; 278bf215546Sopenharmony_ci r2[4] = s * (r2[4] - r3[4] * m2), r2[5] = s * (r2[5] - r3[5] * m2), 279bf215546Sopenharmony_ci r2[6] = s * (r2[6] - r3[6] * m2), r2[7] = s * (r2[7] - r3[7] * m2); 280bf215546Sopenharmony_ci m1 = r1[3]; 281bf215546Sopenharmony_ci r1[4] -= r3[4] * m1, r1[5] -= r3[5] * m1, r1[6] -= r3[6] * m1, r1[7] -= r3[7] * m1; 282bf215546Sopenharmony_ci m0 = r0[3]; 283bf215546Sopenharmony_ci r0[4] -= r3[4] * m0, r0[5] -= r3[5] * m0, r0[6] -= r3[6] * m0, r0[7] -= r3[7] * m0; 284bf215546Sopenharmony_ci 285bf215546Sopenharmony_ci m1 = r1[2]; /* now back substitute row 1 */ 286bf215546Sopenharmony_ci s = 1.0F / r1[1]; 287bf215546Sopenharmony_ci r1[4] = s * (r1[4] - r2[4] * m1), r1[5] = s * (r1[5] - r2[5] * m1), 288bf215546Sopenharmony_ci r1[6] = s * (r1[6] - r2[6] * m1), r1[7] = s * (r1[7] - r2[7] * m1); 289bf215546Sopenharmony_ci m0 = r0[2]; 290bf215546Sopenharmony_ci r0[4] -= r2[4] * m0, r0[5] -= r2[5] * m0, r0[6] -= r2[6] * m0, r0[7] -= r2[7] * m0; 291bf215546Sopenharmony_ci 292bf215546Sopenharmony_ci m0 = r0[1]; /* now back substitute row 0 */ 293bf215546Sopenharmony_ci s = 1.0F / r0[0]; 294bf215546Sopenharmony_ci r0[4] = s * (r0[4] - r1[4] * m0), r0[5] = s * (r0[5] - r1[5] * m0), 295bf215546Sopenharmony_ci r0[6] = s * (r0[6] - r1[6] * m0), r0[7] = s * (r0[7] - r1[7] * m0); 296bf215546Sopenharmony_ci 297bf215546Sopenharmony_ci MAT(out, 0, 0) = r0[4]; 298bf215546Sopenharmony_ci MAT(out, 0, 1) = r0[5], MAT(out, 0, 2) = r0[6]; 299bf215546Sopenharmony_ci MAT(out, 0, 3) = r0[7], MAT(out, 1, 0) = r1[4]; 300bf215546Sopenharmony_ci MAT(out, 1, 1) = r1[5], MAT(out, 1, 2) = r1[6]; 301bf215546Sopenharmony_ci MAT(out, 1, 3) = r1[7], MAT(out, 2, 0) = r2[4]; 302bf215546Sopenharmony_ci MAT(out, 2, 1) = r2[5], MAT(out, 2, 2) = r2[6]; 303bf215546Sopenharmony_ci MAT(out, 2, 3) = r2[7], MAT(out, 3, 0) = r3[4]; 304bf215546Sopenharmony_ci MAT(out, 3, 1) = r3[5], MAT(out, 3, 2) = r3[6]; 305bf215546Sopenharmony_ci MAT(out, 3, 3) = r3[7]; 306bf215546Sopenharmony_ci 307bf215546Sopenharmony_ci#undef MAT 308bf215546Sopenharmony_ci#undef SWAP_ROWS 309bf215546Sopenharmony_ci 310bf215546Sopenharmony_ci return true; 311bf215546Sopenharmony_ci} 312