1/************************************************************************** 2 * 3 * Copyright 2010-2021 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 20 * USE OR OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * The above copyright notice and this permission notice (including the 23 * next paragraph) shall be included in all copies or substantial portions 24 * of the Software. 25 * 26 **************************************************************************/ 27 28 29#include "pipe/p_config.h" 30 31#include "util/u_math.h" 32#include "util/u_cpu_detect.h" 33#include "util/u_pack_color.h" 34#include "util/u_rect.h" 35#include "util/u_sse.h" 36 37#include "lp_jit.h" 38#include "lp_rast.h" 39#include "lp_debug.h" 40#include "lp_state_fs.h" 41#include "lp_linear_priv.h" 42 43 44#if defined(PIPE_ARCH_SSE) 45 46#define FIXED15_ONE 0x7fff 47 48/* Translate floating point value to 1.15 unsigned fixed-point. 49 */ 50static inline ushort 51float_to_ufixed_1_15(float f) 52{ 53 return CLAMP((unsigned)(f * (float)FIXED15_ONE), 0, FIXED15_ONE); 54} 55 56 57/* Translate floating point value to 1.15 signed fixed-point. 58 */ 59static inline int16_t 60float_to_sfixed_1_15(float f) 61{ 62 return CLAMP((signed)(f * (float)FIXED15_ONE), -FIXED15_ONE, FIXED15_ONE); 63} 64 65 66/* Interpolate in 1.15 space, but produce a packed row of 0.8 values. 67 */ 68static const uint32_t * 69interp_0_8(struct lp_linear_elem *elem) 70{ 71 struct lp_linear_interp *interp = (struct lp_linear_interp *)elem; 72 uint32_t *row = interp->row; 73 __m128i a0 = interp->a0; 74 const __m128i dadx = interp->dadx; 75 const int width = (interp->width + 3) & ~3; 76 77 for (int i = 0; i < width; i += 4) { 78 __m128i l = _mm_srai_epi16(a0, 7); // l = a0 >> 7 79 a0 = _mm_add_epi16(a0, dadx); // a0 += dadx 80 81 __m128i h = _mm_srai_epi16(a0, 7); // h = a0 >> 7 82 a0 = _mm_add_epi16(a0, dadx); // a0 += dadx 83 84 // pack l[0..7] and h[0..7] as 16 bytes 85 *(__m128i *)&row[i] = _mm_packus_epi16(l, h); 86 } 87 88 // advance to next row 89 interp->a0 = _mm_add_epi16(interp->a0, interp->dady); 90 return interp->row; 91} 92 93static const uint32_t * 94interp_noop(struct lp_linear_elem *elem) 95{ 96 struct lp_linear_interp *interp = (struct lp_linear_interp *)elem; 97 return interp->row; 98} 99 100 101static const uint32_t * 102interp_check(struct lp_linear_elem *elem) 103{ 104 struct lp_linear_interp *interp = (struct lp_linear_interp *)elem; 105 interp->row[0] = 1; 106 return interp->row; 107} 108 109/* Not quite a noop - we use row[0] to track whether this gets called 110 * or not, so we can optimize which interpolants we care about. 111 */ 112void 113lp_linear_init_noop_interp(struct lp_linear_interp *interp) 114{ 115 interp->row[0] = 0; 116 interp->base.fetch = interp_check; 117} 118 119boolean 120lp_linear_init_interp(struct lp_linear_interp *interp, 121 int x, int y, int width, int height, 122 unsigned usage_mask, 123 boolean perspective, 124 float oow, 125 const float *a0, 126 const float *dadx, 127 const float *dady) 128{ 129 float s0[4]; 130 float dsdx[4]; 131 float dsdy[4]; 132 int16_t s0_fp[8]; 133 int16_t dsdx_fp[4]; 134 int16_t dsdy_fp[4]; 135 136 /* Zero coefficients to avoid using uninitialised values */ 137 memset(s0, 0, sizeof(s0)); 138 memset(dsdx, 0, sizeof(dsdx)); 139 memset(dsdy, 0, sizeof(dsdy)); 140 memset(s0_fp, 0, sizeof(s0_fp)); 141 memset(dsdx_fp, 0, sizeof(dsdx_fp)); 142 memset(dsdy_fp, 0, sizeof(dsdy_fp)); 143 144 if (perspective && oow != 1.0f) { 145 for (unsigned j = 0; j < 4; j++) { 146 if (usage_mask & (1<<j)) { 147 s0[j] = a0[j] * oow; 148 dsdx[j] = dadx[j] * oow; 149 dsdy[j] = dady[j] * oow; 150 } 151 } 152 } else { 153 for (unsigned j = 0; j < 4; j++) { 154 if (usage_mask & (1<<j)) { 155 s0[j] = a0[j]; 156 dsdx[j] = dadx[j]; 157 dsdy[j] = dady[j]; 158 } 159 } 160 } 161 162 s0[0] += x * dsdx[0] + y * dsdy[0]; 163 s0[1] += x * dsdx[1] + y * dsdy[1]; 164 s0[2] += x * dsdx[2] + y * dsdy[2]; 165 s0[3] += x * dsdx[3] + y * dsdy[3]; 166 167 /* XXX: lift all of this into the rectangle setup code. 168 * 169 * For rectangles with linear shaders, at setup time: 170 * - if w is constant (else mark as non-fastpath) 171 * - premultiply perspective interpolants by w 172 * - set w = 1 in position 173 * - check all interpolants for min/max 0..1 (else mark as 174 * non-fastpath) 175 */ 176 for (unsigned j = 0; j < 4; j++) { 177 if (usage_mask & (1<<j)) { 178 // compute texcoords at rect corners 179 float a = s0[j]; 180 float b = s0[j] + (width - 1) * dsdx[j]; 181 float c = s0[j] + (height - 1) * dsdy[j]; 182 float d = s0[j] + (height - 1) * dsdy[j] + (width - 1) * dsdx[j]; 183 184 if (MIN4(a,b,c,d) < 0.0) 185 FAIL("min < 0.0"); // out of bounds 186 187 if (MAX4(a,b,c,d) > 1.0) 188 FAIL("max > 1.0"); // out of bounds 189 190 dsdx_fp[j] = float_to_sfixed_1_15(dsdx[j]); 191 dsdy_fp[j] = float_to_sfixed_1_15(dsdy[j]); 192 193 s0_fp[j] = float_to_ufixed_1_15(s0[j]); // first pixel 194 s0_fp[j + 4] = s0_fp[j] + dsdx_fp[j]; // second pixel 195 196 dsdx_fp[j] *= 2; 197 } 198 } 199 200 interp->width = align(width, 4); 201 /* RGBA->BGRA swizzle here */ 202 interp->a0 = _mm_setr_epi16(s0_fp[2], s0_fp[1], s0_fp[0], s0_fp[3], 203 s0_fp[6], s0_fp[5], s0_fp[4], s0_fp[7]); 204 205 interp->dadx = _mm_setr_epi16(dsdx_fp[2], dsdx_fp[1], dsdx_fp[0], dsdx_fp[3], 206 dsdx_fp[2], dsdx_fp[1], dsdx_fp[0], dsdx_fp[3]); 207 208 interp->dady = _mm_setr_epi16(dsdy_fp[2], dsdy_fp[1], dsdy_fp[0], dsdy_fp[3], 209 dsdy_fp[2], dsdy_fp[1], dsdy_fp[0], dsdy_fp[3]); 210 211 /* If the value is y-invariant, eagerly calculate it here and then 212 * always return the precalculated value. 213 */ 214 if (dsdy[0] == 0 && 215 dsdy[1] == 0 && 216 dsdy[2] == 0 && 217 dsdy[3] == 0) { 218 interp_0_8(&interp->base); 219 interp->base.fetch = interp_noop; 220 } else { 221 interp->base.fetch = interp_0_8; 222 } 223 224 return TRUE; 225} 226 227#else 228boolean 229lp_linear_init_interp(struct lp_linear_interp *interp, 230 int x, int y, int width, int height, 231 unsigned usage_mask, 232 boolean perspective, 233 float oow, 234 const float *a0, 235 const float *dadx, 236 const float *dady) 237{ 238 return FALSE; 239} 240#endif 241