1/************************************************************************** 2 * 3 * Copyright 2010-2021 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 20 * USE OR OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * The above copyright notice and this permission notice (including the 23 * next paragraph) shall be included in all copies or substantial portions 24 * of the Software. 25 * 26 **************************************************************************/ 27 28 29#include "pipe/p_config.h" 30 31#include "util/u_math.h" 32#include "util/u_cpu_detect.h" 33#include "util/u_pack_color.h" 34#include "util/u_rect.h" 35#include "util/u_sse.h" 36 37#include "lp_jit.h" 38#include "lp_debug.h" 39#include "lp_state_fs.h" 40#include "lp_linear_priv.h" 41 42#if defined(PIPE_ARCH_SSE) 43 44#define FIXED16_SHIFT 16 45#define FIXED16_ONE (1<<16) 46#define FIXED16_HALF (1<<15) 47 48/* 49 * Color tolerance. Allow 1 bit of error in 8 bit unorm colors. 50 */ 51#define FIXED16_TOL (FIXED16_ONE >> 7) 52 53/* 54 * Tolerance for texture coordinate derivatives when doing linear filtering. 55 * 56 * (Note that extra care needs to be taken when doing linear filtering as 57 * coordinates may snap up to neighbour texels inside the tile). 58 */ 59#define FIXED16_TOL_DERIV (FIXED16_TOL / TILE_SIZE) 60 61static inline int 62float_to_fixed16(float f) 63{ 64 return f * (float)FIXED16_ONE; 65} 66 67static inline int 68fixed16_frac(int x) 69{ 70 return x & (FIXED16_ONE - 1); 71} 72 73static inline int 74fixed16_approx(int x, int y, int tol) 75{ 76 return y - tol <= x && x <= y + tol; 77} 78 79 80/* 81 * Unstretched blit of a bgra texture. 82 */ 83static const uint32_t * 84fetch_bgra_memcpy(struct lp_linear_elem *elem) 85{ 86 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 87 const struct lp_jit_texture *texture = samp->texture; 88 const uint32_t *src_row = 89 (const uint32_t *)((const uint8_t *)texture->base + 90 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]); 91 const int s = samp->s; 92 const int width = samp->width; 93 const uint32_t *row; 94 95 src_row = &src_row[s >> FIXED16_SHIFT]; 96 97 if (((uintptr_t)src_row & 0xf) == 0) { 98 /* The source texels are already aligned. Return them */ 99 row = src_row; 100 } else { 101 memcpy(samp->row, src_row, width * sizeof *row); 102 row = samp->row; 103 } 104 105 samp->t += samp->dtdy; 106 return row; 107} 108 109 110/* 111 * Unstretched blit of a bgrx texture. 112 */ 113static const uint32_t * 114fetch_bgrx_memcpy(struct lp_linear_elem *elem) 115{ 116 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 117 const struct lp_jit_texture *texture = samp->texture; 118 const uint32_t *src_row = 119 (const uint32_t *)((const uint8_t *)texture->base + 120 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]); 121 const int s = samp->s; 122 const int width = samp->width; 123 uint32_t *row = samp->row; 124 125 src_row = &src_row[s >> FIXED16_SHIFT]; 126 127 for (int i = 0; i < width; i++) { 128 row[i] = src_row[i] | 0xff000000; 129 } 130 131 samp->t += samp->dtdy; 132 return row; 133} 134 135 136/* 137 * Perform nearest filtered lookup of a row of texels. Texture lookup 138 * is assumed to be axis aligned but with arbitrary scaling. 139 * 140 * Texture coordinate interpolation is performed in 16.16 fixed point, 141 * not to be confused with the 1.15 format used by the interpolants. 142 * 143 * After 64 pixels (ie. in the next tile), the starting point will be 144 * recalculated with floating point arithmetic. 145 */ 146static const uint32_t * 147fetch_bgra_axis_aligned(struct lp_linear_elem *elem) 148{ 149 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 150 const struct lp_jit_texture *texture = samp->texture; 151 const uint32_t *src_row = 152 (const uint32_t *)((const uint8_t *)texture->base + 153 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]); 154 const int dsdx = samp->dsdx; 155 const int width = samp->width; 156 uint32_t *row = samp->row; 157 int s = samp->s; 158 159 for (int i = 0; i < width; i++) { 160 row[i] = src_row[s>>FIXED16_SHIFT]; 161 s += dsdx; 162 } 163 164 samp->t += samp->dtdy; 165 return row; 166} 167 168 169static const uint32_t * 170fetch_bgrx_axis_aligned(struct lp_linear_elem *elem) 171{ 172 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 173 const struct lp_jit_texture *texture = samp->texture; 174 const uint32_t *src_row = 175 (const uint32_t *)((const uint8_t *)texture->base + 176 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]); 177 const int dsdx = samp->dsdx; 178 const int width = samp->width; 179 uint32_t *row = samp->row; 180 int s = samp->s; 181 182 for (int i = 0; i < width; i++) { 183 row[i] = src_row[s>>FIXED16_SHIFT] | 0xff000000; 184 s += dsdx; 185 } 186 187 samp->t += samp->dtdy; 188 return row; 189} 190 191 192/* Non-axis aligned, but no clamping or wrapping required 193 */ 194static const uint32_t * 195fetch_bgra(struct lp_linear_elem *elem) 196{ 197 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 198 const struct lp_jit_texture *texture = samp->texture; 199 const uint8_t *src = texture->base; 200 const int stride = texture->row_stride[0]; 201 const int dsdx = samp->dsdx; 202 const int dtdx = samp->dtdx; 203 const int width = samp->width; 204 uint32_t *row = samp->row; 205 int s = samp->s; 206 int t = samp->t; 207 208 for (int i = 0; i < width; i++) { 209 const uint8_t *texel = (src + 210 (t>>FIXED16_SHIFT) * stride + 211 (s>>FIXED16_SHIFT) * 4); 212 213 row[i] = *(const uint32_t *)texel; 214 215 s += dsdx; 216 t += dtdx; 217 } 218 219 samp->s += samp->dsdy; 220 samp->t += samp->dtdy; 221 return row; 222} 223 224 225static const uint32_t * 226fetch_bgrx(struct lp_linear_elem *elem) 227{ 228 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 229 const struct lp_jit_texture *texture = samp->texture; 230 const uint8_t *src = texture->base; 231 const int stride = texture->row_stride[0]; 232 const int dsdx = samp->dsdx; 233 const int dtdx = samp->dtdx; 234 const int width = samp->width; 235 uint32_t *row = samp->row; 236 int s = samp->s; 237 int t = samp->t; 238 239 for (int i = 0; i < width; i++) { 240 const uint8_t *texel = (src + 241 (t>>FIXED16_SHIFT) * stride + 242 (s>>FIXED16_SHIFT) * 4); 243 244 row[i] = (*(const uint32_t *)texel) | 0xff000000; 245 246 s += dsdx; 247 t += dtdx; 248 } 249 250 samp->s += samp->dsdy; 251 samp->t += samp->dtdy; 252 return row; 253} 254 255/* Non-axis aligned, clamped. 256 */ 257static const uint32_t * 258fetch_bgra_clamp(struct lp_linear_elem *elem) 259{ 260 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 261 const struct lp_jit_texture *texture = samp->texture; 262 const uint8_t *src = texture->base; 263 const int stride = texture->row_stride[0]; 264 const int tex_height = texture->height - 1; 265 const int tex_width = texture->width - 1; 266 const int dsdx = samp->dsdx; 267 const int dtdx = samp->dtdx; 268 const int width = samp->width; 269 uint32_t *row = samp->row; 270 int s = samp->s; 271 int t = samp->t; 272 273 for (int i = 0; i < width; i++) { 274 int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height); 275 int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width); 276 277 const uint8_t *texel = src + ct * stride + cs * 4; 278 279 row[i] = *(const uint32_t *)texel; 280 281 s += dsdx; 282 t += dtdx; 283 } 284 285 samp->s += samp->dsdy; 286 samp->t += samp->dtdy; 287 return row; 288} 289 290static const uint32_t * 291fetch_bgrx_clamp(struct lp_linear_elem *elem) 292{ 293 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 294 const struct lp_jit_texture *texture = samp->texture; 295 const uint8_t *src = texture->base; 296 const int stride = texture->row_stride[0]; 297 const int tex_height = texture->height - 1; 298 const int tex_width = texture->width - 1; 299 const int dsdx = samp->dsdx; 300 const int dtdx = samp->dtdx; 301 const int width = samp->width; 302 uint32_t *row = samp->row; 303 int s = samp->s; 304 int t = samp->t; 305 306 for (int i = 0; i < width; i++) { 307 int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height); 308 int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width); 309 310 const uint8_t *texel = src + ct * stride + cs * 4; 311 312 row[i] = (*(const uint32_t *)texel) | 0xff000000; 313 314 s += dsdx; 315 t += dtdx; 316 } 317 318 samp->s += samp->dsdy; 319 samp->t += samp->dtdy; 320 return row; 321} 322 323/** 324 * Fetch and stretch one row. 325 */ 326static inline const uint32_t * 327fetch_and_stretch_bgra_row(struct lp_linear_sampler *samp, 328 int y) 329{ 330 const struct lp_jit_texture *texture = samp->texture; 331 const uint32_t *data = (const uint32_t *)texture->base; 332 const int stride = texture->row_stride[0] / sizeof(uint32_t); 333 const int width = samp->width; 334 335 /* 336 * Search the stretched row cache first. 337 */ 338 339 if (y == samp->stretched_row_y[0]) { 340 samp->stretched_row_index = 1; 341 return samp->stretched_row[0]; 342 } 343 344 if (y == samp->stretched_row_y[1]) { 345 samp->stretched_row_index = 0; 346 return samp->stretched_row[1]; 347 } 348 349 /* 350 * Replace one entry. 351 */ 352 353 const uint32_t * restrict src_row = data + y * stride; 354 uint32_t * restrict dst_row = samp->stretched_row[samp->stretched_row_index]; 355 356 if (fixed16_frac(samp->s) == 0 && 357 samp->dsdx == FIXED16_ONE) { // TODO: could be relaxed 358 /* 359 * 1:1 blit on the x direction. 360 */ 361 src_row += samp->s >> FIXED16_SHIFT; 362 363 if (((uintptr_t)src_row & 0xf) == 0) { 364 /* The source texture is already aligned. Return it */ 365 return src_row; 366 } 367 368 /* Copy the source texture */ 369 for (int i = 0; i < width; i += 4) { 370 __m128i src = _mm_loadu_si128((const __m128i *)&src_row[i]); 371 *(__m128i *)&dst_row[i] = src; 372 } 373 } 374 else { 375 util_sse2_stretch_row_8unorm((__m128i *)dst_row, 376 align(width, 4), 377 src_row, samp->s, samp->dsdx); 378 } 379 380 samp->stretched_row_y[samp->stretched_row_index] = y; 381 samp->stretched_row_index ^= 1; 382 383 return dst_row; 384} 385 386 387/* Maximise only as we fetch unscaled pixels linearly into a size-64 388 * temporary. For minimise, we will want to either have a bigger 389 * temporary or fetch sparsely. 390 */ 391static const uint32_t * 392fetch_bgra_axis_aligned_linear(struct lp_linear_elem *elem) 393{ 394 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 395 const int width = samp->width; 396 uint32_t * restrict row = samp->row; 397 const int y = samp->t >> FIXED16_SHIFT; 398 const int w = (samp->t >> 8) & 0xff; 399 400 samp->t += samp->dtdy; 401 402 const uint32_t * restrict src_row0 = fetch_and_stretch_bgra_row(samp, y); 403 404 if (w == 0) { 405 return src_row0; 406 } 407 408 const uint32_t * restrict src_row1 = fetch_and_stretch_bgra_row(samp, y + 1); 409 410 __m128i wt = _mm_set1_epi16(w); 411 412 /* Combine the two rows using a constant weight. 413 */ 414 for (int i = 0; i < width; i += 4) { 415 __m128i srca = _mm_load_si128((const __m128i *)&src_row0[i]); 416 __m128i srcb = _mm_load_si128((const __m128i *)&src_row1[i]); 417 418 *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed88(srca, srcb, &wt, &wt); 419 } 420 421 return row; 422} 423 424 425/* Non-axis-aligned version. Don't try to take advantage of 426 * maximize. 427 */ 428static const uint32_t * 429fetch_bgra_linear(struct lp_linear_elem *elem) 430{ 431 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 432 const struct lp_jit_texture *texture = samp->texture; 433 const int stride = texture->row_stride[0] / sizeof(uint32_t); 434 const uint32_t *data = (const uint32_t *)texture->base; 435 const int dsdx = samp->dsdx; 436 const int dtdx = samp->dtdx; 437 const int width = samp->width; 438 uint32_t *row = samp->row; 439 int s = samp->s; 440 int t = samp->t; 441 442 for (int i = 0; i < width; i += 4) { 443 union m128i si0, si1, si2, si3, ws, wt; 444 __m128i si02, si13; 445 446 for (int j = 0; j < 4; j++) { 447 const uint32_t *src = data + (t >> 16) * stride + (s >> 16); 448 449 si0.ui[j] = src[0]; 450 si1.ui[j] = src[1]; 451 si2.ui[j] = src[stride + 0]; 452 si3.ui[j] = src[stride + 1]; 453 454 ws.ui[j] = (s>>8) & 0xff; 455 wt.ui[j] = (t>>8) & 0xff; 456 457 s += dsdx; 458 t += dtdx; 459 } 460 461 ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 16)); 462 ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 8)); 463 464 wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 16)); 465 wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 8)); 466 467 si02 = util_sse2_lerp_epi8_fixed08(si0.m, si2.m, wt.m); 468 si13 = util_sse2_lerp_epi8_fixed08(si1.m, si3.m, wt.m); 469 470 *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed08(si02, si13, ws.m); 471 } 472 473 samp->s += samp->dsdy; 474 samp->t += samp->dtdy; 475 return row; 476} 477 478 479/* Clamped, non-axis-aligned version. Don't try to take advantage of 480 * maximize. 481 */ 482static const uint32_t * 483fetch_bgra_clamp_linear(struct lp_linear_elem *elem) 484{ 485 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 486 const struct lp_jit_texture *texture = samp->texture; 487 const uint32_t *data = (const uint32_t *)texture->base; 488 const int stride = texture->row_stride[0] / sizeof(uint32_t); 489 const int tex_height = texture->height - 1; 490 const int tex_width = texture->width - 1; 491 const int dsdx = samp->dsdx; 492 const int dtdx = samp->dtdx; 493 const int width = samp->width; 494 uint32_t *row = samp->row; 495 int s = samp->s; 496 int t = samp->t; 497 498 /* width, height, stride (in pixels) must be smaller than 32768 */ 499 __m128i dsdx4, dtdx4, s4, t4, stride4, w4, h4, zero, one; 500 s4 = _mm_set1_epi32(s); 501 t4 = _mm_set1_epi32(t); 502 s4 = _mm_add_epi32(s4, _mm_set_epi32(3*dsdx, 2*dsdx, dsdx, 0)); 503 t4 = _mm_add_epi32(t4, _mm_set_epi32(3*dtdx, 2*dtdx, dtdx, 0)); 504 dsdx4 = _mm_set1_epi32(4*dsdx); 505 dtdx4 = _mm_set1_epi32(4*dtdx); 506 stride4 = _mm_set1_epi32(stride); 507 w4 = _mm_set1_epi32(tex_width); 508 h4 = _mm_set1_epi32(tex_height); 509 zero = _mm_setzero_si128(); 510 one = _mm_set1_epi32(1); 511 512 for (int i = 0; i < width; i += 4) { 513 union m128i addr[4]; 514 __m128i ws, wt, wsl, wsh, wtl, wth; 515 __m128i s4s, t4s, cs0, cs1, ct0, ct1, tmp, si[4]; 516 517 s4s = _mm_srli_epi32(s4, 16); 518 t4s = _mm_srli_epi32(t4, 16); 519 cs0 = _mm_min_epi16(_mm_max_epi16(s4s, zero), w4); 520 cs1 = _mm_add_epi16(s4s, one); 521 cs1 = _mm_min_epi16(_mm_max_epi16(cs1, zero), w4); 522 ct0 = _mm_min_epi16(_mm_max_epi16(t4s, zero), h4); 523 ct1 = _mm_add_epi16(t4s, one); 524 ct1 = _mm_min_epi16(_mm_max_epi16(ct1, zero), h4); 525 tmp = _mm_madd_epi16(ct0, stride4); 526 addr[0].m = _mm_add_epi32(tmp, cs0); 527 addr[1].m = _mm_add_epi32(tmp, cs1); 528 tmp = _mm_madd_epi16(ct1, stride4); 529 addr[2].m = _mm_add_epi32(tmp, cs0); 530 addr[3].m = _mm_add_epi32(tmp, cs1); 531 532 for (int j = 0; j < 4; j++) { 533 __m128i ld1, ld2, ld3; 534 si[j] = _mm_cvtsi32_si128(data[addr[j].ui[0]]); 535 ld1 = _mm_cvtsi32_si128(data[addr[j].ui[1]]); 536 si[j] = _mm_unpacklo_epi32(si[j], ld1); 537 ld2 = _mm_cvtsi32_si128(data[addr[j].ui[2]]); 538 ld3 = _mm_cvtsi32_si128(data[addr[j].ui[3]]); 539 ld2 = _mm_unpacklo_epi32(ld2, ld3); 540 si[j] = _mm_unpacklo_epi64(si[j], ld2); 541 } 542 543 ws = _mm_srli_epi32(s4, 8); 544 ws = _mm_and_si128(ws, _mm_set1_epi32(0xFF)); 545 wt = _mm_srli_epi32(t4, 8); 546 wt = _mm_and_si128(wt, _mm_set1_epi32(0xFF)); 547 548 s4 = _mm_add_epi32(s4, dsdx4); 549 t4 = _mm_add_epi32(t4, dtdx4); 550 551#if 0 552/* scalar code for reference */ 553 for (int j = 0; j < 4; j++) { 554 int s0 = s >> FIXED16_SHIFT; 555 int t0 = t >> FIXED16_SHIFT; 556 int cs0 = CLAMP(s0 , 0, tex_width); 557 int cs1 = CLAMP(s0 + 1, 0, tex_width); 558 int ct0 = CLAMP(t0 , 0, tex_height); 559 int ct1 = CLAMP(t0 + 1, 0, tex_height); 560 561 si0.ui[j] = data[ct0 * stride + cs0]; 562 si1.ui[j] = data[ct0 * stride + cs1]; 563 si2.ui[j] = data[ct1 * stride + cs0]; 564 si3.ui[j] = data[ct1 * stride + cs1]; 565 566 ws.ui[j] = (s>>8) & 0xff; 567 wt.ui[j] = (t>>8) & 0xff; 568 569 s += dsdx; 570 t += dtdx; 571 } 572#endif 573 574 ws = _mm_or_si128(ws, _mm_slli_epi32(ws, 16)); 575 wsl = _mm_shuffle_epi32(ws, _MM_SHUFFLE(1,1,0,0)); 576 wsh = _mm_shuffle_epi32(ws, _MM_SHUFFLE(3,3,2,2)); 577 578 wt = _mm_or_si128(wt, _mm_slli_epi32(wt, 16)); 579 wtl = _mm_shuffle_epi32(wt, _MM_SHUFFLE(1,1,0,0)); 580 wth = _mm_shuffle_epi32(wt, _MM_SHUFFLE(3,3,2,2)); 581 582 *(__m128i *)&row[i] = util_sse2_lerp_2d_epi8_fixed88(si[0], si[2], 583 &si[1], &si[3], 584 &wtl, &wth, 585 &wsl, &wsh); 586 } 587 588 samp->s += samp->dsdy; 589 samp->t += samp->dtdy; 590 591 return row; 592} 593 594 595static const uint32_t * 596fetch_bgrx_axis_aligned_linear(struct lp_linear_elem *elem) 597{ 598 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 599 const __m128i mask = _mm_set1_epi32(0xff000000); 600 uint32_t *dst_row = samp->row; 601 const uint32_t *src_row = fetch_bgra_axis_aligned_linear(&samp->base); 602 const int width = samp->width; 603 604 for (int i = 0; i < width; i += 4) { 605 __m128i bgra = *(__m128i *)&src_row[i]; 606 __m128i bgrx = _mm_or_si128(bgra, mask); 607 *(__m128i *)&dst_row[i] = bgrx; 608 } 609 610 return dst_row; 611} 612 613 614static const uint32_t * 615fetch_bgrx_clamp_linear(struct lp_linear_elem *elem) 616{ 617 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 618 const __m128i mask = _mm_set1_epi32(0xff000000); 619 uint32_t *row = samp->row; 620 const int width = samp->width; 621 622 fetch_bgra_clamp_linear(&samp->base); 623 624 for (int i = 0; i < width; i += 4) { 625 __m128i bgra = *(__m128i *)&row[i]; 626 __m128i bgrx = _mm_or_si128(bgra, mask); 627 *(__m128i *)&row[i] = bgrx; 628 } 629 630 return row; 631} 632 633 634static const uint32_t * 635fetch_bgrx_linear(struct lp_linear_elem *elem) 636{ 637 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 638 const __m128i mask = _mm_set1_epi32(0xff000000); 639 uint32_t *row = samp->row; 640 const int width = samp->width; 641 642 fetch_bgra_linear(&samp->base); 643 644 for (int i = 0; i < width; i += 4) { 645 __m128i bgra = *(__m128i *)&row[i]; 646 __m128i bgrx = _mm_or_si128(bgra, mask); 647 *(__m128i *)&row[i] = bgrx; 648 } 649 650 return row; 651} 652 653 654static boolean 655sampler_is_nearest(const struct lp_linear_sampler *samp, 656 const struct lp_sampler_static_state *sampler_state, 657 boolean minify) 658{ 659 unsigned img_filter; 660 661 if (minify) 662 img_filter = sampler_state->sampler_state.min_img_filter; 663 else 664 img_filter = sampler_state->sampler_state.mag_img_filter; 665 666 /* Is it obviously nearest? 667 */ 668 if (img_filter == PIPE_TEX_FILTER_NEAREST) 669 return TRUE; 670 671 /* Otherwise look for linear samplers which devolve to nearest. 672 */ 673 674 /* Needs to be axis aligned. 675 */ 676 if (!samp->axis_aligned) 677 return FALSE; 678 679 if (0) { 680 /* For maximizing shaders, revert to nearest 681 */ 682 if (samp->dsdx < -FIXED16_HALF && samp->dsdx < FIXED16_HALF && 683 samp->dtdy < -FIXED16_HALF && samp->dtdy < FIXED16_HALF) 684 return TRUE; 685 686 /* For severely minimising shaders, revert to nearest: 687 */ 688 if ((samp->dsdx < 2 * FIXED16_ONE || samp->dsdx > 2 * FIXED16_ONE) && 689 (samp->dtdy < 2 * FIXED16_ONE || samp->dtdy > 2 * FIXED16_ONE)) 690 return TRUE; 691 } 692 693 /* 694 * Must be near a pixel center: 695 */ 696 if (!fixed16_approx(fixed16_frac(samp->s), FIXED16_HALF, FIXED16_TOL) || 697 !fixed16_approx(fixed16_frac(samp->t), FIXED16_HALF, FIXED16_TOL)) 698 return FALSE; 699 700 /* 701 * Must make a full step between pixels: 702 */ 703 if (!fixed16_approx(samp->dsdx, FIXED16_ONE, FIXED16_TOL_DERIV) || 704 !fixed16_approx(samp->dtdy, FIXED16_ONE, FIXED16_TOL_DERIV)) 705 return FALSE; 706 707 /* Treat it as nearest! 708 */ 709 return TRUE; 710} 711 712/* XXX: Lots of static-state parameters being passed in here but very 713 * little info is extracted from each one. Consolidate it all down to 714 * something succinct in the prepare phase? 715 */ 716boolean 717lp_linear_init_sampler(struct lp_linear_sampler *samp, 718 const struct lp_tgsi_texture_info *info, 719 const struct lp_sampler_static_state *sampler_state, 720 const struct lp_jit_texture *texture, 721 int x0, int y0, int width, int height, 722 const float (*a0)[4], 723 const float (*dadx)[4], 724 const float (*dady)[4]) 725{ 726 const struct lp_tgsi_channel_info *schan = &info->coord[0]; 727 const struct lp_tgsi_channel_info *tchan = &info->coord[1]; 728 729 assert(schan->file == TGSI_FILE_INPUT); 730 assert(tchan->file == TGSI_FILE_INPUT); 731 732 float w0 = a0[0][3]; 733 734 int foo = 1; 735 float s0 = a0[schan->u.index+foo][schan->swizzle]; 736 float dsdx = dadx[schan->u.index+foo][schan->swizzle]; 737 float dsdy = dady[schan->u.index+foo][schan->swizzle]; 738 739 float t0 = a0[tchan->u.index+foo][tchan->swizzle]; 740 float dtdx = dadx[tchan->u.index+foo][tchan->swizzle]; 741 float dtdy = dady[tchan->u.index+foo][tchan->swizzle]; 742 743 int mins, mint, maxs, maxt; 744 float oow = 1.0f / w0; 745 float width_oow = texture->width * oow; 746 float height_oow = texture->height * oow; 747 float fdsdx = dsdx * width_oow; 748 float fdsdy = dsdy * width_oow; 749 float fdtdx = dtdx * height_oow; 750 float fdtdy = dtdy * height_oow; 751 int fetch_width; 752 int fetch_height; 753 boolean minify; 754 boolean need_wrap; 755 boolean is_nearest; 756 757 samp->texture = texture; 758 samp->width = width; 759 760 samp->s = float_to_fixed16(fdsdx * x0 + 761 fdsdy * y0 + 762 s0 * width_oow); 763 764 samp->t = float_to_fixed16(fdtdx * x0 + 765 fdtdy * y0 + 766 t0 * height_oow); 767 768 samp->dsdx = float_to_fixed16(fdsdx); 769 samp->dsdy = float_to_fixed16(fdsdy); 770 samp->dtdx = float_to_fixed16(fdtdx); 771 samp->dtdy = float_to_fixed16(fdtdy); 772 773 774 samp->axis_aligned = (samp->dsdy == 0 && 775 samp->dtdx == 0); // TODO: could be relaxed 776 777 { 778 int dsdx = samp->dsdx >= 0 ? samp->dsdx : -samp->dsdx; 779 int dsdy = samp->dsdy >= 0 ? samp->dsdy : -samp->dsdy; 780 int dtdx = samp->dtdx >= 0 ? samp->dtdx : -samp->dtdx; 781 int dtdy = samp->dtdy >= 0 ? samp->dtdy : -samp->dtdy; 782 int rho = MAX4(dsdx, dsdy, dtdx, dtdy); 783 784 minify = (rho > FIXED16_ONE); 785 } 786 787 is_nearest = sampler_is_nearest(samp, sampler_state, minify); 788 789 if (!is_nearest) { 790 samp->s -= FIXED16_HALF; 791 samp->t -= FIXED16_HALF; 792 } 793 794 /* Check for clamping. This rarely happens as we're rejecting interpolants 795 * which fall outside the 0..1 range. 796 */ 797 798 if (is_nearest) { 799 /* Nearest fetch routines don't employ SSE and always operate one pixel 800 * at a time. 801 */ 802 fetch_width = width - 1; 803 } 804 else { 805 /* Linear fetch routines employ SSE, and always fetch groups of four 806 * texels. 807 */ 808 fetch_width = align(width, 4) - 1; 809 } 810 fetch_height = height - 1; 811 812 if (samp->axis_aligned) { 813 int s0 = samp->s; 814 int s1 = samp->s + fetch_width * samp->dsdx; 815 int t0 = samp->t; 816 int t1 = samp->t + fetch_height * samp->dtdy; 817 818 mins = MIN2(s0, s1); 819 mint = MIN2(t0, t1); 820 maxs = MAX2(s0, s1); 821 maxt = MAX2(t0, t1); 822 } 823 else { 824 int s0 = samp->s; 825 int s1 = samp->s + fetch_width * samp->dsdx; 826 int s2 = samp->s + fetch_height * samp->dsdy; 827 int s3 = samp->s + fetch_width * samp->dsdx + fetch_height * samp->dsdy; 828 int t0 = samp->t; 829 int t1 = samp->t + fetch_width * samp->dtdx; 830 int t2 = samp->t + fetch_height * samp->dtdy; 831 int t3 = samp->t + fetch_width * samp->dtdx + fetch_height * samp->dtdy; 832 833 mins = MIN4(s0, s1, s2, s3); 834 mint = MIN4(t0, t1, t2, t3); 835 maxs = MAX4(s0, s1, s2, s3); 836 maxt = MAX4(t0, t1, t2, t3); 837 } 838 839 if (is_nearest) { 840 need_wrap = (mins < 0 || 841 mint < 0 || 842 maxs >= (texture->width << FIXED16_SHIFT) || 843 maxt >= (texture->height << FIXED16_SHIFT)); 844 } else { 845 need_wrap = (mins < 0 || 846 mint < 0 || 847 maxs + FIXED16_ONE >= (texture->width << FIXED16_SHIFT) || 848 maxt + FIXED16_ONE >= (texture->height << FIXED16_SHIFT)); 849 } 850 851 if (0 && need_wrap) { 852 debug_printf("%u x %u %s\n", 853 texture->width, texture->height, 854 is_nearest ? "nearest" : "linear"); 855 debug_printf("mins = %f\n", mins*1.0f/FIXED16_ONE); 856 debug_printf("mint = %f\n", mint*1.0f/FIXED16_ONE); 857 debug_printf("maxs = %f\n", maxs*1.0f/FIXED16_ONE); 858 debug_printf("maxt = %f\n", maxt*1.0f/FIXED16_ONE); 859 debug_printf("\n"); 860 } 861 862 /* We accept any mode below, but we only implement clamping. 863 */ 864 if (need_wrap && 865 (sampler_state->sampler_state.wrap_s != PIPE_TEX_WRAP_CLAMP_TO_EDGE || 866 sampler_state->sampler_state.wrap_t != PIPE_TEX_WRAP_CLAMP_TO_EDGE)) { 867 return FALSE; 868 } 869 870 if (is_nearest) { 871 switch (sampler_state->texture_state.format) { 872 case PIPE_FORMAT_B8G8R8A8_UNORM: 873 if (need_wrap) 874 samp->base.fetch = fetch_bgra_clamp; 875 else if (!samp->axis_aligned) 876 samp->base.fetch = fetch_bgra; 877 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed 878 samp->base.fetch = fetch_bgra_axis_aligned; 879 else 880 samp->base.fetch = fetch_bgra_memcpy; 881 return TRUE; 882 case PIPE_FORMAT_B8G8R8X8_UNORM: 883 if (need_wrap) 884 samp->base.fetch = fetch_bgrx_clamp; 885 else if (!samp->axis_aligned) 886 samp->base.fetch = fetch_bgrx; 887 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed 888 samp->base.fetch = fetch_bgrx_axis_aligned; 889 else 890 samp->base.fetch = fetch_bgrx_memcpy; 891 return TRUE; 892 default: 893 break; 894 } 895 896 FAIL("unknown format for nearest"); 897 } 898 else { 899 samp->stretched_row_y[0] = -1; 900 samp->stretched_row_y[1] = -1; 901 samp->stretched_row_index = 0; 902 903 switch (sampler_state->texture_state.format) { 904 case PIPE_FORMAT_B8G8R8A8_UNORM: 905 if (need_wrap) 906 samp->base.fetch = fetch_bgra_clamp_linear; 907 else if (!samp->axis_aligned) 908 samp->base.fetch = fetch_bgra_linear; 909 else 910 samp->base.fetch = fetch_bgra_axis_aligned_linear; 911 return TRUE; 912 case PIPE_FORMAT_B8G8R8X8_UNORM: 913 if (need_wrap) 914 samp->base.fetch = fetch_bgrx_clamp_linear; 915 else if (!samp->axis_aligned) 916 samp->base.fetch = fetch_bgrx_linear; 917 else 918 samp->base.fetch = fetch_bgrx_axis_aligned_linear; 919 return TRUE; 920 default: 921 break; 922 } 923 924 FAIL("unknown format"); 925 } 926} 927 928 929static const uint32_t * 930fetch_noop(struct lp_linear_elem *elem) 931{ 932 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem; 933 return samp->row; 934} 935 936 937void 938lp_linear_init_noop_sampler(struct lp_linear_sampler *samp) 939{ 940 samp->base.fetch = fetch_noop; 941} 942 943 944/* 945 * Check the given sampler and texture info for linear path compatibility. 946 */ 947boolean 948lp_linear_check_sampler(const struct lp_sampler_static_state *sampler, 949 const struct lp_tgsi_texture_info *tex) 950{ 951 if (tex->modifier != LP_BLD_TEX_MODIFIER_NONE) 952 return FALSE; 953 954 if (tex->target != TGSI_TEXTURE_2D) 955 return FALSE; 956 957 if (tex->coord[0].file != TGSI_FILE_INPUT || 958 tex->coord[1].file != TGSI_FILE_INPUT) 959 return FALSE; 960 961 /* These are the only sampling modes we support at the moment. 962 * 963 * Actually we'll accept any mode as we're failing on any 964 * interpolant which exceeds 0..1. Clamping is applied only to 965 * avoid invalid reads. 966 */ 967 if (!is_nearest_sampler(sampler) && 968 !is_linear_sampler(sampler)) 969 return FALSE; 970 971 /* These are the only texture formats we support at the moment 972 */ 973 if (sampler->texture_state.format != PIPE_FORMAT_B8G8R8A8_UNORM && 974 sampler->texture_state.format != PIPE_FORMAT_B8G8R8X8_UNORM) 975 return FALSE; 976 977 /* We don't support sampler view swizzling on the linear path */ 978 if (sampler->texture_state.swizzle_r != PIPE_SWIZZLE_X || 979 sampler->texture_state.swizzle_g != PIPE_SWIZZLE_Y || 980 sampler->texture_state.swizzle_b != PIPE_SWIZZLE_Z || 981 sampler->texture_state.swizzle_a != PIPE_SWIZZLE_W) { 982 return FALSE; 983 } 984 985 return TRUE; 986} 987 988#else 989boolean 990lp_linear_check_sampler(const struct lp_sampler_static_state *sampler, 991 const struct lp_tgsi_texture_info *tex) 992{ 993 return FALSE; 994} 995#endif 996