1/**************************************************************************
2 *
3 * Copyright 2010-2021 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29#include "pipe/p_config.h"
30
31#include "util/u_math.h"
32#include "util/u_cpu_detect.h"
33#include "util/u_pack_color.h"
34#include "util/u_rect.h"
35#include "util/u_sse.h"
36
37#include "lp_jit.h"
38#include "lp_debug.h"
39#include "lp_state_fs.h"
40#include "lp_linear_priv.h"
41
42#if defined(PIPE_ARCH_SSE)
43
44#define FIXED16_SHIFT  16
45#define FIXED16_ONE    (1<<16)
46#define FIXED16_HALF   (1<<15)
47
48/*
49 * Color tolerance.  Allow 1 bit of error in 8 bit unorm colors.
50 */
51#define FIXED16_TOL (FIXED16_ONE >> 7)
52
53/*
54 * Tolerance for texture coordinate derivatives when doing linear filtering.
55 *
56 * (Note that extra care needs to be taken when doing linear filtering as
57 * coordinates may snap up to neighbour texels inside the tile).
58 */
59#define FIXED16_TOL_DERIV (FIXED16_TOL / TILE_SIZE)
60
61static inline int
62float_to_fixed16(float f)
63{
64   return f * (float)FIXED16_ONE;
65}
66
67static inline int
68fixed16_frac(int x)
69{
70   return x & (FIXED16_ONE - 1);
71}
72
73static inline int
74fixed16_approx(int x, int y, int tol)
75{
76   return y - tol <= x && x <= y + tol;
77}
78
79
80/*
81 * Unstretched blit of a bgra texture.
82 */
83static const uint32_t *
84fetch_bgra_memcpy(struct lp_linear_elem *elem)
85{
86   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
87   const struct lp_jit_texture *texture = samp->texture;
88   const uint32_t *src_row =
89      (const uint32_t *)((const uint8_t *)texture->base +
90                         (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
91   const int s     = samp->s;
92   const int width = samp->width;
93   const uint32_t *row;
94
95   src_row = &src_row[s >> FIXED16_SHIFT];
96
97   if (((uintptr_t)src_row & 0xf) == 0) {
98      /* The source texels are already aligned. Return them */
99      row = src_row;
100   } else {
101      memcpy(samp->row, src_row, width * sizeof *row);
102      row = samp->row;
103   }
104
105   samp->t += samp->dtdy;
106   return row;
107}
108
109
110/*
111 * Unstretched blit of a bgrx texture.
112 */
113static const uint32_t *
114fetch_bgrx_memcpy(struct lp_linear_elem *elem)
115{
116   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
117   const struct lp_jit_texture *texture = samp->texture;
118   const uint32_t *src_row =
119      (const uint32_t *)((const uint8_t *)texture->base +
120                         (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
121   const int s     = samp->s;
122   const int width = samp->width;
123   uint32_t *row   = samp->row;
124
125   src_row = &src_row[s >> FIXED16_SHIFT];
126
127   for (int i = 0; i < width; i++) {
128      row[i] = src_row[i] | 0xff000000;
129   }
130
131   samp->t += samp->dtdy;
132   return row;
133}
134
135
136/*
137 * Perform nearest filtered lookup of a row of texels.  Texture lookup
138 * is assumed to be axis aligned but with arbitrary scaling.
139 *
140 * Texture coordinate interpolation is performed in 16.16 fixed point,
141 * not to be confused with the 1.15 format used by the interpolants.
142 *
143 * After 64 pixels (ie. in the next tile), the starting point will be
144 * recalculated with floating point arithmetic.
145 */
146static const uint32_t *
147fetch_bgra_axis_aligned(struct lp_linear_elem *elem)
148{
149   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
150   const struct lp_jit_texture *texture = samp->texture;
151   const uint32_t *src_row =
152      (const uint32_t *)((const uint8_t *)texture->base +
153                         (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
154   const int dsdx  = samp->dsdx;
155   const int width = samp->width;
156   uint32_t *row   = samp->row;
157   int s = samp->s;
158
159   for (int i = 0; i < width; i++) {
160      row[i] = src_row[s>>FIXED16_SHIFT];
161      s += dsdx;
162   }
163
164   samp->t += samp->dtdy;
165   return row;
166}
167
168
169static const uint32_t *
170fetch_bgrx_axis_aligned(struct lp_linear_elem *elem)
171{
172   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
173   const struct lp_jit_texture *texture = samp->texture;
174   const uint32_t *src_row =
175      (const uint32_t *)((const uint8_t *)texture->base +
176                         (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
177   const int dsdx  = samp->dsdx;
178   const int width = samp->width;
179   uint32_t *row   = samp->row;
180   int s = samp->s;
181
182   for (int i = 0; i < width; i++) {
183      row[i] = src_row[s>>FIXED16_SHIFT] | 0xff000000;
184      s += dsdx;
185   }
186
187   samp->t += samp->dtdy;
188   return row;
189}
190
191
192/* Non-axis aligned, but no clamping or wrapping required
193 */
194static const uint32_t *
195fetch_bgra(struct lp_linear_elem *elem)
196{
197   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
198   const struct lp_jit_texture *texture = samp->texture;
199   const uint8_t *src = texture->base;
200   const int stride = texture->row_stride[0];
201   const int dsdx  = samp->dsdx;
202   const int dtdx  = samp->dtdx;
203   const int width = samp->width;
204   uint32_t *row   = samp->row;
205   int s = samp->s;
206   int t = samp->t;
207
208   for (int i = 0; i < width; i++) {
209      const uint8_t *texel = (src +
210                              (t>>FIXED16_SHIFT) * stride +
211                              (s>>FIXED16_SHIFT) * 4);
212
213      row[i] = *(const uint32_t *)texel;
214
215      s += dsdx;
216      t += dtdx;
217   }
218
219   samp->s += samp->dsdy;
220   samp->t += samp->dtdy;
221   return row;
222}
223
224
225static const uint32_t *
226fetch_bgrx(struct lp_linear_elem *elem)
227{
228   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
229   const struct lp_jit_texture *texture = samp->texture;
230   const uint8_t *src = texture->base;
231   const int stride = texture->row_stride[0];
232   const int dsdx  = samp->dsdx;
233   const int dtdx  = samp->dtdx;
234   const int width = samp->width;
235   uint32_t *row   = samp->row;
236   int s = samp->s;
237   int t = samp->t;
238
239   for (int i = 0; i < width; i++) {
240      const uint8_t *texel = (src +
241                              (t>>FIXED16_SHIFT) * stride +
242                              (s>>FIXED16_SHIFT) * 4);
243
244      row[i] = (*(const uint32_t *)texel) | 0xff000000;
245
246      s += dsdx;
247      t += dtdx;
248   }
249
250   samp->s += samp->dsdy;
251   samp->t += samp->dtdy;
252   return row;
253}
254
255/* Non-axis aligned, clamped.
256 */
257static const uint32_t *
258fetch_bgra_clamp(struct lp_linear_elem *elem)
259{
260   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
261   const struct lp_jit_texture *texture = samp->texture;
262   const uint8_t *src   = texture->base;
263   const int stride     = texture->row_stride[0];
264   const int tex_height = texture->height - 1;
265   const int tex_width  = texture->width - 1;
266   const int dsdx  = samp->dsdx;
267   const int dtdx  = samp->dtdx;
268   const int width = samp->width;
269   uint32_t *row   = samp->row;
270   int s = samp->s;
271   int t = samp->t;
272
273   for (int i = 0; i < width; i++) {
274      int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
275      int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
276
277      const uint8_t *texel = src + ct * stride + cs * 4;
278
279      row[i] = *(const uint32_t *)texel;
280
281      s += dsdx;
282      t += dtdx;
283   }
284
285   samp->s += samp->dsdy;
286   samp->t += samp->dtdy;
287   return row;
288}
289
290static const uint32_t *
291fetch_bgrx_clamp(struct lp_linear_elem *elem)
292{
293   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
294   const struct lp_jit_texture *texture = samp->texture;
295   const uint8_t *src   = texture->base;
296   const int stride     = texture->row_stride[0];
297   const int tex_height = texture->height - 1;
298   const int tex_width  = texture->width - 1;
299   const int dsdx  = samp->dsdx;
300   const int dtdx  = samp->dtdx;
301   const int width = samp->width;
302   uint32_t *row   = samp->row;
303   int s = samp->s;
304   int t = samp->t;
305
306   for (int i = 0; i < width; i++) {
307      int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
308      int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
309
310      const uint8_t *texel = src + ct * stride + cs * 4;
311
312      row[i] = (*(const uint32_t *)texel) | 0xff000000;
313
314      s += dsdx;
315      t += dtdx;
316   }
317
318   samp->s += samp->dsdy;
319   samp->t += samp->dtdy;
320   return row;
321}
322
323/**
324 * Fetch and stretch one row.
325 */
326static inline const uint32_t *
327fetch_and_stretch_bgra_row(struct lp_linear_sampler *samp,
328                           int y)
329{
330   const struct lp_jit_texture *texture = samp->texture;
331   const uint32_t *data = (const uint32_t *)texture->base;
332   const int stride = texture->row_stride[0] / sizeof(uint32_t);
333   const int width = samp->width;
334
335   /*
336    * Search the stretched row cache first.
337    */
338
339   if (y == samp->stretched_row_y[0]) {
340      samp->stretched_row_index = 1;
341      return samp->stretched_row[0];
342   }
343
344   if (y == samp->stretched_row_y[1]) {
345      samp->stretched_row_index = 0;
346      return samp->stretched_row[1];
347   }
348
349   /*
350    * Replace one entry.
351    */
352
353   const uint32_t * restrict src_row = data + y * stride;
354   uint32_t * restrict dst_row = samp->stretched_row[samp->stretched_row_index];
355
356   if (fixed16_frac(samp->s) == 0 &&
357       samp->dsdx == FIXED16_ONE) { // TODO: could be relaxed
358      /*
359       * 1:1 blit on the x direction.
360       */
361      src_row += samp->s >> FIXED16_SHIFT;
362
363      if (((uintptr_t)src_row & 0xf) == 0) {
364         /* The source texture is already aligned. Return it */
365         return src_row;
366      }
367
368      /* Copy the source texture */
369      for (int i = 0; i < width; i += 4) {
370         __m128i src = _mm_loadu_si128((const __m128i *)&src_row[i]);
371         *(__m128i *)&dst_row[i] = src;
372      }
373   }
374   else {
375      util_sse2_stretch_row_8unorm((__m128i *)dst_row,
376                                   align(width, 4),
377                                   src_row, samp->s, samp->dsdx);
378   }
379
380   samp->stretched_row_y[samp->stretched_row_index] = y;
381   samp->stretched_row_index ^= 1;
382
383   return dst_row;
384}
385
386
387/* Maximise only as we fetch unscaled pixels linearly into a size-64
388 * temporary.  For minimise, we will want to either have a bigger
389 * temporary or fetch sparsely.
390 */
391static const uint32_t *
392fetch_bgra_axis_aligned_linear(struct lp_linear_elem *elem)
393{
394   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
395   const int width = samp->width;
396   uint32_t * restrict row = samp->row;
397   const int y = samp->t >> FIXED16_SHIFT;
398   const int w = (samp->t >> 8) & 0xff;
399
400   samp->t += samp->dtdy;
401
402   const uint32_t * restrict src_row0 = fetch_and_stretch_bgra_row(samp, y);
403
404   if (w == 0) {
405      return src_row0;
406   }
407
408   const uint32_t * restrict src_row1 = fetch_and_stretch_bgra_row(samp, y + 1);
409
410   __m128i wt = _mm_set1_epi16(w);
411
412   /* Combine the two rows using a constant weight.
413    */
414   for (int i = 0; i < width; i += 4) {
415      __m128i srca = _mm_load_si128((const __m128i *)&src_row0[i]);
416      __m128i srcb = _mm_load_si128((const __m128i *)&src_row1[i]);
417
418      *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed88(srca, srcb, &wt, &wt);
419   }
420
421   return row;
422}
423
424
425/* Non-axis-aligned version.  Don't try to take advantage of
426 * maximize.
427 */
428static const uint32_t *
429fetch_bgra_linear(struct lp_linear_elem *elem)
430{
431   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
432   const struct lp_jit_texture *texture = samp->texture;
433   const int stride     = texture->row_stride[0] / sizeof(uint32_t);
434   const uint32_t *data  = (const uint32_t *)texture->base;
435   const int dsdx  = samp->dsdx;
436   const int dtdx  = samp->dtdx;
437   const int width = samp->width;
438   uint32_t *row   = samp->row;
439   int s = samp->s;
440   int t = samp->t;
441
442   for (int i = 0; i < width; i += 4) {
443      union m128i si0, si1, si2, si3, ws, wt;
444      __m128i si02, si13;
445
446      for (int j = 0; j < 4; j++) {
447         const uint32_t *src = data + (t >> 16) * stride + (s >> 16);
448
449         si0.ui[j] = src[0];
450         si1.ui[j] = src[1];
451         si2.ui[j] = src[stride + 0];
452         si3.ui[j] = src[stride + 1];
453
454         ws.ui[j] = (s>>8) & 0xff;
455         wt.ui[j] = (t>>8) & 0xff;
456
457         s += dsdx;
458         t += dtdx;
459      }
460
461      ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 16));
462      ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 8));
463
464      wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 16));
465      wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 8));
466
467      si02 = util_sse2_lerp_epi8_fixed08(si0.m, si2.m, wt.m);
468      si13 = util_sse2_lerp_epi8_fixed08(si1.m, si3.m, wt.m);
469
470      *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed08(si02, si13, ws.m);
471   }
472
473   samp->s += samp->dsdy;
474   samp->t += samp->dtdy;
475   return row;
476}
477
478
479/* Clamped, non-axis-aligned version.  Don't try to take advantage of
480 * maximize.
481 */
482static const uint32_t *
483fetch_bgra_clamp_linear(struct lp_linear_elem *elem)
484{
485   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
486   const struct lp_jit_texture *texture = samp->texture;
487   const uint32_t *data  = (const uint32_t *)texture->base;
488   const int stride     = texture->row_stride[0] / sizeof(uint32_t);
489   const int tex_height = texture->height - 1;
490   const int tex_width  = texture->width - 1;
491   const int dsdx  = samp->dsdx;
492   const int dtdx  = samp->dtdx;
493   const int width = samp->width;
494   uint32_t *row   = samp->row;
495   int s = samp->s;
496   int t = samp->t;
497
498   /* width, height, stride (in pixels) must be smaller than 32768 */
499   __m128i dsdx4, dtdx4, s4, t4, stride4, w4, h4, zero, one;
500   s4 = _mm_set1_epi32(s);
501   t4 = _mm_set1_epi32(t);
502   s4 = _mm_add_epi32(s4, _mm_set_epi32(3*dsdx, 2*dsdx, dsdx, 0));
503   t4 =  _mm_add_epi32(t4, _mm_set_epi32(3*dtdx, 2*dtdx, dtdx, 0));
504   dsdx4 = _mm_set1_epi32(4*dsdx);
505   dtdx4 = _mm_set1_epi32(4*dtdx);
506   stride4 = _mm_set1_epi32(stride);
507   w4 = _mm_set1_epi32(tex_width);
508   h4 = _mm_set1_epi32(tex_height);
509   zero = _mm_setzero_si128();
510   one = _mm_set1_epi32(1);
511
512   for (int i = 0; i < width; i += 4) {
513      union m128i addr[4];
514      __m128i ws, wt, wsl, wsh, wtl, wth;
515      __m128i s4s, t4s, cs0, cs1, ct0, ct1, tmp, si[4];
516
517      s4s = _mm_srli_epi32(s4, 16);
518      t4s = _mm_srli_epi32(t4, 16);
519      cs0 = _mm_min_epi16(_mm_max_epi16(s4s, zero), w4);
520      cs1 = _mm_add_epi16(s4s, one);
521      cs1 = _mm_min_epi16(_mm_max_epi16(cs1, zero), w4);
522      ct0 = _mm_min_epi16(_mm_max_epi16(t4s, zero), h4);
523      ct1 = _mm_add_epi16(t4s, one);
524      ct1 = _mm_min_epi16(_mm_max_epi16(ct1, zero), h4);
525      tmp = _mm_madd_epi16(ct0, stride4);
526      addr[0].m = _mm_add_epi32(tmp, cs0);
527      addr[1].m = _mm_add_epi32(tmp, cs1);
528      tmp = _mm_madd_epi16(ct1, stride4);
529      addr[2].m = _mm_add_epi32(tmp, cs0);
530      addr[3].m = _mm_add_epi32(tmp, cs1);
531
532      for (int j = 0; j < 4; j++) {
533         __m128i ld1, ld2, ld3;
534         si[j] = _mm_cvtsi32_si128(data[addr[j].ui[0]]);
535         ld1 = _mm_cvtsi32_si128(data[addr[j].ui[1]]);
536         si[j] = _mm_unpacklo_epi32(si[j], ld1);
537         ld2 = _mm_cvtsi32_si128(data[addr[j].ui[2]]);
538         ld3 = _mm_cvtsi32_si128(data[addr[j].ui[3]]);
539         ld2 = _mm_unpacklo_epi32(ld2, ld3);
540         si[j] =  _mm_unpacklo_epi64(si[j], ld2);
541      }
542
543      ws = _mm_srli_epi32(s4, 8);
544      ws = _mm_and_si128(ws, _mm_set1_epi32(0xFF));
545      wt = _mm_srli_epi32(t4, 8);
546      wt = _mm_and_si128(wt, _mm_set1_epi32(0xFF));
547
548      s4 = _mm_add_epi32(s4, dsdx4);
549      t4 = _mm_add_epi32(t4, dtdx4);
550
551#if 0
552/* scalar code for reference */
553      for (int j = 0; j < 4; j++) {
554         int s0 = s >> FIXED16_SHIFT;
555         int t0 = t >> FIXED16_SHIFT;
556         int cs0 = CLAMP(s0    , 0, tex_width);
557         int cs1 = CLAMP(s0 + 1, 0, tex_width);
558         int ct0 = CLAMP(t0    , 0, tex_height);
559         int ct1 = CLAMP(t0 + 1, 0, tex_height);
560
561         si0.ui[j] = data[ct0 * stride + cs0];
562         si1.ui[j] = data[ct0 * stride + cs1];
563         si2.ui[j] = data[ct1 * stride + cs0];
564         si3.ui[j] = data[ct1 * stride + cs1];
565
566         ws.ui[j] = (s>>8) & 0xff;
567         wt.ui[j] = (t>>8) & 0xff;
568
569         s += dsdx;
570         t += dtdx;
571      }
572#endif
573
574      ws = _mm_or_si128(ws, _mm_slli_epi32(ws, 16));
575      wsl = _mm_shuffle_epi32(ws, _MM_SHUFFLE(1,1,0,0));
576      wsh = _mm_shuffle_epi32(ws, _MM_SHUFFLE(3,3,2,2));
577
578      wt = _mm_or_si128(wt, _mm_slli_epi32(wt, 16));
579      wtl = _mm_shuffle_epi32(wt, _MM_SHUFFLE(1,1,0,0));
580      wth = _mm_shuffle_epi32(wt, _MM_SHUFFLE(3,3,2,2));
581
582      *(__m128i *)&row[i] = util_sse2_lerp_2d_epi8_fixed88(si[0], si[2],
583                                                           &si[1], &si[3],
584                                                           &wtl, &wth,
585                                                           &wsl, &wsh);
586   }
587
588   samp->s += samp->dsdy;
589   samp->t += samp->dtdy;
590
591   return row;
592}
593
594
595static const uint32_t *
596fetch_bgrx_axis_aligned_linear(struct lp_linear_elem *elem)
597{
598   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
599   const __m128i mask = _mm_set1_epi32(0xff000000);
600   uint32_t *dst_row = samp->row;
601   const uint32_t *src_row = fetch_bgra_axis_aligned_linear(&samp->base);
602   const int width = samp->width;
603
604   for (int i = 0; i < width; i += 4) {
605      __m128i bgra = *(__m128i *)&src_row[i];
606      __m128i bgrx = _mm_or_si128(bgra, mask);
607      *(__m128i *)&dst_row[i] = bgrx;
608   }
609
610   return dst_row;
611}
612
613
614static const uint32_t *
615fetch_bgrx_clamp_linear(struct lp_linear_elem *elem)
616{
617   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
618   const __m128i mask = _mm_set1_epi32(0xff000000);
619   uint32_t *row = samp->row;
620   const int width = samp->width;
621
622   fetch_bgra_clamp_linear(&samp->base);
623
624   for (int i = 0; i < width; i += 4) {
625      __m128i bgra = *(__m128i *)&row[i];
626      __m128i bgrx = _mm_or_si128(bgra, mask);
627      *(__m128i *)&row[i] = bgrx;
628   }
629
630   return row;
631}
632
633
634static const uint32_t *
635fetch_bgrx_linear(struct lp_linear_elem *elem)
636{
637   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
638   const __m128i mask = _mm_set1_epi32(0xff000000);
639   uint32_t *row = samp->row;
640   const int width = samp->width;
641
642   fetch_bgra_linear(&samp->base);
643
644   for (int i = 0; i < width; i += 4) {
645      __m128i bgra = *(__m128i *)&row[i];
646      __m128i bgrx = _mm_or_si128(bgra, mask);
647      *(__m128i *)&row[i] = bgrx;
648   }
649
650   return row;
651}
652
653
654static boolean
655sampler_is_nearest(const struct lp_linear_sampler *samp,
656                   const struct lp_sampler_static_state *sampler_state,
657                   boolean minify)
658{
659   unsigned img_filter;
660
661   if (minify)
662      img_filter = sampler_state->sampler_state.min_img_filter;
663   else
664      img_filter = sampler_state->sampler_state.mag_img_filter;
665
666   /* Is it obviously nearest?
667    */
668   if (img_filter == PIPE_TEX_FILTER_NEAREST)
669      return TRUE;
670
671   /* Otherwise look for linear samplers which devolve to nearest.
672    */
673
674   /* Needs to be axis aligned.
675    */
676   if (!samp->axis_aligned)
677      return FALSE;
678
679   if (0) {
680      /* For maximizing shaders, revert to nearest
681       */
682      if (samp->dsdx < -FIXED16_HALF && samp->dsdx < FIXED16_HALF &&
683          samp->dtdy < -FIXED16_HALF && samp->dtdy < FIXED16_HALF)
684         return TRUE;
685
686      /* For severely minimising shaders, revert to nearest:
687       */
688      if ((samp->dsdx < 2 * FIXED16_ONE || samp->dsdx > 2 * FIXED16_ONE) &&
689          (samp->dtdy < 2 * FIXED16_ONE || samp->dtdy > 2 * FIXED16_ONE))
690         return TRUE;
691   }
692
693   /*
694    * Must be near a pixel center:
695    */
696   if (!fixed16_approx(fixed16_frac(samp->s), FIXED16_HALF, FIXED16_TOL) ||
697       !fixed16_approx(fixed16_frac(samp->t), FIXED16_HALF, FIXED16_TOL))
698      return FALSE;
699
700   /*
701    * Must make a full step between pixels:
702    */
703   if (!fixed16_approx(samp->dsdx, FIXED16_ONE, FIXED16_TOL_DERIV) ||
704       !fixed16_approx(samp->dtdy, FIXED16_ONE, FIXED16_TOL_DERIV))
705      return FALSE;
706
707   /* Treat it as nearest!
708    */
709   return TRUE;
710}
711
712/* XXX: Lots of static-state parameters being passed in here but very
713 * little info is extracted from each one.  Consolidate it all down to
714 * something succinct in the prepare phase?
715 */
716boolean
717lp_linear_init_sampler(struct lp_linear_sampler *samp,
718                       const struct lp_tgsi_texture_info *info,
719                       const struct lp_sampler_static_state *sampler_state,
720                       const struct lp_jit_texture *texture,
721                       int x0, int y0, int width, int height,
722                       const float (*a0)[4],
723                       const float (*dadx)[4],
724                       const float (*dady)[4])
725{
726   const struct lp_tgsi_channel_info *schan = &info->coord[0];
727   const struct lp_tgsi_channel_info *tchan = &info->coord[1];
728
729   assert(schan->file == TGSI_FILE_INPUT);
730   assert(tchan->file == TGSI_FILE_INPUT);
731
732   float w0   =   a0[0][3];
733
734   int foo = 1;
735   float s0   =   a0[schan->u.index+foo][schan->swizzle];
736   float dsdx = dadx[schan->u.index+foo][schan->swizzle];
737   float dsdy = dady[schan->u.index+foo][schan->swizzle];
738
739   float t0   =   a0[tchan->u.index+foo][tchan->swizzle];
740   float dtdx = dadx[tchan->u.index+foo][tchan->swizzle];
741   float dtdy = dady[tchan->u.index+foo][tchan->swizzle];
742
743   int mins, mint, maxs, maxt;
744   float oow = 1.0f / w0;
745   float width_oow = texture->width * oow;
746   float height_oow = texture->height * oow;
747   float fdsdx = dsdx * width_oow;
748   float fdsdy = dsdy * width_oow;
749   float fdtdx = dtdx * height_oow;
750   float fdtdy = dtdy * height_oow;
751   int fetch_width;
752   int fetch_height;
753   boolean minify;
754   boolean need_wrap;
755   boolean is_nearest;
756
757   samp->texture = texture;
758   samp->width = width;
759
760   samp->s = float_to_fixed16(fdsdx * x0 +
761                              fdsdy * y0 +
762                              s0 * width_oow);
763
764   samp->t = float_to_fixed16(fdtdx * x0 +
765                              fdtdy * y0 +
766                              t0 * height_oow);
767
768   samp->dsdx = float_to_fixed16(fdsdx);
769   samp->dsdy = float_to_fixed16(fdsdy);
770   samp->dtdx = float_to_fixed16(fdtdx);
771   samp->dtdy = float_to_fixed16(fdtdy);
772
773
774   samp->axis_aligned = (samp->dsdy == 0 &&
775                         samp->dtdx == 0); // TODO: could be relaxed
776
777   {
778      int dsdx = samp->dsdx >= 0 ? samp->dsdx : -samp->dsdx;
779      int dsdy = samp->dsdy >= 0 ? samp->dsdy : -samp->dsdy;
780      int dtdx = samp->dtdx >= 0 ? samp->dtdx : -samp->dtdx;
781      int dtdy = samp->dtdy >= 0 ? samp->dtdy : -samp->dtdy;
782      int rho = MAX4(dsdx, dsdy, dtdx, dtdy);
783
784      minify = (rho > FIXED16_ONE);
785   }
786
787   is_nearest = sampler_is_nearest(samp, sampler_state, minify);
788
789   if (!is_nearest) {
790      samp->s -= FIXED16_HALF;
791      samp->t -= FIXED16_HALF;
792   }
793
794   /* Check for clamping.  This rarely happens as we're rejecting interpolants
795    * which fall outside the 0..1 range.
796    */
797
798   if (is_nearest) {
799      /* Nearest fetch routines don't employ SSE and always operate one pixel
800       * at a time.
801       */
802      fetch_width = width - 1;
803   }
804   else {
805      /* Linear fetch routines employ SSE, and always fetch groups of four
806       * texels.
807       */
808      fetch_width = align(width, 4) - 1;
809   }
810   fetch_height = height - 1;
811
812   if (samp->axis_aligned) {
813      int s0 = samp->s;
814      int s1 = samp->s + fetch_width  * samp->dsdx;
815      int t0 = samp->t;
816      int t1 = samp->t + fetch_height * samp->dtdy;
817
818      mins = MIN2(s0, s1);
819      mint = MIN2(t0, t1);
820      maxs = MAX2(s0, s1);
821      maxt = MAX2(t0, t1);
822   }
823   else {
824      int s0 = samp->s;
825      int s1 = samp->s + fetch_width  * samp->dsdx;
826      int s2 = samp->s + fetch_height * samp->dsdy;
827      int s3 = samp->s + fetch_width  * samp->dsdx + fetch_height * samp->dsdy;
828      int t0 = samp->t;
829      int t1 = samp->t + fetch_width  * samp->dtdx;
830      int t2 = samp->t + fetch_height * samp->dtdy;
831      int t3 = samp->t + fetch_width  * samp->dtdx + fetch_height * samp->dtdy;
832
833      mins = MIN4(s0, s1, s2, s3);
834      mint = MIN4(t0, t1, t2, t3);
835      maxs = MAX4(s0, s1, s2, s3);
836      maxt = MAX4(t0, t1, t2, t3);
837   }
838
839   if (is_nearest) {
840      need_wrap = (mins < 0 ||
841                   mint < 0 ||
842                   maxs >= (texture->width  << FIXED16_SHIFT) ||
843                   maxt >= (texture->height << FIXED16_SHIFT));
844   } else {
845      need_wrap = (mins < 0 ||
846                   mint < 0 ||
847                   maxs + FIXED16_ONE >= (texture->width  << FIXED16_SHIFT) ||
848                   maxt + FIXED16_ONE >= (texture->height << FIXED16_SHIFT));
849   }
850
851   if (0 && need_wrap) {
852      debug_printf("%u x %u %s\n",
853                   texture->width, texture->height,
854                   is_nearest ? "nearest" : "linear");
855      debug_printf("mins = %f\n", mins*1.0f/FIXED16_ONE);
856      debug_printf("mint = %f\n", mint*1.0f/FIXED16_ONE);
857      debug_printf("maxs = %f\n", maxs*1.0f/FIXED16_ONE);
858      debug_printf("maxt = %f\n", maxt*1.0f/FIXED16_ONE);
859      debug_printf("\n");
860   }
861
862   /* We accept any mode below, but we only implement clamping.
863    */
864   if (need_wrap &&
865       (sampler_state->sampler_state.wrap_s != PIPE_TEX_WRAP_CLAMP_TO_EDGE ||
866        sampler_state->sampler_state.wrap_t != PIPE_TEX_WRAP_CLAMP_TO_EDGE)) {
867       return FALSE;
868   }
869
870   if (is_nearest) {
871      switch (sampler_state->texture_state.format) {
872      case PIPE_FORMAT_B8G8R8A8_UNORM:
873         if (need_wrap)
874            samp->base.fetch = fetch_bgra_clamp;
875         else if (!samp->axis_aligned)
876            samp->base.fetch = fetch_bgra;
877         else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
878            samp->base.fetch = fetch_bgra_axis_aligned;
879         else
880            samp->base.fetch = fetch_bgra_memcpy;
881         return TRUE;
882      case PIPE_FORMAT_B8G8R8X8_UNORM:
883         if (need_wrap)
884            samp->base.fetch = fetch_bgrx_clamp;
885         else if (!samp->axis_aligned)
886            samp->base.fetch = fetch_bgrx;
887         else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
888            samp->base.fetch = fetch_bgrx_axis_aligned;
889         else
890            samp->base.fetch = fetch_bgrx_memcpy;
891         return TRUE;
892      default:
893         break;
894      }
895
896      FAIL("unknown format for nearest");
897   }
898   else {
899      samp->stretched_row_y[0] = -1;
900      samp->stretched_row_y[1] = -1;
901      samp->stretched_row_index = 0;
902
903      switch (sampler_state->texture_state.format) {
904      case PIPE_FORMAT_B8G8R8A8_UNORM:
905         if (need_wrap)
906            samp->base.fetch = fetch_bgra_clamp_linear;
907         else if (!samp->axis_aligned)
908            samp->base.fetch = fetch_bgra_linear;
909         else
910            samp->base.fetch = fetch_bgra_axis_aligned_linear;
911         return TRUE;
912      case PIPE_FORMAT_B8G8R8X8_UNORM:
913         if (need_wrap)
914            samp->base.fetch = fetch_bgrx_clamp_linear;
915         else if (!samp->axis_aligned)
916            samp->base.fetch = fetch_bgrx_linear;
917         else
918            samp->base.fetch = fetch_bgrx_axis_aligned_linear;
919         return TRUE;
920      default:
921         break;
922      }
923
924      FAIL("unknown format");
925   }
926}
927
928
929static const uint32_t *
930fetch_noop(struct lp_linear_elem *elem)
931{
932   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
933   return samp->row;
934}
935
936
937void
938lp_linear_init_noop_sampler(struct lp_linear_sampler *samp)
939{
940   samp->base.fetch = fetch_noop;
941}
942
943
944/*
945 * Check the given sampler and texture info for linear path compatibility.
946 */
947boolean
948lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
949                        const struct lp_tgsi_texture_info *tex)
950{
951   if (tex->modifier != LP_BLD_TEX_MODIFIER_NONE)
952      return FALSE;
953
954   if (tex->target != TGSI_TEXTURE_2D)
955      return FALSE;
956
957   if (tex->coord[0].file != TGSI_FILE_INPUT ||
958       tex->coord[1].file != TGSI_FILE_INPUT)
959      return FALSE;
960
961   /* These are the only sampling modes we support at the moment.
962    *
963    * Actually we'll accept any mode as we're failing on any
964    * interpolant which exceeds 0..1.  Clamping is applied only to
965    * avoid invalid reads.
966    */
967   if (!is_nearest_sampler(sampler) &&
968       !is_linear_sampler(sampler))
969      return FALSE;
970
971   /* These are the only texture formats we support at the moment
972    */
973   if (sampler->texture_state.format != PIPE_FORMAT_B8G8R8A8_UNORM &&
974       sampler->texture_state.format != PIPE_FORMAT_B8G8R8X8_UNORM)
975      return FALSE;
976
977   /* We don't support sampler view swizzling on the linear path */
978   if (sampler->texture_state.swizzle_r != PIPE_SWIZZLE_X ||
979       sampler->texture_state.swizzle_g != PIPE_SWIZZLE_Y ||
980       sampler->texture_state.swizzle_b != PIPE_SWIZZLE_Z ||
981       sampler->texture_state.swizzle_a != PIPE_SWIZZLE_W) {
982      return FALSE;
983   }
984
985   return TRUE;
986}
987
988#else
989boolean
990lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
991                        const struct lp_tgsi_texture_info *tex)
992{
993   return FALSE;
994}
995#endif
996