1 /**************************************************************************
2  *
3  * Copyright 2010-2021 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
21  *
22  * The above copyright notice and this permission notice (including the
23  * next paragraph) shall be included in all copies or substantial portions
24  * of the Software.
25  *
26  **************************************************************************/
27 
28 
29 #include "pipe/p_config.h"
30 
31 #include "util/u_math.h"
32 #include "util/u_cpu_detect.h"
33 #include "util/u_pack_color.h"
34 #include "util/u_surface.h"
35 #include "util/u_sse.h"
36 
37 #include "lp_jit.h"
38 #include "lp_rast.h"
39 #include "lp_debug.h"
40 #include "lp_state_fs.h"
41 #include "lp_linear_priv.h"
42 
43 
44 #if defined(PIPE_ARCH_SSE)
45 
46 #include <emmintrin.h>
47 
48 
49 struct nearest_sampler {
50    alignas(16) uint32_t out[64];
51 
52    const struct lp_jit_texture *texture;
53    float fsrc_x;                /* src_x0 */
54    float fsrc_y;                /* src_y0 */
55    float fdsdx;              /* sx */
56    float fdsdy;              /* sx */
57    float fdtdx;              /* sy */
58    float fdtdy;              /* sy */
59    int width;
60    int y;
61 
62    const uint32_t *(*fetch)(struct nearest_sampler *samp);
63 };
64 
65 
66 struct linear_interp {
67    alignas(16) uint32_t out[64];
68    __m128i a0;
69    __m128i dadx;
70    __m128i dady;
71    int width;                   /* rounded up to multiple of 4 */
72    boolean is_constant;
73 };
74 
75 /* Organize all the information needed for blending in one place.
76  * Could have blend function pointer here, but we currently always
77  * know which one we want to call.
78  */
79 struct color_blend {
80    const uint32_t *src;
81    uint8_t *color;
82    int stride;
83    int width;                   /* the exact width */
84 };
85 
86 
87 /* Organize all the information needed for running each of the shaders
88  * in one place.
89  */
90 struct shader {
91    alignas(16) uint32_t out0[64];
92    const uint32_t *src0;
93    const uint32_t *src1;
94    __m128i const0;
95    int width;                   /* rounded up to multiple of 4 */
96 };
97 
98 
99 /* For a row of pixels, perform add/one/inv_src_alpha (ie
100  * premultiplied alpha) blending between the incoming pixels and the
101  * destination buffer.
102  *
103  * Used to implement the BLIT_RGBA + blend shader, there are no
104  * operations from the pixel shader left to implement at this level -
105  * effectively the pixel shader was just a texture fetch which has
106  * already been performed.  This routine then purely implements
107  * blending.
108  */
109 static void
blend_premul(struct color_blend *blend)110 blend_premul(struct color_blend *blend)
111 {
112    const uint32_t *src = blend->src;  /* aligned */
113    uint32_t *dst = (uint32_t *)blend->color;      /* unaligned */
114    const int width = blend->width;
115    int i;
116    union { __m128i m128; uint ui[4]; } dstreg;
117 
118    blend->color += blend->stride;
119 
120    for (i = 0; i + 3 < width; i += 4) {
121       __m128i tmp;
122       tmp = _mm_loadu_si128((const __m128i *)&dst[i]);  /* UNALIGNED READ */
123       dstreg.m128 = util_sse2_blend_premul_4(*(const __m128i *)&src[i],
124                                              tmp);
125       _mm_storeu_si128((__m128i *)&dst[i], dstreg.m128); /* UNALIGNED WRITE */
126    }
127 
128    if (i < width) {
129       int j;
130       for (j = 0; j < width - i ; j++) {
131          dstreg.ui[j] = dst[i+j];
132       }
133       dstreg.m128 = util_sse2_blend_premul_4(*(const __m128i *)&src[i],
134                                              dstreg.m128);
135       for (; i < width; i++)
136          dst[i] = dstreg.ui[i&3];
137    }
138 }
139 
140 
141 static void
blend_noop(struct color_blend *blend)142 blend_noop(struct color_blend *blend)
143 {
144    memcpy(blend->color, blend->src, blend->width * sizeof(unsigned));
145    blend->color += blend->stride;
146 }
147 
148 
149 static void
init_blend(struct color_blend *blend, int x, int y, int width, int height, uint8_t *color, int stride)150 init_blend(struct color_blend *blend,
151            int x, int y, int width, int height,
152            uint8_t *color,
153            int stride)
154 {
155    blend->color = color + x * 4 + y * stride;
156    blend->stride = stride;
157    blend->width = width;
158 }
159 
160 
161 /*
162  * Perform nearest filtered lookup of a row of texels.  Texture lookup
163  * is assumed to be axis aligned but with arbitrary scaling.
164  *
165  * Texture coordinate interpolation is performed in 24.8 fixed point.
166  * Note that the longest span we will encounter is 64 pixels long,
167  * meaning that 8 fractional bits is more than sufficient to represent
168  * the shallowest gradient possible within this span.
169  *
170  * After 64 pixels (ie. in the next tile), the starting point will be
171  * recalculated with floating point arithmetic.
172  *
173  * XXX: migrate this to use Jose's quad blitter texture fetch routines.
174  */
175 static const uint32_t *
fetch_row(struct nearest_sampler *samp)176 fetch_row(struct nearest_sampler *samp)
177 {
178    const int y = samp->y++;
179    uint32_t *row = samp->out;
180    const struct lp_jit_texture *texture = samp->texture;
181    const int yy = util_iround(samp->fsrc_y + samp->fdtdy * y);
182    const uint32_t *src_row =
183       (const uint32_t *)((const uint8_t *)texture->base +
184                          yy * texture->row_stride[0]);
185    const int iscale_x = samp->fdsdx * 256;
186    const int width = samp->width;
187    int acc = samp->fsrc_x * 256 + 128;
188 
189    for (int i = 0; i < width; i++) {
190       row[i] = src_row[acc>>8];
191       acc += iscale_x;
192    }
193 
194    return row;
195 }
196 
197 
198 /* Version of fetch_row which can cope with texture edges.  In
199  * practise, aero never triggers this.
200  */
201 static const uint32_t *
fetch_row_clamped(struct nearest_sampler *samp)202 fetch_row_clamped(struct nearest_sampler *samp)
203 {
204    const int y = samp->y++;
205    uint32_t *row = samp->out;
206    const struct lp_jit_texture *texture = samp->texture;
207    const int yy = util_iround(samp->fsrc_y + samp->fdtdy * y);
208    const uint32_t *src_row =
209       (const uint32_t *)((const uint8_t *)texture->base +
210                          CLAMP(yy, 0, texture->height-1) *
211                          texture->row_stride[0]);
212    const float src_x0 = samp->fsrc_x;
213    const float scale_x = samp->fdsdx;
214    const int width = samp->width;
215 
216    for (int i = 0; i < width; i++) {
217       row[i] = src_row[CLAMP(util_iround(src_x0 + i * scale_x),
218                              0, texture->width - 1)];
219    }
220 
221    return row;
222 }
223 
224 /* It vary rarely happens that some non-axis-aligned texturing creeps
225  * into the linear path.  Handle it here.  The alternative would be
226  * more pre-checking or an option to fallback by returning false from
227  * jit_linear.
228  */
229 static const uint32_t *
fetch_row_xy_clamped(struct nearest_sampler *samp)230 fetch_row_xy_clamped(struct nearest_sampler *samp)
231 {
232    const int y = samp->y++;
233    uint32_t *row = samp->out;
234    const struct lp_jit_texture *texture = samp->texture;
235    const float yrow = samp->fsrc_y + samp->fdtdy * y;
236    const float xrow = samp->fsrc_x + samp->fdsdy * y;
237    const int width  = samp->width;
238 
239    for (int i = 0; i < width; i++) {
240       int yy = util_iround(yrow + samp->fdtdx * i);
241       int xx = util_iround(xrow + samp->fdsdx * i);
242 
243       const uint32_t *src_row =
244          (const uint32_t *)((const uint8_t *) texture->base +
245                             CLAMP(yy, 0, texture->height-1) *
246                             texture->row_stride[0]);
247 
248       row[i] = src_row[CLAMP(xx, 0, texture->width - 1)];
249    }
250 
251    return row;
252 }
253 
254 
255 static boolean
init_nearest_sampler(struct nearest_sampler *samp, const struct lp_jit_texture *texture, int x0, int y0, int width, int height, float s0, float dsdx, float dsdy, float t0, float dtdx, float dtdy, float w0, float dwdx, float dwdy)256 init_nearest_sampler(struct nearest_sampler *samp,
257                      const struct lp_jit_texture *texture,
258                      int x0, int y0,
259                      int width, int height,
260                      float s0, float dsdx, float dsdy,
261                      float t0, float dtdx, float dtdy,
262                      float w0, float dwdx, float dwdy)
263 {
264    const float oow = 1.0f / w0;
265 
266    if (dwdx != 0.0 || dwdy != 0.0)
267       return FALSE;
268 
269    samp->texture = texture;
270    samp->width = width;
271    samp->fdsdx = dsdx * texture->width * oow;
272    samp->fdsdy = dsdy * texture->width * oow;
273    samp->fdtdx = dtdx * texture->height * oow;
274    samp->fdtdy = dtdy * texture->height * oow;
275    samp->fsrc_x = (samp->fdsdx * x0 +
276                    samp->fdsdy * y0 +
277                    s0 * texture->width * oow - 0.5f);
278 
279    samp->fsrc_y = (samp->fdtdx * x0 +
280                    samp->fdtdy * y0 +
281                    t0 * texture->height * oow - 0.5f);
282    samp->y = 0;
283 
284    /* Because we want to permit consumers of this data to round up to
285     * the next multiple of 4, and because we don't want valgrind to
286     * complain about uninitialized reads, set the last bit of the
287     * buffer to zero:
288     */
289    for (int i = width; i & 3; i++)
290       samp->out[i] = 0;
291 
292    if (dsdy != 0 || dtdx != 0) {
293       /* Arbitrary texture lookup:
294        */
295       samp->fetch = fetch_row_xy_clamped;
296    } else {
297       /* Axis aligned stretch blit, abitrary scaling factors including
298        * flipped, minifying and magnifying:
299        */
300       int isrc_x = util_iround(samp->fsrc_x);
301       int isrc_y = util_iround(samp->fsrc_y);
302       int isrc_x1 = util_iround(samp->fsrc_x + width * samp->fdsdx);
303       int isrc_y1 = util_iround(samp->fsrc_y + height * samp->fdtdy);
304 
305       /* Look at the maximum and minimum texture coordinates we will be
306        * fetching and figure out if we need to use clamping.  There is
307        * similar code in u_blit_sw.c which takes a better approach to
308        * this which could be substituted later.
309        */
310       if (isrc_x  <= texture->width  && isrc_x  >= 0 &&
311           isrc_y  <= texture->height && isrc_y  >= 0 &&
312           isrc_x1 <= texture->width  && isrc_x1 >= 0 &&
313           isrc_y1 <= texture->height && isrc_y1 >= 0) {
314          samp->fetch = fetch_row;
315       } else {
316          samp->fetch = fetch_row_clamped;
317       }
318    }
319 
320    return TRUE;
321 }
322 
323 
324 static const uint32_t *
shade_rgb1(struct shader *shader)325 shade_rgb1(struct shader *shader)
326 {
327    const __m128i rgb1 = _mm_set1_epi32(0xff000000);
328    const uint32_t *src0 = shader->src0;
329    uint32_t *dst = shader->out0;
330    int width = shader->width;
331    int i;
332 
333    for (i = 0; i + 3 < width; i += 4) {
334       __m128i s = *(const __m128i *)&src0[i];
335       *(__m128i *)&dst[i] = _mm_or_si128(s, rgb1);
336    }
337 
338    return shader->out0;
339 }
340 
341 
342 static void
init_shader(struct shader *shader, int x, int y, int width, int height)343 init_shader(struct shader *shader,
344            int x, int y, int width, int height)
345 {
346    shader->width = align(width, 4);
347 }
348 
349 
350 /* Linear shader which implements the BLIT_RGBA shader with the
351  * additional constraints imposed by lp_setup_is_blit().
352  */
353 static boolean
blit_rgba_blit(const struct lp_rast_state *state, unsigned x, unsigned y, unsigned width, unsigned height, const float (*a0)[4], const float (*dadx)[4], const float (*dady)[4], uint8_t *color, unsigned stride)354 blit_rgba_blit(const struct lp_rast_state *state,
355                unsigned x, unsigned y,
356                unsigned width, unsigned height,
357                const float (*a0)[4],
358                const float (*dadx)[4],
359                const float (*dady)[4],
360                uint8_t *color,
361                unsigned stride)
362 {
363    const struct lp_jit_context *context = &state->jit_context;
364    const struct lp_jit_texture *texture = &context->textures[0];
365    const uint8_t *src;
366    unsigned src_stride;
367    int src_x, src_y;
368 
369    LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
370 
371    /* Require w==1.0:
372     */
373    if (a0[0][3] != 1.0 ||
374        dadx[0][3] != 0.0 ||
375        dady[0][3] != 0.0)
376       return FALSE;
377 
378    src_x = x + util_iround(a0[1][0]*texture->width - 0.5f);
379    src_y = y + util_iround(a0[1][1]*texture->height - 0.5f);
380 
381    src = texture->base;
382    src_stride = texture->row_stride[0];
383 
384    /* Fall back to blit_rgba() if clamping required:
385     */
386    if (src_x < 0 ||
387        src_y < 0 ||
388        src_x + width > texture->width ||
389        src_y + height > texture->height)
390       return FALSE;
391 
392    util_copy_rect(color, PIPE_FORMAT_B8G8R8A8_UNORM, stride,
393                   x, y,
394                   width, height,
395                   src, src_stride,
396                   src_x, src_y);
397 
398    return TRUE;
399 }
400 
401 
402 /* Linear shader which implements the BLIT_RGB1 shader, with the
403  * additional constraints imposed by lp_setup_is_blit().
404  */
405 static boolean
blit_rgb1_blit(const struct lp_rast_state *state, unsigned x, unsigned y, unsigned width, unsigned height, const float (*a0)[4], const float (*dadx)[4], const float (*dady)[4], uint8_t *color, unsigned stride)406 blit_rgb1_blit(const struct lp_rast_state *state,
407                unsigned x, unsigned y,
408                unsigned width, unsigned height,
409                const float (*a0)[4],
410                const float (*dadx)[4],
411                const float (*dady)[4],
412                uint8_t *color,
413                unsigned stride)
414 {
415    const struct lp_jit_context *context = &state->jit_context;
416    const struct lp_jit_texture *texture = &context->textures[0];
417    const uint8_t *src;
418    unsigned src_stride;
419    int src_x, src_y;
420 
421    LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
422 
423    /* Require w==1.0:
424     */
425    if (a0[0][3] != 1.0 ||
426        dadx[0][3] != 0.0 ||
427        dady[0][3] != 0.0)
428       return FALSE;
429 
430    color += x * 4 + y * stride;
431 
432    src_x = x + util_iround(a0[1][0]*texture->width - 0.5f);
433    src_y = y + util_iround(a0[1][1]*texture->height - 0.5f);
434 
435    src = texture->base;
436    src_stride = texture->row_stride[0];
437    src += src_x * 4;
438    src += src_y * src_stride;
439 
440    if (src_x < 0 ||
441        src_y < 0 ||
442        src_x + width > texture->width ||
443        src_y + height > texture->height)
444       return FALSE;
445 
446    for (y = 0; y < height; y++) {
447       const uint32_t *src_row = (const uint32_t *)src;
448       uint32_t *dst_row = (uint32_t *)color;
449 
450       for (x = 0; x < width; x++) {
451          *dst_row++ = *src_row++ | 0xff000000;
452       }
453 
454       color += stride;
455       src += src_stride;
456    }
457 
458    return TRUE;
459 }
460 
461 
462 /* Linear shader variant implementing the BLIT_RGBA shader without
463  * blending.
464  */
465 static boolean
blit_rgba(const struct lp_rast_state *state, unsigned x, unsigned y, unsigned width, unsigned height, const float (*a0)[4], const float (*dadx)[4], const float (*dady)[4], uint8_t *color, unsigned stride)466 blit_rgba(const struct lp_rast_state *state,
467           unsigned x, unsigned y,
468           unsigned width, unsigned height,
469           const float (*a0)[4],
470           const float (*dadx)[4],
471           const float (*dady)[4],
472           uint8_t *color,
473           unsigned stride)
474 {
475    const struct lp_jit_context *context = &state->jit_context;
476    struct nearest_sampler samp;
477    struct color_blend blend;
478 
479    LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
480 
481    if (!init_nearest_sampler(&samp,
482                              &context->textures[0],
483                              x, y, width, height,
484                              a0[1][0], dadx[1][0], dady[1][0],
485                              a0[1][1], dadx[1][1], dady[1][1],
486                              a0[0][3], dadx[0][3], dady[0][3]))
487       return FALSE;
488 
489    init_blend(&blend,
490               x, y, width, height,
491               color, stride);
492 
493    /* Rasterize the rectangle and run the shader:
494     */
495    for (y = 0; y < height; y++) {
496       blend.src = samp.fetch(&samp);
497       blend_noop(&blend);
498    }
499 
500    return TRUE;
501 }
502 
503 
504 static boolean
blit_rgb1(const struct lp_rast_state *state, unsigned x, unsigned y, unsigned width, unsigned height, const float (*a0)[4], const float (*dadx)[4], const float (*dady)[4], uint8_t *color, unsigned stride)505 blit_rgb1(const struct lp_rast_state *state,
506           unsigned x, unsigned y,
507           unsigned width, unsigned height,
508           const float (*a0)[4],
509           const float (*dadx)[4],
510           const float (*dady)[4],
511           uint8_t *color,
512           unsigned stride)
513 {
514    const struct lp_jit_context *context = &state->jit_context;
515    struct nearest_sampler samp;
516    struct color_blend blend;
517    struct shader shader;
518 
519    LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
520 
521    if (!init_nearest_sampler(&samp,
522                              &context->textures[0],
523                              x, y, width, height,
524                              a0[1][0], dadx[1][0], dady[1][0],
525                              a0[1][1], dadx[1][1], dady[1][1],
526                              a0[0][3], dadx[0][3], dady[0][3]))
527       return FALSE;
528 
529    init_blend(&blend, x, y, width, height, color, stride);
530 
531    init_shader(&shader, x, y, width, height);
532 
533    /* Rasterize the rectangle and run the shader:
534     */
535    for (y = 0; y < height; y++) {
536       shader.src0 = samp.fetch(&samp);
537       blend.src = shade_rgb1(&shader);
538       blend_noop(&blend);
539    }
540 
541    return TRUE;
542 }
543 
544 
545 /* Linear shader variant implementing the BLIT_RGBA shader with
546  * one/inv_src_alpha blending.
547  */
548 static boolean
blit_rgba_blend_premul(const struct lp_rast_state *state, unsigned x, unsigned y, unsigned width, unsigned height, const float (*a0)[4], const float (*dadx)[4], const float (*dady)[4], uint8_t *color, unsigned stride)549 blit_rgba_blend_premul(const struct lp_rast_state *state,
550                        unsigned x, unsigned y,
551                        unsigned width, unsigned height,
552                        const float (*a0)[4],
553                        const float (*dadx)[4],
554                        const float (*dady)[4],
555                        uint8_t *color,
556                        unsigned stride)
557 {
558    const struct lp_jit_context *context = &state->jit_context;
559    struct nearest_sampler samp;
560    struct color_blend blend;
561 
562    LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
563 
564    if (!init_nearest_sampler(&samp,
565                              &context->textures[0],
566                              x, y, width, height,
567                              a0[1][0], dadx[1][0], dady[1][0],
568                              a0[1][1], dadx[1][1], dady[1][1],
569                              a0[0][3], dadx[0][3], dady[0][3]))
570       return FALSE;
571 
572    init_blend(&blend, x, y, width, height, color, stride);
573 
574    /* Rasterize the rectangle and run the shader:
575     */
576    for (y = 0; y < height; y++) {
577       blend.src = samp.fetch(&samp);
578       blend_premul(&blend);
579    }
580 
581    return TRUE;
582 }
583 
584 
585 /* Linear shader which always emits red.  Used for debugging.
586  */
587 static boolean
linear_red(const struct lp_rast_state *state, unsigned x, unsigned y, unsigned width, unsigned height, const float (*a0)[4], const float (*dadx)[4], const float (*dady)[4], uint8_t *color, unsigned stride)588 linear_red(const struct lp_rast_state *state,
589            unsigned x, unsigned y,
590            unsigned width, unsigned height,
591            const float (*a0)[4],
592            const float (*dadx)[4],
593            const float (*dady)[4],
594            uint8_t *color,
595            unsigned stride)
596 {
597    union util_color uc;
598 
599    util_pack_color_ub(0xff, 0, 0, 0xff,
600                       PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
601 
602    util_fill_rect(color,
603                   PIPE_FORMAT_B8G8R8A8_UNORM,
604                   stride,
605                   x,
606                   y,
607                   width,
608                   height,
609                   &uc);
610 
611    return TRUE;
612 }
613 
614 
615 /* Noop linear shader variant, for debugging.
616  */
617 static boolean
linear_no_op(const struct lp_rast_state *state, unsigned x, unsigned y, unsigned width, unsigned height, const float (*a0)[4], const float (*dadx)[4], const float (*dady)[4], uint8_t *color, unsigned stride)618 linear_no_op(const struct lp_rast_state *state,
619              unsigned x, unsigned y,
620              unsigned width, unsigned height,
621              const float (*a0)[4],
622              const float (*dadx)[4],
623              const float (*dady)[4],
624              uint8_t *color,
625              unsigned stride)
626 {
627    return TRUE;
628 }
629 
630 
631 /* Check for ADD/ONE/INV_SRC_ALPHA, ie premultiplied-alpha blending.
632  */
633 static boolean
is_one_inv_src_alpha_blend(const struct lp_fragment_shader_variant *variant)634 is_one_inv_src_alpha_blend(const struct lp_fragment_shader_variant *variant)
635 {
636    return
637       !variant->key.blend.logicop_enable &&
638       variant->key.blend.rt[0].blend_enable &&
639       variant->key.blend.rt[0].rgb_func == PIPE_BLEND_ADD &&
640       variant->key.blend.rt[0].rgb_src_factor == PIPE_BLENDFACTOR_ONE &&
641       variant->key.blend.rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA &&
642       variant->key.blend.rt[0].alpha_func == PIPE_BLEND_ADD &&
643       variant->key.blend.rt[0].alpha_src_factor == PIPE_BLENDFACTOR_ONE &&
644       variant->key.blend.rt[0].alpha_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA &&
645       variant->key.blend.rt[0].colormask == 0xf;
646 }
647 
648 
649 /* Examine the fragment shader variant and determine whether we can
650  * substitute a fastpath linear shader implementation.
651  */
652 void
llvmpipe_fs_variant_linear_fastpath(struct lp_fragment_shader_variant *variant)653 llvmpipe_fs_variant_linear_fastpath(struct lp_fragment_shader_variant *variant)
654 {
655    if (LP_PERF & PERF_NO_SHADE) {
656       variant->jit_linear = linear_red;
657       return;
658    }
659 
660    struct lp_sampler_static_state *samp0 =
661       lp_fs_variant_key_sampler_idx(&variant->key, 0);
662    if (!samp0)
663       return;
664 
665    enum pipe_format tex_format = samp0->texture_state.format;
666    if (variant->shader->kind == LP_FS_KIND_BLIT_RGBA &&
667        tex_format == PIPE_FORMAT_B8G8R8A8_UNORM &&
668        is_nearest_clamp_sampler(samp0)) {
669       if (variant->opaque) {
670          variant->jit_linear_blit = blit_rgba_blit;
671          variant->jit_linear = blit_rgba;
672       } else if (is_one_inv_src_alpha_blend(variant) &&
673                  util_get_cpu_caps()->has_sse2) {
674          variant->jit_linear = blit_rgba_blend_premul;
675       }
676       return;
677    }
678 
679    if (variant->shader->kind == LP_FS_KIND_BLIT_RGB1 &&
680        variant->opaque &&
681        (tex_format == PIPE_FORMAT_B8G8R8A8_UNORM ||
682         tex_format == PIPE_FORMAT_B8G8R8X8_UNORM) &&
683        is_nearest_clamp_sampler(samp0)) {
684       variant->jit_linear_blit = blit_rgb1_blit;
685       variant->jit_linear = blit_rgb1;
686       return;
687    }
688 
689    if (0) {
690       variant->jit_linear = linear_no_op;
691       return;
692    }
693 }
694 #else
695 void
llvmpipe_fs_variant_linear_fastpath(struct lp_fragment_shader_variant *variant)696 llvmpipe_fs_variant_linear_fastpath(struct lp_fragment_shader_variant *variant)
697 {
698    /* don't bother if there is no SSE */
699 }
700 #endif
701 
702