1bf215546Sopenharmony_ci/**************************************************************************
2bf215546Sopenharmony_ci *
3bf215546Sopenharmony_ci * Copyright 2007-2009 VMware, Inc.
4bf215546Sopenharmony_ci * All Rights Reserved.
5bf215546Sopenharmony_ci *
6bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
7bf215546Sopenharmony_ci * copy of this software and associated documentation files (the
8bf215546Sopenharmony_ci * "Software"), to deal in the Software without restriction, including
9bf215546Sopenharmony_ci * without limitation the rights to use, copy, modify, merge, publish,
10bf215546Sopenharmony_ci * distribute, sub license, and/or sell copies of the Software, and to
11bf215546Sopenharmony_ci * permit persons to whom the Software is furnished to do so, subject to
12bf215546Sopenharmony_ci * the following conditions:
13bf215546Sopenharmony_ci *
14bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the
15bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions
16bf215546Sopenharmony_ci * of the Software.
17bf215546Sopenharmony_ci *
18bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19bf215546Sopenharmony_ci * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20bf215546Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21bf215546Sopenharmony_ci * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22bf215546Sopenharmony_ci * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23bf215546Sopenharmony_ci * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24bf215546Sopenharmony_ci * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25bf215546Sopenharmony_ci *
26bf215546Sopenharmony_ci **************************************************************************/
27bf215546Sopenharmony_ci
28bf215546Sopenharmony_ci/*
29bf215546Sopenharmony_ci * Rasterization for binned triangles within a tile
30bf215546Sopenharmony_ci */
31bf215546Sopenharmony_ci
32bf215546Sopenharmony_ci#include <limits.h>
33bf215546Sopenharmony_ci#include "util/u_math.h"
34bf215546Sopenharmony_ci#include "lp_debug.h"
35bf215546Sopenharmony_ci#include "lp_perf.h"
36bf215546Sopenharmony_ci#include "lp_rast_priv.h"
37bf215546Sopenharmony_ci
38bf215546Sopenharmony_ci/**
39bf215546Sopenharmony_ci * Shade all pixels in a 4x4 block.
40bf215546Sopenharmony_ci */
41bf215546Sopenharmony_cistatic void
42bf215546Sopenharmony_ciblock_full_4(struct lp_rasterizer_task *task,
43bf215546Sopenharmony_ci             const struct lp_rast_triangle *tri,
44bf215546Sopenharmony_ci             int x, int y)
45bf215546Sopenharmony_ci{
46bf215546Sopenharmony_ci   lp_rast_shade_quads_all(task, &tri->inputs, x, y);
47bf215546Sopenharmony_ci}
48bf215546Sopenharmony_ci
49bf215546Sopenharmony_ci
50bf215546Sopenharmony_ci/**
51bf215546Sopenharmony_ci * Shade all pixels in a 16x16 block.
52bf215546Sopenharmony_ci */
53bf215546Sopenharmony_cistatic void
54bf215546Sopenharmony_ciblock_full_16(struct lp_rasterizer_task *task,
55bf215546Sopenharmony_ci              const struct lp_rast_triangle *tri,
56bf215546Sopenharmony_ci              int x, int y)
57bf215546Sopenharmony_ci{
58bf215546Sopenharmony_ci   assert(x % 16 == 0);
59bf215546Sopenharmony_ci   assert(y % 16 == 0);
60bf215546Sopenharmony_ci   for (unsigned iy = 0; iy < 16; iy += 4)
61bf215546Sopenharmony_ci      for (unsigned ix = 0; ix < 16; ix += 4)
62bf215546Sopenharmony_ci         block_full_4(task, tri, x + ix, y + iy);
63bf215546Sopenharmony_ci}
64bf215546Sopenharmony_ci
65bf215546Sopenharmony_cistatic inline unsigned
66bf215546Sopenharmony_cibuild_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy)
67bf215546Sopenharmony_ci{
68bf215546Sopenharmony_ci   unsigned mask = 0;
69bf215546Sopenharmony_ci
70bf215546Sopenharmony_ci   int32_t c0 = c;
71bf215546Sopenharmony_ci   int32_t c1 = c0 + dcdy;
72bf215546Sopenharmony_ci   int32_t c2 = c1 + dcdy;
73bf215546Sopenharmony_ci   int32_t c3 = c2 + dcdy;
74bf215546Sopenharmony_ci
75bf215546Sopenharmony_ci   mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
76bf215546Sopenharmony_ci   mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
77bf215546Sopenharmony_ci   mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
78bf215546Sopenharmony_ci   mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
79bf215546Sopenharmony_ci   mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
80bf215546Sopenharmony_ci   mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
81bf215546Sopenharmony_ci   mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
82bf215546Sopenharmony_ci   mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
83bf215546Sopenharmony_ci   mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
84bf215546Sopenharmony_ci   mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
85bf215546Sopenharmony_ci   mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
86bf215546Sopenharmony_ci   mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
87bf215546Sopenharmony_ci   mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
88bf215546Sopenharmony_ci   mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
89bf215546Sopenharmony_ci   mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
90bf215546Sopenharmony_ci   mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
91bf215546Sopenharmony_ci
92bf215546Sopenharmony_ci   return mask;
93bf215546Sopenharmony_ci}
94bf215546Sopenharmony_ci
95bf215546Sopenharmony_ci
96bf215546Sopenharmony_cistatic inline void
97bf215546Sopenharmony_cibuild_masks(int32_t c,
98bf215546Sopenharmony_ci            int32_t cdiff,
99bf215546Sopenharmony_ci            int32_t dcdx,
100bf215546Sopenharmony_ci            int32_t dcdy,
101bf215546Sopenharmony_ci            unsigned *outmask,
102bf215546Sopenharmony_ci            unsigned *partmask)
103bf215546Sopenharmony_ci{
104bf215546Sopenharmony_ci   *outmask |= build_mask_linear(c, dcdx, dcdy);
105bf215546Sopenharmony_ci   *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
106bf215546Sopenharmony_ci}
107bf215546Sopenharmony_ci
108bf215546Sopenharmony_civoid
109bf215546Sopenharmony_cilp_rast_triangle_3_16(struct lp_rasterizer_task *task,
110bf215546Sopenharmony_ci                      const union lp_rast_cmd_arg arg)
111bf215546Sopenharmony_ci{
112bf215546Sopenharmony_ci   union lp_rast_cmd_arg arg2;
113bf215546Sopenharmony_ci   arg2.triangle.tri = arg.triangle.tri;
114bf215546Sopenharmony_ci   arg2.triangle.plane_mask = (1<<3)-1;
115bf215546Sopenharmony_ci   lp_rast_triangle_3(task, arg2);
116bf215546Sopenharmony_ci}
117bf215546Sopenharmony_ci
118bf215546Sopenharmony_civoid
119bf215546Sopenharmony_cilp_rast_triangle_3_4(struct lp_rasterizer_task *task,
120bf215546Sopenharmony_ci                      const union lp_rast_cmd_arg arg)
121bf215546Sopenharmony_ci{
122bf215546Sopenharmony_ci   lp_rast_triangle_3_16(task, arg);
123bf215546Sopenharmony_ci}
124bf215546Sopenharmony_ci
125bf215546Sopenharmony_civoid
126bf215546Sopenharmony_cilp_rast_triangle_4_16(struct lp_rasterizer_task *task,
127bf215546Sopenharmony_ci                      const union lp_rast_cmd_arg arg)
128bf215546Sopenharmony_ci{
129bf215546Sopenharmony_ci   union lp_rast_cmd_arg arg2;
130bf215546Sopenharmony_ci   arg2.triangle.tri = arg.triangle.tri;
131bf215546Sopenharmony_ci   arg2.triangle.plane_mask = (1<<4)-1;
132bf215546Sopenharmony_ci   lp_rast_triangle_4(task, arg2);
133bf215546Sopenharmony_ci}
134bf215546Sopenharmony_ci
135bf215546Sopenharmony_civoid
136bf215546Sopenharmony_cilp_rast_triangle_ms_3_16(struct lp_rasterizer_task *task,
137bf215546Sopenharmony_ci                      const union lp_rast_cmd_arg arg)
138bf215546Sopenharmony_ci{
139bf215546Sopenharmony_ci   union lp_rast_cmd_arg arg2;
140bf215546Sopenharmony_ci   arg2.triangle.tri = arg.triangle.tri;
141bf215546Sopenharmony_ci   arg2.triangle.plane_mask = (1<<3)-1;
142bf215546Sopenharmony_ci   lp_rast_triangle_ms_3(task, arg2);
143bf215546Sopenharmony_ci}
144bf215546Sopenharmony_ci
145bf215546Sopenharmony_civoid
146bf215546Sopenharmony_cilp_rast_triangle_ms_3_4(struct lp_rasterizer_task *task,
147bf215546Sopenharmony_ci                      const union lp_rast_cmd_arg arg)
148bf215546Sopenharmony_ci{
149bf215546Sopenharmony_ci   lp_rast_triangle_ms_3_16(task, arg);
150bf215546Sopenharmony_ci}
151bf215546Sopenharmony_ci
152bf215546Sopenharmony_civoid
153bf215546Sopenharmony_cilp_rast_triangle_ms_4_16(struct lp_rasterizer_task *task,
154bf215546Sopenharmony_ci                      const union lp_rast_cmd_arg arg)
155bf215546Sopenharmony_ci{
156bf215546Sopenharmony_ci   union lp_rast_cmd_arg arg2;
157bf215546Sopenharmony_ci   arg2.triangle.tri = arg.triangle.tri;
158bf215546Sopenharmony_ci   arg2.triangle.plane_mask = (1<<4)-1;
159bf215546Sopenharmony_ci   lp_rast_triangle_ms_4(task, arg2);
160bf215546Sopenharmony_ci}
161bf215546Sopenharmony_ci
162bf215546Sopenharmony_ci#if defined(PIPE_ARCH_SSE)
163bf215546Sopenharmony_ci
164bf215546Sopenharmony_ci#include <emmintrin.h>
165bf215546Sopenharmony_ci#include "util/u_sse.h"
166bf215546Sopenharmony_ci
167bf215546Sopenharmony_ci
168bf215546Sopenharmony_cistatic inline void
169bf215546Sopenharmony_cibuild_masks_sse(int c,
170bf215546Sopenharmony_ci                int cdiff,
171bf215546Sopenharmony_ci                int dcdx,
172bf215546Sopenharmony_ci                int dcdy,
173bf215546Sopenharmony_ci                unsigned *outmask,
174bf215546Sopenharmony_ci                unsigned *partmask)
175bf215546Sopenharmony_ci{
176bf215546Sopenharmony_ci   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
177bf215546Sopenharmony_ci   __m128i xdcdy = _mm_set1_epi32(dcdy);
178bf215546Sopenharmony_ci
179bf215546Sopenharmony_ci   /* Get values across the quad
180bf215546Sopenharmony_ci    */
181bf215546Sopenharmony_ci   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
182bf215546Sopenharmony_ci   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
183bf215546Sopenharmony_ci   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
184bf215546Sopenharmony_ci
185bf215546Sopenharmony_ci   {
186bf215546Sopenharmony_ci      __m128i cstep01, cstep23, result;
187bf215546Sopenharmony_ci
188bf215546Sopenharmony_ci      cstep01 = _mm_packs_epi32(cstep0, cstep1);
189bf215546Sopenharmony_ci      cstep23 = _mm_packs_epi32(cstep2, cstep3);
190bf215546Sopenharmony_ci      result = _mm_packs_epi16(cstep01, cstep23);
191bf215546Sopenharmony_ci
192bf215546Sopenharmony_ci      *outmask |= _mm_movemask_epi8(result);
193bf215546Sopenharmony_ci   }
194bf215546Sopenharmony_ci
195bf215546Sopenharmony_ci
196bf215546Sopenharmony_ci   {
197bf215546Sopenharmony_ci      __m128i cio4 = _mm_set1_epi32(cdiff);
198bf215546Sopenharmony_ci      __m128i cstep01, cstep23, result;
199bf215546Sopenharmony_ci
200bf215546Sopenharmony_ci      cstep0 = _mm_add_epi32(cstep0, cio4);
201bf215546Sopenharmony_ci      cstep1 = _mm_add_epi32(cstep1, cio4);
202bf215546Sopenharmony_ci      cstep2 = _mm_add_epi32(cstep2, cio4);
203bf215546Sopenharmony_ci      cstep3 = _mm_add_epi32(cstep3, cio4);
204bf215546Sopenharmony_ci
205bf215546Sopenharmony_ci      cstep01 = _mm_packs_epi32(cstep0, cstep1);
206bf215546Sopenharmony_ci      cstep23 = _mm_packs_epi32(cstep2, cstep3);
207bf215546Sopenharmony_ci      result = _mm_packs_epi16(cstep01, cstep23);
208bf215546Sopenharmony_ci
209bf215546Sopenharmony_ci      *partmask |= _mm_movemask_epi8(result);
210bf215546Sopenharmony_ci   }
211bf215546Sopenharmony_ci}
212bf215546Sopenharmony_ci
213bf215546Sopenharmony_ci
214bf215546Sopenharmony_cistatic inline unsigned
215bf215546Sopenharmony_cibuild_mask_linear_sse(int c, int dcdx, int dcdy)
216bf215546Sopenharmony_ci{
217bf215546Sopenharmony_ci   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
218bf215546Sopenharmony_ci   __m128i xdcdy = _mm_set1_epi32(dcdy);
219bf215546Sopenharmony_ci
220bf215546Sopenharmony_ci   /* Get values across the quad
221bf215546Sopenharmony_ci    */
222bf215546Sopenharmony_ci   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
223bf215546Sopenharmony_ci   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
224bf215546Sopenharmony_ci   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
225bf215546Sopenharmony_ci
226bf215546Sopenharmony_ci   /* pack pairs of results into epi16
227bf215546Sopenharmony_ci    */
228bf215546Sopenharmony_ci   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
229bf215546Sopenharmony_ci   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
230bf215546Sopenharmony_ci
231bf215546Sopenharmony_ci   /* pack into epi8, preserving sign bits
232bf215546Sopenharmony_ci    */
233bf215546Sopenharmony_ci   __m128i result = _mm_packs_epi16(cstep01, cstep23);
234bf215546Sopenharmony_ci
235bf215546Sopenharmony_ci   /* extract sign bits to create mask
236bf215546Sopenharmony_ci    */
237bf215546Sopenharmony_ci   return _mm_movemask_epi8(result);
238bf215546Sopenharmony_ci}
239bf215546Sopenharmony_ci
240bf215546Sopenharmony_cistatic inline unsigned
241bf215546Sopenharmony_cisign_bits4(const __m128i *cstep, int cdiff)
242bf215546Sopenharmony_ci{
243bf215546Sopenharmony_ci
244bf215546Sopenharmony_ci   /* Adjust the step values
245bf215546Sopenharmony_ci    */
246bf215546Sopenharmony_ci   __m128i cio4 = _mm_set1_epi32(cdiff);
247bf215546Sopenharmony_ci   __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
248bf215546Sopenharmony_ci   __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
249bf215546Sopenharmony_ci   __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
250bf215546Sopenharmony_ci   __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
251bf215546Sopenharmony_ci
252bf215546Sopenharmony_ci   /* Pack down to epi8
253bf215546Sopenharmony_ci    */
254bf215546Sopenharmony_ci   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
255bf215546Sopenharmony_ci   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
256bf215546Sopenharmony_ci   __m128i result = _mm_packs_epi16(cstep01, cstep23);
257bf215546Sopenharmony_ci
258bf215546Sopenharmony_ci   /* Extract the sign bits
259bf215546Sopenharmony_ci    */
260bf215546Sopenharmony_ci   return _mm_movemask_epi8(result);
261bf215546Sopenharmony_ci}
262bf215546Sopenharmony_ci
263bf215546Sopenharmony_ci#define COLUMN0 ((1<<0)|(1<<4)|(1<<8) |(1<<12))
264bf215546Sopenharmony_ci#define COLUMN1 ((1<<1)|(1<<5)|(1<<9) |(1<<13))
265bf215546Sopenharmony_ci#define COLUMN2 ((1<<2)|(1<<6)|(1<<10)|(1<<14))
266bf215546Sopenharmony_ci#define COLUMN3 ((1<<3)|(1<<7)|(1<<11)|(1<<15))
267bf215546Sopenharmony_ci
268bf215546Sopenharmony_ci#define ROW0 ((1<<0) |(1<<1) |(1<<2) |(1<<3))
269bf215546Sopenharmony_ci#define ROW1 ((1<<4) |(1<<5) |(1<<6) |(1<<7))
270bf215546Sopenharmony_ci#define ROW2 ((1<<8) |(1<<9) |(1<<10)|(1<<11))
271bf215546Sopenharmony_ci#define ROW3 ((1<<12)|(1<<13)|(1<<14)|(1<<15))
272bf215546Sopenharmony_ci
273bf215546Sopenharmony_ci#define STAMP_SIZE 4
274bf215546Sopenharmony_cistatic unsigned bottom_mask_tab[STAMP_SIZE] = {
275bf215546Sopenharmony_ci   ROW3,
276bf215546Sopenharmony_ci   ROW3 | ROW2,
277bf215546Sopenharmony_ci   ROW3 | ROW2 | ROW1,
278bf215546Sopenharmony_ci   ROW3 | ROW2 | ROW1 | ROW0,
279bf215546Sopenharmony_ci};
280bf215546Sopenharmony_ci
281bf215546Sopenharmony_cistatic unsigned right_mask_tab[STAMP_SIZE] = {
282bf215546Sopenharmony_ci   COLUMN3,
283bf215546Sopenharmony_ci   COLUMN3 | COLUMN2,
284bf215546Sopenharmony_ci   COLUMN3 | COLUMN2 | COLUMN1,
285bf215546Sopenharmony_ci   COLUMN3 | COLUMN2 | COLUMN1 | COLUMN0,
286bf215546Sopenharmony_ci};
287bf215546Sopenharmony_ci
288bf215546Sopenharmony_ci
289bf215546Sopenharmony_ci#define NR_PLANES 3
290bf215546Sopenharmony_ci
291bf215546Sopenharmony_civoid
292bf215546Sopenharmony_cilp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
293bf215546Sopenharmony_ci                         const union lp_rast_cmd_arg arg)
294bf215546Sopenharmony_ci{
295bf215546Sopenharmony_ci   const struct lp_rast_triangle *tri = arg.triangle.tri;
296bf215546Sopenharmony_ci   const struct lp_rast_plane *plane = GET_PLANES(tri);
297bf215546Sopenharmony_ci   const int x = (arg.triangle.plane_mask & 0xff) + task->x;
298bf215546Sopenharmony_ci   const int y = (arg.triangle.plane_mask >> 8) + task->y;
299bf215546Sopenharmony_ci
300bf215546Sopenharmony_ci   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
301bf215546Sopenharmony_ci   unsigned nr = 0;
302bf215546Sopenharmony_ci
303bf215546Sopenharmony_ci   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
304bf215546Sopenharmony_ci   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
305bf215546Sopenharmony_ci   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
306bf215546Sopenharmony_ci   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
307bf215546Sopenharmony_ci   __m128i zero = _mm_setzero_si128();
308bf215546Sopenharmony_ci
309bf215546Sopenharmony_ci   __m128i c, dcdx, dcdy, rej4;
310bf215546Sopenharmony_ci   __m128i dcdx_neg_mask, dcdy_neg_mask;
311bf215546Sopenharmony_ci   __m128i dcdx2, dcdx3;
312bf215546Sopenharmony_ci
313bf215546Sopenharmony_ci   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
314bf215546Sopenharmony_ci   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
315bf215546Sopenharmony_ci   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
316bf215546Sopenharmony_ci   __m128i unused;
317bf215546Sopenharmony_ci
318bf215546Sopenharmony_ci   transpose4_epi32(&p0, &p1, &p2, &zero,
319bf215546Sopenharmony_ci                    &c, &unused, &dcdx, &dcdy);
320bf215546Sopenharmony_ci
321bf215546Sopenharmony_ci   /* recalc eo - easier than trying to load as scalars / shuffle... */
322bf215546Sopenharmony_ci   dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
323bf215546Sopenharmony_ci   dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
324bf215546Sopenharmony_ci   rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
325bf215546Sopenharmony_ci                        _mm_and_si128(dcdx_neg_mask, dcdx));
326bf215546Sopenharmony_ci
327bf215546Sopenharmony_ci   /* Adjust dcdx;
328bf215546Sopenharmony_ci    */
329bf215546Sopenharmony_ci   dcdx = _mm_sub_epi32(zero, dcdx);
330bf215546Sopenharmony_ci
331bf215546Sopenharmony_ci   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
332bf215546Sopenharmony_ci   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
333bf215546Sopenharmony_ci   rej4 = _mm_slli_epi32(rej4, 2);
334bf215546Sopenharmony_ci
335bf215546Sopenharmony_ci   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
336bf215546Sopenharmony_ci   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
337bf215546Sopenharmony_ci   rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
338bf215546Sopenharmony_ci
339bf215546Sopenharmony_ci   dcdx2 = _mm_add_epi32(dcdx, dcdx);
340bf215546Sopenharmony_ci   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
341bf215546Sopenharmony_ci
342bf215546Sopenharmony_ci   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
343bf215546Sopenharmony_ci                    &span_0, &span_1, &span_2, &unused);
344bf215546Sopenharmony_ci
345bf215546Sopenharmony_ci   for (unsigned i = 0; i < 4; i++) {
346bf215546Sopenharmony_ci      __m128i cx = c;
347bf215546Sopenharmony_ci
348bf215546Sopenharmony_ci      for (unsigned j = 0; j < 4; j++) {
349bf215546Sopenharmony_ci         __m128i c4rej = _mm_add_epi32(cx, rej4);
350bf215546Sopenharmony_ci         __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
351bf215546Sopenharmony_ci
352bf215546Sopenharmony_ci         /* if (is_zero(rej_masks)) */
353bf215546Sopenharmony_ci         if (_mm_movemask_epi8(rej_masks) == 0) {
354bf215546Sopenharmony_ci            __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
355bf215546Sopenharmony_ci            __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
356bf215546Sopenharmony_ci            __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
357bf215546Sopenharmony_ci
358bf215546Sopenharmony_ci            __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
359bf215546Sopenharmony_ci
360bf215546Sopenharmony_ci            __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
361bf215546Sopenharmony_ci            __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
362bf215546Sopenharmony_ci            __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
363bf215546Sopenharmony_ci
364bf215546Sopenharmony_ci            __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
365bf215546Sopenharmony_ci            __m128i c_01 = _mm_packs_epi32(c_0, c_1);
366bf215546Sopenharmony_ci
367bf215546Sopenharmony_ci            __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
368bf215546Sopenharmony_ci            __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
369bf215546Sopenharmony_ci            __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
370bf215546Sopenharmony_ci
371bf215546Sopenharmony_ci            __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
372bf215546Sopenharmony_ci
373bf215546Sopenharmony_ci            __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
374bf215546Sopenharmony_ci            __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
375bf215546Sopenharmony_ci            __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
376bf215546Sopenharmony_ci
377bf215546Sopenharmony_ci            __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
378bf215546Sopenharmony_ci            __m128i c_23 = _mm_packs_epi32(c_2, c_3);
379bf215546Sopenharmony_ci            __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
380bf215546Sopenharmony_ci
381bf215546Sopenharmony_ci            unsigned mask = _mm_movemask_epi8(c_0123);
382bf215546Sopenharmony_ci
383bf215546Sopenharmony_ci            out[nr].i = i;
384bf215546Sopenharmony_ci            out[nr].j = j;
385bf215546Sopenharmony_ci            out[nr].mask = mask;
386bf215546Sopenharmony_ci            if (mask != 0xffff)
387bf215546Sopenharmony_ci               nr++;
388bf215546Sopenharmony_ci         }
389bf215546Sopenharmony_ci         cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
390bf215546Sopenharmony_ci      }
391bf215546Sopenharmony_ci
392bf215546Sopenharmony_ci      c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
393bf215546Sopenharmony_ci   }
394bf215546Sopenharmony_ci
395bf215546Sopenharmony_ci   for (unsigned i = 0; i < nr; i++)
396bf215546Sopenharmony_ci      lp_rast_shade_quads_mask(task,
397bf215546Sopenharmony_ci                               &tri->inputs,
398bf215546Sopenharmony_ci                               x + 4 * out[i].j,
399bf215546Sopenharmony_ci                               y + 4 * out[i].i,
400bf215546Sopenharmony_ci                               0xffff & ~out[i].mask);
401bf215546Sopenharmony_ci}
402bf215546Sopenharmony_ci
403bf215546Sopenharmony_civoid
404bf215546Sopenharmony_cilp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
405bf215546Sopenharmony_ci                        const union lp_rast_cmd_arg arg)
406bf215546Sopenharmony_ci{
407bf215546Sopenharmony_ci   const struct lp_rast_triangle *tri = arg.triangle.tri;
408bf215546Sopenharmony_ci   const struct lp_rast_plane *plane = GET_PLANES(tri);
409bf215546Sopenharmony_ci   const unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
410bf215546Sopenharmony_ci   const unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
411bf215546Sopenharmony_ci
412bf215546Sopenharmony_ci   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
413bf215546Sopenharmony_ci   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
414bf215546Sopenharmony_ci   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
415bf215546Sopenharmony_ci   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
416bf215546Sopenharmony_ci   __m128i zero = _mm_setzero_si128();
417bf215546Sopenharmony_ci
418bf215546Sopenharmony_ci   __m128i c, dcdx, dcdy;
419bf215546Sopenharmony_ci   __m128i dcdx2, dcdx3;
420bf215546Sopenharmony_ci
421bf215546Sopenharmony_ci   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
422bf215546Sopenharmony_ci   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
423bf215546Sopenharmony_ci   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
424bf215546Sopenharmony_ci   __m128i unused;
425bf215546Sopenharmony_ci
426bf215546Sopenharmony_ci   transpose4_epi32(&p0, &p1, &p2, &zero,
427bf215546Sopenharmony_ci                    &c, &unused, &dcdx, &dcdy);
428bf215546Sopenharmony_ci
429bf215546Sopenharmony_ci   /* Adjust dcdx;
430bf215546Sopenharmony_ci    */
431bf215546Sopenharmony_ci   dcdx = _mm_sub_epi32(zero, dcdx);
432bf215546Sopenharmony_ci
433bf215546Sopenharmony_ci   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
434bf215546Sopenharmony_ci   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
435bf215546Sopenharmony_ci
436bf215546Sopenharmony_ci   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
437bf215546Sopenharmony_ci   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
438bf215546Sopenharmony_ci
439bf215546Sopenharmony_ci   dcdx2 = _mm_add_epi32(dcdx, dcdx);
440bf215546Sopenharmony_ci   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
441bf215546Sopenharmony_ci
442bf215546Sopenharmony_ci   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
443bf215546Sopenharmony_ci                    &span_0, &span_1, &span_2, &unused);
444bf215546Sopenharmony_ci
445bf215546Sopenharmony_ci
446bf215546Sopenharmony_ci   {
447bf215546Sopenharmony_ci      __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
448bf215546Sopenharmony_ci      __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
449bf215546Sopenharmony_ci      __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
450bf215546Sopenharmony_ci
451bf215546Sopenharmony_ci      __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
452bf215546Sopenharmony_ci
453bf215546Sopenharmony_ci      __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
454bf215546Sopenharmony_ci      __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
455bf215546Sopenharmony_ci      __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
456bf215546Sopenharmony_ci
457bf215546Sopenharmony_ci      __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
458bf215546Sopenharmony_ci      __m128i c_01 = _mm_packs_epi32(c_0, c_1);
459bf215546Sopenharmony_ci
460bf215546Sopenharmony_ci      __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
461bf215546Sopenharmony_ci      __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
462bf215546Sopenharmony_ci      __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
463bf215546Sopenharmony_ci
464bf215546Sopenharmony_ci      __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
465bf215546Sopenharmony_ci
466bf215546Sopenharmony_ci      __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
467bf215546Sopenharmony_ci      __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
468bf215546Sopenharmony_ci      __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
469bf215546Sopenharmony_ci
470bf215546Sopenharmony_ci      __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
471bf215546Sopenharmony_ci      __m128i c_23 = _mm_packs_epi32(c_2, c_3);
472bf215546Sopenharmony_ci      __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
473bf215546Sopenharmony_ci
474bf215546Sopenharmony_ci      unsigned mask = _mm_movemask_epi8(c_0123);
475bf215546Sopenharmony_ci
476bf215546Sopenharmony_ci      if (mask != 0xffff)
477bf215546Sopenharmony_ci         lp_rast_shade_quads_mask(task,
478bf215546Sopenharmony_ci                                  &tri->inputs,
479bf215546Sopenharmony_ci                                  x,
480bf215546Sopenharmony_ci                                  y,
481bf215546Sopenharmony_ci                                  0xffff & ~mask);
482bf215546Sopenharmony_ci   }
483bf215546Sopenharmony_ci}
484bf215546Sopenharmony_ci
485bf215546Sopenharmony_ci#undef NR_PLANES
486bf215546Sopenharmony_ci
487bf215546Sopenharmony_ci#else
488bf215546Sopenharmony_ci
489bf215546Sopenharmony_ci#if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN
490bf215546Sopenharmony_ci
491bf215546Sopenharmony_ci#include <altivec.h>
492bf215546Sopenharmony_ci#include "util/u_pwr8.h"
493bf215546Sopenharmony_ci
494bf215546Sopenharmony_cistatic inline void
495bf215546Sopenharmony_cibuild_masks_ppc(int c,
496bf215546Sopenharmony_ci                int cdiff,
497bf215546Sopenharmony_ci                int dcdx,
498bf215546Sopenharmony_ci                int dcdy,
499bf215546Sopenharmony_ci                unsigned *outmask,
500bf215546Sopenharmony_ci                unsigned *partmask)
501bf215546Sopenharmony_ci{
502bf215546Sopenharmony_ci   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
503bf215546Sopenharmony_ci   __m128i xdcdy = (__m128i) vec_splats(dcdy);
504bf215546Sopenharmony_ci
505bf215546Sopenharmony_ci   /* Get values across the quad
506bf215546Sopenharmony_ci    */
507bf215546Sopenharmony_ci   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
508bf215546Sopenharmony_ci   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
509bf215546Sopenharmony_ci   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
510bf215546Sopenharmony_ci
511bf215546Sopenharmony_ci   {
512bf215546Sopenharmony_ci      __m128i cstep01, cstep23, result;
513bf215546Sopenharmony_ci
514bf215546Sopenharmony_ci      cstep01 = vec_packs_epi32(cstep0, cstep1);
515bf215546Sopenharmony_ci      cstep23 = vec_packs_epi32(cstep2, cstep3);
516bf215546Sopenharmony_ci      result = vec_packs_epi16(cstep01, cstep23);
517bf215546Sopenharmony_ci
518bf215546Sopenharmony_ci      *outmask |= vec_movemask_epi8(result);
519bf215546Sopenharmony_ci   }
520bf215546Sopenharmony_ci
521bf215546Sopenharmony_ci
522bf215546Sopenharmony_ci   {
523bf215546Sopenharmony_ci      __m128i cio4 = (__m128i) vec_splats(cdiff);
524bf215546Sopenharmony_ci      __m128i cstep01, cstep23, result;
525bf215546Sopenharmony_ci
526bf215546Sopenharmony_ci      cstep0 = vec_add_epi32(cstep0, cio4);
527bf215546Sopenharmony_ci      cstep1 = vec_add_epi32(cstep1, cio4);
528bf215546Sopenharmony_ci      cstep2 = vec_add_epi32(cstep2, cio4);
529bf215546Sopenharmony_ci      cstep3 = vec_add_epi32(cstep3, cio4);
530bf215546Sopenharmony_ci
531bf215546Sopenharmony_ci      cstep01 = vec_packs_epi32(cstep0, cstep1);
532bf215546Sopenharmony_ci      cstep23 = vec_packs_epi32(cstep2, cstep3);
533bf215546Sopenharmony_ci      result = vec_packs_epi16(cstep01, cstep23);
534bf215546Sopenharmony_ci
535bf215546Sopenharmony_ci      *partmask |= vec_movemask_epi8(result);
536bf215546Sopenharmony_ci   }
537bf215546Sopenharmony_ci}
538bf215546Sopenharmony_ci
539bf215546Sopenharmony_cistatic inline unsigned
540bf215546Sopenharmony_cibuild_mask_linear_ppc(int c, int dcdx, int dcdy)
541bf215546Sopenharmony_ci{
542bf215546Sopenharmony_ci   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
543bf215546Sopenharmony_ci   __m128i xdcdy = (__m128i) vec_splats(dcdy);
544bf215546Sopenharmony_ci
545bf215546Sopenharmony_ci   /* Get values across the quad
546bf215546Sopenharmony_ci    */
547bf215546Sopenharmony_ci   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
548bf215546Sopenharmony_ci   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
549bf215546Sopenharmony_ci   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
550bf215546Sopenharmony_ci
551bf215546Sopenharmony_ci   /* pack pairs of results into epi16
552bf215546Sopenharmony_ci    */
553bf215546Sopenharmony_ci   __m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
554bf215546Sopenharmony_ci   __m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
555bf215546Sopenharmony_ci
556bf215546Sopenharmony_ci   /* pack into epi8, preserving sign bits
557bf215546Sopenharmony_ci    */
558bf215546Sopenharmony_ci   __m128i result = vec_packs_epi16(cstep01, cstep23);
559bf215546Sopenharmony_ci
560bf215546Sopenharmony_ci   /* extract sign bits to create mask
561bf215546Sopenharmony_ci    */
562bf215546Sopenharmony_ci   return vec_movemask_epi8(result);
563bf215546Sopenharmony_ci}
564bf215546Sopenharmony_ci
565bf215546Sopenharmony_cistatic inline __m128i
566bf215546Sopenharmony_cilp_plane_to_m128i(const struct lp_rast_plane *plane)
567bf215546Sopenharmony_ci{
568bf215546Sopenharmony_ci   return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
569bf215546Sopenharmony_ci                         (int32_t)plane->dcdy, (int32_t)plane->eo);
570bf215546Sopenharmony_ci}
571bf215546Sopenharmony_ci
572bf215546Sopenharmony_ci#define NR_PLANES 3
573bf215546Sopenharmony_ci
574bf215546Sopenharmony_civoid
575bf215546Sopenharmony_cilp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
576bf215546Sopenharmony_ci                      const union lp_rast_cmd_arg arg)
577bf215546Sopenharmony_ci{
578bf215546Sopenharmony_ci   const struct lp_rast_triangle *tri = arg.triangle.tri;
579bf215546Sopenharmony_ci   const struct lp_rast_plane *plane = GET_PLANES(tri);
580bf215546Sopenharmony_ci   const int x = (arg.triangle.plane_mask & 0xff) + task->x;
581bf215546Sopenharmony_ci   const int y = (arg.triangle.plane_mask >> 8) + task->y;
582bf215546Sopenharmony_ci
583bf215546Sopenharmony_ci   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
584bf215546Sopenharmony_ci   unsigned nr = 0;
585bf215546Sopenharmony_ci
586bf215546Sopenharmony_ci   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
587bf215546Sopenharmony_ci   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
588bf215546Sopenharmony_ci   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
589bf215546Sopenharmony_ci   __m128i zero = vec_splats((unsigned char) 0);
590bf215546Sopenharmony_ci
591bf215546Sopenharmony_ci   __m128i c;
592bf215546Sopenharmony_ci   __m128i dcdx;
593bf215546Sopenharmony_ci   __m128i dcdy;
594bf215546Sopenharmony_ci   __m128i rej4;
595bf215546Sopenharmony_ci
596bf215546Sopenharmony_ci   __m128i dcdx2;
597bf215546Sopenharmony_ci   __m128i dcdx3;
598bf215546Sopenharmony_ci
599bf215546Sopenharmony_ci   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
600bf215546Sopenharmony_ci   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
601bf215546Sopenharmony_ci   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
602bf215546Sopenharmony_ci   __m128i unused;
603bf215546Sopenharmony_ci
604bf215546Sopenharmony_ci   __m128i vshuf_mask0;
605bf215546Sopenharmony_ci   __m128i vshuf_mask1;
606bf215546Sopenharmony_ci   __m128i vshuf_mask2;
607bf215546Sopenharmony_ci
608bf215546Sopenharmony_ci#if UTIL_ARCH_LITTLE_ENDIAN
609bf215546Sopenharmony_ci   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);
610bf215546Sopenharmony_ci   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);
611bf215546Sopenharmony_ci   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);
612bf215546Sopenharmony_ci#else
613bf215546Sopenharmony_ci   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);
614bf215546Sopenharmony_ci   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);
615bf215546Sopenharmony_ci   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);
616bf215546Sopenharmony_ci#endif
617bf215546Sopenharmony_ci
618bf215546Sopenharmony_ci   transpose4_epi32(&p0, &p1, &p2, &zero,
619bf215546Sopenharmony_ci                    &c, &dcdx, &dcdy, &rej4);
620bf215546Sopenharmony_ci
621bf215546Sopenharmony_ci   /* Adjust dcdx;
622bf215546Sopenharmony_ci    */
623bf215546Sopenharmony_ci   dcdx = vec_sub_epi32(zero, dcdx);
624bf215546Sopenharmony_ci
625bf215546Sopenharmony_ci   c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));
626bf215546Sopenharmony_ci   c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));
627bf215546Sopenharmony_ci   rej4 = vec_slli_epi32(rej4, 2);
628bf215546Sopenharmony_ci
629bf215546Sopenharmony_ci   /*
630bf215546Sopenharmony_ci    * Adjust so we can just check the sign bit (< 0 comparison),
631bf215546Sopenharmony_ci    * instead of having to do a less efficient <= 0 comparison
632bf215546Sopenharmony_ci    */
633bf215546Sopenharmony_ci   c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));
634bf215546Sopenharmony_ci   rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));
635bf215546Sopenharmony_ci
636bf215546Sopenharmony_ci   dcdx2 = vec_add_epi32(dcdx, dcdx);
637bf215546Sopenharmony_ci   dcdx3 = vec_add_epi32(dcdx2, dcdx);
638bf215546Sopenharmony_ci
639bf215546Sopenharmony_ci   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
640bf215546Sopenharmony_ci                    &span_0, &span_1, &span_2, &unused);
641bf215546Sopenharmony_ci
642bf215546Sopenharmony_ci   for (unsigned i = 0; i < 4; i++) {
643bf215546Sopenharmony_ci      __m128i cx = c;
644bf215546Sopenharmony_ci
645bf215546Sopenharmony_ci      for (unsigned j = 0; j < 4; j++) {
646bf215546Sopenharmony_ci         __m128i c4rej = vec_add_epi32(cx, rej4);
647bf215546Sopenharmony_ci         __m128i rej_masks = vec_srai_epi32(c4rej, 31);
648bf215546Sopenharmony_ci
649bf215546Sopenharmony_ci         /* if (is_zero(rej_masks)) */
650bf215546Sopenharmony_ci         if (vec_movemask_epi8(rej_masks) == 0) {
651bf215546Sopenharmony_ci            __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0);
652bf215546Sopenharmony_ci            __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1);
653bf215546Sopenharmony_ci            __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2);
654bf215546Sopenharmony_ci
655bf215546Sopenharmony_ci            __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);
656bf215546Sopenharmony_ci
657bf215546Sopenharmony_ci            __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0));
658bf215546Sopenharmony_ci            __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1));
659bf215546Sopenharmony_ci            __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2));
660bf215546Sopenharmony_ci
661bf215546Sopenharmony_ci            __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);
662bf215546Sopenharmony_ci            __m128i c_01 = vec_packs_epi32(c_0, c_1);
663bf215546Sopenharmony_ci
664bf215546Sopenharmony_ci            __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0));
665bf215546Sopenharmony_ci            __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1));
666bf215546Sopenharmony_ci            __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2));
667bf215546Sopenharmony_ci
668bf215546Sopenharmony_ci            __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2);
669bf215546Sopenharmony_ci
670bf215546Sopenharmony_ci            __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0));
671bf215546Sopenharmony_ci            __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1));
672bf215546Sopenharmony_ci            __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2));
673bf215546Sopenharmony_ci
674bf215546Sopenharmony_ci            __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3);
675bf215546Sopenharmony_ci            __m128i c_23 = vec_packs_epi32(c_2, c_3);
676bf215546Sopenharmony_ci            __m128i c_0123 = vec_packs_epi16(c_01, c_23);
677bf215546Sopenharmony_ci
678bf215546Sopenharmony_ci            unsigned mask = vec_movemask_epi8(c_0123);
679bf215546Sopenharmony_ci
680bf215546Sopenharmony_ci            out[nr].i = i;
681bf215546Sopenharmony_ci            out[nr].j = j;
682bf215546Sopenharmony_ci            out[nr].mask = mask;
683bf215546Sopenharmony_ci            if (mask != 0xffff)
684bf215546Sopenharmony_ci               nr++;
685bf215546Sopenharmony_ci         }
686bf215546Sopenharmony_ci         cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2));
687bf215546Sopenharmony_ci      }
688bf215546Sopenharmony_ci
689bf215546Sopenharmony_ci      c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2));
690bf215546Sopenharmony_ci   }
691bf215546Sopenharmony_ci
692bf215546Sopenharmony_ci   for (unsigned i = 0; i < nr; i++)
693bf215546Sopenharmony_ci      lp_rast_shade_quads_mask(task,
694bf215546Sopenharmony_ci                               &tri->inputs,
695bf215546Sopenharmony_ci                               x + 4 * out[i].j,
696bf215546Sopenharmony_ci                               y + 4 * out[i].i,
697bf215546Sopenharmony_ci                               0xffff & ~out[i].mask);
698bf215546Sopenharmony_ci}
699bf215546Sopenharmony_ci
700bf215546Sopenharmony_ci#undef NR_PLANES
701bf215546Sopenharmony_ci
702bf215546Sopenharmony_ci#else
703bf215546Sopenharmony_ci
704bf215546Sopenharmony_civoid
705bf215546Sopenharmony_cilp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
706bf215546Sopenharmony_ci                         const union lp_rast_cmd_arg arg)
707bf215546Sopenharmony_ci{
708bf215546Sopenharmony_ci   union lp_rast_cmd_arg arg2;
709bf215546Sopenharmony_ci   arg2.triangle.tri = arg.triangle.tri;
710bf215546Sopenharmony_ci   arg2.triangle.plane_mask = (1<<3)-1;
711bf215546Sopenharmony_ci   lp_rast_triangle_32_3(task, arg2);
712bf215546Sopenharmony_ci}
713bf215546Sopenharmony_ci
714bf215546Sopenharmony_ci#endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */
715bf215546Sopenharmony_ci
716bf215546Sopenharmony_civoid
717bf215546Sopenharmony_cilp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
718bf215546Sopenharmony_ci                         const union lp_rast_cmd_arg arg)
719bf215546Sopenharmony_ci{
720bf215546Sopenharmony_ci   union lp_rast_cmd_arg arg2;
721bf215546Sopenharmony_ci   arg2.triangle.tri = arg.triangle.tri;
722bf215546Sopenharmony_ci   arg2.triangle.plane_mask = (1<<4)-1;
723bf215546Sopenharmony_ci   lp_rast_triangle_32_4(task, arg2);
724bf215546Sopenharmony_ci}
725bf215546Sopenharmony_ci
726bf215546Sopenharmony_civoid
727bf215546Sopenharmony_cilp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
728bf215546Sopenharmony_ci                      const union lp_rast_cmd_arg arg)
729bf215546Sopenharmony_ci{
730bf215546Sopenharmony_ci   lp_rast_triangle_32_3_16(task, arg);
731bf215546Sopenharmony_ci}
732bf215546Sopenharmony_ci
733bf215546Sopenharmony_ci#endif
734bf215546Sopenharmony_ci
735bf215546Sopenharmony_ci#if defined PIPE_ARCH_SSE
736bf215546Sopenharmony_ci#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
737bf215546Sopenharmony_ci#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy)
738bf215546Sopenharmony_ci#elif (defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN)
739bf215546Sopenharmony_ci#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
740bf215546Sopenharmony_ci#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy)
741bf215546Sopenharmony_ci#else
742bf215546Sopenharmony_ci#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
743bf215546Sopenharmony_ci#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
744bf215546Sopenharmony_ci#endif
745bf215546Sopenharmony_ci
746bf215546Sopenharmony_ci#define RASTER_64 1
747bf215546Sopenharmony_ci
748bf215546Sopenharmony_ci#define TAG(x) x##_1
749bf215546Sopenharmony_ci#define NR_PLANES 1
750bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
751bf215546Sopenharmony_ci
752bf215546Sopenharmony_ci#define TAG(x) x##_2
753bf215546Sopenharmony_ci#define NR_PLANES 2
754bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
755bf215546Sopenharmony_ci
756bf215546Sopenharmony_ci#define TAG(x) x##_3
757bf215546Sopenharmony_ci#define NR_PLANES 3
758bf215546Sopenharmony_ci/*#define TRI_4 lp_rast_triangle_3_4*/
759bf215546Sopenharmony_ci/*#define TRI_16 lp_rast_triangle_3_16*/
760bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
761bf215546Sopenharmony_ci
762bf215546Sopenharmony_ci#define TAG(x) x##_4
763bf215546Sopenharmony_ci#define NR_PLANES 4
764bf215546Sopenharmony_ci/*#define TRI_16 lp_rast_triangle_4_16*/
765bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
766bf215546Sopenharmony_ci
767bf215546Sopenharmony_ci#define TAG(x) x##_5
768bf215546Sopenharmony_ci#define NR_PLANES 5
769bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
770bf215546Sopenharmony_ci
771bf215546Sopenharmony_ci#define TAG(x) x##_6
772bf215546Sopenharmony_ci#define NR_PLANES 6
773bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
774bf215546Sopenharmony_ci
775bf215546Sopenharmony_ci#define TAG(x) x##_7
776bf215546Sopenharmony_ci#define NR_PLANES 7
777bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
778bf215546Sopenharmony_ci
779bf215546Sopenharmony_ci#define TAG(x) x##_8
780bf215546Sopenharmony_ci#define NR_PLANES 8
781bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
782bf215546Sopenharmony_ci
783bf215546Sopenharmony_ci#undef RASTER_64
784bf215546Sopenharmony_ci
785bf215546Sopenharmony_ci#define TAG(x) x##_32_1
786bf215546Sopenharmony_ci#define NR_PLANES 1
787bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
788bf215546Sopenharmony_ci
789bf215546Sopenharmony_ci#define TAG(x) x##_32_2
790bf215546Sopenharmony_ci#define NR_PLANES 2
791bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
792bf215546Sopenharmony_ci
793bf215546Sopenharmony_ci#define TAG(x) x##_32_3
794bf215546Sopenharmony_ci#define NR_PLANES 3
795bf215546Sopenharmony_ci/*#define TRI_4 lp_rast_triangle_3_4*/
796bf215546Sopenharmony_ci/*#define TRI_16 lp_rast_triangle_3_16*/
797bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
798bf215546Sopenharmony_ci
799bf215546Sopenharmony_ci#define TAG(x) x##_32_4
800bf215546Sopenharmony_ci#define NR_PLANES 4
801bf215546Sopenharmony_ci#ifdef PIPE_ARCH_SSE
802bf215546Sopenharmony_ci#define TRI_16 lp_rast_triangle_32_4_16
803bf215546Sopenharmony_ci#endif
804bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
805bf215546Sopenharmony_ci
806bf215546Sopenharmony_ci#define TAG(x) x##_32_5
807bf215546Sopenharmony_ci#define NR_PLANES 5
808bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
809bf215546Sopenharmony_ci
810bf215546Sopenharmony_ci#define TAG(x) x##_32_6
811bf215546Sopenharmony_ci#define NR_PLANES 6
812bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
813bf215546Sopenharmony_ci
814bf215546Sopenharmony_ci#define TAG(x) x##_32_7
815bf215546Sopenharmony_ci#define NR_PLANES 7
816bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
817bf215546Sopenharmony_ci
818bf215546Sopenharmony_ci#define TAG(x) x##_32_8
819bf215546Sopenharmony_ci#define NR_PLANES 8
820bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
821bf215546Sopenharmony_ci
822bf215546Sopenharmony_ci#define MULTISAMPLE 1
823bf215546Sopenharmony_ci#define RASTER_64 1
824bf215546Sopenharmony_ci
825bf215546Sopenharmony_ci#define TAG(x) x##_ms_1
826bf215546Sopenharmony_ci#define NR_PLANES 1
827bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
828bf215546Sopenharmony_ci
829bf215546Sopenharmony_ci#define TAG(x) x##_ms_2
830bf215546Sopenharmony_ci#define NR_PLANES 2
831bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
832bf215546Sopenharmony_ci
833bf215546Sopenharmony_ci#define TAG(x) x##_ms_3
834bf215546Sopenharmony_ci#define NR_PLANES 3
835bf215546Sopenharmony_ci/*#define TRI_4 lp_rast_triangle_3_4*/
836bf215546Sopenharmony_ci/*#define TRI_16 lp_rast_triangle_3_16*/
837bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
838bf215546Sopenharmony_ci
839bf215546Sopenharmony_ci#define TAG(x) x##_ms_4
840bf215546Sopenharmony_ci#define NR_PLANES 4
841bf215546Sopenharmony_ci/*#define TRI_16 lp_rast_triangle_4_16*/
842bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
843bf215546Sopenharmony_ci
844bf215546Sopenharmony_ci#define TAG(x) x##_ms_5
845bf215546Sopenharmony_ci#define NR_PLANES 5
846bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
847bf215546Sopenharmony_ci
848bf215546Sopenharmony_ci#define TAG(x) x##_ms_6
849bf215546Sopenharmony_ci#define NR_PLANES 6
850bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
851bf215546Sopenharmony_ci
852bf215546Sopenharmony_ci#define TAG(x) x##_ms_7
853bf215546Sopenharmony_ci#define NR_PLANES 7
854bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
855bf215546Sopenharmony_ci
856bf215546Sopenharmony_ci#define TAG(x) x##_ms_8
857bf215546Sopenharmony_ci#define NR_PLANES 8
858bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h"
859bf215546Sopenharmony_ci
860bf215546Sopenharmony_ci#undef RASTER_64
861