1bf215546Sopenharmony_ci/************************************************************************** 2bf215546Sopenharmony_ci * 3bf215546Sopenharmony_ci * Copyright 2007-2009 VMware, Inc. 4bf215546Sopenharmony_ci * All Rights Reserved. 5bf215546Sopenharmony_ci * 6bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 7bf215546Sopenharmony_ci * copy of this software and associated documentation files (the 8bf215546Sopenharmony_ci * "Software"), to deal in the Software without restriction, including 9bf215546Sopenharmony_ci * without limitation the rights to use, copy, modify, merge, publish, 10bf215546Sopenharmony_ci * distribute, sub license, and/or sell copies of the Software, and to 11bf215546Sopenharmony_ci * permit persons to whom the Software is furnished to do so, subject to 12bf215546Sopenharmony_ci * the following conditions: 13bf215546Sopenharmony_ci * 14bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the 15bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions 16bf215546Sopenharmony_ci * of the Software. 17bf215546Sopenharmony_ci * 18bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19bf215546Sopenharmony_ci * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20bf215546Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21bf215546Sopenharmony_ci * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22bf215546Sopenharmony_ci * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23bf215546Sopenharmony_ci * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24bf215546Sopenharmony_ci * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25bf215546Sopenharmony_ci * 26bf215546Sopenharmony_ci **************************************************************************/ 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci/* 29bf215546Sopenharmony_ci * Rasterization for binned triangles within a tile 30bf215546Sopenharmony_ci */ 31bf215546Sopenharmony_ci 32bf215546Sopenharmony_ci#include <limits.h> 33bf215546Sopenharmony_ci#include "util/u_math.h" 34bf215546Sopenharmony_ci#include "lp_debug.h" 35bf215546Sopenharmony_ci#include "lp_perf.h" 36bf215546Sopenharmony_ci#include "lp_rast_priv.h" 37bf215546Sopenharmony_ci 38bf215546Sopenharmony_ci/** 39bf215546Sopenharmony_ci * Shade all pixels in a 4x4 block. 40bf215546Sopenharmony_ci */ 41bf215546Sopenharmony_cistatic void 42bf215546Sopenharmony_ciblock_full_4(struct lp_rasterizer_task *task, 43bf215546Sopenharmony_ci const struct lp_rast_triangle *tri, 44bf215546Sopenharmony_ci int x, int y) 45bf215546Sopenharmony_ci{ 46bf215546Sopenharmony_ci lp_rast_shade_quads_all(task, &tri->inputs, x, y); 47bf215546Sopenharmony_ci} 48bf215546Sopenharmony_ci 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_ci/** 51bf215546Sopenharmony_ci * Shade all pixels in a 16x16 block. 52bf215546Sopenharmony_ci */ 53bf215546Sopenharmony_cistatic void 54bf215546Sopenharmony_ciblock_full_16(struct lp_rasterizer_task *task, 55bf215546Sopenharmony_ci const struct lp_rast_triangle *tri, 56bf215546Sopenharmony_ci int x, int y) 57bf215546Sopenharmony_ci{ 58bf215546Sopenharmony_ci assert(x % 16 == 0); 59bf215546Sopenharmony_ci assert(y % 16 == 0); 60bf215546Sopenharmony_ci for (unsigned iy = 0; iy < 16; iy += 4) 61bf215546Sopenharmony_ci for (unsigned ix = 0; ix < 16; ix += 4) 62bf215546Sopenharmony_ci block_full_4(task, tri, x + ix, y + iy); 63bf215546Sopenharmony_ci} 64bf215546Sopenharmony_ci 65bf215546Sopenharmony_cistatic inline unsigned 66bf215546Sopenharmony_cibuild_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy) 67bf215546Sopenharmony_ci{ 68bf215546Sopenharmony_ci unsigned mask = 0; 69bf215546Sopenharmony_ci 70bf215546Sopenharmony_ci int32_t c0 = c; 71bf215546Sopenharmony_ci int32_t c1 = c0 + dcdy; 72bf215546Sopenharmony_ci int32_t c2 = c1 + dcdy; 73bf215546Sopenharmony_ci int32_t c3 = c2 + dcdy; 74bf215546Sopenharmony_ci 75bf215546Sopenharmony_ci mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0); 76bf215546Sopenharmony_ci mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1); 77bf215546Sopenharmony_ci mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2); 78bf215546Sopenharmony_ci mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3); 79bf215546Sopenharmony_ci mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4); 80bf215546Sopenharmony_ci mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5); 81bf215546Sopenharmony_ci mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6); 82bf215546Sopenharmony_ci mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); 83bf215546Sopenharmony_ci mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8); 84bf215546Sopenharmony_ci mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9); 85bf215546Sopenharmony_ci mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10); 86bf215546Sopenharmony_ci mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11); 87bf215546Sopenharmony_ci mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12); 88bf215546Sopenharmony_ci mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13); 89bf215546Sopenharmony_ci mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14); 90bf215546Sopenharmony_ci mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15); 91bf215546Sopenharmony_ci 92bf215546Sopenharmony_ci return mask; 93bf215546Sopenharmony_ci} 94bf215546Sopenharmony_ci 95bf215546Sopenharmony_ci 96bf215546Sopenharmony_cistatic inline void 97bf215546Sopenharmony_cibuild_masks(int32_t c, 98bf215546Sopenharmony_ci int32_t cdiff, 99bf215546Sopenharmony_ci int32_t dcdx, 100bf215546Sopenharmony_ci int32_t dcdy, 101bf215546Sopenharmony_ci unsigned *outmask, 102bf215546Sopenharmony_ci unsigned *partmask) 103bf215546Sopenharmony_ci{ 104bf215546Sopenharmony_ci *outmask |= build_mask_linear(c, dcdx, dcdy); 105bf215546Sopenharmony_ci *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy); 106bf215546Sopenharmony_ci} 107bf215546Sopenharmony_ci 108bf215546Sopenharmony_civoid 109bf215546Sopenharmony_cilp_rast_triangle_3_16(struct lp_rasterizer_task *task, 110bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 111bf215546Sopenharmony_ci{ 112bf215546Sopenharmony_ci union lp_rast_cmd_arg arg2; 113bf215546Sopenharmony_ci arg2.triangle.tri = arg.triangle.tri; 114bf215546Sopenharmony_ci arg2.triangle.plane_mask = (1<<3)-1; 115bf215546Sopenharmony_ci lp_rast_triangle_3(task, arg2); 116bf215546Sopenharmony_ci} 117bf215546Sopenharmony_ci 118bf215546Sopenharmony_civoid 119bf215546Sopenharmony_cilp_rast_triangle_3_4(struct lp_rasterizer_task *task, 120bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 121bf215546Sopenharmony_ci{ 122bf215546Sopenharmony_ci lp_rast_triangle_3_16(task, arg); 123bf215546Sopenharmony_ci} 124bf215546Sopenharmony_ci 125bf215546Sopenharmony_civoid 126bf215546Sopenharmony_cilp_rast_triangle_4_16(struct lp_rasterizer_task *task, 127bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 128bf215546Sopenharmony_ci{ 129bf215546Sopenharmony_ci union lp_rast_cmd_arg arg2; 130bf215546Sopenharmony_ci arg2.triangle.tri = arg.triangle.tri; 131bf215546Sopenharmony_ci arg2.triangle.plane_mask = (1<<4)-1; 132bf215546Sopenharmony_ci lp_rast_triangle_4(task, arg2); 133bf215546Sopenharmony_ci} 134bf215546Sopenharmony_ci 135bf215546Sopenharmony_civoid 136bf215546Sopenharmony_cilp_rast_triangle_ms_3_16(struct lp_rasterizer_task *task, 137bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 138bf215546Sopenharmony_ci{ 139bf215546Sopenharmony_ci union lp_rast_cmd_arg arg2; 140bf215546Sopenharmony_ci arg2.triangle.tri = arg.triangle.tri; 141bf215546Sopenharmony_ci arg2.triangle.plane_mask = (1<<3)-1; 142bf215546Sopenharmony_ci lp_rast_triangle_ms_3(task, arg2); 143bf215546Sopenharmony_ci} 144bf215546Sopenharmony_ci 145bf215546Sopenharmony_civoid 146bf215546Sopenharmony_cilp_rast_triangle_ms_3_4(struct lp_rasterizer_task *task, 147bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 148bf215546Sopenharmony_ci{ 149bf215546Sopenharmony_ci lp_rast_triangle_ms_3_16(task, arg); 150bf215546Sopenharmony_ci} 151bf215546Sopenharmony_ci 152bf215546Sopenharmony_civoid 153bf215546Sopenharmony_cilp_rast_triangle_ms_4_16(struct lp_rasterizer_task *task, 154bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 155bf215546Sopenharmony_ci{ 156bf215546Sopenharmony_ci union lp_rast_cmd_arg arg2; 157bf215546Sopenharmony_ci arg2.triangle.tri = arg.triangle.tri; 158bf215546Sopenharmony_ci arg2.triangle.plane_mask = (1<<4)-1; 159bf215546Sopenharmony_ci lp_rast_triangle_ms_4(task, arg2); 160bf215546Sopenharmony_ci} 161bf215546Sopenharmony_ci 162bf215546Sopenharmony_ci#if defined(PIPE_ARCH_SSE) 163bf215546Sopenharmony_ci 164bf215546Sopenharmony_ci#include <emmintrin.h> 165bf215546Sopenharmony_ci#include "util/u_sse.h" 166bf215546Sopenharmony_ci 167bf215546Sopenharmony_ci 168bf215546Sopenharmony_cistatic inline void 169bf215546Sopenharmony_cibuild_masks_sse(int c, 170bf215546Sopenharmony_ci int cdiff, 171bf215546Sopenharmony_ci int dcdx, 172bf215546Sopenharmony_ci int dcdy, 173bf215546Sopenharmony_ci unsigned *outmask, 174bf215546Sopenharmony_ci unsigned *partmask) 175bf215546Sopenharmony_ci{ 176bf215546Sopenharmony_ci __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 177bf215546Sopenharmony_ci __m128i xdcdy = _mm_set1_epi32(dcdy); 178bf215546Sopenharmony_ci 179bf215546Sopenharmony_ci /* Get values across the quad 180bf215546Sopenharmony_ci */ 181bf215546Sopenharmony_ci __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 182bf215546Sopenharmony_ci __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 183bf215546Sopenharmony_ci __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 184bf215546Sopenharmony_ci 185bf215546Sopenharmony_ci { 186bf215546Sopenharmony_ci __m128i cstep01, cstep23, result; 187bf215546Sopenharmony_ci 188bf215546Sopenharmony_ci cstep01 = _mm_packs_epi32(cstep0, cstep1); 189bf215546Sopenharmony_ci cstep23 = _mm_packs_epi32(cstep2, cstep3); 190bf215546Sopenharmony_ci result = _mm_packs_epi16(cstep01, cstep23); 191bf215546Sopenharmony_ci 192bf215546Sopenharmony_ci *outmask |= _mm_movemask_epi8(result); 193bf215546Sopenharmony_ci } 194bf215546Sopenharmony_ci 195bf215546Sopenharmony_ci 196bf215546Sopenharmony_ci { 197bf215546Sopenharmony_ci __m128i cio4 = _mm_set1_epi32(cdiff); 198bf215546Sopenharmony_ci __m128i cstep01, cstep23, result; 199bf215546Sopenharmony_ci 200bf215546Sopenharmony_ci cstep0 = _mm_add_epi32(cstep0, cio4); 201bf215546Sopenharmony_ci cstep1 = _mm_add_epi32(cstep1, cio4); 202bf215546Sopenharmony_ci cstep2 = _mm_add_epi32(cstep2, cio4); 203bf215546Sopenharmony_ci cstep3 = _mm_add_epi32(cstep3, cio4); 204bf215546Sopenharmony_ci 205bf215546Sopenharmony_ci cstep01 = _mm_packs_epi32(cstep0, cstep1); 206bf215546Sopenharmony_ci cstep23 = _mm_packs_epi32(cstep2, cstep3); 207bf215546Sopenharmony_ci result = _mm_packs_epi16(cstep01, cstep23); 208bf215546Sopenharmony_ci 209bf215546Sopenharmony_ci *partmask |= _mm_movemask_epi8(result); 210bf215546Sopenharmony_ci } 211bf215546Sopenharmony_ci} 212bf215546Sopenharmony_ci 213bf215546Sopenharmony_ci 214bf215546Sopenharmony_cistatic inline unsigned 215bf215546Sopenharmony_cibuild_mask_linear_sse(int c, int dcdx, int dcdy) 216bf215546Sopenharmony_ci{ 217bf215546Sopenharmony_ci __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 218bf215546Sopenharmony_ci __m128i xdcdy = _mm_set1_epi32(dcdy); 219bf215546Sopenharmony_ci 220bf215546Sopenharmony_ci /* Get values across the quad 221bf215546Sopenharmony_ci */ 222bf215546Sopenharmony_ci __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 223bf215546Sopenharmony_ci __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 224bf215546Sopenharmony_ci __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 225bf215546Sopenharmony_ci 226bf215546Sopenharmony_ci /* pack pairs of results into epi16 227bf215546Sopenharmony_ci */ 228bf215546Sopenharmony_ci __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 229bf215546Sopenharmony_ci __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 230bf215546Sopenharmony_ci 231bf215546Sopenharmony_ci /* pack into epi8, preserving sign bits 232bf215546Sopenharmony_ci */ 233bf215546Sopenharmony_ci __m128i result = _mm_packs_epi16(cstep01, cstep23); 234bf215546Sopenharmony_ci 235bf215546Sopenharmony_ci /* extract sign bits to create mask 236bf215546Sopenharmony_ci */ 237bf215546Sopenharmony_ci return _mm_movemask_epi8(result); 238bf215546Sopenharmony_ci} 239bf215546Sopenharmony_ci 240bf215546Sopenharmony_cistatic inline unsigned 241bf215546Sopenharmony_cisign_bits4(const __m128i *cstep, int cdiff) 242bf215546Sopenharmony_ci{ 243bf215546Sopenharmony_ci 244bf215546Sopenharmony_ci /* Adjust the step values 245bf215546Sopenharmony_ci */ 246bf215546Sopenharmony_ci __m128i cio4 = _mm_set1_epi32(cdiff); 247bf215546Sopenharmony_ci __m128i cstep0 = _mm_add_epi32(cstep[0], cio4); 248bf215546Sopenharmony_ci __m128i cstep1 = _mm_add_epi32(cstep[1], cio4); 249bf215546Sopenharmony_ci __m128i cstep2 = _mm_add_epi32(cstep[2], cio4); 250bf215546Sopenharmony_ci __m128i cstep3 = _mm_add_epi32(cstep[3], cio4); 251bf215546Sopenharmony_ci 252bf215546Sopenharmony_ci /* Pack down to epi8 253bf215546Sopenharmony_ci */ 254bf215546Sopenharmony_ci __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 255bf215546Sopenharmony_ci __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 256bf215546Sopenharmony_ci __m128i result = _mm_packs_epi16(cstep01, cstep23); 257bf215546Sopenharmony_ci 258bf215546Sopenharmony_ci /* Extract the sign bits 259bf215546Sopenharmony_ci */ 260bf215546Sopenharmony_ci return _mm_movemask_epi8(result); 261bf215546Sopenharmony_ci} 262bf215546Sopenharmony_ci 263bf215546Sopenharmony_ci#define COLUMN0 ((1<<0)|(1<<4)|(1<<8) |(1<<12)) 264bf215546Sopenharmony_ci#define COLUMN1 ((1<<1)|(1<<5)|(1<<9) |(1<<13)) 265bf215546Sopenharmony_ci#define COLUMN2 ((1<<2)|(1<<6)|(1<<10)|(1<<14)) 266bf215546Sopenharmony_ci#define COLUMN3 ((1<<3)|(1<<7)|(1<<11)|(1<<15)) 267bf215546Sopenharmony_ci 268bf215546Sopenharmony_ci#define ROW0 ((1<<0) |(1<<1) |(1<<2) |(1<<3)) 269bf215546Sopenharmony_ci#define ROW1 ((1<<4) |(1<<5) |(1<<6) |(1<<7)) 270bf215546Sopenharmony_ci#define ROW2 ((1<<8) |(1<<9) |(1<<10)|(1<<11)) 271bf215546Sopenharmony_ci#define ROW3 ((1<<12)|(1<<13)|(1<<14)|(1<<15)) 272bf215546Sopenharmony_ci 273bf215546Sopenharmony_ci#define STAMP_SIZE 4 274bf215546Sopenharmony_cistatic unsigned bottom_mask_tab[STAMP_SIZE] = { 275bf215546Sopenharmony_ci ROW3, 276bf215546Sopenharmony_ci ROW3 | ROW2, 277bf215546Sopenharmony_ci ROW3 | ROW2 | ROW1, 278bf215546Sopenharmony_ci ROW3 | ROW2 | ROW1 | ROW0, 279bf215546Sopenharmony_ci}; 280bf215546Sopenharmony_ci 281bf215546Sopenharmony_cistatic unsigned right_mask_tab[STAMP_SIZE] = { 282bf215546Sopenharmony_ci COLUMN3, 283bf215546Sopenharmony_ci COLUMN3 | COLUMN2, 284bf215546Sopenharmony_ci COLUMN3 | COLUMN2 | COLUMN1, 285bf215546Sopenharmony_ci COLUMN3 | COLUMN2 | COLUMN1 | COLUMN0, 286bf215546Sopenharmony_ci}; 287bf215546Sopenharmony_ci 288bf215546Sopenharmony_ci 289bf215546Sopenharmony_ci#define NR_PLANES 3 290bf215546Sopenharmony_ci 291bf215546Sopenharmony_civoid 292bf215546Sopenharmony_cilp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 293bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 294bf215546Sopenharmony_ci{ 295bf215546Sopenharmony_ci const struct lp_rast_triangle *tri = arg.triangle.tri; 296bf215546Sopenharmony_ci const struct lp_rast_plane *plane = GET_PLANES(tri); 297bf215546Sopenharmony_ci const int x = (arg.triangle.plane_mask & 0xff) + task->x; 298bf215546Sopenharmony_ci const int y = (arg.triangle.plane_mask >> 8) + task->y; 299bf215546Sopenharmony_ci 300bf215546Sopenharmony_ci struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; 301bf215546Sopenharmony_ci unsigned nr = 0; 302bf215546Sopenharmony_ci 303bf215546Sopenharmony_ci /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ 304bf215546Sopenharmony_ci __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ 305bf215546Sopenharmony_ci __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); 306bf215546Sopenharmony_ci __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); 307bf215546Sopenharmony_ci __m128i zero = _mm_setzero_si128(); 308bf215546Sopenharmony_ci 309bf215546Sopenharmony_ci __m128i c, dcdx, dcdy, rej4; 310bf215546Sopenharmony_ci __m128i dcdx_neg_mask, dcdy_neg_mask; 311bf215546Sopenharmony_ci __m128i dcdx2, dcdx3; 312bf215546Sopenharmony_ci 313bf215546Sopenharmony_ci __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 314bf215546Sopenharmony_ci __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 315bf215546Sopenharmony_ci __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 316bf215546Sopenharmony_ci __m128i unused; 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci transpose4_epi32(&p0, &p1, &p2, &zero, 319bf215546Sopenharmony_ci &c, &unused, &dcdx, &dcdy); 320bf215546Sopenharmony_ci 321bf215546Sopenharmony_ci /* recalc eo - easier than trying to load as scalars / shuffle... */ 322bf215546Sopenharmony_ci dcdx_neg_mask = _mm_srai_epi32(dcdx, 31); 323bf215546Sopenharmony_ci dcdy_neg_mask = _mm_srai_epi32(dcdy, 31); 324bf215546Sopenharmony_ci rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy), 325bf215546Sopenharmony_ci _mm_and_si128(dcdx_neg_mask, dcdx)); 326bf215546Sopenharmony_ci 327bf215546Sopenharmony_ci /* Adjust dcdx; 328bf215546Sopenharmony_ci */ 329bf215546Sopenharmony_ci dcdx = _mm_sub_epi32(zero, dcdx); 330bf215546Sopenharmony_ci 331bf215546Sopenharmony_ci c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 332bf215546Sopenharmony_ci c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 333bf215546Sopenharmony_ci rej4 = _mm_slli_epi32(rej4, 2); 334bf215546Sopenharmony_ci 335bf215546Sopenharmony_ci /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 336bf215546Sopenharmony_ci c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 337bf215546Sopenharmony_ci rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1)); 338bf215546Sopenharmony_ci 339bf215546Sopenharmony_ci dcdx2 = _mm_add_epi32(dcdx, dcdx); 340bf215546Sopenharmony_ci dcdx3 = _mm_add_epi32(dcdx2, dcdx); 341bf215546Sopenharmony_ci 342bf215546Sopenharmony_ci transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 343bf215546Sopenharmony_ci &span_0, &span_1, &span_2, &unused); 344bf215546Sopenharmony_ci 345bf215546Sopenharmony_ci for (unsigned i = 0; i < 4; i++) { 346bf215546Sopenharmony_ci __m128i cx = c; 347bf215546Sopenharmony_ci 348bf215546Sopenharmony_ci for (unsigned j = 0; j < 4; j++) { 349bf215546Sopenharmony_ci __m128i c4rej = _mm_add_epi32(cx, rej4); 350bf215546Sopenharmony_ci __m128i rej_masks = _mm_srai_epi32(c4rej, 31); 351bf215546Sopenharmony_ci 352bf215546Sopenharmony_ci /* if (is_zero(rej_masks)) */ 353bf215546Sopenharmony_ci if (_mm_movemask_epi8(rej_masks) == 0) { 354bf215546Sopenharmony_ci __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0); 355bf215546Sopenharmony_ci __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1); 356bf215546Sopenharmony_ci __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2); 357bf215546Sopenharmony_ci 358bf215546Sopenharmony_ci __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 359bf215546Sopenharmony_ci 360bf215546Sopenharmony_ci __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 361bf215546Sopenharmony_ci __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 362bf215546Sopenharmony_ci __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 363bf215546Sopenharmony_ci 364bf215546Sopenharmony_ci __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 365bf215546Sopenharmony_ci __m128i c_01 = _mm_packs_epi32(c_0, c_1); 366bf215546Sopenharmony_ci 367bf215546Sopenharmony_ci __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 368bf215546Sopenharmony_ci __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 369bf215546Sopenharmony_ci __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 370bf215546Sopenharmony_ci 371bf215546Sopenharmony_ci __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 372bf215546Sopenharmony_ci 373bf215546Sopenharmony_ci __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 374bf215546Sopenharmony_ci __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 375bf215546Sopenharmony_ci __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 376bf215546Sopenharmony_ci 377bf215546Sopenharmony_ci __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 378bf215546Sopenharmony_ci __m128i c_23 = _mm_packs_epi32(c_2, c_3); 379bf215546Sopenharmony_ci __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 380bf215546Sopenharmony_ci 381bf215546Sopenharmony_ci unsigned mask = _mm_movemask_epi8(c_0123); 382bf215546Sopenharmony_ci 383bf215546Sopenharmony_ci out[nr].i = i; 384bf215546Sopenharmony_ci out[nr].j = j; 385bf215546Sopenharmony_ci out[nr].mask = mask; 386bf215546Sopenharmony_ci if (mask != 0xffff) 387bf215546Sopenharmony_ci nr++; 388bf215546Sopenharmony_ci } 389bf215546Sopenharmony_ci cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2)); 390bf215546Sopenharmony_ci } 391bf215546Sopenharmony_ci 392bf215546Sopenharmony_ci c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2)); 393bf215546Sopenharmony_ci } 394bf215546Sopenharmony_ci 395bf215546Sopenharmony_ci for (unsigned i = 0; i < nr; i++) 396bf215546Sopenharmony_ci lp_rast_shade_quads_mask(task, 397bf215546Sopenharmony_ci &tri->inputs, 398bf215546Sopenharmony_ci x + 4 * out[i].j, 399bf215546Sopenharmony_ci y + 4 * out[i].i, 400bf215546Sopenharmony_ci 0xffff & ~out[i].mask); 401bf215546Sopenharmony_ci} 402bf215546Sopenharmony_ci 403bf215546Sopenharmony_civoid 404bf215546Sopenharmony_cilp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, 405bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 406bf215546Sopenharmony_ci{ 407bf215546Sopenharmony_ci const struct lp_rast_triangle *tri = arg.triangle.tri; 408bf215546Sopenharmony_ci const struct lp_rast_plane *plane = GET_PLANES(tri); 409bf215546Sopenharmony_ci const unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; 410bf215546Sopenharmony_ci const unsigned y = (arg.triangle.plane_mask >> 8) + task->y; 411bf215546Sopenharmony_ci 412bf215546Sopenharmony_ci /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ 413bf215546Sopenharmony_ci __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ 414bf215546Sopenharmony_ci __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); 415bf215546Sopenharmony_ci __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); 416bf215546Sopenharmony_ci __m128i zero = _mm_setzero_si128(); 417bf215546Sopenharmony_ci 418bf215546Sopenharmony_ci __m128i c, dcdx, dcdy; 419bf215546Sopenharmony_ci __m128i dcdx2, dcdx3; 420bf215546Sopenharmony_ci 421bf215546Sopenharmony_ci __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 422bf215546Sopenharmony_ci __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 423bf215546Sopenharmony_ci __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 424bf215546Sopenharmony_ci __m128i unused; 425bf215546Sopenharmony_ci 426bf215546Sopenharmony_ci transpose4_epi32(&p0, &p1, &p2, &zero, 427bf215546Sopenharmony_ci &c, &unused, &dcdx, &dcdy); 428bf215546Sopenharmony_ci 429bf215546Sopenharmony_ci /* Adjust dcdx; 430bf215546Sopenharmony_ci */ 431bf215546Sopenharmony_ci dcdx = _mm_sub_epi32(zero, dcdx); 432bf215546Sopenharmony_ci 433bf215546Sopenharmony_ci c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 434bf215546Sopenharmony_ci c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 435bf215546Sopenharmony_ci 436bf215546Sopenharmony_ci /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 437bf215546Sopenharmony_ci c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 438bf215546Sopenharmony_ci 439bf215546Sopenharmony_ci dcdx2 = _mm_add_epi32(dcdx, dcdx); 440bf215546Sopenharmony_ci dcdx3 = _mm_add_epi32(dcdx2, dcdx); 441bf215546Sopenharmony_ci 442bf215546Sopenharmony_ci transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 443bf215546Sopenharmony_ci &span_0, &span_1, &span_2, &unused); 444bf215546Sopenharmony_ci 445bf215546Sopenharmony_ci 446bf215546Sopenharmony_ci { 447bf215546Sopenharmony_ci __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0); 448bf215546Sopenharmony_ci __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1); 449bf215546Sopenharmony_ci __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2); 450bf215546Sopenharmony_ci 451bf215546Sopenharmony_ci __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 452bf215546Sopenharmony_ci 453bf215546Sopenharmony_ci __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 454bf215546Sopenharmony_ci __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 455bf215546Sopenharmony_ci __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 456bf215546Sopenharmony_ci 457bf215546Sopenharmony_ci __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 458bf215546Sopenharmony_ci __m128i c_01 = _mm_packs_epi32(c_0, c_1); 459bf215546Sopenharmony_ci 460bf215546Sopenharmony_ci __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 461bf215546Sopenharmony_ci __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 462bf215546Sopenharmony_ci __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 463bf215546Sopenharmony_ci 464bf215546Sopenharmony_ci __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 465bf215546Sopenharmony_ci 466bf215546Sopenharmony_ci __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 467bf215546Sopenharmony_ci __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 468bf215546Sopenharmony_ci __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 469bf215546Sopenharmony_ci 470bf215546Sopenharmony_ci __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 471bf215546Sopenharmony_ci __m128i c_23 = _mm_packs_epi32(c_2, c_3); 472bf215546Sopenharmony_ci __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 473bf215546Sopenharmony_ci 474bf215546Sopenharmony_ci unsigned mask = _mm_movemask_epi8(c_0123); 475bf215546Sopenharmony_ci 476bf215546Sopenharmony_ci if (mask != 0xffff) 477bf215546Sopenharmony_ci lp_rast_shade_quads_mask(task, 478bf215546Sopenharmony_ci &tri->inputs, 479bf215546Sopenharmony_ci x, 480bf215546Sopenharmony_ci y, 481bf215546Sopenharmony_ci 0xffff & ~mask); 482bf215546Sopenharmony_ci } 483bf215546Sopenharmony_ci} 484bf215546Sopenharmony_ci 485bf215546Sopenharmony_ci#undef NR_PLANES 486bf215546Sopenharmony_ci 487bf215546Sopenharmony_ci#else 488bf215546Sopenharmony_ci 489bf215546Sopenharmony_ci#if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN 490bf215546Sopenharmony_ci 491bf215546Sopenharmony_ci#include <altivec.h> 492bf215546Sopenharmony_ci#include "util/u_pwr8.h" 493bf215546Sopenharmony_ci 494bf215546Sopenharmony_cistatic inline void 495bf215546Sopenharmony_cibuild_masks_ppc(int c, 496bf215546Sopenharmony_ci int cdiff, 497bf215546Sopenharmony_ci int dcdx, 498bf215546Sopenharmony_ci int dcdy, 499bf215546Sopenharmony_ci unsigned *outmask, 500bf215546Sopenharmony_ci unsigned *partmask) 501bf215546Sopenharmony_ci{ 502bf215546Sopenharmony_ci __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 503bf215546Sopenharmony_ci __m128i xdcdy = (__m128i) vec_splats(dcdy); 504bf215546Sopenharmony_ci 505bf215546Sopenharmony_ci /* Get values across the quad 506bf215546Sopenharmony_ci */ 507bf215546Sopenharmony_ci __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); 508bf215546Sopenharmony_ci __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); 509bf215546Sopenharmony_ci __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); 510bf215546Sopenharmony_ci 511bf215546Sopenharmony_ci { 512bf215546Sopenharmony_ci __m128i cstep01, cstep23, result; 513bf215546Sopenharmony_ci 514bf215546Sopenharmony_ci cstep01 = vec_packs_epi32(cstep0, cstep1); 515bf215546Sopenharmony_ci cstep23 = vec_packs_epi32(cstep2, cstep3); 516bf215546Sopenharmony_ci result = vec_packs_epi16(cstep01, cstep23); 517bf215546Sopenharmony_ci 518bf215546Sopenharmony_ci *outmask |= vec_movemask_epi8(result); 519bf215546Sopenharmony_ci } 520bf215546Sopenharmony_ci 521bf215546Sopenharmony_ci 522bf215546Sopenharmony_ci { 523bf215546Sopenharmony_ci __m128i cio4 = (__m128i) vec_splats(cdiff); 524bf215546Sopenharmony_ci __m128i cstep01, cstep23, result; 525bf215546Sopenharmony_ci 526bf215546Sopenharmony_ci cstep0 = vec_add_epi32(cstep0, cio4); 527bf215546Sopenharmony_ci cstep1 = vec_add_epi32(cstep1, cio4); 528bf215546Sopenharmony_ci cstep2 = vec_add_epi32(cstep2, cio4); 529bf215546Sopenharmony_ci cstep3 = vec_add_epi32(cstep3, cio4); 530bf215546Sopenharmony_ci 531bf215546Sopenharmony_ci cstep01 = vec_packs_epi32(cstep0, cstep1); 532bf215546Sopenharmony_ci cstep23 = vec_packs_epi32(cstep2, cstep3); 533bf215546Sopenharmony_ci result = vec_packs_epi16(cstep01, cstep23); 534bf215546Sopenharmony_ci 535bf215546Sopenharmony_ci *partmask |= vec_movemask_epi8(result); 536bf215546Sopenharmony_ci } 537bf215546Sopenharmony_ci} 538bf215546Sopenharmony_ci 539bf215546Sopenharmony_cistatic inline unsigned 540bf215546Sopenharmony_cibuild_mask_linear_ppc(int c, int dcdx, int dcdy) 541bf215546Sopenharmony_ci{ 542bf215546Sopenharmony_ci __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 543bf215546Sopenharmony_ci __m128i xdcdy = (__m128i) vec_splats(dcdy); 544bf215546Sopenharmony_ci 545bf215546Sopenharmony_ci /* Get values across the quad 546bf215546Sopenharmony_ci */ 547bf215546Sopenharmony_ci __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); 548bf215546Sopenharmony_ci __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); 549bf215546Sopenharmony_ci __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); 550bf215546Sopenharmony_ci 551bf215546Sopenharmony_ci /* pack pairs of results into epi16 552bf215546Sopenharmony_ci */ 553bf215546Sopenharmony_ci __m128i cstep01 = vec_packs_epi32(cstep0, cstep1); 554bf215546Sopenharmony_ci __m128i cstep23 = vec_packs_epi32(cstep2, cstep3); 555bf215546Sopenharmony_ci 556bf215546Sopenharmony_ci /* pack into epi8, preserving sign bits 557bf215546Sopenharmony_ci */ 558bf215546Sopenharmony_ci __m128i result = vec_packs_epi16(cstep01, cstep23); 559bf215546Sopenharmony_ci 560bf215546Sopenharmony_ci /* extract sign bits to create mask 561bf215546Sopenharmony_ci */ 562bf215546Sopenharmony_ci return vec_movemask_epi8(result); 563bf215546Sopenharmony_ci} 564bf215546Sopenharmony_ci 565bf215546Sopenharmony_cistatic inline __m128i 566bf215546Sopenharmony_cilp_plane_to_m128i(const struct lp_rast_plane *plane) 567bf215546Sopenharmony_ci{ 568bf215546Sopenharmony_ci return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx, 569bf215546Sopenharmony_ci (int32_t)plane->dcdy, (int32_t)plane->eo); 570bf215546Sopenharmony_ci} 571bf215546Sopenharmony_ci 572bf215546Sopenharmony_ci#define NR_PLANES 3 573bf215546Sopenharmony_ci 574bf215546Sopenharmony_civoid 575bf215546Sopenharmony_cilp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 576bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 577bf215546Sopenharmony_ci{ 578bf215546Sopenharmony_ci const struct lp_rast_triangle *tri = arg.triangle.tri; 579bf215546Sopenharmony_ci const struct lp_rast_plane *plane = GET_PLANES(tri); 580bf215546Sopenharmony_ci const int x = (arg.triangle.plane_mask & 0xff) + task->x; 581bf215546Sopenharmony_ci const int y = (arg.triangle.plane_mask >> 8) + task->y; 582bf215546Sopenharmony_ci 583bf215546Sopenharmony_ci struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; 584bf215546Sopenharmony_ci unsigned nr = 0; 585bf215546Sopenharmony_ci 586bf215546Sopenharmony_ci __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ 587bf215546Sopenharmony_ci __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ 588bf215546Sopenharmony_ci __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ 589bf215546Sopenharmony_ci __m128i zero = vec_splats((unsigned char) 0); 590bf215546Sopenharmony_ci 591bf215546Sopenharmony_ci __m128i c; 592bf215546Sopenharmony_ci __m128i dcdx; 593bf215546Sopenharmony_ci __m128i dcdy; 594bf215546Sopenharmony_ci __m128i rej4; 595bf215546Sopenharmony_ci 596bf215546Sopenharmony_ci __m128i dcdx2; 597bf215546Sopenharmony_ci __m128i dcdx3; 598bf215546Sopenharmony_ci 599bf215546Sopenharmony_ci __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 600bf215546Sopenharmony_ci __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 601bf215546Sopenharmony_ci __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 602bf215546Sopenharmony_ci __m128i unused; 603bf215546Sopenharmony_ci 604bf215546Sopenharmony_ci __m128i vshuf_mask0; 605bf215546Sopenharmony_ci __m128i vshuf_mask1; 606bf215546Sopenharmony_ci __m128i vshuf_mask2; 607bf215546Sopenharmony_ci 608bf215546Sopenharmony_ci#if UTIL_ARCH_LITTLE_ENDIAN 609bf215546Sopenharmony_ci vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100); 610bf215546Sopenharmony_ci vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504); 611bf215546Sopenharmony_ci vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908); 612bf215546Sopenharmony_ci#else 613bf215546Sopenharmony_ci vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F); 614bf215546Sopenharmony_ci vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B); 615bf215546Sopenharmony_ci vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607); 616bf215546Sopenharmony_ci#endif 617bf215546Sopenharmony_ci 618bf215546Sopenharmony_ci transpose4_epi32(&p0, &p1, &p2, &zero, 619bf215546Sopenharmony_ci &c, &dcdx, &dcdy, &rej4); 620bf215546Sopenharmony_ci 621bf215546Sopenharmony_ci /* Adjust dcdx; 622bf215546Sopenharmony_ci */ 623bf215546Sopenharmony_ci dcdx = vec_sub_epi32(zero, dcdx); 624bf215546Sopenharmony_ci 625bf215546Sopenharmony_ci c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x))); 626bf215546Sopenharmony_ci c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y))); 627bf215546Sopenharmony_ci rej4 = vec_slli_epi32(rej4, 2); 628bf215546Sopenharmony_ci 629bf215546Sopenharmony_ci /* 630bf215546Sopenharmony_ci * Adjust so we can just check the sign bit (< 0 comparison), 631bf215546Sopenharmony_ci * instead of having to do a less efficient <= 0 comparison 632bf215546Sopenharmony_ci */ 633bf215546Sopenharmony_ci c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1)); 634bf215546Sopenharmony_ci rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1)); 635bf215546Sopenharmony_ci 636bf215546Sopenharmony_ci dcdx2 = vec_add_epi32(dcdx, dcdx); 637bf215546Sopenharmony_ci dcdx3 = vec_add_epi32(dcdx2, dcdx); 638bf215546Sopenharmony_ci 639bf215546Sopenharmony_ci transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 640bf215546Sopenharmony_ci &span_0, &span_1, &span_2, &unused); 641bf215546Sopenharmony_ci 642bf215546Sopenharmony_ci for (unsigned i = 0; i < 4; i++) { 643bf215546Sopenharmony_ci __m128i cx = c; 644bf215546Sopenharmony_ci 645bf215546Sopenharmony_ci for (unsigned j = 0; j < 4; j++) { 646bf215546Sopenharmony_ci __m128i c4rej = vec_add_epi32(cx, rej4); 647bf215546Sopenharmony_ci __m128i rej_masks = vec_srai_epi32(c4rej, 31); 648bf215546Sopenharmony_ci 649bf215546Sopenharmony_ci /* if (is_zero(rej_masks)) */ 650bf215546Sopenharmony_ci if (vec_movemask_epi8(rej_masks) == 0) { 651bf215546Sopenharmony_ci __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0); 652bf215546Sopenharmony_ci __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1); 653bf215546Sopenharmony_ci __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2); 654bf215546Sopenharmony_ci 655bf215546Sopenharmony_ci __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0); 656bf215546Sopenharmony_ci 657bf215546Sopenharmony_ci __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0)); 658bf215546Sopenharmony_ci __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1)); 659bf215546Sopenharmony_ci __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2)); 660bf215546Sopenharmony_ci 661bf215546Sopenharmony_ci __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1); 662bf215546Sopenharmony_ci __m128i c_01 = vec_packs_epi32(c_0, c_1); 663bf215546Sopenharmony_ci 664bf215546Sopenharmony_ci __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0)); 665bf215546Sopenharmony_ci __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1)); 666bf215546Sopenharmony_ci __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2)); 667bf215546Sopenharmony_ci 668bf215546Sopenharmony_ci __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2); 669bf215546Sopenharmony_ci 670bf215546Sopenharmony_ci __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0)); 671bf215546Sopenharmony_ci __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1)); 672bf215546Sopenharmony_ci __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2)); 673bf215546Sopenharmony_ci 674bf215546Sopenharmony_ci __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3); 675bf215546Sopenharmony_ci __m128i c_23 = vec_packs_epi32(c_2, c_3); 676bf215546Sopenharmony_ci __m128i c_0123 = vec_packs_epi16(c_01, c_23); 677bf215546Sopenharmony_ci 678bf215546Sopenharmony_ci unsigned mask = vec_movemask_epi8(c_0123); 679bf215546Sopenharmony_ci 680bf215546Sopenharmony_ci out[nr].i = i; 681bf215546Sopenharmony_ci out[nr].j = j; 682bf215546Sopenharmony_ci out[nr].mask = mask; 683bf215546Sopenharmony_ci if (mask != 0xffff) 684bf215546Sopenharmony_ci nr++; 685bf215546Sopenharmony_ci } 686bf215546Sopenharmony_ci cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2)); 687bf215546Sopenharmony_ci } 688bf215546Sopenharmony_ci 689bf215546Sopenharmony_ci c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2)); 690bf215546Sopenharmony_ci } 691bf215546Sopenharmony_ci 692bf215546Sopenharmony_ci for (unsigned i = 0; i < nr; i++) 693bf215546Sopenharmony_ci lp_rast_shade_quads_mask(task, 694bf215546Sopenharmony_ci &tri->inputs, 695bf215546Sopenharmony_ci x + 4 * out[i].j, 696bf215546Sopenharmony_ci y + 4 * out[i].i, 697bf215546Sopenharmony_ci 0xffff & ~out[i].mask); 698bf215546Sopenharmony_ci} 699bf215546Sopenharmony_ci 700bf215546Sopenharmony_ci#undef NR_PLANES 701bf215546Sopenharmony_ci 702bf215546Sopenharmony_ci#else 703bf215546Sopenharmony_ci 704bf215546Sopenharmony_civoid 705bf215546Sopenharmony_cilp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 706bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 707bf215546Sopenharmony_ci{ 708bf215546Sopenharmony_ci union lp_rast_cmd_arg arg2; 709bf215546Sopenharmony_ci arg2.triangle.tri = arg.triangle.tri; 710bf215546Sopenharmony_ci arg2.triangle.plane_mask = (1<<3)-1; 711bf215546Sopenharmony_ci lp_rast_triangle_32_3(task, arg2); 712bf215546Sopenharmony_ci} 713bf215546Sopenharmony_ci 714bf215546Sopenharmony_ci#endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */ 715bf215546Sopenharmony_ci 716bf215546Sopenharmony_civoid 717bf215546Sopenharmony_cilp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, 718bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 719bf215546Sopenharmony_ci{ 720bf215546Sopenharmony_ci union lp_rast_cmd_arg arg2; 721bf215546Sopenharmony_ci arg2.triangle.tri = arg.triangle.tri; 722bf215546Sopenharmony_ci arg2.triangle.plane_mask = (1<<4)-1; 723bf215546Sopenharmony_ci lp_rast_triangle_32_4(task, arg2); 724bf215546Sopenharmony_ci} 725bf215546Sopenharmony_ci 726bf215546Sopenharmony_civoid 727bf215546Sopenharmony_cilp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, 728bf215546Sopenharmony_ci const union lp_rast_cmd_arg arg) 729bf215546Sopenharmony_ci{ 730bf215546Sopenharmony_ci lp_rast_triangle_32_3_16(task, arg); 731bf215546Sopenharmony_ci} 732bf215546Sopenharmony_ci 733bf215546Sopenharmony_ci#endif 734bf215546Sopenharmony_ci 735bf215546Sopenharmony_ci#if defined PIPE_ARCH_SSE 736bf215546Sopenharmony_ci#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) 737bf215546Sopenharmony_ci#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy) 738bf215546Sopenharmony_ci#elif (defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN) 739bf215546Sopenharmony_ci#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) 740bf215546Sopenharmony_ci#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy) 741bf215546Sopenharmony_ci#else 742bf215546Sopenharmony_ci#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask) 743bf215546Sopenharmony_ci#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy) 744bf215546Sopenharmony_ci#endif 745bf215546Sopenharmony_ci 746bf215546Sopenharmony_ci#define RASTER_64 1 747bf215546Sopenharmony_ci 748bf215546Sopenharmony_ci#define TAG(x) x##_1 749bf215546Sopenharmony_ci#define NR_PLANES 1 750bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 751bf215546Sopenharmony_ci 752bf215546Sopenharmony_ci#define TAG(x) x##_2 753bf215546Sopenharmony_ci#define NR_PLANES 2 754bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 755bf215546Sopenharmony_ci 756bf215546Sopenharmony_ci#define TAG(x) x##_3 757bf215546Sopenharmony_ci#define NR_PLANES 3 758bf215546Sopenharmony_ci/*#define TRI_4 lp_rast_triangle_3_4*/ 759bf215546Sopenharmony_ci/*#define TRI_16 lp_rast_triangle_3_16*/ 760bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 761bf215546Sopenharmony_ci 762bf215546Sopenharmony_ci#define TAG(x) x##_4 763bf215546Sopenharmony_ci#define NR_PLANES 4 764bf215546Sopenharmony_ci/*#define TRI_16 lp_rast_triangle_4_16*/ 765bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 766bf215546Sopenharmony_ci 767bf215546Sopenharmony_ci#define TAG(x) x##_5 768bf215546Sopenharmony_ci#define NR_PLANES 5 769bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 770bf215546Sopenharmony_ci 771bf215546Sopenharmony_ci#define TAG(x) x##_6 772bf215546Sopenharmony_ci#define NR_PLANES 6 773bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 774bf215546Sopenharmony_ci 775bf215546Sopenharmony_ci#define TAG(x) x##_7 776bf215546Sopenharmony_ci#define NR_PLANES 7 777bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 778bf215546Sopenharmony_ci 779bf215546Sopenharmony_ci#define TAG(x) x##_8 780bf215546Sopenharmony_ci#define NR_PLANES 8 781bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 782bf215546Sopenharmony_ci 783bf215546Sopenharmony_ci#undef RASTER_64 784bf215546Sopenharmony_ci 785bf215546Sopenharmony_ci#define TAG(x) x##_32_1 786bf215546Sopenharmony_ci#define NR_PLANES 1 787bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 788bf215546Sopenharmony_ci 789bf215546Sopenharmony_ci#define TAG(x) x##_32_2 790bf215546Sopenharmony_ci#define NR_PLANES 2 791bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 792bf215546Sopenharmony_ci 793bf215546Sopenharmony_ci#define TAG(x) x##_32_3 794bf215546Sopenharmony_ci#define NR_PLANES 3 795bf215546Sopenharmony_ci/*#define TRI_4 lp_rast_triangle_3_4*/ 796bf215546Sopenharmony_ci/*#define TRI_16 lp_rast_triangle_3_16*/ 797bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 798bf215546Sopenharmony_ci 799bf215546Sopenharmony_ci#define TAG(x) x##_32_4 800bf215546Sopenharmony_ci#define NR_PLANES 4 801bf215546Sopenharmony_ci#ifdef PIPE_ARCH_SSE 802bf215546Sopenharmony_ci#define TRI_16 lp_rast_triangle_32_4_16 803bf215546Sopenharmony_ci#endif 804bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 805bf215546Sopenharmony_ci 806bf215546Sopenharmony_ci#define TAG(x) x##_32_5 807bf215546Sopenharmony_ci#define NR_PLANES 5 808bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 809bf215546Sopenharmony_ci 810bf215546Sopenharmony_ci#define TAG(x) x##_32_6 811bf215546Sopenharmony_ci#define NR_PLANES 6 812bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 813bf215546Sopenharmony_ci 814bf215546Sopenharmony_ci#define TAG(x) x##_32_7 815bf215546Sopenharmony_ci#define NR_PLANES 7 816bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 817bf215546Sopenharmony_ci 818bf215546Sopenharmony_ci#define TAG(x) x##_32_8 819bf215546Sopenharmony_ci#define NR_PLANES 8 820bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 821bf215546Sopenharmony_ci 822bf215546Sopenharmony_ci#define MULTISAMPLE 1 823bf215546Sopenharmony_ci#define RASTER_64 1 824bf215546Sopenharmony_ci 825bf215546Sopenharmony_ci#define TAG(x) x##_ms_1 826bf215546Sopenharmony_ci#define NR_PLANES 1 827bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 828bf215546Sopenharmony_ci 829bf215546Sopenharmony_ci#define TAG(x) x##_ms_2 830bf215546Sopenharmony_ci#define NR_PLANES 2 831bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 832bf215546Sopenharmony_ci 833bf215546Sopenharmony_ci#define TAG(x) x##_ms_3 834bf215546Sopenharmony_ci#define NR_PLANES 3 835bf215546Sopenharmony_ci/*#define TRI_4 lp_rast_triangle_3_4*/ 836bf215546Sopenharmony_ci/*#define TRI_16 lp_rast_triangle_3_16*/ 837bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 838bf215546Sopenharmony_ci 839bf215546Sopenharmony_ci#define TAG(x) x##_ms_4 840bf215546Sopenharmony_ci#define NR_PLANES 4 841bf215546Sopenharmony_ci/*#define TRI_16 lp_rast_triangle_4_16*/ 842bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 843bf215546Sopenharmony_ci 844bf215546Sopenharmony_ci#define TAG(x) x##_ms_5 845bf215546Sopenharmony_ci#define NR_PLANES 5 846bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 847bf215546Sopenharmony_ci 848bf215546Sopenharmony_ci#define TAG(x) x##_ms_6 849bf215546Sopenharmony_ci#define NR_PLANES 6 850bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 851bf215546Sopenharmony_ci 852bf215546Sopenharmony_ci#define TAG(x) x##_ms_7 853bf215546Sopenharmony_ci#define NR_PLANES 7 854bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 855bf215546Sopenharmony_ci 856bf215546Sopenharmony_ci#define TAG(x) x##_ms_8 857bf215546Sopenharmony_ci#define NR_PLANES 8 858bf215546Sopenharmony_ci#include "lp_rast_tri_tmp.h" 859bf215546Sopenharmony_ci 860bf215546Sopenharmony_ci#undef RASTER_64 861