1/************************************************************************** 2 * 3 * Copyright 2007-2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28/* 29 * Rasterization for binned triangles within a tile 30 */ 31 32#include <limits.h> 33#include "util/u_math.h" 34#include "lp_debug.h" 35#include "lp_perf.h" 36#include "lp_rast_priv.h" 37 38/** 39 * Shade all pixels in a 4x4 block. 40 */ 41static void 42block_full_4(struct lp_rasterizer_task *task, 43 const struct lp_rast_triangle *tri, 44 int x, int y) 45{ 46 lp_rast_shade_quads_all(task, &tri->inputs, x, y); 47} 48 49 50/** 51 * Shade all pixels in a 16x16 block. 52 */ 53static void 54block_full_16(struct lp_rasterizer_task *task, 55 const struct lp_rast_triangle *tri, 56 int x, int y) 57{ 58 assert(x % 16 == 0); 59 assert(y % 16 == 0); 60 for (unsigned iy = 0; iy < 16; iy += 4) 61 for (unsigned ix = 0; ix < 16; ix += 4) 62 block_full_4(task, tri, x + ix, y + iy); 63} 64 65static inline unsigned 66build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy) 67{ 68 unsigned mask = 0; 69 70 int32_t c0 = c; 71 int32_t c1 = c0 + dcdy; 72 int32_t c2 = c1 + dcdy; 73 int32_t c3 = c2 + dcdy; 74 75 mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0); 76 mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1); 77 mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2); 78 mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3); 79 mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4); 80 mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5); 81 mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6); 82 mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); 83 mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8); 84 mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9); 85 mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10); 86 mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11); 87 mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12); 88 mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13); 89 mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14); 90 mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15); 91 92 return mask; 93} 94 95 96static inline void 97build_masks(int32_t c, 98 int32_t cdiff, 99 int32_t dcdx, 100 int32_t dcdy, 101 unsigned *outmask, 102 unsigned *partmask) 103{ 104 *outmask |= build_mask_linear(c, dcdx, dcdy); 105 *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy); 106} 107 108void 109lp_rast_triangle_3_16(struct lp_rasterizer_task *task, 110 const union lp_rast_cmd_arg arg) 111{ 112 union lp_rast_cmd_arg arg2; 113 arg2.triangle.tri = arg.triangle.tri; 114 arg2.triangle.plane_mask = (1<<3)-1; 115 lp_rast_triangle_3(task, arg2); 116} 117 118void 119lp_rast_triangle_3_4(struct lp_rasterizer_task *task, 120 const union lp_rast_cmd_arg arg) 121{ 122 lp_rast_triangle_3_16(task, arg); 123} 124 125void 126lp_rast_triangle_4_16(struct lp_rasterizer_task *task, 127 const union lp_rast_cmd_arg arg) 128{ 129 union lp_rast_cmd_arg arg2; 130 arg2.triangle.tri = arg.triangle.tri; 131 arg2.triangle.plane_mask = (1<<4)-1; 132 lp_rast_triangle_4(task, arg2); 133} 134 135void 136lp_rast_triangle_ms_3_16(struct lp_rasterizer_task *task, 137 const union lp_rast_cmd_arg arg) 138{ 139 union lp_rast_cmd_arg arg2; 140 arg2.triangle.tri = arg.triangle.tri; 141 arg2.triangle.plane_mask = (1<<3)-1; 142 lp_rast_triangle_ms_3(task, arg2); 143} 144 145void 146lp_rast_triangle_ms_3_4(struct lp_rasterizer_task *task, 147 const union lp_rast_cmd_arg arg) 148{ 149 lp_rast_triangle_ms_3_16(task, arg); 150} 151 152void 153lp_rast_triangle_ms_4_16(struct lp_rasterizer_task *task, 154 const union lp_rast_cmd_arg arg) 155{ 156 union lp_rast_cmd_arg arg2; 157 arg2.triangle.tri = arg.triangle.tri; 158 arg2.triangle.plane_mask = (1<<4)-1; 159 lp_rast_triangle_ms_4(task, arg2); 160} 161 162#if defined(PIPE_ARCH_SSE) 163 164#include <emmintrin.h> 165#include "util/u_sse.h" 166 167 168static inline void 169build_masks_sse(int c, 170 int cdiff, 171 int dcdx, 172 int dcdy, 173 unsigned *outmask, 174 unsigned *partmask) 175{ 176 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 177 __m128i xdcdy = _mm_set1_epi32(dcdy); 178 179 /* Get values across the quad 180 */ 181 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 182 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 183 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 184 185 { 186 __m128i cstep01, cstep23, result; 187 188 cstep01 = _mm_packs_epi32(cstep0, cstep1); 189 cstep23 = _mm_packs_epi32(cstep2, cstep3); 190 result = _mm_packs_epi16(cstep01, cstep23); 191 192 *outmask |= _mm_movemask_epi8(result); 193 } 194 195 196 { 197 __m128i cio4 = _mm_set1_epi32(cdiff); 198 __m128i cstep01, cstep23, result; 199 200 cstep0 = _mm_add_epi32(cstep0, cio4); 201 cstep1 = _mm_add_epi32(cstep1, cio4); 202 cstep2 = _mm_add_epi32(cstep2, cio4); 203 cstep3 = _mm_add_epi32(cstep3, cio4); 204 205 cstep01 = _mm_packs_epi32(cstep0, cstep1); 206 cstep23 = _mm_packs_epi32(cstep2, cstep3); 207 result = _mm_packs_epi16(cstep01, cstep23); 208 209 *partmask |= _mm_movemask_epi8(result); 210 } 211} 212 213 214static inline unsigned 215build_mask_linear_sse(int c, int dcdx, int dcdy) 216{ 217 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 218 __m128i xdcdy = _mm_set1_epi32(dcdy); 219 220 /* Get values across the quad 221 */ 222 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 223 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 224 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 225 226 /* pack pairs of results into epi16 227 */ 228 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 229 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 230 231 /* pack into epi8, preserving sign bits 232 */ 233 __m128i result = _mm_packs_epi16(cstep01, cstep23); 234 235 /* extract sign bits to create mask 236 */ 237 return _mm_movemask_epi8(result); 238} 239 240static inline unsigned 241sign_bits4(const __m128i *cstep, int cdiff) 242{ 243 244 /* Adjust the step values 245 */ 246 __m128i cio4 = _mm_set1_epi32(cdiff); 247 __m128i cstep0 = _mm_add_epi32(cstep[0], cio4); 248 __m128i cstep1 = _mm_add_epi32(cstep[1], cio4); 249 __m128i cstep2 = _mm_add_epi32(cstep[2], cio4); 250 __m128i cstep3 = _mm_add_epi32(cstep[3], cio4); 251 252 /* Pack down to epi8 253 */ 254 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 255 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 256 __m128i result = _mm_packs_epi16(cstep01, cstep23); 257 258 /* Extract the sign bits 259 */ 260 return _mm_movemask_epi8(result); 261} 262 263#define COLUMN0 ((1<<0)|(1<<4)|(1<<8) |(1<<12)) 264#define COLUMN1 ((1<<1)|(1<<5)|(1<<9) |(1<<13)) 265#define COLUMN2 ((1<<2)|(1<<6)|(1<<10)|(1<<14)) 266#define COLUMN3 ((1<<3)|(1<<7)|(1<<11)|(1<<15)) 267 268#define ROW0 ((1<<0) |(1<<1) |(1<<2) |(1<<3)) 269#define ROW1 ((1<<4) |(1<<5) |(1<<6) |(1<<7)) 270#define ROW2 ((1<<8) |(1<<9) |(1<<10)|(1<<11)) 271#define ROW3 ((1<<12)|(1<<13)|(1<<14)|(1<<15)) 272 273#define STAMP_SIZE 4 274static unsigned bottom_mask_tab[STAMP_SIZE] = { 275 ROW3, 276 ROW3 | ROW2, 277 ROW3 | ROW2 | ROW1, 278 ROW3 | ROW2 | ROW1 | ROW0, 279}; 280 281static unsigned right_mask_tab[STAMP_SIZE] = { 282 COLUMN3, 283 COLUMN3 | COLUMN2, 284 COLUMN3 | COLUMN2 | COLUMN1, 285 COLUMN3 | COLUMN2 | COLUMN1 | COLUMN0, 286}; 287 288 289#define NR_PLANES 3 290 291void 292lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 293 const union lp_rast_cmd_arg arg) 294{ 295 const struct lp_rast_triangle *tri = arg.triangle.tri; 296 const struct lp_rast_plane *plane = GET_PLANES(tri); 297 const int x = (arg.triangle.plane_mask & 0xff) + task->x; 298 const int y = (arg.triangle.plane_mask >> 8) + task->y; 299 300 struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; 301 unsigned nr = 0; 302 303 /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ 304 __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ 305 __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); 306 __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); 307 __m128i zero = _mm_setzero_si128(); 308 309 __m128i c, dcdx, dcdy, rej4; 310 __m128i dcdx_neg_mask, dcdy_neg_mask; 311 __m128i dcdx2, dcdx3; 312 313 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 314 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 315 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 316 __m128i unused; 317 318 transpose4_epi32(&p0, &p1, &p2, &zero, 319 &c, &unused, &dcdx, &dcdy); 320 321 /* recalc eo - easier than trying to load as scalars / shuffle... */ 322 dcdx_neg_mask = _mm_srai_epi32(dcdx, 31); 323 dcdy_neg_mask = _mm_srai_epi32(dcdy, 31); 324 rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy), 325 _mm_and_si128(dcdx_neg_mask, dcdx)); 326 327 /* Adjust dcdx; 328 */ 329 dcdx = _mm_sub_epi32(zero, dcdx); 330 331 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 332 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 333 rej4 = _mm_slli_epi32(rej4, 2); 334 335 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 336 c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 337 rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1)); 338 339 dcdx2 = _mm_add_epi32(dcdx, dcdx); 340 dcdx3 = _mm_add_epi32(dcdx2, dcdx); 341 342 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 343 &span_0, &span_1, &span_2, &unused); 344 345 for (unsigned i = 0; i < 4; i++) { 346 __m128i cx = c; 347 348 for (unsigned j = 0; j < 4; j++) { 349 __m128i c4rej = _mm_add_epi32(cx, rej4); 350 __m128i rej_masks = _mm_srai_epi32(c4rej, 31); 351 352 /* if (is_zero(rej_masks)) */ 353 if (_mm_movemask_epi8(rej_masks) == 0) { 354 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0); 355 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1); 356 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2); 357 358 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 359 360 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 361 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 362 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 363 364 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 365 __m128i c_01 = _mm_packs_epi32(c_0, c_1); 366 367 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 368 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 369 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 370 371 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 372 373 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 374 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 375 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 376 377 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 378 __m128i c_23 = _mm_packs_epi32(c_2, c_3); 379 __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 380 381 unsigned mask = _mm_movemask_epi8(c_0123); 382 383 out[nr].i = i; 384 out[nr].j = j; 385 out[nr].mask = mask; 386 if (mask != 0xffff) 387 nr++; 388 } 389 cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2)); 390 } 391 392 c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2)); 393 } 394 395 for (unsigned i = 0; i < nr; i++) 396 lp_rast_shade_quads_mask(task, 397 &tri->inputs, 398 x + 4 * out[i].j, 399 y + 4 * out[i].i, 400 0xffff & ~out[i].mask); 401} 402 403void 404lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, 405 const union lp_rast_cmd_arg arg) 406{ 407 const struct lp_rast_triangle *tri = arg.triangle.tri; 408 const struct lp_rast_plane *plane = GET_PLANES(tri); 409 const unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; 410 const unsigned y = (arg.triangle.plane_mask >> 8) + task->y; 411 412 /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ 413 __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ 414 __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); 415 __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); 416 __m128i zero = _mm_setzero_si128(); 417 418 __m128i c, dcdx, dcdy; 419 __m128i dcdx2, dcdx3; 420 421 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 422 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 423 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 424 __m128i unused; 425 426 transpose4_epi32(&p0, &p1, &p2, &zero, 427 &c, &unused, &dcdx, &dcdy); 428 429 /* Adjust dcdx; 430 */ 431 dcdx = _mm_sub_epi32(zero, dcdx); 432 433 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 434 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 435 436 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 437 c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 438 439 dcdx2 = _mm_add_epi32(dcdx, dcdx); 440 dcdx3 = _mm_add_epi32(dcdx2, dcdx); 441 442 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 443 &span_0, &span_1, &span_2, &unused); 444 445 446 { 447 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0); 448 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1); 449 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2); 450 451 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 452 453 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 454 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 455 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 456 457 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 458 __m128i c_01 = _mm_packs_epi32(c_0, c_1); 459 460 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 461 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 462 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 463 464 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 465 466 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 467 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 468 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 469 470 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 471 __m128i c_23 = _mm_packs_epi32(c_2, c_3); 472 __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 473 474 unsigned mask = _mm_movemask_epi8(c_0123); 475 476 if (mask != 0xffff) 477 lp_rast_shade_quads_mask(task, 478 &tri->inputs, 479 x, 480 y, 481 0xffff & ~mask); 482 } 483} 484 485#undef NR_PLANES 486 487#else 488 489#if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN 490 491#include <altivec.h> 492#include "util/u_pwr8.h" 493 494static inline void 495build_masks_ppc(int c, 496 int cdiff, 497 int dcdx, 498 int dcdy, 499 unsigned *outmask, 500 unsigned *partmask) 501{ 502 __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 503 __m128i xdcdy = (__m128i) vec_splats(dcdy); 504 505 /* Get values across the quad 506 */ 507 __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); 508 __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); 509 __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); 510 511 { 512 __m128i cstep01, cstep23, result; 513 514 cstep01 = vec_packs_epi32(cstep0, cstep1); 515 cstep23 = vec_packs_epi32(cstep2, cstep3); 516 result = vec_packs_epi16(cstep01, cstep23); 517 518 *outmask |= vec_movemask_epi8(result); 519 } 520 521 522 { 523 __m128i cio4 = (__m128i) vec_splats(cdiff); 524 __m128i cstep01, cstep23, result; 525 526 cstep0 = vec_add_epi32(cstep0, cio4); 527 cstep1 = vec_add_epi32(cstep1, cio4); 528 cstep2 = vec_add_epi32(cstep2, cio4); 529 cstep3 = vec_add_epi32(cstep3, cio4); 530 531 cstep01 = vec_packs_epi32(cstep0, cstep1); 532 cstep23 = vec_packs_epi32(cstep2, cstep3); 533 result = vec_packs_epi16(cstep01, cstep23); 534 535 *partmask |= vec_movemask_epi8(result); 536 } 537} 538 539static inline unsigned 540build_mask_linear_ppc(int c, int dcdx, int dcdy) 541{ 542 __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 543 __m128i xdcdy = (__m128i) vec_splats(dcdy); 544 545 /* Get values across the quad 546 */ 547 __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); 548 __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); 549 __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); 550 551 /* pack pairs of results into epi16 552 */ 553 __m128i cstep01 = vec_packs_epi32(cstep0, cstep1); 554 __m128i cstep23 = vec_packs_epi32(cstep2, cstep3); 555 556 /* pack into epi8, preserving sign bits 557 */ 558 __m128i result = vec_packs_epi16(cstep01, cstep23); 559 560 /* extract sign bits to create mask 561 */ 562 return vec_movemask_epi8(result); 563} 564 565static inline __m128i 566lp_plane_to_m128i(const struct lp_rast_plane *plane) 567{ 568 return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx, 569 (int32_t)plane->dcdy, (int32_t)plane->eo); 570} 571 572#define NR_PLANES 3 573 574void 575lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 576 const union lp_rast_cmd_arg arg) 577{ 578 const struct lp_rast_triangle *tri = arg.triangle.tri; 579 const struct lp_rast_plane *plane = GET_PLANES(tri); 580 const int x = (arg.triangle.plane_mask & 0xff) + task->x; 581 const int y = (arg.triangle.plane_mask >> 8) + task->y; 582 583 struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; 584 unsigned nr = 0; 585 586 __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ 587 __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ 588 __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ 589 __m128i zero = vec_splats((unsigned char) 0); 590 591 __m128i c; 592 __m128i dcdx; 593 __m128i dcdy; 594 __m128i rej4; 595 596 __m128i dcdx2; 597 __m128i dcdx3; 598 599 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 600 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 601 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 602 __m128i unused; 603 604 __m128i vshuf_mask0; 605 __m128i vshuf_mask1; 606 __m128i vshuf_mask2; 607 608#if UTIL_ARCH_LITTLE_ENDIAN 609 vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100); 610 vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504); 611 vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908); 612#else 613 vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F); 614 vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B); 615 vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607); 616#endif 617 618 transpose4_epi32(&p0, &p1, &p2, &zero, 619 &c, &dcdx, &dcdy, &rej4); 620 621 /* Adjust dcdx; 622 */ 623 dcdx = vec_sub_epi32(zero, dcdx); 624 625 c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x))); 626 c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y))); 627 rej4 = vec_slli_epi32(rej4, 2); 628 629 /* 630 * Adjust so we can just check the sign bit (< 0 comparison), 631 * instead of having to do a less efficient <= 0 comparison 632 */ 633 c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1)); 634 rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1)); 635 636 dcdx2 = vec_add_epi32(dcdx, dcdx); 637 dcdx3 = vec_add_epi32(dcdx2, dcdx); 638 639 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 640 &span_0, &span_1, &span_2, &unused); 641 642 for (unsigned i = 0; i < 4; i++) { 643 __m128i cx = c; 644 645 for (unsigned j = 0; j < 4; j++) { 646 __m128i c4rej = vec_add_epi32(cx, rej4); 647 __m128i rej_masks = vec_srai_epi32(c4rej, 31); 648 649 /* if (is_zero(rej_masks)) */ 650 if (vec_movemask_epi8(rej_masks) == 0) { 651 __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0); 652 __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1); 653 __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2); 654 655 __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0); 656 657 __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0)); 658 __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1)); 659 __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2)); 660 661 __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1); 662 __m128i c_01 = vec_packs_epi32(c_0, c_1); 663 664 __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0)); 665 __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1)); 666 __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2)); 667 668 __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2); 669 670 __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0)); 671 __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1)); 672 __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2)); 673 674 __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3); 675 __m128i c_23 = vec_packs_epi32(c_2, c_3); 676 __m128i c_0123 = vec_packs_epi16(c_01, c_23); 677 678 unsigned mask = vec_movemask_epi8(c_0123); 679 680 out[nr].i = i; 681 out[nr].j = j; 682 out[nr].mask = mask; 683 if (mask != 0xffff) 684 nr++; 685 } 686 cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2)); 687 } 688 689 c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2)); 690 } 691 692 for (unsigned i = 0; i < nr; i++) 693 lp_rast_shade_quads_mask(task, 694 &tri->inputs, 695 x + 4 * out[i].j, 696 y + 4 * out[i].i, 697 0xffff & ~out[i].mask); 698} 699 700#undef NR_PLANES 701 702#else 703 704void 705lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 706 const union lp_rast_cmd_arg arg) 707{ 708 union lp_rast_cmd_arg arg2; 709 arg2.triangle.tri = arg.triangle.tri; 710 arg2.triangle.plane_mask = (1<<3)-1; 711 lp_rast_triangle_32_3(task, arg2); 712} 713 714#endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */ 715 716void 717lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, 718 const union lp_rast_cmd_arg arg) 719{ 720 union lp_rast_cmd_arg arg2; 721 arg2.triangle.tri = arg.triangle.tri; 722 arg2.triangle.plane_mask = (1<<4)-1; 723 lp_rast_triangle_32_4(task, arg2); 724} 725 726void 727lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, 728 const union lp_rast_cmd_arg arg) 729{ 730 lp_rast_triangle_32_3_16(task, arg); 731} 732 733#endif 734 735#if defined PIPE_ARCH_SSE 736#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) 737#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy) 738#elif (defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN) 739#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) 740#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy) 741#else 742#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask) 743#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy) 744#endif 745 746#define RASTER_64 1 747 748#define TAG(x) x##_1 749#define NR_PLANES 1 750#include "lp_rast_tri_tmp.h" 751 752#define TAG(x) x##_2 753#define NR_PLANES 2 754#include "lp_rast_tri_tmp.h" 755 756#define TAG(x) x##_3 757#define NR_PLANES 3 758/*#define TRI_4 lp_rast_triangle_3_4*/ 759/*#define TRI_16 lp_rast_triangle_3_16*/ 760#include "lp_rast_tri_tmp.h" 761 762#define TAG(x) x##_4 763#define NR_PLANES 4 764/*#define TRI_16 lp_rast_triangle_4_16*/ 765#include "lp_rast_tri_tmp.h" 766 767#define TAG(x) x##_5 768#define NR_PLANES 5 769#include "lp_rast_tri_tmp.h" 770 771#define TAG(x) x##_6 772#define NR_PLANES 6 773#include "lp_rast_tri_tmp.h" 774 775#define TAG(x) x##_7 776#define NR_PLANES 7 777#include "lp_rast_tri_tmp.h" 778 779#define TAG(x) x##_8 780#define NR_PLANES 8 781#include "lp_rast_tri_tmp.h" 782 783#undef RASTER_64 784 785#define TAG(x) x##_32_1 786#define NR_PLANES 1 787#include "lp_rast_tri_tmp.h" 788 789#define TAG(x) x##_32_2 790#define NR_PLANES 2 791#include "lp_rast_tri_tmp.h" 792 793#define TAG(x) x##_32_3 794#define NR_PLANES 3 795/*#define TRI_4 lp_rast_triangle_3_4*/ 796/*#define TRI_16 lp_rast_triangle_3_16*/ 797#include "lp_rast_tri_tmp.h" 798 799#define TAG(x) x##_32_4 800#define NR_PLANES 4 801#ifdef PIPE_ARCH_SSE 802#define TRI_16 lp_rast_triangle_32_4_16 803#endif 804#include "lp_rast_tri_tmp.h" 805 806#define TAG(x) x##_32_5 807#define NR_PLANES 5 808#include "lp_rast_tri_tmp.h" 809 810#define TAG(x) x##_32_6 811#define NR_PLANES 6 812#include "lp_rast_tri_tmp.h" 813 814#define TAG(x) x##_32_7 815#define NR_PLANES 7 816#include "lp_rast_tri_tmp.h" 817 818#define TAG(x) x##_32_8 819#define NR_PLANES 8 820#include "lp_rast_tri_tmp.h" 821 822#define MULTISAMPLE 1 823#define RASTER_64 1 824 825#define TAG(x) x##_ms_1 826#define NR_PLANES 1 827#include "lp_rast_tri_tmp.h" 828 829#define TAG(x) x##_ms_2 830#define NR_PLANES 2 831#include "lp_rast_tri_tmp.h" 832 833#define TAG(x) x##_ms_3 834#define NR_PLANES 3 835/*#define TRI_4 lp_rast_triangle_3_4*/ 836/*#define TRI_16 lp_rast_triangle_3_16*/ 837#include "lp_rast_tri_tmp.h" 838 839#define TAG(x) x##_ms_4 840#define NR_PLANES 4 841/*#define TRI_16 lp_rast_triangle_4_16*/ 842#include "lp_rast_tri_tmp.h" 843 844#define TAG(x) x##_ms_5 845#define NR_PLANES 5 846#include "lp_rast_tri_tmp.h" 847 848#define TAG(x) x##_ms_6 849#define NR_PLANES 6 850#include "lp_rast_tri_tmp.h" 851 852#define TAG(x) x##_ms_7 853#define NR_PLANES 7 854#include "lp_rast_tri_tmp.h" 855 856#define TAG(x) x##_ms_8 857#define NR_PLANES 8 858#include "lp_rast_tri_tmp.h" 859 860#undef RASTER_64 861