1/************************************************************************** 2 * 3 * Copyright 2009-2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28/** 29 * @file 30 * Depth/stencil testing to LLVM IR translation. 31 * 32 * To be done accurately/efficiently the depth/stencil test must be done with 33 * the same type/format of the depth/stencil buffer, which implies massaging 34 * the incoming depths to fit into place. Using a more straightforward 35 * type/format for depth/stencil values internally and only convert when 36 * flushing would avoid this, but it would most likely result in depth fighting 37 * artifacts. 38 * 39 * Since we're using linear layout for everything, but we need to deal with 40 * 2x2 quads, we need to load/store multiple values and swizzle them into 41 * place (we could avoid this by doing depth/stencil testing in linear format, 42 * which would be easy for late depth/stencil test as we could do that after 43 * the fragment shader loop just as we do for color buffers, but more tricky 44 * for early depth test as we'd need both masks and interpolated depth in 45 * linear format). 46 * 47 * 48 * @author Jose Fonseca <jfonseca@vmware.com> 49 * @author Brian Paul <jfonseca@vmware.com> 50 */ 51 52#include "pipe/p_state.h" 53#include "util/format/u_format.h" 54#include "util/u_cpu_detect.h" 55 56#include "gallivm/lp_bld_type.h" 57#include "gallivm/lp_bld_arit.h" 58#include "gallivm/lp_bld_bitarit.h" 59#include "gallivm/lp_bld_const.h" 60#include "gallivm/lp_bld_conv.h" 61#include "gallivm/lp_bld_logic.h" 62#include "gallivm/lp_bld_flow.h" 63#include "gallivm/lp_bld_intr.h" 64#include "gallivm/lp_bld_debug.h" 65#include "gallivm/lp_bld_swizzle.h" 66#include "gallivm/lp_bld_pack.h" 67 68#include "lp_bld_depth.h" 69#include "lp_state_fs.h" 70 71 72/** Used to select fields from pipe_stencil_state */ 73enum stencil_op { 74 S_FAIL_OP, 75 Z_FAIL_OP, 76 Z_PASS_OP 77}; 78 79 80 81/** 82 * Do the stencil test comparison (compare FB stencil values against ref value). 83 * This will be used twice when generating two-sided stencil code. 84 * \param stencil the front/back stencil state 85 * \param stencilRef the stencil reference value, replicated as a vector 86 * \param stencilVals vector of stencil values from framebuffer 87 * \return vector mask of pass/fail values (~0 or 0) 88 */ 89static LLVMValueRef 90lp_build_stencil_test_single(struct lp_build_context *bld, 91 const struct pipe_stencil_state *stencil, 92 LLVMValueRef stencilRef, 93 LLVMValueRef stencilVals) 94{ 95 LLVMBuilderRef builder = bld->gallivm->builder; 96 const unsigned stencilMax = 255; /* XXX fix */ 97 struct lp_type type = bld->type; 98 LLVMValueRef res; 99 100 /* 101 * SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values 102 * are between 0..255 so ensure we generate the fastest comparisons for 103 * wider elements. 104 */ 105 if (type.width <= 8) { 106 assert(!type.sign); 107 } else { 108 assert(type.sign); 109 } 110 111 assert(stencil->enabled); 112 113 if (stencil->valuemask != stencilMax) { 114 /* compute stencilRef = stencilRef & valuemask */ 115 LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask); 116 stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, ""); 117 /* compute stencilVals = stencilVals & valuemask */ 118 stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, ""); 119 } 120 121 res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals); 122 123 return res; 124} 125 126 127/** 128 * Do the one or two-sided stencil test comparison. 129 * \sa lp_build_stencil_test_single 130 * \param front_facing an integer vector mask, indicating front (~0) or back 131 * (0) facing polygon. If NULL, assume front-facing. 132 */ 133static LLVMValueRef 134lp_build_stencil_test(struct lp_build_context *bld, 135 const struct pipe_stencil_state stencil[2], 136 LLVMValueRef stencilRefs[2], 137 LLVMValueRef stencilVals, 138 LLVMValueRef front_facing) 139{ 140 LLVMValueRef res; 141 142 assert(stencil[0].enabled); 143 144 /* do front face test */ 145 res = lp_build_stencil_test_single(bld, &stencil[0], 146 stencilRefs[0], stencilVals); 147 148 if (stencil[1].enabled && front_facing != NULL) { 149 /* do back face test */ 150 LLVMValueRef back_res; 151 152 back_res = lp_build_stencil_test_single(bld, &stencil[1], 153 stencilRefs[1], stencilVals); 154 155 res = lp_build_select(bld, front_facing, res, back_res); 156 } 157 158 return res; 159} 160 161 162/** 163 * Apply the stencil operator (add/sub/keep/etc) to the given vector 164 * of stencil values. 165 * \return new stencil values vector 166 */ 167static LLVMValueRef 168lp_build_stencil_op_single(struct lp_build_context *bld, 169 const struct pipe_stencil_state *stencil, 170 enum stencil_op op, 171 LLVMValueRef stencilRef, 172 LLVMValueRef stencilVals) 173 174{ 175 LLVMBuilderRef builder = bld->gallivm->builder; 176 struct lp_type type = bld->type; 177 LLVMValueRef res; 178 LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff); 179 unsigned stencil_op; 180 181 assert(type.sign); 182 183 switch (op) { 184 case S_FAIL_OP: 185 stencil_op = stencil->fail_op; 186 break; 187 case Z_FAIL_OP: 188 stencil_op = stencil->zfail_op; 189 break; 190 case Z_PASS_OP: 191 stencil_op = stencil->zpass_op; 192 break; 193 default: 194 assert(0 && "Invalid stencil_op mode"); 195 stencil_op = PIPE_STENCIL_OP_KEEP; 196 } 197 198 switch (stencil_op) { 199 case PIPE_STENCIL_OP_KEEP: 200 res = stencilVals; 201 /* we can return early for this case */ 202 return res; 203 case PIPE_STENCIL_OP_ZERO: 204 res = bld->zero; 205 break; 206 case PIPE_STENCIL_OP_REPLACE: 207 res = stencilRef; 208 break; 209 case PIPE_STENCIL_OP_INCR: 210 res = lp_build_add(bld, stencilVals, bld->one); 211 res = lp_build_min(bld, res, max); 212 break; 213 case PIPE_STENCIL_OP_DECR: 214 res = lp_build_sub(bld, stencilVals, bld->one); 215 res = lp_build_max(bld, res, bld->zero); 216 break; 217 case PIPE_STENCIL_OP_INCR_WRAP: 218 res = lp_build_add(bld, stencilVals, bld->one); 219 res = LLVMBuildAnd(builder, res, max, ""); 220 break; 221 case PIPE_STENCIL_OP_DECR_WRAP: 222 res = lp_build_sub(bld, stencilVals, bld->one); 223 res = LLVMBuildAnd(builder, res, max, ""); 224 break; 225 case PIPE_STENCIL_OP_INVERT: 226 res = LLVMBuildNot(builder, stencilVals, ""); 227 res = LLVMBuildAnd(builder, res, max, ""); 228 break; 229 default: 230 assert(0 && "bad stencil op mode"); 231 res = bld->undef; 232 } 233 234 return res; 235} 236 237 238/** 239 * Do the one or two-sided stencil test op/update. 240 */ 241static LLVMValueRef 242lp_build_stencil_op(struct lp_build_context *bld, 243 const struct pipe_stencil_state stencil[2], 244 enum stencil_op op, 245 LLVMValueRef stencilRefs[2], 246 LLVMValueRef stencilVals, 247 LLVMValueRef mask, 248 LLVMValueRef front_facing) 249 250{ 251 LLVMBuilderRef builder = bld->gallivm->builder; 252 LLVMValueRef res; 253 254 assert(stencil[0].enabled); 255 256 /* do front face op */ 257 res = lp_build_stencil_op_single(bld, &stencil[0], op, 258 stencilRefs[0], stencilVals); 259 260 if (stencil[1].enabled && front_facing != NULL) { 261 /* do back face op */ 262 LLVMValueRef back_res; 263 264 back_res = lp_build_stencil_op_single(bld, &stencil[1], op, 265 stencilRefs[1], stencilVals); 266 267 res = lp_build_select(bld, front_facing, res, back_res); 268 } 269 270 if (stencil[0].writemask != 0xff || 271 (stencil[1].enabled && front_facing != NULL && 272 stencil[1].writemask != 0xff)) { 273 /* mask &= stencil[0].writemask */ 274 LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type, 275 stencil[0].writemask); 276 if (stencil[1].enabled && 277 stencil[1].writemask != stencil[0].writemask && 278 front_facing != NULL) { 279 LLVMValueRef back_writemask = 280 lp_build_const_int_vec(bld->gallivm, bld->type, 281 stencil[1].writemask); 282 writemask = lp_build_select(bld, front_facing, 283 writemask, back_writemask); 284 } 285 286 mask = LLVMBuildAnd(builder, mask, writemask, ""); 287 /* res = (res & mask) | (stencilVals & ~mask) */ 288 res = lp_build_select_bitwise(bld, mask, res, stencilVals); 289 } 290 else { 291 /* res = mask ? res : stencilVals */ 292 res = lp_build_select(bld, mask, res, stencilVals); 293 } 294 295 return res; 296} 297 298 299 300/** 301 * Return a type that matches the depth/stencil format. 302 */ 303struct lp_type 304lp_depth_type(const struct util_format_description *format_desc, 305 unsigned length) 306{ 307 struct lp_type type; 308 unsigned z_swizzle; 309 310 assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); 311 assert(format_desc->block.width == 1); 312 assert(format_desc->block.height == 1); 313 314 memset(&type, 0, sizeof type); 315 type.width = format_desc->block.bits; 316 317 z_swizzle = format_desc->swizzle[0]; 318 if (z_swizzle < 4) { 319 if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) { 320 type.floating = TRUE; 321 assert(z_swizzle == 0); 322 assert(format_desc->channel[z_swizzle].size == 32); 323 } 324 else if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) { 325 assert(format_desc->block.bits <= 32); 326 assert(format_desc->channel[z_swizzle].normalized); 327 if (format_desc->channel[z_swizzle].size < format_desc->block.bits) { 328 /* Prefer signed integers when possible, as SSE has less support 329 * for unsigned comparison; 330 */ 331 type.sign = TRUE; 332 } 333 } 334 else 335 assert(0); 336 } 337 338 type.length = length; 339 340 return type; 341} 342 343 344/** 345 * Compute bitmask and bit shift to apply to the incoming fragment Z values 346 * and the Z buffer values needed before doing the Z comparison. 347 * 348 * Note that we leave the Z bits in the position that we find them 349 * in the Z buffer (typically 0xffffff00 or 0x00ffffff). That lets us 350 * get by with fewer bit twiddling steps. 351 */ 352static boolean 353get_z_shift_and_mask(const struct util_format_description *format_desc, 354 unsigned *shift, unsigned *width, unsigned *mask) 355{ 356 unsigned total_bits; 357 unsigned z_swizzle; 358 359 assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); 360 assert(format_desc->block.width == 1); 361 assert(format_desc->block.height == 1); 362 363 /* 64bit d/s format is special already extracted 32 bits */ 364 total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits; 365 366 z_swizzle = format_desc->swizzle[0]; 367 368 if (z_swizzle == PIPE_SWIZZLE_NONE) 369 return FALSE; 370 371 *width = format_desc->channel[z_swizzle].size; 372 /* & 31 is for the same reason as the 32-bit limit above */ 373 *shift = format_desc->channel[z_swizzle].shift & 31; 374 375 if (*width == total_bits) { 376 *mask = 0xffffffff; 377 } else { 378 *mask = ((1 << *width) - 1) << *shift; 379 } 380 381 return TRUE; 382} 383 384 385/** 386 * Compute bitmask and bit shift to apply to the framebuffer pixel values 387 * to put the stencil bits in the least significant position. 388 * (i.e. 0x000000ff) 389 */ 390static boolean 391get_s_shift_and_mask(const struct util_format_description *format_desc, 392 unsigned *shift, unsigned *mask) 393{ 394 const unsigned s_swizzle = format_desc->swizzle[1]; 395 396 if (s_swizzle == PIPE_SWIZZLE_NONE) 397 return FALSE; 398 399 /* just special case 64bit d/s format */ 400 if (format_desc->block.bits > 32) { 401 /* XXX big-endian? */ 402 assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); 403 *shift = 0; 404 *mask = 0xff; 405 return TRUE; 406 } 407 408 *shift = format_desc->channel[s_swizzle].shift; 409 const unsigned sz = format_desc->channel[s_swizzle].size; 410 *mask = (1U << sz) - 1U; 411 412 return TRUE; 413} 414 415 416/** 417 * Perform the occlusion test and increase the counter. 418 * Test the depth mask. Add the number of channel which has none zero mask 419 * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}. 420 * The counter will add 4. 421 * TODO: could get that out of the fs loop. 422 * 423 * \param type holds element type of the mask vector. 424 * \param maskvalue is the depth test mask. 425 * \param counter is a pointer of the uint32 counter. 426 */ 427void 428lp_build_occlusion_count(struct gallivm_state *gallivm, 429 struct lp_type type, 430 LLVMValueRef maskvalue, 431 LLVMValueRef counter) 432{ 433 LLVMBuilderRef builder = gallivm->builder; 434 LLVMContextRef context = gallivm->context; 435 LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1); 436 LLVMValueRef count, newcount; 437 438 assert(type.length <= 16); 439 assert(type.floating); 440 441 if (util_get_cpu_caps()->has_sse && type.length == 4) { 442 const char *movmskintr = "llvm.x86.sse.movmsk.ps"; 443 const char *popcntintr = "llvm.ctpop.i32"; 444 LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, 445 lp_build_vec_type(gallivm, type), ""); 446 bits = lp_build_intrinsic_unary(builder, movmskintr, 447 LLVMInt32TypeInContext(context), bits); 448 count = lp_build_intrinsic_unary(builder, popcntintr, 449 LLVMInt32TypeInContext(context), bits); 450 count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); 451 } 452 else if (util_get_cpu_caps()->has_avx && type.length == 8) { 453 const char *movmskintr = "llvm.x86.avx.movmsk.ps.256"; 454 const char *popcntintr = "llvm.ctpop.i32"; 455 LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, 456 lp_build_vec_type(gallivm, type), ""); 457 bits = lp_build_intrinsic_unary(builder, movmskintr, 458 LLVMInt32TypeInContext(context), bits); 459 count = lp_build_intrinsic_unary(builder, popcntintr, 460 LLVMInt32TypeInContext(context), bits); 461 count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); 462 } 463 else { 464 LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv"); 465 LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8); 466 LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4); 467 LLVMValueRef shufflev, countd; 468 LLVMValueRef shuffles[16]; 469 const char *popcntintr = NULL; 470 471 countv = LLVMBuildBitCast(builder, countv, i8vntype, ""); 472 473 for (unsigned i = 0; i < type.length; i++) { 474#if UTIL_ARCH_LITTLE_ENDIAN 475 shuffles[i] = lp_build_const_int32(gallivm, 4*i); 476#else 477 shuffles[i] = lp_build_const_int32(gallivm, (4*i) + 3); 478#endif 479 } 480 481 shufflev = LLVMConstVector(shuffles, type.length); 482 countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, ""); 483 countd = LLVMBuildBitCast(builder, countd, counttype, "countd"); 484 485 /* 486 * XXX FIXME 487 * this is bad on cpus without popcount (on x86 supported by intel 488 * nehalem, amd barcelona, and up - not tied to sse42). 489 * Would be much faster to just sum the 4 elements of the vector with 490 * some horizontal add (shuffle/add/shuffle/add after the initial and). 491 */ 492 switch (type.length) { 493 case 4: 494 popcntintr = "llvm.ctpop.i32"; 495 break; 496 case 8: 497 popcntintr = "llvm.ctpop.i64"; 498 break; 499 case 16: 500 popcntintr = "llvm.ctpop.i128"; 501 break; 502 default: 503 assert(0); 504 } 505 count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd); 506 507 if (type.length > 8) { 508 count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), ""); 509 } 510 else if (type.length < 8) { 511 count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); 512 } 513 } 514 newcount = LLVMBuildLoad2(builder, LLVMTypeOf(count), counter, "origcount"); 515 newcount = LLVMBuildAdd(builder, newcount, count, "newcount"); 516 LLVMBuildStore(builder, newcount, counter); 517} 518 519 520/** 521 * Load depth/stencil values. 522 * The stored values are linear, swizzle them. 523 * 524 * \param type the data type of the fragment depth/stencil values 525 * \param format_desc description of the depth/stencil surface 526 * \param is_1d whether this resource has only one dimension 527 * \param loop_counter the current loop iteration 528 * \param depth_ptr pointer to the depth/stencil values of this 4x4 block 529 * \param depth_stride stride of the depth/stencil buffer 530 * \param z_fb contains z values loaded from fb (may include padding) 531 * \param s_fb contains s values loaded from fb (may include padding) 532 */ 533void 534lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm, 535 struct lp_type z_src_type, 536 const struct util_format_description *format_desc, 537 boolean is_1d, 538 LLVMValueRef depth_ptr, 539 LLVMValueRef depth_stride, 540 LLVMValueRef *z_fb, 541 LLVMValueRef *s_fb, 542 LLVMValueRef loop_counter) 543{ 544 LLVMBuilderRef builder = gallivm->builder; 545 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4]; 546 LLVMValueRef depth_offset1, depth_offset2; 547 const unsigned depth_bytes = format_desc->block.bits / 8; 548 struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length); 549 550 struct lp_type zs_load_type = zs_type; 551 zs_load_type.length = zs_load_type.length / 2; 552 553 LLVMTypeRef zs_dst_type = lp_build_vec_type(gallivm, zs_load_type); 554 555 if (z_src_type.length == 4) { 556 LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter, 557 lp_build_const_int32(gallivm, 1), ""); 558 LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter, 559 lp_build_const_int32(gallivm, 2), ""); 560 LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb, 561 depth_stride, ""); 562 depth_offset1 = LLVMBuildMul(builder, looplsb, 563 lp_build_const_int32(gallivm, depth_bytes * 2), ""); 564 depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, ""); 565 566 /* just concatenate the loaded 2x2 values into 4-wide vector */ 567 for (unsigned i = 0; i < 4; i++) { 568 shuffles[i] = lp_build_const_int32(gallivm, i); 569 } 570 } 571 else { 572 unsigned i; 573 LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter, 574 lp_build_const_int32(gallivm, 1), ""); 575 assert(z_src_type.length == 8); 576 depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, ""); 577 /* 578 * We load 2x4 values, and need to swizzle them (order 579 * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately. 580 */ 581 for (i = 0; i < 8; i++) { 582 shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); 583 } 584 } 585 586 depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, ""); 587 588 /* Load current z/stencil values from z/stencil buffer */ 589 LLVMTypeRef load_ptr_type = LLVMPointerType(zs_dst_type, 0); 590 LLVMValueRef zs_dst_ptr = 591 LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, ""); 592 zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, ""); 593 LLVMValueRef zs_dst1 = LLVMBuildLoad2(builder, zs_dst_type, zs_dst_ptr, ""); 594 LLVMValueRef zs_dst2; 595 if (is_1d) { 596 zs_dst2 = lp_build_undef(gallivm, zs_load_type); 597 } 598 else { 599 zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, ""); 600 zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, ""); 601 zs_dst2 = LLVMBuildLoad2(builder, zs_dst_type, zs_dst_ptr, ""); 602 } 603 604 *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2, 605 LLVMConstVector(shuffles, zs_type.length), ""); 606 *s_fb = *z_fb; 607 608 if (format_desc->block.bits == 8) { 609 /* Extend stencil-only 8 bit values (S8_UINT) */ 610 *s_fb = LLVMBuildZExt(builder, *s_fb, 611 lp_build_int_vec_type(gallivm, z_src_type), ""); 612 } 613 614 if (format_desc->block.bits < z_src_type.width) { 615 /* Extend destination ZS values (e.g., when reading from Z16_UNORM) */ 616 *z_fb = LLVMBuildZExt(builder, *z_fb, 617 lp_build_int_vec_type(gallivm, z_src_type), ""); 618 } 619 620 else if (format_desc->block.bits > 32) { 621 /* rely on llvm to handle too wide vector we have here nicely */ 622 struct lp_type typex2 = zs_type; 623 struct lp_type s_type = zs_type; 624 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4]; 625 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4]; 626 LLVMValueRef tmp; 627 628 typex2.width = typex2.width / 2; 629 typex2.length = typex2.length * 2; 630 s_type.width = s_type.width / 2; 631 s_type.floating = 0; 632 633 tmp = LLVMBuildBitCast(builder, *z_fb, 634 lp_build_vec_type(gallivm, typex2), ""); 635 636 for (unsigned i = 0; i < zs_type.length; i++) { 637 shuffles1[i] = lp_build_const_int32(gallivm, i * 2); 638 shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1); 639 } 640 *z_fb = LLVMBuildShuffleVector(builder, tmp, tmp, 641 LLVMConstVector(shuffles1, zs_type.length), ""); 642 *s_fb = LLVMBuildShuffleVector(builder, tmp, tmp, 643 LLVMConstVector(shuffles2, zs_type.length), ""); 644 *s_fb = LLVMBuildBitCast(builder, *s_fb, 645 lp_build_vec_type(gallivm, s_type), ""); 646 lp_build_name(*s_fb, "s_dst"); 647 } 648 649 lp_build_name(*z_fb, "z_dst"); 650 lp_build_name(*s_fb, "s_dst"); 651 lp_build_name(*z_fb, "z_dst"); 652} 653 654 655/** 656 * Store depth/stencil values. 657 * Incoming values are swizzled (typically n 2x2 quads), stored linear. 658 * If there's a mask it will do select/store otherwise just store. 659 * 660 * \param type the data type of the fragment depth/stencil values 661 * \param format_desc description of the depth/stencil surface 662 * \param is_1d whether this resource has only one dimension 663 * \param mask_value the alive/dead pixel mask for the quad (vector) 664 * \param z_fb z values read from fb (with padding) 665 * \param s_fb s values read from fb (with padding) 666 * \param loop_counter the current loop iteration 667 * \param depth_ptr pointer to the depth/stencil values of this 4x4 block 668 * \param depth_stride stride of the depth/stencil buffer 669 * \param z_value the depth values to store (with padding) 670 * \param s_value the stencil values to store (with padding) 671 */ 672void 673lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm, 674 struct lp_type z_src_type, 675 const struct util_format_description *format_desc, 676 boolean is_1d, 677 LLVMValueRef mask_value, 678 LLVMValueRef z_fb, 679 LLVMValueRef s_fb, 680 LLVMValueRef loop_counter, 681 LLVMValueRef depth_ptr, 682 LLVMValueRef depth_stride, 683 LLVMValueRef z_value, 684 LLVMValueRef s_value) 685{ 686 struct lp_build_context z_bld; 687 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4]; 688 LLVMBuilderRef builder = gallivm->builder; 689 LLVMValueRef zs_dst1, zs_dst2; 690 LLVMValueRef zs_dst_ptr1, zs_dst_ptr2; 691 LLVMValueRef depth_offset1, depth_offset2; 692 LLVMTypeRef load_ptr_type; 693 unsigned depth_bytes = format_desc->block.bits / 8; 694 struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length); 695 struct lp_type z_type = zs_type; 696 struct lp_type zs_load_type = zs_type; 697 698 zs_load_type.length = zs_load_type.length / 2; 699 load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0); 700 701 z_type.width = z_src_type.width; 702 703 lp_build_context_init(&z_bld, gallivm, z_type); 704 705 /* 706 * This is far from ideal, at least for late depth write we should do this 707 * outside the fs loop to avoid all the swizzle stuff. 708 */ 709 if (z_src_type.length == 4) { 710 LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter, 711 lp_build_const_int32(gallivm, 1), ""); 712 LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter, 713 lp_build_const_int32(gallivm, 2), ""); 714 LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb, 715 depth_stride, ""); 716 depth_offset1 = LLVMBuildMul(builder, looplsb, 717 lp_build_const_int32(gallivm, depth_bytes * 2), ""); 718 depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, ""); 719 } 720 else { 721 LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter, 722 lp_build_const_int32(gallivm, 1), ""); 723 assert(z_src_type.length == 8); 724 depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, ""); 725 /* 726 * We load 2x4 values, and need to swizzle them (order 727 * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately. 728 */ 729 for (unsigned i = 0; i < 8; i++) { 730 shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); 731 } 732 } 733 734 depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, ""); 735 736 zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, ""); 737 zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, ""); 738 zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, ""); 739 zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, ""); 740 741 if (format_desc->block.bits > 32) { 742 s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, ""); 743 } 744 745 if (mask_value) { 746 z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb); 747 if (format_desc->block.bits > 32) { 748 s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, ""); 749 s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb); 750 } 751 } 752 753 if (zs_type.width < z_src_type.width) { 754 /* Truncate ZS values (e.g., when writing to Z16_UNORM) */ 755 z_value = LLVMBuildTrunc(builder, z_value, 756 lp_build_int_vec_type(gallivm, zs_type), ""); 757 } 758 759 if (format_desc->block.bits <= 32) { 760 if (z_src_type.length == 4) { 761 zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2); 762 zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2); 763 } 764 else { 765 assert(z_src_type.length == 8); 766 zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value, 767 LLVMConstVector(&shuffles[0], 768 zs_load_type.length), ""); 769 zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value, 770 LLVMConstVector(&shuffles[4], 771 zs_load_type.length), ""); 772 } 773 } 774 else { 775 if (z_src_type.length == 4) { 776 zs_dst1 = lp_build_interleave2(gallivm, z_type, 777 z_value, s_value, 0); 778 zs_dst2 = lp_build_interleave2(gallivm, z_type, 779 z_value, s_value, 1); 780 } 781 else { 782 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2]; 783 assert(z_src_type.length == 8); 784 for (unsigned i = 0; i < 8; i++) { 785 shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); 786 shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 + 787 z_src_type.length); 788 } 789 zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value, 790 LLVMConstVector(&shuffles[0], 791 z_src_type.length), ""); 792 zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value, 793 LLVMConstVector(&shuffles[8], 794 z_src_type.length), ""); 795 } 796 zs_dst1 = LLVMBuildBitCast(builder, zs_dst1, 797 lp_build_vec_type(gallivm, zs_load_type), ""); 798 zs_dst2 = LLVMBuildBitCast(builder, zs_dst2, 799 lp_build_vec_type(gallivm, zs_load_type), ""); 800 } 801 802 LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1); 803 if (!is_1d) { 804 LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2); 805 } 806} 807 808 809/** 810 * Generate code for performing depth and/or stencil tests. 811 * We operate on a vector of values (typically n 2x2 quads). 812 * 813 * \param depth the depth test state 814 * \param stencil the front/back stencil state 815 * \param type the data type of the fragment depth/stencil values 816 * \param format_desc description of the depth/stencil surface 817 * \param mask the alive/dead pixel mask for the quad (vector) 818 * \param cov_mask coverage mask 819 * \param stencil_refs the front/back stencil ref values (scalar) 820 * \param z_src the incoming depth/stencil values (n 2x2 quad values, float32) 821 * \param zs_dst the depth/stencil values in framebuffer 822 * \param face contains boolean value indicating front/back facing polygon 823 */ 824void 825lp_build_depth_stencil_test(struct gallivm_state *gallivm, 826 const struct lp_depth_state *depth, 827 const struct pipe_stencil_state stencil[2], 828 struct lp_type z_src_type, 829 const struct util_format_description *format_desc, 830 struct lp_build_mask_context *mask, 831 LLVMValueRef *cov_mask, 832 LLVMValueRef stencil_refs[2], 833 LLVMValueRef z_src, 834 LLVMValueRef z_fb, 835 LLVMValueRef s_fb, 836 LLVMValueRef face, 837 LLVMValueRef *z_value, 838 LLVMValueRef *s_value, 839 boolean do_branch, 840 bool restrict_depth) 841{ 842 LLVMBuilderRef builder = gallivm->builder; 843 struct lp_type z_type; 844 struct lp_build_context z_bld; 845 struct lp_build_context s_bld; 846 struct lp_type s_type; 847 unsigned z_shift = 0, z_width = 0, z_mask = 0; 848 LLVMValueRef z_dst = NULL; 849 LLVMValueRef stencil_vals = NULL; 850 LLVMValueRef z_bitmask = NULL, stencil_shift = NULL; 851 LLVMValueRef z_pass = NULL, s_pass_mask = NULL; 852 LLVMValueRef current_mask = mask ? lp_build_mask_value(mask) : *cov_mask; 853 LLVMValueRef front_facing = NULL; 854 boolean have_z, have_s; 855 856 /* 857 * Depths are expected to be between 0 and 1, even if they are stored in 858 * floats. Setting these bits here will ensure that the lp_build_conv() call 859 * below won't try to unnecessarily clamp the incoming values. 860 * If depths are expected outside 0..1 don't set these bits. 861 */ 862 if (z_src_type.floating) { 863 if (restrict_depth) { 864 z_src_type.sign = FALSE; 865 z_src_type.norm = TRUE; 866 } 867 } 868 else { 869 assert(!z_src_type.sign); 870 assert(z_src_type.norm); 871 } 872 873 /* Pick the type matching the depth-stencil format. */ 874 z_type = lp_depth_type(format_desc, z_src_type.length); 875 876 /* Pick the intermediate type for depth operations. */ 877 z_type.width = z_src_type.width; 878 assert(z_type.length == z_src_type.length); 879 880 /* FIXME: for non-float depth/stencil might generate better code 881 * if we'd always split it up to use 128bit operations. 882 * For stencil we'd almost certainly want to pack to 8xi16 values, 883 * for z just run twice. 884 */ 885 886 /* Sanity checking */ 887 { 888 ASSERTED const unsigned z_swizzle = format_desc->swizzle[0]; 889 ASSERTED const unsigned s_swizzle = format_desc->swizzle[1]; 890 891 assert(z_swizzle != PIPE_SWIZZLE_NONE || 892 s_swizzle != PIPE_SWIZZLE_NONE); 893 894 assert(depth->enabled || stencil[0].enabled); 895 896 assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); 897 assert(format_desc->block.width == 1); 898 assert(format_desc->block.height == 1); 899 900 if (stencil[0].enabled) { 901 assert(s_swizzle < 4); 902 assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); 903 assert(format_desc->channel[s_swizzle].pure_integer); 904 assert(!format_desc->channel[s_swizzle].normalized); 905 assert(format_desc->channel[s_swizzle].size == 8); 906 } 907 908 if (depth->enabled) { 909 assert(z_swizzle < 4); 910 if (z_type.floating) { 911 assert(z_swizzle == 0); 912 assert(format_desc->channel[z_swizzle].type == 913 UTIL_FORMAT_TYPE_FLOAT); 914 assert(format_desc->channel[z_swizzle].size == 32); 915 } 916 else { 917 assert(format_desc->channel[z_swizzle].type == 918 UTIL_FORMAT_TYPE_UNSIGNED); 919 assert(format_desc->channel[z_swizzle].normalized); 920 assert(!z_type.fixed); 921 } 922 } 923 } 924 925 926 /* Setup build context for Z vals */ 927 lp_build_context_init(&z_bld, gallivm, z_type); 928 929 /* Setup build context for stencil vals */ 930 s_type = lp_int_type(z_type); 931 lp_build_context_init(&s_bld, gallivm, s_type); 932 933 /* Compute and apply the Z/stencil bitmasks and shifts. 934 */ 935 { 936 unsigned s_shift, s_mask; 937 938 z_dst = z_fb; 939 stencil_vals = s_fb; 940 941 have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask); 942 have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask); 943 944 if (have_z) { 945 if (z_mask != 0xffffffff) { 946 z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask); 947 } 948 949 /* 950 * Align the framebuffer Z 's LSB to the right. 951 */ 952 if (z_shift) { 953 LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); 954 z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst"); 955 } else if (z_bitmask) { 956 z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst"); 957 } else { 958 lp_build_name(z_dst, "z_dst"); 959 } 960 } 961 962 if (have_s) { 963 if (s_shift) { 964 LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift); 965 stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, ""); 966 stencil_shift = shift; /* used below */ 967 } 968 969 if (s_mask != 0xffffffff) { 970 LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask); 971 stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, ""); 972 } 973 974 lp_build_name(stencil_vals, "s_dst"); 975 } 976 } 977 978 if (stencil[0].enabled) { 979 980 if (face) { 981 if (0) { 982 /* 983 * XXX: the scalar expansion below produces atrocious code 984 * (basically producing a 64bit scalar value, then moving the 2 985 * 32bit pieces separately to simd, plus 4 shuffles, which is 986 * seriously lame). But the scalar-simd transitions are always 987 * tricky, so no big surprise there. 988 * This here would be way better, however llvm has some serious 989 * trouble later using it in the select, probably because it will 990 * recognize the expression as constant and move the simd value 991 * away (out of the loop) - and then it will suddenly try 992 * constructing i1 high-bit masks out of it later... 993 * (Try piglit stencil-twoside.) 994 * Note this is NOT due to using SExt/Trunc, it fails exactly the 995 * same even when using native compare/select. 996 * I cannot reproduce this problem when using stand-alone compiler 997 * though, suggesting some problem with optimization passes... 998 * (With stand-alone compilation, the construction of this mask 999 * value, no matter if the easy 3 instruction here or the complex 1000 * 16+ one below, never gets separated from where it's used.) 1001 * The scalar code still has the same problem, but the generated 1002 * code looks a bit better at least for some reason, even if 1003 * mostly by luck (the fundamental issue clearly is the same). 1004 */ 1005 front_facing = lp_build_broadcast(gallivm, s_bld.vec_type, face); 1006 /* front_facing = face != 0 ? ~0 : 0 */ 1007 front_facing = lp_build_compare(gallivm, s_bld.type, 1008 PIPE_FUNC_NOTEQUAL, 1009 front_facing, s_bld.zero); 1010 } else { 1011 LLVMValueRef zero = lp_build_const_int32(gallivm, 0); 1012 1013 /* front_facing = face != 0 ? ~0 : 0 */ 1014 front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, ""); 1015 front_facing = LLVMBuildSExt(builder, front_facing, 1016 LLVMIntTypeInContext(gallivm->context, 1017 s_bld.type.length*s_bld.type.width), 1018 ""); 1019 front_facing = LLVMBuildBitCast(builder, front_facing, 1020 s_bld.int_vec_type, ""); 1021 1022 } 1023 } 1024 1025 s_pass_mask = lp_build_stencil_test(&s_bld, stencil, 1026 stencil_refs, stencil_vals, 1027 front_facing); 1028 1029 /* apply stencil-fail operator */ 1030 { 1031 LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, current_mask, s_pass_mask); 1032 stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP, 1033 stencil_refs, stencil_vals, 1034 s_fail_mask, front_facing); 1035 } 1036 } 1037 1038 if (depth->enabled) { 1039 /* 1040 * Convert fragment Z to the desired type, aligning the LSB to the right. 1041 */ 1042 1043 assert(z_type.width == z_src_type.width); 1044 assert(z_type.length == z_src_type.length); 1045 assert(lp_check_value(z_src_type, z_src)); 1046 if (z_src_type.floating) { 1047 /* 1048 * Convert from floating point values 1049 */ 1050 1051 if (!z_type.floating) { 1052 z_src = lp_build_clamped_float_to_unsigned_norm(gallivm, 1053 z_src_type, 1054 z_width, 1055 z_src); 1056 } 1057 } else { 1058 /* 1059 * Convert from unsigned normalized values. 1060 */ 1061 1062 assert(!z_src_type.sign); 1063 assert(!z_src_type.fixed); 1064 assert(z_src_type.norm); 1065 assert(!z_type.floating); 1066 if (z_src_type.width > z_width) { 1067 LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type, 1068 z_src_type.width - z_width); 1069 z_src = LLVMBuildLShr(builder, z_src, shift, ""); 1070 } 1071 } 1072 assert(lp_check_value(z_type, z_src)); 1073 1074 lp_build_name(z_src, "z_src"); 1075 1076 /* compare src Z to dst Z, returning 'pass' mask */ 1077 z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst); 1078 1079 /* mask off bits that failed stencil test */ 1080 if (s_pass_mask) { 1081 current_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, ""); 1082 } 1083 1084 if (!stencil[0].enabled && mask) { 1085 /* We can potentially skip all remaining operations here, but only 1086 * if stencil is disabled because we still need to update the stencil 1087 * buffer values. Don't need to update Z buffer values. 1088 */ 1089 lp_build_mask_update(mask, z_pass); 1090 1091 if (do_branch) { 1092 lp_build_mask_check(mask); 1093 } 1094 } 1095 1096 if (depth->writemask) { 1097 LLVMValueRef z_pass_mask; 1098 1099 /* mask off bits that failed Z test */ 1100 z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, ""); 1101 1102 /* Mix the old and new Z buffer values. 1103 * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i] 1104 */ 1105 z_dst = lp_build_select(&z_bld, z_pass_mask, z_src, z_dst); 1106 } 1107 1108 if (stencil[0].enabled) { 1109 /* update stencil buffer values according to z pass/fail result */ 1110 LLVMValueRef z_fail_mask, z_pass_mask; 1111 1112 /* apply Z-fail operator */ 1113 z_fail_mask = lp_build_andnot(&s_bld, current_mask, z_pass); 1114 stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP, 1115 stencil_refs, stencil_vals, 1116 z_fail_mask, front_facing); 1117 1118 /* apply Z-pass operator */ 1119 z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, ""); 1120 stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, 1121 stencil_refs, stencil_vals, 1122 z_pass_mask, front_facing); 1123 } 1124 } 1125 else { 1126 /* No depth test: apply Z-pass operator to stencil buffer values which 1127 * passed the stencil test. 1128 */ 1129 s_pass_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, ""); 1130 stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, 1131 stencil_refs, stencil_vals, 1132 s_pass_mask, front_facing); 1133 } 1134 1135 /* Put Z and stencil bits in the right place */ 1136 if (have_z && z_shift) { 1137 LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); 1138 z_dst = LLVMBuildShl(builder, z_dst, shift, ""); 1139 } 1140 if (stencil_vals && stencil_shift) 1141 stencil_vals = LLVMBuildShl(builder, stencil_vals, 1142 stencil_shift, ""); 1143 1144 /* Finally, merge the z/stencil values */ 1145 if (format_desc->block.bits <= 32) { 1146 if (have_z && have_s) 1147 *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, ""); 1148 else if (have_z) 1149 *z_value = z_dst; 1150 else 1151 *z_value = stencil_vals; 1152 *s_value = *z_value; 1153 } 1154 else { 1155 *z_value = z_dst; 1156 *s_value = stencil_vals; 1157 } 1158 1159 if (mask) { 1160 if (s_pass_mask) 1161 lp_build_mask_update(mask, s_pass_mask); 1162 1163 if (depth->enabled && stencil[0].enabled) 1164 lp_build_mask_update(mask, z_pass); 1165 } else { 1166 LLVMValueRef tmp_mask = *cov_mask; 1167 if (s_pass_mask) 1168 tmp_mask = LLVMBuildAnd(builder, tmp_mask, s_pass_mask, ""); 1169 1170 /* for multisample we don't do the stencil optimisation so update always */ 1171 if (depth->enabled) 1172 tmp_mask = LLVMBuildAnd(builder, tmp_mask, z_pass, ""); 1173 *cov_mask = tmp_mask; 1174 } 1175} 1176 1177