1/************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28#include <limits.h> 29#include "util/u_memory.h" 30#include "util/u_math.h" 31#include "util/u_rect.h" 32#include "util/u_surface.h" 33#include "util/u_pack_color.h" 34#include "util/u_string.h" 35#include "util/u_thread.h" 36#include "util/u_memset.h" 37#include "util/os_time.h" 38 39#include "lp_scene_queue.h" 40#include "lp_context.h" 41#include "lp_debug.h" 42#include "lp_fence.h" 43#include "lp_perf.h" 44#include "lp_query.h" 45#include "lp_rast.h" 46#include "lp_rast_priv.h" 47#include "gallivm/lp_bld_format.h" 48#include "gallivm/lp_bld_debug.h" 49#include "lp_scene.h" 50#include "lp_screen.h" 51#include "lp_tex_sample.h" 52 53 54#ifdef DEBUG 55int jit_line = 0; 56const struct lp_rast_state *jit_state = NULL; 57const struct lp_rasterizer_task *jit_task = NULL; 58#endif 59 60const float lp_sample_pos_4x[4][2] = { { 0.375, 0.125 }, 61 { 0.875, 0.375 }, 62 { 0.125, 0.625 }, 63 { 0.625, 0.875 } }; 64 65/** 66 * Begin rasterizing a scene. 67 * Called once per scene by one thread. 68 */ 69static void 70lp_rast_begin(struct lp_rasterizer *rast, 71 struct lp_scene *scene) 72{ 73 rast->curr_scene = scene; 74 75 LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); 76 77 lp_scene_begin_rasterization(scene); 78 lp_scene_bin_iter_begin(scene); 79} 80 81 82static void 83lp_rast_end(struct lp_rasterizer *rast) 84{ 85 rast->curr_scene = NULL; 86} 87 88 89/** 90 * Beginning rasterization of a tile. 91 * \param x window X position of the tile, in pixels 92 * \param y window Y position of the tile, in pixels 93 */ 94static void 95lp_rast_tile_begin(struct lp_rasterizer_task *task, 96 const struct cmd_bin *bin, 97 int x, int y) 98{ 99 struct lp_scene *scene = task->scene; 100 101 LP_DBG(DEBUG_RAST, "%s %d,%d\n", __FUNCTION__, x, y); 102 103 task->bin = bin; 104 task->x = x * TILE_SIZE; 105 task->y = y * TILE_SIZE; 106 task->width = TILE_SIZE + x * TILE_SIZE > scene->fb.width ? 107 scene->fb.width - x * TILE_SIZE : TILE_SIZE; 108 task->height = TILE_SIZE + y * TILE_SIZE > scene->fb.height ? 109 scene->fb.height - y * TILE_SIZE : TILE_SIZE; 110 111 task->thread_data.vis_counter = 0; 112 task->thread_data.ps_invocations = 0; 113 114 for (unsigned i = 0; i < scene->fb.nr_cbufs; i++) { 115 if (scene->fb.cbufs[i]) { 116 task->color_tiles[i] = scene->cbufs[i].map + 117 scene->cbufs[i].stride * task->y + 118 scene->cbufs[i].format_bytes * task->x; 119 } 120 } 121 if (scene->fb.zsbuf) { 122 task->depth_tile = scene->zsbuf.map + 123 scene->zsbuf.stride * task->y + 124 scene->zsbuf.format_bytes * task->x; 125 } 126} 127 128 129/** 130 * Clear the rasterizer's current color tile. 131 * This is a bin command called during bin processing. 132 * Clear commands always clear all bound layers. 133 */ 134static void 135lp_rast_clear_color(struct lp_rasterizer_task *task, 136 const union lp_rast_cmd_arg arg) 137{ 138 const struct lp_scene *scene = task->scene; 139 const unsigned cbuf = arg.clear_rb->cbuf; 140 141 /* we never bin clear commands for non-existing buffers */ 142 assert(cbuf < scene->fb.nr_cbufs); 143 assert(scene->fb.cbufs[cbuf]); 144 145 const enum pipe_format format = scene->fb.cbufs[cbuf]->format; 146 union util_color uc = arg.clear_rb->color_val; 147 148 /* 149 * this is pretty rough since we have target format (bunch of bytes...) 150 * here. dump it as raw 4 dwords. 151 */ 152 LP_DBG(DEBUG_RAST, 153 "%s clear value (target format %d) raw 0x%x,0x%x,0x%x,0x%x\n", 154 __FUNCTION__, format, uc.ui[0], uc.ui[1], uc.ui[2], uc.ui[3]); 155 156 for (unsigned s = 0; s < scene->cbufs[cbuf].nr_samples; s++) { 157 void *map = (char *) scene->cbufs[cbuf].map 158 + scene->cbufs[cbuf].sample_stride * s; 159 util_fill_box(map, 160 format, 161 scene->cbufs[cbuf].stride, 162 scene->cbufs[cbuf].layer_stride, 163 task->x, 164 task->y, 165 0, 166 task->width, 167 task->height, 168 scene->fb_max_layer + 1, 169 &uc); 170 } 171 172 /* this will increase for each rb which probably doesn't mean much */ 173 LP_COUNT(nr_color_tile_clear); 174} 175 176 177/** 178 * Clear the rasterizer's current z/stencil tile. 179 * This is a bin command called during bin processing. 180 * Clear commands always clear all bound layers. 181 */ 182static void 183lp_rast_clear_zstencil(struct lp_rasterizer_task *task, 184 const union lp_rast_cmd_arg arg) 185{ 186 const struct lp_scene *scene = task->scene; 187 uint64_t clear_value64 = arg.clear_zstencil.value; 188 uint64_t clear_mask64 = arg.clear_zstencil.mask; 189 uint32_t clear_value = (uint32_t) clear_value64; 190 uint32_t clear_mask = (uint32_t) clear_mask64; 191 const unsigned height = task->height; 192 const unsigned width = task->width; 193 const unsigned dst_stride = scene->zsbuf.stride; 194 195 LP_DBG(DEBUG_RAST, "%s: value=0x%08x, mask=0x%08x\n", 196 __FUNCTION__, clear_value, clear_mask); 197 198 /* 199 * Clear the area of the depth/depth buffer matching this tile. 200 */ 201 202 if (scene->fb.zsbuf) { 203 for (unsigned s = 0; s < scene->zsbuf.nr_samples; s++) { 204 uint8_t *dst_layer = 205 task->depth_tile + (s * scene->zsbuf.sample_stride); 206 const unsigned block_size = 207 util_format_get_blocksize(scene->fb.zsbuf->format); 208 209 clear_value &= clear_mask; 210 211 for (unsigned layer = 0; layer <= scene->fb_max_layer; layer++) { 212 uint8_t *dst = dst_layer; 213 214 switch (block_size) { 215 case 1: 216 assert(clear_mask == 0xff); 217 for (unsigned i = 0; i < height; i++) { 218 uint8_t *row = (uint8_t *)dst; 219 memset(row, (uint8_t) clear_value, width); 220 dst += dst_stride; 221 } 222 break; 223 case 2: 224 if (clear_mask == 0xffff) { 225 for (unsigned i = 0; i < height; i++) { 226 uint16_t *row = (uint16_t *)dst; 227 for (unsigned j = 0; j < width; j++) 228 *row++ = (uint16_t) clear_value; 229 dst += dst_stride; 230 } 231 } 232 else { 233 for (unsigned i = 0; i < height; i++) { 234 uint16_t *row = (uint16_t *)dst; 235 for (unsigned j = 0; j < width; j++) { 236 uint16_t tmp = ~clear_mask & *row; 237 *row++ = clear_value | tmp; 238 } 239 dst += dst_stride; 240 } 241 } 242 break; 243 case 4: 244 if (clear_mask == 0xffffffff) { 245 for (unsigned i = 0; i < height; i++) { 246 util_memset32(dst, clear_value, width); 247 dst += dst_stride; 248 } 249 } 250 else { 251 for (unsigned i = 0; i < height; i++) { 252 uint32_t *row = (uint32_t *)dst; 253 for (unsigned j = 0; j < width; j++) { 254 uint32_t tmp = ~clear_mask & *row; 255 *row++ = clear_value | tmp; 256 } 257 dst += dst_stride; 258 } 259 } 260 break; 261 case 8: 262 clear_value64 &= clear_mask64; 263 if (clear_mask64 == 0xffffffffffULL) { 264 for (unsigned i = 0; i < height; i++) { 265 util_memset64(dst, clear_value64, width); 266 dst += dst_stride; 267 } 268 } 269 else { 270 for (unsigned i = 0; i < height; i++) { 271 uint64_t *row = (uint64_t *)dst; 272 for (unsigned j = 0; j < width; j++) { 273 uint64_t tmp = ~clear_mask64 & *row; 274 *row++ = clear_value64 | tmp; 275 } 276 dst += dst_stride; 277 } 278 } 279 break; 280 281 default: 282 assert(0); 283 break; 284 } 285 dst_layer += scene->zsbuf.layer_stride; 286 } 287 } 288 } 289} 290 291 292/** 293 * Run the shader on all blocks in a tile. This is used when a tile is 294 * completely contained inside a triangle. 295 * This is a bin command called during bin processing. 296 */ 297static void 298lp_rast_shade_tile(struct lp_rasterizer_task *task, 299 const union lp_rast_cmd_arg arg) 300{ 301 const struct lp_scene *scene = task->scene; 302 const struct lp_rast_shader_inputs *inputs = arg.shade_tile; 303 const unsigned tile_x = task->x, tile_y = task->y; 304 305 if (inputs->disable) { 306 /* This command was partially binned and has been disabled */ 307 return; 308 } 309 310 LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); 311 312 const struct lp_rast_state *state = task->state; 313 assert(state); 314 if (!state) { 315 return; 316 } 317 318 const struct lp_fragment_shader_variant *variant = state->variant; 319 320 /* render the whole 64x64 tile in 4x4 chunks */ 321 for (unsigned y = 0; y < task->height; y += 4){ 322 for (unsigned x = 0; x < task->width; x += 4) { 323 /* color buffer */ 324 uint8_t *color[PIPE_MAX_COLOR_BUFS]; 325 unsigned stride[PIPE_MAX_COLOR_BUFS]; 326 unsigned sample_stride[PIPE_MAX_COLOR_BUFS]; 327 for (unsigned i = 0; i < scene->fb.nr_cbufs; i++){ 328 if (scene->fb.cbufs[i]) { 329 stride[i] = scene->cbufs[i].stride; 330 sample_stride[i] = scene->cbufs[i].sample_stride; 331 color[i] = lp_rast_get_color_block_pointer(task, i, tile_x + x, 332 tile_y + y, 333 inputs->layer + inputs->view_index); 334 } 335 else { 336 stride[i] = 0; 337 sample_stride[i] = 0; 338 color[i] = NULL; 339 } 340 } 341 342 /* depth buffer */ 343 uint8_t *depth = NULL; 344 unsigned depth_stride = 0; 345 unsigned depth_sample_stride = 0; 346 if (scene->zsbuf.map) { 347 depth = lp_rast_get_depth_block_pointer(task, tile_x + x, 348 tile_y + y, 349 inputs->layer + inputs->view_index); 350 depth_stride = scene->zsbuf.stride; 351 depth_sample_stride = scene->zsbuf.sample_stride; 352 } 353 354 uint64_t mask = 0; 355 for (unsigned i = 0; i < scene->fb_max_samples; i++) 356 mask |= (uint64_t)(0xffff) << (16 * i); 357 358 /* Propagate non-interpolated raster state. */ 359 task->thread_data.raster_state.viewport_index = inputs->viewport_index; 360 task->thread_data.raster_state.view_index = inputs->view_index; 361 362 /* run shader on 4x4 block */ 363 BEGIN_JIT_CALL(state, task); 364 variant->jit_function[RAST_WHOLE](&state->jit_context, 365 tile_x + x, tile_y + y, 366 inputs->frontfacing, 367 GET_A0(inputs), 368 GET_DADX(inputs), 369 GET_DADY(inputs), 370 color, 371 depth, 372 mask, 373 &task->thread_data, 374 stride, 375 depth_stride, 376 sample_stride, 377 depth_sample_stride); 378 END_JIT_CALL(); 379 } 380 } 381} 382 383 384/** 385 * Run the shader on all blocks in a tile. This is used when a tile is 386 * completely contained inside a triangle, and the shader is opaque. 387 * This is a bin command called during bin processing. 388 */ 389static void 390lp_rast_shade_tile_opaque(struct lp_rasterizer_task *task, 391 const union lp_rast_cmd_arg arg) 392{ 393 LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); 394 395 assert(task->state); 396 if (!task->state) { 397 return; 398 } 399 400 lp_rast_shade_tile(task, arg); 401} 402 403 404/** 405 * Compute shading for a 4x4 block of pixels inside a triangle. 406 * This is a bin command called during bin processing. 407 * \param x X position of quad in window coords 408 * \param y Y position of quad in window coords 409 */ 410void 411lp_rast_shade_quads_mask_sample(struct lp_rasterizer_task *task, 412 const struct lp_rast_shader_inputs *inputs, 413 unsigned x, unsigned y, 414 uint64_t mask) 415{ 416 const struct lp_rast_state *state = task->state; 417 const struct lp_fragment_shader_variant *variant = state->variant; 418 const struct lp_scene *scene = task->scene; 419 420 assert(state); 421 422 /* Sanity checks */ 423 assert(x < scene->tiles_x * TILE_SIZE); 424 assert(y < scene->tiles_y * TILE_SIZE); 425 assert(x % TILE_VECTOR_WIDTH == 0); 426 assert(y % TILE_VECTOR_HEIGHT == 0); 427 428 assert((x % 4) == 0); 429 assert((y % 4) == 0); 430 431 /* color buffer */ 432 uint8_t *color[PIPE_MAX_COLOR_BUFS]; 433 unsigned stride[PIPE_MAX_COLOR_BUFS]; 434 unsigned sample_stride[PIPE_MAX_COLOR_BUFS]; 435 for (unsigned i = 0; i < scene->fb.nr_cbufs; i++) { 436 if (scene->fb.cbufs[i]) { 437 stride[i] = scene->cbufs[i].stride; 438 sample_stride[i] = scene->cbufs[i].sample_stride; 439 color[i] = lp_rast_get_color_block_pointer(task, i, x, y, 440 inputs->layer + inputs->view_index); 441 } 442 else { 443 stride[i] = 0; 444 sample_stride[i] = 0; 445 color[i] = NULL; 446 } 447 } 448 449 /* depth buffer */ 450 uint8_t *depth = NULL; 451 unsigned depth_stride = 0; 452 unsigned depth_sample_stride = 0; 453 if (scene->zsbuf.map) { 454 depth_stride = scene->zsbuf.stride; 455 depth_sample_stride = scene->zsbuf.sample_stride; 456 depth = lp_rast_get_depth_block_pointer(task, x, y, inputs->layer + inputs->view_index); 457 } 458 459 assert(lp_check_alignment(state->jit_context.u8_blend_color, 16)); 460 461 /* 462 * The rasterizer may produce fragments outside our 463 * allocated 4x4 blocks hence need to filter them out here. 464 */ 465 if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) { 466 /* Propagate non-interpolated raster state. */ 467 task->thread_data.raster_state.viewport_index = inputs->viewport_index; 468 task->thread_data.raster_state.view_index = inputs->view_index; 469 470 /* run shader on 4x4 block */ 471 BEGIN_JIT_CALL(state, task); 472 variant->jit_function[RAST_EDGE_TEST](&state->jit_context, 473 x, y, 474 inputs->frontfacing, 475 GET_A0(inputs), 476 GET_DADX(inputs), 477 GET_DADY(inputs), 478 color, 479 depth, 480 mask, 481 &task->thread_data, 482 stride, 483 depth_stride, 484 sample_stride, 485 depth_sample_stride); 486 END_JIT_CALL(); 487 } 488} 489 490 491void 492lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, 493 const struct lp_rast_shader_inputs *inputs, 494 unsigned x, unsigned y, 495 unsigned mask) 496{ 497 uint64_t new_mask = 0; 498 for (unsigned i = 0; i < task->scene->fb_max_samples; i++) 499 new_mask |= ((uint64_t)mask) << (16 * i); 500 lp_rast_shade_quads_mask_sample(task, inputs, x, y, new_mask); 501} 502 503 504/** 505 * Directly copy pixels from a texture to the destination color buffer. 506 * This is a bin command called during bin processing. 507 */ 508static void 509lp_rast_blit_tile_to_dest(struct lp_rasterizer_task *task, 510 const union lp_rast_cmd_arg arg) 511{ 512 const struct lp_scene *scene = task->scene; 513 const struct lp_rast_shader_inputs *inputs = arg.shade_tile; 514 const struct lp_rast_state *state = task->state; 515 struct lp_fragment_shader_variant *variant = state->variant; 516 const struct lp_jit_texture *texture = &state->jit_context.textures[0]; 517 struct pipe_surface *cbuf = scene->fb.cbufs[0]; 518 const unsigned face_slice = cbuf->u.tex.first_layer; 519 const unsigned level = cbuf->u.tex.level; 520 struct llvmpipe_resource *lpt = llvmpipe_resource(cbuf->texture); 521 522 LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); 523 524 if (inputs->disable) { 525 /* This command was partially binned and has been disabled */ 526 return; 527 } 528 529 uint8_t *dst = llvmpipe_get_texture_image_address(lpt, face_slice, level); 530 if (!dst) 531 return; 532 533 const unsigned dst_stride = lpt->row_stride[level]; 534 535 const uint8_t *src = texture->base; 536 const unsigned src_stride = texture->row_stride[0]; 537 538 int src_x = util_iround(GET_A0(inputs)[1][0]*texture->width - 0.5f); 539 int src_y = util_iround(GET_A0(inputs)[1][1]*texture->height - 0.5f); 540 541 src_x += task->x; 542 src_y += task->y; 543 544 if (0) { 545 union util_color uc; 546 uc.ui[0] = 0xff0000ff; 547 util_fill_rect(dst, 548 cbuf->format, 549 dst_stride, 550 task->x, 551 task->y, 552 task->width, 553 task->height, 554 &uc); 555 return; 556 } 557 558 if (src_x >= 0 && 559 src_y >= 0 && 560 src_x + task->width <= texture->width && 561 src_y + task->height <= texture->height) { 562 563 if (variant->shader->kind == LP_FS_KIND_BLIT_RGBA || 564 (variant->shader->kind == LP_FS_KIND_BLIT_RGB1 && 565 cbuf->format == PIPE_FORMAT_B8G8R8X8_UNORM)) { 566 util_copy_rect(dst, 567 cbuf->format, 568 dst_stride, 569 task->x, task->y, 570 task->width, task->height, 571 src, src_stride, 572 src_x, src_y); 573 return; 574 } 575 576 if (variant->shader->kind == LP_FS_KIND_BLIT_RGB1) { 577 if (cbuf->format == PIPE_FORMAT_B8G8R8A8_UNORM) { 578 dst += task->x * 4; 579 src += src_x * 4; 580 dst += task->y * dst_stride; 581 src += src_y * src_stride; 582 583 for (int y = 0; y < task->height; ++y) { 584 const uint32_t *src_row = (const uint32_t *)src; 585 uint32_t *dst_row = (uint32_t *)dst; 586 587 for (int x = 0; x < task->width; ++x) { 588 *dst_row++ = *src_row++ | 0xff000000; 589 } 590 dst += dst_stride; 591 src += src_stride; 592 } 593 594 return; 595 } 596 } 597 598 } 599 600 /* 601 * Fall back to the jit shaders. 602 */ 603 604 lp_rast_shade_tile_opaque(task, arg); 605} 606 607 608static void 609lp_rast_blit_tile(struct lp_rasterizer_task *task, 610 const union lp_rast_cmd_arg arg) 611{ 612 /* This kindof just works, but isn't efficient: 613 */ 614 lp_rast_blit_tile_to_dest(task, arg); 615} 616 617 618/** 619 * Begin a new occlusion query. 620 * This is a bin command put in all bins. 621 * Called per thread. 622 */ 623static void 624lp_rast_begin_query(struct lp_rasterizer_task *task, 625 const union lp_rast_cmd_arg arg) 626{ 627 struct llvmpipe_query *pq = arg.query_obj; 628 629 switch (pq->type) { 630 case PIPE_QUERY_OCCLUSION_COUNTER: 631 case PIPE_QUERY_OCCLUSION_PREDICATE: 632 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: 633 pq->start[task->thread_index] = task->thread_data.vis_counter; 634 break; 635 case PIPE_QUERY_PIPELINE_STATISTICS: 636 pq->start[task->thread_index] = task->thread_data.ps_invocations; 637 break; 638 case PIPE_QUERY_TIME_ELAPSED: 639 pq->start[task->thread_index] = os_time_get_nano(); 640 break; 641 default: 642 assert(0); 643 break; 644 } 645} 646 647 648/** 649 * End the current occlusion query. 650 * This is a bin command put in all bins. 651 * Called per thread. 652 */ 653static void 654lp_rast_end_query(struct lp_rasterizer_task *task, 655 const union lp_rast_cmd_arg arg) 656{ 657 struct llvmpipe_query *pq = arg.query_obj; 658 659 switch (pq->type) { 660 case PIPE_QUERY_OCCLUSION_COUNTER: 661 case PIPE_QUERY_OCCLUSION_PREDICATE: 662 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: 663 pq->end[task->thread_index] += 664 task->thread_data.vis_counter - pq->start[task->thread_index]; 665 pq->start[task->thread_index] = 0; 666 break; 667 case PIPE_QUERY_TIMESTAMP: 668 case PIPE_QUERY_TIME_ELAPSED: 669 pq->end[task->thread_index] = os_time_get_nano(); 670 break; 671 case PIPE_QUERY_PIPELINE_STATISTICS: 672 pq->end[task->thread_index] += 673 task->thread_data.ps_invocations - pq->start[task->thread_index]; 674 pq->start[task->thread_index] = 0; 675 break; 676 default: 677 assert(0); 678 break; 679 } 680} 681 682 683void 684lp_rast_set_state(struct lp_rasterizer_task *task, 685 const union lp_rast_cmd_arg arg) 686{ 687 task->state = arg.set_state; 688} 689 690 691/** 692 * Called when we're done writing to a color tile. 693 */ 694static void 695lp_rast_tile_end(struct lp_rasterizer_task *task) 696{ 697 698 for (unsigned i = 0; i < task->scene->num_active_queries; ++i) { 699 lp_rast_end_query(task, 700 lp_rast_arg_query(task->scene->active_queries[i])); 701 } 702 703 /* debug */ 704 memset(task->color_tiles, 0, sizeof(task->color_tiles)); 705 task->depth_tile = NULL; 706 task->bin = NULL; 707} 708 709 710/* Currently have two rendering paths only - the general case triangle 711 * path and the super-specialized blit/clear path. 712 */ 713#define TRI ((LP_RAST_FLAGS_TRI <<1)-1) /* general case */ 714#define RECT ((LP_RAST_FLAGS_RECT<<1)-1) /* direct rectangle rasterizer */ 715#define BLIT ((LP_RAST_FLAGS_BLIT<<1)-1) /* write direct-to-dest */ 716 717static const unsigned 718rast_flags[] = { 719 BLIT, /* clear color */ 720 TRI, /* clear zstencil */ 721 TRI, /* triangle_1 */ 722 TRI, /* triangle_2 */ 723 TRI, /* triangle_3 */ 724 TRI, /* triangle_4 */ 725 TRI, /* triangle_5 */ 726 TRI, /* triangle_6 */ 727 TRI, /* triangle_7 */ 728 TRI, /* triangle_8 */ 729 TRI, /* triangle_3_4 */ 730 TRI, /* triangle_3_16 */ 731 TRI, /* triangle_4_16 */ 732 RECT, /* shade_tile */ 733 RECT, /* shade_tile_opaque */ 734 TRI, /* begin_query */ 735 TRI, /* end_query */ 736 BLIT, /* set_state, */ 737 TRI, /* lp_rast_triangle_32_1 */ 738 TRI, /* lp_rast_triangle_32_2 */ 739 TRI, /* lp_rast_triangle_32_3 */ 740 TRI, /* lp_rast_triangle_32_4 */ 741 TRI, /* lp_rast_triangle_32_5 */ 742 TRI, /* lp_rast_triangle_32_6 */ 743 TRI, /* lp_rast_triangle_32_7 */ 744 TRI, /* lp_rast_triangle_32_8 */ 745 TRI, /* lp_rast_triangle_32_3_4 */ 746 TRI, /* lp_rast_triangle_32_3_16 */ 747 TRI, /* lp_rast_triangle_32_4_16 */ 748 TRI, /* lp_rast_triangle_ms_1 */ 749 TRI, /* lp_rast_triangle_ms_2 */ 750 TRI, /* lp_rast_triangle_ms_3 */ 751 TRI, /* lp_rast_triangle_ms_4 */ 752 TRI, /* lp_rast_triangle_ms_5 */ 753 TRI, /* lp_rast_triangle_ms_6 */ 754 TRI, /* lp_rast_triangle_ms_7 */ 755 TRI, /* lp_rast_triangle_ms_8 */ 756 TRI, /* lp_rast_triangle_ms_3_4 */ 757 TRI, /* lp_rast_triangle_ms_3_16 */ 758 TRI, /* lp_rast_triangle_ms_4_16 */ 759 RECT, /* rectangle */ 760 BLIT, /* blit */ 761}; 762 763/* 764 */ 765static const lp_rast_cmd_func 766dispatch_blit[] = { 767 lp_rast_clear_color, 768 NULL, /* clear_zstencil */ 769 NULL, /* triangle_1 */ 770 NULL, /* triangle_2 */ 771 NULL, /* triangle_3 */ 772 NULL, /* triangle_4 */ 773 NULL, /* triangle_5 */ 774 NULL, /* triangle_6 */ 775 NULL, /* triangle_7 */ 776 NULL, /* triangle_8 */ 777 NULL, /* triangle_3_4 */ 778 NULL, /* triangle_3_16 */ 779 NULL, /* triangle_4_16 */ 780 NULL, /* shade_tile */ 781 NULL, /* shade_tile_opaque */ 782 NULL, /* begin_query */ 783 NULL, /* end_query */ 784 lp_rast_set_state, /* set_state */ 785 NULL, /* lp_rast_triangle_32_1 */ 786 NULL, /* lp_rast_triangle_32_2 */ 787 NULL, /* lp_rast_triangle_32_3 */ 788 NULL, /* lp_rast_triangle_32_4 */ 789 NULL, /* lp_rast_triangle_32_5 */ 790 NULL, /* lp_rast_triangle_32_6 */ 791 NULL, /* lp_rast_triangle_32_7 */ 792 NULL, /* lp_rast_triangle_32_8 */ 793 NULL, /* lp_rast_triangle_32_3_4 */ 794 NULL, /* lp_rast_triangle_32_3_16 */ 795 NULL, /* lp_rast_triangle_32_4_16 */ 796 NULL, /* lp_rast_triangle_ms_1 */ 797 NULL, /* lp_rast_triangle_ms_2 */ 798 NULL, /* lp_rast_triangle_ms_3 */ 799 NULL, /* lp_rast_triangle_ms_4 */ 800 NULL, /* lp_rast_triangle_ms_5 */ 801 NULL, /* lp_rast_triangle_ms_6 */ 802 NULL, /* lp_rast_triangle_ms_7 */ 803 NULL, /* lp_rast_triangle_ms_8 */ 804 NULL, /* lp_rast_triangle_ms_3_4 */ 805 NULL, /* lp_rast_triangle_ms_3_16 */ 806 NULL, /* lp_rast_triangle_ms_4_16 */ 807 NULL, /* rectangle */ 808 lp_rast_blit_tile_to_dest, 809}; 810 811 812 813/* Triangle and general case rasterization: Use the SOA llvm shdaers, 814 * an active swizzled tile for each color buf, etc. Don't blit/clear 815 * directly to destination surface as we know there are swizzled 816 * operations coming. 817 */ 818static const lp_rast_cmd_func 819dispatch_tri[] = { 820 lp_rast_clear_color, 821 lp_rast_clear_zstencil, 822 lp_rast_triangle_1, 823 lp_rast_triangle_2, 824 lp_rast_triangle_3, 825 lp_rast_triangle_4, 826 lp_rast_triangle_5, 827 lp_rast_triangle_6, 828 lp_rast_triangle_7, 829 lp_rast_triangle_8, 830 lp_rast_triangle_3_4, 831 lp_rast_triangle_3_16, 832 lp_rast_triangle_4_16, 833 lp_rast_shade_tile, 834 lp_rast_shade_tile_opaque, 835 lp_rast_begin_query, 836 lp_rast_end_query, 837 lp_rast_set_state, 838 lp_rast_triangle_32_1, 839 lp_rast_triangle_32_2, 840 lp_rast_triangle_32_3, 841 lp_rast_triangle_32_4, 842 lp_rast_triangle_32_5, 843 lp_rast_triangle_32_6, 844 lp_rast_triangle_32_7, 845 lp_rast_triangle_32_8, 846 lp_rast_triangle_32_3_4, 847 lp_rast_triangle_32_3_16, 848 lp_rast_triangle_32_4_16, 849 lp_rast_triangle_ms_1, 850 lp_rast_triangle_ms_2, 851 lp_rast_triangle_ms_3, 852 lp_rast_triangle_ms_4, 853 lp_rast_triangle_ms_5, 854 lp_rast_triangle_ms_6, 855 lp_rast_triangle_ms_7, 856 lp_rast_triangle_ms_8, 857 lp_rast_triangle_ms_3_4, 858 lp_rast_triangle_ms_3_16, 859 lp_rast_triangle_ms_4_16, 860 lp_rast_rectangle, 861 lp_rast_blit_tile, 862}; 863 864 865/* Debug rasterization with most fastpaths disabled. 866 */ 867static const lp_rast_cmd_func 868dispatch_tri_debug[] = 869{ 870 lp_rast_clear_color, 871 lp_rast_clear_zstencil, 872 lp_rast_triangle_1, 873 lp_rast_triangle_2, 874 lp_rast_triangle_3, 875 lp_rast_triangle_4, 876 lp_rast_triangle_5, 877 lp_rast_triangle_6, 878 lp_rast_triangle_7, 879 lp_rast_triangle_8, 880 lp_rast_triangle_3_4, 881 lp_rast_triangle_3_16, 882 lp_rast_triangle_4_16, 883 lp_rast_shade_tile, 884 lp_rast_shade_tile, 885 lp_rast_begin_query, 886 lp_rast_end_query, 887 lp_rast_set_state, 888 lp_rast_triangle_32_1, 889 lp_rast_triangle_32_2, 890 lp_rast_triangle_32_3, 891 lp_rast_triangle_32_4, 892 lp_rast_triangle_32_5, 893 lp_rast_triangle_32_6, 894 lp_rast_triangle_32_7, 895 lp_rast_triangle_32_8, 896 lp_rast_triangle_32_3_4, 897 lp_rast_triangle_32_3_16, 898 lp_rast_triangle_32_4_16, 899 lp_rast_triangle_ms_1, 900 lp_rast_triangle_ms_2, 901 lp_rast_triangle_ms_3, 902 lp_rast_triangle_ms_4, 903 lp_rast_triangle_ms_5, 904 lp_rast_triangle_ms_6, 905 lp_rast_triangle_ms_7, 906 lp_rast_triangle_ms_8, 907 lp_rast_triangle_ms_3_4, 908 lp_rast_triangle_ms_3_16, 909 lp_rast_triangle_ms_4_16, 910 lp_rast_rectangle, 911 lp_rast_shade_tile, 912}; 913 914 915struct lp_bin_info 916lp_characterize_bin(const struct cmd_bin *bin) 917{ 918 unsigned andflags = ~0, j = 0; 919 920 STATIC_ASSERT(ARRAY_SIZE(rast_flags) == LP_RAST_OP_MAX); 921 922 for (const struct cmd_block *block = bin->head; block; block = block->next) { 923 for (unsigned k = 0; k < block->count; k++, j++) { 924 andflags &= rast_flags[block->cmd[k]]; 925 } 926 } 927 928 struct lp_bin_info info; 929 info.type = andflags; 930 info.count = j; 931 932 return info; 933} 934 935 936static void 937blit_rasterize_bin(struct lp_rasterizer_task *task, 938 const struct cmd_bin *bin) 939{ 940 STATIC_ASSERT(ARRAY_SIZE(dispatch_blit) == LP_RAST_OP_MAX); 941 942 if (0) debug_printf("%s\n", __FUNCTION__); 943 for (const struct cmd_block *block = bin->head; block; block = block->next) { 944 for (unsigned k = 0; k < block->count; k++) { 945 dispatch_blit[block->cmd[k]](task, block->arg[k]); 946 } 947 } 948} 949 950 951static void 952tri_rasterize_bin(struct lp_rasterizer_task *task, 953 const struct cmd_bin *bin, 954 int x, int y) 955{ 956 STATIC_ASSERT(ARRAY_SIZE(dispatch_tri) == LP_RAST_OP_MAX); 957 958 for (const struct cmd_block *block = bin->head; block; block = block->next) { 959 for (unsigned k = 0; k < block->count; k++) { 960 dispatch_tri[block->cmd[k]](task, block->arg[k]); 961 } 962 } 963} 964 965 966static void 967debug_rasterize_bin(struct lp_rasterizer_task *task, 968 const struct cmd_bin *bin) 969{ 970 STATIC_ASSERT(ARRAY_SIZE(dispatch_tri_debug) == LP_RAST_OP_MAX); 971 972 for (const struct cmd_block *block = bin->head; block; block = block->next) { 973 for (unsigned k = 0; k < block->count; k++) { 974 dispatch_tri_debug[block->cmd[k]](task, block->arg[k]); 975 } 976 } 977} 978 979 980/** 981 * Rasterize commands for a single bin. 982 * \param x, y position of the bin's tile in the framebuffer 983 * Must be called between lp_rast_begin() and lp_rast_end(). 984 * Called per thread. 985 */ 986static void 987rasterize_bin(struct lp_rasterizer_task *task, 988 const struct cmd_bin *bin, int x, int y) 989{ 990 struct lp_bin_info info = lp_characterize_bin(bin); 991 992 lp_rast_tile_begin(task, bin, x, y); 993 994 if (LP_DEBUG & DEBUG_NO_FASTPATH) { 995 debug_rasterize_bin(task, bin); 996 } else if (info.type & LP_RAST_FLAGS_BLIT) { 997 blit_rasterize_bin(task, bin); 998 } else if (task->scene->permit_linear_rasterizer && 999 !(LP_PERF & PERF_NO_RAST_LINEAR) && 1000 (info.type & LP_RAST_FLAGS_RECT)) { 1001 lp_linear_rasterize_bin(task, bin); 1002 } else { 1003 tri_rasterize_bin(task, bin, x, y); 1004 } 1005 1006 lp_rast_tile_end(task); 1007 1008#ifdef DEBUG 1009 /* Debug/Perf flags: 1010 */ 1011 if (bin->head->count == 1) { 1012 if (bin->head->cmd[0] == LP_RAST_OP_BLIT) 1013 LP_COUNT(nr_pure_blit_64); 1014 else if (bin->head->cmd[0] == LP_RAST_OP_SHADE_TILE_OPAQUE) 1015 LP_COUNT(nr_pure_shade_opaque_64); 1016 else if (bin->head->cmd[0] == LP_RAST_OP_SHADE_TILE) 1017 LP_COUNT(nr_pure_shade_64); 1018 } 1019#endif 1020} 1021 1022 1023/* An empty bin is one that just loads the contents of the tile and 1024 * stores them again unchanged. This typically happens when bins have 1025 * been flushed for some reason in the middle of a frame, or when 1026 * incremental updates are being made to a render target. 1027 * 1028 * Try to avoid doing pointless work in this case. 1029 */ 1030static boolean 1031is_empty_bin(const struct cmd_bin *bin) 1032{ 1033 return bin->head == NULL; 1034} 1035 1036 1037/** 1038 * Rasterize/execute all bins within a scene. 1039 * Called per thread. 1040 */ 1041static void 1042rasterize_scene(struct lp_rasterizer_task *task, 1043 struct lp_scene *scene) 1044{ 1045 task->scene = scene; 1046 1047 /* Clear the cache tags. This should not always be necessary but 1048 simpler for now. */ 1049#if LP_USE_TEXTURE_CACHE 1050 memset(task->thread_data.cache->cache_tags, 0, 1051 sizeof(task->thread_data.cache->cache_tags)); 1052#if LP_BUILD_FORMAT_CACHE_DEBUG 1053 task->thread_data.cache->cache_access_total = 0; 1054 task->thread_data.cache->cache_access_miss = 0; 1055#endif 1056#endif 1057 1058 if (!task->rast->no_rast) { 1059 /* loop over scene bins, rasterize each */ 1060 { 1061 struct cmd_bin *bin; 1062 int i, j; 1063 1064 assert(scene); 1065 while ((bin = lp_scene_bin_iter_next(scene, &i, &j))) { 1066 if (!is_empty_bin(bin)) 1067 rasterize_bin(task, bin, i, j); 1068 } 1069 } 1070 } 1071 1072 1073#if LP_BUILD_FORMAT_CACHE_DEBUG 1074 { 1075 uint64_t total, miss; 1076 total = task->thread_data.cache->cache_access_total; 1077 miss = task->thread_data.cache->cache_access_miss; 1078 if (total) { 1079 debug_printf("thread %d cache access %llu miss %llu hit rate %f\n", 1080 task->thread_index, (long long unsigned)total, 1081 (long long unsigned)miss, 1082 (float)(total - miss)/(float)total); 1083 } 1084 } 1085#endif 1086 1087 if (scene->fence) { 1088 lp_fence_signal(scene->fence); 1089 } 1090 1091 task->scene = NULL; 1092} 1093 1094 1095/** 1096 * Called by setup module when it has something for us to render. 1097 */ 1098void 1099lp_rast_queue_scene(struct lp_rasterizer *rast, 1100 struct lp_scene *scene) 1101{ 1102 LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); 1103 1104 lp_fence_reference(&rast->last_fence, scene->fence); 1105 if (rast->last_fence) 1106 rast->last_fence->issued = TRUE; 1107 1108 if (rast->num_threads == 0) { 1109 /* no threading */ 1110 unsigned fpstate = util_fpstate_get(); 1111 1112 /* Make sure that denorms are treated like zeros. This is 1113 * the behavior required by D3D10. OpenGL doesn't care. 1114 */ 1115 util_fpstate_set_denorms_to_zero(fpstate); 1116 1117 lp_rast_begin(rast, scene); 1118 1119 rasterize_scene(&rast->tasks[0], scene); 1120 1121 lp_rast_end(rast); 1122 1123 util_fpstate_set(fpstate); 1124 1125 rast->curr_scene = NULL; 1126 } 1127 else { 1128 /* threaded rendering! */ 1129 unsigned i; 1130 1131 lp_scene_enqueue(rast->full_scenes, scene); 1132 1133 /* signal the threads that there's work to do */ 1134 for (i = 0; i < rast->num_threads; i++) { 1135 pipe_semaphore_signal(&rast->tasks[i].work_ready); 1136 } 1137 } 1138 1139 LP_DBG(DEBUG_SETUP, "%s done \n", __FUNCTION__); 1140} 1141 1142 1143void 1144lp_rast_finish(struct lp_rasterizer *rast) 1145{ 1146 if (rast->num_threads == 0) { 1147 /* nothing to do */ 1148 } 1149 else { 1150 int i; 1151 1152 /* wait for work to complete */ 1153 for (i = 0; i < rast->num_threads; i++) { 1154 pipe_semaphore_wait(&rast->tasks[i].work_done); 1155 } 1156 } 1157} 1158 1159 1160/** 1161 * This is the thread's main entrypoint. 1162 * It's a simple loop: 1163 * 1. wait for work 1164 * 2. do work 1165 * 3. signal that we're done 1166 */ 1167static int 1168thread_function(void *init_data) 1169{ 1170 struct lp_rasterizer_task *task = (struct lp_rasterizer_task *) init_data; 1171 struct lp_rasterizer *rast = task->rast; 1172 boolean debug = false; 1173 char thread_name[16]; 1174 1175 snprintf(thread_name, sizeof thread_name, "llvmpipe-%u", task->thread_index); 1176 u_thread_setname(thread_name); 1177 1178 /* Make sure that denorms are treated like zeros. This is 1179 * the behavior required by D3D10. OpenGL doesn't care. 1180 */ 1181 unsigned fpstate = util_fpstate_get(); 1182 util_fpstate_set_denorms_to_zero(fpstate); 1183 1184 while (1) { 1185 /* wait for work */ 1186 if (debug) 1187 debug_printf("thread %d waiting for work\n", task->thread_index); 1188 pipe_semaphore_wait(&task->work_ready); 1189 1190 if (rast->exit_flag) 1191 break; 1192 1193 if (task->thread_index == 0) { 1194 /* thread[0]: 1195 * - get next scene to rasterize 1196 * - map the framebuffer surfaces 1197 */ 1198 lp_rast_begin(rast, lp_scene_dequeue(rast->full_scenes, TRUE)); 1199 } 1200 1201 /* Wait for all threads to get here so that threads[1+] don't 1202 * get a null rast->curr_scene pointer. 1203 */ 1204 util_barrier_wait(&rast->barrier); 1205 1206 /* do work */ 1207 if (debug) 1208 debug_printf("thread %d doing work\n", task->thread_index); 1209 1210 rasterize_scene(task, rast->curr_scene); 1211 1212 /* wait for all threads to finish with this scene */ 1213 util_barrier_wait(&rast->barrier); 1214 1215 /* XXX: shouldn't be necessary: 1216 */ 1217 if (task->thread_index == 0) { 1218 lp_rast_end(rast); 1219 } 1220 1221 /* signal done with work */ 1222 if (debug) 1223 debug_printf("thread %d done working\n", task->thread_index); 1224 1225 pipe_semaphore_signal(&task->work_done); 1226 } 1227 1228#ifdef _WIN32 1229 pipe_semaphore_signal(&task->work_done); 1230#endif 1231 1232 return 0; 1233} 1234 1235 1236/** 1237 * Initialize semaphores and spawn the threads. 1238 */ 1239static void 1240create_rast_threads(struct lp_rasterizer *rast) 1241{ 1242 /* NOTE: if num_threads is zero, we won't use any threads */ 1243 for (unsigned i = 0; i < rast->num_threads; i++) { 1244 pipe_semaphore_init(&rast->tasks[i].work_ready, 0); 1245 pipe_semaphore_init(&rast->tasks[i].work_done, 0); 1246 if (thrd_success != u_thread_create(rast->threads + i, thread_function, 1247 (void *) &rast->tasks[i])) { 1248 rast->num_threads = i; /* previous thread is max */ 1249 break; 1250 } 1251 } 1252} 1253 1254 1255/** 1256 * Create new lp_rasterizer. If num_threads is zero, don't create any 1257 * new threads, do rendering synchronously. 1258 * \param num_threads number of rasterizer threads to create 1259 */ 1260struct lp_rasterizer * 1261lp_rast_create(unsigned num_threads) 1262{ 1263 struct lp_rasterizer *rast; 1264 unsigned i; 1265 1266 rast = CALLOC_STRUCT(lp_rasterizer); 1267 if (!rast) { 1268 goto no_rast; 1269 } 1270 1271 rast->full_scenes = lp_scene_queue_create(); 1272 if (!rast->full_scenes) { 1273 goto no_full_scenes; 1274 } 1275 1276 for (i = 0; i < MAX2(1, num_threads); i++) { 1277 struct lp_rasterizer_task *task = &rast->tasks[i]; 1278 task->rast = rast; 1279 task->thread_index = i; 1280 task->thread_data.cache = 1281 align_malloc(sizeof(struct lp_build_format_cache), 16); 1282 if (!task->thread_data.cache) { 1283 goto no_thread_data_cache; 1284 } 1285 } 1286 1287 rast->num_threads = num_threads; 1288 1289 rast->no_rast = debug_get_bool_option("LP_NO_RAST", FALSE); 1290 1291 create_rast_threads(rast); 1292 1293 /* for synchronizing rasterization threads */ 1294 if (rast->num_threads > 0) { 1295 util_barrier_init(&rast->barrier, rast->num_threads); 1296 } 1297 1298 memset(lp_dummy_tile, 0, sizeof lp_dummy_tile); 1299 1300 return rast; 1301 1302no_thread_data_cache: 1303 for (i = 0; i < MAX2(1, rast->num_threads); i++) { 1304 if (rast->tasks[i].thread_data.cache) { 1305 align_free(rast->tasks[i].thread_data.cache); 1306 } 1307 } 1308 1309 lp_scene_queue_destroy(rast->full_scenes); 1310no_full_scenes: 1311 FREE(rast); 1312no_rast: 1313 return NULL; 1314} 1315 1316 1317/* Shutdown: 1318 */ 1319void 1320lp_rast_destroy(struct lp_rasterizer *rast) 1321{ 1322 /* Set exit_flag and signal each thread's work_ready semaphore. 1323 * Each thread will be woken up, notice that the exit_flag is set and 1324 * break out of its main loop. The thread will then exit. 1325 */ 1326 rast->exit_flag = TRUE; 1327 for (unsigned i = 0; i < rast->num_threads; i++) { 1328 pipe_semaphore_signal(&rast->tasks[i].work_ready); 1329 } 1330 1331 /* Wait for threads to terminate before cleaning up per-thread data. 1332 * We don't actually call pipe_thread_wait to avoid dead lock on Windows 1333 * per https://bugs.freedesktop.org/show_bug.cgi?id=76252 */ 1334 for (unsigned i = 0; i < rast->num_threads; i++) { 1335#ifdef _WIN32 1336 /* Threads might already be dead - Windows apparently terminates 1337 * other threads when returning from main. 1338 */ 1339 DWORD exit_code = STILL_ACTIVE; 1340 if (GetExitCodeThread(rast->threads[i], &exit_code) && 1341 exit_code == STILL_ACTIVE) { 1342 pipe_semaphore_wait(&rast->tasks[i].work_done); 1343 } 1344#else 1345 thrd_join(rast->threads[i], NULL); 1346#endif 1347 } 1348 1349 /* Clean up per-thread data */ 1350 for (unsigned i = 0; i < rast->num_threads; i++) { 1351 pipe_semaphore_destroy(&rast->tasks[i].work_ready); 1352 pipe_semaphore_destroy(&rast->tasks[i].work_done); 1353 } 1354 for (unsigned i = 0; i < MAX2(1, rast->num_threads); i++) { 1355 align_free(rast->tasks[i].thread_data.cache); 1356 } 1357 1358 lp_fence_reference(&rast->last_fence, NULL); 1359 1360 /* for synchronizing rasterization threads */ 1361 if (rast->num_threads > 0) { 1362 util_barrier_destroy(&rast->barrier); 1363 } 1364 1365 lp_scene_queue_destroy(rast->full_scenes); 1366 1367 FREE(rast); 1368} 1369 1370void lp_rast_fence(struct lp_rasterizer *rast, 1371 struct lp_fence **fence) 1372{ 1373 if (fence) 1374 lp_fence_reference((struct lp_fence **)fence, rast->last_fence); 1375} 1376