1/* 2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#include "pipe/p_screen.h" 28#include "pipe/p_state.h" 29#include "tgsi/tgsi_dump.h" 30#include "tgsi/tgsi_parse.h" 31#include "util/format/u_format.h" 32#include "util/u_inlines.h" 33#include "util/u_memory.h" 34#include "util/u_string.h" 35 36#include "nir/tgsi_to_nir.h" 37#include "nir_serialize.h" 38 39#include "freedreno_context.h" 40#include "freedreno_util.h" 41 42#include "ir3/ir3_cache.h" 43#include "ir3/ir3_compiler.h" 44#include "ir3/ir3_gallium.h" 45#include "ir3/ir3_nir.h" 46#include "ir3/ir3_shader.h" 47 48/** 49 * The hardware cso for shader state 50 * 51 * Initially just a container for the ir3_shader, but this is where we'll 52 * plumb in async compile. 53 */ 54struct ir3_shader_state { 55 struct ir3_shader *shader; 56 57 /* Fence signalled when async compile is completed: */ 58 struct util_queue_fence ready; 59}; 60 61/** 62 * Should initial variants be compiled synchronously? 63 * 64 * The only case where util_debug_message() is used in the initial-variants 65 * path is with FD_MESA_DEBUG=shaderdb. So if either debug is disabled (ie. 66 * debug.debug_message==NULL), or shaderdb stats are not enabled, we can 67 * compile the initial shader variant asynchronously. 68 */ 69static bool 70initial_variants_synchronous(struct fd_context *ctx) 71{ 72 return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) || 73 FD_DBG(SERIALC); 74} 75 76static void 77dump_shader_info(struct ir3_shader_variant *v, 78 struct util_debug_callback *debug) 79{ 80 if (!FD_DBG(SHADERDB)) 81 return; 82 83 util_debug_message( 84 debug, SHADER_INFO, 85 "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, " 86 "%u dwords, %u last-baryf, %u half, %u full, %u constlen, " 87 "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, " 88 "%u stp, %u ldp, %u sstall, %u (ss), %u systall, %u (sy), %d waves, " 89 "%d loops\n", 90 ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count, 91 v->info.instrs_count - v->info.nops_count, v->info.mov_count, 92 v->info.cov_count, v->info.sizedwords, v->info.last_baryf, 93 v->info.max_half_reg + 1, v->info.max_reg + 1, v->constlen, 94 v->info.instrs_per_cat[0], v->info.instrs_per_cat[1], 95 v->info.instrs_per_cat[2], v->info.instrs_per_cat[3], 96 v->info.instrs_per_cat[4], v->info.instrs_per_cat[5], 97 v->info.instrs_per_cat[6], v->info.instrs_per_cat[7], 98 v->info.stp_count, v->info.ldp_count, v->info.sstall, 99 v->info.ss, v->info.systall, v->info.sy, v->info.max_waves, v->loops); 100} 101 102static void 103upload_shader_variant(struct ir3_shader_variant *v) 104{ 105 struct ir3_compiler *compiler = v->compiler; 106 107 assert(!v->bo); 108 109 v->bo = 110 fd_bo_new(compiler->dev, v->info.size, FD_BO_NOMAP, 111 "%s:%s", ir3_shader_stage(v), v->name); 112 113 /* Always include shaders in kernel crash dumps. */ 114 fd_bo_mark_for_dump(v->bo); 115 116 fd_bo_upload(v->bo, v->bin, v->info.size); 117} 118 119struct ir3_shader_variant * 120ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key, 121 bool binning_pass, struct util_debug_callback *debug) 122{ 123 struct ir3_shader_variant *v; 124 bool created = false; 125 126 /* Some shader key values may not be used by a given ir3_shader (for 127 * example, fragment shader saturates in the vertex shader), so clean out 128 * those flags to avoid recompiling. 129 */ 130 ir3_key_clear_unused(&key, shader); 131 132 v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created); 133 134 if (created) { 135 if (shader->initial_variants_done) { 136 perf_debug_message(debug, SHADER_INFO, 137 "%s shader: recompiling at draw time: global " 138 "0x%08x, vfsamples %x/%x, astc %x/%x\n", 139 ir3_shader_stage(v), key.global, key.vsamples, 140 key.fsamples, key.vastc_srgb, key.fastc_srgb); 141 } 142 143 dump_shader_info(v, debug); 144 upload_shader_variant(v); 145 146 if (v->binning) { 147 upload_shader_variant(v->binning); 148 dump_shader_info(v->binning, debug); 149 } 150 } 151 152 return v; 153} 154 155static void 156copy_stream_out(struct ir3_stream_output_info *i, 157 const struct pipe_stream_output_info *p) 158{ 159 STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride)); 160 STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output)); 161 162 i->num_outputs = p->num_outputs; 163 for (int n = 0; n < ARRAY_SIZE(i->stride); n++) 164 i->stride[n] = p->stride[n]; 165 166 for (int n = 0; n < ARRAY_SIZE(i->output); n++) { 167 i->output[n].register_index = p->output[n].register_index; 168 i->output[n].start_component = p->output[n].start_component; 169 i->output[n].num_components = p->output[n].num_components; 170 i->output[n].output_buffer = p->output[n].output_buffer; 171 i->output[n].dst_offset = p->output[n].dst_offset; 172 i->output[n].stream = p->output[n].stream; 173 } 174} 175 176static void 177create_initial_variants(struct ir3_shader_state *hwcso, 178 struct util_debug_callback *debug) 179{ 180 struct ir3_shader *shader = hwcso->shader; 181 struct ir3_compiler *compiler = shader->compiler; 182 nir_shader *nir = shader->nir; 183 184 /* Compile standard variants immediately to try to avoid draw-time stalls 185 * to run the compiler. 186 */ 187 struct ir3_shader_key key = { 188 .tessellation = IR3_TESS_NONE, 189 .ucp_enables = MASK(nir->info.clip_distance_array_size), 190 .msaa = true, 191 }; 192 193 switch (nir->info.stage) { 194 case MESA_SHADER_TESS_EVAL: 195 key.tessellation = ir3_tess_mode(nir->info.tess._primitive_mode); 196 break; 197 198 case MESA_SHADER_TESS_CTRL: 199 /* The primitive_mode field, while it exists for TCS, is not 200 * populated (since separable shaders between TCS/TES are legal, 201 * so TCS wouldn't have access to TES's declaration). Make a 202 * guess so that we shader-db something plausible for TCS. 203 */ 204 if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER) 205 key.tessellation = IR3_TESS_TRIANGLES; 206 else 207 key.tessellation = IR3_TESS_ISOLINES; 208 break; 209 210 case MESA_SHADER_GEOMETRY: 211 key.has_gs = true; 212 break; 213 214 default: 215 break; 216 } 217 218 key.safe_constlen = false; 219 struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug); 220 if (!v) 221 return; 222 223 if (v->constlen > compiler->max_const_safe) { 224 key.safe_constlen = true; 225 ir3_shader_variant(shader, key, false, debug); 226 } 227 228 /* For vertex shaders, also compile initial binning pass shader: */ 229 if (nir->info.stage == MESA_SHADER_VERTEX) { 230 key.safe_constlen = false; 231 v = ir3_shader_variant(shader, key, true, debug); 232 if (!v) 233 return; 234 235 if (v->constlen > compiler->max_const_safe) { 236 key.safe_constlen = true; 237 ir3_shader_variant(shader, key, true, debug); 238 } 239 } 240 241 shader->initial_variants_done = true; 242} 243 244static void 245create_initial_variants_async(void *job, void *gdata, int thread_index) 246{ 247 struct ir3_shader_state *hwcso = job; 248 struct util_debug_callback debug = {}; 249 250 create_initial_variants(hwcso, &debug); 251} 252 253static void 254create_initial_compute_variants_async(void *job, void *gdata, int thread_index) 255{ 256 struct ir3_shader_state *hwcso = job; 257 struct ir3_shader *shader = hwcso->shader; 258 struct util_debug_callback debug = {}; 259 static struct ir3_shader_key key; /* static is implicitly zeroed */ 260 261 ir3_shader_variant(shader, key, false, &debug); 262 shader->initial_variants_done = true; 263} 264 265/* a bit annoying that compute-shader and normal shader state objects 266 * aren't a bit more aligned. 267 */ 268void * 269ir3_shader_compute_state_create(struct pipe_context *pctx, 270 const struct pipe_compute_state *cso) 271{ 272 struct fd_context *ctx = fd_context(pctx); 273 274 /* req_input_mem will only be non-zero for cl kernels (ie. clover). 275 * This isn't a perfect test because I guess it is possible (but 276 * uncommon) for none for the kernel parameters to be a global, 277 * but ctx->set_global_bindings() can't fail, so this is the next 278 * best place to fail if we need a newer version of kernel driver: 279 */ 280 if ((cso->req_input_mem > 0) && 281 fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) { 282 return NULL; 283 } 284 285 struct ir3_compiler *compiler = ctx->screen->compiler; 286 nir_shader *nir; 287 288 if (cso->ir_type == PIPE_SHADER_IR_NIR) { 289 /* we take ownership of the reference: */ 290 nir = (nir_shader *)cso->prog; 291 } else if (cso->ir_type == PIPE_SHADER_IR_NIR_SERIALIZED) { 292 const nir_shader_compiler_options *options = 293 ir3_get_compiler_options(compiler); 294 const struct pipe_binary_program_header *hdr = cso->prog; 295 struct blob_reader reader; 296 297 blob_reader_init(&reader, hdr->blob, hdr->num_bytes); 298 nir = nir_deserialize(NULL, options, &reader); 299 300 ir3_finalize_nir(compiler, nir); 301 } else { 302 assert(cso->ir_type == PIPE_SHADER_IR_TGSI); 303 if (ir3_shader_debug & IR3_DBG_DISASM) { 304 tgsi_dump(cso->prog, 0); 305 } 306 nir = tgsi_to_nir(cso->prog, pctx->screen, false); 307 } 308 309 struct ir3_shader *shader = 310 ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){ 311 /* TODO: force to single on a6xx with legacy 312 * ballot extension that uses 64-bit masks 313 */ 314 .api_wavesize = IR3_SINGLE_OR_DOUBLE, 315 .real_wavesize = IR3_SINGLE_OR_DOUBLE, 316 }, NULL); 317 shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4; /* byte->dword */ 318 shader->cs.req_local_mem = cso->req_local_mem; 319 320 struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso)); 321 322 util_queue_fence_init(&hwcso->ready); 323 hwcso->shader = shader; 324 325 /* Immediately compile a standard variant. We have so few variants in our 326 * shaders, that doing so almost eliminates draw-time recompiles. (This 327 * is also how we get data from shader-db's ./run) 328 */ 329 330 if (initial_variants_synchronous(ctx)) { 331 static struct ir3_shader_key key; /* static is implicitly zeroed */ 332 ir3_shader_variant(shader, key, false, &ctx->debug); 333 shader->initial_variants_done = true; 334 } else { 335 struct fd_screen *screen = ctx->screen; 336 util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready, 337 create_initial_compute_variants_async, NULL, 0); 338 } 339 340 return hwcso; 341} 342 343void * 344ir3_shader_state_create(struct pipe_context *pctx, 345 const struct pipe_shader_state *cso) 346{ 347 struct fd_context *ctx = fd_context(pctx); 348 struct ir3_compiler *compiler = ctx->screen->compiler; 349 struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso)); 350 351 /* 352 * Convert to nir (if necessary): 353 */ 354 355 nir_shader *nir; 356 if (cso->type == PIPE_SHADER_IR_NIR) { 357 /* we take ownership of the reference: */ 358 nir = cso->ir.nir; 359 } else { 360 assert(cso->type == PIPE_SHADER_IR_TGSI); 361 if (ir3_shader_debug & IR3_DBG_DISASM) { 362 tgsi_dump(cso->tokens, 0); 363 } 364 nir = tgsi_to_nir(cso->tokens, pctx->screen, false); 365 } 366 367 /* 368 * Create ir3_shader: 369 * 370 * This part is cheap, it doesn't compile initial variants 371 */ 372 373 struct ir3_stream_output_info stream_output = {}; 374 copy_stream_out(&stream_output, &cso->stream_output); 375 376 hwcso->shader = 377 ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){ 378 /* TODO: force to single on a6xx with legacy 379 * ballot extension that uses 64-bit masks 380 */ 381 .api_wavesize = IR3_SINGLE_OR_DOUBLE, 382 .real_wavesize = IR3_SINGLE_OR_DOUBLE, 383 }, 384 &stream_output); 385 386 /* 387 * Create initial variants to avoid draw-time stalls. This is 388 * normally done asynchronously, unless debug is enabled (which 389 * will be the case for shader-db) 390 */ 391 392 util_queue_fence_init(&hwcso->ready); 393 394 if (initial_variants_synchronous(ctx)) { 395 create_initial_variants(hwcso, &ctx->debug); 396 } else { 397 util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready, 398 create_initial_variants_async, NULL, 0); 399 } 400 401 return hwcso; 402} 403 404void 405ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso) 406{ 407 struct fd_context *ctx = fd_context(pctx); 408 struct fd_screen *screen = ctx->screen; 409 struct ir3_shader_state *hwcso = _hwcso; 410 struct ir3_shader *so = hwcso->shader; 411 412 ir3_cache_invalidate(ctx->shader_cache, hwcso); 413 414 /* util_queue_drop_job() guarantees that either: 415 * 1) job did not execute 416 * 2) job completed 417 * 418 * In either case the fence is signaled 419 */ 420 util_queue_drop_job(&screen->compile_queue, &hwcso->ready); 421 422 /* free the uploaded shaders, since this is handled outside of the 423 * shared ir3 code (ie. not used by turnip): 424 */ 425 for (struct ir3_shader_variant *v = so->variants; v; v = v->next) { 426 fd_bo_del(v->bo); 427 v->bo = NULL; 428 429 if (v->binning && v->binning->bo) { 430 fd_bo_del(v->binning->bo); 431 v->binning->bo = NULL; 432 } 433 } 434 435 ir3_shader_destroy(so); 436 util_queue_fence_destroy(&hwcso->ready); 437 free(hwcso); 438} 439 440struct ir3_shader * 441ir3_get_shader(struct ir3_shader_state *hwcso) 442{ 443 if (!hwcso) 444 return NULL; 445 446 struct ir3_shader *shader = hwcso->shader; 447 perf_time (1000, "waited for %s:%s:%s variants", 448 _mesa_shader_stage_to_abbrev(shader->type), 449 shader->nir->info.name, 450 shader->nir->info.label) { 451 /* wait for initial variants to compile: */ 452 util_queue_fence_wait(&hwcso->ready); 453 } 454 455 return shader; 456} 457 458struct shader_info * 459ir3_get_shader_info(struct ir3_shader_state *hwcso) 460{ 461 if (!hwcso) 462 return NULL; 463 return &hwcso->shader->nir->info; 464} 465 466/* fixup dirty shader state in case some "unrelated" (from the state- 467 * tracker's perspective) state change causes us to switch to a 468 * different variant. 469 */ 470void 471ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key) 472{ 473 struct fd_context *ctx = fd_context(pctx); 474 475 if (!ir3_shader_key_equal(ctx->last.key, key)) { 476 if (ir3_shader_key_changes_fs(ctx->last.key, key)) { 477 fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT, 478 FD_DIRTY_SHADER_PROG); 479 } 480 481 if (ir3_shader_key_changes_vs(ctx->last.key, key)) { 482 fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG); 483 } 484 485 /* NOTE: currently only a6xx has gs/tess, but needs no 486 * gs/tess specific lowering. 487 */ 488 489 *ctx->last.key = *key; 490 } 491} 492 493static char * 494ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir) 495{ 496 struct fd_screen *screen = fd_screen(pscreen); 497 498 ir3_nir_lower_io_to_temporaries(nir); 499 ir3_finalize_nir(screen->compiler, nir); 500 501 return NULL; 502} 503 504static void 505ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen, 506 unsigned max_threads) 507{ 508 struct fd_screen *screen = fd_screen(pscreen); 509 510 /* This function doesn't allow a greater number of threads than 511 * the queue had at its creation. 512 */ 513 util_queue_adjust_num_threads(&screen->compile_queue, max_threads); 514} 515 516static bool 517ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen, 518 void *shader, 519 enum pipe_shader_type shader_type) 520{ 521 struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader; 522 523 return util_queue_fence_is_signalled(&hwcso->ready); 524} 525 526void 527ir3_prog_init(struct pipe_context *pctx) 528{ 529 pctx->create_vs_state = ir3_shader_state_create; 530 pctx->delete_vs_state = ir3_shader_state_delete; 531 532 pctx->create_tcs_state = ir3_shader_state_create; 533 pctx->delete_tcs_state = ir3_shader_state_delete; 534 535 pctx->create_tes_state = ir3_shader_state_create; 536 pctx->delete_tes_state = ir3_shader_state_delete; 537 538 pctx->create_gs_state = ir3_shader_state_create; 539 pctx->delete_gs_state = ir3_shader_state_delete; 540 541 pctx->create_fs_state = ir3_shader_state_create; 542 pctx->delete_fs_state = ir3_shader_state_delete; 543} 544 545void 546ir3_screen_init(struct pipe_screen *pscreen) 547{ 548 struct fd_screen *screen = fd_screen(pscreen); 549 550 screen->compiler = ir3_compiler_create(screen->dev, screen->dev_id, 551 &(struct ir3_compiler_options) {}); 552 553 /* TODO do we want to limit things to # of fast cores, or just limit 554 * based on total # of both big and little cores. The little cores 555 * tend to be in-order and probably much slower for compiling than 556 * big cores. OTOH if they are sitting idle, maybe it is useful to 557 * use them? 558 */ 559 unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) - 1; 560 561 /* Create at least one thread - even on single core CPU systems. */ 562 num_threads = MAX2(1, num_threads); 563 564 util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads, 565 UTIL_QUEUE_INIT_RESIZE_IF_FULL | 566 UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL); 567 568 pscreen->finalize_nir = ir3_screen_finalize_nir; 569 pscreen->set_max_shader_compiler_threads = 570 ir3_set_max_shader_compiler_threads; 571 pscreen->is_parallel_shader_compilation_finished = 572 ir3_is_parallel_shader_compilation_finished; 573} 574 575void 576ir3_screen_fini(struct pipe_screen *pscreen) 577{ 578 struct fd_screen *screen = fd_screen(pscreen); 579 580 util_queue_destroy(&screen->compile_queue); 581 ir3_compiler_destroy(screen->compiler); 582 screen->compiler = NULL; 583} 584 585void 586ir3_update_max_tf_vtx(struct fd_context *ctx, 587 const struct ir3_shader_variant *v) 588{ 589 struct fd_streamout_stateobj *so = &ctx->streamout; 590 const struct ir3_stream_output_info *info = &v->stream_output; 591 uint32_t maxvtxcnt = 0x7fffffff; 592 593 if (v->stream_output.num_outputs == 0) 594 maxvtxcnt = 0; 595 if (so->num_targets == 0) 596 maxvtxcnt = 0; 597 598 /* offset to write to is: 599 * 600 * total_vtxcnt = vtxcnt + offsets[i] 601 * offset = total_vtxcnt * stride[i] 602 * 603 * offset = vtxcnt * stride[i] ; calculated in shader 604 * + offsets[i] * stride[i] ; calculated at emit_tfbos() 605 * 606 * assuming for each vtx, each target buffer will have data written 607 * up to 'offset + stride[i]', that leaves maxvtxcnt as: 608 * 609 * buffer_size = (maxvtxcnt * stride[i]) + stride[i] 610 * maxvtxcnt = (buffer_size - stride[i]) / stride[i] 611 * 612 * but shader is actually doing a less-than (rather than less-than- 613 * equal) check, so we can drop the -stride[i]. 614 * 615 * TODO is assumption about `offset + stride[i]` legit? 616 */ 617 for (unsigned i = 0; i < so->num_targets; i++) { 618 struct pipe_stream_output_target *target = so->targets[i]; 619 unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */ 620 if (target) { 621 uint32_t max = target->buffer_size / stride; 622 maxvtxcnt = MIN2(maxvtxcnt, max); 623 } 624 } 625 626 ctx->streamout.max_tf_vtx = maxvtxcnt; 627} 628