1/* 2 * Copyright © 2021 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "nir.h" 25#include "nir_builder.h" 26 27/* This pass provides a way to move computations that are always the same for 28 * an entire draw/compute dispatch into a "preamble" that runs before the main 29 * entrypoint. 30 * 31 * We also expose a separate API to get or construct the preamble of a shader 32 * in case backends want to insert their own code. 33 */ 34 35 36nir_function_impl * 37nir_shader_get_preamble(nir_shader *shader) 38{ 39 nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader); 40 if (entrypoint->preamble) { 41 return entrypoint->preamble->impl; 42 } else { 43 nir_function *preamble = nir_function_create(shader, "@preamble"); 44 preamble->is_preamble = true; 45 nir_function_impl *impl = nir_function_impl_create(preamble); 46 entrypoint->preamble = preamble; 47 return impl; 48 } 49} 50 51typedef struct { 52 bool can_move; 53 bool candidate; 54 bool must_stay; 55 bool replace; 56 57 unsigned can_move_users; 58 59 unsigned size, align; 60 61 unsigned offset; 62 63 /* Average the cost of a value among its users, to try to account for 64 * values that have multiple can_move uses. 65 */ 66 float value; 67 68 /* Overall benefit, i.e. the value minus any cost to inserting 69 * load_preamble. 70 */ 71 float benefit; 72} def_state; 73 74typedef struct { 75 /* Per-definition array of states */ 76 def_state *states; 77 78 nir_ssa_def *def; 79 80 const nir_opt_preamble_options *options; 81} opt_preamble_ctx; 82 83static float 84get_instr_cost(nir_instr *instr, const nir_opt_preamble_options *options) 85{ 86 /* No backend will want to hoist load_const or undef by itself, so handle 87 * this for them. 88 */ 89 if (instr->type == nir_instr_type_load_const || 90 instr->type == nir_instr_type_ssa_undef) 91 return 0; 92 93 return options->instr_cost_cb(instr, options->cb_data); 94} 95 96static bool 97can_move_src(nir_src *src, void *state) 98{ 99 opt_preamble_ctx *ctx = state; 100 101 assert(src->is_ssa); 102 return ctx->states[src->ssa->index].can_move; 103} 104 105static bool 106can_move_srcs(nir_instr *instr, opt_preamble_ctx *ctx) 107{ 108 return nir_foreach_src(instr, can_move_src, ctx); 109} 110 111static bool 112can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx) 113{ 114 switch (instr->intrinsic) { 115 /* Intrinsics which can always be moved */ 116 case nir_intrinsic_load_push_constant: 117 case nir_intrinsic_load_work_dim: 118 case nir_intrinsic_load_num_workgroups: 119 case nir_intrinsic_load_workgroup_size: 120 case nir_intrinsic_load_ray_launch_size: 121 case nir_intrinsic_load_ray_launch_size_addr_amd: 122 case nir_intrinsic_load_sbt_base_amd: 123 case nir_intrinsic_load_is_indexed_draw: 124 case nir_intrinsic_load_viewport_scale: 125 case nir_intrinsic_load_user_clip_plane: 126 case nir_intrinsic_load_viewport_x_scale: 127 case nir_intrinsic_load_viewport_y_scale: 128 case nir_intrinsic_load_viewport_z_scale: 129 case nir_intrinsic_load_viewport_offset: 130 case nir_intrinsic_load_viewport_x_offset: 131 case nir_intrinsic_load_viewport_y_offset: 132 case nir_intrinsic_load_viewport_z_offset: 133 case nir_intrinsic_load_blend_const_color_a_float: 134 case nir_intrinsic_load_blend_const_color_b_float: 135 case nir_intrinsic_load_blend_const_color_g_float: 136 case nir_intrinsic_load_blend_const_color_r_float: 137 case nir_intrinsic_load_blend_const_color_rgba: 138 case nir_intrinsic_load_blend_const_color_aaaa8888_unorm: 139 case nir_intrinsic_load_blend_const_color_rgba8888_unorm: 140 case nir_intrinsic_load_line_width: 141 case nir_intrinsic_load_aa_line_width: 142 case nir_intrinsic_load_fb_layers_v3d: 143 case nir_intrinsic_load_tcs_num_patches_amd: 144 case nir_intrinsic_load_sample_positions_pan: 145 case nir_intrinsic_load_shader_query_enabled_amd: 146 case nir_intrinsic_load_cull_front_face_enabled_amd: 147 case nir_intrinsic_load_cull_back_face_enabled_amd: 148 case nir_intrinsic_load_cull_ccw_amd: 149 case nir_intrinsic_load_cull_small_primitives_enabled_amd: 150 case nir_intrinsic_load_cull_any_enabled_amd: 151 case nir_intrinsic_load_cull_small_prim_precision_amd: 152 return true; 153 154 /* Intrinsics which can be moved depending on hardware */ 155 case nir_intrinsic_load_base_instance: 156 case nir_intrinsic_load_base_vertex: 157 case nir_intrinsic_load_first_vertex: 158 case nir_intrinsic_load_draw_id: 159 return ctx->options->drawid_uniform; 160 161 case nir_intrinsic_load_subgroup_size: 162 case nir_intrinsic_load_num_subgroups: 163 return ctx->options->subgroup_size_uniform; 164 165 /* Intrinsics which can be moved if the sources can */ 166 case nir_intrinsic_load_ubo: 167 case nir_intrinsic_load_ubo_vec4: 168 case nir_intrinsic_get_ubo_size: 169 case nir_intrinsic_get_ssbo_size: 170 case nir_intrinsic_ballot_bitfield_extract: 171 case nir_intrinsic_ballot_find_lsb: 172 case nir_intrinsic_ballot_find_msb: 173 case nir_intrinsic_ballot_bit_count_reduce: 174 case nir_intrinsic_load_deref: 175 case nir_intrinsic_load_global_constant: 176 case nir_intrinsic_load_uniform: 177 case nir_intrinsic_load_constant: 178 case nir_intrinsic_load_sample_pos_from_id: 179 case nir_intrinsic_load_kernel_input: 180 case nir_intrinsic_load_buffer_amd: 181 case nir_intrinsic_image_samples: 182 case nir_intrinsic_image_deref_samples: 183 case nir_intrinsic_bindless_image_samples: 184 case nir_intrinsic_image_size: 185 case nir_intrinsic_image_deref_size: 186 case nir_intrinsic_bindless_image_size: 187 case nir_intrinsic_vulkan_resource_index: 188 case nir_intrinsic_vulkan_resource_reindex: 189 case nir_intrinsic_load_vulkan_descriptor: 190 case nir_intrinsic_quad_swizzle_amd: 191 case nir_intrinsic_masked_swizzle_amd: 192 case nir_intrinsic_load_ssbo_address: 193 case nir_intrinsic_bindless_resource_ir3: 194 return can_move_srcs(&instr->instr, ctx); 195 196 /* Image/SSBO loads can be moved if they are CAN_REORDER and their 197 * sources can be moved. 198 */ 199 case nir_intrinsic_image_load: 200 case nir_intrinsic_bindless_image_load: 201 case nir_intrinsic_load_ssbo: 202 case nir_intrinsic_load_ssbo_ir3: 203 return (nir_intrinsic_access(instr) & ACCESS_CAN_REORDER) && 204 can_move_srcs(&instr->instr, ctx); 205 206 default: 207 return false; 208 } 209} 210 211static bool 212can_move_instr(nir_instr *instr, opt_preamble_ctx *ctx) 213{ 214 switch (instr->type) { 215 case nir_instr_type_tex: { 216 nir_tex_instr *tex = nir_instr_as_tex(instr); 217 /* See note below about derivatives. We have special code to convert tex 218 * to txd, though, because it's a common case. 219 */ 220 if (nir_tex_instr_has_implicit_derivative(tex) && 221 tex->op != nir_texop_tex) { 222 return false; 223 } 224 return can_move_srcs(instr, ctx); 225 } 226 case nir_instr_type_alu: { 227 /* The preamble is presumably run with only one thread, so we can't run 228 * derivatives in it. 229 * TODO: Replace derivatives with 0 instead, if real apps hit this. 230 */ 231 nir_alu_instr *alu = nir_instr_as_alu(instr); 232 switch (alu->op) { 233 case nir_op_fddx: 234 case nir_op_fddy: 235 case nir_op_fddx_fine: 236 case nir_op_fddy_fine: 237 case nir_op_fddx_coarse: 238 case nir_op_fddy_coarse: 239 return false; 240 default: 241 return can_move_srcs(instr, ctx); 242 } 243 } 244 case nir_instr_type_intrinsic: 245 return can_move_intrinsic(nir_instr_as_intrinsic(instr), ctx); 246 247 case nir_instr_type_load_const: 248 case nir_instr_type_ssa_undef: 249 return true; 250 251 case nir_instr_type_deref: { 252 nir_deref_instr *deref = nir_instr_as_deref(instr); 253 if (deref->deref_type == nir_deref_type_var) { 254 switch (deref->modes) { 255 case nir_var_uniform: 256 case nir_var_mem_ubo: 257 return true; 258 default: 259 return false; 260 } 261 } else { 262 return can_move_srcs(instr, ctx); 263 } 264 } 265 266 case nir_instr_type_phi: 267 /* TODO: we could move an if-statement if everything inside it is 268 * moveable. 269 */ 270 return false; 271 272 default: 273 return false; 274 } 275} 276 277/* True if we should avoid making this a candidate. This is only called on 278 * instructions we already determined we can move, this just makes it so that 279 * uses of this instruction cannot be rewritten. Typically this happens 280 * because of static constraints on the IR, for example some deref chains 281 * cannot be broken. 282 */ 283static bool 284avoid_instr(nir_instr *instr, const nir_opt_preamble_options *options) 285{ 286 if (instr->type == nir_instr_type_deref) 287 return true; 288 289 return options->avoid_instr_cb(instr, options->cb_data); 290} 291 292static bool 293update_src_value(nir_src *src, void *data) 294{ 295 opt_preamble_ctx *ctx = data; 296 297 def_state *state = &ctx->states[ctx->def->index]; 298 def_state *src_state = &ctx->states[src->ssa->index]; 299 300 assert(src_state->can_move); 301 302 /* If an instruction has can_move and non-can_move users, it becomes a 303 * candidate and its value shouldn't propagate downwards. For example, 304 * imagine a chain like this: 305 * 306 * -- F (cannot move) 307 * / 308 * A <-- B <-- C <-- D <-- E (cannot move) 309 * 310 * B and D are marked candidates. Picking B removes A and B, picking D 311 * removes C and D, and picking both removes all 4. Therefore B and D are 312 * independent and B's value shouldn't flow into D. 313 * 314 * A similar argument holds for must_stay values. 315 */ 316 if (!src_state->must_stay && !src_state->candidate) 317 state->value += src_state->value; 318 return true; 319} 320 321static int 322candidate_sort(const void *data1, const void *data2) 323{ 324 const def_state *state1 = *(def_state **)data1; 325 const def_state *state2 = *(def_state **)data2; 326 327 float value1 = state1->value / state1->size; 328 float value2 = state2->value / state2->size; 329 if (value1 < value2) 330 return 1; 331 else if (value1 > value2) 332 return -1; 333 else 334 return 0; 335} 336 337bool 338nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options, 339 unsigned *size) 340{ 341 opt_preamble_ctx ctx = { 342 .options = options, 343 }; 344 345 nir_function_impl *impl = nir_shader_get_entrypoint(shader); 346 ctx.states = calloc(impl->ssa_alloc, sizeof(*ctx.states)); 347 348 /* Step 1: Calculate can_move */ 349 nir_foreach_block (block, impl) { 350 nir_foreach_instr (instr, block) { 351 nir_ssa_def *def = nir_instr_ssa_def(instr); 352 if (!def) 353 continue; 354 355 def_state *state = &ctx.states[def->index]; 356 357 state->can_move = can_move_instr(instr, &ctx); 358 } 359 } 360 361 /* Step 2: Calculate is_candidate. This is complicated by the presence of 362 * non-candidate instructions like derefs whose users cannot be rewritten. 363 * If a deref chain is used at all by a non-can_move thing, then any offset 364 * sources anywhere along the chain should be considered candidates because 365 * the entire deref chain will never be deleted, but if it's only used by 366 * can_move things then it becomes subsumed by its users and none of the 367 * offset sources should be considered candidates as they will be removed 368 * when the users of the deref chain are moved. We need to replace "are 369 * there any non-can_move users" with "are there any non-can_move users, 370 * *recursing through non-candidate users*". We do this by walking backward 371 * and marking when a non-candidate instruction must stay in the final 372 * program because it has a non-can_move user, including recursively. 373 */ 374 unsigned num_candidates = 0; 375 nir_foreach_block_reverse (block, impl) { 376 nir_foreach_instr_reverse (instr, block) { 377 nir_ssa_def *def = nir_instr_ssa_def(instr); 378 if (!def) 379 continue; 380 381 def_state *state = &ctx.states[def->index]; 382 if (!state->can_move) 383 continue; 384 385 state->value = get_instr_cost(instr, options); 386 bool is_candidate = !avoid_instr(instr, options); 387 state->candidate = false; 388 state->must_stay = false; 389 nir_foreach_use (use, def) { 390 nir_ssa_def *use_def = nir_instr_ssa_def(use->parent_instr); 391 if (!use_def || !ctx.states[use_def->index].can_move || 392 ctx.states[use_def->index].must_stay) { 393 if (is_candidate) 394 state->candidate = true; 395 else 396 state->must_stay = true; 397 } else { 398 state->can_move_users++; 399 } 400 } 401 402 nir_foreach_if_use (use, def) { 403 if (is_candidate) 404 state->candidate = true; 405 else 406 state->must_stay = true; 407 break; 408 } 409 410 if (state->candidate) 411 num_candidates++; 412 } 413 } 414 415 if (num_candidates == 0) { 416 *size = 0; 417 free(ctx.states); 418 return false; 419 } 420 421 def_state **candidates = malloc(sizeof(*candidates) * num_candidates); 422 unsigned candidate_idx = 0; 423 unsigned total_size = 0; 424 425 /* Step 3: Calculate value of candidates by propagating downwards. We try 426 * to share the value amongst can_move uses, in case there are multiple. 427 * This won't always find the most optimal solution, but is hopefully a 428 * good heuristic. 429 * 430 * Note that we use the can_move adjusted in the last pass, because if a 431 * can_move instruction cannot be moved because it's not a candidate and it 432 * has a non-can_move source then we don't want to count it as a use. 433 * 434 * While we're here, also collect an array of candidates. 435 */ 436 nir_foreach_block (block, impl) { 437 nir_foreach_instr (instr, block) { 438 nir_ssa_def *def = nir_instr_ssa_def(instr); 439 if (!def) 440 continue; 441 442 def_state *state = &ctx.states[def->index]; 443 if (!state->can_move || state->must_stay) 444 continue; 445 446 ctx.def = def; 447 nir_foreach_src(instr, update_src_value, &ctx); 448 449 /* If this instruction is a candidate, its value shouldn't be 450 * propagated so we skip dividing it. 451 * 452 * Note: if it's can_move but not a candidate, then all its users 453 * must be can_move, so if there are no users then it must be dead. 454 */ 455 if (!state->candidate && !state->must_stay) { 456 if (state->can_move_users > 0) 457 state->value /= state->can_move_users; 458 else 459 state->value = 0; 460 } 461 462 if (state->candidate) { 463 state->benefit = state->value - 464 options->rewrite_cost_cb(def, options->cb_data); 465 466 if (state->benefit > 0) { 467 options->def_size(def, &state->size, &state->align); 468 total_size = ALIGN_POT(total_size, state->align); 469 total_size += state->size; 470 candidates[candidate_idx++] = state; 471 } 472 } 473 } 474 } 475 476 assert(candidate_idx <= num_candidates); 477 num_candidates = candidate_idx; 478 479 if (num_candidates == 0) { 480 *size = 0; 481 free(ctx.states); 482 free(candidates); 483 return false; 484 } 485 486 /* Step 4: Figure out which candidates we're going to replace and assign an 487 * offset. Assuming there is no expression sharing, this is similar to the 488 * 0-1 knapsack problem, except when there is a gap introduced by 489 * alignment. We use a well-known greedy approximation, sorting by value 490 * divided by size. 491 */ 492 493 if (total_size > options->preamble_storage_size) { 494 qsort(candidates, num_candidates, sizeof(*candidates), candidate_sort); 495 } 496 497 unsigned offset = 0; 498 for (unsigned i = 0; i < num_candidates; i++) { 499 def_state *state = candidates[i]; 500 offset = ALIGN_POT(offset, state->align); 501 502 if (offset + state->size > options->preamble_storage_size) 503 break; 504 505 state->replace = true; 506 state->offset = offset; 507 508 offset += state->size; 509 } 510 511 *size = offset; 512 513 free(candidates); 514 515 /* Step 5: Actually do the replacement. */ 516 struct hash_table *remap_table = 517 _mesa_pointer_hash_table_create(NULL); 518 nir_function_impl *preamble = 519 nir_shader_get_preamble(impl->function->shader); 520 nir_builder _b; 521 nir_builder *b = &_b; 522 nir_builder_init(b, preamble); 523 b->cursor = nir_before_cf_list(&preamble->body); 524 525 nir_foreach_block (block, impl) { 526 nir_foreach_instr (instr, block) { 527 nir_ssa_def *def = nir_instr_ssa_def(instr); 528 if (!def) 529 continue; 530 531 def_state *state = &ctx.states[def->index]; 532 if (!state->can_move) 533 continue; 534 535 nir_instr *clone = nir_instr_clone_deep(impl->function->shader, 536 instr, remap_table); 537 538 nir_builder_instr_insert(b, clone); 539 540 if (clone->type == nir_instr_type_tex) { 541 nir_tex_instr *tex = nir_instr_as_tex(clone); 542 if (tex->op == nir_texop_tex) { 543 /* For maximum compatibility, replace normal textures with 544 * textureGrad with a gradient of 0. 545 * TODO: Handle txb somehow. 546 */ 547 b->cursor = nir_before_instr(clone); 548 549 nir_ssa_def *zero = 550 nir_imm_zero(b, tex->coord_components - tex->is_array, 32); 551 nir_tex_instr_add_src(tex, nir_tex_src_ddx, nir_src_for_ssa(zero)); 552 nir_tex_instr_add_src(tex, nir_tex_src_ddy, nir_src_for_ssa(zero)); 553 tex->op = nir_texop_txd; 554 555 b->cursor = nir_after_instr(clone); 556 } 557 } 558 559 if (state->replace) { 560 nir_ssa_def *clone_def = nir_instr_ssa_def(clone); 561 nir_store_preamble(b, clone_def, .base = state->offset); 562 } 563 } 564 } 565 566 nir_builder_init(b, impl); 567 568 nir_foreach_block (block, impl) { 569 nir_foreach_instr_safe (instr, block) { 570 nir_ssa_def *def = nir_instr_ssa_def(instr); 571 if (!def) 572 continue; 573 574 def_state *state = &ctx.states[def->index]; 575 if (!state->replace) 576 continue; 577 578 b->cursor = nir_before_instr(instr); 579 580 nir_ssa_def *new_def = 581 nir_load_preamble(b, def->num_components, def->bit_size, 582 .base = state->offset); 583 584 585 nir_ssa_def_rewrite_uses(def, new_def); 586 nir_instr_free_and_dce(instr); 587 } 588 } 589 590 nir_metadata_preserve(impl, 591 nir_metadata_block_index | 592 nir_metadata_dominance); 593 594 ralloc_free(remap_table); 595 free(ctx.states); 596 return true; 597} 598