1/* 2 * Copyright © 2019 Google, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include "compiler/nir/nir_builder.h" 25#include "ir3_compiler.h" 26#include "ir3_nir.h" 27 28struct state { 29 uint32_t topology; 30 31 struct primitive_map { 32 /* +POSITION, +PSIZE, ... - see shader_io_get_unique_index */ 33 unsigned loc[12 + 32]; 34 unsigned stride; 35 } map; 36 37 nir_ssa_def *header; 38 39 nir_variable *vertex_count_var; 40 nir_variable *emitted_vertex_var; 41 nir_variable *vertex_flags_out; 42 43 struct exec_list old_outputs; 44 struct exec_list new_outputs; 45 struct exec_list emit_outputs; 46 47 /* tess ctrl shader on a650 gets the local primitive id at different bits: */ 48 unsigned local_primitive_id_start; 49}; 50 51static nir_ssa_def * 52bitfield_extract(nir_builder *b, nir_ssa_def *v, uint32_t start, uint32_t mask) 53{ 54 return nir_iand(b, nir_ushr(b, v, nir_imm_int(b, start)), 55 nir_imm_int(b, mask)); 56} 57 58static nir_ssa_def * 59build_invocation_id(nir_builder *b, struct state *state) 60{ 61 return bitfield_extract(b, state->header, 11, 31); 62} 63 64static nir_ssa_def * 65build_vertex_id(nir_builder *b, struct state *state) 66{ 67 return bitfield_extract(b, state->header, 6, 31); 68} 69 70static nir_ssa_def * 71build_local_primitive_id(nir_builder *b, struct state *state) 72{ 73 return bitfield_extract(b, state->header, state->local_primitive_id_start, 74 63); 75} 76 77static bool 78is_tess_levels(gl_varying_slot slot) 79{ 80 return (slot == VARYING_SLOT_PRIMITIVE_ID || 81 slot == VARYING_SLOT_TESS_LEVEL_OUTER || 82 slot == VARYING_SLOT_TESS_LEVEL_INNER); 83} 84 85/* Return a deterministic index for varyings. We can't rely on driver_location 86 * to be correct without linking the different stages first, so we create 87 * "primitive maps" where the producer decides on the location of each varying 88 * slot and then exports a per-slot array to the consumer. This compacts the 89 * gl_varying_slot space down a bit so that the primitive maps aren't too 90 * large. 91 * 92 * Note: per-patch varyings are currently handled separately, without any 93 * compacting. 94 * 95 * TODO: We could probably use the driver_location's directly in the non-SSO 96 * (Vulkan) case. 97 */ 98 99static unsigned 100shader_io_get_unique_index(gl_varying_slot slot) 101{ 102 switch (slot) { 103 case VARYING_SLOT_POS: return 0; 104 case VARYING_SLOT_PSIZ: return 1; 105 case VARYING_SLOT_COL0: return 2; 106 case VARYING_SLOT_COL1: return 3; 107 case VARYING_SLOT_BFC0: return 4; 108 case VARYING_SLOT_BFC1: return 5; 109 case VARYING_SLOT_FOGC: return 6; 110 case VARYING_SLOT_CLIP_DIST0: return 7; 111 case VARYING_SLOT_CLIP_DIST1: return 8; 112 case VARYING_SLOT_CLIP_VERTEX: return 9; 113 case VARYING_SLOT_LAYER: return 10; 114 case VARYING_SLOT_VIEWPORT: return 11; 115 case VARYING_SLOT_VAR0 ... VARYING_SLOT_VAR31: { 116 struct state state = {}; 117 STATIC_ASSERT(ARRAY_SIZE(state.map.loc) - 1 == 118 (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0)); 119 struct ir3_shader_variant v = {}; 120 STATIC_ASSERT(ARRAY_SIZE(v.output_loc) - 1 == 121 (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0)); 122 return 12 + (slot - VARYING_SLOT_VAR0); 123 } 124 default: 125 unreachable("illegal slot in get unique index\n"); 126 } 127} 128 129static nir_ssa_def * 130build_local_offset(nir_builder *b, struct state *state, nir_ssa_def *vertex, 131 uint32_t location, uint32_t comp, nir_ssa_def *offset) 132{ 133 nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b); 134 nir_ssa_def *primitive_offset = 135 nir_imul24(b, build_local_primitive_id(b, state), primitive_stride); 136 nir_ssa_def *attr_offset; 137 nir_ssa_def *vertex_stride; 138 unsigned index = shader_io_get_unique_index(location); 139 140 switch (b->shader->info.stage) { 141 case MESA_SHADER_VERTEX: 142 case MESA_SHADER_TESS_EVAL: 143 vertex_stride = nir_imm_int(b, state->map.stride * 4); 144 attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp); 145 break; 146 case MESA_SHADER_TESS_CTRL: 147 case MESA_SHADER_GEOMETRY: 148 vertex_stride = nir_load_vs_vertex_stride_ir3(b); 149 attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index), 150 nir_imm_int(b, comp * 4)); 151 break; 152 default: 153 unreachable("bad shader stage"); 154 } 155 156 nir_ssa_def *vertex_offset = nir_imul24(b, vertex, vertex_stride); 157 158 return nir_iadd( 159 b, nir_iadd(b, primitive_offset, vertex_offset), 160 nir_iadd(b, attr_offset, nir_ishl(b, offset, nir_imm_int(b, 4)))); 161} 162 163static nir_intrinsic_instr * 164replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr, 165 nir_intrinsic_op op, nir_ssa_def *src0, nir_ssa_def *src1, 166 nir_ssa_def *src2) 167{ 168 nir_intrinsic_instr *new_intr = nir_intrinsic_instr_create(b->shader, op); 169 170 new_intr->src[0] = nir_src_for_ssa(src0); 171 if (src1) 172 new_intr->src[1] = nir_src_for_ssa(src1); 173 if (src2) 174 new_intr->src[2] = nir_src_for_ssa(src2); 175 176 new_intr->num_components = intr->num_components; 177 178 if (nir_intrinsic_infos[op].has_dest) 179 nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, intr->num_components, 180 intr->dest.ssa.bit_size, NULL); 181 182 nir_builder_instr_insert(b, &new_intr->instr); 183 184 if (nir_intrinsic_infos[op].has_dest) 185 nir_ssa_def_rewrite_uses(&intr->dest.ssa, &new_intr->dest.ssa); 186 187 nir_instr_remove(&intr->instr); 188 189 return new_intr; 190} 191 192static void 193build_primitive_map(nir_shader *shader, struct primitive_map *map) 194{ 195 /* All interfaces except the TCS <-> TES interface use ldlw, which takes 196 * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses 197 * ldg, which takes an offset in dwords, but each per-vertex slot has 198 * space for every vertex, and there's space at the beginning for 199 * per-patch varyings. 200 */ 201 unsigned slot_size = 16, start = 0; 202 if (shader->info.stage == MESA_SHADER_TESS_CTRL) { 203 slot_size = shader->info.tess.tcs_vertices_out * 4; 204 start = util_last_bit(shader->info.patch_outputs_written) * 4; 205 } 206 207 uint64_t mask = shader->info.outputs_written; 208 unsigned loc = start; 209 while (mask) { 210 int location = u_bit_scan64(&mask); 211 if (is_tess_levels(location)) 212 continue; 213 214 unsigned index = shader_io_get_unique_index(location); 215 map->loc[index] = loc; 216 loc += slot_size; 217 } 218 219 map->stride = loc; 220 /* Use units of dwords for the stride. */ 221 if (shader->info.stage != MESA_SHADER_TESS_CTRL) 222 map->stride /= 4; 223} 224 225/* For shader stages that receive a primitive map, calculate how big it should 226 * be. 227 */ 228 229static unsigned 230calc_primitive_map_size(nir_shader *shader) 231{ 232 uint64_t mask = shader->info.inputs_read; 233 unsigned max_index = 0; 234 while (mask) { 235 int location = u_bit_scan64(&mask); 236 237 if (is_tess_levels(location)) 238 continue; 239 240 unsigned index = shader_io_get_unique_index(location); 241 max_index = MAX2(max_index, index + 1); 242 } 243 244 return max_index; 245} 246 247static void 248lower_block_to_explicit_output(nir_block *block, nir_builder *b, 249 struct state *state) 250{ 251 nir_foreach_instr_safe (instr, block) { 252 if (instr->type != nir_instr_type_intrinsic) 253 continue; 254 255 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 256 257 switch (intr->intrinsic) { 258 case nir_intrinsic_store_output: { 259 // src[] = { value, offset }. 260 261 /* nir_lower_io_to_temporaries replaces all access to output 262 * variables with temp variables and then emits a nir_copy_var at 263 * the end of the shader. Thus, we should always get a full wrmask 264 * here. 265 */ 266 assert( 267 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1)); 268 269 b->cursor = nir_instr_remove(&intr->instr); 270 271 nir_ssa_def *vertex_id = build_vertex_id(b, state); 272 nir_ssa_def *offset = build_local_offset( 273 b, state, vertex_id, nir_intrinsic_io_semantics(intr).location, 274 nir_intrinsic_component(intr), intr->src[1].ssa); 275 276 nir_store_shared_ir3(b, intr->src[0].ssa, offset); 277 break; 278 } 279 280 default: 281 break; 282 } 283 } 284} 285 286static nir_ssa_def * 287local_thread_id(nir_builder *b) 288{ 289 return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023); 290} 291 292void 293ir3_nir_lower_to_explicit_output(nir_shader *shader, 294 struct ir3_shader_variant *v, 295 unsigned topology) 296{ 297 struct state state = {}; 298 299 build_primitive_map(shader, &state.map); 300 memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc)); 301 302 nir_function_impl *impl = nir_shader_get_entrypoint(shader); 303 assert(impl); 304 305 nir_builder b; 306 nir_builder_init(&b, impl); 307 b.cursor = nir_before_cf_list(&impl->body); 308 309 if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE) 310 state.header = nir_load_tcs_header_ir3(&b); 311 else 312 state.header = nir_load_gs_header_ir3(&b); 313 314 nir_foreach_block_safe (block, impl) 315 lower_block_to_explicit_output(block, &b, &state); 316 317 nir_metadata_preserve(impl, 318 nir_metadata_block_index | nir_metadata_dominance); 319 320 v->output_size = state.map.stride; 321} 322 323static void 324lower_block_to_explicit_input(nir_block *block, nir_builder *b, 325 struct state *state) 326{ 327 nir_foreach_instr_safe (instr, block) { 328 if (instr->type != nir_instr_type_intrinsic) 329 continue; 330 331 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 332 333 switch (intr->intrinsic) { 334 case nir_intrinsic_load_per_vertex_input: { 335 // src[] = { vertex, offset }. 336 337 b->cursor = nir_before_instr(&intr->instr); 338 339 nir_ssa_def *offset = build_local_offset( 340 b, state, 341 intr->src[0].ssa, // this is typically gl_InvocationID 342 nir_intrinsic_io_semantics(intr).location, 343 nir_intrinsic_component(intr), intr->src[1].ssa); 344 345 replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL, 346 NULL); 347 break; 348 } 349 350 case nir_intrinsic_load_invocation_id: { 351 b->cursor = nir_before_instr(&intr->instr); 352 353 nir_ssa_def *iid = build_invocation_id(b, state); 354 nir_ssa_def_rewrite_uses(&intr->dest.ssa, iid); 355 nir_instr_remove(&intr->instr); 356 break; 357 } 358 359 default: 360 break; 361 } 362 } 363} 364 365void 366ir3_nir_lower_to_explicit_input(nir_shader *shader, 367 struct ir3_shader_variant *v) 368{ 369 struct state state = {}; 370 371 /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS, 372 * HS uses a different primitive id, which starts at bit 16 in the header 373 */ 374 if (shader->info.stage == MESA_SHADER_TESS_CTRL && 375 v->compiler->tess_use_shared) 376 state.local_primitive_id_start = 16; 377 378 nir_function_impl *impl = nir_shader_get_entrypoint(shader); 379 assert(impl); 380 381 nir_builder b; 382 nir_builder_init(&b, impl); 383 b.cursor = nir_before_cf_list(&impl->body); 384 385 if (shader->info.stage == MESA_SHADER_GEOMETRY) 386 state.header = nir_load_gs_header_ir3(&b); 387 else 388 state.header = nir_load_tcs_header_ir3(&b); 389 390 nir_foreach_block_safe (block, impl) 391 lower_block_to_explicit_input(block, &b, &state); 392 393 v->input_size = calc_primitive_map_size(shader); 394} 395 396static nir_ssa_def * 397build_tcs_out_vertices(nir_builder *b) 398{ 399 if (b->shader->info.stage == MESA_SHADER_TESS_CTRL) 400 return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out); 401 else 402 return nir_load_patch_vertices_in(b); 403} 404 405static nir_ssa_def * 406build_per_vertex_offset(nir_builder *b, struct state *state, 407 nir_ssa_def *vertex, uint32_t location, uint32_t comp, 408 nir_ssa_def *offset) 409{ 410 nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b); 411 nir_ssa_def *patch_stride = nir_load_hs_patch_stride_ir3(b); 412 nir_ssa_def *patch_offset = nir_imul24(b, patch_id, patch_stride); 413 nir_ssa_def *attr_offset; 414 415 if (nir_src_is_const(nir_src_for_ssa(offset))) { 416 location += nir_src_as_uint(nir_src_for_ssa(offset)); 417 offset = nir_imm_int(b, 0); 418 } else { 419 /* Offset is in vec4's, but we need it in unit of components for the 420 * load/store_global_ir3 offset. 421 */ 422 offset = nir_ishl(b, offset, nir_imm_int(b, 2)); 423 } 424 425 nir_ssa_def *vertex_offset; 426 if (vertex) { 427 unsigned index = shader_io_get_unique_index(location); 428 switch (b->shader->info.stage) { 429 case MESA_SHADER_TESS_CTRL: 430 attr_offset = nir_imm_int(b, state->map.loc[index] + comp); 431 break; 432 case MESA_SHADER_TESS_EVAL: 433 attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index), 434 nir_imm_int(b, comp)); 435 break; 436 default: 437 unreachable("bad shader state"); 438 } 439 440 attr_offset = nir_iadd(b, attr_offset, 441 nir_imul24(b, offset, build_tcs_out_vertices(b))); 442 vertex_offset = nir_ishl(b, vertex, nir_imm_int(b, 2)); 443 } else { 444 assert(location >= VARYING_SLOT_PATCH0 && 445 location <= VARYING_SLOT_TESS_MAX); 446 unsigned index = location - VARYING_SLOT_PATCH0; 447 attr_offset = nir_iadd(b, nir_imm_int(b, index * 4 + comp), offset); 448 vertex_offset = nir_imm_int(b, 0); 449 } 450 451 return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset); 452} 453 454static nir_ssa_def * 455build_patch_offset(nir_builder *b, struct state *state, uint32_t base, 456 uint32_t comp, nir_ssa_def *offset) 457{ 458 return build_per_vertex_offset(b, state, NULL, base, comp, offset); 459} 460 461static void 462tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer) 463{ 464 switch (state->topology) { 465 case IR3_TESS_TRIANGLES: 466 *inner = 1; 467 *outer = 3; 468 break; 469 case IR3_TESS_QUADS: 470 *inner = 2; 471 *outer = 4; 472 break; 473 case IR3_TESS_ISOLINES: 474 *inner = 0; 475 *outer = 2; 476 break; 477 default: 478 unreachable("bad"); 479 } 480} 481 482static nir_ssa_def * 483build_tessfactor_base(nir_builder *b, gl_varying_slot slot, uint32_t comp, 484 struct state *state) 485{ 486 uint32_t inner_levels, outer_levels; 487 tess_level_components(state, &inner_levels, &outer_levels); 488 489 const uint32_t patch_stride = 1 + inner_levels + outer_levels; 490 491 nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b); 492 493 nir_ssa_def *patch_offset = 494 nir_imul24(b, patch_id, nir_imm_int(b, patch_stride)); 495 496 uint32_t offset; 497 switch (slot) { 498 case VARYING_SLOT_PRIMITIVE_ID: 499 offset = 0; 500 break; 501 case VARYING_SLOT_TESS_LEVEL_OUTER: 502 offset = 1; 503 break; 504 case VARYING_SLOT_TESS_LEVEL_INNER: 505 offset = 1 + outer_levels; 506 break; 507 default: 508 unreachable("bad"); 509 } 510 511 return nir_iadd(b, patch_offset, nir_imm_int(b, offset + comp)); 512} 513 514static void 515lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state) 516{ 517 nir_foreach_instr_safe (instr, block) { 518 if (instr->type != nir_instr_type_intrinsic) 519 continue; 520 521 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 522 523 switch (intr->intrinsic) { 524 case nir_intrinsic_load_per_vertex_output: { 525 // src[] = { vertex, offset }. 526 527 b->cursor = nir_before_instr(&intr->instr); 528 529 nir_ssa_def *address = nir_load_tess_param_base_ir3(b); 530 nir_ssa_def *offset = build_per_vertex_offset( 531 b, state, intr->src[0].ssa, 532 nir_intrinsic_io_semantics(intr).location, 533 nir_intrinsic_component(intr), intr->src[1].ssa); 534 535 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, 536 offset, NULL); 537 break; 538 } 539 540 case nir_intrinsic_store_per_vertex_output: { 541 // src[] = { value, vertex, offset }. 542 543 b->cursor = nir_before_instr(&intr->instr); 544 545 /* sparse writemask not supported */ 546 assert( 547 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1)); 548 549 nir_ssa_def *value = intr->src[0].ssa; 550 nir_ssa_def *address = nir_load_tess_param_base_ir3(b); 551 nir_ssa_def *offset = build_per_vertex_offset( 552 b, state, intr->src[1].ssa, 553 nir_intrinsic_io_semantics(intr).location, 554 nir_intrinsic_component(intr), intr->src[2].ssa); 555 556 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value, 557 address, offset); 558 559 break; 560 } 561 562 case nir_intrinsic_load_output: { 563 // src[] = { offset }. 564 565 b->cursor = nir_before_instr(&intr->instr); 566 567 nir_ssa_def *address, *offset; 568 569 /* note if vectorization of the tess level loads ever happens: 570 * "ldg" across 16-byte boundaries can behave incorrectly if results 571 * are never used. most likely some issue with (sy) not properly 572 * syncing with values coming from a second memory transaction. 573 */ 574 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location; 575 if (is_tess_levels(location)) { 576 assert(intr->dest.ssa.num_components == 1); 577 address = nir_load_tess_factor_base_ir3(b); 578 offset = build_tessfactor_base( 579 b, location, nir_intrinsic_component(intr), state); 580 } else { 581 address = nir_load_tess_param_base_ir3(b); 582 offset = build_patch_offset(b, state, location, 583 nir_intrinsic_component(intr), 584 intr->src[0].ssa); 585 } 586 587 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, 588 offset, NULL); 589 break; 590 } 591 592 case nir_intrinsic_store_output: { 593 // src[] = { value, offset }. 594 595 /* write patch output to bo */ 596 597 b->cursor = nir_before_instr(&intr->instr); 598 599 /* sparse writemask not supported */ 600 assert( 601 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1)); 602 603 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location; 604 if (is_tess_levels(location)) { 605 uint32_t inner_levels, outer_levels, levels; 606 tess_level_components(state, &inner_levels, &outer_levels); 607 608 assert(intr->src[0].ssa->num_components == 1); 609 610 nir_if *nif = NULL; 611 if (location != VARYING_SLOT_PRIMITIVE_ID) { 612 /* with tess levels are defined as float[4] and float[2], 613 * but tess factor BO has smaller sizes for tris/isolines, 614 * so we have to discard any writes beyond the number of 615 * components for inner/outer levels 616 */ 617 if (location == VARYING_SLOT_TESS_LEVEL_OUTER) 618 levels = outer_levels; 619 else 620 levels = inner_levels; 621 622 nir_ssa_def *offset = nir_iadd_imm( 623 b, intr->src[1].ssa, nir_intrinsic_component(intr)); 624 nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels))); 625 } 626 627 nir_ssa_def *offset = build_tessfactor_base( 628 b, location, nir_intrinsic_component(intr), state); 629 630 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, 631 intr->src[0].ssa, 632 nir_load_tess_factor_base_ir3(b), 633 nir_iadd(b, intr->src[1].ssa, offset)); 634 635 if (location != VARYING_SLOT_PRIMITIVE_ID) { 636 nir_pop_if(b, nif); 637 } 638 } else { 639 nir_ssa_def *address = nir_load_tess_param_base_ir3(b); 640 nir_ssa_def *offset = build_patch_offset( 641 b, state, location, nir_intrinsic_component(intr), 642 intr->src[1].ssa); 643 644 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, 645 intr->src[0].ssa, address, offset); 646 } 647 break; 648 } 649 650 default: 651 break; 652 } 653 } 654} 655 656static void 657emit_tess_epilouge(nir_builder *b, struct state *state) 658{ 659 /* Insert endpatch instruction: 660 * 661 * TODO we should re-work this to use normal flow control. 662 */ 663 664 nir_end_patch_ir3(b); 665} 666 667void 668ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v, 669 unsigned topology) 670{ 671 struct state state = {.topology = topology}; 672 673 if (shader_debug_enabled(shader->info.stage)) { 674 mesa_logi("NIR (before tess lowering) for %s shader:", 675 _mesa_shader_stage_to_string(shader->info.stage)); 676 nir_log_shaderi(shader); 677 } 678 679 build_primitive_map(shader, &state.map); 680 memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc)); 681 v->output_size = state.map.stride; 682 683 nir_function_impl *impl = nir_shader_get_entrypoint(shader); 684 assert(impl); 685 686 nir_builder b; 687 nir_builder_init(&b, impl); 688 b.cursor = nir_before_cf_list(&impl->body); 689 690 state.header = nir_load_tcs_header_ir3(&b); 691 692 /* If required, store gl_PrimitiveID. */ 693 if (v->key.tcs_store_primid) { 694 b.cursor = nir_after_cf_list(&impl->body); 695 696 nir_store_output(&b, nir_load_primitive_id(&b), nir_imm_int(&b, 0), 697 .io_semantics = { 698 .location = VARYING_SLOT_PRIMITIVE_ID, 699 .num_slots = 1 700 }); 701 702 b.cursor = nir_before_cf_list(&impl->body); 703 } 704 705 nir_foreach_block_safe (block, impl) 706 lower_tess_ctrl_block(block, &b, &state); 707 708 /* Now move the body of the TCS into a conditional: 709 * 710 * if (gl_InvocationID < num_vertices) 711 * // body 712 * 713 */ 714 715 nir_cf_list body; 716 nir_cf_extract(&body, nir_before_cf_list(&impl->body), 717 nir_after_cf_list(&impl->body)); 718 719 b.cursor = nir_after_cf_list(&impl->body); 720 721 /* Re-emit the header, since the old one got moved into the if branch */ 722 state.header = nir_load_tcs_header_ir3(&b); 723 nir_ssa_def *iid = build_invocation_id(&b, &state); 724 725 const uint32_t nvertices = shader->info.tess.tcs_vertices_out; 726 nir_ssa_def *cond = nir_ult(&b, iid, nir_imm_int(&b, nvertices)); 727 728 nir_if *nif = nir_push_if(&b, cond); 729 730 nir_cf_reinsert(&body, b.cursor); 731 732 b.cursor = nir_after_cf_list(&nif->then_list); 733 734 /* Insert conditional exit for threads invocation id != 0 */ 735 nir_ssa_def *iid0_cond = nir_ieq_imm(&b, iid, 0); 736 nir_cond_end_ir3(&b, iid0_cond); 737 738 emit_tess_epilouge(&b, &state); 739 740 nir_pop_if(&b, nif); 741 742 nir_metadata_preserve(impl, nir_metadata_none); 743} 744 745static void 746lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state) 747{ 748 nir_foreach_instr_safe (instr, block) { 749 if (instr->type != nir_instr_type_intrinsic) 750 continue; 751 752 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 753 754 switch (intr->intrinsic) { 755 case nir_intrinsic_load_tess_coord: { 756 b->cursor = nir_after_instr(&intr->instr); 757 nir_ssa_def *x = nir_channel(b, &intr->dest.ssa, 0); 758 nir_ssa_def *y = nir_channel(b, &intr->dest.ssa, 1); 759 nir_ssa_def *z; 760 761 if (state->topology == IR3_TESS_TRIANGLES) 762 z = nir_fsub(b, nir_fsub(b, nir_imm_float(b, 1.0f), y), x); 763 else 764 z = nir_imm_float(b, 0.0f); 765 766 nir_ssa_def *coord = nir_vec3(b, x, y, z); 767 768 nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, coord, 769 b->cursor.instr); 770 break; 771 } 772 773 case nir_intrinsic_load_per_vertex_input: { 774 // src[] = { vertex, offset }. 775 776 b->cursor = nir_before_instr(&intr->instr); 777 778 nir_ssa_def *address = nir_load_tess_param_base_ir3(b); 779 nir_ssa_def *offset = build_per_vertex_offset( 780 b, state, intr->src[0].ssa, 781 nir_intrinsic_io_semantics(intr).location, 782 nir_intrinsic_component(intr), intr->src[1].ssa); 783 784 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, 785 offset, NULL); 786 break; 787 } 788 789 case nir_intrinsic_load_input: { 790 // src[] = { offset }. 791 792 b->cursor = nir_before_instr(&intr->instr); 793 794 nir_ssa_def *address, *offset; 795 796 /* note if vectorization of the tess level loads ever happens: 797 * "ldg" across 16-byte boundaries can behave incorrectly if results 798 * are never used. most likely some issue with (sy) not properly 799 * syncing with values coming from a second memory transaction. 800 */ 801 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location; 802 if (is_tess_levels(location)) { 803 assert(intr->dest.ssa.num_components == 1); 804 address = nir_load_tess_factor_base_ir3(b); 805 offset = build_tessfactor_base( 806 b, location, nir_intrinsic_component(intr), state); 807 } else { 808 address = nir_load_tess_param_base_ir3(b); 809 offset = build_patch_offset(b, state, location, 810 nir_intrinsic_component(intr), 811 intr->src[0].ssa); 812 } 813 814 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, 815 offset, NULL); 816 break; 817 } 818 819 default: 820 break; 821 } 822 } 823} 824 825void 826ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v, 827 unsigned topology) 828{ 829 struct state state = {.topology = topology}; 830 831 if (shader_debug_enabled(shader->info.stage)) { 832 mesa_logi("NIR (before tess lowering) for %s shader:", 833 _mesa_shader_stage_to_string(shader->info.stage)); 834 nir_log_shaderi(shader); 835 } 836 837 nir_function_impl *impl = nir_shader_get_entrypoint(shader); 838 assert(impl); 839 840 nir_builder b; 841 nir_builder_init(&b, impl); 842 843 nir_foreach_block_safe (block, impl) 844 lower_tess_eval_block(block, &b, &state); 845 846 v->input_size = calc_primitive_map_size(shader); 847 848 nir_metadata_preserve(impl, nir_metadata_none); 849} 850 851static void 852copy_vars(nir_builder *b, struct exec_list *dests, struct exec_list *srcs) 853{ 854 foreach_two_lists (dest_node, dests, src_node, srcs) { 855 nir_variable *dest = exec_node_data(nir_variable, dest_node, node); 856 nir_variable *src = exec_node_data(nir_variable, src_node, node); 857 nir_copy_var(b, dest, src); 858 } 859} 860 861static void 862lower_gs_block(nir_block *block, nir_builder *b, struct state *state) 863{ 864 nir_foreach_instr_safe (instr, block) { 865 if (instr->type != nir_instr_type_intrinsic) 866 continue; 867 868 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 869 870 switch (intr->intrinsic) { 871 case nir_intrinsic_end_primitive: { 872 /* Note: This ignores the stream, which seems to match the blob 873 * behavior. I'm guessing the HW ignores any extraneous cut 874 * signals from an EndPrimitive() that doesn't correspond to the 875 * rasterized stream. 876 */ 877 b->cursor = nir_before_instr(&intr->instr); 878 nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1); 879 nir_instr_remove(&intr->instr); 880 break; 881 } 882 883 case nir_intrinsic_emit_vertex: { 884 /* Load the vertex count */ 885 b->cursor = nir_before_instr(&intr->instr); 886 nir_ssa_def *count = nir_load_var(b, state->vertex_count_var); 887 888 nir_push_if(b, nir_ieq(b, count, local_thread_id(b))); 889 890 unsigned stream = nir_intrinsic_stream_id(intr); 891 /* vertex_flags_out |= stream */ 892 nir_store_var(b, state->vertex_flags_out, 893 nir_ior(b, nir_load_var(b, state->vertex_flags_out), 894 nir_imm_int(b, stream)), 895 0x1 /* .x */); 896 897 copy_vars(b, &state->emit_outputs, &state->old_outputs); 898 899 nir_instr_remove(&intr->instr); 900 901 nir_store_var(b, state->emitted_vertex_var, 902 nir_iadd(b, nir_load_var(b, state->emitted_vertex_var), 903 nir_imm_int(b, 1)), 904 0x1); 905 906 nir_pop_if(b, NULL); 907 908 /* Increment the vertex count by 1 */ 909 nir_store_var(b, state->vertex_count_var, 910 nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */ 911 nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1); 912 913 break; 914 } 915 916 default: 917 break; 918 } 919 } 920} 921 922void 923ir3_nir_lower_gs(nir_shader *shader) 924{ 925 struct state state = {}; 926 927 /* Don't lower multiple times: */ 928 nir_foreach_shader_out_variable (var, shader) 929 if (var->data.location == VARYING_SLOT_GS_VERTEX_FLAGS_IR3) 930 return; 931 932 if (shader_debug_enabled(shader->info.stage)) { 933 mesa_logi("NIR (before gs lowering):"); 934 nir_log_shaderi(shader); 935 } 936 937 /* Create an output var for vertex_flags. This will be shadowed below, 938 * same way regular outputs get shadowed, and this variable will become a 939 * temporary. 940 */ 941 state.vertex_flags_out = nir_variable_create( 942 shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags"); 943 state.vertex_flags_out->data.driver_location = shader->num_outputs++; 944 state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3; 945 state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE; 946 947 nir_function_impl *impl = nir_shader_get_entrypoint(shader); 948 assert(impl); 949 950 nir_builder b; 951 nir_builder_init(&b, impl); 952 b.cursor = nir_before_cf_list(&impl->body); 953 954 state.header = nir_load_gs_header_ir3(&b); 955 956 /* Generate two set of shadow vars for the output variables. The first 957 * set replaces the real outputs and the second set (emit_outputs) we'll 958 * assign in the emit_vertex conditionals. Then at the end of the shader 959 * we copy the emit_outputs to the real outputs, so that we get 960 * store_output in uniform control flow. 961 */ 962 exec_list_make_empty(&state.old_outputs); 963 nir_foreach_shader_out_variable_safe (var, shader) { 964 exec_node_remove(&var->node); 965 exec_list_push_tail(&state.old_outputs, &var->node); 966 } 967 exec_list_make_empty(&state.new_outputs); 968 exec_list_make_empty(&state.emit_outputs); 969 nir_foreach_variable_in_list (var, &state.old_outputs) { 970 /* Create a new output var by cloning the original output var and 971 * stealing the name. 972 */ 973 nir_variable *output = nir_variable_clone(var, shader); 974 exec_list_push_tail(&state.new_outputs, &output->node); 975 976 /* Rewrite the original output to be a shadow variable. */ 977 var->name = ralloc_asprintf(var, "%s@gs-temp", output->name); 978 var->data.mode = nir_var_shader_temp; 979 980 /* Clone the shadow variable to create the emit shadow variable that 981 * we'll assign in the emit conditionals. 982 */ 983 nir_variable *emit_output = nir_variable_clone(var, shader); 984 emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name); 985 exec_list_push_tail(&state.emit_outputs, &emit_output->node); 986 } 987 988 /* During the shader we'll keep track of which vertex we're currently 989 * emitting for the EmitVertex test and how many vertices we emitted so we 990 * know to discard if didn't emit any. In most simple shaders, this can 991 * all be statically determined and gets optimized away. 992 */ 993 state.vertex_count_var = 994 nir_local_variable_create(impl, glsl_uint_type(), "vertex_count"); 995 state.emitted_vertex_var = 996 nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex"); 997 998 /* Initialize to 0. */ 999 b.cursor = nir_before_cf_list(&impl->body); 1000 nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1); 1001 nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1); 1002 nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1); 1003 1004 nir_foreach_block_safe (block, impl) 1005 lower_gs_block(block, &b, &state); 1006 1007 /* Note: returns are lowered, so there should be only one block before the 1008 * end block. If we had real returns, we would probably want to redirect 1009 * them to this new if statement, rather than emitting this code at every 1010 * return statement. 1011 */ 1012 assert(impl->end_block->predecessors->entries == 1); 1013 nir_block *block = nir_impl_last_block(impl); 1014 b.cursor = nir_after_block_before_jump(block); 1015 1016 /* If we haven't emitted any vertex we need to copy the shadow (old) 1017 * outputs to emit outputs here. 1018 * 1019 * Also some piglit GS tests[1] don't have EndPrimitive() so throw 1020 * in an extra vertex_flags write for good measure. If unneeded it 1021 * will be optimized out. 1022 * 1023 * [1] ex, tests/spec/glsl-1.50/execution/compatibility/clipping/gs-clip-vertex-const-accept.shader_test 1024 */ 1025 nir_ssa_def *cond = 1026 nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0); 1027 nir_push_if(&b, cond); 1028 nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1); 1029 copy_vars(&b, &state.emit_outputs, &state.old_outputs); 1030 nir_pop_if(&b, NULL); 1031 1032 nir_discard_if(&b, cond); 1033 1034 copy_vars(&b, &state.new_outputs, &state.emit_outputs); 1035 1036 exec_list_append(&shader->variables, &state.old_outputs); 1037 exec_list_append(&shader->variables, &state.emit_outputs); 1038 exec_list_append(&shader->variables, &state.new_outputs); 1039 1040 nir_metadata_preserve(impl, nir_metadata_none); 1041 1042 nir_lower_global_vars_to_local(shader); 1043 nir_split_var_copies(shader); 1044 nir_lower_var_copies(shader); 1045 1046 nir_fixup_deref_modes(shader); 1047 1048 if (shader_debug_enabled(shader->info.stage)) { 1049 mesa_logi("NIR (after gs lowering):"); 1050 nir_log_shaderi(shader); 1051 } 1052} 1053