/* * Copyright © 2022 Valve Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Authors: * Timur Kristóf * */ #include "nir.h" #include "nir_builder.h" #include "util/u_math.h" typedef struct { uint32_t task_count_shared_addr; } lower_task_nv_state; typedef struct { /* If true, lower all task_payload I/O to use shared memory. */ bool payload_in_shared; /* Shared memory address where task_payload will be located. */ uint32_t payload_shared_addr; } lower_task_state; static bool lower_nv_task_output(nir_builder *b, nir_instr *instr, void *state) { if (instr->type != nir_instr_type_intrinsic) return false; lower_task_nv_state *s = (lower_task_nv_state *) state; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_load_output: { b->cursor = nir_after_instr(instr); nir_ssa_def *load = nir_load_shared(b, 1, 32, nir_imm_int(b, s->task_count_shared_addr)); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, load); nir_instr_remove(instr); return true; } case nir_intrinsic_store_output: { b->cursor = nir_after_instr(instr); nir_ssa_def *store_val = intrin->src[0].ssa; nir_store_shared(b, store_val, nir_imm_int(b, s->task_count_shared_addr)); nir_instr_remove(instr); return true; } default: return false; } } static void append_launch_mesh_workgroups_to_nv_task(nir_builder *b, lower_task_nv_state *s) { /* At the beginning of the shader, write 0 to the task count. * This ensures that 0 mesh workgroups are launched when the * shader doesn't write the TASK_COUNT output. */ b->cursor = nir_before_cf_list(&b->impl->body); nir_ssa_def *zero = nir_imm_int(b, 0); nir_store_shared(b, zero, nir_imm_int(b, s->task_count_shared_addr)); nir_scoped_barrier(b, .execution_scope = NIR_SCOPE_WORKGROUP, .memory_scope = NIR_SCOPE_WORKGROUP, .memory_semantics = NIR_MEMORY_RELEASE, .memory_modes = nir_var_mem_shared); /* At the end of the shader, read the task count from shared memory * and emit launch_mesh_workgroups. */ b->cursor = nir_after_cf_list(&b->impl->body); nir_scoped_barrier(b, .execution_scope = NIR_SCOPE_WORKGROUP, .memory_scope = NIR_SCOPE_WORKGROUP, .memory_semantics = NIR_MEMORY_ACQUIRE, .memory_modes = nir_var_mem_shared); nir_ssa_def *task_count = nir_load_shared(b, 1, 32, nir_imm_int(b, s->task_count_shared_addr)); /* NV_mesh_shader doesn't offer to choose which task_payload variable * should be passed to mesh shaders, we just pass all. */ uint32_t range = b->shader->info.task_payload_size; nir_ssa_def *one = nir_imm_int(b, 1); nir_ssa_def *dispatch_3d = nir_vec3(b, task_count, one, one); nir_launch_mesh_workgroups(b, dispatch_3d, .base = 0, .range = range); } /** * For NV_mesh_shader: * Task shaders only have 1 output, TASK_COUNT which is a 32-bit * unsigned int that contains the 1-dimensional mesh dispatch size. * This output should behave like a shared variable. * * We lower this output to a shared variable and then we emit * the new launch_mesh_workgroups intrinsic at the end of the shader. */ static void nir_lower_nv_task_count(nir_shader *shader) { lower_task_nv_state state = { .task_count_shared_addr = ALIGN(shader->info.shared_size, 4), }; shader->info.shared_size += 4; nir_shader_instructions_pass(shader, lower_nv_task_output, nir_metadata_none, &state); nir_function_impl *impl = nir_shader_get_entrypoint(shader); nir_builder builder; nir_builder_init(&builder, impl); append_launch_mesh_workgroups_to_nv_task(&builder, &state); nir_metadata_preserve(impl, nir_metadata_none); } static nir_intrinsic_op shared_opcode_for_task_payload(nir_intrinsic_op task_payload_op) { switch (task_payload_op) { #define OP(O) case nir_intrinsic_task_payload_##O: return nir_intrinsic_shared_##O; OP(atomic_exchange) OP(atomic_comp_swap) OP(atomic_add) OP(atomic_imin) OP(atomic_umin) OP(atomic_imax) OP(atomic_umax) OP(atomic_and) OP(atomic_or) OP(atomic_xor) OP(atomic_fadd) OP(atomic_fmin) OP(atomic_fmax) OP(atomic_fcomp_swap) #undef OP case nir_intrinsic_load_task_payload: return nir_intrinsic_load_shared; case nir_intrinsic_store_task_payload: return nir_intrinsic_store_shared; default: unreachable("Invalid task payload atomic"); } } static bool lower_task_payload_to_shared(nir_builder *b, nir_intrinsic_instr *intrin, lower_task_state *s) { /* This assumes that shared and task_payload intrinsics * have the same number of sources and same indices. */ unsigned base = nir_intrinsic_base(intrin); intrin->intrinsic = shared_opcode_for_task_payload(intrin->intrinsic); nir_intrinsic_set_base(intrin, base + s->payload_shared_addr); return true; } static void emit_shared_to_payload_copy(nir_builder *b, uint32_t payload_addr, uint32_t payload_size, lower_task_state *s) { const unsigned invocations = b->shader->info.workgroup_size[0] * b->shader->info.workgroup_size[1] * b->shader->info.workgroup_size[2]; const unsigned bytes_per_copy = 16; const unsigned copies_needed = DIV_ROUND_UP(payload_size, bytes_per_copy); const unsigned copies_per_invocation = DIV_ROUND_UP(copies_needed, invocations); const unsigned base_shared_addr = s->payload_shared_addr + payload_addr; nir_ssa_def *invocation_index = nir_load_local_invocation_index(b); nir_ssa_def *addr = nir_imul_imm(b, invocation_index, bytes_per_copy); /* Wait for all previous shared stores to finish. * This is necessary because we placed the payload in shared memory. */ nir_scoped_barrier(b, .execution_scope = NIR_SCOPE_WORKGROUP, .memory_scope = NIR_SCOPE_WORKGROUP, .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_shared); for (unsigned i = 0; i < copies_per_invocation; ++i) { unsigned const_off = bytes_per_copy * invocations * i; /* Read from shared memory. */ nir_ssa_def *copy = nir_load_shared(b, 4, 32, addr, .align_mul = 16, .base = base_shared_addr + const_off); /* Write to task payload memory. */ nir_store_task_payload(b, copy, addr, .base = const_off); } } static bool lower_task_launch_mesh_workgroups(nir_builder *b, nir_intrinsic_instr *intrin, lower_task_state *s) { if (s->payload_in_shared) { /* Copy the payload from shared memory. * Because launch_mesh_workgroups may only occur in * workgroup-uniform control flow, here we assume that * all invocations in the workgroup are active and therefore * they can all participate in the copy. * * TODO: Skip the copy when the mesh dispatch size is (0, 0, 0). * This is problematic because the dispatch size can be divergent, * and may differ accross subgroups. */ uint32_t payload_addr = nir_intrinsic_base(intrin); uint32_t payload_size = nir_intrinsic_range(intrin); b->cursor = nir_before_instr(&intrin->instr); emit_shared_to_payload_copy(b, payload_addr, payload_size, s); } /* The launch_mesh_workgroups intrinsic is a terminating instruction, * so let's delete everything after it. */ b->cursor = nir_after_instr(&intrin->instr); nir_block *current_block = nir_cursor_current_block(b->cursor); /* Delete following instructions in the current block. */ nir_foreach_instr_reverse_safe(instr, current_block) { if (instr == &intrin->instr) break; nir_instr_remove(instr); } /* Delete following CF at the same level. */ b->cursor = nir_after_instr(&intrin->instr); nir_cf_list extracted; nir_cf_node *end_node = ¤t_block->cf_node; while (!nir_cf_node_is_last(end_node)) end_node = nir_cf_node_next(end_node); nir_cf_extract(&extracted, b->cursor, nir_after_cf_node(end_node)); nir_cf_delete(&extracted); /* Terminate the task shader. */ b->cursor = nir_after_instr(&intrin->instr); nir_jump(b, nir_jump_return); return true; } static bool lower_task_intrin(nir_builder *b, nir_instr *instr, void *state) { if (instr->type != nir_instr_type_intrinsic) return false; lower_task_state *s = (lower_task_state *) state; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_task_payload_atomic_add: case nir_intrinsic_task_payload_atomic_imin: case nir_intrinsic_task_payload_atomic_umin: case nir_intrinsic_task_payload_atomic_imax: case nir_intrinsic_task_payload_atomic_umax: case nir_intrinsic_task_payload_atomic_and: case nir_intrinsic_task_payload_atomic_or: case nir_intrinsic_task_payload_atomic_xor: case nir_intrinsic_task_payload_atomic_exchange: case nir_intrinsic_task_payload_atomic_comp_swap: case nir_intrinsic_task_payload_atomic_fadd: case nir_intrinsic_task_payload_atomic_fmin: case nir_intrinsic_task_payload_atomic_fmax: case nir_intrinsic_task_payload_atomic_fcomp_swap: case nir_intrinsic_store_task_payload: case nir_intrinsic_load_task_payload: if (s->payload_in_shared) return lower_task_payload_to_shared(b, intrin, s); return false; case nir_intrinsic_launch_mesh_workgroups: return lower_task_launch_mesh_workgroups(b, intrin, s); default: return false; } } static bool uses_task_payload_atomics(nir_shader *shader) { nir_foreach_function(func, shader) { if (!func->impl) continue; nir_foreach_block(block, func->impl) { nir_foreach_instr(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_task_payload_atomic_add: case nir_intrinsic_task_payload_atomic_imin: case nir_intrinsic_task_payload_atomic_umin: case nir_intrinsic_task_payload_atomic_imax: case nir_intrinsic_task_payload_atomic_umax: case nir_intrinsic_task_payload_atomic_and: case nir_intrinsic_task_payload_atomic_or: case nir_intrinsic_task_payload_atomic_xor: case nir_intrinsic_task_payload_atomic_exchange: case nir_intrinsic_task_payload_atomic_comp_swap: case nir_intrinsic_task_payload_atomic_fadd: case nir_intrinsic_task_payload_atomic_fmin: case nir_intrinsic_task_payload_atomic_fmax: case nir_intrinsic_task_payload_atomic_fcomp_swap: return true; default: break; } } } } return false; } /** * Common Task Shader lowering to make the job of the backends easier. * * - Lowers NV_mesh_shader TASK_COUNT output to launch_mesh_workgroups. * - Removes all code after launch_mesh_workgroups, enforcing the * fact that it's a terminating instruction. * - Ensures that task shaders always have at least one * launch_mesh_workgroups instruction, so the backend doesn't * need to implement a special case when the shader doesn't have it. * - Optionally, implements task_payload using shared memory when * task_payload atomics are used. * This is useful when the backend is otherwise not capable of * handling the same atomic features as it can for shared memory. * If this is used, the backend only has to implement the basic * load/store operations for task_payload. * * Note, this pass operates on lowered explicit I/O intrinsics, so * it should be called after nir_lower_io + nir_lower_explicit_io. */ bool nir_lower_task_shader(nir_shader *shader, nir_lower_task_shader_options options) { if (shader->info.stage != MESA_SHADER_TASK) return false; nir_function_impl *impl = nir_shader_get_entrypoint(shader); nir_builder builder; nir_builder_init(&builder, impl); if (shader->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_TASK_COUNT)) { /* NV_mesh_shader: * If the shader writes TASK_COUNT, lower that to emit * the new launch_mesh_workgroups intrinsic instead. */ nir_lower_nv_task_count(shader); } else { /* To make sure that task shaders always have a code path that * executes a launch_mesh_workgroups, let's add one at the end. * If the shader already had a launch_mesh_workgroups by any chance, * this will be removed. */ nir_block *last_block = nir_impl_last_block(impl); builder.cursor = nir_after_block_before_jump(last_block); nir_launch_mesh_workgroups(&builder, nir_imm_zero(&builder, 3, 32)); } bool payload_in_shared = options.payload_to_shared_for_atomics && uses_task_payload_atomics(shader); lower_task_state state = { .payload_shared_addr = ALIGN(shader->info.shared_size, 16), .payload_in_shared = payload_in_shared, }; if (payload_in_shared) shader->info.shared_size = state.payload_shared_addr + shader->info.task_payload_size; nir_shader_instructions_pass(shader, lower_task_intrin, nir_metadata_none, &state); /* Delete all code that potentially can't be reached due to * launch_mesh_workgroups being a terminating instruction. */ nir_lower_returns(shader); bool progress; do { progress = false; NIR_PASS(progress, shader, nir_opt_dead_cf); NIR_PASS(progress, shader, nir_opt_dce); } while (progress); return true; }