1/* 2 * Copyright 2017 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "nir_builder.h" 26#include "si_pipe.h" 27 28 29static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data) 30{ 31 struct si_screen *sscreen = (struct si_screen *)data; 32 33 if (sscreen->info.has_packed_math_16bit && instr->type == nir_instr_type_alu) { 34 nir_alu_instr *alu = nir_instr_as_alu(instr); 35 36 if (alu->dest.dest.is_ssa && 37 alu->dest.dest.ssa.bit_size == 16 && 38 alu->dest.dest.ssa.num_components == 2) 39 return false; 40 } 41 42 return true; 43} 44 45static uint8_t si_vectorize_callback(const nir_instr *instr, const void *data) 46{ 47 if (instr->type != nir_instr_type_alu) 48 return 0; 49 50 nir_alu_instr *alu = nir_instr_as_alu(instr); 51 if (nir_dest_bit_size(alu->dest.dest) == 16) 52 return 2; 53 54 return 1; 55} 56 57void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first) 58{ 59 bool progress; 60 61 do { 62 progress = false; 63 bool lower_alu_to_scalar = false; 64 bool lower_phis_to_scalar = false; 65 66 NIR_PASS(progress, nir, nir_lower_vars_to_ssa); 67 NIR_PASS(progress, nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen); 68 NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false); 69 70 if (first) { 71 NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp); 72 NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp); 73 NIR_PASS(progress, nir, nir_opt_find_array_copies); 74 } 75 NIR_PASS(progress, nir, nir_opt_copy_prop_vars); 76 NIR_PASS(progress, nir, nir_opt_dead_write_vars); 77 78 NIR_PASS(lower_alu_to_scalar, nir, nir_opt_trivial_continues); 79 /* (Constant) copy propagation is needed for txf with offsets. */ 80 NIR_PASS(progress, nir, nir_copy_prop); 81 NIR_PASS(progress, nir, nir_opt_remove_phis); 82 NIR_PASS(progress, nir, nir_opt_dce); 83 /* nir_opt_if_optimize_phi_true_false is disabled on LLVM14 (#6976) */ 84 NIR_PASS(lower_phis_to_scalar, nir, nir_opt_if, 85 nir_opt_if_aggressive_last_continue | 86 (LLVM_VERSION_MAJOR == 14 ? 0 : nir_opt_if_optimize_phi_true_false)); 87 NIR_PASS(progress, nir, nir_opt_dead_cf); 88 89 if (lower_alu_to_scalar) 90 NIR_PASS_V(nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen); 91 if (lower_phis_to_scalar) 92 NIR_PASS_V(nir, nir_lower_phis_to_scalar, false); 93 progress |= lower_alu_to_scalar | lower_phis_to_scalar; 94 95 NIR_PASS(progress, nir, nir_opt_cse); 96 NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true); 97 98 /* Needed for algebraic lowering */ 99 NIR_PASS(progress, nir, nir_opt_algebraic); 100 NIR_PASS(progress, nir, nir_opt_constant_folding); 101 102 if (!nir->info.flrp_lowered) { 103 unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) | 104 (nir->options->lower_flrp32 ? 32 : 0) | 105 (nir->options->lower_flrp64 ? 64 : 0); 106 assert(lower_flrp); 107 bool lower_flrp_progress = false; 108 109 NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */); 110 if (lower_flrp_progress) { 111 NIR_PASS(progress, nir, nir_opt_constant_folding); 112 progress = true; 113 } 114 115 /* Nothing should rematerialize any flrps, so we only 116 * need to do this lowering once. 117 */ 118 nir->info.flrp_lowered = true; 119 } 120 121 NIR_PASS(progress, nir, nir_opt_undef); 122 NIR_PASS(progress, nir, nir_opt_conditional_discard); 123 if (nir->options->max_unroll_iterations) { 124 NIR_PASS(progress, nir, nir_opt_loop_unroll); 125 } 126 127 if (nir->info.stage == MESA_SHADER_FRAGMENT) 128 NIR_PASS_V(nir, nir_opt_move_discards_to_top); 129 130 if (sscreen->info.has_packed_math_16bit) 131 NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback, NULL); 132 } while (progress); 133 134 NIR_PASS_V(nir, nir_lower_var_copies); 135} 136 137void si_nir_late_opts(nir_shader *nir) 138{ 139 bool more_late_algebraic = true; 140 while (more_late_algebraic) { 141 more_late_algebraic = false; 142 NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late); 143 NIR_PASS_V(nir, nir_opt_constant_folding); 144 145 /* We should run this after constant folding for stages that support indirect 146 * inputs/outputs. 147 */ 148 if (nir->options->support_indirect_inputs & BITFIELD_BIT(nir->info.stage) || 149 nir->options->support_indirect_outputs & BITFIELD_BIT(nir->info.stage)) 150 NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out); 151 152 NIR_PASS_V(nir, nir_copy_prop); 153 NIR_PASS_V(nir, nir_opt_dce); 154 NIR_PASS_V(nir, nir_opt_cse); 155 } 156} 157 158static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shader *nir) 159{ 160 /* Optimize types of image_sample sources and destinations. 161 * 162 * The image_sample sources bit sizes are: 163 * nir_tex_src_coord: a16 ? 16 : 32 164 * nir_tex_src_comparator: 32 165 * nir_tex_src_offset: 32 166 * nir_tex_src_bias: a16 ? 16 : 32 167 * nir_tex_src_lod: a16 ? 16 : 32 168 * nir_tex_src_min_lod: a16 ? 16 : 32 169 * nir_tex_src_ms_index: a16 ? 16 : 32 170 * nir_tex_src_ddx: has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32) 171 * nir_tex_src_ddy: has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32) 172 * 173 * We only use a16/g16 if all of the affected sources are 16bit. 174 */ 175 bool has_g16 = sscreen->info.gfx_level >= GFX10 && LLVM_VERSION_MAJOR >= 12; 176 struct nir_fold_tex_srcs_options fold_srcs_options[] = { 177 { 178 .sampler_dims = 179 ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)), 180 .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) | 181 (1 << nir_tex_src_bias) | (1 << nir_tex_src_min_lod) | 182 (1 << nir_tex_src_ms_index) | 183 (has_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)), 184 }, 185 { 186 .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE), 187 .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy), 188 }, 189 }; 190 struct nir_fold_16bit_tex_image_options fold_16bit_options = { 191 .rounding_mode = nir_rounding_mode_rtne, 192 .fold_tex_dest = true, 193 .fold_image_load_store_data = true, 194 .fold_srcs_options_count = has_g16 ? 2 : 1, 195 .fold_srcs_options = fold_srcs_options, 196 }; 197 bool changed = false; 198 NIR_PASS(changed, nir, nir_fold_16bit_tex_image, &fold_16bit_options); 199 200 if (changed) { 201 si_nir_opts(sscreen, nir, false); 202 si_nir_late_opts(nir); 203 } 204} 205 206static bool 207lower_intrinsic_filter(const nir_instr *instr, const void *dummy) 208{ 209 return instr->type == nir_instr_type_intrinsic; 210} 211 212static nir_ssa_def * 213lower_intrinsic_instr(nir_builder *b, nir_instr *instr, void *dummy) 214{ 215 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 216 217 switch (intrin->intrinsic) { 218 case nir_intrinsic_is_sparse_texels_resident: 219 /* code==0 means sparse texels are resident */ 220 return nir_ieq_imm(b, intrin->src[0].ssa, 0); 221 case nir_intrinsic_sparse_residency_code_and: 222 return nir_ior(b, intrin->src[0].ssa, intrin->src[1].ssa); 223 default: 224 return NULL; 225 } 226} 227 228static bool si_lower_intrinsics(nir_shader *nir) 229{ 230 return nir_shader_lower_instructions(nir, 231 lower_intrinsic_filter, 232 lower_intrinsic_instr, 233 NULL); 234} 235 236/** 237 * Perform "lowering" operations on the NIR that are run once when the shader 238 * selector is created. 239 */ 240static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir) 241{ 242 /* Perform lowerings (and optimizations) of code. 243 * 244 * Performance considerations aside, we must: 245 * - lower certain ALU operations 246 * - ensure constant offsets for texture instructions are folded 247 * and copy-propagated 248 */ 249 250 static const struct nir_lower_tex_options lower_tex_options = { 251 .lower_txp = ~0u, 252 .lower_txs_cube_array = true, 253 .lower_invalid_implicit_lod = true, 254 .lower_tg4_offsets = true, 255 }; 256 NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options); 257 258 static const struct nir_lower_image_options lower_image_options = { 259 .lower_cube_size = true, 260 }; 261 NIR_PASS_V(nir, nir_lower_image, &lower_image_options); 262 263 NIR_PASS_V(nir, si_lower_intrinsics); 264 265 const nir_lower_subgroups_options subgroups_options = { 266 .subgroup_size = 64, 267 .ballot_bit_size = 64, 268 .ballot_components = 1, 269 .lower_to_scalar = true, 270 .lower_subgroup_masks = true, 271 .lower_vote_trivial = false, 272 .lower_vote_eq = true, 273 }; 274 NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options); 275 276 NIR_PASS_V(nir, nir_lower_discard_or_demote, 277 (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) || 278 nir->info.use_legacy_math_rules); 279 280 /* Lower load constants to scalar and then clean up the mess */ 281 NIR_PASS_V(nir, nir_lower_load_const_to_scalar); 282 NIR_PASS_V(nir, nir_lower_var_copies); 283 NIR_PASS_V(nir, nir_opt_intrinsics); 284 NIR_PASS_V(nir, nir_lower_system_values); 285 NIR_PASS_V(nir, nir_lower_compute_system_values, NULL); 286 287 /* si_nir_kill_outputs and ac_nir_optimize_outputs require outputs to be scalar. */ 288 if (nir->info.stage == MESA_SHADER_VERTEX || 289 nir->info.stage == MESA_SHADER_TESS_EVAL || 290 nir->info.stage == MESA_SHADER_GEOMETRY) 291 NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out); 292 293 if (nir->info.stage == MESA_SHADER_COMPUTE) { 294 if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) { 295 /* If we are shuffling local_invocation_id for quad derivatives, we 296 * need to derive local_invocation_index from local_invocation_id 297 * first, so that the value corresponds to the shuffled 298 * local_invocation_id. 299 */ 300 nir_lower_compute_system_values_options options = {0}; 301 options.lower_local_invocation_index = true; 302 NIR_PASS_V(nir, nir_lower_compute_system_values, &options); 303 } 304 305 nir_opt_cse(nir); /* CSE load_local_invocation_id */ 306 nir_lower_compute_system_values_options options = {0}; 307 options.shuffle_local_ids_for_quad_derivatives = true; 308 NIR_PASS_V(nir, nir_lower_compute_system_values, &options); 309 } 310 311 if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16)) { 312 NIR_PASS_V(nir, nir_lower_mediump_io, 313 /* TODO: LLVM fails to compile this test if VS inputs are 16-bit: 314 * dEQP-GLES31.functional.shaders.builtin_functions.integer.bitfieldinsert.uvec3_lowp_geometry 315 */ 316 (nir->info.stage != MESA_SHADER_VERTEX ? nir_var_shader_in : 0) | nir_var_shader_out, 317 BITFIELD64_BIT(VARYING_SLOT_PNTC) | BITFIELD64_RANGE(VARYING_SLOT_VAR0, 32), 318 true); 319 } 320 321 si_nir_opts(sscreen, nir, true); 322 /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */ 323 si_nir_late_opts(nir); 324 325 if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16)) 326 si_late_optimize_16bit_samplers(sscreen, nir); 327 328 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL); 329} 330 331char *si_finalize_nir(struct pipe_screen *screen, void *nirptr) 332{ 333 struct si_screen *sscreen = (struct si_screen *)screen; 334 struct nir_shader *nir = (struct nir_shader *)nirptr; 335 336 nir_lower_io_passes(nir); 337 338 /* Remove dead derefs, so that we can remove uniforms. */ 339 NIR_PASS_V(nir, nir_opt_dce); 340 341 /* Remove uniforms because those should have been lowered to UBOs already. */ 342 nir_foreach_variable_with_modes_safe(var, nir, nir_var_uniform) { 343 if (!glsl_type_get_image_count(var->type) && 344 !glsl_type_get_sampler_count(var->type)) 345 exec_node_remove(&var->node); 346 } 347 348 si_lower_nir(sscreen, nir); 349 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); 350 351 if (sscreen->options.inline_uniforms) 352 nir_find_inlinable_uniforms(nir); 353 354 /* Lower large variables that are always constant with load_constant intrinsics, which 355 * get turned into PC-relative loads from a data section next to the shader. 356 * 357 * Run this once before lcssa because the added phis may prevent this 358 * pass from operating correctly. 359 * 360 * nir_opt_large_constants may use op_amul (see nir_build_deref_offset), 361 * or may create unneeded code, so run si_nir_opts if needed. 362 */ 363 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL); 364 bool progress = false; 365 NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16); 366 if (progress) 367 si_nir_opts(sscreen, nir, false); 368 369 NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */ 370 NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */ 371 372 return NULL; 373} 374