1/* 2 * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#include "util/ralloc.h" 28 29#include "freedreno_dev_info.h" 30 31#include "ir3_compiler.h" 32 33static const struct debug_named_value shader_debug_options[] = { 34 /* clang-format off */ 35 {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"}, 36 {"tcs", IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"}, 37 {"tes", IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"}, 38 {"gs", IR3_DBG_SHADER_GS, "Print shader disasm for geometry shaders"}, 39 {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"}, 40 {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"}, 41 {"disasm", IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"}, 42 {"optmsgs", IR3_DBG_OPTMSGS, "Enable optimizer debug messages"}, 43 {"forces2en", IR3_DBG_FORCES2EN, "Force s2en mode for tex sampler instructions"}, 44 {"nouboopt", IR3_DBG_NOUBOOPT, "Disable lowering UBO to uniform"}, 45 {"nofp16", IR3_DBG_NOFP16, "Don't lower mediump to fp16"}, 46 {"nocache", IR3_DBG_NOCACHE, "Disable shader cache"}, 47 {"spillall", IR3_DBG_SPILLALL, "Spill as much as possible to test the spiller"}, 48 {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"}, 49#ifdef DEBUG 50 /* DEBUG-only options: */ 51 {"schedmsgs", IR3_DBG_SCHEDMSGS, "Enable scheduler debug messages"}, 52 {"ramsgs", IR3_DBG_RAMSGS, "Enable register-allocation debug messages"}, 53#endif 54 DEBUG_NAMED_VALUE_END 55 /* clang-format on */ 56}; 57 58DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", 59 shader_debug_options, 0) 60DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH", 61 NULL) 62 63enum ir3_shader_debug ir3_shader_debug = 0; 64const char *ir3_shader_override_path = NULL; 65 66void 67ir3_compiler_destroy(struct ir3_compiler *compiler) 68{ 69 disk_cache_destroy(compiler->disk_cache); 70 ralloc_free(compiler); 71} 72 73#define COMMON_OPTIONS \ 74 .lower_fpow = true, \ 75 .lower_scmp = true, \ 76 .lower_flrp16 = true, \ 77 .lower_flrp32 = true, \ 78 .lower_flrp64 = true, \ 79 .lower_ffract = true, \ 80 .lower_fmod = true, \ 81 .lower_fdiv = true, \ 82 .lower_isign = true, \ 83 .lower_ldexp = true, \ 84 .lower_uadd_carry = true, \ 85 .lower_usub_borrow = true, \ 86 .lower_mul_high = true, \ 87 .lower_mul_2x32_64 = true, \ 88 .fuse_ffma16 = true, \ 89 .fuse_ffma32 = true, \ 90 .fuse_ffma64 = true, \ 91 .vertex_id_zero_based = false, \ 92 .lower_extract_byte = true, \ 93 .lower_extract_word = true, \ 94 .lower_insert_byte = true, \ 95 .lower_insert_word = true, \ 96 .lower_helper_invocation = true, \ 97 .lower_bitfield_insert_to_shifts = true, \ 98 .lower_bitfield_extract_to_shifts = true, \ 99 .lower_pack_half_2x16 = true, \ 100 .lower_pack_snorm_4x8 = true, \ 101 .lower_pack_snorm_2x16 = true, \ 102 .lower_pack_unorm_4x8 = true, \ 103 .lower_pack_unorm_2x16 = true, \ 104 .lower_unpack_half_2x16 = true, \ 105 .lower_unpack_snorm_4x8 = true, \ 106 .lower_unpack_snorm_2x16 = true, \ 107 .lower_unpack_unorm_4x8 = true, \ 108 .lower_unpack_unorm_2x16 = true, \ 109 .lower_pack_split = true, \ 110 .use_interpolated_input_intrinsics = true, \ 111 .lower_rotate = true, \ 112 .lower_to_scalar = true, \ 113 .has_imul24 = true, \ 114 .has_fsub = true, \ 115 .has_isub = true, \ 116 .force_indirect_unrolling_sampler = true, \ 117 .lower_uniforms_to_ubo = true, \ 118 .use_scoped_barrier = true, \ 119 .max_unroll_iterations = 32 120 121static const nir_shader_compiler_options nir_options = { 122 COMMON_OPTIONS, 123 .lower_wpos_pntc = true, 124 .lower_cs_local_index_to_id = true, 125 126 /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c 127 * but that should be harmless for GL since 64b is not 128 * supported there. 129 */ 130 .lower_int64_options = (nir_lower_int64_options)~0, 131}; 132 133/* we don't want to lower vertex_id to _zero_based on newer gpus: */ 134static const nir_shader_compiler_options nir_options_a6xx = { 135 COMMON_OPTIONS, 136 .vectorize_io = true, 137 .force_indirect_unrolling = nir_var_all, 138 .lower_wpos_pntc = true, 139 .lower_cs_local_index_to_id = true, 140 141 /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c 142 * but that should be harmless for GL since 64b is not 143 * supported there. 144 */ 145 .lower_int64_options = (nir_lower_int64_options)~0, 146 .lower_device_index_to_zero = true, 147 .has_udot_4x8 = true, 148 .has_sudot_4x8 = true, 149}; 150 151struct ir3_compiler * 152ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, 153 const struct ir3_compiler_options *options) 154{ 155 struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler); 156 157 ir3_shader_debug = debug_get_option_ir3_shader_debug(); 158 ir3_shader_override_path = 159 !__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL; 160 161 if (ir3_shader_override_path) { 162 ir3_shader_debug |= IR3_DBG_NOCACHE; 163 } 164 165 compiler->dev = dev; 166 compiler->dev_id = dev_id; 167 compiler->gen = fd_dev_gen(dev_id); 168 compiler->robust_buffer_access2 = options->robust_buffer_access2; 169 170 /* All known GPU's have 32k local memory (aka shared) */ 171 compiler->local_mem_size = 32 * 1024; 172 /* TODO see if older GPU's were different here */ 173 compiler->branchstack_size = 64; 174 compiler->wave_granularity = 2; 175 compiler->max_waves = 16; 176 177 compiler->max_variable_workgroup_size = 1024; 178 179 const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id); 180 181 if (compiler->gen >= 6) { 182 compiler->samgq_workaround = true; 183 /* a6xx split the pipeline state into geometry and fragment state, in 184 * order to let the VS run ahead of the FS. As a result there are now 185 * separate const files for the the fragment shader and everything 186 * else, and separate limits. There seems to be a shared limit, but 187 * it's higher than the vert or frag limits. 188 * 189 * Also, according to the observation on a630/a650/a660, max_const_pipeline 190 * has to be 512 when all geometry stages are present. Otherwise a gpu hang 191 * happens. Accordingly maximum safe size for each stage should be under 192 * (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for 193 * const files. 194 * 195 * Only when VS and FS stages are present, the limit is 640. 196 * 197 * TODO: The shared limit seems to be different on different models. 198 */ 199 compiler->max_const_pipeline = 512; 200 compiler->max_const_frag = 512; 201 compiler->max_const_geom = 512; 202 compiler->max_const_safe = 100; 203 204 /* Compute shaders don't share a const file with the FS. Instead they 205 * have their own file, which is smaller than the FS one. 206 * 207 * TODO: is this true on earlier gen's? 208 */ 209 compiler->max_const_compute = 256; 210 211 /* TODO: implement clip+cull distances on earlier gen's */ 212 compiler->has_clip_cull = true; 213 214 /* TODO: implement private memory on earlier gen's */ 215 compiler->has_pvtmem = true; 216 217 compiler->has_preamble = true; 218 219 compiler->tess_use_shared = dev_info->a6xx.tess_use_shared; 220 221 compiler->storage_16bit = dev_info->a6xx.storage_16bit; 222 223 compiler->has_getfiberid = dev_info->a6xx.has_getfiberid; 224 225 compiler->has_dp2acc = dev_info->a6xx.has_dp2acc; 226 compiler->has_dp4acc = dev_info->a6xx.has_dp4acc; 227 228 compiler->shared_consts_base_offset = 504; 229 compiler->shared_consts_size = 8; 230 compiler->geom_shared_consts_size_quirk = 16; 231 } else { 232 compiler->max_const_pipeline = 512; 233 compiler->max_const_geom = 512; 234 compiler->max_const_frag = 512; 235 compiler->max_const_compute = 512; 236 237 /* Note: this will have to change if/when we support tess+GS on 238 * earlier gen's. 239 */ 240 compiler->max_const_safe = 256; 241 } 242 243 if (compiler->gen >= 6) { 244 compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4; 245 } else if (compiler->gen >= 4) { 246 /* On a4xx-a5xx, using r24.x and above requires using the smallest 247 * threadsize. 248 */ 249 compiler->reg_size_vec4 = 48; 250 } else { 251 /* TODO: confirm this */ 252 compiler->reg_size_vec4 = 96; 253 } 254 255 if (compiler->gen >= 6) { 256 compiler->threadsize_base = 64; 257 } else if (compiler->gen >= 4) { 258 /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan 259 * 1.1 subgroupSize which is 32. 260 */ 261 compiler->threadsize_base = 32; 262 } else { 263 compiler->threadsize_base = 8; 264 } 265 266 if (compiler->gen >= 4) { 267 /* need special handling for "flat" */ 268 compiler->flat_bypass = true; 269 compiler->levels_add_one = false; 270 compiler->unminify_coords = false; 271 compiler->txf_ms_with_isaml = false; 272 compiler->array_index_add_half = true; 273 compiler->instr_align = 16; 274 compiler->const_upload_unit = 4; 275 } else { 276 /* no special handling for "flat" */ 277 compiler->flat_bypass = false; 278 compiler->levels_add_one = true; 279 compiler->unminify_coords = true; 280 compiler->txf_ms_with_isaml = true; 281 compiler->array_index_add_half = false; 282 compiler->instr_align = 4; 283 compiler->const_upload_unit = 8; 284 } 285 286 compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32; 287 compiler->has_shared_regfile = compiler->gen >= 5; 288 289 compiler->push_ubo_with_preamble = options->push_ubo_with_preamble; 290 291 /* The driver can't request this unless preambles are supported. */ 292 if (options->push_ubo_with_preamble) 293 assert(compiler->has_preamble); 294 295 if (compiler->gen >= 6) { 296 compiler->nir_options = nir_options_a6xx; 297 compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc; 298 compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc; 299 } else { 300 compiler->nir_options = nir_options; 301 /* a2xx compiler doesn't handle indirect: */ 302 if (compiler->gen <= 2) 303 compiler->nir_options.force_indirect_unrolling = nir_var_all; 304 } 305 306 /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but 307 * this core NIR option enables some optimizations of 16-bit operations. 308 */ 309 if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16)) 310 compiler->nir_options.support_16bit_alu = true; 311 312 if (!options->disable_cache) 313 ir3_disk_cache_init(compiler); 314 315 return compiler; 316} 317 318const nir_shader_compiler_options * 319ir3_get_compiler_options(struct ir3_compiler *compiler) 320{ 321 return &compiler->nir_options; 322} 323