1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org> 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21bf215546Sopenharmony_ci * SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci * Authors: 24bf215546Sopenharmony_ci * Rob Clark <robclark@freedesktop.org> 25bf215546Sopenharmony_ci */ 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci#include "util/ralloc.h" 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_ci#include "freedreno_dev_info.h" 30bf215546Sopenharmony_ci 31bf215546Sopenharmony_ci#include "ir3_compiler.h" 32bf215546Sopenharmony_ci 33bf215546Sopenharmony_cistatic const struct debug_named_value shader_debug_options[] = { 34bf215546Sopenharmony_ci /* clang-format off */ 35bf215546Sopenharmony_ci {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"}, 36bf215546Sopenharmony_ci {"tcs", IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"}, 37bf215546Sopenharmony_ci {"tes", IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"}, 38bf215546Sopenharmony_ci {"gs", IR3_DBG_SHADER_GS, "Print shader disasm for geometry shaders"}, 39bf215546Sopenharmony_ci {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"}, 40bf215546Sopenharmony_ci {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"}, 41bf215546Sopenharmony_ci {"disasm", IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"}, 42bf215546Sopenharmony_ci {"optmsgs", IR3_DBG_OPTMSGS, "Enable optimizer debug messages"}, 43bf215546Sopenharmony_ci {"forces2en", IR3_DBG_FORCES2EN, "Force s2en mode for tex sampler instructions"}, 44bf215546Sopenharmony_ci {"nouboopt", IR3_DBG_NOUBOOPT, "Disable lowering UBO to uniform"}, 45bf215546Sopenharmony_ci {"nofp16", IR3_DBG_NOFP16, "Don't lower mediump to fp16"}, 46bf215546Sopenharmony_ci {"nocache", IR3_DBG_NOCACHE, "Disable shader cache"}, 47bf215546Sopenharmony_ci {"spillall", IR3_DBG_SPILLALL, "Spill as much as possible to test the spiller"}, 48bf215546Sopenharmony_ci {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"}, 49bf215546Sopenharmony_ci#ifdef DEBUG 50bf215546Sopenharmony_ci /* DEBUG-only options: */ 51bf215546Sopenharmony_ci {"schedmsgs", IR3_DBG_SCHEDMSGS, "Enable scheduler debug messages"}, 52bf215546Sopenharmony_ci {"ramsgs", IR3_DBG_RAMSGS, "Enable register-allocation debug messages"}, 53bf215546Sopenharmony_ci#endif 54bf215546Sopenharmony_ci DEBUG_NAMED_VALUE_END 55bf215546Sopenharmony_ci /* clang-format on */ 56bf215546Sopenharmony_ci}; 57bf215546Sopenharmony_ci 58bf215546Sopenharmony_ciDEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", 59bf215546Sopenharmony_ci shader_debug_options, 0) 60bf215546Sopenharmony_ciDEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH", 61bf215546Sopenharmony_ci NULL) 62bf215546Sopenharmony_ci 63bf215546Sopenharmony_cienum ir3_shader_debug ir3_shader_debug = 0; 64bf215546Sopenharmony_ciconst char *ir3_shader_override_path = NULL; 65bf215546Sopenharmony_ci 66bf215546Sopenharmony_civoid 67bf215546Sopenharmony_ciir3_compiler_destroy(struct ir3_compiler *compiler) 68bf215546Sopenharmony_ci{ 69bf215546Sopenharmony_ci disk_cache_destroy(compiler->disk_cache); 70bf215546Sopenharmony_ci ralloc_free(compiler); 71bf215546Sopenharmony_ci} 72bf215546Sopenharmony_ci 73bf215546Sopenharmony_ci#define COMMON_OPTIONS \ 74bf215546Sopenharmony_ci .lower_fpow = true, \ 75bf215546Sopenharmony_ci .lower_scmp = true, \ 76bf215546Sopenharmony_ci .lower_flrp16 = true, \ 77bf215546Sopenharmony_ci .lower_flrp32 = true, \ 78bf215546Sopenharmony_ci .lower_flrp64 = true, \ 79bf215546Sopenharmony_ci .lower_ffract = true, \ 80bf215546Sopenharmony_ci .lower_fmod = true, \ 81bf215546Sopenharmony_ci .lower_fdiv = true, \ 82bf215546Sopenharmony_ci .lower_isign = true, \ 83bf215546Sopenharmony_ci .lower_ldexp = true, \ 84bf215546Sopenharmony_ci .lower_uadd_carry = true, \ 85bf215546Sopenharmony_ci .lower_usub_borrow = true, \ 86bf215546Sopenharmony_ci .lower_mul_high = true, \ 87bf215546Sopenharmony_ci .lower_mul_2x32_64 = true, \ 88bf215546Sopenharmony_ci .fuse_ffma16 = true, \ 89bf215546Sopenharmony_ci .fuse_ffma32 = true, \ 90bf215546Sopenharmony_ci .fuse_ffma64 = true, \ 91bf215546Sopenharmony_ci .vertex_id_zero_based = false, \ 92bf215546Sopenharmony_ci .lower_extract_byte = true, \ 93bf215546Sopenharmony_ci .lower_extract_word = true, \ 94bf215546Sopenharmony_ci .lower_insert_byte = true, \ 95bf215546Sopenharmony_ci .lower_insert_word = true, \ 96bf215546Sopenharmony_ci .lower_helper_invocation = true, \ 97bf215546Sopenharmony_ci .lower_bitfield_insert_to_shifts = true, \ 98bf215546Sopenharmony_ci .lower_bitfield_extract_to_shifts = true, \ 99bf215546Sopenharmony_ci .lower_pack_half_2x16 = true, \ 100bf215546Sopenharmony_ci .lower_pack_snorm_4x8 = true, \ 101bf215546Sopenharmony_ci .lower_pack_snorm_2x16 = true, \ 102bf215546Sopenharmony_ci .lower_pack_unorm_4x8 = true, \ 103bf215546Sopenharmony_ci .lower_pack_unorm_2x16 = true, \ 104bf215546Sopenharmony_ci .lower_unpack_half_2x16 = true, \ 105bf215546Sopenharmony_ci .lower_unpack_snorm_4x8 = true, \ 106bf215546Sopenharmony_ci .lower_unpack_snorm_2x16 = true, \ 107bf215546Sopenharmony_ci .lower_unpack_unorm_4x8 = true, \ 108bf215546Sopenharmony_ci .lower_unpack_unorm_2x16 = true, \ 109bf215546Sopenharmony_ci .lower_pack_split = true, \ 110bf215546Sopenharmony_ci .use_interpolated_input_intrinsics = true, \ 111bf215546Sopenharmony_ci .lower_rotate = true, \ 112bf215546Sopenharmony_ci .lower_to_scalar = true, \ 113bf215546Sopenharmony_ci .has_imul24 = true, \ 114bf215546Sopenharmony_ci .has_fsub = true, \ 115bf215546Sopenharmony_ci .has_isub = true, \ 116bf215546Sopenharmony_ci .force_indirect_unrolling_sampler = true, \ 117bf215546Sopenharmony_ci .lower_uniforms_to_ubo = true, \ 118bf215546Sopenharmony_ci .use_scoped_barrier = true, \ 119bf215546Sopenharmony_ci .max_unroll_iterations = 32 120bf215546Sopenharmony_ci 121bf215546Sopenharmony_cistatic const nir_shader_compiler_options nir_options = { 122bf215546Sopenharmony_ci COMMON_OPTIONS, 123bf215546Sopenharmony_ci .lower_wpos_pntc = true, 124bf215546Sopenharmony_ci .lower_cs_local_index_to_id = true, 125bf215546Sopenharmony_ci 126bf215546Sopenharmony_ci /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c 127bf215546Sopenharmony_ci * but that should be harmless for GL since 64b is not 128bf215546Sopenharmony_ci * supported there. 129bf215546Sopenharmony_ci */ 130bf215546Sopenharmony_ci .lower_int64_options = (nir_lower_int64_options)~0, 131bf215546Sopenharmony_ci}; 132bf215546Sopenharmony_ci 133bf215546Sopenharmony_ci/* we don't want to lower vertex_id to _zero_based on newer gpus: */ 134bf215546Sopenharmony_cistatic const nir_shader_compiler_options nir_options_a6xx = { 135bf215546Sopenharmony_ci COMMON_OPTIONS, 136bf215546Sopenharmony_ci .vectorize_io = true, 137bf215546Sopenharmony_ci .force_indirect_unrolling = nir_var_all, 138bf215546Sopenharmony_ci .lower_wpos_pntc = true, 139bf215546Sopenharmony_ci .lower_cs_local_index_to_id = true, 140bf215546Sopenharmony_ci 141bf215546Sopenharmony_ci /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c 142bf215546Sopenharmony_ci * but that should be harmless for GL since 64b is not 143bf215546Sopenharmony_ci * supported there. 144bf215546Sopenharmony_ci */ 145bf215546Sopenharmony_ci .lower_int64_options = (nir_lower_int64_options)~0, 146bf215546Sopenharmony_ci .lower_device_index_to_zero = true, 147bf215546Sopenharmony_ci .has_udot_4x8 = true, 148bf215546Sopenharmony_ci .has_sudot_4x8 = true, 149bf215546Sopenharmony_ci}; 150bf215546Sopenharmony_ci 151bf215546Sopenharmony_cistruct ir3_compiler * 152bf215546Sopenharmony_ciir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, 153bf215546Sopenharmony_ci const struct ir3_compiler_options *options) 154bf215546Sopenharmony_ci{ 155bf215546Sopenharmony_ci struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler); 156bf215546Sopenharmony_ci 157bf215546Sopenharmony_ci ir3_shader_debug = debug_get_option_ir3_shader_debug(); 158bf215546Sopenharmony_ci ir3_shader_override_path = 159bf215546Sopenharmony_ci !__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL; 160bf215546Sopenharmony_ci 161bf215546Sopenharmony_ci if (ir3_shader_override_path) { 162bf215546Sopenharmony_ci ir3_shader_debug |= IR3_DBG_NOCACHE; 163bf215546Sopenharmony_ci } 164bf215546Sopenharmony_ci 165bf215546Sopenharmony_ci compiler->dev = dev; 166bf215546Sopenharmony_ci compiler->dev_id = dev_id; 167bf215546Sopenharmony_ci compiler->gen = fd_dev_gen(dev_id); 168bf215546Sopenharmony_ci compiler->robust_buffer_access2 = options->robust_buffer_access2; 169bf215546Sopenharmony_ci 170bf215546Sopenharmony_ci /* All known GPU's have 32k local memory (aka shared) */ 171bf215546Sopenharmony_ci compiler->local_mem_size = 32 * 1024; 172bf215546Sopenharmony_ci /* TODO see if older GPU's were different here */ 173bf215546Sopenharmony_ci compiler->branchstack_size = 64; 174bf215546Sopenharmony_ci compiler->wave_granularity = 2; 175bf215546Sopenharmony_ci compiler->max_waves = 16; 176bf215546Sopenharmony_ci 177bf215546Sopenharmony_ci compiler->max_variable_workgroup_size = 1024; 178bf215546Sopenharmony_ci 179bf215546Sopenharmony_ci const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id); 180bf215546Sopenharmony_ci 181bf215546Sopenharmony_ci if (compiler->gen >= 6) { 182bf215546Sopenharmony_ci compiler->samgq_workaround = true; 183bf215546Sopenharmony_ci /* a6xx split the pipeline state into geometry and fragment state, in 184bf215546Sopenharmony_ci * order to let the VS run ahead of the FS. As a result there are now 185bf215546Sopenharmony_ci * separate const files for the the fragment shader and everything 186bf215546Sopenharmony_ci * else, and separate limits. There seems to be a shared limit, but 187bf215546Sopenharmony_ci * it's higher than the vert or frag limits. 188bf215546Sopenharmony_ci * 189bf215546Sopenharmony_ci * Also, according to the observation on a630/a650/a660, max_const_pipeline 190bf215546Sopenharmony_ci * has to be 512 when all geometry stages are present. Otherwise a gpu hang 191bf215546Sopenharmony_ci * happens. Accordingly maximum safe size for each stage should be under 192bf215546Sopenharmony_ci * (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for 193bf215546Sopenharmony_ci * const files. 194bf215546Sopenharmony_ci * 195bf215546Sopenharmony_ci * Only when VS and FS stages are present, the limit is 640. 196bf215546Sopenharmony_ci * 197bf215546Sopenharmony_ci * TODO: The shared limit seems to be different on different models. 198bf215546Sopenharmony_ci */ 199bf215546Sopenharmony_ci compiler->max_const_pipeline = 512; 200bf215546Sopenharmony_ci compiler->max_const_frag = 512; 201bf215546Sopenharmony_ci compiler->max_const_geom = 512; 202bf215546Sopenharmony_ci compiler->max_const_safe = 100; 203bf215546Sopenharmony_ci 204bf215546Sopenharmony_ci /* Compute shaders don't share a const file with the FS. Instead they 205bf215546Sopenharmony_ci * have their own file, which is smaller than the FS one. 206bf215546Sopenharmony_ci * 207bf215546Sopenharmony_ci * TODO: is this true on earlier gen's? 208bf215546Sopenharmony_ci */ 209bf215546Sopenharmony_ci compiler->max_const_compute = 256; 210bf215546Sopenharmony_ci 211bf215546Sopenharmony_ci /* TODO: implement clip+cull distances on earlier gen's */ 212bf215546Sopenharmony_ci compiler->has_clip_cull = true; 213bf215546Sopenharmony_ci 214bf215546Sopenharmony_ci /* TODO: implement private memory on earlier gen's */ 215bf215546Sopenharmony_ci compiler->has_pvtmem = true; 216bf215546Sopenharmony_ci 217bf215546Sopenharmony_ci compiler->has_preamble = true; 218bf215546Sopenharmony_ci 219bf215546Sopenharmony_ci compiler->tess_use_shared = dev_info->a6xx.tess_use_shared; 220bf215546Sopenharmony_ci 221bf215546Sopenharmony_ci compiler->storage_16bit = dev_info->a6xx.storage_16bit; 222bf215546Sopenharmony_ci 223bf215546Sopenharmony_ci compiler->has_getfiberid = dev_info->a6xx.has_getfiberid; 224bf215546Sopenharmony_ci 225bf215546Sopenharmony_ci compiler->has_dp2acc = dev_info->a6xx.has_dp2acc; 226bf215546Sopenharmony_ci compiler->has_dp4acc = dev_info->a6xx.has_dp4acc; 227bf215546Sopenharmony_ci 228bf215546Sopenharmony_ci compiler->shared_consts_base_offset = 504; 229bf215546Sopenharmony_ci compiler->shared_consts_size = 8; 230bf215546Sopenharmony_ci compiler->geom_shared_consts_size_quirk = 16; 231bf215546Sopenharmony_ci } else { 232bf215546Sopenharmony_ci compiler->max_const_pipeline = 512; 233bf215546Sopenharmony_ci compiler->max_const_geom = 512; 234bf215546Sopenharmony_ci compiler->max_const_frag = 512; 235bf215546Sopenharmony_ci compiler->max_const_compute = 512; 236bf215546Sopenharmony_ci 237bf215546Sopenharmony_ci /* Note: this will have to change if/when we support tess+GS on 238bf215546Sopenharmony_ci * earlier gen's. 239bf215546Sopenharmony_ci */ 240bf215546Sopenharmony_ci compiler->max_const_safe = 256; 241bf215546Sopenharmony_ci } 242bf215546Sopenharmony_ci 243bf215546Sopenharmony_ci if (compiler->gen >= 6) { 244bf215546Sopenharmony_ci compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4; 245bf215546Sopenharmony_ci } else if (compiler->gen >= 4) { 246bf215546Sopenharmony_ci /* On a4xx-a5xx, using r24.x and above requires using the smallest 247bf215546Sopenharmony_ci * threadsize. 248bf215546Sopenharmony_ci */ 249bf215546Sopenharmony_ci compiler->reg_size_vec4 = 48; 250bf215546Sopenharmony_ci } else { 251bf215546Sopenharmony_ci /* TODO: confirm this */ 252bf215546Sopenharmony_ci compiler->reg_size_vec4 = 96; 253bf215546Sopenharmony_ci } 254bf215546Sopenharmony_ci 255bf215546Sopenharmony_ci if (compiler->gen >= 6) { 256bf215546Sopenharmony_ci compiler->threadsize_base = 64; 257bf215546Sopenharmony_ci } else if (compiler->gen >= 4) { 258bf215546Sopenharmony_ci /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan 259bf215546Sopenharmony_ci * 1.1 subgroupSize which is 32. 260bf215546Sopenharmony_ci */ 261bf215546Sopenharmony_ci compiler->threadsize_base = 32; 262bf215546Sopenharmony_ci } else { 263bf215546Sopenharmony_ci compiler->threadsize_base = 8; 264bf215546Sopenharmony_ci } 265bf215546Sopenharmony_ci 266bf215546Sopenharmony_ci if (compiler->gen >= 4) { 267bf215546Sopenharmony_ci /* need special handling for "flat" */ 268bf215546Sopenharmony_ci compiler->flat_bypass = true; 269bf215546Sopenharmony_ci compiler->levels_add_one = false; 270bf215546Sopenharmony_ci compiler->unminify_coords = false; 271bf215546Sopenharmony_ci compiler->txf_ms_with_isaml = false; 272bf215546Sopenharmony_ci compiler->array_index_add_half = true; 273bf215546Sopenharmony_ci compiler->instr_align = 16; 274bf215546Sopenharmony_ci compiler->const_upload_unit = 4; 275bf215546Sopenharmony_ci } else { 276bf215546Sopenharmony_ci /* no special handling for "flat" */ 277bf215546Sopenharmony_ci compiler->flat_bypass = false; 278bf215546Sopenharmony_ci compiler->levels_add_one = true; 279bf215546Sopenharmony_ci compiler->unminify_coords = true; 280bf215546Sopenharmony_ci compiler->txf_ms_with_isaml = true; 281bf215546Sopenharmony_ci compiler->array_index_add_half = false; 282bf215546Sopenharmony_ci compiler->instr_align = 4; 283bf215546Sopenharmony_ci compiler->const_upload_unit = 8; 284bf215546Sopenharmony_ci } 285bf215546Sopenharmony_ci 286bf215546Sopenharmony_ci compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32; 287bf215546Sopenharmony_ci compiler->has_shared_regfile = compiler->gen >= 5; 288bf215546Sopenharmony_ci 289bf215546Sopenharmony_ci compiler->push_ubo_with_preamble = options->push_ubo_with_preamble; 290bf215546Sopenharmony_ci 291bf215546Sopenharmony_ci /* The driver can't request this unless preambles are supported. */ 292bf215546Sopenharmony_ci if (options->push_ubo_with_preamble) 293bf215546Sopenharmony_ci assert(compiler->has_preamble); 294bf215546Sopenharmony_ci 295bf215546Sopenharmony_ci if (compiler->gen >= 6) { 296bf215546Sopenharmony_ci compiler->nir_options = nir_options_a6xx; 297bf215546Sopenharmony_ci compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc; 298bf215546Sopenharmony_ci compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc; 299bf215546Sopenharmony_ci } else { 300bf215546Sopenharmony_ci compiler->nir_options = nir_options; 301bf215546Sopenharmony_ci /* a2xx compiler doesn't handle indirect: */ 302bf215546Sopenharmony_ci if (compiler->gen <= 2) 303bf215546Sopenharmony_ci compiler->nir_options.force_indirect_unrolling = nir_var_all; 304bf215546Sopenharmony_ci } 305bf215546Sopenharmony_ci 306bf215546Sopenharmony_ci /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but 307bf215546Sopenharmony_ci * this core NIR option enables some optimizations of 16-bit operations. 308bf215546Sopenharmony_ci */ 309bf215546Sopenharmony_ci if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16)) 310bf215546Sopenharmony_ci compiler->nir_options.support_16bit_alu = true; 311bf215546Sopenharmony_ci 312bf215546Sopenharmony_ci if (!options->disable_cache) 313bf215546Sopenharmony_ci ir3_disk_cache_init(compiler); 314bf215546Sopenharmony_ci 315bf215546Sopenharmony_ci return compiler; 316bf215546Sopenharmony_ci} 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ciconst nir_shader_compiler_options * 319bf215546Sopenharmony_ciir3_get_compiler_options(struct ir3_compiler *compiler) 320bf215546Sopenharmony_ci{ 321bf215546Sopenharmony_ci return &compiler->nir_options; 322bf215546Sopenharmony_ci} 323