freedreno/ir3/ir3_compiler.c

bf215546Sopenharmony_ci/*
bf215546Sopenharmony_ci * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
bf215546Sopenharmony_ci * Software.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
bf215546Sopenharmony_ci * SOFTWARE.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Authors:
bf215546Sopenharmony_ci *    Rob Clark <robclark@freedesktop.org>
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#include "util/ralloc.h"
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#include "freedreno_dev_info.h"
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#include "ir3_compiler.h"
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic const struct debug_named_value shader_debug_options[] = {
bf215546Sopenharmony_ci   /* clang-format off */
bf215546Sopenharmony_ci   {"vs",         IR3_DBG_SHADER_VS,  "Print shader disasm for vertex shaders"},
bf215546Sopenharmony_ci   {"tcs",        IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"},
bf215546Sopenharmony_ci   {"tes",        IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"},
bf215546Sopenharmony_ci   {"gs",         IR3_DBG_SHADER_GS,  "Print shader disasm for geometry shaders"},
bf215546Sopenharmony_ci   {"fs",         IR3_DBG_SHADER_FS,  "Print shader disasm for fragment shaders"},
bf215546Sopenharmony_ci   {"cs",         IR3_DBG_SHADER_CS,  "Print shader disasm for compute shaders"},
bf215546Sopenharmony_ci   {"disasm",     IR3_DBG_DISASM,     "Dump NIR and adreno shader disassembly"},
bf215546Sopenharmony_ci   {"optmsgs",    IR3_DBG_OPTMSGS,    "Enable optimizer debug messages"},
bf215546Sopenharmony_ci   {"forces2en",  IR3_DBG_FORCES2EN,  "Force s2en mode for tex sampler instructions"},
bf215546Sopenharmony_ci   {"nouboopt",   IR3_DBG_NOUBOOPT,   "Disable lowering UBO to uniform"},
bf215546Sopenharmony_ci   {"nofp16",     IR3_DBG_NOFP16,     "Don't lower mediump to fp16"},
bf215546Sopenharmony_ci   {"nocache",    IR3_DBG_NOCACHE,    "Disable shader cache"},
bf215546Sopenharmony_ci   {"spillall",   IR3_DBG_SPILLALL,   "Spill as much as possible to test the spiller"},
bf215546Sopenharmony_ci   {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
bf215546Sopenharmony_ci#ifdef DEBUG
bf215546Sopenharmony_ci   /* DEBUG-only options: */
bf215546Sopenharmony_ci   {"schedmsgs",  IR3_DBG_SCHEDMSGS,  "Enable scheduler debug messages"},
bf215546Sopenharmony_ci   {"ramsgs",     IR3_DBG_RAMSGS,     "Enable register-allocation debug messages"},
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci   DEBUG_NAMED_VALUE_END
bf215546Sopenharmony_ci   /* clang-format on */
bf215546Sopenharmony_ci};
bf215546Sopenharmony_ci
bf215546Sopenharmony_ciDEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
bf215546Sopenharmony_ci                            shader_debug_options, 0)
bf215546Sopenharmony_ciDEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
bf215546Sopenharmony_ci                      NULL)
bf215546Sopenharmony_ci
bf215546Sopenharmony_cienum ir3_shader_debug ir3_shader_debug = 0;
bf215546Sopenharmony_ciconst char *ir3_shader_override_path = NULL;
bf215546Sopenharmony_ci
bf215546Sopenharmony_civoid
bf215546Sopenharmony_ciir3_compiler_destroy(struct ir3_compiler *compiler)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   disk_cache_destroy(compiler->disk_cache);
bf215546Sopenharmony_ci   ralloc_free(compiler);
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#define COMMON_OPTIONS                                                        \
bf215546Sopenharmony_ci   .lower_fpow = true,                                                        \
bf215546Sopenharmony_ci   .lower_scmp = true,                                                        \
bf215546Sopenharmony_ci   .lower_flrp16 = true,                                                      \
bf215546Sopenharmony_ci   .lower_flrp32 = true,                                                      \
bf215546Sopenharmony_ci   .lower_flrp64 = true,                                                      \
bf215546Sopenharmony_ci   .lower_ffract = true,                                                      \
bf215546Sopenharmony_ci   .lower_fmod = true,                                                        \
bf215546Sopenharmony_ci   .lower_fdiv = true,                                                        \
bf215546Sopenharmony_ci   .lower_isign = true,                                                       \
bf215546Sopenharmony_ci   .lower_ldexp = true,                                                       \
bf215546Sopenharmony_ci   .lower_uadd_carry = true,                                                  \
bf215546Sopenharmony_ci   .lower_usub_borrow = true,                                                 \
bf215546Sopenharmony_ci   .lower_mul_high = true,                                                    \
bf215546Sopenharmony_ci   .lower_mul_2x32_64 = true,                                                 \
bf215546Sopenharmony_ci   .fuse_ffma16 = true,                                                       \
bf215546Sopenharmony_ci   .fuse_ffma32 = true,                                                       \
bf215546Sopenharmony_ci   .fuse_ffma64 = true,                                                       \
bf215546Sopenharmony_ci   .vertex_id_zero_based = false,                                             \
bf215546Sopenharmony_ci   .lower_extract_byte = true,                                                \
bf215546Sopenharmony_ci   .lower_extract_word = true,                                                \
bf215546Sopenharmony_ci   .lower_insert_byte = true,                                                 \
bf215546Sopenharmony_ci   .lower_insert_word = true,                                                 \
bf215546Sopenharmony_ci   .lower_helper_invocation = true,                                           \
bf215546Sopenharmony_ci   .lower_bitfield_insert_to_shifts = true,                                   \
bf215546Sopenharmony_ci   .lower_bitfield_extract_to_shifts = true,                                  \
bf215546Sopenharmony_ci   .lower_pack_half_2x16 = true,                                              \
bf215546Sopenharmony_ci   .lower_pack_snorm_4x8 = true,                                              \
bf215546Sopenharmony_ci   .lower_pack_snorm_2x16 = true,                                             \
bf215546Sopenharmony_ci   .lower_pack_unorm_4x8 = true,                                              \
bf215546Sopenharmony_ci   .lower_pack_unorm_2x16 = true,                                             \
bf215546Sopenharmony_ci   .lower_unpack_half_2x16 = true,                                            \
bf215546Sopenharmony_ci   .lower_unpack_snorm_4x8 = true,                                            \
bf215546Sopenharmony_ci   .lower_unpack_snorm_2x16 = true,                                           \
bf215546Sopenharmony_ci   .lower_unpack_unorm_4x8 = true,                                            \
bf215546Sopenharmony_ci   .lower_unpack_unorm_2x16 = true,                                           \
bf215546Sopenharmony_ci   .lower_pack_split = true,                                                  \
bf215546Sopenharmony_ci   .use_interpolated_input_intrinsics = true,                                 \
bf215546Sopenharmony_ci   .lower_rotate = true,                                                      \
bf215546Sopenharmony_ci   .lower_to_scalar = true,                                                   \
bf215546Sopenharmony_ci   .has_imul24 = true,                                                        \
bf215546Sopenharmony_ci   .has_fsub = true,                                                          \
bf215546Sopenharmony_ci   .has_isub = true,                                                          \
bf215546Sopenharmony_ci   .force_indirect_unrolling_sampler = true,                                  \
bf215546Sopenharmony_ci   .lower_uniforms_to_ubo = true,                                             \
bf215546Sopenharmony_ci   .use_scoped_barrier = true,                                                \
bf215546Sopenharmony_ci   .max_unroll_iterations = 32
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic const nir_shader_compiler_options nir_options = {
bf215546Sopenharmony_ci   COMMON_OPTIONS,
bf215546Sopenharmony_ci   .lower_wpos_pntc = true,
bf215546Sopenharmony_ci   .lower_cs_local_index_to_id = true,
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
bf215546Sopenharmony_ci    * but that should be harmless for GL since 64b is not
bf215546Sopenharmony_ci    * supported there.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   .lower_int64_options = (nir_lower_int64_options)~0,
bf215546Sopenharmony_ci};
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* we don't want to lower vertex_id to _zero_based on newer gpus: */
bf215546Sopenharmony_cistatic const nir_shader_compiler_options nir_options_a6xx = {
bf215546Sopenharmony_ci   COMMON_OPTIONS,
bf215546Sopenharmony_ci   .vectorize_io = true,
bf215546Sopenharmony_ci   .force_indirect_unrolling = nir_var_all,
bf215546Sopenharmony_ci   .lower_wpos_pntc = true,
bf215546Sopenharmony_ci   .lower_cs_local_index_to_id = true,
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
bf215546Sopenharmony_ci    * but that should be harmless for GL since 64b is not
bf215546Sopenharmony_ci    * supported there.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   .lower_int64_options = (nir_lower_int64_options)~0,
bf215546Sopenharmony_ci   .lower_device_index_to_zero = true,
bf215546Sopenharmony_ci   .has_udot_4x8 = true,
bf215546Sopenharmony_ci   .has_sudot_4x8 = true,
bf215546Sopenharmony_ci};
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistruct ir3_compiler *
bf215546Sopenharmony_ciir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
bf215546Sopenharmony_ci                    const struct ir3_compiler_options *options)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   ir3_shader_debug = debug_get_option_ir3_shader_debug();
bf215546Sopenharmony_ci   ir3_shader_override_path =
bf215546Sopenharmony_ci      !__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (ir3_shader_override_path) {
bf215546Sopenharmony_ci      ir3_shader_debug |= IR3_DBG_NOCACHE;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   compiler->dev = dev;
bf215546Sopenharmony_ci   compiler->dev_id = dev_id;
bf215546Sopenharmony_ci   compiler->gen = fd_dev_gen(dev_id);
bf215546Sopenharmony_ci   compiler->robust_buffer_access2 = options->robust_buffer_access2;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* All known GPU's have 32k local memory (aka shared) */
bf215546Sopenharmony_ci   compiler->local_mem_size = 32 * 1024;
bf215546Sopenharmony_ci   /* TODO see if older GPU's were different here */
bf215546Sopenharmony_ci   compiler->branchstack_size = 64;
bf215546Sopenharmony_ci   compiler->wave_granularity = 2;
bf215546Sopenharmony_ci   compiler->max_waves = 16;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   compiler->max_variable_workgroup_size = 1024;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (compiler->gen >= 6) {
bf215546Sopenharmony_ci      compiler->samgq_workaround = true;
bf215546Sopenharmony_ci      /* a6xx split the pipeline state into geometry and fragment state, in
bf215546Sopenharmony_ci       * order to let the VS run ahead of the FS. As a result there are now
bf215546Sopenharmony_ci       * separate const files for the the fragment shader and everything
bf215546Sopenharmony_ci       * else, and separate limits. There seems to be a shared limit, but
bf215546Sopenharmony_ci       * it's higher than the vert or frag limits.
bf215546Sopenharmony_ci       *
bf215546Sopenharmony_ci       * Also, according to the observation on a630/a650/a660, max_const_pipeline
bf215546Sopenharmony_ci       * has to be 512 when all geometry stages are present. Otherwise a gpu hang
bf215546Sopenharmony_ci       * happens. Accordingly maximum safe size for each stage should be under
bf215546Sopenharmony_ci       * (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for
bf215546Sopenharmony_ci       * const files.
bf215546Sopenharmony_ci       *
bf215546Sopenharmony_ci       * Only when VS and FS stages are present, the limit is 640.
bf215546Sopenharmony_ci       *
bf215546Sopenharmony_ci       * TODO: The shared limit seems to be different on different models.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      compiler->max_const_pipeline = 512;
bf215546Sopenharmony_ci      compiler->max_const_frag = 512;
bf215546Sopenharmony_ci      compiler->max_const_geom = 512;
bf215546Sopenharmony_ci      compiler->max_const_safe = 100;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      /* Compute shaders don't share a const file with the FS. Instead they
bf215546Sopenharmony_ci       * have their own file, which is smaller than the FS one.
bf215546Sopenharmony_ci       *
bf215546Sopenharmony_ci       * TODO: is this true on earlier gen's?
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      compiler->max_const_compute = 256;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      /* TODO: implement clip+cull distances on earlier gen's */
bf215546Sopenharmony_ci      compiler->has_clip_cull = true;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      /* TODO: implement private memory on earlier gen's */
bf215546Sopenharmony_ci      compiler->has_pvtmem = true;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      compiler->has_preamble = true;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      compiler->storage_16bit = dev_info->a6xx.storage_16bit;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
bf215546Sopenharmony_ci      compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      compiler->shared_consts_base_offset = 504;
bf215546Sopenharmony_ci      compiler->shared_consts_size = 8;
bf215546Sopenharmony_ci      compiler->geom_shared_consts_size_quirk = 16;
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      compiler->max_const_pipeline = 512;
bf215546Sopenharmony_ci      compiler->max_const_geom = 512;
bf215546Sopenharmony_ci      compiler->max_const_frag = 512;
bf215546Sopenharmony_ci      compiler->max_const_compute = 512;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      /* Note: this will have to change if/when we support tess+GS on
bf215546Sopenharmony_ci       * earlier gen's.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      compiler->max_const_safe = 256;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (compiler->gen >= 6) {
bf215546Sopenharmony_ci      compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
bf215546Sopenharmony_ci   } else if (compiler->gen >= 4) {
bf215546Sopenharmony_ci      /* On a4xx-a5xx, using r24.x and above requires using the smallest
bf215546Sopenharmony_ci       * threadsize.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      compiler->reg_size_vec4 = 48;
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      /* TODO: confirm this */
bf215546Sopenharmony_ci      compiler->reg_size_vec4 = 96;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (compiler->gen >= 6) {
bf215546Sopenharmony_ci      compiler->threadsize_base = 64;
bf215546Sopenharmony_ci   } else if (compiler->gen >= 4) {
bf215546Sopenharmony_ci      /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
bf215546Sopenharmony_ci       * 1.1 subgroupSize which is 32.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      compiler->threadsize_base = 32;
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      compiler->threadsize_base = 8;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (compiler->gen >= 4) {
bf215546Sopenharmony_ci      /* need special handling for "flat" */
bf215546Sopenharmony_ci      compiler->flat_bypass = true;
bf215546Sopenharmony_ci      compiler->levels_add_one = false;
bf215546Sopenharmony_ci      compiler->unminify_coords = false;
bf215546Sopenharmony_ci      compiler->txf_ms_with_isaml = false;
bf215546Sopenharmony_ci      compiler->array_index_add_half = true;
bf215546Sopenharmony_ci      compiler->instr_align = 16;
bf215546Sopenharmony_ci      compiler->const_upload_unit = 4;
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      /* no special handling for "flat" */
bf215546Sopenharmony_ci      compiler->flat_bypass = false;
bf215546Sopenharmony_ci      compiler->levels_add_one = true;
bf215546Sopenharmony_ci      compiler->unminify_coords = true;
bf215546Sopenharmony_ci      compiler->txf_ms_with_isaml = true;
bf215546Sopenharmony_ci      compiler->array_index_add_half = false;
bf215546Sopenharmony_ci      compiler->instr_align = 4;
bf215546Sopenharmony_ci      compiler->const_upload_unit = 8;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
bf215546Sopenharmony_ci   compiler->has_shared_regfile = compiler->gen >= 5;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   compiler->push_ubo_with_preamble = options->push_ubo_with_preamble;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* The driver can't request this unless preambles are supported. */
bf215546Sopenharmony_ci   if (options->push_ubo_with_preamble)
bf215546Sopenharmony_ci      assert(compiler->has_preamble);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (compiler->gen >= 6) {
bf215546Sopenharmony_ci      compiler->nir_options = nir_options_a6xx;
bf215546Sopenharmony_ci      compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc;
bf215546Sopenharmony_ci      compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc;
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      compiler->nir_options = nir_options;
bf215546Sopenharmony_ci      /* a2xx compiler doesn't handle indirect: */
bf215546Sopenharmony_ci      if (compiler->gen <= 2)
bf215546Sopenharmony_ci         compiler->nir_options.force_indirect_unrolling = nir_var_all;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but
bf215546Sopenharmony_ci    * this core NIR option enables some optimizations of 16-bit operations.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16))
bf215546Sopenharmony_ci      compiler->nir_options.support_16bit_alu = true;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (!options->disable_cache)
bf215546Sopenharmony_ci      ir3_disk_cache_init(compiler);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return compiler;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ciconst nir_shader_compiler_options *
bf215546Sopenharmony_ciir3_get_compiler_options(struct ir3_compiler *compiler)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   return &compiler->nir_options;
bf215546Sopenharmony_ci}