1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright 2012 Advanced Micro Devices, Inc. 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci#include "ac_shader_util.h" 25bf215546Sopenharmony_ci#include "ac_gpu_info.h" 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci#include "sid.h" 28bf215546Sopenharmony_ci#include "u_math.h" 29bf215546Sopenharmony_ci 30bf215546Sopenharmony_ci#include <assert.h> 31bf215546Sopenharmony_ci#include <stdlib.h> 32bf215546Sopenharmony_ci#include <string.h> 33bf215546Sopenharmony_ci 34bf215546Sopenharmony_ciunsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask, 35bf215546Sopenharmony_ci bool writes_mrt0_alpha) 36bf215546Sopenharmony_ci{ 37bf215546Sopenharmony_ci /* If writes_mrt0_alpha is true, one other flag must be true too. */ 38bf215546Sopenharmony_ci assert(!writes_mrt0_alpha || writes_z || writes_stencil || writes_samplemask); 39bf215546Sopenharmony_ci 40bf215546Sopenharmony_ci if (writes_z || writes_mrt0_alpha) { 41bf215546Sopenharmony_ci /* Z needs 32 bits. */ 42bf215546Sopenharmony_ci if (writes_samplemask || writes_mrt0_alpha) 43bf215546Sopenharmony_ci return V_028710_SPI_SHADER_32_ABGR; 44bf215546Sopenharmony_ci else if (writes_stencil) 45bf215546Sopenharmony_ci return V_028710_SPI_SHADER_32_GR; 46bf215546Sopenharmony_ci else 47bf215546Sopenharmony_ci return V_028710_SPI_SHADER_32_R; 48bf215546Sopenharmony_ci } else if (writes_stencil || writes_samplemask) { 49bf215546Sopenharmony_ci /* Both stencil and sample mask need only 16 bits. */ 50bf215546Sopenharmony_ci return V_028710_SPI_SHADER_UINT16_ABGR; 51bf215546Sopenharmony_ci } else { 52bf215546Sopenharmony_ci return V_028710_SPI_SHADER_ZERO; 53bf215546Sopenharmony_ci } 54bf215546Sopenharmony_ci} 55bf215546Sopenharmony_ci 56bf215546Sopenharmony_ciunsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format) 57bf215546Sopenharmony_ci{ 58bf215546Sopenharmony_ci unsigned i, cb_shader_mask = 0; 59bf215546Sopenharmony_ci 60bf215546Sopenharmony_ci for (i = 0; i < 8; i++) { 61bf215546Sopenharmony_ci switch ((spi_shader_col_format >> (i * 4)) & 0xf) { 62bf215546Sopenharmony_ci case V_028714_SPI_SHADER_ZERO: 63bf215546Sopenharmony_ci break; 64bf215546Sopenharmony_ci case V_028714_SPI_SHADER_32_R: 65bf215546Sopenharmony_ci cb_shader_mask |= 0x1 << (i * 4); 66bf215546Sopenharmony_ci break; 67bf215546Sopenharmony_ci case V_028714_SPI_SHADER_32_GR: 68bf215546Sopenharmony_ci cb_shader_mask |= 0x3 << (i * 4); 69bf215546Sopenharmony_ci break; 70bf215546Sopenharmony_ci case V_028714_SPI_SHADER_32_AR: 71bf215546Sopenharmony_ci cb_shader_mask |= 0x9u << (i * 4); 72bf215546Sopenharmony_ci break; 73bf215546Sopenharmony_ci case V_028714_SPI_SHADER_FP16_ABGR: 74bf215546Sopenharmony_ci case V_028714_SPI_SHADER_UNORM16_ABGR: 75bf215546Sopenharmony_ci case V_028714_SPI_SHADER_SNORM16_ABGR: 76bf215546Sopenharmony_ci case V_028714_SPI_SHADER_UINT16_ABGR: 77bf215546Sopenharmony_ci case V_028714_SPI_SHADER_SINT16_ABGR: 78bf215546Sopenharmony_ci case V_028714_SPI_SHADER_32_ABGR: 79bf215546Sopenharmony_ci cb_shader_mask |= 0xfu << (i * 4); 80bf215546Sopenharmony_ci break; 81bf215546Sopenharmony_ci default: 82bf215546Sopenharmony_ci assert(0); 83bf215546Sopenharmony_ci } 84bf215546Sopenharmony_ci } 85bf215546Sopenharmony_ci return cb_shader_mask; 86bf215546Sopenharmony_ci} 87bf215546Sopenharmony_ci 88bf215546Sopenharmony_ci/** 89bf215546Sopenharmony_ci * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a 90bf215546Sopenharmony_ci * geometry shader. 91bf215546Sopenharmony_ci */ 92bf215546Sopenharmony_ciuint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum amd_gfx_level gfx_level) 93bf215546Sopenharmony_ci{ 94bf215546Sopenharmony_ci unsigned cut_mode; 95bf215546Sopenharmony_ci 96bf215546Sopenharmony_ci assert (gfx_level < GFX11); 97bf215546Sopenharmony_ci 98bf215546Sopenharmony_ci if (gs_max_vert_out <= 128) { 99bf215546Sopenharmony_ci cut_mode = V_028A40_GS_CUT_128; 100bf215546Sopenharmony_ci } else if (gs_max_vert_out <= 256) { 101bf215546Sopenharmony_ci cut_mode = V_028A40_GS_CUT_256; 102bf215546Sopenharmony_ci } else if (gs_max_vert_out <= 512) { 103bf215546Sopenharmony_ci cut_mode = V_028A40_GS_CUT_512; 104bf215546Sopenharmony_ci } else { 105bf215546Sopenharmony_ci assert(gs_max_vert_out <= 1024); 106bf215546Sopenharmony_ci cut_mode = V_028A40_GS_CUT_1024; 107bf215546Sopenharmony_ci } 108bf215546Sopenharmony_ci 109bf215546Sopenharmony_ci return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode) | 110bf215546Sopenharmony_ci S_028A40_ES_WRITE_OPTIMIZE(gfx_level <= GFX8) | S_028A40_GS_WRITE_OPTIMIZE(1) | 111bf215546Sopenharmony_ci S_028A40_ONCHIP(gfx_level >= GFX9 ? 1 : 0); 112bf215546Sopenharmony_ci} 113bf215546Sopenharmony_ci 114bf215546Sopenharmony_ci/// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format 115bf215546Sopenharmony_ci/// value for LLVM8+ tbuffer intrinsics. 116bf215546Sopenharmony_ciunsigned ac_get_tbuffer_format(enum amd_gfx_level gfx_level, unsigned dfmt, unsigned nfmt) 117bf215546Sopenharmony_ci{ 118bf215546Sopenharmony_ci // Some games try to access vertex buffers without a valid format. 119bf215546Sopenharmony_ci // This is a game bug, but we should still handle it gracefully. 120bf215546Sopenharmony_ci if (dfmt == V_008F0C_GFX10_FORMAT_INVALID) 121bf215546Sopenharmony_ci return V_008F0C_GFX10_FORMAT_INVALID; 122bf215546Sopenharmony_ci 123bf215546Sopenharmony_ci if (gfx_level >= GFX11) { 124bf215546Sopenharmony_ci switch (dfmt) { 125bf215546Sopenharmony_ci default: 126bf215546Sopenharmony_ci unreachable("bad dfmt"); 127bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_INVALID: 128bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_INVALID; 129bf215546Sopenharmony_ci 130bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_8: 131bf215546Sopenharmony_ci switch (nfmt) { 132bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UNORM: 133bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_UNORM; 134bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SNORM: 135bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_SNORM; 136bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_USCALED: 137bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_USCALED; 138bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SSCALED: 139bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_SSCALED; 140bf215546Sopenharmony_ci default: 141bf215546Sopenharmony_ci unreachable("bad nfmt"); 142bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 143bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_UINT; 144bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 145bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_SINT; 146bf215546Sopenharmony_ci } 147bf215546Sopenharmony_ci 148bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_8_8: 149bf215546Sopenharmony_ci switch (nfmt) { 150bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UNORM: 151bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_UNORM; 152bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SNORM: 153bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_SNORM; 154bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_USCALED: 155bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_USCALED; 156bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SSCALED: 157bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_SSCALED; 158bf215546Sopenharmony_ci default: 159bf215546Sopenharmony_ci unreachable("bad nfmt"); 160bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 161bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_UINT; 162bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 163bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_SINT; 164bf215546Sopenharmony_ci } 165bf215546Sopenharmony_ci 166bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: 167bf215546Sopenharmony_ci switch (nfmt) { 168bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UNORM: 169bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_8_8_UNORM; 170bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SNORM: 171bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_8_8_SNORM; 172bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_USCALED: 173bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_8_8_USCALED; 174bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SSCALED: 175bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_8_8_SSCALED; 176bf215546Sopenharmony_ci default: 177bf215546Sopenharmony_ci unreachable("bad nfmt"); 178bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 179bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_8_8_UINT; 180bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 181bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_8_8_8_8_SINT; 182bf215546Sopenharmony_ci } 183bf215546Sopenharmony_ci 184bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_16: 185bf215546Sopenharmony_ci switch (nfmt) { 186bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UNORM: 187bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_UNORM; 188bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SNORM: 189bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_SNORM; 190bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_USCALED: 191bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_USCALED; 192bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SSCALED: 193bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_SSCALED; 194bf215546Sopenharmony_ci default: 195bf215546Sopenharmony_ci unreachable("bad nfmt"); 196bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 197bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_UINT; 198bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 199bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_SINT; 200bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_FLOAT: 201bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_FLOAT; 202bf215546Sopenharmony_ci } 203bf215546Sopenharmony_ci 204bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_16_16: 205bf215546Sopenharmony_ci switch (nfmt) { 206bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UNORM: 207bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_UNORM; 208bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SNORM: 209bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_SNORM; 210bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_USCALED: 211bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_USCALED; 212bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SSCALED: 213bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_SSCALED; 214bf215546Sopenharmony_ci default: 215bf215546Sopenharmony_ci unreachable("bad nfmt"); 216bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 217bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_UINT; 218bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 219bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_SINT; 220bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_FLOAT: 221bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_FLOAT; 222bf215546Sopenharmony_ci } 223bf215546Sopenharmony_ci 224bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: 225bf215546Sopenharmony_ci switch (nfmt) { 226bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UNORM: 227bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_16_16_UNORM; 228bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SNORM: 229bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_16_16_SNORM; 230bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_USCALED: 231bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_16_16_USCALED; 232bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SSCALED: 233bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_16_16_SSCALED; 234bf215546Sopenharmony_ci default: 235bf215546Sopenharmony_ci unreachable("bad nfmt"); 236bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 237bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_16_16_UINT; 238bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 239bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_16_16_SINT; 240bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_FLOAT: 241bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_16_16_16_16_FLOAT; 242bf215546Sopenharmony_ci } 243bf215546Sopenharmony_ci 244bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_32: 245bf215546Sopenharmony_ci switch (nfmt) { 246bf215546Sopenharmony_ci default: 247bf215546Sopenharmony_ci unreachable("bad nfmt"); 248bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 249bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_UINT; 250bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 251bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_SINT; 252bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_FLOAT: 253bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_FLOAT; 254bf215546Sopenharmony_ci } 255bf215546Sopenharmony_ci 256bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_32_32: 257bf215546Sopenharmony_ci switch (nfmt) { 258bf215546Sopenharmony_ci default: 259bf215546Sopenharmony_ci unreachable("bad nfmt"); 260bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 261bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_32_UINT; 262bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 263bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_32_SINT; 264bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_FLOAT: 265bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_32_FLOAT; 266bf215546Sopenharmony_ci } 267bf215546Sopenharmony_ci 268bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_32_32_32: 269bf215546Sopenharmony_ci switch (nfmt) { 270bf215546Sopenharmony_ci default: 271bf215546Sopenharmony_ci unreachable("bad nfmt"); 272bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 273bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_32_32_UINT; 274bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 275bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_32_32_SINT; 276bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_FLOAT: 277bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_32_32_FLOAT; 278bf215546Sopenharmony_ci } 279bf215546Sopenharmony_ci 280bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: 281bf215546Sopenharmony_ci switch (nfmt) { 282bf215546Sopenharmony_ci default: 283bf215546Sopenharmony_ci unreachable("bad nfmt"); 284bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 285bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_32_32_32_UINT; 286bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 287bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_32_32_32_SINT; 288bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_FLOAT: 289bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT; 290bf215546Sopenharmony_ci } 291bf215546Sopenharmony_ci 292bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: 293bf215546Sopenharmony_ci switch (nfmt) { 294bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UNORM: 295bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_2_10_10_10_UNORM; 296bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SNORM: 297bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_2_10_10_10_SNORM; 298bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_USCALED: 299bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_2_10_10_10_USCALED; 300bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SSCALED: 301bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_2_10_10_10_SSCALED; 302bf215546Sopenharmony_ci default: 303bf215546Sopenharmony_ci unreachable("bad nfmt"); 304bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 305bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_2_10_10_10_UINT; 306bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 307bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_2_10_10_10_SINT; 308bf215546Sopenharmony_ci } 309bf215546Sopenharmony_ci 310bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_10_11_11: 311bf215546Sopenharmony_ci switch (nfmt) { 312bf215546Sopenharmony_ci default: 313bf215546Sopenharmony_ci unreachable("bad nfmt"); 314bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_FLOAT: 315bf215546Sopenharmony_ci return V_008F0C_GFX11_FORMAT_10_11_11_FLOAT; 316bf215546Sopenharmony_ci } 317bf215546Sopenharmony_ci } 318bf215546Sopenharmony_ci } else if (gfx_level >= GFX10) { 319bf215546Sopenharmony_ci unsigned format; 320bf215546Sopenharmony_ci switch (dfmt) { 321bf215546Sopenharmony_ci default: 322bf215546Sopenharmony_ci unreachable("bad dfmt"); 323bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_INVALID: 324bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_INVALID; 325bf215546Sopenharmony_ci break; 326bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_8: 327bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_8_UINT; 328bf215546Sopenharmony_ci break; 329bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_8_8: 330bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_8_8_UINT; 331bf215546Sopenharmony_ci break; 332bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: 333bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_8_8_8_8_UINT; 334bf215546Sopenharmony_ci break; 335bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_16: 336bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_16_UINT; 337bf215546Sopenharmony_ci break; 338bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_16_16: 339bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_16_16_UINT; 340bf215546Sopenharmony_ci break; 341bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: 342bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_16_16_16_16_UINT; 343bf215546Sopenharmony_ci break; 344bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_32: 345bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_32_UINT; 346bf215546Sopenharmony_ci break; 347bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_32_32: 348bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_32_32_UINT; 349bf215546Sopenharmony_ci break; 350bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_32_32_32: 351bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_32_32_32_UINT; 352bf215546Sopenharmony_ci break; 353bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: 354bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_32_32_32_32_UINT; 355bf215546Sopenharmony_ci break; 356bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: 357bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_2_10_10_10_UINT; 358bf215546Sopenharmony_ci break; 359bf215546Sopenharmony_ci case V_008F0C_BUF_DATA_FORMAT_10_11_11: 360bf215546Sopenharmony_ci format = V_008F0C_GFX10_FORMAT_10_11_11_UINT; 361bf215546Sopenharmony_ci break; 362bf215546Sopenharmony_ci } 363bf215546Sopenharmony_ci 364bf215546Sopenharmony_ci // Use the regularity properties of the combined format enum. 365bf215546Sopenharmony_ci // 366bf215546Sopenharmony_ci // Note: float is incompatible with 8-bit data formats, 367bf215546Sopenharmony_ci // [us]{norm,scaled} are incomparible with 32-bit data formats. 368bf215546Sopenharmony_ci // [us]scaled are not writable. 369bf215546Sopenharmony_ci switch (nfmt) { 370bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UNORM: 371bf215546Sopenharmony_ci format -= 4; 372bf215546Sopenharmony_ci break; 373bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SNORM: 374bf215546Sopenharmony_ci format -= 3; 375bf215546Sopenharmony_ci break; 376bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_USCALED: 377bf215546Sopenharmony_ci format -= 2; 378bf215546Sopenharmony_ci break; 379bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SSCALED: 380bf215546Sopenharmony_ci format -= 1; 381bf215546Sopenharmony_ci break; 382bf215546Sopenharmony_ci default: 383bf215546Sopenharmony_ci unreachable("bad nfmt"); 384bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_UINT: 385bf215546Sopenharmony_ci break; 386bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_SINT: 387bf215546Sopenharmony_ci format += 1; 388bf215546Sopenharmony_ci break; 389bf215546Sopenharmony_ci case V_008F0C_BUF_NUM_FORMAT_FLOAT: 390bf215546Sopenharmony_ci format += 2; 391bf215546Sopenharmony_ci break; 392bf215546Sopenharmony_ci } 393bf215546Sopenharmony_ci 394bf215546Sopenharmony_ci return format; 395bf215546Sopenharmony_ci } else { 396bf215546Sopenharmony_ci return dfmt | (nfmt << 4); 397bf215546Sopenharmony_ci } 398bf215546Sopenharmony_ci} 399bf215546Sopenharmony_ci 400bf215546Sopenharmony_cistatic const struct ac_data_format_info data_format_table[] = { 401bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_INVALID] = {0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID}, 402bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_8] = {1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8}, 403bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_16] = {2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16}, 404bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_8_8] = {2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8}, 405bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_32] = {4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32}, 406bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_16_16] = {4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16}, 407bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_10_11_11] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11}, 408bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_11_11_10] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10}, 409bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2}, 410bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10}, 411bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = {4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8}, 412bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_32_32] = {8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32}, 413bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16}, 414bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_32_32_32] = {12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32}, 415bf215546Sopenharmony_ci [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = {16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32}, 416bf215546Sopenharmony_ci}; 417bf215546Sopenharmony_ci 418bf215546Sopenharmony_ciconst struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt) 419bf215546Sopenharmony_ci{ 420bf215546Sopenharmony_ci assert(dfmt < ARRAY_SIZE(data_format_table)); 421bf215546Sopenharmony_ci return &data_format_table[dfmt]; 422bf215546Sopenharmony_ci} 423bf215546Sopenharmony_ci 424bf215546Sopenharmony_cienum ac_image_dim ac_get_sampler_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim dim, 425bf215546Sopenharmony_ci bool is_array) 426bf215546Sopenharmony_ci{ 427bf215546Sopenharmony_ci switch (dim) { 428bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_1D: 429bf215546Sopenharmony_ci if (gfx_level == GFX9) 430bf215546Sopenharmony_ci return is_array ? ac_image_2darray : ac_image_2d; 431bf215546Sopenharmony_ci return is_array ? ac_image_1darray : ac_image_1d; 432bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_2D: 433bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_RECT: 434bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_EXTERNAL: 435bf215546Sopenharmony_ci return is_array ? ac_image_2darray : ac_image_2d; 436bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_3D: 437bf215546Sopenharmony_ci return ac_image_3d; 438bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_CUBE: 439bf215546Sopenharmony_ci return ac_image_cube; 440bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_MS: 441bf215546Sopenharmony_ci return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa; 442bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_SUBPASS: 443bf215546Sopenharmony_ci return ac_image_2darray; 444bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_SUBPASS_MS: 445bf215546Sopenharmony_ci return ac_image_2darraymsaa; 446bf215546Sopenharmony_ci default: 447bf215546Sopenharmony_ci unreachable("bad sampler dim"); 448bf215546Sopenharmony_ci } 449bf215546Sopenharmony_ci} 450bf215546Sopenharmony_ci 451bf215546Sopenharmony_cienum ac_image_dim ac_get_image_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim sdim, 452bf215546Sopenharmony_ci bool is_array) 453bf215546Sopenharmony_ci{ 454bf215546Sopenharmony_ci enum ac_image_dim dim = ac_get_sampler_dim(gfx_level, sdim, is_array); 455bf215546Sopenharmony_ci 456bf215546Sopenharmony_ci /* Match the resource type set in the descriptor. */ 457bf215546Sopenharmony_ci if (dim == ac_image_cube || (gfx_level <= GFX8 && dim == ac_image_3d)) 458bf215546Sopenharmony_ci dim = ac_image_2darray; 459bf215546Sopenharmony_ci else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && gfx_level == GFX9) { 460bf215546Sopenharmony_ci /* When a single layer of a 3D texture is bound, the shader 461bf215546Sopenharmony_ci * will refer to a 2D target, but the descriptor has a 3D type. 462bf215546Sopenharmony_ci * Since the HW ignores BASE_ARRAY in this case, we need to 463bf215546Sopenharmony_ci * send 3 coordinates. This doesn't hurt when the underlying 464bf215546Sopenharmony_ci * texture is non-3D. 465bf215546Sopenharmony_ci */ 466bf215546Sopenharmony_ci dim = ac_image_3d; 467bf215546Sopenharmony_ci } 468bf215546Sopenharmony_ci 469bf215546Sopenharmony_ci return dim; 470bf215546Sopenharmony_ci} 471bf215546Sopenharmony_ci 472bf215546Sopenharmony_ciunsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, 473bf215546Sopenharmony_ci signed char *face_vgpr_index_ptr, 474bf215546Sopenharmony_ci signed char *ancillary_vgpr_index_ptr, 475bf215546Sopenharmony_ci signed char *sample_coverage_vgpr_index_ptr) 476bf215546Sopenharmony_ci{ 477bf215546Sopenharmony_ci unsigned num_input_vgprs = 0; 478bf215546Sopenharmony_ci signed char face_vgpr_index = -1; 479bf215546Sopenharmony_ci signed char ancillary_vgpr_index = -1; 480bf215546Sopenharmony_ci signed char sample_coverage_vgpr_index = -1; 481bf215546Sopenharmony_ci 482bf215546Sopenharmony_ci if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr)) 483bf215546Sopenharmony_ci num_input_vgprs += 2; 484bf215546Sopenharmony_ci if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr)) 485bf215546Sopenharmony_ci num_input_vgprs += 2; 486bf215546Sopenharmony_ci if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr)) 487bf215546Sopenharmony_ci num_input_vgprs += 2; 488bf215546Sopenharmony_ci if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr)) 489bf215546Sopenharmony_ci num_input_vgprs += 3; 490bf215546Sopenharmony_ci if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr)) 491bf215546Sopenharmony_ci num_input_vgprs += 2; 492bf215546Sopenharmony_ci if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr)) 493bf215546Sopenharmony_ci num_input_vgprs += 2; 494bf215546Sopenharmony_ci if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr)) 495bf215546Sopenharmony_ci num_input_vgprs += 2; 496bf215546Sopenharmony_ci if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr)) 497bf215546Sopenharmony_ci num_input_vgprs += 1; 498bf215546Sopenharmony_ci if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr)) 499bf215546Sopenharmony_ci num_input_vgprs += 1; 500bf215546Sopenharmony_ci if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr)) 501bf215546Sopenharmony_ci num_input_vgprs += 1; 502bf215546Sopenharmony_ci if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr)) 503bf215546Sopenharmony_ci num_input_vgprs += 1; 504bf215546Sopenharmony_ci if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr)) 505bf215546Sopenharmony_ci num_input_vgprs += 1; 506bf215546Sopenharmony_ci if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) { 507bf215546Sopenharmony_ci face_vgpr_index = num_input_vgprs; 508bf215546Sopenharmony_ci num_input_vgprs += 1; 509bf215546Sopenharmony_ci } 510bf215546Sopenharmony_ci if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) { 511bf215546Sopenharmony_ci ancillary_vgpr_index = num_input_vgprs; 512bf215546Sopenharmony_ci num_input_vgprs += 1; 513bf215546Sopenharmony_ci } 514bf215546Sopenharmony_ci if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr)) { 515bf215546Sopenharmony_ci sample_coverage_vgpr_index = num_input_vgprs; 516bf215546Sopenharmony_ci num_input_vgprs += 1; 517bf215546Sopenharmony_ci } 518bf215546Sopenharmony_ci if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr)) 519bf215546Sopenharmony_ci num_input_vgprs += 1; 520bf215546Sopenharmony_ci 521bf215546Sopenharmony_ci if (face_vgpr_index_ptr) 522bf215546Sopenharmony_ci *face_vgpr_index_ptr = face_vgpr_index; 523bf215546Sopenharmony_ci if (ancillary_vgpr_index_ptr) 524bf215546Sopenharmony_ci *ancillary_vgpr_index_ptr = ancillary_vgpr_index; 525bf215546Sopenharmony_ci if (sample_coverage_vgpr_index_ptr) 526bf215546Sopenharmony_ci *sample_coverage_vgpr_index_ptr = sample_coverage_vgpr_index; 527bf215546Sopenharmony_ci 528bf215546Sopenharmony_ci return num_input_vgprs; 529bf215546Sopenharmony_ci} 530bf215546Sopenharmony_ci 531bf215546Sopenharmony_civoid ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, 532bf215546Sopenharmony_ci bool is_depth, bool use_rbplus, 533bf215546Sopenharmony_ci struct ac_spi_color_formats *formats) 534bf215546Sopenharmony_ci{ 535bf215546Sopenharmony_ci /* Alpha is needed for alpha-to-coverage. 536bf215546Sopenharmony_ci * Blending may be with or without alpha. 537bf215546Sopenharmony_ci */ 538bf215546Sopenharmony_ci unsigned normal = 0; /* most optimal, may not support blending or export alpha */ 539bf215546Sopenharmony_ci unsigned alpha = 0; /* exports alpha, but may not support blending */ 540bf215546Sopenharmony_ci unsigned blend = 0; /* supports blending, but may not export alpha */ 541bf215546Sopenharmony_ci unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */ 542bf215546Sopenharmony_ci 543bf215546Sopenharmony_ci /* Choose the SPI color formats. These are required values for RB+. 544bf215546Sopenharmony_ci * Other chips have multiple choices, though they are not necessarily better. 545bf215546Sopenharmony_ci */ 546bf215546Sopenharmony_ci switch (format) { 547bf215546Sopenharmony_ci case V_028C70_COLOR_5_6_5: 548bf215546Sopenharmony_ci case V_028C70_COLOR_1_5_5_5: 549bf215546Sopenharmony_ci case V_028C70_COLOR_5_5_5_1: 550bf215546Sopenharmony_ci case V_028C70_COLOR_4_4_4_4: 551bf215546Sopenharmony_ci case V_028C70_COLOR_10_11_11: 552bf215546Sopenharmony_ci case V_028C70_COLOR_11_11_10: 553bf215546Sopenharmony_ci case V_028C70_COLOR_5_9_9_9: 554bf215546Sopenharmony_ci case V_028C70_COLOR_8: 555bf215546Sopenharmony_ci case V_028C70_COLOR_8_8: 556bf215546Sopenharmony_ci case V_028C70_COLOR_8_8_8_8: 557bf215546Sopenharmony_ci case V_028C70_COLOR_10_10_10_2: 558bf215546Sopenharmony_ci case V_028C70_COLOR_2_10_10_10: 559bf215546Sopenharmony_ci if (ntype == V_028C70_NUMBER_UINT) 560bf215546Sopenharmony_ci alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; 561bf215546Sopenharmony_ci else if (ntype == V_028C70_NUMBER_SINT) 562bf215546Sopenharmony_ci alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; 563bf215546Sopenharmony_ci else 564bf215546Sopenharmony_ci alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; 565bf215546Sopenharmony_ci 566bf215546Sopenharmony_ci if (!use_rbplus && format == V_028C70_COLOR_8 && 567bf215546Sopenharmony_ci ntype != V_028C70_NUMBER_SRGB && swap == V_028C70_SWAP_STD) /* R */ { 568bf215546Sopenharmony_ci /* When RB+ is enabled, R8_UNORM should use FP16_ABGR for 2x 569bf215546Sopenharmony_ci * exporting performance. Otherwise, use 32_R to remove useless 570bf215546Sopenharmony_ci * instructions needed for 16-bit compressed exports. 571bf215546Sopenharmony_ci */ 572bf215546Sopenharmony_ci blend = normal = V_028714_SPI_SHADER_32_R; 573bf215546Sopenharmony_ci } 574bf215546Sopenharmony_ci break; 575bf215546Sopenharmony_ci 576bf215546Sopenharmony_ci case V_028C70_COLOR_16: 577bf215546Sopenharmony_ci case V_028C70_COLOR_16_16: 578bf215546Sopenharmony_ci case V_028C70_COLOR_16_16_16_16: 579bf215546Sopenharmony_ci if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) { 580bf215546Sopenharmony_ci /* UNORM16 and SNORM16 don't support blending */ 581bf215546Sopenharmony_ci if (ntype == V_028C70_NUMBER_UNORM) 582bf215546Sopenharmony_ci normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR; 583bf215546Sopenharmony_ci else 584bf215546Sopenharmony_ci normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR; 585bf215546Sopenharmony_ci 586bf215546Sopenharmony_ci /* Use 32 bits per channel for blending. */ 587bf215546Sopenharmony_ci if (format == V_028C70_COLOR_16) { 588bf215546Sopenharmony_ci if (swap == V_028C70_SWAP_STD) { /* R */ 589bf215546Sopenharmony_ci blend = V_028714_SPI_SHADER_32_R; 590bf215546Sopenharmony_ci blend_alpha = V_028714_SPI_SHADER_32_AR; 591bf215546Sopenharmony_ci } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ 592bf215546Sopenharmony_ci blend = blend_alpha = V_028714_SPI_SHADER_32_AR; 593bf215546Sopenharmony_ci else 594bf215546Sopenharmony_ci assert(0); 595bf215546Sopenharmony_ci } else if (format == V_028C70_COLOR_16_16) { 596bf215546Sopenharmony_ci if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) { /* RG or GR */ 597bf215546Sopenharmony_ci blend = V_028714_SPI_SHADER_32_GR; 598bf215546Sopenharmony_ci blend_alpha = V_028714_SPI_SHADER_32_ABGR; 599bf215546Sopenharmony_ci } else if (swap == V_028C70_SWAP_ALT) /* RA */ 600bf215546Sopenharmony_ci blend = blend_alpha = V_028714_SPI_SHADER_32_AR; 601bf215546Sopenharmony_ci else 602bf215546Sopenharmony_ci assert(0); 603bf215546Sopenharmony_ci } else /* 16_16_16_16 */ 604bf215546Sopenharmony_ci blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR; 605bf215546Sopenharmony_ci } else if (ntype == V_028C70_NUMBER_UINT) 606bf215546Sopenharmony_ci alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; 607bf215546Sopenharmony_ci else if (ntype == V_028C70_NUMBER_SINT) 608bf215546Sopenharmony_ci alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; 609bf215546Sopenharmony_ci else if (ntype == V_028C70_NUMBER_FLOAT) 610bf215546Sopenharmony_ci alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; 611bf215546Sopenharmony_ci else 612bf215546Sopenharmony_ci assert(0); 613bf215546Sopenharmony_ci break; 614bf215546Sopenharmony_ci 615bf215546Sopenharmony_ci case V_028C70_COLOR_32: 616bf215546Sopenharmony_ci if (swap == V_028C70_SWAP_STD) { /* R */ 617bf215546Sopenharmony_ci blend = normal = V_028714_SPI_SHADER_32_R; 618bf215546Sopenharmony_ci alpha = blend_alpha = V_028714_SPI_SHADER_32_AR; 619bf215546Sopenharmony_ci } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ 620bf215546Sopenharmony_ci alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; 621bf215546Sopenharmony_ci else 622bf215546Sopenharmony_ci assert(0); 623bf215546Sopenharmony_ci break; 624bf215546Sopenharmony_ci 625bf215546Sopenharmony_ci case V_028C70_COLOR_32_32: 626bf215546Sopenharmony_ci if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) { /* RG or GR */ 627bf215546Sopenharmony_ci blend = normal = V_028714_SPI_SHADER_32_GR; 628bf215546Sopenharmony_ci alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR; 629bf215546Sopenharmony_ci } else if (swap == V_028C70_SWAP_ALT) /* RA */ 630bf215546Sopenharmony_ci alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; 631bf215546Sopenharmony_ci else 632bf215546Sopenharmony_ci assert(0); 633bf215546Sopenharmony_ci break; 634bf215546Sopenharmony_ci 635bf215546Sopenharmony_ci case V_028C70_COLOR_32_32_32_32: 636bf215546Sopenharmony_ci case V_028C70_COLOR_8_24: 637bf215546Sopenharmony_ci case V_028C70_COLOR_24_8: 638bf215546Sopenharmony_ci case V_028C70_COLOR_X24_8_32_FLOAT: 639bf215546Sopenharmony_ci alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; 640bf215546Sopenharmony_ci break; 641bf215546Sopenharmony_ci 642bf215546Sopenharmony_ci default: 643bf215546Sopenharmony_ci assert(0); 644bf215546Sopenharmony_ci return; 645bf215546Sopenharmony_ci } 646bf215546Sopenharmony_ci 647bf215546Sopenharmony_ci /* The DB->CB copy needs 32_ABGR. */ 648bf215546Sopenharmony_ci if (is_depth) 649bf215546Sopenharmony_ci alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; 650bf215546Sopenharmony_ci 651bf215546Sopenharmony_ci formats->normal = normal; 652bf215546Sopenharmony_ci formats->alpha = alpha; 653bf215546Sopenharmony_ci formats->blend = blend; 654bf215546Sopenharmony_ci formats->blend_alpha = blend_alpha; 655bf215546Sopenharmony_ci} 656bf215546Sopenharmony_ci 657bf215546Sopenharmony_civoid ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling, 658bf215546Sopenharmony_ci bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask) 659bf215546Sopenharmony_ci{ 660bf215546Sopenharmony_ci *late_alloc_wave64 = 0; /* The limit is per SA. */ 661bf215546Sopenharmony_ci *cu_mask = 0xffff; 662bf215546Sopenharmony_ci 663bf215546Sopenharmony_ci /* CU masking can decrease performance and cause a hang with <= 2 CUs per SA. */ 664bf215546Sopenharmony_ci if (info->min_good_cu_per_sa <= 2) 665bf215546Sopenharmony_ci return; 666bf215546Sopenharmony_ci 667bf215546Sopenharmony_ci /* If scratch is used with late alloc, the GPU could deadlock if PS uses scratch too. A more 668bf215546Sopenharmony_ci * complicated computation is needed to enable late alloc with scratch (see PAL). 669bf215546Sopenharmony_ci */ 670bf215546Sopenharmony_ci if (uses_scratch) 671bf215546Sopenharmony_ci return; 672bf215546Sopenharmony_ci 673bf215546Sopenharmony_ci /* Late alloc is not used for NGG on Navi14 due to a hw bug. */ 674bf215546Sopenharmony_ci if (ngg && info->family == CHIP_NAVI14) 675bf215546Sopenharmony_ci return; 676bf215546Sopenharmony_ci 677bf215546Sopenharmony_ci if (info->gfx_level >= GFX10) { 678bf215546Sopenharmony_ci /* For Wave32, the hw will launch twice the number of late alloc waves, so 1 == 2x wave32. 679bf215546Sopenharmony_ci * These limits are estimated because they are all safe but they vary in performance. 680bf215546Sopenharmony_ci */ 681bf215546Sopenharmony_ci if (ngg_culling) 682bf215546Sopenharmony_ci *late_alloc_wave64 = info->min_good_cu_per_sa * 10; 683bf215546Sopenharmony_ci else 684bf215546Sopenharmony_ci *late_alloc_wave64 = info->min_good_cu_per_sa * 4; 685bf215546Sopenharmony_ci 686bf215546Sopenharmony_ci /* Limit LATE_ALLOC_GS to prevent a hang (hw bug) on gfx10. */ 687bf215546Sopenharmony_ci if (info->gfx_level == GFX10 && ngg) 688bf215546Sopenharmony_ci *late_alloc_wave64 = MIN2(*late_alloc_wave64, 64); 689bf215546Sopenharmony_ci 690bf215546Sopenharmony_ci /* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock. 691bf215546Sopenharmony_ci * Others: CU1 must be disabled to prevent a hw deadlock. 692bf215546Sopenharmony_ci * 693bf215546Sopenharmony_ci * The deadlock is caused by late alloc, which usually increases performance. 694bf215546Sopenharmony_ci */ 695bf215546Sopenharmony_ci *cu_mask &= info->gfx_level == GFX10 ? ~BITFIELD_RANGE(2, 2) : 696bf215546Sopenharmony_ci ~BITFIELD_RANGE(1, 1); 697bf215546Sopenharmony_ci } else { 698bf215546Sopenharmony_ci if (info->min_good_cu_per_sa <= 4) { 699bf215546Sopenharmony_ci /* Too few available compute units per SA. Disallowing VS to run on one CU could hurt us 700bf215546Sopenharmony_ci * more than late VS allocation would help. 701bf215546Sopenharmony_ci * 702bf215546Sopenharmony_ci * 2 is the highest safe number that allows us to keep all CUs enabled. 703bf215546Sopenharmony_ci */ 704bf215546Sopenharmony_ci *late_alloc_wave64 = 2; 705bf215546Sopenharmony_ci } else { 706bf215546Sopenharmony_ci /* This is a good initial value, allowing 1 late_alloc wave per SIMD on num_cu - 2. 707bf215546Sopenharmony_ci */ 708bf215546Sopenharmony_ci *late_alloc_wave64 = (info->min_good_cu_per_sa - 2) * 4; 709bf215546Sopenharmony_ci } 710bf215546Sopenharmony_ci 711bf215546Sopenharmony_ci /* VS can't execute on one CU if the limit is > 2. */ 712bf215546Sopenharmony_ci if (*late_alloc_wave64 > 2) 713bf215546Sopenharmony_ci *cu_mask = 0xfffe; /* 1 CU disabled */ 714bf215546Sopenharmony_ci } 715bf215546Sopenharmony_ci 716bf215546Sopenharmony_ci /* Max number that fits into the register field. */ 717bf215546Sopenharmony_ci if (ngg) /* GS */ 718bf215546Sopenharmony_ci *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(~0u)); 719bf215546Sopenharmony_ci else /* VS */ 720bf215546Sopenharmony_ci *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B11C_LIMIT(~0u)); 721bf215546Sopenharmony_ci} 722bf215546Sopenharmony_ci 723bf215546Sopenharmony_ciunsigned ac_compute_cs_workgroup_size(uint16_t sizes[3], bool variable, unsigned max) 724bf215546Sopenharmony_ci{ 725bf215546Sopenharmony_ci if (variable) 726bf215546Sopenharmony_ci return max; 727bf215546Sopenharmony_ci 728bf215546Sopenharmony_ci return sizes[0] * sizes[1] * sizes[2]; 729bf215546Sopenharmony_ci} 730bf215546Sopenharmony_ci 731bf215546Sopenharmony_ciunsigned ac_compute_lshs_workgroup_size(enum amd_gfx_level gfx_level, gl_shader_stage stage, 732bf215546Sopenharmony_ci unsigned tess_num_patches, 733bf215546Sopenharmony_ci unsigned tess_patch_in_vtx, 734bf215546Sopenharmony_ci unsigned tess_patch_out_vtx) 735bf215546Sopenharmony_ci{ 736bf215546Sopenharmony_ci /* When tessellation is used, API VS runs on HW LS, API TCS runs on HW HS. 737bf215546Sopenharmony_ci * These two HW stages are merged on GFX9+. 738bf215546Sopenharmony_ci */ 739bf215546Sopenharmony_ci 740bf215546Sopenharmony_ci bool merged_shaders = gfx_level >= GFX9; 741bf215546Sopenharmony_ci unsigned ls_workgroup_size = tess_num_patches * tess_patch_in_vtx; 742bf215546Sopenharmony_ci unsigned hs_workgroup_size = tess_num_patches * tess_patch_out_vtx; 743bf215546Sopenharmony_ci 744bf215546Sopenharmony_ci if (merged_shaders) 745bf215546Sopenharmony_ci return MAX2(ls_workgroup_size, hs_workgroup_size); 746bf215546Sopenharmony_ci else if (stage == MESA_SHADER_VERTEX) 747bf215546Sopenharmony_ci return ls_workgroup_size; 748bf215546Sopenharmony_ci else if (stage == MESA_SHADER_TESS_CTRL) 749bf215546Sopenharmony_ci return hs_workgroup_size; 750bf215546Sopenharmony_ci else 751bf215546Sopenharmony_ci unreachable("invalid LSHS shader stage"); 752bf215546Sopenharmony_ci} 753bf215546Sopenharmony_ci 754bf215546Sopenharmony_ciunsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned wave_size, 755bf215546Sopenharmony_ci unsigned es_verts, unsigned gs_inst_prims) 756bf215546Sopenharmony_ci{ 757bf215546Sopenharmony_ci /* ESGS may operate in workgroups if on-chip GS (LDS rings) are enabled. 758bf215546Sopenharmony_ci * 759bf215546Sopenharmony_ci * GFX6: Not possible in the HW. 760bf215546Sopenharmony_ci * GFX7-8 (unmerged): possible in the HW, but not implemented in Mesa. 761bf215546Sopenharmony_ci * GFX9+ (merged): implemented in Mesa. 762bf215546Sopenharmony_ci */ 763bf215546Sopenharmony_ci 764bf215546Sopenharmony_ci if (gfx_level <= GFX8) 765bf215546Sopenharmony_ci return wave_size; 766bf215546Sopenharmony_ci 767bf215546Sopenharmony_ci unsigned workgroup_size = MAX2(es_verts, gs_inst_prims); 768bf215546Sopenharmony_ci return CLAMP(workgroup_size, 1, 256); 769bf215546Sopenharmony_ci} 770bf215546Sopenharmony_ci 771bf215546Sopenharmony_ciunsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims, 772bf215546Sopenharmony_ci unsigned max_vtx_out, unsigned prim_amp_factor) 773bf215546Sopenharmony_ci{ 774bf215546Sopenharmony_ci /* NGG always operates in workgroups. 775bf215546Sopenharmony_ci * 776bf215546Sopenharmony_ci * For API VS/TES/GS: 777bf215546Sopenharmony_ci * - 1 invocation per input vertex 778bf215546Sopenharmony_ci * - 1 invocation per input primitive 779bf215546Sopenharmony_ci * 780bf215546Sopenharmony_ci * The same invocation can process both an input vertex and primitive, 781bf215546Sopenharmony_ci * however 1 invocation can only output up to 1 vertex and 1 primitive. 782bf215546Sopenharmony_ci */ 783bf215546Sopenharmony_ci 784bf215546Sopenharmony_ci unsigned max_vtx_in = es_verts < 256 ? es_verts : 3 * gs_inst_prims; 785bf215546Sopenharmony_ci unsigned max_prim_in = gs_inst_prims; 786bf215546Sopenharmony_ci unsigned max_prim_out = gs_inst_prims * prim_amp_factor; 787bf215546Sopenharmony_ci unsigned workgroup_size = MAX4(max_vtx_in, max_vtx_out, max_prim_in, max_prim_out); 788bf215546Sopenharmony_ci 789bf215546Sopenharmony_ci return CLAMP(workgroup_size, 1, 256); 790bf215546Sopenharmony_ci} 791bf215546Sopenharmony_ci 792bf215546Sopenharmony_civoid ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t clear_mask, 793bf215546Sopenharmony_ci unsigned value_shift, const struct radeon_info *info, 794bf215546Sopenharmony_ci void set_sh_reg(void*, unsigned, uint32_t)) 795bf215546Sopenharmony_ci{ 796bf215546Sopenharmony_ci /* Register field position and mask. */ 797bf215546Sopenharmony_ci uint32_t cu_en_mask = ~clear_mask; 798bf215546Sopenharmony_ci unsigned cu_en_shift = ffs(cu_en_mask) - 1; 799bf215546Sopenharmony_ci /* The value being set. */ 800bf215546Sopenharmony_ci uint32_t cu_en = (value & cu_en_mask) >> cu_en_shift; 801bf215546Sopenharmony_ci 802bf215546Sopenharmony_ci /* AND the field by spi_cu_en. */ 803bf215546Sopenharmony_ci uint32_t spi_cu_en = info->spi_cu_en >> value_shift; 804bf215546Sopenharmony_ci uint32_t new_value = (value & ~cu_en_mask) | 805bf215546Sopenharmony_ci (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask); 806bf215546Sopenharmony_ci 807bf215546Sopenharmony_ci set_sh_reg(cs, reg_offset, new_value); 808bf215546Sopenharmony_ci} 809bf215546Sopenharmony_ci 810bf215546Sopenharmony_ci/* Return the register value and tune bytes_per_wave to increase scratch performance. */ 811bf215546Sopenharmony_civoid ac_get_scratch_tmpring_size(const struct radeon_info *info, 812bf215546Sopenharmony_ci unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, 813bf215546Sopenharmony_ci uint32_t *tmpring_size) 814bf215546Sopenharmony_ci{ 815bf215546Sopenharmony_ci /* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors. 816bf215546Sopenharmony_ci * WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE. 817bf215546Sopenharmony_ci * Thus, WAVESIZE must be constant while the scratch buffer is being used by the GPU. 818bf215546Sopenharmony_ci * 819bf215546Sopenharmony_ci * If you want to increase WAVESIZE without waiting for idle, you need to allocate a new 820bf215546Sopenharmony_ci * scratch buffer and use it instead. This will result in multiple scratch buffers being 821bf215546Sopenharmony_ci * used at the same time, each with a different WAVESIZE. 822bf215546Sopenharmony_ci * 823bf215546Sopenharmony_ci * If you want to decrease WAVESIZE, you don't have to. There is no advantage in decreasing 824bf215546Sopenharmony_ci * WAVESIZE after it's been increased. 825bf215546Sopenharmony_ci * 826bf215546Sopenharmony_ci * Shaders with SCRATCH_EN=0 don't allocate scratch space. 827bf215546Sopenharmony_ci */ 828bf215546Sopenharmony_ci const unsigned size_shift = info->gfx_level >= GFX11 ? 8 : 10; 829bf215546Sopenharmony_ci const unsigned min_size_per_wave = BITFIELD_BIT(size_shift); 830bf215546Sopenharmony_ci 831bf215546Sopenharmony_ci /* The LLVM shader backend should be reporting aligned scratch_sizes. */ 832bf215546Sopenharmony_ci assert((bytes_per_wave & BITFIELD_MASK(size_shift)) == 0 && 833bf215546Sopenharmony_ci "scratch size per wave should be aligned"); 834bf215546Sopenharmony_ci 835bf215546Sopenharmony_ci /* Add 1 scratch item to make the number of items odd. This should improve scratch 836bf215546Sopenharmony_ci * performance by more randomly distributing scratch waves among memory channels. 837bf215546Sopenharmony_ci */ 838bf215546Sopenharmony_ci if (bytes_per_wave) 839bf215546Sopenharmony_ci bytes_per_wave |= min_size_per_wave; 840bf215546Sopenharmony_ci 841bf215546Sopenharmony_ci *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave); 842bf215546Sopenharmony_ci 843bf215546Sopenharmony_ci unsigned max_scratch_waves = info->max_scratch_waves; 844bf215546Sopenharmony_ci if (info->gfx_level >= GFX11) 845bf215546Sopenharmony_ci max_scratch_waves /= info->num_se; /* WAVES is per SE */ 846bf215546Sopenharmony_ci 847bf215546Sopenharmony_ci /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */ 848bf215546Sopenharmony_ci *tmpring_size = S_0286E8_WAVES(max_scratch_waves) | 849bf215546Sopenharmony_ci S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift); 850bf215546Sopenharmony_ci} 851