1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright 2012 Advanced Micro Devices, Inc.
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci#include "ac_shader_util.h"
25bf215546Sopenharmony_ci#include "ac_gpu_info.h"
26bf215546Sopenharmony_ci
27bf215546Sopenharmony_ci#include "sid.h"
28bf215546Sopenharmony_ci#include "u_math.h"
29bf215546Sopenharmony_ci
30bf215546Sopenharmony_ci#include <assert.h>
31bf215546Sopenharmony_ci#include <stdlib.h>
32bf215546Sopenharmony_ci#include <string.h>
33bf215546Sopenharmony_ci
34bf215546Sopenharmony_ciunsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask,
35bf215546Sopenharmony_ci                                    bool writes_mrt0_alpha)
36bf215546Sopenharmony_ci{
37bf215546Sopenharmony_ci   /* If writes_mrt0_alpha is true, one other flag must be true too. */
38bf215546Sopenharmony_ci   assert(!writes_mrt0_alpha || writes_z || writes_stencil || writes_samplemask);
39bf215546Sopenharmony_ci
40bf215546Sopenharmony_ci   if (writes_z || writes_mrt0_alpha) {
41bf215546Sopenharmony_ci      /* Z needs 32 bits. */
42bf215546Sopenharmony_ci      if (writes_samplemask || writes_mrt0_alpha)
43bf215546Sopenharmony_ci         return V_028710_SPI_SHADER_32_ABGR;
44bf215546Sopenharmony_ci      else if (writes_stencil)
45bf215546Sopenharmony_ci         return V_028710_SPI_SHADER_32_GR;
46bf215546Sopenharmony_ci      else
47bf215546Sopenharmony_ci         return V_028710_SPI_SHADER_32_R;
48bf215546Sopenharmony_ci   } else if (writes_stencil || writes_samplemask) {
49bf215546Sopenharmony_ci      /* Both stencil and sample mask need only 16 bits. */
50bf215546Sopenharmony_ci      return V_028710_SPI_SHADER_UINT16_ABGR;
51bf215546Sopenharmony_ci   } else {
52bf215546Sopenharmony_ci      return V_028710_SPI_SHADER_ZERO;
53bf215546Sopenharmony_ci   }
54bf215546Sopenharmony_ci}
55bf215546Sopenharmony_ci
56bf215546Sopenharmony_ciunsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format)
57bf215546Sopenharmony_ci{
58bf215546Sopenharmony_ci   unsigned i, cb_shader_mask = 0;
59bf215546Sopenharmony_ci
60bf215546Sopenharmony_ci   for (i = 0; i < 8; i++) {
61bf215546Sopenharmony_ci      switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
62bf215546Sopenharmony_ci      case V_028714_SPI_SHADER_ZERO:
63bf215546Sopenharmony_ci         break;
64bf215546Sopenharmony_ci      case V_028714_SPI_SHADER_32_R:
65bf215546Sopenharmony_ci         cb_shader_mask |= 0x1 << (i * 4);
66bf215546Sopenharmony_ci         break;
67bf215546Sopenharmony_ci      case V_028714_SPI_SHADER_32_GR:
68bf215546Sopenharmony_ci         cb_shader_mask |= 0x3 << (i * 4);
69bf215546Sopenharmony_ci         break;
70bf215546Sopenharmony_ci      case V_028714_SPI_SHADER_32_AR:
71bf215546Sopenharmony_ci         cb_shader_mask |= 0x9u << (i * 4);
72bf215546Sopenharmony_ci         break;
73bf215546Sopenharmony_ci      case V_028714_SPI_SHADER_FP16_ABGR:
74bf215546Sopenharmony_ci      case V_028714_SPI_SHADER_UNORM16_ABGR:
75bf215546Sopenharmony_ci      case V_028714_SPI_SHADER_SNORM16_ABGR:
76bf215546Sopenharmony_ci      case V_028714_SPI_SHADER_UINT16_ABGR:
77bf215546Sopenharmony_ci      case V_028714_SPI_SHADER_SINT16_ABGR:
78bf215546Sopenharmony_ci      case V_028714_SPI_SHADER_32_ABGR:
79bf215546Sopenharmony_ci         cb_shader_mask |= 0xfu << (i * 4);
80bf215546Sopenharmony_ci         break;
81bf215546Sopenharmony_ci      default:
82bf215546Sopenharmony_ci         assert(0);
83bf215546Sopenharmony_ci      }
84bf215546Sopenharmony_ci   }
85bf215546Sopenharmony_ci   return cb_shader_mask;
86bf215546Sopenharmony_ci}
87bf215546Sopenharmony_ci
88bf215546Sopenharmony_ci/**
89bf215546Sopenharmony_ci * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a
90bf215546Sopenharmony_ci * geometry shader.
91bf215546Sopenharmony_ci */
92bf215546Sopenharmony_ciuint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum amd_gfx_level gfx_level)
93bf215546Sopenharmony_ci{
94bf215546Sopenharmony_ci   unsigned cut_mode;
95bf215546Sopenharmony_ci
96bf215546Sopenharmony_ci   assert (gfx_level < GFX11);
97bf215546Sopenharmony_ci
98bf215546Sopenharmony_ci   if (gs_max_vert_out <= 128) {
99bf215546Sopenharmony_ci      cut_mode = V_028A40_GS_CUT_128;
100bf215546Sopenharmony_ci   } else if (gs_max_vert_out <= 256) {
101bf215546Sopenharmony_ci      cut_mode = V_028A40_GS_CUT_256;
102bf215546Sopenharmony_ci   } else if (gs_max_vert_out <= 512) {
103bf215546Sopenharmony_ci      cut_mode = V_028A40_GS_CUT_512;
104bf215546Sopenharmony_ci   } else {
105bf215546Sopenharmony_ci      assert(gs_max_vert_out <= 1024);
106bf215546Sopenharmony_ci      cut_mode = V_028A40_GS_CUT_1024;
107bf215546Sopenharmony_ci   }
108bf215546Sopenharmony_ci
109bf215546Sopenharmony_ci   return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode) |
110bf215546Sopenharmony_ci          S_028A40_ES_WRITE_OPTIMIZE(gfx_level <= GFX8) | S_028A40_GS_WRITE_OPTIMIZE(1) |
111bf215546Sopenharmony_ci          S_028A40_ONCHIP(gfx_level >= GFX9 ? 1 : 0);
112bf215546Sopenharmony_ci}
113bf215546Sopenharmony_ci
114bf215546Sopenharmony_ci/// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
115bf215546Sopenharmony_ci/// value for LLVM8+ tbuffer intrinsics.
116bf215546Sopenharmony_ciunsigned ac_get_tbuffer_format(enum amd_gfx_level gfx_level, unsigned dfmt, unsigned nfmt)
117bf215546Sopenharmony_ci{
118bf215546Sopenharmony_ci   // Some games try to access vertex buffers without a valid format.
119bf215546Sopenharmony_ci   // This is a game bug, but we should still handle it gracefully.
120bf215546Sopenharmony_ci   if (dfmt == V_008F0C_GFX10_FORMAT_INVALID)
121bf215546Sopenharmony_ci      return V_008F0C_GFX10_FORMAT_INVALID;
122bf215546Sopenharmony_ci
123bf215546Sopenharmony_ci   if (gfx_level >= GFX11) {
124bf215546Sopenharmony_ci      switch (dfmt) {
125bf215546Sopenharmony_ci      default:
126bf215546Sopenharmony_ci         unreachable("bad dfmt");
127bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_INVALID:
128bf215546Sopenharmony_ci         return V_008F0C_GFX11_FORMAT_INVALID;
129bf215546Sopenharmony_ci
130bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_8:
131bf215546Sopenharmony_ci         switch (nfmt) {
132bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UNORM:
133bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_UNORM;
134bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SNORM:
135bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_SNORM;
136bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_USCALED:
137bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_USCALED;
138bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SSCALED:
139bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_SSCALED;
140bf215546Sopenharmony_ci         default:
141bf215546Sopenharmony_ci            unreachable("bad nfmt");
142bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UINT:
143bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_UINT;
144bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SINT:
145bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_SINT;
146bf215546Sopenharmony_ci         }
147bf215546Sopenharmony_ci
148bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_8_8:
149bf215546Sopenharmony_ci         switch (nfmt) {
150bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UNORM:
151bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_UNORM;
152bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SNORM:
153bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_SNORM;
154bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_USCALED:
155bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_USCALED;
156bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SSCALED:
157bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_SSCALED;
158bf215546Sopenharmony_ci         default:
159bf215546Sopenharmony_ci            unreachable("bad nfmt");
160bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UINT:
161bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_UINT;
162bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SINT:
163bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_SINT;
164bf215546Sopenharmony_ci         }
165bf215546Sopenharmony_ci
166bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
167bf215546Sopenharmony_ci         switch (nfmt) {
168bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UNORM:
169bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_8_8_UNORM;
170bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SNORM:
171bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_8_8_SNORM;
172bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_USCALED:
173bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_8_8_USCALED;
174bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SSCALED:
175bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_8_8_SSCALED;
176bf215546Sopenharmony_ci         default:
177bf215546Sopenharmony_ci            unreachable("bad nfmt");
178bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UINT:
179bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_8_8_UINT;
180bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SINT:
181bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_8_8_8_8_SINT;
182bf215546Sopenharmony_ci         }
183bf215546Sopenharmony_ci
184bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_16:
185bf215546Sopenharmony_ci         switch (nfmt) {
186bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UNORM:
187bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_UNORM;
188bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SNORM:
189bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_SNORM;
190bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_USCALED:
191bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_USCALED;
192bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SSCALED:
193bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_SSCALED;
194bf215546Sopenharmony_ci         default:
195bf215546Sopenharmony_ci            unreachable("bad nfmt");
196bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UINT:
197bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_UINT;
198bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SINT:
199bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_SINT;
200bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_FLOAT:
201bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_FLOAT;
202bf215546Sopenharmony_ci         }
203bf215546Sopenharmony_ci
204bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_16_16:
205bf215546Sopenharmony_ci         switch (nfmt) {
206bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UNORM:
207bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_UNORM;
208bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SNORM:
209bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_SNORM;
210bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_USCALED:
211bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_USCALED;
212bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SSCALED:
213bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_SSCALED;
214bf215546Sopenharmony_ci         default:
215bf215546Sopenharmony_ci            unreachable("bad nfmt");
216bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UINT:
217bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_UINT;
218bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SINT:
219bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_SINT;
220bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_FLOAT:
221bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_FLOAT;
222bf215546Sopenharmony_ci         }
223bf215546Sopenharmony_ci
224bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
225bf215546Sopenharmony_ci         switch (nfmt) {
226bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UNORM:
227bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_16_16_UNORM;
228bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SNORM:
229bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_16_16_SNORM;
230bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_USCALED:
231bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_16_16_USCALED;
232bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SSCALED:
233bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_16_16_SSCALED;
234bf215546Sopenharmony_ci         default:
235bf215546Sopenharmony_ci            unreachable("bad nfmt");
236bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UINT:
237bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_16_16_UINT;
238bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SINT:
239bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_16_16_SINT;
240bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_FLOAT:
241bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_16_16_16_16_FLOAT;
242bf215546Sopenharmony_ci         }
243bf215546Sopenharmony_ci
244bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_32:
245bf215546Sopenharmony_ci         switch (nfmt) {
246bf215546Sopenharmony_ci         default:
247bf215546Sopenharmony_ci            unreachable("bad nfmt");
248bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UINT:
249bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_UINT;
250bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SINT:
251bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_SINT;
252bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_FLOAT:
253bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_FLOAT;
254bf215546Sopenharmony_ci         }
255bf215546Sopenharmony_ci
256bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_32_32:
257bf215546Sopenharmony_ci         switch (nfmt) {
258bf215546Sopenharmony_ci         default:
259bf215546Sopenharmony_ci            unreachable("bad nfmt");
260bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UINT:
261bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_32_UINT;
262bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SINT:
263bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_32_SINT;
264bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_FLOAT:
265bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_32_FLOAT;
266bf215546Sopenharmony_ci         }
267bf215546Sopenharmony_ci
268bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_32_32_32:
269bf215546Sopenharmony_ci         switch (nfmt) {
270bf215546Sopenharmony_ci         default:
271bf215546Sopenharmony_ci            unreachable("bad nfmt");
272bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UINT:
273bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_32_32_UINT;
274bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SINT:
275bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_32_32_SINT;
276bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_FLOAT:
277bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_32_32_FLOAT;
278bf215546Sopenharmony_ci         }
279bf215546Sopenharmony_ci
280bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
281bf215546Sopenharmony_ci         switch (nfmt) {
282bf215546Sopenharmony_ci         default:
283bf215546Sopenharmony_ci            unreachable("bad nfmt");
284bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UINT:
285bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_32_32_32_UINT;
286bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SINT:
287bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_32_32_32_SINT;
288bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_FLOAT:
289bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT;
290bf215546Sopenharmony_ci         }
291bf215546Sopenharmony_ci
292bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
293bf215546Sopenharmony_ci         switch (nfmt) {
294bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UNORM:
295bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_2_10_10_10_UNORM;
296bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SNORM:
297bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_2_10_10_10_SNORM;
298bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_USCALED:
299bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_2_10_10_10_USCALED;
300bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SSCALED:
301bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_2_10_10_10_SSCALED;
302bf215546Sopenharmony_ci         default:
303bf215546Sopenharmony_ci            unreachable("bad nfmt");
304bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_UINT:
305bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_2_10_10_10_UINT;
306bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_SINT:
307bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_2_10_10_10_SINT;
308bf215546Sopenharmony_ci         }
309bf215546Sopenharmony_ci
310bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_10_11_11:
311bf215546Sopenharmony_ci         switch (nfmt) {
312bf215546Sopenharmony_ci         default:
313bf215546Sopenharmony_ci            unreachable("bad nfmt");
314bf215546Sopenharmony_ci         case V_008F0C_BUF_NUM_FORMAT_FLOAT:
315bf215546Sopenharmony_ci            return V_008F0C_GFX11_FORMAT_10_11_11_FLOAT;
316bf215546Sopenharmony_ci         }
317bf215546Sopenharmony_ci      }
318bf215546Sopenharmony_ci   } else if (gfx_level >= GFX10) {
319bf215546Sopenharmony_ci      unsigned format;
320bf215546Sopenharmony_ci      switch (dfmt) {
321bf215546Sopenharmony_ci      default:
322bf215546Sopenharmony_ci         unreachable("bad dfmt");
323bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_INVALID:
324bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_INVALID;
325bf215546Sopenharmony_ci         break;
326bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_8:
327bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_8_UINT;
328bf215546Sopenharmony_ci         break;
329bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_8_8:
330bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_8_8_UINT;
331bf215546Sopenharmony_ci         break;
332bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
333bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_8_8_8_8_UINT;
334bf215546Sopenharmony_ci         break;
335bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_16:
336bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_16_UINT;
337bf215546Sopenharmony_ci         break;
338bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_16_16:
339bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_16_16_UINT;
340bf215546Sopenharmony_ci         break;
341bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
342bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_16_16_16_16_UINT;
343bf215546Sopenharmony_ci         break;
344bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_32:
345bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_32_UINT;
346bf215546Sopenharmony_ci         break;
347bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_32_32:
348bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_32_32_UINT;
349bf215546Sopenharmony_ci         break;
350bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_32_32_32:
351bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_32_32_32_UINT;
352bf215546Sopenharmony_ci         break;
353bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
354bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_32_32_32_32_UINT;
355bf215546Sopenharmony_ci         break;
356bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
357bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_2_10_10_10_UINT;
358bf215546Sopenharmony_ci         break;
359bf215546Sopenharmony_ci      case V_008F0C_BUF_DATA_FORMAT_10_11_11:
360bf215546Sopenharmony_ci         format = V_008F0C_GFX10_FORMAT_10_11_11_UINT;
361bf215546Sopenharmony_ci         break;
362bf215546Sopenharmony_ci      }
363bf215546Sopenharmony_ci
364bf215546Sopenharmony_ci      // Use the regularity properties of the combined format enum.
365bf215546Sopenharmony_ci      //
366bf215546Sopenharmony_ci      // Note: float is incompatible with 8-bit data formats,
367bf215546Sopenharmony_ci      //       [us]{norm,scaled} are incomparible with 32-bit data formats.
368bf215546Sopenharmony_ci      //       [us]scaled are not writable.
369bf215546Sopenharmony_ci      switch (nfmt) {
370bf215546Sopenharmony_ci      case V_008F0C_BUF_NUM_FORMAT_UNORM:
371bf215546Sopenharmony_ci         format -= 4;
372bf215546Sopenharmony_ci         break;
373bf215546Sopenharmony_ci      case V_008F0C_BUF_NUM_FORMAT_SNORM:
374bf215546Sopenharmony_ci         format -= 3;
375bf215546Sopenharmony_ci         break;
376bf215546Sopenharmony_ci      case V_008F0C_BUF_NUM_FORMAT_USCALED:
377bf215546Sopenharmony_ci         format -= 2;
378bf215546Sopenharmony_ci         break;
379bf215546Sopenharmony_ci      case V_008F0C_BUF_NUM_FORMAT_SSCALED:
380bf215546Sopenharmony_ci         format -= 1;
381bf215546Sopenharmony_ci         break;
382bf215546Sopenharmony_ci      default:
383bf215546Sopenharmony_ci         unreachable("bad nfmt");
384bf215546Sopenharmony_ci      case V_008F0C_BUF_NUM_FORMAT_UINT:
385bf215546Sopenharmony_ci         break;
386bf215546Sopenharmony_ci      case V_008F0C_BUF_NUM_FORMAT_SINT:
387bf215546Sopenharmony_ci         format += 1;
388bf215546Sopenharmony_ci         break;
389bf215546Sopenharmony_ci      case V_008F0C_BUF_NUM_FORMAT_FLOAT:
390bf215546Sopenharmony_ci         format += 2;
391bf215546Sopenharmony_ci         break;
392bf215546Sopenharmony_ci      }
393bf215546Sopenharmony_ci
394bf215546Sopenharmony_ci      return format;
395bf215546Sopenharmony_ci   } else {
396bf215546Sopenharmony_ci      return dfmt | (nfmt << 4);
397bf215546Sopenharmony_ci   }
398bf215546Sopenharmony_ci}
399bf215546Sopenharmony_ci
400bf215546Sopenharmony_cistatic const struct ac_data_format_info data_format_table[] = {
401bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_INVALID] = {0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID},
402bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_8] = {1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8},
403bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_16] = {2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16},
404bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_8_8] = {2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8},
405bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_32] = {4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32},
406bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_16_16] = {4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16},
407bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_10_11_11] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11},
408bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_11_11_10] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10},
409bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2},
410bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10},
411bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = {4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8},
412bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_32_32] = {8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32},
413bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16},
414bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_32_32_32] = {12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32},
415bf215546Sopenharmony_ci   [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = {16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32},
416bf215546Sopenharmony_ci};
417bf215546Sopenharmony_ci
418bf215546Sopenharmony_ciconst struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt)
419bf215546Sopenharmony_ci{
420bf215546Sopenharmony_ci   assert(dfmt < ARRAY_SIZE(data_format_table));
421bf215546Sopenharmony_ci   return &data_format_table[dfmt];
422bf215546Sopenharmony_ci}
423bf215546Sopenharmony_ci
424bf215546Sopenharmony_cienum ac_image_dim ac_get_sampler_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim dim,
425bf215546Sopenharmony_ci                                     bool is_array)
426bf215546Sopenharmony_ci{
427bf215546Sopenharmony_ci   switch (dim) {
428bf215546Sopenharmony_ci   case GLSL_SAMPLER_DIM_1D:
429bf215546Sopenharmony_ci      if (gfx_level == GFX9)
430bf215546Sopenharmony_ci         return is_array ? ac_image_2darray : ac_image_2d;
431bf215546Sopenharmony_ci      return is_array ? ac_image_1darray : ac_image_1d;
432bf215546Sopenharmony_ci   case GLSL_SAMPLER_DIM_2D:
433bf215546Sopenharmony_ci   case GLSL_SAMPLER_DIM_RECT:
434bf215546Sopenharmony_ci   case GLSL_SAMPLER_DIM_EXTERNAL:
435bf215546Sopenharmony_ci      return is_array ? ac_image_2darray : ac_image_2d;
436bf215546Sopenharmony_ci   case GLSL_SAMPLER_DIM_3D:
437bf215546Sopenharmony_ci      return ac_image_3d;
438bf215546Sopenharmony_ci   case GLSL_SAMPLER_DIM_CUBE:
439bf215546Sopenharmony_ci      return ac_image_cube;
440bf215546Sopenharmony_ci   case GLSL_SAMPLER_DIM_MS:
441bf215546Sopenharmony_ci      return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
442bf215546Sopenharmony_ci   case GLSL_SAMPLER_DIM_SUBPASS:
443bf215546Sopenharmony_ci      return ac_image_2darray;
444bf215546Sopenharmony_ci   case GLSL_SAMPLER_DIM_SUBPASS_MS:
445bf215546Sopenharmony_ci      return ac_image_2darraymsaa;
446bf215546Sopenharmony_ci   default:
447bf215546Sopenharmony_ci      unreachable("bad sampler dim");
448bf215546Sopenharmony_ci   }
449bf215546Sopenharmony_ci}
450bf215546Sopenharmony_ci
451bf215546Sopenharmony_cienum ac_image_dim ac_get_image_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim sdim,
452bf215546Sopenharmony_ci                                   bool is_array)
453bf215546Sopenharmony_ci{
454bf215546Sopenharmony_ci   enum ac_image_dim dim = ac_get_sampler_dim(gfx_level, sdim, is_array);
455bf215546Sopenharmony_ci
456bf215546Sopenharmony_ci   /* Match the resource type set in the descriptor. */
457bf215546Sopenharmony_ci   if (dim == ac_image_cube || (gfx_level <= GFX8 && dim == ac_image_3d))
458bf215546Sopenharmony_ci      dim = ac_image_2darray;
459bf215546Sopenharmony_ci   else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && gfx_level == GFX9) {
460bf215546Sopenharmony_ci      /* When a single layer of a 3D texture is bound, the shader
461bf215546Sopenharmony_ci       * will refer to a 2D target, but the descriptor has a 3D type.
462bf215546Sopenharmony_ci       * Since the HW ignores BASE_ARRAY in this case, we need to
463bf215546Sopenharmony_ci       * send 3 coordinates. This doesn't hurt when the underlying
464bf215546Sopenharmony_ci       * texture is non-3D.
465bf215546Sopenharmony_ci       */
466bf215546Sopenharmony_ci      dim = ac_image_3d;
467bf215546Sopenharmony_ci   }
468bf215546Sopenharmony_ci
469bf215546Sopenharmony_ci   return dim;
470bf215546Sopenharmony_ci}
471bf215546Sopenharmony_ci
472bf215546Sopenharmony_ciunsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
473bf215546Sopenharmony_ci                                  signed char *face_vgpr_index_ptr,
474bf215546Sopenharmony_ci                                  signed char *ancillary_vgpr_index_ptr,
475bf215546Sopenharmony_ci                                  signed char *sample_coverage_vgpr_index_ptr)
476bf215546Sopenharmony_ci{
477bf215546Sopenharmony_ci   unsigned num_input_vgprs = 0;
478bf215546Sopenharmony_ci   signed char face_vgpr_index = -1;
479bf215546Sopenharmony_ci   signed char ancillary_vgpr_index = -1;
480bf215546Sopenharmony_ci   signed char sample_coverage_vgpr_index = -1;
481bf215546Sopenharmony_ci
482bf215546Sopenharmony_ci   if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
483bf215546Sopenharmony_ci      num_input_vgprs += 2;
484bf215546Sopenharmony_ci   if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
485bf215546Sopenharmony_ci      num_input_vgprs += 2;
486bf215546Sopenharmony_ci   if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
487bf215546Sopenharmony_ci      num_input_vgprs += 2;
488bf215546Sopenharmony_ci   if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
489bf215546Sopenharmony_ci      num_input_vgprs += 3;
490bf215546Sopenharmony_ci   if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
491bf215546Sopenharmony_ci      num_input_vgprs += 2;
492bf215546Sopenharmony_ci   if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
493bf215546Sopenharmony_ci      num_input_vgprs += 2;
494bf215546Sopenharmony_ci   if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
495bf215546Sopenharmony_ci      num_input_vgprs += 2;
496bf215546Sopenharmony_ci   if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
497bf215546Sopenharmony_ci      num_input_vgprs += 1;
498bf215546Sopenharmony_ci   if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
499bf215546Sopenharmony_ci      num_input_vgprs += 1;
500bf215546Sopenharmony_ci   if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
501bf215546Sopenharmony_ci      num_input_vgprs += 1;
502bf215546Sopenharmony_ci   if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
503bf215546Sopenharmony_ci      num_input_vgprs += 1;
504bf215546Sopenharmony_ci   if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
505bf215546Sopenharmony_ci      num_input_vgprs += 1;
506bf215546Sopenharmony_ci   if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) {
507bf215546Sopenharmony_ci      face_vgpr_index = num_input_vgprs;
508bf215546Sopenharmony_ci      num_input_vgprs += 1;
509bf215546Sopenharmony_ci   }
510bf215546Sopenharmony_ci   if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) {
511bf215546Sopenharmony_ci      ancillary_vgpr_index = num_input_vgprs;
512bf215546Sopenharmony_ci      num_input_vgprs += 1;
513bf215546Sopenharmony_ci   }
514bf215546Sopenharmony_ci   if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr)) {
515bf215546Sopenharmony_ci      sample_coverage_vgpr_index = num_input_vgprs;
516bf215546Sopenharmony_ci      num_input_vgprs += 1;
517bf215546Sopenharmony_ci   }
518bf215546Sopenharmony_ci   if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
519bf215546Sopenharmony_ci      num_input_vgprs += 1;
520bf215546Sopenharmony_ci
521bf215546Sopenharmony_ci   if (face_vgpr_index_ptr)
522bf215546Sopenharmony_ci      *face_vgpr_index_ptr = face_vgpr_index;
523bf215546Sopenharmony_ci   if (ancillary_vgpr_index_ptr)
524bf215546Sopenharmony_ci      *ancillary_vgpr_index_ptr = ancillary_vgpr_index;
525bf215546Sopenharmony_ci   if (sample_coverage_vgpr_index_ptr)
526bf215546Sopenharmony_ci      *sample_coverage_vgpr_index_ptr = sample_coverage_vgpr_index;
527bf215546Sopenharmony_ci
528bf215546Sopenharmony_ci   return num_input_vgprs;
529bf215546Sopenharmony_ci}
530bf215546Sopenharmony_ci
531bf215546Sopenharmony_civoid ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype,
532bf215546Sopenharmony_ci                                 bool is_depth, bool use_rbplus,
533bf215546Sopenharmony_ci                                 struct ac_spi_color_formats *formats)
534bf215546Sopenharmony_ci{
535bf215546Sopenharmony_ci   /* Alpha is needed for alpha-to-coverage.
536bf215546Sopenharmony_ci    * Blending may be with or without alpha.
537bf215546Sopenharmony_ci    */
538bf215546Sopenharmony_ci   unsigned normal = 0;      /* most optimal, may not support blending or export alpha */
539bf215546Sopenharmony_ci   unsigned alpha = 0;       /* exports alpha, but may not support blending */
540bf215546Sopenharmony_ci   unsigned blend = 0;       /* supports blending, but may not export alpha */
541bf215546Sopenharmony_ci   unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
542bf215546Sopenharmony_ci
543bf215546Sopenharmony_ci   /* Choose the SPI color formats. These are required values for RB+.
544bf215546Sopenharmony_ci    * Other chips have multiple choices, though they are not necessarily better.
545bf215546Sopenharmony_ci    */
546bf215546Sopenharmony_ci   switch (format) {
547bf215546Sopenharmony_ci   case V_028C70_COLOR_5_6_5:
548bf215546Sopenharmony_ci   case V_028C70_COLOR_1_5_5_5:
549bf215546Sopenharmony_ci   case V_028C70_COLOR_5_5_5_1:
550bf215546Sopenharmony_ci   case V_028C70_COLOR_4_4_4_4:
551bf215546Sopenharmony_ci   case V_028C70_COLOR_10_11_11:
552bf215546Sopenharmony_ci   case V_028C70_COLOR_11_11_10:
553bf215546Sopenharmony_ci   case V_028C70_COLOR_5_9_9_9:
554bf215546Sopenharmony_ci   case V_028C70_COLOR_8:
555bf215546Sopenharmony_ci   case V_028C70_COLOR_8_8:
556bf215546Sopenharmony_ci   case V_028C70_COLOR_8_8_8_8:
557bf215546Sopenharmony_ci   case V_028C70_COLOR_10_10_10_2:
558bf215546Sopenharmony_ci   case V_028C70_COLOR_2_10_10_10:
559bf215546Sopenharmony_ci      if (ntype == V_028C70_NUMBER_UINT)
560bf215546Sopenharmony_ci         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
561bf215546Sopenharmony_ci      else if (ntype == V_028C70_NUMBER_SINT)
562bf215546Sopenharmony_ci         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
563bf215546Sopenharmony_ci      else
564bf215546Sopenharmony_ci         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
565bf215546Sopenharmony_ci
566bf215546Sopenharmony_ci      if (!use_rbplus && format == V_028C70_COLOR_8 &&
567bf215546Sopenharmony_ci          ntype != V_028C70_NUMBER_SRGB && swap == V_028C70_SWAP_STD) /* R */ {
568bf215546Sopenharmony_ci         /* When RB+ is enabled, R8_UNORM should use FP16_ABGR for 2x
569bf215546Sopenharmony_ci          * exporting performance. Otherwise, use 32_R to remove useless
570bf215546Sopenharmony_ci          * instructions needed for 16-bit compressed exports.
571bf215546Sopenharmony_ci          */
572bf215546Sopenharmony_ci         blend = normal = V_028714_SPI_SHADER_32_R;
573bf215546Sopenharmony_ci      }
574bf215546Sopenharmony_ci      break;
575bf215546Sopenharmony_ci
576bf215546Sopenharmony_ci   case V_028C70_COLOR_16:
577bf215546Sopenharmony_ci   case V_028C70_COLOR_16_16:
578bf215546Sopenharmony_ci   case V_028C70_COLOR_16_16_16_16:
579bf215546Sopenharmony_ci      if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) {
580bf215546Sopenharmony_ci         /* UNORM16 and SNORM16 don't support blending */
581bf215546Sopenharmony_ci         if (ntype == V_028C70_NUMBER_UNORM)
582bf215546Sopenharmony_ci            normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
583bf215546Sopenharmony_ci         else
584bf215546Sopenharmony_ci            normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
585bf215546Sopenharmony_ci
586bf215546Sopenharmony_ci         /* Use 32 bits per channel for blending. */
587bf215546Sopenharmony_ci         if (format == V_028C70_COLOR_16) {
588bf215546Sopenharmony_ci            if (swap == V_028C70_SWAP_STD) { /* R */
589bf215546Sopenharmony_ci               blend = V_028714_SPI_SHADER_32_R;
590bf215546Sopenharmony_ci               blend_alpha = V_028714_SPI_SHADER_32_AR;
591bf215546Sopenharmony_ci            } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
592bf215546Sopenharmony_ci               blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
593bf215546Sopenharmony_ci            else
594bf215546Sopenharmony_ci               assert(0);
595bf215546Sopenharmony_ci         } else if (format == V_028C70_COLOR_16_16) {
596bf215546Sopenharmony_ci            if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) { /* RG or GR */
597bf215546Sopenharmony_ci               blend = V_028714_SPI_SHADER_32_GR;
598bf215546Sopenharmony_ci               blend_alpha = V_028714_SPI_SHADER_32_ABGR;
599bf215546Sopenharmony_ci            } else if (swap == V_028C70_SWAP_ALT) /* RA */
600bf215546Sopenharmony_ci               blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
601bf215546Sopenharmony_ci            else
602bf215546Sopenharmony_ci               assert(0);
603bf215546Sopenharmony_ci         } else /* 16_16_16_16 */
604bf215546Sopenharmony_ci            blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
605bf215546Sopenharmony_ci      } else if (ntype == V_028C70_NUMBER_UINT)
606bf215546Sopenharmony_ci         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
607bf215546Sopenharmony_ci      else if (ntype == V_028C70_NUMBER_SINT)
608bf215546Sopenharmony_ci         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
609bf215546Sopenharmony_ci      else if (ntype == V_028C70_NUMBER_FLOAT)
610bf215546Sopenharmony_ci         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
611bf215546Sopenharmony_ci      else
612bf215546Sopenharmony_ci         assert(0);
613bf215546Sopenharmony_ci      break;
614bf215546Sopenharmony_ci
615bf215546Sopenharmony_ci   case V_028C70_COLOR_32:
616bf215546Sopenharmony_ci      if (swap == V_028C70_SWAP_STD) { /* R */
617bf215546Sopenharmony_ci         blend = normal = V_028714_SPI_SHADER_32_R;
618bf215546Sopenharmony_ci         alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
619bf215546Sopenharmony_ci      } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
620bf215546Sopenharmony_ci         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
621bf215546Sopenharmony_ci      else
622bf215546Sopenharmony_ci         assert(0);
623bf215546Sopenharmony_ci      break;
624bf215546Sopenharmony_ci
625bf215546Sopenharmony_ci   case V_028C70_COLOR_32_32:
626bf215546Sopenharmony_ci      if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) { /* RG or GR */
627bf215546Sopenharmony_ci         blend = normal = V_028714_SPI_SHADER_32_GR;
628bf215546Sopenharmony_ci         alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
629bf215546Sopenharmony_ci      } else if (swap == V_028C70_SWAP_ALT) /* RA */
630bf215546Sopenharmony_ci         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
631bf215546Sopenharmony_ci      else
632bf215546Sopenharmony_ci         assert(0);
633bf215546Sopenharmony_ci      break;
634bf215546Sopenharmony_ci
635bf215546Sopenharmony_ci   case V_028C70_COLOR_32_32_32_32:
636bf215546Sopenharmony_ci   case V_028C70_COLOR_8_24:
637bf215546Sopenharmony_ci   case V_028C70_COLOR_24_8:
638bf215546Sopenharmony_ci   case V_028C70_COLOR_X24_8_32_FLOAT:
639bf215546Sopenharmony_ci      alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
640bf215546Sopenharmony_ci      break;
641bf215546Sopenharmony_ci
642bf215546Sopenharmony_ci   default:
643bf215546Sopenharmony_ci      assert(0);
644bf215546Sopenharmony_ci      return;
645bf215546Sopenharmony_ci   }
646bf215546Sopenharmony_ci
647bf215546Sopenharmony_ci   /* The DB->CB copy needs 32_ABGR. */
648bf215546Sopenharmony_ci   if (is_depth)
649bf215546Sopenharmony_ci      alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
650bf215546Sopenharmony_ci
651bf215546Sopenharmony_ci   formats->normal = normal;
652bf215546Sopenharmony_ci   formats->alpha = alpha;
653bf215546Sopenharmony_ci   formats->blend = blend;
654bf215546Sopenharmony_ci   formats->blend_alpha = blend_alpha;
655bf215546Sopenharmony_ci}
656bf215546Sopenharmony_ci
657bf215546Sopenharmony_civoid ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling,
658bf215546Sopenharmony_ci                           bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask)
659bf215546Sopenharmony_ci{
660bf215546Sopenharmony_ci   *late_alloc_wave64 = 0; /* The limit is per SA. */
661bf215546Sopenharmony_ci   *cu_mask = 0xffff;
662bf215546Sopenharmony_ci
663bf215546Sopenharmony_ci   /* CU masking can decrease performance and cause a hang with <= 2 CUs per SA. */
664bf215546Sopenharmony_ci   if (info->min_good_cu_per_sa <= 2)
665bf215546Sopenharmony_ci      return;
666bf215546Sopenharmony_ci
667bf215546Sopenharmony_ci   /* If scratch is used with late alloc, the GPU could deadlock if PS uses scratch too. A more
668bf215546Sopenharmony_ci    * complicated computation is needed to enable late alloc with scratch (see PAL).
669bf215546Sopenharmony_ci    */
670bf215546Sopenharmony_ci   if (uses_scratch)
671bf215546Sopenharmony_ci      return;
672bf215546Sopenharmony_ci
673bf215546Sopenharmony_ci   /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
674bf215546Sopenharmony_ci   if (ngg && info->family == CHIP_NAVI14)
675bf215546Sopenharmony_ci      return;
676bf215546Sopenharmony_ci
677bf215546Sopenharmony_ci   if (info->gfx_level >= GFX10) {
678bf215546Sopenharmony_ci      /* For Wave32, the hw will launch twice the number of late alloc waves, so 1 == 2x wave32.
679bf215546Sopenharmony_ci       * These limits are estimated because they are all safe but they vary in performance.
680bf215546Sopenharmony_ci       */
681bf215546Sopenharmony_ci      if (ngg_culling)
682bf215546Sopenharmony_ci         *late_alloc_wave64 = info->min_good_cu_per_sa * 10;
683bf215546Sopenharmony_ci      else
684bf215546Sopenharmony_ci         *late_alloc_wave64 = info->min_good_cu_per_sa * 4;
685bf215546Sopenharmony_ci
686bf215546Sopenharmony_ci      /* Limit LATE_ALLOC_GS to prevent a hang (hw bug) on gfx10. */
687bf215546Sopenharmony_ci      if (info->gfx_level == GFX10 && ngg)
688bf215546Sopenharmony_ci         *late_alloc_wave64 = MIN2(*late_alloc_wave64, 64);
689bf215546Sopenharmony_ci
690bf215546Sopenharmony_ci      /* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock.
691bf215546Sopenharmony_ci       * Others: CU1 must be disabled to prevent a hw deadlock.
692bf215546Sopenharmony_ci       *
693bf215546Sopenharmony_ci       * The deadlock is caused by late alloc, which usually increases performance.
694bf215546Sopenharmony_ci       */
695bf215546Sopenharmony_ci      *cu_mask &= info->gfx_level == GFX10 ? ~BITFIELD_RANGE(2, 2) :
696bf215546Sopenharmony_ci                                              ~BITFIELD_RANGE(1, 1);
697bf215546Sopenharmony_ci   } else {
698bf215546Sopenharmony_ci      if (info->min_good_cu_per_sa <= 4) {
699bf215546Sopenharmony_ci         /* Too few available compute units per SA. Disallowing VS to run on one CU could hurt us
700bf215546Sopenharmony_ci          * more than late VS allocation would help.
701bf215546Sopenharmony_ci          *
702bf215546Sopenharmony_ci          * 2 is the highest safe number that allows us to keep all CUs enabled.
703bf215546Sopenharmony_ci          */
704bf215546Sopenharmony_ci         *late_alloc_wave64 = 2;
705bf215546Sopenharmony_ci      } else {
706bf215546Sopenharmony_ci         /* This is a good initial value, allowing 1 late_alloc wave per SIMD on num_cu - 2.
707bf215546Sopenharmony_ci          */
708bf215546Sopenharmony_ci         *late_alloc_wave64 = (info->min_good_cu_per_sa - 2) * 4;
709bf215546Sopenharmony_ci      }
710bf215546Sopenharmony_ci
711bf215546Sopenharmony_ci      /* VS can't execute on one CU if the limit is > 2. */
712bf215546Sopenharmony_ci      if (*late_alloc_wave64 > 2)
713bf215546Sopenharmony_ci         *cu_mask = 0xfffe; /* 1 CU disabled */
714bf215546Sopenharmony_ci   }
715bf215546Sopenharmony_ci
716bf215546Sopenharmony_ci   /* Max number that fits into the register field. */
717bf215546Sopenharmony_ci   if (ngg) /* GS */
718bf215546Sopenharmony_ci      *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(~0u));
719bf215546Sopenharmony_ci   else /* VS */
720bf215546Sopenharmony_ci      *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B11C_LIMIT(~0u));
721bf215546Sopenharmony_ci}
722bf215546Sopenharmony_ci
723bf215546Sopenharmony_ciunsigned ac_compute_cs_workgroup_size(uint16_t sizes[3], bool variable, unsigned max)
724bf215546Sopenharmony_ci{
725bf215546Sopenharmony_ci   if (variable)
726bf215546Sopenharmony_ci      return max;
727bf215546Sopenharmony_ci
728bf215546Sopenharmony_ci   return sizes[0] * sizes[1] * sizes[2];
729bf215546Sopenharmony_ci}
730bf215546Sopenharmony_ci
731bf215546Sopenharmony_ciunsigned ac_compute_lshs_workgroup_size(enum amd_gfx_level gfx_level, gl_shader_stage stage,
732bf215546Sopenharmony_ci                                        unsigned tess_num_patches,
733bf215546Sopenharmony_ci                                        unsigned tess_patch_in_vtx,
734bf215546Sopenharmony_ci                                        unsigned tess_patch_out_vtx)
735bf215546Sopenharmony_ci{
736bf215546Sopenharmony_ci   /* When tessellation is used, API VS runs on HW LS, API TCS runs on HW HS.
737bf215546Sopenharmony_ci    * These two HW stages are merged on GFX9+.
738bf215546Sopenharmony_ci    */
739bf215546Sopenharmony_ci
740bf215546Sopenharmony_ci   bool merged_shaders = gfx_level >= GFX9;
741bf215546Sopenharmony_ci   unsigned ls_workgroup_size = tess_num_patches * tess_patch_in_vtx;
742bf215546Sopenharmony_ci   unsigned hs_workgroup_size = tess_num_patches * tess_patch_out_vtx;
743bf215546Sopenharmony_ci
744bf215546Sopenharmony_ci   if (merged_shaders)
745bf215546Sopenharmony_ci      return MAX2(ls_workgroup_size, hs_workgroup_size);
746bf215546Sopenharmony_ci   else if (stage == MESA_SHADER_VERTEX)
747bf215546Sopenharmony_ci      return ls_workgroup_size;
748bf215546Sopenharmony_ci   else if (stage == MESA_SHADER_TESS_CTRL)
749bf215546Sopenharmony_ci      return hs_workgroup_size;
750bf215546Sopenharmony_ci   else
751bf215546Sopenharmony_ci      unreachable("invalid LSHS shader stage");
752bf215546Sopenharmony_ci}
753bf215546Sopenharmony_ci
754bf215546Sopenharmony_ciunsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned wave_size,
755bf215546Sopenharmony_ci                                        unsigned es_verts, unsigned gs_inst_prims)
756bf215546Sopenharmony_ci{
757bf215546Sopenharmony_ci   /* ESGS may operate in workgroups if on-chip GS (LDS rings) are enabled.
758bf215546Sopenharmony_ci    *
759bf215546Sopenharmony_ci    * GFX6: Not possible in the HW.
760bf215546Sopenharmony_ci    * GFX7-8 (unmerged): possible in the HW, but not implemented in Mesa.
761bf215546Sopenharmony_ci    * GFX9+ (merged): implemented in Mesa.
762bf215546Sopenharmony_ci    */
763bf215546Sopenharmony_ci
764bf215546Sopenharmony_ci   if (gfx_level <= GFX8)
765bf215546Sopenharmony_ci      return wave_size;
766bf215546Sopenharmony_ci
767bf215546Sopenharmony_ci   unsigned workgroup_size = MAX2(es_verts, gs_inst_prims);
768bf215546Sopenharmony_ci   return CLAMP(workgroup_size, 1, 256);
769bf215546Sopenharmony_ci}
770bf215546Sopenharmony_ci
771bf215546Sopenharmony_ciunsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims,
772bf215546Sopenharmony_ci                                       unsigned max_vtx_out, unsigned prim_amp_factor)
773bf215546Sopenharmony_ci{
774bf215546Sopenharmony_ci   /* NGG always operates in workgroups.
775bf215546Sopenharmony_ci    *
776bf215546Sopenharmony_ci    * For API VS/TES/GS:
777bf215546Sopenharmony_ci    * - 1 invocation per input vertex
778bf215546Sopenharmony_ci    * - 1 invocation per input primitive
779bf215546Sopenharmony_ci    *
780bf215546Sopenharmony_ci    * The same invocation can process both an input vertex and primitive,
781bf215546Sopenharmony_ci    * however 1 invocation can only output up to 1 vertex and 1 primitive.
782bf215546Sopenharmony_ci    */
783bf215546Sopenharmony_ci
784bf215546Sopenharmony_ci   unsigned max_vtx_in = es_verts < 256 ? es_verts : 3 * gs_inst_prims;
785bf215546Sopenharmony_ci   unsigned max_prim_in = gs_inst_prims;
786bf215546Sopenharmony_ci   unsigned max_prim_out = gs_inst_prims * prim_amp_factor;
787bf215546Sopenharmony_ci   unsigned workgroup_size = MAX4(max_vtx_in, max_vtx_out, max_prim_in, max_prim_out);
788bf215546Sopenharmony_ci
789bf215546Sopenharmony_ci   return CLAMP(workgroup_size, 1, 256);
790bf215546Sopenharmony_ci}
791bf215546Sopenharmony_ci
792bf215546Sopenharmony_civoid ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t clear_mask,
793bf215546Sopenharmony_ci                      unsigned value_shift, const struct radeon_info *info,
794bf215546Sopenharmony_ci                      void set_sh_reg(void*, unsigned, uint32_t))
795bf215546Sopenharmony_ci{
796bf215546Sopenharmony_ci   /* Register field position and mask. */
797bf215546Sopenharmony_ci   uint32_t cu_en_mask = ~clear_mask;
798bf215546Sopenharmony_ci   unsigned cu_en_shift = ffs(cu_en_mask) - 1;
799bf215546Sopenharmony_ci   /* The value being set. */
800bf215546Sopenharmony_ci   uint32_t cu_en = (value & cu_en_mask) >> cu_en_shift;
801bf215546Sopenharmony_ci
802bf215546Sopenharmony_ci   /* AND the field by spi_cu_en. */
803bf215546Sopenharmony_ci   uint32_t spi_cu_en = info->spi_cu_en >> value_shift;
804bf215546Sopenharmony_ci   uint32_t new_value = (value & ~cu_en_mask) |
805bf215546Sopenharmony_ci                        (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
806bf215546Sopenharmony_ci
807bf215546Sopenharmony_ci   set_sh_reg(cs, reg_offset, new_value);
808bf215546Sopenharmony_ci}
809bf215546Sopenharmony_ci
810bf215546Sopenharmony_ci/* Return the register value and tune bytes_per_wave to increase scratch performance. */
811bf215546Sopenharmony_civoid ac_get_scratch_tmpring_size(const struct radeon_info *info,
812bf215546Sopenharmony_ci                                 unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
813bf215546Sopenharmony_ci                                 uint32_t *tmpring_size)
814bf215546Sopenharmony_ci{
815bf215546Sopenharmony_ci   /* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors.
816bf215546Sopenharmony_ci    * WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE.
817bf215546Sopenharmony_ci    * Thus, WAVESIZE must be constant while the scratch buffer is being used by the GPU.
818bf215546Sopenharmony_ci    *
819bf215546Sopenharmony_ci    * If you want to increase WAVESIZE without waiting for idle, you need to allocate a new
820bf215546Sopenharmony_ci    * scratch buffer and use it instead. This will result in multiple scratch buffers being
821bf215546Sopenharmony_ci    * used at the same time, each with a different WAVESIZE.
822bf215546Sopenharmony_ci    *
823bf215546Sopenharmony_ci    * If you want to decrease WAVESIZE, you don't have to. There is no advantage in decreasing
824bf215546Sopenharmony_ci    * WAVESIZE after it's been increased.
825bf215546Sopenharmony_ci    *
826bf215546Sopenharmony_ci    * Shaders with SCRATCH_EN=0 don't allocate scratch space.
827bf215546Sopenharmony_ci    */
828bf215546Sopenharmony_ci   const unsigned size_shift = info->gfx_level >= GFX11 ? 8 : 10;
829bf215546Sopenharmony_ci   const unsigned min_size_per_wave = BITFIELD_BIT(size_shift);
830bf215546Sopenharmony_ci
831bf215546Sopenharmony_ci   /* The LLVM shader backend should be reporting aligned scratch_sizes. */
832bf215546Sopenharmony_ci   assert((bytes_per_wave & BITFIELD_MASK(size_shift)) == 0 &&
833bf215546Sopenharmony_ci          "scratch size per wave should be aligned");
834bf215546Sopenharmony_ci
835bf215546Sopenharmony_ci   /* Add 1 scratch item to make the number of items odd. This should improve scratch
836bf215546Sopenharmony_ci    * performance by more randomly distributing scratch waves among memory channels.
837bf215546Sopenharmony_ci    */
838bf215546Sopenharmony_ci   if (bytes_per_wave)
839bf215546Sopenharmony_ci      bytes_per_wave |= min_size_per_wave;
840bf215546Sopenharmony_ci
841bf215546Sopenharmony_ci   *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
842bf215546Sopenharmony_ci
843bf215546Sopenharmony_ci   unsigned max_scratch_waves = info->max_scratch_waves;
844bf215546Sopenharmony_ci   if (info->gfx_level >= GFX11)
845bf215546Sopenharmony_ci      max_scratch_waves /= info->num_se; /* WAVES is per SE */
846bf215546Sopenharmony_ci
847bf215546Sopenharmony_ci   /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
848bf215546Sopenharmony_ci   *tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
849bf215546Sopenharmony_ci                   S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
850bf215546Sopenharmony_ci}
851