1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright 2018 Advanced Micro Devices, Inc.
3bf215546Sopenharmony_ci * All Rights Reserved.
4bf215546Sopenharmony_ci *
5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
8bf215546Sopenharmony_ci * on the rights to use, copy, modify, merge, publish, distribute, sub
9bf215546Sopenharmony_ci * license, and/or sell copies of the Software, and to permit persons to whom
10bf215546Sopenharmony_ci * the Software is furnished to do so, subject to the following conditions:
11bf215546Sopenharmony_ci *
12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
14bf215546Sopenharmony_ci * Software.
15bf215546Sopenharmony_ci *
16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19bf215546Sopenharmony_ci * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20bf215546Sopenharmony_ci * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21bf215546Sopenharmony_ci * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22bf215546Sopenharmony_ci * USE OR OTHER DEALINGS IN THE SOFTWARE.
23bf215546Sopenharmony_ci */
24bf215546Sopenharmony_ci
25bf215546Sopenharmony_ci#include "si_pipe.h"
26bf215546Sopenharmony_ci#include "tgsi/tgsi_text.h"
27bf215546Sopenharmony_ci#include "tgsi/tgsi_ureg.h"
28bf215546Sopenharmony_ci
29bf215546Sopenharmony_civoid *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers)
30bf215546Sopenharmony_ci{
31bf215546Sopenharmony_ci   unsigned vs_blit_property;
32bf215546Sopenharmony_ci   void **vs;
33bf215546Sopenharmony_ci
34bf215546Sopenharmony_ci   switch (type) {
35bf215546Sopenharmony_ci   case UTIL_BLITTER_ATTRIB_NONE:
36bf215546Sopenharmony_ci      vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : &sctx->vs_blit_pos;
37bf215546Sopenharmony_ci      vs_blit_property = SI_VS_BLIT_SGPRS_POS;
38bf215546Sopenharmony_ci      break;
39bf215546Sopenharmony_ci   case UTIL_BLITTER_ATTRIB_COLOR:
40bf215546Sopenharmony_ci      vs = num_layers > 1 ? &sctx->vs_blit_color_layered : &sctx->vs_blit_color;
41bf215546Sopenharmony_ci      vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
42bf215546Sopenharmony_ci      break;
43bf215546Sopenharmony_ci   case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
44bf215546Sopenharmony_ci   case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
45bf215546Sopenharmony_ci      assert(num_layers == 1);
46bf215546Sopenharmony_ci      vs = &sctx->vs_blit_texcoord;
47bf215546Sopenharmony_ci      vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
48bf215546Sopenharmony_ci      break;
49bf215546Sopenharmony_ci   default:
50bf215546Sopenharmony_ci      assert(0);
51bf215546Sopenharmony_ci      return NULL;
52bf215546Sopenharmony_ci   }
53bf215546Sopenharmony_ci   if (*vs)
54bf215546Sopenharmony_ci      return *vs;
55bf215546Sopenharmony_ci
56bf215546Sopenharmony_ci   struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
57bf215546Sopenharmony_ci   if (!ureg)
58bf215546Sopenharmony_ci      return NULL;
59bf215546Sopenharmony_ci
60bf215546Sopenharmony_ci   /* Tell the shader to load VS inputs from SGPRs: */
61bf215546Sopenharmony_ci   ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property);
62bf215546Sopenharmony_ci   ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);
63bf215546Sopenharmony_ci
64bf215546Sopenharmony_ci   /* This is just a pass-through shader with 1-3 MOV instructions. */
65bf215546Sopenharmony_ci   ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), ureg_DECL_vs_input(ureg, 0));
66bf215546Sopenharmony_ci
67bf215546Sopenharmony_ci   if (type != UTIL_BLITTER_ATTRIB_NONE) {
68bf215546Sopenharmony_ci      ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0), ureg_DECL_vs_input(ureg, 1));
69bf215546Sopenharmony_ci   }
70bf215546Sopenharmony_ci
71bf215546Sopenharmony_ci   if (num_layers > 1) {
72bf215546Sopenharmony_ci      struct ureg_src instance_id = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0);
73bf215546Sopenharmony_ci      struct ureg_dst layer = ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);
74bf215546Sopenharmony_ci
75bf215546Sopenharmony_ci      ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X),
76bf215546Sopenharmony_ci               ureg_scalar(instance_id, TGSI_SWIZZLE_X));
77bf215546Sopenharmony_ci   }
78bf215546Sopenharmony_ci   ureg_END(ureg);
79bf215546Sopenharmony_ci
80bf215546Sopenharmony_ci   *vs = ureg_create_shader_and_destroy(ureg, &sctx->b);
81bf215546Sopenharmony_ci   return *vs;
82bf215546Sopenharmony_ci}
83bf215546Sopenharmony_ci
84bf215546Sopenharmony_ci/* Create a compute shader implementing clear_buffer or copy_buffer. */
85bf215546Sopenharmony_civoid *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
86bf215546Sopenharmony_ci                                   bool dst_stream_cache_policy, bool is_copy)
87bf215546Sopenharmony_ci{
88bf215546Sopenharmony_ci   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
89bf215546Sopenharmony_ci   assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
90bf215546Sopenharmony_ci
91bf215546Sopenharmony_ci   unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
92bf215546Sopenharmony_ci   if (dst_stream_cache_policy)
93bf215546Sopenharmony_ci      store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
94bf215546Sopenharmony_ci
95bf215546Sopenharmony_ci   /* Don't cache loads, because there is no reuse. */
96bf215546Sopenharmony_ci   unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
97bf215546Sopenharmony_ci
98bf215546Sopenharmony_ci   unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
99bf215546Sopenharmony_ci   unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
100bf215546Sopenharmony_ci
101bf215546Sopenharmony_ci   for (unsigned i = 0; i < num_mem_ops; i++) {
102bf215546Sopenharmony_ci      if (i * 4 < num_dwords_per_thread)
103bf215546Sopenharmony_ci         inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
104bf215546Sopenharmony_ci   }
105bf215546Sopenharmony_ci
106bf215546Sopenharmony_ci   struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
107bf215546Sopenharmony_ci   if (!ureg)
108bf215546Sopenharmony_ci      return NULL;
109bf215546Sopenharmony_ci
110bf215546Sopenharmony_ci   unsigned default_wave_size = si_determine_wave_size(sscreen, NULL);
111bf215546Sopenharmony_ci
112bf215546Sopenharmony_ci   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, default_wave_size);
113bf215546Sopenharmony_ci   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
114bf215546Sopenharmony_ci   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
115bf215546Sopenharmony_ci
116bf215546Sopenharmony_ci   struct ureg_src value;
117bf215546Sopenharmony_ci   if (!is_copy) {
118bf215546Sopenharmony_ci      ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);
119bf215546Sopenharmony_ci      value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);
120bf215546Sopenharmony_ci   }
121bf215546Sopenharmony_ci
122bf215546Sopenharmony_ci   struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
123bf215546Sopenharmony_ci   struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
124bf215546Sopenharmony_ci   struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
125bf215546Sopenharmony_ci   struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
126bf215546Sopenharmony_ci   struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
127bf215546Sopenharmony_ci   struct ureg_src srcbuf;
128bf215546Sopenharmony_ci   struct ureg_src *values = NULL;
129bf215546Sopenharmony_ci
130bf215546Sopenharmony_ci   if (is_copy) {
131bf215546Sopenharmony_ci      srcbuf = ureg_DECL_buffer(ureg, 1, false);
132bf215546Sopenharmony_ci      values = malloc(num_mem_ops * sizeof(struct ureg_src));
133bf215546Sopenharmony_ci   }
134bf215546Sopenharmony_ci
135bf215546Sopenharmony_ci   /* If there are multiple stores, the first store writes into 0*wavesize+tid,
136bf215546Sopenharmony_ci    * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
137bf215546Sopenharmony_ci    */
138bf215546Sopenharmony_ci   ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, default_wave_size * num_mem_ops),
139bf215546Sopenharmony_ci             tid);
140bf215546Sopenharmony_ci   /* Convert from a "store size unit" into bytes. */
141bf215546Sopenharmony_ci   ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0]));
142bf215546Sopenharmony_ci   ureg_MOV(ureg, load_addr, ureg_src(store_addr));
143bf215546Sopenharmony_ci
144bf215546Sopenharmony_ci   /* Distance between a load and a store for latency hiding. */
145bf215546Sopenharmony_ci   unsigned load_store_distance = is_copy ? 8 : 0;
146bf215546Sopenharmony_ci
147bf215546Sopenharmony_ci   for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
148bf215546Sopenharmony_ci      int d = i - load_store_distance;
149bf215546Sopenharmony_ci
150bf215546Sopenharmony_ci      if (is_copy && i < num_mem_ops) {
151bf215546Sopenharmony_ci         if (i) {
152bf215546Sopenharmony_ci            ureg_UADD(ureg, load_addr, ureg_src(load_addr),
153bf215546Sopenharmony_ci                      ureg_imm1u(ureg, 4 * inst_dwords[i] * default_wave_size));
154bf215546Sopenharmony_ci         }
155bf215546Sopenharmony_ci
156bf215546Sopenharmony_ci         values[i] = ureg_src(ureg_DECL_temporary(ureg));
157bf215546Sopenharmony_ci         struct ureg_dst dst =
158bf215546Sopenharmony_ci            ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i]));
159bf215546Sopenharmony_ci         struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
160bf215546Sopenharmony_ci         ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier,
161bf215546Sopenharmony_ci                          TGSI_TEXTURE_BUFFER, 0);
162bf215546Sopenharmony_ci      }
163bf215546Sopenharmony_ci
164bf215546Sopenharmony_ci      if (d >= 0) {
165bf215546Sopenharmony_ci         if (d) {
166bf215546Sopenharmony_ci            ureg_UADD(ureg, store_addr, ureg_src(store_addr),
167bf215546Sopenharmony_ci                      ureg_imm1u(ureg, 4 * inst_dwords[d] * default_wave_size));
168bf215546Sopenharmony_ci         }
169bf215546Sopenharmony_ci
170bf215546Sopenharmony_ci         struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
171bf215546Sopenharmony_ci         struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value};
172bf215546Sopenharmony_ci         ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier,
173bf215546Sopenharmony_ci                          TGSI_TEXTURE_BUFFER, 0);
174bf215546Sopenharmony_ci      }
175bf215546Sopenharmony_ci   }
176bf215546Sopenharmony_ci   ureg_END(ureg);
177bf215546Sopenharmony_ci
178bf215546Sopenharmony_ci   struct pipe_compute_state state = {};
179bf215546Sopenharmony_ci   state.ir_type = PIPE_SHADER_IR_TGSI;
180bf215546Sopenharmony_ci   state.prog = ureg_get_tokens(ureg, NULL);
181bf215546Sopenharmony_ci
182bf215546Sopenharmony_ci   void *cs = ctx->create_compute_state(ctx, &state);
183bf215546Sopenharmony_ci   ureg_destroy(ureg);
184bf215546Sopenharmony_ci   ureg_free_tokens(state.prog);
185bf215546Sopenharmony_ci
186bf215546Sopenharmony_ci   free(values);
187bf215546Sopenharmony_ci   return cs;
188bf215546Sopenharmony_ci}
189bf215546Sopenharmony_ci
190bf215546Sopenharmony_ci/* Create the compute shader that is used to collect the results.
191bf215546Sopenharmony_ci *
192bf215546Sopenharmony_ci * One compute grid with a single thread is launched for every query result
193bf215546Sopenharmony_ci * buffer. The thread (optionally) reads a previous summary buffer, then
194bf215546Sopenharmony_ci * accumulates data from the query result buffer, and writes the result either
195bf215546Sopenharmony_ci * to a summary buffer to be consumed by the next grid invocation or to the
196bf215546Sopenharmony_ci * user-supplied buffer.
197bf215546Sopenharmony_ci *
198bf215546Sopenharmony_ci * Data layout:
199bf215546Sopenharmony_ci *
200bf215546Sopenharmony_ci * CONST
201bf215546Sopenharmony_ci *  0.x = end_offset
202bf215546Sopenharmony_ci *  0.y = result_stride
203bf215546Sopenharmony_ci *  0.z = result_count
204bf215546Sopenharmony_ci *  0.w = bit field:
205bf215546Sopenharmony_ci *          1: read previously accumulated values
206bf215546Sopenharmony_ci *          2: write accumulated values for chaining
207bf215546Sopenharmony_ci *          4: write result available
208bf215546Sopenharmony_ci *          8: convert result to boolean (0/1)
209bf215546Sopenharmony_ci *         16: only read one dword and use that as result
210bf215546Sopenharmony_ci *         32: apply timestamp conversion
211bf215546Sopenharmony_ci *         64: store full 64 bits result
212bf215546Sopenharmony_ci *        128: store signed 32 bits result
213bf215546Sopenharmony_ci *        256: SO_OVERFLOW mode: take the difference of two successive half-pairs
214bf215546Sopenharmony_ci *  1.x = fence_offset
215bf215546Sopenharmony_ci *  1.y = pair_stride
216bf215546Sopenharmony_ci *  1.z = pair_count
217bf215546Sopenharmony_ci *
218bf215546Sopenharmony_ci * BUFFER[0] = query result buffer
219bf215546Sopenharmony_ci * BUFFER[1] = previous summary buffer
220bf215546Sopenharmony_ci * BUFFER[2] = next summary buffer or user-supplied buffer
221bf215546Sopenharmony_ci */
222bf215546Sopenharmony_civoid *si_create_query_result_cs(struct si_context *sctx)
223bf215546Sopenharmony_ci{
224bf215546Sopenharmony_ci   /* TEMP[0].xy = accumulated result so far
225bf215546Sopenharmony_ci    * TEMP[0].z = result not available
226bf215546Sopenharmony_ci    *
227bf215546Sopenharmony_ci    * TEMP[1].x = current result index
228bf215546Sopenharmony_ci    * TEMP[1].y = current pair index
229bf215546Sopenharmony_ci    */
230bf215546Sopenharmony_ci   static const char text_tmpl[] =
231bf215546Sopenharmony_ci      "COMP\n"
232bf215546Sopenharmony_ci      "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
233bf215546Sopenharmony_ci      "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
234bf215546Sopenharmony_ci      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
235bf215546Sopenharmony_ci      "DCL BUFFER[0]\n"
236bf215546Sopenharmony_ci      "DCL BUFFER[1]\n"
237bf215546Sopenharmony_ci      "DCL BUFFER[2]\n"
238bf215546Sopenharmony_ci      "DCL CONST[0][0..1]\n"
239bf215546Sopenharmony_ci      "DCL TEMP[0..5]\n"
240bf215546Sopenharmony_ci      "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
241bf215546Sopenharmony_ci      "IMM[1] UINT32 {1, 2, 4, 8}\n"
242bf215546Sopenharmony_ci      "IMM[2] UINT32 {16, 32, 64, 128}\n"
243bf215546Sopenharmony_ci      "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
244bf215546Sopenharmony_ci      "IMM[4] UINT32 {256, 0, 0, 0}\n"
245bf215546Sopenharmony_ci
246bf215546Sopenharmony_ci      "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
247bf215546Sopenharmony_ci      "UIF TEMP[5]\n"
248bf215546Sopenharmony_ci      /* Check result availability. */
249bf215546Sopenharmony_ci      "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
250bf215546Sopenharmony_ci      "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
251bf215546Sopenharmony_ci      "MOV TEMP[1], TEMP[0].zzzz\n"
252bf215546Sopenharmony_ci      "NOT TEMP[0].z, TEMP[0].zzzz\n"
253bf215546Sopenharmony_ci
254bf215546Sopenharmony_ci      /* Load result if available. */
255bf215546Sopenharmony_ci      "UIF TEMP[1]\n"
256bf215546Sopenharmony_ci      "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
257bf215546Sopenharmony_ci      "ENDIF\n"
258bf215546Sopenharmony_ci      "ELSE\n"
259bf215546Sopenharmony_ci      /* Load previously accumulated result if requested. */
260bf215546Sopenharmony_ci      "MOV TEMP[0], IMM[0].xxxx\n"
261bf215546Sopenharmony_ci      "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
262bf215546Sopenharmony_ci      "UIF TEMP[4]\n"
263bf215546Sopenharmony_ci      "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
264bf215546Sopenharmony_ci      "ENDIF\n"
265bf215546Sopenharmony_ci
266bf215546Sopenharmony_ci      "MOV TEMP[1].x, IMM[0].xxxx\n"
267bf215546Sopenharmony_ci      "BGNLOOP\n"
268bf215546Sopenharmony_ci      /* Break if accumulated result so far is not available. */
269bf215546Sopenharmony_ci      "UIF TEMP[0].zzzz\n"
270bf215546Sopenharmony_ci      "BRK\n"
271bf215546Sopenharmony_ci      "ENDIF\n"
272bf215546Sopenharmony_ci
273bf215546Sopenharmony_ci      /* Break if result_index >= result_count. */
274bf215546Sopenharmony_ci      "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
275bf215546Sopenharmony_ci      "UIF TEMP[5]\n"
276bf215546Sopenharmony_ci      "BRK\n"
277bf215546Sopenharmony_ci      "ENDIF\n"
278bf215546Sopenharmony_ci
279bf215546Sopenharmony_ci      /* Load fence and check result availability */
280bf215546Sopenharmony_ci      "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
281bf215546Sopenharmony_ci      "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
282bf215546Sopenharmony_ci      "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
283bf215546Sopenharmony_ci      "NOT TEMP[0].z, TEMP[0].zzzz\n"
284bf215546Sopenharmony_ci      "UIF TEMP[0].zzzz\n"
285bf215546Sopenharmony_ci      "BRK\n"
286bf215546Sopenharmony_ci      "ENDIF\n"
287bf215546Sopenharmony_ci
288bf215546Sopenharmony_ci      "MOV TEMP[1].y, IMM[0].xxxx\n"
289bf215546Sopenharmony_ci      "BGNLOOP\n"
290bf215546Sopenharmony_ci      /* Load start and end. */
291bf215546Sopenharmony_ci      "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
292bf215546Sopenharmony_ci      "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
293bf215546Sopenharmony_ci      "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
294bf215546Sopenharmony_ci
295bf215546Sopenharmony_ci      "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
296bf215546Sopenharmony_ci      "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
297bf215546Sopenharmony_ci
298bf215546Sopenharmony_ci      "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
299bf215546Sopenharmony_ci
300bf215546Sopenharmony_ci      "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
301bf215546Sopenharmony_ci      "UIF TEMP[5].zzzz\n"
302bf215546Sopenharmony_ci      /* Load second start/end half-pair and
303bf215546Sopenharmony_ci       * take the difference
304bf215546Sopenharmony_ci       */
305bf215546Sopenharmony_ci      "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
306bf215546Sopenharmony_ci      "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
307bf215546Sopenharmony_ci      "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
308bf215546Sopenharmony_ci
309bf215546Sopenharmony_ci      "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
310bf215546Sopenharmony_ci      "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
311bf215546Sopenharmony_ci      "ENDIF\n"
312bf215546Sopenharmony_ci
313bf215546Sopenharmony_ci      "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
314bf215546Sopenharmony_ci
315bf215546Sopenharmony_ci      /* Increment pair index */
316bf215546Sopenharmony_ci      "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
317bf215546Sopenharmony_ci      "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
318bf215546Sopenharmony_ci      "UIF TEMP[5]\n"
319bf215546Sopenharmony_ci      "BRK\n"
320bf215546Sopenharmony_ci      "ENDIF\n"
321bf215546Sopenharmony_ci      "ENDLOOP\n"
322bf215546Sopenharmony_ci
323bf215546Sopenharmony_ci      /* Increment result index */
324bf215546Sopenharmony_ci      "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
325bf215546Sopenharmony_ci      "ENDLOOP\n"
326bf215546Sopenharmony_ci      "ENDIF\n"
327bf215546Sopenharmony_ci
328bf215546Sopenharmony_ci      "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
329bf215546Sopenharmony_ci      "UIF TEMP[4]\n"
330bf215546Sopenharmony_ci      /* Store accumulated data for chaining. */
331bf215546Sopenharmony_ci      "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
332bf215546Sopenharmony_ci      "ELSE\n"
333bf215546Sopenharmony_ci      "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
334bf215546Sopenharmony_ci      "UIF TEMP[4]\n"
335bf215546Sopenharmony_ci      /* Store result availability. */
336bf215546Sopenharmony_ci      "NOT TEMP[0].z, TEMP[0]\n"
337bf215546Sopenharmony_ci      "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
338bf215546Sopenharmony_ci      "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
339bf215546Sopenharmony_ci
340bf215546Sopenharmony_ci      "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
341bf215546Sopenharmony_ci      "UIF TEMP[4]\n"
342bf215546Sopenharmony_ci      "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
343bf215546Sopenharmony_ci      "ENDIF\n"
344bf215546Sopenharmony_ci      "ELSE\n"
345bf215546Sopenharmony_ci      /* Store result if it is available. */
346bf215546Sopenharmony_ci      "NOT TEMP[4], TEMP[0].zzzz\n"
347bf215546Sopenharmony_ci      "UIF TEMP[4]\n"
348bf215546Sopenharmony_ci      /* Apply timestamp conversion */
349bf215546Sopenharmony_ci      "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
350bf215546Sopenharmony_ci      "UIF TEMP[4]\n"
351bf215546Sopenharmony_ci      "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
352bf215546Sopenharmony_ci      "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
353bf215546Sopenharmony_ci      "ENDIF\n"
354bf215546Sopenharmony_ci
355bf215546Sopenharmony_ci      /* Convert to boolean */
356bf215546Sopenharmony_ci      "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
357bf215546Sopenharmony_ci      "UIF TEMP[4]\n"
358bf215546Sopenharmony_ci      "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
359bf215546Sopenharmony_ci      "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
360bf215546Sopenharmony_ci      "MOV TEMP[0].y, IMM[0].xxxx\n"
361bf215546Sopenharmony_ci      "ENDIF\n"
362bf215546Sopenharmony_ci
363bf215546Sopenharmony_ci      "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
364bf215546Sopenharmony_ci      "UIF TEMP[4]\n"
365bf215546Sopenharmony_ci      "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
366bf215546Sopenharmony_ci      "ELSE\n"
367bf215546Sopenharmony_ci      /* Clamping */
368bf215546Sopenharmony_ci      "UIF TEMP[0].yyyy\n"
369bf215546Sopenharmony_ci      "MOV TEMP[0].x, IMM[0].wwww\n"
370bf215546Sopenharmony_ci      "ENDIF\n"
371bf215546Sopenharmony_ci
372bf215546Sopenharmony_ci      "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
373bf215546Sopenharmony_ci      "UIF TEMP[4]\n"
374bf215546Sopenharmony_ci      "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
375bf215546Sopenharmony_ci      "ENDIF\n"
376bf215546Sopenharmony_ci
377bf215546Sopenharmony_ci      "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
378bf215546Sopenharmony_ci      "ENDIF\n"
379bf215546Sopenharmony_ci      "ENDIF\n"
380bf215546Sopenharmony_ci      "ENDIF\n"
381bf215546Sopenharmony_ci      "ENDIF\n"
382bf215546Sopenharmony_ci
383bf215546Sopenharmony_ci      "END\n";
384bf215546Sopenharmony_ci
385bf215546Sopenharmony_ci   char text[sizeof(text_tmpl) + 32];
386bf215546Sopenharmony_ci   struct tgsi_token tokens[1024];
387bf215546Sopenharmony_ci   struct pipe_compute_state state = {};
388bf215546Sopenharmony_ci
389bf215546Sopenharmony_ci   /* Hard code the frequency into the shader so that the backend can
390bf215546Sopenharmony_ci    * use the full range of optimizations for divide-by-constant.
391bf215546Sopenharmony_ci    */
392bf215546Sopenharmony_ci   snprintf(text, sizeof(text), text_tmpl, sctx->screen->info.clock_crystal_freq);
393bf215546Sopenharmony_ci
394bf215546Sopenharmony_ci   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
395bf215546Sopenharmony_ci      assert(false);
396bf215546Sopenharmony_ci      return NULL;
397bf215546Sopenharmony_ci   }
398bf215546Sopenharmony_ci
399bf215546Sopenharmony_ci   state.ir_type = PIPE_SHADER_IR_TGSI;
400bf215546Sopenharmony_ci   state.prog = tokens;
401bf215546Sopenharmony_ci
402bf215546Sopenharmony_ci   return sctx->b.create_compute_state(&sctx->b, &state);
403bf215546Sopenharmony_ci}
404bf215546Sopenharmony_ci
405bf215546Sopenharmony_civoid *si_clear_render_target_shader(struct pipe_context *ctx)
406bf215546Sopenharmony_ci{
407bf215546Sopenharmony_ci   static const char text[] =
408bf215546Sopenharmony_ci      "COMP\n"
409bf215546Sopenharmony_ci      "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
410bf215546Sopenharmony_ci      "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
411bf215546Sopenharmony_ci      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
412bf215546Sopenharmony_ci      "DCL SV[0], THREAD_ID\n"
413bf215546Sopenharmony_ci      "DCL SV[1], BLOCK_ID\n"
414bf215546Sopenharmony_ci      "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
415bf215546Sopenharmony_ci      "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
416bf215546Sopenharmony_ci      "DCL TEMP[0..3], LOCAL\n"
417bf215546Sopenharmony_ci      "IMM[0] UINT32 {8, 1, 0, 0}\n"
418bf215546Sopenharmony_ci      "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
419bf215546Sopenharmony_ci      "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
420bf215546Sopenharmony_ci      "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
421bf215546Sopenharmony_ci      "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
422bf215546Sopenharmony_ci      "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
423bf215546Sopenharmony_ci      "END\n";
424bf215546Sopenharmony_ci
425bf215546Sopenharmony_ci   struct tgsi_token tokens[1024];
426bf215546Sopenharmony_ci   struct pipe_compute_state state = {0};
427bf215546Sopenharmony_ci
428bf215546Sopenharmony_ci   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
429bf215546Sopenharmony_ci      assert(false);
430bf215546Sopenharmony_ci      return NULL;
431bf215546Sopenharmony_ci   }
432bf215546Sopenharmony_ci
433bf215546Sopenharmony_ci   state.ir_type = PIPE_SHADER_IR_TGSI;
434bf215546Sopenharmony_ci   state.prog = tokens;
435bf215546Sopenharmony_ci
436bf215546Sopenharmony_ci   return ctx->create_compute_state(ctx, &state);
437bf215546Sopenharmony_ci}
438bf215546Sopenharmony_ci
439bf215546Sopenharmony_ci/* TODO: Didn't really test 1D_ARRAY */
440bf215546Sopenharmony_civoid *si_clear_render_target_shader_1d_array(struct pipe_context *ctx)
441bf215546Sopenharmony_ci{
442bf215546Sopenharmony_ci   static const char text[] =
443bf215546Sopenharmony_ci      "COMP\n"
444bf215546Sopenharmony_ci      "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
445bf215546Sopenharmony_ci      "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
446bf215546Sopenharmony_ci      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
447bf215546Sopenharmony_ci      "DCL SV[0], THREAD_ID\n"
448bf215546Sopenharmony_ci      "DCL SV[1], BLOCK_ID\n"
449bf215546Sopenharmony_ci      "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
450bf215546Sopenharmony_ci      "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
451bf215546Sopenharmony_ci      "DCL TEMP[0..3], LOCAL\n"
452bf215546Sopenharmony_ci      "IMM[0] UINT32 {64, 1, 0, 0}\n"
453bf215546Sopenharmony_ci      "MOV TEMP[0].xy, CONST[0][0].xzzw\n"
454bf215546Sopenharmony_ci      "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
455bf215546Sopenharmony_ci      "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
456bf215546Sopenharmony_ci      "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
457bf215546Sopenharmony_ci      "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
458bf215546Sopenharmony_ci      "END\n";
459bf215546Sopenharmony_ci
460bf215546Sopenharmony_ci   struct tgsi_token tokens[1024];
461bf215546Sopenharmony_ci   struct pipe_compute_state state = {0};
462bf215546Sopenharmony_ci
463bf215546Sopenharmony_ci   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
464bf215546Sopenharmony_ci      assert(false);
465bf215546Sopenharmony_ci      return NULL;
466bf215546Sopenharmony_ci   }
467bf215546Sopenharmony_ci
468bf215546Sopenharmony_ci   state.ir_type = PIPE_SHADER_IR_TGSI;
469bf215546Sopenharmony_ci   state.prog = tokens;
470bf215546Sopenharmony_ci
471bf215546Sopenharmony_ci   return ctx->create_compute_state(ctx, &state);
472bf215546Sopenharmony_ci}
473bf215546Sopenharmony_ci
474bf215546Sopenharmony_civoid *si_clear_12bytes_buffer_shader(struct pipe_context *ctx)
475bf215546Sopenharmony_ci{
476bf215546Sopenharmony_ci   static const char text[] = "COMP\n"
477bf215546Sopenharmony_ci                              "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
478bf215546Sopenharmony_ci                              "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
479bf215546Sopenharmony_ci                              "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
480bf215546Sopenharmony_ci                              "PROPERTY CS_USER_DATA_COMPONENTS_AMD 3\n"
481bf215546Sopenharmony_ci                              "DCL SV[0], THREAD_ID\n"
482bf215546Sopenharmony_ci                              "DCL SV[1], BLOCK_ID\n"
483bf215546Sopenharmony_ci                              "DCL SV[2], CS_USER_DATA_AMD\n"
484bf215546Sopenharmony_ci                              "DCL BUFFER[0]\n"
485bf215546Sopenharmony_ci                              "DCL TEMP[0..0]\n"
486bf215546Sopenharmony_ci                              "IMM[0] UINT32 {64, 1, 12, 0}\n"
487bf215546Sopenharmony_ci                              "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
488bf215546Sopenharmony_ci                              "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" // 12 bytes
489bf215546Sopenharmony_ci                              "STORE BUFFER[0].xyz, TEMP[0].xxxx, SV[2].xyzz%s\n"
490bf215546Sopenharmony_ci                              "END\n";
491bf215546Sopenharmony_ci   char final_text[2048];
492bf215546Sopenharmony_ci   struct tgsi_token tokens[1024];
493bf215546Sopenharmony_ci   struct pipe_compute_state state = {0};
494bf215546Sopenharmony_ci
495bf215546Sopenharmony_ci   snprintf(final_text, sizeof(final_text), text,
496bf215546Sopenharmony_ci            SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ", STREAM_CACHE_POLICY" : "");
497bf215546Sopenharmony_ci
498bf215546Sopenharmony_ci   if (!tgsi_text_translate(final_text, tokens, ARRAY_SIZE(tokens))) {
499bf215546Sopenharmony_ci      assert(false);
500bf215546Sopenharmony_ci      return NULL;
501bf215546Sopenharmony_ci   }
502bf215546Sopenharmony_ci
503bf215546Sopenharmony_ci   state.ir_type = PIPE_SHADER_IR_TGSI;
504bf215546Sopenharmony_ci   state.prog = tokens;
505bf215546Sopenharmony_ci
506bf215546Sopenharmony_ci   return ctx->create_compute_state(ctx, &state);
507bf215546Sopenharmony_ci}
508bf215546Sopenharmony_ci
509bf215546Sopenharmony_ci/* Load samples from the image, and copy them to the same image. This looks like
510bf215546Sopenharmony_ci * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are
511bf215546Sopenharmony_ci * reordered to match expanded FMASK.
512bf215546Sopenharmony_ci *
513bf215546Sopenharmony_ci * After the shader finishes, FMASK should be cleared to identity.
514bf215546Sopenharmony_ci */
515bf215546Sopenharmony_civoid *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array)
516bf215546Sopenharmony_ci{
517bf215546Sopenharmony_ci   enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
518bf215546Sopenharmony_ci   struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
519bf215546Sopenharmony_ci   if (!ureg)
520bf215546Sopenharmony_ci      return NULL;
521bf215546Sopenharmony_ci
522bf215546Sopenharmony_ci   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8);
523bf215546Sopenharmony_ci   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8);
524bf215546Sopenharmony_ci   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
525bf215546Sopenharmony_ci
526bf215546Sopenharmony_ci   /* Compute the image coordinates. */
527bf215546Sopenharmony_ci   struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false);
528bf215546Sopenharmony_ci   struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
529bf215546Sopenharmony_ci   struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
530bf215546Sopenharmony_ci   struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZW);
531bf215546Sopenharmony_ci   ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY), ureg_swizzle(blk, 0, 1, 1, 1),
532bf215546Sopenharmony_ci             ureg_imm2u(ureg, 8, 8), ureg_swizzle(tid, 0, 1, 1, 1));
533bf215546Sopenharmony_ci   if (is_array) {
534bf215546Sopenharmony_ci      ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z), ureg_scalar(blk, TGSI_SWIZZLE_Z));
535bf215546Sopenharmony_ci   }
536bf215546Sopenharmony_ci
537bf215546Sopenharmony_ci   /* Load samples, resolving FMASK. */
538bf215546Sopenharmony_ci   struct ureg_dst sample[8];
539bf215546Sopenharmony_ci   assert(num_samples <= ARRAY_SIZE(sample));
540bf215546Sopenharmony_ci
541bf215546Sopenharmony_ci   for (unsigned i = 0; i < num_samples; i++) {
542bf215546Sopenharmony_ci      sample[i] = ureg_DECL_temporary(ureg);
543bf215546Sopenharmony_ci
544bf215546Sopenharmony_ci      ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i));
545bf215546Sopenharmony_ci
546bf215546Sopenharmony_ci      struct ureg_src srcs[] = {image, ureg_src(coord)};
547bf215546Sopenharmony_ci      ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2, TGSI_MEMORY_RESTRICT, target,
548bf215546Sopenharmony_ci                       0);
549bf215546Sopenharmony_ci   }
550bf215546Sopenharmony_ci
551bf215546Sopenharmony_ci   /* Store samples, ignoring FMASK. */
552bf215546Sopenharmony_ci   for (unsigned i = 0; i < num_samples; i++) {
553bf215546Sopenharmony_ci      ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i));
554bf215546Sopenharmony_ci
555bf215546Sopenharmony_ci      struct ureg_dst dst_image = ureg_dst(image);
556bf215546Sopenharmony_ci      struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])};
557bf215546Sopenharmony_ci      ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2, TGSI_MEMORY_RESTRICT,
558bf215546Sopenharmony_ci                       target, 0);
559bf215546Sopenharmony_ci   }
560bf215546Sopenharmony_ci   ureg_END(ureg);
561bf215546Sopenharmony_ci
562bf215546Sopenharmony_ci   struct pipe_compute_state state = {};
563bf215546Sopenharmony_ci   state.ir_type = PIPE_SHADER_IR_TGSI;
564bf215546Sopenharmony_ci   state.prog = ureg_get_tokens(ureg, NULL);
565bf215546Sopenharmony_ci
566bf215546Sopenharmony_ci   void *cs = ctx->create_compute_state(ctx, &state);
567bf215546Sopenharmony_ci   ureg_destroy(ureg);
568bf215546Sopenharmony_ci   return cs;
569bf215546Sopenharmony_ci}
570bf215546Sopenharmony_ci
571bf215546Sopenharmony_ci/* Create the compute shader that is used to collect the results of gfx10+
572bf215546Sopenharmony_ci * shader queries.
573bf215546Sopenharmony_ci *
574bf215546Sopenharmony_ci * One compute grid with a single thread is launched for every query result
575bf215546Sopenharmony_ci * buffer. The thread (optionally) reads a previous summary buffer, then
576bf215546Sopenharmony_ci * accumulates data from the query result buffer, and writes the result either
577bf215546Sopenharmony_ci * to a summary buffer to be consumed by the next grid invocation or to the
578bf215546Sopenharmony_ci * user-supplied buffer.
579bf215546Sopenharmony_ci *
580bf215546Sopenharmony_ci * Data layout:
581bf215546Sopenharmony_ci *
582bf215546Sopenharmony_ci * BUFFER[0] = query result buffer (layout is defined by gfx10_sh_query_buffer_mem)
583bf215546Sopenharmony_ci * BUFFER[1] = previous summary buffer
584bf215546Sopenharmony_ci * BUFFER[2] = next summary buffer or user-supplied buffer
585bf215546Sopenharmony_ci *
586bf215546Sopenharmony_ci * CONST
587bf215546Sopenharmony_ci *  0.x = config; the low 3 bits indicate the mode:
588bf215546Sopenharmony_ci *          0: sum up counts
589bf215546Sopenharmony_ci *          1: determine result availability and write it as a boolean
590bf215546Sopenharmony_ci *          2: SO_OVERFLOW
591bf215546Sopenharmony_ci *          3: SO_ANY_OVERFLOW
592bf215546Sopenharmony_ci *        the remaining bits form a bitfield:
593bf215546Sopenharmony_ci *          8: write result as a 64-bit value
594bf215546Sopenharmony_ci *  0.y = offset in bytes to counts or stream for SO_OVERFLOW mode
595bf215546Sopenharmony_ci *  0.z = chain bit field:
596bf215546Sopenharmony_ci *          1: have previous summary buffer
597bf215546Sopenharmony_ci *          2: write next summary buffer
598bf215546Sopenharmony_ci *  0.w = result_count
599bf215546Sopenharmony_ci */
600bf215546Sopenharmony_civoid *gfx10_create_sh_query_result_cs(struct si_context *sctx)
601bf215546Sopenharmony_ci{
602bf215546Sopenharmony_ci   /* TEMP[0].x = accumulated result so far
603bf215546Sopenharmony_ci    * TEMP[0].y = result missing
604bf215546Sopenharmony_ci    * TEMP[0].z = whether we're in overflow mode
605bf215546Sopenharmony_ci    */
606bf215546Sopenharmony_ci   static const char text_tmpl[] = "COMP\n"
607bf215546Sopenharmony_ci                                   "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
608bf215546Sopenharmony_ci                                   "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
609bf215546Sopenharmony_ci                                   "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
610bf215546Sopenharmony_ci                                   "DCL BUFFER[0]\n"
611bf215546Sopenharmony_ci                                   "DCL BUFFER[1]\n"
612bf215546Sopenharmony_ci                                   "DCL BUFFER[2]\n"
613bf215546Sopenharmony_ci                                   "DCL CONST[0][0..0]\n"
614bf215546Sopenharmony_ci                                   "DCL TEMP[0..5]\n"
615bf215546Sopenharmony_ci                                   "IMM[0] UINT32 {0, 7, 256, 4294967295}\n"
616bf215546Sopenharmony_ci                                   "IMM[1] UINT32 {1, 2, 4, 8}\n"
617bf215546Sopenharmony_ci                                   "IMM[2] UINT32 {16, 32, 64, 128}\n"
618bf215546Sopenharmony_ci
619bf215546Sopenharmony_ci                                   /*
620bf215546Sopenharmony_ci                                   acc_result = 0;
621bf215546Sopenharmony_ci                                   acc_missing = 0;
622bf215546Sopenharmony_ci                                   if (chain & 1) {
623bf215546Sopenharmony_ci                                           acc_result = buffer[1][0];
624bf215546Sopenharmony_ci                                           acc_missing = buffer[1][1];
625bf215546Sopenharmony_ci                                   }
626bf215546Sopenharmony_ci                                   */
627bf215546Sopenharmony_ci                                   "MOV TEMP[0].xy, IMM[0].xxxx\n"
628bf215546Sopenharmony_ci                                   "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
629bf215546Sopenharmony_ci                                   "UIF TEMP[5]\n"
630bf215546Sopenharmony_ci                                   "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
631bf215546Sopenharmony_ci                                   "ENDIF\n"
632bf215546Sopenharmony_ci
633bf215546Sopenharmony_ci                                   /*
634bf215546Sopenharmony_ci                                   is_overflow (TEMP[0].z) = (config & 7) >= 2;
635bf215546Sopenharmony_ci                                   result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 :
636bf215546Sopenharmony_ci                                   result_count; base_offset (TEMP[1].y) = 0; for (;;) { if
637bf215546Sopenharmony_ci                                   (!result_remaining) break; result_remaining--;
638bf215546Sopenharmony_ci                                   */
639bf215546Sopenharmony_ci                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
640bf215546Sopenharmony_ci                                   "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
641bf215546Sopenharmony_ci
642bf215546Sopenharmony_ci                                   "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
643bf215546Sopenharmony_ci                                   "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
644bf215546Sopenharmony_ci                                   "MOV TEMP[1].y, IMM[0].xxxx\n"
645bf215546Sopenharmony_ci
646bf215546Sopenharmony_ci                                   "BGNLOOP\n"
647bf215546Sopenharmony_ci                                   "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
648bf215546Sopenharmony_ci                                   "UIF TEMP[5]\n"
649bf215546Sopenharmony_ci                                   "BRK\n"
650bf215546Sopenharmony_ci                                   "ENDIF\n"
651bf215546Sopenharmony_ci                                   "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
652bf215546Sopenharmony_ci
653bf215546Sopenharmony_ci                                   /*
654bf215546Sopenharmony_ci                                   fence = buffer[0]@(base_offset + sizeof(gfx10_sh_query_buffer_mem.stream));
655bf215546Sopenharmony_ci                                   if (!fence) {
656bf215546Sopenharmony_ci                                           acc_missing = ~0u;
657bf215546Sopenharmony_ci                                           break;
658bf215546Sopenharmony_ci                                   }
659bf215546Sopenharmony_ci                                   */
660bf215546Sopenharmony_ci                                   "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].wwww\n"
661bf215546Sopenharmony_ci                                   "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
662bf215546Sopenharmony_ci                                   "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
663bf215546Sopenharmony_ci                                   "UIF TEMP[5]\n"
664bf215546Sopenharmony_ci                                   "MOV TEMP[0].y, TEMP[5].xxxx\n"
665bf215546Sopenharmony_ci                                   "BRK\n"
666bf215546Sopenharmony_ci                                   "ENDIF\n"
667bf215546Sopenharmony_ci
668bf215546Sopenharmony_ci                                   /*
669bf215546Sopenharmony_ci                                   stream_offset (TEMP[2].x) = base_offset + offset;
670bf215546Sopenharmony_ci
671bf215546Sopenharmony_ci                                   if (!(config & 7)) {
672bf215546Sopenharmony_ci                                           acc_result += buffer[0]@stream_offset;
673bf215546Sopenharmony_ci                                   }
674bf215546Sopenharmony_ci                                   */
675bf215546Sopenharmony_ci                                   "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
676bf215546Sopenharmony_ci
677bf215546Sopenharmony_ci                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
678bf215546Sopenharmony_ci                                   "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
679bf215546Sopenharmony_ci                                   "UIF TEMP[5]\n"
680bf215546Sopenharmony_ci                                   "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
681bf215546Sopenharmony_ci                                   "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
682bf215546Sopenharmony_ci                                   "ENDIF\n"
683bf215546Sopenharmony_ci
684bf215546Sopenharmony_ci                                   /*
685bf215546Sopenharmony_ci                                   if ((config & 7) >= 2) {
686bf215546Sopenharmony_ci                                           count (TEMP[2].y) = (config & 1) ? 4 : 1;
687bf215546Sopenharmony_ci                                   */
688bf215546Sopenharmony_ci                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
689bf215546Sopenharmony_ci                                   "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
690bf215546Sopenharmony_ci                                   "UIF TEMP[5]\n"
691bf215546Sopenharmony_ci                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
692bf215546Sopenharmony_ci                                   "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
693bf215546Sopenharmony_ci
694bf215546Sopenharmony_ci                                   /*
695bf215546Sopenharmony_ci                                   do {
696bf215546Sopenharmony_ci                                           generated = buffer[0]@(stream_offset + 2 * sizeof(uint64_t));
697bf215546Sopenharmony_ci                                           emitted = buffer[0]@(stream_offset + 3 * sizeof(uint64_t));
698bf215546Sopenharmony_ci                                           if (generated != emitted) {
699bf215546Sopenharmony_ci                                                   acc_result = 1;
700bf215546Sopenharmony_ci                                                   result_remaining = 0;
701bf215546Sopenharmony_ci                                                   break;
702bf215546Sopenharmony_ci                                           }
703bf215546Sopenharmony_ci
704bf215546Sopenharmony_ci                                           stream_offset += sizeof(gfx10_sh_query_buffer_mem.stream[0]);
705bf215546Sopenharmony_ci                                   } while (--count);
706bf215546Sopenharmony_ci                                   */
707bf215546Sopenharmony_ci                                   "BGNLOOP\n"
708bf215546Sopenharmony_ci                                   "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
709bf215546Sopenharmony_ci                                   "LOAD TEMP[4].xyzw, BUFFER[0], TEMP[5].xxxx\n"
710bf215546Sopenharmony_ci                                   "USNE TEMP[5], TEMP[4].xyxy, TEMP[4].zwzw\n"
711bf215546Sopenharmony_ci                                   "UIF TEMP[5]\n"
712bf215546Sopenharmony_ci                                   "MOV TEMP[0].x, IMM[1].xxxx\n"
713bf215546Sopenharmony_ci                                   "MOV TEMP[1].y, IMM[0].xxxx\n"
714bf215546Sopenharmony_ci                                   "BRK\n"
715bf215546Sopenharmony_ci                                   "ENDIF\n"
716bf215546Sopenharmony_ci
717bf215546Sopenharmony_ci                                   "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
718bf215546Sopenharmony_ci                                   "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
719bf215546Sopenharmony_ci                                   "UIF TEMP[5]\n"
720bf215546Sopenharmony_ci                                   "BRK\n"
721bf215546Sopenharmony_ci                                   "ENDIF\n"
722bf215546Sopenharmony_ci                                   "UADD TEMP[2].x, TEMP[2].xxxx, IMM[2].yyyy\n"
723bf215546Sopenharmony_ci                                   "ENDLOOP\n"
724bf215546Sopenharmony_ci                                   "ENDIF\n"
725bf215546Sopenharmony_ci
726bf215546Sopenharmony_ci                                   /*
727bf215546Sopenharmony_ci                                           base_offset += sizeof(gfx10_sh_query_buffer_mem);
728bf215546Sopenharmony_ci                                   } // end outer loop
729bf215546Sopenharmony_ci                                   */
730bf215546Sopenharmony_ci                                   "UADD TEMP[1].y, TEMP[1].yyyy, IMM[0].zzzz\n"
731bf215546Sopenharmony_ci                                   "ENDLOOP\n"
732bf215546Sopenharmony_ci
733bf215546Sopenharmony_ci                                   /*
734bf215546Sopenharmony_ci                                   if (chain & 2) {
735bf215546Sopenharmony_ci                                           buffer[2][0] = acc_result;
736bf215546Sopenharmony_ci                                           buffer[2][1] = acc_missing;
737bf215546Sopenharmony_ci                                   } else {
738bf215546Sopenharmony_ci                                   */
739bf215546Sopenharmony_ci                                   "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
740bf215546Sopenharmony_ci                                   "UIF TEMP[5]\n"
741bf215546Sopenharmony_ci                                   "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
742bf215546Sopenharmony_ci                                   "ELSE\n"
743bf215546Sopenharmony_ci
744bf215546Sopenharmony_ci                                   /*
745bf215546Sopenharmony_ci                                   if ((config & 7) == 1) {
746bf215546Sopenharmony_ci                                           acc_result = acc_missing ? 0 : 1;
747bf215546Sopenharmony_ci                                           acc_missing = 0;
748bf215546Sopenharmony_ci                                   }
749bf215546Sopenharmony_ci                                   */
750bf215546Sopenharmony_ci                                   "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
751bf215546Sopenharmony_ci                                   "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
752bf215546Sopenharmony_ci                                   "UIF TEMP[5]\n"
753bf215546Sopenharmony_ci                                   "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
754bf215546Sopenharmony_ci                                   "MOV TEMP[0].y, IMM[0].xxxx\n"
755bf215546Sopenharmony_ci                                   "ENDIF\n"
756bf215546Sopenharmony_ci
757bf215546Sopenharmony_ci                                   /*
758bf215546Sopenharmony_ci                                   if (!acc_missing) {
759bf215546Sopenharmony_ci                                           buffer[2][0] = acc_result;
760bf215546Sopenharmony_ci                                           if (config & 8)
761bf215546Sopenharmony_ci                                                   buffer[2][1] = 0;
762bf215546Sopenharmony_ci                                   }
763bf215546Sopenharmony_ci                                   */
764bf215546Sopenharmony_ci                                   "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
765bf215546Sopenharmony_ci                                   "UIF TEMP[5]\n"
766bf215546Sopenharmony_ci                                   "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
767bf215546Sopenharmony_ci
768bf215546Sopenharmony_ci                                   "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
769bf215546Sopenharmony_ci                                   "UIF TEMP[5]\n"
770bf215546Sopenharmony_ci                                   "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
771bf215546Sopenharmony_ci                                   "ENDIF\n"
772bf215546Sopenharmony_ci                                   "ENDIF\n"
773bf215546Sopenharmony_ci                                   "ENDIF\n"
774bf215546Sopenharmony_ci
775bf215546Sopenharmony_ci                                   "END\n";
776bf215546Sopenharmony_ci
777bf215546Sopenharmony_ci   struct tgsi_token tokens[1024];
778bf215546Sopenharmony_ci   struct pipe_compute_state state = {};
779bf215546Sopenharmony_ci
780bf215546Sopenharmony_ci   if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
781bf215546Sopenharmony_ci      assert(false);
782bf215546Sopenharmony_ci      return NULL;
783bf215546Sopenharmony_ci   }
784bf215546Sopenharmony_ci
785bf215546Sopenharmony_ci   state.ir_type = PIPE_SHADER_IR_TGSI;
786bf215546Sopenharmony_ci   state.prog = tokens;
787bf215546Sopenharmony_ci
788bf215546Sopenharmony_ci   return sctx->b.create_compute_state(&sctx->b, &state);
789bf215546Sopenharmony_ci}
790