1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright 2017 Advanced Micro Devices, Inc.
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * on the rights to use, copy, modify, merge, publish, distribute, sub
8bf215546Sopenharmony_ci * license, and/or sell copies of the Software, and to permit persons to whom
9bf215546Sopenharmony_ci * the Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19bf215546Sopenharmony_ci * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20bf215546Sopenharmony_ci * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21bf215546Sopenharmony_ci * USE OR OTHER DEALINGS IN THE SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci#include "ac_llvm_cull.h"
25bf215546Sopenharmony_ci#include "si_pipe.h"
26bf215546Sopenharmony_ci#include "si_query.h"
27bf215546Sopenharmony_ci#include "si_shader_internal.h"
28bf215546Sopenharmony_ci#include "sid.h"
29bf215546Sopenharmony_ci#include "util/u_memory.h"
30bf215546Sopenharmony_ci#include "util/u_prim.h"
31bf215546Sopenharmony_ci
32bf215546Sopenharmony_cistatic LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
33bf215546Sopenharmony_ci{
34bf215546Sopenharmony_ci   return si_unpack_param(ctx, ctx->args.merged_wave_info, 24, 4);
35bf215546Sopenharmony_ci}
36bf215546Sopenharmony_ci
37bf215546Sopenharmony_cistatic LLVMValueRef get_tgsize(struct si_shader_context *ctx)
38bf215546Sopenharmony_ci{
39bf215546Sopenharmony_ci   return si_unpack_param(ctx, ctx->args.merged_wave_info, 28, 4);
40bf215546Sopenharmony_ci}
41bf215546Sopenharmony_ci
42bf215546Sopenharmony_ciLLVMValueRef gfx10_get_thread_id_in_tg(struct si_shader_context *ctx)
43bf215546Sopenharmony_ci{
44bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
45bf215546Sopenharmony_ci   LLVMValueRef tmp;
46bf215546Sopenharmony_ci   tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
47bf215546Sopenharmony_ci                      LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
48bf215546Sopenharmony_ci   return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
49bf215546Sopenharmony_ci}
50bf215546Sopenharmony_ci
51bf215546Sopenharmony_cistatic LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
52bf215546Sopenharmony_ci{
53bf215546Sopenharmony_ci   return si_unpack_param(ctx, ctx->args.gs_tg_info, 12, 9);
54bf215546Sopenharmony_ci}
55bf215546Sopenharmony_ci
56bf215546Sopenharmony_cistatic LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
57bf215546Sopenharmony_ci{
58bf215546Sopenharmony_ci   return si_unpack_param(ctx, ctx->args.gs_tg_info, 22, 9);
59bf215546Sopenharmony_ci}
60bf215546Sopenharmony_ci
61bf215546Sopenharmony_cistatic LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx)
62bf215546Sopenharmony_ci{
63bf215546Sopenharmony_ci   return si_unpack_param(ctx, ctx->args.gs_tg_info, 0, 12);
64bf215546Sopenharmony_ci}
65bf215546Sopenharmony_ci
66bf215546Sopenharmony_cistatic LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
67bf215546Sopenharmony_ci{
68bf215546Sopenharmony_ci   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
69bf215546Sopenharmony_ci
70bf215546Sopenharmony_ci   return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
71bf215546Sopenharmony_ci                                LLVMConstInt(ctx->ac.i32, SI_GS_QUERY_BUF, false));
72bf215546Sopenharmony_ci}
73bf215546Sopenharmony_ci
74bf215546Sopenharmony_cistatic LLVMValueRef ngg_get_emulated_counters_buf(struct si_shader_context *ctx)
75bf215546Sopenharmony_ci{
76bf215546Sopenharmony_ci   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
77bf215546Sopenharmony_ci
78bf215546Sopenharmony_ci   return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
79bf215546Sopenharmony_ci                                LLVMConstInt(ctx->ac.i32, SI_GS_QUERY_EMULATED_COUNTERS_BUF, false));
80bf215546Sopenharmony_ci}
81bf215546Sopenharmony_ci
82bf215546Sopenharmony_ci/**
83bf215546Sopenharmony_ci * Return the number of vertices as a constant in \p num_vertices,
84bf215546Sopenharmony_ci * and return a more precise value as LLVMValueRef from the function.
85bf215546Sopenharmony_ci */
86bf215546Sopenharmony_cistatic LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, unsigned *num_vertices)
87bf215546Sopenharmony_ci{
88bf215546Sopenharmony_ci   const struct si_shader_info *info = &ctx->shader->selector->info;
89bf215546Sopenharmony_ci
90bf215546Sopenharmony_ci   if (ctx->stage == MESA_SHADER_GEOMETRY) {
91bf215546Sopenharmony_ci      *num_vertices = u_vertices_per_prim(info->base.gs.output_primitive);
92bf215546Sopenharmony_ci      return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
93bf215546Sopenharmony_ci   } else if (ctx->stage == MESA_SHADER_VERTEX) {
94bf215546Sopenharmony_ci      if (info->base.vs.blit_sgprs_amd) {
95bf215546Sopenharmony_ci         /* Blits always use axis-aligned rectangles with 3 vertices. */
96bf215546Sopenharmony_ci         *num_vertices = 3;
97bf215546Sopenharmony_ci         return LLVMConstInt(ctx->ac.i32, 3, 0);
98bf215546Sopenharmony_ci      } else if (ctx->shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES) {
99bf215546Sopenharmony_ci         *num_vertices = 2;
100bf215546Sopenharmony_ci         return LLVMConstInt(ctx->ac.i32, 2, 0);
101bf215546Sopenharmony_ci      } else {
102bf215546Sopenharmony_ci         /* We always build up all three indices for the prim export
103bf215546Sopenharmony_ci          * independent of the primitive type. The additional garbage
104bf215546Sopenharmony_ci          * data shouldn't hurt. This is used by exports and streamout.
105bf215546Sopenharmony_ci          */
106bf215546Sopenharmony_ci         *num_vertices = 3;
107bf215546Sopenharmony_ci
108bf215546Sopenharmony_ci         /* Extract OUTPRIM field. */
109bf215546Sopenharmony_ci         LLVMValueRef num = GET_FIELD(ctx, GS_STATE_OUTPRIM);
110bf215546Sopenharmony_ci         return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
111bf215546Sopenharmony_ci      }
112bf215546Sopenharmony_ci   } else {
113bf215546Sopenharmony_ci      assert(ctx->stage == MESA_SHADER_TESS_EVAL);
114bf215546Sopenharmony_ci
115bf215546Sopenharmony_ci      if (info->base.tess.point_mode)
116bf215546Sopenharmony_ci         *num_vertices = 1;
117bf215546Sopenharmony_ci      else if (info->base.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
118bf215546Sopenharmony_ci         *num_vertices = 2;
119bf215546Sopenharmony_ci      else
120bf215546Sopenharmony_ci         *num_vertices = 3;
121bf215546Sopenharmony_ci
122bf215546Sopenharmony_ci      return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
123bf215546Sopenharmony_ci   }
124bf215546Sopenharmony_ci}
125bf215546Sopenharmony_ci
126bf215546Sopenharmony_cibool gfx10_ngg_export_prim_early(struct si_shader *shader)
127bf215546Sopenharmony_ci{
128bf215546Sopenharmony_ci   struct si_shader_selector *sel = shader->selector;
129bf215546Sopenharmony_ci
130bf215546Sopenharmony_ci   assert(shader->key.ge.as_ngg && !shader->key.ge.as_es);
131bf215546Sopenharmony_ci
132bf215546Sopenharmony_ci   return sel->stage != MESA_SHADER_GEOMETRY &&
133bf215546Sopenharmony_ci          !gfx10_ngg_writes_user_edgeflags(shader);
134bf215546Sopenharmony_ci}
135bf215546Sopenharmony_ci
136bf215546Sopenharmony_civoid gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
137bf215546Sopenharmony_ci{
138bf215546Sopenharmony_ci   /* Newer chips can use PRIMGEN_PASSTHRU_NO_MSG to skip gs_alloc_req for NGG passthrough. */
139bf215546Sopenharmony_ci   if (gfx10_is_ngg_passthrough(ctx->shader) &&
140bf215546Sopenharmony_ci       ctx->screen->info.family >= CHIP_NAVI23)
141bf215546Sopenharmony_ci      return;
142bf215546Sopenharmony_ci
143bf215546Sopenharmony_ci   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
144bf215546Sopenharmony_ci                                 ngg_get_prim_cnt(ctx));
145bf215546Sopenharmony_ci}
146bf215546Sopenharmony_ci
147bf215546Sopenharmony_civoid gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
148bf215546Sopenharmony_ci                                 LLVMValueRef prim_passthrough)
149bf215546Sopenharmony_ci{
150bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
151bf215546Sopenharmony_ci
152bf215546Sopenharmony_ci   if (gfx10_is_ngg_passthrough(ctx->shader) || ctx->shader->key.ge.opt.ngg_culling) {
153bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
154bf215546Sopenharmony_ci      {
155bf215546Sopenharmony_ci         struct ac_ngg_prim prim = {};
156bf215546Sopenharmony_ci
157bf215546Sopenharmony_ci         if (prim_passthrough)
158bf215546Sopenharmony_ci            prim.passthrough = prim_passthrough;
159bf215546Sopenharmony_ci         else
160bf215546Sopenharmony_ci            prim.passthrough = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[0]);
161bf215546Sopenharmony_ci
162bf215546Sopenharmony_ci         /* This is only used with NGG culling, which returns the NGG
163bf215546Sopenharmony_ci          * passthrough prim export encoding.
164bf215546Sopenharmony_ci          */
165bf215546Sopenharmony_ci         if (gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
166bf215546Sopenharmony_ci            unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
167bf215546Sopenharmony_ci            LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
168bf215546Sopenharmony_ci
169bf215546Sopenharmony_ci            unsigned num_vertices;
170bf215546Sopenharmony_ci            ngg_get_vertices_per_prim(ctx, &num_vertices);
171bf215546Sopenharmony_ci
172bf215546Sopenharmony_ci            for (unsigned i = 0; i < num_vertices; i++) {
173bf215546Sopenharmony_ci               unsigned shift = 9 + i * 10;
174bf215546Sopenharmony_ci               LLVMValueRef edge;
175bf215546Sopenharmony_ci
176bf215546Sopenharmony_ci               edge = LLVMBuildLoad2(builder, ctx->ac.i1, user_edgeflags[i], "");
177bf215546Sopenharmony_ci               edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
178bf215546Sopenharmony_ci               edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
179bf215546Sopenharmony_ci               edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
180bf215546Sopenharmony_ci            }
181bf215546Sopenharmony_ci            prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
182bf215546Sopenharmony_ci         }
183bf215546Sopenharmony_ci
184bf215546Sopenharmony_ci         ac_build_export_prim(&ctx->ac, &prim);
185bf215546Sopenharmony_ci      }
186bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 6001);
187bf215546Sopenharmony_ci      return;
188bf215546Sopenharmony_ci   }
189bf215546Sopenharmony_ci
190bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
191bf215546Sopenharmony_ci   {
192bf215546Sopenharmony_ci      struct ac_ngg_prim prim = {};
193bf215546Sopenharmony_ci
194bf215546Sopenharmony_ci      ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
195bf215546Sopenharmony_ci
196bf215546Sopenharmony_ci      prim.isnull = ctx->ac.i1false;
197bf215546Sopenharmony_ci
198bf215546Sopenharmony_ci      if (gfx10_edgeflags_have_effect(ctx->shader))
199bf215546Sopenharmony_ci         prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args);
200bf215546Sopenharmony_ci      else
201bf215546Sopenharmony_ci         prim.edgeflags = ctx->ac.i32_0;
202bf215546Sopenharmony_ci
203bf215546Sopenharmony_ci      for (unsigned i = 0; i < prim.num_vertices; ++i)
204bf215546Sopenharmony_ci         prim.index[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16);
205bf215546Sopenharmony_ci
206bf215546Sopenharmony_ci      if (gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
207bf215546Sopenharmony_ci         LLVMValueRef edgeflags = ctx->ac.i32_0;
208bf215546Sopenharmony_ci
209bf215546Sopenharmony_ci         for (unsigned i = 0; i < prim.num_vertices; ++i) {
210bf215546Sopenharmony_ci            LLVMValueRef edge;
211bf215546Sopenharmony_ci
212bf215546Sopenharmony_ci            edge = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.i1, user_edgeflags[i], "");
213bf215546Sopenharmony_ci            edge = LLVMBuildZExt(ctx->ac.builder, edge, ctx->ac.i32, "");
214bf215546Sopenharmony_ci            edge = LLVMBuildShl(ctx->ac.builder, edge, LLVMConstInt(ctx->ac.i32, 9 + i*10, 0), "");
215bf215546Sopenharmony_ci            edgeflags = LLVMBuildOr(ctx->ac.builder, edgeflags, edge, "");
216bf215546Sopenharmony_ci         }
217bf215546Sopenharmony_ci         prim.edgeflags = LLVMBuildAnd(ctx->ac.builder, prim.edgeflags, edgeflags, "");
218bf215546Sopenharmony_ci      }
219bf215546Sopenharmony_ci
220bf215546Sopenharmony_ci      ac_build_export_prim(&ctx->ac, &prim);
221bf215546Sopenharmony_ci   }
222bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 6001);
223bf215546Sopenharmony_ci}
224bf215546Sopenharmony_ci
225bf215546Sopenharmony_cistatic void build_streamout_vertex(struct si_shader_context *ctx, LLVMValueRef *so_buffer,
226bf215546Sopenharmony_ci                                   LLVMValueRef *wg_offset_dw, unsigned stream,
227bf215546Sopenharmony_ci                                   LLVMValueRef offset_vtx, LLVMValueRef vertexptr)
228bf215546Sopenharmony_ci{
229bf215546Sopenharmony_ci   struct si_shader_info *info = &ctx->shader->selector->info;
230bf215546Sopenharmony_ci   struct pipe_stream_output_info *so = &ctx->so;
231bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
232bf215546Sopenharmony_ci   LLVMValueRef offset[4] = {};
233bf215546Sopenharmony_ci   LLVMValueRef tmp;
234bf215546Sopenharmony_ci
235bf215546Sopenharmony_ci   for (unsigned buffer = 0; buffer < 4; ++buffer) {
236bf215546Sopenharmony_ci      if (!wg_offset_dw[buffer])
237bf215546Sopenharmony_ci         continue;
238bf215546Sopenharmony_ci
239bf215546Sopenharmony_ci      tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->stride[buffer], false),
240bf215546Sopenharmony_ci                         "");
241bf215546Sopenharmony_ci      tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
242bf215546Sopenharmony_ci      offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
243bf215546Sopenharmony_ci   }
244bf215546Sopenharmony_ci
245bf215546Sopenharmony_ci   for (unsigned i = 0; i < so->num_outputs; ++i) {
246bf215546Sopenharmony_ci      if (so->output[i].stream != stream)
247bf215546Sopenharmony_ci         continue;
248bf215546Sopenharmony_ci
249bf215546Sopenharmony_ci      unsigned reg = so->output[i].register_index;
250bf215546Sopenharmony_ci      struct si_shader_output_values out;
251bf215546Sopenharmony_ci      out.semantic = info->output_semantic[reg];
252bf215546Sopenharmony_ci
253bf215546Sopenharmony_ci      for (unsigned comp = 0; comp < 4; comp++) {
254bf215546Sopenharmony_ci         tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
255bf215546Sopenharmony_ci         out.values[comp] = LLVMBuildLoad(builder, tmp, "");
256bf215546Sopenharmony_ci         out.vertex_streams = info->output_streams[reg];
257bf215546Sopenharmony_ci      }
258bf215546Sopenharmony_ci
259bf215546Sopenharmony_ci      si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
260bf215546Sopenharmony_ci   }
261bf215546Sopenharmony_ci}
262bf215546Sopenharmony_ci
263bf215546Sopenharmony_cistruct ngg_streamout {
264bf215546Sopenharmony_ci   LLVMValueRef num_vertices;
265bf215546Sopenharmony_ci
266bf215546Sopenharmony_ci   /* per-thread data */
267bf215546Sopenharmony_ci   LLVMValueRef prim_enable[4]; /* i1 per stream */
268bf215546Sopenharmony_ci   LLVMValueRef vertices[3];    /* [N x i32] addrspace(LDS)* */
269bf215546Sopenharmony_ci
270bf215546Sopenharmony_ci   /* Output */
271bf215546Sopenharmony_ci   LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
272bf215546Sopenharmony_ci};
273bf215546Sopenharmony_ci
274bf215546Sopenharmony_ci/**
275bf215546Sopenharmony_ci * Build streamout logic.
276bf215546Sopenharmony_ci *
277bf215546Sopenharmony_ci * Implies a barrier.
278bf215546Sopenharmony_ci *
279bf215546Sopenharmony_ci * Writes number of emitted primitives to gs_ngg_scratch[4:8].
280bf215546Sopenharmony_ci *
281bf215546Sopenharmony_ci * Clobbers gs_ngg_scratch[8:].
282bf215546Sopenharmony_ci */
283bf215546Sopenharmony_cistatic void build_streamout(struct si_shader_context *ctx, struct ngg_streamout *nggso)
284bf215546Sopenharmony_ci{
285bf215546Sopenharmony_ci   struct si_shader_info *info = &ctx->shader->selector->info;
286bf215546Sopenharmony_ci   struct pipe_stream_output_info *so = &ctx->so;
287bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
288bf215546Sopenharmony_ci   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
289bf215546Sopenharmony_ci   LLVMValueRef tid = gfx10_get_thread_id_in_tg(ctx);
290bf215546Sopenharmony_ci   LLVMValueRef tmp, tmp2;
291bf215546Sopenharmony_ci   LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
292bf215546Sopenharmony_ci   LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
293bf215546Sopenharmony_ci   LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
294bf215546Sopenharmony_ci   LLVMValueRef so_buffer[4] = {};
295bf215546Sopenharmony_ci   unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
296bf215546Sopenharmony_ci   LLVMValueRef prim_stride_dw[4] = {};
297bf215546Sopenharmony_ci   LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
298bf215546Sopenharmony_ci   int stream_for_buffer[4] = {-1, -1, -1, -1};
299bf215546Sopenharmony_ci   unsigned bufmask_for_stream[4] = {};
300bf215546Sopenharmony_ci   bool isgs = ctx->stage == MESA_SHADER_GEOMETRY;
301bf215546Sopenharmony_ci   unsigned scratch_emit_base = isgs ? 4 : 0;
302bf215546Sopenharmony_ci   LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
303bf215546Sopenharmony_ci   unsigned scratch_offset_base = isgs ? 8 : 4;
304bf215546Sopenharmony_ci   LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
305bf215546Sopenharmony_ci
306bf215546Sopenharmony_ci   /* Determine the mapping of streamout buffers to vertex streams. */
307bf215546Sopenharmony_ci   for (unsigned i = 0; i < so->num_outputs; ++i) {
308bf215546Sopenharmony_ci      unsigned buf = so->output[i].output_buffer;
309bf215546Sopenharmony_ci      unsigned stream = so->output[i].stream;
310bf215546Sopenharmony_ci      assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
311bf215546Sopenharmony_ci      stream_for_buffer[buf] = stream;
312bf215546Sopenharmony_ci      bufmask_for_stream[stream] |= 1 << buf;
313bf215546Sopenharmony_ci   }
314bf215546Sopenharmony_ci
315bf215546Sopenharmony_ci   for (unsigned buffer = 0; buffer < 4; ++buffer) {
316bf215546Sopenharmony_ci      if (stream_for_buffer[buffer] == -1)
317bf215546Sopenharmony_ci         continue;
318bf215546Sopenharmony_ci
319bf215546Sopenharmony_ci      assert(so->stride[buffer]);
320bf215546Sopenharmony_ci
321bf215546Sopenharmony_ci      tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
322bf215546Sopenharmony_ci      prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
323bf215546Sopenharmony_ci      prim_stride_dw_vgpr =
324bf215546Sopenharmony_ci         ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
325bf215546Sopenharmony_ci                            LLVMConstInt(ctx->ac.i32, buffer, false));
326bf215546Sopenharmony_ci
327bf215546Sopenharmony_ci      so_buffer[buffer] = ac_build_load_to_sgpr(
328bf215546Sopenharmony_ci         &ctx->ac, buf_ptr, LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
329bf215546Sopenharmony_ci   }
330bf215546Sopenharmony_ci
331bf215546Sopenharmony_ci   tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
332bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, tmp, 5200);
333bf215546Sopenharmony_ci   {
334bf215546Sopenharmony_ci      LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
335bf215546Sopenharmony_ci      LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
336bf215546Sopenharmony_ci
337bf215546Sopenharmony_ci      /* Advance the streamout offsets in GDS. */
338bf215546Sopenharmony_ci      LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
339bf215546Sopenharmony_ci      LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
340bf215546Sopenharmony_ci
341bf215546Sopenharmony_ci      tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
342bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, tmp, 5210);
343bf215546Sopenharmony_ci      {
344bf215546Sopenharmony_ci         if (isgs) {
345bf215546Sopenharmony_ci            tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
346bf215546Sopenharmony_ci            tmp = LLVMBuildLoad2(builder, ctx->ac.i32, tmp, "");
347bf215546Sopenharmony_ci         } else {
348bf215546Sopenharmony_ci            tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
349bf215546Sopenharmony_ci         }
350bf215546Sopenharmony_ci         LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
351bf215546Sopenharmony_ci
352bf215546Sopenharmony_ci         unsigned swizzle[4];
353bf215546Sopenharmony_ci         int unused_stream = -1;
354bf215546Sopenharmony_ci         for (unsigned stream = 0; stream < 4; ++stream) {
355bf215546Sopenharmony_ci            if (!info->num_stream_output_components[stream]) {
356bf215546Sopenharmony_ci               unused_stream = stream;
357bf215546Sopenharmony_ci               break;
358bf215546Sopenharmony_ci            }
359bf215546Sopenharmony_ci         }
360bf215546Sopenharmony_ci         for (unsigned buffer = 0; buffer < 4; ++buffer) {
361bf215546Sopenharmony_ci            if (stream_for_buffer[buffer] >= 0) {
362bf215546Sopenharmony_ci               swizzle[buffer] = stream_for_buffer[buffer];
363bf215546Sopenharmony_ci            } else {
364bf215546Sopenharmony_ci               assert(unused_stream >= 0);
365bf215546Sopenharmony_ci               swizzle[buffer] = unused_stream;
366bf215546Sopenharmony_ci            }
367bf215546Sopenharmony_ci         }
368bf215546Sopenharmony_ci
369bf215546Sopenharmony_ci         tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
370bf215546Sopenharmony_ci         tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
371bf215546Sopenharmony_ci
372bf215546Sopenharmony_ci         LLVMValueRef args[8] = {
373bf215546Sopenharmony_ci            LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
374bf215546Sopenharmony_ci            ctx->ac.i32_0,                             /* value to add */
375bf215546Sopenharmony_ci            ctx->ac.i32_0,                             /* ordering */
376bf215546Sopenharmony_ci            ctx->ac.i32_0,                             /* scope */
377bf215546Sopenharmony_ci            ctx->ac.i1false,                           /* isVolatile */
378bf215546Sopenharmony_ci            LLVMConstInt(ctx->ac.i32, 1 << 24, false), /* OA index, bits 24+: lane count */
379bf215546Sopenharmony_ci            ctx->ac.i1true,                            /* wave release */
380bf215546Sopenharmony_ci            ctx->ac.i1true,                            /* wave done */
381bf215546Sopenharmony_ci         };
382bf215546Sopenharmony_ci
383bf215546Sopenharmony_ci         if (ctx->screen->info.gfx_level >= GFX11) {
384bf215546Sopenharmony_ci            /* Gfx11 GDS instructions only operate on the first active lane. All other lanes are
385bf215546Sopenharmony_ci             * ignored. So are their EXEC bits. This uses the mutex feature of ds_ordered_count
386bf215546Sopenharmony_ci             * to emulate a multi-dword atomic.
387bf215546Sopenharmony_ci             *
388bf215546Sopenharmony_ci             * This is the expected code:
389bf215546Sopenharmony_ci             *    ds_ordered_count release=0 done=0   // lock mutex
390bf215546Sopenharmony_ci             *    ds_add_rtn_u32 dwords_written0
391bf215546Sopenharmony_ci             *    ds_add_rtn_u32 dwords_written1
392bf215546Sopenharmony_ci             *    ds_add_rtn_u32 dwords_written2
393bf215546Sopenharmony_ci             *    ds_add_rtn_u32 dwords_written3
394bf215546Sopenharmony_ci             *    ds_ordered_count release=1 done=1   // unlock mutex
395bf215546Sopenharmony_ci             *
396bf215546Sopenharmony_ci             * TODO: Increment GDS_STRMOUT registers instead of GDS memory.
397bf215546Sopenharmony_ci             */
398bf215546Sopenharmony_ci            LLVMValueRef dwords_written[4] = {tmp, tmp, tmp, tmp};
399bf215546Sopenharmony_ci
400bf215546Sopenharmony_ci            /* Move all 4 VGPRs from other lanes to lane 0. */
401bf215546Sopenharmony_ci            for (unsigned i = 1; i < 4; i++) {
402bf215546Sopenharmony_ci               if (ctx->shader->selector->info.base.xfb_stride[i])
403bf215546Sopenharmony_ci                  dwords_written[i] = ac_build_quad_swizzle(&ctx->ac, tmp, i, i, i, i);
404bf215546Sopenharmony_ci            }
405bf215546Sopenharmony_ci
406bf215546Sopenharmony_ci            /* Set release=0 to start a GDS mutex. Set done=0 because it's not the last one. */
407bf215546Sopenharmony_ci            args[6] = args[7] = ctx->ac.i1false;
408bf215546Sopenharmony_ci            ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32,
409bf215546Sopenharmony_ci                               args, ARRAY_SIZE(args), 0);
410bf215546Sopenharmony_ci            ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
411bf215546Sopenharmony_ci
412bf215546Sopenharmony_ci            for (unsigned i = 0; i < 4; i++) {
413bf215546Sopenharmony_ci               if (ctx->shader->selector->info.base.xfb_stride[i]) {
414bf215546Sopenharmony_ci                  LLVMValueRef gds_ptr =
415bf215546Sopenharmony_ci                     ac_build_gep_ptr(&ctx->ac, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
416bf215546Sopenharmony_ci
417bf215546Sopenharmony_ci                  dwords_written[i] = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
418bf215546Sopenharmony_ci                                                         gds_ptr, dwords_written[i],
419bf215546Sopenharmony_ci                                                         LLVMAtomicOrderingMonotonic, false);
420bf215546Sopenharmony_ci               }
421bf215546Sopenharmony_ci            }
422bf215546Sopenharmony_ci
423bf215546Sopenharmony_ci            /* TODO: This might not be needed if GDS executes instructions in order. */
424bf215546Sopenharmony_ci            ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
425bf215546Sopenharmony_ci
426bf215546Sopenharmony_ci            /* Set release=1 to end a GDS mutex. Set done=1 because it's the last one. */
427bf215546Sopenharmony_ci            args[6] = args[7] = ctx->ac.i1true;
428bf215546Sopenharmony_ci            ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32,
429bf215546Sopenharmony_ci                               args, ARRAY_SIZE(args), 0);
430bf215546Sopenharmony_ci
431bf215546Sopenharmony_ci            tmp = dwords_written[0];
432bf215546Sopenharmony_ci            for (unsigned i = 1; i < 4; i++) {
433bf215546Sopenharmony_ci               if (ctx->shader->selector->info.base.xfb_stride[i]) {
434bf215546Sopenharmony_ci                  dwords_written[i] = ac_build_readlane(&ctx->ac, dwords_written[i], ctx->ac.i32_0);
435bf215546Sopenharmony_ci                  tmp = ac_build_writelane(&ctx->ac, tmp, dwords_written[i], LLVMConstInt(ctx->ac.i32, i, 0));
436bf215546Sopenharmony_ci               }
437bf215546Sopenharmony_ci            }
438bf215546Sopenharmony_ci         } else {
439bf215546Sopenharmony_ci            args[1] = tmp; /* value to add */
440bf215546Sopenharmony_ci            args[5] = LLVMConstInt(ctx->ac.i32, 4 << 24, false), /* bits 24+: lane count */
441bf215546Sopenharmony_ci
442bf215546Sopenharmony_ci            tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32,
443bf215546Sopenharmony_ci                                     args, ARRAY_SIZE(args), 0);
444bf215546Sopenharmony_ci         }
445bf215546Sopenharmony_ci
446bf215546Sopenharmony_ci         /* Keep offsets in a VGPR for quick retrieval via readlane by
447bf215546Sopenharmony_ci          * the first wave for bounds checking, and also store in LDS
448bf215546Sopenharmony_ci          * for retrieval by all waves later. */
449bf215546Sopenharmony_ci         LLVMBuildStore(builder, tmp, offsets_vgpr);
450bf215546Sopenharmony_ci
451bf215546Sopenharmony_ci         tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
452bf215546Sopenharmony_ci         tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
453bf215546Sopenharmony_ci         LLVMBuildStore(builder, tmp, tmp2);
454bf215546Sopenharmony_ci      }
455bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5210);
456bf215546Sopenharmony_ci
457bf215546Sopenharmony_ci      /* Determine the max emit per buffer. This is done via the SALU, in part
458bf215546Sopenharmony_ci       * because LLVM can't generate divide-by-multiply if we try to do this
459bf215546Sopenharmony_ci       * via VALU with one lane per buffer.
460bf215546Sopenharmony_ci       */
461bf215546Sopenharmony_ci      LLVMValueRef max_emit[4] = {};
462bf215546Sopenharmony_ci      for (unsigned buffer = 0; buffer < 4; ++buffer) {
463bf215546Sopenharmony_ci         if (stream_for_buffer[buffer] == -1)
464bf215546Sopenharmony_ci            continue;
465bf215546Sopenharmony_ci
466bf215546Sopenharmony_ci         LLVMValueRef bufsize_dw = LLVMBuildLShr(
467bf215546Sopenharmony_ci            builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
468bf215546Sopenharmony_ci
469bf215546Sopenharmony_ci         tmp = LLVMBuildLoad2(builder, ctx->ac.i32, offsets_vgpr, "");
470bf215546Sopenharmony_ci         LLVMValueRef offset_dw =
471bf215546Sopenharmony_ci            ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
472bf215546Sopenharmony_ci
473bf215546Sopenharmony_ci         tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
474bf215546Sopenharmony_ci         tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
475bf215546Sopenharmony_ci
476bf215546Sopenharmony_ci         tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
477bf215546Sopenharmony_ci         max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
478bf215546Sopenharmony_ci      }
479bf215546Sopenharmony_ci
480bf215546Sopenharmony_ci      /* Determine the number of emitted primitives per stream and fixup the
481bf215546Sopenharmony_ci       * GDS counter if necessary.
482bf215546Sopenharmony_ci       *
483bf215546Sopenharmony_ci       * This is complicated by the fact that a single stream can emit to
484bf215546Sopenharmony_ci       * multiple buffers (but luckily not vice versa).
485bf215546Sopenharmony_ci       */
486bf215546Sopenharmony_ci      LLVMValueRef emit_vgpr = ctx->ac.i32_0;
487bf215546Sopenharmony_ci
488bf215546Sopenharmony_ci      for (unsigned stream = 0; stream < 4; ++stream) {
489bf215546Sopenharmony_ci         if (!info->num_stream_output_components[stream])
490bf215546Sopenharmony_ci            continue;
491bf215546Sopenharmony_ci
492bf215546Sopenharmony_ci         tmp = LLVMBuildLoad2(builder, ctx->ac.i32, generated_by_stream_vgpr, "");
493bf215546Sopenharmony_ci         LLVMValueRef generated =
494bf215546Sopenharmony_ci            ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
495bf215546Sopenharmony_ci
496bf215546Sopenharmony_ci         LLVMValueRef emit = generated;
497bf215546Sopenharmony_ci         for (unsigned buffer = 0; buffer < 4; ++buffer) {
498bf215546Sopenharmony_ci            if (stream_for_buffer[buffer] == stream)
499bf215546Sopenharmony_ci               emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
500bf215546Sopenharmony_ci         }
501bf215546Sopenharmony_ci
502bf215546Sopenharmony_ci         emit_vgpr =
503bf215546Sopenharmony_ci            ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
504bf215546Sopenharmony_ci
505bf215546Sopenharmony_ci         /* Fixup the offset using a plain GDS atomic if we overflowed. */
506bf215546Sopenharmony_ci         tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
507bf215546Sopenharmony_ci         ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
508bf215546Sopenharmony_ci         tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
509bf215546Sopenharmony_ci                             ac_get_thread_id(&ctx->ac), "");
510bf215546Sopenharmony_ci         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
511bf215546Sopenharmony_ci         ac_build_ifcc(&ctx->ac, tmp, 5222);
512bf215546Sopenharmony_ci         {
513bf215546Sopenharmony_ci            tmp = LLVMBuildSub(builder, generated, emit, "");
514bf215546Sopenharmony_ci            tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
515bf215546Sopenharmony_ci
516bf215546Sopenharmony_ci            if (ctx->screen->info.gfx_level >= GFX11) {
517bf215546Sopenharmony_ci               /* Gfx11 GDS instructions only operate on the first active lane.
518bf215546Sopenharmony_ci                * This is an unrolled waterfall loop. We only get here when we overflow,
519bf215546Sopenharmony_ci                * so it doesn't have to be fast.
520bf215546Sopenharmony_ci                */
521bf215546Sopenharmony_ci               for (unsigned i = 0; i < 4; i++) {
522bf215546Sopenharmony_ci                  if (bufmask_for_stream[stream] & BITFIELD_BIT(i)) {
523bf215546Sopenharmony_ci                     LLVMValueRef index = LLVMConstInt(ctx->ac.i32, i, 0);
524bf215546Sopenharmony_ci
525bf215546Sopenharmony_ci                     ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, index, ""), 0);
526bf215546Sopenharmony_ci                     LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub,
527bf215546Sopenharmony_ci                                        LLVMBuildGEP(builder, gdsbase, &index, 1, ""),
528bf215546Sopenharmony_ci                                        tmp, LLVMAtomicOrderingMonotonic, false);
529bf215546Sopenharmony_ci                     ac_build_endif(&ctx->ac, 0);
530bf215546Sopenharmony_ci                  }
531bf215546Sopenharmony_ci               }
532bf215546Sopenharmony_ci            } else {
533bf215546Sopenharmony_ci               LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub,
534bf215546Sopenharmony_ci                                  LLVMBuildGEP(builder, gdsbase, &tid, 1, ""),
535bf215546Sopenharmony_ci                                  tmp, LLVMAtomicOrderingMonotonic, false);
536bf215546Sopenharmony_ci            }
537bf215546Sopenharmony_ci         }
538bf215546Sopenharmony_ci         ac_build_endif(&ctx->ac, 5222);
539bf215546Sopenharmony_ci         ac_build_endif(&ctx->ac, 5221);
540bf215546Sopenharmony_ci      }
541bf215546Sopenharmony_ci
542bf215546Sopenharmony_ci      tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
543bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, tmp, 5225);
544bf215546Sopenharmony_ci      {
545bf215546Sopenharmony_ci         tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
546bf215546Sopenharmony_ci         tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
547bf215546Sopenharmony_ci         LLVMBuildStore(builder, emit_vgpr, tmp);
548bf215546Sopenharmony_ci      }
549bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5225);
550bf215546Sopenharmony_ci   }
551bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 5200);
552bf215546Sopenharmony_ci
553bf215546Sopenharmony_ci   /* Determine the workgroup-relative per-thread / primitive offset into
554bf215546Sopenharmony_ci    * the streamout buffers */
555bf215546Sopenharmony_ci   struct ac_wg_scan primemit_scan[4] = {};
556bf215546Sopenharmony_ci
557bf215546Sopenharmony_ci   if (isgs) {
558bf215546Sopenharmony_ci      for (unsigned stream = 0; stream < 4; ++stream) {
559bf215546Sopenharmony_ci         if (!info->num_stream_output_components[stream])
560bf215546Sopenharmony_ci            continue;
561bf215546Sopenharmony_ci
562bf215546Sopenharmony_ci         primemit_scan[stream].stage = ctx->stage;
563bf215546Sopenharmony_ci         primemit_scan[stream].enable_exclusive = true;
564bf215546Sopenharmony_ci         primemit_scan[stream].op = nir_op_iadd;
565bf215546Sopenharmony_ci         primemit_scan[stream].src = nggso->prim_enable[stream];
566bf215546Sopenharmony_ci         primemit_scan[stream].scratch = ac_build_gep0(
567bf215546Sopenharmony_ci            &ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
568bf215546Sopenharmony_ci         primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
569bf215546Sopenharmony_ci         primemit_scan[stream].numwaves = get_tgsize(ctx);
570bf215546Sopenharmony_ci         if (ctx->stage == MESA_SHADER_GEOMETRY) {
571bf215546Sopenharmony_ci            /* ngg_subgroup_size is only the input size. GS can always generate up to 256 vertices. */
572bf215546Sopenharmony_ci            primemit_scan[stream].maxwaves = DIV_ROUND_UP(256, ctx->ac.wave_size);
573bf215546Sopenharmony_ci         } else {
574bf215546Sopenharmony_ci            primemit_scan[stream].maxwaves = DIV_ROUND_UP(ctx->screen->ngg_subgroup_size,
575bf215546Sopenharmony_ci                                                          ctx->ac.wave_size);
576bf215546Sopenharmony_ci         }
577bf215546Sopenharmony_ci         ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
578bf215546Sopenharmony_ci      }
579bf215546Sopenharmony_ci   }
580bf215546Sopenharmony_ci
581bf215546Sopenharmony_ci   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
582bf215546Sopenharmony_ci   ac_build_s_barrier(&ctx->ac, ctx->stage);
583bf215546Sopenharmony_ci
584bf215546Sopenharmony_ci   /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
585bf215546Sopenharmony_ci   LLVMValueRef wgoffset_dw[4] = {};
586bf215546Sopenharmony_ci
587bf215546Sopenharmony_ci   {
588bf215546Sopenharmony_ci      LLVMValueRef scratch_vgpr;
589bf215546Sopenharmony_ci
590bf215546Sopenharmony_ci      tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
591bf215546Sopenharmony_ci      scratch_vgpr = LLVMBuildLoad2(builder, ctx->ac.i32, tmp, "");
592bf215546Sopenharmony_ci
593bf215546Sopenharmony_ci      for (unsigned buffer = 0; buffer < 4; ++buffer) {
594bf215546Sopenharmony_ci         if (stream_for_buffer[buffer] >= 0) {
595bf215546Sopenharmony_ci            wgoffset_dw[buffer] =
596bf215546Sopenharmony_ci               ac_build_readlane(&ctx->ac, scratch_vgpr,
597bf215546Sopenharmony_ci                                 LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
598bf215546Sopenharmony_ci         }
599bf215546Sopenharmony_ci      }
600bf215546Sopenharmony_ci
601bf215546Sopenharmony_ci      for (unsigned stream = 0; stream < 4; ++stream) {
602bf215546Sopenharmony_ci         if (info->num_stream_output_components[stream]) {
603bf215546Sopenharmony_ci            nggso->emit[stream] =
604bf215546Sopenharmony_ci               ac_build_readlane(&ctx->ac, scratch_vgpr,
605bf215546Sopenharmony_ci                                 LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
606bf215546Sopenharmony_ci         }
607bf215546Sopenharmony_ci      }
608bf215546Sopenharmony_ci   }
609bf215546Sopenharmony_ci
610bf215546Sopenharmony_ci   /* Write out primitive data */
611bf215546Sopenharmony_ci   for (unsigned stream = 0; stream < 4; ++stream) {
612bf215546Sopenharmony_ci      if (!info->num_stream_output_components[stream])
613bf215546Sopenharmony_ci         continue;
614bf215546Sopenharmony_ci
615bf215546Sopenharmony_ci      if (isgs) {
616bf215546Sopenharmony_ci         ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
617bf215546Sopenharmony_ci      } else {
618bf215546Sopenharmony_ci         primemit_scan[stream].result_exclusive = tid;
619bf215546Sopenharmony_ci      }
620bf215546Sopenharmony_ci
621bf215546Sopenharmony_ci      tmp = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
622bf215546Sopenharmony_ci                          nggso->emit[stream], "");
623bf215546Sopenharmony_ci      tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
624bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, tmp, 5240);
625bf215546Sopenharmony_ci      {
626bf215546Sopenharmony_ci         LLVMValueRef offset_vtx =
627bf215546Sopenharmony_ci            LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
628bf215546Sopenharmony_ci
629bf215546Sopenharmony_ci         for (unsigned i = 0; i < max_num_vertices; ++i) {
630bf215546Sopenharmony_ci            tmp = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
631bf215546Sopenharmony_ci                                nggso->num_vertices, "");
632bf215546Sopenharmony_ci            ac_build_ifcc(&ctx->ac, tmp, 5241);
633bf215546Sopenharmony_ci            build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
634bf215546Sopenharmony_ci                                   nggso->vertices[i]);
635bf215546Sopenharmony_ci            ac_build_endif(&ctx->ac, 5241);
636bf215546Sopenharmony_ci            offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
637bf215546Sopenharmony_ci         }
638bf215546Sopenharmony_ci      }
639bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5240);
640bf215546Sopenharmony_ci   }
641bf215546Sopenharmony_ci}
642bf215546Sopenharmony_ci
643bf215546Sopenharmony_ci/* LDS layout of ES vertex data for NGG culling. */
644bf215546Sopenharmony_cienum
645bf215546Sopenharmony_ci{
646bf215546Sopenharmony_ci   /* Byte 0: Boolean ES thread accepted (unculled) flag.
647bf215546Sopenharmony_ci    * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
648bf215546Sopenharmony_ci    * Byte 2: TES rel patch ID
649bf215546Sopenharmony_ci    * Byte 3: 8-bit clip distance mask: 1 means the clip distance is negative.
650bf215546Sopenharmony_ci    *         The mask from all vertices is AND'ed. If the result is non-zero,
651bf215546Sopenharmony_ci    *         the primitive is culled.
652bf215546Sopenharmony_ci    */
653bf215546Sopenharmony_ci   lds_byte0_accept_flag = 0,
654bf215546Sopenharmony_ci   lds_byte1_new_thread_id,
655bf215546Sopenharmony_ci   lds_byte2_tes_rel_patch_id,
656bf215546Sopenharmony_ci   lds_byte3_clipdist_neg_mask,
657bf215546Sopenharmony_ci
658bf215546Sopenharmony_ci   lds_packed_data = 0, /* lds_byteN_... */
659bf215546Sopenharmony_ci   lds_pos_cull_x_div_w,
660bf215546Sopenharmony_ci   lds_pos_cull_y_div_w,
661bf215546Sopenharmony_ci   lds_pos_cull_w,
662bf215546Sopenharmony_ci
663bf215546Sopenharmony_ci   lds_pos_x = lds_packed_data + 1,
664bf215546Sopenharmony_ci   lds_pos_y,
665bf215546Sopenharmony_ci   lds_pos_z,
666bf215546Sopenharmony_ci   lds_pos_w,
667bf215546Sopenharmony_ci   /* If VS: */
668bf215546Sopenharmony_ci   lds_vertex_id,
669bf215546Sopenharmony_ci   lds_instance_id, /* optional */
670bf215546Sopenharmony_ci   /* If TES: */
671bf215546Sopenharmony_ci   lds_tes_u = lds_vertex_id,
672bf215546Sopenharmony_ci   lds_tes_v = lds_instance_id,
673bf215546Sopenharmony_ci   lds_tes_patch_id, /* optional */
674bf215546Sopenharmony_ci};
675bf215546Sopenharmony_ci
676bf215546Sopenharmony_cistatic LLVMValueRef si_build_gep_i8_var(struct si_shader_context *ctx, LLVMValueRef ptr,
677bf215546Sopenharmony_ci                                        LLVMValueRef index)
678bf215546Sopenharmony_ci{
679bf215546Sopenharmony_ci   LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
680bf215546Sopenharmony_ci
681bf215546Sopenharmony_ci   return LLVMBuildGEP(ctx->ac.builder, LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""), &index,
682bf215546Sopenharmony_ci                       1, "");
683bf215546Sopenharmony_ci}
684bf215546Sopenharmony_ci
685bf215546Sopenharmony_cistatic LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx, LLVMValueRef ptr,
686bf215546Sopenharmony_ci                                    unsigned byte_index)
687bf215546Sopenharmony_ci{
688bf215546Sopenharmony_ci   assert(byte_index < 4);
689bf215546Sopenharmony_ci   return si_build_gep_i8_var(ctx, ptr, LLVMConstInt(ctx->ac.i32, byte_index, 0));
690bf215546Sopenharmony_ci}
691bf215546Sopenharmony_ci
692bf215546Sopenharmony_cistatic unsigned ngg_nogs_vertex_size(struct si_shader *shader)
693bf215546Sopenharmony_ci{
694bf215546Sopenharmony_ci   unsigned lds_vertex_size = 0;
695bf215546Sopenharmony_ci
696bf215546Sopenharmony_ci   /* The edgeflag is always stored in the last element that's also
697bf215546Sopenharmony_ci    * used for padding to reduce LDS bank conflicts. */
698bf215546Sopenharmony_ci   if (si_shader_uses_streamout(shader))
699bf215546Sopenharmony_ci      lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
700bf215546Sopenharmony_ci   if (gfx10_ngg_writes_user_edgeflags(shader))
701bf215546Sopenharmony_ci      lds_vertex_size = MAX2(lds_vertex_size, 1);
702bf215546Sopenharmony_ci
703bf215546Sopenharmony_ci   /* LDS size for passing data from GS to ES.
704bf215546Sopenharmony_ci    * GS stores Primitive IDs into LDS at the address corresponding
705bf215546Sopenharmony_ci    * to the ES thread of the provoking vertex. All ES threads
706bf215546Sopenharmony_ci    * load and export PrimitiveID for their thread.
707bf215546Sopenharmony_ci    */
708bf215546Sopenharmony_ci   if (shader->selector->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)
709bf215546Sopenharmony_ci      lds_vertex_size = MAX2(lds_vertex_size, 1);
710bf215546Sopenharmony_ci
711bf215546Sopenharmony_ci   if (shader->key.ge.opt.ngg_culling) {
712bf215546Sopenharmony_ci      if (shader->selector->stage == MESA_SHADER_VERTEX) {
713bf215546Sopenharmony_ci         STATIC_ASSERT(lds_instance_id + 1 == 7);
714bf215546Sopenharmony_ci         lds_vertex_size = MAX2(lds_vertex_size, 7);
715bf215546Sopenharmony_ci      } else {
716bf215546Sopenharmony_ci         assert(shader->selector->stage == MESA_SHADER_TESS_EVAL);
717bf215546Sopenharmony_ci
718bf215546Sopenharmony_ci         if (shader->selector->info.uses_primid || shader->key.ge.mono.u.vs_export_prim_id) {
719bf215546Sopenharmony_ci            STATIC_ASSERT(lds_tes_patch_id + 2 == 9); /* +1 for LDS padding */
720bf215546Sopenharmony_ci            lds_vertex_size = MAX2(lds_vertex_size, 9);
721bf215546Sopenharmony_ci         } else {
722bf215546Sopenharmony_ci            STATIC_ASSERT(lds_tes_v + 1 == 7);
723bf215546Sopenharmony_ci            lds_vertex_size = MAX2(lds_vertex_size, 7);
724bf215546Sopenharmony_ci         }
725bf215546Sopenharmony_ci      }
726bf215546Sopenharmony_ci   }
727bf215546Sopenharmony_ci
728bf215546Sopenharmony_ci   return lds_vertex_size;
729bf215546Sopenharmony_ci}
730bf215546Sopenharmony_ci
731bf215546Sopenharmony_ci/**
732bf215546Sopenharmony_ci * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
733bf215546Sopenharmony_ci * for the vertex outputs.
734bf215546Sopenharmony_ci */
735bf215546Sopenharmony_cistatic LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vtxid)
736bf215546Sopenharmony_ci{
737bf215546Sopenharmony_ci   /* The extra dword is used to avoid LDS bank conflicts. */
738bf215546Sopenharmony_ci   unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
739bf215546Sopenharmony_ci   LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
740bf215546Sopenharmony_ci   LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
741bf215546Sopenharmony_ci   LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
742bf215546Sopenharmony_ci   return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
743bf215546Sopenharmony_ci}
744bf215546Sopenharmony_ci
745bf215546Sopenharmony_cistatic LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, LLVMValueRef ret,
746bf215546Sopenharmony_ci                                          struct ac_arg param, unsigned return_index)
747bf215546Sopenharmony_ci{
748bf215546Sopenharmony_ci   LLVMValueRef v = ac_get_arg(&ctx->ac, param);
749bf215546Sopenharmony_ci
750bf215546Sopenharmony_ci   for (unsigned i = 0; i < 4; i++) {
751bf215546Sopenharmony_ci      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, ac_llvm_extract_elem(&ctx->ac, v, i),
752bf215546Sopenharmony_ci                                 return_index + i, "");
753bf215546Sopenharmony_ci   }
754bf215546Sopenharmony_ci   return ret;
755bf215546Sopenharmony_ci}
756bf215546Sopenharmony_ci
757bf215546Sopenharmony_cistatic void load_vertex_counts(struct si_shader_context *ctx, LLVMValueRef lds,
758bf215546Sopenharmony_ci                               unsigned max_waves, LLVMValueRef tid,
759bf215546Sopenharmony_ci                               LLVMValueRef *total_count,
760bf215546Sopenharmony_ci                               LLVMValueRef *prefix_sum)
761bf215546Sopenharmony_ci{
762bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
763bf215546Sopenharmony_ci   LLVMValueRef i8vec4_lane = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
764bf215546Sopenharmony_ci   unsigned num_i8vec4 = DIV_ROUND_UP(max_waves, 4);
765bf215546Sopenharmony_ci
766bf215546Sopenharmony_ci   /* If all threads loaded the vertex counts, it would cause many LDS bank conflicts
767bf215546Sopenharmony_ci    * and the performance could decrease up to WaveSize times (32x or 64x).
768bf215546Sopenharmony_ci    *
769bf215546Sopenharmony_ci    * Therefore, only load the i-th tuple of vertex counts in the i-th thread. Other threads will
770bf215546Sopenharmony_ci    * get them through readlane. 4 8-bit vertex counts are loaded per thread.
771bf215546Sopenharmony_ci    */
772bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntULT, tid,
773bf215546Sopenharmony_ci                                         LLVMConstInt(ctx->ac.i32, num_i8vec4, 0), ""), 17771);
774bf215546Sopenharmony_ci   LLVMBuildStore(builder, LLVMBuildLoad2(builder, ctx->ac.i32, ac_build_gep0(&ctx->ac, lds, tid), ""), i8vec4_lane);
775bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 17771);
776bf215546Sopenharmony_ci
777bf215546Sopenharmony_ci   /* Compute the number of ES waves. */
778bf215546Sopenharmony_ci   LLVMValueRef num_waves = get_tgsize(ctx);
779bf215546Sopenharmony_ci
780bf215546Sopenharmony_ci   /* Compute a byte mask where each byte is either 0 or 0xff depending on whether the wave
781bf215546Sopenharmony_ci    * exists. We need the mask to clear uninitialized bytes in LDS and to compute the prefix sum.
782bf215546Sopenharmony_ci    *
783bf215546Sopenharmony_ci    * 8 waves: valid_mask = ~0ull >> (64 - num_waves * 8)
784bf215546Sopenharmony_ci    * 4 waves: valid_mask = ~0 >> (32 - num_waves * 8)
785bf215546Sopenharmony_ci    */
786bf215546Sopenharmony_ci   LLVMValueRef num_waves8 = LLVMBuildShl(builder, num_waves, LLVMConstInt(ctx->ac.i32, 3, 0), "");
787bf215546Sopenharmony_ci   LLVMValueRef valid_mask;
788bf215546Sopenharmony_ci
789bf215546Sopenharmony_ci   if (max_waves > 4) {
790bf215546Sopenharmony_ci      LLVMValueRef num_waves8_rev = LLVMBuildSub(builder, LLVMConstInt(ctx->ac.i32, 64, 0),
791bf215546Sopenharmony_ci                                                 num_waves8, "");
792bf215546Sopenharmony_ci      valid_mask = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i64, ~0ull, 0),
793bf215546Sopenharmony_ci                                 LLVMBuildZExt(builder, num_waves8_rev, ctx->ac.i64, ""), "");
794bf215546Sopenharmony_ci   } else {
795bf215546Sopenharmony_ci      LLVMValueRef num_waves8_rev = LLVMBuildSub(builder, LLVMConstInt(ctx->ac.i32, 32, 0),
796bf215546Sopenharmony_ci                                                 num_waves8, "");
797bf215546Sopenharmony_ci      valid_mask = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, ~0, 0), num_waves8_rev, "");
798bf215546Sopenharmony_ci   }
799bf215546Sopenharmony_ci
800bf215546Sopenharmony_ci   /* Compute a byte mask where bytes below wave_id are 0xff, else they are 0.
801bf215546Sopenharmony_ci    *
802bf215546Sopenharmony_ci    * prefix_mask = ~(~0 << (wave_id * 8))
803bf215546Sopenharmony_ci    */
804bf215546Sopenharmony_ci   LLVMTypeRef type = max_waves > 4 ? ctx->ac.i64 : ctx->ac.i32;
805bf215546Sopenharmony_ci   LLVMValueRef wave_id8 = LLVMBuildShl(builder, get_wave_id_in_tg(ctx),
806bf215546Sopenharmony_ci                                        LLVMConstInt(ctx->ac.i32, 3, 0), "");
807bf215546Sopenharmony_ci   LLVMValueRef prefix_mask =
808bf215546Sopenharmony_ci      LLVMBuildNot(builder, LLVMBuildShl(builder, LLVMConstInt(type, ~0ull, 0),
809bf215546Sopenharmony_ci                                         LLVMBuildZExt(builder, wave_id8, type, ""), ""), "");
810bf215546Sopenharmony_ci
811bf215546Sopenharmony_ci   /* Compute the total vertex count and the vertex count of previous waves (prefix). */
812bf215546Sopenharmony_ci   *total_count = ctx->ac.i32_0;
813bf215546Sopenharmony_ci   *prefix_sum = ctx->ac.i32_0;
814bf215546Sopenharmony_ci
815bf215546Sopenharmony_ci   for (unsigned i = 0; i < num_i8vec4; i++) {
816bf215546Sopenharmony_ci      LLVMValueRef i8vec4;
817bf215546Sopenharmony_ci
818bf215546Sopenharmony_ci      i8vec4 = ac_build_readlane_no_opt_barrier(&ctx->ac, LLVMBuildLoad2(builder, ctx->ac.i32, i8vec4_lane, ""),
819bf215546Sopenharmony_ci                                                LLVMConstInt(ctx->ac.i32, i, 0));
820bf215546Sopenharmony_ci      /* Inactive waves have uninitialized vertex counts. Set them to 0 using this. */
821bf215546Sopenharmony_ci      i8vec4 = LLVMBuildAnd(builder, i8vec4,
822bf215546Sopenharmony_ci                            ac_unpack_param(&ctx->ac, valid_mask, 32 * i, 32), "");
823bf215546Sopenharmony_ci      /* Compute the sum of all i8vec4 components and add it to the result. */
824bf215546Sopenharmony_ci      *total_count = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.sad.u8", ctx->ac.i32,
825bf215546Sopenharmony_ci                                        (LLVMValueRef[]){i8vec4, ctx->ac.i32_0, *total_count},
826bf215546Sopenharmony_ci                                        3, AC_FUNC_ATTR_READNONE);
827bf215546Sopenharmony_ci      ac_set_range_metadata(&ctx->ac, *total_count, 0, 64*4 + 1); /* the result is at most 64*4 */
828bf215546Sopenharmony_ci
829bf215546Sopenharmony_ci      /* Compute the sum of the vertex counts of all previous waves. */
830bf215546Sopenharmony_ci      i8vec4 = LLVMBuildAnd(builder, i8vec4,
831bf215546Sopenharmony_ci                                ac_unpack_param(&ctx->ac, prefix_mask, 32 * i, 32), "");
832bf215546Sopenharmony_ci      *prefix_sum = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.sad.u8", ctx->ac.i32,
833bf215546Sopenharmony_ci                                       (LLVMValueRef[]){i8vec4, ctx->ac.i32_0, *prefix_sum},
834bf215546Sopenharmony_ci                                       3, AC_FUNC_ATTR_READNONE);
835bf215546Sopenharmony_ci      ac_set_range_metadata(&ctx->ac, *prefix_sum, 0, 64*4 + 1); /* the result is at most 64*4 */
836bf215546Sopenharmony_ci   }
837bf215546Sopenharmony_ci   *total_count = ac_build_readlane_no_opt_barrier(&ctx->ac, *total_count, NULL);
838bf215546Sopenharmony_ci}
839bf215546Sopenharmony_ci
840bf215546Sopenharmony_ci/**
841bf215546Sopenharmony_ci * Given a total thread count, update total and per-wave thread counts in input SGPRs
842bf215546Sopenharmony_ci * and return the per-wave thread count.
843bf215546Sopenharmony_ci *
844bf215546Sopenharmony_ci * \param new_num_threads    Total thread count on the input, per-wave thread count on the output.
845bf215546Sopenharmony_ci * \param tg_info            tg_info SGPR value
846bf215546Sopenharmony_ci * \param tg_info_num_bits   the bit size of thread count field in tg_info
847bf215546Sopenharmony_ci * \param tg_info_shift      the bit offset of the thread count field in tg_info
848bf215546Sopenharmony_ci * \param wave_info          merged_wave_info SGPR value
849bf215546Sopenharmony_ci * \param wave_info_num_bits the bit size of thread count field in merged_wave_info
850bf215546Sopenharmony_ci * \param wave_info_shift    the bit offset of the thread count field in merged_wave_info
851bf215546Sopenharmony_ci */
852bf215546Sopenharmony_cistatic void update_thread_counts(struct si_shader_context *ctx, LLVMValueRef *new_num_threads,
853bf215546Sopenharmony_ci                                 LLVMValueRef *tg_info, unsigned tg_info_num_bits,
854bf215546Sopenharmony_ci                                 unsigned tg_info_shift, LLVMValueRef *wave_info,
855bf215546Sopenharmony_ci                                 unsigned wave_info_num_bits, unsigned wave_info_shift)
856bf215546Sopenharmony_ci{
857bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
858bf215546Sopenharmony_ci
859bf215546Sopenharmony_ci   /* Update the total thread count. */
860bf215546Sopenharmony_ci   unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
861bf215546Sopenharmony_ci   *tg_info = LLVMBuildAnd(builder, *tg_info, LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
862bf215546Sopenharmony_ci   *tg_info = LLVMBuildOr(
863bf215546Sopenharmony_ci      builder, *tg_info,
864bf215546Sopenharmony_ci      LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
865bf215546Sopenharmony_ci
866bf215546Sopenharmony_ci   /* Update the per-wave thread count. */
867bf215546Sopenharmony_ci   LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
868bf215546Sopenharmony_ci                                            LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
869bf215546Sopenharmony_ci   *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
870bf215546Sopenharmony_ci   *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
871bf215546Sopenharmony_ci   *new_num_threads =
872bf215546Sopenharmony_ci      ac_build_imin(&ctx->ac, *new_num_threads, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
873bf215546Sopenharmony_ci   unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
874bf215546Sopenharmony_ci   *wave_info = LLVMBuildAnd(builder, *wave_info, LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
875bf215546Sopenharmony_ci   *wave_info = LLVMBuildOr(
876bf215546Sopenharmony_ci      builder, *wave_info,
877bf215546Sopenharmony_ci      LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""),
878bf215546Sopenharmony_ci      "");
879bf215546Sopenharmony_ci}
880bf215546Sopenharmony_ci
881bf215546Sopenharmony_cistatic void gfx10_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted,
882bf215546Sopenharmony_ci                                           void *userdata)
883bf215546Sopenharmony_ci{
884bf215546Sopenharmony_ci   struct si_shader_context *ctx = container_of(ac, struct si_shader_context, ac);
885bf215546Sopenharmony_ci   LLVMValueRef *params = (LLVMValueRef *)userdata;
886bf215546Sopenharmony_ci   LLVMValueRef gs_accepted = params[0];
887bf215546Sopenharmony_ci   LLVMValueRef *gs_vtxptr = (LLVMValueRef *)params[1];
888bf215546Sopenharmony_ci
889bf215546Sopenharmony_ci   unsigned num_vertices;
890bf215546Sopenharmony_ci   ngg_get_vertices_per_prim(ctx, &num_vertices);
891bf215546Sopenharmony_ci
892bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, accepted, 0);
893bf215546Sopenharmony_ci   LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_1, gs_accepted);
894bf215546Sopenharmony_ci
895bf215546Sopenharmony_ci   if (gs_vtxptr) {
896bf215546Sopenharmony_ci      for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
897bf215546Sopenharmony_ci         LLVMBuildStore(ctx->ac.builder, ctx->ac.i8_1,
898bf215546Sopenharmony_ci                        si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
899bf215546Sopenharmony_ci      }
900bf215546Sopenharmony_ci   }
901bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 0);
902bf215546Sopenharmony_ci}
903bf215546Sopenharmony_ci
904bf215546Sopenharmony_cistatic void add_clipdist_bit(struct si_shader_context *ctx, LLVMValueRef distance, unsigned i,
905bf215546Sopenharmony_ci                             LLVMValueRef *packed_data)
906bf215546Sopenharmony_ci{
907bf215546Sopenharmony_ci   LLVMValueRef neg = LLVMBuildFCmp(ctx->ac.builder, LLVMRealOLT, distance, ctx->ac.f32_0, "");
908bf215546Sopenharmony_ci   neg = LLVMBuildZExt(ctx->ac.builder, neg, ctx->ac.i32, "");
909bf215546Sopenharmony_ci   /* Put the negative distance flag into lds_byte3_clipdist_neg_mask. */
910bf215546Sopenharmony_ci   neg = LLVMBuildShl(ctx->ac.builder, neg, LLVMConstInt(ctx->ac.i32, 24 + i, 0), "");
911bf215546Sopenharmony_ci   *packed_data = LLVMBuildOr(ctx->ac.builder, *packed_data, neg, "");
912bf215546Sopenharmony_ci}
913bf215546Sopenharmony_ci
914bf215546Sopenharmony_cistatic bool add_clipdist_bits_for_clipvertex(struct si_shader_context *ctx,
915bf215546Sopenharmony_ci                                             unsigned clipdist_enable,
916bf215546Sopenharmony_ci                                             LLVMValueRef clipvertex[4],
917bf215546Sopenharmony_ci                                             LLVMValueRef *packed_data)
918bf215546Sopenharmony_ci{
919bf215546Sopenharmony_ci   struct ac_export_args clipdist[2];
920bf215546Sopenharmony_ci   bool added = false;
921bf215546Sopenharmony_ci
922bf215546Sopenharmony_ci   si_llvm_clipvertex_to_clipdist(ctx, clipdist, clipvertex);
923bf215546Sopenharmony_ci
924bf215546Sopenharmony_ci   for (unsigned j = 0; j < 8; j++) {
925bf215546Sopenharmony_ci      if (!(clipdist_enable & BITFIELD_BIT(j)))
926bf215546Sopenharmony_ci         continue;
927bf215546Sopenharmony_ci
928bf215546Sopenharmony_ci      LLVMValueRef distance = clipdist[j / 4].out[j % 4];
929bf215546Sopenharmony_ci      add_clipdist_bit(ctx, distance, j, packed_data);
930bf215546Sopenharmony_ci      added = true;
931bf215546Sopenharmony_ci   }
932bf215546Sopenharmony_ci   return added;
933bf215546Sopenharmony_ci}
934bf215546Sopenharmony_ci
935bf215546Sopenharmony_cistatic void cull_primitive(struct si_shader_context *ctx,
936bf215546Sopenharmony_ci                           LLVMValueRef pos[3][4], LLVMValueRef clipdist_accepted,
937bf215546Sopenharmony_ci                           LLVMValueRef out_prim_accepted, LLVMValueRef gs_vtxptr_accept[3])
938bf215546Sopenharmony_ci{
939bf215546Sopenharmony_ci   struct si_shader *shader = ctx->shader;
940bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
941bf215546Sopenharmony_ci
942bf215546Sopenharmony_ci   LLVMValueRef vp_scale[2] = {}, vp_translate[2] = {}, small_prim_precision = NULL;
943bf215546Sopenharmony_ci   LLVMValueRef clip_half_line_width[2] = {};
944bf215546Sopenharmony_ci
945bf215546Sopenharmony_ci   /* Load the viewport state for small prim culling. */
946bf215546Sopenharmony_ci   bool prim_is_lines = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES;
947bf215546Sopenharmony_ci   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->small_prim_cull_info);
948bf215546Sopenharmony_ci   /* Lines will always use the non-AA viewport transformation. */
949bf215546Sopenharmony_ci   LLVMValueRef vp = ac_build_load_to_sgpr(&ctx->ac, ptr,
950bf215546Sopenharmony_ci                                           prim_is_lines ? ctx->ac.i32_1 : ctx->ac.i32_0);
951bf215546Sopenharmony_ci   vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
952bf215546Sopenharmony_ci   vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
953bf215546Sopenharmony_ci   vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
954bf215546Sopenharmony_ci   vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
955bf215546Sopenharmony_ci   vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
956bf215546Sopenharmony_ci
957bf215546Sopenharmony_ci   /* Execute culling code. */
958bf215546Sopenharmony_ci   struct ac_cull_options options = {};
959bf215546Sopenharmony_ci   options.cull_view_xy = true;
960bf215546Sopenharmony_ci   options.cull_w = true;
961bf215546Sopenharmony_ci
962bf215546Sopenharmony_ci   if (prim_is_lines) {
963bf215546Sopenharmony_ci      ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr,
964bf215546Sopenharmony_ci                                 LLVMPointerType(ctx->ac.v2f32, AC_ADDR_SPACE_CONST_32BIT), "");
965bf215546Sopenharmony_ci      LLVMValueRef terms = ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, 4, 0));
966bf215546Sopenharmony_ci      terms = LLVMBuildBitCast(builder, terms, ctx->ac.v2f32, "");
967bf215546Sopenharmony_ci      clip_half_line_width[0] = ac_llvm_extract_elem(&ctx->ac, terms, 0);
968bf215546Sopenharmony_ci      clip_half_line_width[1] = ac_llvm_extract_elem(&ctx->ac, terms, 1);
969bf215546Sopenharmony_ci      small_prim_precision = GET_FIELD(ctx, GS_STATE_SMALL_PRIM_PRECISION_NO_AA);
970bf215546Sopenharmony_ci
971bf215546Sopenharmony_ci      options.num_vertices = 2;
972bf215546Sopenharmony_ci      options.cull_small_prims = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT;
973bf215546Sopenharmony_ci
974bf215546Sopenharmony_ci      assert(!(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE));
975bf215546Sopenharmony_ci      assert(!(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE));
976bf215546Sopenharmony_ci   } else {
977bf215546Sopenharmony_ci      /* Get the small prim filter precision. */
978bf215546Sopenharmony_ci      small_prim_precision = GET_FIELD(ctx, GS_STATE_SMALL_PRIM_PRECISION);
979bf215546Sopenharmony_ci
980bf215546Sopenharmony_ci      options.num_vertices = 3;
981bf215546Sopenharmony_ci      options.cull_front = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
982bf215546Sopenharmony_ci      options.cull_back = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
983bf215546Sopenharmony_ci      options.cull_small_prims = true; /* this would only be false with conservative rasterization */
984bf215546Sopenharmony_ci      options.cull_zero_area = options.cull_front || options.cull_back;
985bf215546Sopenharmony_ci   }
986bf215546Sopenharmony_ci
987bf215546Sopenharmony_ci   /* Extract the small prim precision. */
988bf215546Sopenharmony_ci   small_prim_precision =
989bf215546Sopenharmony_ci      LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
990bf215546Sopenharmony_ci   small_prim_precision =
991bf215546Sopenharmony_ci      LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
992bf215546Sopenharmony_ci   small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
993bf215546Sopenharmony_ci
994bf215546Sopenharmony_ci   /* Tell ES threads whether their vertex survived. */
995bf215546Sopenharmony_ci   LLVMValueRef params[] = {
996bf215546Sopenharmony_ci      out_prim_accepted,
997bf215546Sopenharmony_ci      (void*)gs_vtxptr_accept,
998bf215546Sopenharmony_ci   };
999bf215546Sopenharmony_ci   ac_cull_primitive(&ctx->ac, pos, clipdist_accepted, vp_scale, vp_translate,
1000bf215546Sopenharmony_ci                     small_prim_precision, clip_half_line_width,
1001bf215546Sopenharmony_ci                     &options, gfx10_build_primitive_accepted, params);
1002bf215546Sopenharmony_ci}
1003bf215546Sopenharmony_ci
1004bf215546Sopenharmony_ci/**
1005bf215546Sopenharmony_ci * Cull primitives for NGG VS or TES, then compact vertices, which happens
1006bf215546Sopenharmony_ci * before the VS or TES main function. Return values for the main function.
1007bf215546Sopenharmony_ci * Also return the position, which is passed to the shader as an input,
1008bf215546Sopenharmony_ci * so that we don't compute it twice.
1009bf215546Sopenharmony_ci */
1010bf215546Sopenharmony_civoid gfx10_ngg_culling_build_end(struct si_shader_context *ctx)
1011bf215546Sopenharmony_ci{
1012bf215546Sopenharmony_ci   struct si_shader *shader = ctx->shader;
1013bf215546Sopenharmony_ci   struct si_shader_selector *sel = shader->selector;
1014bf215546Sopenharmony_ci   struct si_shader_info *info = &sel->info;
1015bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
1016bf215546Sopenharmony_ci   LLVMValueRef *addrs = ctx->abi.outputs;
1017bf215546Sopenharmony_ci   unsigned max_waves = DIV_ROUND_UP(ctx->screen->ngg_subgroup_size, ctx->ac.wave_size);
1018bf215546Sopenharmony_ci
1019bf215546Sopenharmony_ci   assert(shader->key.ge.opt.ngg_culling);
1020bf215546Sopenharmony_ci   assert(shader->key.ge.as_ngg);
1021bf215546Sopenharmony_ci   assert(sel->stage == MESA_SHADER_VERTEX ||
1022bf215546Sopenharmony_ci          (sel->stage == MESA_SHADER_TESS_EVAL && !shader->key.ge.as_es));
1023bf215546Sopenharmony_ci
1024bf215546Sopenharmony_ci   LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx));
1025bf215546Sopenharmony_ci   LLVMValueRef packed_data = ctx->ac.i32_0;
1026bf215546Sopenharmony_ci   LLVMValueRef position[4] = {};
1027bf215546Sopenharmony_ci   unsigned pos_index = 0;
1028bf215546Sopenharmony_ci   unsigned clip_plane_enable = SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(shader->key.ge.opt.ngg_culling);
1029bf215546Sopenharmony_ci   unsigned clipdist_enable = (sel->info.clipdist_mask & clip_plane_enable) | sel->info.culldist_mask;
1030bf215546Sopenharmony_ci   bool has_clipdist_mask = false;
1031bf215546Sopenharmony_ci
1032bf215546Sopenharmony_ci   for (unsigned i = 0; i < info->num_outputs; i++) {
1033bf215546Sopenharmony_ci      LLVMValueRef clipvertex[4];
1034bf215546Sopenharmony_ci      unsigned base;
1035bf215546Sopenharmony_ci
1036bf215546Sopenharmony_ci      switch (info->output_semantic[i]) {
1037bf215546Sopenharmony_ci      case VARYING_SLOT_POS:
1038bf215546Sopenharmony_ci         /* If we are going to cull everything (rasterizer_discard), discard
1039bf215546Sopenharmony_ci          * the position. This is useful for analyzing maximum theoretical
1040bf215546Sopenharmony_ci          * performance without VS input loads.
1041bf215546Sopenharmony_ci          */
1042bf215546Sopenharmony_ci         if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE &&
1043bf215546Sopenharmony_ci             shader->key.ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE) {
1044bf215546Sopenharmony_ci            for (unsigned j = 0; j < 4; j++)
1045bf215546Sopenharmony_ci               LLVMBuildStore(builder, LLVMGetUndef(ctx->ac.f32), addrs[4 * i + j]);
1046bf215546Sopenharmony_ci            break;
1047bf215546Sopenharmony_ci         }
1048bf215546Sopenharmony_ci
1049bf215546Sopenharmony_ci         pos_index = i;
1050bf215546Sopenharmony_ci         for (unsigned j = 0; j < 4; j++) {
1051bf215546Sopenharmony_ci            position[j] = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32, addrs[4 * i + j], "");
1052bf215546Sopenharmony_ci         }
1053bf215546Sopenharmony_ci
1054bf215546Sopenharmony_ci         /* Store Position.W into LDS. */
1055bf215546Sopenharmony_ci         LLVMBuildStore(
1056bf215546Sopenharmony_ci            builder, ac_to_integer(&ctx->ac, position[3]),
1057bf215546Sopenharmony_ci            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_cull_w, 0)));
1058bf215546Sopenharmony_ci
1059bf215546Sopenharmony_ci         /* Store Position.XY / W into LDS. */
1060bf215546Sopenharmony_ci         for (unsigned chan = 0; chan < 2; chan++) {
1061bf215546Sopenharmony_ci            LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
1062bf215546Sopenharmony_ci            LLVMBuildStore(
1063bf215546Sopenharmony_ci               builder, ac_to_integer(&ctx->ac, val),
1064bf215546Sopenharmony_ci               ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_cull_x_div_w + chan, 0)));
1065bf215546Sopenharmony_ci         }
1066bf215546Sopenharmony_ci         break;
1067bf215546Sopenharmony_ci
1068bf215546Sopenharmony_ci      case VARYING_SLOT_CLIP_DIST0:
1069bf215546Sopenharmony_ci      case VARYING_SLOT_CLIP_DIST1:
1070bf215546Sopenharmony_ci         base = info->output_semantic[i] == VARYING_SLOT_CLIP_DIST1 ? 4 : 0;
1071bf215546Sopenharmony_ci
1072bf215546Sopenharmony_ci         for (unsigned j = 0; j < 4; j++) {
1073bf215546Sopenharmony_ci            unsigned index = base + j;
1074bf215546Sopenharmony_ci
1075bf215546Sopenharmony_ci            if (!(clipdist_enable & BITFIELD_BIT(index)))
1076bf215546Sopenharmony_ci               continue;
1077bf215546Sopenharmony_ci
1078bf215546Sopenharmony_ci            LLVMValueRef distance = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32, addrs[4 * i + j], "");
1079bf215546Sopenharmony_ci            add_clipdist_bit(ctx, distance, index, &packed_data);
1080bf215546Sopenharmony_ci            has_clipdist_mask = true;
1081bf215546Sopenharmony_ci         }
1082bf215546Sopenharmony_ci         break;
1083bf215546Sopenharmony_ci
1084bf215546Sopenharmony_ci      case VARYING_SLOT_CLIP_VERTEX:
1085bf215546Sopenharmony_ci         for (unsigned j = 0; j < 4; j++)
1086bf215546Sopenharmony_ci            clipvertex[j] = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32, addrs[4 * i + j], "");
1087bf215546Sopenharmony_ci
1088bf215546Sopenharmony_ci         if (add_clipdist_bits_for_clipvertex(ctx, clipdist_enable, clipvertex, &packed_data))
1089bf215546Sopenharmony_ci            has_clipdist_mask = true;
1090bf215546Sopenharmony_ci         break;
1091bf215546Sopenharmony_ci      }
1092bf215546Sopenharmony_ci   }
1093bf215546Sopenharmony_ci
1094bf215546Sopenharmony_ci   if (clip_plane_enable && !sel->info.clipdist_mask) {
1095bf215546Sopenharmony_ci      /* When clip planes are enabled and there are no clip distance outputs,
1096bf215546Sopenharmony_ci       * we should use user clip planes and cull against the position.
1097bf215546Sopenharmony_ci       */
1098bf215546Sopenharmony_ci      assert(!has_clipdist_mask);
1099bf215546Sopenharmony_ci      if (add_clipdist_bits_for_clipvertex(ctx, clipdist_enable, position, &packed_data))
1100bf215546Sopenharmony_ci         has_clipdist_mask = true;
1101bf215546Sopenharmony_ci   }
1102bf215546Sopenharmony_ci
1103bf215546Sopenharmony_ci   /* Initialize the packed data. */
1104bf215546Sopenharmony_ci   LLVMBuildStore(
1105bf215546Sopenharmony_ci      builder, packed_data,
1106bf215546Sopenharmony_ci      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
1107bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1108bf215546Sopenharmony_ci
1109bf215546Sopenharmony_ci   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
1110bf215546Sopenharmony_ci   ac_build_s_barrier(&ctx->ac, ctx->stage);
1111bf215546Sopenharmony_ci
1112bf215546Sopenharmony_ci   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
1113bf215546Sopenharmony_ci
1114bf215546Sopenharmony_ci   unsigned num_vertices;
1115bf215546Sopenharmony_ci   ngg_get_vertices_per_prim(ctx, &num_vertices);
1116bf215546Sopenharmony_ci
1117bf215546Sopenharmony_ci   /* The hardware requires that there are no holes between unculled vertices,
1118bf215546Sopenharmony_ci    * which means we have to pack ES threads, i.e. reduce the ES thread count
1119bf215546Sopenharmony_ci    * and move ES input VGPRs to lower threads. The upside is that varyings
1120bf215546Sopenharmony_ci    * are only fetched and computed for unculled vertices.
1121bf215546Sopenharmony_ci    *
1122bf215546Sopenharmony_ci    * Vertex compaction:
1123bf215546Sopenharmony_ci    *
1124bf215546Sopenharmony_ci    * Part 1: Store the surviving vertex count for each wave in LDS.
1125bf215546Sopenharmony_ci    *   - The GS culling code notifies ES threads which vertices were accepted.
1126bf215546Sopenharmony_ci    *   - Barrier
1127bf215546Sopenharmony_ci    *   - ES threads will compute the vertex count and store it in LDS.
1128bf215546Sopenharmony_ci    * - Barrier
1129bf215546Sopenharmony_ci    * - Each wave loads the vertex counts from LDS.
1130bf215546Sopenharmony_ci    *
1131bf215546Sopenharmony_ci    * Part 2: Compact ES threads:
1132bf215546Sopenharmony_ci    * - Compute the prefix sum for each surviving vertex. This is the new thread ID
1133bf215546Sopenharmony_ci    *   of the vertex.
1134bf215546Sopenharmony_ci    * - Write input VGPRs and vertex positions for each surviving vertex into the LDS
1135bf215546Sopenharmony_ci    *   address of the new thread ID.
1136bf215546Sopenharmony_ci    * - Now kill all waves that have inactive threads.
1137bf215546Sopenharmony_ci    * - Barrier
1138bf215546Sopenharmony_ci    * - Update vertex indices and null flag in the GS input VGPRs.
1139bf215546Sopenharmony_ci    *
1140bf215546Sopenharmony_ci    * Part 3: Update inputs GPRs
1141bf215546Sopenharmony_ci    * - For all waves, update per-wave thread counts in input SGPRs.
1142bf215546Sopenharmony_ci    * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
1143bf215546Sopenharmony_ci    */
1144bf215546Sopenharmony_ci
1145bf215546Sopenharmony_ci   LLVMValueRef vtxindex[3];
1146bf215546Sopenharmony_ci   for (unsigned i = 0; i < num_vertices; ++i)
1147bf215546Sopenharmony_ci      vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16);
1148bf215546Sopenharmony_ci
1149bf215546Sopenharmony_ci   LLVMValueRef gs_vtxptr[3];
1150bf215546Sopenharmony_ci   for (unsigned i = 0; i < num_vertices; i++)
1151bf215546Sopenharmony_ci      gs_vtxptr[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1152bf215546Sopenharmony_ci
1153bf215546Sopenharmony_ci   es_vtxptr = ngg_nogs_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx));
1154bf215546Sopenharmony_ci
1155bf215546Sopenharmony_ci   /* Adding these optimization barriers improves the generated code as follows. Crazy right?
1156bf215546Sopenharmony_ci    *
1157bf215546Sopenharmony_ci    * - s_mov_b32 s4, 0xffff
1158bf215546Sopenharmony_ci    * - v_lshrrev_b32_e32 v10, 16, v0
1159bf215546Sopenharmony_ci    * - v_and_b32_e32 v12, s4, v0
1160bf215546Sopenharmony_ci    * - v_and_b32_e32 v11, s4, v1
1161bf215546Sopenharmony_ci    *   s_bfe_u32 s4, s3, 0x80008
1162bf215546Sopenharmony_ci    * - s_mov_b64 s[8:9], 0
1163bf215546Sopenharmony_ci    * - v_mul_u32_u24_e32 v0, 28, v10
1164bf215546Sopenharmony_ci    * - v_mul_u32_u24_e32 v9, 28, v12
1165bf215546Sopenharmony_ci    * - v_mul_u32_u24_e32 v1, 28, v11
1166bf215546Sopenharmony_ci    * + v_mov_b32_e32 v11, 28
1167bf215546Sopenharmony_ci    *   v_cmp_gt_u32_e32 vcc, s4, v2
1168bf215546Sopenharmony_ci    * + s_mov_b64 s[8:9], 0
1169bf215546Sopenharmony_ci    *   s_waitcnt lgkmcnt(0)
1170bf215546Sopenharmony_ci    *   s_barrier
1171bf215546Sopenharmony_ci    * + v_mul_u32_u24_sdwa v10, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1172bf215546Sopenharmony_ci    * + v_mul_u32_u24_sdwa v23, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1173bf215546Sopenharmony_ci    * + v_mul_u32_u24_sdwa v0, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1174bf215546Sopenharmony_ci    *   s_and_saveexec_b64 s[44:45], vcc
1175bf215546Sopenharmony_ci    *   s_cbranch_execz BB2_8
1176bf215546Sopenharmony_ci    * - v_mul_u32_u24_e32 v16, 28, v12
1177bf215546Sopenharmony_ci    * - v_mul_u32_u24_e32 v17, 28, v11
1178bf215546Sopenharmony_ci    * - v_mul_u32_u24_e32 v18, 28, v10
1179bf215546Sopenharmony_ci    */
1180bf215546Sopenharmony_ci   for (unsigned i = 0; i < num_vertices; i++)
1181bf215546Sopenharmony_ci      ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[i], false);
1182bf215546Sopenharmony_ci
1183bf215546Sopenharmony_ci   LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1184bf215546Sopenharmony_ci
1185bf215546Sopenharmony_ci   /* Do culling in GS threads. */
1186bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
1187bf215546Sopenharmony_ci   {
1188bf215546Sopenharmony_ci      /* Load positions. */
1189bf215546Sopenharmony_ci      LLVMValueRef pos[3][4] = {};
1190bf215546Sopenharmony_ci      LLVMValueRef clipdist_neg_mask = NULL;
1191bf215546Sopenharmony_ci
1192bf215546Sopenharmony_ci      for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
1193bf215546Sopenharmony_ci         for (unsigned chan = 0; chan < 4; chan++) {
1194bf215546Sopenharmony_ci            unsigned index;
1195bf215546Sopenharmony_ci            if (chan == 0 || chan == 1)
1196bf215546Sopenharmony_ci               index = lds_pos_cull_x_div_w + chan;
1197bf215546Sopenharmony_ci            else if (chan == 3)
1198bf215546Sopenharmony_ci               index = lds_pos_cull_w;
1199bf215546Sopenharmony_ci            else
1200bf215546Sopenharmony_ci               continue;
1201bf215546Sopenharmony_ci
1202bf215546Sopenharmony_ci            LLVMValueRef addr =
1203bf215546Sopenharmony_ci               ac_build_gep0(&ctx->ac, gs_vtxptr[vtx], LLVMConstInt(ctx->ac.i32, index, 0));
1204bf215546Sopenharmony_ci            pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
1205bf215546Sopenharmony_ci            pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
1206bf215546Sopenharmony_ci         }
1207bf215546Sopenharmony_ci
1208bf215546Sopenharmony_ci         if (has_clipdist_mask) {
1209bf215546Sopenharmony_ci            /* Load and AND clip distance masks. Each bit means whether that clip distance is
1210bf215546Sopenharmony_ci             * negative. If all masks are AND'ed and the result is 0, the primitive isn't culled
1211bf215546Sopenharmony_ci             * by clip distances.
1212bf215546Sopenharmony_ci             */
1213bf215546Sopenharmony_ci            LLVMValueRef addr = si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte3_clipdist_neg_mask);
1214bf215546Sopenharmony_ci            LLVMValueRef mask = LLVMBuildLoad2(builder, ctx->ac.i8, addr, "");
1215bf215546Sopenharmony_ci            if (!clipdist_neg_mask)
1216bf215546Sopenharmony_ci               clipdist_neg_mask = mask;
1217bf215546Sopenharmony_ci            else
1218bf215546Sopenharmony_ci               clipdist_neg_mask = LLVMBuildAnd(builder, clipdist_neg_mask, mask, "");
1219bf215546Sopenharmony_ci         }
1220bf215546Sopenharmony_ci      }
1221bf215546Sopenharmony_ci
1222bf215546Sopenharmony_ci      LLVMValueRef clipdist_accepted =
1223bf215546Sopenharmony_ci         has_clipdist_mask ? LLVMBuildICmp(builder, LLVMIntEQ, clipdist_neg_mask, ctx->ac.i8_0, "")
1224bf215546Sopenharmony_ci                           : ctx->ac.i1true;
1225bf215546Sopenharmony_ci
1226bf215546Sopenharmony_ci      cull_primitive(ctx, pos, clipdist_accepted, gs_accepted, gs_vtxptr);
1227bf215546Sopenharmony_ci   }
1228bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 16002);
1229bf215546Sopenharmony_ci
1230bf215546Sopenharmony_ci   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
1231bf215546Sopenharmony_ci   ac_build_s_barrier(&ctx->ac, ctx->stage);
1232bf215546Sopenharmony_ci
1233bf215546Sopenharmony_ci   gs_accepted = LLVMBuildLoad2(builder, ctx->ac.i32, gs_accepted, "");
1234bf215546Sopenharmony_ci
1235bf215546Sopenharmony_ci   LLVMValueRef vertex_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
1236bf215546Sopenharmony_ci   LLVMValueRef vertex_mask = ac_build_alloca(&ctx->ac, ctx->ac.iN_wavemask, "");
1237bf215546Sopenharmony_ci
1238bf215546Sopenharmony_ci   /* Convert the per-vertex accept flag to a vertex thread mask, store it in registers. */
1239bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
1240bf215546Sopenharmony_ci   {
1241bf215546Sopenharmony_ci      LLVMValueRef accepted =
1242bf215546Sopenharmony_ci         LLVMBuildLoad2(builder, ctx->ac.i8, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
1243bf215546Sopenharmony_ci      accepted = LLVMBuildICmp(builder, LLVMIntNE, accepted, ctx->ac.i8_0, "");
1244bf215546Sopenharmony_ci      LLVMValueRef mask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
1245bf215546Sopenharmony_ci
1246bf215546Sopenharmony_ci      LLVMBuildStore(builder, accepted, vertex_accepted);
1247bf215546Sopenharmony_ci      LLVMBuildStore(builder, mask, vertex_mask);
1248bf215546Sopenharmony_ci   }
1249bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 16007);
1250bf215546Sopenharmony_ci
1251bf215546Sopenharmony_ci   /* Store the per-wave vertex count to LDS. Non-ES waves store 0. */
1252bf215546Sopenharmony_ci   vertex_mask = LLVMBuildLoad2(builder, ctx->ac.iN_wavemask, vertex_mask, "");
1253bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, ctx->ac.i32_0, ""), 16008);
1254bf215546Sopenharmony_ci   {
1255bf215546Sopenharmony_ci      LLVMValueRef vertex_count = ac_build_bit_count(&ctx->ac, vertex_mask);
1256bf215546Sopenharmony_ci      LLVMBuildStore(builder, LLVMBuildTrunc(builder, vertex_count, ctx->ac.i8, ""),
1257bf215546Sopenharmony_ci                     si_build_gep_i8_var(ctx, ctx->gs_ngg_scratch, get_wave_id_in_tg(ctx)));
1258bf215546Sopenharmony_ci   }
1259bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 16008);
1260bf215546Sopenharmony_ci
1261bf215546Sopenharmony_ci   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
1262bf215546Sopenharmony_ci   ac_build_s_barrier(&ctx->ac, ctx->stage);
1263bf215546Sopenharmony_ci
1264bf215546Sopenharmony_ci   /* Load the vertex masks and compute the new ES thread count. */
1265bf215546Sopenharmony_ci   LLVMValueRef new_num_es_threads, prefix_sum, kill_wave;
1266bf215546Sopenharmony_ci   load_vertex_counts(ctx, ctx->gs_ngg_scratch, max_waves, tid, &new_num_es_threads,
1267bf215546Sopenharmony_ci                      &prefix_sum);
1268bf215546Sopenharmony_ci
1269bf215546Sopenharmony_ci   bool uses_instance_id = ctx->stage == MESA_SHADER_VERTEX &&
1270bf215546Sopenharmony_ci                           (sel->info.uses_instanceid ||
1271bf215546Sopenharmony_ci                            shader->key.ge.part.vs.prolog.instance_divisor_is_one ||
1272bf215546Sopenharmony_ci                            shader->key.ge.part.vs.prolog.instance_divisor_is_fetched);
1273bf215546Sopenharmony_ci   bool uses_tes_prim_id = ctx->stage == MESA_SHADER_TESS_EVAL &&
1274bf215546Sopenharmony_ci                           (sel->info.uses_primid || shader->key.ge.mono.u.vs_export_prim_id);
1275bf215546Sopenharmony_ci
1276bf215546Sopenharmony_ci   /* ES threads compute their prefix sum, which is the new ES thread ID.
1277bf215546Sopenharmony_ci    * Then they write the vertex position and input VGPRs into the LDS address
1278bf215546Sopenharmony_ci    * of the new thread ID. It will be used to load input VGPRs by compacted
1279bf215546Sopenharmony_ci    * threads.
1280bf215546Sopenharmony_ci    */
1281bf215546Sopenharmony_ci   vertex_accepted = LLVMBuildLoad2(builder, ctx->ac.i1, vertex_accepted, "");
1282bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, vertex_accepted, 16009);
1283bf215546Sopenharmony_ci   {
1284bf215546Sopenharmony_ci      /* Add the number of bits set in vertex_mask up to the current thread ID - 1
1285bf215546Sopenharmony_ci       * to get the prefix sum.
1286bf215546Sopenharmony_ci       */
1287bf215546Sopenharmony_ci      prefix_sum = LLVMBuildAdd(builder, prefix_sum, ac_build_mbcnt(&ctx->ac, vertex_mask), "");
1288bf215546Sopenharmony_ci
1289bf215546Sopenharmony_ci      LLVMValueRef new_id = prefix_sum;
1290bf215546Sopenharmony_ci      LLVMValueRef new_vtx = ngg_nogs_vertex_ptr(ctx, new_id);
1291bf215546Sopenharmony_ci
1292bf215546Sopenharmony_ci      LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
1293bf215546Sopenharmony_ci                     si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
1294bf215546Sopenharmony_ci
1295bf215546Sopenharmony_ci      /* Store Position.XYZW into LDS. */
1296bf215546Sopenharmony_ci      for (unsigned chan = 0; chan < 4; chan++) {
1297bf215546Sopenharmony_ci         LLVMBuildStore(
1298bf215546Sopenharmony_ci            builder, ac_to_integer(&ctx->ac,
1299bf215546Sopenharmony_ci                                   LLVMBuildLoad2(builder, ctx->ac.f32, addrs[4 * pos_index + chan], "")),
1300bf215546Sopenharmony_ci            ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
1301bf215546Sopenharmony_ci      }
1302bf215546Sopenharmony_ci
1303bf215546Sopenharmony_ci      /* Store VertexID and InstanceID into LDS. ES threads will have to load them
1304bf215546Sopenharmony_ci       * from LDS after vertex compaction and use them instead of their own
1305bf215546Sopenharmony_ci       * system values.
1306bf215546Sopenharmony_ci       */
1307bf215546Sopenharmony_ci      if (ctx->stage == MESA_SHADER_VERTEX) {
1308bf215546Sopenharmony_ci         LLVMBuildStore(
1309bf215546Sopenharmony_ci            builder, ctx->abi.vertex_id,
1310bf215546Sopenharmony_ci            ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
1311bf215546Sopenharmony_ci         if (uses_instance_id) {
1312bf215546Sopenharmony_ci            LLVMBuildStore(
1313bf215546Sopenharmony_ci               builder, ctx->abi.instance_id,
1314bf215546Sopenharmony_ci               ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
1315bf215546Sopenharmony_ci         }
1316bf215546Sopenharmony_ci      } else {
1317bf215546Sopenharmony_ci         assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1318bf215546Sopenharmony_ci         LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tes_u)),
1319bf215546Sopenharmony_ci                        ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
1320bf215546Sopenharmony_ci         LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tes_v)),
1321bf215546Sopenharmony_ci                        ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
1322bf215546Sopenharmony_ci         LLVMBuildStore(builder, LLVMBuildTrunc(builder, ac_get_arg(&ctx->ac, ctx->args.tes_rel_patch_id), ctx->ac.i8, ""),
1323bf215546Sopenharmony_ci                        si_build_gep_i8(ctx, new_vtx, lds_byte2_tes_rel_patch_id));
1324bf215546Sopenharmony_ci         if (uses_tes_prim_id) {
1325bf215546Sopenharmony_ci            LLVMBuildStore(
1326bf215546Sopenharmony_ci               builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
1327bf215546Sopenharmony_ci               ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
1328bf215546Sopenharmony_ci         }
1329bf215546Sopenharmony_ci      }
1330bf215546Sopenharmony_ci   }
1331bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 16009);
1332bf215546Sopenharmony_ci
1333bf215546Sopenharmony_ci   /* If all vertices are culled, set the primitive count to 0, so that all waves are culled here. */
1334bf215546Sopenharmony_ci   LLVMValueRef num_primitives = ngg_get_prim_cnt(ctx);
1335bf215546Sopenharmony_ci   num_primitives = LLVMBuildSelect(builder,
1336bf215546Sopenharmony_ci                                    LLVMBuildICmp(builder, LLVMIntEQ, new_num_es_threads,
1337bf215546Sopenharmony_ci                                                  ctx->ac.i32_0, ""),
1338bf215546Sopenharmony_ci                                    ctx->ac.i32_0, num_primitives, "");
1339bf215546Sopenharmony_ci   /* Kill waves that have inactive threads. */
1340bf215546Sopenharmony_ci   kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
1341bf215546Sopenharmony_ci                             ac_build_imax(&ctx->ac, new_num_es_threads, num_primitives),
1342bf215546Sopenharmony_ci                             LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
1343bf215546Sopenharmony_ci                                          LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""),
1344bf215546Sopenharmony_ci                             "");
1345bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, kill_wave, 19202);
1346bf215546Sopenharmony_ci   {
1347bf215546Sopenharmony_ci      /* If we are killing wave 0, send that there are no primitives
1348bf215546Sopenharmony_ci       * in this threadgroup.
1349bf215546Sopenharmony_ci       */
1350bf215546Sopenharmony_ci      ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ctx->ac.i32_0);
1351bf215546Sopenharmony_ci      ac_build_s_endpgm(&ctx->ac);
1352bf215546Sopenharmony_ci   }
1353bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 19202);
1354bf215546Sopenharmony_ci
1355bf215546Sopenharmony_ci   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
1356bf215546Sopenharmony_ci   ac_build_s_barrier(&ctx->ac, ctx->stage);
1357bf215546Sopenharmony_ci
1358bf215546Sopenharmony_ci   /* Send the final vertex and primitive counts. */
1359bf215546Sopenharmony_ci   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), new_num_es_threads,
1360bf215546Sopenharmony_ci                                 ngg_get_prim_cnt(ctx));
1361bf215546Sopenharmony_ci
1362bf215546Sopenharmony_ci   /* Update thread counts in SGPRs. */
1363bf215546Sopenharmony_ci   LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->args.gs_tg_info);
1364bf215546Sopenharmony_ci   LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->args.merged_wave_info);
1365bf215546Sopenharmony_ci
1366bf215546Sopenharmony_ci   /* This also converts the thread count from the total count to the per-wave count. */
1367bf215546Sopenharmony_ci   update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12, &new_merged_wave_info, 8,
1368bf215546Sopenharmony_ci                        0);
1369bf215546Sopenharmony_ci
1370bf215546Sopenharmony_ci   /* Update vertex indices in VGPR0 (same format as NGG passthrough).
1371bf215546Sopenharmony_ci    *
1372bf215546Sopenharmony_ci    * Set the null flag at the beginning (culled), and then
1373bf215546Sopenharmony_ci    * overwrite it for accepted primitives.
1374bf215546Sopenharmony_ci    */
1375bf215546Sopenharmony_ci   LLVMValueRef new_vgpr0 =
1376bf215546Sopenharmony_ci      ac_build_alloca_init(&ctx->ac, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), "");
1377bf215546Sopenharmony_ci
1378bf215546Sopenharmony_ci   /* Get vertex indices after vertex compaction. */
1379bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
1380bf215546Sopenharmony_ci   {
1381bf215546Sopenharmony_ci      struct ac_ngg_prim prim = {};
1382bf215546Sopenharmony_ci      prim.num_vertices = num_vertices;
1383bf215546Sopenharmony_ci      prim.isnull = ctx->ac.i1false;
1384bf215546Sopenharmony_ci
1385bf215546Sopenharmony_ci      if (gfx10_edgeflags_have_effect(shader))
1386bf215546Sopenharmony_ci         prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args);
1387bf215546Sopenharmony_ci      else
1388bf215546Sopenharmony_ci         prim.edgeflags = ctx->ac.i32_0;
1389bf215546Sopenharmony_ci
1390bf215546Sopenharmony_ci      for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
1391bf215546Sopenharmony_ci         prim.index[vtx] = LLVMBuildLoad2(
1392bf215546Sopenharmony_ci            builder, ctx->ac.i8, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), "");
1393bf215546Sopenharmony_ci         prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
1394bf215546Sopenharmony_ci      }
1395bf215546Sopenharmony_ci
1396bf215546Sopenharmony_ci      /* Set the new GS input VGPR. */
1397bf215546Sopenharmony_ci      LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
1398bf215546Sopenharmony_ci   }
1399bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 16011);
1400bf215546Sopenharmony_ci
1401bf215546Sopenharmony_ci   if (gfx10_ngg_export_prim_early(shader))
1402bf215546Sopenharmony_ci      gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad2(builder, ctx->ac.i32, new_vgpr0, ""));
1403bf215546Sopenharmony_ci
1404bf215546Sopenharmony_ci   /* Prepare LDS addresses of the new ES input VGPRs. */
1405bf215546Sopenharmony_ci   LLVMValueRef input_vgpr_addresses[4] = {
1406bf215546Sopenharmony_ci      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)),
1407bf215546Sopenharmony_ci      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)),
1408bf215546Sopenharmony_ci   };
1409bf215546Sopenharmony_ci   if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1410bf215546Sopenharmony_ci      input_vgpr_addresses[2] = si_build_gep_i8(ctx, es_vtxptr, lds_byte2_tes_rel_patch_id);
1411bf215546Sopenharmony_ci      if (uses_tes_prim_id) {
1412bf215546Sopenharmony_ci         input_vgpr_addresses[3] = ac_build_gep0(&ctx->ac, es_vtxptr,
1413bf215546Sopenharmony_ci                                                 LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0));
1414bf215546Sopenharmony_ci      }
1415bf215546Sopenharmony_ci   }
1416bf215546Sopenharmony_ci
1417bf215546Sopenharmony_ci   /* Return values for the main function. */
1418bf215546Sopenharmony_ci   LLVMValueRef ret = ctx->return_value;
1419bf215546Sopenharmony_ci   LLVMValueRef val;
1420bf215546Sopenharmony_ci
1421bf215546Sopenharmony_ci   ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
1422bf215546Sopenharmony_ci   ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
1423bf215546Sopenharmony_ci   if (ctx->stage == MESA_SHADER_TESS_EVAL)
1424bf215546Sopenharmony_ci      ret = si_insert_input_ret(ctx, ret, ctx->args.tess_offchip_offset, 4);
1425bf215546Sopenharmony_ci   if (ctx->ac.gfx_level >= GFX11)
1426bf215546Sopenharmony_ci      ret = si_insert_input_ret(ctx, ret, ctx->args.gs_attr_offset, 5);
1427bf215546Sopenharmony_ci
1428bf215546Sopenharmony_ci   ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS);
1429bf215546Sopenharmony_ci   ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
1430bf215546Sopenharmony_ci                             8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
1431bf215546Sopenharmony_ci   ret = si_insert_input_ptr(ctx, ret, ctx->const_and_shader_buffers,
1432bf215546Sopenharmony_ci                             8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
1433bf215546Sopenharmony_ci   ret = si_insert_input_ptr(ctx, ret, ctx->samplers_and_images, 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
1434bf215546Sopenharmony_ci   ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
1435bf215546Sopenharmony_ci   if (ctx->ac.gfx_level >= GFX11)
1436bf215546Sopenharmony_ci      ret = si_insert_input_ptr(ctx, ret, ctx->gs_attr_address, 8 + GFX9_SGPR_ATTRIBUTE_RING_ADDR);
1437bf215546Sopenharmony_ci
1438bf215546Sopenharmony_ci   if (ctx->stage == MESA_SHADER_VERTEX) {
1439bf215546Sopenharmony_ci      ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, 8 + SI_SGPR_BASE_VERTEX);
1440bf215546Sopenharmony_ci      ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id, 8 + SI_SGPR_DRAWID);
1441bf215546Sopenharmony_ci      ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, 8 + SI_SGPR_START_INSTANCE);
1442bf215546Sopenharmony_ci      ret = si_insert_input_ptr(ctx, ret, ctx->args.vertex_buffers, 8 + GFX9_GS_NUM_USER_SGPR);
1443bf215546Sopenharmony_ci
1444bf215546Sopenharmony_ci      for (unsigned i = 0; i < shader->selector->info.num_vbos_in_user_sgprs; i++) {
1445bf215546Sopenharmony_ci         ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
1446bf215546Sopenharmony_ci                                     8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
1447bf215546Sopenharmony_ci      }
1448bf215546Sopenharmony_ci   } else {
1449bf215546Sopenharmony_ci      assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1450bf215546Sopenharmony_ci      ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, 8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
1451bf215546Sopenharmony_ci      ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr, 8 + SI_SGPR_TES_OFFCHIP_ADDR);
1452bf215546Sopenharmony_ci   }
1453bf215546Sopenharmony_ci
1454bf215546Sopenharmony_ci   unsigned vgpr;
1455bf215546Sopenharmony_ci   if (ctx->stage == MESA_SHADER_VERTEX) {
1456bf215546Sopenharmony_ci      if (shader->selector->info.num_vbos_in_user_sgprs) {
1457bf215546Sopenharmony_ci         vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->info.num_vbos_in_user_sgprs * 4;
1458bf215546Sopenharmony_ci      } else {
1459bf215546Sopenharmony_ci         vgpr = 8 + GFX9_GS_NUM_USER_SGPR + 1;
1460bf215546Sopenharmony_ci      }
1461bf215546Sopenharmony_ci   } else {
1462bf215546Sopenharmony_ci      vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
1463bf215546Sopenharmony_ci   }
1464bf215546Sopenharmony_ci
1465bf215546Sopenharmony_ci   val = LLVMBuildLoad2(builder, ctx->ac.i32, new_vgpr0, "");
1466bf215546Sopenharmony_ci   ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1467bf215546Sopenharmony_ci   vgpr++; /* gs_vtx_offset[1] = offsets of vertices 2-3  */
1468bf215546Sopenharmony_ci
1469bf215546Sopenharmony_ci   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
1470bf215546Sopenharmony_ci   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
1471bf215546Sopenharmony_ci   vgpr++; /* gs_vtx_offset[2] = offsets of vertices 4-5 */
1472bf215546Sopenharmony_ci
1473bf215546Sopenharmony_ci   /* Set the input VPGRs to the corresponding LDS addresses where the VGPR values are
1474bf215546Sopenharmony_ci    * stored. The VS prolog will load them.
1475bf215546Sopenharmony_ci    */
1476bf215546Sopenharmony_ci   if (ctx->stage == MESA_SHADER_VERTEX) {
1477bf215546Sopenharmony_ci      val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[0], ctx->ac.i32, "");
1478bf215546Sopenharmony_ci      ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
1479bf215546Sopenharmony_ci                                 ""); /* VGPR5 - VertexID */
1480bf215546Sopenharmony_ci      vgpr += 2;
1481bf215546Sopenharmony_ci      if (uses_instance_id) {
1482bf215546Sopenharmony_ci         val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[1], ctx->ac.i32, "");
1483bf215546Sopenharmony_ci         ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
1484bf215546Sopenharmony_ci                                    ""); /* VGPR8 - InstanceID */
1485bf215546Sopenharmony_ci      } else {
1486bf215546Sopenharmony_ci         vgpr++;
1487bf215546Sopenharmony_ci      }
1488bf215546Sopenharmony_ci   } else {
1489bf215546Sopenharmony_ci      assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1490bf215546Sopenharmony_ci      unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
1491bf215546Sopenharmony_ci      for (unsigned i = 0; i < num_vgprs; i++) {
1492bf215546Sopenharmony_ci         val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[i], ctx->ac.i32, "");
1493bf215546Sopenharmony_ci         ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1494bf215546Sopenharmony_ci      }
1495bf215546Sopenharmony_ci      if (num_vgprs == 3)
1496bf215546Sopenharmony_ci         vgpr++;
1497bf215546Sopenharmony_ci   }
1498bf215546Sopenharmony_ci
1499bf215546Sopenharmony_ci   /* These two also use LDS. */
1500bf215546Sopenharmony_ci   if (gfx10_ngg_writes_user_edgeflags(shader) ||
1501bf215546Sopenharmony_ci       (ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)) {
1502bf215546Sopenharmony_ci      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
1503bf215546Sopenharmony_ci      ac_build_s_barrier(&ctx->ac, ctx->stage);
1504bf215546Sopenharmony_ci   }
1505bf215546Sopenharmony_ci
1506bf215546Sopenharmony_ci   ctx->return_value = ret;
1507bf215546Sopenharmony_ci}
1508bf215546Sopenharmony_ci
1509bf215546Sopenharmony_ci/**
1510bf215546Sopenharmony_ci * Emit the end of an API VS or TES shader compiled as ESGS shader.
1511bf215546Sopenharmony_ci */
1512bf215546Sopenharmony_civoid gfx10_ngg_build_end(struct si_shader_context *ctx)
1513bf215546Sopenharmony_ci{
1514bf215546Sopenharmony_ci   struct si_shader_selector *sel = ctx->shader->selector;
1515bf215546Sopenharmony_ci   struct si_shader_info *info = &sel->info;
1516bf215546Sopenharmony_ci   struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1517bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
1518bf215546Sopenharmony_ci   LLVMValueRef *addrs = ctx->abi.outputs;
1519bf215546Sopenharmony_ci   LLVMValueRef tmp, tmp2;
1520bf215546Sopenharmony_ci
1521bf215546Sopenharmony_ci   assert(!ctx->shader->is_gs_copy_shader);
1522bf215546Sopenharmony_ci   assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
1523bf215546Sopenharmony_ci
1524bf215546Sopenharmony_ci   LLVMValueRef vertex_ptr = NULL;
1525bf215546Sopenharmony_ci
1526bf215546Sopenharmony_ci   if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader))
1527bf215546Sopenharmony_ci      vertex_ptr = ngg_nogs_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx));
1528bf215546Sopenharmony_ci
1529bf215546Sopenharmony_ci   for (unsigned i = 0; i < info->num_outputs; i++) {
1530bf215546Sopenharmony_ci      outputs[i].semantic = info->output_semantic[i];
1531bf215546Sopenharmony_ci
1532bf215546Sopenharmony_ci      for (unsigned j = 0; j < 4; j++) {
1533bf215546Sopenharmony_ci         outputs[i].vertex_streams = info->output_streams[i];
1534bf215546Sopenharmony_ci
1535bf215546Sopenharmony_ci         /* TODO: we may store more outputs than streamout needs,
1536bf215546Sopenharmony_ci          * but streamout performance isn't that important.
1537bf215546Sopenharmony_ci          */
1538bf215546Sopenharmony_ci         if (ctx->so.num_outputs) {
1539bf215546Sopenharmony_ci            tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
1540bf215546Sopenharmony_ci            tmp2 = LLVMBuildLoad2(builder, ctx->ac.f32, addrs[4 * i + j], "");
1541bf215546Sopenharmony_ci            LLVMTypeRef type = ac_to_integer_type(&ctx->ac, ctx->ac.f32);
1542bf215546Sopenharmony_ci            tmp2 = LLVMBuildBitCast(ctx->ac.builder, tmp2, type, "");
1543bf215546Sopenharmony_ci            LLVMBuildStore(builder, tmp2, tmp);
1544bf215546Sopenharmony_ci         }
1545bf215546Sopenharmony_ci      }
1546bf215546Sopenharmony_ci
1547bf215546Sopenharmony_ci      /* Store the edgeflag at the end (if streamout is enabled) */
1548bf215546Sopenharmony_ci      if (info->output_semantic[i] == VARYING_SLOT_EDGE && gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
1549bf215546Sopenharmony_ci         LLVMValueRef edgeflag = LLVMBuildLoad2(builder, ctx->ac.f32, addrs[4 * i], "");
1550bf215546Sopenharmony_ci         /* The output is a float, but the hw expects a 1-bit integer. */
1551bf215546Sopenharmony_ci         edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
1552bf215546Sopenharmony_ci         edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
1553bf215546Sopenharmony_ci
1554bf215546Sopenharmony_ci         tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1555bf215546Sopenharmony_ci         tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1556bf215546Sopenharmony_ci         LLVMBuildStore(builder, edgeflag, tmp);
1557bf215546Sopenharmony_ci      }
1558bf215546Sopenharmony_ci   }
1559bf215546Sopenharmony_ci
1560bf215546Sopenharmony_ci   bool unterminated_es_if_block =
1561bf215546Sopenharmony_ci      !ctx->so.num_outputs && !gfx10_ngg_writes_user_edgeflags(ctx->shader) &&
1562bf215546Sopenharmony_ci      !ctx->screen->use_ngg_streamout && /* no query buffer */
1563bf215546Sopenharmony_ci      (ctx->stage != MESA_SHADER_VERTEX || !ctx->shader->key.ge.mono.u.vs_export_prim_id);
1564bf215546Sopenharmony_ci
1565bf215546Sopenharmony_ci   if (!unterminated_es_if_block)
1566bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1567bf215546Sopenharmony_ci
1568bf215546Sopenharmony_ci   LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
1569bf215546Sopenharmony_ci   LLVMValueRef is_es_thread = si_is_es_thread(ctx);
1570bf215546Sopenharmony_ci   LLVMValueRef vtxindex[3];
1571bf215546Sopenharmony_ci
1572bf215546Sopenharmony_ci   if (ctx->shader->key.ge.opt.ngg_culling || gfx10_is_ngg_passthrough(ctx->shader)) {
1573bf215546Sopenharmony_ci      for (unsigned i = 0; i < 3; ++i)
1574bf215546Sopenharmony_ci         vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[0], 10 * i, 9);
1575bf215546Sopenharmony_ci   } else {
1576bf215546Sopenharmony_ci      for (unsigned i = 0; i < 3; ++i)
1577bf215546Sopenharmony_ci         vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16);
1578bf215546Sopenharmony_ci   }
1579bf215546Sopenharmony_ci
1580bf215546Sopenharmony_ci   /* Determine the number of vertices per primitive. */
1581bf215546Sopenharmony_ci   unsigned num_vertices;
1582bf215546Sopenharmony_ci   LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
1583bf215546Sopenharmony_ci
1584bf215546Sopenharmony_ci   /* Streamout */
1585bf215546Sopenharmony_ci   LLVMValueRef emitted_prims = NULL;
1586bf215546Sopenharmony_ci
1587bf215546Sopenharmony_ci   if (ctx->so.num_outputs) {
1588bf215546Sopenharmony_ci      assert(!unterminated_es_if_block);
1589bf215546Sopenharmony_ci
1590bf215546Sopenharmony_ci      struct ngg_streamout nggso = {};
1591bf215546Sopenharmony_ci      nggso.num_vertices = num_vertices_val;
1592bf215546Sopenharmony_ci      nggso.prim_enable[0] = is_gs_thread;
1593bf215546Sopenharmony_ci
1594bf215546Sopenharmony_ci      for (unsigned i = 0; i < num_vertices; ++i)
1595bf215546Sopenharmony_ci         nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1596bf215546Sopenharmony_ci
1597bf215546Sopenharmony_ci      build_streamout(ctx, &nggso);
1598bf215546Sopenharmony_ci      emitted_prims = nggso.emit[0];
1599bf215546Sopenharmony_ci   }
1600bf215546Sopenharmony_ci
1601bf215546Sopenharmony_ci   LLVMValueRef user_edgeflags[3] = {};
1602bf215546Sopenharmony_ci
1603bf215546Sopenharmony_ci   if (gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
1604bf215546Sopenharmony_ci      assert(!unterminated_es_if_block);
1605bf215546Sopenharmony_ci
1606bf215546Sopenharmony_ci      /* Streamout already inserted the barrier, so don't insert it again. */
1607bf215546Sopenharmony_ci      if (!ctx->so.num_outputs) {
1608bf215546Sopenharmony_ci         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
1609bf215546Sopenharmony_ci         ac_build_s_barrier(&ctx->ac, ctx->stage);
1610bf215546Sopenharmony_ci      }
1611bf215546Sopenharmony_ci
1612bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1613bf215546Sopenharmony_ci      /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
1614bf215546Sopenharmony_ci      for (unsigned i = 0; i < num_vertices; i++) {
1615bf215546Sopenharmony_ci         tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1616bf215546Sopenharmony_ci         tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1617bf215546Sopenharmony_ci         tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
1618bf215546Sopenharmony_ci         tmp = LLVMBuildLoad2(builder, ctx->ac.i32, tmp, "");
1619bf215546Sopenharmony_ci         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1620bf215546Sopenharmony_ci
1621bf215546Sopenharmony_ci         user_edgeflags[i] = ac_build_alloca_init(&ctx->ac, tmp, "");
1622bf215546Sopenharmony_ci      }
1623bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5400);
1624bf215546Sopenharmony_ci   }
1625bf215546Sopenharmony_ci
1626bf215546Sopenharmony_ci   /* Copy Primitive IDs from GS threads to the LDS address corresponding
1627bf215546Sopenharmony_ci    * to the ES thread of the provoking vertex.
1628bf215546Sopenharmony_ci    */
1629bf215546Sopenharmony_ci   if (ctx->stage == MESA_SHADER_VERTEX && ctx->shader->key.ge.mono.u.vs_export_prim_id) {
1630bf215546Sopenharmony_ci      assert(!unterminated_es_if_block);
1631bf215546Sopenharmony_ci
1632bf215546Sopenharmony_ci      /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
1633bf215546Sopenharmony_ci      if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
1634bf215546Sopenharmony_ci         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
1635bf215546Sopenharmony_ci         ac_build_s_barrier(&ctx->ac, ctx->stage);
1636bf215546Sopenharmony_ci      }
1637bf215546Sopenharmony_ci
1638bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1639bf215546Sopenharmony_ci      /* Extract the PROVOKING_VTX_INDEX field. */
1640bf215546Sopenharmony_ci      LLVMValueRef provoking_vtx_in_prim = GET_FIELD(ctx, GS_STATE_PROVOKING_VTX_INDEX);
1641bf215546Sopenharmony_ci
1642bf215546Sopenharmony_ci      /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
1643bf215546Sopenharmony_ci      LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
1644bf215546Sopenharmony_ci      LLVMValueRef provoking_vtx_index =
1645bf215546Sopenharmony_ci         LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
1646bf215546Sopenharmony_ci      LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
1647bf215546Sopenharmony_ci
1648bf215546Sopenharmony_ci      LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
1649bf215546Sopenharmony_ci                     ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
1650bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5400);
1651bf215546Sopenharmony_ci   }
1652bf215546Sopenharmony_ci
1653bf215546Sopenharmony_ci   /* Update query buffer */
1654bf215546Sopenharmony_ci   if (ctx->screen->use_ngg_streamout && !info->base.vs.blit_sgprs_amd) {
1655bf215546Sopenharmony_ci      assert(!unterminated_es_if_block);
1656bf215546Sopenharmony_ci
1657bf215546Sopenharmony_ci      tmp = GET_FIELD(ctx, GS_STATE_STREAMOUT_QUERY_ENABLED);
1658bf215546Sopenharmony_ci      tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1659bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
1660bf215546Sopenharmony_ci      tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
1661bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, tmp, 5030);
1662bf215546Sopenharmony_ci      tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
1663bf215546Sopenharmony_ci                          ctx->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
1664bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, tmp, 5031);
1665bf215546Sopenharmony_ci      {
1666bf215546Sopenharmony_ci         LLVMValueRef args[] = {
1667bf215546Sopenharmony_ci            ngg_get_prim_cnt(ctx),
1668bf215546Sopenharmony_ci            ngg_get_query_buf(ctx),
1669bf215546Sopenharmony_ci            LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
1670bf215546Sopenharmony_ci            ctx->ac.i32_0,                        /* soffset */
1671bf215546Sopenharmony_ci            ctx->ac.i32_0,                        /* cachepolicy */
1672bf215546Sopenharmony_ci         };
1673bf215546Sopenharmony_ci
1674bf215546Sopenharmony_ci         if (ctx->so.num_outputs) {
1675bf215546Sopenharmony_ci            args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
1676bf215546Sopenharmony_ci            args[2] = ac_build_writelane(&ctx->ac, args[2], LLVMConstInt(ctx->ac.i32, 24, false),
1677bf215546Sopenharmony_ci                                         ctx->ac.i32_1);
1678bf215546Sopenharmony_ci         }
1679bf215546Sopenharmony_ci
1680bf215546Sopenharmony_ci         /* TODO: should this be 64-bit atomics? */
1681bf215546Sopenharmony_ci         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
1682bf215546Sopenharmony_ci                            0);
1683bf215546Sopenharmony_ci      }
1684bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5031);
1685bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5030);
1686bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5029);
1687bf215546Sopenharmony_ci   }
1688bf215546Sopenharmony_ci
1689bf215546Sopenharmony_ci   /* Build the primitive export. */
1690bf215546Sopenharmony_ci   if (!gfx10_ngg_export_prim_early(ctx->shader)) {
1691bf215546Sopenharmony_ci      assert(!unterminated_es_if_block);
1692bf215546Sopenharmony_ci      gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
1693bf215546Sopenharmony_ci   }
1694bf215546Sopenharmony_ci
1695bf215546Sopenharmony_ci   /* Export per-vertex data (positions and parameters). */
1696bf215546Sopenharmony_ci   if (!unterminated_es_if_block)
1697bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
1698bf215546Sopenharmony_ci   {
1699bf215546Sopenharmony_ci      unsigned i;
1700bf215546Sopenharmony_ci
1701bf215546Sopenharmony_ci      /* Unconditionally (re-)load the values for proper SSA form. */
1702bf215546Sopenharmony_ci      for (i = 0; i < info->num_outputs; i++) {
1703bf215546Sopenharmony_ci         /* If the NGG cull shader part computed the position, don't
1704bf215546Sopenharmony_ci          * use the position from the current shader part. Instead,
1705bf215546Sopenharmony_ci          * load it from LDS.
1706bf215546Sopenharmony_ci          */
1707bf215546Sopenharmony_ci         if (info->output_semantic[i] == VARYING_SLOT_POS &&
1708bf215546Sopenharmony_ci             ctx->shader->key.ge.opt.ngg_culling) {
1709bf215546Sopenharmony_ci            vertex_ptr = ngg_nogs_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx));
1710bf215546Sopenharmony_ci
1711bf215546Sopenharmony_ci            for (unsigned j = 0; j < 4; j++) {
1712bf215546Sopenharmony_ci               tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
1713bf215546Sopenharmony_ci               tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1714bf215546Sopenharmony_ci               tmp = LLVMBuildLoad2(builder, ctx->ac.i32, tmp, "");
1715bf215546Sopenharmony_ci               outputs[i].values[j] = LLVMBuildBitCast(ctx->ac.builder, tmp,
1716bf215546Sopenharmony_ci                                                       ac_to_float_type(&ctx->ac, ctx->ac.i32), "");
1717bf215546Sopenharmony_ci            }
1718bf215546Sopenharmony_ci         } else {
1719bf215546Sopenharmony_ci            for (unsigned j = 0; j < 4; j++) {
1720bf215546Sopenharmony_ci               outputs[i].values[j] = LLVMBuildLoad2(builder, ctx->ac.f32, addrs[4 * i + j], "");
1721bf215546Sopenharmony_ci            }
1722bf215546Sopenharmony_ci         }
1723bf215546Sopenharmony_ci      }
1724bf215546Sopenharmony_ci
1725bf215546Sopenharmony_ci      if (ctx->shader->key.ge.mono.u.vs_export_prim_id) {
1726bf215546Sopenharmony_ci         outputs[i].semantic = VARYING_SLOT_PRIMITIVE_ID;
1727bf215546Sopenharmony_ci         outputs[i].vertex_streams = 0;
1728bf215546Sopenharmony_ci
1729bf215546Sopenharmony_ci         if (ctx->stage == MESA_SHADER_VERTEX) {
1730bf215546Sopenharmony_ci            /* Wait for LDS stores to finish. */
1731bf215546Sopenharmony_ci            ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
1732bf215546Sopenharmony_ci            ac_build_s_barrier(&ctx->ac, ctx->stage);
1733bf215546Sopenharmony_ci
1734bf215546Sopenharmony_ci            tmp = ngg_nogs_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx));
1735bf215546Sopenharmony_ci            tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
1736bf215546Sopenharmony_ci            outputs[i].values[0] = LLVMBuildLoad2(builder, ctx->ac.i32, tmp, "");
1737bf215546Sopenharmony_ci         } else {
1738bf215546Sopenharmony_ci            assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1739bf215546Sopenharmony_ci            outputs[i].values[0] = si_get_primitive_id(ctx, 0);
1740bf215546Sopenharmony_ci         }
1741bf215546Sopenharmony_ci
1742bf215546Sopenharmony_ci         outputs[i].values[0] = LLVMBuildBitCast(ctx->ac.builder, outputs[i].values[0], ctx->ac.f32, "");
1743bf215546Sopenharmony_ci         for (unsigned j = 1; j < 4; j++)
1744bf215546Sopenharmony_ci            outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
1745bf215546Sopenharmony_ci         i++;
1746bf215546Sopenharmony_ci      }
1747bf215546Sopenharmony_ci
1748bf215546Sopenharmony_ci      si_llvm_build_vs_exports(ctx, NULL, outputs, i);
1749bf215546Sopenharmony_ci   }
1750bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 6002);
1751bf215546Sopenharmony_ci}
1752bf215546Sopenharmony_ci
1753bf215546Sopenharmony_cistatic LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
1754bf215546Sopenharmony_ci{
1755bf215546Sopenharmony_ci   const struct si_shader_selector *sel = ctx->shader->selector;
1756bf215546Sopenharmony_ci   const struct si_shader_info *info = &sel->info;
1757bf215546Sopenharmony_ci
1758bf215546Sopenharmony_ci   LLVMTypeRef elements[2] = {
1759bf215546Sopenharmony_ci      LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
1760bf215546Sopenharmony_ci      LLVMArrayType(ctx->ac.i8, 4),
1761bf215546Sopenharmony_ci   };
1762bf215546Sopenharmony_ci   LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
1763bf215546Sopenharmony_ci   type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
1764bf215546Sopenharmony_ci   return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
1765bf215546Sopenharmony_ci}
1766bf215546Sopenharmony_ci
1767bf215546Sopenharmony_ci/**
1768bf215546Sopenharmony_ci * Return a pointer to the LDS storage reserved for the N'th vertex, where N
1769bf215546Sopenharmony_ci * is in emit order; that is:
1770bf215546Sopenharmony_ci * - at the shader end, N is the threadidx (relative to the entire threadgroup)
1771bf215546Sopenharmony_ci * - during vertex emit, i.e. while the API GS shader invocation is running,
1772bf215546Sopenharmony_ci *   N = threadidx * gs.vertices_out + emitidx
1773bf215546Sopenharmony_ci *
1774bf215546Sopenharmony_ci * Goals of the LDS memory layout:
1775bf215546Sopenharmony_ci * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
1776bf215546Sopenharmony_ci *    in uniform control flow
1777bf215546Sopenharmony_ci * 2. Eliminate bank conflicts on read for export if, additionally, there is no
1778bf215546Sopenharmony_ci *    culling
1779bf215546Sopenharmony_ci * 3. Agnostic to the number of waves (since we don't know it before compiling)
1780bf215546Sopenharmony_ci * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
1781bf215546Sopenharmony_ci * 5. Avoid wasting memory.
1782bf215546Sopenharmony_ci *
1783bf215546Sopenharmony_ci * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
1784bf215546Sopenharmony_ci * layout, elimination of bank conflicts requires that each vertex occupy an
1785bf215546Sopenharmony_ci * odd number of dwords. We use the additional dword to store the output stream
1786bf215546Sopenharmony_ci * index as well as a flag to indicate whether this vertex ends a primitive
1787bf215546Sopenharmony_ci * for rasterization.
1788bf215546Sopenharmony_ci *
1789bf215546Sopenharmony_ci * Swizzling is required to satisfy points 1 and 2 simultaneously.
1790bf215546Sopenharmony_ci *
1791bf215546Sopenharmony_ci * Vertices are stored in export order (gsthread * gs.vertices_out + emitidx).
1792bf215546Sopenharmony_ci * Indices are swizzled in groups of 32, which ensures point 1 without
1793bf215546Sopenharmony_ci * disturbing point 2.
1794bf215546Sopenharmony_ci *
1795bf215546Sopenharmony_ci * \return an LDS pointer to type {[N x i32], [4 x i8]}
1796bf215546Sopenharmony_ci */
1797bf215546Sopenharmony_cistatic LLVMValueRef ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
1798bf215546Sopenharmony_ci{
1799bf215546Sopenharmony_ci   struct si_shader_selector *sel = ctx->shader->selector;
1800bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
1801bf215546Sopenharmony_ci   LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
1802bf215546Sopenharmony_ci
1803bf215546Sopenharmony_ci   /* gs.vertices_out = 2^(write_stride_2exp) * some odd number */
1804bf215546Sopenharmony_ci   unsigned write_stride_2exp = ffs(sel->info.base.gs.vertices_out) - 1;
1805bf215546Sopenharmony_ci   if (write_stride_2exp) {
1806bf215546Sopenharmony_ci      LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
1807bf215546Sopenharmony_ci      LLVMValueRef swizzle = LLVMBuildAnd(
1808bf215546Sopenharmony_ci         builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
1809bf215546Sopenharmony_ci      vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
1810bf215546Sopenharmony_ci   }
1811bf215546Sopenharmony_ci
1812bf215546Sopenharmony_ci   return ac_build_gep0(&ctx->ac, storage, vertexidx);
1813bf215546Sopenharmony_ci}
1814bf215546Sopenharmony_ci
1815bf215546Sopenharmony_cistatic LLVMValueRef ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
1816bf215546Sopenharmony_ci                                           LLVMValueRef emitidx)
1817bf215546Sopenharmony_ci{
1818bf215546Sopenharmony_ci   struct si_shader_selector *sel = ctx->shader->selector;
1819bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
1820bf215546Sopenharmony_ci   LLVMValueRef tmp;
1821bf215546Sopenharmony_ci
1822bf215546Sopenharmony_ci   tmp = LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false);
1823bf215546Sopenharmony_ci   tmp = LLVMBuildMul(builder, tmp, gsthread, "");
1824bf215546Sopenharmony_ci   const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
1825bf215546Sopenharmony_ci   return ngg_gs_vertex_ptr(ctx, vertexidx);
1826bf215546Sopenharmony_ci}
1827bf215546Sopenharmony_ci
1828bf215546Sopenharmony_cistatic LLVMValueRef ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx,
1829bf215546Sopenharmony_ci                                               LLVMValueRef vertexptr, unsigned out_idx)
1830bf215546Sopenharmony_ci{
1831bf215546Sopenharmony_ci   LLVMValueRef gep_idx[3] = {
1832bf215546Sopenharmony_ci      ctx->ac.i32_0, /* implied C-style array */
1833bf215546Sopenharmony_ci      ctx->ac.i32_0, /* first struct entry */
1834bf215546Sopenharmony_ci      LLVMConstInt(ctx->ac.i32, out_idx, false),
1835bf215546Sopenharmony_ci   };
1836bf215546Sopenharmony_ci   return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1837bf215546Sopenharmony_ci}
1838bf215546Sopenharmony_ci
1839bf215546Sopenharmony_cistatic LLVMValueRef ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx,
1840bf215546Sopenharmony_ci                                                 LLVMValueRef vertexptr, unsigned stream)
1841bf215546Sopenharmony_ci{
1842bf215546Sopenharmony_ci   LLVMValueRef gep_idx[3] = {
1843bf215546Sopenharmony_ci      ctx->ac.i32_0, /* implied C-style array */
1844bf215546Sopenharmony_ci      ctx->ac.i32_1, /* second struct entry */
1845bf215546Sopenharmony_ci      LLVMConstInt(ctx->ac.i32, stream, false),
1846bf215546Sopenharmony_ci   };
1847bf215546Sopenharmony_ci   return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1848bf215546Sopenharmony_ci}
1849bf215546Sopenharmony_ci
1850bf215546Sopenharmony_civoid gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs)
1851bf215546Sopenharmony_ci{
1852bf215546Sopenharmony_ci   const struct si_shader_selector *sel = ctx->shader->selector;
1853bf215546Sopenharmony_ci   const struct si_shader_info *info = &sel->info;
1854bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
1855bf215546Sopenharmony_ci   LLVMValueRef tmp;
1856bf215546Sopenharmony_ci   const LLVMValueRef vertexidx = LLVMBuildLoad2(builder, ctx->ac.i32, ctx->gs_next_vertex[stream], "");
1857bf215546Sopenharmony_ci
1858bf215546Sopenharmony_ci   /* If this thread has already emitted the declared maximum number of
1859bf215546Sopenharmony_ci    * vertices, skip the write: excessive vertex emissions are not
1860bf215546Sopenharmony_ci    * supposed to have any effect.
1861bf215546Sopenharmony_ci    */
1862bf215546Sopenharmony_ci   const LLVMValueRef can_emit =
1863bf215546Sopenharmony_ci      LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
1864bf215546Sopenharmony_ci                    LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false), "");
1865bf215546Sopenharmony_ci
1866bf215546Sopenharmony_ci   tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1867bf215546Sopenharmony_ci   tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
1868bf215546Sopenharmony_ci   LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1869bf215546Sopenharmony_ci
1870bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, can_emit, 9001);
1871bf215546Sopenharmony_ci
1872bf215546Sopenharmony_ci   const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx), vertexidx);
1873bf215546Sopenharmony_ci   unsigned out_idx = 0;
1874bf215546Sopenharmony_ci   for (unsigned i = 0; i < info->num_outputs; i++) {
1875bf215546Sopenharmony_ci      for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
1876bf215546Sopenharmony_ci         if (!(info->output_usagemask[i] & (1 << chan)) ||
1877bf215546Sopenharmony_ci             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
1878bf215546Sopenharmony_ci            continue;
1879bf215546Sopenharmony_ci
1880bf215546Sopenharmony_ci         LLVMValueRef out_val = LLVMBuildLoad2(builder, ctx->ac.f32, addrs[4 * i + chan], "");
1881bf215546Sopenharmony_ci         LLVMTypeRef as_int = ac_to_integer_type(&ctx->ac, ctx->ac.f32);
1882bf215546Sopenharmony_ci         out_val = LLVMBuildBitCast(ctx->ac.builder, out_val, as_int, "");
1883bf215546Sopenharmony_ci         LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
1884bf215546Sopenharmony_ci      }
1885bf215546Sopenharmony_ci   }
1886bf215546Sopenharmony_ci   assert(out_idx * 4 == info->gsvs_vertex_size);
1887bf215546Sopenharmony_ci
1888bf215546Sopenharmony_ci   /* Determine and store whether this vertex completed a primitive. */
1889bf215546Sopenharmony_ci   const LLVMValueRef curverts = LLVMBuildLoad2(builder, ctx->ac.i32, ctx->gs_curprim_verts[stream], "");
1890bf215546Sopenharmony_ci
1891bf215546Sopenharmony_ci   tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->info.base.gs.output_primitive) - 1, false);
1892bf215546Sopenharmony_ci   const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
1893bf215546Sopenharmony_ci
1894bf215546Sopenharmony_ci   /* Since the geometry shader emits triangle strips, we need to
1895bf215546Sopenharmony_ci    * track which primitive is odd and swap vertex indices to get
1896bf215546Sopenharmony_ci    * the correct vertex order.
1897bf215546Sopenharmony_ci    */
1898bf215546Sopenharmony_ci   LLVMValueRef is_odd = ctx->ac.i1false;
1899bf215546Sopenharmony_ci   if (stream == 0 && u_vertices_per_prim(sel->info.base.gs.output_primitive) == 3) {
1900bf215546Sopenharmony_ci      tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
1901bf215546Sopenharmony_ci      is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
1902bf215546Sopenharmony_ci   }
1903bf215546Sopenharmony_ci
1904bf215546Sopenharmony_ci   tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
1905bf215546Sopenharmony_ci   LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
1906bf215546Sopenharmony_ci
1907bf215546Sopenharmony_ci   /* The per-vertex primitive flag encoding:
1908bf215546Sopenharmony_ci    *   bit 0: whether this vertex finishes a primitive
1909bf215546Sopenharmony_ci    *   bit 1: whether the primitive is odd (if we are emitting triangle strips)
1910bf215546Sopenharmony_ci    */
1911bf215546Sopenharmony_ci   tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
1912bf215546Sopenharmony_ci   tmp = LLVMBuildOr(
1913bf215546Sopenharmony_ci      builder, tmp,
1914bf215546Sopenharmony_ci      LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
1915bf215546Sopenharmony_ci   LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
1916bf215546Sopenharmony_ci
1917bf215546Sopenharmony_ci   tmp = LLVMBuildLoad2(builder, ctx->ac.i32, ctx->gs_generated_prims[stream], "");
1918bf215546Sopenharmony_ci   tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
1919bf215546Sopenharmony_ci   LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
1920bf215546Sopenharmony_ci
1921bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 9001);
1922bf215546Sopenharmony_ci}
1923bf215546Sopenharmony_ci
1924bf215546Sopenharmony_civoid gfx10_ngg_gs_emit_begin(struct si_shader_context *ctx)
1925bf215546Sopenharmony_ci{
1926bf215546Sopenharmony_ci   /* Zero out the part of LDS scratch that is used to accumulate the
1927bf215546Sopenharmony_ci    * per-stream generated primitive count.
1928bf215546Sopenharmony_ci    */
1929bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
1930bf215546Sopenharmony_ci   LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
1931bf215546Sopenharmony_ci   LLVMValueRef tid = gfx10_get_thread_id_in_tg(ctx);
1932bf215546Sopenharmony_ci   LLVMValueRef tmp;
1933bf215546Sopenharmony_ci
1934bf215546Sopenharmony_ci   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1935bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, tmp, 5090);
1936bf215546Sopenharmony_ci   {
1937bf215546Sopenharmony_ci      LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
1938bf215546Sopenharmony_ci      LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
1939bf215546Sopenharmony_ci   }
1940bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 5090);
1941bf215546Sopenharmony_ci
1942bf215546Sopenharmony_ci   if (ctx->screen->info.gfx_level < GFX11) {
1943bf215546Sopenharmony_ci      tmp = si_is_gs_thread(ctx);
1944bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, tmp, 15090);
1945bf215546Sopenharmony_ci         {
1946bf215546Sopenharmony_ci            tmp = GET_FIELD(ctx, GS_STATE_PIPELINE_STATS_EMU);
1947bf215546Sopenharmony_ci            tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1948bf215546Sopenharmony_ci            ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (GS_PIPELINE_STATS_EMU) */
1949bf215546Sopenharmony_ci            LLVMValueRef args[] = {
1950bf215546Sopenharmony_ci               ctx->ac.i32_1,
1951bf215546Sopenharmony_ci               ngg_get_emulated_counters_buf(ctx),
1952bf215546Sopenharmony_ci               LLVMConstInt(ctx->ac.i32,
1953bf215546Sopenharmony_ci                            si_query_pipestat_end_dw_offset(ctx->screen, PIPE_STAT_QUERY_GS_INVOCATIONS) * 4,
1954bf215546Sopenharmony_ci                            false),
1955bf215546Sopenharmony_ci               ctx->ac.i32_0,                            /* soffset */
1956bf215546Sopenharmony_ci               ctx->ac.i32_0,                            /* cachepolicy */
1957bf215546Sopenharmony_ci            };
1958bf215546Sopenharmony_ci
1959bf215546Sopenharmony_ci            ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5, 0);
1960bf215546Sopenharmony_ci            ac_build_endif(&ctx->ac, 5109);
1961bf215546Sopenharmony_ci         }
1962bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 15090);
1963bf215546Sopenharmony_ci   }
1964bf215546Sopenharmony_ci
1965bf215546Sopenharmony_ci   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
1966bf215546Sopenharmony_ci   ac_build_s_barrier(&ctx->ac, ctx->stage);
1967bf215546Sopenharmony_ci}
1968bf215546Sopenharmony_ci
1969bf215546Sopenharmony_civoid gfx10_ngg_gs_build_end(struct si_shader_context *ctx)
1970bf215546Sopenharmony_ci{
1971bf215546Sopenharmony_ci   const struct si_shader_selector *sel = ctx->shader->selector;
1972bf215546Sopenharmony_ci   const struct si_shader_info *info = &sel->info;
1973bf215546Sopenharmony_ci   const unsigned verts_per_prim = u_vertices_per_prim(sel->info.base.gs.output_primitive);
1974bf215546Sopenharmony_ci   LLVMBuilderRef builder = ctx->ac.builder;
1975bf215546Sopenharmony_ci   LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
1976bf215546Sopenharmony_ci   LLVMValueRef tmp, tmp2;
1977bf215546Sopenharmony_ci
1978bf215546Sopenharmony_ci   /* Zero out remaining (non-emitted) primitive flags.
1979bf215546Sopenharmony_ci    *
1980bf215546Sopenharmony_ci    * Note: Alternatively, we could pass the relevant gs_next_vertex to
1981bf215546Sopenharmony_ci    *       the emit threads via LDS. This is likely worse in the expected
1982bf215546Sopenharmony_ci    *       typical case where each GS thread emits the full set of
1983bf215546Sopenharmony_ci    *       vertices.
1984bf215546Sopenharmony_ci    */
1985bf215546Sopenharmony_ci   for (unsigned stream = 0; stream < 4; ++stream) {
1986bf215546Sopenharmony_ci      if (!info->num_stream_output_components[stream])
1987bf215546Sopenharmony_ci         continue;
1988bf215546Sopenharmony_ci
1989bf215546Sopenharmony_ci      const LLVMValueRef gsthread = gfx10_get_thread_id_in_tg(ctx);
1990bf215546Sopenharmony_ci
1991bf215546Sopenharmony_ci      ac_build_bgnloop(&ctx->ac, 5100);
1992bf215546Sopenharmony_ci
1993bf215546Sopenharmony_ci      const LLVMValueRef vertexidx = LLVMBuildLoad2(builder, ctx->ac.i32, ctx->gs_next_vertex[stream], "");
1994bf215546Sopenharmony_ci      tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
1995bf215546Sopenharmony_ci                          LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false), "");
1996bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, tmp, 5101);
1997bf215546Sopenharmony_ci      ac_build_break(&ctx->ac);
1998bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5101);
1999bf215546Sopenharmony_ci
2000bf215546Sopenharmony_ci      tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
2001bf215546Sopenharmony_ci      LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
2002bf215546Sopenharmony_ci
2003bf215546Sopenharmony_ci      tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
2004bf215546Sopenharmony_ci      LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
2005bf215546Sopenharmony_ci
2006bf215546Sopenharmony_ci      ac_build_endloop(&ctx->ac, 5100);
2007bf215546Sopenharmony_ci   }
2008bf215546Sopenharmony_ci
2009bf215546Sopenharmony_ci   /* Accumulate generated primitives counts across the entire threadgroup. */
2010bf215546Sopenharmony_ci   for (unsigned stream = 0; stream < 4; ++stream) {
2011bf215546Sopenharmony_ci      if (!info->num_stream_output_components[stream])
2012bf215546Sopenharmony_ci         continue;
2013bf215546Sopenharmony_ci
2014bf215546Sopenharmony_ci      LLVMValueRef numprims = LLVMBuildLoad2(builder, ctx->ac.i32, ctx->gs_generated_prims[stream], "");
2015bf215546Sopenharmony_ci      numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
2016bf215546Sopenharmony_ci
2017bf215546Sopenharmony_ci      tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
2018bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, tmp, 5105);
2019bf215546Sopenharmony_ci      {
2020bf215546Sopenharmony_ci         LLVMBuildAtomicRMW(
2021bf215546Sopenharmony_ci            builder, LLVMAtomicRMWBinOpAdd,
2022bf215546Sopenharmony_ci            ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
2023bf215546Sopenharmony_ci            numprims, LLVMAtomicOrderingMonotonic, false);
2024bf215546Sopenharmony_ci      }
2025bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5105);
2026bf215546Sopenharmony_ci   }
2027bf215546Sopenharmony_ci
2028bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
2029bf215546Sopenharmony_ci
2030bf215546Sopenharmony_ci   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
2031bf215546Sopenharmony_ci   ac_build_s_barrier(&ctx->ac, ctx->stage);
2032bf215546Sopenharmony_ci
2033bf215546Sopenharmony_ci   const LLVMValueRef tid = gfx10_get_thread_id_in_tg(ctx);
2034bf215546Sopenharmony_ci   LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
2035bf215546Sopenharmony_ci
2036bf215546Sopenharmony_ci   /* Streamout */
2037bf215546Sopenharmony_ci   if (ctx->so.num_outputs) {
2038bf215546Sopenharmony_ci      struct ngg_streamout nggso = {};
2039bf215546Sopenharmony_ci
2040bf215546Sopenharmony_ci      nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
2041bf215546Sopenharmony_ci
2042bf215546Sopenharmony_ci      LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
2043bf215546Sopenharmony_ci      for (unsigned stream = 0; stream < 4; ++stream) {
2044bf215546Sopenharmony_ci         if (!info->num_stream_output_components[stream])
2045bf215546Sopenharmony_ci            continue;
2046bf215546Sopenharmony_ci
2047bf215546Sopenharmony_ci         tmp = LLVMBuildLoad2(builder, ctx->ac.i8, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
2048bf215546Sopenharmony_ci         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
2049bf215546Sopenharmony_ci         tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
2050bf215546Sopenharmony_ci         nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
2051bf215546Sopenharmony_ci      }
2052bf215546Sopenharmony_ci
2053bf215546Sopenharmony_ci      for (unsigned i = 0; i < verts_per_prim; ++i) {
2054bf215546Sopenharmony_ci         tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
2055bf215546Sopenharmony_ci                            "");
2056bf215546Sopenharmony_ci         tmp = ngg_gs_vertex_ptr(ctx, tmp);
2057bf215546Sopenharmony_ci         nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
2058bf215546Sopenharmony_ci      }
2059bf215546Sopenharmony_ci
2060bf215546Sopenharmony_ci      build_streamout(ctx, &nggso);
2061bf215546Sopenharmony_ci   }
2062bf215546Sopenharmony_ci
2063bf215546Sopenharmony_ci   /* Write shader query data. */
2064bf215546Sopenharmony_ci   if (ctx->screen->use_ngg_streamout) {
2065bf215546Sopenharmony_ci      tmp = GET_FIELD(ctx, GS_STATE_STREAMOUT_QUERY_ENABLED);
2066bf215546Sopenharmony_ci      tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
2067bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
2068bf215546Sopenharmony_ci      unsigned num_query_comps = ctx->so.num_outputs ? 8 : 4;
2069bf215546Sopenharmony_ci      tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
2070bf215546Sopenharmony_ci                          LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
2071bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, tmp, 5110);
2072bf215546Sopenharmony_ci      {
2073bf215546Sopenharmony_ci         LLVMValueRef offset;
2074bf215546Sopenharmony_ci         tmp = tid;
2075bf215546Sopenharmony_ci         if (ctx->so.num_outputs)
2076bf215546Sopenharmony_ci            tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
2077bf215546Sopenharmony_ci         offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
2078bf215546Sopenharmony_ci         if (ctx->so.num_outputs) {
2079bf215546Sopenharmony_ci            tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
2080bf215546Sopenharmony_ci            tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
2081bf215546Sopenharmony_ci            offset = LLVMBuildAdd(builder, offset, tmp, "");
2082bf215546Sopenharmony_ci         }
2083bf215546Sopenharmony_ci
2084bf215546Sopenharmony_ci         tmp = LLVMBuildLoad2(builder, ctx->ac.i32, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
2085bf215546Sopenharmony_ci         LLVMValueRef args[] = {
2086bf215546Sopenharmony_ci            tmp,           ngg_get_query_buf(ctx),
2087bf215546Sopenharmony_ci            offset,        LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
2088bf215546Sopenharmony_ci            ctx->ac.i32_0,                                       /* cachepolicy */
2089bf215546Sopenharmony_ci         };
2090bf215546Sopenharmony_ci         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
2091bf215546Sopenharmony_ci                            0);
2092bf215546Sopenharmony_ci      }
2093bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5110);
2094bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 5109);
2095bf215546Sopenharmony_ci   }
2096bf215546Sopenharmony_ci
2097bf215546Sopenharmony_ci   /* Cull primitives. */
2098bf215546Sopenharmony_ci   if (ctx->shader->key.ge.opt.ngg_culling) {
2099bf215546Sopenharmony_ci      assert(info->num_stream_output_components[0]);
2100bf215546Sopenharmony_ci
2101bf215546Sopenharmony_ci      LLVMValueRef gs_vtxptr = ngg_gs_vertex_ptr(ctx, tid);
2102bf215546Sopenharmony_ci      LLVMValueRef live = LLVMBuildLoad2(builder, ctx->ac.i8, ngg_gs_get_emit_primflag_ptr(ctx, gs_vtxptr, 0), "");
2103bf215546Sopenharmony_ci      live = LLVMBuildTrunc(builder, live, ctx->ac.i1, "");
2104bf215546Sopenharmony_ci      LLVMValueRef is_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
2105bf215546Sopenharmony_ci      LLVMValueRef prim_enable = LLVMBuildAnd(builder, live, is_emit, "");
2106bf215546Sopenharmony_ci
2107bf215546Sopenharmony_ci      /* Wait for streamout to finish before we kill primitives. */
2108bf215546Sopenharmony_ci      if (ctx->so.num_outputs) {
2109bf215546Sopenharmony_ci         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
2110bf215546Sopenharmony_ci         ac_build_s_barrier(&ctx->ac, ctx->stage);
2111bf215546Sopenharmony_ci      }
2112bf215546Sopenharmony_ci
2113bf215546Sopenharmony_ci      ac_build_ifcc(&ctx->ac, prim_enable, 0);
2114bf215546Sopenharmony_ci      {
2115bf215546Sopenharmony_ci         LLVMValueRef vtxptr[3] = {};
2116bf215546Sopenharmony_ci         LLVMValueRef pos[3][4] = {};
2117bf215546Sopenharmony_ci
2118bf215546Sopenharmony_ci         for (unsigned i = 0; i < verts_per_prim; i++) {
2119bf215546Sopenharmony_ci            tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
2120bf215546Sopenharmony_ci            vtxptr[i] = ac_build_gep0(&ctx->ac, ngg_gs_vertex_ptr(ctx, tmp), ctx->ac.i32_0);
2121bf215546Sopenharmony_ci         }
2122bf215546Sopenharmony_ci
2123bf215546Sopenharmony_ci         for (unsigned i = 0; i < info->num_outputs; i++) {
2124bf215546Sopenharmony_ci            /* If the stream index is non-zero for all channels, skip the output. */
2125bf215546Sopenharmony_ci            if (info->output_streams[i] & 0x3 &&
2126bf215546Sopenharmony_ci                (info->output_streams[i] >> 2) & 0x3 &&
2127bf215546Sopenharmony_ci                (info->output_streams[i] >> 4) & 0x3 &&
2128bf215546Sopenharmony_ci                (info->output_streams[i] >> 6) & 0x3)
2129bf215546Sopenharmony_ci               continue;
2130bf215546Sopenharmony_ci
2131bf215546Sopenharmony_ci            switch (info->output_semantic[i]) {
2132bf215546Sopenharmony_ci            case VARYING_SLOT_POS:
2133bf215546Sopenharmony_ci               /* Load the positions from LDS. */
2134bf215546Sopenharmony_ci               for (unsigned vert = 0; vert < verts_per_prim; vert++) {
2135bf215546Sopenharmony_ci                  for (unsigned comp = 0; comp < 4; comp++) {
2136bf215546Sopenharmony_ci                     /* Z is not needed. */
2137bf215546Sopenharmony_ci                     if (comp == 2)
2138bf215546Sopenharmony_ci                        continue;
2139bf215546Sopenharmony_ci
2140bf215546Sopenharmony_ci                     tmp = ac_build_gep0(&ctx->ac, vtxptr[vert],
2141bf215546Sopenharmony_ci                                         LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
2142bf215546Sopenharmony_ci                     pos[vert][comp] = LLVMBuildLoad(builder, tmp, "");
2143bf215546Sopenharmony_ci                     pos[vert][comp] = ac_to_float(&ctx->ac, pos[vert][comp]);
2144bf215546Sopenharmony_ci                  }
2145bf215546Sopenharmony_ci               }
2146bf215546Sopenharmony_ci
2147bf215546Sopenharmony_ci               /* Divide XY by W. */
2148bf215546Sopenharmony_ci               for (unsigned vert = 0; vert < verts_per_prim; vert++) {
2149bf215546Sopenharmony_ci                  for (unsigned comp = 0; comp < 2; comp++)
2150bf215546Sopenharmony_ci                     pos[vert][comp] = ac_build_fdiv(&ctx->ac, pos[vert][comp], pos[vert][3]);
2151bf215546Sopenharmony_ci               }
2152bf215546Sopenharmony_ci               break;
2153bf215546Sopenharmony_ci            }
2154bf215546Sopenharmony_ci         }
2155bf215546Sopenharmony_ci
2156bf215546Sopenharmony_ci         LLVMValueRef clipdist_accepted = ctx->ac.i1true; /* TODO */
2157bf215546Sopenharmony_ci         LLVMValueRef accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
2158bf215546Sopenharmony_ci
2159bf215546Sopenharmony_ci         cull_primitive(ctx, pos, clipdist_accepted, accepted, NULL);
2160bf215546Sopenharmony_ci
2161bf215546Sopenharmony_ci         accepted = LLVMBuildLoad2(builder, ctx->ac.i32, accepted, "");
2162bf215546Sopenharmony_ci         LLVMValueRef rejected = LLVMBuildNot(builder, LLVMBuildTrunc(builder, accepted, ctx->ac.i1, ""), "");
2163bf215546Sopenharmony_ci
2164bf215546Sopenharmony_ci         ac_build_ifcc(&ctx->ac, rejected, 0);
2165bf215546Sopenharmony_ci         LLVMBuildStore(builder, ctx->ac.i8_0, ngg_gs_get_emit_primflag_ptr(ctx, gs_vtxptr, 0));
2166bf215546Sopenharmony_ci         ac_build_endif(&ctx->ac, 0);
2167bf215546Sopenharmony_ci      }
2168bf215546Sopenharmony_ci      ac_build_endif(&ctx->ac, 0);
2169bf215546Sopenharmony_ci
2170bf215546Sopenharmony_ci      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
2171bf215546Sopenharmony_ci      ac_build_s_barrier(&ctx->ac, ctx->stage);
2172bf215546Sopenharmony_ci   }
2173bf215546Sopenharmony_ci
2174bf215546Sopenharmony_ci   /* Determine vertex liveness. */
2175bf215546Sopenharmony_ci   LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
2176bf215546Sopenharmony_ci
2177bf215546Sopenharmony_ci   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
2178bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, tmp, 5120);
2179bf215546Sopenharmony_ci   {
2180bf215546Sopenharmony_ci      for (unsigned i = 0; i < verts_per_prim; ++i) {
2181bf215546Sopenharmony_ci         const LLVMValueRef primidx =
2182bf215546Sopenharmony_ci            LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
2183bf215546Sopenharmony_ci
2184bf215546Sopenharmony_ci         if (i > 0) {
2185bf215546Sopenharmony_ci            tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
2186bf215546Sopenharmony_ci            ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
2187bf215546Sopenharmony_ci         }
2188bf215546Sopenharmony_ci
2189bf215546Sopenharmony_ci         /* Load primitive liveness */
2190bf215546Sopenharmony_ci         tmp = ngg_gs_vertex_ptr(ctx, primidx);
2191bf215546Sopenharmony_ci         tmp = LLVMBuildLoad2(builder, ctx->ac.i8, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
2192bf215546Sopenharmony_ci         const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
2193bf215546Sopenharmony_ci
2194bf215546Sopenharmony_ci         tmp = LLVMBuildLoad2(builder, ctx->ac.i1, vertliveptr, "");
2195bf215546Sopenharmony_ci         tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
2196bf215546Sopenharmony_ci
2197bf215546Sopenharmony_ci         if (i > 0)
2198bf215546Sopenharmony_ci            ac_build_endif(&ctx->ac, 5121 + i);
2199bf215546Sopenharmony_ci      }
2200bf215546Sopenharmony_ci   }
2201bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 5120);
2202bf215546Sopenharmony_ci
2203bf215546Sopenharmony_ci   /* Inclusive scan addition across the current wave. */
2204bf215546Sopenharmony_ci   LLVMValueRef vertlive = LLVMBuildLoad2(builder, ctx->ac.i1, vertliveptr, "");
2205bf215546Sopenharmony_ci   struct ac_wg_scan vertlive_scan = {};
2206bf215546Sopenharmony_ci   vertlive_scan.stage = ctx->stage;
2207bf215546Sopenharmony_ci   vertlive_scan.op = nir_op_iadd;
2208bf215546Sopenharmony_ci   vertlive_scan.enable_reduce = true;
2209bf215546Sopenharmony_ci   vertlive_scan.enable_exclusive = true;
2210bf215546Sopenharmony_ci   vertlive_scan.src = vertlive;
2211bf215546Sopenharmony_ci   vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
2212bf215546Sopenharmony_ci   vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
2213bf215546Sopenharmony_ci   vertlive_scan.numwaves = get_tgsize(ctx);
2214bf215546Sopenharmony_ci   vertlive_scan.maxwaves = DIV_ROUND_UP(256, ctx->ac.wave_size);
2215bf215546Sopenharmony_ci
2216bf215546Sopenharmony_ci   ac_build_wg_scan(&ctx->ac, &vertlive_scan);
2217bf215546Sopenharmony_ci
2218bf215546Sopenharmony_ci   /* Skip all exports (including index exports) when possible. */
2219bf215546Sopenharmony_ci   LLVMValueRef have_exports =
2220bf215546Sopenharmony_ci      LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
2221bf215546Sopenharmony_ci   num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
2222bf215546Sopenharmony_ci
2223bf215546Sopenharmony_ci   /* Allocate export space. Send this message as early as possible, to
2224bf215546Sopenharmony_ci    * hide the latency of the SQ <-> SPI roundtrip.
2225bf215546Sopenharmony_ci    */
2226bf215546Sopenharmony_ci   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
2227bf215546Sopenharmony_ci                                 num_emit_threads);
2228bf215546Sopenharmony_ci
2229bf215546Sopenharmony_ci   /* Setup the reverse vertex compaction permutation. We re-use stream 1
2230bf215546Sopenharmony_ci    * of the primitive liveness flags, relying on the fact that each
2231bf215546Sopenharmony_ci    * threadgroup can have at most 256 threads. */
2232bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, vertlive, 5130);
2233bf215546Sopenharmony_ci   {
2234bf215546Sopenharmony_ci      tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
2235bf215546Sopenharmony_ci      tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
2236bf215546Sopenharmony_ci      LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
2237bf215546Sopenharmony_ci   }
2238bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 5130);
2239bf215546Sopenharmony_ci
2240bf215546Sopenharmony_ci   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
2241bf215546Sopenharmony_ci   ac_build_s_barrier(&ctx->ac, ctx->stage);
2242bf215546Sopenharmony_ci
2243bf215546Sopenharmony_ci   /* Export primitive data */
2244bf215546Sopenharmony_ci   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
2245bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, tmp, 5140);
2246bf215546Sopenharmony_ci   {
2247bf215546Sopenharmony_ci      LLVMValueRef flags;
2248bf215546Sopenharmony_ci      struct ac_ngg_prim prim = {};
2249bf215546Sopenharmony_ci      prim.num_vertices = verts_per_prim;
2250bf215546Sopenharmony_ci
2251bf215546Sopenharmony_ci      tmp = ngg_gs_vertex_ptr(ctx, tid);
2252bf215546Sopenharmony_ci      flags = LLVMBuildLoad2(builder, ctx->ac.i8, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
2253bf215546Sopenharmony_ci      prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
2254bf215546Sopenharmony_ci      prim.edgeflags = ctx->ac.i32_0;
2255bf215546Sopenharmony_ci
2256bf215546Sopenharmony_ci      for (unsigned i = 0; i < verts_per_prim; ++i) {
2257bf215546Sopenharmony_ci         prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
2258bf215546Sopenharmony_ci                                      LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
2259bf215546Sopenharmony_ci      }
2260bf215546Sopenharmony_ci
2261bf215546Sopenharmony_ci      /* Geometry shaders output triangle strips, but NGG expects triangles. */
2262bf215546Sopenharmony_ci      if (verts_per_prim == 3) {
2263bf215546Sopenharmony_ci         LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
2264bf215546Sopenharmony_ci         is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
2265bf215546Sopenharmony_ci         LLVMValueRef flatshade_first = LLVMBuildICmp(
2266bf215546Sopenharmony_ci            builder, LLVMIntEQ, GET_FIELD(ctx, GS_STATE_PROVOKING_VTX_INDEX), ctx->ac.i32_0, "");
2267bf215546Sopenharmony_ci
2268bf215546Sopenharmony_ci         ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
2269bf215546Sopenharmony_ci      }
2270bf215546Sopenharmony_ci
2271bf215546Sopenharmony_ci      ac_build_export_prim(&ctx->ac, &prim);
2272bf215546Sopenharmony_ci
2273bf215546Sopenharmony_ci      if (ctx->screen->info.gfx_level < GFX11) {
2274bf215546Sopenharmony_ci         tmp = GET_FIELD(ctx, GS_STATE_PIPELINE_STATS_EMU);
2275bf215546Sopenharmony_ci         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
2276bf215546Sopenharmony_ci         ac_build_ifcc(&ctx->ac, tmp, 5229); /* if (GS_PIPELINE_STATS_EMU) */
2277bf215546Sopenharmony_ci         ac_build_ifcc(&ctx->ac, LLVMBuildNot(builder, prim.isnull, ""), 5237);
2278bf215546Sopenharmony_ci         {
2279bf215546Sopenharmony_ci            LLVMValueRef args[] = {
2280bf215546Sopenharmony_ci               ctx->ac.i32_1,
2281bf215546Sopenharmony_ci               ngg_get_emulated_counters_buf(ctx),
2282bf215546Sopenharmony_ci               LLVMConstInt(ctx->ac.i32,
2283bf215546Sopenharmony_ci                            si_query_pipestat_end_dw_offset(ctx->screen, PIPE_STAT_QUERY_GS_PRIMITIVES) * 4,
2284bf215546Sopenharmony_ci                            false),
2285bf215546Sopenharmony_ci               ctx->ac.i32_0,                            /* soffset */
2286bf215546Sopenharmony_ci               ctx->ac.i32_0,                            /* cachepolicy */
2287bf215546Sopenharmony_ci            };
2288bf215546Sopenharmony_ci
2289bf215546Sopenharmony_ci            ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5, 0);
2290bf215546Sopenharmony_ci         }
2291bf215546Sopenharmony_ci         ac_build_endif(&ctx->ac, 5237);
2292bf215546Sopenharmony_ci         ac_build_endif(&ctx->ac, 5229);
2293bf215546Sopenharmony_ci      }
2294bf215546Sopenharmony_ci   }
2295bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 5140);
2296bf215546Sopenharmony_ci
2297bf215546Sopenharmony_ci   /* Export position and parameter data */
2298bf215546Sopenharmony_ci   LLVMValueRef num_export_threads = vertlive_scan.result_reduce;
2299bf215546Sopenharmony_ci   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_export_threads, "");
2300bf215546Sopenharmony_ci   ac_build_ifcc(&ctx->ac, tmp, 5145);
2301bf215546Sopenharmony_ci   {
2302bf215546Sopenharmony_ci      struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
2303bf215546Sopenharmony_ci
2304bf215546Sopenharmony_ci      tmp = ngg_gs_vertex_ptr(ctx, tid);
2305bf215546Sopenharmony_ci      tmp = LLVMBuildLoad2(builder, ctx->ac.i8, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
2306bf215546Sopenharmony_ci      tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
2307bf215546Sopenharmony_ci      const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
2308bf215546Sopenharmony_ci
2309bf215546Sopenharmony_ci      unsigned out_idx = 0;
2310bf215546Sopenharmony_ci      for (unsigned i = 0; i < info->num_outputs; i++) {
2311bf215546Sopenharmony_ci         outputs[i].semantic = info->output_semantic[i];
2312bf215546Sopenharmony_ci
2313bf215546Sopenharmony_ci         for (unsigned j = 0; j < 4; j++, out_idx++) {
2314bf215546Sopenharmony_ci            tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
2315bf215546Sopenharmony_ci            tmp = LLVMBuildLoad2(builder, ctx->ac.i32, tmp, "");
2316bf215546Sopenharmony_ci            assert(LLVMGetTypeKind(LLVMTypeOf(tmp)) != LLVMPointerTypeKind);
2317bf215546Sopenharmony_ci            outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
2318bf215546Sopenharmony_ci            outputs[i].vertex_streams = info->output_streams[i];
2319bf215546Sopenharmony_ci         }
2320bf215546Sopenharmony_ci      }
2321bf215546Sopenharmony_ci
2322bf215546Sopenharmony_ci      si_llvm_build_vs_exports(ctx, num_export_threads, outputs, info->num_outputs);
2323bf215546Sopenharmony_ci   }
2324bf215546Sopenharmony_ci   ac_build_endif(&ctx->ac, 5145);
2325bf215546Sopenharmony_ci}
2326bf215546Sopenharmony_ci
2327bf215546Sopenharmony_cistatic void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
2328bf215546Sopenharmony_ci                                     unsigned min_verts_per_prim, bool use_adjacency)
2329bf215546Sopenharmony_ci{
2330bf215546Sopenharmony_ci   unsigned max_reuse = max_esverts - min_verts_per_prim;
2331bf215546Sopenharmony_ci   if (use_adjacency)
2332bf215546Sopenharmony_ci      max_reuse /= 2;
2333bf215546Sopenharmony_ci   *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
2334bf215546Sopenharmony_ci}
2335bf215546Sopenharmony_ci
2336bf215546Sopenharmony_ciunsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
2337bf215546Sopenharmony_ci{
2338bf215546Sopenharmony_ci   const struct si_shader_selector *sel = shader->selector;
2339bf215546Sopenharmony_ci
2340bf215546Sopenharmony_ci   if (sel->stage == MESA_SHADER_GEOMETRY && si_shader_uses_streamout(shader))
2341bf215546Sopenharmony_ci      return 44;
2342bf215546Sopenharmony_ci
2343bf215546Sopenharmony_ci   return 8;
2344bf215546Sopenharmony_ci}
2345bf215546Sopenharmony_ci
2346bf215546Sopenharmony_ci/**
2347bf215546Sopenharmony_ci * Determine subgroup information like maximum number of vertices and prims.
2348bf215546Sopenharmony_ci *
2349bf215546Sopenharmony_ci * This happens before the shader is uploaded, since LDS relocations during
2350bf215546Sopenharmony_ci * upload depend on the subgroup size.
2351bf215546Sopenharmony_ci */
2352bf215546Sopenharmony_cibool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
2353bf215546Sopenharmony_ci{
2354bf215546Sopenharmony_ci   const struct si_shader_selector *gs_sel = shader->selector;
2355bf215546Sopenharmony_ci   const struct si_shader_selector *es_sel =
2356bf215546Sopenharmony_ci      shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
2357bf215546Sopenharmony_ci   const gl_shader_stage gs_stage = gs_sel->stage;
2358bf215546Sopenharmony_ci   const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
2359bf215546Sopenharmony_ci   const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key);
2360bf215546Sopenharmony_ci   const bool use_adjacency =
2361bf215546Sopenharmony_ci      input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
2362bf215546Sopenharmony_ci   const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
2363bf215546Sopenharmony_ci   const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
2364bf215546Sopenharmony_ci
2365bf215546Sopenharmony_ci   /* All these are in dwords: */
2366bf215546Sopenharmony_ci   /* GE can only use 8K dwords (32KB) of LDS per workgroup.
2367bf215546Sopenharmony_ci    */
2368bf215546Sopenharmony_ci   const unsigned max_lds_size = 8 * 1024 - gfx10_ngg_get_scratch_dw_size(shader);
2369bf215546Sopenharmony_ci   const unsigned target_lds_size = max_lds_size;
2370bf215546Sopenharmony_ci   unsigned esvert_lds_size = 0;
2371bf215546Sopenharmony_ci   unsigned gsprim_lds_size = 0;
2372bf215546Sopenharmony_ci
2373bf215546Sopenharmony_ci   /* All these are per subgroup: */
2374bf215546Sopenharmony_ci   const unsigned min_esverts =
2375bf215546Sopenharmony_ci      gs_sel->screen->info.gfx_level >= GFX11 ? 3 : /* gfx11 requires at least 1 primitive per TG */
2376bf215546Sopenharmony_ci      gs_sel->screen->info.gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim);
2377bf215546Sopenharmony_ci   bool max_vert_out_per_gs_instance = false;
2378bf215546Sopenharmony_ci   unsigned max_gsprims_base = gs_sel->screen->ngg_subgroup_size; /* default prim group size clamp */
2379bf215546Sopenharmony_ci   unsigned max_esverts_base = gs_sel->screen->ngg_subgroup_size;
2380bf215546Sopenharmony_ci
2381bf215546Sopenharmony_ci   if (gs_stage == MESA_SHADER_GEOMETRY) {
2382bf215546Sopenharmony_ci      bool force_multi_cycling = false;
2383bf215546Sopenharmony_ci      unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
2384bf215546Sopenharmony_ci
2385bf215546Sopenharmony_ciretry_select_mode:
2386bf215546Sopenharmony_ci      if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
2387bf215546Sopenharmony_ci         if (max_out_verts_per_gsprim) {
2388bf215546Sopenharmony_ci            max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
2389bf215546Sopenharmony_ci         }
2390bf215546Sopenharmony_ci      } else {
2391bf215546Sopenharmony_ci         /* Use special multi-cycling mode in which each GS
2392bf215546Sopenharmony_ci          * instance gets its own subgroup. Does not work with
2393bf215546Sopenharmony_ci          * tessellation. */
2394bf215546Sopenharmony_ci         max_vert_out_per_gs_instance = true;
2395bf215546Sopenharmony_ci         max_gsprims_base = 1;
2396bf215546Sopenharmony_ci         max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out;
2397bf215546Sopenharmony_ci      }
2398bf215546Sopenharmony_ci
2399bf215546Sopenharmony_ci      esvert_lds_size = es_sel->info.esgs_itemsize / 4;
2400bf215546Sopenharmony_ci      gsprim_lds_size = (gs_sel->info.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
2401bf215546Sopenharmony_ci
2402bf215546Sopenharmony_ci      if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
2403bf215546Sopenharmony_ci         if (gs_sel->tess_turns_off_ngg || es_sel->stage != MESA_SHADER_TESS_EVAL) {
2404bf215546Sopenharmony_ci            force_multi_cycling = true;
2405bf215546Sopenharmony_ci            goto retry_select_mode;
2406bf215546Sopenharmony_ci         }
2407bf215546Sopenharmony_ci      }
2408bf215546Sopenharmony_ci   } else {
2409bf215546Sopenharmony_ci      /* VS and TES. */
2410bf215546Sopenharmony_ci      /* LDS size for passing data from ES to GS. */
2411bf215546Sopenharmony_ci      esvert_lds_size = ngg_nogs_vertex_size(shader);
2412bf215546Sopenharmony_ci   }
2413bf215546Sopenharmony_ci
2414bf215546Sopenharmony_ci   unsigned max_gsprims = max_gsprims_base;
2415bf215546Sopenharmony_ci   unsigned max_esverts = max_esverts_base;
2416bf215546Sopenharmony_ci
2417bf215546Sopenharmony_ci   if (esvert_lds_size)
2418bf215546Sopenharmony_ci      max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
2419bf215546Sopenharmony_ci   if (gsprim_lds_size)
2420bf215546Sopenharmony_ci      max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
2421bf215546Sopenharmony_ci
2422bf215546Sopenharmony_ci   max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2423bf215546Sopenharmony_ci   clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2424bf215546Sopenharmony_ci   assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2425bf215546Sopenharmony_ci
2426bf215546Sopenharmony_ci   if (esvert_lds_size || gsprim_lds_size) {
2427bf215546Sopenharmony_ci      /* Now that we have a rough proportionality between esverts
2428bf215546Sopenharmony_ci       * and gsprims based on the primitive type, scale both of them
2429bf215546Sopenharmony_ci       * down simultaneously based on required LDS space.
2430bf215546Sopenharmony_ci       *
2431bf215546Sopenharmony_ci       * We could be smarter about this if we knew how much vertex
2432bf215546Sopenharmony_ci       * reuse to expect.
2433bf215546Sopenharmony_ci       */
2434bf215546Sopenharmony_ci      unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
2435bf215546Sopenharmony_ci      if (lds_total > target_lds_size) {
2436bf215546Sopenharmony_ci         max_esverts = max_esverts * target_lds_size / lds_total;
2437bf215546Sopenharmony_ci         max_gsprims = max_gsprims * target_lds_size / lds_total;
2438bf215546Sopenharmony_ci
2439bf215546Sopenharmony_ci         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2440bf215546Sopenharmony_ci         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2441bf215546Sopenharmony_ci         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2442bf215546Sopenharmony_ci      }
2443bf215546Sopenharmony_ci   }
2444bf215546Sopenharmony_ci
2445bf215546Sopenharmony_ci   /* Round up towards full wave sizes for better ALU utilization. */
2446bf215546Sopenharmony_ci   if (!max_vert_out_per_gs_instance) {
2447bf215546Sopenharmony_ci      unsigned orig_max_esverts;
2448bf215546Sopenharmony_ci      unsigned orig_max_gsprims;
2449bf215546Sopenharmony_ci      do {
2450bf215546Sopenharmony_ci         orig_max_esverts = max_esverts;
2451bf215546Sopenharmony_ci         orig_max_gsprims = max_gsprims;
2452bf215546Sopenharmony_ci
2453bf215546Sopenharmony_ci         max_esverts = align(max_esverts, shader->wave_size);
2454bf215546Sopenharmony_ci         max_esverts = MIN2(max_esverts, max_esverts_base);
2455bf215546Sopenharmony_ci         if (esvert_lds_size)
2456bf215546Sopenharmony_ci            max_esverts =
2457bf215546Sopenharmony_ci               MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
2458bf215546Sopenharmony_ci         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2459bf215546Sopenharmony_ci
2460bf215546Sopenharmony_ci         /* Hardware restriction: minimum value of max_esverts */
2461bf215546Sopenharmony_ci         max_esverts = MAX2(max_esverts, min_esverts);
2462bf215546Sopenharmony_ci
2463bf215546Sopenharmony_ci         max_gsprims = align(max_gsprims, shader->wave_size);
2464bf215546Sopenharmony_ci         max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2465bf215546Sopenharmony_ci         if (gsprim_lds_size) {
2466bf215546Sopenharmony_ci            /* Don't count unusable vertices to the LDS size. Those are vertices above
2467bf215546Sopenharmony_ci             * the maximum number of vertices that can occur in the workgroup,
2468bf215546Sopenharmony_ci             * which is e.g. max_gsprims * 3 for triangles.
2469bf215546Sopenharmony_ci             */
2470bf215546Sopenharmony_ci            unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2471bf215546Sopenharmony_ci            max_gsprims =
2472bf215546Sopenharmony_ci               MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
2473bf215546Sopenharmony_ci         }
2474bf215546Sopenharmony_ci         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2475bf215546Sopenharmony_ci         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2476bf215546Sopenharmony_ci      } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2477bf215546Sopenharmony_ci
2478bf215546Sopenharmony_ci      /* Verify the restriction. */
2479bf215546Sopenharmony_ci      assert(max_esverts >= min_esverts);
2480bf215546Sopenharmony_ci   } else {
2481bf215546Sopenharmony_ci      max_esverts = MAX2(max_esverts, min_esverts);
2482bf215546Sopenharmony_ci   }
2483bf215546Sopenharmony_ci
2484bf215546Sopenharmony_ci   unsigned max_out_vertices =
2485bf215546Sopenharmony_ci      max_vert_out_per_gs_instance
2486bf215546Sopenharmony_ci         ? gs_sel->info.base.gs.vertices_out
2487bf215546Sopenharmony_ci         : gs_stage == MESA_SHADER_GEOMETRY
2488bf215546Sopenharmony_ci              ? max_gsprims * gs_num_invocations * gs_sel->info.base.gs.vertices_out
2489bf215546Sopenharmony_ci              : max_esverts;
2490bf215546Sopenharmony_ci   assert(max_out_vertices <= 256);
2491bf215546Sopenharmony_ci
2492bf215546Sopenharmony_ci   unsigned prim_amp_factor = 1;
2493bf215546Sopenharmony_ci   if (gs_stage == MESA_SHADER_GEOMETRY) {
2494bf215546Sopenharmony_ci      /* Number of output primitives per GS input primitive after
2495bf215546Sopenharmony_ci       * GS instancing. */
2496bf215546Sopenharmony_ci      prim_amp_factor = gs_sel->info.base.gs.vertices_out;
2497bf215546Sopenharmony_ci   }
2498bf215546Sopenharmony_ci
2499bf215546Sopenharmony_ci   shader->ngg.hw_max_esverts = max_esverts;
2500bf215546Sopenharmony_ci   shader->ngg.max_gsprims = max_gsprims;
2501bf215546Sopenharmony_ci   shader->ngg.max_out_verts = max_out_vertices;
2502bf215546Sopenharmony_ci   shader->ngg.prim_amp_factor = prim_amp_factor;
2503bf215546Sopenharmony_ci   shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2504bf215546Sopenharmony_ci
2505bf215546Sopenharmony_ci   /* Don't count unusable vertices. */
2506bf215546Sopenharmony_ci   shader->gs_info.esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) *
2507bf215546Sopenharmony_ci                                    esvert_lds_size;
2508bf215546Sopenharmony_ci   shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
2509bf215546Sopenharmony_ci
2510bf215546Sopenharmony_ci   assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */
2511bf215546Sopenharmony_ci
2512bf215546Sopenharmony_ci   /* If asserts are disabled, we use the same conditions to return false */
2513bf215546Sopenharmony_ci   return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
2514bf215546Sopenharmony_ci          max_out_vertices <= 256 &&
2515bf215546Sopenharmony_ci          shader->ngg.hw_max_esverts >= min_esverts;
2516bf215546Sopenharmony_ci}
2517