1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright 2013 Advanced Micro Devices, Inc.
3bf215546Sopenharmony_ci * All Rights Reserved.
4bf215546Sopenharmony_ci *
5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
8bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
10bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
11bf215546Sopenharmony_ci *
12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
14bf215546Sopenharmony_ci * Software.
15bf215546Sopenharmony_ci *
16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22bf215546Sopenharmony_ci * SOFTWARE.
23bf215546Sopenharmony_ci */
24bf215546Sopenharmony_ci
25bf215546Sopenharmony_ci#include "si_build_pm4.h"
26bf215546Sopenharmony_ci#include "util/u_memory.h"
27bf215546Sopenharmony_ci#include "util/u_suballoc.h"
28bf215546Sopenharmony_ci
29bf215546Sopenharmony_cistatic void si_set_streamout_enable(struct si_context *sctx, bool enable);
30bf215546Sopenharmony_ci
31bf215546Sopenharmony_cistatic inline void si_so_target_reference(struct si_streamout_target **dst,
32bf215546Sopenharmony_ci                                          struct pipe_stream_output_target *src)
33bf215546Sopenharmony_ci{
34bf215546Sopenharmony_ci   pipe_so_target_reference((struct pipe_stream_output_target **)dst, src);
35bf215546Sopenharmony_ci}
36bf215546Sopenharmony_ci
37bf215546Sopenharmony_cistatic struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx,
38bf215546Sopenharmony_ci                                                             struct pipe_resource *buffer,
39bf215546Sopenharmony_ci                                                             unsigned buffer_offset,
40bf215546Sopenharmony_ci                                                             unsigned buffer_size)
41bf215546Sopenharmony_ci{
42bf215546Sopenharmony_ci   struct si_streamout_target *t;
43bf215546Sopenharmony_ci   struct si_resource *buf = si_resource(buffer);
44bf215546Sopenharmony_ci
45bf215546Sopenharmony_ci   t = CALLOC_STRUCT(si_streamout_target);
46bf215546Sopenharmony_ci   if (!t) {
47bf215546Sopenharmony_ci      return NULL;
48bf215546Sopenharmony_ci   }
49bf215546Sopenharmony_ci
50bf215546Sopenharmony_ci   t->b.reference.count = 1;
51bf215546Sopenharmony_ci   t->b.context = ctx;
52bf215546Sopenharmony_ci   pipe_resource_reference(&t->b.buffer, buffer);
53bf215546Sopenharmony_ci   t->b.buffer_offset = buffer_offset;
54bf215546Sopenharmony_ci   t->b.buffer_size = buffer_size;
55bf215546Sopenharmony_ci
56bf215546Sopenharmony_ci   util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size);
57bf215546Sopenharmony_ci   return &t->b;
58bf215546Sopenharmony_ci}
59bf215546Sopenharmony_ci
60bf215546Sopenharmony_cistatic void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target)
61bf215546Sopenharmony_ci{
62bf215546Sopenharmony_ci   struct si_streamout_target *t = (struct si_streamout_target *)target;
63bf215546Sopenharmony_ci   pipe_resource_reference(&t->b.buffer, NULL);
64bf215546Sopenharmony_ci   si_resource_reference(&t->buf_filled_size, NULL);
65bf215546Sopenharmony_ci   FREE(t);
66bf215546Sopenharmony_ci}
67bf215546Sopenharmony_ci
68bf215546Sopenharmony_civoid si_streamout_buffers_dirty(struct si_context *sctx)
69bf215546Sopenharmony_ci{
70bf215546Sopenharmony_ci   if (!sctx->streamout.enabled_mask)
71bf215546Sopenharmony_ci      return;
72bf215546Sopenharmony_ci
73bf215546Sopenharmony_ci   si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
74bf215546Sopenharmony_ci   si_set_streamout_enable(sctx, true);
75bf215546Sopenharmony_ci}
76bf215546Sopenharmony_ci
77bf215546Sopenharmony_cistatic void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets,
78bf215546Sopenharmony_ci                                     struct pipe_stream_output_target **targets,
79bf215546Sopenharmony_ci                                     const unsigned *offsets)
80bf215546Sopenharmony_ci{
81bf215546Sopenharmony_ci   struct si_context *sctx = (struct si_context *)ctx;
82bf215546Sopenharmony_ci   unsigned old_num_targets = sctx->streamout.num_targets;
83bf215546Sopenharmony_ci   unsigned i;
84bf215546Sopenharmony_ci   bool wait_now = false;
85bf215546Sopenharmony_ci
86bf215546Sopenharmony_ci   /* We are going to unbind the buffers. Mark which caches need to be flushed. */
87bf215546Sopenharmony_ci   if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
88bf215546Sopenharmony_ci      /* Since streamout uses vector writes which go through TC L2
89bf215546Sopenharmony_ci       * and most other clients can use TC L2 as well, we don't need
90bf215546Sopenharmony_ci       * to flush it.
91bf215546Sopenharmony_ci       *
92bf215546Sopenharmony_ci       * The only cases which requires flushing it is VGT DMA index
93bf215546Sopenharmony_ci       * fetching (on <= GFX7) and indirect draw data, which are rare
94bf215546Sopenharmony_ci       * cases. Thus, flag the TC L2 dirtiness in the resource and
95bf215546Sopenharmony_ci       * handle it at draw call time.
96bf215546Sopenharmony_ci       */
97bf215546Sopenharmony_ci      for (i = 0; i < sctx->streamout.num_targets; i++)
98bf215546Sopenharmony_ci         if (sctx->streamout.targets[i])
99bf215546Sopenharmony_ci            si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
100bf215546Sopenharmony_ci
101bf215546Sopenharmony_ci      /* Invalidate the scalar cache in case a streamout buffer is
102bf215546Sopenharmony_ci       * going to be used as a constant buffer.
103bf215546Sopenharmony_ci       *
104bf215546Sopenharmony_ci       * Invalidate vL1, because streamout bypasses it (done by
105bf215546Sopenharmony_ci       * setting GLC=1 in the store instruction), but vL1 in other
106bf215546Sopenharmony_ci       * CUs can contain outdated data of streamout buffers.
107bf215546Sopenharmony_ci       *
108bf215546Sopenharmony_ci       * VS_PARTIAL_FLUSH is required if the buffers are going to be
109bf215546Sopenharmony_ci       * used as an input immediately.
110bf215546Sopenharmony_ci       */
111bf215546Sopenharmony_ci      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
112bf215546Sopenharmony_ci
113bf215546Sopenharmony_ci      /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
114bf215546Sopenharmony_ci      if (sctx->screen->use_ngg_streamout) {
115bf215546Sopenharmony_ci         sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
116bf215546Sopenharmony_ci
117bf215546Sopenharmony_ci         /* Wait now. This is needed to make sure that GDS is not
118bf215546Sopenharmony_ci          * busy at the end of IBs.
119bf215546Sopenharmony_ci          *
120bf215546Sopenharmony_ci          * Also, the next streamout operation will overwrite GDS,
121bf215546Sopenharmony_ci          * so we need to make sure that it's idle.
122bf215546Sopenharmony_ci          */
123bf215546Sopenharmony_ci         wait_now = true;
124bf215546Sopenharmony_ci      } else {
125bf215546Sopenharmony_ci         sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
126bf215546Sopenharmony_ci      }
127bf215546Sopenharmony_ci   }
128bf215546Sopenharmony_ci
129bf215546Sopenharmony_ci   /* All readers of the streamout targets need to be finished before we can
130bf215546Sopenharmony_ci    * start writing to the targets.
131bf215546Sopenharmony_ci    */
132bf215546Sopenharmony_ci   if (num_targets) {
133bf215546Sopenharmony_ci      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
134bf215546Sopenharmony_ci                     SI_CONTEXT_PFP_SYNC_ME;
135bf215546Sopenharmony_ci   }
136bf215546Sopenharmony_ci
137bf215546Sopenharmony_ci   /* Streamout buffers must be bound in 2 places:
138bf215546Sopenharmony_ci    * 1) in VGT by setting the VGT_STRMOUT registers
139bf215546Sopenharmony_ci    * 2) as shader resources
140bf215546Sopenharmony_ci    */
141bf215546Sopenharmony_ci
142bf215546Sopenharmony_ci   /* Stop streamout. */
143bf215546Sopenharmony_ci   if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
144bf215546Sopenharmony_ci      si_emit_streamout_end(sctx);
145bf215546Sopenharmony_ci
146bf215546Sopenharmony_ci   /* Set the new targets. */
147bf215546Sopenharmony_ci   unsigned enabled_mask = 0, append_bitmask = 0;
148bf215546Sopenharmony_ci   for (i = 0; i < num_targets; i++) {
149bf215546Sopenharmony_ci      si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
150bf215546Sopenharmony_ci      if (!targets[i])
151bf215546Sopenharmony_ci         continue;
152bf215546Sopenharmony_ci
153bf215546Sopenharmony_ci      si_context_add_resource_size(sctx, targets[i]->buffer);
154bf215546Sopenharmony_ci      enabled_mask |= 1 << i;
155bf215546Sopenharmony_ci
156bf215546Sopenharmony_ci      if (offsets[i] == ((unsigned)-1))
157bf215546Sopenharmony_ci         append_bitmask |= 1 << i;
158bf215546Sopenharmony_ci
159bf215546Sopenharmony_ci      /* Allocate space for the filled buffer size. */
160bf215546Sopenharmony_ci      struct si_streamout_target *t = sctx->streamout.targets[i];
161bf215546Sopenharmony_ci      if (!t->buf_filled_size) {
162bf215546Sopenharmony_ci         unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
163bf215546Sopenharmony_ci         u_suballocator_alloc(&sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
164bf215546Sopenharmony_ci                              &t->buf_filled_size_offset,
165bf215546Sopenharmony_ci                              (struct pipe_resource **)&t->buf_filled_size);
166bf215546Sopenharmony_ci      }
167bf215546Sopenharmony_ci   }
168bf215546Sopenharmony_ci
169bf215546Sopenharmony_ci   for (; i < sctx->streamout.num_targets; i++)
170bf215546Sopenharmony_ci      si_so_target_reference(&sctx->streamout.targets[i], NULL);
171bf215546Sopenharmony_ci
172bf215546Sopenharmony_ci   if (!!sctx->streamout.enabled_mask != !!enabled_mask) {
173bf215546Sopenharmony_ci      sctx->streamout.enabled_mask = enabled_mask;
174bf215546Sopenharmony_ci      sctx->do_update_shaders = true; /* to keep/remove streamout shader code as an optimization */
175bf215546Sopenharmony_ci   }
176bf215546Sopenharmony_ci
177bf215546Sopenharmony_ci   sctx->streamout.num_targets = num_targets;
178bf215546Sopenharmony_ci   sctx->streamout.append_bitmask = append_bitmask;
179bf215546Sopenharmony_ci
180bf215546Sopenharmony_ci   /* Update dirty state bits. */
181bf215546Sopenharmony_ci   if (num_targets) {
182bf215546Sopenharmony_ci      si_streamout_buffers_dirty(sctx);
183bf215546Sopenharmony_ci   } else {
184bf215546Sopenharmony_ci      si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
185bf215546Sopenharmony_ci      si_set_streamout_enable(sctx, false);
186bf215546Sopenharmony_ci   }
187bf215546Sopenharmony_ci
188bf215546Sopenharmony_ci   /* Set the shader resources.*/
189bf215546Sopenharmony_ci   for (i = 0; i < num_targets; i++) {
190bf215546Sopenharmony_ci      if (targets[i]) {
191bf215546Sopenharmony_ci         struct pipe_shader_buffer sbuf;
192bf215546Sopenharmony_ci         sbuf.buffer = targets[i]->buffer;
193bf215546Sopenharmony_ci
194bf215546Sopenharmony_ci         if (sctx->screen->use_ngg_streamout) {
195bf215546Sopenharmony_ci            sbuf.buffer_offset = targets[i]->buffer_offset;
196bf215546Sopenharmony_ci            sbuf.buffer_size = targets[i]->buffer_size;
197bf215546Sopenharmony_ci         } else {
198bf215546Sopenharmony_ci            sbuf.buffer_offset = 0;
199bf215546Sopenharmony_ci            sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size;
200bf215546Sopenharmony_ci         }
201bf215546Sopenharmony_ci
202bf215546Sopenharmony_ci         si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
203bf215546Sopenharmony_ci         si_resource(targets[i]->buffer)->bind_history |= SI_BIND_STREAMOUT_BUFFER;
204bf215546Sopenharmony_ci      } else {
205bf215546Sopenharmony_ci         si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
206bf215546Sopenharmony_ci      }
207bf215546Sopenharmony_ci   }
208bf215546Sopenharmony_ci   for (; i < old_num_targets; i++)
209bf215546Sopenharmony_ci      si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
210bf215546Sopenharmony_ci
211bf215546Sopenharmony_ci   if (wait_now)
212bf215546Sopenharmony_ci      sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
213bf215546Sopenharmony_ci}
214bf215546Sopenharmony_ci
215bf215546Sopenharmony_cistatic void si_flush_vgt_streamout(struct si_context *sctx)
216bf215546Sopenharmony_ci{
217bf215546Sopenharmony_ci   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
218bf215546Sopenharmony_ci   unsigned reg_strmout_cntl;
219bf215546Sopenharmony_ci
220bf215546Sopenharmony_ci   radeon_begin(cs);
221bf215546Sopenharmony_ci
222bf215546Sopenharmony_ci   /* The register is at different places on different ASICs. */
223bf215546Sopenharmony_ci   if (sctx->gfx_level >= GFX9) {
224bf215546Sopenharmony_ci      reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
225bf215546Sopenharmony_ci      radeon_emit(PKT3(PKT3_WRITE_DATA, 3, 0));
226bf215546Sopenharmony_ci      radeon_emit(S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME));
227bf215546Sopenharmony_ci      radeon_emit(R_0300FC_CP_STRMOUT_CNTL >> 2);
228bf215546Sopenharmony_ci      radeon_emit(0);
229bf215546Sopenharmony_ci      radeon_emit(0);
230bf215546Sopenharmony_ci   } else if (sctx->gfx_level >= GFX7) {
231bf215546Sopenharmony_ci      reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
232bf215546Sopenharmony_ci      radeon_set_uconfig_reg(reg_strmout_cntl, 0);
233bf215546Sopenharmony_ci   } else {
234bf215546Sopenharmony_ci      reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
235bf215546Sopenharmony_ci      radeon_set_config_reg(reg_strmout_cntl, 0);
236bf215546Sopenharmony_ci   }
237bf215546Sopenharmony_ci
238bf215546Sopenharmony_ci   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
239bf215546Sopenharmony_ci   radeon_emit(EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
240bf215546Sopenharmony_ci
241bf215546Sopenharmony_ci   radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
242bf215546Sopenharmony_ci   radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
243bf215546Sopenharmony_ci   radeon_emit(reg_strmout_cntl >> 2); /* register */
244bf215546Sopenharmony_ci   radeon_emit(0);
245bf215546Sopenharmony_ci   radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
246bf215546Sopenharmony_ci   radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
247bf215546Sopenharmony_ci   radeon_emit(4);                              /* poll interval */
248bf215546Sopenharmony_ci   radeon_end();
249bf215546Sopenharmony_ci}
250bf215546Sopenharmony_ci
251bf215546Sopenharmony_cistatic void si_emit_streamout_begin(struct si_context *sctx)
252bf215546Sopenharmony_ci{
253bf215546Sopenharmony_ci   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
254bf215546Sopenharmony_ci   struct si_streamout_target **t = sctx->streamout.targets;
255bf215546Sopenharmony_ci
256bf215546Sopenharmony_ci   if (!sctx->screen->use_ngg_streamout)
257bf215546Sopenharmony_ci      si_flush_vgt_streamout(sctx);
258bf215546Sopenharmony_ci
259bf215546Sopenharmony_ci   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
260bf215546Sopenharmony_ci      if (!t[i])
261bf215546Sopenharmony_ci         continue;
262bf215546Sopenharmony_ci
263bf215546Sopenharmony_ci      t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
264bf215546Sopenharmony_ci
265bf215546Sopenharmony_ci      if (sctx->screen->use_ngg_streamout) {
266bf215546Sopenharmony_ci         bool append = sctx->streamout.append_bitmask & (1 << i);
267bf215546Sopenharmony_ci         uint64_t va = 0;
268bf215546Sopenharmony_ci
269bf215546Sopenharmony_ci         if (append) {
270bf215546Sopenharmony_ci            radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
271bf215546Sopenharmony_ci                                      RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE);
272bf215546Sopenharmony_ci
273bf215546Sopenharmony_ci            va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
274bf215546Sopenharmony_ci         }
275bf215546Sopenharmony_ci
276bf215546Sopenharmony_ci         radeon_begin(cs);
277bf215546Sopenharmony_ci         radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
278bf215546Sopenharmony_ci         radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
279bf215546Sopenharmony_ci                     S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(1));
280bf215546Sopenharmony_ci         radeon_emit(va);
281bf215546Sopenharmony_ci         radeon_emit(va >> 32);
282bf215546Sopenharmony_ci         radeon_emit(4 * i); /* destination in GDS */
283bf215546Sopenharmony_ci         radeon_emit(0);
284bf215546Sopenharmony_ci         radeon_emit(S_415_BYTE_COUNT_GFX9(4));
285bf215546Sopenharmony_ci         radeon_end();
286bf215546Sopenharmony_ci      } else {
287bf215546Sopenharmony_ci         /* Legacy streamout.
288bf215546Sopenharmony_ci          *
289bf215546Sopenharmony_ci          * The hw binds streamout buffers as shader resources. VGT only counts primitives
290bf215546Sopenharmony_ci          * and tells the shader through SGPRs what to do.
291bf215546Sopenharmony_ci          */
292bf215546Sopenharmony_ci         radeon_begin(cs);
293bf215546Sopenharmony_ci         radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
294bf215546Sopenharmony_ci         radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
295bf215546Sopenharmony_ci         radeon_emit(sctx->streamout.stride_in_dw[i]);                                    /* VTX_STRIDE (in DW) */
296bf215546Sopenharmony_ci
297bf215546Sopenharmony_ci         if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
298bf215546Sopenharmony_ci            uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
299bf215546Sopenharmony_ci
300bf215546Sopenharmony_ci            /* Append. */
301bf215546Sopenharmony_ci            radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
302bf215546Sopenharmony_ci            radeon_emit(STRMOUT_SELECT_BUFFER(i) |
303bf215546Sopenharmony_ci                        STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
304bf215546Sopenharmony_ci            radeon_emit(0);                                              /* unused */
305bf215546Sopenharmony_ci            radeon_emit(0);                                              /* unused */
306bf215546Sopenharmony_ci            radeon_emit(va);                                             /* src address lo */
307bf215546Sopenharmony_ci            radeon_emit(va >> 32);                                       /* src address hi */
308bf215546Sopenharmony_ci
309bf215546Sopenharmony_ci            radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
310bf215546Sopenharmony_ci                                      RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE);
311bf215546Sopenharmony_ci         } else {
312bf215546Sopenharmony_ci            /* Start from the beginning. */
313bf215546Sopenharmony_ci            radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
314bf215546Sopenharmony_ci            radeon_emit(STRMOUT_SELECT_BUFFER(i) |
315bf215546Sopenharmony_ci                        STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
316bf215546Sopenharmony_ci            radeon_emit(0);                                                 /* unused */
317bf215546Sopenharmony_ci            radeon_emit(0);                                                 /* unused */
318bf215546Sopenharmony_ci            radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
319bf215546Sopenharmony_ci            radeon_emit(0);                          /* unused */
320bf215546Sopenharmony_ci         }
321bf215546Sopenharmony_ci         radeon_end_update_context_roll(sctx);
322bf215546Sopenharmony_ci      }
323bf215546Sopenharmony_ci   }
324bf215546Sopenharmony_ci
325bf215546Sopenharmony_ci   sctx->streamout.begin_emitted = true;
326bf215546Sopenharmony_ci}
327bf215546Sopenharmony_ci
328bf215546Sopenharmony_civoid si_emit_streamout_end(struct si_context *sctx)
329bf215546Sopenharmony_ci{
330bf215546Sopenharmony_ci   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
331bf215546Sopenharmony_ci   struct si_streamout_target **t = sctx->streamout.targets;
332bf215546Sopenharmony_ci
333bf215546Sopenharmony_ci   if (!sctx->screen->use_ngg_streamout)
334bf215546Sopenharmony_ci      si_flush_vgt_streamout(sctx);
335bf215546Sopenharmony_ci
336bf215546Sopenharmony_ci   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
337bf215546Sopenharmony_ci      if (!t[i])
338bf215546Sopenharmony_ci         continue;
339bf215546Sopenharmony_ci
340bf215546Sopenharmony_ci      uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
341bf215546Sopenharmony_ci
342bf215546Sopenharmony_ci      if (sctx->screen->use_ngg_streamout) {
343bf215546Sopenharmony_ci         /* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */
344bf215546Sopenharmony_ci         si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
345bf215546Sopenharmony_ci                           EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,
346bf215546Sopenharmony_ci                           t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0);
347bf215546Sopenharmony_ci      } else {
348bf215546Sopenharmony_ci         radeon_begin(cs);
349bf215546Sopenharmony_ci         radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
350bf215546Sopenharmony_ci         radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
351bf215546Sopenharmony_ci                     STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
352bf215546Sopenharmony_ci         radeon_emit(va);                                  /* dst address lo */
353bf215546Sopenharmony_ci         radeon_emit(va >> 32);                            /* dst address hi */
354bf215546Sopenharmony_ci         radeon_emit(0);                                   /* unused */
355bf215546Sopenharmony_ci         radeon_emit(0);                                   /* unused */
356bf215546Sopenharmony_ci
357bf215546Sopenharmony_ci         /* Zero the buffer size. The counters (primitives generated,
358bf215546Sopenharmony_ci          * primitives emitted) may be enabled even if there is not
359bf215546Sopenharmony_ci          * buffer bound. This ensures that the primitives-emitted query
360bf215546Sopenharmony_ci          * won't increment. */
361bf215546Sopenharmony_ci         radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
362bf215546Sopenharmony_ci         radeon_end_update_context_roll(sctx);
363bf215546Sopenharmony_ci
364bf215546Sopenharmony_ci         radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
365bf215546Sopenharmony_ci                                   RADEON_USAGE_WRITE | RADEON_PRIO_SO_FILLED_SIZE);
366bf215546Sopenharmony_ci      }
367bf215546Sopenharmony_ci
368bf215546Sopenharmony_ci      t[i]->buf_filled_size_valid = true;
369bf215546Sopenharmony_ci   }
370bf215546Sopenharmony_ci
371bf215546Sopenharmony_ci   sctx->streamout.begin_emitted = false;
372bf215546Sopenharmony_ci}
373bf215546Sopenharmony_ci
374bf215546Sopenharmony_ci/* STREAMOUT CONFIG DERIVED STATE
375bf215546Sopenharmony_ci *
376bf215546Sopenharmony_ci * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
377bf215546Sopenharmony_ci * The buffer mask is an independent state, so no writes occur if there
378bf215546Sopenharmony_ci * are no buffers bound.
379bf215546Sopenharmony_ci */
380bf215546Sopenharmony_ci
381bf215546Sopenharmony_cistatic void si_emit_streamout_enable(struct si_context *sctx)
382bf215546Sopenharmony_ci{
383bf215546Sopenharmony_ci   assert(!sctx->screen->use_ngg_streamout);
384bf215546Sopenharmony_ci
385bf215546Sopenharmony_ci   radeon_begin(&sctx->gfx_cs);
386bf215546Sopenharmony_ci   radeon_set_context_reg_seq(R_028B94_VGT_STRMOUT_CONFIG, 2);
387bf215546Sopenharmony_ci   radeon_emit(S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
388bf215546Sopenharmony_ci               S_028B94_RAST_STREAM(0) |
389bf215546Sopenharmony_ci               S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
390bf215546Sopenharmony_ci               S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
391bf215546Sopenharmony_ci               S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
392bf215546Sopenharmony_ci   radeon_emit(sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
393bf215546Sopenharmony_ci   radeon_end();
394bf215546Sopenharmony_ci}
395bf215546Sopenharmony_ci
396bf215546Sopenharmony_cistatic void si_set_streamout_enable(struct si_context *sctx, bool enable)
397bf215546Sopenharmony_ci{
398bf215546Sopenharmony_ci   bool old_strmout_en = si_get_strmout_en(sctx);
399bf215546Sopenharmony_ci   unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
400bf215546Sopenharmony_ci
401bf215546Sopenharmony_ci   sctx->streamout.streamout_enabled = enable;
402bf215546Sopenharmony_ci
403bf215546Sopenharmony_ci   sctx->streamout.hw_enabled_mask =
404bf215546Sopenharmony_ci      sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
405bf215546Sopenharmony_ci      (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
406bf215546Sopenharmony_ci
407bf215546Sopenharmony_ci   if (!sctx->screen->use_ngg_streamout &&
408bf215546Sopenharmony_ci       ((old_strmout_en != si_get_strmout_en(sctx)) ||
409bf215546Sopenharmony_ci        (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
410bf215546Sopenharmony_ci      si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
411bf215546Sopenharmony_ci}
412bf215546Sopenharmony_ci
413bf215546Sopenharmony_civoid si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff)
414bf215546Sopenharmony_ci{
415bf215546Sopenharmony_ci   if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) {
416bf215546Sopenharmony_ci      bool old_strmout_en = si_get_strmout_en(sctx);
417bf215546Sopenharmony_ci
418bf215546Sopenharmony_ci      sctx->streamout.num_prims_gen_queries += diff;
419bf215546Sopenharmony_ci      assert(sctx->streamout.num_prims_gen_queries >= 0);
420bf215546Sopenharmony_ci
421bf215546Sopenharmony_ci      sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0;
422bf215546Sopenharmony_ci
423bf215546Sopenharmony_ci      if (old_strmout_en != si_get_strmout_en(sctx))
424bf215546Sopenharmony_ci         si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
425bf215546Sopenharmony_ci
426bf215546Sopenharmony_ci      if (si_update_ngg(sctx)) {
427bf215546Sopenharmony_ci         si_shader_change_notify(sctx);
428bf215546Sopenharmony_ci         sctx->do_update_shaders = true;
429bf215546Sopenharmony_ci      }
430bf215546Sopenharmony_ci   }
431bf215546Sopenharmony_ci}
432bf215546Sopenharmony_ci
433bf215546Sopenharmony_civoid si_init_streamout_functions(struct si_context *sctx)
434bf215546Sopenharmony_ci{
435bf215546Sopenharmony_ci   sctx->b.create_stream_output_target = si_create_so_target;
436bf215546Sopenharmony_ci   sctx->b.stream_output_target_destroy = si_so_target_destroy;
437bf215546Sopenharmony_ci   sctx->b.set_stream_output_targets = si_set_streamout_targets;
438bf215546Sopenharmony_ci   sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
439bf215546Sopenharmony_ci
440bf215546Sopenharmony_ci   if (!sctx->screen->use_ngg_streamout)
441bf215546Sopenharmony_ci      sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
442bf215546Sopenharmony_ci}
443