1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright 2013 Advanced Micro Devices, Inc. 3bf215546Sopenharmony_ci * All Rights Reserved. 4bf215546Sopenharmony_ci * 5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 8bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 10bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 11bf215546Sopenharmony_ci * 12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 14bf215546Sopenharmony_ci * Software. 15bf215546Sopenharmony_ci * 16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22bf215546Sopenharmony_ci * SOFTWARE. 23bf215546Sopenharmony_ci */ 24bf215546Sopenharmony_ci 25bf215546Sopenharmony_ci#include "si_build_pm4.h" 26bf215546Sopenharmony_ci#include "util/u_memory.h" 27bf215546Sopenharmony_ci#include "util/u_suballoc.h" 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_cistatic void si_set_streamout_enable(struct si_context *sctx, bool enable); 30bf215546Sopenharmony_ci 31bf215546Sopenharmony_cistatic inline void si_so_target_reference(struct si_streamout_target **dst, 32bf215546Sopenharmony_ci struct pipe_stream_output_target *src) 33bf215546Sopenharmony_ci{ 34bf215546Sopenharmony_ci pipe_so_target_reference((struct pipe_stream_output_target **)dst, src); 35bf215546Sopenharmony_ci} 36bf215546Sopenharmony_ci 37bf215546Sopenharmony_cistatic struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx, 38bf215546Sopenharmony_ci struct pipe_resource *buffer, 39bf215546Sopenharmony_ci unsigned buffer_offset, 40bf215546Sopenharmony_ci unsigned buffer_size) 41bf215546Sopenharmony_ci{ 42bf215546Sopenharmony_ci struct si_streamout_target *t; 43bf215546Sopenharmony_ci struct si_resource *buf = si_resource(buffer); 44bf215546Sopenharmony_ci 45bf215546Sopenharmony_ci t = CALLOC_STRUCT(si_streamout_target); 46bf215546Sopenharmony_ci if (!t) { 47bf215546Sopenharmony_ci return NULL; 48bf215546Sopenharmony_ci } 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_ci t->b.reference.count = 1; 51bf215546Sopenharmony_ci t->b.context = ctx; 52bf215546Sopenharmony_ci pipe_resource_reference(&t->b.buffer, buffer); 53bf215546Sopenharmony_ci t->b.buffer_offset = buffer_offset; 54bf215546Sopenharmony_ci t->b.buffer_size = buffer_size; 55bf215546Sopenharmony_ci 56bf215546Sopenharmony_ci util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size); 57bf215546Sopenharmony_ci return &t->b; 58bf215546Sopenharmony_ci} 59bf215546Sopenharmony_ci 60bf215546Sopenharmony_cistatic void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target) 61bf215546Sopenharmony_ci{ 62bf215546Sopenharmony_ci struct si_streamout_target *t = (struct si_streamout_target *)target; 63bf215546Sopenharmony_ci pipe_resource_reference(&t->b.buffer, NULL); 64bf215546Sopenharmony_ci si_resource_reference(&t->buf_filled_size, NULL); 65bf215546Sopenharmony_ci FREE(t); 66bf215546Sopenharmony_ci} 67bf215546Sopenharmony_ci 68bf215546Sopenharmony_civoid si_streamout_buffers_dirty(struct si_context *sctx) 69bf215546Sopenharmony_ci{ 70bf215546Sopenharmony_ci if (!sctx->streamout.enabled_mask) 71bf215546Sopenharmony_ci return; 72bf215546Sopenharmony_ci 73bf215546Sopenharmony_ci si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin); 74bf215546Sopenharmony_ci si_set_streamout_enable(sctx, true); 75bf215546Sopenharmony_ci} 76bf215546Sopenharmony_ci 77bf215546Sopenharmony_cistatic void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets, 78bf215546Sopenharmony_ci struct pipe_stream_output_target **targets, 79bf215546Sopenharmony_ci const unsigned *offsets) 80bf215546Sopenharmony_ci{ 81bf215546Sopenharmony_ci struct si_context *sctx = (struct si_context *)ctx; 82bf215546Sopenharmony_ci unsigned old_num_targets = sctx->streamout.num_targets; 83bf215546Sopenharmony_ci unsigned i; 84bf215546Sopenharmony_ci bool wait_now = false; 85bf215546Sopenharmony_ci 86bf215546Sopenharmony_ci /* We are going to unbind the buffers. Mark which caches need to be flushed. */ 87bf215546Sopenharmony_ci if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) { 88bf215546Sopenharmony_ci /* Since streamout uses vector writes which go through TC L2 89bf215546Sopenharmony_ci * and most other clients can use TC L2 as well, we don't need 90bf215546Sopenharmony_ci * to flush it. 91bf215546Sopenharmony_ci * 92bf215546Sopenharmony_ci * The only cases which requires flushing it is VGT DMA index 93bf215546Sopenharmony_ci * fetching (on <= GFX7) and indirect draw data, which are rare 94bf215546Sopenharmony_ci * cases. Thus, flag the TC L2 dirtiness in the resource and 95bf215546Sopenharmony_ci * handle it at draw call time. 96bf215546Sopenharmony_ci */ 97bf215546Sopenharmony_ci for (i = 0; i < sctx->streamout.num_targets; i++) 98bf215546Sopenharmony_ci if (sctx->streamout.targets[i]) 99bf215546Sopenharmony_ci si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true; 100bf215546Sopenharmony_ci 101bf215546Sopenharmony_ci /* Invalidate the scalar cache in case a streamout buffer is 102bf215546Sopenharmony_ci * going to be used as a constant buffer. 103bf215546Sopenharmony_ci * 104bf215546Sopenharmony_ci * Invalidate vL1, because streamout bypasses it (done by 105bf215546Sopenharmony_ci * setting GLC=1 in the store instruction), but vL1 in other 106bf215546Sopenharmony_ci * CUs can contain outdated data of streamout buffers. 107bf215546Sopenharmony_ci * 108bf215546Sopenharmony_ci * VS_PARTIAL_FLUSH is required if the buffers are going to be 109bf215546Sopenharmony_ci * used as an input immediately. 110bf215546Sopenharmony_ci */ 111bf215546Sopenharmony_ci sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE; 112bf215546Sopenharmony_ci 113bf215546Sopenharmony_ci /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */ 114bf215546Sopenharmony_ci if (sctx->screen->use_ngg_streamout) { 115bf215546Sopenharmony_ci sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; 116bf215546Sopenharmony_ci 117bf215546Sopenharmony_ci /* Wait now. This is needed to make sure that GDS is not 118bf215546Sopenharmony_ci * busy at the end of IBs. 119bf215546Sopenharmony_ci * 120bf215546Sopenharmony_ci * Also, the next streamout operation will overwrite GDS, 121bf215546Sopenharmony_ci * so we need to make sure that it's idle. 122bf215546Sopenharmony_ci */ 123bf215546Sopenharmony_ci wait_now = true; 124bf215546Sopenharmony_ci } else { 125bf215546Sopenharmony_ci sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; 126bf215546Sopenharmony_ci } 127bf215546Sopenharmony_ci } 128bf215546Sopenharmony_ci 129bf215546Sopenharmony_ci /* All readers of the streamout targets need to be finished before we can 130bf215546Sopenharmony_ci * start writing to the targets. 131bf215546Sopenharmony_ci */ 132bf215546Sopenharmony_ci if (num_targets) { 133bf215546Sopenharmony_ci sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | 134bf215546Sopenharmony_ci SI_CONTEXT_PFP_SYNC_ME; 135bf215546Sopenharmony_ci } 136bf215546Sopenharmony_ci 137bf215546Sopenharmony_ci /* Streamout buffers must be bound in 2 places: 138bf215546Sopenharmony_ci * 1) in VGT by setting the VGT_STRMOUT registers 139bf215546Sopenharmony_ci * 2) as shader resources 140bf215546Sopenharmony_ci */ 141bf215546Sopenharmony_ci 142bf215546Sopenharmony_ci /* Stop streamout. */ 143bf215546Sopenharmony_ci if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) 144bf215546Sopenharmony_ci si_emit_streamout_end(sctx); 145bf215546Sopenharmony_ci 146bf215546Sopenharmony_ci /* Set the new targets. */ 147bf215546Sopenharmony_ci unsigned enabled_mask = 0, append_bitmask = 0; 148bf215546Sopenharmony_ci for (i = 0; i < num_targets; i++) { 149bf215546Sopenharmony_ci si_so_target_reference(&sctx->streamout.targets[i], targets[i]); 150bf215546Sopenharmony_ci if (!targets[i]) 151bf215546Sopenharmony_ci continue; 152bf215546Sopenharmony_ci 153bf215546Sopenharmony_ci si_context_add_resource_size(sctx, targets[i]->buffer); 154bf215546Sopenharmony_ci enabled_mask |= 1 << i; 155bf215546Sopenharmony_ci 156bf215546Sopenharmony_ci if (offsets[i] == ((unsigned)-1)) 157bf215546Sopenharmony_ci append_bitmask |= 1 << i; 158bf215546Sopenharmony_ci 159bf215546Sopenharmony_ci /* Allocate space for the filled buffer size. */ 160bf215546Sopenharmony_ci struct si_streamout_target *t = sctx->streamout.targets[i]; 161bf215546Sopenharmony_ci if (!t->buf_filled_size) { 162bf215546Sopenharmony_ci unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4; 163bf215546Sopenharmony_ci u_suballocator_alloc(&sctx->allocator_zeroed_memory, buf_filled_size_size, 4, 164bf215546Sopenharmony_ci &t->buf_filled_size_offset, 165bf215546Sopenharmony_ci (struct pipe_resource **)&t->buf_filled_size); 166bf215546Sopenharmony_ci } 167bf215546Sopenharmony_ci } 168bf215546Sopenharmony_ci 169bf215546Sopenharmony_ci for (; i < sctx->streamout.num_targets; i++) 170bf215546Sopenharmony_ci si_so_target_reference(&sctx->streamout.targets[i], NULL); 171bf215546Sopenharmony_ci 172bf215546Sopenharmony_ci if (!!sctx->streamout.enabled_mask != !!enabled_mask) { 173bf215546Sopenharmony_ci sctx->streamout.enabled_mask = enabled_mask; 174bf215546Sopenharmony_ci sctx->do_update_shaders = true; /* to keep/remove streamout shader code as an optimization */ 175bf215546Sopenharmony_ci } 176bf215546Sopenharmony_ci 177bf215546Sopenharmony_ci sctx->streamout.num_targets = num_targets; 178bf215546Sopenharmony_ci sctx->streamout.append_bitmask = append_bitmask; 179bf215546Sopenharmony_ci 180bf215546Sopenharmony_ci /* Update dirty state bits. */ 181bf215546Sopenharmony_ci if (num_targets) { 182bf215546Sopenharmony_ci si_streamout_buffers_dirty(sctx); 183bf215546Sopenharmony_ci } else { 184bf215546Sopenharmony_ci si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false); 185bf215546Sopenharmony_ci si_set_streamout_enable(sctx, false); 186bf215546Sopenharmony_ci } 187bf215546Sopenharmony_ci 188bf215546Sopenharmony_ci /* Set the shader resources.*/ 189bf215546Sopenharmony_ci for (i = 0; i < num_targets; i++) { 190bf215546Sopenharmony_ci if (targets[i]) { 191bf215546Sopenharmony_ci struct pipe_shader_buffer sbuf; 192bf215546Sopenharmony_ci sbuf.buffer = targets[i]->buffer; 193bf215546Sopenharmony_ci 194bf215546Sopenharmony_ci if (sctx->screen->use_ngg_streamout) { 195bf215546Sopenharmony_ci sbuf.buffer_offset = targets[i]->buffer_offset; 196bf215546Sopenharmony_ci sbuf.buffer_size = targets[i]->buffer_size; 197bf215546Sopenharmony_ci } else { 198bf215546Sopenharmony_ci sbuf.buffer_offset = 0; 199bf215546Sopenharmony_ci sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size; 200bf215546Sopenharmony_ci } 201bf215546Sopenharmony_ci 202bf215546Sopenharmony_ci si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf); 203bf215546Sopenharmony_ci si_resource(targets[i]->buffer)->bind_history |= SI_BIND_STREAMOUT_BUFFER; 204bf215546Sopenharmony_ci } else { 205bf215546Sopenharmony_ci si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); 206bf215546Sopenharmony_ci } 207bf215546Sopenharmony_ci } 208bf215546Sopenharmony_ci for (; i < old_num_targets; i++) 209bf215546Sopenharmony_ci si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); 210bf215546Sopenharmony_ci 211bf215546Sopenharmony_ci if (wait_now) 212bf215546Sopenharmony_ci sctx->emit_cache_flush(sctx, &sctx->gfx_cs); 213bf215546Sopenharmony_ci} 214bf215546Sopenharmony_ci 215bf215546Sopenharmony_cistatic void si_flush_vgt_streamout(struct si_context *sctx) 216bf215546Sopenharmony_ci{ 217bf215546Sopenharmony_ci struct radeon_cmdbuf *cs = &sctx->gfx_cs; 218bf215546Sopenharmony_ci unsigned reg_strmout_cntl; 219bf215546Sopenharmony_ci 220bf215546Sopenharmony_ci radeon_begin(cs); 221bf215546Sopenharmony_ci 222bf215546Sopenharmony_ci /* The register is at different places on different ASICs. */ 223bf215546Sopenharmony_ci if (sctx->gfx_level >= GFX9) { 224bf215546Sopenharmony_ci reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; 225bf215546Sopenharmony_ci radeon_emit(PKT3(PKT3_WRITE_DATA, 3, 0)); 226bf215546Sopenharmony_ci radeon_emit(S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME)); 227bf215546Sopenharmony_ci radeon_emit(R_0300FC_CP_STRMOUT_CNTL >> 2); 228bf215546Sopenharmony_ci radeon_emit(0); 229bf215546Sopenharmony_ci radeon_emit(0); 230bf215546Sopenharmony_ci } else if (sctx->gfx_level >= GFX7) { 231bf215546Sopenharmony_ci reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; 232bf215546Sopenharmony_ci radeon_set_uconfig_reg(reg_strmout_cntl, 0); 233bf215546Sopenharmony_ci } else { 234bf215546Sopenharmony_ci reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; 235bf215546Sopenharmony_ci radeon_set_config_reg(reg_strmout_cntl, 0); 236bf215546Sopenharmony_ci } 237bf215546Sopenharmony_ci 238bf215546Sopenharmony_ci radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); 239bf215546Sopenharmony_ci radeon_emit(EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); 240bf215546Sopenharmony_ci 241bf215546Sopenharmony_ci radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 242bf215546Sopenharmony_ci radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 243bf215546Sopenharmony_ci radeon_emit(reg_strmout_cntl >> 2); /* register */ 244bf215546Sopenharmony_ci radeon_emit(0); 245bf215546Sopenharmony_ci radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ 246bf215546Sopenharmony_ci radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ 247bf215546Sopenharmony_ci radeon_emit(4); /* poll interval */ 248bf215546Sopenharmony_ci radeon_end(); 249bf215546Sopenharmony_ci} 250bf215546Sopenharmony_ci 251bf215546Sopenharmony_cistatic void si_emit_streamout_begin(struct si_context *sctx) 252bf215546Sopenharmony_ci{ 253bf215546Sopenharmony_ci struct radeon_cmdbuf *cs = &sctx->gfx_cs; 254bf215546Sopenharmony_ci struct si_streamout_target **t = sctx->streamout.targets; 255bf215546Sopenharmony_ci 256bf215546Sopenharmony_ci if (!sctx->screen->use_ngg_streamout) 257bf215546Sopenharmony_ci si_flush_vgt_streamout(sctx); 258bf215546Sopenharmony_ci 259bf215546Sopenharmony_ci for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { 260bf215546Sopenharmony_ci if (!t[i]) 261bf215546Sopenharmony_ci continue; 262bf215546Sopenharmony_ci 263bf215546Sopenharmony_ci t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i]; 264bf215546Sopenharmony_ci 265bf215546Sopenharmony_ci if (sctx->screen->use_ngg_streamout) { 266bf215546Sopenharmony_ci bool append = sctx->streamout.append_bitmask & (1 << i); 267bf215546Sopenharmony_ci uint64_t va = 0; 268bf215546Sopenharmony_ci 269bf215546Sopenharmony_ci if (append) { 270bf215546Sopenharmony_ci radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, 271bf215546Sopenharmony_ci RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE); 272bf215546Sopenharmony_ci 273bf215546Sopenharmony_ci va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; 274bf215546Sopenharmony_ci } 275bf215546Sopenharmony_ci 276bf215546Sopenharmony_ci radeon_begin(cs); 277bf215546Sopenharmony_ci radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0)); 278bf215546Sopenharmony_ci radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | 279bf215546Sopenharmony_ci S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(1)); 280bf215546Sopenharmony_ci radeon_emit(va); 281bf215546Sopenharmony_ci radeon_emit(va >> 32); 282bf215546Sopenharmony_ci radeon_emit(4 * i); /* destination in GDS */ 283bf215546Sopenharmony_ci radeon_emit(0); 284bf215546Sopenharmony_ci radeon_emit(S_415_BYTE_COUNT_GFX9(4)); 285bf215546Sopenharmony_ci radeon_end(); 286bf215546Sopenharmony_ci } else { 287bf215546Sopenharmony_ci /* Legacy streamout. 288bf215546Sopenharmony_ci * 289bf215546Sopenharmony_ci * The hw binds streamout buffers as shader resources. VGT only counts primitives 290bf215546Sopenharmony_ci * and tells the shader through SGPRs what to do. 291bf215546Sopenharmony_ci */ 292bf215546Sopenharmony_ci radeon_begin(cs); 293bf215546Sopenharmony_ci radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2); 294bf215546Sopenharmony_ci radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ 295bf215546Sopenharmony_ci radeon_emit(sctx->streamout.stride_in_dw[i]); /* VTX_STRIDE (in DW) */ 296bf215546Sopenharmony_ci 297bf215546Sopenharmony_ci if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) { 298bf215546Sopenharmony_ci uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; 299bf215546Sopenharmony_ci 300bf215546Sopenharmony_ci /* Append. */ 301bf215546Sopenharmony_ci radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 302bf215546Sopenharmony_ci radeon_emit(STRMOUT_SELECT_BUFFER(i) | 303bf215546Sopenharmony_ci STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ 304bf215546Sopenharmony_ci radeon_emit(0); /* unused */ 305bf215546Sopenharmony_ci radeon_emit(0); /* unused */ 306bf215546Sopenharmony_ci radeon_emit(va); /* src address lo */ 307bf215546Sopenharmony_ci radeon_emit(va >> 32); /* src address hi */ 308bf215546Sopenharmony_ci 309bf215546Sopenharmony_ci radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, 310bf215546Sopenharmony_ci RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE); 311bf215546Sopenharmony_ci } else { 312bf215546Sopenharmony_ci /* Start from the beginning. */ 313bf215546Sopenharmony_ci radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 314bf215546Sopenharmony_ci radeon_emit(STRMOUT_SELECT_BUFFER(i) | 315bf215546Sopenharmony_ci STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ 316bf215546Sopenharmony_ci radeon_emit(0); /* unused */ 317bf215546Sopenharmony_ci radeon_emit(0); /* unused */ 318bf215546Sopenharmony_ci radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */ 319bf215546Sopenharmony_ci radeon_emit(0); /* unused */ 320bf215546Sopenharmony_ci } 321bf215546Sopenharmony_ci radeon_end_update_context_roll(sctx); 322bf215546Sopenharmony_ci } 323bf215546Sopenharmony_ci } 324bf215546Sopenharmony_ci 325bf215546Sopenharmony_ci sctx->streamout.begin_emitted = true; 326bf215546Sopenharmony_ci} 327bf215546Sopenharmony_ci 328bf215546Sopenharmony_civoid si_emit_streamout_end(struct si_context *sctx) 329bf215546Sopenharmony_ci{ 330bf215546Sopenharmony_ci struct radeon_cmdbuf *cs = &sctx->gfx_cs; 331bf215546Sopenharmony_ci struct si_streamout_target **t = sctx->streamout.targets; 332bf215546Sopenharmony_ci 333bf215546Sopenharmony_ci if (!sctx->screen->use_ngg_streamout) 334bf215546Sopenharmony_ci si_flush_vgt_streamout(sctx); 335bf215546Sopenharmony_ci 336bf215546Sopenharmony_ci for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { 337bf215546Sopenharmony_ci if (!t[i]) 338bf215546Sopenharmony_ci continue; 339bf215546Sopenharmony_ci 340bf215546Sopenharmony_ci uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; 341bf215546Sopenharmony_ci 342bf215546Sopenharmony_ci if (sctx->screen->use_ngg_streamout) { 343bf215546Sopenharmony_ci /* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */ 344bf215546Sopenharmony_ci si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2, 345bf215546Sopenharmony_ci EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS, 346bf215546Sopenharmony_ci t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0); 347bf215546Sopenharmony_ci } else { 348bf215546Sopenharmony_ci radeon_begin(cs); 349bf215546Sopenharmony_ci radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 350bf215546Sopenharmony_ci radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | 351bf215546Sopenharmony_ci STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ 352bf215546Sopenharmony_ci radeon_emit(va); /* dst address lo */ 353bf215546Sopenharmony_ci radeon_emit(va >> 32); /* dst address hi */ 354bf215546Sopenharmony_ci radeon_emit(0); /* unused */ 355bf215546Sopenharmony_ci radeon_emit(0); /* unused */ 356bf215546Sopenharmony_ci 357bf215546Sopenharmony_ci /* Zero the buffer size. The counters (primitives generated, 358bf215546Sopenharmony_ci * primitives emitted) may be enabled even if there is not 359bf215546Sopenharmony_ci * buffer bound. This ensures that the primitives-emitted query 360bf215546Sopenharmony_ci * won't increment. */ 361bf215546Sopenharmony_ci radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0); 362bf215546Sopenharmony_ci radeon_end_update_context_roll(sctx); 363bf215546Sopenharmony_ci 364bf215546Sopenharmony_ci radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, 365bf215546Sopenharmony_ci RADEON_USAGE_WRITE | RADEON_PRIO_SO_FILLED_SIZE); 366bf215546Sopenharmony_ci } 367bf215546Sopenharmony_ci 368bf215546Sopenharmony_ci t[i]->buf_filled_size_valid = true; 369bf215546Sopenharmony_ci } 370bf215546Sopenharmony_ci 371bf215546Sopenharmony_ci sctx->streamout.begin_emitted = false; 372bf215546Sopenharmony_ci} 373bf215546Sopenharmony_ci 374bf215546Sopenharmony_ci/* STREAMOUT CONFIG DERIVED STATE 375bf215546Sopenharmony_ci * 376bf215546Sopenharmony_ci * Streamout must be enabled for the PRIMITIVES_GENERATED query to work. 377bf215546Sopenharmony_ci * The buffer mask is an independent state, so no writes occur if there 378bf215546Sopenharmony_ci * are no buffers bound. 379bf215546Sopenharmony_ci */ 380bf215546Sopenharmony_ci 381bf215546Sopenharmony_cistatic void si_emit_streamout_enable(struct si_context *sctx) 382bf215546Sopenharmony_ci{ 383bf215546Sopenharmony_ci assert(!sctx->screen->use_ngg_streamout); 384bf215546Sopenharmony_ci 385bf215546Sopenharmony_ci radeon_begin(&sctx->gfx_cs); 386bf215546Sopenharmony_ci radeon_set_context_reg_seq(R_028B94_VGT_STRMOUT_CONFIG, 2); 387bf215546Sopenharmony_ci radeon_emit(S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) | 388bf215546Sopenharmony_ci S_028B94_RAST_STREAM(0) | 389bf215546Sopenharmony_ci S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) | 390bf215546Sopenharmony_ci S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) | 391bf215546Sopenharmony_ci S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx))); 392bf215546Sopenharmony_ci radeon_emit(sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask); 393bf215546Sopenharmony_ci radeon_end(); 394bf215546Sopenharmony_ci} 395bf215546Sopenharmony_ci 396bf215546Sopenharmony_cistatic void si_set_streamout_enable(struct si_context *sctx, bool enable) 397bf215546Sopenharmony_ci{ 398bf215546Sopenharmony_ci bool old_strmout_en = si_get_strmout_en(sctx); 399bf215546Sopenharmony_ci unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask; 400bf215546Sopenharmony_ci 401bf215546Sopenharmony_ci sctx->streamout.streamout_enabled = enable; 402bf215546Sopenharmony_ci 403bf215546Sopenharmony_ci sctx->streamout.hw_enabled_mask = 404bf215546Sopenharmony_ci sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) | 405bf215546Sopenharmony_ci (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12); 406bf215546Sopenharmony_ci 407bf215546Sopenharmony_ci if (!sctx->screen->use_ngg_streamout && 408bf215546Sopenharmony_ci ((old_strmout_en != si_get_strmout_en(sctx)) || 409bf215546Sopenharmony_ci (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))) 410bf215546Sopenharmony_ci si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); 411bf215546Sopenharmony_ci} 412bf215546Sopenharmony_ci 413bf215546Sopenharmony_civoid si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff) 414bf215546Sopenharmony_ci{ 415bf215546Sopenharmony_ci if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) { 416bf215546Sopenharmony_ci bool old_strmout_en = si_get_strmout_en(sctx); 417bf215546Sopenharmony_ci 418bf215546Sopenharmony_ci sctx->streamout.num_prims_gen_queries += diff; 419bf215546Sopenharmony_ci assert(sctx->streamout.num_prims_gen_queries >= 0); 420bf215546Sopenharmony_ci 421bf215546Sopenharmony_ci sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0; 422bf215546Sopenharmony_ci 423bf215546Sopenharmony_ci if (old_strmout_en != si_get_strmout_en(sctx)) 424bf215546Sopenharmony_ci si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); 425bf215546Sopenharmony_ci 426bf215546Sopenharmony_ci if (si_update_ngg(sctx)) { 427bf215546Sopenharmony_ci si_shader_change_notify(sctx); 428bf215546Sopenharmony_ci sctx->do_update_shaders = true; 429bf215546Sopenharmony_ci } 430bf215546Sopenharmony_ci } 431bf215546Sopenharmony_ci} 432bf215546Sopenharmony_ci 433bf215546Sopenharmony_civoid si_init_streamout_functions(struct si_context *sctx) 434bf215546Sopenharmony_ci{ 435bf215546Sopenharmony_ci sctx->b.create_stream_output_target = si_create_so_target; 436bf215546Sopenharmony_ci sctx->b.stream_output_target_destroy = si_so_target_destroy; 437bf215546Sopenharmony_ci sctx->b.set_stream_output_targets = si_set_streamout_targets; 438bf215546Sopenharmony_ci sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; 439bf215546Sopenharmony_ci 440bf215546Sopenharmony_ci if (!sctx->screen->use_ngg_streamout) 441bf215546Sopenharmony_ci sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable; 442bf215546Sopenharmony_ci} 443