1/* 2 * Copyright 2013 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25#include "si_build_pm4.h" 26#include "util/u_memory.h" 27#include "util/u_suballoc.h" 28 29static void si_set_streamout_enable(struct si_context *sctx, bool enable); 30 31static inline void si_so_target_reference(struct si_streamout_target **dst, 32 struct pipe_stream_output_target *src) 33{ 34 pipe_so_target_reference((struct pipe_stream_output_target **)dst, src); 35} 36 37static struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx, 38 struct pipe_resource *buffer, 39 unsigned buffer_offset, 40 unsigned buffer_size) 41{ 42 struct si_streamout_target *t; 43 struct si_resource *buf = si_resource(buffer); 44 45 t = CALLOC_STRUCT(si_streamout_target); 46 if (!t) { 47 return NULL; 48 } 49 50 t->b.reference.count = 1; 51 t->b.context = ctx; 52 pipe_resource_reference(&t->b.buffer, buffer); 53 t->b.buffer_offset = buffer_offset; 54 t->b.buffer_size = buffer_size; 55 56 util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size); 57 return &t->b; 58} 59 60static void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target) 61{ 62 struct si_streamout_target *t = (struct si_streamout_target *)target; 63 pipe_resource_reference(&t->b.buffer, NULL); 64 si_resource_reference(&t->buf_filled_size, NULL); 65 FREE(t); 66} 67 68void si_streamout_buffers_dirty(struct si_context *sctx) 69{ 70 if (!sctx->streamout.enabled_mask) 71 return; 72 73 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin); 74 si_set_streamout_enable(sctx, true); 75} 76 77static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets, 78 struct pipe_stream_output_target **targets, 79 const unsigned *offsets) 80{ 81 struct si_context *sctx = (struct si_context *)ctx; 82 unsigned old_num_targets = sctx->streamout.num_targets; 83 unsigned i; 84 bool wait_now = false; 85 86 /* We are going to unbind the buffers. Mark which caches need to be flushed. */ 87 if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) { 88 /* Since streamout uses vector writes which go through TC L2 89 * and most other clients can use TC L2 as well, we don't need 90 * to flush it. 91 * 92 * The only cases which requires flushing it is VGT DMA index 93 * fetching (on <= GFX7) and indirect draw data, which are rare 94 * cases. Thus, flag the TC L2 dirtiness in the resource and 95 * handle it at draw call time. 96 */ 97 for (i = 0; i < sctx->streamout.num_targets; i++) 98 if (sctx->streamout.targets[i]) 99 si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true; 100 101 /* Invalidate the scalar cache in case a streamout buffer is 102 * going to be used as a constant buffer. 103 * 104 * Invalidate vL1, because streamout bypasses it (done by 105 * setting GLC=1 in the store instruction), but vL1 in other 106 * CUs can contain outdated data of streamout buffers. 107 * 108 * VS_PARTIAL_FLUSH is required if the buffers are going to be 109 * used as an input immediately. 110 */ 111 sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE; 112 113 /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */ 114 if (sctx->screen->use_ngg_streamout) { 115 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; 116 117 /* Wait now. This is needed to make sure that GDS is not 118 * busy at the end of IBs. 119 * 120 * Also, the next streamout operation will overwrite GDS, 121 * so we need to make sure that it's idle. 122 */ 123 wait_now = true; 124 } else { 125 sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; 126 } 127 } 128 129 /* All readers of the streamout targets need to be finished before we can 130 * start writing to the targets. 131 */ 132 if (num_targets) { 133 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | 134 SI_CONTEXT_PFP_SYNC_ME; 135 } 136 137 /* Streamout buffers must be bound in 2 places: 138 * 1) in VGT by setting the VGT_STRMOUT registers 139 * 2) as shader resources 140 */ 141 142 /* Stop streamout. */ 143 if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) 144 si_emit_streamout_end(sctx); 145 146 /* Set the new targets. */ 147 unsigned enabled_mask = 0, append_bitmask = 0; 148 for (i = 0; i < num_targets; i++) { 149 si_so_target_reference(&sctx->streamout.targets[i], targets[i]); 150 if (!targets[i]) 151 continue; 152 153 si_context_add_resource_size(sctx, targets[i]->buffer); 154 enabled_mask |= 1 << i; 155 156 if (offsets[i] == ((unsigned)-1)) 157 append_bitmask |= 1 << i; 158 159 /* Allocate space for the filled buffer size. */ 160 struct si_streamout_target *t = sctx->streamout.targets[i]; 161 if (!t->buf_filled_size) { 162 unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4; 163 u_suballocator_alloc(&sctx->allocator_zeroed_memory, buf_filled_size_size, 4, 164 &t->buf_filled_size_offset, 165 (struct pipe_resource **)&t->buf_filled_size); 166 } 167 } 168 169 for (; i < sctx->streamout.num_targets; i++) 170 si_so_target_reference(&sctx->streamout.targets[i], NULL); 171 172 if (!!sctx->streamout.enabled_mask != !!enabled_mask) { 173 sctx->streamout.enabled_mask = enabled_mask; 174 sctx->do_update_shaders = true; /* to keep/remove streamout shader code as an optimization */ 175 } 176 177 sctx->streamout.num_targets = num_targets; 178 sctx->streamout.append_bitmask = append_bitmask; 179 180 /* Update dirty state bits. */ 181 if (num_targets) { 182 si_streamout_buffers_dirty(sctx); 183 } else { 184 si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false); 185 si_set_streamout_enable(sctx, false); 186 } 187 188 /* Set the shader resources.*/ 189 for (i = 0; i < num_targets; i++) { 190 if (targets[i]) { 191 struct pipe_shader_buffer sbuf; 192 sbuf.buffer = targets[i]->buffer; 193 194 if (sctx->screen->use_ngg_streamout) { 195 sbuf.buffer_offset = targets[i]->buffer_offset; 196 sbuf.buffer_size = targets[i]->buffer_size; 197 } else { 198 sbuf.buffer_offset = 0; 199 sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size; 200 } 201 202 si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf); 203 si_resource(targets[i]->buffer)->bind_history |= SI_BIND_STREAMOUT_BUFFER; 204 } else { 205 si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); 206 } 207 } 208 for (; i < old_num_targets; i++) 209 si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); 210 211 if (wait_now) 212 sctx->emit_cache_flush(sctx, &sctx->gfx_cs); 213} 214 215static void si_flush_vgt_streamout(struct si_context *sctx) 216{ 217 struct radeon_cmdbuf *cs = &sctx->gfx_cs; 218 unsigned reg_strmout_cntl; 219 220 radeon_begin(cs); 221 222 /* The register is at different places on different ASICs. */ 223 if (sctx->gfx_level >= GFX9) { 224 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; 225 radeon_emit(PKT3(PKT3_WRITE_DATA, 3, 0)); 226 radeon_emit(S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME)); 227 radeon_emit(R_0300FC_CP_STRMOUT_CNTL >> 2); 228 radeon_emit(0); 229 radeon_emit(0); 230 } else if (sctx->gfx_level >= GFX7) { 231 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; 232 radeon_set_uconfig_reg(reg_strmout_cntl, 0); 233 } else { 234 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; 235 radeon_set_config_reg(reg_strmout_cntl, 0); 236 } 237 238 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); 239 radeon_emit(EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); 240 241 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 242 radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 243 radeon_emit(reg_strmout_cntl >> 2); /* register */ 244 radeon_emit(0); 245 radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ 246 radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ 247 radeon_emit(4); /* poll interval */ 248 radeon_end(); 249} 250 251static void si_emit_streamout_begin(struct si_context *sctx) 252{ 253 struct radeon_cmdbuf *cs = &sctx->gfx_cs; 254 struct si_streamout_target **t = sctx->streamout.targets; 255 256 if (!sctx->screen->use_ngg_streamout) 257 si_flush_vgt_streamout(sctx); 258 259 for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { 260 if (!t[i]) 261 continue; 262 263 t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i]; 264 265 if (sctx->screen->use_ngg_streamout) { 266 bool append = sctx->streamout.append_bitmask & (1 << i); 267 uint64_t va = 0; 268 269 if (append) { 270 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, 271 RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE); 272 273 va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; 274 } 275 276 radeon_begin(cs); 277 radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0)); 278 radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | 279 S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(1)); 280 radeon_emit(va); 281 radeon_emit(va >> 32); 282 radeon_emit(4 * i); /* destination in GDS */ 283 radeon_emit(0); 284 radeon_emit(S_415_BYTE_COUNT_GFX9(4)); 285 radeon_end(); 286 } else { 287 /* Legacy streamout. 288 * 289 * The hw binds streamout buffers as shader resources. VGT only counts primitives 290 * and tells the shader through SGPRs what to do. 291 */ 292 radeon_begin(cs); 293 radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2); 294 radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ 295 radeon_emit(sctx->streamout.stride_in_dw[i]); /* VTX_STRIDE (in DW) */ 296 297 if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) { 298 uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; 299 300 /* Append. */ 301 radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 302 radeon_emit(STRMOUT_SELECT_BUFFER(i) | 303 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ 304 radeon_emit(0); /* unused */ 305 radeon_emit(0); /* unused */ 306 radeon_emit(va); /* src address lo */ 307 radeon_emit(va >> 32); /* src address hi */ 308 309 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, 310 RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE); 311 } else { 312 /* Start from the beginning. */ 313 radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 314 radeon_emit(STRMOUT_SELECT_BUFFER(i) | 315 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ 316 radeon_emit(0); /* unused */ 317 radeon_emit(0); /* unused */ 318 radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */ 319 radeon_emit(0); /* unused */ 320 } 321 radeon_end_update_context_roll(sctx); 322 } 323 } 324 325 sctx->streamout.begin_emitted = true; 326} 327 328void si_emit_streamout_end(struct si_context *sctx) 329{ 330 struct radeon_cmdbuf *cs = &sctx->gfx_cs; 331 struct si_streamout_target **t = sctx->streamout.targets; 332 333 if (!sctx->screen->use_ngg_streamout) 334 si_flush_vgt_streamout(sctx); 335 336 for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { 337 if (!t[i]) 338 continue; 339 340 uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; 341 342 if (sctx->screen->use_ngg_streamout) { 343 /* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */ 344 si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2, 345 EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS, 346 t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0); 347 } else { 348 radeon_begin(cs); 349 radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 350 radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | 351 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ 352 radeon_emit(va); /* dst address lo */ 353 radeon_emit(va >> 32); /* dst address hi */ 354 radeon_emit(0); /* unused */ 355 radeon_emit(0); /* unused */ 356 357 /* Zero the buffer size. The counters (primitives generated, 358 * primitives emitted) may be enabled even if there is not 359 * buffer bound. This ensures that the primitives-emitted query 360 * won't increment. */ 361 radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0); 362 radeon_end_update_context_roll(sctx); 363 364 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, 365 RADEON_USAGE_WRITE | RADEON_PRIO_SO_FILLED_SIZE); 366 } 367 368 t[i]->buf_filled_size_valid = true; 369 } 370 371 sctx->streamout.begin_emitted = false; 372} 373 374/* STREAMOUT CONFIG DERIVED STATE 375 * 376 * Streamout must be enabled for the PRIMITIVES_GENERATED query to work. 377 * The buffer mask is an independent state, so no writes occur if there 378 * are no buffers bound. 379 */ 380 381static void si_emit_streamout_enable(struct si_context *sctx) 382{ 383 assert(!sctx->screen->use_ngg_streamout); 384 385 radeon_begin(&sctx->gfx_cs); 386 radeon_set_context_reg_seq(R_028B94_VGT_STRMOUT_CONFIG, 2); 387 radeon_emit(S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) | 388 S_028B94_RAST_STREAM(0) | 389 S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) | 390 S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) | 391 S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx))); 392 radeon_emit(sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask); 393 radeon_end(); 394} 395 396static void si_set_streamout_enable(struct si_context *sctx, bool enable) 397{ 398 bool old_strmout_en = si_get_strmout_en(sctx); 399 unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask; 400 401 sctx->streamout.streamout_enabled = enable; 402 403 sctx->streamout.hw_enabled_mask = 404 sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) | 405 (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12); 406 407 if (!sctx->screen->use_ngg_streamout && 408 ((old_strmout_en != si_get_strmout_en(sctx)) || 409 (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))) 410 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); 411} 412 413void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff) 414{ 415 if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) { 416 bool old_strmout_en = si_get_strmout_en(sctx); 417 418 sctx->streamout.num_prims_gen_queries += diff; 419 assert(sctx->streamout.num_prims_gen_queries >= 0); 420 421 sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0; 422 423 if (old_strmout_en != si_get_strmout_en(sctx)) 424 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); 425 426 if (si_update_ngg(sctx)) { 427 si_shader_change_notify(sctx); 428 sctx->do_update_shaders = true; 429 } 430 } 431} 432 433void si_init_streamout_functions(struct si_context *sctx) 434{ 435 sctx->b.create_stream_output_target = si_create_so_target; 436 sctx->b.stream_output_target_destroy = si_so_target_destroy; 437 sctx->b.set_stream_output_targets = si_set_streamout_targets; 438 sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; 439 440 if (!sctx->screen->use_ngg_streamout) 441 sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable; 442} 443