1/* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "si_pipe.h" 26#include "si_query.h" 27#include "sid.h" 28#include "util/u_memory.h" 29#include "util/u_suballoc.h" 30 31#include <stddef.h> 32 33static void emit_shader_query(struct si_context *sctx) 34{ 35 assert(!list_is_empty(&sctx->shader_query_buffers)); 36 37 struct gfx10_sh_query_buffer *qbuf = 38 list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); 39 qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem); 40} 41 42static void gfx10_release_query_buffers(struct si_context *sctx, 43 struct gfx10_sh_query_buffer *first, 44 struct gfx10_sh_query_buffer *last) 45{ 46 while (first) { 47 struct gfx10_sh_query_buffer *qbuf = first; 48 if (first != last) 49 first = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list); 50 else 51 first = NULL; 52 53 qbuf->refcount--; 54 if (qbuf->refcount) 55 continue; 56 57 if (qbuf->list.next == &sctx->shader_query_buffers) 58 continue; /* keep the most recent buffer; it may not be full yet */ 59 if (qbuf->list.prev == &sctx->shader_query_buffers) 60 continue; /* keep the oldest buffer for recycling */ 61 62 list_del(&qbuf->list); 63 si_resource_reference(&qbuf->buf, NULL); 64 FREE(qbuf); 65 } 66} 67 68static bool gfx10_alloc_query_buffer(struct si_context *sctx) 69{ 70 if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) 71 return true; 72 73 struct gfx10_sh_query_buffer *qbuf = NULL; 74 75 if (!list_is_empty(&sctx->shader_query_buffers)) { 76 qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); 77 if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0) 78 goto success; 79 80 qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); 81 if (!qbuf->refcount && 82 !si_cs_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) && 83 sctx->ws->buffer_wait(sctx->ws, qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) { 84 /* Can immediately re-use the oldest buffer */ 85 list_del(&qbuf->list); 86 } else { 87 qbuf = NULL; 88 } 89 } 90 91 if (!qbuf) { 92 qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer); 93 if (unlikely(!qbuf)) 94 return false; 95 96 struct si_screen *screen = sctx->screen; 97 unsigned buf_size = 98 MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size); 99 qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); 100 if (unlikely(!qbuf->buf)) { 101 FREE(qbuf); 102 return false; 103 } 104 } 105 106 /* The buffer is currently unused by the GPU. Initialize it. 107 * 108 * We need to set the high bit of all the primitive counters for 109 * compatibility with the SET_PREDICATION packet. 110 */ 111 uint64_t *results = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, 112 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED); 113 assert(results); 114 115 for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e; 116 ++i) { 117 for (unsigned j = 0; j < 16; ++j) 118 results[32 * i + j] = (uint64_t)1 << 63; 119 results[32 * i + 16] = 0; 120 } 121 122 list_addtail(&qbuf->list, &sctx->shader_query_buffers); 123 qbuf->head = 0; 124 qbuf->refcount = sctx->num_active_shader_queries; 125 126success:; 127 struct pipe_shader_buffer sbuf; 128 sbuf.buffer = &qbuf->buf->b.b; 129 sbuf.buffer_offset = qbuf->head; 130 sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem); 131 si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf); 132 SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 1); 133 134 si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query); 135 return true; 136} 137 138static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery) 139{ 140 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; 141 gfx10_release_query_buffers(sctx, query->first, query->last); 142 FREE(query); 143} 144 145static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery) 146{ 147 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; 148 149 gfx10_release_query_buffers(sctx, query->first, query->last); 150 query->first = query->last = NULL; 151 152 if (unlikely(!gfx10_alloc_query_buffer(sctx))) 153 return false; 154 155 query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); 156 query->first_begin = query->first->head; 157 158 sctx->num_active_shader_queries++; 159 query->first->refcount++; 160 161 return true; 162} 163 164static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery) 165{ 166 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; 167 168 if (unlikely(!query->first)) 169 return false; /* earlier out of memory error */ 170 171 query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); 172 query->last_end = query->last->head; 173 174 /* Signal the fence of the previous chunk */ 175 if (query->last_end != 0) { 176 uint64_t fence_va = query->last->buf->gpu_address; 177 fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem); 178 fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence); 179 si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, 180 EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va, 181 0xffffffff, PIPE_QUERY_GPU_FINISHED); 182 } 183 184 sctx->num_active_shader_queries--; 185 186 if (sctx->num_active_shader_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) { 187 si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL); 188 SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 0); 189 190 /* If a query_begin is followed by a query_end without a draw 191 * in-between, we need to clear the atom to ensure that the 192 * next query_begin will re-initialize the shader buffer. */ 193 si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false); 194 } 195 196 return true; 197} 198 199static void gfx10_sh_query_add_result(struct gfx10_sh_query *query, 200 struct gfx10_sh_query_buffer_mem *qmem, 201 union pipe_query_result *result) 202{ 203 static const uint64_t mask = ((uint64_t)1 << 63) - 1; 204 205 switch (query->b.type) { 206 case PIPE_QUERY_PRIMITIVES_EMITTED: 207 result->u64 += qmem->stream[query->stream].emitted_primitives & mask; 208 break; 209 case PIPE_QUERY_PRIMITIVES_GENERATED: 210 result->u64 += qmem->stream[query->stream].generated_primitives & mask; 211 break; 212 case PIPE_QUERY_SO_STATISTICS: 213 result->so_statistics.num_primitives_written += 214 qmem->stream[query->stream].emitted_primitives & mask; 215 result->so_statistics.primitives_storage_needed += 216 qmem->stream[query->stream].generated_primitives & mask; 217 break; 218 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 219 result->b |= qmem->stream[query->stream].emitted_primitives != 220 qmem->stream[query->stream].generated_primitives; 221 break; 222 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: 223 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { 224 result->b |= qmem->stream[stream].emitted_primitives != 225 qmem->stream[stream].generated_primitives; 226 } 227 break; 228 default: 229 assert(0); 230 } 231} 232 233static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait, 234 union pipe_query_result *result) 235{ 236 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; 237 238 util_query_clear_result(result, query->b.type); 239 240 if (unlikely(!query->first)) 241 return false; /* earlier out of memory error */ 242 assert(query->last); 243 244 for (struct gfx10_sh_query_buffer *qbuf = query->last;; 245 qbuf = list_entry(qbuf->list.prev, struct gfx10_sh_query_buffer, list)) { 246 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK); 247 void *map; 248 249 if (rquery->b.flushed) 250 map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage); 251 else 252 map = si_buffer_map(sctx, qbuf->buf, usage); 253 254 if (!map) 255 return false; 256 257 unsigned results_begin = 0; 258 unsigned results_end = qbuf->head; 259 if (qbuf == query->first) 260 results_begin = query->first_begin; 261 if (qbuf == query->last) 262 results_end = query->last_end; 263 264 while (results_begin != results_end) { 265 struct gfx10_sh_query_buffer_mem *qmem = map + results_begin; 266 results_begin += sizeof(*qmem); 267 268 gfx10_sh_query_add_result(query, qmem, result); 269 } 270 271 if (qbuf == query->first) 272 break; 273 } 274 275 return true; 276} 277 278static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery, 279 enum pipe_query_flags flags, 280 enum pipe_query_value_type result_type, 281 int index, struct pipe_resource *resource, 282 unsigned offset) 283{ 284 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; 285 struct si_qbo_state saved_state = {}; 286 struct pipe_resource *tmp_buffer = NULL; 287 unsigned tmp_buffer_offset = 0; 288 289 if (!sctx->sh_query_result_shader) { 290 sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx); 291 if (!sctx->sh_query_result_shader) 292 return; 293 } 294 295 if (query->first != query->last) { 296 u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer); 297 if (!tmp_buffer) 298 return; 299 } 300 301 si_save_qbo_state(sctx, &saved_state); 302 303 /* Pre-fill the constants configuring the shader behavior. */ 304 struct { 305 uint32_t config; 306 uint32_t offset; 307 uint32_t chain; 308 uint32_t result_count; 309 } consts; 310 struct pipe_constant_buffer constant_buffer = {}; 311 312 if (index >= 0) { 313 switch (query->b.type) { 314 case PIPE_QUERY_PRIMITIVES_GENERATED: 315 consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t); 316 consts.config = 0; 317 break; 318 case PIPE_QUERY_PRIMITIVES_EMITTED: 319 consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t); 320 consts.config = 0; 321 break; 322 case PIPE_QUERY_SO_STATISTICS: 323 consts.offset = sizeof(uint32_t) * (4 * index + query->stream); 324 consts.config = 0; 325 break; 326 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 327 consts.offset = 4 * sizeof(uint64_t) * query->stream; 328 consts.config = 2; 329 break; 330 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: 331 consts.offset = 0; 332 consts.config = 3; 333 break; 334 default: 335 unreachable("bad query type"); 336 } 337 } else { 338 /* Check result availability. */ 339 consts.offset = 0; 340 consts.config = 1; 341 } 342 343 if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64) 344 consts.config |= 8; 345 346 constant_buffer.buffer_size = sizeof(consts); 347 constant_buffer.user_buffer = &consts; 348 349 /* Pre-fill the SSBOs and grid. */ 350 struct pipe_shader_buffer ssbo[3]; 351 struct pipe_grid_info grid = {}; 352 353 ssbo[1].buffer = tmp_buffer; 354 ssbo[1].buffer_offset = tmp_buffer_offset; 355 ssbo[1].buffer_size = 16; 356 357 ssbo[2] = ssbo[1]; 358 359 grid.block[0] = 1; 360 grid.block[1] = 1; 361 grid.block[2] = 1; 362 grid.grid[0] = 1; 363 grid.grid[1] = 1; 364 grid.grid[2] = 1; 365 366 struct gfx10_sh_query_buffer *qbuf = query->first; 367 for (;;) { 368 unsigned begin = qbuf == query->first ? query->first_begin : 0; 369 unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0; 370 if (!end) 371 continue; 372 373 ssbo[0].buffer = &qbuf->buf->b.b; 374 ssbo[0].buffer_offset = begin; 375 ssbo[0].buffer_size = end - begin; 376 377 consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem); 378 consts.chain = 0; 379 if (qbuf != query->first) 380 consts.chain |= 1; 381 if (qbuf != query->last) 382 consts.chain |= 2; 383 384 if (qbuf == query->last) { 385 ssbo[2].buffer = resource; 386 ssbo[2].buffer_offset = offset; 387 ssbo[2].buffer_size = 8; 388 } 389 390 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer); 391 392 if (flags & PIPE_QUERY_WAIT) { 393 uint64_t va; 394 395 /* Wait for result availability. Wait only for readiness 396 * of the last entry, since the fence writes should be 397 * serialized in the CP. 398 */ 399 va = qbuf->buf->gpu_address; 400 va += end - sizeof(struct gfx10_sh_query_buffer_mem); 401 va += offsetof(struct gfx10_sh_query_buffer_mem, fence); 402 403 si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0); 404 } 405 406 /* ssbo[2] is either tmp_buffer or resource */ 407 assert(ssbo[2].buffer); 408 si_launch_grid_internal_ssbos(sctx, &grid, sctx->sh_query_result_shader, 409 SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER, 410 3, ssbo, (1 << 2) | (ssbo[1].buffer ? 1 << 1 : 0)); 411 412 if (qbuf == query->last) 413 break; 414 qbuf = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list); 415 } 416 417 si_restore_qbo_state(sctx, &saved_state); 418 pipe_resource_reference(&tmp_buffer, NULL); 419} 420 421static const struct si_query_ops gfx10_sh_query_ops = { 422 .destroy = gfx10_sh_query_destroy, 423 .begin = gfx10_sh_query_begin, 424 .end = gfx10_sh_query_end, 425 .get_result = gfx10_sh_query_get_result, 426 .get_result_resource = gfx10_sh_query_get_result_resource, 427}; 428 429struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type, 430 unsigned index) 431{ 432 struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query); 433 if (unlikely(!query)) 434 return NULL; 435 436 query->b.ops = &gfx10_sh_query_ops; 437 query->b.type = query_type; 438 query->stream = index; 439 440 return (struct pipe_query *)query; 441} 442 443void gfx10_init_query(struct si_context *sctx) 444{ 445 list_inithead(&sctx->shader_query_buffers); 446 sctx->atoms.s.shader_query.emit = emit_shader_query; 447} 448 449void gfx10_destroy_query(struct si_context *sctx) 450{ 451 if (!sctx->shader_query_buffers.next) 452 return; 453 454 while (!list_is_empty(&sctx->shader_query_buffers)) { 455 struct gfx10_sh_query_buffer *qbuf = 456 list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); 457 list_del(&qbuf->list); 458 459 assert(!qbuf->refcount); 460 si_resource_reference(&qbuf->buf, NULL); 461 FREE(qbuf); 462 } 463} 464