1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25#include "si_pipe.h"
26#include "si_query.h"
27#include "sid.h"
28#include "util/u_memory.h"
29#include "util/u_suballoc.h"
30
31#include <stddef.h>
32
33static void emit_shader_query(struct si_context *sctx)
34{
35   assert(!list_is_empty(&sctx->shader_query_buffers));
36
37   struct gfx10_sh_query_buffer *qbuf =
38      list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
39   qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
40}
41
42static void gfx10_release_query_buffers(struct si_context *sctx,
43                                        struct gfx10_sh_query_buffer *first,
44                                        struct gfx10_sh_query_buffer *last)
45{
46   while (first) {
47      struct gfx10_sh_query_buffer *qbuf = first;
48      if (first != last)
49         first = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list);
50      else
51         first = NULL;
52
53      qbuf->refcount--;
54      if (qbuf->refcount)
55         continue;
56
57      if (qbuf->list.next == &sctx->shader_query_buffers)
58         continue; /* keep the most recent buffer; it may not be full yet */
59      if (qbuf->list.prev == &sctx->shader_query_buffers)
60         continue; /* keep the oldest buffer for recycling */
61
62      list_del(&qbuf->list);
63      si_resource_reference(&qbuf->buf, NULL);
64      FREE(qbuf);
65   }
66}
67
68static bool gfx10_alloc_query_buffer(struct si_context *sctx)
69{
70   if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
71      return true;
72
73   struct gfx10_sh_query_buffer *qbuf = NULL;
74
75   if (!list_is_empty(&sctx->shader_query_buffers)) {
76      qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
77      if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
78         goto success;
79
80      qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
81      if (!qbuf->refcount &&
82          !si_cs_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
83          sctx->ws->buffer_wait(sctx->ws, qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
84         /* Can immediately re-use the oldest buffer */
85         list_del(&qbuf->list);
86      } else {
87         qbuf = NULL;
88      }
89   }
90
91   if (!qbuf) {
92      qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
93      if (unlikely(!qbuf))
94         return false;
95
96      struct si_screen *screen = sctx->screen;
97      unsigned buf_size =
98         MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
99      qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
100      if (unlikely(!qbuf->buf)) {
101         FREE(qbuf);
102         return false;
103      }
104   }
105
106   /* The buffer is currently unused by the GPU. Initialize it.
107    *
108    * We need to set the high bit of all the primitive counters for
109    * compatibility with the SET_PREDICATION packet.
110    */
111   uint64_t *results = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
112                                            PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
113   assert(results);
114
115   for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
116        ++i) {
117      for (unsigned j = 0; j < 16; ++j)
118         results[32 * i + j] = (uint64_t)1 << 63;
119      results[32 * i + 16] = 0;
120   }
121
122   list_addtail(&qbuf->list, &sctx->shader_query_buffers);
123   qbuf->head = 0;
124   qbuf->refcount = sctx->num_active_shader_queries;
125
126success:;
127   struct pipe_shader_buffer sbuf;
128   sbuf.buffer = &qbuf->buf->b.b;
129   sbuf.buffer_offset = qbuf->head;
130   sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
131   si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf);
132   SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 1);
133
134   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
135   return true;
136}
137
138static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
139{
140   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
141   gfx10_release_query_buffers(sctx, query->first, query->last);
142   FREE(query);
143}
144
145static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
146{
147   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
148
149   gfx10_release_query_buffers(sctx, query->first, query->last);
150   query->first = query->last = NULL;
151
152   if (unlikely(!gfx10_alloc_query_buffer(sctx)))
153      return false;
154
155   query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
156   query->first_begin = query->first->head;
157
158   sctx->num_active_shader_queries++;
159   query->first->refcount++;
160
161   return true;
162}
163
164static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
165{
166   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
167
168   if (unlikely(!query->first))
169      return false; /* earlier out of memory error */
170
171   query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
172   query->last_end = query->last->head;
173
174   /* Signal the fence of the previous chunk */
175   if (query->last_end != 0) {
176      uint64_t fence_va = query->last->buf->gpu_address;
177      fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
178      fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
179      si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
180                        EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
181                        0xffffffff, PIPE_QUERY_GPU_FINISHED);
182   }
183
184   sctx->num_active_shader_queries--;
185
186   if (sctx->num_active_shader_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) {
187      si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
188      SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 0);
189
190      /* If a query_begin is followed by a query_end without a draw
191       * in-between, we need to clear the atom to ensure that the
192       * next query_begin will re-initialize the shader buffer. */
193      si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
194   }
195
196   return true;
197}
198
199static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
200                                      struct gfx10_sh_query_buffer_mem *qmem,
201                                      union pipe_query_result *result)
202{
203   static const uint64_t mask = ((uint64_t)1 << 63) - 1;
204
205   switch (query->b.type) {
206   case PIPE_QUERY_PRIMITIVES_EMITTED:
207      result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
208      break;
209   case PIPE_QUERY_PRIMITIVES_GENERATED:
210      result->u64 += qmem->stream[query->stream].generated_primitives & mask;
211      break;
212   case PIPE_QUERY_SO_STATISTICS:
213      result->so_statistics.num_primitives_written +=
214         qmem->stream[query->stream].emitted_primitives & mask;
215      result->so_statistics.primitives_storage_needed +=
216         qmem->stream[query->stream].generated_primitives & mask;
217      break;
218   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
219      result->b |= qmem->stream[query->stream].emitted_primitives !=
220                   qmem->stream[query->stream].generated_primitives;
221      break;
222   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
223      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
224         result->b |= qmem->stream[stream].emitted_primitives !=
225                      qmem->stream[stream].generated_primitives;
226      }
227      break;
228   default:
229      assert(0);
230   }
231}
232
233static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
234                                      union pipe_query_result *result)
235{
236   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
237
238   util_query_clear_result(result, query->b.type);
239
240   if (unlikely(!query->first))
241      return false; /* earlier out of memory error */
242   assert(query->last);
243
244   for (struct gfx10_sh_query_buffer *qbuf = query->last;;
245        qbuf = list_entry(qbuf->list.prev, struct gfx10_sh_query_buffer, list)) {
246      unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
247      void *map;
248
249      if (rquery->b.flushed)
250         map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
251      else
252         map = si_buffer_map(sctx, qbuf->buf, usage);
253
254      if (!map)
255         return false;
256
257      unsigned results_begin = 0;
258      unsigned results_end = qbuf->head;
259      if (qbuf == query->first)
260         results_begin = query->first_begin;
261      if (qbuf == query->last)
262         results_end = query->last_end;
263
264      while (results_begin != results_end) {
265         struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
266         results_begin += sizeof(*qmem);
267
268         gfx10_sh_query_add_result(query, qmem, result);
269      }
270
271      if (qbuf == query->first)
272         break;
273   }
274
275   return true;
276}
277
278static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
279                                               enum pipe_query_flags flags,
280                                               enum pipe_query_value_type result_type,
281                                               int index, struct pipe_resource *resource,
282                                               unsigned offset)
283{
284   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
285   struct si_qbo_state saved_state = {};
286   struct pipe_resource *tmp_buffer = NULL;
287   unsigned tmp_buffer_offset = 0;
288
289   if (!sctx->sh_query_result_shader) {
290      sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
291      if (!sctx->sh_query_result_shader)
292         return;
293   }
294
295   if (query->first != query->last) {
296      u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
297      if (!tmp_buffer)
298         return;
299   }
300
301   si_save_qbo_state(sctx, &saved_state);
302
303   /* Pre-fill the constants configuring the shader behavior. */
304   struct {
305      uint32_t config;
306      uint32_t offset;
307      uint32_t chain;
308      uint32_t result_count;
309   } consts;
310   struct pipe_constant_buffer constant_buffer = {};
311
312   if (index >= 0) {
313      switch (query->b.type) {
314      case PIPE_QUERY_PRIMITIVES_GENERATED:
315         consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t);
316         consts.config = 0;
317         break;
318      case PIPE_QUERY_PRIMITIVES_EMITTED:
319         consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t);
320         consts.config = 0;
321         break;
322      case PIPE_QUERY_SO_STATISTICS:
323         consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
324         consts.config = 0;
325         break;
326      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
327         consts.offset = 4 * sizeof(uint64_t) * query->stream;
328         consts.config = 2;
329         break;
330      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
331         consts.offset = 0;
332         consts.config = 3;
333         break;
334      default:
335         unreachable("bad query type");
336      }
337   } else {
338      /* Check result availability. */
339      consts.offset = 0;
340      consts.config = 1;
341   }
342
343   if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
344      consts.config |= 8;
345
346   constant_buffer.buffer_size = sizeof(consts);
347   constant_buffer.user_buffer = &consts;
348
349   /* Pre-fill the SSBOs and grid. */
350   struct pipe_shader_buffer ssbo[3];
351   struct pipe_grid_info grid = {};
352
353   ssbo[1].buffer = tmp_buffer;
354   ssbo[1].buffer_offset = tmp_buffer_offset;
355   ssbo[1].buffer_size = 16;
356
357   ssbo[2] = ssbo[1];
358
359   grid.block[0] = 1;
360   grid.block[1] = 1;
361   grid.block[2] = 1;
362   grid.grid[0] = 1;
363   grid.grid[1] = 1;
364   grid.grid[2] = 1;
365
366   struct gfx10_sh_query_buffer *qbuf = query->first;
367   for (;;) {
368      unsigned begin = qbuf == query->first ? query->first_begin : 0;
369      unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
370      if (!end)
371         continue;
372
373      ssbo[0].buffer = &qbuf->buf->b.b;
374      ssbo[0].buffer_offset = begin;
375      ssbo[0].buffer_size = end - begin;
376
377      consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
378      consts.chain = 0;
379      if (qbuf != query->first)
380         consts.chain |= 1;
381      if (qbuf != query->last)
382         consts.chain |= 2;
383
384      if (qbuf == query->last) {
385         ssbo[2].buffer = resource;
386         ssbo[2].buffer_offset = offset;
387         ssbo[2].buffer_size = 8;
388      }
389
390      sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
391
392      if (flags & PIPE_QUERY_WAIT) {
393         uint64_t va;
394
395         /* Wait for result availability. Wait only for readiness
396          * of the last entry, since the fence writes should be
397          * serialized in the CP.
398          */
399         va = qbuf->buf->gpu_address;
400         va += end - sizeof(struct gfx10_sh_query_buffer_mem);
401         va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
402
403         si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
404      }
405
406      /* ssbo[2] is either tmp_buffer or resource */
407      assert(ssbo[2].buffer);
408      si_launch_grid_internal_ssbos(sctx, &grid, sctx->sh_query_result_shader,
409                                    SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
410                                    3, ssbo, (1 << 2) | (ssbo[1].buffer ? 1 << 1 : 0));
411
412      if (qbuf == query->last)
413         break;
414      qbuf = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list);
415   }
416
417   si_restore_qbo_state(sctx, &saved_state);
418   pipe_resource_reference(&tmp_buffer, NULL);
419}
420
421static const struct si_query_ops gfx10_sh_query_ops = {
422   .destroy = gfx10_sh_query_destroy,
423   .begin = gfx10_sh_query_begin,
424   .end = gfx10_sh_query_end,
425   .get_result = gfx10_sh_query_get_result,
426   .get_result_resource = gfx10_sh_query_get_result_resource,
427};
428
429struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
430                                         unsigned index)
431{
432   struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
433   if (unlikely(!query))
434      return NULL;
435
436   query->b.ops = &gfx10_sh_query_ops;
437   query->b.type = query_type;
438   query->stream = index;
439
440   return (struct pipe_query *)query;
441}
442
443void gfx10_init_query(struct si_context *sctx)
444{
445   list_inithead(&sctx->shader_query_buffers);
446   sctx->atoms.s.shader_query.emit = emit_shader_query;
447}
448
449void gfx10_destroy_query(struct si_context *sctx)
450{
451   if (!sctx->shader_query_buffers.next)
452      return;
453
454   while (!list_is_empty(&sctx->shader_query_buffers)) {
455      struct gfx10_sh_query_buffer *qbuf =
456         list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
457      list_del(&qbuf->list);
458
459      assert(!qbuf->refcount);
460      si_resource_reference(&qbuf->buf, NULL);
461      FREE(qbuf);
462   }
463}
464