1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4 * Copyright 2018 Advanced Micro Devices, Inc.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27#include "si_query.h"
28#include "si_build_pm4.h"
29
30#include "amd/common/sid.h"
31#include "si_pipe.h"
32#include "util/os_time.h"
33#include "util/u_memory.h"
34#include "util/u_suballoc.h"
35#include "util/u_upload_mgr.h"
36
37static const struct si_query_ops query_hw_ops;
38
39struct si_hw_query_params {
40   unsigned start_offset;
41   unsigned end_offset;
42   unsigned fence_offset;
43   unsigned pair_stride;
44   unsigned pair_count;
45};
46
47/* Queries without buffer handling or suspend/resume. */
48struct si_query_sw {
49   struct si_query b;
50
51   uint64_t begin_result;
52   uint64_t end_result;
53
54   uint64_t begin_time;
55   uint64_t end_time;
56
57   /* Fence for GPU_FINISHED. */
58   struct pipe_fence_handle *fence;
59};
60
61static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
62{
63   struct si_query_sw *query = (struct si_query_sw *)squery;
64
65   sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
66   FREE(query);
67}
68
69static enum radeon_value_id winsys_id_from_type(unsigned type)
70{
71   switch (type) {
72   case SI_QUERY_REQUESTED_VRAM:
73      return RADEON_REQUESTED_VRAM_MEMORY;
74   case SI_QUERY_REQUESTED_GTT:
75      return RADEON_REQUESTED_GTT_MEMORY;
76   case SI_QUERY_MAPPED_VRAM:
77      return RADEON_MAPPED_VRAM;
78   case SI_QUERY_MAPPED_GTT:
79      return RADEON_MAPPED_GTT;
80   case SI_QUERY_SLAB_WASTED_VRAM:
81      return RADEON_SLAB_WASTED_VRAM;
82   case SI_QUERY_SLAB_WASTED_GTT:
83      return RADEON_SLAB_WASTED_GTT;
84   case SI_QUERY_BUFFER_WAIT_TIME:
85      return RADEON_BUFFER_WAIT_TIME_NS;
86   case SI_QUERY_NUM_MAPPED_BUFFERS:
87      return RADEON_NUM_MAPPED_BUFFERS;
88   case SI_QUERY_NUM_GFX_IBS:
89      return RADEON_NUM_GFX_IBS;
90   case SI_QUERY_GFX_BO_LIST_SIZE:
91      return RADEON_GFX_BO_LIST_COUNTER;
92   case SI_QUERY_GFX_IB_SIZE:
93      return RADEON_GFX_IB_SIZE_COUNTER;
94   case SI_QUERY_NUM_BYTES_MOVED:
95      return RADEON_NUM_BYTES_MOVED;
96   case SI_QUERY_NUM_EVICTIONS:
97      return RADEON_NUM_EVICTIONS;
98   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
99      return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
100   case SI_QUERY_VRAM_USAGE:
101      return RADEON_VRAM_USAGE;
102   case SI_QUERY_VRAM_VIS_USAGE:
103      return RADEON_VRAM_VIS_USAGE;
104   case SI_QUERY_GTT_USAGE:
105      return RADEON_GTT_USAGE;
106   case SI_QUERY_GPU_TEMPERATURE:
107      return RADEON_GPU_TEMPERATURE;
108   case SI_QUERY_CURRENT_GPU_SCLK:
109      return RADEON_CURRENT_SCLK;
110   case SI_QUERY_CURRENT_GPU_MCLK:
111      return RADEON_CURRENT_MCLK;
112   case SI_QUERY_CS_THREAD_BUSY:
113      return RADEON_CS_THREAD_TIME;
114   default:
115      unreachable("query type does not correspond to winsys id");
116   }
117}
118
119static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
120{
121   struct si_query_sw *query = (struct si_query_sw *)squery;
122   enum radeon_value_id ws_id;
123
124   switch (query->b.type) {
125   case PIPE_QUERY_TIMESTAMP_DISJOINT:
126   case PIPE_QUERY_GPU_FINISHED:
127      break;
128   case SI_QUERY_DRAW_CALLS:
129      query->begin_result = sctx->num_draw_calls;
130      break;
131   case SI_QUERY_DECOMPRESS_CALLS:
132      query->begin_result = sctx->num_decompress_calls;
133      break;
134   case SI_QUERY_PRIM_RESTART_CALLS:
135      query->begin_result = sctx->num_prim_restart_calls;
136      break;
137   case SI_QUERY_COMPUTE_CALLS:
138      query->begin_result = sctx->num_compute_calls;
139      break;
140   case SI_QUERY_CP_DMA_CALLS:
141      query->begin_result = sctx->num_cp_dma_calls;
142      break;
143   case SI_QUERY_NUM_VS_FLUSHES:
144      query->begin_result = sctx->num_vs_flushes;
145      break;
146   case SI_QUERY_NUM_PS_FLUSHES:
147      query->begin_result = sctx->num_ps_flushes;
148      break;
149   case SI_QUERY_NUM_CS_FLUSHES:
150      query->begin_result = sctx->num_cs_flushes;
151      break;
152   case SI_QUERY_NUM_CB_CACHE_FLUSHES:
153      query->begin_result = sctx->num_cb_cache_flushes;
154      break;
155   case SI_QUERY_NUM_DB_CACHE_FLUSHES:
156      query->begin_result = sctx->num_db_cache_flushes;
157      break;
158   case SI_QUERY_NUM_L2_INVALIDATES:
159      query->begin_result = sctx->num_L2_invalidates;
160      break;
161   case SI_QUERY_NUM_L2_WRITEBACKS:
162      query->begin_result = sctx->num_L2_writebacks;
163      break;
164   case SI_QUERY_NUM_RESIDENT_HANDLES:
165      query->begin_result = sctx->num_resident_handles;
166      break;
167   case SI_QUERY_TC_OFFLOADED_SLOTS:
168      query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
169      break;
170   case SI_QUERY_TC_DIRECT_SLOTS:
171      query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
172      break;
173   case SI_QUERY_TC_NUM_SYNCS:
174      query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
175      break;
176   case SI_QUERY_REQUESTED_VRAM:
177   case SI_QUERY_REQUESTED_GTT:
178   case SI_QUERY_MAPPED_VRAM:
179   case SI_QUERY_MAPPED_GTT:
180   case SI_QUERY_SLAB_WASTED_VRAM:
181   case SI_QUERY_SLAB_WASTED_GTT:
182   case SI_QUERY_VRAM_USAGE:
183   case SI_QUERY_VRAM_VIS_USAGE:
184   case SI_QUERY_GTT_USAGE:
185   case SI_QUERY_GPU_TEMPERATURE:
186   case SI_QUERY_CURRENT_GPU_SCLK:
187   case SI_QUERY_CURRENT_GPU_MCLK:
188   case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
189   case SI_QUERY_NUM_MAPPED_BUFFERS:
190      query->begin_result = 0;
191      break;
192   case SI_QUERY_BUFFER_WAIT_TIME:
193   case SI_QUERY_GFX_IB_SIZE:
194   case SI_QUERY_NUM_GFX_IBS:
195   case SI_QUERY_NUM_BYTES_MOVED:
196   case SI_QUERY_NUM_EVICTIONS:
197   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
198      enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
199      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
200      break;
201   }
202   case SI_QUERY_GFX_BO_LIST_SIZE:
203      ws_id = winsys_id_from_type(query->b.type);
204      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
205      query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
206      break;
207   case SI_QUERY_CS_THREAD_BUSY:
208      ws_id = winsys_id_from_type(query->b.type);
209      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
210      query->begin_time = os_time_get_nano();
211      break;
212   case SI_QUERY_GALLIUM_THREAD_BUSY:
213      query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
214      query->begin_time = os_time_get_nano();
215      break;
216   case SI_QUERY_GPU_LOAD:
217   case SI_QUERY_GPU_SHADERS_BUSY:
218   case SI_QUERY_GPU_TA_BUSY:
219   case SI_QUERY_GPU_GDS_BUSY:
220   case SI_QUERY_GPU_VGT_BUSY:
221   case SI_QUERY_GPU_IA_BUSY:
222   case SI_QUERY_GPU_SX_BUSY:
223   case SI_QUERY_GPU_WD_BUSY:
224   case SI_QUERY_GPU_BCI_BUSY:
225   case SI_QUERY_GPU_SC_BUSY:
226   case SI_QUERY_GPU_PA_BUSY:
227   case SI_QUERY_GPU_DB_BUSY:
228   case SI_QUERY_GPU_CP_BUSY:
229   case SI_QUERY_GPU_CB_BUSY:
230   case SI_QUERY_GPU_SDMA_BUSY:
231   case SI_QUERY_GPU_PFP_BUSY:
232   case SI_QUERY_GPU_MEQ_BUSY:
233   case SI_QUERY_GPU_ME_BUSY:
234   case SI_QUERY_GPU_SURF_SYNC_BUSY:
235   case SI_QUERY_GPU_CP_DMA_BUSY:
236   case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
237      query->begin_result = si_begin_counter(sctx->screen, query->b.type);
238      break;
239   case SI_QUERY_NUM_COMPILATIONS:
240      query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
241      break;
242   case SI_QUERY_NUM_SHADERS_CREATED:
243      query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
244      break;
245   case SI_QUERY_LIVE_SHADER_CACHE_HITS:
246      query->begin_result = sctx->screen->live_shader_cache.hits;
247      break;
248   case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
249      query->begin_result = sctx->screen->live_shader_cache.misses;
250      break;
251   case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
252      query->begin_result = sctx->screen->num_memory_shader_cache_hits;
253      break;
254   case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
255      query->begin_result = sctx->screen->num_memory_shader_cache_misses;
256      break;
257   case SI_QUERY_DISK_SHADER_CACHE_HITS:
258      query->begin_result = sctx->screen->num_disk_shader_cache_hits;
259      break;
260   case SI_QUERY_DISK_SHADER_CACHE_MISSES:
261      query->begin_result = sctx->screen->num_disk_shader_cache_misses;
262      break;
263   case SI_QUERY_GPIN_ASIC_ID:
264   case SI_QUERY_GPIN_NUM_SIMD:
265   case SI_QUERY_GPIN_NUM_RB:
266   case SI_QUERY_GPIN_NUM_SPI:
267   case SI_QUERY_GPIN_NUM_SE:
268      break;
269   default:
270      unreachable("si_query_sw_begin: bad query type");
271   }
272
273   return true;
274}
275
276static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
277{
278   struct si_query_sw *query = (struct si_query_sw *)squery;
279   enum radeon_value_id ws_id;
280
281   switch (query->b.type) {
282   case PIPE_QUERY_TIMESTAMP_DISJOINT:
283      break;
284   case PIPE_QUERY_GPU_FINISHED:
285      sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
286      break;
287   case SI_QUERY_DRAW_CALLS:
288      query->end_result = sctx->num_draw_calls;
289      break;
290   case SI_QUERY_DECOMPRESS_CALLS:
291      query->end_result = sctx->num_decompress_calls;
292      break;
293   case SI_QUERY_PRIM_RESTART_CALLS:
294      query->end_result = sctx->num_prim_restart_calls;
295      break;
296   case SI_QUERY_COMPUTE_CALLS:
297      query->end_result = sctx->num_compute_calls;
298      break;
299   case SI_QUERY_CP_DMA_CALLS:
300      query->end_result = sctx->num_cp_dma_calls;
301      break;
302   case SI_QUERY_NUM_VS_FLUSHES:
303      query->end_result = sctx->num_vs_flushes;
304      break;
305   case SI_QUERY_NUM_PS_FLUSHES:
306      query->end_result = sctx->num_ps_flushes;
307      break;
308   case SI_QUERY_NUM_CS_FLUSHES:
309      query->end_result = sctx->num_cs_flushes;
310      break;
311   case SI_QUERY_NUM_CB_CACHE_FLUSHES:
312      query->end_result = sctx->num_cb_cache_flushes;
313      break;
314   case SI_QUERY_NUM_DB_CACHE_FLUSHES:
315      query->end_result = sctx->num_db_cache_flushes;
316      break;
317   case SI_QUERY_NUM_L2_INVALIDATES:
318      query->end_result = sctx->num_L2_invalidates;
319      break;
320   case SI_QUERY_NUM_L2_WRITEBACKS:
321      query->end_result = sctx->num_L2_writebacks;
322      break;
323   case SI_QUERY_NUM_RESIDENT_HANDLES:
324      query->end_result = sctx->num_resident_handles;
325      break;
326   case SI_QUERY_TC_OFFLOADED_SLOTS:
327      query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
328      break;
329   case SI_QUERY_TC_DIRECT_SLOTS:
330      query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
331      break;
332   case SI_QUERY_TC_NUM_SYNCS:
333      query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
334      break;
335   case SI_QUERY_REQUESTED_VRAM:
336   case SI_QUERY_REQUESTED_GTT:
337   case SI_QUERY_MAPPED_VRAM:
338   case SI_QUERY_MAPPED_GTT:
339   case SI_QUERY_SLAB_WASTED_VRAM:
340   case SI_QUERY_SLAB_WASTED_GTT:
341   case SI_QUERY_VRAM_USAGE:
342   case SI_QUERY_VRAM_VIS_USAGE:
343   case SI_QUERY_GTT_USAGE:
344   case SI_QUERY_GPU_TEMPERATURE:
345   case SI_QUERY_CURRENT_GPU_SCLK:
346   case SI_QUERY_CURRENT_GPU_MCLK:
347   case SI_QUERY_BUFFER_WAIT_TIME:
348   case SI_QUERY_GFX_IB_SIZE:
349   case SI_QUERY_NUM_MAPPED_BUFFERS:
350   case SI_QUERY_NUM_GFX_IBS:
351   case SI_QUERY_NUM_BYTES_MOVED:
352   case SI_QUERY_NUM_EVICTIONS:
353   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
354      enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
355      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
356      break;
357   }
358   case SI_QUERY_GFX_BO_LIST_SIZE:
359      ws_id = winsys_id_from_type(query->b.type);
360      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
361      query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
362      break;
363   case SI_QUERY_CS_THREAD_BUSY:
364      ws_id = winsys_id_from_type(query->b.type);
365      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
366      query->end_time = os_time_get_nano();
367      break;
368   case SI_QUERY_GALLIUM_THREAD_BUSY:
369      query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
370      query->end_time = os_time_get_nano();
371      break;
372   case SI_QUERY_GPU_LOAD:
373   case SI_QUERY_GPU_SHADERS_BUSY:
374   case SI_QUERY_GPU_TA_BUSY:
375   case SI_QUERY_GPU_GDS_BUSY:
376   case SI_QUERY_GPU_VGT_BUSY:
377   case SI_QUERY_GPU_IA_BUSY:
378   case SI_QUERY_GPU_SX_BUSY:
379   case SI_QUERY_GPU_WD_BUSY:
380   case SI_QUERY_GPU_BCI_BUSY:
381   case SI_QUERY_GPU_SC_BUSY:
382   case SI_QUERY_GPU_PA_BUSY:
383   case SI_QUERY_GPU_DB_BUSY:
384   case SI_QUERY_GPU_CP_BUSY:
385   case SI_QUERY_GPU_CB_BUSY:
386   case SI_QUERY_GPU_SDMA_BUSY:
387   case SI_QUERY_GPU_PFP_BUSY:
388   case SI_QUERY_GPU_MEQ_BUSY:
389   case SI_QUERY_GPU_ME_BUSY:
390   case SI_QUERY_GPU_SURF_SYNC_BUSY:
391   case SI_QUERY_GPU_CP_DMA_BUSY:
392   case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
393      query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
394      query->begin_result = 0;
395      break;
396   case SI_QUERY_NUM_COMPILATIONS:
397      query->end_result = p_atomic_read(&sctx->screen->num_compilations);
398      break;
399   case SI_QUERY_NUM_SHADERS_CREATED:
400      query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
401      break;
402   case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
403      query->end_result = sctx->last_tex_ps_draw_ratio;
404      break;
405   case SI_QUERY_LIVE_SHADER_CACHE_HITS:
406      query->end_result = sctx->screen->live_shader_cache.hits;
407      break;
408   case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
409      query->end_result = sctx->screen->live_shader_cache.misses;
410      break;
411   case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
412      query->end_result = sctx->screen->num_memory_shader_cache_hits;
413      break;
414   case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
415      query->end_result = sctx->screen->num_memory_shader_cache_misses;
416      break;
417   case SI_QUERY_DISK_SHADER_CACHE_HITS:
418      query->end_result = sctx->screen->num_disk_shader_cache_hits;
419      break;
420   case SI_QUERY_DISK_SHADER_CACHE_MISSES:
421      query->end_result = sctx->screen->num_disk_shader_cache_misses;
422      break;
423   case SI_QUERY_GPIN_ASIC_ID:
424   case SI_QUERY_GPIN_NUM_SIMD:
425   case SI_QUERY_GPIN_NUM_RB:
426   case SI_QUERY_GPIN_NUM_SPI:
427   case SI_QUERY_GPIN_NUM_SE:
428      break;
429   default:
430      unreachable("si_query_sw_end: bad query type");
431   }
432
433   return true;
434}
435
436static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
437                                   union pipe_query_result *result)
438{
439   struct si_query_sw *query = (struct si_query_sw *)squery;
440
441   switch (query->b.type) {
442   case PIPE_QUERY_TIMESTAMP_DISJOINT:
443      /* Convert from cycles per millisecond to cycles per second (Hz). */
444      result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
445      result->timestamp_disjoint.disjoint = false;
446      return true;
447   case PIPE_QUERY_GPU_FINISHED: {
448      struct pipe_screen *screen = sctx->b.screen;
449      struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
450
451      result->b = screen->fence_finish(screen, ctx, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0);
452      return result->b;
453   }
454
455   case SI_QUERY_GFX_BO_LIST_SIZE:
456      result->u64 =
457         (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
458      return true;
459   case SI_QUERY_CS_THREAD_BUSY:
460   case SI_QUERY_GALLIUM_THREAD_BUSY:
461      result->u64 =
462         (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
463      return true;
464   case SI_QUERY_GPIN_ASIC_ID:
465      result->u32 = 0;
466      return true;
467   case SI_QUERY_GPIN_NUM_SIMD:
468      result->u32 = sctx->screen->info.num_cu;
469      return true;
470   case SI_QUERY_GPIN_NUM_RB:
471      result->u32 = sctx->screen->info.max_render_backends;
472      return true;
473   case SI_QUERY_GPIN_NUM_SPI:
474      result->u32 = 1; /* all supported chips have one SPI per SE */
475      return true;
476   case SI_QUERY_GPIN_NUM_SE:
477      result->u32 = sctx->screen->info.max_se;
478      return true;
479   }
480
481   result->u64 = query->end_result - query->begin_result;
482
483   switch (query->b.type) {
484   case SI_QUERY_BUFFER_WAIT_TIME:
485   case SI_QUERY_GPU_TEMPERATURE:
486      result->u64 /= 1000;
487      break;
488   case SI_QUERY_CURRENT_GPU_SCLK:
489   case SI_QUERY_CURRENT_GPU_MCLK:
490      result->u64 *= 1000000;
491      break;
492   }
493
494   return true;
495}
496
497static const struct si_query_ops sw_query_ops = {.destroy = si_query_sw_destroy,
498                                                 .begin = si_query_sw_begin,
499                                                 .end = si_query_sw_end,
500                                                 .get_result = si_query_sw_get_result,
501                                                 .get_result_resource = NULL};
502
503static struct pipe_query *si_query_sw_create(unsigned query_type)
504{
505   struct si_query_sw *query;
506
507   query = CALLOC_STRUCT(si_query_sw);
508   if (!query)
509      return NULL;
510
511   query->b.type = query_type;
512   query->b.ops = &sw_query_ops;
513
514   return (struct pipe_query *)query;
515}
516
517void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
518{
519   struct si_query_buffer *prev = buffer->previous;
520
521   /* Release all query buffers. */
522   while (prev) {
523      struct si_query_buffer *qbuf = prev;
524      prev = prev->previous;
525      si_resource_reference(&qbuf->buf, NULL);
526      FREE(qbuf);
527   }
528
529   si_resource_reference(&buffer->buf, NULL);
530}
531
532void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
533{
534   /* Discard all query buffers except for the oldest. */
535   while (buffer->previous) {
536      struct si_query_buffer *qbuf = buffer->previous;
537      buffer->previous = qbuf->previous;
538
539      si_resource_reference(&buffer->buf, NULL);
540      buffer->buf = qbuf->buf; /* move ownership */
541      FREE(qbuf);
542   }
543   buffer->results_end = 0;
544
545   if (!buffer->buf)
546      return;
547
548   /* Discard even the oldest buffer if it can't be mapped without a stall. */
549   if (si_cs_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
550       !sctx->ws->buffer_wait(sctx->ws, buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
551      si_resource_reference(&buffer->buf, NULL);
552   } else {
553      buffer->unprepared = true;
554   }
555}
556
557bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
558                           bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
559                           unsigned size)
560{
561   bool unprepared = buffer->unprepared;
562   buffer->unprepared = false;
563
564   if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
565      if (buffer->buf) {
566         struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
567         memcpy(qbuf, buffer, sizeof(*qbuf));
568         buffer->previous = qbuf;
569      }
570      buffer->results_end = 0;
571
572      /* Queries are normally read by the CPU after
573       * being written by the gpu, hence staging is probably a good
574       * usage pattern.
575       */
576      struct si_screen *screen = sctx->screen;
577      unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
578      buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
579      if (unlikely(!buffer->buf))
580         return false;
581      unprepared = true;
582   }
583
584   if (unprepared && prepare_buffer) {
585      if (unlikely(!prepare_buffer(sctx, buffer))) {
586         si_resource_reference(&buffer->buf, NULL);
587         return false;
588      }
589   }
590
591   return true;
592}
593
594void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
595{
596   struct si_query_hw *query = (struct si_query_hw *)squery;
597
598   si_query_buffer_destroy(sctx->screen, &query->buffer);
599   si_resource_reference(&query->workaround_buf, NULL);
600   FREE(squery);
601}
602
603static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
604{
605   struct si_query_hw *query = container_of(qbuf, struct si_query_hw, buffer);
606   struct si_screen *screen = sctx->screen;
607
608   /* The caller ensures that the buffer is currently unused by the GPU. */
609   uint32_t *results = screen->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
610                                              PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
611   if (!results)
612      return false;
613
614   memset(results, 0, qbuf->buf->b.b.width0);
615
616   if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
617       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
618       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
619      unsigned max_rbs = screen->info.max_render_backends;
620      unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
621      unsigned num_results;
622      unsigned i, j;
623
624      /* Set top bits for unused backends. */
625      num_results = qbuf->buf->b.b.width0 / query->result_size;
626      for (j = 0; j < num_results; j++) {
627         for (i = 0; i < max_rbs; i++) {
628            if (!(enabled_rb_mask & (1 << i))) {
629               results[(i * 4) + 1] = 0x80000000;
630               results[(i * 4) + 3] = 0x80000000;
631            }
632         }
633         results += 4 * max_rbs;
634      }
635   }
636
637   return true;
638}
639
640static unsigned si_query_pipestats_num_results(struct si_screen *sscreen)
641{
642   return sscreen->info.gfx_level >= GFX11 ? 14 : 11;
643}
644
645static unsigned si_query_pipestat_dw_offset(enum pipe_statistics_query_index index)
646{
647   switch (index) {
648   case PIPE_STAT_QUERY_PS_INVOCATIONS: return 0;
649   case PIPE_STAT_QUERY_C_PRIMITIVES: return 2;
650   case PIPE_STAT_QUERY_C_INVOCATIONS: return 4;
651   case PIPE_STAT_QUERY_VS_INVOCATIONS: return 6;
652   case PIPE_STAT_QUERY_GS_INVOCATIONS: return 8;
653   case PIPE_STAT_QUERY_GS_PRIMITIVES: return 10;
654   case PIPE_STAT_QUERY_IA_PRIMITIVES: return 12;
655   case PIPE_STAT_QUERY_IA_VERTICES: return 14;
656   case PIPE_STAT_QUERY_HS_INVOCATIONS: return 16;
657   case PIPE_STAT_QUERY_DS_INVOCATIONS: return 18;
658   case PIPE_STAT_QUERY_CS_INVOCATIONS: return 20;
659   /* gfx11: MS_INVOCATIONS */
660   /* gfx11: MS_PRIMITIVES */
661   /* gfx11: TS_INVOCATIONS */
662   default:
663      assert(false);
664   }
665   return ~0;
666}
667
668unsigned si_query_pipestat_end_dw_offset(struct si_screen *sscreen,
669                                         enum pipe_statistics_query_index index)
670{
671   return si_query_pipestats_num_results(sscreen) * 2 + si_query_pipestat_dw_offset(index);
672}
673
674static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
675                                            enum pipe_query_flags flags,
676                                            enum pipe_query_value_type result_type,
677                                            int index, struct pipe_resource *resource,
678                                            unsigned offset);
679
680static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
681                                      struct si_resource *buffer, uint64_t va);
682static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
683                                     struct si_resource *buffer, uint64_t va);
684static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *, void *buffer,
685                                   union pipe_query_result *result);
686static void si_query_hw_clear_result(struct si_query_hw *, union pipe_query_result *);
687
688static struct si_query_hw_ops query_hw_default_hw_ops = {
689   .prepare_buffer = si_query_hw_prepare_buffer,
690   .emit_start = si_query_hw_do_emit_start,
691   .emit_stop = si_query_hw_do_emit_stop,
692   .clear_result = si_query_hw_clear_result,
693   .add_result = si_query_hw_add_result,
694};
695
696static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
697                                             unsigned index)
698{
699   struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
700   if (!query)
701      return NULL;
702
703   query->b.type = query_type;
704   query->b.ops = &query_hw_ops;
705   query->ops = &query_hw_default_hw_ops;
706
707   switch (query_type) {
708   case PIPE_QUERY_OCCLUSION_COUNTER:
709   case PIPE_QUERY_OCCLUSION_PREDICATE:
710   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
711      query->result_size = 16 * sscreen->info.max_render_backends;
712      query->result_size += 16; /* for the fence + alignment */
713      query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
714      break;
715   case PIPE_QUERY_TIME_ELAPSED:
716      query->result_size = 24;
717      query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
718      break;
719   case PIPE_QUERY_TIMESTAMP:
720      query->result_size = 16;
721      query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
722      query->flags = SI_QUERY_HW_FLAG_NO_START;
723      break;
724   case PIPE_QUERY_PRIMITIVES_EMITTED:
725   case PIPE_QUERY_PRIMITIVES_GENERATED:
726   case PIPE_QUERY_SO_STATISTICS:
727   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
728      /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
729      query->result_size = 32;
730      query->b.num_cs_dw_suspend = 6;
731      query->stream = index;
732      break;
733   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
734      /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
735      query->result_size = 32 * SI_MAX_STREAMS;
736      query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
737      break;
738   case PIPE_QUERY_PIPELINE_STATISTICS:
739      query->result_size = si_query_pipestats_num_results(sscreen) * 16;
740      query->result_size += 8; /* for the fence + alignment */
741      query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
742      query->index = index;
743      if ((index == PIPE_STAT_QUERY_GS_PRIMITIVES || index == PIPE_STAT_QUERY_GS_INVOCATIONS) &&
744          sscreen->use_ngg && (sscreen->info.gfx_level >= GFX10 && sscreen->info.gfx_level <= GFX10_3))
745         query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
746      break;
747   default:
748      assert(0);
749      FREE(query);
750      return NULL;
751   }
752
753   return (struct pipe_query *)query;
754}
755
756static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
757{
758   if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
759       type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
760      bool old_enable = sctx->num_occlusion_queries != 0;
761      bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0;
762      bool enable, perfect_enable;
763
764      sctx->num_occlusion_queries += diff;
765      assert(sctx->num_occlusion_queries >= 0);
766
767      if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
768         sctx->num_perfect_occlusion_queries += diff;
769         assert(sctx->num_perfect_occlusion_queries >= 0);
770      }
771
772      enable = sctx->num_occlusion_queries != 0;
773      perfect_enable = sctx->num_perfect_occlusion_queries != 0;
774
775      if (enable != old_enable || perfect_enable != old_perfect_enable) {
776         si_set_occlusion_query_state(sctx, old_perfect_enable);
777      }
778   }
779}
780
781static unsigned event_type_for_stream(unsigned stream)
782{
783   switch (stream) {
784   default:
785   case 0:
786      return V_028A90_SAMPLE_STREAMOUTSTATS;
787   case 1:
788      return V_028A90_SAMPLE_STREAMOUTSTATS1;
789   case 2:
790      return V_028A90_SAMPLE_STREAMOUTSTATS2;
791   case 3:
792      return V_028A90_SAMPLE_STREAMOUTSTATS3;
793   }
794}
795
796static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
797{
798   radeon_begin(cs);
799   radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
800   radeon_emit(EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
801   radeon_emit(va);
802   radeon_emit(va >> 32);
803   radeon_end();
804}
805
806static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
807                                      struct si_resource *buffer, uint64_t va)
808{
809   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
810
811   switch (query->b.type) {
812   case PIPE_QUERY_OCCLUSION_COUNTER:
813   case PIPE_QUERY_OCCLUSION_PREDICATE:
814   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
815      radeon_begin(cs);
816      if (sctx->gfx_level >= GFX11) {
817         uint64_t rb_mask = BITFIELD64_MASK(sctx->screen->info.max_render_backends);
818
819         radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
820         radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_CONTROL) | EVENT_INDEX(1));
821         radeon_emit(PIXEL_PIPE_STATE_CNTL_COUNTER_ID(0) |
822                     PIXEL_PIPE_STATE_CNTL_STRIDE(2) |
823                     PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask));
824         radeon_emit(PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask));
825      }
826
827      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
828      if (sctx->gfx_level >= GFX11)
829         radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
830      else
831         radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
832      radeon_emit(va);
833      radeon_emit(va >> 32);
834      radeon_end();
835      break;
836   }
837   case PIPE_QUERY_PRIMITIVES_EMITTED:
838   case PIPE_QUERY_PRIMITIVES_GENERATED:
839   case PIPE_QUERY_SO_STATISTICS:
840   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
841      emit_sample_streamout(cs, va, query->stream);
842      break;
843   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
844      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
845         emit_sample_streamout(cs, va + 32 * stream, stream);
846      break;
847   case PIPE_QUERY_TIME_ELAPSED:
848      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
849                        EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
850      break;
851   case PIPE_QUERY_PIPELINE_STATISTICS: {
852      if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
853         /* The hw GS primitive counter doesn't work when ngg is active.
854          * So if use_ngg is true, we don't use the hw version but instead
855          * emulate it in the GS shader.
856          * The value is written at the same position, so we don't need to
857          * change anything else.
858          * If ngg is enabled for the draw, the primitive count is written in
859          * gfx10_ngg_gs_emit_epilogue. If ngg is disabled, the number of exported
860          * vertices is stored in gs_emitted_vertices and the number of prim
861          * is computed based on the output prim type in emit_gs_epilogue.
862          */
863         struct pipe_shader_buffer sbuf;
864         sbuf.buffer = &buffer->b.b;
865         sbuf.buffer_offset = query->buffer.results_end;
866         sbuf.buffer_size = buffer->bo_size;
867         si_set_internal_shader_buffer(sctx, SI_GS_QUERY_EMULATED_COUNTERS_BUF, &sbuf);
868         SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 1);
869
870         const uint32_t zero = 0;
871         radeon_begin(cs);
872         /* Clear the emulated counter end value. We don't clear start because it's unused. */
873         va += si_query_pipestat_end_dw_offset(sctx->screen, query->index) * 4;
874         radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + 1, 0));
875         radeon_emit(S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
876         radeon_emit(va);
877         radeon_emit(va >> 32);
878         radeon_emit(zero);
879         radeon_end();
880
881         sctx->num_pipeline_stat_emulated_queries++;
882      } else {
883         radeon_begin(cs);
884         radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
885         radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
886         radeon_emit(va);
887         radeon_emit(va >> 32);
888         radeon_end();
889      }
890      break;
891   }
892   default:
893      assert(0);
894   }
895   radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
896                             RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
897}
898
899static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
900{
901   uint64_t va;
902
903   if (!query->buffer.buf && query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
904      si_resource_reference(&query->buffer.buf, sctx->pipeline_stats_query_buf);
905
906   /* Don't realloc pipeline_stats_query_buf */
907   if ((!(query->flags & SI_QUERY_EMULATE_GS_COUNTERS) || !sctx->pipeline_stats_query_buf) &&
908       !si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size))
909      return;
910
911   if (query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
912      si_resource_reference(&sctx->pipeline_stats_query_buf, query->buffer.buf);
913
914   si_update_occlusion_query_state(sctx, query->b.type, 1);
915   si_update_prims_generated_query_state(sctx, query->b.type, 1);
916
917   if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
918      sctx->num_pipeline_stat_queries++;
919
920   si_need_gfx_cs_space(sctx, 0);
921
922   va = query->buffer.buf->gpu_address + query->buffer.results_end;
923   query->ops->emit_start(sctx, query, query->buffer.buf, va);
924}
925
926static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
927                                     struct si_resource *buffer, uint64_t va)
928{
929   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
930   uint64_t fence_va = 0;
931
932   switch (query->b.type) {
933   case PIPE_QUERY_OCCLUSION_COUNTER:
934   case PIPE_QUERY_OCCLUSION_PREDICATE:
935   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
936      va += 8;
937      radeon_begin(cs);
938      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
939      if (sctx->gfx_level >= GFX11)
940         radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
941      else
942         radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
943      radeon_emit(va);
944      radeon_emit(va >> 32);
945      radeon_end();
946
947      fence_va = va + sctx->screen->info.max_render_backends * 16 - 8;
948      break;
949   }
950   case PIPE_QUERY_PRIMITIVES_EMITTED:
951   case PIPE_QUERY_PRIMITIVES_GENERATED:
952   case PIPE_QUERY_SO_STATISTICS:
953   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
954      va += 16;
955      emit_sample_streamout(cs, va, query->stream);
956      break;
957   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
958      va += 16;
959      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
960         emit_sample_streamout(cs, va + 32 * stream, stream);
961      break;
962   case PIPE_QUERY_TIME_ELAPSED:
963      va += 8;
964      FALLTHROUGH;
965   case PIPE_QUERY_TIMESTAMP:
966      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
967                        EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
968      fence_va = va + 8;
969      break;
970   case PIPE_QUERY_PIPELINE_STATISTICS: {
971      unsigned sample_size = (query->result_size - 8) / 2;
972
973      va += sample_size;
974
975      radeon_begin(cs);
976      if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
977         radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
978         radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
979
980         if (--sctx->num_pipeline_stat_emulated_queries == 0) {
981            si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
982            SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 0);
983         }
984      } else {
985         radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
986         radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
987         radeon_emit(va);
988         radeon_emit(va >> 32);
989      }
990      radeon_end();
991
992      fence_va = va + sample_size;
993      break;
994   }
995   default:
996      assert(0);
997   }
998   radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
999                             RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
1000
1001   if (fence_va) {
1002      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
1003                        EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
1004                        query->b.type);
1005   }
1006}
1007
1008static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
1009{
1010   uint64_t va;
1011
1012   /* The queries which need begin already called this in begin_query. */
1013   if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1014      si_need_gfx_cs_space(sctx, 0);
1015      if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
1016                                 query->result_size))
1017         return;
1018   }
1019
1020   if (!query->buffer.buf)
1021      return; // previous buffer allocation failure
1022
1023   /* emit end query */
1024   va = query->buffer.buf->gpu_address + query->buffer.results_end;
1025
1026   query->ops->emit_stop(sctx, query, query->buffer.buf, va);
1027
1028   query->buffer.results_end += query->result_size;
1029
1030   si_update_occlusion_query_state(sctx, query->b.type, -1);
1031   si_update_prims_generated_query_state(sctx, query->b.type, -1);
1032
1033   if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
1034      sctx->num_pipeline_stat_queries--;
1035}
1036
1037static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
1038                               uint32_t op)
1039{
1040   struct radeon_cmdbuf *cs = &ctx->gfx_cs;
1041
1042   radeon_begin(cs);
1043
1044   if (ctx->gfx_level >= GFX9) {
1045      radeon_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
1046      radeon_emit(op);
1047      radeon_emit(va);
1048      radeon_emit(va >> 32);
1049   } else {
1050      radeon_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
1051      radeon_emit(va);
1052      radeon_emit(op | ((va >> 32) & 0xFF));
1053   }
1054   radeon_end();
1055
1056   radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_READ | RADEON_PRIO_QUERY);
1057}
1058
1059static void si_emit_query_predication(struct si_context *ctx)
1060{
1061   uint32_t op;
1062   bool flag_wait, invert;
1063
1064   struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
1065   if (!query)
1066      return;
1067
1068   invert = ctx->render_cond_invert;
1069   flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
1070               ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
1071
1072   if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1073                                          query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
1074      struct gfx10_sh_query *gfx10_query = (struct gfx10_sh_query *)query;
1075      struct gfx10_sh_query_buffer *qbuf, *first, *last;
1076
1077      op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1078
1079      /* if true then invert, see GL_ARB_conditional_render_inverted */
1080      if (!invert)
1081         op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1082      else
1083         op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1084
1085      op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1086
1087      first = gfx10_query->first;
1088      last = gfx10_query->last;
1089
1090      while (first) {
1091         qbuf = first;
1092         if (first != last)
1093            first = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list);
1094         else
1095            first = NULL;
1096
1097         unsigned results_base = gfx10_query->first_begin;
1098         uint64_t va_base = qbuf->buf->gpu_address;
1099         uint64_t va = va_base + results_base;
1100
1101         unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0;
1102         unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0;
1103
1104         unsigned count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
1105         do {
1106            if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1107               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1108                  emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * stream, op);
1109
1110                  /* set CONTINUE bit for all packets except the first */
1111                  op |= PREDICATION_CONTINUE;
1112               }
1113            } else {
1114               emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * gfx10_query->stream, op);
1115               op |= PREDICATION_CONTINUE;
1116            }
1117
1118            results_base += sizeof(struct gfx10_sh_query_buffer_mem);
1119         } while (count--);
1120      }
1121   } else {
1122      struct si_query_buffer *qbuf;
1123
1124      if (query->workaround_buf) {
1125         op = PRED_OP(PREDICATION_OP_BOOL64);
1126      } else {
1127         switch (query->b.type) {
1128         case PIPE_QUERY_OCCLUSION_COUNTER:
1129         case PIPE_QUERY_OCCLUSION_PREDICATE:
1130         case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1131            op = PRED_OP(PREDICATION_OP_ZPASS);
1132            break;
1133         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1134         case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1135            op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1136            invert = !invert;
1137            break;
1138         default:
1139            assert(0);
1140            return;
1141         }
1142      }
1143
1144      /* if true then invert, see GL_ARB_conditional_render_inverted */
1145      if (invert)
1146         op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1147      else
1148         op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1149
1150      /* Use the value written by compute shader as a workaround. Note that
1151       * the wait flag does not apply in this predication mode.
1152       *
1153       * The shader outputs the result value to L2. Workarounds only affect GFX8
1154       * and later, where the CP reads data from L2, so we don't need an
1155       * additional flush.
1156       */
1157      if (query->workaround_buf) {
1158         uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
1159         emit_set_predicate(ctx, query->workaround_buf, va, op);
1160         return;
1161      }
1162
1163      op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1164
1165      /* emit predicate packets for all data blocks */
1166      for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1167         unsigned results_base = 0;
1168         uint64_t va_base = qbuf->buf->gpu_address;
1169
1170         while (results_base < qbuf->results_end) {
1171            uint64_t va = va_base + results_base;
1172
1173            if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1174               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1175                  emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1176
1177                  /* set CONTINUE bit for all packets except the first */
1178                  op |= PREDICATION_CONTINUE;
1179               }
1180            } else {
1181               emit_set_predicate(ctx, qbuf->buf, va, op);
1182               op |= PREDICATION_CONTINUE;
1183            }
1184
1185            results_base += query->result_size;
1186         }
1187      }
1188   }
1189}
1190
1191static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
1192                                          unsigned index)
1193{
1194   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
1195
1196   if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
1197       (query_type >= PIPE_QUERY_DRIVER_SPECIFIC))
1198      return si_query_sw_create(query_type);
1199
1200   if (sscreen->use_ngg_streamout &&
1201       (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
1202        query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
1203        query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1204        query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
1205      return gfx10_sh_query_create(sscreen, query_type, index);
1206
1207   return si_query_hw_create(sscreen, query_type, index);
1208}
1209
1210static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1211{
1212   struct si_context *sctx = (struct si_context *)ctx;
1213   struct si_query *squery = (struct si_query *)query;
1214
1215   squery->ops->destroy(sctx, squery);
1216}
1217
1218static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
1219{
1220   struct si_context *sctx = (struct si_context *)ctx;
1221   struct si_query *squery = (struct si_query *)query;
1222
1223   return squery->ops->begin(sctx, squery);
1224}
1225
1226bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
1227{
1228   struct si_query_hw *query = (struct si_query_hw *)squery;
1229
1230   if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1231      assert(0);
1232      return false;
1233   }
1234
1235   if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
1236      si_query_buffer_reset(sctx, &query->buffer);
1237
1238   si_resource_reference(&query->workaround_buf, NULL);
1239
1240   si_query_hw_emit_start(sctx, query);
1241   if (!query->buffer.buf)
1242      return false;
1243
1244   list_addtail(&query->b.active_list, &sctx->active_queries);
1245   sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1246   return true;
1247}
1248
1249static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
1250{
1251   struct si_context *sctx = (struct si_context *)ctx;
1252   struct si_query *squery = (struct si_query *)query;
1253
1254   return squery->ops->end(sctx, squery);
1255}
1256
1257bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
1258{
1259   struct si_query_hw *query = (struct si_query_hw *)squery;
1260
1261   if (query->flags & SI_QUERY_HW_FLAG_NO_START)
1262      si_query_buffer_reset(sctx, &query->buffer);
1263
1264   si_query_hw_emit_stop(sctx, query);
1265
1266   if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
1267      list_delinit(&query->b.active_list);
1268      sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
1269   }
1270
1271   if (!query->buffer.buf)
1272      return false;
1273
1274   return true;
1275}
1276
1277static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index,
1278                                   struct si_hw_query_params *params)
1279{
1280   unsigned max_rbs = sctx->screen->info.max_render_backends;
1281
1282   params->pair_stride = 0;
1283   params->pair_count = 1;
1284
1285   switch (squery->b.type) {
1286   case PIPE_QUERY_OCCLUSION_COUNTER:
1287   case PIPE_QUERY_OCCLUSION_PREDICATE:
1288   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1289      params->start_offset = 0;
1290      params->end_offset = 8;
1291      params->fence_offset = max_rbs * 16;
1292      params->pair_stride = 16;
1293      params->pair_count = max_rbs;
1294      break;
1295   case PIPE_QUERY_TIME_ELAPSED:
1296      params->start_offset = 0;
1297      params->end_offset = 8;
1298      params->fence_offset = 16;
1299      break;
1300   case PIPE_QUERY_TIMESTAMP:
1301      params->start_offset = 0;
1302      params->end_offset = 0;
1303      params->fence_offset = 8;
1304      break;
1305   case PIPE_QUERY_PRIMITIVES_EMITTED:
1306      params->start_offset = 8;
1307      params->end_offset = 24;
1308      params->fence_offset = params->end_offset + 4;
1309      break;
1310   case PIPE_QUERY_PRIMITIVES_GENERATED:
1311      params->start_offset = 0;
1312      params->end_offset = 16;
1313      params->fence_offset = params->end_offset + 4;
1314      break;
1315   case PIPE_QUERY_SO_STATISTICS:
1316      params->start_offset = 8 - index * 8;
1317      params->end_offset = 24 - index * 8;
1318      params->fence_offset = params->end_offset + 4;
1319      break;
1320   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1321      params->pair_count = SI_MAX_STREAMS;
1322      params->pair_stride = 32;
1323      FALLTHROUGH;
1324   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1325      params->start_offset = 0;
1326      params->end_offset = 16;
1327
1328      /* We can re-use the high dword of the last 64-bit value as a
1329       * fence: it is initialized as 0, and the high bit is set by
1330       * the write of the streamout stats event.
1331       */
1332      params->fence_offset = squery->result_size - 4;
1333      break;
1334   case PIPE_QUERY_PIPELINE_STATISTICS: {
1335      params->start_offset = si_query_pipestat_dw_offset(index) * 4;
1336      params->end_offset = si_query_pipestat_end_dw_offset(sctx->screen, index) * 4;
1337      params->fence_offset = si_query_pipestats_num_results(sctx->screen) * 16;
1338      break;
1339   }
1340   default:
1341      unreachable("si_get_hw_query_params unsupported");
1342   }
1343}
1344
1345static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
1346                                     bool test_status_bit)
1347{
1348   uint32_t *current_result = (uint32_t *)map;
1349   uint64_t start, end;
1350
1351   start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
1352   end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
1353
1354   if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1355      return end - start;
1356   }
1357   return 0;
1358}
1359
1360static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
1361                                   void *buffer, union pipe_query_result *result)
1362{
1363   unsigned max_rbs = sscreen->info.max_render_backends;
1364
1365   switch (query->b.type) {
1366   case PIPE_QUERY_OCCLUSION_COUNTER: {
1367      for (unsigned i = 0; i < max_rbs; ++i) {
1368         unsigned results_base = i * 16;
1369         result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
1370      }
1371      break;
1372   }
1373   case PIPE_QUERY_OCCLUSION_PREDICATE:
1374   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1375      for (unsigned i = 0; i < max_rbs; ++i) {
1376         unsigned results_base = i * 16;
1377         result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
1378      }
1379      break;
1380   }
1381   case PIPE_QUERY_TIME_ELAPSED:
1382      result->u64 += si_query_read_result(buffer, 0, 2, false);
1383      break;
1384   case PIPE_QUERY_TIMESTAMP:
1385      result->u64 = *(uint64_t *)buffer;
1386      break;
1387   case PIPE_QUERY_PRIMITIVES_EMITTED:
1388      /* SAMPLE_STREAMOUTSTATS stores this structure:
1389       * {
1390       *    u64 NumPrimitivesWritten;
1391       *    u64 PrimitiveStorageNeeded;
1392       * }
1393       * We only need NumPrimitivesWritten here. */
1394      result->u64 += si_query_read_result(buffer, 2, 6, true);
1395      break;
1396   case PIPE_QUERY_PRIMITIVES_GENERATED:
1397      /* Here we read PrimitiveStorageNeeded. */
1398      result->u64 += si_query_read_result(buffer, 0, 4, true);
1399      break;
1400   case PIPE_QUERY_SO_STATISTICS:
1401      result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
1402      result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
1403      break;
1404   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1405      result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1406                                  si_query_read_result(buffer, 0, 4, true);
1407      break;
1408   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1409      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1410         result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1411                                     si_query_read_result(buffer, 0, 4, true);
1412         buffer = (char *)buffer + 32;
1413      }
1414      break;
1415   case PIPE_QUERY_PIPELINE_STATISTICS:
1416      for (int i = 0; i < 11; i++) {
1417         result->pipeline_statistics.counters[i] +=
1418            si_query_read_result(buffer, si_query_pipestat_dw_offset(i),
1419                                 si_query_pipestat_end_dw_offset(sscreen, i), false);
1420      }
1421#if 0 /* for testing */
1422      printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1423             "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1424             "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1425             result->pipeline_statistics.ia_vertices,
1426             result->pipeline_statistics.ia_primitives,
1427             result->pipeline_statistics.vs_invocations,
1428             result->pipeline_statistics.hs_invocations,
1429             result->pipeline_statistics.ds_invocations,
1430             result->pipeline_statistics.gs_invocations,
1431             result->pipeline_statistics.gs_primitives,
1432             result->pipeline_statistics.c_invocations,
1433             result->pipeline_statistics.c_primitives,
1434             result->pipeline_statistics.ps_invocations,
1435             result->pipeline_statistics.cs_invocations);
1436#endif
1437      break;
1438   default:
1439      assert(0);
1440   }
1441}
1442
1443void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
1444{
1445   si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
1446}
1447
1448void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
1449{
1450   si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
1451}
1452
1453static const struct si_query_ops query_hw_ops = {
1454   .destroy = si_query_hw_destroy,
1455   .begin = si_query_hw_begin,
1456   .end = si_query_hw_end,
1457   .get_result = si_query_hw_get_result,
1458   .get_result_resource = si_query_hw_get_result_resource,
1459
1460   .suspend = si_query_hw_suspend,
1461   .resume = si_query_hw_resume,
1462};
1463
1464static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
1465                                union pipe_query_result *result)
1466{
1467   struct si_context *sctx = (struct si_context *)ctx;
1468   struct si_query *squery = (struct si_query *)query;
1469
1470   return squery->ops->get_result(sctx, squery, wait, result);
1471}
1472
1473static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
1474                                         enum pipe_query_flags flags, enum pipe_query_value_type result_type,
1475                                         int index, struct pipe_resource *resource, unsigned offset)
1476{
1477   struct si_context *sctx = (struct si_context *)ctx;
1478   struct si_query *squery = (struct si_query *)query;
1479
1480   squery->ops->get_result_resource(sctx, squery, flags, result_type, index, resource, offset);
1481}
1482
1483static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
1484{
1485   util_query_clear_result(result, query->b.type);
1486}
1487
1488bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1489                            union pipe_query_result *result)
1490{
1491   struct si_screen *sscreen = sctx->screen;
1492   struct si_query_hw *query = (struct si_query_hw *)squery;
1493   struct si_query_buffer *qbuf;
1494
1495   query->ops->clear_result(query, result);
1496
1497   for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1498      unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1499      unsigned results_base = 0;
1500      void *map;
1501
1502      if (squery->b.flushed)
1503         map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
1504      else
1505         map = si_buffer_map(sctx, qbuf->buf, usage);
1506
1507      if (!map)
1508         return false;
1509
1510      while (results_base != qbuf->results_end) {
1511         query->ops->add_result(sscreen, query, map + results_base, result);
1512         results_base += query->result_size;
1513      }
1514   }
1515
1516   /* Convert the time to expected units. */
1517   if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
1518       squery->type == PIPE_QUERY_TIMESTAMP) {
1519      result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
1520   }
1521   return true;
1522}
1523
1524static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
1525                                            enum pipe_query_flags flags,
1526                                            enum pipe_query_value_type result_type,
1527                                            int index, struct pipe_resource *resource,
1528                                            unsigned offset)
1529{
1530   struct si_query_hw *query = (struct si_query_hw *)squery;
1531   struct si_query_buffer *qbuf;
1532   struct si_query_buffer *qbuf_prev;
1533   struct pipe_resource *tmp_buffer = NULL;
1534   unsigned tmp_buffer_offset = 0;
1535   struct si_qbo_state saved_state = {};
1536   struct pipe_grid_info grid = {};
1537   struct pipe_constant_buffer constant_buffer = {};
1538   struct pipe_shader_buffer ssbo[3];
1539   struct si_hw_query_params params;
1540   struct {
1541      uint32_t end_offset;
1542      uint32_t result_stride;
1543      uint32_t result_count;
1544      uint32_t config;
1545      uint32_t fence_offset;
1546      uint32_t pair_stride;
1547      uint32_t pair_count;
1548   } consts;
1549
1550   if (!sctx->query_result_shader) {
1551      sctx->query_result_shader = si_create_query_result_cs(sctx);
1552      if (!sctx->query_result_shader)
1553         return;
1554   }
1555
1556   if (query->buffer.previous) {
1557      u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
1558      if (!tmp_buffer)
1559         return;
1560   }
1561
1562   si_save_qbo_state(sctx, &saved_state);
1563
1564   si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, &params);
1565   consts.end_offset = params.end_offset - params.start_offset;
1566   consts.fence_offset = params.fence_offset - params.start_offset;
1567   consts.result_stride = query->result_size;
1568   consts.pair_stride = params.pair_stride;
1569   consts.pair_count = params.pair_count;
1570
1571   constant_buffer.buffer_size = sizeof(consts);
1572   constant_buffer.user_buffer = &consts;
1573
1574   ssbo[1].buffer = tmp_buffer;
1575   ssbo[1].buffer_offset = tmp_buffer_offset;
1576   ssbo[1].buffer_size = 16;
1577
1578   ssbo[2] = ssbo[1];
1579
1580   grid.block[0] = 1;
1581   grid.block[1] = 1;
1582   grid.block[2] = 1;
1583   grid.grid[0] = 1;
1584   grid.grid[1] = 1;
1585   grid.grid[2] = 1;
1586
1587   consts.config = 0;
1588   if (index < 0)
1589      consts.config |= 4;
1590   if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1591       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1592      consts.config |= 8;
1593   else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1594            query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1595      consts.config |= 8 | 256;
1596   else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
1597      consts.config |= 32;
1598
1599   switch (result_type) {
1600   case PIPE_QUERY_TYPE_U64:
1601   case PIPE_QUERY_TYPE_I64:
1602      consts.config |= 64;
1603      break;
1604   case PIPE_QUERY_TYPE_I32:
1605      consts.config |= 128;
1606      break;
1607   case PIPE_QUERY_TYPE_U32:
1608      break;
1609   }
1610
1611   sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
1612
1613   for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1614      if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1615         qbuf_prev = qbuf->previous;
1616         consts.result_count = qbuf->results_end / query->result_size;
1617         consts.config &= ~3;
1618         if (qbuf != &query->buffer)
1619            consts.config |= 1;
1620         if (qbuf->previous)
1621            consts.config |= 2;
1622      } else {
1623         /* Only read the last timestamp. */
1624         qbuf_prev = NULL;
1625         consts.result_count = 0;
1626         consts.config |= 16;
1627         params.start_offset += qbuf->results_end - query->result_size;
1628      }
1629
1630      sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
1631
1632      ssbo[0].buffer = &qbuf->buf->b.b;
1633      ssbo[0].buffer_offset = params.start_offset;
1634      ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1635
1636      if (!qbuf->previous) {
1637         ssbo[2].buffer = resource;
1638         ssbo[2].buffer_offset = offset;
1639         ssbo[2].buffer_size = resource->width0 - offset;
1640         /* assert size is correct, based on result_type ? */
1641
1642         si_resource(resource)->TC_L2_dirty = true;
1643      }
1644
1645      if ((flags & PIPE_QUERY_WAIT) && qbuf == &query->buffer) {
1646         uint64_t va;
1647
1648         /* Wait for result availability. Wait only for readiness
1649          * of the last entry, since the fence writes should be
1650          * serialized in the CP.
1651          */
1652         va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1653         va += params.fence_offset;
1654
1655         si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
1656      }
1657      si_launch_grid_internal_ssbos(sctx, &grid, sctx->query_result_shader,
1658                                    SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
1659                                    3, ssbo, 0x4);
1660   }
1661
1662   si_restore_qbo_state(sctx, &saved_state);
1663   pipe_resource_reference(&tmp_buffer, NULL);
1664}
1665
1666static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
1667                                enum pipe_render_cond_flag mode)
1668{
1669   struct si_context *sctx = (struct si_context *)ctx;
1670   struct si_query_hw *squery = (struct si_query_hw *)query;
1671   struct si_atom *atom = &sctx->atoms.s.render_cond;
1672
1673   if (query) {
1674      bool needs_workaround = false;
1675
1676      /* There was a firmware regression in GFX8 which causes successive
1677       * SET_PREDICATION packets to give the wrong answer for
1678       * non-inverted stream overflow predication.
1679       */
1680      if (((sctx->gfx_level == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
1681           (sctx->gfx_level == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
1682          !condition &&
1683          (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1684           (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1685            (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
1686         needs_workaround = true;
1687      }
1688
1689      if (needs_workaround && !squery->workaround_buf) {
1690         bool old_render_cond_enabled = sctx->render_cond_enabled;
1691         sctx->render_cond_enabled = false;
1692
1693         u_suballocator_alloc(&sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
1694                              (struct pipe_resource **)&squery->workaround_buf);
1695
1696         /* Reset to NULL to avoid a redundant SET_PREDICATION
1697          * from launching the compute grid.
1698          */
1699         sctx->render_cond = NULL;
1700
1701         ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1702                                        &squery->workaround_buf->b.b, squery->workaround_offset);
1703
1704         /* Settings this in the render cond atom is too late,
1705          * so set it here. */
1706         sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
1707
1708         sctx->render_cond_enabled = old_render_cond_enabled;
1709      }
1710   }
1711
1712   sctx->render_cond = query;
1713   sctx->render_cond_invert = condition;
1714   sctx->render_cond_mode = mode;
1715   sctx->render_cond_enabled = query;
1716
1717   si_set_atom_dirty(sctx, atom, query != NULL);
1718}
1719
1720void si_suspend_queries(struct si_context *sctx)
1721{
1722   struct si_query *query;
1723
1724   LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1725      query->ops->suspend(sctx, query);
1726}
1727
1728void si_resume_queries(struct si_context *sctx)
1729{
1730   struct si_query *query;
1731
1732   /* Check CS space here. Resuming must not be interrupted by flushes. */
1733   si_need_gfx_cs_space(sctx, 0);
1734
1735   LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1736      query->ops->resume(sctx, query);
1737}
1738
1739#define XFULL(name_, query_type_, type_, result_type_, group_id_)                                  \
1740   {                                                                                               \
1741      .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1742      .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_           \
1743   }
1744
1745#define X(name_, query_type_, type_, result_type_)                                                 \
1746   XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1747
1748#define XG(group_, name_, query_type_, type_, result_type_)                                        \
1749   XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
1750
1751static struct pipe_driver_query_info si_driver_query_list[] = {
1752   X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1753   X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1754   X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1755   X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
1756   X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
1757   X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1758   X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1759   X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1760   X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1761   X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1762   X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1763   X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1764   X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1765   X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1766   X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1767   X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1768   X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1769   X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1770   X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1771   X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1772   X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1773   X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1774   X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1775   X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1776   X("slab-wasted-VRAM", SLAB_WASTED_VRAM, BYTES, AVERAGE),
1777   X("slab-wasted-GTT", SLAB_WASTED_GTT, BYTES, AVERAGE),
1778   X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1779   X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1780   X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1781   X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
1782   X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
1783   X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1784   X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1785   X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1786   X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1787   X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1788   X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1789   X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1790   X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1791   X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1792   X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1793   X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1794   X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1795   X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1796
1797   /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1798    * which use it as a fallback path to detect the GPU type.
1799    *
1800    * Note: The names of these queries are significant for GPUPerfStudio
1801    * (and possibly their order as well). */
1802   XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1803   XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1804   XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1805   XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1806   XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1807
1808   X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1809   X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1810   X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1811
1812   /* The following queries must be at the end of the list because their
1813    * availability is adjusted dynamically based on the DRM version. */
1814   X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1815   X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1816   X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1817   X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1818   X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1819   X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1820   X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1821   X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1822   X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1823   X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1824   X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1825   X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1826   X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1827   X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1828
1829   /* SRBM_STATUS2 */
1830   X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1831
1832   /* CP_STAT */
1833   X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1834   X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1835   X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1836   X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1837   X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
1838   X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1839};
1840
1841#undef X
1842#undef XG
1843#undef XFULL
1844
1845static unsigned si_get_num_queries(struct si_screen *sscreen)
1846{
1847   /* amdgpu */
1848   if (sscreen->info.is_amdgpu) {
1849      if (sscreen->info.gfx_level >= GFX8)
1850         return ARRAY_SIZE(si_driver_query_list);
1851      else
1852         return ARRAY_SIZE(si_driver_query_list) - 7;
1853   }
1854
1855   /* radeon */
1856   if (sscreen->info.gfx_level == GFX7)
1857      return ARRAY_SIZE(si_driver_query_list) - 6;
1858   else
1859      return ARRAY_SIZE(si_driver_query_list) - 7;
1860
1861   return ARRAY_SIZE(si_driver_query_list) - 21;
1862}
1863
1864static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
1865                                    struct pipe_driver_query_info *info)
1866{
1867   struct si_screen *sscreen = (struct si_screen *)screen;
1868   unsigned num_queries = si_get_num_queries(sscreen);
1869
1870   if (!info) {
1871      unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
1872
1873      return num_queries + num_perfcounters;
1874   }
1875
1876   if (index >= num_queries)
1877      return si_get_perfcounter_info(sscreen, index - num_queries, info);
1878
1879   *info = si_driver_query_list[index];
1880
1881   switch (info->query_type) {
1882   case SI_QUERY_REQUESTED_VRAM:
1883   case SI_QUERY_VRAM_USAGE:
1884   case SI_QUERY_MAPPED_VRAM:
1885   case SI_QUERY_SLAB_WASTED_VRAM:
1886      info->max_value.u64 = (uint64_t)sscreen->info.vram_size_kb * 1024;
1887      break;
1888   case SI_QUERY_REQUESTED_GTT:
1889   case SI_QUERY_GTT_USAGE:
1890   case SI_QUERY_MAPPED_GTT:
1891   case SI_QUERY_SLAB_WASTED_GTT:
1892      info->max_value.u64 = (uint64_t)sscreen->info.gart_size_kb * 1024;
1893      break;
1894   case SI_QUERY_GPU_TEMPERATURE:
1895      info->max_value.u64 = 125;
1896      break;
1897   case SI_QUERY_VRAM_VIS_USAGE:
1898      info->max_value.u64 = (uint64_t)sscreen->info.vram_vis_size_kb * 1024;
1899      break;
1900   }
1901
1902   if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
1903      info->group_id += sscreen->perfcounters->base.num_groups;
1904
1905   return 1;
1906}
1907
1908/* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1909 * performance counter groups, so be careful when changing this and related
1910 * functions.
1911 */
1912static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
1913                                          struct pipe_driver_query_group_info *info)
1914{
1915   struct si_screen *sscreen = (struct si_screen *)screen;
1916   unsigned num_pc_groups = 0;
1917
1918   if (sscreen->perfcounters)
1919      num_pc_groups = sscreen->perfcounters->base.num_groups;
1920
1921   if (!info)
1922      return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
1923
1924   if (index < num_pc_groups)
1925      return si_get_perfcounter_group_info(sscreen, index, info);
1926
1927   index -= num_pc_groups;
1928   if (index >= SI_NUM_SW_QUERY_GROUPS)
1929      return 0;
1930
1931   info->name = "GPIN";
1932   info->max_active_queries = 5;
1933   info->num_queries = 5;
1934   return 1;
1935}
1936
1937void si_init_query_functions(struct si_context *sctx)
1938{
1939   sctx->b.create_query = si_create_query;
1940   sctx->b.create_batch_query = si_create_batch_query;
1941   sctx->b.destroy_query = si_destroy_query;
1942   sctx->b.begin_query = si_begin_query;
1943   sctx->b.end_query = si_end_query;
1944   sctx->b.get_query_result = si_get_query_result;
1945   sctx->b.get_query_result_resource = si_get_query_result_resource;
1946
1947   if (sctx->has_graphics) {
1948      sctx->atoms.s.render_cond.emit = si_emit_query_predication;
1949      sctx->b.render_condition = si_render_condition;
1950   }
1951
1952   list_inithead(&sctx->active_queries);
1953}
1954
1955void si_init_screen_query_functions(struct si_screen *sscreen)
1956{
1957   sscreen->b.get_driver_query_info = si_get_driver_query_info;
1958   sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
1959}
1960