1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyrigh 2016 Red Hat Inc.
3bf215546Sopenharmony_ci * SPDX-License-Identifier: MIT
4bf215546Sopenharmony_ci *
5bf215546Sopenharmony_ci * Based on anv:
6bf215546Sopenharmony_ci * Copyright © 2015 Intel Corporation
7bf215546Sopenharmony_ci */
8bf215546Sopenharmony_ci
9bf215546Sopenharmony_ci#include "tu_query.h"
10bf215546Sopenharmony_ci
11bf215546Sopenharmony_ci#include <fcntl.h>
12bf215546Sopenharmony_ci
13bf215546Sopenharmony_ci#include "nir/nir_builder.h"
14bf215546Sopenharmony_ci#include "util/os_time.h"
15bf215546Sopenharmony_ci
16bf215546Sopenharmony_ci#include "vk_util.h"
17bf215546Sopenharmony_ci
18bf215546Sopenharmony_ci#include "tu_cmd_buffer.h"
19bf215546Sopenharmony_ci#include "tu_cs.h"
20bf215546Sopenharmony_ci#include "tu_device.h"
21bf215546Sopenharmony_ci
22bf215546Sopenharmony_ci#define NSEC_PER_SEC 1000000000ull
23bf215546Sopenharmony_ci#define WAIT_TIMEOUT 5
24bf215546Sopenharmony_ci#define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)
25bf215546Sopenharmony_ci
26bf215546Sopenharmony_cistruct PACKED query_slot {
27bf215546Sopenharmony_ci   uint64_t available;
28bf215546Sopenharmony_ci};
29bf215546Sopenharmony_ci
30bf215546Sopenharmony_cistruct PACKED occlusion_slot_value {
31bf215546Sopenharmony_ci   /* Seems sample counters are placed to be 16-byte aligned
32bf215546Sopenharmony_ci    * even though this query needs an 8-byte slot. */
33bf215546Sopenharmony_ci   uint64_t value;
34bf215546Sopenharmony_ci   uint64_t _padding;
35bf215546Sopenharmony_ci};
36bf215546Sopenharmony_ci
37bf215546Sopenharmony_cistruct PACKED occlusion_query_slot {
38bf215546Sopenharmony_ci   struct query_slot common;
39bf215546Sopenharmony_ci   uint64_t result;
40bf215546Sopenharmony_ci
41bf215546Sopenharmony_ci   struct occlusion_slot_value begin;
42bf215546Sopenharmony_ci   struct occlusion_slot_value end;
43bf215546Sopenharmony_ci};
44bf215546Sopenharmony_ci
45bf215546Sopenharmony_cistruct PACKED timestamp_query_slot {
46bf215546Sopenharmony_ci   struct query_slot common;
47bf215546Sopenharmony_ci   uint64_t result;
48bf215546Sopenharmony_ci};
49bf215546Sopenharmony_ci
50bf215546Sopenharmony_cistruct PACKED primitive_slot_value {
51bf215546Sopenharmony_ci   uint64_t values[2];
52bf215546Sopenharmony_ci};
53bf215546Sopenharmony_ci
54bf215546Sopenharmony_cistruct PACKED pipeline_stat_query_slot {
55bf215546Sopenharmony_ci   struct query_slot common;
56bf215546Sopenharmony_ci   uint64_t results[STAT_COUNT];
57bf215546Sopenharmony_ci
58bf215546Sopenharmony_ci   uint64_t begin[STAT_COUNT];
59bf215546Sopenharmony_ci   uint64_t end[STAT_COUNT];
60bf215546Sopenharmony_ci};
61bf215546Sopenharmony_ci
62bf215546Sopenharmony_cistruct PACKED primitive_query_slot {
63bf215546Sopenharmony_ci   struct query_slot common;
64bf215546Sopenharmony_ci   /* The result of transform feedback queries is two integer values:
65bf215546Sopenharmony_ci    *   results[0] is the count of primitives written,
66bf215546Sopenharmony_ci    *   results[1] is the count of primitives generated.
67bf215546Sopenharmony_ci    * Also a result for each stream is stored at 4 slots respectively.
68bf215546Sopenharmony_ci    */
69bf215546Sopenharmony_ci   uint64_t results[2];
70bf215546Sopenharmony_ci
71bf215546Sopenharmony_ci   /* Primitive counters also need to be 16-byte aligned. */
72bf215546Sopenharmony_ci   uint64_t _padding;
73bf215546Sopenharmony_ci
74bf215546Sopenharmony_ci   struct primitive_slot_value begin[4];
75bf215546Sopenharmony_ci   struct primitive_slot_value end[4];
76bf215546Sopenharmony_ci};
77bf215546Sopenharmony_ci
78bf215546Sopenharmony_cistruct PACKED perfcntr_query_slot {
79bf215546Sopenharmony_ci   uint64_t result;
80bf215546Sopenharmony_ci   uint64_t begin;
81bf215546Sopenharmony_ci   uint64_t end;
82bf215546Sopenharmony_ci};
83bf215546Sopenharmony_ci
84bf215546Sopenharmony_cistruct PACKED perf_query_slot {
85bf215546Sopenharmony_ci   struct query_slot common;
86bf215546Sopenharmony_ci   struct perfcntr_query_slot perfcntr;
87bf215546Sopenharmony_ci};
88bf215546Sopenharmony_ci
89bf215546Sopenharmony_cistruct PACKED primitives_generated_query_slot {
90bf215546Sopenharmony_ci   struct query_slot common;
91bf215546Sopenharmony_ci   uint64_t result;
92bf215546Sopenharmony_ci   uint64_t begin;
93bf215546Sopenharmony_ci   uint64_t end;
94bf215546Sopenharmony_ci};
95bf215546Sopenharmony_ci
96bf215546Sopenharmony_ci/* Returns the IOVA of a given uint64_t field in a given slot of a query
97bf215546Sopenharmony_ci * pool. */
98bf215546Sopenharmony_ci#define query_iova(type, pool, query, field)                         \
99bf215546Sopenharmony_ci   pool->bo->iova + pool->stride * (query) + offsetof(type, field)
100bf215546Sopenharmony_ci
101bf215546Sopenharmony_ci#define occlusion_query_iova(pool, query, field)                     \
102bf215546Sopenharmony_ci   query_iova(struct occlusion_query_slot, pool, query, field)
103bf215546Sopenharmony_ci
104bf215546Sopenharmony_ci#define pipeline_stat_query_iova(pool, query, field)                 \
105bf215546Sopenharmony_ci   pool->bo->iova + pool->stride * (query) +                            \
106bf215546Sopenharmony_ci   offsetof(struct pipeline_stat_query_slot, field)
107bf215546Sopenharmony_ci
108bf215546Sopenharmony_ci#define primitive_query_iova(pool, query, field, i)                  \
109bf215546Sopenharmony_ci   query_iova(struct primitive_query_slot, pool, query, field) +     \
110bf215546Sopenharmony_ci   offsetof(struct primitive_slot_value, values[i])
111bf215546Sopenharmony_ci
112bf215546Sopenharmony_ci#define perf_query_iova(pool, query, field, i)                          \
113bf215546Sopenharmony_ci   pool->bo->iova + pool->stride * (query) +                             \
114bf215546Sopenharmony_ci   sizeof(struct query_slot) +                                   \
115bf215546Sopenharmony_ci   sizeof(struct perfcntr_query_slot) * (i) +                          \
116bf215546Sopenharmony_ci   offsetof(struct perfcntr_query_slot, field)
117bf215546Sopenharmony_ci
118bf215546Sopenharmony_ci#define primitives_generated_query_iova(pool, query, field)               \
119bf215546Sopenharmony_ci   query_iova(struct primitives_generated_query_slot, pool, query, field)
120bf215546Sopenharmony_ci
121bf215546Sopenharmony_ci#define query_available_iova(pool, query)                            \
122bf215546Sopenharmony_ci   query_iova(struct query_slot, pool, query, available)
123bf215546Sopenharmony_ci
124bf215546Sopenharmony_ci#define query_result_iova(pool, query, type, i)                            \
125bf215546Sopenharmony_ci   pool->bo->iova + pool->stride * (query) +                          \
126bf215546Sopenharmony_ci   sizeof(struct query_slot) + sizeof(type) * (i)
127bf215546Sopenharmony_ci
128bf215546Sopenharmony_ci#define query_result_addr(pool, query, type, i)                            \
129bf215546Sopenharmony_ci   pool->bo->map + pool->stride * (query) +                             \
130bf215546Sopenharmony_ci   sizeof(struct query_slot) + sizeof(type) * (i)
131bf215546Sopenharmony_ci
132bf215546Sopenharmony_ci#define query_is_available(slot) slot->available
133bf215546Sopenharmony_ci
134bf215546Sopenharmony_cistatic const VkPerformanceCounterUnitKHR
135bf215546Sopenharmony_cifd_perfcntr_type_to_vk_unit[] = {
136bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
137bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
138bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
139bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
140bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
141bf215546Sopenharmony_ci   /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
142bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
143bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
144bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
145bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
146bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
147bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
148bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
149bf215546Sopenharmony_ci};
150bf215546Sopenharmony_ci
151bf215546Sopenharmony_ci/* TODO. Basically this comes from the freedreno implementation where
152bf215546Sopenharmony_ci * only UINT64 is used. We'd better confirm this by the blob vulkan driver
153bf215546Sopenharmony_ci * when it starts supporting perf query.
154bf215546Sopenharmony_ci */
155bf215546Sopenharmony_cistatic const VkPerformanceCounterStorageKHR
156bf215546Sopenharmony_cifd_perfcntr_type_to_vk_storage[] = {
157bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
158bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
159bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
160bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
161bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
162bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
163bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
164bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
165bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
166bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
167bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
168bf215546Sopenharmony_ci   [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
169bf215546Sopenharmony_ci};
170bf215546Sopenharmony_ci
171bf215546Sopenharmony_ci/*
172bf215546Sopenharmony_ci * Returns a pointer to a given slot in a query pool.
173bf215546Sopenharmony_ci */
174bf215546Sopenharmony_cistatic void* slot_address(struct tu_query_pool *pool, uint32_t query)
175bf215546Sopenharmony_ci{
176bf215546Sopenharmony_ci   return (char*)pool->bo->map + query * pool->stride;
177bf215546Sopenharmony_ci}
178bf215546Sopenharmony_ci
179bf215546Sopenharmony_cistatic void
180bf215546Sopenharmony_ciperfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
181bf215546Sopenharmony_ci               uint32_t index, uint32_t *gid, uint32_t *cid)
182bf215546Sopenharmony_ci
183bf215546Sopenharmony_ci{
184bf215546Sopenharmony_ci   uint32_t i;
185bf215546Sopenharmony_ci
186bf215546Sopenharmony_ci   for (i = 0; i < group_count; i++) {
187bf215546Sopenharmony_ci      if (group[i].num_countables > index) {
188bf215546Sopenharmony_ci         *gid = i;
189bf215546Sopenharmony_ci         *cid = index;
190bf215546Sopenharmony_ci         break;
191bf215546Sopenharmony_ci      }
192bf215546Sopenharmony_ci      index -= group[i].num_countables;
193bf215546Sopenharmony_ci   }
194bf215546Sopenharmony_ci
195bf215546Sopenharmony_ci   assert(i < group_count);
196bf215546Sopenharmony_ci}
197bf215546Sopenharmony_ci
198bf215546Sopenharmony_cistatic int
199bf215546Sopenharmony_cicompare_perfcntr_pass(const void *a, const void *b)
200bf215546Sopenharmony_ci{
201bf215546Sopenharmony_ci   return ((struct tu_perf_query_data *)a)->pass -
202bf215546Sopenharmony_ci          ((struct tu_perf_query_data *)b)->pass;
203bf215546Sopenharmony_ci}
204bf215546Sopenharmony_ci
205bf215546Sopenharmony_ciVKAPI_ATTR VkResult VKAPI_CALL
206bf215546Sopenharmony_citu_CreateQueryPool(VkDevice _device,
207bf215546Sopenharmony_ci                   const VkQueryPoolCreateInfo *pCreateInfo,
208bf215546Sopenharmony_ci                   const VkAllocationCallbacks *pAllocator,
209bf215546Sopenharmony_ci                   VkQueryPool *pQueryPool)
210bf215546Sopenharmony_ci{
211bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_device, device, _device);
212bf215546Sopenharmony_ci   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
213bf215546Sopenharmony_ci   assert(pCreateInfo->queryCount > 0);
214bf215546Sopenharmony_ci
215bf215546Sopenharmony_ci   uint32_t pool_size, slot_size;
216bf215546Sopenharmony_ci   const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
217bf215546Sopenharmony_ci
218bf215546Sopenharmony_ci   pool_size = sizeof(struct tu_query_pool);
219bf215546Sopenharmony_ci
220bf215546Sopenharmony_ci   switch (pCreateInfo->queryType) {
221bf215546Sopenharmony_ci   case VK_QUERY_TYPE_OCCLUSION:
222bf215546Sopenharmony_ci      slot_size = sizeof(struct occlusion_query_slot);
223bf215546Sopenharmony_ci      break;
224bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TIMESTAMP:
225bf215546Sopenharmony_ci      slot_size = sizeof(struct timestamp_query_slot);
226bf215546Sopenharmony_ci      break;
227bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
228bf215546Sopenharmony_ci      slot_size = sizeof(struct primitive_query_slot);
229bf215546Sopenharmony_ci      break;
230bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
231bf215546Sopenharmony_ci      slot_size = sizeof(struct primitives_generated_query_slot);
232bf215546Sopenharmony_ci      break;
233bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
234bf215546Sopenharmony_ci      perf_query_info =
235bf215546Sopenharmony_ci            vk_find_struct_const(pCreateInfo->pNext,
236bf215546Sopenharmony_ci                                 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
237bf215546Sopenharmony_ci      assert(perf_query_info);
238bf215546Sopenharmony_ci
239bf215546Sopenharmony_ci      slot_size = sizeof(struct perf_query_slot) +
240bf215546Sopenharmony_ci                  sizeof(struct perfcntr_query_slot) *
241bf215546Sopenharmony_ci                  (perf_query_info->counterIndexCount - 1);
242bf215546Sopenharmony_ci
243bf215546Sopenharmony_ci      /* Size of the array pool->tu_perf_query_data */
244bf215546Sopenharmony_ci      pool_size += sizeof(struct tu_perf_query_data) *
245bf215546Sopenharmony_ci                   perf_query_info->counterIndexCount;
246bf215546Sopenharmony_ci      break;
247bf215546Sopenharmony_ci   }
248bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
249bf215546Sopenharmony_ci      slot_size = sizeof(struct pipeline_stat_query_slot);
250bf215546Sopenharmony_ci      break;
251bf215546Sopenharmony_ci   default:
252bf215546Sopenharmony_ci      unreachable("Invalid query type");
253bf215546Sopenharmony_ci   }
254bf215546Sopenharmony_ci
255bf215546Sopenharmony_ci   struct tu_query_pool *pool =
256bf215546Sopenharmony_ci         vk_object_alloc(&device->vk, pAllocator, pool_size,
257bf215546Sopenharmony_ci                         VK_OBJECT_TYPE_QUERY_POOL);
258bf215546Sopenharmony_ci   if (!pool)
259bf215546Sopenharmony_ci      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
260bf215546Sopenharmony_ci
261bf215546Sopenharmony_ci   if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
262bf215546Sopenharmony_ci      pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
263bf215546Sopenharmony_ci                                      &pool->perf_group_count);
264bf215546Sopenharmony_ci
265bf215546Sopenharmony_ci      pool->counter_index_count = perf_query_info->counterIndexCount;
266bf215546Sopenharmony_ci
267bf215546Sopenharmony_ci      /* Build all perf counters data that is requested, so we could get
268bf215546Sopenharmony_ci       * correct group id, countable id, counter register and pass index with
269bf215546Sopenharmony_ci       * only a counter index provided by applications at each command submit.
270bf215546Sopenharmony_ci       *
271bf215546Sopenharmony_ci       * Also, since this built data will be sorted by pass index later, we
272bf215546Sopenharmony_ci       * should keep the original indices and store perfcntrs results according
273bf215546Sopenharmony_ci       * to them so apps can get correct results with their own indices.
274bf215546Sopenharmony_ci       */
275bf215546Sopenharmony_ci      uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
276bf215546Sopenharmony_ci      memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
277bf215546Sopenharmony_ci      memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
278bf215546Sopenharmony_ci
279bf215546Sopenharmony_ci      for (uint32_t i = 0; i < pool->counter_index_count; i++) {
280bf215546Sopenharmony_ci         uint32_t gid = 0, cid = 0;
281bf215546Sopenharmony_ci
282bf215546Sopenharmony_ci         perfcntr_index(pool->perf_group, pool->perf_group_count,
283bf215546Sopenharmony_ci                        perf_query_info->pCounterIndices[i], &gid, &cid);
284bf215546Sopenharmony_ci
285bf215546Sopenharmony_ci         pool->perf_query_data[i].gid = gid;
286bf215546Sopenharmony_ci         pool->perf_query_data[i].cid = cid;
287bf215546Sopenharmony_ci         pool->perf_query_data[i].app_idx = i;
288bf215546Sopenharmony_ci
289bf215546Sopenharmony_ci         /* When a counter register is over the capacity(num_counters),
290bf215546Sopenharmony_ci          * reset it for next pass.
291bf215546Sopenharmony_ci          */
292bf215546Sopenharmony_ci         if (regs[gid] < pool->perf_group[gid].num_counters) {
293bf215546Sopenharmony_ci            pool->perf_query_data[i].cntr_reg = regs[gid]++;
294bf215546Sopenharmony_ci            pool->perf_query_data[i].pass = pass[gid];
295bf215546Sopenharmony_ci         } else {
296bf215546Sopenharmony_ci            pool->perf_query_data[i].pass = ++pass[gid];
297bf215546Sopenharmony_ci            pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
298bf215546Sopenharmony_ci            regs[gid]++;
299bf215546Sopenharmony_ci         }
300bf215546Sopenharmony_ci      }
301bf215546Sopenharmony_ci
302bf215546Sopenharmony_ci      /* Sort by pass index so we could easily prepare a command stream
303bf215546Sopenharmony_ci       * with the ascending order of pass index.
304bf215546Sopenharmony_ci       */
305bf215546Sopenharmony_ci      qsort(pool->perf_query_data, pool->counter_index_count,
306bf215546Sopenharmony_ci            sizeof(pool->perf_query_data[0]),
307bf215546Sopenharmony_ci            compare_perfcntr_pass);
308bf215546Sopenharmony_ci   }
309bf215546Sopenharmony_ci
310bf215546Sopenharmony_ci   VkResult result = tu_bo_init_new(device, &pool->bo,
311bf215546Sopenharmony_ci         pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS);
312bf215546Sopenharmony_ci   if (result != VK_SUCCESS) {
313bf215546Sopenharmony_ci      vk_object_free(&device->vk, pAllocator, pool);
314bf215546Sopenharmony_ci      return result;
315bf215546Sopenharmony_ci   }
316bf215546Sopenharmony_ci
317bf215546Sopenharmony_ci   result = tu_bo_map(device, pool->bo);
318bf215546Sopenharmony_ci   if (result != VK_SUCCESS) {
319bf215546Sopenharmony_ci      tu_bo_finish(device, pool->bo);
320bf215546Sopenharmony_ci      vk_object_free(&device->vk, pAllocator, pool);
321bf215546Sopenharmony_ci      return result;
322bf215546Sopenharmony_ci   }
323bf215546Sopenharmony_ci
324bf215546Sopenharmony_ci   /* Initialize all query statuses to unavailable */
325bf215546Sopenharmony_ci   memset(pool->bo->map, 0, pool->bo->size);
326bf215546Sopenharmony_ci
327bf215546Sopenharmony_ci   pool->type = pCreateInfo->queryType;
328bf215546Sopenharmony_ci   pool->stride = slot_size;
329bf215546Sopenharmony_ci   pool->size = pCreateInfo->queryCount;
330bf215546Sopenharmony_ci   pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
331bf215546Sopenharmony_ci   *pQueryPool = tu_query_pool_to_handle(pool);
332bf215546Sopenharmony_ci
333bf215546Sopenharmony_ci   return VK_SUCCESS;
334bf215546Sopenharmony_ci}
335bf215546Sopenharmony_ci
336bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL
337bf215546Sopenharmony_citu_DestroyQueryPool(VkDevice _device,
338bf215546Sopenharmony_ci                    VkQueryPool _pool,
339bf215546Sopenharmony_ci                    const VkAllocationCallbacks *pAllocator)
340bf215546Sopenharmony_ci{
341bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_device, device, _device);
342bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_query_pool, pool, _pool);
343bf215546Sopenharmony_ci
344bf215546Sopenharmony_ci   if (!pool)
345bf215546Sopenharmony_ci      return;
346bf215546Sopenharmony_ci
347bf215546Sopenharmony_ci   tu_bo_finish(device, pool->bo);
348bf215546Sopenharmony_ci   vk_object_free(&device->vk, pAllocator, pool);
349bf215546Sopenharmony_ci}
350bf215546Sopenharmony_ci
351bf215546Sopenharmony_cistatic uint32_t
352bf215546Sopenharmony_ciget_result_count(struct tu_query_pool *pool)
353bf215546Sopenharmony_ci{
354bf215546Sopenharmony_ci   switch (pool->type) {
355bf215546Sopenharmony_ci   /* Occulusion and timestamp queries write one integer value */
356bf215546Sopenharmony_ci   case VK_QUERY_TYPE_OCCLUSION:
357bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TIMESTAMP:
358bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
359bf215546Sopenharmony_ci      return 1;
360bf215546Sopenharmony_ci   /* Transform feedback queries write two integer values */
361bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
362bf215546Sopenharmony_ci      return 2;
363bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
364bf215546Sopenharmony_ci      return util_bitcount(pool->pipeline_statistics);
365bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
366bf215546Sopenharmony_ci      return pool->counter_index_count;
367bf215546Sopenharmony_ci   default:
368bf215546Sopenharmony_ci      assert(!"Invalid query type");
369bf215546Sopenharmony_ci      return 0;
370bf215546Sopenharmony_ci   }
371bf215546Sopenharmony_ci}
372bf215546Sopenharmony_ci
373bf215546Sopenharmony_cistatic uint32_t
374bf215546Sopenharmony_cistatistics_index(uint32_t *statistics)
375bf215546Sopenharmony_ci{
376bf215546Sopenharmony_ci   uint32_t stat;
377bf215546Sopenharmony_ci   stat = u_bit_scan(statistics);
378bf215546Sopenharmony_ci
379bf215546Sopenharmony_ci   switch (1 << stat) {
380bf215546Sopenharmony_ci   case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:
381bf215546Sopenharmony_ci   case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:
382bf215546Sopenharmony_ci      return 0;
383bf215546Sopenharmony_ci   case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:
384bf215546Sopenharmony_ci      return 1;
385bf215546Sopenharmony_ci   case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:
386bf215546Sopenharmony_ci      return 2;
387bf215546Sopenharmony_ci   case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:
388bf215546Sopenharmony_ci      return 4;
389bf215546Sopenharmony_ci   case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:
390bf215546Sopenharmony_ci      return 5;
391bf215546Sopenharmony_ci   case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:
392bf215546Sopenharmony_ci      return 6;
393bf215546Sopenharmony_ci   case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:
394bf215546Sopenharmony_ci      return 7;
395bf215546Sopenharmony_ci   case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:
396bf215546Sopenharmony_ci      return 8;
397bf215546Sopenharmony_ci   case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:
398bf215546Sopenharmony_ci      return 9;
399bf215546Sopenharmony_ci   case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:
400bf215546Sopenharmony_ci      return 10;
401bf215546Sopenharmony_ci   default:
402bf215546Sopenharmony_ci      return 0;
403bf215546Sopenharmony_ci   }
404bf215546Sopenharmony_ci}
405bf215546Sopenharmony_ci
406bf215546Sopenharmony_cistatic bool
407bf215546Sopenharmony_ciis_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)
408bf215546Sopenharmony_ci{
409bf215546Sopenharmony_ci   return pipeline_statistics &
410bf215546Sopenharmony_ci          (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
411bf215546Sopenharmony_ci           VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
412bf215546Sopenharmony_ci           VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
413bf215546Sopenharmony_ci           VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
414bf215546Sopenharmony_ci           VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
415bf215546Sopenharmony_ci           VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT |
416bf215546Sopenharmony_ci           VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
417bf215546Sopenharmony_ci           VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT |
418bf215546Sopenharmony_ci           VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT);
419bf215546Sopenharmony_ci}
420bf215546Sopenharmony_ci
421bf215546Sopenharmony_cistatic bool
422bf215546Sopenharmony_ciis_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)
423bf215546Sopenharmony_ci{
424bf215546Sopenharmony_ci   return pipeline_statistics &
425bf215546Sopenharmony_ci          VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
426bf215546Sopenharmony_ci}
427bf215546Sopenharmony_ci
428bf215546Sopenharmony_cistatic bool
429bf215546Sopenharmony_ciis_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
430bf215546Sopenharmony_ci{
431bf215546Sopenharmony_ci   return pipeline_statistics &
432bf215546Sopenharmony_ci          VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
433bf215546Sopenharmony_ci}
434bf215546Sopenharmony_ci
435bf215546Sopenharmony_ci/* Wait on the the availability status of a query up until a timeout. */
436bf215546Sopenharmony_cistatic VkResult
437bf215546Sopenharmony_ciwait_for_available(struct tu_device *device, struct tu_query_pool *pool,
438bf215546Sopenharmony_ci                   uint32_t query)
439bf215546Sopenharmony_ci{
440bf215546Sopenharmony_ci   /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
441bf215546Sopenharmony_ci    * scheduler friendly way instead of busy polling once the patch has landed
442bf215546Sopenharmony_ci    * upstream. */
443bf215546Sopenharmony_ci   struct query_slot *slot = slot_address(pool, query);
444bf215546Sopenharmony_ci   uint64_t abs_timeout = os_time_get_absolute_timeout(
445bf215546Sopenharmony_ci         WAIT_TIMEOUT * NSEC_PER_SEC);
446bf215546Sopenharmony_ci   while(os_time_get_nano() < abs_timeout) {
447bf215546Sopenharmony_ci      if (query_is_available(slot))
448bf215546Sopenharmony_ci         return VK_SUCCESS;
449bf215546Sopenharmony_ci   }
450bf215546Sopenharmony_ci   return vk_error(device, VK_TIMEOUT);
451bf215546Sopenharmony_ci}
452bf215546Sopenharmony_ci
453bf215546Sopenharmony_ci/* Writes a query value to a buffer from the CPU. */
454bf215546Sopenharmony_cistatic void
455bf215546Sopenharmony_ciwrite_query_value_cpu(char* base,
456bf215546Sopenharmony_ci                      uint32_t offset,
457bf215546Sopenharmony_ci                      uint64_t value,
458bf215546Sopenharmony_ci                      VkQueryResultFlags flags)
459bf215546Sopenharmony_ci{
460bf215546Sopenharmony_ci   if (flags & VK_QUERY_RESULT_64_BIT) {
461bf215546Sopenharmony_ci      *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
462bf215546Sopenharmony_ci   } else {
463bf215546Sopenharmony_ci      *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
464bf215546Sopenharmony_ci   }
465bf215546Sopenharmony_ci}
466bf215546Sopenharmony_ci
467bf215546Sopenharmony_cistatic VkResult
468bf215546Sopenharmony_ciget_query_pool_results(struct tu_device *device,
469bf215546Sopenharmony_ci                       struct tu_query_pool *pool,
470bf215546Sopenharmony_ci                       uint32_t firstQuery,
471bf215546Sopenharmony_ci                       uint32_t queryCount,
472bf215546Sopenharmony_ci                       size_t dataSize,
473bf215546Sopenharmony_ci                       void *pData,
474bf215546Sopenharmony_ci                       VkDeviceSize stride,
475bf215546Sopenharmony_ci                       VkQueryResultFlags flags)
476bf215546Sopenharmony_ci{
477bf215546Sopenharmony_ci   assert(dataSize >= stride * queryCount);
478bf215546Sopenharmony_ci
479bf215546Sopenharmony_ci   char *result_base = pData;
480bf215546Sopenharmony_ci   VkResult result = VK_SUCCESS;
481bf215546Sopenharmony_ci   for (uint32_t i = 0; i < queryCount; i++) {
482bf215546Sopenharmony_ci      uint32_t query = firstQuery + i;
483bf215546Sopenharmony_ci      struct query_slot *slot = slot_address(pool, query);
484bf215546Sopenharmony_ci      bool available = query_is_available(slot);
485bf215546Sopenharmony_ci      uint32_t result_count = get_result_count(pool);
486bf215546Sopenharmony_ci      uint32_t statistics = pool->pipeline_statistics;
487bf215546Sopenharmony_ci
488bf215546Sopenharmony_ci      if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
489bf215546Sopenharmony_ci         VkResult wait_result = wait_for_available(device, pool, query);
490bf215546Sopenharmony_ci         if (wait_result != VK_SUCCESS)
491bf215546Sopenharmony_ci            return wait_result;
492bf215546Sopenharmony_ci         available = true;
493bf215546Sopenharmony_ci      } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
494bf215546Sopenharmony_ci         /* From the Vulkan 1.1.130 spec:
495bf215546Sopenharmony_ci          *
496bf215546Sopenharmony_ci          *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
497bf215546Sopenharmony_ci          *    both not set then no result values are written to pData for
498bf215546Sopenharmony_ci          *    queries that are in the unavailable state at the time of the
499bf215546Sopenharmony_ci          *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
500bf215546Sopenharmony_ci          *    availability state is still written to pData for those queries
501bf215546Sopenharmony_ci          *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
502bf215546Sopenharmony_ci          */
503bf215546Sopenharmony_ci         result = VK_NOT_READY;
504bf215546Sopenharmony_ci         if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
505bf215546Sopenharmony_ci            result_base += stride;
506bf215546Sopenharmony_ci            continue;
507bf215546Sopenharmony_ci         }
508bf215546Sopenharmony_ci      }
509bf215546Sopenharmony_ci
510bf215546Sopenharmony_ci      for (uint32_t k = 0; k < result_count; k++) {
511bf215546Sopenharmony_ci         if (available) {
512bf215546Sopenharmony_ci            uint64_t *result;
513bf215546Sopenharmony_ci
514bf215546Sopenharmony_ci            if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
515bf215546Sopenharmony_ci               uint32_t stat_idx = statistics_index(&statistics);
516bf215546Sopenharmony_ci               result = query_result_addr(pool, query, uint64_t, stat_idx);
517bf215546Sopenharmony_ci            } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
518bf215546Sopenharmony_ci               result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
519bf215546Sopenharmony_ci            } else {
520bf215546Sopenharmony_ci               result = query_result_addr(pool, query, uint64_t, k);
521bf215546Sopenharmony_ci            }
522bf215546Sopenharmony_ci
523bf215546Sopenharmony_ci            write_query_value_cpu(result_base, k, *result, flags);
524bf215546Sopenharmony_ci         } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
525bf215546Sopenharmony_ci             /* From the Vulkan 1.1.130 spec:
526bf215546Sopenharmony_ci              *
527bf215546Sopenharmony_ci              *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
528bf215546Sopenharmony_ci              *   is not set, and the query’s status is unavailable, an
529bf215546Sopenharmony_ci              *   intermediate result value between zero and the final result
530bf215546Sopenharmony_ci              *   value is written to pData for that query.
531bf215546Sopenharmony_ci              *
532bf215546Sopenharmony_ci              * Just return 0 here for simplicity since it's a valid result.
533bf215546Sopenharmony_ci              */
534bf215546Sopenharmony_ci            write_query_value_cpu(result_base, k, 0, flags);
535bf215546Sopenharmony_ci      }
536bf215546Sopenharmony_ci
537bf215546Sopenharmony_ci      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
538bf215546Sopenharmony_ci         /* From the Vulkan 1.1.130 spec:
539bf215546Sopenharmony_ci          *
540bf215546Sopenharmony_ci          *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
541bf215546Sopenharmony_ci          *    integer value written for each query is non-zero if the query’s
542bf215546Sopenharmony_ci          *    status was available or zero if the status was unavailable.
543bf215546Sopenharmony_ci          */
544bf215546Sopenharmony_ci         write_query_value_cpu(result_base, result_count, available, flags);
545bf215546Sopenharmony_ci
546bf215546Sopenharmony_ci      result_base += stride;
547bf215546Sopenharmony_ci   }
548bf215546Sopenharmony_ci   return result;
549bf215546Sopenharmony_ci}
550bf215546Sopenharmony_ci
551bf215546Sopenharmony_ciVKAPI_ATTR VkResult VKAPI_CALL
552bf215546Sopenharmony_citu_GetQueryPoolResults(VkDevice _device,
553bf215546Sopenharmony_ci                       VkQueryPool queryPool,
554bf215546Sopenharmony_ci                       uint32_t firstQuery,
555bf215546Sopenharmony_ci                       uint32_t queryCount,
556bf215546Sopenharmony_ci                       size_t dataSize,
557bf215546Sopenharmony_ci                       void *pData,
558bf215546Sopenharmony_ci                       VkDeviceSize stride,
559bf215546Sopenharmony_ci                       VkQueryResultFlags flags)
560bf215546Sopenharmony_ci{
561bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_device, device, _device);
562bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
563bf215546Sopenharmony_ci   assert(firstQuery + queryCount <= pool->size);
564bf215546Sopenharmony_ci
565bf215546Sopenharmony_ci   if (vk_device_is_lost(&device->vk))
566bf215546Sopenharmony_ci      return VK_ERROR_DEVICE_LOST;
567bf215546Sopenharmony_ci
568bf215546Sopenharmony_ci   switch (pool->type) {
569bf215546Sopenharmony_ci   case VK_QUERY_TYPE_OCCLUSION:
570bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TIMESTAMP:
571bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
572bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
573bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
574bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
575bf215546Sopenharmony_ci      return get_query_pool_results(device, pool, firstQuery, queryCount,
576bf215546Sopenharmony_ci                                    dataSize, pData, stride, flags);
577bf215546Sopenharmony_ci   default:
578bf215546Sopenharmony_ci      assert(!"Invalid query type");
579bf215546Sopenharmony_ci   }
580bf215546Sopenharmony_ci   return VK_SUCCESS;
581bf215546Sopenharmony_ci}
582bf215546Sopenharmony_ci
583bf215546Sopenharmony_ci/* Copies a query value from one buffer to another from the GPU. */
584bf215546Sopenharmony_cistatic void
585bf215546Sopenharmony_cicopy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
586bf215546Sopenharmony_ci                     struct tu_cs *cs,
587bf215546Sopenharmony_ci                     uint64_t src_iova,
588bf215546Sopenharmony_ci                     uint64_t base_write_iova,
589bf215546Sopenharmony_ci                     uint32_t offset,
590bf215546Sopenharmony_ci                     VkQueryResultFlags flags) {
591bf215546Sopenharmony_ci   uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
592bf215546Sopenharmony_ci         sizeof(uint64_t) : sizeof(uint32_t);
593bf215546Sopenharmony_ci   uint64_t write_iova = base_write_iova + (offset * element_size);
594bf215546Sopenharmony_ci
595bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
596bf215546Sopenharmony_ci   uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
597bf215546Sopenharmony_ci         CP_MEM_TO_MEM_0_DOUBLE : 0;
598bf215546Sopenharmony_ci   tu_cs_emit(cs, mem_to_mem_flags);
599bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, write_iova);
600bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, src_iova);
601bf215546Sopenharmony_ci}
602bf215546Sopenharmony_ci
603bf215546Sopenharmony_cistatic void
604bf215546Sopenharmony_ciemit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
605bf215546Sopenharmony_ci                             struct tu_cs *cs,
606bf215546Sopenharmony_ci                             struct tu_query_pool *pool,
607bf215546Sopenharmony_ci                             uint32_t firstQuery,
608bf215546Sopenharmony_ci                             uint32_t queryCount,
609bf215546Sopenharmony_ci                             struct tu_buffer *buffer,
610bf215546Sopenharmony_ci                             VkDeviceSize dstOffset,
611bf215546Sopenharmony_ci                             VkDeviceSize stride,
612bf215546Sopenharmony_ci                             VkQueryResultFlags flags)
613bf215546Sopenharmony_ci{
614bf215546Sopenharmony_ci   /* From the Vulkan 1.1.130 spec:
615bf215546Sopenharmony_ci    *
616bf215546Sopenharmony_ci    *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
617bf215546Sopenharmony_ci    *    uses of vkCmdResetQueryPool in the same queue, without any additional
618bf215546Sopenharmony_ci    *    synchronization.
619bf215546Sopenharmony_ci    *
620bf215546Sopenharmony_ci    * To ensure that previous writes to the available bit are coherent, first
621bf215546Sopenharmony_ci    * wait for all writes to complete.
622bf215546Sopenharmony_ci    */
623bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
624bf215546Sopenharmony_ci
625bf215546Sopenharmony_ci   for (uint32_t i = 0; i < queryCount; i++) {
626bf215546Sopenharmony_ci      uint32_t query = firstQuery + i;
627bf215546Sopenharmony_ci      uint64_t available_iova = query_available_iova(pool, query);
628bf215546Sopenharmony_ci      uint64_t buffer_iova = buffer->iova + dstOffset + i * stride;
629bf215546Sopenharmony_ci      uint32_t result_count = get_result_count(pool);
630bf215546Sopenharmony_ci      uint32_t statistics = pool->pipeline_statistics;
631bf215546Sopenharmony_ci
632bf215546Sopenharmony_ci      /* Wait for the available bit to be set if executed with the
633bf215546Sopenharmony_ci       * VK_QUERY_RESULT_WAIT_BIT flag. */
634bf215546Sopenharmony_ci      if (flags & VK_QUERY_RESULT_WAIT_BIT) {
635bf215546Sopenharmony_ci         tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
636bf215546Sopenharmony_ci         tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
637bf215546Sopenharmony_ci                        CP_WAIT_REG_MEM_0_POLL_MEMORY);
638bf215546Sopenharmony_ci         tu_cs_emit_qw(cs, available_iova);
639bf215546Sopenharmony_ci         tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
640bf215546Sopenharmony_ci         tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
641bf215546Sopenharmony_ci         tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
642bf215546Sopenharmony_ci      }
643bf215546Sopenharmony_ci
644bf215546Sopenharmony_ci      for (uint32_t k = 0; k < result_count; k++) {
645bf215546Sopenharmony_ci         uint64_t result_iova;
646bf215546Sopenharmony_ci
647bf215546Sopenharmony_ci         if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
648bf215546Sopenharmony_ci            uint32_t stat_idx = statistics_index(&statistics);
649bf215546Sopenharmony_ci            result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
650bf215546Sopenharmony_ci         } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
651bf215546Sopenharmony_ci            result_iova = query_result_iova(pool, query,
652bf215546Sopenharmony_ci                                            struct perfcntr_query_slot, k);
653bf215546Sopenharmony_ci         } else {
654bf215546Sopenharmony_ci            result_iova = query_result_iova(pool, query, uint64_t, k);
655bf215546Sopenharmony_ci         }
656bf215546Sopenharmony_ci
657bf215546Sopenharmony_ci         if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
658bf215546Sopenharmony_ci            /* Unconditionally copying the bo->result into the buffer here is
659bf215546Sopenharmony_ci             * valid because we only set bo->result on vkCmdEndQuery. Thus, even
660bf215546Sopenharmony_ci             * if the query is unavailable, this will copy the correct partial
661bf215546Sopenharmony_ci             * value of 0.
662bf215546Sopenharmony_ci             */
663bf215546Sopenharmony_ci            copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
664bf215546Sopenharmony_ci                                 k /* offset */, flags);
665bf215546Sopenharmony_ci         } else {
666bf215546Sopenharmony_ci            /* Conditionally copy bo->result into the buffer based on whether the
667bf215546Sopenharmony_ci             * query is available.
668bf215546Sopenharmony_ci             *
669bf215546Sopenharmony_ci             * NOTE: For the conditional packets to be executed, CP_COND_EXEC
670bf215546Sopenharmony_ci             * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
671bf215546Sopenharmony_ci             * that 0 < available < 2, aka available == 1.
672bf215546Sopenharmony_ci             */
673bf215546Sopenharmony_ci            tu_cs_reserve(cs, 7 + 6);
674bf215546Sopenharmony_ci            tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
675bf215546Sopenharmony_ci            tu_cs_emit_qw(cs, available_iova);
676bf215546Sopenharmony_ci            tu_cs_emit_qw(cs, available_iova);
677bf215546Sopenharmony_ci            tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
678bf215546Sopenharmony_ci            tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
679bf215546Sopenharmony_ci
680bf215546Sopenharmony_ci            /* Start of conditional execution */
681bf215546Sopenharmony_ci            copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
682bf215546Sopenharmony_ci                              k /* offset */, flags);
683bf215546Sopenharmony_ci            /* End of conditional execution */
684bf215546Sopenharmony_ci         }
685bf215546Sopenharmony_ci      }
686bf215546Sopenharmony_ci
687bf215546Sopenharmony_ci      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
688bf215546Sopenharmony_ci         copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
689bf215546Sopenharmony_ci                              result_count /* offset */, flags);
690bf215546Sopenharmony_ci      }
691bf215546Sopenharmony_ci   }
692bf215546Sopenharmony_ci}
693bf215546Sopenharmony_ci
694bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL
695bf215546Sopenharmony_citu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
696bf215546Sopenharmony_ci                           VkQueryPool queryPool,
697bf215546Sopenharmony_ci                           uint32_t firstQuery,
698bf215546Sopenharmony_ci                           uint32_t queryCount,
699bf215546Sopenharmony_ci                           VkBuffer dstBuffer,
700bf215546Sopenharmony_ci                           VkDeviceSize dstOffset,
701bf215546Sopenharmony_ci                           VkDeviceSize stride,
702bf215546Sopenharmony_ci                           VkQueryResultFlags flags)
703bf215546Sopenharmony_ci{
704bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
705bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
706bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
707bf215546Sopenharmony_ci   struct tu_cs *cs = &cmdbuf->cs;
708bf215546Sopenharmony_ci   assert(firstQuery + queryCount <= pool->size);
709bf215546Sopenharmony_ci
710bf215546Sopenharmony_ci   switch (pool->type) {
711bf215546Sopenharmony_ci   case VK_QUERY_TYPE_OCCLUSION:
712bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TIMESTAMP:
713bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
714bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
715bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
716bf215546Sopenharmony_ci      return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
717bf215546Sopenharmony_ci               queryCount, buffer, dstOffset, stride, flags);
718bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
719bf215546Sopenharmony_ci      unreachable("allowCommandBufferQueryCopies is false");
720bf215546Sopenharmony_ci   default:
721bf215546Sopenharmony_ci      assert(!"Invalid query type");
722bf215546Sopenharmony_ci   }
723bf215546Sopenharmony_ci}
724bf215546Sopenharmony_ci
725bf215546Sopenharmony_cistatic void
726bf215546Sopenharmony_ciemit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
727bf215546Sopenharmony_ci                      struct tu_query_pool *pool,
728bf215546Sopenharmony_ci                      uint32_t firstQuery,
729bf215546Sopenharmony_ci                      uint32_t queryCount)
730bf215546Sopenharmony_ci{
731bf215546Sopenharmony_ci   struct tu_cs *cs = &cmdbuf->cs;
732bf215546Sopenharmony_ci
733bf215546Sopenharmony_ci   for (uint32_t i = 0; i < queryCount; i++) {
734bf215546Sopenharmony_ci      uint32_t query = firstQuery + i;
735bf215546Sopenharmony_ci      uint32_t statistics = pool->pipeline_statistics;
736bf215546Sopenharmony_ci
737bf215546Sopenharmony_ci      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
738bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, query_available_iova(pool, query));
739bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, 0x0);
740bf215546Sopenharmony_ci
741bf215546Sopenharmony_ci      for (uint32_t k = 0; k < get_result_count(pool); k++) {
742bf215546Sopenharmony_ci         uint64_t result_iova;
743bf215546Sopenharmony_ci
744bf215546Sopenharmony_ci         if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
745bf215546Sopenharmony_ci            uint32_t stat_idx = statistics_index(&statistics);
746bf215546Sopenharmony_ci            result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
747bf215546Sopenharmony_ci         } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
748bf215546Sopenharmony_ci            result_iova = query_result_iova(pool, query,
749bf215546Sopenharmony_ci                                            struct perfcntr_query_slot, k);
750bf215546Sopenharmony_ci         } else {
751bf215546Sopenharmony_ci            result_iova = query_result_iova(pool, query, uint64_t, k);
752bf215546Sopenharmony_ci         }
753bf215546Sopenharmony_ci
754bf215546Sopenharmony_ci         tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
755bf215546Sopenharmony_ci         tu_cs_emit_qw(cs, result_iova);
756bf215546Sopenharmony_ci         tu_cs_emit_qw(cs, 0x0);
757bf215546Sopenharmony_ci      }
758bf215546Sopenharmony_ci   }
759bf215546Sopenharmony_ci
760bf215546Sopenharmony_ci}
761bf215546Sopenharmony_ci
762bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL
763bf215546Sopenharmony_citu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
764bf215546Sopenharmony_ci                     VkQueryPool queryPool,
765bf215546Sopenharmony_ci                     uint32_t firstQuery,
766bf215546Sopenharmony_ci                     uint32_t queryCount)
767bf215546Sopenharmony_ci{
768bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
769bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
770bf215546Sopenharmony_ci
771bf215546Sopenharmony_ci   switch (pool->type) {
772bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TIMESTAMP:
773bf215546Sopenharmony_ci   case VK_QUERY_TYPE_OCCLUSION:
774bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
775bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
776bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
777bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
778bf215546Sopenharmony_ci      emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
779bf215546Sopenharmony_ci      break;
780bf215546Sopenharmony_ci   default:
781bf215546Sopenharmony_ci      assert(!"Invalid query type");
782bf215546Sopenharmony_ci   }
783bf215546Sopenharmony_ci}
784bf215546Sopenharmony_ci
785bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL
786bf215546Sopenharmony_citu_ResetQueryPool(VkDevice device,
787bf215546Sopenharmony_ci                  VkQueryPool queryPool,
788bf215546Sopenharmony_ci                  uint32_t firstQuery,
789bf215546Sopenharmony_ci                  uint32_t queryCount)
790bf215546Sopenharmony_ci{
791bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
792bf215546Sopenharmony_ci
793bf215546Sopenharmony_ci   for (uint32_t i = 0; i < queryCount; i++) {
794bf215546Sopenharmony_ci      struct query_slot *slot = slot_address(pool, i + firstQuery);
795bf215546Sopenharmony_ci      slot->available = 0;
796bf215546Sopenharmony_ci
797bf215546Sopenharmony_ci      for (uint32_t k = 0; k < get_result_count(pool); k++) {
798bf215546Sopenharmony_ci         uint64_t *res;
799bf215546Sopenharmony_ci
800bf215546Sopenharmony_ci         if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
801bf215546Sopenharmony_ci            res = query_result_addr(pool, i + firstQuery,
802bf215546Sopenharmony_ci                                    struct perfcntr_query_slot, k);
803bf215546Sopenharmony_ci         } else {
804bf215546Sopenharmony_ci            res = query_result_addr(pool, i + firstQuery, uint64_t, k);
805bf215546Sopenharmony_ci         }
806bf215546Sopenharmony_ci
807bf215546Sopenharmony_ci         *res = 0;
808bf215546Sopenharmony_ci      }
809bf215546Sopenharmony_ci   }
810bf215546Sopenharmony_ci}
811bf215546Sopenharmony_ci
812bf215546Sopenharmony_cistatic void
813bf215546Sopenharmony_ciemit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
814bf215546Sopenharmony_ci                           struct tu_query_pool *pool,
815bf215546Sopenharmony_ci                           uint32_t query)
816bf215546Sopenharmony_ci{
817bf215546Sopenharmony_ci   /* From the Vulkan 1.1.130 spec:
818bf215546Sopenharmony_ci    *
819bf215546Sopenharmony_ci    *    A query must begin and end inside the same subpass of a render pass
820bf215546Sopenharmony_ci    *    instance, or must both begin and end outside of a render pass
821bf215546Sopenharmony_ci    *    instance.
822bf215546Sopenharmony_ci    *
823bf215546Sopenharmony_ci    * Unlike on an immediate-mode renderer, Turnip renders all tiles on
824bf215546Sopenharmony_ci    * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
825bf215546Sopenharmony_ci    * query begins/ends inside the same subpass of a render pass, we need to
826bf215546Sopenharmony_ci    * record the packets on the secondary draw command stream. cmdbuf->draw_cs
827bf215546Sopenharmony_ci    * is then run on every tile during render, so we just need to accumulate
828bf215546Sopenharmony_ci    * sample counts in slot->result to compute the query result.
829bf215546Sopenharmony_ci    */
830bf215546Sopenharmony_ci   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
831bf215546Sopenharmony_ci
832bf215546Sopenharmony_ci   uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
833bf215546Sopenharmony_ci
834bf215546Sopenharmony_ci   tu_cs_emit_regs(cs,
835bf215546Sopenharmony_ci                   A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
836bf215546Sopenharmony_ci
837bf215546Sopenharmony_ci   tu_cs_emit_regs(cs,
838bf215546Sopenharmony_ci                   A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
839bf215546Sopenharmony_ci
840bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
841bf215546Sopenharmony_ci   tu_cs_emit(cs, ZPASS_DONE);
842bf215546Sopenharmony_ci}
843bf215546Sopenharmony_ci
844bf215546Sopenharmony_cistatic void
845bf215546Sopenharmony_ciemit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
846bf215546Sopenharmony_ci                      struct tu_query_pool *pool,
847bf215546Sopenharmony_ci                      uint32_t query)
848bf215546Sopenharmony_ci{
849bf215546Sopenharmony_ci   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
850bf215546Sopenharmony_ci   uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin);
851bf215546Sopenharmony_ci
852bf215546Sopenharmony_ci   if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
853bf215546Sopenharmony_ci      bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
854bf215546Sopenharmony_ci      cmdbuf->state.prim_counters_running++;
855bf215546Sopenharmony_ci
856bf215546Sopenharmony_ci      /* Prevent starting primitive counters when it is supposed to be stopped
857bf215546Sopenharmony_ci       * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
858bf215546Sopenharmony_ci       */
859bf215546Sopenharmony_ci      if (need_cond_exec) {
860bf215546Sopenharmony_ci         tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
861bf215546Sopenharmony_ci                        CP_COND_REG_EXEC_0_SYSMEM |
862bf215546Sopenharmony_ci                        CP_COND_REG_EXEC_0_BINNING);
863bf215546Sopenharmony_ci      }
864bf215546Sopenharmony_ci
865bf215546Sopenharmony_ci      tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
866bf215546Sopenharmony_ci
867bf215546Sopenharmony_ci      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
868bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
869bf215546Sopenharmony_ci      tu_cs_emit(cs, 0);
870bf215546Sopenharmony_ci
871bf215546Sopenharmony_ci      if (need_cond_exec) {
872bf215546Sopenharmony_ci         tu_cond_exec_end(cs);
873bf215546Sopenharmony_ci      }
874bf215546Sopenharmony_ci   }
875bf215546Sopenharmony_ci
876bf215546Sopenharmony_ci   if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
877bf215546Sopenharmony_ci      tu6_emit_event_write(cmdbuf, cs, START_FRAGMENT_CTRS);
878bf215546Sopenharmony_ci   }
879bf215546Sopenharmony_ci
880bf215546Sopenharmony_ci   if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
881bf215546Sopenharmony_ci      tu6_emit_event_write(cmdbuf, cs, START_COMPUTE_CTRS);
882bf215546Sopenharmony_ci   }
883bf215546Sopenharmony_ci
884bf215546Sopenharmony_ci   tu_cs_emit_wfi(cs);
885bf215546Sopenharmony_ci
886bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
887bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
888bf215546Sopenharmony_ci                  CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
889bf215546Sopenharmony_ci                  CP_REG_TO_MEM_0_64B);
890bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, begin_iova);
891bf215546Sopenharmony_ci}
892bf215546Sopenharmony_ci
893bf215546Sopenharmony_cistatic void
894bf215546Sopenharmony_ciemit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
895bf215546Sopenharmony_ci{
896bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
897bf215546Sopenharmony_ci   tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
898bf215546Sopenharmony_ci                        REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
899bf215546Sopenharmony_ci                  A6XX_CP_REG_TEST_0_BIT(pass) |
900bf215546Sopenharmony_ci                  A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
901bf215546Sopenharmony_ci   tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
902bf215546Sopenharmony_ci}
903bf215546Sopenharmony_ci
904bf215546Sopenharmony_cistatic void
905bf215546Sopenharmony_ciemit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
906bf215546Sopenharmony_ci                           struct tu_query_pool *pool,
907bf215546Sopenharmony_ci                           uint32_t query)
908bf215546Sopenharmony_ci{
909bf215546Sopenharmony_ci   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
910bf215546Sopenharmony_ci   uint32_t last_pass = ~0;
911bf215546Sopenharmony_ci
912bf215546Sopenharmony_ci   if (cmdbuf->state.pass) {
913bf215546Sopenharmony_ci      cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
914bf215546Sopenharmony_ci   }
915bf215546Sopenharmony_ci
916bf215546Sopenharmony_ci   /* Querying perf counters happens in these steps:
917bf215546Sopenharmony_ci    *
918bf215546Sopenharmony_ci    *  0) There's a scratch reg to set a pass index for perf counters query.
919bf215546Sopenharmony_ci    *     Prepare cmd streams to set each pass index to the reg at device
920bf215546Sopenharmony_ci    *     creation time. See tu_CreateDevice in tu_device.c
921bf215546Sopenharmony_ci    *  1) Emit command streams to read all requested perf counters at all
922bf215546Sopenharmony_ci    *     passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which
923bf215546Sopenharmony_ci    *     reads the scratch reg where pass index is set.
924bf215546Sopenharmony_ci    *     See emit_perfcntrs_pass_start.
925bf215546Sopenharmony_ci    *  2) Pick the right cs setting proper pass index to the reg and prepend
926bf215546Sopenharmony_ci    *     it to the command buffer at each submit time.
927bf215546Sopenharmony_ci    *     See tu_QueueSubmit in tu_drm.c
928bf215546Sopenharmony_ci    *  3) If the pass index in the reg is true, then executes the command
929bf215546Sopenharmony_ci    *     stream below CP_COND_REG_EXEC.
930bf215546Sopenharmony_ci    */
931bf215546Sopenharmony_ci
932bf215546Sopenharmony_ci   tu_cs_emit_wfi(cs);
933bf215546Sopenharmony_ci
934bf215546Sopenharmony_ci   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
935bf215546Sopenharmony_ci      struct tu_perf_query_data *data = &pool->perf_query_data[i];
936bf215546Sopenharmony_ci
937bf215546Sopenharmony_ci      if (last_pass != data->pass) {
938bf215546Sopenharmony_ci         last_pass = data->pass;
939bf215546Sopenharmony_ci
940bf215546Sopenharmony_ci         if (data->pass != 0)
941bf215546Sopenharmony_ci            tu_cond_exec_end(cs);
942bf215546Sopenharmony_ci         emit_perfcntrs_pass_start(cs, data->pass);
943bf215546Sopenharmony_ci      }
944bf215546Sopenharmony_ci
945bf215546Sopenharmony_ci      const struct fd_perfcntr_counter *counter =
946bf215546Sopenharmony_ci            &pool->perf_group[data->gid].counters[data->cntr_reg];
947bf215546Sopenharmony_ci      const struct fd_perfcntr_countable *countable =
948bf215546Sopenharmony_ci            &pool->perf_group[data->gid].countables[data->cid];
949bf215546Sopenharmony_ci
950bf215546Sopenharmony_ci      tu_cs_emit_pkt4(cs, counter->select_reg, 1);
951bf215546Sopenharmony_ci      tu_cs_emit(cs, countable->selector);
952bf215546Sopenharmony_ci   }
953bf215546Sopenharmony_ci   tu_cond_exec_end(cs);
954bf215546Sopenharmony_ci
955bf215546Sopenharmony_ci   last_pass = ~0;
956bf215546Sopenharmony_ci   tu_cs_emit_wfi(cs);
957bf215546Sopenharmony_ci
958bf215546Sopenharmony_ci   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
959bf215546Sopenharmony_ci      struct tu_perf_query_data *data = &pool->perf_query_data[i];
960bf215546Sopenharmony_ci
961bf215546Sopenharmony_ci      if (last_pass != data->pass) {
962bf215546Sopenharmony_ci         last_pass = data->pass;
963bf215546Sopenharmony_ci
964bf215546Sopenharmony_ci         if (data->pass != 0)
965bf215546Sopenharmony_ci            tu_cond_exec_end(cs);
966bf215546Sopenharmony_ci         emit_perfcntrs_pass_start(cs, data->pass);
967bf215546Sopenharmony_ci      }
968bf215546Sopenharmony_ci
969bf215546Sopenharmony_ci      const struct fd_perfcntr_counter *counter =
970bf215546Sopenharmony_ci            &pool->perf_group[data->gid].counters[data->cntr_reg];
971bf215546Sopenharmony_ci
972bf215546Sopenharmony_ci      uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
973bf215546Sopenharmony_ci
974bf215546Sopenharmony_ci      tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
975bf215546Sopenharmony_ci      tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
976bf215546Sopenharmony_ci                     CP_REG_TO_MEM_0_64B);
977bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, begin_iova);
978bf215546Sopenharmony_ci   }
979bf215546Sopenharmony_ci   tu_cond_exec_end(cs);
980bf215546Sopenharmony_ci}
981bf215546Sopenharmony_ci
982bf215546Sopenharmony_cistatic void
983bf215546Sopenharmony_ciemit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
984bf215546Sopenharmony_ci                     struct tu_query_pool *pool,
985bf215546Sopenharmony_ci                     uint32_t query,
986bf215546Sopenharmony_ci                     uint32_t stream_id)
987bf215546Sopenharmony_ci{
988bf215546Sopenharmony_ci   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
989bf215546Sopenharmony_ci   uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
990bf215546Sopenharmony_ci
991bf215546Sopenharmony_ci   tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova));
992bf215546Sopenharmony_ci   tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
993bf215546Sopenharmony_ci}
994bf215546Sopenharmony_ci
995bf215546Sopenharmony_cistatic void
996bf215546Sopenharmony_ciemit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
997bf215546Sopenharmony_ci                                struct tu_query_pool *pool,
998bf215546Sopenharmony_ci                                uint32_t query)
999bf215546Sopenharmony_ci{
1000bf215546Sopenharmony_ci   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1001bf215546Sopenharmony_ci   uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1002bf215546Sopenharmony_ci
1003bf215546Sopenharmony_ci   if (cmdbuf->state.pass) {
1004bf215546Sopenharmony_ci      cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
1005bf215546Sopenharmony_ci   } else {
1006bf215546Sopenharmony_ci      cmdbuf->state.prim_generated_query_running_before_rp = true;
1007bf215546Sopenharmony_ci   }
1008bf215546Sopenharmony_ci
1009bf215546Sopenharmony_ci   cmdbuf->state.prim_counters_running++;
1010bf215546Sopenharmony_ci
1011bf215546Sopenharmony_ci   if (cmdbuf->state.pass) {
1012bf215546Sopenharmony_ci      /* Primitives that passed all tests are still counted in in each
1013bf215546Sopenharmony_ci       * tile even with HW binning beforehand. Do not permit it.
1014bf215546Sopenharmony_ci       */
1015bf215546Sopenharmony_ci      tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1016bf215546Sopenharmony_ci                           CP_COND_REG_EXEC_0_SYSMEM |
1017bf215546Sopenharmony_ci                           CP_COND_REG_EXEC_0_BINNING);
1018bf215546Sopenharmony_ci   }
1019bf215546Sopenharmony_ci
1020bf215546Sopenharmony_ci   tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
1021bf215546Sopenharmony_ci
1022bf215546Sopenharmony_ci   tu_cs_emit_wfi(cs);
1023bf215546Sopenharmony_ci
1024bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1025bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1026bf215546Sopenharmony_ci                  CP_REG_TO_MEM_0_CNT(2) |
1027bf215546Sopenharmony_ci                  CP_REG_TO_MEM_0_64B);
1028bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, begin_iova);
1029bf215546Sopenharmony_ci
1030bf215546Sopenharmony_ci   if (cmdbuf->state.pass) {
1031bf215546Sopenharmony_ci      tu_cond_exec_end(cs);
1032bf215546Sopenharmony_ci   }
1033bf215546Sopenharmony_ci}
1034bf215546Sopenharmony_ci
1035bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL
1036bf215546Sopenharmony_citu_CmdBeginQuery(VkCommandBuffer commandBuffer,
1037bf215546Sopenharmony_ci                 VkQueryPool queryPool,
1038bf215546Sopenharmony_ci                 uint32_t query,
1039bf215546Sopenharmony_ci                 VkQueryControlFlags flags)
1040bf215546Sopenharmony_ci{
1041bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1042bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1043bf215546Sopenharmony_ci   assert(query < pool->size);
1044bf215546Sopenharmony_ci
1045bf215546Sopenharmony_ci   switch (pool->type) {
1046bf215546Sopenharmony_ci   case VK_QUERY_TYPE_OCCLUSION:
1047bf215546Sopenharmony_ci      /* In freedreno, there is no implementation difference between
1048bf215546Sopenharmony_ci       * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
1049bf215546Sopenharmony_ci       * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
1050bf215546Sopenharmony_ci       */
1051bf215546Sopenharmony_ci      emit_begin_occlusion_query(cmdbuf, pool, query);
1052bf215546Sopenharmony_ci      break;
1053bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1054bf215546Sopenharmony_ci      emit_begin_xfb_query(cmdbuf, pool, query, 0);
1055bf215546Sopenharmony_ci      break;
1056bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1057bf215546Sopenharmony_ci      emit_begin_prim_generated_query(cmdbuf, pool, query);
1058bf215546Sopenharmony_ci      break;
1059bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1060bf215546Sopenharmony_ci      emit_begin_perf_query(cmdbuf, pool, query);
1061bf215546Sopenharmony_ci      break;
1062bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1063bf215546Sopenharmony_ci      emit_begin_stat_query(cmdbuf, pool, query);
1064bf215546Sopenharmony_ci      break;
1065bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TIMESTAMP:
1066bf215546Sopenharmony_ci      unreachable("Unimplemented query type");
1067bf215546Sopenharmony_ci   default:
1068bf215546Sopenharmony_ci      assert(!"Invalid query type");
1069bf215546Sopenharmony_ci   }
1070bf215546Sopenharmony_ci}
1071bf215546Sopenharmony_ci
1072bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL
1073bf215546Sopenharmony_citu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
1074bf215546Sopenharmony_ci                           VkQueryPool queryPool,
1075bf215546Sopenharmony_ci                           uint32_t query,
1076bf215546Sopenharmony_ci                           VkQueryControlFlags flags,
1077bf215546Sopenharmony_ci                           uint32_t index)
1078bf215546Sopenharmony_ci{
1079bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1080bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1081bf215546Sopenharmony_ci   assert(query < pool->size);
1082bf215546Sopenharmony_ci
1083bf215546Sopenharmony_ci   switch (pool->type) {
1084bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1085bf215546Sopenharmony_ci      emit_begin_xfb_query(cmdbuf, pool, query, index);
1086bf215546Sopenharmony_ci      break;
1087bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1088bf215546Sopenharmony_ci      emit_begin_prim_generated_query(cmdbuf, pool, query);
1089bf215546Sopenharmony_ci      break;
1090bf215546Sopenharmony_ci   default:
1091bf215546Sopenharmony_ci      assert(!"Invalid query type");
1092bf215546Sopenharmony_ci   }
1093bf215546Sopenharmony_ci}
1094bf215546Sopenharmony_ci
1095bf215546Sopenharmony_cistatic void
1096bf215546Sopenharmony_ciemit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
1097bf215546Sopenharmony_ci                         struct tu_query_pool *pool,
1098bf215546Sopenharmony_ci                         uint32_t query)
1099bf215546Sopenharmony_ci{
1100bf215546Sopenharmony_ci   /* Ending an occlusion query happens in a few steps:
1101bf215546Sopenharmony_ci    *    1) Set the slot->end to UINT64_MAX.
1102bf215546Sopenharmony_ci    *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
1103bf215546Sopenharmony_ci    *       write the current sample count value into slot->end.
1104bf215546Sopenharmony_ci    *    3) Since (2) is asynchronous, wait until slot->end is not equal to
1105bf215546Sopenharmony_ci    *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
1106bf215546Sopenharmony_ci    *    4) Accumulate the results of the query (slot->end - slot->begin) into
1107bf215546Sopenharmony_ci    *       slot->result.
1108bf215546Sopenharmony_ci    *    5) If vkCmdEndQuery is *not* called from within the scope of a render
1109bf215546Sopenharmony_ci    *       pass, set the slot's available bit since the query is now done.
1110bf215546Sopenharmony_ci    *    6) If vkCmdEndQuery *is* called from within the scope of a render
1111bf215546Sopenharmony_ci    *       pass, we cannot mark as available yet since the commands in
1112bf215546Sopenharmony_ci    *       draw_cs are not run until vkCmdEndRenderPass.
1113bf215546Sopenharmony_ci    */
1114bf215546Sopenharmony_ci   const struct tu_render_pass *pass = cmdbuf->state.pass;
1115bf215546Sopenharmony_ci   struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1116bf215546Sopenharmony_ci
1117bf215546Sopenharmony_ci   uint64_t available_iova = query_available_iova(pool, query);
1118bf215546Sopenharmony_ci   uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
1119bf215546Sopenharmony_ci   uint64_t end_iova = occlusion_query_iova(pool, query, end);
1120bf215546Sopenharmony_ci   uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0);
1121bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1122bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, end_iova);
1123bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, 0xffffffffffffffffull);
1124bf215546Sopenharmony_ci
1125bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1126bf215546Sopenharmony_ci
1127bf215546Sopenharmony_ci   tu_cs_emit_regs(cs,
1128bf215546Sopenharmony_ci                   A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
1129bf215546Sopenharmony_ci
1130bf215546Sopenharmony_ci   tu_cs_emit_regs(cs,
1131bf215546Sopenharmony_ci                   A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
1132bf215546Sopenharmony_ci
1133bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1134bf215546Sopenharmony_ci   tu_cs_emit(cs, ZPASS_DONE);
1135bf215546Sopenharmony_ci
1136bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1137bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
1138bf215546Sopenharmony_ci                  CP_WAIT_REG_MEM_0_POLL_MEMORY);
1139bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, end_iova);
1140bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
1141bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1142bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1143bf215546Sopenharmony_ci
1144bf215546Sopenharmony_ci   /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
1145bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1146bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
1147bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, result_iova);
1148bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, result_iova);
1149bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, end_iova);
1150bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, begin_iova);
1151bf215546Sopenharmony_ci
1152bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1153bf215546Sopenharmony_ci
1154bf215546Sopenharmony_ci   if (pass)
1155bf215546Sopenharmony_ci      /* Technically, queries should be tracked per-subpass, but here we track
1156bf215546Sopenharmony_ci       * at the render pass level to simply the code a bit. This is safe
1157bf215546Sopenharmony_ci       * because the only commands that use the available bit are
1158bf215546Sopenharmony_ci       * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
1159bf215546Sopenharmony_ci       * cannot be invoked from inside a render pass scope.
1160bf215546Sopenharmony_ci       */
1161bf215546Sopenharmony_ci      cs = &cmdbuf->draw_epilogue_cs;
1162bf215546Sopenharmony_ci
1163bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1164bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, available_iova);
1165bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, 0x1);
1166bf215546Sopenharmony_ci}
1167bf215546Sopenharmony_ci
1168bf215546Sopenharmony_ci/* PRIMITIVE_CTRS is used for two distinct queries:
1169bf215546Sopenharmony_ci * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT
1170bf215546Sopenharmony_ci * - VK_QUERY_TYPE_PIPELINE_STATISTICS
1171bf215546Sopenharmony_ci * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted
1172bf215546Sopenharmony_ci * only for outer query.
1173bf215546Sopenharmony_ci *
1174bf215546Sopenharmony_ci * Also, pipeline stat query could run outside of renderpass and prim gen
1175bf215546Sopenharmony_ci * query inside of secondary cmd buffer - for such case we ought to track
1176bf215546Sopenharmony_ci * the status of pipeline stats query.
1177bf215546Sopenharmony_ci */
1178bf215546Sopenharmony_cistatic void
1179bf215546Sopenharmony_ciemit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
1180bf215546Sopenharmony_ci                         struct tu_cs *cs,
1181bf215546Sopenharmony_ci                         enum VkQueryType query_type)
1182bf215546Sopenharmony_ci{
1183bf215546Sopenharmony_ci   bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY;
1184bf215546Sopenharmony_ci   cmdbuf->state.prim_counters_running--;
1185bf215546Sopenharmony_ci   if (cmdbuf->state.prim_counters_running == 0) {
1186bf215546Sopenharmony_ci      bool need_cond_exec =
1187bf215546Sopenharmony_ci         is_secondary &&
1188bf215546Sopenharmony_ci         query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT &&
1189bf215546Sopenharmony_ci         is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics);
1190bf215546Sopenharmony_ci
1191bf215546Sopenharmony_ci      if (!need_cond_exec) {
1192bf215546Sopenharmony_ci         tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
1193bf215546Sopenharmony_ci      } else {
1194bf215546Sopenharmony_ci         tu_cs_reserve(cs, 7 + 2);
1195bf215546Sopenharmony_ci         /* Check that pipeline stats query is not running, only then
1196bf215546Sopenharmony_ci          * we count stop the counter.
1197bf215546Sopenharmony_ci          */
1198bf215546Sopenharmony_ci         tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
1199bf215546Sopenharmony_ci         tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1200bf215546Sopenharmony_ci         tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1201bf215546Sopenharmony_ci         tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
1202bf215546Sopenharmony_ci         tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
1203bf215546Sopenharmony_ci
1204bf215546Sopenharmony_ci         tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
1205bf215546Sopenharmony_ci      }
1206bf215546Sopenharmony_ci   }
1207bf215546Sopenharmony_ci
1208bf215546Sopenharmony_ci   if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1209bf215546Sopenharmony_ci      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1210bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1211bf215546Sopenharmony_ci      tu_cs_emit(cs, 1);
1212bf215546Sopenharmony_ci   }
1213bf215546Sopenharmony_ci}
1214bf215546Sopenharmony_ci
1215bf215546Sopenharmony_cistatic void
1216bf215546Sopenharmony_ciemit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
1217bf215546Sopenharmony_ci                    struct tu_query_pool *pool,
1218bf215546Sopenharmony_ci                    uint32_t query)
1219bf215546Sopenharmony_ci{
1220bf215546Sopenharmony_ci   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1221bf215546Sopenharmony_ci   uint64_t end_iova = pipeline_stat_query_iova(pool, query, end);
1222bf215546Sopenharmony_ci   uint64_t available_iova = query_available_iova(pool, query);
1223bf215546Sopenharmony_ci   uint64_t result_iova;
1224bf215546Sopenharmony_ci   uint64_t stat_start_iova;
1225bf215546Sopenharmony_ci   uint64_t stat_stop_iova;
1226bf215546Sopenharmony_ci
1227bf215546Sopenharmony_ci   if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
1228bf215546Sopenharmony_ci      /* No need to conditionally execute STOP_PRIMITIVE_CTRS when
1229bf215546Sopenharmony_ci       * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a
1230bf215546Sopenharmony_ci       * renderpass, because it is already stopped.
1231bf215546Sopenharmony_ci       */
1232bf215546Sopenharmony_ci      emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS);
1233bf215546Sopenharmony_ci   }
1234bf215546Sopenharmony_ci
1235bf215546Sopenharmony_ci   if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
1236bf215546Sopenharmony_ci      tu6_emit_event_write(cmdbuf, cs, STOP_FRAGMENT_CTRS);
1237bf215546Sopenharmony_ci   }
1238bf215546Sopenharmony_ci
1239bf215546Sopenharmony_ci   if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
1240bf215546Sopenharmony_ci      tu6_emit_event_write(cmdbuf, cs, STOP_COMPUTE_CTRS);
1241bf215546Sopenharmony_ci   }
1242bf215546Sopenharmony_ci
1243bf215546Sopenharmony_ci   tu_cs_emit_wfi(cs);
1244bf215546Sopenharmony_ci
1245bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1246bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
1247bf215546Sopenharmony_ci                  CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
1248bf215546Sopenharmony_ci                  CP_REG_TO_MEM_0_64B);
1249bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, end_iova);
1250bf215546Sopenharmony_ci
1251bf215546Sopenharmony_ci   for (int i = 0; i < STAT_COUNT; i++) {
1252bf215546Sopenharmony_ci      result_iova = query_result_iova(pool, query, uint64_t, i);
1253bf215546Sopenharmony_ci      stat_start_iova = pipeline_stat_query_iova(pool, query, begin[i]);
1254bf215546Sopenharmony_ci      stat_stop_iova = pipeline_stat_query_iova(pool, query, end[i]);
1255bf215546Sopenharmony_ci
1256bf215546Sopenharmony_ci      tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1257bf215546Sopenharmony_ci      tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1258bf215546Sopenharmony_ci                     CP_MEM_TO_MEM_0_DOUBLE |
1259bf215546Sopenharmony_ci                     CP_MEM_TO_MEM_0_NEG_C);
1260bf215546Sopenharmony_ci
1261bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, result_iova);
1262bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, result_iova);
1263bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, stat_stop_iova);
1264bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, stat_start_iova);
1265bf215546Sopenharmony_ci   }
1266bf215546Sopenharmony_ci
1267bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1268bf215546Sopenharmony_ci
1269bf215546Sopenharmony_ci   if (cmdbuf->state.pass)
1270bf215546Sopenharmony_ci      cs = &cmdbuf->draw_epilogue_cs;
1271bf215546Sopenharmony_ci
1272bf215546Sopenharmony_ci   /* Set the availability to 1 */
1273bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1274bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, available_iova);
1275bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, 0x1);
1276bf215546Sopenharmony_ci}
1277bf215546Sopenharmony_ci
1278bf215546Sopenharmony_cistatic void
1279bf215546Sopenharmony_ciemit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
1280bf215546Sopenharmony_ci                         struct tu_query_pool *pool,
1281bf215546Sopenharmony_ci                         uint32_t query)
1282bf215546Sopenharmony_ci{
1283bf215546Sopenharmony_ci   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1284bf215546Sopenharmony_ci   uint64_t available_iova = query_available_iova(pool, query);
1285bf215546Sopenharmony_ci   uint64_t end_iova;
1286bf215546Sopenharmony_ci   uint64_t begin_iova;
1287bf215546Sopenharmony_ci   uint64_t result_iova;
1288bf215546Sopenharmony_ci   uint32_t last_pass = ~0;
1289bf215546Sopenharmony_ci
1290bf215546Sopenharmony_ci   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1291bf215546Sopenharmony_ci      struct tu_perf_query_data *data = &pool->perf_query_data[i];
1292bf215546Sopenharmony_ci
1293bf215546Sopenharmony_ci      if (last_pass != data->pass) {
1294bf215546Sopenharmony_ci         last_pass = data->pass;
1295bf215546Sopenharmony_ci
1296bf215546Sopenharmony_ci         if (data->pass != 0)
1297bf215546Sopenharmony_ci            tu_cond_exec_end(cs);
1298bf215546Sopenharmony_ci         emit_perfcntrs_pass_start(cs, data->pass);
1299bf215546Sopenharmony_ci      }
1300bf215546Sopenharmony_ci
1301bf215546Sopenharmony_ci      const struct fd_perfcntr_counter *counter =
1302bf215546Sopenharmony_ci            &pool->perf_group[data->gid].counters[data->cntr_reg];
1303bf215546Sopenharmony_ci
1304bf215546Sopenharmony_ci      end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1305bf215546Sopenharmony_ci
1306bf215546Sopenharmony_ci      tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1307bf215546Sopenharmony_ci      tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1308bf215546Sopenharmony_ci                     CP_REG_TO_MEM_0_64B);
1309bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, end_iova);
1310bf215546Sopenharmony_ci   }
1311bf215546Sopenharmony_ci   tu_cond_exec_end(cs);
1312bf215546Sopenharmony_ci
1313bf215546Sopenharmony_ci   last_pass = ~0;
1314bf215546Sopenharmony_ci   tu_cs_emit_wfi(cs);
1315bf215546Sopenharmony_ci
1316bf215546Sopenharmony_ci   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1317bf215546Sopenharmony_ci      struct tu_perf_query_data *data = &pool->perf_query_data[i];
1318bf215546Sopenharmony_ci
1319bf215546Sopenharmony_ci      if (last_pass != data->pass) {
1320bf215546Sopenharmony_ci         last_pass = data->pass;
1321bf215546Sopenharmony_ci
1322bf215546Sopenharmony_ci
1323bf215546Sopenharmony_ci         if (data->pass != 0)
1324bf215546Sopenharmony_ci            tu_cond_exec_end(cs);
1325bf215546Sopenharmony_ci         emit_perfcntrs_pass_start(cs, data->pass);
1326bf215546Sopenharmony_ci      }
1327bf215546Sopenharmony_ci
1328bf215546Sopenharmony_ci      result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,
1329bf215546Sopenharmony_ci             data->app_idx);
1330bf215546Sopenharmony_ci      begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1331bf215546Sopenharmony_ci      end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1332bf215546Sopenharmony_ci
1333bf215546Sopenharmony_ci      /* result += end - begin */
1334bf215546Sopenharmony_ci      tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1335bf215546Sopenharmony_ci      tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1336bf215546Sopenharmony_ci                     CP_MEM_TO_MEM_0_DOUBLE |
1337bf215546Sopenharmony_ci                     CP_MEM_TO_MEM_0_NEG_C);
1338bf215546Sopenharmony_ci
1339bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, result_iova);
1340bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, result_iova);
1341bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, end_iova);
1342bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, begin_iova);
1343bf215546Sopenharmony_ci   }
1344bf215546Sopenharmony_ci   tu_cond_exec_end(cs);
1345bf215546Sopenharmony_ci
1346bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1347bf215546Sopenharmony_ci
1348bf215546Sopenharmony_ci   if (cmdbuf->state.pass)
1349bf215546Sopenharmony_ci      cs = &cmdbuf->draw_epilogue_cs;
1350bf215546Sopenharmony_ci
1351bf215546Sopenharmony_ci   /* Set the availability to 1 */
1352bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1353bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, available_iova);
1354bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, 0x1);
1355bf215546Sopenharmony_ci}
1356bf215546Sopenharmony_ci
1357bf215546Sopenharmony_cistatic void
1358bf215546Sopenharmony_ciemit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
1359bf215546Sopenharmony_ci                   struct tu_query_pool *pool,
1360bf215546Sopenharmony_ci                   uint32_t query,
1361bf215546Sopenharmony_ci                   uint32_t stream_id)
1362bf215546Sopenharmony_ci{
1363bf215546Sopenharmony_ci   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1364bf215546Sopenharmony_ci
1365bf215546Sopenharmony_ci   uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
1366bf215546Sopenharmony_ci   uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);
1367bf215546Sopenharmony_ci   uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);
1368bf215546Sopenharmony_ci   uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
1369bf215546Sopenharmony_ci   uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
1370bf215546Sopenharmony_ci   uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
1371bf215546Sopenharmony_ci   uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
1372bf215546Sopenharmony_ci   uint64_t available_iova = query_available_iova(pool, query);
1373bf215546Sopenharmony_ci
1374bf215546Sopenharmony_ci   tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova));
1375bf215546Sopenharmony_ci   tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
1376bf215546Sopenharmony_ci
1377bf215546Sopenharmony_ci   tu_cs_emit_wfi(cs);
1378bf215546Sopenharmony_ci   tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
1379bf215546Sopenharmony_ci
1380bf215546Sopenharmony_ci   /* Set the count of written primitives */
1381bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1382bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1383bf215546Sopenharmony_ci                  CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1384bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, result_written_iova);
1385bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, result_written_iova);
1386bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, end_written_iova);
1387bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, begin_written_iova);
1388bf215546Sopenharmony_ci
1389bf215546Sopenharmony_ci   tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
1390bf215546Sopenharmony_ci
1391bf215546Sopenharmony_ci   /* Set the count of generated primitives */
1392bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1393bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1394bf215546Sopenharmony_ci                  CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1395bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, result_generated_iova);
1396bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, result_generated_iova);
1397bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, end_generated_iova);
1398bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, begin_generated_iova);
1399bf215546Sopenharmony_ci
1400bf215546Sopenharmony_ci   /* Set the availability to 1 */
1401bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1402bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, available_iova);
1403bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, 0x1);
1404bf215546Sopenharmony_ci}
1405bf215546Sopenharmony_ci
1406bf215546Sopenharmony_cistatic void
1407bf215546Sopenharmony_ciemit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1408bf215546Sopenharmony_ci                              struct tu_query_pool *pool,
1409bf215546Sopenharmony_ci                              uint32_t query)
1410bf215546Sopenharmony_ci{
1411bf215546Sopenharmony_ci   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1412bf215546Sopenharmony_ci
1413bf215546Sopenharmony_ci   if (!cmdbuf->state.pass) {
1414bf215546Sopenharmony_ci      cmdbuf->state.prim_generated_query_running_before_rp = false;
1415bf215546Sopenharmony_ci   }
1416bf215546Sopenharmony_ci
1417bf215546Sopenharmony_ci   uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1418bf215546Sopenharmony_ci   uint64_t end_iova = primitives_generated_query_iova(pool, query, end);
1419bf215546Sopenharmony_ci   uint64_t result_iova = primitives_generated_query_iova(pool, query, result);
1420bf215546Sopenharmony_ci   uint64_t available_iova = query_available_iova(pool, query);
1421bf215546Sopenharmony_ci
1422bf215546Sopenharmony_ci   if (cmdbuf->state.pass) {
1423bf215546Sopenharmony_ci      tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1424bf215546Sopenharmony_ci                             CP_COND_REG_EXEC_0_SYSMEM |
1425bf215546Sopenharmony_ci                             CP_COND_REG_EXEC_0_BINNING);
1426bf215546Sopenharmony_ci   }
1427bf215546Sopenharmony_ci
1428bf215546Sopenharmony_ci   tu_cs_emit_wfi(cs);
1429bf215546Sopenharmony_ci
1430bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1431bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1432bf215546Sopenharmony_ci                  CP_REG_TO_MEM_0_CNT(2) |
1433bf215546Sopenharmony_ci                  CP_REG_TO_MEM_0_64B);
1434bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, end_iova);
1435bf215546Sopenharmony_ci
1436bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1437bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1438bf215546Sopenharmony_ci                  CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
1439bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, result_iova);
1440bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, result_iova);
1441bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, end_iova);
1442bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, begin_iova);
1443bf215546Sopenharmony_ci
1444bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1445bf215546Sopenharmony_ci
1446bf215546Sopenharmony_ci   /* Should be after waiting for mem writes to have up to date info
1447bf215546Sopenharmony_ci    * about which query is running.
1448bf215546Sopenharmony_ci    */
1449bf215546Sopenharmony_ci   emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
1450bf215546Sopenharmony_ci
1451bf215546Sopenharmony_ci   if (cmdbuf->state.pass) {
1452bf215546Sopenharmony_ci      tu_cond_exec_end(cs);
1453bf215546Sopenharmony_ci   }
1454bf215546Sopenharmony_ci
1455bf215546Sopenharmony_ci   if (cmdbuf->state.pass)
1456bf215546Sopenharmony_ci      cs = &cmdbuf->draw_epilogue_cs;
1457bf215546Sopenharmony_ci
1458bf215546Sopenharmony_ci   /* Set the availability to 1 */
1459bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1460bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, available_iova);
1461bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, 0x1);
1462bf215546Sopenharmony_ci}
1463bf215546Sopenharmony_ci
1464bf215546Sopenharmony_ci/* Implement this bit of spec text from section 17.2 "Query Operation":
1465bf215546Sopenharmony_ci *
1466bf215546Sopenharmony_ci *     If queries are used while executing a render pass instance that has
1467bf215546Sopenharmony_ci *     multiview enabled, the query uses N consecutive query indices in the
1468bf215546Sopenharmony_ci *     query pool (starting at query) where N is the number of bits set in the
1469bf215546Sopenharmony_ci *     view mask in the subpass the query is used in. How the numerical
1470bf215546Sopenharmony_ci *     results of the query are distributed among the queries is
1471bf215546Sopenharmony_ci *     implementation-dependent. For example, some implementations may write
1472bf215546Sopenharmony_ci *     each view’s results to a distinct query, while other implementations
1473bf215546Sopenharmony_ci *     may write the total result to the first query and write zero to the
1474bf215546Sopenharmony_ci *     other queries. However, the sum of the results in all the queries must
1475bf215546Sopenharmony_ci *     accurately reflect the total result of the query summed over all views.
1476bf215546Sopenharmony_ci *     Applications can sum the results from all the queries to compute the
1477bf215546Sopenharmony_ci *     total result.
1478bf215546Sopenharmony_ci *
1479bf215546Sopenharmony_ci * Since we execute all views at once, we write zero to the other queries.
1480bf215546Sopenharmony_ci * Furthermore, because queries must be reset before use, and we set the
1481bf215546Sopenharmony_ci * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
1482bf215546Sopenharmony_ci */
1483bf215546Sopenharmony_ci
1484bf215546Sopenharmony_cistatic void
1485bf215546Sopenharmony_cihandle_multiview_queries(struct tu_cmd_buffer *cmd,
1486bf215546Sopenharmony_ci                         struct tu_query_pool *pool,
1487bf215546Sopenharmony_ci                         uint32_t query)
1488bf215546Sopenharmony_ci{
1489bf215546Sopenharmony_ci   if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
1490bf215546Sopenharmony_ci      return;
1491bf215546Sopenharmony_ci
1492bf215546Sopenharmony_ci   unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
1493bf215546Sopenharmony_ci   struct tu_cs *cs = &cmd->draw_epilogue_cs;
1494bf215546Sopenharmony_ci
1495bf215546Sopenharmony_ci   for (uint32_t i = 1; i < views; i++) {
1496bf215546Sopenharmony_ci      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1497bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
1498bf215546Sopenharmony_ci      tu_cs_emit_qw(cs, 0x1);
1499bf215546Sopenharmony_ci   }
1500bf215546Sopenharmony_ci}
1501bf215546Sopenharmony_ci
1502bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL
1503bf215546Sopenharmony_citu_CmdEndQuery(VkCommandBuffer commandBuffer,
1504bf215546Sopenharmony_ci               VkQueryPool queryPool,
1505bf215546Sopenharmony_ci               uint32_t query)
1506bf215546Sopenharmony_ci{
1507bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1508bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1509bf215546Sopenharmony_ci   assert(query < pool->size);
1510bf215546Sopenharmony_ci
1511bf215546Sopenharmony_ci   switch (pool->type) {
1512bf215546Sopenharmony_ci   case VK_QUERY_TYPE_OCCLUSION:
1513bf215546Sopenharmony_ci      emit_end_occlusion_query(cmdbuf, pool, query);
1514bf215546Sopenharmony_ci      break;
1515bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1516bf215546Sopenharmony_ci      emit_end_xfb_query(cmdbuf, pool, query, 0);
1517bf215546Sopenharmony_ci      break;
1518bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1519bf215546Sopenharmony_ci      emit_end_prim_generated_query(cmdbuf, pool, query);
1520bf215546Sopenharmony_ci      break;
1521bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1522bf215546Sopenharmony_ci      emit_end_perf_query(cmdbuf, pool, query);
1523bf215546Sopenharmony_ci      break;
1524bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1525bf215546Sopenharmony_ci      emit_end_stat_query(cmdbuf, pool, query);
1526bf215546Sopenharmony_ci      break;
1527bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TIMESTAMP:
1528bf215546Sopenharmony_ci      unreachable("Unimplemented query type");
1529bf215546Sopenharmony_ci   default:
1530bf215546Sopenharmony_ci      assert(!"Invalid query type");
1531bf215546Sopenharmony_ci   }
1532bf215546Sopenharmony_ci
1533bf215546Sopenharmony_ci   handle_multiview_queries(cmdbuf, pool, query);
1534bf215546Sopenharmony_ci}
1535bf215546Sopenharmony_ci
1536bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL
1537bf215546Sopenharmony_citu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
1538bf215546Sopenharmony_ci                         VkQueryPool queryPool,
1539bf215546Sopenharmony_ci                         uint32_t query,
1540bf215546Sopenharmony_ci                         uint32_t index)
1541bf215546Sopenharmony_ci{
1542bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1543bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1544bf215546Sopenharmony_ci   assert(query < pool->size);
1545bf215546Sopenharmony_ci
1546bf215546Sopenharmony_ci   switch (pool->type) {
1547bf215546Sopenharmony_ci   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1548bf215546Sopenharmony_ci      assert(index <= 4);
1549bf215546Sopenharmony_ci      emit_end_xfb_query(cmdbuf, pool, query, index);
1550bf215546Sopenharmony_ci      break;
1551bf215546Sopenharmony_ci   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1552bf215546Sopenharmony_ci      emit_end_prim_generated_query(cmdbuf, pool, query);
1553bf215546Sopenharmony_ci      break;
1554bf215546Sopenharmony_ci   default:
1555bf215546Sopenharmony_ci      assert(!"Invalid query type");
1556bf215546Sopenharmony_ci   }
1557bf215546Sopenharmony_ci}
1558bf215546Sopenharmony_ci
1559bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL
1560bf215546Sopenharmony_citu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
1561bf215546Sopenharmony_ci                      VkPipelineStageFlagBits2 pipelineStage,
1562bf215546Sopenharmony_ci                      VkQueryPool queryPool,
1563bf215546Sopenharmony_ci                      uint32_t query)
1564bf215546Sopenharmony_ci{
1565bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1566bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1567bf215546Sopenharmony_ci
1568bf215546Sopenharmony_ci   /* Inside a render pass, just write the timestamp multiple times so that
1569bf215546Sopenharmony_ci    * the user gets the last one if we use GMEM. There isn't really much
1570bf215546Sopenharmony_ci    * better we can do, and this seems to be what the blob does too.
1571bf215546Sopenharmony_ci    */
1572bf215546Sopenharmony_ci   struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
1573bf215546Sopenharmony_ci
1574bf215546Sopenharmony_ci   /* Stages that will already have been executed by the time the CP executes
1575bf215546Sopenharmony_ci    * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
1576bf215546Sopenharmony_ci    * indirect stage counts as top-of-pipe too.
1577bf215546Sopenharmony_ci    */
1578bf215546Sopenharmony_ci   VkPipelineStageFlags2 top_of_pipe_flags =
1579bf215546Sopenharmony_ci      VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1580bf215546Sopenharmony_ci      VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
1581bf215546Sopenharmony_ci
1582bf215546Sopenharmony_ci   if (pipelineStage & ~top_of_pipe_flags) {
1583bf215546Sopenharmony_ci      /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
1584bf215546Sopenharmony_ci       * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
1585bf215546Sopenharmony_ci       * complete.
1586bf215546Sopenharmony_ci       *
1587bf215546Sopenharmony_ci       * Stalling the CP like this is really unfortunate, but I don't think
1588bf215546Sopenharmony_ci       * there's a better solution that allows all 48 bits of precision
1589bf215546Sopenharmony_ci       * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
1590bf215546Sopenharmony_ci       */
1591bf215546Sopenharmony_ci      tu_cs_emit_wfi(cs);
1592bf215546Sopenharmony_ci   }
1593bf215546Sopenharmony_ci
1594bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1595bf215546Sopenharmony_ci   tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) |
1596bf215546Sopenharmony_ci                  CP_REG_TO_MEM_0_CNT(2) |
1597bf215546Sopenharmony_ci                  CP_REG_TO_MEM_0_64B);
1598bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
1599bf215546Sopenharmony_ci
1600bf215546Sopenharmony_ci   /* Only flag availability once the entire renderpass is done, similar to
1601bf215546Sopenharmony_ci    * the begin/end path.
1602bf215546Sopenharmony_ci    */
1603bf215546Sopenharmony_ci   cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
1604bf215546Sopenharmony_ci
1605bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1606bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, query_available_iova(pool, query));
1607bf215546Sopenharmony_ci   tu_cs_emit_qw(cs, 0x1);
1608bf215546Sopenharmony_ci
1609bf215546Sopenharmony_ci   /* From the spec for vkCmdWriteTimestamp:
1610bf215546Sopenharmony_ci    *
1611bf215546Sopenharmony_ci    *    If vkCmdWriteTimestamp is called while executing a render pass
1612bf215546Sopenharmony_ci    *    instance that has multiview enabled, the timestamp uses N consecutive
1613bf215546Sopenharmony_ci    *    query indices in the query pool (starting at query) where N is the
1614bf215546Sopenharmony_ci    *    number of bits set in the view mask of the subpass the command is
1615bf215546Sopenharmony_ci    *    executed in. The resulting query values are determined by an
1616bf215546Sopenharmony_ci    *    implementation-dependent choice of one of the following behaviors:
1617bf215546Sopenharmony_ci    *
1618bf215546Sopenharmony_ci    *    -   The first query is a timestamp value and (if more than one bit is
1619bf215546Sopenharmony_ci    *        set in the view mask) zero is written to the remaining queries.
1620bf215546Sopenharmony_ci    *        If two timestamps are written in the same subpass, the sum of the
1621bf215546Sopenharmony_ci    *        execution time of all views between those commands is the
1622bf215546Sopenharmony_ci    *        difference between the first query written by each command.
1623bf215546Sopenharmony_ci    *
1624bf215546Sopenharmony_ci    *    -   All N queries are timestamp values. If two timestamps are written
1625bf215546Sopenharmony_ci    *        in the same subpass, the sum of the execution time of all views
1626bf215546Sopenharmony_ci    *        between those commands is the sum of the difference between
1627bf215546Sopenharmony_ci    *        corresponding queries written by each command. The difference
1628bf215546Sopenharmony_ci    *        between corresponding queries may be the execution time of a
1629bf215546Sopenharmony_ci    *        single view.
1630bf215546Sopenharmony_ci    *
1631bf215546Sopenharmony_ci    * We execute all views in the same draw call, so we implement the first
1632bf215546Sopenharmony_ci    * option, the same as regular queries.
1633bf215546Sopenharmony_ci    */
1634bf215546Sopenharmony_ci   handle_multiview_queries(cmd, pool, query);
1635bf215546Sopenharmony_ci}
1636bf215546Sopenharmony_ci
1637bf215546Sopenharmony_ciVKAPI_ATTR VkResult VKAPI_CALL
1638bf215546Sopenharmony_citu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1639bf215546Sopenharmony_ci    VkPhysicalDevice                            physicalDevice,
1640bf215546Sopenharmony_ci    uint32_t                                    queueFamilyIndex,
1641bf215546Sopenharmony_ci    uint32_t*                                   pCounterCount,
1642bf215546Sopenharmony_ci    VkPerformanceCounterKHR*                    pCounters,
1643bf215546Sopenharmony_ci    VkPerformanceCounterDescriptionKHR*         pCounterDescriptions)
1644bf215546Sopenharmony_ci{
1645bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1646bf215546Sopenharmony_ci
1647bf215546Sopenharmony_ci   uint32_t desc_count = *pCounterCount;
1648bf215546Sopenharmony_ci   uint32_t group_count;
1649bf215546Sopenharmony_ci   const struct fd_perfcntr_group *group =
1650bf215546Sopenharmony_ci         fd_perfcntrs(&phydev->dev_id, &group_count);
1651bf215546Sopenharmony_ci
1652bf215546Sopenharmony_ci   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
1653bf215546Sopenharmony_ci   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
1654bf215546Sopenharmony_ci                          pCounterDescriptions, &desc_count);
1655bf215546Sopenharmony_ci
1656bf215546Sopenharmony_ci   for (int i = 0; i < group_count; i++) {
1657bf215546Sopenharmony_ci      for (int j = 0; j < group[i].num_countables; j++) {
1658bf215546Sopenharmony_ci
1659bf215546Sopenharmony_ci         vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
1660bf215546Sopenharmony_ci            counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR;
1661bf215546Sopenharmony_ci            counter->unit =
1662bf215546Sopenharmony_ci                  fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
1663bf215546Sopenharmony_ci            counter->storage =
1664bf215546Sopenharmony_ci                  fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
1665bf215546Sopenharmony_ci
1666bf215546Sopenharmony_ci            unsigned char sha1_result[20];
1667bf215546Sopenharmony_ci            _mesa_sha1_compute(group[i].countables[j].name,
1668bf215546Sopenharmony_ci                               strlen(group[i].countables[j].name),
1669bf215546Sopenharmony_ci                               sha1_result);
1670bf215546Sopenharmony_ci            memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
1671bf215546Sopenharmony_ci         }
1672bf215546Sopenharmony_ci
1673bf215546Sopenharmony_ci         vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
1674bf215546Sopenharmony_ci            desc->flags = 0;
1675bf215546Sopenharmony_ci
1676bf215546Sopenharmony_ci            snprintf(desc->name, sizeof(desc->name),
1677bf215546Sopenharmony_ci                     "%s", group[i].countables[j].name);
1678bf215546Sopenharmony_ci            snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
1679bf215546Sopenharmony_ci            snprintf(desc->description, sizeof(desc->description),
1680bf215546Sopenharmony_ci                     "%s: %s performance counter",
1681bf215546Sopenharmony_ci                     group[i].name, group[i].countables[j].name);
1682bf215546Sopenharmony_ci         }
1683bf215546Sopenharmony_ci      }
1684bf215546Sopenharmony_ci   }
1685bf215546Sopenharmony_ci
1686bf215546Sopenharmony_ci   return vk_outarray_status(&out);
1687bf215546Sopenharmony_ci}
1688bf215546Sopenharmony_ci
1689bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL
1690bf215546Sopenharmony_citu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1691bf215546Sopenharmony_ci      VkPhysicalDevice                            physicalDevice,
1692bf215546Sopenharmony_ci      const VkQueryPoolPerformanceCreateInfoKHR*  pPerformanceQueryCreateInfo,
1693bf215546Sopenharmony_ci      uint32_t*                                   pNumPasses)
1694bf215546Sopenharmony_ci{
1695bf215546Sopenharmony_ci   TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1696bf215546Sopenharmony_ci   uint32_t group_count = 0;
1697bf215546Sopenharmony_ci   uint32_t gid = 0, cid = 0, n_passes;
1698bf215546Sopenharmony_ci   const struct fd_perfcntr_group *group =
1699bf215546Sopenharmony_ci         fd_perfcntrs(&phydev->dev_id, &group_count);
1700bf215546Sopenharmony_ci
1701bf215546Sopenharmony_ci   uint32_t counters_requested[group_count];
1702bf215546Sopenharmony_ci   memset(counters_requested, 0x0, sizeof(counters_requested));
1703bf215546Sopenharmony_ci   *pNumPasses = 1;
1704bf215546Sopenharmony_ci
1705bf215546Sopenharmony_ci   for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
1706bf215546Sopenharmony_ci      perfcntr_index(group, group_count,
1707bf215546Sopenharmony_ci                     pPerformanceQueryCreateInfo->pCounterIndices[i],
1708bf215546Sopenharmony_ci                     &gid, &cid);
1709bf215546Sopenharmony_ci
1710bf215546Sopenharmony_ci      counters_requested[gid]++;
1711bf215546Sopenharmony_ci   }
1712bf215546Sopenharmony_ci
1713bf215546Sopenharmony_ci   for (uint32_t i = 0; i < group_count; i++) {
1714bf215546Sopenharmony_ci      n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
1715bf215546Sopenharmony_ci      *pNumPasses = MAX2(*pNumPasses, n_passes);
1716bf215546Sopenharmony_ci   }
1717bf215546Sopenharmony_ci}
1718bf215546Sopenharmony_ci
1719bf215546Sopenharmony_ciVKAPI_ATTR VkResult VKAPI_CALL
1720bf215546Sopenharmony_citu_AcquireProfilingLockKHR(VkDevice device,
1721bf215546Sopenharmony_ci                           const VkAcquireProfilingLockInfoKHR* pInfo)
1722bf215546Sopenharmony_ci{
1723bf215546Sopenharmony_ci   /* TODO. Probably there's something to do for kgsl. */
1724bf215546Sopenharmony_ci   return VK_SUCCESS;
1725bf215546Sopenharmony_ci}
1726bf215546Sopenharmony_ci
1727bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL
1728bf215546Sopenharmony_citu_ReleaseProfilingLockKHR(VkDevice device)
1729bf215546Sopenharmony_ci{
1730bf215546Sopenharmony_ci   /* TODO. Probably there's something to do for kgsl. */
1731bf215546Sopenharmony_ci   return;
1732bf215546Sopenharmony_ci}
1733