1/*
2 * Copyrigh 2016 Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 *
5 * Based on anv:
6 * Copyright © 2015 Intel Corporation
7 */
8
9#include "tu_query.h"
10
11#include <fcntl.h>
12
13#include "nir/nir_builder.h"
14#include "util/os_time.h"
15
16#include "vk_util.h"
17
18#include "tu_cmd_buffer.h"
19#include "tu_cs.h"
20#include "tu_device.h"
21
22#define NSEC_PER_SEC 1000000000ull
23#define WAIT_TIMEOUT 5
24#define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)
25
26struct PACKED query_slot {
27   uint64_t available;
28};
29
30struct PACKED occlusion_slot_value {
31   /* Seems sample counters are placed to be 16-byte aligned
32    * even though this query needs an 8-byte slot. */
33   uint64_t value;
34   uint64_t _padding;
35};
36
37struct PACKED occlusion_query_slot {
38   struct query_slot common;
39   uint64_t result;
40
41   struct occlusion_slot_value begin;
42   struct occlusion_slot_value end;
43};
44
45struct PACKED timestamp_query_slot {
46   struct query_slot common;
47   uint64_t result;
48};
49
50struct PACKED primitive_slot_value {
51   uint64_t values[2];
52};
53
54struct PACKED pipeline_stat_query_slot {
55   struct query_slot common;
56   uint64_t results[STAT_COUNT];
57
58   uint64_t begin[STAT_COUNT];
59   uint64_t end[STAT_COUNT];
60};
61
62struct PACKED primitive_query_slot {
63   struct query_slot common;
64   /* The result of transform feedback queries is two integer values:
65    *   results[0] is the count of primitives written,
66    *   results[1] is the count of primitives generated.
67    * Also a result for each stream is stored at 4 slots respectively.
68    */
69   uint64_t results[2];
70
71   /* Primitive counters also need to be 16-byte aligned. */
72   uint64_t _padding;
73
74   struct primitive_slot_value begin[4];
75   struct primitive_slot_value end[4];
76};
77
78struct PACKED perfcntr_query_slot {
79   uint64_t result;
80   uint64_t begin;
81   uint64_t end;
82};
83
84struct PACKED perf_query_slot {
85   struct query_slot common;
86   struct perfcntr_query_slot perfcntr;
87};
88
89struct PACKED primitives_generated_query_slot {
90   struct query_slot common;
91   uint64_t result;
92   uint64_t begin;
93   uint64_t end;
94};
95
96/* Returns the IOVA of a given uint64_t field in a given slot of a query
97 * pool. */
98#define query_iova(type, pool, query, field)                         \
99   pool->bo->iova + pool->stride * (query) + offsetof(type, field)
100
101#define occlusion_query_iova(pool, query, field)                     \
102   query_iova(struct occlusion_query_slot, pool, query, field)
103
104#define pipeline_stat_query_iova(pool, query, field)                 \
105   pool->bo->iova + pool->stride * (query) +                            \
106   offsetof(struct pipeline_stat_query_slot, field)
107
108#define primitive_query_iova(pool, query, field, i)                  \
109   query_iova(struct primitive_query_slot, pool, query, field) +     \
110   offsetof(struct primitive_slot_value, values[i])
111
112#define perf_query_iova(pool, query, field, i)                          \
113   pool->bo->iova + pool->stride * (query) +                             \
114   sizeof(struct query_slot) +                                   \
115   sizeof(struct perfcntr_query_slot) * (i) +                          \
116   offsetof(struct perfcntr_query_slot, field)
117
118#define primitives_generated_query_iova(pool, query, field)               \
119   query_iova(struct primitives_generated_query_slot, pool, query, field)
120
121#define query_available_iova(pool, query)                            \
122   query_iova(struct query_slot, pool, query, available)
123
124#define query_result_iova(pool, query, type, i)                            \
125   pool->bo->iova + pool->stride * (query) +                          \
126   sizeof(struct query_slot) + sizeof(type) * (i)
127
128#define query_result_addr(pool, query, type, i)                            \
129   pool->bo->map + pool->stride * (query) +                             \
130   sizeof(struct query_slot) + sizeof(type) * (i)
131
132#define query_is_available(slot) slot->available
133
134static const VkPerformanceCounterUnitKHR
135fd_perfcntr_type_to_vk_unit[] = {
136   [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
137   [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
138   [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
139   [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
140   [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
141   /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
142   [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
143   [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
144   [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
145   [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
146   [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
147   [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
148   [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
149};
150
151/* TODO. Basically this comes from the freedreno implementation where
152 * only UINT64 is used. We'd better confirm this by the blob vulkan driver
153 * when it starts supporting perf query.
154 */
155static const VkPerformanceCounterStorageKHR
156fd_perfcntr_type_to_vk_storage[] = {
157   [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
158   [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
159   [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
160   [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
161   [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
162   [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
163   [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
164   [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
165   [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
166   [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
167   [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
168   [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
169};
170
171/*
172 * Returns a pointer to a given slot in a query pool.
173 */
174static void* slot_address(struct tu_query_pool *pool, uint32_t query)
175{
176   return (char*)pool->bo->map + query * pool->stride;
177}
178
179static void
180perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
181               uint32_t index, uint32_t *gid, uint32_t *cid)
182
183{
184   uint32_t i;
185
186   for (i = 0; i < group_count; i++) {
187      if (group[i].num_countables > index) {
188         *gid = i;
189         *cid = index;
190         break;
191      }
192      index -= group[i].num_countables;
193   }
194
195   assert(i < group_count);
196}
197
198static int
199compare_perfcntr_pass(const void *a, const void *b)
200{
201   return ((struct tu_perf_query_data *)a)->pass -
202          ((struct tu_perf_query_data *)b)->pass;
203}
204
205VKAPI_ATTR VkResult VKAPI_CALL
206tu_CreateQueryPool(VkDevice _device,
207                   const VkQueryPoolCreateInfo *pCreateInfo,
208                   const VkAllocationCallbacks *pAllocator,
209                   VkQueryPool *pQueryPool)
210{
211   TU_FROM_HANDLE(tu_device, device, _device);
212   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
213   assert(pCreateInfo->queryCount > 0);
214
215   uint32_t pool_size, slot_size;
216   const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
217
218   pool_size = sizeof(struct tu_query_pool);
219
220   switch (pCreateInfo->queryType) {
221   case VK_QUERY_TYPE_OCCLUSION:
222      slot_size = sizeof(struct occlusion_query_slot);
223      break;
224   case VK_QUERY_TYPE_TIMESTAMP:
225      slot_size = sizeof(struct timestamp_query_slot);
226      break;
227   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
228      slot_size = sizeof(struct primitive_query_slot);
229      break;
230   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
231      slot_size = sizeof(struct primitives_generated_query_slot);
232      break;
233   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
234      perf_query_info =
235            vk_find_struct_const(pCreateInfo->pNext,
236                                 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
237      assert(perf_query_info);
238
239      slot_size = sizeof(struct perf_query_slot) +
240                  sizeof(struct perfcntr_query_slot) *
241                  (perf_query_info->counterIndexCount - 1);
242
243      /* Size of the array pool->tu_perf_query_data */
244      pool_size += sizeof(struct tu_perf_query_data) *
245                   perf_query_info->counterIndexCount;
246      break;
247   }
248   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
249      slot_size = sizeof(struct pipeline_stat_query_slot);
250      break;
251   default:
252      unreachable("Invalid query type");
253   }
254
255   struct tu_query_pool *pool =
256         vk_object_alloc(&device->vk, pAllocator, pool_size,
257                         VK_OBJECT_TYPE_QUERY_POOL);
258   if (!pool)
259      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
260
261   if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
262      pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
263                                      &pool->perf_group_count);
264
265      pool->counter_index_count = perf_query_info->counterIndexCount;
266
267      /* Build all perf counters data that is requested, so we could get
268       * correct group id, countable id, counter register and pass index with
269       * only a counter index provided by applications at each command submit.
270       *
271       * Also, since this built data will be sorted by pass index later, we
272       * should keep the original indices and store perfcntrs results according
273       * to them so apps can get correct results with their own indices.
274       */
275      uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
276      memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
277      memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
278
279      for (uint32_t i = 0; i < pool->counter_index_count; i++) {
280         uint32_t gid = 0, cid = 0;
281
282         perfcntr_index(pool->perf_group, pool->perf_group_count,
283                        perf_query_info->pCounterIndices[i], &gid, &cid);
284
285         pool->perf_query_data[i].gid = gid;
286         pool->perf_query_data[i].cid = cid;
287         pool->perf_query_data[i].app_idx = i;
288
289         /* When a counter register is over the capacity(num_counters),
290          * reset it for next pass.
291          */
292         if (regs[gid] < pool->perf_group[gid].num_counters) {
293            pool->perf_query_data[i].cntr_reg = regs[gid]++;
294            pool->perf_query_data[i].pass = pass[gid];
295         } else {
296            pool->perf_query_data[i].pass = ++pass[gid];
297            pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
298            regs[gid]++;
299         }
300      }
301
302      /* Sort by pass index so we could easily prepare a command stream
303       * with the ascending order of pass index.
304       */
305      qsort(pool->perf_query_data, pool->counter_index_count,
306            sizeof(pool->perf_query_data[0]),
307            compare_perfcntr_pass);
308   }
309
310   VkResult result = tu_bo_init_new(device, &pool->bo,
311         pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS);
312   if (result != VK_SUCCESS) {
313      vk_object_free(&device->vk, pAllocator, pool);
314      return result;
315   }
316
317   result = tu_bo_map(device, pool->bo);
318   if (result != VK_SUCCESS) {
319      tu_bo_finish(device, pool->bo);
320      vk_object_free(&device->vk, pAllocator, pool);
321      return result;
322   }
323
324   /* Initialize all query statuses to unavailable */
325   memset(pool->bo->map, 0, pool->bo->size);
326
327   pool->type = pCreateInfo->queryType;
328   pool->stride = slot_size;
329   pool->size = pCreateInfo->queryCount;
330   pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
331   *pQueryPool = tu_query_pool_to_handle(pool);
332
333   return VK_SUCCESS;
334}
335
336VKAPI_ATTR void VKAPI_CALL
337tu_DestroyQueryPool(VkDevice _device,
338                    VkQueryPool _pool,
339                    const VkAllocationCallbacks *pAllocator)
340{
341   TU_FROM_HANDLE(tu_device, device, _device);
342   TU_FROM_HANDLE(tu_query_pool, pool, _pool);
343
344   if (!pool)
345      return;
346
347   tu_bo_finish(device, pool->bo);
348   vk_object_free(&device->vk, pAllocator, pool);
349}
350
351static uint32_t
352get_result_count(struct tu_query_pool *pool)
353{
354   switch (pool->type) {
355   /* Occulusion and timestamp queries write one integer value */
356   case VK_QUERY_TYPE_OCCLUSION:
357   case VK_QUERY_TYPE_TIMESTAMP:
358   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
359      return 1;
360   /* Transform feedback queries write two integer values */
361   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
362      return 2;
363   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
364      return util_bitcount(pool->pipeline_statistics);
365   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
366      return pool->counter_index_count;
367   default:
368      assert(!"Invalid query type");
369      return 0;
370   }
371}
372
373static uint32_t
374statistics_index(uint32_t *statistics)
375{
376   uint32_t stat;
377   stat = u_bit_scan(statistics);
378
379   switch (1 << stat) {
380   case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:
381   case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:
382      return 0;
383   case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:
384      return 1;
385   case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:
386      return 2;
387   case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:
388      return 4;
389   case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:
390      return 5;
391   case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:
392      return 6;
393   case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:
394      return 7;
395   case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:
396      return 8;
397   case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:
398      return 9;
399   case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:
400      return 10;
401   default:
402      return 0;
403   }
404}
405
406static bool
407is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)
408{
409   return pipeline_statistics &
410          (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
411           VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
412           VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
413           VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
414           VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
415           VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT |
416           VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
417           VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT |
418           VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT);
419}
420
421static bool
422is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)
423{
424   return pipeline_statistics &
425          VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
426}
427
428static bool
429is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
430{
431   return pipeline_statistics &
432          VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
433}
434
435/* Wait on the the availability status of a query up until a timeout. */
436static VkResult
437wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
438                   uint32_t query)
439{
440   /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
441    * scheduler friendly way instead of busy polling once the patch has landed
442    * upstream. */
443   struct query_slot *slot = slot_address(pool, query);
444   uint64_t abs_timeout = os_time_get_absolute_timeout(
445         WAIT_TIMEOUT * NSEC_PER_SEC);
446   while(os_time_get_nano() < abs_timeout) {
447      if (query_is_available(slot))
448         return VK_SUCCESS;
449   }
450   return vk_error(device, VK_TIMEOUT);
451}
452
453/* Writes a query value to a buffer from the CPU. */
454static void
455write_query_value_cpu(char* base,
456                      uint32_t offset,
457                      uint64_t value,
458                      VkQueryResultFlags flags)
459{
460   if (flags & VK_QUERY_RESULT_64_BIT) {
461      *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
462   } else {
463      *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
464   }
465}
466
467static VkResult
468get_query_pool_results(struct tu_device *device,
469                       struct tu_query_pool *pool,
470                       uint32_t firstQuery,
471                       uint32_t queryCount,
472                       size_t dataSize,
473                       void *pData,
474                       VkDeviceSize stride,
475                       VkQueryResultFlags flags)
476{
477   assert(dataSize >= stride * queryCount);
478
479   char *result_base = pData;
480   VkResult result = VK_SUCCESS;
481   for (uint32_t i = 0; i < queryCount; i++) {
482      uint32_t query = firstQuery + i;
483      struct query_slot *slot = slot_address(pool, query);
484      bool available = query_is_available(slot);
485      uint32_t result_count = get_result_count(pool);
486      uint32_t statistics = pool->pipeline_statistics;
487
488      if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
489         VkResult wait_result = wait_for_available(device, pool, query);
490         if (wait_result != VK_SUCCESS)
491            return wait_result;
492         available = true;
493      } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
494         /* From the Vulkan 1.1.130 spec:
495          *
496          *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
497          *    both not set then no result values are written to pData for
498          *    queries that are in the unavailable state at the time of the
499          *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
500          *    availability state is still written to pData for those queries
501          *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
502          */
503         result = VK_NOT_READY;
504         if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
505            result_base += stride;
506            continue;
507         }
508      }
509
510      for (uint32_t k = 0; k < result_count; k++) {
511         if (available) {
512            uint64_t *result;
513
514            if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
515               uint32_t stat_idx = statistics_index(&statistics);
516               result = query_result_addr(pool, query, uint64_t, stat_idx);
517            } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
518               result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
519            } else {
520               result = query_result_addr(pool, query, uint64_t, k);
521            }
522
523            write_query_value_cpu(result_base, k, *result, flags);
524         } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
525             /* From the Vulkan 1.1.130 spec:
526              *
527              *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
528              *   is not set, and the query’s status is unavailable, an
529              *   intermediate result value between zero and the final result
530              *   value is written to pData for that query.
531              *
532              * Just return 0 here for simplicity since it's a valid result.
533              */
534            write_query_value_cpu(result_base, k, 0, flags);
535      }
536
537      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
538         /* From the Vulkan 1.1.130 spec:
539          *
540          *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
541          *    integer value written for each query is non-zero if the query’s
542          *    status was available or zero if the status was unavailable.
543          */
544         write_query_value_cpu(result_base, result_count, available, flags);
545
546      result_base += stride;
547   }
548   return result;
549}
550
551VKAPI_ATTR VkResult VKAPI_CALL
552tu_GetQueryPoolResults(VkDevice _device,
553                       VkQueryPool queryPool,
554                       uint32_t firstQuery,
555                       uint32_t queryCount,
556                       size_t dataSize,
557                       void *pData,
558                       VkDeviceSize stride,
559                       VkQueryResultFlags flags)
560{
561   TU_FROM_HANDLE(tu_device, device, _device);
562   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
563   assert(firstQuery + queryCount <= pool->size);
564
565   if (vk_device_is_lost(&device->vk))
566      return VK_ERROR_DEVICE_LOST;
567
568   switch (pool->type) {
569   case VK_QUERY_TYPE_OCCLUSION:
570   case VK_QUERY_TYPE_TIMESTAMP:
571   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
572   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
573   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
574   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
575      return get_query_pool_results(device, pool, firstQuery, queryCount,
576                                    dataSize, pData, stride, flags);
577   default:
578      assert(!"Invalid query type");
579   }
580   return VK_SUCCESS;
581}
582
583/* Copies a query value from one buffer to another from the GPU. */
584static void
585copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
586                     struct tu_cs *cs,
587                     uint64_t src_iova,
588                     uint64_t base_write_iova,
589                     uint32_t offset,
590                     VkQueryResultFlags flags) {
591   uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
592         sizeof(uint64_t) : sizeof(uint32_t);
593   uint64_t write_iova = base_write_iova + (offset * element_size);
594
595   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
596   uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
597         CP_MEM_TO_MEM_0_DOUBLE : 0;
598   tu_cs_emit(cs, mem_to_mem_flags);
599   tu_cs_emit_qw(cs, write_iova);
600   tu_cs_emit_qw(cs, src_iova);
601}
602
603static void
604emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
605                             struct tu_cs *cs,
606                             struct tu_query_pool *pool,
607                             uint32_t firstQuery,
608                             uint32_t queryCount,
609                             struct tu_buffer *buffer,
610                             VkDeviceSize dstOffset,
611                             VkDeviceSize stride,
612                             VkQueryResultFlags flags)
613{
614   /* From the Vulkan 1.1.130 spec:
615    *
616    *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
617    *    uses of vkCmdResetQueryPool in the same queue, without any additional
618    *    synchronization.
619    *
620    * To ensure that previous writes to the available bit are coherent, first
621    * wait for all writes to complete.
622    */
623   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
624
625   for (uint32_t i = 0; i < queryCount; i++) {
626      uint32_t query = firstQuery + i;
627      uint64_t available_iova = query_available_iova(pool, query);
628      uint64_t buffer_iova = buffer->iova + dstOffset + i * stride;
629      uint32_t result_count = get_result_count(pool);
630      uint32_t statistics = pool->pipeline_statistics;
631
632      /* Wait for the available bit to be set if executed with the
633       * VK_QUERY_RESULT_WAIT_BIT flag. */
634      if (flags & VK_QUERY_RESULT_WAIT_BIT) {
635         tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
636         tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
637                        CP_WAIT_REG_MEM_0_POLL_MEMORY);
638         tu_cs_emit_qw(cs, available_iova);
639         tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
640         tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
641         tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
642      }
643
644      for (uint32_t k = 0; k < result_count; k++) {
645         uint64_t result_iova;
646
647         if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
648            uint32_t stat_idx = statistics_index(&statistics);
649            result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
650         } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
651            result_iova = query_result_iova(pool, query,
652                                            struct perfcntr_query_slot, k);
653         } else {
654            result_iova = query_result_iova(pool, query, uint64_t, k);
655         }
656
657         if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
658            /* Unconditionally copying the bo->result into the buffer here is
659             * valid because we only set bo->result on vkCmdEndQuery. Thus, even
660             * if the query is unavailable, this will copy the correct partial
661             * value of 0.
662             */
663            copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
664                                 k /* offset */, flags);
665         } else {
666            /* Conditionally copy bo->result into the buffer based on whether the
667             * query is available.
668             *
669             * NOTE: For the conditional packets to be executed, CP_COND_EXEC
670             * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
671             * that 0 < available < 2, aka available == 1.
672             */
673            tu_cs_reserve(cs, 7 + 6);
674            tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
675            tu_cs_emit_qw(cs, available_iova);
676            tu_cs_emit_qw(cs, available_iova);
677            tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
678            tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
679
680            /* Start of conditional execution */
681            copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
682                              k /* offset */, flags);
683            /* End of conditional execution */
684         }
685      }
686
687      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
688         copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
689                              result_count /* offset */, flags);
690      }
691   }
692}
693
694VKAPI_ATTR void VKAPI_CALL
695tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
696                           VkQueryPool queryPool,
697                           uint32_t firstQuery,
698                           uint32_t queryCount,
699                           VkBuffer dstBuffer,
700                           VkDeviceSize dstOffset,
701                           VkDeviceSize stride,
702                           VkQueryResultFlags flags)
703{
704   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
705   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
706   TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
707   struct tu_cs *cs = &cmdbuf->cs;
708   assert(firstQuery + queryCount <= pool->size);
709
710   switch (pool->type) {
711   case VK_QUERY_TYPE_OCCLUSION:
712   case VK_QUERY_TYPE_TIMESTAMP:
713   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
714   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
715   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
716      return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
717               queryCount, buffer, dstOffset, stride, flags);
718   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
719      unreachable("allowCommandBufferQueryCopies is false");
720   default:
721      assert(!"Invalid query type");
722   }
723}
724
725static void
726emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
727                      struct tu_query_pool *pool,
728                      uint32_t firstQuery,
729                      uint32_t queryCount)
730{
731   struct tu_cs *cs = &cmdbuf->cs;
732
733   for (uint32_t i = 0; i < queryCount; i++) {
734      uint32_t query = firstQuery + i;
735      uint32_t statistics = pool->pipeline_statistics;
736
737      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
738      tu_cs_emit_qw(cs, query_available_iova(pool, query));
739      tu_cs_emit_qw(cs, 0x0);
740
741      for (uint32_t k = 0; k < get_result_count(pool); k++) {
742         uint64_t result_iova;
743
744         if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
745            uint32_t stat_idx = statistics_index(&statistics);
746            result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
747         } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
748            result_iova = query_result_iova(pool, query,
749                                            struct perfcntr_query_slot, k);
750         } else {
751            result_iova = query_result_iova(pool, query, uint64_t, k);
752         }
753
754         tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
755         tu_cs_emit_qw(cs, result_iova);
756         tu_cs_emit_qw(cs, 0x0);
757      }
758   }
759
760}
761
762VKAPI_ATTR void VKAPI_CALL
763tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
764                     VkQueryPool queryPool,
765                     uint32_t firstQuery,
766                     uint32_t queryCount)
767{
768   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
769   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
770
771   switch (pool->type) {
772   case VK_QUERY_TYPE_TIMESTAMP:
773   case VK_QUERY_TYPE_OCCLUSION:
774   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
775   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
776   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
777   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
778      emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
779      break;
780   default:
781      assert(!"Invalid query type");
782   }
783}
784
785VKAPI_ATTR void VKAPI_CALL
786tu_ResetQueryPool(VkDevice device,
787                  VkQueryPool queryPool,
788                  uint32_t firstQuery,
789                  uint32_t queryCount)
790{
791   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
792
793   for (uint32_t i = 0; i < queryCount; i++) {
794      struct query_slot *slot = slot_address(pool, i + firstQuery);
795      slot->available = 0;
796
797      for (uint32_t k = 0; k < get_result_count(pool); k++) {
798         uint64_t *res;
799
800         if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
801            res = query_result_addr(pool, i + firstQuery,
802                                    struct perfcntr_query_slot, k);
803         } else {
804            res = query_result_addr(pool, i + firstQuery, uint64_t, k);
805         }
806
807         *res = 0;
808      }
809   }
810}
811
812static void
813emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
814                           struct tu_query_pool *pool,
815                           uint32_t query)
816{
817   /* From the Vulkan 1.1.130 spec:
818    *
819    *    A query must begin and end inside the same subpass of a render pass
820    *    instance, or must both begin and end outside of a render pass
821    *    instance.
822    *
823    * Unlike on an immediate-mode renderer, Turnip renders all tiles on
824    * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
825    * query begins/ends inside the same subpass of a render pass, we need to
826    * record the packets on the secondary draw command stream. cmdbuf->draw_cs
827    * is then run on every tile during render, so we just need to accumulate
828    * sample counts in slot->result to compute the query result.
829    */
830   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
831
832   uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
833
834   tu_cs_emit_regs(cs,
835                   A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
836
837   tu_cs_emit_regs(cs,
838                   A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
839
840   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
841   tu_cs_emit(cs, ZPASS_DONE);
842}
843
844static void
845emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
846                      struct tu_query_pool *pool,
847                      uint32_t query)
848{
849   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
850   uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin);
851
852   if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
853      bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
854      cmdbuf->state.prim_counters_running++;
855
856      /* Prevent starting primitive counters when it is supposed to be stopped
857       * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
858       */
859      if (need_cond_exec) {
860         tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
861                        CP_COND_REG_EXEC_0_SYSMEM |
862                        CP_COND_REG_EXEC_0_BINNING);
863      }
864
865      tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
866
867      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
868      tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
869      tu_cs_emit(cs, 0);
870
871      if (need_cond_exec) {
872         tu_cond_exec_end(cs);
873      }
874   }
875
876   if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
877      tu6_emit_event_write(cmdbuf, cs, START_FRAGMENT_CTRS);
878   }
879
880   if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
881      tu6_emit_event_write(cmdbuf, cs, START_COMPUTE_CTRS);
882   }
883
884   tu_cs_emit_wfi(cs);
885
886   tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
887   tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
888                  CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
889                  CP_REG_TO_MEM_0_64B);
890   tu_cs_emit_qw(cs, begin_iova);
891}
892
893static void
894emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
895{
896   tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
897   tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
898                        REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
899                  A6XX_CP_REG_TEST_0_BIT(pass) |
900                  A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
901   tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
902}
903
904static void
905emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
906                           struct tu_query_pool *pool,
907                           uint32_t query)
908{
909   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
910   uint32_t last_pass = ~0;
911
912   if (cmdbuf->state.pass) {
913      cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
914   }
915
916   /* Querying perf counters happens in these steps:
917    *
918    *  0) There's a scratch reg to set a pass index for perf counters query.
919    *     Prepare cmd streams to set each pass index to the reg at device
920    *     creation time. See tu_CreateDevice in tu_device.c
921    *  1) Emit command streams to read all requested perf counters at all
922    *     passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which
923    *     reads the scratch reg where pass index is set.
924    *     See emit_perfcntrs_pass_start.
925    *  2) Pick the right cs setting proper pass index to the reg and prepend
926    *     it to the command buffer at each submit time.
927    *     See tu_QueueSubmit in tu_drm.c
928    *  3) If the pass index in the reg is true, then executes the command
929    *     stream below CP_COND_REG_EXEC.
930    */
931
932   tu_cs_emit_wfi(cs);
933
934   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
935      struct tu_perf_query_data *data = &pool->perf_query_data[i];
936
937      if (last_pass != data->pass) {
938         last_pass = data->pass;
939
940         if (data->pass != 0)
941            tu_cond_exec_end(cs);
942         emit_perfcntrs_pass_start(cs, data->pass);
943      }
944
945      const struct fd_perfcntr_counter *counter =
946            &pool->perf_group[data->gid].counters[data->cntr_reg];
947      const struct fd_perfcntr_countable *countable =
948            &pool->perf_group[data->gid].countables[data->cid];
949
950      tu_cs_emit_pkt4(cs, counter->select_reg, 1);
951      tu_cs_emit(cs, countable->selector);
952   }
953   tu_cond_exec_end(cs);
954
955   last_pass = ~0;
956   tu_cs_emit_wfi(cs);
957
958   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
959      struct tu_perf_query_data *data = &pool->perf_query_data[i];
960
961      if (last_pass != data->pass) {
962         last_pass = data->pass;
963
964         if (data->pass != 0)
965            tu_cond_exec_end(cs);
966         emit_perfcntrs_pass_start(cs, data->pass);
967      }
968
969      const struct fd_perfcntr_counter *counter =
970            &pool->perf_group[data->gid].counters[data->cntr_reg];
971
972      uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
973
974      tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
975      tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
976                     CP_REG_TO_MEM_0_64B);
977      tu_cs_emit_qw(cs, begin_iova);
978   }
979   tu_cond_exec_end(cs);
980}
981
982static void
983emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
984                     struct tu_query_pool *pool,
985                     uint32_t query,
986                     uint32_t stream_id)
987{
988   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
989   uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
990
991   tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova));
992   tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
993}
994
995static void
996emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
997                                struct tu_query_pool *pool,
998                                uint32_t query)
999{
1000   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1001   uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1002
1003   if (cmdbuf->state.pass) {
1004      cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
1005   } else {
1006      cmdbuf->state.prim_generated_query_running_before_rp = true;
1007   }
1008
1009   cmdbuf->state.prim_counters_running++;
1010
1011   if (cmdbuf->state.pass) {
1012      /* Primitives that passed all tests are still counted in in each
1013       * tile even with HW binning beforehand. Do not permit it.
1014       */
1015      tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1016                           CP_COND_REG_EXEC_0_SYSMEM |
1017                           CP_COND_REG_EXEC_0_BINNING);
1018   }
1019
1020   tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
1021
1022   tu_cs_emit_wfi(cs);
1023
1024   tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1025   tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1026                  CP_REG_TO_MEM_0_CNT(2) |
1027                  CP_REG_TO_MEM_0_64B);
1028   tu_cs_emit_qw(cs, begin_iova);
1029
1030   if (cmdbuf->state.pass) {
1031      tu_cond_exec_end(cs);
1032   }
1033}
1034
1035VKAPI_ATTR void VKAPI_CALL
1036tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
1037                 VkQueryPool queryPool,
1038                 uint32_t query,
1039                 VkQueryControlFlags flags)
1040{
1041   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1042   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1043   assert(query < pool->size);
1044
1045   switch (pool->type) {
1046   case VK_QUERY_TYPE_OCCLUSION:
1047      /* In freedreno, there is no implementation difference between
1048       * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
1049       * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
1050       */
1051      emit_begin_occlusion_query(cmdbuf, pool, query);
1052      break;
1053   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1054      emit_begin_xfb_query(cmdbuf, pool, query, 0);
1055      break;
1056   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1057      emit_begin_prim_generated_query(cmdbuf, pool, query);
1058      break;
1059   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1060      emit_begin_perf_query(cmdbuf, pool, query);
1061      break;
1062   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1063      emit_begin_stat_query(cmdbuf, pool, query);
1064      break;
1065   case VK_QUERY_TYPE_TIMESTAMP:
1066      unreachable("Unimplemented query type");
1067   default:
1068      assert(!"Invalid query type");
1069   }
1070}
1071
1072VKAPI_ATTR void VKAPI_CALL
1073tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
1074                           VkQueryPool queryPool,
1075                           uint32_t query,
1076                           VkQueryControlFlags flags,
1077                           uint32_t index)
1078{
1079   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1080   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1081   assert(query < pool->size);
1082
1083   switch (pool->type) {
1084   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1085      emit_begin_xfb_query(cmdbuf, pool, query, index);
1086      break;
1087   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1088      emit_begin_prim_generated_query(cmdbuf, pool, query);
1089      break;
1090   default:
1091      assert(!"Invalid query type");
1092   }
1093}
1094
1095static void
1096emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
1097                         struct tu_query_pool *pool,
1098                         uint32_t query)
1099{
1100   /* Ending an occlusion query happens in a few steps:
1101    *    1) Set the slot->end to UINT64_MAX.
1102    *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
1103    *       write the current sample count value into slot->end.
1104    *    3) Since (2) is asynchronous, wait until slot->end is not equal to
1105    *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
1106    *    4) Accumulate the results of the query (slot->end - slot->begin) into
1107    *       slot->result.
1108    *    5) If vkCmdEndQuery is *not* called from within the scope of a render
1109    *       pass, set the slot's available bit since the query is now done.
1110    *    6) If vkCmdEndQuery *is* called from within the scope of a render
1111    *       pass, we cannot mark as available yet since the commands in
1112    *       draw_cs are not run until vkCmdEndRenderPass.
1113    */
1114   const struct tu_render_pass *pass = cmdbuf->state.pass;
1115   struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1116
1117   uint64_t available_iova = query_available_iova(pool, query);
1118   uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
1119   uint64_t end_iova = occlusion_query_iova(pool, query, end);
1120   uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0);
1121   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1122   tu_cs_emit_qw(cs, end_iova);
1123   tu_cs_emit_qw(cs, 0xffffffffffffffffull);
1124
1125   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1126
1127   tu_cs_emit_regs(cs,
1128                   A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
1129
1130   tu_cs_emit_regs(cs,
1131                   A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
1132
1133   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1134   tu_cs_emit(cs, ZPASS_DONE);
1135
1136   tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1137   tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
1138                  CP_WAIT_REG_MEM_0_POLL_MEMORY);
1139   tu_cs_emit_qw(cs, end_iova);
1140   tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
1141   tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1142   tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1143
1144   /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
1145   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1146   tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
1147   tu_cs_emit_qw(cs, result_iova);
1148   tu_cs_emit_qw(cs, result_iova);
1149   tu_cs_emit_qw(cs, end_iova);
1150   tu_cs_emit_qw(cs, begin_iova);
1151
1152   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1153
1154   if (pass)
1155      /* Technically, queries should be tracked per-subpass, but here we track
1156       * at the render pass level to simply the code a bit. This is safe
1157       * because the only commands that use the available bit are
1158       * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
1159       * cannot be invoked from inside a render pass scope.
1160       */
1161      cs = &cmdbuf->draw_epilogue_cs;
1162
1163   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1164   tu_cs_emit_qw(cs, available_iova);
1165   tu_cs_emit_qw(cs, 0x1);
1166}
1167
1168/* PRIMITIVE_CTRS is used for two distinct queries:
1169 * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT
1170 * - VK_QUERY_TYPE_PIPELINE_STATISTICS
1171 * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted
1172 * only for outer query.
1173 *
1174 * Also, pipeline stat query could run outside of renderpass and prim gen
1175 * query inside of secondary cmd buffer - for such case we ought to track
1176 * the status of pipeline stats query.
1177 */
1178static void
1179emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
1180                         struct tu_cs *cs,
1181                         enum VkQueryType query_type)
1182{
1183   bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY;
1184   cmdbuf->state.prim_counters_running--;
1185   if (cmdbuf->state.prim_counters_running == 0) {
1186      bool need_cond_exec =
1187         is_secondary &&
1188         query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT &&
1189         is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics);
1190
1191      if (!need_cond_exec) {
1192         tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
1193      } else {
1194         tu_cs_reserve(cs, 7 + 2);
1195         /* Check that pipeline stats query is not running, only then
1196          * we count stop the counter.
1197          */
1198         tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
1199         tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1200         tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1201         tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
1202         tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
1203
1204         tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
1205      }
1206   }
1207
1208   if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1209      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1210      tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1211      tu_cs_emit(cs, 1);
1212   }
1213}
1214
1215static void
1216emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
1217                    struct tu_query_pool *pool,
1218                    uint32_t query)
1219{
1220   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1221   uint64_t end_iova = pipeline_stat_query_iova(pool, query, end);
1222   uint64_t available_iova = query_available_iova(pool, query);
1223   uint64_t result_iova;
1224   uint64_t stat_start_iova;
1225   uint64_t stat_stop_iova;
1226
1227   if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
1228      /* No need to conditionally execute STOP_PRIMITIVE_CTRS when
1229       * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a
1230       * renderpass, because it is already stopped.
1231       */
1232      emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS);
1233   }
1234
1235   if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
1236      tu6_emit_event_write(cmdbuf, cs, STOP_FRAGMENT_CTRS);
1237   }
1238
1239   if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
1240      tu6_emit_event_write(cmdbuf, cs, STOP_COMPUTE_CTRS);
1241   }
1242
1243   tu_cs_emit_wfi(cs);
1244
1245   tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1246   tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
1247                  CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
1248                  CP_REG_TO_MEM_0_64B);
1249   tu_cs_emit_qw(cs, end_iova);
1250
1251   for (int i = 0; i < STAT_COUNT; i++) {
1252      result_iova = query_result_iova(pool, query, uint64_t, i);
1253      stat_start_iova = pipeline_stat_query_iova(pool, query, begin[i]);
1254      stat_stop_iova = pipeline_stat_query_iova(pool, query, end[i]);
1255
1256      tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1257      tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1258                     CP_MEM_TO_MEM_0_DOUBLE |
1259                     CP_MEM_TO_MEM_0_NEG_C);
1260
1261      tu_cs_emit_qw(cs, result_iova);
1262      tu_cs_emit_qw(cs, result_iova);
1263      tu_cs_emit_qw(cs, stat_stop_iova);
1264      tu_cs_emit_qw(cs, stat_start_iova);
1265   }
1266
1267   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1268
1269   if (cmdbuf->state.pass)
1270      cs = &cmdbuf->draw_epilogue_cs;
1271
1272   /* Set the availability to 1 */
1273   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1274   tu_cs_emit_qw(cs, available_iova);
1275   tu_cs_emit_qw(cs, 0x1);
1276}
1277
1278static void
1279emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
1280                         struct tu_query_pool *pool,
1281                         uint32_t query)
1282{
1283   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1284   uint64_t available_iova = query_available_iova(pool, query);
1285   uint64_t end_iova;
1286   uint64_t begin_iova;
1287   uint64_t result_iova;
1288   uint32_t last_pass = ~0;
1289
1290   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1291      struct tu_perf_query_data *data = &pool->perf_query_data[i];
1292
1293      if (last_pass != data->pass) {
1294         last_pass = data->pass;
1295
1296         if (data->pass != 0)
1297            tu_cond_exec_end(cs);
1298         emit_perfcntrs_pass_start(cs, data->pass);
1299      }
1300
1301      const struct fd_perfcntr_counter *counter =
1302            &pool->perf_group[data->gid].counters[data->cntr_reg];
1303
1304      end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1305
1306      tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1307      tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1308                     CP_REG_TO_MEM_0_64B);
1309      tu_cs_emit_qw(cs, end_iova);
1310   }
1311   tu_cond_exec_end(cs);
1312
1313   last_pass = ~0;
1314   tu_cs_emit_wfi(cs);
1315
1316   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1317      struct tu_perf_query_data *data = &pool->perf_query_data[i];
1318
1319      if (last_pass != data->pass) {
1320         last_pass = data->pass;
1321
1322
1323         if (data->pass != 0)
1324            tu_cond_exec_end(cs);
1325         emit_perfcntrs_pass_start(cs, data->pass);
1326      }
1327
1328      result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,
1329             data->app_idx);
1330      begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1331      end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1332
1333      /* result += end - begin */
1334      tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1335      tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1336                     CP_MEM_TO_MEM_0_DOUBLE |
1337                     CP_MEM_TO_MEM_0_NEG_C);
1338
1339      tu_cs_emit_qw(cs, result_iova);
1340      tu_cs_emit_qw(cs, result_iova);
1341      tu_cs_emit_qw(cs, end_iova);
1342      tu_cs_emit_qw(cs, begin_iova);
1343   }
1344   tu_cond_exec_end(cs);
1345
1346   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1347
1348   if (cmdbuf->state.pass)
1349      cs = &cmdbuf->draw_epilogue_cs;
1350
1351   /* Set the availability to 1 */
1352   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1353   tu_cs_emit_qw(cs, available_iova);
1354   tu_cs_emit_qw(cs, 0x1);
1355}
1356
1357static void
1358emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
1359                   struct tu_query_pool *pool,
1360                   uint32_t query,
1361                   uint32_t stream_id)
1362{
1363   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1364
1365   uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
1366   uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);
1367   uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);
1368   uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
1369   uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
1370   uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
1371   uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
1372   uint64_t available_iova = query_available_iova(pool, query);
1373
1374   tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova));
1375   tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
1376
1377   tu_cs_emit_wfi(cs);
1378   tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
1379
1380   /* Set the count of written primitives */
1381   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1382   tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1383                  CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1384   tu_cs_emit_qw(cs, result_written_iova);
1385   tu_cs_emit_qw(cs, result_written_iova);
1386   tu_cs_emit_qw(cs, end_written_iova);
1387   tu_cs_emit_qw(cs, begin_written_iova);
1388
1389   tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
1390
1391   /* Set the count of generated primitives */
1392   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1393   tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1394                  CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1395   tu_cs_emit_qw(cs, result_generated_iova);
1396   tu_cs_emit_qw(cs, result_generated_iova);
1397   tu_cs_emit_qw(cs, end_generated_iova);
1398   tu_cs_emit_qw(cs, begin_generated_iova);
1399
1400   /* Set the availability to 1 */
1401   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1402   tu_cs_emit_qw(cs, available_iova);
1403   tu_cs_emit_qw(cs, 0x1);
1404}
1405
1406static void
1407emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1408                              struct tu_query_pool *pool,
1409                              uint32_t query)
1410{
1411   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1412
1413   if (!cmdbuf->state.pass) {
1414      cmdbuf->state.prim_generated_query_running_before_rp = false;
1415   }
1416
1417   uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1418   uint64_t end_iova = primitives_generated_query_iova(pool, query, end);
1419   uint64_t result_iova = primitives_generated_query_iova(pool, query, result);
1420   uint64_t available_iova = query_available_iova(pool, query);
1421
1422   if (cmdbuf->state.pass) {
1423      tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1424                             CP_COND_REG_EXEC_0_SYSMEM |
1425                             CP_COND_REG_EXEC_0_BINNING);
1426   }
1427
1428   tu_cs_emit_wfi(cs);
1429
1430   tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1431   tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1432                  CP_REG_TO_MEM_0_CNT(2) |
1433                  CP_REG_TO_MEM_0_64B);
1434   tu_cs_emit_qw(cs, end_iova);
1435
1436   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1437   tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1438                  CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
1439   tu_cs_emit_qw(cs, result_iova);
1440   tu_cs_emit_qw(cs, result_iova);
1441   tu_cs_emit_qw(cs, end_iova);
1442   tu_cs_emit_qw(cs, begin_iova);
1443
1444   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1445
1446   /* Should be after waiting for mem writes to have up to date info
1447    * about which query is running.
1448    */
1449   emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
1450
1451   if (cmdbuf->state.pass) {
1452      tu_cond_exec_end(cs);
1453   }
1454
1455   if (cmdbuf->state.pass)
1456      cs = &cmdbuf->draw_epilogue_cs;
1457
1458   /* Set the availability to 1 */
1459   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1460   tu_cs_emit_qw(cs, available_iova);
1461   tu_cs_emit_qw(cs, 0x1);
1462}
1463
1464/* Implement this bit of spec text from section 17.2 "Query Operation":
1465 *
1466 *     If queries are used while executing a render pass instance that has
1467 *     multiview enabled, the query uses N consecutive query indices in the
1468 *     query pool (starting at query) where N is the number of bits set in the
1469 *     view mask in the subpass the query is used in. How the numerical
1470 *     results of the query are distributed among the queries is
1471 *     implementation-dependent. For example, some implementations may write
1472 *     each view’s results to a distinct query, while other implementations
1473 *     may write the total result to the first query and write zero to the
1474 *     other queries. However, the sum of the results in all the queries must
1475 *     accurately reflect the total result of the query summed over all views.
1476 *     Applications can sum the results from all the queries to compute the
1477 *     total result.
1478 *
1479 * Since we execute all views at once, we write zero to the other queries.
1480 * Furthermore, because queries must be reset before use, and we set the
1481 * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
1482 */
1483
1484static void
1485handle_multiview_queries(struct tu_cmd_buffer *cmd,
1486                         struct tu_query_pool *pool,
1487                         uint32_t query)
1488{
1489   if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
1490      return;
1491
1492   unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
1493   struct tu_cs *cs = &cmd->draw_epilogue_cs;
1494
1495   for (uint32_t i = 1; i < views; i++) {
1496      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1497      tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
1498      tu_cs_emit_qw(cs, 0x1);
1499   }
1500}
1501
1502VKAPI_ATTR void VKAPI_CALL
1503tu_CmdEndQuery(VkCommandBuffer commandBuffer,
1504               VkQueryPool queryPool,
1505               uint32_t query)
1506{
1507   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1508   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1509   assert(query < pool->size);
1510
1511   switch (pool->type) {
1512   case VK_QUERY_TYPE_OCCLUSION:
1513      emit_end_occlusion_query(cmdbuf, pool, query);
1514      break;
1515   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1516      emit_end_xfb_query(cmdbuf, pool, query, 0);
1517      break;
1518   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1519      emit_end_prim_generated_query(cmdbuf, pool, query);
1520      break;
1521   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1522      emit_end_perf_query(cmdbuf, pool, query);
1523      break;
1524   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1525      emit_end_stat_query(cmdbuf, pool, query);
1526      break;
1527   case VK_QUERY_TYPE_TIMESTAMP:
1528      unreachable("Unimplemented query type");
1529   default:
1530      assert(!"Invalid query type");
1531   }
1532
1533   handle_multiview_queries(cmdbuf, pool, query);
1534}
1535
1536VKAPI_ATTR void VKAPI_CALL
1537tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
1538                         VkQueryPool queryPool,
1539                         uint32_t query,
1540                         uint32_t index)
1541{
1542   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1543   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1544   assert(query < pool->size);
1545
1546   switch (pool->type) {
1547   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1548      assert(index <= 4);
1549      emit_end_xfb_query(cmdbuf, pool, query, index);
1550      break;
1551   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1552      emit_end_prim_generated_query(cmdbuf, pool, query);
1553      break;
1554   default:
1555      assert(!"Invalid query type");
1556   }
1557}
1558
1559VKAPI_ATTR void VKAPI_CALL
1560tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
1561                      VkPipelineStageFlagBits2 pipelineStage,
1562                      VkQueryPool queryPool,
1563                      uint32_t query)
1564{
1565   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1566   TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1567
1568   /* Inside a render pass, just write the timestamp multiple times so that
1569    * the user gets the last one if we use GMEM. There isn't really much
1570    * better we can do, and this seems to be what the blob does too.
1571    */
1572   struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
1573
1574   /* Stages that will already have been executed by the time the CP executes
1575    * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
1576    * indirect stage counts as top-of-pipe too.
1577    */
1578   VkPipelineStageFlags2 top_of_pipe_flags =
1579      VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1580      VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
1581
1582   if (pipelineStage & ~top_of_pipe_flags) {
1583      /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
1584       * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
1585       * complete.
1586       *
1587       * Stalling the CP like this is really unfortunate, but I don't think
1588       * there's a better solution that allows all 48 bits of precision
1589       * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
1590       */
1591      tu_cs_emit_wfi(cs);
1592   }
1593
1594   tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1595   tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) |
1596                  CP_REG_TO_MEM_0_CNT(2) |
1597                  CP_REG_TO_MEM_0_64B);
1598   tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
1599
1600   /* Only flag availability once the entire renderpass is done, similar to
1601    * the begin/end path.
1602    */
1603   cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
1604
1605   tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1606   tu_cs_emit_qw(cs, query_available_iova(pool, query));
1607   tu_cs_emit_qw(cs, 0x1);
1608
1609   /* From the spec for vkCmdWriteTimestamp:
1610    *
1611    *    If vkCmdWriteTimestamp is called while executing a render pass
1612    *    instance that has multiview enabled, the timestamp uses N consecutive
1613    *    query indices in the query pool (starting at query) where N is the
1614    *    number of bits set in the view mask of the subpass the command is
1615    *    executed in. The resulting query values are determined by an
1616    *    implementation-dependent choice of one of the following behaviors:
1617    *
1618    *    -   The first query is a timestamp value and (if more than one bit is
1619    *        set in the view mask) zero is written to the remaining queries.
1620    *        If two timestamps are written in the same subpass, the sum of the
1621    *        execution time of all views between those commands is the
1622    *        difference between the first query written by each command.
1623    *
1624    *    -   All N queries are timestamp values. If two timestamps are written
1625    *        in the same subpass, the sum of the execution time of all views
1626    *        between those commands is the sum of the difference between
1627    *        corresponding queries written by each command. The difference
1628    *        between corresponding queries may be the execution time of a
1629    *        single view.
1630    *
1631    * We execute all views in the same draw call, so we implement the first
1632    * option, the same as regular queries.
1633    */
1634   handle_multiview_queries(cmd, pool, query);
1635}
1636
1637VKAPI_ATTR VkResult VKAPI_CALL
1638tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1639    VkPhysicalDevice                            physicalDevice,
1640    uint32_t                                    queueFamilyIndex,
1641    uint32_t*                                   pCounterCount,
1642    VkPerformanceCounterKHR*                    pCounters,
1643    VkPerformanceCounterDescriptionKHR*         pCounterDescriptions)
1644{
1645   TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1646
1647   uint32_t desc_count = *pCounterCount;
1648   uint32_t group_count;
1649   const struct fd_perfcntr_group *group =
1650         fd_perfcntrs(&phydev->dev_id, &group_count);
1651
1652   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
1653   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
1654                          pCounterDescriptions, &desc_count);
1655
1656   for (int i = 0; i < group_count; i++) {
1657      for (int j = 0; j < group[i].num_countables; j++) {
1658
1659         vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
1660            counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR;
1661            counter->unit =
1662                  fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
1663            counter->storage =
1664                  fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
1665
1666            unsigned char sha1_result[20];
1667            _mesa_sha1_compute(group[i].countables[j].name,
1668                               strlen(group[i].countables[j].name),
1669                               sha1_result);
1670            memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
1671         }
1672
1673         vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
1674            desc->flags = 0;
1675
1676            snprintf(desc->name, sizeof(desc->name),
1677                     "%s", group[i].countables[j].name);
1678            snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
1679            snprintf(desc->description, sizeof(desc->description),
1680                     "%s: %s performance counter",
1681                     group[i].name, group[i].countables[j].name);
1682         }
1683      }
1684   }
1685
1686   return vk_outarray_status(&out);
1687}
1688
1689VKAPI_ATTR void VKAPI_CALL
1690tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1691      VkPhysicalDevice                            physicalDevice,
1692      const VkQueryPoolPerformanceCreateInfoKHR*  pPerformanceQueryCreateInfo,
1693      uint32_t*                                   pNumPasses)
1694{
1695   TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1696   uint32_t group_count = 0;
1697   uint32_t gid = 0, cid = 0, n_passes;
1698   const struct fd_perfcntr_group *group =
1699         fd_perfcntrs(&phydev->dev_id, &group_count);
1700
1701   uint32_t counters_requested[group_count];
1702   memset(counters_requested, 0x0, sizeof(counters_requested));
1703   *pNumPasses = 1;
1704
1705   for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
1706      perfcntr_index(group, group_count,
1707                     pPerformanceQueryCreateInfo->pCounterIndices[i],
1708                     &gid, &cid);
1709
1710      counters_requested[gid]++;
1711   }
1712
1713   for (uint32_t i = 0; i < group_count; i++) {
1714      n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
1715      *pNumPasses = MAX2(*pNumPasses, n_passes);
1716   }
1717}
1718
1719VKAPI_ATTR VkResult VKAPI_CALL
1720tu_AcquireProfilingLockKHR(VkDevice device,
1721                           const VkAcquireProfilingLockInfoKHR* pInfo)
1722{
1723   /* TODO. Probably there's something to do for kgsl. */
1724   return VK_SUCCESS;
1725}
1726
1727VKAPI_ATTR void VKAPI_CALL
1728tu_ReleaseProfilingLockKHR(VkDevice device)
1729{
1730   /* TODO. Probably there's something to do for kgsl. */
1731   return;
1732}
1733