1/*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include <assert.h>
25#include <stdbool.h>
26#include <string.h>
27#include <unistd.h>
28#include <fcntl.h>
29
30#include "anv_private.h"
31
32#include "util/os_time.h"
33
34#include "genxml/gen_macros.h"
35#include "genxml/genX_pack.h"
36
37/* We reserve :
38 *    - GPR 14 for perf queries
39 *    - GPR 15 for conditional rendering
40 */
41#define MI_BUILDER_NUM_ALLOC_GPRS 14
42#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8
43#define __gen_get_batch_dwords anv_batch_emit_dwords
44#define __gen_address_offset anv_address_add
45#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
46#include "common/mi_builder.h"
47#include "perf/intel_perf.h"
48#include "perf/intel_perf_mdapi.h"
49#include "perf/intel_perf_regs.h"
50
51#include "vk_util.h"
52
53static struct anv_address
54anv_query_address(struct anv_query_pool *pool, uint32_t query)
55{
56   return (struct anv_address) {
57      .bo = pool->bo,
58      .offset = query * pool->stride,
59   };
60}
61
62VkResult genX(CreateQueryPool)(
63    VkDevice                                    _device,
64    const VkQueryPoolCreateInfo*                pCreateInfo,
65    const VkAllocationCallbacks*                pAllocator,
66    VkQueryPool*                                pQueryPool)
67{
68   ANV_FROM_HANDLE(anv_device, device, _device);
69   const struct anv_physical_device *pdevice = device->physical;
70#if GFX_VER >= 8
71   const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
72   struct intel_perf_counter_pass *counter_pass;
73   struct intel_perf_query_info **pass_query;
74   uint32_t n_passes = 0;
75#endif
76   uint32_t data_offset = 0;
77   VK_MULTIALLOC(ma);
78   VkResult result;
79
80   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
81
82   /* Query pool slots are made up of some number of 64-bit values packed
83    * tightly together. For most query types have the first 64-bit value is
84    * the "available" bit which is 0 when the query is unavailable and 1 when
85    * it is available. The 64-bit values that follow are determined by the
86    * type of query.
87    *
88    * For performance queries, we have a requirement to align OA reports at
89    * 64bytes so we put those first and have the "available" bit behind
90    * together with some other counters.
91    */
92   uint32_t uint64s_per_slot = 0;
93
94   VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1);
95
96   VkQueryPipelineStatisticFlags pipeline_statistics = 0;
97   switch (pCreateInfo->queryType) {
98   case VK_QUERY_TYPE_OCCLUSION:
99      /* Occlusion queries have two values: begin and end. */
100      uint64s_per_slot = 1 + 2;
101      break;
102   case VK_QUERY_TYPE_TIMESTAMP:
103      /* Timestamps just have the one timestamp value */
104      uint64s_per_slot = 1 + 1;
105      break;
106   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
107      pipeline_statistics = pCreateInfo->pipelineStatistics;
108      /* We're going to trust this field implicitly so we need to ensure that
109       * no unhandled extension bits leak in.
110       */
111      pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
112
113      /* Statistics queries have a min and max for every statistic */
114      uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics);
115      break;
116   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
117      /* Transform feedback queries are 4 values, begin/end for
118       * written/available.
119       */
120      uint64s_per_slot = 1 + 4;
121      break;
122   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
123      const struct intel_perf_query_field_layout *layout =
124         &pdevice->perf->query_layout;
125
126      uint64s_per_slot = 2; /* availability + marker */
127      /* Align to the requirement of the layout */
128      uint64s_per_slot = align_u32(uint64s_per_slot,
129                                   DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
130      data_offset = uint64s_per_slot * sizeof(uint64_t);
131      /* Add the query data for begin & end commands */
132      uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
133      break;
134   }
135#if GFX_VER >= 8
136   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
137      const struct intel_perf_query_field_layout *layout =
138         &pdevice->perf->query_layout;
139
140      perf_query_info = vk_find_struct_const(pCreateInfo->pNext,
141                                             QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
142      n_passes = intel_perf_get_n_passes(pdevice->perf,
143                                         perf_query_info->pCounterIndices,
144                                         perf_query_info->counterIndexCount,
145                                         NULL);
146      vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass,
147                             perf_query_info->counterIndexCount);
148      vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *,
149                             n_passes);
150      uint64s_per_slot = 4 /* availability + small batch */;
151      /* Align to the requirement of the layout */
152      uint64s_per_slot = align_u32(uint64s_per_slot,
153                                   DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
154      data_offset = uint64s_per_slot * sizeof(uint64_t);
155      /* Add the query data for begin & end commands */
156      uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
157      /* Multiply by the number of passes */
158      uint64s_per_slot *= n_passes;
159      break;
160   }
161#endif
162   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
163      /* Query has two values: begin and end. */
164      uint64s_per_slot = 1 + 2;
165      break;
166   default:
167      assert(!"Invalid query type");
168   }
169
170   if (!vk_object_multialloc(&device->vk, &ma, pAllocator,
171                             VK_OBJECT_TYPE_QUERY_POOL))
172      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
173
174   pool->type = pCreateInfo->queryType;
175   pool->pipeline_statistics = pipeline_statistics;
176   pool->stride = uint64s_per_slot * sizeof(uint64_t);
177   pool->slots = pCreateInfo->queryCount;
178
179   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
180      pool->data_offset = data_offset;
181      pool->snapshot_size = (pool->stride - data_offset) / 2;
182   }
183#if GFX_VER >= 8
184   else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
185      pool->pass_size = pool->stride / n_passes;
186      pool->data_offset = data_offset;
187      pool->snapshot_size = (pool->pass_size - data_offset) / 2;
188      pool->n_counters = perf_query_info->counterIndexCount;
189      pool->counter_pass = counter_pass;
190      intel_perf_get_counters_passes(pdevice->perf,
191                                     perf_query_info->pCounterIndices,
192                                     perf_query_info->counterIndexCount,
193                                     pool->counter_pass);
194      pool->n_passes = n_passes;
195      pool->pass_query = pass_query;
196      intel_perf_get_n_passes(pdevice->perf,
197                              perf_query_info->pCounterIndices,
198                              perf_query_info->counterIndexCount,
199                              pool->pass_query);
200   }
201#endif
202
203   uint64_t size = pool->slots * (uint64_t)pool->stride;
204   result = anv_device_alloc_bo(device, "query-pool", size,
205                                ANV_BO_ALLOC_MAPPED |
206                                ANV_BO_ALLOC_SNOOPED,
207                                0 /* explicit_address */,
208                                &pool->bo);
209   if (result != VK_SUCCESS)
210      goto fail;
211
212#if GFX_VER >= 8
213   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
214      for (uint32_t p = 0; p < pool->n_passes; p++) {
215         struct mi_builder b;
216         struct anv_batch batch = {
217            .start = pool->bo->map + khr_perf_query_preamble_offset(pool, p),
218            .end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset,
219         };
220         batch.next = batch.start;
221
222         mi_builder_init(&b, &device->info, &batch);
223         mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
224                      mi_imm(p * (uint64_t)pool->pass_size));
225         anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
226      }
227   }
228#endif
229
230   *pQueryPool = anv_query_pool_to_handle(pool);
231
232   return VK_SUCCESS;
233
234 fail:
235   vk_free2(&device->vk.alloc, pAllocator, pool);
236
237   return result;
238}
239
240void genX(DestroyQueryPool)(
241    VkDevice                                    _device,
242    VkQueryPool                                 _pool,
243    const VkAllocationCallbacks*                pAllocator)
244{
245   ANV_FROM_HANDLE(anv_device, device, _device);
246   ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
247
248   if (!pool)
249      return;
250
251   anv_device_release_bo(device, pool->bo);
252   vk_object_free(&device->vk, pAllocator, pool);
253}
254
255#if GFX_VER >= 8
256/**
257 * VK_KHR_performance_query layout  :
258 *
259 * --------------------------------------------
260 * |       availability (8b)       | |        |
261 * |-------------------------------| |        |
262 * |      Small batch loading      | |        |
263 * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
264 * |            (24b)              | | Pass 0 |
265 * |-------------------------------| |        |
266 * |       some padding (see       | |        |
267 * | query_field_layout:alignment) | |        |
268 * |-------------------------------| |        |
269 * |           query data          | |        |
270 * | (2 * query_field_layout:size) | |        |
271 * |-------------------------------|--        | Query 0
272 * |       availability (8b)       | |        |
273 * |-------------------------------| |        |
274 * |      Small batch loading      | |        |
275 * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
276 * |            (24b)              | | Pass 1 |
277 * |-------------------------------| |        |
278 * |       some padding (see       | |        |
279 * | query_field_layout:alignment) | |        |
280 * |-------------------------------| |        |
281 * |           query data          | |        |
282 * | (2 * query_field_layout:size) | |        |
283 * |-------------------------------|-----------
284 * |       availability (8b)       | |        |
285 * |-------------------------------| |        |
286 * |      Small batch loading      | |        |
287 * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
288 * |            (24b)              | | Pass 0 |
289 * |-------------------------------| |        |
290 * |       some padding (see       | |        |
291 * | query_field_layout:alignment) | |        |
292 * |-------------------------------| |        |
293 * |           query data          | |        |
294 * | (2 * query_field_layout:size) | |        |
295 * |-------------------------------|--        | Query 1
296 * |               ...             | |        |
297 * --------------------------------------------
298 */
299
300static uint64_t
301khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
302{
303   return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size;
304}
305
306static uint64_t
307khr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
308{
309   return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size +
310      pool->data_offset + (end ? pool->snapshot_size : 0);
311}
312
313static struct anv_address
314khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
315{
316   return anv_address_add(
317      (struct anv_address) { .bo = pool->bo, },
318      khr_perf_query_availability_offset(pool, query, pass));
319}
320
321static struct anv_address
322khr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
323{
324   return anv_address_add(
325      (struct anv_address) { .bo = pool->bo, },
326      khr_perf_query_data_offset(pool, query, pass, end));
327}
328
329static bool
330khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
331{
332   if (anv_batch_has_error(&cmd_buffer->batch))
333      return false;
334
335   if (cmd_buffer->self_mod_locations)
336      return true;
337
338   struct anv_device *device = cmd_buffer->device;
339   const struct anv_physical_device *pdevice = device->physical;
340
341   cmd_buffer->self_mod_locations =
342      vk_alloc(&cmd_buffer->vk.pool->alloc,
343               pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8,
344               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
345
346   if (!cmd_buffer->self_mod_locations) {
347      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
348      return false;
349   }
350
351   return true;
352}
353#endif
354
355/**
356 * VK_INTEL_performance_query layout :
357 *
358 * ---------------------------------
359 * |       availability (8b)       |
360 * |-------------------------------|
361 * |          marker (8b)          |
362 * |-------------------------------|
363 * |       some padding (see       |
364 * | query_field_layout:alignment) |
365 * |-------------------------------|
366 * |           query data          |
367 * | (2 * query_field_layout:size) |
368 * ---------------------------------
369 */
370
371static uint32_t
372intel_perf_marker_offset(void)
373{
374   return 8;
375}
376
377static uint32_t
378intel_perf_query_data_offset(struct anv_query_pool *pool, bool end)
379{
380   return pool->data_offset + (end ? pool->snapshot_size : 0);
381}
382
383static void
384cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
385                       uint32_t value_index, uint64_t result)
386{
387   if (flags & VK_QUERY_RESULT_64_BIT) {
388      uint64_t *dst64 = dst_slot;
389      dst64[value_index] = result;
390   } else {
391      uint32_t *dst32 = dst_slot;
392      dst32[value_index] = result;
393   }
394}
395
396static void *
397query_slot(struct anv_query_pool *pool, uint32_t query)
398{
399   return pool->bo->map + query * pool->stride;
400}
401
402static bool
403query_is_available(struct anv_query_pool *pool, uint32_t query)
404{
405#if GFX_VER >= 8
406   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
407      for (uint32_t p = 0; p < pool->n_passes; p++) {
408         volatile uint64_t *slot =
409            pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
410         if (!slot[0])
411            return false;
412      }
413      return true;
414   }
415#endif
416
417   return *(volatile uint64_t *)query_slot(pool, query);
418}
419
420static VkResult
421wait_for_available(struct anv_device *device,
422                   struct anv_query_pool *pool, uint32_t query)
423{
424   uint64_t abs_timeout_ns = os_time_get_absolute_timeout(2 * NSEC_PER_SEC);
425
426   while (os_time_get_nano() < abs_timeout_ns) {
427      if (query_is_available(pool, query))
428         return VK_SUCCESS;
429      VkResult status = vk_device_check_status(&device->vk);
430      if (status != VK_SUCCESS)
431         return status;
432   }
433
434   return vk_device_set_lost(&device->vk, "query timeout");
435}
436
437VkResult genX(GetQueryPoolResults)(
438    VkDevice                                    _device,
439    VkQueryPool                                 queryPool,
440    uint32_t                                    firstQuery,
441    uint32_t                                    queryCount,
442    size_t                                      dataSize,
443    void*                                       pData,
444    VkDeviceSize                                stride,
445    VkQueryResultFlags                          flags)
446{
447   ANV_FROM_HANDLE(anv_device, device, _device);
448   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
449
450   assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
451          pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
452          pool->type == VK_QUERY_TYPE_TIMESTAMP ||
453          pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
454          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
455          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
456          pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
457
458   if (vk_device_is_lost(&device->vk))
459      return VK_ERROR_DEVICE_LOST;
460
461   if (pData == NULL)
462      return VK_SUCCESS;
463
464   void *data_end = pData + dataSize;
465
466   VkResult status = VK_SUCCESS;
467   for (uint32_t i = 0; i < queryCount; i++) {
468      bool available = query_is_available(pool, firstQuery + i);
469
470      if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
471         status = wait_for_available(device, pool, firstQuery + i);
472         if (status != VK_SUCCESS) {
473            return status;
474         }
475
476         available = true;
477      }
478
479      /* From the Vulkan 1.0.42 spec:
480       *
481       *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
482       *    both not set then no result values are written to pData for
483       *    queries that are in the unavailable state at the time of the call,
484       *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
485       *    availability state is still written to pData for those queries if
486       *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
487       *
488       * From VK_KHR_performance_query :
489       *
490       *    "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies
491       *     that the result should contain the number of counters that were recorded
492       *     into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR"
493       */
494      bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
495
496      uint32_t idx = 0;
497      switch (pool->type) {
498      case VK_QUERY_TYPE_OCCLUSION:
499      case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
500         uint64_t *slot = query_slot(pool, firstQuery + i);
501         if (write_results) {
502            /* From the Vulkan 1.2.132 spec:
503             *
504             *    "If VK_QUERY_RESULT_PARTIAL_BIT is set,
505             *    VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
506             *    is unavailable, an intermediate result value between zero and
507             *    the final result value is written to pData for that query."
508             */
509            uint64_t result = available ? slot[2] - slot[1] : 0;
510            cpu_write_query_result(pData, flags, idx, result);
511         }
512         idx++;
513         break;
514      }
515
516      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
517         uint64_t *slot = query_slot(pool, firstQuery + i);
518         uint32_t statistics = pool->pipeline_statistics;
519         while (statistics) {
520            uint32_t stat = u_bit_scan(&statistics);
521            if (write_results) {
522               uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
523
524               /* WaDividePSInvocationCountBy4:HSW,BDW */
525               if ((device->info.ver == 8 || device->info.verx10 == 75) &&
526                   (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
527                  result >>= 2;
528
529               cpu_write_query_result(pData, flags, idx, result);
530            }
531            idx++;
532         }
533         assert(idx == util_bitcount(pool->pipeline_statistics));
534         break;
535      }
536
537      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
538         uint64_t *slot = query_slot(pool, firstQuery + i);
539         if (write_results)
540            cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
541         idx++;
542         if (write_results)
543            cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
544         idx++;
545         break;
546      }
547
548      case VK_QUERY_TYPE_TIMESTAMP: {
549         uint64_t *slot = query_slot(pool, firstQuery + i);
550         if (write_results)
551            cpu_write_query_result(pData, flags, idx, slot[1]);
552         idx++;
553         break;
554      }
555
556#if GFX_VER >= 8
557      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
558         const struct anv_physical_device *pdevice = device->physical;
559         assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
560                          VK_QUERY_RESULT_PARTIAL_BIT)) == 0);
561         for (uint32_t p = 0; p < pool->n_passes; p++) {
562            const struct intel_perf_query_info *query = pool->pass_query[p];
563            struct intel_perf_query_result result;
564            intel_perf_query_result_clear(&result);
565            intel_perf_query_result_accumulate_fields(&result, query,
566                                                      pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false),
567                                                      pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true),
568                                                      false /* no_oa_accumulate */);
569            anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);
570         }
571         break;
572      }
573#endif
574
575      case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
576         if (!write_results)
577            break;
578         const void *query_data = query_slot(pool, firstQuery + i);
579         const struct intel_perf_query_info *query = &device->physical->perf->queries[0];
580         struct intel_perf_query_result result;
581         intel_perf_query_result_clear(&result);
582         intel_perf_query_result_accumulate_fields(&result, query,
583                                                   query_data + intel_perf_query_data_offset(pool, false),
584                                                   query_data + intel_perf_query_data_offset(pool, true),
585                                                   false /* no_oa_accumulate */);
586         intel_perf_query_result_write_mdapi(pData, stride,
587                                             &device->info,
588                                             query, &result);
589         const uint64_t *marker = query_data + intel_perf_marker_offset();
590         intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
591         break;
592      }
593
594      default:
595         unreachable("invalid pool type");
596      }
597
598      if (!write_results)
599         status = VK_NOT_READY;
600
601      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
602         cpu_write_query_result(pData, flags, idx, available);
603
604      pData += stride;
605      if (pData >= data_end)
606         break;
607   }
608
609   return status;
610}
611
612static void
613emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
614                    struct anv_address addr)
615{
616   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
617   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
618
619   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
620      pc.DestinationAddressType  = DAT_PPGTT;
621      pc.PostSyncOperation       = WritePSDepthCount;
622      pc.DepthStallEnable        = true;
623      pc.Address                 = addr;
624
625      if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
626         pc.CommandStreamerStallEnable = true;
627   }
628}
629
630static void
631emit_query_mi_availability(struct mi_builder *b,
632                           struct anv_address addr,
633                           bool available)
634{
635   mi_store(b, mi_mem64(addr), mi_imm(available));
636}
637
638static void
639emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
640                           struct anv_address addr,
641                           bool available)
642{
643   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
644   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
645
646   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
647      pc.DestinationAddressType  = DAT_PPGTT;
648      pc.PostSyncOperation       = WriteImmediateData;
649      pc.Address                 = addr;
650      pc.ImmediateData           = available;
651   }
652}
653
654/**
655 * Goes through a series of consecutive query indices in the given pool
656 * setting all element values to 0 and emitting them as available.
657 */
658static void
659emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
660                  struct mi_builder *b, struct anv_query_pool *pool,
661                  uint32_t first_index, uint32_t num_queries)
662{
663   switch (pool->type) {
664   case VK_QUERY_TYPE_OCCLUSION:
665   case VK_QUERY_TYPE_TIMESTAMP:
666      /* These queries are written with a PIPE_CONTROL so clear them using the
667       * PIPE_CONTROL as well so we don't have to synchronize between 2 types
668       * of operations.
669       */
670      assert((pool->stride % 8) == 0);
671      for (uint32_t i = 0; i < num_queries; i++) {
672         struct anv_address slot_addr =
673            anv_query_address(pool, first_index + i);
674
675         for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
676            emit_query_pc_availability(cmd_buffer,
677                                       anv_address_add(slot_addr, qword * 8),
678                                       false);
679         }
680         emit_query_pc_availability(cmd_buffer, slot_addr, true);
681      }
682      break;
683
684   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
685   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
686   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
687      for (uint32_t i = 0; i < num_queries; i++) {
688         struct anv_address slot_addr =
689            anv_query_address(pool, first_index + i);
690         mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
691         emit_query_mi_availability(b, slot_addr, true);
692      }
693      break;
694
695#if GFX_VER >= 8
696   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
697      for (uint32_t i = 0; i < num_queries; i++) {
698         for (uint32_t p = 0; p < pool->n_passes; p++) {
699            mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false),
700                         0, 2 * pool->snapshot_size);
701            emit_query_mi_availability(b,
702                                       khr_perf_query_availability_address(pool, first_index + i, p),
703                                       true);
704         }
705      }
706      break;
707   }
708#endif
709
710   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
711      for (uint32_t i = 0; i < num_queries; i++) {
712         struct anv_address slot_addr =
713            anv_query_address(pool, first_index + i);
714         mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
715         emit_query_mi_availability(b, slot_addr, true);
716      }
717      break;
718
719   default:
720      unreachable("Unsupported query type");
721   }
722}
723
724void genX(CmdResetQueryPool)(
725    VkCommandBuffer                             commandBuffer,
726    VkQueryPool                                 queryPool,
727    uint32_t                                    firstQuery,
728    uint32_t                                    queryCount)
729{
730   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
731   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
732
733   switch (pool->type) {
734   case VK_QUERY_TYPE_OCCLUSION:
735      for (uint32_t i = 0; i < queryCount; i++) {
736         emit_query_pc_availability(cmd_buffer,
737                                    anv_query_address(pool, firstQuery + i),
738                                    false);
739      }
740      break;
741
742   case VK_QUERY_TYPE_TIMESTAMP: {
743      for (uint32_t i = 0; i < queryCount; i++) {
744         emit_query_pc_availability(cmd_buffer,
745                                    anv_query_address(pool, firstQuery + i),
746                                    false);
747      }
748
749      /* Add a CS stall here to make sure the PIPE_CONTROL above has
750       * completed. Otherwise some timestamps written later with MI_STORE_*
751       * commands might race with the PIPE_CONTROL in the loop above.
752       */
753      anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT,
754                                "vkCmdResetQueryPool of timestamps");
755      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
756      break;
757   }
758
759   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
760   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
761   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
762      struct mi_builder b;
763      mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
764
765      for (uint32_t i = 0; i < queryCount; i++)
766         emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
767      break;
768   }
769
770#if GFX_VER >= 8
771   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
772      struct mi_builder b;
773      mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
774
775      for (uint32_t i = 0; i < queryCount; i++) {
776         for (uint32_t p = 0; p < pool->n_passes; p++) {
777            emit_query_mi_availability(
778               &b,
779               khr_perf_query_availability_address(pool, firstQuery + i, p),
780               false);
781         }
782      }
783      break;
784   }
785#endif
786
787   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
788      struct mi_builder b;
789      mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
790
791      for (uint32_t i = 0; i < queryCount; i++)
792         emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
793      break;
794   }
795
796   default:
797      unreachable("Unsupported query type");
798   }
799}
800
801void genX(ResetQueryPool)(
802    VkDevice                                    _device,
803    VkQueryPool                                 queryPool,
804    uint32_t                                    firstQuery,
805    uint32_t                                    queryCount)
806{
807   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
808
809   for (uint32_t i = 0; i < queryCount; i++) {
810      if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
811#if GFX_VER >= 8
812         for (uint32_t p = 0; p < pool->n_passes; p++) {
813            uint64_t *pass_slot = pool->bo->map +
814               khr_perf_query_availability_offset(pool, firstQuery + i, p);
815            *pass_slot = 0;
816         }
817#endif
818      } else {
819         uint64_t *slot = query_slot(pool, firstQuery + i);
820         *slot = 0;
821      }
822   }
823}
824
825static const uint32_t vk_pipeline_stat_to_reg[] = {
826   GENX(IA_VERTICES_COUNT_num),
827   GENX(IA_PRIMITIVES_COUNT_num),
828   GENX(VS_INVOCATION_COUNT_num),
829   GENX(GS_INVOCATION_COUNT_num),
830   GENX(GS_PRIMITIVES_COUNT_num),
831   GENX(CL_INVOCATION_COUNT_num),
832   GENX(CL_PRIMITIVES_COUNT_num),
833   GENX(PS_INVOCATION_COUNT_num),
834   GENX(HS_INVOCATION_COUNT_num),
835   GENX(DS_INVOCATION_COUNT_num),
836   GENX(CS_INVOCATION_COUNT_num),
837};
838
839static void
840emit_pipeline_stat(struct mi_builder *b, uint32_t stat,
841                   struct anv_address addr)
842{
843   STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
844                 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
845
846   assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
847   mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat]));
848}
849
850static void
851emit_xfb_query(struct mi_builder *b, uint32_t stream,
852               struct anv_address addr)
853{
854   assert(stream < MAX_XFB_STREAMS);
855
856   mi_store(b, mi_mem64(anv_address_add(addr, 0)),
857               mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
858   mi_store(b, mi_mem64(anv_address_add(addr, 16)),
859               mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
860}
861
862static void
863emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
864                      struct anv_query_pool *pool,
865                      struct mi_builder *b,
866                      struct anv_address query_addr,
867                      bool end)
868{
869   const struct intel_perf_query_field_layout *layout =
870      &cmd_buffer->device->physical->perf->query_layout;
871   struct anv_address data_addr =
872      anv_address_add(query_addr, intel_perf_query_data_offset(pool, end));
873
874   for (uint32_t f = 0; f < layout->n_fields; f++) {
875      const struct intel_perf_query_field *field =
876         &layout->fields[end ? f : (layout->n_fields - 1 - f)];
877
878      switch (field->type) {
879      case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
880         anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
881            rpc.MemoryAddress = anv_address_add(data_addr, field->location);
882         }
883         break;
884
885      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
886      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
887      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
888      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
889      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: {
890         struct anv_address addr = anv_address_add(data_addr, field->location);
891         struct mi_value src = field->size == 8 ?
892            mi_reg64(field->mmio_offset) :
893            mi_reg32(field->mmio_offset);
894         struct mi_value dst = field->size == 8 ?
895            mi_mem64(addr) : mi_mem32(addr);
896         mi_store(b, dst, src);
897         break;
898      }
899
900      default:
901         unreachable("Invalid query field");
902         break;
903      }
904   }
905}
906
907void genX(CmdBeginQuery)(
908    VkCommandBuffer                             commandBuffer,
909    VkQueryPool                                 queryPool,
910    uint32_t                                    query,
911    VkQueryControlFlags                         flags)
912{
913   genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
914}
915
916void genX(CmdBeginQueryIndexedEXT)(
917    VkCommandBuffer                             commandBuffer,
918    VkQueryPool                                 queryPool,
919    uint32_t                                    query,
920    VkQueryControlFlags                         flags,
921    uint32_t                                    index)
922{
923   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
924   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
925   struct anv_address query_addr = anv_query_address(pool, query);
926
927   struct mi_builder b;
928   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
929
930   switch (pool->type) {
931   case VK_QUERY_TYPE_OCCLUSION:
932      emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
933      break;
934
935   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
936      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
937         pc.CommandStreamerStallEnable = true;
938         pc.StallAtPixelScoreboard = true;
939      }
940      mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
941                   mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
942      break;
943
944   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
945      /* TODO: This might only be necessary for certain stats */
946      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
947         pc.CommandStreamerStallEnable = true;
948         pc.StallAtPixelScoreboard = true;
949      }
950
951      uint32_t statistics = pool->pipeline_statistics;
952      uint32_t offset = 8;
953      while (statistics) {
954         uint32_t stat = u_bit_scan(&statistics);
955         emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
956         offset += 16;
957      }
958      break;
959   }
960
961   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
962      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
963         pc.CommandStreamerStallEnable = true;
964         pc.StallAtPixelScoreboard = true;
965      }
966      emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
967      break;
968
969#if GFX_VER >= 8
970   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
971      if (!khr_perf_query_ensure_relocs(cmd_buffer))
972         return;
973
974      const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
975      const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
976
977      uint32_t reloc_idx = 0;
978      for (uint32_t end = 0; end < 2; end++) {
979         for (uint32_t r = 0; r < layout->n_fields; r++) {
980            const struct intel_perf_query_field *field =
981               &layout->fields[end ? r : (layout->n_fields - 1 - r)];
982            struct mi_value reg_addr =
983               mi_iadd(
984                  &b,
985                  mi_imm(intel_canonical_address(pool->bo->offset +
986                                                 khr_perf_query_data_offset(pool, query, 0, end) +
987                                                 field->location)),
988                  mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
989            cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
990
991            if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC &&
992                field->size == 8) {
993               reg_addr =
994                  mi_iadd(
995                     &b,
996                     mi_imm(intel_canonical_address(pool->bo->offset +
997                                                    khr_perf_query_data_offset(pool, query, 0, end) +
998                                                    field->location + 4)),
999                     mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
1000               cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
1001            }
1002         }
1003      }
1004
1005      struct mi_value availability_write_offset =
1006         mi_iadd(
1007            &b,
1008            mi_imm(
1009               intel_canonical_address(
1010                  pool->bo->offset +
1011                  khr_perf_query_availability_offset(pool, query, 0 /* pass */))),
1012            mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
1013      cmd_buffer->self_mod_locations[reloc_idx++] =
1014         mi_store_address(&b, availability_write_offset);
1015
1016      assert(reloc_idx == pdevice->n_perf_query_commands);
1017
1018      mi_self_mod_barrier(&b);
1019
1020      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1021         pc.CommandStreamerStallEnable = true;
1022         pc.StallAtPixelScoreboard = true;
1023      }
1024      cmd_buffer->perf_query_pool = pool;
1025
1026      cmd_buffer->perf_reloc_idx = 0;
1027      for (uint32_t r = 0; r < layout->n_fields; r++) {
1028         const struct intel_perf_query_field *field =
1029            &layout->fields[layout->n_fields - 1 - r];
1030         void *dws;
1031
1032         switch (field->type) {
1033         case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
1034            dws = anv_batch_emitn(&cmd_buffer->batch,
1035                                  GENX(MI_REPORT_PERF_COUNT_length),
1036                                  GENX(MI_REPORT_PERF_COUNT),
1037                                  .MemoryAddress = query_addr /* Will be overwritten */);
1038            _mi_resolve_address_token(&b,
1039                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1040                                      dws +
1041                                      GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
1042            break;
1043
1044         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
1045         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
1046         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
1047         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1048         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1049            dws =
1050               anv_batch_emitn(&cmd_buffer->batch,
1051                               GENX(MI_STORE_REGISTER_MEM_length),
1052                               GENX(MI_STORE_REGISTER_MEM),
1053                               .RegisterAddress = field->mmio_offset,
1054                               .MemoryAddress = query_addr /* Will be overwritten */ );
1055            _mi_resolve_address_token(&b,
1056                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1057                                      dws +
1058                                      GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1059            if (field->size == 8) {
1060               dws =
1061                  anv_batch_emitn(&cmd_buffer->batch,
1062                                  GENX(MI_STORE_REGISTER_MEM_length),
1063                                  GENX(MI_STORE_REGISTER_MEM),
1064                                  .RegisterAddress = field->mmio_offset + 4,
1065                                  .MemoryAddress = query_addr /* Will be overwritten */ );
1066               _mi_resolve_address_token(&b,
1067                                         cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1068                                         dws +
1069                                         GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1070            }
1071            break;
1072
1073         default:
1074            unreachable("Invalid query field");
1075            break;
1076         }
1077      }
1078      break;
1079   }
1080#endif
1081
1082   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1083      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1084         pc.CommandStreamerStallEnable = true;
1085         pc.StallAtPixelScoreboard = true;
1086      }
1087      emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false);
1088      break;
1089   }
1090
1091   default:
1092      unreachable("");
1093   }
1094}
1095
1096void genX(CmdEndQuery)(
1097    VkCommandBuffer                             commandBuffer,
1098    VkQueryPool                                 queryPool,
1099    uint32_t                                    query)
1100{
1101   genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
1102}
1103
1104void genX(CmdEndQueryIndexedEXT)(
1105    VkCommandBuffer                             commandBuffer,
1106    VkQueryPool                                 queryPool,
1107    uint32_t                                    query,
1108    uint32_t                                    index)
1109{
1110   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1111   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1112   struct anv_address query_addr = anv_query_address(pool, query);
1113
1114   struct mi_builder b;
1115   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1116
1117   switch (pool->type) {
1118   case VK_QUERY_TYPE_OCCLUSION:
1119      emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
1120      emit_query_pc_availability(cmd_buffer, query_addr, true);
1121      break;
1122
1123   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1124      /* Ensure previous commands have completed before capturing the register
1125       * value.
1126       */
1127      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1128         pc.CommandStreamerStallEnable = true;
1129         pc.StallAtPixelScoreboard = true;
1130      }
1131
1132      mi_store(&b, mi_mem64(anv_address_add(query_addr, 16)),
1133                   mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
1134      emit_query_mi_availability(&b, query_addr, true);
1135      break;
1136
1137   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1138      /* TODO: This might only be necessary for certain stats */
1139      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1140         pc.CommandStreamerStallEnable = true;
1141         pc.StallAtPixelScoreboard = true;
1142      }
1143
1144      uint32_t statistics = pool->pipeline_statistics;
1145      uint32_t offset = 16;
1146      while (statistics) {
1147         uint32_t stat = u_bit_scan(&statistics);
1148         emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
1149         offset += 16;
1150      }
1151
1152      emit_query_mi_availability(&b, query_addr, true);
1153      break;
1154   }
1155
1156   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1157      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1158         pc.CommandStreamerStallEnable = true;
1159         pc.StallAtPixelScoreboard = true;
1160      }
1161
1162      emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
1163      emit_query_mi_availability(&b, query_addr, true);
1164      break;
1165
1166#if GFX_VER >= 8
1167   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1168      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1169         pc.CommandStreamerStallEnable = true;
1170         pc.StallAtPixelScoreboard = true;
1171      }
1172      cmd_buffer->perf_query_pool = pool;
1173
1174      if (!khr_perf_query_ensure_relocs(cmd_buffer))
1175         return;
1176
1177      const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
1178      const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
1179
1180      void *dws;
1181      for (uint32_t r = 0; r < layout->n_fields; r++) {
1182         const struct intel_perf_query_field *field = &layout->fields[r];
1183
1184         switch (field->type) {
1185         case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
1186            dws = anv_batch_emitn(&cmd_buffer->batch,
1187                                  GENX(MI_REPORT_PERF_COUNT_length),
1188                                  GENX(MI_REPORT_PERF_COUNT),
1189                                  .MemoryAddress = query_addr /* Will be overwritten */);
1190            _mi_resolve_address_token(&b,
1191                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1192                                      dws +
1193                                      GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
1194            break;
1195
1196         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
1197         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
1198         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
1199         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1200         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1201            dws =
1202               anv_batch_emitn(&cmd_buffer->batch,
1203                               GENX(MI_STORE_REGISTER_MEM_length),
1204                               GENX(MI_STORE_REGISTER_MEM),
1205                               .RegisterAddress = field->mmio_offset,
1206                               .MemoryAddress = query_addr /* Will be overwritten */ );
1207            _mi_resolve_address_token(&b,
1208                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1209                                      dws +
1210                                      GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1211            if (field->size == 8) {
1212               dws =
1213                  anv_batch_emitn(&cmd_buffer->batch,
1214                                  GENX(MI_STORE_REGISTER_MEM_length),
1215                                  GENX(MI_STORE_REGISTER_MEM),
1216                                  .RegisterAddress = field->mmio_offset + 4,
1217                                  .MemoryAddress = query_addr /* Will be overwritten */ );
1218               _mi_resolve_address_token(&b,
1219                                         cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1220                                         dws +
1221                                         GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1222            }
1223            break;
1224
1225         default:
1226            unreachable("Invalid query field");
1227            break;
1228         }
1229      }
1230
1231      dws =
1232         anv_batch_emitn(&cmd_buffer->batch,
1233                         GENX(MI_STORE_DATA_IMM_length),
1234                         GENX(MI_STORE_DATA_IMM),
1235                         .ImmediateData = true);
1236      _mi_resolve_address_token(&b,
1237                                cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1238                                dws +
1239                                GENX(MI_STORE_DATA_IMM_Address_start) / 8);
1240
1241      assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands);
1242      break;
1243   }
1244#endif
1245
1246   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1247      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1248         pc.CommandStreamerStallEnable = true;
1249         pc.StallAtPixelScoreboard = true;
1250      }
1251      uint32_t marker_offset = intel_perf_marker_offset();
1252      mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)),
1253                   mi_imm(cmd_buffer->intel_perf_marker));
1254      emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true);
1255      emit_query_mi_availability(&b, query_addr, true);
1256      break;
1257   }
1258
1259   default:
1260      unreachable("");
1261   }
1262
1263   /* When multiview is active the spec requires that N consecutive query
1264    * indices are used, where N is the number of active views in the subpass.
1265    * The spec allows that we only write the results to one of the queries
1266    * but we still need to manage result availability for all the query indices.
1267    * Since we only emit a single query for all active views in the
1268    * first index, mark the other query indices as being already available
1269    * with result 0.
1270    */
1271   if (cmd_buffer->state.gfx.view_mask) {
1272      const uint32_t num_queries =
1273         util_bitcount(cmd_buffer->state.gfx.view_mask);
1274      if (num_queries > 1)
1275         emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1276   }
1277}
1278
1279#define TIMESTAMP 0x2358
1280
1281void genX(CmdWriteTimestamp2)(
1282    VkCommandBuffer                             commandBuffer,
1283    VkPipelineStageFlags2                       stage,
1284    VkQueryPool                                 queryPool,
1285    uint32_t                                    query)
1286{
1287   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1288   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1289   struct anv_address query_addr = anv_query_address(pool, query);
1290
1291   assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
1292
1293   struct mi_builder b;
1294   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1295
1296   if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
1297      mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
1298                   mi_reg64(TIMESTAMP));
1299      emit_query_mi_availability(&b, query_addr, true);
1300   } else {
1301      /* Everything else is bottom-of-pipe */
1302      cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
1303      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1304
1305      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1306         pc.DestinationAddressType  = DAT_PPGTT;
1307         pc.PostSyncOperation       = WriteTimestamp;
1308         pc.Address                 = anv_address_add(query_addr, 8);
1309
1310         if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
1311            pc.CommandStreamerStallEnable = true;
1312      }
1313      emit_query_pc_availability(cmd_buffer, query_addr, true);
1314   }
1315
1316
1317   /* When multiview is active the spec requires that N consecutive query
1318    * indices are used, where N is the number of active views in the subpass.
1319    * The spec allows that we only write the results to one of the queries
1320    * but we still need to manage result availability for all the query indices.
1321    * Since we only emit a single query for all active views in the
1322    * first index, mark the other query indices as being already available
1323    * with result 0.
1324    */
1325   if (cmd_buffer->state.gfx.view_mask) {
1326      const uint32_t num_queries =
1327         util_bitcount(cmd_buffer->state.gfx.view_mask);
1328      if (num_queries > 1)
1329         emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1330   }
1331}
1332
1333#if GFX_VERx10 >= 75
1334
1335#define MI_PREDICATE_SRC0    0x2400
1336#define MI_PREDICATE_SRC1    0x2408
1337#define MI_PREDICATE_RESULT  0x2418
1338
1339/**
1340 * Writes the results of a query to dst_addr is the value at poll_addr is equal
1341 * to the reference value.
1342 */
1343static void
1344gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
1345                            struct mi_builder *b,
1346                            struct anv_address poll_addr,
1347                            struct anv_address dst_addr,
1348                            uint64_t ref_value,
1349                            VkQueryResultFlags flags,
1350                            uint32_t value_index,
1351                            struct mi_value query_result)
1352{
1353   mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr));
1354   mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value));
1355   anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1356      mip.LoadOperation    = LOAD_LOAD;
1357      mip.CombineOperation = COMBINE_SET;
1358      mip.CompareOperation = COMPARE_SRCS_EQUAL;
1359   }
1360
1361   if (flags & VK_QUERY_RESULT_64_BIT) {
1362      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1363      mi_store_if(b, mi_mem64(res_addr), query_result);
1364   } else {
1365      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1366      mi_store_if(b, mi_mem32(res_addr), query_result);
1367   }
1368}
1369
1370static void
1371gpu_write_query_result(struct mi_builder *b,
1372                       struct anv_address dst_addr,
1373                       VkQueryResultFlags flags,
1374                       uint32_t value_index,
1375                       struct mi_value query_result)
1376{
1377   if (flags & VK_QUERY_RESULT_64_BIT) {
1378      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1379      mi_store(b, mi_mem64(res_addr), query_result);
1380   } else {
1381      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1382      mi_store(b, mi_mem32(res_addr), query_result);
1383   }
1384}
1385
1386static struct mi_value
1387compute_query_result(struct mi_builder *b, struct anv_address addr)
1388{
1389   return mi_isub(b, mi_mem64(anv_address_add(addr, 8)),
1390                     mi_mem64(anv_address_add(addr, 0)));
1391}
1392
1393void genX(CmdCopyQueryPoolResults)(
1394    VkCommandBuffer                             commandBuffer,
1395    VkQueryPool                                 queryPool,
1396    uint32_t                                    firstQuery,
1397    uint32_t                                    queryCount,
1398    VkBuffer                                    destBuffer,
1399    VkDeviceSize                                destOffset,
1400    VkDeviceSize                                destStride,
1401    VkQueryResultFlags                          flags)
1402{
1403   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1404   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1405   ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
1406
1407   struct mi_builder b;
1408   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1409   struct mi_value result;
1410
1411   /* If render target writes are ongoing, request a render target cache flush
1412    * to ensure proper ordering of the commands from the 3d pipe and the
1413    * command streamer.
1414    */
1415   if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
1416      anv_add_pending_pipe_bits(cmd_buffer,
1417                                ANV_PIPE_TILE_CACHE_FLUSH_BIT |
1418                                ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
1419                                "CopyQueryPoolResults");
1420   }
1421
1422   if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
1423       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
1424       /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
1425        * because we're about to copy values from MI commands, we need to
1426        * stall the command streamer to make sure the PIPE_CONTROL values have
1427        * landed, otherwise we could see inconsistent values & availability.
1428        *
1429        *  From the vulkan spec:
1430        *
1431        *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1432        *     previous uses of vkCmdResetQueryPool in the same queue, without
1433        *     any additional synchronization."
1434        */
1435       pool->type == VK_QUERY_TYPE_OCCLUSION ||
1436       pool->type == VK_QUERY_TYPE_TIMESTAMP) {
1437      anv_add_pending_pipe_bits(cmd_buffer,
1438                                ANV_PIPE_CS_STALL_BIT,
1439                                "CopyQueryPoolResults");
1440      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1441   }
1442
1443   struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
1444   for (uint32_t i = 0; i < queryCount; i++) {
1445      struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
1446      uint32_t idx = 0;
1447      switch (pool->type) {
1448      case VK_QUERY_TYPE_OCCLUSION:
1449      case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1450         result = compute_query_result(&b, anv_address_add(query_addr, 8));
1451         /* Like in the case of vkGetQueryPoolResults, if the query is
1452          * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
1453          * conservatively write 0 as the query result. If the
1454          * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
1455          */
1456         gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1457               1 /* available */, flags, idx, result);
1458         if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
1459            gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1460                  0 /* unavailable */, flags, idx, mi_imm(0));
1461         }
1462         idx++;
1463         break;
1464
1465      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1466         uint32_t statistics = pool->pipeline_statistics;
1467         while (statistics) {
1468            uint32_t stat = u_bit_scan(&statistics);
1469
1470            result = compute_query_result(&b, anv_address_add(query_addr,
1471                                                              idx * 16 + 8));
1472
1473            /* WaDividePSInvocationCountBy4:HSW,BDW */
1474            if ((cmd_buffer->device->info.ver == 8 ||
1475                 cmd_buffer->device->info.verx10 == 75) &&
1476                (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
1477               result = mi_ushr32_imm(&b, result, 2);
1478            }
1479
1480            gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1481         }
1482         assert(idx == util_bitcount(pool->pipeline_statistics));
1483         break;
1484      }
1485
1486      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1487         result = compute_query_result(&b, anv_address_add(query_addr, 8));
1488         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1489         result = compute_query_result(&b, anv_address_add(query_addr, 24));
1490         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1491         break;
1492
1493      case VK_QUERY_TYPE_TIMESTAMP:
1494         result = mi_mem64(anv_address_add(query_addr, 8));
1495         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1496         break;
1497
1498#if GFX_VER >= 8
1499      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1500         unreachable("Copy KHR performance query results not implemented");
1501         break;
1502#endif
1503
1504      default:
1505         unreachable("unhandled query type");
1506      }
1507
1508      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1509         gpu_write_query_result(&b, dest_addr, flags, idx,
1510                                mi_mem64(query_addr));
1511      }
1512
1513      dest_addr = anv_address_add(dest_addr, destStride);
1514   }
1515}
1516
1517#else
1518void genX(CmdCopyQueryPoolResults)(
1519    VkCommandBuffer                             commandBuffer,
1520    VkQueryPool                                 queryPool,
1521    uint32_t                                    firstQuery,
1522    uint32_t                                    queryCount,
1523    VkBuffer                                    destBuffer,
1524    VkDeviceSize                                destOffset,
1525    VkDeviceSize                                destStride,
1526    VkQueryResultFlags                          flags)
1527{
1528   anv_finishme("Queries not yet supported on Ivy Bridge");
1529}
1530#endif
1531