1 /*
2  * Copyright © 2020 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 
26 #include "util/timespec.h"
27 
28 static const char *v3dv_counters[][3] = {
29    {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
30    {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
31    {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
32    {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
33    {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
34    {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
35    {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
36    {"TLB", "TLB-quads-with-zero-coverage", "[TLB] Quads with all pixels having zero coverage"},
37    {"TLB", "TLB-quads-with-non-zero-coverage", "[TLB] Quads with any pixels having non-zero coverage"},
38    {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
39    {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
40    {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
41    {"PTB", "PTB-primitives-discared-reversed", "[PTB] Primitives that are discarded because they are reversed"},
42    {"QPU", "QPU-total-idle-clk-cycles", "[QPU] Total idle clock cycles for all QPUs"},
43    {"QPU", "QPU-total-active-clk-cycles-vertex-coord-shading", "[QPU] Total active clock cycles for all QPUs doing vertex/coordinate/user shading (counts only when QPU is not stalled)"},
44    {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
45    {"QPU", "QPU-total-clk-cycles-executing-valid-instr", "[QPU] Total clock cycles for all QPUs executing valid instructions"},
46    {"QPU", "QPU-total-clk-cycles-waiting-TMU", "[QPU] Total clock cycles for all QPUs stalled waiting for TMUs only (counter won't increment if QPU also stalling for another reason)"},
47    {"QPU", "QPU-total-clk-cycles-waiting-scoreboard", "[QPU] Total clock cycles for all QPUs stalled waiting for Scoreboard only (counter won't increment if QPU also stalling for another reason)"},
48    {"QPU", "QPU-total-clk-cycles-waiting-varyings", "[QPU] Total clock cycles for all QPUs stalled waiting for Varyings only (counter won't increment if QPU also stalling for another reason)"},
49    {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
50    {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
51    {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
52    {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
53    {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
54    {"TMU", "TMU-total-text-cache-miss", "[TMU] Total texture cache misses (number of fetches from memory/L2cache)"},
55    {"VPM", "VPM-total-clk-cycles-VDW-stalled", "[VPM] Total clock cycles VDW is stalled waiting for VPM access"},
56    {"VPM", "VPM-total-clk-cycles-VCD-stalled", "[VPM] Total clock cycles VCD is stalled waiting for VPM access"},
57    {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
58    {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
59    {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
60    {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
61    {"CORE", "cycle-count", "[CORE] Cycle counter"},
62    {"QPU", "QPU-total-clk-cycles-waiting-vertex-coord-shading", "[QPU] Total stalled clock cycles for all QPUs doing vertex/coordinate/user shading"},
63    {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
64    {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
65    {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
66    {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
67    {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
68    {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
69    {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
70    {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
71    {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
72    {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
73    {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
74    {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
75    {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
76    {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
77    {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
78    {"TMU", "TMU-total-config-access", "[TMU] Total config accesses"},
79    {"L2T", "L2T-no-id-stalled", "[L2T] No ID stall"},
80    {"L2T", "L2T-command-queue-stalled", "[L2T] Command queue full stall"},
81    {"L2T", "L2T-TMU-writes", "[L2T] TMU write accesses"},
82    {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
83    {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
84    {"CLE", "CLE-thread-active-cycles", "[CLE] Bin or render thread active cycles"},
85    {"L2T", "L2T-TMU-reads", "[L2T] TMU read accesses"},
86    {"L2T", "L2T-CLE-reads", "[L2T] CLE read accesses"},
87    {"L2T", "L2T-VCD-reads", "[L2T] VCD read accesses"},
88    {"L2T", "L2T-TMU-config-reads", "[L2T] TMU CFG read accesses"},
89    {"L2T", "L2T-SLC0-reads", "[L2T] SLC0 read accesses"},
90    {"L2T", "L2T-SLC1-reads", "[L2T] SLC1 read accesses"},
91    {"L2T", "L2T-SLC2-reads", "[L2T] SLC2 read accesses"},
92    {"L2T", "L2T-TMU-write-miss", "[L2T] TMU write misses"},
93    {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
94    {"L2T", "L2T-CLE-read-miss", "[L2T] CLE read misses"},
95    {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
96    {"L2T", "L2T-TMU-config-read-miss", "[L2T] TMU CFG read misses"},
97    {"L2T", "L2T-SLC0-read-miss", "[L2T] SLC0 read misses"},
98    {"L2T", "L2T-SLC1-read-miss", "[L2T] SLC1 read misses"},
99    {"L2T", "L2T-SLC2-read-miss", "[L2T] SLC2 read misses"},
100    {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
101    {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
102    {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
103    {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
104    {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
105    {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
106    {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
107    {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
108    {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
109    {"GMP", "GMP-memory-reads", "[GMP] Total memory reads"},
110    {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
111    {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
112    {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
113    {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
114    {"TMU", "TMU-MRU-hits", "[TMU] Total MRU hits"},
115    {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
116 };
117 
118 static void
kperfmon_create(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query)119 kperfmon_create(struct v3dv_device *device,
120                 struct v3dv_query_pool *pool,
121                 uint32_t query)
122 {
123    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
124       assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
125 
126       struct drm_v3d_perfmon_create req = {
127          .ncounters = MIN2(pool->perfmon.ncounters -
128                            i * DRM_V3D_MAX_PERF_COUNTERS,
129                            DRM_V3D_MAX_PERF_COUNTERS),
130       };
131       memcpy(req.counters,
132              &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
133              req.ncounters);
134 
135       int ret = v3dv_ioctl(device->pdevice->render_fd,
136                            DRM_IOCTL_V3D_PERFMON_CREATE,
137                            &req);
138       if (ret)
139          fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
140 
141       pool->queries[query].perf.kperfmon_ids[i] = req.id;
142    }
143 }
144 
145 static void
kperfmon_destroy(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query)146 kperfmon_destroy(struct v3dv_device *device,
147                  struct v3dv_query_pool *pool,
148                  uint32_t query)
149 {
150    /* Skip destroying if never created */
151    if (!pool->queries[query].perf.kperfmon_ids[0])
152       return;
153 
154    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
155       struct drm_v3d_perfmon_destroy req = {
156          .id = pool->queries[query].perf.kperfmon_ids[i]
157       };
158 
159       int ret = v3dv_ioctl(device->pdevice->render_fd,
160                            DRM_IOCTL_V3D_PERFMON_DESTROY,
161                            &req);
162 
163       if (ret) {
164          fprintf(stderr, "Failed to destroy perfmon %u: %s\n",
165                  req.id, strerror(ret));
166       }
167    }
168 }
169 
170 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)171 v3dv_CreateQueryPool(VkDevice _device,
172                      const VkQueryPoolCreateInfo *pCreateInfo,
173                      const VkAllocationCallbacks *pAllocator,
174                      VkQueryPool *pQueryPool)
175 {
176    V3DV_FROM_HANDLE(v3dv_device, device, _device);
177 
178    assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
179           pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
180           pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
181    assert(pCreateInfo->queryCount > 0);
182 
183    struct v3dv_query_pool *pool =
184       vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
185                        VK_OBJECT_TYPE_QUERY_POOL);
186    if (pool == NULL)
187       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
188 
189    pool->query_type = pCreateInfo->queryType;
190    pool->query_count = pCreateInfo->queryCount;
191 
192    uint32_t query_idx = 0;
193    VkResult result;
194 
195    const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
196    pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
197                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
198    if (pool->queries == NULL) {
199       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
200       goto fail;
201    }
202 
203    switch (pool->query_type) {
204    case VK_QUERY_TYPE_OCCLUSION: {
205       /* The hardware allows us to setup groups of 16 queries in consecutive
206        * 4-byte addresses, requiring only that each group of 16 queries is
207        * aligned to a 1024 byte boundary.
208        */
209       const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
210       const uint32_t bo_size = query_groups * 1024;
211       pool->bo = v3dv_bo_alloc(device, bo_size, "query", true);
212       if (!pool->bo) {
213          result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
214          goto fail;
215       }
216       if (!v3dv_bo_map(device, pool->bo, bo_size)) {
217          result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
218          goto fail;
219       }
220       break;
221    }
222    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
223       const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
224          vk_find_struct_const(pCreateInfo->pNext,
225                               QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
226 
227       assert(pq_info);
228       assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);
229 
230       pool->perfmon.ncounters = pq_info->counterIndexCount;
231       for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
232          pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
233 
234       pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
235                                              DRM_V3D_MAX_PERF_COUNTERS);
236 
237       assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
238       break;
239    }
240    case VK_QUERY_TYPE_TIMESTAMP:
241       break;
242    default:
243       unreachable("Unsupported query type");
244    }
245 
246    for (; query_idx < pool->query_count; query_idx++) {
247       pool->queries[query_idx].maybe_available = false;
248       switch (pool->query_type) {
249       case VK_QUERY_TYPE_OCCLUSION: {
250          const uint32_t query_group = query_idx / 16;
251          const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
252          pool->queries[query_idx].bo = pool->bo;
253          pool->queries[query_idx].offset = query_offset;
254          break;
255          }
256       case VK_QUERY_TYPE_TIMESTAMP:
257          pool->queries[query_idx].value = 0;
258          break;
259       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
260          result = vk_sync_create(&device->vk,
261                                  &device->pdevice->drm_syncobj_type, 0, 0,
262                                  &pool->queries[query_idx].perf.last_job_sync);
263          if (result != VK_SUCCESS)
264             goto fail;
265 
266          for (uint32_t j = 0; j < pool->perfmon.nperfmons; j++)
267             pool->queries[query_idx].perf.kperfmon_ids[j] = 0;
268          break;
269          }
270       default:
271          unreachable("Unsupported query type");
272       }
273    }
274 
275    *pQueryPool = v3dv_query_pool_to_handle(pool);
276 
277    return VK_SUCCESS;
278 
279 fail:
280    if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
281       for (uint32_t j = 0; j < query_idx; j++)
282          vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
283    }
284 
285    if (pool->bo)
286       v3dv_bo_free(device, pool->bo);
287    if (pool->queries)
288       vk_free2(&device->vk.alloc, pAllocator, pool->queries);
289    vk_object_free(&device->vk, pAllocator, pool);
290 
291    return result;
292 }
293 
294 VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyQueryPool(VkDevice _device, VkQueryPool queryPool, const VkAllocationCallbacks *pAllocator)295 v3dv_DestroyQueryPool(VkDevice _device,
296                       VkQueryPool queryPool,
297                       const VkAllocationCallbacks *pAllocator)
298 {
299    V3DV_FROM_HANDLE(v3dv_device, device, _device);
300    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
301 
302    if (!pool)
303       return;
304 
305    if (pool->bo)
306       v3dv_bo_free(device, pool->bo);
307 
308    if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
309       for (uint32_t i = 0; i < pool->query_count; i++) {
310          kperfmon_destroy(device, pool, i);
311          vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
312       }
313    }
314 
315    if (pool->queries)
316       vk_free2(&device->vk.alloc, pAllocator, pool->queries);
317 
318    vk_object_free(&device->vk, pAllocator, pool);
319 }
320 
321 static void
write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)322 write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
323 {
324    if (do_64bit) {
325       uint64_t *dst64 = (uint64_t *) dst;
326       dst64[idx] = value;
327    } else {
328       uint32_t *dst32 = (uint32_t *) dst;
329       dst32[idx] = (uint32_t) value;
330    }
331 }
332 
333 static VkResult
query_wait_available(struct v3dv_device *device, struct v3dv_query *q, VkQueryType query_type)334 query_wait_available(struct v3dv_device *device,
335                      struct v3dv_query *q,
336                      VkQueryType query_type)
337 {
338    if (!q->maybe_available) {
339       struct timespec timeout;
340       timespec_get(&timeout, TIME_UTC);
341       timespec_add_msec(&timeout, &timeout, 2000);
342 
343       VkResult result = VK_SUCCESS;
344 
345       mtx_lock(&device->query_mutex);
346       while (!q->maybe_available) {
347          if (vk_device_is_lost(&device->vk)) {
348             result = VK_ERROR_DEVICE_LOST;
349             break;
350          }
351 
352          int ret = cnd_timedwait(&device->query_ended,
353                                  &device->query_mutex,
354                                  &timeout);
355          if (ret != thrd_success) {
356             mtx_unlock(&device->query_mutex);
357             result = vk_device_set_lost(&device->vk, "Query wait failed");
358             break;
359          }
360       }
361       mtx_unlock(&device->query_mutex);
362 
363       if (result != VK_SUCCESS)
364          return result;
365    }
366 
367    if (query_type == VK_QUERY_TYPE_OCCLUSION &&
368        !v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull))
369       return vk_device_set_lost(&device->vk, "Query BO wait failed: %m");
370 
371    if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
372        vk_sync_wait(&device->vk, q->perf.last_job_sync,
373                     0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS)
374       return vk_device_set_lost(&device->vk, "Query job wait failed");
375 
376    return VK_SUCCESS;
377 }
378 
379 static VkResult
write_occlusion_query_result(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query, bool do_64bit, void *data, uint32_t slot)380 write_occlusion_query_result(struct v3dv_device *device,
381                              struct v3dv_query_pool *pool,
382                              uint32_t query,
383                              bool do_64bit,
384                              void *data,
385                              uint32_t slot)
386 {
387    assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
388 
389    if (vk_device_is_lost(&device->vk))
390       return VK_ERROR_DEVICE_LOST;
391 
392    struct v3dv_query *q = &pool->queries[query];
393    assert(q->bo && q->bo->map);
394 
395    const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
396    write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
397    return VK_SUCCESS;
398 }
399 
400 static VkResult
write_timestamp_query_result(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query, bool do_64bit, void *data, uint32_t slot)401 write_timestamp_query_result(struct v3dv_device *device,
402                              struct v3dv_query_pool *pool,
403                              uint32_t query,
404                              bool do_64bit,
405                              void *data,
406                              uint32_t slot)
407 {
408    assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
409 
410    struct v3dv_query *q = &pool->queries[query];
411 
412    write_to_buffer(data, slot, do_64bit, q->value);
413    return VK_SUCCESS;
414 }
415 
416 static VkResult
write_performance_query_result(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query, bool do_64bit, void *data, uint32_t slot)417 write_performance_query_result(struct v3dv_device *device,
418                                struct v3dv_query_pool *pool,
419                                uint32_t query,
420                                bool do_64bit,
421                                void *data,
422                                uint32_t slot)
423 {
424    assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
425 
426    struct v3dv_query *q = &pool->queries[query];
427    uint64_t counter_values[V3D_PERFCNT_NUM];
428 
429    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
430       struct drm_v3d_perfmon_get_values req = {
431          .id = q->perf.kperfmon_ids[i],
432          .values_ptr = (uintptr_t)(&counter_values[i *
433                                    DRM_V3D_MAX_PERF_COUNTERS])
434       };
435 
436       int ret = v3dv_ioctl(device->pdevice->render_fd,
437                            DRM_IOCTL_V3D_PERFMON_GET_VALUES,
438                            &req);
439 
440       if (ret) {
441          fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret));
442          return vk_error(device, VK_ERROR_DEVICE_LOST);
443       }
444    }
445 
446    for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
447       write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
448 
449    return VK_SUCCESS;
450 }
451 
452 static VkResult
query_check_available(struct v3dv_device *device, struct v3dv_query *q, VkQueryType query_type)453 query_check_available(struct v3dv_device *device,
454                       struct v3dv_query *q,
455                       VkQueryType query_type)
456 {
457    if (!q->maybe_available)
458       return VK_NOT_READY;
459 
460    if (query_type == VK_QUERY_TYPE_OCCLUSION &&
461        !v3dv_bo_wait(device, q->bo, 0))
462       return VK_NOT_READY;
463 
464    if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
465        vk_sync_wait(&device->vk, q->perf.last_job_sync,
466                     0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS)
467       return VK_NOT_READY;
468 
469    return VK_SUCCESS;
470 }
471 
472 static VkResult
write_query_result(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query, bool do_64bit, void *data, uint32_t slot)473 write_query_result(struct v3dv_device *device,
474                    struct v3dv_query_pool *pool,
475                    uint32_t query,
476                    bool do_64bit,
477                    void *data,
478                    uint32_t slot)
479 {
480    switch (pool->query_type) {
481    case VK_QUERY_TYPE_OCCLUSION:
482       return write_occlusion_query_result(device, pool, query, do_64bit,
483                                           data, slot);
484    case VK_QUERY_TYPE_TIMESTAMP:
485       return write_timestamp_query_result(device, pool, query, do_64bit,
486                                           data, slot);
487    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
488       return write_performance_query_result(device, pool, query, do_64bit,
489                                             data, slot);
490    default:
491       unreachable("Unsupported query type");
492    }
493 }
494 
495 static VkResult
query_is_available(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query, bool do_wait, bool *available)496 query_is_available(struct v3dv_device *device,
497                    struct v3dv_query_pool *pool,
498                    uint32_t query,
499                    bool do_wait,
500                    bool *available)
501 {
502    struct v3dv_query *q = &pool->queries[query];
503 
504    assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION ||
505           (q->bo && q->bo->map));
506 
507    if (do_wait) {
508       VkResult result = query_wait_available(device, q, pool->query_type);
509       if (result != VK_SUCCESS) {
510          *available = false;
511          return result;
512       }
513 
514       *available = true;
515    } else {
516       VkResult result = query_check_available(device, q, pool->query_type);
517       assert(result == VK_SUCCESS || result == VK_NOT_READY);
518       *available = (result == VK_SUCCESS);
519    }
520 
521    return VK_SUCCESS;
522 }
523 
524 static uint32_t
get_query_result_count(struct v3dv_query_pool *pool)525 get_query_result_count(struct v3dv_query_pool *pool)
526 {
527    switch (pool->query_type) {
528    case VK_QUERY_TYPE_OCCLUSION:
529    case VK_QUERY_TYPE_TIMESTAMP:
530       return 1;
531    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
532       return pool->perfmon.ncounters;
533    default:
534       unreachable("Unsupported query type");
535    }
536 }
537 
538 VkResult
v3dv_get_query_pool_results(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t first, uint32_t count, void *data, VkDeviceSize stride, VkQueryResultFlags flags)539 v3dv_get_query_pool_results(struct v3dv_device *device,
540                             struct v3dv_query_pool *pool,
541                             uint32_t first,
542                             uint32_t count,
543                             void *data,
544                             VkDeviceSize stride,
545                             VkQueryResultFlags flags)
546 {
547    assert(first < pool->query_count);
548    assert(first + count <= pool->query_count);
549    assert(data);
550 
551    const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
552       pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
553    const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
554    const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
555 
556    uint32_t result_count = get_query_result_count(pool);
557 
558    VkResult result = VK_SUCCESS;
559    for (uint32_t i = first; i < first + count; i++) {
560       bool available = false;
561       VkResult query_result =
562          query_is_available(device, pool, i, do_wait, &available);
563       if (query_result == VK_ERROR_DEVICE_LOST)
564          result = VK_ERROR_DEVICE_LOST;
565 
566       /**
567        * From the Vulkan 1.0 spec:
568        *
569        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
570        *     both not set then no result values are written to pData for queries
571        *     that are in the unavailable state at the time of the call, and
572        *     vkGetQueryPoolResults returns VK_NOT_READY. However, availability
573        *     state is still written to pData for those queries if
574        *     VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
575        */
576       uint32_t slot = 0;
577 
578       const bool write_result = available || do_partial;
579       if (write_result)
580          write_query_result(device, pool, i, do_64bit, data, slot);
581       slot += result_count;
582 
583       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
584          write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
585 
586       if (!write_result && result != VK_ERROR_DEVICE_LOST)
587          result = VK_NOT_READY;
588 
589       data += stride;
590    }
591 
592    return result;
593 }
594 
595 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, size_t dataSize, void *pData, VkDeviceSize stride, VkQueryResultFlags flags)596 v3dv_GetQueryPoolResults(VkDevice _device,
597                          VkQueryPool queryPool,
598                          uint32_t firstQuery,
599                          uint32_t queryCount,
600                          size_t dataSize,
601                          void *pData,
602                          VkDeviceSize stride,
603                          VkQueryResultFlags flags)
604 {
605    V3DV_FROM_HANDLE(v3dv_device, device, _device);
606    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
607 
608    return v3dv_get_query_pool_results(device, pool, firstQuery, queryCount,
609                                       pData, stride, flags);
610 }
611 
612 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)613 v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
614                        VkQueryPool queryPool,
615                        uint32_t firstQuery,
616                        uint32_t queryCount)
617 {
618    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
619    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
620 
621    v3dv_cmd_buffer_reset_queries(cmd_buffer, pool, firstQuery, queryCount);
622 }
623 
624 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags)625 v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
626                              VkQueryPool queryPool,
627                              uint32_t firstQuery,
628                              uint32_t queryCount,
629                              VkBuffer dstBuffer,
630                              VkDeviceSize dstOffset,
631                              VkDeviceSize stride,
632                              VkQueryResultFlags flags)
633 {
634    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
635    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
636    V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer);
637 
638    v3dv_cmd_buffer_copy_query_results(cmd_buffer, pool,
639                                       firstQuery, queryCount,
640                                       dst, dstOffset, stride, flags);
641 }
642 
643 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, VkQueryControlFlags flags)644 v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,
645                    VkQueryPool queryPool,
646                    uint32_t query,
647                    VkQueryControlFlags flags)
648 {
649    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
650    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
651 
652    v3dv_cmd_buffer_begin_query(cmd_buffer, pool, query, flags);
653 }
654 
655 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query)656 v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
657                  VkQueryPool queryPool,
658                  uint32_t query)
659 {
660    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
661    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
662 
663    v3dv_cmd_buffer_end_query(cmd_buffer, pool, query);
664 }
665 
666 void
v3dv_reset_query_pools(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t first, uint32_t count)667 v3dv_reset_query_pools(struct v3dv_device *device,
668                        struct v3dv_query_pool *pool,
669                        uint32_t first,
670                        uint32_t count)
671 {
672    mtx_lock(&device->query_mutex);
673 
674    for (uint32_t i = first; i < first + count; i++) {
675       assert(i < pool->query_count);
676       struct v3dv_query *q = &pool->queries[i];
677       q->maybe_available = false;
678       switch (pool->query_type) {
679       case VK_QUERY_TYPE_OCCLUSION: {
680          const uint8_t *q_addr = ((uint8_t *) q->bo->map) + q->offset;
681          uint32_t *counter = (uint32_t *) q_addr;
682          *counter = 0;
683          break;
684       }
685       case VK_QUERY_TYPE_TIMESTAMP:
686          q->value = 0;
687          break;
688       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
689          kperfmon_destroy(device, pool, i);
690          kperfmon_create(device, pool, i);
691          if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
692             fprintf(stderr, "Failed to reset sync");
693          break;
694       default:
695          unreachable("Unsupported query type");
696       }
697    }
698 
699    mtx_unlock(&device->query_mutex);
700 }
701 
702 VKAPI_ATTR void VKAPI_CALL
v3dv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)703 v3dv_ResetQueryPool(VkDevice _device,
704                     VkQueryPool queryPool,
705                     uint32_t firstQuery,
706                     uint32_t queryCount)
707 {
708    V3DV_FROM_HANDLE(v3dv_device, device, _device);
709    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
710 
711    v3dv_reset_query_pools(device, pool, firstQuery, queryCount);
712 }
713 
714 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, uint32_t *pCounterCount, VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions)715 v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
716    VkPhysicalDevice physicalDevice,
717    uint32_t queueFamilyIndex,
718    uint32_t *pCounterCount,
719    VkPerformanceCounterKHR *pCounters,
720    VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
721 {
722    uint32_t desc_count = *pCounterCount;
723 
724    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
725                           out, pCounters, pCounterCount);
726    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
727                           out_desc, pCounterDescriptions, &desc_count);
728 
729    for (int i = 0; i < ARRAY_SIZE(v3dv_counters); i++) {
730       vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
731          counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
732          counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
733          counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
734 
735          unsigned char sha1_result[20];
736          _mesa_sha1_compute(v3dv_counters[i][1], strlen(v3dv_counters[i][1]),
737                             sha1_result);
738 
739          memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
740       }
741 
742       vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
743                                &out_desc, desc) {
744          desc->flags = 0;
745          snprintf(desc->name, sizeof(desc->name), "%s",
746             v3dv_counters[i][1]);
747          snprintf(desc->category, sizeof(desc->category), "%s",
748             v3dv_counters[i][0]);
749          snprintf(desc->description, sizeof(desc->description), "%s",
750             v3dv_counters[i][2]);
751       }
752    }
753 
754    return vk_outarray_status(&out);
755 }
756 
757 VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( VkPhysicalDevice physicalDevice, const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo, uint32_t *pNumPasses)758 v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
759    VkPhysicalDevice physicalDevice,
760    const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
761    uint32_t *pNumPasses)
762 {
763    *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
764                               DRM_V3D_MAX_PERF_COUNTERS);
765 }
766 
767 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_AcquireProfilingLockKHR( VkDevice _device, const VkAcquireProfilingLockInfoKHR *pInfo)768 v3dv_AcquireProfilingLockKHR(
769    VkDevice _device,
770    const VkAcquireProfilingLockInfoKHR *pInfo)
771 {
772    return VK_SUCCESS;
773 }
774 
775 VKAPI_ATTR void VKAPI_CALL
v3dv_ReleaseProfilingLockKHR(VkDevice device)776 v3dv_ReleaseProfilingLockKHR(VkDevice device)
777 {
778 }
779