1 /*
2 * Copyright © 2020 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "v3dv_private.h"
25
26 #include "util/timespec.h"
27
28 static const char *v3dv_counters[][3] = {
29 {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
30 {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
31 {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
32 {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
33 {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
34 {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
35 {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
36 {"TLB", "TLB-quads-with-zero-coverage", "[TLB] Quads with all pixels having zero coverage"},
37 {"TLB", "TLB-quads-with-non-zero-coverage", "[TLB] Quads with any pixels having non-zero coverage"},
38 {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
39 {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
40 {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
41 {"PTB", "PTB-primitives-discared-reversed", "[PTB] Primitives that are discarded because they are reversed"},
42 {"QPU", "QPU-total-idle-clk-cycles", "[QPU] Total idle clock cycles for all QPUs"},
43 {"QPU", "QPU-total-active-clk-cycles-vertex-coord-shading", "[QPU] Total active clock cycles for all QPUs doing vertex/coordinate/user shading (counts only when QPU is not stalled)"},
44 {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
45 {"QPU", "QPU-total-clk-cycles-executing-valid-instr", "[QPU] Total clock cycles for all QPUs executing valid instructions"},
46 {"QPU", "QPU-total-clk-cycles-waiting-TMU", "[QPU] Total clock cycles for all QPUs stalled waiting for TMUs only (counter won't increment if QPU also stalling for another reason)"},
47 {"QPU", "QPU-total-clk-cycles-waiting-scoreboard", "[QPU] Total clock cycles for all QPUs stalled waiting for Scoreboard only (counter won't increment if QPU also stalling for another reason)"},
48 {"QPU", "QPU-total-clk-cycles-waiting-varyings", "[QPU] Total clock cycles for all QPUs stalled waiting for Varyings only (counter won't increment if QPU also stalling for another reason)"},
49 {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
50 {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
51 {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
52 {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
53 {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
54 {"TMU", "TMU-total-text-cache-miss", "[TMU] Total texture cache misses (number of fetches from memory/L2cache)"},
55 {"VPM", "VPM-total-clk-cycles-VDW-stalled", "[VPM] Total clock cycles VDW is stalled waiting for VPM access"},
56 {"VPM", "VPM-total-clk-cycles-VCD-stalled", "[VPM] Total clock cycles VCD is stalled waiting for VPM access"},
57 {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
58 {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
59 {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
60 {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
61 {"CORE", "cycle-count", "[CORE] Cycle counter"},
62 {"QPU", "QPU-total-clk-cycles-waiting-vertex-coord-shading", "[QPU] Total stalled clock cycles for all QPUs doing vertex/coordinate/user shading"},
63 {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
64 {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
65 {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
66 {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
67 {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
68 {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
69 {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
70 {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
71 {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
72 {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
73 {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
74 {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
75 {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
76 {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
77 {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
78 {"TMU", "TMU-total-config-access", "[TMU] Total config accesses"},
79 {"L2T", "L2T-no-id-stalled", "[L2T] No ID stall"},
80 {"L2T", "L2T-command-queue-stalled", "[L2T] Command queue full stall"},
81 {"L2T", "L2T-TMU-writes", "[L2T] TMU write accesses"},
82 {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
83 {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
84 {"CLE", "CLE-thread-active-cycles", "[CLE] Bin or render thread active cycles"},
85 {"L2T", "L2T-TMU-reads", "[L2T] TMU read accesses"},
86 {"L2T", "L2T-CLE-reads", "[L2T] CLE read accesses"},
87 {"L2T", "L2T-VCD-reads", "[L2T] VCD read accesses"},
88 {"L2T", "L2T-TMU-config-reads", "[L2T] TMU CFG read accesses"},
89 {"L2T", "L2T-SLC0-reads", "[L2T] SLC0 read accesses"},
90 {"L2T", "L2T-SLC1-reads", "[L2T] SLC1 read accesses"},
91 {"L2T", "L2T-SLC2-reads", "[L2T] SLC2 read accesses"},
92 {"L2T", "L2T-TMU-write-miss", "[L2T] TMU write misses"},
93 {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
94 {"L2T", "L2T-CLE-read-miss", "[L2T] CLE read misses"},
95 {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
96 {"L2T", "L2T-TMU-config-read-miss", "[L2T] TMU CFG read misses"},
97 {"L2T", "L2T-SLC0-read-miss", "[L2T] SLC0 read misses"},
98 {"L2T", "L2T-SLC1-read-miss", "[L2T] SLC1 read misses"},
99 {"L2T", "L2T-SLC2-read-miss", "[L2T] SLC2 read misses"},
100 {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
101 {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
102 {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
103 {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
104 {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
105 {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
106 {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
107 {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
108 {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
109 {"GMP", "GMP-memory-reads", "[GMP] Total memory reads"},
110 {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
111 {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
112 {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
113 {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
114 {"TMU", "TMU-MRU-hits", "[TMU] Total MRU hits"},
115 {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
116 };
117
118 static void
kperfmon_create(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query)119 kperfmon_create(struct v3dv_device *device,
120 struct v3dv_query_pool *pool,
121 uint32_t query)
122 {
123 for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
124 assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
125
126 struct drm_v3d_perfmon_create req = {
127 .ncounters = MIN2(pool->perfmon.ncounters -
128 i * DRM_V3D_MAX_PERF_COUNTERS,
129 DRM_V3D_MAX_PERF_COUNTERS),
130 };
131 memcpy(req.counters,
132 &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
133 req.ncounters);
134
135 int ret = v3dv_ioctl(device->pdevice->render_fd,
136 DRM_IOCTL_V3D_PERFMON_CREATE,
137 &req);
138 if (ret)
139 fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
140
141 pool->queries[query].perf.kperfmon_ids[i] = req.id;
142 }
143 }
144
145 static void
kperfmon_destroy(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query)146 kperfmon_destroy(struct v3dv_device *device,
147 struct v3dv_query_pool *pool,
148 uint32_t query)
149 {
150 /* Skip destroying if never created */
151 if (!pool->queries[query].perf.kperfmon_ids[0])
152 return;
153
154 for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
155 struct drm_v3d_perfmon_destroy req = {
156 .id = pool->queries[query].perf.kperfmon_ids[i]
157 };
158
159 int ret = v3dv_ioctl(device->pdevice->render_fd,
160 DRM_IOCTL_V3D_PERFMON_DESTROY,
161 &req);
162
163 if (ret) {
164 fprintf(stderr, "Failed to destroy perfmon %u: %s\n",
165 req.id, strerror(ret));
166 }
167 }
168 }
169
170 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)171 v3dv_CreateQueryPool(VkDevice _device,
172 const VkQueryPoolCreateInfo *pCreateInfo,
173 const VkAllocationCallbacks *pAllocator,
174 VkQueryPool *pQueryPool)
175 {
176 V3DV_FROM_HANDLE(v3dv_device, device, _device);
177
178 assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
179 pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
180 pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
181 assert(pCreateInfo->queryCount > 0);
182
183 struct v3dv_query_pool *pool =
184 vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
185 VK_OBJECT_TYPE_QUERY_POOL);
186 if (pool == NULL)
187 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
188
189 pool->query_type = pCreateInfo->queryType;
190 pool->query_count = pCreateInfo->queryCount;
191
192 uint32_t query_idx = 0;
193 VkResult result;
194
195 const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
196 pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
197 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
198 if (pool->queries == NULL) {
199 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
200 goto fail;
201 }
202
203 switch (pool->query_type) {
204 case VK_QUERY_TYPE_OCCLUSION: {
205 /* The hardware allows us to setup groups of 16 queries in consecutive
206 * 4-byte addresses, requiring only that each group of 16 queries is
207 * aligned to a 1024 byte boundary.
208 */
209 const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
210 const uint32_t bo_size = query_groups * 1024;
211 pool->bo = v3dv_bo_alloc(device, bo_size, "query", true);
212 if (!pool->bo) {
213 result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
214 goto fail;
215 }
216 if (!v3dv_bo_map(device, pool->bo, bo_size)) {
217 result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
218 goto fail;
219 }
220 break;
221 }
222 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
223 const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
224 vk_find_struct_const(pCreateInfo->pNext,
225 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
226
227 assert(pq_info);
228 assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);
229
230 pool->perfmon.ncounters = pq_info->counterIndexCount;
231 for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
232 pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
233
234 pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
235 DRM_V3D_MAX_PERF_COUNTERS);
236
237 assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
238 break;
239 }
240 case VK_QUERY_TYPE_TIMESTAMP:
241 break;
242 default:
243 unreachable("Unsupported query type");
244 }
245
246 for (; query_idx < pool->query_count; query_idx++) {
247 pool->queries[query_idx].maybe_available = false;
248 switch (pool->query_type) {
249 case VK_QUERY_TYPE_OCCLUSION: {
250 const uint32_t query_group = query_idx / 16;
251 const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
252 pool->queries[query_idx].bo = pool->bo;
253 pool->queries[query_idx].offset = query_offset;
254 break;
255 }
256 case VK_QUERY_TYPE_TIMESTAMP:
257 pool->queries[query_idx].value = 0;
258 break;
259 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
260 result = vk_sync_create(&device->vk,
261 &device->pdevice->drm_syncobj_type, 0, 0,
262 &pool->queries[query_idx].perf.last_job_sync);
263 if (result != VK_SUCCESS)
264 goto fail;
265
266 for (uint32_t j = 0; j < pool->perfmon.nperfmons; j++)
267 pool->queries[query_idx].perf.kperfmon_ids[j] = 0;
268 break;
269 }
270 default:
271 unreachable("Unsupported query type");
272 }
273 }
274
275 *pQueryPool = v3dv_query_pool_to_handle(pool);
276
277 return VK_SUCCESS;
278
279 fail:
280 if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
281 for (uint32_t j = 0; j < query_idx; j++)
282 vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
283 }
284
285 if (pool->bo)
286 v3dv_bo_free(device, pool->bo);
287 if (pool->queries)
288 vk_free2(&device->vk.alloc, pAllocator, pool->queries);
289 vk_object_free(&device->vk, pAllocator, pool);
290
291 return result;
292 }
293
294 VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyQueryPool(VkDevice _device, VkQueryPool queryPool, const VkAllocationCallbacks *pAllocator)295 v3dv_DestroyQueryPool(VkDevice _device,
296 VkQueryPool queryPool,
297 const VkAllocationCallbacks *pAllocator)
298 {
299 V3DV_FROM_HANDLE(v3dv_device, device, _device);
300 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
301
302 if (!pool)
303 return;
304
305 if (pool->bo)
306 v3dv_bo_free(device, pool->bo);
307
308 if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
309 for (uint32_t i = 0; i < pool->query_count; i++) {
310 kperfmon_destroy(device, pool, i);
311 vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
312 }
313 }
314
315 if (pool->queries)
316 vk_free2(&device->vk.alloc, pAllocator, pool->queries);
317
318 vk_object_free(&device->vk, pAllocator, pool);
319 }
320
321 static void
write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)322 write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
323 {
324 if (do_64bit) {
325 uint64_t *dst64 = (uint64_t *) dst;
326 dst64[idx] = value;
327 } else {
328 uint32_t *dst32 = (uint32_t *) dst;
329 dst32[idx] = (uint32_t) value;
330 }
331 }
332
333 static VkResult
query_wait_available(struct v3dv_device *device, struct v3dv_query *q, VkQueryType query_type)334 query_wait_available(struct v3dv_device *device,
335 struct v3dv_query *q,
336 VkQueryType query_type)
337 {
338 if (!q->maybe_available) {
339 struct timespec timeout;
340 timespec_get(&timeout, TIME_UTC);
341 timespec_add_msec(&timeout, &timeout, 2000);
342
343 VkResult result = VK_SUCCESS;
344
345 mtx_lock(&device->query_mutex);
346 while (!q->maybe_available) {
347 if (vk_device_is_lost(&device->vk)) {
348 result = VK_ERROR_DEVICE_LOST;
349 break;
350 }
351
352 int ret = cnd_timedwait(&device->query_ended,
353 &device->query_mutex,
354 &timeout);
355 if (ret != thrd_success) {
356 mtx_unlock(&device->query_mutex);
357 result = vk_device_set_lost(&device->vk, "Query wait failed");
358 break;
359 }
360 }
361 mtx_unlock(&device->query_mutex);
362
363 if (result != VK_SUCCESS)
364 return result;
365 }
366
367 if (query_type == VK_QUERY_TYPE_OCCLUSION &&
368 !v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull))
369 return vk_device_set_lost(&device->vk, "Query BO wait failed: %m");
370
371 if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
372 vk_sync_wait(&device->vk, q->perf.last_job_sync,
373 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS)
374 return vk_device_set_lost(&device->vk, "Query job wait failed");
375
376 return VK_SUCCESS;
377 }
378
379 static VkResult
write_occlusion_query_result(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query, bool do_64bit, void *data, uint32_t slot)380 write_occlusion_query_result(struct v3dv_device *device,
381 struct v3dv_query_pool *pool,
382 uint32_t query,
383 bool do_64bit,
384 void *data,
385 uint32_t slot)
386 {
387 assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
388
389 if (vk_device_is_lost(&device->vk))
390 return VK_ERROR_DEVICE_LOST;
391
392 struct v3dv_query *q = &pool->queries[query];
393 assert(q->bo && q->bo->map);
394
395 const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
396 write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
397 return VK_SUCCESS;
398 }
399
400 static VkResult
write_timestamp_query_result(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query, bool do_64bit, void *data, uint32_t slot)401 write_timestamp_query_result(struct v3dv_device *device,
402 struct v3dv_query_pool *pool,
403 uint32_t query,
404 bool do_64bit,
405 void *data,
406 uint32_t slot)
407 {
408 assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
409
410 struct v3dv_query *q = &pool->queries[query];
411
412 write_to_buffer(data, slot, do_64bit, q->value);
413 return VK_SUCCESS;
414 }
415
416 static VkResult
write_performance_query_result(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query, bool do_64bit, void *data, uint32_t slot)417 write_performance_query_result(struct v3dv_device *device,
418 struct v3dv_query_pool *pool,
419 uint32_t query,
420 bool do_64bit,
421 void *data,
422 uint32_t slot)
423 {
424 assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
425
426 struct v3dv_query *q = &pool->queries[query];
427 uint64_t counter_values[V3D_PERFCNT_NUM];
428
429 for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
430 struct drm_v3d_perfmon_get_values req = {
431 .id = q->perf.kperfmon_ids[i],
432 .values_ptr = (uintptr_t)(&counter_values[i *
433 DRM_V3D_MAX_PERF_COUNTERS])
434 };
435
436 int ret = v3dv_ioctl(device->pdevice->render_fd,
437 DRM_IOCTL_V3D_PERFMON_GET_VALUES,
438 &req);
439
440 if (ret) {
441 fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret));
442 return vk_error(device, VK_ERROR_DEVICE_LOST);
443 }
444 }
445
446 for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
447 write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
448
449 return VK_SUCCESS;
450 }
451
452 static VkResult
query_check_available(struct v3dv_device *device, struct v3dv_query *q, VkQueryType query_type)453 query_check_available(struct v3dv_device *device,
454 struct v3dv_query *q,
455 VkQueryType query_type)
456 {
457 if (!q->maybe_available)
458 return VK_NOT_READY;
459
460 if (query_type == VK_QUERY_TYPE_OCCLUSION &&
461 !v3dv_bo_wait(device, q->bo, 0))
462 return VK_NOT_READY;
463
464 if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
465 vk_sync_wait(&device->vk, q->perf.last_job_sync,
466 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS)
467 return VK_NOT_READY;
468
469 return VK_SUCCESS;
470 }
471
472 static VkResult
write_query_result(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query, bool do_64bit, void *data, uint32_t slot)473 write_query_result(struct v3dv_device *device,
474 struct v3dv_query_pool *pool,
475 uint32_t query,
476 bool do_64bit,
477 void *data,
478 uint32_t slot)
479 {
480 switch (pool->query_type) {
481 case VK_QUERY_TYPE_OCCLUSION:
482 return write_occlusion_query_result(device, pool, query, do_64bit,
483 data, slot);
484 case VK_QUERY_TYPE_TIMESTAMP:
485 return write_timestamp_query_result(device, pool, query, do_64bit,
486 data, slot);
487 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
488 return write_performance_query_result(device, pool, query, do_64bit,
489 data, slot);
490 default:
491 unreachable("Unsupported query type");
492 }
493 }
494
495 static VkResult
query_is_available(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t query, bool do_wait, bool *available)496 query_is_available(struct v3dv_device *device,
497 struct v3dv_query_pool *pool,
498 uint32_t query,
499 bool do_wait,
500 bool *available)
501 {
502 struct v3dv_query *q = &pool->queries[query];
503
504 assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION ||
505 (q->bo && q->bo->map));
506
507 if (do_wait) {
508 VkResult result = query_wait_available(device, q, pool->query_type);
509 if (result != VK_SUCCESS) {
510 *available = false;
511 return result;
512 }
513
514 *available = true;
515 } else {
516 VkResult result = query_check_available(device, q, pool->query_type);
517 assert(result == VK_SUCCESS || result == VK_NOT_READY);
518 *available = (result == VK_SUCCESS);
519 }
520
521 return VK_SUCCESS;
522 }
523
524 static uint32_t
get_query_result_count(struct v3dv_query_pool *pool)525 get_query_result_count(struct v3dv_query_pool *pool)
526 {
527 switch (pool->query_type) {
528 case VK_QUERY_TYPE_OCCLUSION:
529 case VK_QUERY_TYPE_TIMESTAMP:
530 return 1;
531 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
532 return pool->perfmon.ncounters;
533 default:
534 unreachable("Unsupported query type");
535 }
536 }
537
538 VkResult
v3dv_get_query_pool_results(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t first, uint32_t count, void *data, VkDeviceSize stride, VkQueryResultFlags flags)539 v3dv_get_query_pool_results(struct v3dv_device *device,
540 struct v3dv_query_pool *pool,
541 uint32_t first,
542 uint32_t count,
543 void *data,
544 VkDeviceSize stride,
545 VkQueryResultFlags flags)
546 {
547 assert(first < pool->query_count);
548 assert(first + count <= pool->query_count);
549 assert(data);
550
551 const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
552 pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
553 const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
554 const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
555
556 uint32_t result_count = get_query_result_count(pool);
557
558 VkResult result = VK_SUCCESS;
559 for (uint32_t i = first; i < first + count; i++) {
560 bool available = false;
561 VkResult query_result =
562 query_is_available(device, pool, i, do_wait, &available);
563 if (query_result == VK_ERROR_DEVICE_LOST)
564 result = VK_ERROR_DEVICE_LOST;
565
566 /**
567 * From the Vulkan 1.0 spec:
568 *
569 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
570 * both not set then no result values are written to pData for queries
571 * that are in the unavailable state at the time of the call, and
572 * vkGetQueryPoolResults returns VK_NOT_READY. However, availability
573 * state is still written to pData for those queries if
574 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
575 */
576 uint32_t slot = 0;
577
578 const bool write_result = available || do_partial;
579 if (write_result)
580 write_query_result(device, pool, i, do_64bit, data, slot);
581 slot += result_count;
582
583 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
584 write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
585
586 if (!write_result && result != VK_ERROR_DEVICE_LOST)
587 result = VK_NOT_READY;
588
589 data += stride;
590 }
591
592 return result;
593 }
594
595 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, size_t dataSize, void *pData, VkDeviceSize stride, VkQueryResultFlags flags)596 v3dv_GetQueryPoolResults(VkDevice _device,
597 VkQueryPool queryPool,
598 uint32_t firstQuery,
599 uint32_t queryCount,
600 size_t dataSize,
601 void *pData,
602 VkDeviceSize stride,
603 VkQueryResultFlags flags)
604 {
605 V3DV_FROM_HANDLE(v3dv_device, device, _device);
606 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
607
608 return v3dv_get_query_pool_results(device, pool, firstQuery, queryCount,
609 pData, stride, flags);
610 }
611
612 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)613 v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
614 VkQueryPool queryPool,
615 uint32_t firstQuery,
616 uint32_t queryCount)
617 {
618 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
619 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
620
621 v3dv_cmd_buffer_reset_queries(cmd_buffer, pool, firstQuery, queryCount);
622 }
623
624 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags)625 v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
626 VkQueryPool queryPool,
627 uint32_t firstQuery,
628 uint32_t queryCount,
629 VkBuffer dstBuffer,
630 VkDeviceSize dstOffset,
631 VkDeviceSize stride,
632 VkQueryResultFlags flags)
633 {
634 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
635 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
636 V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer);
637
638 v3dv_cmd_buffer_copy_query_results(cmd_buffer, pool,
639 firstQuery, queryCount,
640 dst, dstOffset, stride, flags);
641 }
642
643 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, VkQueryControlFlags flags)644 v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,
645 VkQueryPool queryPool,
646 uint32_t query,
647 VkQueryControlFlags flags)
648 {
649 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
650 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
651
652 v3dv_cmd_buffer_begin_query(cmd_buffer, pool, query, flags);
653 }
654
655 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query)656 v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
657 VkQueryPool queryPool,
658 uint32_t query)
659 {
660 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
661 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
662
663 v3dv_cmd_buffer_end_query(cmd_buffer, pool, query);
664 }
665
666 void
v3dv_reset_query_pools(struct v3dv_device *device, struct v3dv_query_pool *pool, uint32_t first, uint32_t count)667 v3dv_reset_query_pools(struct v3dv_device *device,
668 struct v3dv_query_pool *pool,
669 uint32_t first,
670 uint32_t count)
671 {
672 mtx_lock(&device->query_mutex);
673
674 for (uint32_t i = first; i < first + count; i++) {
675 assert(i < pool->query_count);
676 struct v3dv_query *q = &pool->queries[i];
677 q->maybe_available = false;
678 switch (pool->query_type) {
679 case VK_QUERY_TYPE_OCCLUSION: {
680 const uint8_t *q_addr = ((uint8_t *) q->bo->map) + q->offset;
681 uint32_t *counter = (uint32_t *) q_addr;
682 *counter = 0;
683 break;
684 }
685 case VK_QUERY_TYPE_TIMESTAMP:
686 q->value = 0;
687 break;
688 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
689 kperfmon_destroy(device, pool, i);
690 kperfmon_create(device, pool, i);
691 if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
692 fprintf(stderr, "Failed to reset sync");
693 break;
694 default:
695 unreachable("Unsupported query type");
696 }
697 }
698
699 mtx_unlock(&device->query_mutex);
700 }
701
702 VKAPI_ATTR void VKAPI_CALL
v3dv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)703 v3dv_ResetQueryPool(VkDevice _device,
704 VkQueryPool queryPool,
705 uint32_t firstQuery,
706 uint32_t queryCount)
707 {
708 V3DV_FROM_HANDLE(v3dv_device, device, _device);
709 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
710
711 v3dv_reset_query_pools(device, pool, firstQuery, queryCount);
712 }
713
714 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, uint32_t *pCounterCount, VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions)715 v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
716 VkPhysicalDevice physicalDevice,
717 uint32_t queueFamilyIndex,
718 uint32_t *pCounterCount,
719 VkPerformanceCounterKHR *pCounters,
720 VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
721 {
722 uint32_t desc_count = *pCounterCount;
723
724 VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
725 out, pCounters, pCounterCount);
726 VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
727 out_desc, pCounterDescriptions, &desc_count);
728
729 for (int i = 0; i < ARRAY_SIZE(v3dv_counters); i++) {
730 vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
731 counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
732 counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
733 counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
734
735 unsigned char sha1_result[20];
736 _mesa_sha1_compute(v3dv_counters[i][1], strlen(v3dv_counters[i][1]),
737 sha1_result);
738
739 memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
740 }
741
742 vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
743 &out_desc, desc) {
744 desc->flags = 0;
745 snprintf(desc->name, sizeof(desc->name), "%s",
746 v3dv_counters[i][1]);
747 snprintf(desc->category, sizeof(desc->category), "%s",
748 v3dv_counters[i][0]);
749 snprintf(desc->description, sizeof(desc->description), "%s",
750 v3dv_counters[i][2]);
751 }
752 }
753
754 return vk_outarray_status(&out);
755 }
756
757 VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( VkPhysicalDevice physicalDevice, const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo, uint32_t *pNumPasses)758 v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
759 VkPhysicalDevice physicalDevice,
760 const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
761 uint32_t *pNumPasses)
762 {
763 *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
764 DRM_V3D_MAX_PERF_COUNTERS);
765 }
766
767 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_AcquireProfilingLockKHR( VkDevice _device, const VkAcquireProfilingLockInfoKHR *pInfo)768 v3dv_AcquireProfilingLockKHR(
769 VkDevice _device,
770 const VkAcquireProfilingLockInfoKHR *pInfo)
771 {
772 return VK_SUCCESS;
773 }
774
775 VKAPI_ATTR void VKAPI_CALL
v3dv_ReleaseProfilingLockKHR(VkDevice device)776 v3dv_ReleaseProfilingLockKHR(VkDevice device)
777 {
778 }
779