1/* 2 * Copyrigh 2016 Red Hat Inc. 3 * SPDX-License-Identifier: MIT 4 * 5 * Based on anv: 6 * Copyright © 2015 Intel Corporation 7 */ 8 9#include "tu_query.h" 10 11#include <fcntl.h> 12 13#include "nir/nir_builder.h" 14#include "util/os_time.h" 15 16#include "vk_util.h" 17 18#include "tu_cmd_buffer.h" 19#include "tu_cs.h" 20#include "tu_device.h" 21 22#define NSEC_PER_SEC 1000000000ull 23#define WAIT_TIMEOUT 5 24#define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1) 25 26struct PACKED query_slot { 27 uint64_t available; 28}; 29 30struct PACKED occlusion_slot_value { 31 /* Seems sample counters are placed to be 16-byte aligned 32 * even though this query needs an 8-byte slot. */ 33 uint64_t value; 34 uint64_t _padding; 35}; 36 37struct PACKED occlusion_query_slot { 38 struct query_slot common; 39 uint64_t result; 40 41 struct occlusion_slot_value begin; 42 struct occlusion_slot_value end; 43}; 44 45struct PACKED timestamp_query_slot { 46 struct query_slot common; 47 uint64_t result; 48}; 49 50struct PACKED primitive_slot_value { 51 uint64_t values[2]; 52}; 53 54struct PACKED pipeline_stat_query_slot { 55 struct query_slot common; 56 uint64_t results[STAT_COUNT]; 57 58 uint64_t begin[STAT_COUNT]; 59 uint64_t end[STAT_COUNT]; 60}; 61 62struct PACKED primitive_query_slot { 63 struct query_slot common; 64 /* The result of transform feedback queries is two integer values: 65 * results[0] is the count of primitives written, 66 * results[1] is the count of primitives generated. 67 * Also a result for each stream is stored at 4 slots respectively. 68 */ 69 uint64_t results[2]; 70 71 /* Primitive counters also need to be 16-byte aligned. */ 72 uint64_t _padding; 73 74 struct primitive_slot_value begin[4]; 75 struct primitive_slot_value end[4]; 76}; 77 78struct PACKED perfcntr_query_slot { 79 uint64_t result; 80 uint64_t begin; 81 uint64_t end; 82}; 83 84struct PACKED perf_query_slot { 85 struct query_slot common; 86 struct perfcntr_query_slot perfcntr; 87}; 88 89struct PACKED primitives_generated_query_slot { 90 struct query_slot common; 91 uint64_t result; 92 uint64_t begin; 93 uint64_t end; 94}; 95 96/* Returns the IOVA of a given uint64_t field in a given slot of a query 97 * pool. */ 98#define query_iova(type, pool, query, field) \ 99 pool->bo->iova + pool->stride * (query) + offsetof(type, field) 100 101#define occlusion_query_iova(pool, query, field) \ 102 query_iova(struct occlusion_query_slot, pool, query, field) 103 104#define pipeline_stat_query_iova(pool, query, field) \ 105 pool->bo->iova + pool->stride * (query) + \ 106 offsetof(struct pipeline_stat_query_slot, field) 107 108#define primitive_query_iova(pool, query, field, i) \ 109 query_iova(struct primitive_query_slot, pool, query, field) + \ 110 offsetof(struct primitive_slot_value, values[i]) 111 112#define perf_query_iova(pool, query, field, i) \ 113 pool->bo->iova + pool->stride * (query) + \ 114 sizeof(struct query_slot) + \ 115 sizeof(struct perfcntr_query_slot) * (i) + \ 116 offsetof(struct perfcntr_query_slot, field) 117 118#define primitives_generated_query_iova(pool, query, field) \ 119 query_iova(struct primitives_generated_query_slot, pool, query, field) 120 121#define query_available_iova(pool, query) \ 122 query_iova(struct query_slot, pool, query, available) 123 124#define query_result_iova(pool, query, type, i) \ 125 pool->bo->iova + pool->stride * (query) + \ 126 sizeof(struct query_slot) + sizeof(type) * (i) 127 128#define query_result_addr(pool, query, type, i) \ 129 pool->bo->map + pool->stride * (query) + \ 130 sizeof(struct query_slot) + sizeof(type) * (i) 131 132#define query_is_available(slot) slot->available 133 134static const VkPerformanceCounterUnitKHR 135fd_perfcntr_type_to_vk_unit[] = { 136 [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 137 [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 138 [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 139 [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR, 140 [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR, 141 /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */ 142 [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 143 [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR, 144 [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 145 [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 146 [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 147 [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 148 [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 149}; 150 151/* TODO. Basically this comes from the freedreno implementation where 152 * only UINT64 is used. We'd better confirm this by the blob vulkan driver 153 * when it starts supporting perf query. 154 */ 155static const VkPerformanceCounterStorageKHR 156fd_perfcntr_type_to_vk_storage[] = { 157 [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR, 158 [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 159 [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 160 [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 161 [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 162 [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 163 [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 164 [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 165 [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 166 [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 167 [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 168 [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 169}; 170 171/* 172 * Returns a pointer to a given slot in a query pool. 173 */ 174static void* slot_address(struct tu_query_pool *pool, uint32_t query) 175{ 176 return (char*)pool->bo->map + query * pool->stride; 177} 178 179static void 180perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count, 181 uint32_t index, uint32_t *gid, uint32_t *cid) 182 183{ 184 uint32_t i; 185 186 for (i = 0; i < group_count; i++) { 187 if (group[i].num_countables > index) { 188 *gid = i; 189 *cid = index; 190 break; 191 } 192 index -= group[i].num_countables; 193 } 194 195 assert(i < group_count); 196} 197 198static int 199compare_perfcntr_pass(const void *a, const void *b) 200{ 201 return ((struct tu_perf_query_data *)a)->pass - 202 ((struct tu_perf_query_data *)b)->pass; 203} 204 205VKAPI_ATTR VkResult VKAPI_CALL 206tu_CreateQueryPool(VkDevice _device, 207 const VkQueryPoolCreateInfo *pCreateInfo, 208 const VkAllocationCallbacks *pAllocator, 209 VkQueryPool *pQueryPool) 210{ 211 TU_FROM_HANDLE(tu_device, device, _device); 212 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); 213 assert(pCreateInfo->queryCount > 0); 214 215 uint32_t pool_size, slot_size; 216 const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL; 217 218 pool_size = sizeof(struct tu_query_pool); 219 220 switch (pCreateInfo->queryType) { 221 case VK_QUERY_TYPE_OCCLUSION: 222 slot_size = sizeof(struct occlusion_query_slot); 223 break; 224 case VK_QUERY_TYPE_TIMESTAMP: 225 slot_size = sizeof(struct timestamp_query_slot); 226 break; 227 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 228 slot_size = sizeof(struct primitive_query_slot); 229 break; 230 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 231 slot_size = sizeof(struct primitives_generated_query_slot); 232 break; 233 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 234 perf_query_info = 235 vk_find_struct_const(pCreateInfo->pNext, 236 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); 237 assert(perf_query_info); 238 239 slot_size = sizeof(struct perf_query_slot) + 240 sizeof(struct perfcntr_query_slot) * 241 (perf_query_info->counterIndexCount - 1); 242 243 /* Size of the array pool->tu_perf_query_data */ 244 pool_size += sizeof(struct tu_perf_query_data) * 245 perf_query_info->counterIndexCount; 246 break; 247 } 248 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 249 slot_size = sizeof(struct pipeline_stat_query_slot); 250 break; 251 default: 252 unreachable("Invalid query type"); 253 } 254 255 struct tu_query_pool *pool = 256 vk_object_alloc(&device->vk, pAllocator, pool_size, 257 VK_OBJECT_TYPE_QUERY_POOL); 258 if (!pool) 259 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 260 261 if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 262 pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id, 263 &pool->perf_group_count); 264 265 pool->counter_index_count = perf_query_info->counterIndexCount; 266 267 /* Build all perf counters data that is requested, so we could get 268 * correct group id, countable id, counter register and pass index with 269 * only a counter index provided by applications at each command submit. 270 * 271 * Also, since this built data will be sorted by pass index later, we 272 * should keep the original indices and store perfcntrs results according 273 * to them so apps can get correct results with their own indices. 274 */ 275 uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count]; 276 memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0])); 277 memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0])); 278 279 for (uint32_t i = 0; i < pool->counter_index_count; i++) { 280 uint32_t gid = 0, cid = 0; 281 282 perfcntr_index(pool->perf_group, pool->perf_group_count, 283 perf_query_info->pCounterIndices[i], &gid, &cid); 284 285 pool->perf_query_data[i].gid = gid; 286 pool->perf_query_data[i].cid = cid; 287 pool->perf_query_data[i].app_idx = i; 288 289 /* When a counter register is over the capacity(num_counters), 290 * reset it for next pass. 291 */ 292 if (regs[gid] < pool->perf_group[gid].num_counters) { 293 pool->perf_query_data[i].cntr_reg = regs[gid]++; 294 pool->perf_query_data[i].pass = pass[gid]; 295 } else { 296 pool->perf_query_data[i].pass = ++pass[gid]; 297 pool->perf_query_data[i].cntr_reg = regs[gid] = 0; 298 regs[gid]++; 299 } 300 } 301 302 /* Sort by pass index so we could easily prepare a command stream 303 * with the ascending order of pass index. 304 */ 305 qsort(pool->perf_query_data, pool->counter_index_count, 306 sizeof(pool->perf_query_data[0]), 307 compare_perfcntr_pass); 308 } 309 310 VkResult result = tu_bo_init_new(device, &pool->bo, 311 pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS); 312 if (result != VK_SUCCESS) { 313 vk_object_free(&device->vk, pAllocator, pool); 314 return result; 315 } 316 317 result = tu_bo_map(device, pool->bo); 318 if (result != VK_SUCCESS) { 319 tu_bo_finish(device, pool->bo); 320 vk_object_free(&device->vk, pAllocator, pool); 321 return result; 322 } 323 324 /* Initialize all query statuses to unavailable */ 325 memset(pool->bo->map, 0, pool->bo->size); 326 327 pool->type = pCreateInfo->queryType; 328 pool->stride = slot_size; 329 pool->size = pCreateInfo->queryCount; 330 pool->pipeline_statistics = pCreateInfo->pipelineStatistics; 331 *pQueryPool = tu_query_pool_to_handle(pool); 332 333 return VK_SUCCESS; 334} 335 336VKAPI_ATTR void VKAPI_CALL 337tu_DestroyQueryPool(VkDevice _device, 338 VkQueryPool _pool, 339 const VkAllocationCallbacks *pAllocator) 340{ 341 TU_FROM_HANDLE(tu_device, device, _device); 342 TU_FROM_HANDLE(tu_query_pool, pool, _pool); 343 344 if (!pool) 345 return; 346 347 tu_bo_finish(device, pool->bo); 348 vk_object_free(&device->vk, pAllocator, pool); 349} 350 351static uint32_t 352get_result_count(struct tu_query_pool *pool) 353{ 354 switch (pool->type) { 355 /* Occulusion and timestamp queries write one integer value */ 356 case VK_QUERY_TYPE_OCCLUSION: 357 case VK_QUERY_TYPE_TIMESTAMP: 358 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 359 return 1; 360 /* Transform feedback queries write two integer values */ 361 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 362 return 2; 363 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 364 return util_bitcount(pool->pipeline_statistics); 365 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 366 return pool->counter_index_count; 367 default: 368 assert(!"Invalid query type"); 369 return 0; 370 } 371} 372 373static uint32_t 374statistics_index(uint32_t *statistics) 375{ 376 uint32_t stat; 377 stat = u_bit_scan(statistics); 378 379 switch (1 << stat) { 380 case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT: 381 case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT: 382 return 0; 383 case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT: 384 return 1; 385 case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT: 386 return 2; 387 case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT: 388 return 4; 389 case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT: 390 return 5; 391 case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT: 392 return 6; 393 case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT: 394 return 7; 395 case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT: 396 return 8; 397 case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT: 398 return 9; 399 case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT: 400 return 10; 401 default: 402 return 0; 403 } 404} 405 406static bool 407is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics) 408{ 409 return pipeline_statistics & 410 (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT | 411 VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT | 412 VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT | 413 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT | 414 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT | 415 VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT | 416 VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT | 417 VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT | 418 VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT); 419} 420 421static bool 422is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics) 423{ 424 return pipeline_statistics & 425 VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT; 426} 427 428static bool 429is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics) 430{ 431 return pipeline_statistics & 432 VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT; 433} 434 435/* Wait on the the availability status of a query up until a timeout. */ 436static VkResult 437wait_for_available(struct tu_device *device, struct tu_query_pool *pool, 438 uint32_t query) 439{ 440 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a 441 * scheduler friendly way instead of busy polling once the patch has landed 442 * upstream. */ 443 struct query_slot *slot = slot_address(pool, query); 444 uint64_t abs_timeout = os_time_get_absolute_timeout( 445 WAIT_TIMEOUT * NSEC_PER_SEC); 446 while(os_time_get_nano() < abs_timeout) { 447 if (query_is_available(slot)) 448 return VK_SUCCESS; 449 } 450 return vk_error(device, VK_TIMEOUT); 451} 452 453/* Writes a query value to a buffer from the CPU. */ 454static void 455write_query_value_cpu(char* base, 456 uint32_t offset, 457 uint64_t value, 458 VkQueryResultFlags flags) 459{ 460 if (flags & VK_QUERY_RESULT_64_BIT) { 461 *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value; 462 } else { 463 *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value; 464 } 465} 466 467static VkResult 468get_query_pool_results(struct tu_device *device, 469 struct tu_query_pool *pool, 470 uint32_t firstQuery, 471 uint32_t queryCount, 472 size_t dataSize, 473 void *pData, 474 VkDeviceSize stride, 475 VkQueryResultFlags flags) 476{ 477 assert(dataSize >= stride * queryCount); 478 479 char *result_base = pData; 480 VkResult result = VK_SUCCESS; 481 for (uint32_t i = 0; i < queryCount; i++) { 482 uint32_t query = firstQuery + i; 483 struct query_slot *slot = slot_address(pool, query); 484 bool available = query_is_available(slot); 485 uint32_t result_count = get_result_count(pool); 486 uint32_t statistics = pool->pipeline_statistics; 487 488 if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) { 489 VkResult wait_result = wait_for_available(device, pool, query); 490 if (wait_result != VK_SUCCESS) 491 return wait_result; 492 available = true; 493 } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) { 494 /* From the Vulkan 1.1.130 spec: 495 * 496 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are 497 * both not set then no result values are written to pData for 498 * queries that are in the unavailable state at the time of the 499 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However, 500 * availability state is still written to pData for those queries 501 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set. 502 */ 503 result = VK_NOT_READY; 504 if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) { 505 result_base += stride; 506 continue; 507 } 508 } 509 510 for (uint32_t k = 0; k < result_count; k++) { 511 if (available) { 512 uint64_t *result; 513 514 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 515 uint32_t stat_idx = statistics_index(&statistics); 516 result = query_result_addr(pool, query, uint64_t, stat_idx); 517 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 518 result = query_result_addr(pool, query, struct perfcntr_query_slot, k); 519 } else { 520 result = query_result_addr(pool, query, uint64_t, k); 521 } 522 523 write_query_value_cpu(result_base, k, *result, flags); 524 } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) 525 /* From the Vulkan 1.1.130 spec: 526 * 527 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT 528 * is not set, and the query’s status is unavailable, an 529 * intermediate result value between zero and the final result 530 * value is written to pData for that query. 531 * 532 * Just return 0 here for simplicity since it's a valid result. 533 */ 534 write_query_value_cpu(result_base, k, 0, flags); 535 } 536 537 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) 538 /* From the Vulkan 1.1.130 spec: 539 * 540 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final 541 * integer value written for each query is non-zero if the query’s 542 * status was available or zero if the status was unavailable. 543 */ 544 write_query_value_cpu(result_base, result_count, available, flags); 545 546 result_base += stride; 547 } 548 return result; 549} 550 551VKAPI_ATTR VkResult VKAPI_CALL 552tu_GetQueryPoolResults(VkDevice _device, 553 VkQueryPool queryPool, 554 uint32_t firstQuery, 555 uint32_t queryCount, 556 size_t dataSize, 557 void *pData, 558 VkDeviceSize stride, 559 VkQueryResultFlags flags) 560{ 561 TU_FROM_HANDLE(tu_device, device, _device); 562 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 563 assert(firstQuery + queryCount <= pool->size); 564 565 if (vk_device_is_lost(&device->vk)) 566 return VK_ERROR_DEVICE_LOST; 567 568 switch (pool->type) { 569 case VK_QUERY_TYPE_OCCLUSION: 570 case VK_QUERY_TYPE_TIMESTAMP: 571 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 572 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 573 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 574 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 575 return get_query_pool_results(device, pool, firstQuery, queryCount, 576 dataSize, pData, stride, flags); 577 default: 578 assert(!"Invalid query type"); 579 } 580 return VK_SUCCESS; 581} 582 583/* Copies a query value from one buffer to another from the GPU. */ 584static void 585copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf, 586 struct tu_cs *cs, 587 uint64_t src_iova, 588 uint64_t base_write_iova, 589 uint32_t offset, 590 VkQueryResultFlags flags) { 591 uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ? 592 sizeof(uint64_t) : sizeof(uint32_t); 593 uint64_t write_iova = base_write_iova + (offset * element_size); 594 595 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); 596 uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ? 597 CP_MEM_TO_MEM_0_DOUBLE : 0; 598 tu_cs_emit(cs, mem_to_mem_flags); 599 tu_cs_emit_qw(cs, write_iova); 600 tu_cs_emit_qw(cs, src_iova); 601} 602 603static void 604emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf, 605 struct tu_cs *cs, 606 struct tu_query_pool *pool, 607 uint32_t firstQuery, 608 uint32_t queryCount, 609 struct tu_buffer *buffer, 610 VkDeviceSize dstOffset, 611 VkDeviceSize stride, 612 VkQueryResultFlags flags) 613{ 614 /* From the Vulkan 1.1.130 spec: 615 * 616 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous 617 * uses of vkCmdResetQueryPool in the same queue, without any additional 618 * synchronization. 619 * 620 * To ensure that previous writes to the available bit are coherent, first 621 * wait for all writes to complete. 622 */ 623 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 624 625 for (uint32_t i = 0; i < queryCount; i++) { 626 uint32_t query = firstQuery + i; 627 uint64_t available_iova = query_available_iova(pool, query); 628 uint64_t buffer_iova = buffer->iova + dstOffset + i * stride; 629 uint32_t result_count = get_result_count(pool); 630 uint32_t statistics = pool->pipeline_statistics; 631 632 /* Wait for the available bit to be set if executed with the 633 * VK_QUERY_RESULT_WAIT_BIT flag. */ 634 if (flags & VK_QUERY_RESULT_WAIT_BIT) { 635 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); 636 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | 637 CP_WAIT_REG_MEM_0_POLL_MEMORY); 638 tu_cs_emit_qw(cs, available_iova); 639 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1)); 640 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); 641 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); 642 } 643 644 for (uint32_t k = 0; k < result_count; k++) { 645 uint64_t result_iova; 646 647 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 648 uint32_t stat_idx = statistics_index(&statistics); 649 result_iova = query_result_iova(pool, query, uint64_t, stat_idx); 650 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 651 result_iova = query_result_iova(pool, query, 652 struct perfcntr_query_slot, k); 653 } else { 654 result_iova = query_result_iova(pool, query, uint64_t, k); 655 } 656 657 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { 658 /* Unconditionally copying the bo->result into the buffer here is 659 * valid because we only set bo->result on vkCmdEndQuery. Thus, even 660 * if the query is unavailable, this will copy the correct partial 661 * value of 0. 662 */ 663 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova, 664 k /* offset */, flags); 665 } else { 666 /* Conditionally copy bo->result into the buffer based on whether the 667 * query is available. 668 * 669 * NOTE: For the conditional packets to be executed, CP_COND_EXEC 670 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests 671 * that 0 < available < 2, aka available == 1. 672 */ 673 tu_cs_reserve(cs, 7 + 6); 674 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); 675 tu_cs_emit_qw(cs, available_iova); 676 tu_cs_emit_qw(cs, available_iova); 677 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); 678 tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */ 679 680 /* Start of conditional execution */ 681 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova, 682 k /* offset */, flags); 683 /* End of conditional execution */ 684 } 685 } 686 687 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 688 copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova, 689 result_count /* offset */, flags); 690 } 691 } 692} 693 694VKAPI_ATTR void VKAPI_CALL 695tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, 696 VkQueryPool queryPool, 697 uint32_t firstQuery, 698 uint32_t queryCount, 699 VkBuffer dstBuffer, 700 VkDeviceSize dstOffset, 701 VkDeviceSize stride, 702 VkQueryResultFlags flags) 703{ 704 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 705 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 706 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); 707 struct tu_cs *cs = &cmdbuf->cs; 708 assert(firstQuery + queryCount <= pool->size); 709 710 switch (pool->type) { 711 case VK_QUERY_TYPE_OCCLUSION: 712 case VK_QUERY_TYPE_TIMESTAMP: 713 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 714 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 715 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 716 return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery, 717 queryCount, buffer, dstOffset, stride, flags); 718 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 719 unreachable("allowCommandBufferQueryCopies is false"); 720 default: 721 assert(!"Invalid query type"); 722 } 723} 724 725static void 726emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf, 727 struct tu_query_pool *pool, 728 uint32_t firstQuery, 729 uint32_t queryCount) 730{ 731 struct tu_cs *cs = &cmdbuf->cs; 732 733 for (uint32_t i = 0; i < queryCount; i++) { 734 uint32_t query = firstQuery + i; 735 uint32_t statistics = pool->pipeline_statistics; 736 737 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 738 tu_cs_emit_qw(cs, query_available_iova(pool, query)); 739 tu_cs_emit_qw(cs, 0x0); 740 741 for (uint32_t k = 0; k < get_result_count(pool); k++) { 742 uint64_t result_iova; 743 744 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 745 uint32_t stat_idx = statistics_index(&statistics); 746 result_iova = query_result_iova(pool, query, uint64_t, stat_idx); 747 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 748 result_iova = query_result_iova(pool, query, 749 struct perfcntr_query_slot, k); 750 } else { 751 result_iova = query_result_iova(pool, query, uint64_t, k); 752 } 753 754 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 755 tu_cs_emit_qw(cs, result_iova); 756 tu_cs_emit_qw(cs, 0x0); 757 } 758 } 759 760} 761 762VKAPI_ATTR void VKAPI_CALL 763tu_CmdResetQueryPool(VkCommandBuffer commandBuffer, 764 VkQueryPool queryPool, 765 uint32_t firstQuery, 766 uint32_t queryCount) 767{ 768 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 769 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 770 771 switch (pool->type) { 772 case VK_QUERY_TYPE_TIMESTAMP: 773 case VK_QUERY_TYPE_OCCLUSION: 774 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 775 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 776 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 777 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 778 emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount); 779 break; 780 default: 781 assert(!"Invalid query type"); 782 } 783} 784 785VKAPI_ATTR void VKAPI_CALL 786tu_ResetQueryPool(VkDevice device, 787 VkQueryPool queryPool, 788 uint32_t firstQuery, 789 uint32_t queryCount) 790{ 791 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 792 793 for (uint32_t i = 0; i < queryCount; i++) { 794 struct query_slot *slot = slot_address(pool, i + firstQuery); 795 slot->available = 0; 796 797 for (uint32_t k = 0; k < get_result_count(pool); k++) { 798 uint64_t *res; 799 800 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 801 res = query_result_addr(pool, i + firstQuery, 802 struct perfcntr_query_slot, k); 803 } else { 804 res = query_result_addr(pool, i + firstQuery, uint64_t, k); 805 } 806 807 *res = 0; 808 } 809 } 810} 811 812static void 813emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf, 814 struct tu_query_pool *pool, 815 uint32_t query) 816{ 817 /* From the Vulkan 1.1.130 spec: 818 * 819 * A query must begin and end inside the same subpass of a render pass 820 * instance, or must both begin and end outside of a render pass 821 * instance. 822 * 823 * Unlike on an immediate-mode renderer, Turnip renders all tiles on 824 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a 825 * query begins/ends inside the same subpass of a render pass, we need to 826 * record the packets on the secondary draw command stream. cmdbuf->draw_cs 827 * is then run on every tile during render, so we just need to accumulate 828 * sample counts in slot->result to compute the query result. 829 */ 830 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 831 832 uint64_t begin_iova = occlusion_query_iova(pool, query, begin); 833 834 tu_cs_emit_regs(cs, 835 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); 836 837 tu_cs_emit_regs(cs, 838 A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova)); 839 840 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); 841 tu_cs_emit(cs, ZPASS_DONE); 842} 843 844static void 845emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, 846 struct tu_query_pool *pool, 847 uint32_t query) 848{ 849 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 850 uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin); 851 852 if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) { 853 bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running; 854 cmdbuf->state.prim_counters_running++; 855 856 /* Prevent starting primitive counters when it is supposed to be stopped 857 * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query. 858 */ 859 if (need_cond_exec) { 860 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | 861 CP_COND_REG_EXEC_0_SYSMEM | 862 CP_COND_REG_EXEC_0_BINNING); 863 } 864 865 tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS); 866 867 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); 868 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); 869 tu_cs_emit(cs, 0); 870 871 if (need_cond_exec) { 872 tu_cond_exec_end(cs); 873 } 874 } 875 876 if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) { 877 tu6_emit_event_write(cmdbuf, cs, START_FRAGMENT_CTRS); 878 } 879 880 if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) { 881 tu6_emit_event_write(cmdbuf, cs, START_COMPUTE_CTRS); 882 } 883 884 tu_cs_emit_wfi(cs); 885 886 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 887 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) | 888 CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) | 889 CP_REG_TO_MEM_0_64B); 890 tu_cs_emit_qw(cs, begin_iova); 891} 892 893static void 894emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass) 895{ 896 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); 897 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG( 898 REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) | 899 A6XX_CP_REG_TEST_0_BIT(pass) | 900 A6XX_CP_REG_TEST_0_WAIT_FOR_ME); 901 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); 902} 903 904static void 905emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf, 906 struct tu_query_pool *pool, 907 uint32_t query) 908{ 909 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 910 uint32_t last_pass = ~0; 911 912 if (cmdbuf->state.pass) { 913 cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true; 914 } 915 916 /* Querying perf counters happens in these steps: 917 * 918 * 0) There's a scratch reg to set a pass index for perf counters query. 919 * Prepare cmd streams to set each pass index to the reg at device 920 * creation time. See tu_CreateDevice in tu_device.c 921 * 1) Emit command streams to read all requested perf counters at all 922 * passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which 923 * reads the scratch reg where pass index is set. 924 * See emit_perfcntrs_pass_start. 925 * 2) Pick the right cs setting proper pass index to the reg and prepend 926 * it to the command buffer at each submit time. 927 * See tu_QueueSubmit in tu_drm.c 928 * 3) If the pass index in the reg is true, then executes the command 929 * stream below CP_COND_REG_EXEC. 930 */ 931 932 tu_cs_emit_wfi(cs); 933 934 for (uint32_t i = 0; i < pool->counter_index_count; i++) { 935 struct tu_perf_query_data *data = &pool->perf_query_data[i]; 936 937 if (last_pass != data->pass) { 938 last_pass = data->pass; 939 940 if (data->pass != 0) 941 tu_cond_exec_end(cs); 942 emit_perfcntrs_pass_start(cs, data->pass); 943 } 944 945 const struct fd_perfcntr_counter *counter = 946 &pool->perf_group[data->gid].counters[data->cntr_reg]; 947 const struct fd_perfcntr_countable *countable = 948 &pool->perf_group[data->gid].countables[data->cid]; 949 950 tu_cs_emit_pkt4(cs, counter->select_reg, 1); 951 tu_cs_emit(cs, countable->selector); 952 } 953 tu_cond_exec_end(cs); 954 955 last_pass = ~0; 956 tu_cs_emit_wfi(cs); 957 958 for (uint32_t i = 0; i < pool->counter_index_count; i++) { 959 struct tu_perf_query_data *data = &pool->perf_query_data[i]; 960 961 if (last_pass != data->pass) { 962 last_pass = data->pass; 963 964 if (data->pass != 0) 965 tu_cond_exec_end(cs); 966 emit_perfcntrs_pass_start(cs, data->pass); 967 } 968 969 const struct fd_perfcntr_counter *counter = 970 &pool->perf_group[data->gid].counters[data->cntr_reg]; 971 972 uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx); 973 974 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 975 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) | 976 CP_REG_TO_MEM_0_64B); 977 tu_cs_emit_qw(cs, begin_iova); 978 } 979 tu_cond_exec_end(cs); 980} 981 982static void 983emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf, 984 struct tu_query_pool *pool, 985 uint32_t query, 986 uint32_t stream_id) 987{ 988 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 989 uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0); 990 991 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova)); 992 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS); 993} 994 995static void 996emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf, 997 struct tu_query_pool *pool, 998 uint32_t query) 999{ 1000 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1001 uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin); 1002 1003 if (cmdbuf->state.pass) { 1004 cmdbuf->state.rp.has_prim_generated_query_in_rp = true; 1005 } else { 1006 cmdbuf->state.prim_generated_query_running_before_rp = true; 1007 } 1008 1009 cmdbuf->state.prim_counters_running++; 1010 1011 if (cmdbuf->state.pass) { 1012 /* Primitives that passed all tests are still counted in in each 1013 * tile even with HW binning beforehand. Do not permit it. 1014 */ 1015 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | 1016 CP_COND_REG_EXEC_0_SYSMEM | 1017 CP_COND_REG_EXEC_0_BINNING); 1018 } 1019 1020 tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS); 1021 1022 tu_cs_emit_wfi(cs); 1023 1024 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1025 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) | 1026 CP_REG_TO_MEM_0_CNT(2) | 1027 CP_REG_TO_MEM_0_64B); 1028 tu_cs_emit_qw(cs, begin_iova); 1029 1030 if (cmdbuf->state.pass) { 1031 tu_cond_exec_end(cs); 1032 } 1033} 1034 1035VKAPI_ATTR void VKAPI_CALL 1036tu_CmdBeginQuery(VkCommandBuffer commandBuffer, 1037 VkQueryPool queryPool, 1038 uint32_t query, 1039 VkQueryControlFlags flags) 1040{ 1041 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 1042 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1043 assert(query < pool->size); 1044 1045 switch (pool->type) { 1046 case VK_QUERY_TYPE_OCCLUSION: 1047 /* In freedreno, there is no implementation difference between 1048 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly 1049 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here. 1050 */ 1051 emit_begin_occlusion_query(cmdbuf, pool, query); 1052 break; 1053 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1054 emit_begin_xfb_query(cmdbuf, pool, query, 0); 1055 break; 1056 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1057 emit_begin_prim_generated_query(cmdbuf, pool, query); 1058 break; 1059 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 1060 emit_begin_perf_query(cmdbuf, pool, query); 1061 break; 1062 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1063 emit_begin_stat_query(cmdbuf, pool, query); 1064 break; 1065 case VK_QUERY_TYPE_TIMESTAMP: 1066 unreachable("Unimplemented query type"); 1067 default: 1068 assert(!"Invalid query type"); 1069 } 1070} 1071 1072VKAPI_ATTR void VKAPI_CALL 1073tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, 1074 VkQueryPool queryPool, 1075 uint32_t query, 1076 VkQueryControlFlags flags, 1077 uint32_t index) 1078{ 1079 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 1080 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1081 assert(query < pool->size); 1082 1083 switch (pool->type) { 1084 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1085 emit_begin_xfb_query(cmdbuf, pool, query, index); 1086 break; 1087 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1088 emit_begin_prim_generated_query(cmdbuf, pool, query); 1089 break; 1090 default: 1091 assert(!"Invalid query type"); 1092 } 1093} 1094 1095static void 1096emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf, 1097 struct tu_query_pool *pool, 1098 uint32_t query) 1099{ 1100 /* Ending an occlusion query happens in a few steps: 1101 * 1) Set the slot->end to UINT64_MAX. 1102 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to 1103 * write the current sample count value into slot->end. 1104 * 3) Since (2) is asynchronous, wait until slot->end is not equal to 1105 * UINT64_MAX before continuing via CP_WAIT_REG_MEM. 1106 * 4) Accumulate the results of the query (slot->end - slot->begin) into 1107 * slot->result. 1108 * 5) If vkCmdEndQuery is *not* called from within the scope of a render 1109 * pass, set the slot's available bit since the query is now done. 1110 * 6) If vkCmdEndQuery *is* called from within the scope of a render 1111 * pass, we cannot mark as available yet since the commands in 1112 * draw_cs are not run until vkCmdEndRenderPass. 1113 */ 1114 const struct tu_render_pass *pass = cmdbuf->state.pass; 1115 struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1116 1117 uint64_t available_iova = query_available_iova(pool, query); 1118 uint64_t begin_iova = occlusion_query_iova(pool, query, begin); 1119 uint64_t end_iova = occlusion_query_iova(pool, query, end); 1120 uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0); 1121 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1122 tu_cs_emit_qw(cs, end_iova); 1123 tu_cs_emit_qw(cs, 0xffffffffffffffffull); 1124 1125 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1126 1127 tu_cs_emit_regs(cs, 1128 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); 1129 1130 tu_cs_emit_regs(cs, 1131 A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova)); 1132 1133 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); 1134 tu_cs_emit(cs, ZPASS_DONE); 1135 1136 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); 1137 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) | 1138 CP_WAIT_REG_MEM_0_POLL_MEMORY); 1139 tu_cs_emit_qw(cs, end_iova); 1140 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff)); 1141 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); 1142 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); 1143 1144 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */ 1145 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1146 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); 1147 tu_cs_emit_qw(cs, result_iova); 1148 tu_cs_emit_qw(cs, result_iova); 1149 tu_cs_emit_qw(cs, end_iova); 1150 tu_cs_emit_qw(cs, begin_iova); 1151 1152 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1153 1154 if (pass) 1155 /* Technically, queries should be tracked per-subpass, but here we track 1156 * at the render pass level to simply the code a bit. This is safe 1157 * because the only commands that use the available bit are 1158 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which 1159 * cannot be invoked from inside a render pass scope. 1160 */ 1161 cs = &cmdbuf->draw_epilogue_cs; 1162 1163 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1164 tu_cs_emit_qw(cs, available_iova); 1165 tu_cs_emit_qw(cs, 0x1); 1166} 1167 1168/* PRIMITIVE_CTRS is used for two distinct queries: 1169 * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT 1170 * - VK_QUERY_TYPE_PIPELINE_STATISTICS 1171 * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted 1172 * only for outer query. 1173 * 1174 * Also, pipeline stat query could run outside of renderpass and prim gen 1175 * query inside of secondary cmd buffer - for such case we ought to track 1176 * the status of pipeline stats query. 1177 */ 1178static void 1179emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf, 1180 struct tu_cs *cs, 1181 enum VkQueryType query_type) 1182{ 1183 bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY; 1184 cmdbuf->state.prim_counters_running--; 1185 if (cmdbuf->state.prim_counters_running == 0) { 1186 bool need_cond_exec = 1187 is_secondary && 1188 query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT && 1189 is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics); 1190 1191 if (!need_cond_exec) { 1192 tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); 1193 } else { 1194 tu_cs_reserve(cs, 7 + 2); 1195 /* Check that pipeline stats query is not running, only then 1196 * we count stop the counter. 1197 */ 1198 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); 1199 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); 1200 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); 1201 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); 1202 tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */ 1203 1204 tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); 1205 } 1206 } 1207 1208 if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 1209 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); 1210 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); 1211 tu_cs_emit(cs, 1); 1212 } 1213} 1214 1215static void 1216emit_end_stat_query(struct tu_cmd_buffer *cmdbuf, 1217 struct tu_query_pool *pool, 1218 uint32_t query) 1219{ 1220 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1221 uint64_t end_iova = pipeline_stat_query_iova(pool, query, end); 1222 uint64_t available_iova = query_available_iova(pool, query); 1223 uint64_t result_iova; 1224 uint64_t stat_start_iova; 1225 uint64_t stat_stop_iova; 1226 1227 if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) { 1228 /* No need to conditionally execute STOP_PRIMITIVE_CTRS when 1229 * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a 1230 * renderpass, because it is already stopped. 1231 */ 1232 emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS); 1233 } 1234 1235 if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) { 1236 tu6_emit_event_write(cmdbuf, cs, STOP_FRAGMENT_CTRS); 1237 } 1238 1239 if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) { 1240 tu6_emit_event_write(cmdbuf, cs, STOP_COMPUTE_CTRS); 1241 } 1242 1243 tu_cs_emit_wfi(cs); 1244 1245 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1246 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) | 1247 CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) | 1248 CP_REG_TO_MEM_0_64B); 1249 tu_cs_emit_qw(cs, end_iova); 1250 1251 for (int i = 0; i < STAT_COUNT; i++) { 1252 result_iova = query_result_iova(pool, query, uint64_t, i); 1253 stat_start_iova = pipeline_stat_query_iova(pool, query, begin[i]); 1254 stat_stop_iova = pipeline_stat_query_iova(pool, query, end[i]); 1255 1256 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1257 tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 1258 CP_MEM_TO_MEM_0_DOUBLE | 1259 CP_MEM_TO_MEM_0_NEG_C); 1260 1261 tu_cs_emit_qw(cs, result_iova); 1262 tu_cs_emit_qw(cs, result_iova); 1263 tu_cs_emit_qw(cs, stat_stop_iova); 1264 tu_cs_emit_qw(cs, stat_start_iova); 1265 } 1266 1267 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1268 1269 if (cmdbuf->state.pass) 1270 cs = &cmdbuf->draw_epilogue_cs; 1271 1272 /* Set the availability to 1 */ 1273 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1274 tu_cs_emit_qw(cs, available_iova); 1275 tu_cs_emit_qw(cs, 0x1); 1276} 1277 1278static void 1279emit_end_perf_query(struct tu_cmd_buffer *cmdbuf, 1280 struct tu_query_pool *pool, 1281 uint32_t query) 1282{ 1283 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1284 uint64_t available_iova = query_available_iova(pool, query); 1285 uint64_t end_iova; 1286 uint64_t begin_iova; 1287 uint64_t result_iova; 1288 uint32_t last_pass = ~0; 1289 1290 for (uint32_t i = 0; i < pool->counter_index_count; i++) { 1291 struct tu_perf_query_data *data = &pool->perf_query_data[i]; 1292 1293 if (last_pass != data->pass) { 1294 last_pass = data->pass; 1295 1296 if (data->pass != 0) 1297 tu_cond_exec_end(cs); 1298 emit_perfcntrs_pass_start(cs, data->pass); 1299 } 1300 1301 const struct fd_perfcntr_counter *counter = 1302 &pool->perf_group[data->gid].counters[data->cntr_reg]; 1303 1304 end_iova = perf_query_iova(pool, 0, end, data->app_idx); 1305 1306 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1307 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) | 1308 CP_REG_TO_MEM_0_64B); 1309 tu_cs_emit_qw(cs, end_iova); 1310 } 1311 tu_cond_exec_end(cs); 1312 1313 last_pass = ~0; 1314 tu_cs_emit_wfi(cs); 1315 1316 for (uint32_t i = 0; i < pool->counter_index_count; i++) { 1317 struct tu_perf_query_data *data = &pool->perf_query_data[i]; 1318 1319 if (last_pass != data->pass) { 1320 last_pass = data->pass; 1321 1322 1323 if (data->pass != 0) 1324 tu_cond_exec_end(cs); 1325 emit_perfcntrs_pass_start(cs, data->pass); 1326 } 1327 1328 result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot, 1329 data->app_idx); 1330 begin_iova = perf_query_iova(pool, 0, begin, data->app_idx); 1331 end_iova = perf_query_iova(pool, 0, end, data->app_idx); 1332 1333 /* result += end - begin */ 1334 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1335 tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 1336 CP_MEM_TO_MEM_0_DOUBLE | 1337 CP_MEM_TO_MEM_0_NEG_C); 1338 1339 tu_cs_emit_qw(cs, result_iova); 1340 tu_cs_emit_qw(cs, result_iova); 1341 tu_cs_emit_qw(cs, end_iova); 1342 tu_cs_emit_qw(cs, begin_iova); 1343 } 1344 tu_cond_exec_end(cs); 1345 1346 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1347 1348 if (cmdbuf->state.pass) 1349 cs = &cmdbuf->draw_epilogue_cs; 1350 1351 /* Set the availability to 1 */ 1352 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1353 tu_cs_emit_qw(cs, available_iova); 1354 tu_cs_emit_qw(cs, 0x1); 1355} 1356 1357static void 1358emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf, 1359 struct tu_query_pool *pool, 1360 uint32_t query, 1361 uint32_t stream_id) 1362{ 1363 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1364 1365 uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0); 1366 uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0); 1367 uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1); 1368 uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0); 1369 uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1); 1370 uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0); 1371 uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1); 1372 uint64_t available_iova = query_available_iova(pool, query); 1373 1374 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova)); 1375 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS); 1376 1377 tu_cs_emit_wfi(cs); 1378 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS); 1379 1380 /* Set the count of written primitives */ 1381 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1382 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 1383 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000); 1384 tu_cs_emit_qw(cs, result_written_iova); 1385 tu_cs_emit_qw(cs, result_written_iova); 1386 tu_cs_emit_qw(cs, end_written_iova); 1387 tu_cs_emit_qw(cs, begin_written_iova); 1388 1389 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS); 1390 1391 /* Set the count of generated primitives */ 1392 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1393 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 1394 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000); 1395 tu_cs_emit_qw(cs, result_generated_iova); 1396 tu_cs_emit_qw(cs, result_generated_iova); 1397 tu_cs_emit_qw(cs, end_generated_iova); 1398 tu_cs_emit_qw(cs, begin_generated_iova); 1399 1400 /* Set the availability to 1 */ 1401 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1402 tu_cs_emit_qw(cs, available_iova); 1403 tu_cs_emit_qw(cs, 0x1); 1404} 1405 1406static void 1407emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf, 1408 struct tu_query_pool *pool, 1409 uint32_t query) 1410{ 1411 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1412 1413 if (!cmdbuf->state.pass) { 1414 cmdbuf->state.prim_generated_query_running_before_rp = false; 1415 } 1416 1417 uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin); 1418 uint64_t end_iova = primitives_generated_query_iova(pool, query, end); 1419 uint64_t result_iova = primitives_generated_query_iova(pool, query, result); 1420 uint64_t available_iova = query_available_iova(pool, query); 1421 1422 if (cmdbuf->state.pass) { 1423 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | 1424 CP_COND_REG_EXEC_0_SYSMEM | 1425 CP_COND_REG_EXEC_0_BINNING); 1426 } 1427 1428 tu_cs_emit_wfi(cs); 1429 1430 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1431 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) | 1432 CP_REG_TO_MEM_0_CNT(2) | 1433 CP_REG_TO_MEM_0_64B); 1434 tu_cs_emit_qw(cs, end_iova); 1435 1436 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1437 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 1438 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES); 1439 tu_cs_emit_qw(cs, result_iova); 1440 tu_cs_emit_qw(cs, result_iova); 1441 tu_cs_emit_qw(cs, end_iova); 1442 tu_cs_emit_qw(cs, begin_iova); 1443 1444 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1445 1446 /* Should be after waiting for mem writes to have up to date info 1447 * about which query is running. 1448 */ 1449 emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT); 1450 1451 if (cmdbuf->state.pass) { 1452 tu_cond_exec_end(cs); 1453 } 1454 1455 if (cmdbuf->state.pass) 1456 cs = &cmdbuf->draw_epilogue_cs; 1457 1458 /* Set the availability to 1 */ 1459 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1460 tu_cs_emit_qw(cs, available_iova); 1461 tu_cs_emit_qw(cs, 0x1); 1462} 1463 1464/* Implement this bit of spec text from section 17.2 "Query Operation": 1465 * 1466 * If queries are used while executing a render pass instance that has 1467 * multiview enabled, the query uses N consecutive query indices in the 1468 * query pool (starting at query) where N is the number of bits set in the 1469 * view mask in the subpass the query is used in. How the numerical 1470 * results of the query are distributed among the queries is 1471 * implementation-dependent. For example, some implementations may write 1472 * each view’s results to a distinct query, while other implementations 1473 * may write the total result to the first query and write zero to the 1474 * other queries. However, the sum of the results in all the queries must 1475 * accurately reflect the total result of the query summed over all views. 1476 * Applications can sum the results from all the queries to compute the 1477 * total result. 1478 * 1479 * Since we execute all views at once, we write zero to the other queries. 1480 * Furthermore, because queries must be reset before use, and we set the 1481 * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available. 1482 */ 1483 1484static void 1485handle_multiview_queries(struct tu_cmd_buffer *cmd, 1486 struct tu_query_pool *pool, 1487 uint32_t query) 1488{ 1489 if (!cmd->state.pass || !cmd->state.subpass->multiview_mask) 1490 return; 1491 1492 unsigned views = util_bitcount(cmd->state.subpass->multiview_mask); 1493 struct tu_cs *cs = &cmd->draw_epilogue_cs; 1494 1495 for (uint32_t i = 1; i < views; i++) { 1496 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1497 tu_cs_emit_qw(cs, query_available_iova(pool, query + i)); 1498 tu_cs_emit_qw(cs, 0x1); 1499 } 1500} 1501 1502VKAPI_ATTR void VKAPI_CALL 1503tu_CmdEndQuery(VkCommandBuffer commandBuffer, 1504 VkQueryPool queryPool, 1505 uint32_t query) 1506{ 1507 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 1508 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1509 assert(query < pool->size); 1510 1511 switch (pool->type) { 1512 case VK_QUERY_TYPE_OCCLUSION: 1513 emit_end_occlusion_query(cmdbuf, pool, query); 1514 break; 1515 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1516 emit_end_xfb_query(cmdbuf, pool, query, 0); 1517 break; 1518 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1519 emit_end_prim_generated_query(cmdbuf, pool, query); 1520 break; 1521 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 1522 emit_end_perf_query(cmdbuf, pool, query); 1523 break; 1524 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1525 emit_end_stat_query(cmdbuf, pool, query); 1526 break; 1527 case VK_QUERY_TYPE_TIMESTAMP: 1528 unreachable("Unimplemented query type"); 1529 default: 1530 assert(!"Invalid query type"); 1531 } 1532 1533 handle_multiview_queries(cmdbuf, pool, query); 1534} 1535 1536VKAPI_ATTR void VKAPI_CALL 1537tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, 1538 VkQueryPool queryPool, 1539 uint32_t query, 1540 uint32_t index) 1541{ 1542 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 1543 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1544 assert(query < pool->size); 1545 1546 switch (pool->type) { 1547 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1548 assert(index <= 4); 1549 emit_end_xfb_query(cmdbuf, pool, query, index); 1550 break; 1551 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1552 emit_end_prim_generated_query(cmdbuf, pool, query); 1553 break; 1554 default: 1555 assert(!"Invalid query type"); 1556 } 1557} 1558 1559VKAPI_ATTR void VKAPI_CALL 1560tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, 1561 VkPipelineStageFlagBits2 pipelineStage, 1562 VkQueryPool queryPool, 1563 uint32_t query) 1564{ 1565 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1566 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1567 1568 /* Inside a render pass, just write the timestamp multiple times so that 1569 * the user gets the last one if we use GMEM. There isn't really much 1570 * better we can do, and this seems to be what the blob does too. 1571 */ 1572 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; 1573 1574 /* Stages that will already have been executed by the time the CP executes 1575 * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw 1576 * indirect stage counts as top-of-pipe too. 1577 */ 1578 VkPipelineStageFlags2 top_of_pipe_flags = 1579 VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | 1580 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT; 1581 1582 if (pipelineStage & ~top_of_pipe_flags) { 1583 /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM 1584 * does CP_WAIT_FOR_ME internally, which will wait for the WFI to 1585 * complete. 1586 * 1587 * Stalling the CP like this is really unfortunate, but I don't think 1588 * there's a better solution that allows all 48 bits of precision 1589 * because CP_EVENT_WRITE doesn't support 64-bit timestamps. 1590 */ 1591 tu_cs_emit_wfi(cs); 1592 } 1593 1594 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1595 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) | 1596 CP_REG_TO_MEM_0_CNT(2) | 1597 CP_REG_TO_MEM_0_64B); 1598 tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0)); 1599 1600 /* Only flag availability once the entire renderpass is done, similar to 1601 * the begin/end path. 1602 */ 1603 cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs; 1604 1605 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1606 tu_cs_emit_qw(cs, query_available_iova(pool, query)); 1607 tu_cs_emit_qw(cs, 0x1); 1608 1609 /* From the spec for vkCmdWriteTimestamp: 1610 * 1611 * If vkCmdWriteTimestamp is called while executing a render pass 1612 * instance that has multiview enabled, the timestamp uses N consecutive 1613 * query indices in the query pool (starting at query) where N is the 1614 * number of bits set in the view mask of the subpass the command is 1615 * executed in. The resulting query values are determined by an 1616 * implementation-dependent choice of one of the following behaviors: 1617 * 1618 * - The first query is a timestamp value and (if more than one bit is 1619 * set in the view mask) zero is written to the remaining queries. 1620 * If two timestamps are written in the same subpass, the sum of the 1621 * execution time of all views between those commands is the 1622 * difference between the first query written by each command. 1623 * 1624 * - All N queries are timestamp values. If two timestamps are written 1625 * in the same subpass, the sum of the execution time of all views 1626 * between those commands is the sum of the difference between 1627 * corresponding queries written by each command. The difference 1628 * between corresponding queries may be the execution time of a 1629 * single view. 1630 * 1631 * We execute all views in the same draw call, so we implement the first 1632 * option, the same as regular queries. 1633 */ 1634 handle_multiview_queries(cmd, pool, query); 1635} 1636 1637VKAPI_ATTR VkResult VKAPI_CALL 1638tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( 1639 VkPhysicalDevice physicalDevice, 1640 uint32_t queueFamilyIndex, 1641 uint32_t* pCounterCount, 1642 VkPerformanceCounterKHR* pCounters, 1643 VkPerformanceCounterDescriptionKHR* pCounterDescriptions) 1644{ 1645 TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice); 1646 1647 uint32_t desc_count = *pCounterCount; 1648 uint32_t group_count; 1649 const struct fd_perfcntr_group *group = 1650 fd_perfcntrs(&phydev->dev_id, &group_count); 1651 1652 VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount); 1653 VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc, 1654 pCounterDescriptions, &desc_count); 1655 1656 for (int i = 0; i < group_count; i++) { 1657 for (int j = 0; j < group[i].num_countables; j++) { 1658 1659 vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { 1660 counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR; 1661 counter->unit = 1662 fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type]; 1663 counter->storage = 1664 fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type]; 1665 1666 unsigned char sha1_result[20]; 1667 _mesa_sha1_compute(group[i].countables[j].name, 1668 strlen(group[i].countables[j].name), 1669 sha1_result); 1670 memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); 1671 } 1672 1673 vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) { 1674 desc->flags = 0; 1675 1676 snprintf(desc->name, sizeof(desc->name), 1677 "%s", group[i].countables[j].name); 1678 snprintf(desc->category, sizeof(desc->category), "%s", group[i].name); 1679 snprintf(desc->description, sizeof(desc->description), 1680 "%s: %s performance counter", 1681 group[i].name, group[i].countables[j].name); 1682 } 1683 } 1684 } 1685 1686 return vk_outarray_status(&out); 1687} 1688 1689VKAPI_ATTR void VKAPI_CALL 1690tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( 1691 VkPhysicalDevice physicalDevice, 1692 const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo, 1693 uint32_t* pNumPasses) 1694{ 1695 TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice); 1696 uint32_t group_count = 0; 1697 uint32_t gid = 0, cid = 0, n_passes; 1698 const struct fd_perfcntr_group *group = 1699 fd_perfcntrs(&phydev->dev_id, &group_count); 1700 1701 uint32_t counters_requested[group_count]; 1702 memset(counters_requested, 0x0, sizeof(counters_requested)); 1703 *pNumPasses = 1; 1704 1705 for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) { 1706 perfcntr_index(group, group_count, 1707 pPerformanceQueryCreateInfo->pCounterIndices[i], 1708 &gid, &cid); 1709 1710 counters_requested[gid]++; 1711 } 1712 1713 for (uint32_t i = 0; i < group_count; i++) { 1714 n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters); 1715 *pNumPasses = MAX2(*pNumPasses, n_passes); 1716 } 1717} 1718 1719VKAPI_ATTR VkResult VKAPI_CALL 1720tu_AcquireProfilingLockKHR(VkDevice device, 1721 const VkAcquireProfilingLockInfoKHR* pInfo) 1722{ 1723 /* TODO. Probably there's something to do for kgsl. */ 1724 return VK_SUCCESS; 1725} 1726 1727VKAPI_ATTR void VKAPI_CALL 1728tu_ReleaseProfilingLockKHR(VkDevice device) 1729{ 1730 /* TODO. Probably there's something to do for kgsl. */ 1731 return; 1732} 1733