1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyrigh 2016 Red Hat Inc. 3bf215546Sopenharmony_ci * SPDX-License-Identifier: MIT 4bf215546Sopenharmony_ci * 5bf215546Sopenharmony_ci * Based on anv: 6bf215546Sopenharmony_ci * Copyright © 2015 Intel Corporation 7bf215546Sopenharmony_ci */ 8bf215546Sopenharmony_ci 9bf215546Sopenharmony_ci#include "tu_query.h" 10bf215546Sopenharmony_ci 11bf215546Sopenharmony_ci#include <fcntl.h> 12bf215546Sopenharmony_ci 13bf215546Sopenharmony_ci#include "nir/nir_builder.h" 14bf215546Sopenharmony_ci#include "util/os_time.h" 15bf215546Sopenharmony_ci 16bf215546Sopenharmony_ci#include "vk_util.h" 17bf215546Sopenharmony_ci 18bf215546Sopenharmony_ci#include "tu_cmd_buffer.h" 19bf215546Sopenharmony_ci#include "tu_cs.h" 20bf215546Sopenharmony_ci#include "tu_device.h" 21bf215546Sopenharmony_ci 22bf215546Sopenharmony_ci#define NSEC_PER_SEC 1000000000ull 23bf215546Sopenharmony_ci#define WAIT_TIMEOUT 5 24bf215546Sopenharmony_ci#define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1) 25bf215546Sopenharmony_ci 26bf215546Sopenharmony_cistruct PACKED query_slot { 27bf215546Sopenharmony_ci uint64_t available; 28bf215546Sopenharmony_ci}; 29bf215546Sopenharmony_ci 30bf215546Sopenharmony_cistruct PACKED occlusion_slot_value { 31bf215546Sopenharmony_ci /* Seems sample counters are placed to be 16-byte aligned 32bf215546Sopenharmony_ci * even though this query needs an 8-byte slot. */ 33bf215546Sopenharmony_ci uint64_t value; 34bf215546Sopenharmony_ci uint64_t _padding; 35bf215546Sopenharmony_ci}; 36bf215546Sopenharmony_ci 37bf215546Sopenharmony_cistruct PACKED occlusion_query_slot { 38bf215546Sopenharmony_ci struct query_slot common; 39bf215546Sopenharmony_ci uint64_t result; 40bf215546Sopenharmony_ci 41bf215546Sopenharmony_ci struct occlusion_slot_value begin; 42bf215546Sopenharmony_ci struct occlusion_slot_value end; 43bf215546Sopenharmony_ci}; 44bf215546Sopenharmony_ci 45bf215546Sopenharmony_cistruct PACKED timestamp_query_slot { 46bf215546Sopenharmony_ci struct query_slot common; 47bf215546Sopenharmony_ci uint64_t result; 48bf215546Sopenharmony_ci}; 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_cistruct PACKED primitive_slot_value { 51bf215546Sopenharmony_ci uint64_t values[2]; 52bf215546Sopenharmony_ci}; 53bf215546Sopenharmony_ci 54bf215546Sopenharmony_cistruct PACKED pipeline_stat_query_slot { 55bf215546Sopenharmony_ci struct query_slot common; 56bf215546Sopenharmony_ci uint64_t results[STAT_COUNT]; 57bf215546Sopenharmony_ci 58bf215546Sopenharmony_ci uint64_t begin[STAT_COUNT]; 59bf215546Sopenharmony_ci uint64_t end[STAT_COUNT]; 60bf215546Sopenharmony_ci}; 61bf215546Sopenharmony_ci 62bf215546Sopenharmony_cistruct PACKED primitive_query_slot { 63bf215546Sopenharmony_ci struct query_slot common; 64bf215546Sopenharmony_ci /* The result of transform feedback queries is two integer values: 65bf215546Sopenharmony_ci * results[0] is the count of primitives written, 66bf215546Sopenharmony_ci * results[1] is the count of primitives generated. 67bf215546Sopenharmony_ci * Also a result for each stream is stored at 4 slots respectively. 68bf215546Sopenharmony_ci */ 69bf215546Sopenharmony_ci uint64_t results[2]; 70bf215546Sopenharmony_ci 71bf215546Sopenharmony_ci /* Primitive counters also need to be 16-byte aligned. */ 72bf215546Sopenharmony_ci uint64_t _padding; 73bf215546Sopenharmony_ci 74bf215546Sopenharmony_ci struct primitive_slot_value begin[4]; 75bf215546Sopenharmony_ci struct primitive_slot_value end[4]; 76bf215546Sopenharmony_ci}; 77bf215546Sopenharmony_ci 78bf215546Sopenharmony_cistruct PACKED perfcntr_query_slot { 79bf215546Sopenharmony_ci uint64_t result; 80bf215546Sopenharmony_ci uint64_t begin; 81bf215546Sopenharmony_ci uint64_t end; 82bf215546Sopenharmony_ci}; 83bf215546Sopenharmony_ci 84bf215546Sopenharmony_cistruct PACKED perf_query_slot { 85bf215546Sopenharmony_ci struct query_slot common; 86bf215546Sopenharmony_ci struct perfcntr_query_slot perfcntr; 87bf215546Sopenharmony_ci}; 88bf215546Sopenharmony_ci 89bf215546Sopenharmony_cistruct PACKED primitives_generated_query_slot { 90bf215546Sopenharmony_ci struct query_slot common; 91bf215546Sopenharmony_ci uint64_t result; 92bf215546Sopenharmony_ci uint64_t begin; 93bf215546Sopenharmony_ci uint64_t end; 94bf215546Sopenharmony_ci}; 95bf215546Sopenharmony_ci 96bf215546Sopenharmony_ci/* Returns the IOVA of a given uint64_t field in a given slot of a query 97bf215546Sopenharmony_ci * pool. */ 98bf215546Sopenharmony_ci#define query_iova(type, pool, query, field) \ 99bf215546Sopenharmony_ci pool->bo->iova + pool->stride * (query) + offsetof(type, field) 100bf215546Sopenharmony_ci 101bf215546Sopenharmony_ci#define occlusion_query_iova(pool, query, field) \ 102bf215546Sopenharmony_ci query_iova(struct occlusion_query_slot, pool, query, field) 103bf215546Sopenharmony_ci 104bf215546Sopenharmony_ci#define pipeline_stat_query_iova(pool, query, field) \ 105bf215546Sopenharmony_ci pool->bo->iova + pool->stride * (query) + \ 106bf215546Sopenharmony_ci offsetof(struct pipeline_stat_query_slot, field) 107bf215546Sopenharmony_ci 108bf215546Sopenharmony_ci#define primitive_query_iova(pool, query, field, i) \ 109bf215546Sopenharmony_ci query_iova(struct primitive_query_slot, pool, query, field) + \ 110bf215546Sopenharmony_ci offsetof(struct primitive_slot_value, values[i]) 111bf215546Sopenharmony_ci 112bf215546Sopenharmony_ci#define perf_query_iova(pool, query, field, i) \ 113bf215546Sopenharmony_ci pool->bo->iova + pool->stride * (query) + \ 114bf215546Sopenharmony_ci sizeof(struct query_slot) + \ 115bf215546Sopenharmony_ci sizeof(struct perfcntr_query_slot) * (i) + \ 116bf215546Sopenharmony_ci offsetof(struct perfcntr_query_slot, field) 117bf215546Sopenharmony_ci 118bf215546Sopenharmony_ci#define primitives_generated_query_iova(pool, query, field) \ 119bf215546Sopenharmony_ci query_iova(struct primitives_generated_query_slot, pool, query, field) 120bf215546Sopenharmony_ci 121bf215546Sopenharmony_ci#define query_available_iova(pool, query) \ 122bf215546Sopenharmony_ci query_iova(struct query_slot, pool, query, available) 123bf215546Sopenharmony_ci 124bf215546Sopenharmony_ci#define query_result_iova(pool, query, type, i) \ 125bf215546Sopenharmony_ci pool->bo->iova + pool->stride * (query) + \ 126bf215546Sopenharmony_ci sizeof(struct query_slot) + sizeof(type) * (i) 127bf215546Sopenharmony_ci 128bf215546Sopenharmony_ci#define query_result_addr(pool, query, type, i) \ 129bf215546Sopenharmony_ci pool->bo->map + pool->stride * (query) + \ 130bf215546Sopenharmony_ci sizeof(struct query_slot) + sizeof(type) * (i) 131bf215546Sopenharmony_ci 132bf215546Sopenharmony_ci#define query_is_available(slot) slot->available 133bf215546Sopenharmony_ci 134bf215546Sopenharmony_cistatic const VkPerformanceCounterUnitKHR 135bf215546Sopenharmony_cifd_perfcntr_type_to_vk_unit[] = { 136bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 137bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 138bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 139bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR, 140bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR, 141bf215546Sopenharmony_ci /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */ 142bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 143bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR, 144bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 145bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 146bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 147bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 148bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 149bf215546Sopenharmony_ci}; 150bf215546Sopenharmony_ci 151bf215546Sopenharmony_ci/* TODO. Basically this comes from the freedreno implementation where 152bf215546Sopenharmony_ci * only UINT64 is used. We'd better confirm this by the blob vulkan driver 153bf215546Sopenharmony_ci * when it starts supporting perf query. 154bf215546Sopenharmony_ci */ 155bf215546Sopenharmony_cistatic const VkPerformanceCounterStorageKHR 156bf215546Sopenharmony_cifd_perfcntr_type_to_vk_storage[] = { 157bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR, 158bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 159bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 160bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 161bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 162bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 163bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 164bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 165bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 166bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 167bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 168bf215546Sopenharmony_ci [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 169bf215546Sopenharmony_ci}; 170bf215546Sopenharmony_ci 171bf215546Sopenharmony_ci/* 172bf215546Sopenharmony_ci * Returns a pointer to a given slot in a query pool. 173bf215546Sopenharmony_ci */ 174bf215546Sopenharmony_cistatic void* slot_address(struct tu_query_pool *pool, uint32_t query) 175bf215546Sopenharmony_ci{ 176bf215546Sopenharmony_ci return (char*)pool->bo->map + query * pool->stride; 177bf215546Sopenharmony_ci} 178bf215546Sopenharmony_ci 179bf215546Sopenharmony_cistatic void 180bf215546Sopenharmony_ciperfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count, 181bf215546Sopenharmony_ci uint32_t index, uint32_t *gid, uint32_t *cid) 182bf215546Sopenharmony_ci 183bf215546Sopenharmony_ci{ 184bf215546Sopenharmony_ci uint32_t i; 185bf215546Sopenharmony_ci 186bf215546Sopenharmony_ci for (i = 0; i < group_count; i++) { 187bf215546Sopenharmony_ci if (group[i].num_countables > index) { 188bf215546Sopenharmony_ci *gid = i; 189bf215546Sopenharmony_ci *cid = index; 190bf215546Sopenharmony_ci break; 191bf215546Sopenharmony_ci } 192bf215546Sopenharmony_ci index -= group[i].num_countables; 193bf215546Sopenharmony_ci } 194bf215546Sopenharmony_ci 195bf215546Sopenharmony_ci assert(i < group_count); 196bf215546Sopenharmony_ci} 197bf215546Sopenharmony_ci 198bf215546Sopenharmony_cistatic int 199bf215546Sopenharmony_cicompare_perfcntr_pass(const void *a, const void *b) 200bf215546Sopenharmony_ci{ 201bf215546Sopenharmony_ci return ((struct tu_perf_query_data *)a)->pass - 202bf215546Sopenharmony_ci ((struct tu_perf_query_data *)b)->pass; 203bf215546Sopenharmony_ci} 204bf215546Sopenharmony_ci 205bf215546Sopenharmony_ciVKAPI_ATTR VkResult VKAPI_CALL 206bf215546Sopenharmony_citu_CreateQueryPool(VkDevice _device, 207bf215546Sopenharmony_ci const VkQueryPoolCreateInfo *pCreateInfo, 208bf215546Sopenharmony_ci const VkAllocationCallbacks *pAllocator, 209bf215546Sopenharmony_ci VkQueryPool *pQueryPool) 210bf215546Sopenharmony_ci{ 211bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_device, device, _device); 212bf215546Sopenharmony_ci assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); 213bf215546Sopenharmony_ci assert(pCreateInfo->queryCount > 0); 214bf215546Sopenharmony_ci 215bf215546Sopenharmony_ci uint32_t pool_size, slot_size; 216bf215546Sopenharmony_ci const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL; 217bf215546Sopenharmony_ci 218bf215546Sopenharmony_ci pool_size = sizeof(struct tu_query_pool); 219bf215546Sopenharmony_ci 220bf215546Sopenharmony_ci switch (pCreateInfo->queryType) { 221bf215546Sopenharmony_ci case VK_QUERY_TYPE_OCCLUSION: 222bf215546Sopenharmony_ci slot_size = sizeof(struct occlusion_query_slot); 223bf215546Sopenharmony_ci break; 224bf215546Sopenharmony_ci case VK_QUERY_TYPE_TIMESTAMP: 225bf215546Sopenharmony_ci slot_size = sizeof(struct timestamp_query_slot); 226bf215546Sopenharmony_ci break; 227bf215546Sopenharmony_ci case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 228bf215546Sopenharmony_ci slot_size = sizeof(struct primitive_query_slot); 229bf215546Sopenharmony_ci break; 230bf215546Sopenharmony_ci case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 231bf215546Sopenharmony_ci slot_size = sizeof(struct primitives_generated_query_slot); 232bf215546Sopenharmony_ci break; 233bf215546Sopenharmony_ci case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 234bf215546Sopenharmony_ci perf_query_info = 235bf215546Sopenharmony_ci vk_find_struct_const(pCreateInfo->pNext, 236bf215546Sopenharmony_ci QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); 237bf215546Sopenharmony_ci assert(perf_query_info); 238bf215546Sopenharmony_ci 239bf215546Sopenharmony_ci slot_size = sizeof(struct perf_query_slot) + 240bf215546Sopenharmony_ci sizeof(struct perfcntr_query_slot) * 241bf215546Sopenharmony_ci (perf_query_info->counterIndexCount - 1); 242bf215546Sopenharmony_ci 243bf215546Sopenharmony_ci /* Size of the array pool->tu_perf_query_data */ 244bf215546Sopenharmony_ci pool_size += sizeof(struct tu_perf_query_data) * 245bf215546Sopenharmony_ci perf_query_info->counterIndexCount; 246bf215546Sopenharmony_ci break; 247bf215546Sopenharmony_ci } 248bf215546Sopenharmony_ci case VK_QUERY_TYPE_PIPELINE_STATISTICS: 249bf215546Sopenharmony_ci slot_size = sizeof(struct pipeline_stat_query_slot); 250bf215546Sopenharmony_ci break; 251bf215546Sopenharmony_ci default: 252bf215546Sopenharmony_ci unreachable("Invalid query type"); 253bf215546Sopenharmony_ci } 254bf215546Sopenharmony_ci 255bf215546Sopenharmony_ci struct tu_query_pool *pool = 256bf215546Sopenharmony_ci vk_object_alloc(&device->vk, pAllocator, pool_size, 257bf215546Sopenharmony_ci VK_OBJECT_TYPE_QUERY_POOL); 258bf215546Sopenharmony_ci if (!pool) 259bf215546Sopenharmony_ci return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 260bf215546Sopenharmony_ci 261bf215546Sopenharmony_ci if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 262bf215546Sopenharmony_ci pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id, 263bf215546Sopenharmony_ci &pool->perf_group_count); 264bf215546Sopenharmony_ci 265bf215546Sopenharmony_ci pool->counter_index_count = perf_query_info->counterIndexCount; 266bf215546Sopenharmony_ci 267bf215546Sopenharmony_ci /* Build all perf counters data that is requested, so we could get 268bf215546Sopenharmony_ci * correct group id, countable id, counter register and pass index with 269bf215546Sopenharmony_ci * only a counter index provided by applications at each command submit. 270bf215546Sopenharmony_ci * 271bf215546Sopenharmony_ci * Also, since this built data will be sorted by pass index later, we 272bf215546Sopenharmony_ci * should keep the original indices and store perfcntrs results according 273bf215546Sopenharmony_ci * to them so apps can get correct results with their own indices. 274bf215546Sopenharmony_ci */ 275bf215546Sopenharmony_ci uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count]; 276bf215546Sopenharmony_ci memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0])); 277bf215546Sopenharmony_ci memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0])); 278bf215546Sopenharmony_ci 279bf215546Sopenharmony_ci for (uint32_t i = 0; i < pool->counter_index_count; i++) { 280bf215546Sopenharmony_ci uint32_t gid = 0, cid = 0; 281bf215546Sopenharmony_ci 282bf215546Sopenharmony_ci perfcntr_index(pool->perf_group, pool->perf_group_count, 283bf215546Sopenharmony_ci perf_query_info->pCounterIndices[i], &gid, &cid); 284bf215546Sopenharmony_ci 285bf215546Sopenharmony_ci pool->perf_query_data[i].gid = gid; 286bf215546Sopenharmony_ci pool->perf_query_data[i].cid = cid; 287bf215546Sopenharmony_ci pool->perf_query_data[i].app_idx = i; 288bf215546Sopenharmony_ci 289bf215546Sopenharmony_ci /* When a counter register is over the capacity(num_counters), 290bf215546Sopenharmony_ci * reset it for next pass. 291bf215546Sopenharmony_ci */ 292bf215546Sopenharmony_ci if (regs[gid] < pool->perf_group[gid].num_counters) { 293bf215546Sopenharmony_ci pool->perf_query_data[i].cntr_reg = regs[gid]++; 294bf215546Sopenharmony_ci pool->perf_query_data[i].pass = pass[gid]; 295bf215546Sopenharmony_ci } else { 296bf215546Sopenharmony_ci pool->perf_query_data[i].pass = ++pass[gid]; 297bf215546Sopenharmony_ci pool->perf_query_data[i].cntr_reg = regs[gid] = 0; 298bf215546Sopenharmony_ci regs[gid]++; 299bf215546Sopenharmony_ci } 300bf215546Sopenharmony_ci } 301bf215546Sopenharmony_ci 302bf215546Sopenharmony_ci /* Sort by pass index so we could easily prepare a command stream 303bf215546Sopenharmony_ci * with the ascending order of pass index. 304bf215546Sopenharmony_ci */ 305bf215546Sopenharmony_ci qsort(pool->perf_query_data, pool->counter_index_count, 306bf215546Sopenharmony_ci sizeof(pool->perf_query_data[0]), 307bf215546Sopenharmony_ci compare_perfcntr_pass); 308bf215546Sopenharmony_ci } 309bf215546Sopenharmony_ci 310bf215546Sopenharmony_ci VkResult result = tu_bo_init_new(device, &pool->bo, 311bf215546Sopenharmony_ci pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS); 312bf215546Sopenharmony_ci if (result != VK_SUCCESS) { 313bf215546Sopenharmony_ci vk_object_free(&device->vk, pAllocator, pool); 314bf215546Sopenharmony_ci return result; 315bf215546Sopenharmony_ci } 316bf215546Sopenharmony_ci 317bf215546Sopenharmony_ci result = tu_bo_map(device, pool->bo); 318bf215546Sopenharmony_ci if (result != VK_SUCCESS) { 319bf215546Sopenharmony_ci tu_bo_finish(device, pool->bo); 320bf215546Sopenharmony_ci vk_object_free(&device->vk, pAllocator, pool); 321bf215546Sopenharmony_ci return result; 322bf215546Sopenharmony_ci } 323bf215546Sopenharmony_ci 324bf215546Sopenharmony_ci /* Initialize all query statuses to unavailable */ 325bf215546Sopenharmony_ci memset(pool->bo->map, 0, pool->bo->size); 326bf215546Sopenharmony_ci 327bf215546Sopenharmony_ci pool->type = pCreateInfo->queryType; 328bf215546Sopenharmony_ci pool->stride = slot_size; 329bf215546Sopenharmony_ci pool->size = pCreateInfo->queryCount; 330bf215546Sopenharmony_ci pool->pipeline_statistics = pCreateInfo->pipelineStatistics; 331bf215546Sopenharmony_ci *pQueryPool = tu_query_pool_to_handle(pool); 332bf215546Sopenharmony_ci 333bf215546Sopenharmony_ci return VK_SUCCESS; 334bf215546Sopenharmony_ci} 335bf215546Sopenharmony_ci 336bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL 337bf215546Sopenharmony_citu_DestroyQueryPool(VkDevice _device, 338bf215546Sopenharmony_ci VkQueryPool _pool, 339bf215546Sopenharmony_ci const VkAllocationCallbacks *pAllocator) 340bf215546Sopenharmony_ci{ 341bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_device, device, _device); 342bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_query_pool, pool, _pool); 343bf215546Sopenharmony_ci 344bf215546Sopenharmony_ci if (!pool) 345bf215546Sopenharmony_ci return; 346bf215546Sopenharmony_ci 347bf215546Sopenharmony_ci tu_bo_finish(device, pool->bo); 348bf215546Sopenharmony_ci vk_object_free(&device->vk, pAllocator, pool); 349bf215546Sopenharmony_ci} 350bf215546Sopenharmony_ci 351bf215546Sopenharmony_cistatic uint32_t 352bf215546Sopenharmony_ciget_result_count(struct tu_query_pool *pool) 353bf215546Sopenharmony_ci{ 354bf215546Sopenharmony_ci switch (pool->type) { 355bf215546Sopenharmony_ci /* Occulusion and timestamp queries write one integer value */ 356bf215546Sopenharmony_ci case VK_QUERY_TYPE_OCCLUSION: 357bf215546Sopenharmony_ci case VK_QUERY_TYPE_TIMESTAMP: 358bf215546Sopenharmony_ci case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 359bf215546Sopenharmony_ci return 1; 360bf215546Sopenharmony_ci /* Transform feedback queries write two integer values */ 361bf215546Sopenharmony_ci case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 362bf215546Sopenharmony_ci return 2; 363bf215546Sopenharmony_ci case VK_QUERY_TYPE_PIPELINE_STATISTICS: 364bf215546Sopenharmony_ci return util_bitcount(pool->pipeline_statistics); 365bf215546Sopenharmony_ci case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 366bf215546Sopenharmony_ci return pool->counter_index_count; 367bf215546Sopenharmony_ci default: 368bf215546Sopenharmony_ci assert(!"Invalid query type"); 369bf215546Sopenharmony_ci return 0; 370bf215546Sopenharmony_ci } 371bf215546Sopenharmony_ci} 372bf215546Sopenharmony_ci 373bf215546Sopenharmony_cistatic uint32_t 374bf215546Sopenharmony_cistatistics_index(uint32_t *statistics) 375bf215546Sopenharmony_ci{ 376bf215546Sopenharmony_ci uint32_t stat; 377bf215546Sopenharmony_ci stat = u_bit_scan(statistics); 378bf215546Sopenharmony_ci 379bf215546Sopenharmony_ci switch (1 << stat) { 380bf215546Sopenharmony_ci case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT: 381bf215546Sopenharmony_ci case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT: 382bf215546Sopenharmony_ci return 0; 383bf215546Sopenharmony_ci case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT: 384bf215546Sopenharmony_ci return 1; 385bf215546Sopenharmony_ci case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT: 386bf215546Sopenharmony_ci return 2; 387bf215546Sopenharmony_ci case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT: 388bf215546Sopenharmony_ci return 4; 389bf215546Sopenharmony_ci case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT: 390bf215546Sopenharmony_ci return 5; 391bf215546Sopenharmony_ci case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT: 392bf215546Sopenharmony_ci return 6; 393bf215546Sopenharmony_ci case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT: 394bf215546Sopenharmony_ci return 7; 395bf215546Sopenharmony_ci case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT: 396bf215546Sopenharmony_ci return 8; 397bf215546Sopenharmony_ci case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT: 398bf215546Sopenharmony_ci return 9; 399bf215546Sopenharmony_ci case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT: 400bf215546Sopenharmony_ci return 10; 401bf215546Sopenharmony_ci default: 402bf215546Sopenharmony_ci return 0; 403bf215546Sopenharmony_ci } 404bf215546Sopenharmony_ci} 405bf215546Sopenharmony_ci 406bf215546Sopenharmony_cistatic bool 407bf215546Sopenharmony_ciis_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics) 408bf215546Sopenharmony_ci{ 409bf215546Sopenharmony_ci return pipeline_statistics & 410bf215546Sopenharmony_ci (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT | 411bf215546Sopenharmony_ci VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT | 412bf215546Sopenharmony_ci VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT | 413bf215546Sopenharmony_ci VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT | 414bf215546Sopenharmony_ci VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT | 415bf215546Sopenharmony_ci VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT | 416bf215546Sopenharmony_ci VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT | 417bf215546Sopenharmony_ci VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT | 418bf215546Sopenharmony_ci VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT); 419bf215546Sopenharmony_ci} 420bf215546Sopenharmony_ci 421bf215546Sopenharmony_cistatic bool 422bf215546Sopenharmony_ciis_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics) 423bf215546Sopenharmony_ci{ 424bf215546Sopenharmony_ci return pipeline_statistics & 425bf215546Sopenharmony_ci VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT; 426bf215546Sopenharmony_ci} 427bf215546Sopenharmony_ci 428bf215546Sopenharmony_cistatic bool 429bf215546Sopenharmony_ciis_pipeline_query_with_compute_stage(uint32_t pipeline_statistics) 430bf215546Sopenharmony_ci{ 431bf215546Sopenharmony_ci return pipeline_statistics & 432bf215546Sopenharmony_ci VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT; 433bf215546Sopenharmony_ci} 434bf215546Sopenharmony_ci 435bf215546Sopenharmony_ci/* Wait on the the availability status of a query up until a timeout. */ 436bf215546Sopenharmony_cistatic VkResult 437bf215546Sopenharmony_ciwait_for_available(struct tu_device *device, struct tu_query_pool *pool, 438bf215546Sopenharmony_ci uint32_t query) 439bf215546Sopenharmony_ci{ 440bf215546Sopenharmony_ci /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a 441bf215546Sopenharmony_ci * scheduler friendly way instead of busy polling once the patch has landed 442bf215546Sopenharmony_ci * upstream. */ 443bf215546Sopenharmony_ci struct query_slot *slot = slot_address(pool, query); 444bf215546Sopenharmony_ci uint64_t abs_timeout = os_time_get_absolute_timeout( 445bf215546Sopenharmony_ci WAIT_TIMEOUT * NSEC_PER_SEC); 446bf215546Sopenharmony_ci while(os_time_get_nano() < abs_timeout) { 447bf215546Sopenharmony_ci if (query_is_available(slot)) 448bf215546Sopenharmony_ci return VK_SUCCESS; 449bf215546Sopenharmony_ci } 450bf215546Sopenharmony_ci return vk_error(device, VK_TIMEOUT); 451bf215546Sopenharmony_ci} 452bf215546Sopenharmony_ci 453bf215546Sopenharmony_ci/* Writes a query value to a buffer from the CPU. */ 454bf215546Sopenharmony_cistatic void 455bf215546Sopenharmony_ciwrite_query_value_cpu(char* base, 456bf215546Sopenharmony_ci uint32_t offset, 457bf215546Sopenharmony_ci uint64_t value, 458bf215546Sopenharmony_ci VkQueryResultFlags flags) 459bf215546Sopenharmony_ci{ 460bf215546Sopenharmony_ci if (flags & VK_QUERY_RESULT_64_BIT) { 461bf215546Sopenharmony_ci *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value; 462bf215546Sopenharmony_ci } else { 463bf215546Sopenharmony_ci *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value; 464bf215546Sopenharmony_ci } 465bf215546Sopenharmony_ci} 466bf215546Sopenharmony_ci 467bf215546Sopenharmony_cistatic VkResult 468bf215546Sopenharmony_ciget_query_pool_results(struct tu_device *device, 469bf215546Sopenharmony_ci struct tu_query_pool *pool, 470bf215546Sopenharmony_ci uint32_t firstQuery, 471bf215546Sopenharmony_ci uint32_t queryCount, 472bf215546Sopenharmony_ci size_t dataSize, 473bf215546Sopenharmony_ci void *pData, 474bf215546Sopenharmony_ci VkDeviceSize stride, 475bf215546Sopenharmony_ci VkQueryResultFlags flags) 476bf215546Sopenharmony_ci{ 477bf215546Sopenharmony_ci assert(dataSize >= stride * queryCount); 478bf215546Sopenharmony_ci 479bf215546Sopenharmony_ci char *result_base = pData; 480bf215546Sopenharmony_ci VkResult result = VK_SUCCESS; 481bf215546Sopenharmony_ci for (uint32_t i = 0; i < queryCount; i++) { 482bf215546Sopenharmony_ci uint32_t query = firstQuery + i; 483bf215546Sopenharmony_ci struct query_slot *slot = slot_address(pool, query); 484bf215546Sopenharmony_ci bool available = query_is_available(slot); 485bf215546Sopenharmony_ci uint32_t result_count = get_result_count(pool); 486bf215546Sopenharmony_ci uint32_t statistics = pool->pipeline_statistics; 487bf215546Sopenharmony_ci 488bf215546Sopenharmony_ci if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) { 489bf215546Sopenharmony_ci VkResult wait_result = wait_for_available(device, pool, query); 490bf215546Sopenharmony_ci if (wait_result != VK_SUCCESS) 491bf215546Sopenharmony_ci return wait_result; 492bf215546Sopenharmony_ci available = true; 493bf215546Sopenharmony_ci } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) { 494bf215546Sopenharmony_ci /* From the Vulkan 1.1.130 spec: 495bf215546Sopenharmony_ci * 496bf215546Sopenharmony_ci * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are 497bf215546Sopenharmony_ci * both not set then no result values are written to pData for 498bf215546Sopenharmony_ci * queries that are in the unavailable state at the time of the 499bf215546Sopenharmony_ci * call, and vkGetQueryPoolResults returns VK_NOT_READY. However, 500bf215546Sopenharmony_ci * availability state is still written to pData for those queries 501bf215546Sopenharmony_ci * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set. 502bf215546Sopenharmony_ci */ 503bf215546Sopenharmony_ci result = VK_NOT_READY; 504bf215546Sopenharmony_ci if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) { 505bf215546Sopenharmony_ci result_base += stride; 506bf215546Sopenharmony_ci continue; 507bf215546Sopenharmony_ci } 508bf215546Sopenharmony_ci } 509bf215546Sopenharmony_ci 510bf215546Sopenharmony_ci for (uint32_t k = 0; k < result_count; k++) { 511bf215546Sopenharmony_ci if (available) { 512bf215546Sopenharmony_ci uint64_t *result; 513bf215546Sopenharmony_ci 514bf215546Sopenharmony_ci if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 515bf215546Sopenharmony_ci uint32_t stat_idx = statistics_index(&statistics); 516bf215546Sopenharmony_ci result = query_result_addr(pool, query, uint64_t, stat_idx); 517bf215546Sopenharmony_ci } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 518bf215546Sopenharmony_ci result = query_result_addr(pool, query, struct perfcntr_query_slot, k); 519bf215546Sopenharmony_ci } else { 520bf215546Sopenharmony_ci result = query_result_addr(pool, query, uint64_t, k); 521bf215546Sopenharmony_ci } 522bf215546Sopenharmony_ci 523bf215546Sopenharmony_ci write_query_value_cpu(result_base, k, *result, flags); 524bf215546Sopenharmony_ci } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) 525bf215546Sopenharmony_ci /* From the Vulkan 1.1.130 spec: 526bf215546Sopenharmony_ci * 527bf215546Sopenharmony_ci * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT 528bf215546Sopenharmony_ci * is not set, and the query’s status is unavailable, an 529bf215546Sopenharmony_ci * intermediate result value between zero and the final result 530bf215546Sopenharmony_ci * value is written to pData for that query. 531bf215546Sopenharmony_ci * 532bf215546Sopenharmony_ci * Just return 0 here for simplicity since it's a valid result. 533bf215546Sopenharmony_ci */ 534bf215546Sopenharmony_ci write_query_value_cpu(result_base, k, 0, flags); 535bf215546Sopenharmony_ci } 536bf215546Sopenharmony_ci 537bf215546Sopenharmony_ci if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) 538bf215546Sopenharmony_ci /* From the Vulkan 1.1.130 spec: 539bf215546Sopenharmony_ci * 540bf215546Sopenharmony_ci * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final 541bf215546Sopenharmony_ci * integer value written for each query is non-zero if the query’s 542bf215546Sopenharmony_ci * status was available or zero if the status was unavailable. 543bf215546Sopenharmony_ci */ 544bf215546Sopenharmony_ci write_query_value_cpu(result_base, result_count, available, flags); 545bf215546Sopenharmony_ci 546bf215546Sopenharmony_ci result_base += stride; 547bf215546Sopenharmony_ci } 548bf215546Sopenharmony_ci return result; 549bf215546Sopenharmony_ci} 550bf215546Sopenharmony_ci 551bf215546Sopenharmony_ciVKAPI_ATTR VkResult VKAPI_CALL 552bf215546Sopenharmony_citu_GetQueryPoolResults(VkDevice _device, 553bf215546Sopenharmony_ci VkQueryPool queryPool, 554bf215546Sopenharmony_ci uint32_t firstQuery, 555bf215546Sopenharmony_ci uint32_t queryCount, 556bf215546Sopenharmony_ci size_t dataSize, 557bf215546Sopenharmony_ci void *pData, 558bf215546Sopenharmony_ci VkDeviceSize stride, 559bf215546Sopenharmony_ci VkQueryResultFlags flags) 560bf215546Sopenharmony_ci{ 561bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_device, device, _device); 562bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 563bf215546Sopenharmony_ci assert(firstQuery + queryCount <= pool->size); 564bf215546Sopenharmony_ci 565bf215546Sopenharmony_ci if (vk_device_is_lost(&device->vk)) 566bf215546Sopenharmony_ci return VK_ERROR_DEVICE_LOST; 567bf215546Sopenharmony_ci 568bf215546Sopenharmony_ci switch (pool->type) { 569bf215546Sopenharmony_ci case VK_QUERY_TYPE_OCCLUSION: 570bf215546Sopenharmony_ci case VK_QUERY_TYPE_TIMESTAMP: 571bf215546Sopenharmony_ci case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 572bf215546Sopenharmony_ci case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 573bf215546Sopenharmony_ci case VK_QUERY_TYPE_PIPELINE_STATISTICS: 574bf215546Sopenharmony_ci case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 575bf215546Sopenharmony_ci return get_query_pool_results(device, pool, firstQuery, queryCount, 576bf215546Sopenharmony_ci dataSize, pData, stride, flags); 577bf215546Sopenharmony_ci default: 578bf215546Sopenharmony_ci assert(!"Invalid query type"); 579bf215546Sopenharmony_ci } 580bf215546Sopenharmony_ci return VK_SUCCESS; 581bf215546Sopenharmony_ci} 582bf215546Sopenharmony_ci 583bf215546Sopenharmony_ci/* Copies a query value from one buffer to another from the GPU. */ 584bf215546Sopenharmony_cistatic void 585bf215546Sopenharmony_cicopy_query_value_gpu(struct tu_cmd_buffer *cmdbuf, 586bf215546Sopenharmony_ci struct tu_cs *cs, 587bf215546Sopenharmony_ci uint64_t src_iova, 588bf215546Sopenharmony_ci uint64_t base_write_iova, 589bf215546Sopenharmony_ci uint32_t offset, 590bf215546Sopenharmony_ci VkQueryResultFlags flags) { 591bf215546Sopenharmony_ci uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ? 592bf215546Sopenharmony_ci sizeof(uint64_t) : sizeof(uint32_t); 593bf215546Sopenharmony_ci uint64_t write_iova = base_write_iova + (offset * element_size); 594bf215546Sopenharmony_ci 595bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); 596bf215546Sopenharmony_ci uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ? 597bf215546Sopenharmony_ci CP_MEM_TO_MEM_0_DOUBLE : 0; 598bf215546Sopenharmony_ci tu_cs_emit(cs, mem_to_mem_flags); 599bf215546Sopenharmony_ci tu_cs_emit_qw(cs, write_iova); 600bf215546Sopenharmony_ci tu_cs_emit_qw(cs, src_iova); 601bf215546Sopenharmony_ci} 602bf215546Sopenharmony_ci 603bf215546Sopenharmony_cistatic void 604bf215546Sopenharmony_ciemit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf, 605bf215546Sopenharmony_ci struct tu_cs *cs, 606bf215546Sopenharmony_ci struct tu_query_pool *pool, 607bf215546Sopenharmony_ci uint32_t firstQuery, 608bf215546Sopenharmony_ci uint32_t queryCount, 609bf215546Sopenharmony_ci struct tu_buffer *buffer, 610bf215546Sopenharmony_ci VkDeviceSize dstOffset, 611bf215546Sopenharmony_ci VkDeviceSize stride, 612bf215546Sopenharmony_ci VkQueryResultFlags flags) 613bf215546Sopenharmony_ci{ 614bf215546Sopenharmony_ci /* From the Vulkan 1.1.130 spec: 615bf215546Sopenharmony_ci * 616bf215546Sopenharmony_ci * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous 617bf215546Sopenharmony_ci * uses of vkCmdResetQueryPool in the same queue, without any additional 618bf215546Sopenharmony_ci * synchronization. 619bf215546Sopenharmony_ci * 620bf215546Sopenharmony_ci * To ensure that previous writes to the available bit are coherent, first 621bf215546Sopenharmony_ci * wait for all writes to complete. 622bf215546Sopenharmony_ci */ 623bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 624bf215546Sopenharmony_ci 625bf215546Sopenharmony_ci for (uint32_t i = 0; i < queryCount; i++) { 626bf215546Sopenharmony_ci uint32_t query = firstQuery + i; 627bf215546Sopenharmony_ci uint64_t available_iova = query_available_iova(pool, query); 628bf215546Sopenharmony_ci uint64_t buffer_iova = buffer->iova + dstOffset + i * stride; 629bf215546Sopenharmony_ci uint32_t result_count = get_result_count(pool); 630bf215546Sopenharmony_ci uint32_t statistics = pool->pipeline_statistics; 631bf215546Sopenharmony_ci 632bf215546Sopenharmony_ci /* Wait for the available bit to be set if executed with the 633bf215546Sopenharmony_ci * VK_QUERY_RESULT_WAIT_BIT flag. */ 634bf215546Sopenharmony_ci if (flags & VK_QUERY_RESULT_WAIT_BIT) { 635bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); 636bf215546Sopenharmony_ci tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | 637bf215546Sopenharmony_ci CP_WAIT_REG_MEM_0_POLL_MEMORY); 638bf215546Sopenharmony_ci tu_cs_emit_qw(cs, available_iova); 639bf215546Sopenharmony_ci tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1)); 640bf215546Sopenharmony_ci tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); 641bf215546Sopenharmony_ci tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); 642bf215546Sopenharmony_ci } 643bf215546Sopenharmony_ci 644bf215546Sopenharmony_ci for (uint32_t k = 0; k < result_count; k++) { 645bf215546Sopenharmony_ci uint64_t result_iova; 646bf215546Sopenharmony_ci 647bf215546Sopenharmony_ci if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 648bf215546Sopenharmony_ci uint32_t stat_idx = statistics_index(&statistics); 649bf215546Sopenharmony_ci result_iova = query_result_iova(pool, query, uint64_t, stat_idx); 650bf215546Sopenharmony_ci } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 651bf215546Sopenharmony_ci result_iova = query_result_iova(pool, query, 652bf215546Sopenharmony_ci struct perfcntr_query_slot, k); 653bf215546Sopenharmony_ci } else { 654bf215546Sopenharmony_ci result_iova = query_result_iova(pool, query, uint64_t, k); 655bf215546Sopenharmony_ci } 656bf215546Sopenharmony_ci 657bf215546Sopenharmony_ci if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { 658bf215546Sopenharmony_ci /* Unconditionally copying the bo->result into the buffer here is 659bf215546Sopenharmony_ci * valid because we only set bo->result on vkCmdEndQuery. Thus, even 660bf215546Sopenharmony_ci * if the query is unavailable, this will copy the correct partial 661bf215546Sopenharmony_ci * value of 0. 662bf215546Sopenharmony_ci */ 663bf215546Sopenharmony_ci copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova, 664bf215546Sopenharmony_ci k /* offset */, flags); 665bf215546Sopenharmony_ci } else { 666bf215546Sopenharmony_ci /* Conditionally copy bo->result into the buffer based on whether the 667bf215546Sopenharmony_ci * query is available. 668bf215546Sopenharmony_ci * 669bf215546Sopenharmony_ci * NOTE: For the conditional packets to be executed, CP_COND_EXEC 670bf215546Sopenharmony_ci * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests 671bf215546Sopenharmony_ci * that 0 < available < 2, aka available == 1. 672bf215546Sopenharmony_ci */ 673bf215546Sopenharmony_ci tu_cs_reserve(cs, 7 + 6); 674bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); 675bf215546Sopenharmony_ci tu_cs_emit_qw(cs, available_iova); 676bf215546Sopenharmony_ci tu_cs_emit_qw(cs, available_iova); 677bf215546Sopenharmony_ci tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); 678bf215546Sopenharmony_ci tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */ 679bf215546Sopenharmony_ci 680bf215546Sopenharmony_ci /* Start of conditional execution */ 681bf215546Sopenharmony_ci copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova, 682bf215546Sopenharmony_ci k /* offset */, flags); 683bf215546Sopenharmony_ci /* End of conditional execution */ 684bf215546Sopenharmony_ci } 685bf215546Sopenharmony_ci } 686bf215546Sopenharmony_ci 687bf215546Sopenharmony_ci if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 688bf215546Sopenharmony_ci copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova, 689bf215546Sopenharmony_ci result_count /* offset */, flags); 690bf215546Sopenharmony_ci } 691bf215546Sopenharmony_ci } 692bf215546Sopenharmony_ci} 693bf215546Sopenharmony_ci 694bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL 695bf215546Sopenharmony_citu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, 696bf215546Sopenharmony_ci VkQueryPool queryPool, 697bf215546Sopenharmony_ci uint32_t firstQuery, 698bf215546Sopenharmony_ci uint32_t queryCount, 699bf215546Sopenharmony_ci VkBuffer dstBuffer, 700bf215546Sopenharmony_ci VkDeviceSize dstOffset, 701bf215546Sopenharmony_ci VkDeviceSize stride, 702bf215546Sopenharmony_ci VkQueryResultFlags flags) 703bf215546Sopenharmony_ci{ 704bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 705bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 706bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); 707bf215546Sopenharmony_ci struct tu_cs *cs = &cmdbuf->cs; 708bf215546Sopenharmony_ci assert(firstQuery + queryCount <= pool->size); 709bf215546Sopenharmony_ci 710bf215546Sopenharmony_ci switch (pool->type) { 711bf215546Sopenharmony_ci case VK_QUERY_TYPE_OCCLUSION: 712bf215546Sopenharmony_ci case VK_QUERY_TYPE_TIMESTAMP: 713bf215546Sopenharmony_ci case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 714bf215546Sopenharmony_ci case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 715bf215546Sopenharmony_ci case VK_QUERY_TYPE_PIPELINE_STATISTICS: 716bf215546Sopenharmony_ci return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery, 717bf215546Sopenharmony_ci queryCount, buffer, dstOffset, stride, flags); 718bf215546Sopenharmony_ci case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 719bf215546Sopenharmony_ci unreachable("allowCommandBufferQueryCopies is false"); 720bf215546Sopenharmony_ci default: 721bf215546Sopenharmony_ci assert(!"Invalid query type"); 722bf215546Sopenharmony_ci } 723bf215546Sopenharmony_ci} 724bf215546Sopenharmony_ci 725bf215546Sopenharmony_cistatic void 726bf215546Sopenharmony_ciemit_reset_query_pool(struct tu_cmd_buffer *cmdbuf, 727bf215546Sopenharmony_ci struct tu_query_pool *pool, 728bf215546Sopenharmony_ci uint32_t firstQuery, 729bf215546Sopenharmony_ci uint32_t queryCount) 730bf215546Sopenharmony_ci{ 731bf215546Sopenharmony_ci struct tu_cs *cs = &cmdbuf->cs; 732bf215546Sopenharmony_ci 733bf215546Sopenharmony_ci for (uint32_t i = 0; i < queryCount; i++) { 734bf215546Sopenharmony_ci uint32_t query = firstQuery + i; 735bf215546Sopenharmony_ci uint32_t statistics = pool->pipeline_statistics; 736bf215546Sopenharmony_ci 737bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 738bf215546Sopenharmony_ci tu_cs_emit_qw(cs, query_available_iova(pool, query)); 739bf215546Sopenharmony_ci tu_cs_emit_qw(cs, 0x0); 740bf215546Sopenharmony_ci 741bf215546Sopenharmony_ci for (uint32_t k = 0; k < get_result_count(pool); k++) { 742bf215546Sopenharmony_ci uint64_t result_iova; 743bf215546Sopenharmony_ci 744bf215546Sopenharmony_ci if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 745bf215546Sopenharmony_ci uint32_t stat_idx = statistics_index(&statistics); 746bf215546Sopenharmony_ci result_iova = query_result_iova(pool, query, uint64_t, stat_idx); 747bf215546Sopenharmony_ci } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 748bf215546Sopenharmony_ci result_iova = query_result_iova(pool, query, 749bf215546Sopenharmony_ci struct perfcntr_query_slot, k); 750bf215546Sopenharmony_ci } else { 751bf215546Sopenharmony_ci result_iova = query_result_iova(pool, query, uint64_t, k); 752bf215546Sopenharmony_ci } 753bf215546Sopenharmony_ci 754bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 755bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_iova); 756bf215546Sopenharmony_ci tu_cs_emit_qw(cs, 0x0); 757bf215546Sopenharmony_ci } 758bf215546Sopenharmony_ci } 759bf215546Sopenharmony_ci 760bf215546Sopenharmony_ci} 761bf215546Sopenharmony_ci 762bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL 763bf215546Sopenharmony_citu_CmdResetQueryPool(VkCommandBuffer commandBuffer, 764bf215546Sopenharmony_ci VkQueryPool queryPool, 765bf215546Sopenharmony_ci uint32_t firstQuery, 766bf215546Sopenharmony_ci uint32_t queryCount) 767bf215546Sopenharmony_ci{ 768bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 769bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 770bf215546Sopenharmony_ci 771bf215546Sopenharmony_ci switch (pool->type) { 772bf215546Sopenharmony_ci case VK_QUERY_TYPE_TIMESTAMP: 773bf215546Sopenharmony_ci case VK_QUERY_TYPE_OCCLUSION: 774bf215546Sopenharmony_ci case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 775bf215546Sopenharmony_ci case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 776bf215546Sopenharmony_ci case VK_QUERY_TYPE_PIPELINE_STATISTICS: 777bf215546Sopenharmony_ci case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 778bf215546Sopenharmony_ci emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount); 779bf215546Sopenharmony_ci break; 780bf215546Sopenharmony_ci default: 781bf215546Sopenharmony_ci assert(!"Invalid query type"); 782bf215546Sopenharmony_ci } 783bf215546Sopenharmony_ci} 784bf215546Sopenharmony_ci 785bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL 786bf215546Sopenharmony_citu_ResetQueryPool(VkDevice device, 787bf215546Sopenharmony_ci VkQueryPool queryPool, 788bf215546Sopenharmony_ci uint32_t firstQuery, 789bf215546Sopenharmony_ci uint32_t queryCount) 790bf215546Sopenharmony_ci{ 791bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 792bf215546Sopenharmony_ci 793bf215546Sopenharmony_ci for (uint32_t i = 0; i < queryCount; i++) { 794bf215546Sopenharmony_ci struct query_slot *slot = slot_address(pool, i + firstQuery); 795bf215546Sopenharmony_ci slot->available = 0; 796bf215546Sopenharmony_ci 797bf215546Sopenharmony_ci for (uint32_t k = 0; k < get_result_count(pool); k++) { 798bf215546Sopenharmony_ci uint64_t *res; 799bf215546Sopenharmony_ci 800bf215546Sopenharmony_ci if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 801bf215546Sopenharmony_ci res = query_result_addr(pool, i + firstQuery, 802bf215546Sopenharmony_ci struct perfcntr_query_slot, k); 803bf215546Sopenharmony_ci } else { 804bf215546Sopenharmony_ci res = query_result_addr(pool, i + firstQuery, uint64_t, k); 805bf215546Sopenharmony_ci } 806bf215546Sopenharmony_ci 807bf215546Sopenharmony_ci *res = 0; 808bf215546Sopenharmony_ci } 809bf215546Sopenharmony_ci } 810bf215546Sopenharmony_ci} 811bf215546Sopenharmony_ci 812bf215546Sopenharmony_cistatic void 813bf215546Sopenharmony_ciemit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf, 814bf215546Sopenharmony_ci struct tu_query_pool *pool, 815bf215546Sopenharmony_ci uint32_t query) 816bf215546Sopenharmony_ci{ 817bf215546Sopenharmony_ci /* From the Vulkan 1.1.130 spec: 818bf215546Sopenharmony_ci * 819bf215546Sopenharmony_ci * A query must begin and end inside the same subpass of a render pass 820bf215546Sopenharmony_ci * instance, or must both begin and end outside of a render pass 821bf215546Sopenharmony_ci * instance. 822bf215546Sopenharmony_ci * 823bf215546Sopenharmony_ci * Unlike on an immediate-mode renderer, Turnip renders all tiles on 824bf215546Sopenharmony_ci * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a 825bf215546Sopenharmony_ci * query begins/ends inside the same subpass of a render pass, we need to 826bf215546Sopenharmony_ci * record the packets on the secondary draw command stream. cmdbuf->draw_cs 827bf215546Sopenharmony_ci * is then run on every tile during render, so we just need to accumulate 828bf215546Sopenharmony_ci * sample counts in slot->result to compute the query result. 829bf215546Sopenharmony_ci */ 830bf215546Sopenharmony_ci struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 831bf215546Sopenharmony_ci 832bf215546Sopenharmony_ci uint64_t begin_iova = occlusion_query_iova(pool, query, begin); 833bf215546Sopenharmony_ci 834bf215546Sopenharmony_ci tu_cs_emit_regs(cs, 835bf215546Sopenharmony_ci A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); 836bf215546Sopenharmony_ci 837bf215546Sopenharmony_ci tu_cs_emit_regs(cs, 838bf215546Sopenharmony_ci A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova)); 839bf215546Sopenharmony_ci 840bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); 841bf215546Sopenharmony_ci tu_cs_emit(cs, ZPASS_DONE); 842bf215546Sopenharmony_ci} 843bf215546Sopenharmony_ci 844bf215546Sopenharmony_cistatic void 845bf215546Sopenharmony_ciemit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, 846bf215546Sopenharmony_ci struct tu_query_pool *pool, 847bf215546Sopenharmony_ci uint32_t query) 848bf215546Sopenharmony_ci{ 849bf215546Sopenharmony_ci struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 850bf215546Sopenharmony_ci uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin); 851bf215546Sopenharmony_ci 852bf215546Sopenharmony_ci if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) { 853bf215546Sopenharmony_ci bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running; 854bf215546Sopenharmony_ci cmdbuf->state.prim_counters_running++; 855bf215546Sopenharmony_ci 856bf215546Sopenharmony_ci /* Prevent starting primitive counters when it is supposed to be stopped 857bf215546Sopenharmony_ci * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query. 858bf215546Sopenharmony_ci */ 859bf215546Sopenharmony_ci if (need_cond_exec) { 860bf215546Sopenharmony_ci tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | 861bf215546Sopenharmony_ci CP_COND_REG_EXEC_0_SYSMEM | 862bf215546Sopenharmony_ci CP_COND_REG_EXEC_0_BINNING); 863bf215546Sopenharmony_ci } 864bf215546Sopenharmony_ci 865bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS); 866bf215546Sopenharmony_ci 867bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); 868bf215546Sopenharmony_ci tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); 869bf215546Sopenharmony_ci tu_cs_emit(cs, 0); 870bf215546Sopenharmony_ci 871bf215546Sopenharmony_ci if (need_cond_exec) { 872bf215546Sopenharmony_ci tu_cond_exec_end(cs); 873bf215546Sopenharmony_ci } 874bf215546Sopenharmony_ci } 875bf215546Sopenharmony_ci 876bf215546Sopenharmony_ci if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) { 877bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, START_FRAGMENT_CTRS); 878bf215546Sopenharmony_ci } 879bf215546Sopenharmony_ci 880bf215546Sopenharmony_ci if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) { 881bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, START_COMPUTE_CTRS); 882bf215546Sopenharmony_ci } 883bf215546Sopenharmony_ci 884bf215546Sopenharmony_ci tu_cs_emit_wfi(cs); 885bf215546Sopenharmony_ci 886bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 887bf215546Sopenharmony_ci tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) | 888bf215546Sopenharmony_ci CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) | 889bf215546Sopenharmony_ci CP_REG_TO_MEM_0_64B); 890bf215546Sopenharmony_ci tu_cs_emit_qw(cs, begin_iova); 891bf215546Sopenharmony_ci} 892bf215546Sopenharmony_ci 893bf215546Sopenharmony_cistatic void 894bf215546Sopenharmony_ciemit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass) 895bf215546Sopenharmony_ci{ 896bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); 897bf215546Sopenharmony_ci tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG( 898bf215546Sopenharmony_ci REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) | 899bf215546Sopenharmony_ci A6XX_CP_REG_TEST_0_BIT(pass) | 900bf215546Sopenharmony_ci A6XX_CP_REG_TEST_0_WAIT_FOR_ME); 901bf215546Sopenharmony_ci tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); 902bf215546Sopenharmony_ci} 903bf215546Sopenharmony_ci 904bf215546Sopenharmony_cistatic void 905bf215546Sopenharmony_ciemit_begin_perf_query(struct tu_cmd_buffer *cmdbuf, 906bf215546Sopenharmony_ci struct tu_query_pool *pool, 907bf215546Sopenharmony_ci uint32_t query) 908bf215546Sopenharmony_ci{ 909bf215546Sopenharmony_ci struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 910bf215546Sopenharmony_ci uint32_t last_pass = ~0; 911bf215546Sopenharmony_ci 912bf215546Sopenharmony_ci if (cmdbuf->state.pass) { 913bf215546Sopenharmony_ci cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true; 914bf215546Sopenharmony_ci } 915bf215546Sopenharmony_ci 916bf215546Sopenharmony_ci /* Querying perf counters happens in these steps: 917bf215546Sopenharmony_ci * 918bf215546Sopenharmony_ci * 0) There's a scratch reg to set a pass index for perf counters query. 919bf215546Sopenharmony_ci * Prepare cmd streams to set each pass index to the reg at device 920bf215546Sopenharmony_ci * creation time. See tu_CreateDevice in tu_device.c 921bf215546Sopenharmony_ci * 1) Emit command streams to read all requested perf counters at all 922bf215546Sopenharmony_ci * passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which 923bf215546Sopenharmony_ci * reads the scratch reg where pass index is set. 924bf215546Sopenharmony_ci * See emit_perfcntrs_pass_start. 925bf215546Sopenharmony_ci * 2) Pick the right cs setting proper pass index to the reg and prepend 926bf215546Sopenharmony_ci * it to the command buffer at each submit time. 927bf215546Sopenharmony_ci * See tu_QueueSubmit in tu_drm.c 928bf215546Sopenharmony_ci * 3) If the pass index in the reg is true, then executes the command 929bf215546Sopenharmony_ci * stream below CP_COND_REG_EXEC. 930bf215546Sopenharmony_ci */ 931bf215546Sopenharmony_ci 932bf215546Sopenharmony_ci tu_cs_emit_wfi(cs); 933bf215546Sopenharmony_ci 934bf215546Sopenharmony_ci for (uint32_t i = 0; i < pool->counter_index_count; i++) { 935bf215546Sopenharmony_ci struct tu_perf_query_data *data = &pool->perf_query_data[i]; 936bf215546Sopenharmony_ci 937bf215546Sopenharmony_ci if (last_pass != data->pass) { 938bf215546Sopenharmony_ci last_pass = data->pass; 939bf215546Sopenharmony_ci 940bf215546Sopenharmony_ci if (data->pass != 0) 941bf215546Sopenharmony_ci tu_cond_exec_end(cs); 942bf215546Sopenharmony_ci emit_perfcntrs_pass_start(cs, data->pass); 943bf215546Sopenharmony_ci } 944bf215546Sopenharmony_ci 945bf215546Sopenharmony_ci const struct fd_perfcntr_counter *counter = 946bf215546Sopenharmony_ci &pool->perf_group[data->gid].counters[data->cntr_reg]; 947bf215546Sopenharmony_ci const struct fd_perfcntr_countable *countable = 948bf215546Sopenharmony_ci &pool->perf_group[data->gid].countables[data->cid]; 949bf215546Sopenharmony_ci 950bf215546Sopenharmony_ci tu_cs_emit_pkt4(cs, counter->select_reg, 1); 951bf215546Sopenharmony_ci tu_cs_emit(cs, countable->selector); 952bf215546Sopenharmony_ci } 953bf215546Sopenharmony_ci tu_cond_exec_end(cs); 954bf215546Sopenharmony_ci 955bf215546Sopenharmony_ci last_pass = ~0; 956bf215546Sopenharmony_ci tu_cs_emit_wfi(cs); 957bf215546Sopenharmony_ci 958bf215546Sopenharmony_ci for (uint32_t i = 0; i < pool->counter_index_count; i++) { 959bf215546Sopenharmony_ci struct tu_perf_query_data *data = &pool->perf_query_data[i]; 960bf215546Sopenharmony_ci 961bf215546Sopenharmony_ci if (last_pass != data->pass) { 962bf215546Sopenharmony_ci last_pass = data->pass; 963bf215546Sopenharmony_ci 964bf215546Sopenharmony_ci if (data->pass != 0) 965bf215546Sopenharmony_ci tu_cond_exec_end(cs); 966bf215546Sopenharmony_ci emit_perfcntrs_pass_start(cs, data->pass); 967bf215546Sopenharmony_ci } 968bf215546Sopenharmony_ci 969bf215546Sopenharmony_ci const struct fd_perfcntr_counter *counter = 970bf215546Sopenharmony_ci &pool->perf_group[data->gid].counters[data->cntr_reg]; 971bf215546Sopenharmony_ci 972bf215546Sopenharmony_ci uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx); 973bf215546Sopenharmony_ci 974bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 975bf215546Sopenharmony_ci tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) | 976bf215546Sopenharmony_ci CP_REG_TO_MEM_0_64B); 977bf215546Sopenharmony_ci tu_cs_emit_qw(cs, begin_iova); 978bf215546Sopenharmony_ci } 979bf215546Sopenharmony_ci tu_cond_exec_end(cs); 980bf215546Sopenharmony_ci} 981bf215546Sopenharmony_ci 982bf215546Sopenharmony_cistatic void 983bf215546Sopenharmony_ciemit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf, 984bf215546Sopenharmony_ci struct tu_query_pool *pool, 985bf215546Sopenharmony_ci uint32_t query, 986bf215546Sopenharmony_ci uint32_t stream_id) 987bf215546Sopenharmony_ci{ 988bf215546Sopenharmony_ci struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 989bf215546Sopenharmony_ci uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0); 990bf215546Sopenharmony_ci 991bf215546Sopenharmony_ci tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova)); 992bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS); 993bf215546Sopenharmony_ci} 994bf215546Sopenharmony_ci 995bf215546Sopenharmony_cistatic void 996bf215546Sopenharmony_ciemit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf, 997bf215546Sopenharmony_ci struct tu_query_pool *pool, 998bf215546Sopenharmony_ci uint32_t query) 999bf215546Sopenharmony_ci{ 1000bf215546Sopenharmony_ci struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1001bf215546Sopenharmony_ci uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin); 1002bf215546Sopenharmony_ci 1003bf215546Sopenharmony_ci if (cmdbuf->state.pass) { 1004bf215546Sopenharmony_ci cmdbuf->state.rp.has_prim_generated_query_in_rp = true; 1005bf215546Sopenharmony_ci } else { 1006bf215546Sopenharmony_ci cmdbuf->state.prim_generated_query_running_before_rp = true; 1007bf215546Sopenharmony_ci } 1008bf215546Sopenharmony_ci 1009bf215546Sopenharmony_ci cmdbuf->state.prim_counters_running++; 1010bf215546Sopenharmony_ci 1011bf215546Sopenharmony_ci if (cmdbuf->state.pass) { 1012bf215546Sopenharmony_ci /* Primitives that passed all tests are still counted in in each 1013bf215546Sopenharmony_ci * tile even with HW binning beforehand. Do not permit it. 1014bf215546Sopenharmony_ci */ 1015bf215546Sopenharmony_ci tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | 1016bf215546Sopenharmony_ci CP_COND_REG_EXEC_0_SYSMEM | 1017bf215546Sopenharmony_ci CP_COND_REG_EXEC_0_BINNING); 1018bf215546Sopenharmony_ci } 1019bf215546Sopenharmony_ci 1020bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS); 1021bf215546Sopenharmony_ci 1022bf215546Sopenharmony_ci tu_cs_emit_wfi(cs); 1023bf215546Sopenharmony_ci 1024bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1025bf215546Sopenharmony_ci tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) | 1026bf215546Sopenharmony_ci CP_REG_TO_MEM_0_CNT(2) | 1027bf215546Sopenharmony_ci CP_REG_TO_MEM_0_64B); 1028bf215546Sopenharmony_ci tu_cs_emit_qw(cs, begin_iova); 1029bf215546Sopenharmony_ci 1030bf215546Sopenharmony_ci if (cmdbuf->state.pass) { 1031bf215546Sopenharmony_ci tu_cond_exec_end(cs); 1032bf215546Sopenharmony_ci } 1033bf215546Sopenharmony_ci} 1034bf215546Sopenharmony_ci 1035bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL 1036bf215546Sopenharmony_citu_CmdBeginQuery(VkCommandBuffer commandBuffer, 1037bf215546Sopenharmony_ci VkQueryPool queryPool, 1038bf215546Sopenharmony_ci uint32_t query, 1039bf215546Sopenharmony_ci VkQueryControlFlags flags) 1040bf215546Sopenharmony_ci{ 1041bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 1042bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1043bf215546Sopenharmony_ci assert(query < pool->size); 1044bf215546Sopenharmony_ci 1045bf215546Sopenharmony_ci switch (pool->type) { 1046bf215546Sopenharmony_ci case VK_QUERY_TYPE_OCCLUSION: 1047bf215546Sopenharmony_ci /* In freedreno, there is no implementation difference between 1048bf215546Sopenharmony_ci * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly 1049bf215546Sopenharmony_ci * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here. 1050bf215546Sopenharmony_ci */ 1051bf215546Sopenharmony_ci emit_begin_occlusion_query(cmdbuf, pool, query); 1052bf215546Sopenharmony_ci break; 1053bf215546Sopenharmony_ci case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1054bf215546Sopenharmony_ci emit_begin_xfb_query(cmdbuf, pool, query, 0); 1055bf215546Sopenharmony_ci break; 1056bf215546Sopenharmony_ci case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1057bf215546Sopenharmony_ci emit_begin_prim_generated_query(cmdbuf, pool, query); 1058bf215546Sopenharmony_ci break; 1059bf215546Sopenharmony_ci case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 1060bf215546Sopenharmony_ci emit_begin_perf_query(cmdbuf, pool, query); 1061bf215546Sopenharmony_ci break; 1062bf215546Sopenharmony_ci case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1063bf215546Sopenharmony_ci emit_begin_stat_query(cmdbuf, pool, query); 1064bf215546Sopenharmony_ci break; 1065bf215546Sopenharmony_ci case VK_QUERY_TYPE_TIMESTAMP: 1066bf215546Sopenharmony_ci unreachable("Unimplemented query type"); 1067bf215546Sopenharmony_ci default: 1068bf215546Sopenharmony_ci assert(!"Invalid query type"); 1069bf215546Sopenharmony_ci } 1070bf215546Sopenharmony_ci} 1071bf215546Sopenharmony_ci 1072bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL 1073bf215546Sopenharmony_citu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, 1074bf215546Sopenharmony_ci VkQueryPool queryPool, 1075bf215546Sopenharmony_ci uint32_t query, 1076bf215546Sopenharmony_ci VkQueryControlFlags flags, 1077bf215546Sopenharmony_ci uint32_t index) 1078bf215546Sopenharmony_ci{ 1079bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 1080bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1081bf215546Sopenharmony_ci assert(query < pool->size); 1082bf215546Sopenharmony_ci 1083bf215546Sopenharmony_ci switch (pool->type) { 1084bf215546Sopenharmony_ci case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1085bf215546Sopenharmony_ci emit_begin_xfb_query(cmdbuf, pool, query, index); 1086bf215546Sopenharmony_ci break; 1087bf215546Sopenharmony_ci case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1088bf215546Sopenharmony_ci emit_begin_prim_generated_query(cmdbuf, pool, query); 1089bf215546Sopenharmony_ci break; 1090bf215546Sopenharmony_ci default: 1091bf215546Sopenharmony_ci assert(!"Invalid query type"); 1092bf215546Sopenharmony_ci } 1093bf215546Sopenharmony_ci} 1094bf215546Sopenharmony_ci 1095bf215546Sopenharmony_cistatic void 1096bf215546Sopenharmony_ciemit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf, 1097bf215546Sopenharmony_ci struct tu_query_pool *pool, 1098bf215546Sopenharmony_ci uint32_t query) 1099bf215546Sopenharmony_ci{ 1100bf215546Sopenharmony_ci /* Ending an occlusion query happens in a few steps: 1101bf215546Sopenharmony_ci * 1) Set the slot->end to UINT64_MAX. 1102bf215546Sopenharmony_ci * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to 1103bf215546Sopenharmony_ci * write the current sample count value into slot->end. 1104bf215546Sopenharmony_ci * 3) Since (2) is asynchronous, wait until slot->end is not equal to 1105bf215546Sopenharmony_ci * UINT64_MAX before continuing via CP_WAIT_REG_MEM. 1106bf215546Sopenharmony_ci * 4) Accumulate the results of the query (slot->end - slot->begin) into 1107bf215546Sopenharmony_ci * slot->result. 1108bf215546Sopenharmony_ci * 5) If vkCmdEndQuery is *not* called from within the scope of a render 1109bf215546Sopenharmony_ci * pass, set the slot's available bit since the query is now done. 1110bf215546Sopenharmony_ci * 6) If vkCmdEndQuery *is* called from within the scope of a render 1111bf215546Sopenharmony_ci * pass, we cannot mark as available yet since the commands in 1112bf215546Sopenharmony_ci * draw_cs are not run until vkCmdEndRenderPass. 1113bf215546Sopenharmony_ci */ 1114bf215546Sopenharmony_ci const struct tu_render_pass *pass = cmdbuf->state.pass; 1115bf215546Sopenharmony_ci struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1116bf215546Sopenharmony_ci 1117bf215546Sopenharmony_ci uint64_t available_iova = query_available_iova(pool, query); 1118bf215546Sopenharmony_ci uint64_t begin_iova = occlusion_query_iova(pool, query, begin); 1119bf215546Sopenharmony_ci uint64_t end_iova = occlusion_query_iova(pool, query, end); 1120bf215546Sopenharmony_ci uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0); 1121bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1122bf215546Sopenharmony_ci tu_cs_emit_qw(cs, end_iova); 1123bf215546Sopenharmony_ci tu_cs_emit_qw(cs, 0xffffffffffffffffull); 1124bf215546Sopenharmony_ci 1125bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1126bf215546Sopenharmony_ci 1127bf215546Sopenharmony_ci tu_cs_emit_regs(cs, 1128bf215546Sopenharmony_ci A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); 1129bf215546Sopenharmony_ci 1130bf215546Sopenharmony_ci tu_cs_emit_regs(cs, 1131bf215546Sopenharmony_ci A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova)); 1132bf215546Sopenharmony_ci 1133bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); 1134bf215546Sopenharmony_ci tu_cs_emit(cs, ZPASS_DONE); 1135bf215546Sopenharmony_ci 1136bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); 1137bf215546Sopenharmony_ci tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) | 1138bf215546Sopenharmony_ci CP_WAIT_REG_MEM_0_POLL_MEMORY); 1139bf215546Sopenharmony_ci tu_cs_emit_qw(cs, end_iova); 1140bf215546Sopenharmony_ci tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff)); 1141bf215546Sopenharmony_ci tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); 1142bf215546Sopenharmony_ci tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); 1143bf215546Sopenharmony_ci 1144bf215546Sopenharmony_ci /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */ 1145bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1146bf215546Sopenharmony_ci tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); 1147bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_iova); 1148bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_iova); 1149bf215546Sopenharmony_ci tu_cs_emit_qw(cs, end_iova); 1150bf215546Sopenharmony_ci tu_cs_emit_qw(cs, begin_iova); 1151bf215546Sopenharmony_ci 1152bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1153bf215546Sopenharmony_ci 1154bf215546Sopenharmony_ci if (pass) 1155bf215546Sopenharmony_ci /* Technically, queries should be tracked per-subpass, but here we track 1156bf215546Sopenharmony_ci * at the render pass level to simply the code a bit. This is safe 1157bf215546Sopenharmony_ci * because the only commands that use the available bit are 1158bf215546Sopenharmony_ci * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which 1159bf215546Sopenharmony_ci * cannot be invoked from inside a render pass scope. 1160bf215546Sopenharmony_ci */ 1161bf215546Sopenharmony_ci cs = &cmdbuf->draw_epilogue_cs; 1162bf215546Sopenharmony_ci 1163bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1164bf215546Sopenharmony_ci tu_cs_emit_qw(cs, available_iova); 1165bf215546Sopenharmony_ci tu_cs_emit_qw(cs, 0x1); 1166bf215546Sopenharmony_ci} 1167bf215546Sopenharmony_ci 1168bf215546Sopenharmony_ci/* PRIMITIVE_CTRS is used for two distinct queries: 1169bf215546Sopenharmony_ci * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT 1170bf215546Sopenharmony_ci * - VK_QUERY_TYPE_PIPELINE_STATISTICS 1171bf215546Sopenharmony_ci * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted 1172bf215546Sopenharmony_ci * only for outer query. 1173bf215546Sopenharmony_ci * 1174bf215546Sopenharmony_ci * Also, pipeline stat query could run outside of renderpass and prim gen 1175bf215546Sopenharmony_ci * query inside of secondary cmd buffer - for such case we ought to track 1176bf215546Sopenharmony_ci * the status of pipeline stats query. 1177bf215546Sopenharmony_ci */ 1178bf215546Sopenharmony_cistatic void 1179bf215546Sopenharmony_ciemit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf, 1180bf215546Sopenharmony_ci struct tu_cs *cs, 1181bf215546Sopenharmony_ci enum VkQueryType query_type) 1182bf215546Sopenharmony_ci{ 1183bf215546Sopenharmony_ci bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY; 1184bf215546Sopenharmony_ci cmdbuf->state.prim_counters_running--; 1185bf215546Sopenharmony_ci if (cmdbuf->state.prim_counters_running == 0) { 1186bf215546Sopenharmony_ci bool need_cond_exec = 1187bf215546Sopenharmony_ci is_secondary && 1188bf215546Sopenharmony_ci query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT && 1189bf215546Sopenharmony_ci is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics); 1190bf215546Sopenharmony_ci 1191bf215546Sopenharmony_ci if (!need_cond_exec) { 1192bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); 1193bf215546Sopenharmony_ci } else { 1194bf215546Sopenharmony_ci tu_cs_reserve(cs, 7 + 2); 1195bf215546Sopenharmony_ci /* Check that pipeline stats query is not running, only then 1196bf215546Sopenharmony_ci * we count stop the counter. 1197bf215546Sopenharmony_ci */ 1198bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); 1199bf215546Sopenharmony_ci tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); 1200bf215546Sopenharmony_ci tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); 1201bf215546Sopenharmony_ci tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); 1202bf215546Sopenharmony_ci tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */ 1203bf215546Sopenharmony_ci 1204bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); 1205bf215546Sopenharmony_ci } 1206bf215546Sopenharmony_ci } 1207bf215546Sopenharmony_ci 1208bf215546Sopenharmony_ci if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 1209bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); 1210bf215546Sopenharmony_ci tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); 1211bf215546Sopenharmony_ci tu_cs_emit(cs, 1); 1212bf215546Sopenharmony_ci } 1213bf215546Sopenharmony_ci} 1214bf215546Sopenharmony_ci 1215bf215546Sopenharmony_cistatic void 1216bf215546Sopenharmony_ciemit_end_stat_query(struct tu_cmd_buffer *cmdbuf, 1217bf215546Sopenharmony_ci struct tu_query_pool *pool, 1218bf215546Sopenharmony_ci uint32_t query) 1219bf215546Sopenharmony_ci{ 1220bf215546Sopenharmony_ci struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1221bf215546Sopenharmony_ci uint64_t end_iova = pipeline_stat_query_iova(pool, query, end); 1222bf215546Sopenharmony_ci uint64_t available_iova = query_available_iova(pool, query); 1223bf215546Sopenharmony_ci uint64_t result_iova; 1224bf215546Sopenharmony_ci uint64_t stat_start_iova; 1225bf215546Sopenharmony_ci uint64_t stat_stop_iova; 1226bf215546Sopenharmony_ci 1227bf215546Sopenharmony_ci if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) { 1228bf215546Sopenharmony_ci /* No need to conditionally execute STOP_PRIMITIVE_CTRS when 1229bf215546Sopenharmony_ci * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a 1230bf215546Sopenharmony_ci * renderpass, because it is already stopped. 1231bf215546Sopenharmony_ci */ 1232bf215546Sopenharmony_ci emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS); 1233bf215546Sopenharmony_ci } 1234bf215546Sopenharmony_ci 1235bf215546Sopenharmony_ci if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) { 1236bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, STOP_FRAGMENT_CTRS); 1237bf215546Sopenharmony_ci } 1238bf215546Sopenharmony_ci 1239bf215546Sopenharmony_ci if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) { 1240bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, STOP_COMPUTE_CTRS); 1241bf215546Sopenharmony_ci } 1242bf215546Sopenharmony_ci 1243bf215546Sopenharmony_ci tu_cs_emit_wfi(cs); 1244bf215546Sopenharmony_ci 1245bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1246bf215546Sopenharmony_ci tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) | 1247bf215546Sopenharmony_ci CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) | 1248bf215546Sopenharmony_ci CP_REG_TO_MEM_0_64B); 1249bf215546Sopenharmony_ci tu_cs_emit_qw(cs, end_iova); 1250bf215546Sopenharmony_ci 1251bf215546Sopenharmony_ci for (int i = 0; i < STAT_COUNT; i++) { 1252bf215546Sopenharmony_ci result_iova = query_result_iova(pool, query, uint64_t, i); 1253bf215546Sopenharmony_ci stat_start_iova = pipeline_stat_query_iova(pool, query, begin[i]); 1254bf215546Sopenharmony_ci stat_stop_iova = pipeline_stat_query_iova(pool, query, end[i]); 1255bf215546Sopenharmony_ci 1256bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1257bf215546Sopenharmony_ci tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 1258bf215546Sopenharmony_ci CP_MEM_TO_MEM_0_DOUBLE | 1259bf215546Sopenharmony_ci CP_MEM_TO_MEM_0_NEG_C); 1260bf215546Sopenharmony_ci 1261bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_iova); 1262bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_iova); 1263bf215546Sopenharmony_ci tu_cs_emit_qw(cs, stat_stop_iova); 1264bf215546Sopenharmony_ci tu_cs_emit_qw(cs, stat_start_iova); 1265bf215546Sopenharmony_ci } 1266bf215546Sopenharmony_ci 1267bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1268bf215546Sopenharmony_ci 1269bf215546Sopenharmony_ci if (cmdbuf->state.pass) 1270bf215546Sopenharmony_ci cs = &cmdbuf->draw_epilogue_cs; 1271bf215546Sopenharmony_ci 1272bf215546Sopenharmony_ci /* Set the availability to 1 */ 1273bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1274bf215546Sopenharmony_ci tu_cs_emit_qw(cs, available_iova); 1275bf215546Sopenharmony_ci tu_cs_emit_qw(cs, 0x1); 1276bf215546Sopenharmony_ci} 1277bf215546Sopenharmony_ci 1278bf215546Sopenharmony_cistatic void 1279bf215546Sopenharmony_ciemit_end_perf_query(struct tu_cmd_buffer *cmdbuf, 1280bf215546Sopenharmony_ci struct tu_query_pool *pool, 1281bf215546Sopenharmony_ci uint32_t query) 1282bf215546Sopenharmony_ci{ 1283bf215546Sopenharmony_ci struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1284bf215546Sopenharmony_ci uint64_t available_iova = query_available_iova(pool, query); 1285bf215546Sopenharmony_ci uint64_t end_iova; 1286bf215546Sopenharmony_ci uint64_t begin_iova; 1287bf215546Sopenharmony_ci uint64_t result_iova; 1288bf215546Sopenharmony_ci uint32_t last_pass = ~0; 1289bf215546Sopenharmony_ci 1290bf215546Sopenharmony_ci for (uint32_t i = 0; i < pool->counter_index_count; i++) { 1291bf215546Sopenharmony_ci struct tu_perf_query_data *data = &pool->perf_query_data[i]; 1292bf215546Sopenharmony_ci 1293bf215546Sopenharmony_ci if (last_pass != data->pass) { 1294bf215546Sopenharmony_ci last_pass = data->pass; 1295bf215546Sopenharmony_ci 1296bf215546Sopenharmony_ci if (data->pass != 0) 1297bf215546Sopenharmony_ci tu_cond_exec_end(cs); 1298bf215546Sopenharmony_ci emit_perfcntrs_pass_start(cs, data->pass); 1299bf215546Sopenharmony_ci } 1300bf215546Sopenharmony_ci 1301bf215546Sopenharmony_ci const struct fd_perfcntr_counter *counter = 1302bf215546Sopenharmony_ci &pool->perf_group[data->gid].counters[data->cntr_reg]; 1303bf215546Sopenharmony_ci 1304bf215546Sopenharmony_ci end_iova = perf_query_iova(pool, 0, end, data->app_idx); 1305bf215546Sopenharmony_ci 1306bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1307bf215546Sopenharmony_ci tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) | 1308bf215546Sopenharmony_ci CP_REG_TO_MEM_0_64B); 1309bf215546Sopenharmony_ci tu_cs_emit_qw(cs, end_iova); 1310bf215546Sopenharmony_ci } 1311bf215546Sopenharmony_ci tu_cond_exec_end(cs); 1312bf215546Sopenharmony_ci 1313bf215546Sopenharmony_ci last_pass = ~0; 1314bf215546Sopenharmony_ci tu_cs_emit_wfi(cs); 1315bf215546Sopenharmony_ci 1316bf215546Sopenharmony_ci for (uint32_t i = 0; i < pool->counter_index_count; i++) { 1317bf215546Sopenharmony_ci struct tu_perf_query_data *data = &pool->perf_query_data[i]; 1318bf215546Sopenharmony_ci 1319bf215546Sopenharmony_ci if (last_pass != data->pass) { 1320bf215546Sopenharmony_ci last_pass = data->pass; 1321bf215546Sopenharmony_ci 1322bf215546Sopenharmony_ci 1323bf215546Sopenharmony_ci if (data->pass != 0) 1324bf215546Sopenharmony_ci tu_cond_exec_end(cs); 1325bf215546Sopenharmony_ci emit_perfcntrs_pass_start(cs, data->pass); 1326bf215546Sopenharmony_ci } 1327bf215546Sopenharmony_ci 1328bf215546Sopenharmony_ci result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot, 1329bf215546Sopenharmony_ci data->app_idx); 1330bf215546Sopenharmony_ci begin_iova = perf_query_iova(pool, 0, begin, data->app_idx); 1331bf215546Sopenharmony_ci end_iova = perf_query_iova(pool, 0, end, data->app_idx); 1332bf215546Sopenharmony_ci 1333bf215546Sopenharmony_ci /* result += end - begin */ 1334bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1335bf215546Sopenharmony_ci tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 1336bf215546Sopenharmony_ci CP_MEM_TO_MEM_0_DOUBLE | 1337bf215546Sopenharmony_ci CP_MEM_TO_MEM_0_NEG_C); 1338bf215546Sopenharmony_ci 1339bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_iova); 1340bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_iova); 1341bf215546Sopenharmony_ci tu_cs_emit_qw(cs, end_iova); 1342bf215546Sopenharmony_ci tu_cs_emit_qw(cs, begin_iova); 1343bf215546Sopenharmony_ci } 1344bf215546Sopenharmony_ci tu_cond_exec_end(cs); 1345bf215546Sopenharmony_ci 1346bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1347bf215546Sopenharmony_ci 1348bf215546Sopenharmony_ci if (cmdbuf->state.pass) 1349bf215546Sopenharmony_ci cs = &cmdbuf->draw_epilogue_cs; 1350bf215546Sopenharmony_ci 1351bf215546Sopenharmony_ci /* Set the availability to 1 */ 1352bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1353bf215546Sopenharmony_ci tu_cs_emit_qw(cs, available_iova); 1354bf215546Sopenharmony_ci tu_cs_emit_qw(cs, 0x1); 1355bf215546Sopenharmony_ci} 1356bf215546Sopenharmony_ci 1357bf215546Sopenharmony_cistatic void 1358bf215546Sopenharmony_ciemit_end_xfb_query(struct tu_cmd_buffer *cmdbuf, 1359bf215546Sopenharmony_ci struct tu_query_pool *pool, 1360bf215546Sopenharmony_ci uint32_t query, 1361bf215546Sopenharmony_ci uint32_t stream_id) 1362bf215546Sopenharmony_ci{ 1363bf215546Sopenharmony_ci struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1364bf215546Sopenharmony_ci 1365bf215546Sopenharmony_ci uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0); 1366bf215546Sopenharmony_ci uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0); 1367bf215546Sopenharmony_ci uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1); 1368bf215546Sopenharmony_ci uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0); 1369bf215546Sopenharmony_ci uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1); 1370bf215546Sopenharmony_ci uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0); 1371bf215546Sopenharmony_ci uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1); 1372bf215546Sopenharmony_ci uint64_t available_iova = query_available_iova(pool, query); 1373bf215546Sopenharmony_ci 1374bf215546Sopenharmony_ci tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova)); 1375bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS); 1376bf215546Sopenharmony_ci 1377bf215546Sopenharmony_ci tu_cs_emit_wfi(cs); 1378bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS); 1379bf215546Sopenharmony_ci 1380bf215546Sopenharmony_ci /* Set the count of written primitives */ 1381bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1382bf215546Sopenharmony_ci tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 1383bf215546Sopenharmony_ci CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000); 1384bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_written_iova); 1385bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_written_iova); 1386bf215546Sopenharmony_ci tu_cs_emit_qw(cs, end_written_iova); 1387bf215546Sopenharmony_ci tu_cs_emit_qw(cs, begin_written_iova); 1388bf215546Sopenharmony_ci 1389bf215546Sopenharmony_ci tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS); 1390bf215546Sopenharmony_ci 1391bf215546Sopenharmony_ci /* Set the count of generated primitives */ 1392bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1393bf215546Sopenharmony_ci tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 1394bf215546Sopenharmony_ci CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000); 1395bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_generated_iova); 1396bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_generated_iova); 1397bf215546Sopenharmony_ci tu_cs_emit_qw(cs, end_generated_iova); 1398bf215546Sopenharmony_ci tu_cs_emit_qw(cs, begin_generated_iova); 1399bf215546Sopenharmony_ci 1400bf215546Sopenharmony_ci /* Set the availability to 1 */ 1401bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1402bf215546Sopenharmony_ci tu_cs_emit_qw(cs, available_iova); 1403bf215546Sopenharmony_ci tu_cs_emit_qw(cs, 0x1); 1404bf215546Sopenharmony_ci} 1405bf215546Sopenharmony_ci 1406bf215546Sopenharmony_cistatic void 1407bf215546Sopenharmony_ciemit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf, 1408bf215546Sopenharmony_ci struct tu_query_pool *pool, 1409bf215546Sopenharmony_ci uint32_t query) 1410bf215546Sopenharmony_ci{ 1411bf215546Sopenharmony_ci struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1412bf215546Sopenharmony_ci 1413bf215546Sopenharmony_ci if (!cmdbuf->state.pass) { 1414bf215546Sopenharmony_ci cmdbuf->state.prim_generated_query_running_before_rp = false; 1415bf215546Sopenharmony_ci } 1416bf215546Sopenharmony_ci 1417bf215546Sopenharmony_ci uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin); 1418bf215546Sopenharmony_ci uint64_t end_iova = primitives_generated_query_iova(pool, query, end); 1419bf215546Sopenharmony_ci uint64_t result_iova = primitives_generated_query_iova(pool, query, result); 1420bf215546Sopenharmony_ci uint64_t available_iova = query_available_iova(pool, query); 1421bf215546Sopenharmony_ci 1422bf215546Sopenharmony_ci if (cmdbuf->state.pass) { 1423bf215546Sopenharmony_ci tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | 1424bf215546Sopenharmony_ci CP_COND_REG_EXEC_0_SYSMEM | 1425bf215546Sopenharmony_ci CP_COND_REG_EXEC_0_BINNING); 1426bf215546Sopenharmony_ci } 1427bf215546Sopenharmony_ci 1428bf215546Sopenharmony_ci tu_cs_emit_wfi(cs); 1429bf215546Sopenharmony_ci 1430bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1431bf215546Sopenharmony_ci tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) | 1432bf215546Sopenharmony_ci CP_REG_TO_MEM_0_CNT(2) | 1433bf215546Sopenharmony_ci CP_REG_TO_MEM_0_64B); 1434bf215546Sopenharmony_ci tu_cs_emit_qw(cs, end_iova); 1435bf215546Sopenharmony_ci 1436bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1437bf215546Sopenharmony_ci tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 1438bf215546Sopenharmony_ci CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES); 1439bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_iova); 1440bf215546Sopenharmony_ci tu_cs_emit_qw(cs, result_iova); 1441bf215546Sopenharmony_ci tu_cs_emit_qw(cs, end_iova); 1442bf215546Sopenharmony_ci tu_cs_emit_qw(cs, begin_iova); 1443bf215546Sopenharmony_ci 1444bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1445bf215546Sopenharmony_ci 1446bf215546Sopenharmony_ci /* Should be after waiting for mem writes to have up to date info 1447bf215546Sopenharmony_ci * about which query is running. 1448bf215546Sopenharmony_ci */ 1449bf215546Sopenharmony_ci emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT); 1450bf215546Sopenharmony_ci 1451bf215546Sopenharmony_ci if (cmdbuf->state.pass) { 1452bf215546Sopenharmony_ci tu_cond_exec_end(cs); 1453bf215546Sopenharmony_ci } 1454bf215546Sopenharmony_ci 1455bf215546Sopenharmony_ci if (cmdbuf->state.pass) 1456bf215546Sopenharmony_ci cs = &cmdbuf->draw_epilogue_cs; 1457bf215546Sopenharmony_ci 1458bf215546Sopenharmony_ci /* Set the availability to 1 */ 1459bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1460bf215546Sopenharmony_ci tu_cs_emit_qw(cs, available_iova); 1461bf215546Sopenharmony_ci tu_cs_emit_qw(cs, 0x1); 1462bf215546Sopenharmony_ci} 1463bf215546Sopenharmony_ci 1464bf215546Sopenharmony_ci/* Implement this bit of spec text from section 17.2 "Query Operation": 1465bf215546Sopenharmony_ci * 1466bf215546Sopenharmony_ci * If queries are used while executing a render pass instance that has 1467bf215546Sopenharmony_ci * multiview enabled, the query uses N consecutive query indices in the 1468bf215546Sopenharmony_ci * query pool (starting at query) where N is the number of bits set in the 1469bf215546Sopenharmony_ci * view mask in the subpass the query is used in. How the numerical 1470bf215546Sopenharmony_ci * results of the query are distributed among the queries is 1471bf215546Sopenharmony_ci * implementation-dependent. For example, some implementations may write 1472bf215546Sopenharmony_ci * each view’s results to a distinct query, while other implementations 1473bf215546Sopenharmony_ci * may write the total result to the first query and write zero to the 1474bf215546Sopenharmony_ci * other queries. However, the sum of the results in all the queries must 1475bf215546Sopenharmony_ci * accurately reflect the total result of the query summed over all views. 1476bf215546Sopenharmony_ci * Applications can sum the results from all the queries to compute the 1477bf215546Sopenharmony_ci * total result. 1478bf215546Sopenharmony_ci * 1479bf215546Sopenharmony_ci * Since we execute all views at once, we write zero to the other queries. 1480bf215546Sopenharmony_ci * Furthermore, because queries must be reset before use, and we set the 1481bf215546Sopenharmony_ci * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available. 1482bf215546Sopenharmony_ci */ 1483bf215546Sopenharmony_ci 1484bf215546Sopenharmony_cistatic void 1485bf215546Sopenharmony_cihandle_multiview_queries(struct tu_cmd_buffer *cmd, 1486bf215546Sopenharmony_ci struct tu_query_pool *pool, 1487bf215546Sopenharmony_ci uint32_t query) 1488bf215546Sopenharmony_ci{ 1489bf215546Sopenharmony_ci if (!cmd->state.pass || !cmd->state.subpass->multiview_mask) 1490bf215546Sopenharmony_ci return; 1491bf215546Sopenharmony_ci 1492bf215546Sopenharmony_ci unsigned views = util_bitcount(cmd->state.subpass->multiview_mask); 1493bf215546Sopenharmony_ci struct tu_cs *cs = &cmd->draw_epilogue_cs; 1494bf215546Sopenharmony_ci 1495bf215546Sopenharmony_ci for (uint32_t i = 1; i < views; i++) { 1496bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1497bf215546Sopenharmony_ci tu_cs_emit_qw(cs, query_available_iova(pool, query + i)); 1498bf215546Sopenharmony_ci tu_cs_emit_qw(cs, 0x1); 1499bf215546Sopenharmony_ci } 1500bf215546Sopenharmony_ci} 1501bf215546Sopenharmony_ci 1502bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL 1503bf215546Sopenharmony_citu_CmdEndQuery(VkCommandBuffer commandBuffer, 1504bf215546Sopenharmony_ci VkQueryPool queryPool, 1505bf215546Sopenharmony_ci uint32_t query) 1506bf215546Sopenharmony_ci{ 1507bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 1508bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1509bf215546Sopenharmony_ci assert(query < pool->size); 1510bf215546Sopenharmony_ci 1511bf215546Sopenharmony_ci switch (pool->type) { 1512bf215546Sopenharmony_ci case VK_QUERY_TYPE_OCCLUSION: 1513bf215546Sopenharmony_ci emit_end_occlusion_query(cmdbuf, pool, query); 1514bf215546Sopenharmony_ci break; 1515bf215546Sopenharmony_ci case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1516bf215546Sopenharmony_ci emit_end_xfb_query(cmdbuf, pool, query, 0); 1517bf215546Sopenharmony_ci break; 1518bf215546Sopenharmony_ci case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1519bf215546Sopenharmony_ci emit_end_prim_generated_query(cmdbuf, pool, query); 1520bf215546Sopenharmony_ci break; 1521bf215546Sopenharmony_ci case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 1522bf215546Sopenharmony_ci emit_end_perf_query(cmdbuf, pool, query); 1523bf215546Sopenharmony_ci break; 1524bf215546Sopenharmony_ci case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1525bf215546Sopenharmony_ci emit_end_stat_query(cmdbuf, pool, query); 1526bf215546Sopenharmony_ci break; 1527bf215546Sopenharmony_ci case VK_QUERY_TYPE_TIMESTAMP: 1528bf215546Sopenharmony_ci unreachable("Unimplemented query type"); 1529bf215546Sopenharmony_ci default: 1530bf215546Sopenharmony_ci assert(!"Invalid query type"); 1531bf215546Sopenharmony_ci } 1532bf215546Sopenharmony_ci 1533bf215546Sopenharmony_ci handle_multiview_queries(cmdbuf, pool, query); 1534bf215546Sopenharmony_ci} 1535bf215546Sopenharmony_ci 1536bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL 1537bf215546Sopenharmony_citu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, 1538bf215546Sopenharmony_ci VkQueryPool queryPool, 1539bf215546Sopenharmony_ci uint32_t query, 1540bf215546Sopenharmony_ci uint32_t index) 1541bf215546Sopenharmony_ci{ 1542bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 1543bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1544bf215546Sopenharmony_ci assert(query < pool->size); 1545bf215546Sopenharmony_ci 1546bf215546Sopenharmony_ci switch (pool->type) { 1547bf215546Sopenharmony_ci case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1548bf215546Sopenharmony_ci assert(index <= 4); 1549bf215546Sopenharmony_ci emit_end_xfb_query(cmdbuf, pool, query, index); 1550bf215546Sopenharmony_ci break; 1551bf215546Sopenharmony_ci case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1552bf215546Sopenharmony_ci emit_end_prim_generated_query(cmdbuf, pool, query); 1553bf215546Sopenharmony_ci break; 1554bf215546Sopenharmony_ci default: 1555bf215546Sopenharmony_ci assert(!"Invalid query type"); 1556bf215546Sopenharmony_ci } 1557bf215546Sopenharmony_ci} 1558bf215546Sopenharmony_ci 1559bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL 1560bf215546Sopenharmony_citu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, 1561bf215546Sopenharmony_ci VkPipelineStageFlagBits2 pipelineStage, 1562bf215546Sopenharmony_ci VkQueryPool queryPool, 1563bf215546Sopenharmony_ci uint32_t query) 1564bf215546Sopenharmony_ci{ 1565bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1566bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1567bf215546Sopenharmony_ci 1568bf215546Sopenharmony_ci /* Inside a render pass, just write the timestamp multiple times so that 1569bf215546Sopenharmony_ci * the user gets the last one if we use GMEM. There isn't really much 1570bf215546Sopenharmony_ci * better we can do, and this seems to be what the blob does too. 1571bf215546Sopenharmony_ci */ 1572bf215546Sopenharmony_ci struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; 1573bf215546Sopenharmony_ci 1574bf215546Sopenharmony_ci /* Stages that will already have been executed by the time the CP executes 1575bf215546Sopenharmony_ci * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw 1576bf215546Sopenharmony_ci * indirect stage counts as top-of-pipe too. 1577bf215546Sopenharmony_ci */ 1578bf215546Sopenharmony_ci VkPipelineStageFlags2 top_of_pipe_flags = 1579bf215546Sopenharmony_ci VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | 1580bf215546Sopenharmony_ci VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT; 1581bf215546Sopenharmony_ci 1582bf215546Sopenharmony_ci if (pipelineStage & ~top_of_pipe_flags) { 1583bf215546Sopenharmony_ci /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM 1584bf215546Sopenharmony_ci * does CP_WAIT_FOR_ME internally, which will wait for the WFI to 1585bf215546Sopenharmony_ci * complete. 1586bf215546Sopenharmony_ci * 1587bf215546Sopenharmony_ci * Stalling the CP like this is really unfortunate, but I don't think 1588bf215546Sopenharmony_ci * there's a better solution that allows all 48 bits of precision 1589bf215546Sopenharmony_ci * because CP_EVENT_WRITE doesn't support 64-bit timestamps. 1590bf215546Sopenharmony_ci */ 1591bf215546Sopenharmony_ci tu_cs_emit_wfi(cs); 1592bf215546Sopenharmony_ci } 1593bf215546Sopenharmony_ci 1594bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1595bf215546Sopenharmony_ci tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) | 1596bf215546Sopenharmony_ci CP_REG_TO_MEM_0_CNT(2) | 1597bf215546Sopenharmony_ci CP_REG_TO_MEM_0_64B); 1598bf215546Sopenharmony_ci tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0)); 1599bf215546Sopenharmony_ci 1600bf215546Sopenharmony_ci /* Only flag availability once the entire renderpass is done, similar to 1601bf215546Sopenharmony_ci * the begin/end path. 1602bf215546Sopenharmony_ci */ 1603bf215546Sopenharmony_ci cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs; 1604bf215546Sopenharmony_ci 1605bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1606bf215546Sopenharmony_ci tu_cs_emit_qw(cs, query_available_iova(pool, query)); 1607bf215546Sopenharmony_ci tu_cs_emit_qw(cs, 0x1); 1608bf215546Sopenharmony_ci 1609bf215546Sopenharmony_ci /* From the spec for vkCmdWriteTimestamp: 1610bf215546Sopenharmony_ci * 1611bf215546Sopenharmony_ci * If vkCmdWriteTimestamp is called while executing a render pass 1612bf215546Sopenharmony_ci * instance that has multiview enabled, the timestamp uses N consecutive 1613bf215546Sopenharmony_ci * query indices in the query pool (starting at query) where N is the 1614bf215546Sopenharmony_ci * number of bits set in the view mask of the subpass the command is 1615bf215546Sopenharmony_ci * executed in. The resulting query values are determined by an 1616bf215546Sopenharmony_ci * implementation-dependent choice of one of the following behaviors: 1617bf215546Sopenharmony_ci * 1618bf215546Sopenharmony_ci * - The first query is a timestamp value and (if more than one bit is 1619bf215546Sopenharmony_ci * set in the view mask) zero is written to the remaining queries. 1620bf215546Sopenharmony_ci * If two timestamps are written in the same subpass, the sum of the 1621bf215546Sopenharmony_ci * execution time of all views between those commands is the 1622bf215546Sopenharmony_ci * difference between the first query written by each command. 1623bf215546Sopenharmony_ci * 1624bf215546Sopenharmony_ci * - All N queries are timestamp values. If two timestamps are written 1625bf215546Sopenharmony_ci * in the same subpass, the sum of the execution time of all views 1626bf215546Sopenharmony_ci * between those commands is the sum of the difference between 1627bf215546Sopenharmony_ci * corresponding queries written by each command. The difference 1628bf215546Sopenharmony_ci * between corresponding queries may be the execution time of a 1629bf215546Sopenharmony_ci * single view. 1630bf215546Sopenharmony_ci * 1631bf215546Sopenharmony_ci * We execute all views in the same draw call, so we implement the first 1632bf215546Sopenharmony_ci * option, the same as regular queries. 1633bf215546Sopenharmony_ci */ 1634bf215546Sopenharmony_ci handle_multiview_queries(cmd, pool, query); 1635bf215546Sopenharmony_ci} 1636bf215546Sopenharmony_ci 1637bf215546Sopenharmony_ciVKAPI_ATTR VkResult VKAPI_CALL 1638bf215546Sopenharmony_citu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( 1639bf215546Sopenharmony_ci VkPhysicalDevice physicalDevice, 1640bf215546Sopenharmony_ci uint32_t queueFamilyIndex, 1641bf215546Sopenharmony_ci uint32_t* pCounterCount, 1642bf215546Sopenharmony_ci VkPerformanceCounterKHR* pCounters, 1643bf215546Sopenharmony_ci VkPerformanceCounterDescriptionKHR* pCounterDescriptions) 1644bf215546Sopenharmony_ci{ 1645bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice); 1646bf215546Sopenharmony_ci 1647bf215546Sopenharmony_ci uint32_t desc_count = *pCounterCount; 1648bf215546Sopenharmony_ci uint32_t group_count; 1649bf215546Sopenharmony_ci const struct fd_perfcntr_group *group = 1650bf215546Sopenharmony_ci fd_perfcntrs(&phydev->dev_id, &group_count); 1651bf215546Sopenharmony_ci 1652bf215546Sopenharmony_ci VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount); 1653bf215546Sopenharmony_ci VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc, 1654bf215546Sopenharmony_ci pCounterDescriptions, &desc_count); 1655bf215546Sopenharmony_ci 1656bf215546Sopenharmony_ci for (int i = 0; i < group_count; i++) { 1657bf215546Sopenharmony_ci for (int j = 0; j < group[i].num_countables; j++) { 1658bf215546Sopenharmony_ci 1659bf215546Sopenharmony_ci vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { 1660bf215546Sopenharmony_ci counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR; 1661bf215546Sopenharmony_ci counter->unit = 1662bf215546Sopenharmony_ci fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type]; 1663bf215546Sopenharmony_ci counter->storage = 1664bf215546Sopenharmony_ci fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type]; 1665bf215546Sopenharmony_ci 1666bf215546Sopenharmony_ci unsigned char sha1_result[20]; 1667bf215546Sopenharmony_ci _mesa_sha1_compute(group[i].countables[j].name, 1668bf215546Sopenharmony_ci strlen(group[i].countables[j].name), 1669bf215546Sopenharmony_ci sha1_result); 1670bf215546Sopenharmony_ci memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); 1671bf215546Sopenharmony_ci } 1672bf215546Sopenharmony_ci 1673bf215546Sopenharmony_ci vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) { 1674bf215546Sopenharmony_ci desc->flags = 0; 1675bf215546Sopenharmony_ci 1676bf215546Sopenharmony_ci snprintf(desc->name, sizeof(desc->name), 1677bf215546Sopenharmony_ci "%s", group[i].countables[j].name); 1678bf215546Sopenharmony_ci snprintf(desc->category, sizeof(desc->category), "%s", group[i].name); 1679bf215546Sopenharmony_ci snprintf(desc->description, sizeof(desc->description), 1680bf215546Sopenharmony_ci "%s: %s performance counter", 1681bf215546Sopenharmony_ci group[i].name, group[i].countables[j].name); 1682bf215546Sopenharmony_ci } 1683bf215546Sopenharmony_ci } 1684bf215546Sopenharmony_ci } 1685bf215546Sopenharmony_ci 1686bf215546Sopenharmony_ci return vk_outarray_status(&out); 1687bf215546Sopenharmony_ci} 1688bf215546Sopenharmony_ci 1689bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL 1690bf215546Sopenharmony_citu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( 1691bf215546Sopenharmony_ci VkPhysicalDevice physicalDevice, 1692bf215546Sopenharmony_ci const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo, 1693bf215546Sopenharmony_ci uint32_t* pNumPasses) 1694bf215546Sopenharmony_ci{ 1695bf215546Sopenharmony_ci TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice); 1696bf215546Sopenharmony_ci uint32_t group_count = 0; 1697bf215546Sopenharmony_ci uint32_t gid = 0, cid = 0, n_passes; 1698bf215546Sopenharmony_ci const struct fd_perfcntr_group *group = 1699bf215546Sopenharmony_ci fd_perfcntrs(&phydev->dev_id, &group_count); 1700bf215546Sopenharmony_ci 1701bf215546Sopenharmony_ci uint32_t counters_requested[group_count]; 1702bf215546Sopenharmony_ci memset(counters_requested, 0x0, sizeof(counters_requested)); 1703bf215546Sopenharmony_ci *pNumPasses = 1; 1704bf215546Sopenharmony_ci 1705bf215546Sopenharmony_ci for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) { 1706bf215546Sopenharmony_ci perfcntr_index(group, group_count, 1707bf215546Sopenharmony_ci pPerformanceQueryCreateInfo->pCounterIndices[i], 1708bf215546Sopenharmony_ci &gid, &cid); 1709bf215546Sopenharmony_ci 1710bf215546Sopenharmony_ci counters_requested[gid]++; 1711bf215546Sopenharmony_ci } 1712bf215546Sopenharmony_ci 1713bf215546Sopenharmony_ci for (uint32_t i = 0; i < group_count; i++) { 1714bf215546Sopenharmony_ci n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters); 1715bf215546Sopenharmony_ci *pNumPasses = MAX2(*pNumPasses, n_passes); 1716bf215546Sopenharmony_ci } 1717bf215546Sopenharmony_ci} 1718bf215546Sopenharmony_ci 1719bf215546Sopenharmony_ciVKAPI_ATTR VkResult VKAPI_CALL 1720bf215546Sopenharmony_citu_AcquireProfilingLockKHR(VkDevice device, 1721bf215546Sopenharmony_ci const VkAcquireProfilingLockInfoKHR* pInfo) 1722bf215546Sopenharmony_ci{ 1723bf215546Sopenharmony_ci /* TODO. Probably there's something to do for kgsl. */ 1724bf215546Sopenharmony_ci return VK_SUCCESS; 1725bf215546Sopenharmony_ci} 1726bf215546Sopenharmony_ci 1727bf215546Sopenharmony_ciVKAPI_ATTR void VKAPI_CALL 1728bf215546Sopenharmony_citu_ReleaseProfilingLockKHR(VkDevice device) 1729bf215546Sopenharmony_ci{ 1730bf215546Sopenharmony_ci /* TODO. Probably there's something to do for kgsl. */ 1731bf215546Sopenharmony_ci return; 1732bf215546Sopenharmony_ci} 1733