1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <assert.h> 25#include <stdbool.h> 26#include <string.h> 27#include <unistd.h> 28#include <fcntl.h> 29 30#include "anv_private.h" 31 32#include "util/os_time.h" 33 34#include "genxml/gen_macros.h" 35#include "genxml/genX_pack.h" 36 37/* We reserve : 38 * - GPR 14 for perf queries 39 * - GPR 15 for conditional rendering 40 */ 41#define MI_BUILDER_NUM_ALLOC_GPRS 14 42#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8 43#define __gen_get_batch_dwords anv_batch_emit_dwords 44#define __gen_address_offset anv_address_add 45#define __gen_get_batch_address(b, a) anv_batch_address(b, a) 46#include "common/mi_builder.h" 47#include "perf/intel_perf.h" 48#include "perf/intel_perf_mdapi.h" 49#include "perf/intel_perf_regs.h" 50 51#include "vk_util.h" 52 53static struct anv_address 54anv_query_address(struct anv_query_pool *pool, uint32_t query) 55{ 56 return (struct anv_address) { 57 .bo = pool->bo, 58 .offset = query * pool->stride, 59 }; 60} 61 62VkResult genX(CreateQueryPool)( 63 VkDevice _device, 64 const VkQueryPoolCreateInfo* pCreateInfo, 65 const VkAllocationCallbacks* pAllocator, 66 VkQueryPool* pQueryPool) 67{ 68 ANV_FROM_HANDLE(anv_device, device, _device); 69 const struct anv_physical_device *pdevice = device->physical; 70#if GFX_VER >= 8 71 const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL; 72 struct intel_perf_counter_pass *counter_pass; 73 struct intel_perf_query_info **pass_query; 74 uint32_t n_passes = 0; 75#endif 76 uint32_t data_offset = 0; 77 VK_MULTIALLOC(ma); 78 VkResult result; 79 80 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); 81 82 /* Query pool slots are made up of some number of 64-bit values packed 83 * tightly together. For most query types have the first 64-bit value is 84 * the "available" bit which is 0 when the query is unavailable and 1 when 85 * it is available. The 64-bit values that follow are determined by the 86 * type of query. 87 * 88 * For performance queries, we have a requirement to align OA reports at 89 * 64bytes so we put those first and have the "available" bit behind 90 * together with some other counters. 91 */ 92 uint32_t uint64s_per_slot = 0; 93 94 VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1); 95 96 VkQueryPipelineStatisticFlags pipeline_statistics = 0; 97 switch (pCreateInfo->queryType) { 98 case VK_QUERY_TYPE_OCCLUSION: 99 /* Occlusion queries have two values: begin and end. */ 100 uint64s_per_slot = 1 + 2; 101 break; 102 case VK_QUERY_TYPE_TIMESTAMP: 103 /* Timestamps just have the one timestamp value */ 104 uint64s_per_slot = 1 + 1; 105 break; 106 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 107 pipeline_statistics = pCreateInfo->pipelineStatistics; 108 /* We're going to trust this field implicitly so we need to ensure that 109 * no unhandled extension bits leak in. 110 */ 111 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK; 112 113 /* Statistics queries have a min and max for every statistic */ 114 uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics); 115 break; 116 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 117 /* Transform feedback queries are 4 values, begin/end for 118 * written/available. 119 */ 120 uint64s_per_slot = 1 + 4; 121 break; 122 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 123 const struct intel_perf_query_field_layout *layout = 124 &pdevice->perf->query_layout; 125 126 uint64s_per_slot = 2; /* availability + marker */ 127 /* Align to the requirement of the layout */ 128 uint64s_per_slot = align_u32(uint64s_per_slot, 129 DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); 130 data_offset = uint64s_per_slot * sizeof(uint64_t); 131 /* Add the query data for begin & end commands */ 132 uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t)); 133 break; 134 } 135#if GFX_VER >= 8 136 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 137 const struct intel_perf_query_field_layout *layout = 138 &pdevice->perf->query_layout; 139 140 perf_query_info = vk_find_struct_const(pCreateInfo->pNext, 141 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); 142 n_passes = intel_perf_get_n_passes(pdevice->perf, 143 perf_query_info->pCounterIndices, 144 perf_query_info->counterIndexCount, 145 NULL); 146 vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass, 147 perf_query_info->counterIndexCount); 148 vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *, 149 n_passes); 150 uint64s_per_slot = 4 /* availability + small batch */; 151 /* Align to the requirement of the layout */ 152 uint64s_per_slot = align_u32(uint64s_per_slot, 153 DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); 154 data_offset = uint64s_per_slot * sizeof(uint64_t); 155 /* Add the query data for begin & end commands */ 156 uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t)); 157 /* Multiply by the number of passes */ 158 uint64s_per_slot *= n_passes; 159 break; 160 } 161#endif 162 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 163 /* Query has two values: begin and end. */ 164 uint64s_per_slot = 1 + 2; 165 break; 166 default: 167 assert(!"Invalid query type"); 168 } 169 170 if (!vk_object_multialloc(&device->vk, &ma, pAllocator, 171 VK_OBJECT_TYPE_QUERY_POOL)) 172 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 173 174 pool->type = pCreateInfo->queryType; 175 pool->pipeline_statistics = pipeline_statistics; 176 pool->stride = uint64s_per_slot * sizeof(uint64_t); 177 pool->slots = pCreateInfo->queryCount; 178 179 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) { 180 pool->data_offset = data_offset; 181 pool->snapshot_size = (pool->stride - data_offset) / 2; 182 } 183#if GFX_VER >= 8 184 else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 185 pool->pass_size = pool->stride / n_passes; 186 pool->data_offset = data_offset; 187 pool->snapshot_size = (pool->pass_size - data_offset) / 2; 188 pool->n_counters = perf_query_info->counterIndexCount; 189 pool->counter_pass = counter_pass; 190 intel_perf_get_counters_passes(pdevice->perf, 191 perf_query_info->pCounterIndices, 192 perf_query_info->counterIndexCount, 193 pool->counter_pass); 194 pool->n_passes = n_passes; 195 pool->pass_query = pass_query; 196 intel_perf_get_n_passes(pdevice->perf, 197 perf_query_info->pCounterIndices, 198 perf_query_info->counterIndexCount, 199 pool->pass_query); 200 } 201#endif 202 203 uint64_t size = pool->slots * (uint64_t)pool->stride; 204 result = anv_device_alloc_bo(device, "query-pool", size, 205 ANV_BO_ALLOC_MAPPED | 206 ANV_BO_ALLOC_SNOOPED, 207 0 /* explicit_address */, 208 &pool->bo); 209 if (result != VK_SUCCESS) 210 goto fail; 211 212#if GFX_VER >= 8 213 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 214 for (uint32_t p = 0; p < pool->n_passes; p++) { 215 struct mi_builder b; 216 struct anv_batch batch = { 217 .start = pool->bo->map + khr_perf_query_preamble_offset(pool, p), 218 .end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset, 219 }; 220 batch.next = batch.start; 221 222 mi_builder_init(&b, &device->info, &batch); 223 mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG), 224 mi_imm(p * (uint64_t)pool->pass_size)); 225 anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe); 226 } 227 } 228#endif 229 230 *pQueryPool = anv_query_pool_to_handle(pool); 231 232 return VK_SUCCESS; 233 234 fail: 235 vk_free2(&device->vk.alloc, pAllocator, pool); 236 237 return result; 238} 239 240void genX(DestroyQueryPool)( 241 VkDevice _device, 242 VkQueryPool _pool, 243 const VkAllocationCallbacks* pAllocator) 244{ 245 ANV_FROM_HANDLE(anv_device, device, _device); 246 ANV_FROM_HANDLE(anv_query_pool, pool, _pool); 247 248 if (!pool) 249 return; 250 251 anv_device_release_bo(device, pool->bo); 252 vk_object_free(&device->vk, pAllocator, pool); 253} 254 255#if GFX_VER >= 8 256/** 257 * VK_KHR_performance_query layout : 258 * 259 * -------------------------------------------- 260 * | availability (8b) | | | 261 * |-------------------------------| | | 262 * | Small batch loading | | | 263 * | ANV_PERF_QUERY_OFFSET_REG | | | 264 * | (24b) | | Pass 0 | 265 * |-------------------------------| | | 266 * | some padding (see | | | 267 * | query_field_layout:alignment) | | | 268 * |-------------------------------| | | 269 * | query data | | | 270 * | (2 * query_field_layout:size) | | | 271 * |-------------------------------|-- | Query 0 272 * | availability (8b) | | | 273 * |-------------------------------| | | 274 * | Small batch loading | | | 275 * | ANV_PERF_QUERY_OFFSET_REG | | | 276 * | (24b) | | Pass 1 | 277 * |-------------------------------| | | 278 * | some padding (see | | | 279 * | query_field_layout:alignment) | | | 280 * |-------------------------------| | | 281 * | query data | | | 282 * | (2 * query_field_layout:size) | | | 283 * |-------------------------------|----------- 284 * | availability (8b) | | | 285 * |-------------------------------| | | 286 * | Small batch loading | | | 287 * | ANV_PERF_QUERY_OFFSET_REG | | | 288 * | (24b) | | Pass 0 | 289 * |-------------------------------| | | 290 * | some padding (see | | | 291 * | query_field_layout:alignment) | | | 292 * |-------------------------------| | | 293 * | query data | | | 294 * | (2 * query_field_layout:size) | | | 295 * |-------------------------------|-- | Query 1 296 * | ... | | | 297 * -------------------------------------------- 298 */ 299 300static uint64_t 301khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass) 302{ 303 return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size; 304} 305 306static uint64_t 307khr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end) 308{ 309 return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size + 310 pool->data_offset + (end ? pool->snapshot_size : 0); 311} 312 313static struct anv_address 314khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass) 315{ 316 return anv_address_add( 317 (struct anv_address) { .bo = pool->bo, }, 318 khr_perf_query_availability_offset(pool, query, pass)); 319} 320 321static struct anv_address 322khr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end) 323{ 324 return anv_address_add( 325 (struct anv_address) { .bo = pool->bo, }, 326 khr_perf_query_data_offset(pool, query, pass, end)); 327} 328 329static bool 330khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer) 331{ 332 if (anv_batch_has_error(&cmd_buffer->batch)) 333 return false; 334 335 if (cmd_buffer->self_mod_locations) 336 return true; 337 338 struct anv_device *device = cmd_buffer->device; 339 const struct anv_physical_device *pdevice = device->physical; 340 341 cmd_buffer->self_mod_locations = 342 vk_alloc(&cmd_buffer->vk.pool->alloc, 343 pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8, 344 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 345 346 if (!cmd_buffer->self_mod_locations) { 347 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); 348 return false; 349 } 350 351 return true; 352} 353#endif 354 355/** 356 * VK_INTEL_performance_query layout : 357 * 358 * --------------------------------- 359 * | availability (8b) | 360 * |-------------------------------| 361 * | marker (8b) | 362 * |-------------------------------| 363 * | some padding (see | 364 * | query_field_layout:alignment) | 365 * |-------------------------------| 366 * | query data | 367 * | (2 * query_field_layout:size) | 368 * --------------------------------- 369 */ 370 371static uint32_t 372intel_perf_marker_offset(void) 373{ 374 return 8; 375} 376 377static uint32_t 378intel_perf_query_data_offset(struct anv_query_pool *pool, bool end) 379{ 380 return pool->data_offset + (end ? pool->snapshot_size : 0); 381} 382 383static void 384cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, 385 uint32_t value_index, uint64_t result) 386{ 387 if (flags & VK_QUERY_RESULT_64_BIT) { 388 uint64_t *dst64 = dst_slot; 389 dst64[value_index] = result; 390 } else { 391 uint32_t *dst32 = dst_slot; 392 dst32[value_index] = result; 393 } 394} 395 396static void * 397query_slot(struct anv_query_pool *pool, uint32_t query) 398{ 399 return pool->bo->map + query * pool->stride; 400} 401 402static bool 403query_is_available(struct anv_query_pool *pool, uint32_t query) 404{ 405#if GFX_VER >= 8 406 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 407 for (uint32_t p = 0; p < pool->n_passes; p++) { 408 volatile uint64_t *slot = 409 pool->bo->map + khr_perf_query_availability_offset(pool, query, p); 410 if (!slot[0]) 411 return false; 412 } 413 return true; 414 } 415#endif 416 417 return *(volatile uint64_t *)query_slot(pool, query); 418} 419 420static VkResult 421wait_for_available(struct anv_device *device, 422 struct anv_query_pool *pool, uint32_t query) 423{ 424 uint64_t abs_timeout_ns = os_time_get_absolute_timeout(2 * NSEC_PER_SEC); 425 426 while (os_time_get_nano() < abs_timeout_ns) { 427 if (query_is_available(pool, query)) 428 return VK_SUCCESS; 429 VkResult status = vk_device_check_status(&device->vk); 430 if (status != VK_SUCCESS) 431 return status; 432 } 433 434 return vk_device_set_lost(&device->vk, "query timeout"); 435} 436 437VkResult genX(GetQueryPoolResults)( 438 VkDevice _device, 439 VkQueryPool queryPool, 440 uint32_t firstQuery, 441 uint32_t queryCount, 442 size_t dataSize, 443 void* pData, 444 VkDeviceSize stride, 445 VkQueryResultFlags flags) 446{ 447 ANV_FROM_HANDLE(anv_device, device, _device); 448 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 449 450 assert(pool->type == VK_QUERY_TYPE_OCCLUSION || 451 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || 452 pool->type == VK_QUERY_TYPE_TIMESTAMP || 453 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || 454 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || 455 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL || 456 pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT); 457 458 if (vk_device_is_lost(&device->vk)) 459 return VK_ERROR_DEVICE_LOST; 460 461 if (pData == NULL) 462 return VK_SUCCESS; 463 464 void *data_end = pData + dataSize; 465 466 VkResult status = VK_SUCCESS; 467 for (uint32_t i = 0; i < queryCount; i++) { 468 bool available = query_is_available(pool, firstQuery + i); 469 470 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { 471 status = wait_for_available(device, pool, firstQuery + i); 472 if (status != VK_SUCCESS) { 473 return status; 474 } 475 476 available = true; 477 } 478 479 /* From the Vulkan 1.0.42 spec: 480 * 481 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are 482 * both not set then no result values are written to pData for 483 * queries that are in the unavailable state at the time of the call, 484 * and vkGetQueryPoolResults returns VK_NOT_READY. However, 485 * availability state is still written to pData for those queries if 486 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set." 487 * 488 * From VK_KHR_performance_query : 489 * 490 * "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies 491 * that the result should contain the number of counters that were recorded 492 * into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR" 493 */ 494 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT); 495 496 uint32_t idx = 0; 497 switch (pool->type) { 498 case VK_QUERY_TYPE_OCCLUSION: 499 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { 500 uint64_t *slot = query_slot(pool, firstQuery + i); 501 if (write_results) { 502 /* From the Vulkan 1.2.132 spec: 503 * 504 * "If VK_QUERY_RESULT_PARTIAL_BIT is set, 505 * VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status 506 * is unavailable, an intermediate result value between zero and 507 * the final result value is written to pData for that query." 508 */ 509 uint64_t result = available ? slot[2] - slot[1] : 0; 510 cpu_write_query_result(pData, flags, idx, result); 511 } 512 idx++; 513 break; 514 } 515 516 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 517 uint64_t *slot = query_slot(pool, firstQuery + i); 518 uint32_t statistics = pool->pipeline_statistics; 519 while (statistics) { 520 uint32_t stat = u_bit_scan(&statistics); 521 if (write_results) { 522 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1]; 523 524 /* WaDividePSInvocationCountBy4:HSW,BDW */ 525 if ((device->info.ver == 8 || device->info.verx10 == 75) && 526 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) 527 result >>= 2; 528 529 cpu_write_query_result(pData, flags, idx, result); 530 } 531 idx++; 532 } 533 assert(idx == util_bitcount(pool->pipeline_statistics)); 534 break; 535 } 536 537 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { 538 uint64_t *slot = query_slot(pool, firstQuery + i); 539 if (write_results) 540 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); 541 idx++; 542 if (write_results) 543 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]); 544 idx++; 545 break; 546 } 547 548 case VK_QUERY_TYPE_TIMESTAMP: { 549 uint64_t *slot = query_slot(pool, firstQuery + i); 550 if (write_results) 551 cpu_write_query_result(pData, flags, idx, slot[1]); 552 idx++; 553 break; 554 } 555 556#if GFX_VER >= 8 557 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 558 const struct anv_physical_device *pdevice = device->physical; 559 assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT | 560 VK_QUERY_RESULT_PARTIAL_BIT)) == 0); 561 for (uint32_t p = 0; p < pool->n_passes; p++) { 562 const struct intel_perf_query_info *query = pool->pass_query[p]; 563 struct intel_perf_query_result result; 564 intel_perf_query_result_clear(&result); 565 intel_perf_query_result_accumulate_fields(&result, query, 566 pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false), 567 pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true), 568 false /* no_oa_accumulate */); 569 anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData); 570 } 571 break; 572 } 573#endif 574 575 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 576 if (!write_results) 577 break; 578 const void *query_data = query_slot(pool, firstQuery + i); 579 const struct intel_perf_query_info *query = &device->physical->perf->queries[0]; 580 struct intel_perf_query_result result; 581 intel_perf_query_result_clear(&result); 582 intel_perf_query_result_accumulate_fields(&result, query, 583 query_data + intel_perf_query_data_offset(pool, false), 584 query_data + intel_perf_query_data_offset(pool, true), 585 false /* no_oa_accumulate */); 586 intel_perf_query_result_write_mdapi(pData, stride, 587 &device->info, 588 query, &result); 589 const uint64_t *marker = query_data + intel_perf_marker_offset(); 590 intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker); 591 break; 592 } 593 594 default: 595 unreachable("invalid pool type"); 596 } 597 598 if (!write_results) 599 status = VK_NOT_READY; 600 601 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) 602 cpu_write_query_result(pData, flags, idx, available); 603 604 pData += stride; 605 if (pData >= data_end) 606 break; 607 } 608 609 return status; 610} 611 612static void 613emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, 614 struct anv_address addr) 615{ 616 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 617 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 618 619 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 620 pc.DestinationAddressType = DAT_PPGTT; 621 pc.PostSyncOperation = WritePSDepthCount; 622 pc.DepthStallEnable = true; 623 pc.Address = addr; 624 625 if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4) 626 pc.CommandStreamerStallEnable = true; 627 } 628} 629 630static void 631emit_query_mi_availability(struct mi_builder *b, 632 struct anv_address addr, 633 bool available) 634{ 635 mi_store(b, mi_mem64(addr), mi_imm(available)); 636} 637 638static void 639emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer, 640 struct anv_address addr, 641 bool available) 642{ 643 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 644 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 645 646 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 647 pc.DestinationAddressType = DAT_PPGTT; 648 pc.PostSyncOperation = WriteImmediateData; 649 pc.Address = addr; 650 pc.ImmediateData = available; 651 } 652} 653 654/** 655 * Goes through a series of consecutive query indices in the given pool 656 * setting all element values to 0 and emitting them as available. 657 */ 658static void 659emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, 660 struct mi_builder *b, struct anv_query_pool *pool, 661 uint32_t first_index, uint32_t num_queries) 662{ 663 switch (pool->type) { 664 case VK_QUERY_TYPE_OCCLUSION: 665 case VK_QUERY_TYPE_TIMESTAMP: 666 /* These queries are written with a PIPE_CONTROL so clear them using the 667 * PIPE_CONTROL as well so we don't have to synchronize between 2 types 668 * of operations. 669 */ 670 assert((pool->stride % 8) == 0); 671 for (uint32_t i = 0; i < num_queries; i++) { 672 struct anv_address slot_addr = 673 anv_query_address(pool, first_index + i); 674 675 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) { 676 emit_query_pc_availability(cmd_buffer, 677 anv_address_add(slot_addr, qword * 8), 678 false); 679 } 680 emit_query_pc_availability(cmd_buffer, slot_addr, true); 681 } 682 break; 683 684 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 685 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 686 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 687 for (uint32_t i = 0; i < num_queries; i++) { 688 struct anv_address slot_addr = 689 anv_query_address(pool, first_index + i); 690 mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8); 691 emit_query_mi_availability(b, slot_addr, true); 692 } 693 break; 694 695#if GFX_VER >= 8 696 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 697 for (uint32_t i = 0; i < num_queries; i++) { 698 for (uint32_t p = 0; p < pool->n_passes; p++) { 699 mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false), 700 0, 2 * pool->snapshot_size); 701 emit_query_mi_availability(b, 702 khr_perf_query_availability_address(pool, first_index + i, p), 703 true); 704 } 705 } 706 break; 707 } 708#endif 709 710 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: 711 for (uint32_t i = 0; i < num_queries; i++) { 712 struct anv_address slot_addr = 713 anv_query_address(pool, first_index + i); 714 mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8); 715 emit_query_mi_availability(b, slot_addr, true); 716 } 717 break; 718 719 default: 720 unreachable("Unsupported query type"); 721 } 722} 723 724void genX(CmdResetQueryPool)( 725 VkCommandBuffer commandBuffer, 726 VkQueryPool queryPool, 727 uint32_t firstQuery, 728 uint32_t queryCount) 729{ 730 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 731 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 732 733 switch (pool->type) { 734 case VK_QUERY_TYPE_OCCLUSION: 735 for (uint32_t i = 0; i < queryCount; i++) { 736 emit_query_pc_availability(cmd_buffer, 737 anv_query_address(pool, firstQuery + i), 738 false); 739 } 740 break; 741 742 case VK_QUERY_TYPE_TIMESTAMP: { 743 for (uint32_t i = 0; i < queryCount; i++) { 744 emit_query_pc_availability(cmd_buffer, 745 anv_query_address(pool, firstQuery + i), 746 false); 747 } 748 749 /* Add a CS stall here to make sure the PIPE_CONTROL above has 750 * completed. Otherwise some timestamps written later with MI_STORE_* 751 * commands might race with the PIPE_CONTROL in the loop above. 752 */ 753 anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT, 754 "vkCmdResetQueryPool of timestamps"); 755 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 756 break; 757 } 758 759 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 760 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 761 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { 762 struct mi_builder b; 763 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 764 765 for (uint32_t i = 0; i < queryCount; i++) 766 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); 767 break; 768 } 769 770#if GFX_VER >= 8 771 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 772 struct mi_builder b; 773 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 774 775 for (uint32_t i = 0; i < queryCount; i++) { 776 for (uint32_t p = 0; p < pool->n_passes; p++) { 777 emit_query_mi_availability( 778 &b, 779 khr_perf_query_availability_address(pool, firstQuery + i, p), 780 false); 781 } 782 } 783 break; 784 } 785#endif 786 787 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 788 struct mi_builder b; 789 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 790 791 for (uint32_t i = 0; i < queryCount; i++) 792 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); 793 break; 794 } 795 796 default: 797 unreachable("Unsupported query type"); 798 } 799} 800 801void genX(ResetQueryPool)( 802 VkDevice _device, 803 VkQueryPool queryPool, 804 uint32_t firstQuery, 805 uint32_t queryCount) 806{ 807 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 808 809 for (uint32_t i = 0; i < queryCount; i++) { 810 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 811#if GFX_VER >= 8 812 for (uint32_t p = 0; p < pool->n_passes; p++) { 813 uint64_t *pass_slot = pool->bo->map + 814 khr_perf_query_availability_offset(pool, firstQuery + i, p); 815 *pass_slot = 0; 816 } 817#endif 818 } else { 819 uint64_t *slot = query_slot(pool, firstQuery + i); 820 *slot = 0; 821 } 822 } 823} 824 825static const uint32_t vk_pipeline_stat_to_reg[] = { 826 GENX(IA_VERTICES_COUNT_num), 827 GENX(IA_PRIMITIVES_COUNT_num), 828 GENX(VS_INVOCATION_COUNT_num), 829 GENX(GS_INVOCATION_COUNT_num), 830 GENX(GS_PRIMITIVES_COUNT_num), 831 GENX(CL_INVOCATION_COUNT_num), 832 GENX(CL_PRIMITIVES_COUNT_num), 833 GENX(PS_INVOCATION_COUNT_num), 834 GENX(HS_INVOCATION_COUNT_num), 835 GENX(DS_INVOCATION_COUNT_num), 836 GENX(CS_INVOCATION_COUNT_num), 837}; 838 839static void 840emit_pipeline_stat(struct mi_builder *b, uint32_t stat, 841 struct anv_address addr) 842{ 843 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK == 844 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1); 845 846 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg)); 847 mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat])); 848} 849 850static void 851emit_xfb_query(struct mi_builder *b, uint32_t stream, 852 struct anv_address addr) 853{ 854 assert(stream < MAX_XFB_STREAMS); 855 856 mi_store(b, mi_mem64(anv_address_add(addr, 0)), 857 mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8)); 858 mi_store(b, mi_mem64(anv_address_add(addr, 16)), 859 mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8)); 860} 861 862static void 863emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer, 864 struct anv_query_pool *pool, 865 struct mi_builder *b, 866 struct anv_address query_addr, 867 bool end) 868{ 869 const struct intel_perf_query_field_layout *layout = 870 &cmd_buffer->device->physical->perf->query_layout; 871 struct anv_address data_addr = 872 anv_address_add(query_addr, intel_perf_query_data_offset(pool, end)); 873 874 for (uint32_t f = 0; f < layout->n_fields; f++) { 875 const struct intel_perf_query_field *field = 876 &layout->fields[end ? f : (layout->n_fields - 1 - f)]; 877 878 switch (field->type) { 879 case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 880 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) { 881 rpc.MemoryAddress = anv_address_add(data_addr, field->location); 882 } 883 break; 884 885 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 886 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: 887 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: 888 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 889 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: { 890 struct anv_address addr = anv_address_add(data_addr, field->location); 891 struct mi_value src = field->size == 8 ? 892 mi_reg64(field->mmio_offset) : 893 mi_reg32(field->mmio_offset); 894 struct mi_value dst = field->size == 8 ? 895 mi_mem64(addr) : mi_mem32(addr); 896 mi_store(b, dst, src); 897 break; 898 } 899 900 default: 901 unreachable("Invalid query field"); 902 break; 903 } 904 } 905} 906 907void genX(CmdBeginQuery)( 908 VkCommandBuffer commandBuffer, 909 VkQueryPool queryPool, 910 uint32_t query, 911 VkQueryControlFlags flags) 912{ 913 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0); 914} 915 916void genX(CmdBeginQueryIndexedEXT)( 917 VkCommandBuffer commandBuffer, 918 VkQueryPool queryPool, 919 uint32_t query, 920 VkQueryControlFlags flags, 921 uint32_t index) 922{ 923 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 924 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 925 struct anv_address query_addr = anv_query_address(pool, query); 926 927 struct mi_builder b; 928 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 929 930 switch (pool->type) { 931 case VK_QUERY_TYPE_OCCLUSION: 932 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8)); 933 break; 934 935 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 936 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 937 pc.CommandStreamerStallEnable = true; 938 pc.StallAtPixelScoreboard = true; 939 } 940 mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)), 941 mi_reg64(GENX(CL_INVOCATION_COUNT_num))); 942 break; 943 944 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 945 /* TODO: This might only be necessary for certain stats */ 946 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 947 pc.CommandStreamerStallEnable = true; 948 pc.StallAtPixelScoreboard = true; 949 } 950 951 uint32_t statistics = pool->pipeline_statistics; 952 uint32_t offset = 8; 953 while (statistics) { 954 uint32_t stat = u_bit_scan(&statistics); 955 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset)); 956 offset += 16; 957 } 958 break; 959 } 960 961 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 962 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 963 pc.CommandStreamerStallEnable = true; 964 pc.StallAtPixelScoreboard = true; 965 } 966 emit_xfb_query(&b, index, anv_address_add(query_addr, 8)); 967 break; 968 969#if GFX_VER >= 8 970 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 971 if (!khr_perf_query_ensure_relocs(cmd_buffer)) 972 return; 973 974 const struct anv_physical_device *pdevice = cmd_buffer->device->physical; 975 const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout; 976 977 uint32_t reloc_idx = 0; 978 for (uint32_t end = 0; end < 2; end++) { 979 for (uint32_t r = 0; r < layout->n_fields; r++) { 980 const struct intel_perf_query_field *field = 981 &layout->fields[end ? r : (layout->n_fields - 1 - r)]; 982 struct mi_value reg_addr = 983 mi_iadd( 984 &b, 985 mi_imm(intel_canonical_address(pool->bo->offset + 986 khr_perf_query_data_offset(pool, query, 0, end) + 987 field->location)), 988 mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); 989 cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr); 990 991 if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC && 992 field->size == 8) { 993 reg_addr = 994 mi_iadd( 995 &b, 996 mi_imm(intel_canonical_address(pool->bo->offset + 997 khr_perf_query_data_offset(pool, query, 0, end) + 998 field->location + 4)), 999 mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); 1000 cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr); 1001 } 1002 } 1003 } 1004 1005 struct mi_value availability_write_offset = 1006 mi_iadd( 1007 &b, 1008 mi_imm( 1009 intel_canonical_address( 1010 pool->bo->offset + 1011 khr_perf_query_availability_offset(pool, query, 0 /* pass */))), 1012 mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); 1013 cmd_buffer->self_mod_locations[reloc_idx++] = 1014 mi_store_address(&b, availability_write_offset); 1015 1016 assert(reloc_idx == pdevice->n_perf_query_commands); 1017 1018 mi_self_mod_barrier(&b); 1019 1020 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1021 pc.CommandStreamerStallEnable = true; 1022 pc.StallAtPixelScoreboard = true; 1023 } 1024 cmd_buffer->perf_query_pool = pool; 1025 1026 cmd_buffer->perf_reloc_idx = 0; 1027 for (uint32_t r = 0; r < layout->n_fields; r++) { 1028 const struct intel_perf_query_field *field = 1029 &layout->fields[layout->n_fields - 1 - r]; 1030 void *dws; 1031 1032 switch (field->type) { 1033 case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 1034 dws = anv_batch_emitn(&cmd_buffer->batch, 1035 GENX(MI_REPORT_PERF_COUNT_length), 1036 GENX(MI_REPORT_PERF_COUNT), 1037 .MemoryAddress = query_addr /* Will be overwritten */); 1038 _mi_resolve_address_token(&b, 1039 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1040 dws + 1041 GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); 1042 break; 1043 1044 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 1045 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: 1046 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: 1047 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 1048 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: 1049 dws = 1050 anv_batch_emitn(&cmd_buffer->batch, 1051 GENX(MI_STORE_REGISTER_MEM_length), 1052 GENX(MI_STORE_REGISTER_MEM), 1053 .RegisterAddress = field->mmio_offset, 1054 .MemoryAddress = query_addr /* Will be overwritten */ ); 1055 _mi_resolve_address_token(&b, 1056 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1057 dws + 1058 GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 1059 if (field->size == 8) { 1060 dws = 1061 anv_batch_emitn(&cmd_buffer->batch, 1062 GENX(MI_STORE_REGISTER_MEM_length), 1063 GENX(MI_STORE_REGISTER_MEM), 1064 .RegisterAddress = field->mmio_offset + 4, 1065 .MemoryAddress = query_addr /* Will be overwritten */ ); 1066 _mi_resolve_address_token(&b, 1067 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1068 dws + 1069 GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 1070 } 1071 break; 1072 1073 default: 1074 unreachable("Invalid query field"); 1075 break; 1076 } 1077 } 1078 break; 1079 } 1080#endif 1081 1082 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 1083 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1084 pc.CommandStreamerStallEnable = true; 1085 pc.StallAtPixelScoreboard = true; 1086 } 1087 emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false); 1088 break; 1089 } 1090 1091 default: 1092 unreachable(""); 1093 } 1094} 1095 1096void genX(CmdEndQuery)( 1097 VkCommandBuffer commandBuffer, 1098 VkQueryPool queryPool, 1099 uint32_t query) 1100{ 1101 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0); 1102} 1103 1104void genX(CmdEndQueryIndexedEXT)( 1105 VkCommandBuffer commandBuffer, 1106 VkQueryPool queryPool, 1107 uint32_t query, 1108 uint32_t index) 1109{ 1110 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1111 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 1112 struct anv_address query_addr = anv_query_address(pool, query); 1113 1114 struct mi_builder b; 1115 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 1116 1117 switch (pool->type) { 1118 case VK_QUERY_TYPE_OCCLUSION: 1119 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16)); 1120 emit_query_pc_availability(cmd_buffer, query_addr, true); 1121 break; 1122 1123 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1124 /* Ensure previous commands have completed before capturing the register 1125 * value. 1126 */ 1127 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1128 pc.CommandStreamerStallEnable = true; 1129 pc.StallAtPixelScoreboard = true; 1130 } 1131 1132 mi_store(&b, mi_mem64(anv_address_add(query_addr, 16)), 1133 mi_reg64(GENX(CL_INVOCATION_COUNT_num))); 1134 emit_query_mi_availability(&b, query_addr, true); 1135 break; 1136 1137 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 1138 /* TODO: This might only be necessary for certain stats */ 1139 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1140 pc.CommandStreamerStallEnable = true; 1141 pc.StallAtPixelScoreboard = true; 1142 } 1143 1144 uint32_t statistics = pool->pipeline_statistics; 1145 uint32_t offset = 16; 1146 while (statistics) { 1147 uint32_t stat = u_bit_scan(&statistics); 1148 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset)); 1149 offset += 16; 1150 } 1151 1152 emit_query_mi_availability(&b, query_addr, true); 1153 break; 1154 } 1155 1156 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1157 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1158 pc.CommandStreamerStallEnable = true; 1159 pc.StallAtPixelScoreboard = true; 1160 } 1161 1162 emit_xfb_query(&b, index, anv_address_add(query_addr, 16)); 1163 emit_query_mi_availability(&b, query_addr, true); 1164 break; 1165 1166#if GFX_VER >= 8 1167 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 1168 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1169 pc.CommandStreamerStallEnable = true; 1170 pc.StallAtPixelScoreboard = true; 1171 } 1172 cmd_buffer->perf_query_pool = pool; 1173 1174 if (!khr_perf_query_ensure_relocs(cmd_buffer)) 1175 return; 1176 1177 const struct anv_physical_device *pdevice = cmd_buffer->device->physical; 1178 const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout; 1179 1180 void *dws; 1181 for (uint32_t r = 0; r < layout->n_fields; r++) { 1182 const struct intel_perf_query_field *field = &layout->fields[r]; 1183 1184 switch (field->type) { 1185 case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 1186 dws = anv_batch_emitn(&cmd_buffer->batch, 1187 GENX(MI_REPORT_PERF_COUNT_length), 1188 GENX(MI_REPORT_PERF_COUNT), 1189 .MemoryAddress = query_addr /* Will be overwritten */); 1190 _mi_resolve_address_token(&b, 1191 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1192 dws + 1193 GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); 1194 break; 1195 1196 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 1197 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: 1198 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: 1199 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 1200 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: 1201 dws = 1202 anv_batch_emitn(&cmd_buffer->batch, 1203 GENX(MI_STORE_REGISTER_MEM_length), 1204 GENX(MI_STORE_REGISTER_MEM), 1205 .RegisterAddress = field->mmio_offset, 1206 .MemoryAddress = query_addr /* Will be overwritten */ ); 1207 _mi_resolve_address_token(&b, 1208 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1209 dws + 1210 GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 1211 if (field->size == 8) { 1212 dws = 1213 anv_batch_emitn(&cmd_buffer->batch, 1214 GENX(MI_STORE_REGISTER_MEM_length), 1215 GENX(MI_STORE_REGISTER_MEM), 1216 .RegisterAddress = field->mmio_offset + 4, 1217 .MemoryAddress = query_addr /* Will be overwritten */ ); 1218 _mi_resolve_address_token(&b, 1219 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1220 dws + 1221 GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 1222 } 1223 break; 1224 1225 default: 1226 unreachable("Invalid query field"); 1227 break; 1228 } 1229 } 1230 1231 dws = 1232 anv_batch_emitn(&cmd_buffer->batch, 1233 GENX(MI_STORE_DATA_IMM_length), 1234 GENX(MI_STORE_DATA_IMM), 1235 .ImmediateData = true); 1236 _mi_resolve_address_token(&b, 1237 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1238 dws + 1239 GENX(MI_STORE_DATA_IMM_Address_start) / 8); 1240 1241 assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands); 1242 break; 1243 } 1244#endif 1245 1246 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 1247 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1248 pc.CommandStreamerStallEnable = true; 1249 pc.StallAtPixelScoreboard = true; 1250 } 1251 uint32_t marker_offset = intel_perf_marker_offset(); 1252 mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)), 1253 mi_imm(cmd_buffer->intel_perf_marker)); 1254 emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true); 1255 emit_query_mi_availability(&b, query_addr, true); 1256 break; 1257 } 1258 1259 default: 1260 unreachable(""); 1261 } 1262 1263 /* When multiview is active the spec requires that N consecutive query 1264 * indices are used, where N is the number of active views in the subpass. 1265 * The spec allows that we only write the results to one of the queries 1266 * but we still need to manage result availability for all the query indices. 1267 * Since we only emit a single query for all active views in the 1268 * first index, mark the other query indices as being already available 1269 * with result 0. 1270 */ 1271 if (cmd_buffer->state.gfx.view_mask) { 1272 const uint32_t num_queries = 1273 util_bitcount(cmd_buffer->state.gfx.view_mask); 1274 if (num_queries > 1) 1275 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); 1276 } 1277} 1278 1279#define TIMESTAMP 0x2358 1280 1281void genX(CmdWriteTimestamp2)( 1282 VkCommandBuffer commandBuffer, 1283 VkPipelineStageFlags2 stage, 1284 VkQueryPool queryPool, 1285 uint32_t query) 1286{ 1287 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1288 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 1289 struct anv_address query_addr = anv_query_address(pool, query); 1290 1291 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP); 1292 1293 struct mi_builder b; 1294 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 1295 1296 if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) { 1297 mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)), 1298 mi_reg64(TIMESTAMP)); 1299 emit_query_mi_availability(&b, query_addr, true); 1300 } else { 1301 /* Everything else is bottom-of-pipe */ 1302 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 1303 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 1304 1305 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1306 pc.DestinationAddressType = DAT_PPGTT; 1307 pc.PostSyncOperation = WriteTimestamp; 1308 pc.Address = anv_address_add(query_addr, 8); 1309 1310 if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4) 1311 pc.CommandStreamerStallEnable = true; 1312 } 1313 emit_query_pc_availability(cmd_buffer, query_addr, true); 1314 } 1315 1316 1317 /* When multiview is active the spec requires that N consecutive query 1318 * indices are used, where N is the number of active views in the subpass. 1319 * The spec allows that we only write the results to one of the queries 1320 * but we still need to manage result availability for all the query indices. 1321 * Since we only emit a single query for all active views in the 1322 * first index, mark the other query indices as being already available 1323 * with result 0. 1324 */ 1325 if (cmd_buffer->state.gfx.view_mask) { 1326 const uint32_t num_queries = 1327 util_bitcount(cmd_buffer->state.gfx.view_mask); 1328 if (num_queries > 1) 1329 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); 1330 } 1331} 1332 1333#if GFX_VERx10 >= 75 1334 1335#define MI_PREDICATE_SRC0 0x2400 1336#define MI_PREDICATE_SRC1 0x2408 1337#define MI_PREDICATE_RESULT 0x2418 1338 1339/** 1340 * Writes the results of a query to dst_addr is the value at poll_addr is equal 1341 * to the reference value. 1342 */ 1343static void 1344gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer, 1345 struct mi_builder *b, 1346 struct anv_address poll_addr, 1347 struct anv_address dst_addr, 1348 uint64_t ref_value, 1349 VkQueryResultFlags flags, 1350 uint32_t value_index, 1351 struct mi_value query_result) 1352{ 1353 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr)); 1354 mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value)); 1355 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 1356 mip.LoadOperation = LOAD_LOAD; 1357 mip.CombineOperation = COMBINE_SET; 1358 mip.CompareOperation = COMPARE_SRCS_EQUAL; 1359 } 1360 1361 if (flags & VK_QUERY_RESULT_64_BIT) { 1362 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8); 1363 mi_store_if(b, mi_mem64(res_addr), query_result); 1364 } else { 1365 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4); 1366 mi_store_if(b, mi_mem32(res_addr), query_result); 1367 } 1368} 1369 1370static void 1371gpu_write_query_result(struct mi_builder *b, 1372 struct anv_address dst_addr, 1373 VkQueryResultFlags flags, 1374 uint32_t value_index, 1375 struct mi_value query_result) 1376{ 1377 if (flags & VK_QUERY_RESULT_64_BIT) { 1378 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8); 1379 mi_store(b, mi_mem64(res_addr), query_result); 1380 } else { 1381 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4); 1382 mi_store(b, mi_mem32(res_addr), query_result); 1383 } 1384} 1385 1386static struct mi_value 1387compute_query_result(struct mi_builder *b, struct anv_address addr) 1388{ 1389 return mi_isub(b, mi_mem64(anv_address_add(addr, 8)), 1390 mi_mem64(anv_address_add(addr, 0))); 1391} 1392 1393void genX(CmdCopyQueryPoolResults)( 1394 VkCommandBuffer commandBuffer, 1395 VkQueryPool queryPool, 1396 uint32_t firstQuery, 1397 uint32_t queryCount, 1398 VkBuffer destBuffer, 1399 VkDeviceSize destOffset, 1400 VkDeviceSize destStride, 1401 VkQueryResultFlags flags) 1402{ 1403 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1404 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 1405 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer); 1406 1407 struct mi_builder b; 1408 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 1409 struct mi_value result; 1410 1411 /* If render target writes are ongoing, request a render target cache flush 1412 * to ensure proper ordering of the commands from the 3d pipe and the 1413 * command streamer. 1414 */ 1415 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) { 1416 anv_add_pending_pipe_bits(cmd_buffer, 1417 ANV_PIPE_TILE_CACHE_FLUSH_BIT | 1418 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, 1419 "CopyQueryPoolResults"); 1420 } 1421 1422 if ((flags & VK_QUERY_RESULT_WAIT_BIT) || 1423 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) || 1424 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and 1425 * because we're about to copy values from MI commands, we need to 1426 * stall the command streamer to make sure the PIPE_CONTROL values have 1427 * landed, otherwise we could see inconsistent values & availability. 1428 * 1429 * From the vulkan spec: 1430 * 1431 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of 1432 * previous uses of vkCmdResetQueryPool in the same queue, without 1433 * any additional synchronization." 1434 */ 1435 pool->type == VK_QUERY_TYPE_OCCLUSION || 1436 pool->type == VK_QUERY_TYPE_TIMESTAMP) { 1437 anv_add_pending_pipe_bits(cmd_buffer, 1438 ANV_PIPE_CS_STALL_BIT, 1439 "CopyQueryPoolResults"); 1440 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 1441 } 1442 1443 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset); 1444 for (uint32_t i = 0; i < queryCount; i++) { 1445 struct anv_address query_addr = anv_query_address(pool, firstQuery + i); 1446 uint32_t idx = 0; 1447 switch (pool->type) { 1448 case VK_QUERY_TYPE_OCCLUSION: 1449 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1450 result = compute_query_result(&b, anv_address_add(query_addr, 8)); 1451 /* Like in the case of vkGetQueryPoolResults, if the query is 1452 * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set, 1453 * conservatively write 0 as the query result. If the 1454 * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value. 1455 */ 1456 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, 1457 1 /* available */, flags, idx, result); 1458 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { 1459 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, 1460 0 /* unavailable */, flags, idx, mi_imm(0)); 1461 } 1462 idx++; 1463 break; 1464 1465 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 1466 uint32_t statistics = pool->pipeline_statistics; 1467 while (statistics) { 1468 uint32_t stat = u_bit_scan(&statistics); 1469 1470 result = compute_query_result(&b, anv_address_add(query_addr, 1471 idx * 16 + 8)); 1472 1473 /* WaDividePSInvocationCountBy4:HSW,BDW */ 1474 if ((cmd_buffer->device->info.ver == 8 || 1475 cmd_buffer->device->info.verx10 == 75) && 1476 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) { 1477 result = mi_ushr32_imm(&b, result, 2); 1478 } 1479 1480 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 1481 } 1482 assert(idx == util_bitcount(pool->pipeline_statistics)); 1483 break; 1484 } 1485 1486 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1487 result = compute_query_result(&b, anv_address_add(query_addr, 8)); 1488 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 1489 result = compute_query_result(&b, anv_address_add(query_addr, 24)); 1490 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 1491 break; 1492 1493 case VK_QUERY_TYPE_TIMESTAMP: 1494 result = mi_mem64(anv_address_add(query_addr, 8)); 1495 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 1496 break; 1497 1498#if GFX_VER >= 8 1499 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 1500 unreachable("Copy KHR performance query results not implemented"); 1501 break; 1502#endif 1503 1504 default: 1505 unreachable("unhandled query type"); 1506 } 1507 1508 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 1509 gpu_write_query_result(&b, dest_addr, flags, idx, 1510 mi_mem64(query_addr)); 1511 } 1512 1513 dest_addr = anv_address_add(dest_addr, destStride); 1514 } 1515} 1516 1517#else 1518void genX(CmdCopyQueryPoolResults)( 1519 VkCommandBuffer commandBuffer, 1520 VkQueryPool queryPool, 1521 uint32_t firstQuery, 1522 uint32_t queryCount, 1523 VkBuffer destBuffer, 1524 VkDeviceSize destOffset, 1525 VkDeviceSize destStride, 1526 VkQueryResultFlags flags) 1527{ 1528 anv_finishme("Queries not yet supported on Ivy Bridge"); 1529} 1530#endif 1531