1/* 2 * Copyright © 2021 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <inttypes.h> 25 26#include "ac_perfcounter.h" 27#include "amdgfxregs.h" 28#include "radv_cs.h" 29#include "radv_private.h" 30#include "sid.h" 31 32void 33radv_perfcounter_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders) 34{ 35 radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2); 36 radeon_emit(cs, shaders & 0x7f); 37 radeon_emit(cs, 0xffffffff); 38} 39 40static void 41radv_emit_windowed_counters(struct radv_device *device, struct radeon_cmdbuf *cs, int family, 42 bool enable) 43{ 44 if (family == RADV_QUEUE_GENERAL) { 45 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 46 radeon_emit(cs, EVENT_TYPE(enable ? V_028A90_PERFCOUNTER_START : V_028A90_PERFCOUNTER_STOP) | 47 EVENT_INDEX(0)); 48 } 49 50 radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(enable)); 51} 52 53void 54radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf *cs) 55{ 56 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, 57 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) | 58 S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET)); 59} 60 61void 62radv_perfcounter_emit_spm_start(struct radv_device *device, struct radeon_cmdbuf *cs, int family) 63{ 64 /* Start SPM counters. */ 65 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, 66 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) | 67 S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING)); 68 69 radv_emit_windowed_counters(device, cs, family, true); 70} 71 72void 73radv_perfcounter_emit_spm_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family) 74{ 75 radv_emit_windowed_counters(device, cs, family, false); 76 77 /* Stop SPM counters. */ 78 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, 79 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) | 80 S_036020_SPM_PERFMON_STATE(device->physical_device->rad_info.never_stop_sq_perf_counters ? 81 V_036020_STRM_PERFMON_STATE_START_COUNTING : 82 V_036020_STRM_PERFMON_STATE_STOP_COUNTING)); 83} 84 85enum radv_perfcounter_op { 86 RADV_PC_OP_SUM, 87 RADV_PC_OP_MAX, 88 RADV_PC_OP_RATIO_DIVSCALE, 89 RADV_PC_OP_REVERSE_RATIO, /* (reg1 - reg0) / reg1 */ 90 RADV_PC_OP_SUM_WEIGHTED_4, 91}; 92 93#define S_REG_SEL(x) ((x)&0xFFFF) 94#define G_REG_SEL(x) ((x)&0xFFFF) 95#define S_REG_BLOCK(x) ((x) << 16) 96#define G_REG_BLOCK(x) (((x) >> 16) & 0x7FFF) 97 98#define S_REG_OFFSET(x) ((x)&0xFFFF) 99#define G_REG_OFFSET(x) ((x)&0xFFFF) 100#define S_REG_INSTANCES(x) ((x) << 16) 101#define G_REG_INSTANCES(x) (((x) >> 16) & 0x7FFF) 102#define S_REG_CONSTANT(x) ((x) << 31) 103#define G_REG_CONSTANT(x) ((x) >> 31) 104 105struct radv_perfcounter_impl { 106 enum radv_perfcounter_op op; 107 uint32_t regs[8]; 108}; 109 110/* Only append to this list, never insert into the middle or remove (but can rename). 111 * 112 * The invariant we're trying to get here is counters that have the same meaning, so 113 * these can be shared between counters that have different implementations on different 114 * GPUs, but should be unique within a GPU. 115 */ 116enum radv_perfcounter_uuid { 117 RADV_PC_UUID_GPU_CYCLES, 118 RADV_PC_UUID_SHADER_WAVES, 119 RADV_PC_UUID_SHADER_INSTRUCTIONS, 120 RADV_PC_UUID_SHADER_INSTRUCTIONS_VALU, 121 RADV_PC_UUID_SHADER_INSTRUCTIONS_SALU, 122 RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_LOAD, 123 RADV_PC_UUID_SHADER_INSTRUCTIONS_SMEM_LOAD, 124 RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_STORE, 125 RADV_PC_UUID_SHADER_INSTRUCTIONS_LDS, 126 RADV_PC_UUID_SHADER_INSTRUCTIONS_GDS, 127 RADV_PC_UUID_SHADER_VALU_BUSY, 128 RADV_PC_UUID_SHADER_SALU_BUSY, 129 RADV_PC_UUID_VRAM_READ_SIZE, 130 RADV_PC_UUID_VRAM_WRITE_SIZE, 131 RADV_PC_UUID_L0_CACHE_HIT_RATIO, 132 RADV_PC_UUID_L1_CACHE_HIT_RATIO, 133 RADV_PC_UUID_L2_CACHE_HIT_RATIO, 134}; 135 136struct radv_perfcounter_desc { 137 struct radv_perfcounter_impl impl; 138 139 VkPerformanceCounterUnitKHR unit; 140 141 char name[VK_MAX_DESCRIPTION_SIZE]; 142 char category[VK_MAX_DESCRIPTION_SIZE]; 143 char description[VK_MAX_DESCRIPTION_SIZE]; 144 enum radv_perfcounter_uuid uuid; 145}; 146 147#define PC_DESC(arg_op, arg_unit, arg_name, arg_category, arg_description, arg_uuid, ...) \ 148 (struct radv_perfcounter_desc) \ 149 { \ 150 .impl = {.op = arg_op, .regs = {__VA_ARGS__}}, \ 151 .unit = VK_PERFORMANCE_COUNTER_UNIT_##arg_unit##_KHR, .name = arg_name, \ 152 .category = arg_category, .description = arg_description, .uuid = RADV_PC_UUID_##arg_uuid \ 153 } 154 155#define ADD_PC(op, unit, name, category, description, uuid, ...) \ 156 do { \ 157 if (descs) { \ 158 descs[*count] = PC_DESC((op), unit, name, category, description, uuid, __VA_ARGS__); \ 159 } \ 160 ++*count; \ 161 } while (0) 162#define CTR(block, ctr) (S_REG_BLOCK(block) | S_REG_SEL(ctr)) 163#define CONSTANT(v) (S_REG_CONSTANT(1) | (uint32_t)(v)) 164 165enum { GRBM_PERF_SEL_GUI_ACTIVE = CTR(GRBM, 2) }; 166 167enum { CPF_PERF_SEL_CPF_STAT_BUSY_GFX10 = CTR(CPF, 0x18) }; 168 169enum { 170 GL1C_PERF_SEL_REQ = CTR(GL1C, 0xe), 171 GL1C_PERF_SEL_REQ_MISS = CTR(GL1C, 0x12), 172}; 173 174enum { 175 GL2C_PERF_SEL_REQ = CTR(GL2C, 0x3), 176 177 GL2C_PERF_SEL_MISS_GFX101 = CTR(GL2C, 0x23), 178 GL2C_PERF_SEL_MC_WRREQ_GFX101 = CTR(GL2C, 0x4b), 179 GL2C_PERF_SEL_EA_WRREQ_64B_GFX101 = CTR(GL2C, 0x4c), 180 GL2C_PERF_SEL_EA_RDREQ_32B_GFX101 = CTR(GL2C, 0x59), 181 GL2C_PERF_SEL_EA_RDREQ_64B_GFX101 = CTR(GL2C, 0x5a), 182 GL2C_PERF_SEL_EA_RDREQ_96B_GFX101 = CTR(GL2C, 0x5b), 183 GL2C_PERF_SEL_EA_RDREQ_128B_GFX101 = CTR(GL2C, 0x5c), 184 185 GL2C_PERF_SEL_MISS_GFX103 = CTR(GL2C, 0x2b), 186 GL2C_PERF_SEL_MC_WRREQ_GFX103 = CTR(GL2C, 0x53), 187 GL2C_PERF_SEL_EA_WRREQ_64B_GFX103 = CTR(GL2C, 0x55), 188 GL2C_PERF_SEL_EA_RDREQ_32B_GFX103 = CTR(GL2C, 0x63), 189 GL2C_PERF_SEL_EA_RDREQ_64B_GFX103 = CTR(GL2C, 0x64), 190 GL2C_PERF_SEL_EA_RDREQ_96B_GFX103 = CTR(GL2C, 0x65), 191 GL2C_PERF_SEL_EA_RDREQ_128B_GFX103 = CTR(GL2C, 0x66), 192}; 193 194enum { 195 SQ_PERF_SEL_WAVES = CTR(SQ, 0x4), 196 SQ_PERF_SEL_INSTS_ALL_GFX10 = CTR(SQ, 0x31), 197 SQ_PERF_SEL_INSTS_GDS_GFX10 = CTR(SQ, 0x37), 198 SQ_PERF_SEL_INSTS_LDS_GFX10 = CTR(SQ, 0x3b), 199 SQ_PERF_SEL_INSTS_SALU_GFX10 = CTR(SQ, 0x3c), 200 SQ_PERF_SEL_INSTS_SMEM_GFX10 = CTR(SQ, 0x3d), 201 SQ_PERF_SEL_INSTS_VALU_GFX10 = CTR(SQ, 0x40), 202 SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10 = CTR(SQ, 0x45), 203 SQ_PERF_SEL_INSTS_TEX_STORE_GFX10 = CTR(SQ, 0x46), 204 SQ_PERF_SEL_INST_CYCLES_VALU_GFX10 = CTR(SQ, 0x75), 205}; 206 207enum { 208 TCP_PERF_SEL_REQ_GFX10 = CTR(TCP, 0x9), 209 TCP_PERF_SEL_REQ_MISS_GFX10 = CTR(TCP, 0x12), 210}; 211 212#define CTR_NUM_SIMD \ 213 CONSTANT(pdev->rad_info.num_simd_per_compute_unit * pdev->rad_info.num_cu) 214#define CTR_NUM_CUS CONSTANT(pdev->rad_info.num_cu) 215 216static void 217radv_query_perfcounter_descs(struct radv_physical_device *pdev, uint32_t *count, 218 struct radv_perfcounter_desc *descs) 219{ 220 *count = 0; 221 222 ADD_PC(RADV_PC_OP_MAX, CYCLES, "GPU active cycles", "GRBM", 223 "cycles the GPU is active processing a command buffer.", GPU_CYCLES, 224 GRBM_PERF_SEL_GUI_ACTIVE); 225 226 ADD_PC(RADV_PC_OP_SUM, GENERIC, "Waves", "Shaders", "Number of waves executed", SHADER_WAVES, 227 SQ_PERF_SEL_WAVES); 228 ADD_PC(RADV_PC_OP_SUM, GENERIC, "Instructions", "Shaders", "Number of Instructions executed", 229 SHADER_INSTRUCTIONS, SQ_PERF_SEL_INSTS_ALL_GFX10); 230 ADD_PC(RADV_PC_OP_SUM, GENERIC, "VALU Instructions", "Shaders", 231 "Number of VALU Instructions executed", SHADER_INSTRUCTIONS_VALU, 232 SQ_PERF_SEL_INSTS_VALU_GFX10); 233 ADD_PC(RADV_PC_OP_SUM, GENERIC, "SALU Instructions", "Shaders", 234 "Number of SALU Instructions executed", SHADER_INSTRUCTIONS_SALU, 235 SQ_PERF_SEL_INSTS_SALU_GFX10); 236 ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Load Instructions", "Shaders", 237 "Number of VMEM load instructions executed", SHADER_INSTRUCTIONS_VMEM_LOAD, 238 SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10); 239 ADD_PC(RADV_PC_OP_SUM, GENERIC, "SMEM Load Instructions", "Shaders", 240 "Number of SMEM load instructions executed", SHADER_INSTRUCTIONS_SMEM_LOAD, 241 SQ_PERF_SEL_INSTS_SMEM_GFX10); 242 ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Store Instructions", "Shaders", 243 "Number of VMEM store instructions executed", SHADER_INSTRUCTIONS_VMEM_STORE, 244 SQ_PERF_SEL_INSTS_TEX_STORE_GFX10); 245 ADD_PC(RADV_PC_OP_SUM, GENERIC, "LDS Instructions", "Shaders", 246 "Number of LDS Instructions executed", SHADER_INSTRUCTIONS_LDS, 247 SQ_PERF_SEL_INSTS_LDS_GFX10); 248 ADD_PC(RADV_PC_OP_SUM, GENERIC, "GDS Instructions", "Shaders", 249 "Number of GDS Instructions executed", SHADER_INSTRUCTIONS_GDS, 250 SQ_PERF_SEL_INSTS_GDS_GFX10); 251 252 ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "VALU Busy", "Shader Utilization", 253 "Percentage of time the VALU units are busy", SHADER_VALU_BUSY, 254 SQ_PERF_SEL_INST_CYCLES_VALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_SIMD); 255 ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "SALU Busy", "Shader Utilization", 256 "Percentage of time the SALU units are busy", SHADER_SALU_BUSY, 257 SQ_PERF_SEL_INSTS_SALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_CUS); 258 259 if (pdev->rad_info.gfx_level >= GFX10_3) { 260 ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", 261 "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX103, 262 CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX103, CONSTANT(64), 263 GL2C_PERF_SEL_EA_RDREQ_96B_GFX103, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX103, 264 CONSTANT(128)); 265 ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", 266 "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX103, 267 CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX103, CONSTANT(64), CONSTANT(0), 268 CONSTANT(0), CONSTANT(0), CONSTANT(0)); 269 } else { 270 ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", 271 "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX101, 272 CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX101, CONSTANT(64), 273 GL2C_PERF_SEL_EA_RDREQ_96B_GFX101, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX101, 274 CONSTANT(128)); 275 ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", 276 "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX101, 277 CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX101, CONSTANT(32), CONSTANT(0), 278 CONSTANT(0), CONSTANT(0), CONSTANT(0)); 279 } 280 281 ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L0 cache hit ratio", "Memory", "Hit ratio of L0 cache", 282 L0_CACHE_HIT_RATIO, TCP_PERF_SEL_REQ_MISS_GFX10, TCP_PERF_SEL_REQ_GFX10); 283 ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L1 cache hit ratio", "Memory", "Hit ratio of L1 cache", 284 L1_CACHE_HIT_RATIO, GL1C_PERF_SEL_REQ_MISS, GL1C_PERF_SEL_REQ); 285 if (pdev->rad_info.gfx_level >= GFX10_3) { 286 ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", 287 "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX103, 288 GL2C_PERF_SEL_REQ); 289 } else { 290 ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", 291 "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX101, 292 GL2C_PERF_SEL_REQ); 293 } 294} 295 296static bool 297radv_init_perfcounter_descs(struct radv_physical_device *pdev) 298{ 299 if (pdev->perfcounters) 300 return true; 301 302 uint32_t count; 303 radv_query_perfcounter_descs(pdev, &count, NULL); 304 305 struct radv_perfcounter_desc *descs = malloc(sizeof(*descs) * count); 306 if (!descs) 307 return false; 308 309 radv_query_perfcounter_descs(pdev, &count, descs); 310 pdev->num_perfcounters = count; 311 pdev->perfcounters = descs; 312 313 return true; 314} 315 316static int 317cmp_uint32_t(const void *a, const void *b) 318{ 319 uint32_t l = *(const uint32_t *)a; 320 uint32_t r = *(const uint32_t *)b; 321 322 return (l < r) ? -1 : (l > r) ? 1 : 0; 323} 324 325static VkResult 326radv_get_counter_registers(const struct radv_physical_device *pdevice, uint32_t num_indices, 327 const uint32_t *indices, unsigned *out_num_regs, uint32_t **out_regs) 328{ 329 ASSERTED uint32_t num_counters = pdevice->num_perfcounters; 330 const struct radv_perfcounter_desc *descs = pdevice->perfcounters; 331 332 unsigned full_reg_cnt = num_indices * ARRAY_SIZE(descs->impl.regs); 333 uint32_t *regs = malloc(full_reg_cnt * sizeof(uint32_t)); 334 if (!regs) 335 return VK_ERROR_OUT_OF_HOST_MEMORY; 336 337 unsigned reg_cnt = 0; 338 for (unsigned i = 0; i < num_indices; ++i) { 339 uint32_t index = indices[i]; 340 assert(index < num_counters); 341 for (unsigned j = 0; j < ARRAY_SIZE(descs[index].impl.regs) && descs[index].impl.regs[j]; 342 ++j) { 343 if (!G_REG_CONSTANT(descs[index].impl.regs[j])) 344 regs[reg_cnt++] = descs[index].impl.regs[j]; 345 } 346 } 347 348 qsort(regs, reg_cnt, sizeof(uint32_t), cmp_uint32_t); 349 350 unsigned deduped_reg_cnt = 0; 351 for (unsigned i = 1; i < reg_cnt; ++i) { 352 if (regs[i] != regs[deduped_reg_cnt]) 353 regs[++deduped_reg_cnt] = regs[i]; 354 } 355 ++deduped_reg_cnt; 356 357 *out_num_regs = deduped_reg_cnt; 358 *out_regs = regs; 359 return VK_SUCCESS; 360} 361 362static unsigned 363radv_pc_get_num_instances(const struct radv_physical_device *pdevice, struct ac_pc_block *ac_block) 364{ 365 return ac_block->num_instances * 366 ((ac_block->b->b->flags & AC_PC_BLOCK_SE) ? pdevice->rad_info.max_se : 1); 367} 368 369static unsigned 370radv_get_num_counter_passes(const struct radv_physical_device *pdevice, unsigned num_regs, 371 const uint32_t *regs) 372{ 373 enum ac_pc_gpu_block prev_block = NUM_GPU_BLOCK; 374 unsigned block_reg_count = 0; 375 struct ac_pc_block *ac_block = NULL; 376 unsigned passes_needed = 1; 377 378 for (unsigned i = 0; i < num_regs; ++i) { 379 enum ac_pc_gpu_block block = G_REG_BLOCK(regs[i]); 380 381 if (block != prev_block) { 382 block_reg_count = 0; 383 prev_block = block; 384 ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); 385 } 386 387 ++block_reg_count; 388 389 passes_needed = 390 MAX2(passes_needed, DIV_ROUND_UP(block_reg_count, ac_block->b->b->num_counters)); 391 } 392 393 return passes_needed; 394} 395 396void 397radv_pc_deinit_query_pool(struct radv_pc_query_pool *pool) 398{ 399 free(pool->counters); 400 free(pool->pc_regs); 401} 402 403VkResult 404radv_pc_init_query_pool(struct radv_physical_device *pdevice, 405 const VkQueryPoolCreateInfo *pCreateInfo, struct radv_pc_query_pool *pool) 406{ 407 const VkQueryPoolPerformanceCreateInfoKHR *perf_info = 408 vk_find_struct_const(pCreateInfo->pNext, QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); 409 VkResult result; 410 411 if (!radv_init_perfcounter_descs(pdevice)) 412 return VK_ERROR_OUT_OF_HOST_MEMORY; 413 414 result = 415 radv_get_counter_registers(pdevice, perf_info->counterIndexCount, perf_info->pCounterIndices, 416 &pool->num_pc_regs, &pool->pc_regs); 417 if (result != VK_SUCCESS) 418 return result; 419 420 pool->num_passes = radv_get_num_counter_passes(pdevice, pool->num_pc_regs, pool->pc_regs); 421 422 uint32_t *pc_reg_offsets = malloc(pool->num_pc_regs * sizeof(uint32_t)); 423 if (!pc_reg_offsets) 424 return VK_ERROR_OUT_OF_HOST_MEMORY; 425 426 unsigned offset = 0; 427 for (unsigned i = 0; i < pool->num_pc_regs; ++i) { 428 enum ac_pc_gpu_block block = pool->pc_regs[i] >> 16; 429 struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); 430 unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block); 431 432 pc_reg_offsets[i] = S_REG_OFFSET(offset) | S_REG_INSTANCES(num_instances); 433 offset += sizeof(uint64_t) * 2 * num_instances; 434 } 435 436 /* allow an uint32_t per pass to signal completion. */ 437 pool->b.stride = offset + 8 * pool->num_passes; 438 439 pool->num_counters = perf_info->counterIndexCount; 440 pool->counters = malloc(pool->num_counters * sizeof(struct radv_perfcounter_impl)); 441 if (!pool->counters) { 442 free(pc_reg_offsets); 443 return VK_ERROR_OUT_OF_HOST_MEMORY; 444 } 445 446 for (unsigned i = 0; i < pool->num_counters; ++i) { 447 pool->counters[i] = pdevice->perfcounters[perf_info->pCounterIndices[i]].impl; 448 449 for (unsigned j = 0; j < ARRAY_SIZE(pool->counters[i].regs); ++j) { 450 uint32_t reg = pool->counters[i].regs[j]; 451 if (!reg || G_REG_CONSTANT(reg)) 452 continue; 453 454 unsigned k; 455 for (k = 0; k < pool->num_pc_regs; ++k) 456 if (pool->pc_regs[k] == reg) 457 break; 458 pool->counters[i].regs[j] = pc_reg_offsets[k]; 459 } 460 } 461 462 free(pc_reg_offsets); 463 return VK_SUCCESS; 464} 465 466static void 467radv_emit_instance(struct radv_cmd_buffer *cmd_buffer, int se, int instance) 468{ 469 struct radeon_cmdbuf *cs = cmd_buffer->cs; 470 unsigned value = S_030800_SH_BROADCAST_WRITES(1); 471 472 if (se >= 0) { 473 value |= S_030800_SE_INDEX(se); 474 } else { 475 value |= S_030800_SE_BROADCAST_WRITES(1); 476 } 477 478 if (instance >= 0) { 479 value |= S_030800_INSTANCE_INDEX(instance); 480 } else { 481 value |= S_030800_INSTANCE_BROADCAST_WRITES(1); 482 } 483 484 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value); 485} 486 487static void 488radv_emit_select(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, 489 unsigned *selectors) 490{ 491 struct ac_pc_block_base *regs = block->b->b; 492 struct radeon_cmdbuf *cs = cmd_buffer->cs; 493 unsigned idx; 494 495 assert(count <= regs->num_counters); 496 497 /* Fake counters. */ 498 if (!regs->select0) 499 return; 500 501 for (idx = 0; idx < count; ++idx) { 502 radeon_set_perfctr_reg(cmd_buffer, regs->select0[idx], 503 G_REG_SEL(selectors[idx]) | regs->select_or); 504 } 505 506 for (idx = 0; idx < regs->num_spm_counters; idx++) { 507 radeon_set_uconfig_reg_seq(cs, regs->select1[idx], 1); 508 radeon_emit(cs, 0); 509 } 510} 511 512static void 513radv_pc_emit_block_instance_read(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, 514 unsigned count, uint64_t va) 515{ 516 struct ac_pc_block_base *regs = block->b->b; 517 struct radeon_cmdbuf *cs = cmd_buffer->cs; 518 unsigned reg = regs->counter0_lo; 519 unsigned reg_delta = 8; 520 521 assert(regs->select0); 522 for (unsigned idx = 0; idx < count; ++idx) { 523 if (regs->counters) 524 reg = regs->counters[idx]; 525 526 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 527 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | 528 COPY_DATA_WR_CONFIRM | COPY_DATA_COUNT_SEL); /* 64 bits */ 529 radeon_emit(cs, reg >> 2); 530 radeon_emit(cs, 0); /* unused */ 531 radeon_emit(cs, va); 532 radeon_emit(cs, va >> 32); 533 534 va += sizeof(uint64_t) * 2 * 535 radv_pc_get_num_instances(cmd_buffer->device->physical_device, block); 536 reg += reg_delta; 537 } 538} 539 540static void 541radv_pc_sample_block(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, 542 uint64_t va) 543{ 544 unsigned se_end = 1; 545 if (block->b->b->flags & AC_PC_BLOCK_SE) 546 se_end = cmd_buffer->device->physical_device->rad_info.max_se; 547 548 for (unsigned se = 0; se < se_end; ++se) { 549 for (unsigned instance = 0; instance < block->num_instances; ++instance) { 550 radv_emit_instance(cmd_buffer, se, instance); 551 radv_pc_emit_block_instance_read(cmd_buffer, block, count, va); 552 va += sizeof(uint64_t) * 2; 553 } 554 } 555} 556 557static void 558radv_pc_wait_idle(struct radv_cmd_buffer *cmd_buffer) 559{ 560 struct radeon_cmdbuf *cs = cmd_buffer->cs; 561 562 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 563 radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); 564 565 radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); 566 radeon_emit(cs, 0); /* CP_COHER_CNTL */ 567 radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ 568 radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ 569 radeon_emit(cs, 0); /* CP_COHER_BASE */ 570 radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ 571 radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ 572 radeon_emit(cs, 0); /* GCR_CNTL */ 573 574 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 575 radeon_emit(cs, 0); 576} 577 578static void 579radv_pc_stop_and_sample(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, 580 uint64_t va, bool end) 581{ 582 struct radeon_cmdbuf *cs = cmd_buffer->cs; 583 struct radv_physical_device *pdevice = cmd_buffer->device->physical_device; 584 585 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 586 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0)); 587 588 radv_pc_wait_idle(cmd_buffer); 589 590 radv_emit_instance(cmd_buffer, -1, -1); 591 radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, false); 592 593 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, 594 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | 595 S_036020_PERFMON_SAMPLE_ENABLE(1)); 596 597 for (unsigned pass = 0; pass < pool->num_passes; ++pass) { 598 uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + 599 PERF_CTR_BO_PASS_OFFSET + 8 * pass; 600 uint64_t reg_va = va + (end ? 8 : 0); 601 602 radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); 603 radeon_emit(cs, pred_va); 604 radeon_emit(cs, pred_va >> 32); 605 radeon_emit(cs, 0); /* Cache policy */ 606 607 uint32_t *skip_dwords = cs->buf + cs->cdw; 608 radeon_emit(cs, 0); 609 610 for (unsigned i = 0; i < pool->num_pc_regs;) { 611 enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]); 612 struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); 613 unsigned offset = ac_block->num_instances * pass; 614 unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block); 615 616 unsigned cnt = 1; 617 while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt])) 618 ++cnt; 619 620 if (offset < cnt) { 621 unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters); 622 radv_pc_sample_block(cmd_buffer, ac_block, pass_reg_cnt, 623 reg_va + offset * num_instances * sizeof(uint64_t)); 624 } 625 626 i += cnt; 627 reg_va += num_instances * sizeof(uint64_t) * 2 * cnt; 628 } 629 630 if (end) { 631 uint64_t signal_va = va + pool->b.stride - 8 - 8 * pass; 632 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); 633 radeon_emit(cs, 634 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME)); 635 radeon_emit(cs, signal_va); 636 radeon_emit(cs, signal_va >> 32); 637 radeon_emit(cs, 1); /* value */ 638 } 639 640 *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1; 641 } 642 643 radv_emit_instance(cmd_buffer, -1, -1); 644} 645 646void 647radv_pc_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, 648 uint64_t va) 649{ 650 struct radeon_cmdbuf *cs = cmd_buffer->cs; 651 struct radv_physical_device *pdevice = cmd_buffer->device->physical_device; 652 ASSERTED unsigned cdw_max; 653 654 cmd_buffer->state.uses_perf_counters = true; 655 656 cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 657 256 + /* Random one time stuff */ 658 10 * pool->num_passes + /* COND_EXECs */ 659 pool->b.stride / 8 * (5 + 8)); 660 661 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo); 662 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo); 663 664 uint64_t perf_ctr_va = 665 radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET; 666 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); 667 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME)); 668 radeon_emit(cs, perf_ctr_va); 669 radeon_emit(cs, perf_ctr_va >> 32); 670 radeon_emit(cs, 0); /* value */ 671 672 radv_pc_wait_idle(cmd_buffer); 673 674 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, 675 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET)); 676 677 radv_emit_inhibit_clockgating(cmd_buffer->device, cs, true); 678 radv_emit_spi_config_cntl(cmd_buffer->device, cs, true); 679 radv_perfcounter_emit_shaders(cs, 0x7f); 680 681 for (unsigned pass = 0; pass < pool->num_passes; ++pass) { 682 uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + 683 PERF_CTR_BO_PASS_OFFSET + 8 * pass; 684 685 radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); 686 radeon_emit(cs, pred_va); 687 radeon_emit(cs, pred_va >> 32); 688 radeon_emit(cs, 0); /* Cache policy */ 689 690 uint32_t *skip_dwords = cs->buf + cs->cdw; 691 radeon_emit(cs, 0); 692 693 for (unsigned i = 0; i < pool->num_pc_regs;) { 694 enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]); 695 struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); 696 unsigned offset = ac_block->num_instances * pass; 697 698 unsigned cnt = 1; 699 while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt])) 700 ++cnt; 701 702 if (offset < cnt) { 703 unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters); 704 radv_emit_select(cmd_buffer, ac_block, pass_reg_cnt, pool->pc_regs + i + offset); 705 } 706 707 i += cnt; 708 } 709 710 *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1; 711 } 712 713 radv_emit_instance(cmd_buffer, -1, -1); 714 715 /* The following sequence actually starts the perfcounters. */ 716 717 radv_pc_stop_and_sample(cmd_buffer, pool, va, false); 718 719 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, 720 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING)); 721 722 radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, true); 723 724 assert(cmd_buffer->cs->cdw <= cdw_max); 725} 726 727void 728radv_pc_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va) 729{ 730 struct radeon_cmdbuf *cs = cmd_buffer->cs; 731 ASSERTED unsigned cdw_max; 732 733 cdw_max = 734 radeon_check_space(cmd_buffer->device->ws, cs, 735 256 + /* Reserved for things that don't scale with passes/counters */ 736 5 * pool->num_passes + /* COND_EXECs */ 737 pool->b.stride / 8 * 8); 738 739 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo); 740 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo); 741 742 uint64_t perf_ctr_va = 743 radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET; 744 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, 745 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0, 746 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, perf_ctr_va, 1, 747 cmd_buffer->gfx9_fence_va); 748 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, perf_ctr_va, 1, 0xffffffff); 749 750 radv_pc_wait_idle(cmd_buffer); 751 radv_pc_stop_and_sample(cmd_buffer, pool, va, true); 752 753 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, 754 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET)); 755 radv_emit_spi_config_cntl(cmd_buffer->device, cs, false); 756 radv_emit_inhibit_clockgating(cmd_buffer->device, cs, false); 757 758 assert(cmd_buffer->cs->cdw <= cdw_max); 759} 760 761static uint64_t 762radv_pc_sum_reg(uint32_t reg, const uint64_t *data) 763{ 764 unsigned instances = G_REG_INSTANCES(reg); 765 unsigned offset = G_REG_OFFSET(reg) / 8; 766 uint64_t result = 0; 767 768 if (G_REG_CONSTANT(reg)) 769 return reg & 0x7fffffffu; 770 771 for (unsigned i = 0; i < instances; ++i) { 772 result += data[offset + 2 * i + 1] - data[offset + 2 * i]; 773 } 774 775 return result; 776} 777 778static uint64_t 779radv_pc_max_reg(uint32_t reg, const uint64_t *data) 780{ 781 unsigned instances = G_REG_INSTANCES(reg); 782 unsigned offset = G_REG_OFFSET(reg) / 8; 783 uint64_t result = 0; 784 785 if (G_REG_CONSTANT(reg)) 786 return reg & 0x7fffffffu; 787 788 for (unsigned i = 0; i < instances; ++i) { 789 result = MAX2(result, data[offset + 2 * i + 1]); 790 } 791 792 return result; 793} 794 795static union VkPerformanceCounterResultKHR 796radv_pc_get_result(const struct radv_perfcounter_impl *impl, const uint64_t *data) 797{ 798 union VkPerformanceCounterResultKHR result; 799 800 switch (impl->op) { 801 case RADV_PC_OP_MAX: 802 result.float64 = radv_pc_max_reg(impl->regs[0], data); 803 break; 804 case RADV_PC_OP_SUM: 805 result.float64 = radv_pc_sum_reg(impl->regs[0], data); 806 break; 807 case RADV_PC_OP_RATIO_DIVSCALE: 808 result.float64 = radv_pc_sum_reg(impl->regs[0], data) / 809 (double)radv_pc_sum_reg(impl->regs[1], data) / 810 radv_pc_sum_reg(impl->regs[2], data) * 100.0; 811 break; 812 case RADV_PC_OP_REVERSE_RATIO: { 813 double tmp = radv_pc_sum_reg(impl->regs[1], data); 814 result.float64 = (tmp - radv_pc_sum_reg(impl->regs[0], data)) / tmp * 100.0; 815 break; 816 } 817 case RADV_PC_OP_SUM_WEIGHTED_4: 818 result.float64 = 0.0; 819 for (unsigned i = 0; i < 4; ++i) 820 result.float64 += 821 radv_pc_sum_reg(impl->regs[2 * i], data) * radv_pc_sum_reg(impl->regs[2 * i + 1], data); 822 break; 823 default: 824 unreachable("unhandled performance counter operation"); 825 } 826 return result; 827} 828 829void 830radv_pc_get_results(const struct radv_pc_query_pool *pc_pool, const uint64_t *data, void *out) 831{ 832 union VkPerformanceCounterResultKHR *pc_result = out; 833 834 for (unsigned i = 0; i < pc_pool->num_counters; ++i) { 835 pc_result[i] = radv_pc_get_result(pc_pool->counters + i, data); 836 } 837} 838 839VkResult 840radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( 841 VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, uint32_t *pCounterCount, 842 VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions) 843{ 844 RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); 845 846 if (vk_queue_to_radv(pdevice, queueFamilyIndex) != RADV_QUEUE_GENERAL) { 847 *pCounterCount = 0; 848 return VK_SUCCESS; 849 } 850 851 if (!radv_init_perfcounter_descs(pdevice)) 852 return VK_ERROR_OUT_OF_HOST_MEMORY; 853 854 uint32_t counter_cnt = pdevice->num_perfcounters; 855 const struct radv_perfcounter_desc *descs = pdevice->perfcounters; 856 857 if (!pCounters && !pCounterDescriptions) { 858 *pCounterCount = counter_cnt; 859 return VK_SUCCESS; 860 } 861 862 VkResult result = counter_cnt > *pCounterCount ? VK_INCOMPLETE : VK_SUCCESS; 863 counter_cnt = MIN2(counter_cnt, *pCounterCount); 864 *pCounterCount = counter_cnt; 865 866 for (uint32_t i = 0; i < counter_cnt; ++i) { 867 if (pCounters) { 868 pCounters[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_KHR; 869 pCounters[i].unit = descs[i].unit; 870 pCounters[i].scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; 871 pCounters[i].storage = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR; 872 873 memset(&pCounters[i].uuid, 0, sizeof(pCounters[i].uuid)); 874 strcpy((char*)&pCounters[i].uuid, "RADV"); 875 876 const uint32_t uuid = descs[i].uuid; 877 memcpy(&pCounters[i].uuid[12], &uuid, sizeof(uuid)); 878 } 879 880 if (pCounterDescriptions) { 881 pCounterDescriptions[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_DESCRIPTION_KHR; 882 pCounterDescriptions[i].flags = 883 VK_PERFORMANCE_COUNTER_DESCRIPTION_CONCURRENTLY_IMPACTED_BIT_KHR; 884 strcpy(pCounterDescriptions[i].name, descs[i].name); 885 strcpy(pCounterDescriptions[i].category, descs[i].category); 886 strcpy(pCounterDescriptions[i].description, descs[i].description); 887 } 888 } 889 return result; 890} 891 892void 893radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( 894 VkPhysicalDevice physicalDevice, 895 const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo, uint32_t *pNumPasses) 896{ 897 RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); 898 899 if (pPerformanceQueryCreateInfo->counterIndexCount == 0) { 900 *pNumPasses = 0; 901 return; 902 } 903 904 if (!radv_init_perfcounter_descs(pdevice)) { 905 /* Can't return an error, so log */ 906 fprintf(stderr, "radv: Failed to init perf counters\n"); 907 *pNumPasses = 1; 908 return; 909 } 910 911 assert(vk_queue_to_radv(pdevice, pPerformanceQueryCreateInfo->queueFamilyIndex) == 912 RADV_QUEUE_GENERAL); 913 914 unsigned num_regs = 0; 915 uint32_t *regs = NULL; 916 VkResult result = 917 radv_get_counter_registers(pdevice, pPerformanceQueryCreateInfo->counterIndexCount, 918 pPerformanceQueryCreateInfo->pCounterIndices, &num_regs, ®s); 919 if (result != VK_SUCCESS) { 920 /* Can't return an error, so log */ 921 fprintf(stderr, "radv: Failed to allocate memory for perf counters\n"); 922 } 923 924 *pNumPasses = radv_get_num_counter_passes(pdevice, num_regs, regs); 925 free(regs); 926} 927