1/*
2 * Copyright © 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include <inttypes.h>
25
26#include "ac_perfcounter.h"
27#include "amdgfxregs.h"
28#include "radv_cs.h"
29#include "radv_private.h"
30#include "sid.h"
31
32void
33radv_perfcounter_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders)
34{
35   radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
36   radeon_emit(cs, shaders & 0x7f);
37   radeon_emit(cs, 0xffffffff);
38}
39
40static void
41radv_emit_windowed_counters(struct radv_device *device, struct radeon_cmdbuf *cs, int family,
42                            bool enable)
43{
44   if (family == RADV_QUEUE_GENERAL) {
45      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
46      radeon_emit(cs, EVENT_TYPE(enable ? V_028A90_PERFCOUNTER_START : V_028A90_PERFCOUNTER_STOP) |
47                         EVENT_INDEX(0));
48   }
49
50   radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(enable));
51}
52
53void
54radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf *cs)
55{
56   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
57                              S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
58                              S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
59}
60
61void
62radv_perfcounter_emit_spm_start(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
63{
64   /* Start SPM counters. */
65   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
66                              S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
67                              S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
68
69   radv_emit_windowed_counters(device, cs, family, true);
70}
71
72void
73radv_perfcounter_emit_spm_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
74{
75   radv_emit_windowed_counters(device, cs, family, false);
76
77   /* Stop SPM counters. */
78   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
79                              S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
80                              S_036020_SPM_PERFMON_STATE(device->physical_device->rad_info.never_stop_sq_perf_counters ?
81                                                            V_036020_STRM_PERFMON_STATE_START_COUNTING :
82                                                            V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
83}
84
85enum radv_perfcounter_op {
86   RADV_PC_OP_SUM,
87   RADV_PC_OP_MAX,
88   RADV_PC_OP_RATIO_DIVSCALE,
89   RADV_PC_OP_REVERSE_RATIO, /* (reg1 - reg0) / reg1 */
90   RADV_PC_OP_SUM_WEIGHTED_4,
91};
92
93#define S_REG_SEL(x)   ((x)&0xFFFF)
94#define G_REG_SEL(x)   ((x)&0xFFFF)
95#define S_REG_BLOCK(x) ((x) << 16)
96#define G_REG_BLOCK(x) (((x) >> 16) & 0x7FFF)
97
98#define S_REG_OFFSET(x)    ((x)&0xFFFF)
99#define G_REG_OFFSET(x)    ((x)&0xFFFF)
100#define S_REG_INSTANCES(x) ((x) << 16)
101#define G_REG_INSTANCES(x) (((x) >> 16) & 0x7FFF)
102#define S_REG_CONSTANT(x)  ((x) << 31)
103#define G_REG_CONSTANT(x)  ((x) >> 31)
104
105struct radv_perfcounter_impl {
106   enum radv_perfcounter_op op;
107   uint32_t regs[8];
108};
109
110/* Only append to this list, never insert into the middle or remove (but can rename).
111 *
112 * The invariant we're trying to get here is counters that have the same meaning, so
113 * these can be shared between counters that have different implementations on different
114 * GPUs, but should be unique within a GPU.
115 */
116enum radv_perfcounter_uuid {
117   RADV_PC_UUID_GPU_CYCLES,
118   RADV_PC_UUID_SHADER_WAVES,
119   RADV_PC_UUID_SHADER_INSTRUCTIONS,
120   RADV_PC_UUID_SHADER_INSTRUCTIONS_VALU,
121   RADV_PC_UUID_SHADER_INSTRUCTIONS_SALU,
122   RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_LOAD,
123   RADV_PC_UUID_SHADER_INSTRUCTIONS_SMEM_LOAD,
124   RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_STORE,
125   RADV_PC_UUID_SHADER_INSTRUCTIONS_LDS,
126   RADV_PC_UUID_SHADER_INSTRUCTIONS_GDS,
127   RADV_PC_UUID_SHADER_VALU_BUSY,
128   RADV_PC_UUID_SHADER_SALU_BUSY,
129   RADV_PC_UUID_VRAM_READ_SIZE,
130   RADV_PC_UUID_VRAM_WRITE_SIZE,
131   RADV_PC_UUID_L0_CACHE_HIT_RATIO,
132   RADV_PC_UUID_L1_CACHE_HIT_RATIO,
133   RADV_PC_UUID_L2_CACHE_HIT_RATIO,
134};
135
136struct radv_perfcounter_desc {
137   struct radv_perfcounter_impl impl;
138
139   VkPerformanceCounterUnitKHR unit;
140
141   char name[VK_MAX_DESCRIPTION_SIZE];
142   char category[VK_MAX_DESCRIPTION_SIZE];
143   char description[VK_MAX_DESCRIPTION_SIZE];
144   enum radv_perfcounter_uuid uuid;
145};
146
147#define PC_DESC(arg_op, arg_unit, arg_name, arg_category, arg_description, arg_uuid, ...)          \
148   (struct radv_perfcounter_desc)                                                                  \
149   {                                                                                               \
150      .impl = {.op = arg_op, .regs = {__VA_ARGS__}},                                               \
151      .unit = VK_PERFORMANCE_COUNTER_UNIT_##arg_unit##_KHR, .name = arg_name,                      \
152      .category = arg_category, .description = arg_description, .uuid = RADV_PC_UUID_##arg_uuid    \
153   }
154
155#define ADD_PC(op, unit, name, category, description, uuid, ...)                                   \
156   do {                                                                                            \
157      if (descs) {                                                                                 \
158         descs[*count] = PC_DESC((op), unit, name, category, description, uuid, __VA_ARGS__);      \
159      }                                                                                            \
160      ++*count;                                                                                    \
161   } while (0)
162#define CTR(block, ctr) (S_REG_BLOCK(block) | S_REG_SEL(ctr))
163#define CONSTANT(v)     (S_REG_CONSTANT(1) | (uint32_t)(v))
164
165enum { GRBM_PERF_SEL_GUI_ACTIVE = CTR(GRBM, 2) };
166
167enum { CPF_PERF_SEL_CPF_STAT_BUSY_GFX10 = CTR(CPF, 0x18) };
168
169enum {
170   GL1C_PERF_SEL_REQ = CTR(GL1C, 0xe),
171   GL1C_PERF_SEL_REQ_MISS = CTR(GL1C, 0x12),
172};
173
174enum {
175   GL2C_PERF_SEL_REQ = CTR(GL2C, 0x3),
176
177   GL2C_PERF_SEL_MISS_GFX101 = CTR(GL2C, 0x23),
178   GL2C_PERF_SEL_MC_WRREQ_GFX101 = CTR(GL2C, 0x4b),
179   GL2C_PERF_SEL_EA_WRREQ_64B_GFX101 = CTR(GL2C, 0x4c),
180   GL2C_PERF_SEL_EA_RDREQ_32B_GFX101 = CTR(GL2C, 0x59),
181   GL2C_PERF_SEL_EA_RDREQ_64B_GFX101 = CTR(GL2C, 0x5a),
182   GL2C_PERF_SEL_EA_RDREQ_96B_GFX101 = CTR(GL2C, 0x5b),
183   GL2C_PERF_SEL_EA_RDREQ_128B_GFX101 = CTR(GL2C, 0x5c),
184
185   GL2C_PERF_SEL_MISS_GFX103 = CTR(GL2C, 0x2b),
186   GL2C_PERF_SEL_MC_WRREQ_GFX103 = CTR(GL2C, 0x53),
187   GL2C_PERF_SEL_EA_WRREQ_64B_GFX103 = CTR(GL2C, 0x55),
188   GL2C_PERF_SEL_EA_RDREQ_32B_GFX103 = CTR(GL2C, 0x63),
189   GL2C_PERF_SEL_EA_RDREQ_64B_GFX103 = CTR(GL2C, 0x64),
190   GL2C_PERF_SEL_EA_RDREQ_96B_GFX103 = CTR(GL2C, 0x65),
191   GL2C_PERF_SEL_EA_RDREQ_128B_GFX103 = CTR(GL2C, 0x66),
192};
193
194enum {
195   SQ_PERF_SEL_WAVES = CTR(SQ, 0x4),
196   SQ_PERF_SEL_INSTS_ALL_GFX10 = CTR(SQ, 0x31),
197   SQ_PERF_SEL_INSTS_GDS_GFX10 = CTR(SQ, 0x37),
198   SQ_PERF_SEL_INSTS_LDS_GFX10 = CTR(SQ, 0x3b),
199   SQ_PERF_SEL_INSTS_SALU_GFX10 = CTR(SQ, 0x3c),
200   SQ_PERF_SEL_INSTS_SMEM_GFX10 = CTR(SQ, 0x3d),
201   SQ_PERF_SEL_INSTS_VALU_GFX10 = CTR(SQ, 0x40),
202   SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10 = CTR(SQ, 0x45),
203   SQ_PERF_SEL_INSTS_TEX_STORE_GFX10 = CTR(SQ, 0x46),
204   SQ_PERF_SEL_INST_CYCLES_VALU_GFX10 = CTR(SQ, 0x75),
205};
206
207enum {
208   TCP_PERF_SEL_REQ_GFX10 = CTR(TCP, 0x9),
209   TCP_PERF_SEL_REQ_MISS_GFX10 = CTR(TCP, 0x12),
210};
211
212#define CTR_NUM_SIMD                                                                               \
213   CONSTANT(pdev->rad_info.num_simd_per_compute_unit * pdev->rad_info.num_cu)
214#define CTR_NUM_CUS CONSTANT(pdev->rad_info.num_cu)
215
216static void
217radv_query_perfcounter_descs(struct radv_physical_device *pdev, uint32_t *count,
218                             struct radv_perfcounter_desc *descs)
219{
220   *count = 0;
221
222   ADD_PC(RADV_PC_OP_MAX, CYCLES, "GPU active cycles", "GRBM",
223          "cycles the GPU is active processing a command buffer.", GPU_CYCLES,
224          GRBM_PERF_SEL_GUI_ACTIVE);
225
226   ADD_PC(RADV_PC_OP_SUM, GENERIC, "Waves", "Shaders", "Number of waves executed", SHADER_WAVES,
227          SQ_PERF_SEL_WAVES);
228   ADD_PC(RADV_PC_OP_SUM, GENERIC, "Instructions", "Shaders", "Number of Instructions executed",
229          SHADER_INSTRUCTIONS, SQ_PERF_SEL_INSTS_ALL_GFX10);
230   ADD_PC(RADV_PC_OP_SUM, GENERIC, "VALU Instructions", "Shaders",
231          "Number of VALU Instructions executed", SHADER_INSTRUCTIONS_VALU,
232          SQ_PERF_SEL_INSTS_VALU_GFX10);
233   ADD_PC(RADV_PC_OP_SUM, GENERIC, "SALU Instructions", "Shaders",
234          "Number of SALU Instructions executed", SHADER_INSTRUCTIONS_SALU,
235          SQ_PERF_SEL_INSTS_SALU_GFX10);
236   ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Load Instructions", "Shaders",
237          "Number of VMEM load instructions executed", SHADER_INSTRUCTIONS_VMEM_LOAD,
238          SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10);
239   ADD_PC(RADV_PC_OP_SUM, GENERIC, "SMEM Load Instructions", "Shaders",
240          "Number of SMEM load instructions executed", SHADER_INSTRUCTIONS_SMEM_LOAD,
241          SQ_PERF_SEL_INSTS_SMEM_GFX10);
242   ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Store Instructions", "Shaders",
243          "Number of VMEM store instructions executed", SHADER_INSTRUCTIONS_VMEM_STORE,
244          SQ_PERF_SEL_INSTS_TEX_STORE_GFX10);
245   ADD_PC(RADV_PC_OP_SUM, GENERIC, "LDS Instructions", "Shaders",
246          "Number of LDS Instructions executed", SHADER_INSTRUCTIONS_LDS,
247          SQ_PERF_SEL_INSTS_LDS_GFX10);
248   ADD_PC(RADV_PC_OP_SUM, GENERIC, "GDS Instructions", "Shaders",
249          "Number of GDS Instructions executed", SHADER_INSTRUCTIONS_GDS,
250          SQ_PERF_SEL_INSTS_GDS_GFX10);
251
252   ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "VALU Busy", "Shader Utilization",
253          "Percentage of time the VALU units are busy", SHADER_VALU_BUSY,
254          SQ_PERF_SEL_INST_CYCLES_VALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_SIMD);
255   ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "SALU Busy", "Shader Utilization",
256          "Percentage of time the SALU units are busy", SHADER_SALU_BUSY,
257          SQ_PERF_SEL_INSTS_SALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_CUS);
258
259   if (pdev->rad_info.gfx_level >= GFX10_3) {
260      ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory",
261             "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX103,
262             CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX103, CONSTANT(64),
263             GL2C_PERF_SEL_EA_RDREQ_96B_GFX103, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX103,
264             CONSTANT(128));
265      ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory",
266             "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX103,
267             CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX103, CONSTANT(64), CONSTANT(0),
268             CONSTANT(0), CONSTANT(0), CONSTANT(0));
269   } else {
270      ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory",
271             "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX101,
272             CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX101, CONSTANT(64),
273             GL2C_PERF_SEL_EA_RDREQ_96B_GFX101, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX101,
274             CONSTANT(128));
275      ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory",
276             "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX101,
277             CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX101, CONSTANT(32), CONSTANT(0),
278             CONSTANT(0), CONSTANT(0), CONSTANT(0));
279   }
280
281   ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L0 cache hit ratio", "Memory", "Hit ratio of L0 cache",
282          L0_CACHE_HIT_RATIO, TCP_PERF_SEL_REQ_MISS_GFX10, TCP_PERF_SEL_REQ_GFX10);
283   ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L1 cache hit ratio", "Memory", "Hit ratio of L1 cache",
284          L1_CACHE_HIT_RATIO, GL1C_PERF_SEL_REQ_MISS, GL1C_PERF_SEL_REQ);
285   if (pdev->rad_info.gfx_level >= GFX10_3) {
286      ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory",
287             "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX103,
288             GL2C_PERF_SEL_REQ);
289   } else {
290      ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory",
291             "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX101,
292             GL2C_PERF_SEL_REQ);
293   }
294}
295
296static bool
297radv_init_perfcounter_descs(struct radv_physical_device *pdev)
298{
299   if (pdev->perfcounters)
300      return true;
301
302   uint32_t count;
303   radv_query_perfcounter_descs(pdev, &count, NULL);
304
305   struct radv_perfcounter_desc *descs = malloc(sizeof(*descs) * count);
306   if (!descs)
307      return false;
308
309   radv_query_perfcounter_descs(pdev, &count, descs);
310   pdev->num_perfcounters = count;
311   pdev->perfcounters = descs;
312
313   return true;
314}
315
316static int
317cmp_uint32_t(const void *a, const void *b)
318{
319   uint32_t l = *(const uint32_t *)a;
320   uint32_t r = *(const uint32_t *)b;
321
322   return (l < r) ? -1 : (l > r) ? 1 : 0;
323}
324
325static VkResult
326radv_get_counter_registers(const struct radv_physical_device *pdevice, uint32_t num_indices,
327                           const uint32_t *indices, unsigned *out_num_regs, uint32_t **out_regs)
328{
329   ASSERTED uint32_t num_counters = pdevice->num_perfcounters;
330   const struct radv_perfcounter_desc *descs = pdevice->perfcounters;
331
332   unsigned full_reg_cnt = num_indices * ARRAY_SIZE(descs->impl.regs);
333   uint32_t *regs = malloc(full_reg_cnt * sizeof(uint32_t));
334   if (!regs)
335      return VK_ERROR_OUT_OF_HOST_MEMORY;
336
337   unsigned reg_cnt = 0;
338   for (unsigned i = 0; i < num_indices; ++i) {
339      uint32_t index = indices[i];
340      assert(index < num_counters);
341      for (unsigned j = 0; j < ARRAY_SIZE(descs[index].impl.regs) && descs[index].impl.regs[j];
342           ++j) {
343         if (!G_REG_CONSTANT(descs[index].impl.regs[j]))
344            regs[reg_cnt++] = descs[index].impl.regs[j];
345      }
346   }
347
348   qsort(regs, reg_cnt, sizeof(uint32_t), cmp_uint32_t);
349
350   unsigned deduped_reg_cnt = 0;
351   for (unsigned i = 1; i < reg_cnt; ++i) {
352      if (regs[i] != regs[deduped_reg_cnt])
353         regs[++deduped_reg_cnt] = regs[i];
354   }
355   ++deduped_reg_cnt;
356
357   *out_num_regs = deduped_reg_cnt;
358   *out_regs = regs;
359   return VK_SUCCESS;
360}
361
362static unsigned
363radv_pc_get_num_instances(const struct radv_physical_device *pdevice, struct ac_pc_block *ac_block)
364{
365   return ac_block->num_instances *
366          ((ac_block->b->b->flags & AC_PC_BLOCK_SE) ? pdevice->rad_info.max_se : 1);
367}
368
369static unsigned
370radv_get_num_counter_passes(const struct radv_physical_device *pdevice, unsigned num_regs,
371                            const uint32_t *regs)
372{
373   enum ac_pc_gpu_block prev_block = NUM_GPU_BLOCK;
374   unsigned block_reg_count = 0;
375   struct ac_pc_block *ac_block = NULL;
376   unsigned passes_needed = 1;
377
378   for (unsigned i = 0; i < num_regs; ++i) {
379      enum ac_pc_gpu_block block = G_REG_BLOCK(regs[i]);
380
381      if (block != prev_block) {
382         block_reg_count = 0;
383         prev_block = block;
384         ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
385      }
386
387      ++block_reg_count;
388
389      passes_needed =
390         MAX2(passes_needed, DIV_ROUND_UP(block_reg_count, ac_block->b->b->num_counters));
391   }
392
393   return passes_needed;
394}
395
396void
397radv_pc_deinit_query_pool(struct radv_pc_query_pool *pool)
398{
399   free(pool->counters);
400   free(pool->pc_regs);
401}
402
403VkResult
404radv_pc_init_query_pool(struct radv_physical_device *pdevice,
405                        const VkQueryPoolCreateInfo *pCreateInfo, struct radv_pc_query_pool *pool)
406{
407   const VkQueryPoolPerformanceCreateInfoKHR *perf_info =
408      vk_find_struct_const(pCreateInfo->pNext, QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
409   VkResult result;
410
411   if (!radv_init_perfcounter_descs(pdevice))
412      return VK_ERROR_OUT_OF_HOST_MEMORY;
413
414   result =
415      radv_get_counter_registers(pdevice, perf_info->counterIndexCount, perf_info->pCounterIndices,
416                                 &pool->num_pc_regs, &pool->pc_regs);
417   if (result != VK_SUCCESS)
418      return result;
419
420   pool->num_passes = radv_get_num_counter_passes(pdevice, pool->num_pc_regs, pool->pc_regs);
421
422   uint32_t *pc_reg_offsets = malloc(pool->num_pc_regs * sizeof(uint32_t));
423   if (!pc_reg_offsets)
424      return VK_ERROR_OUT_OF_HOST_MEMORY;
425
426   unsigned offset = 0;
427   for (unsigned i = 0; i < pool->num_pc_regs; ++i) {
428      enum ac_pc_gpu_block block = pool->pc_regs[i] >> 16;
429      struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
430      unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block);
431
432      pc_reg_offsets[i] = S_REG_OFFSET(offset) | S_REG_INSTANCES(num_instances);
433      offset += sizeof(uint64_t) * 2 * num_instances;
434   }
435
436   /* allow an uint32_t per pass to signal completion. */
437   pool->b.stride = offset + 8 * pool->num_passes;
438
439   pool->num_counters = perf_info->counterIndexCount;
440   pool->counters = malloc(pool->num_counters * sizeof(struct radv_perfcounter_impl));
441   if (!pool->counters) {
442      free(pc_reg_offsets);
443      return VK_ERROR_OUT_OF_HOST_MEMORY;
444   }
445
446   for (unsigned i = 0; i < pool->num_counters; ++i) {
447      pool->counters[i] = pdevice->perfcounters[perf_info->pCounterIndices[i]].impl;
448
449      for (unsigned j = 0; j < ARRAY_SIZE(pool->counters[i].regs); ++j) {
450         uint32_t reg = pool->counters[i].regs[j];
451         if (!reg || G_REG_CONSTANT(reg))
452            continue;
453
454         unsigned k;
455         for (k = 0; k < pool->num_pc_regs; ++k)
456            if (pool->pc_regs[k] == reg)
457               break;
458         pool->counters[i].regs[j] = pc_reg_offsets[k];
459      }
460   }
461
462   free(pc_reg_offsets);
463   return VK_SUCCESS;
464}
465
466static void
467radv_emit_instance(struct radv_cmd_buffer *cmd_buffer, int se, int instance)
468{
469   struct radeon_cmdbuf *cs = cmd_buffer->cs;
470   unsigned value = S_030800_SH_BROADCAST_WRITES(1);
471
472   if (se >= 0) {
473      value |= S_030800_SE_INDEX(se);
474   } else {
475      value |= S_030800_SE_BROADCAST_WRITES(1);
476   }
477
478   if (instance >= 0) {
479      value |= S_030800_INSTANCE_INDEX(instance);
480   } else {
481      value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
482   }
483
484   radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
485}
486
487static void
488radv_emit_select(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count,
489                 unsigned *selectors)
490{
491   struct ac_pc_block_base *regs = block->b->b;
492   struct radeon_cmdbuf *cs = cmd_buffer->cs;
493   unsigned idx;
494
495   assert(count <= regs->num_counters);
496
497   /* Fake counters. */
498   if (!regs->select0)
499      return;
500
501   for (idx = 0; idx < count; ++idx) {
502      radeon_set_perfctr_reg(cmd_buffer, regs->select0[idx],
503                             G_REG_SEL(selectors[idx]) | regs->select_or);
504   }
505
506   for (idx = 0; idx < regs->num_spm_counters; idx++) {
507      radeon_set_uconfig_reg_seq(cs, regs->select1[idx], 1);
508      radeon_emit(cs, 0);
509   }
510}
511
512static void
513radv_pc_emit_block_instance_read(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block,
514                                 unsigned count, uint64_t va)
515{
516   struct ac_pc_block_base *regs = block->b->b;
517   struct radeon_cmdbuf *cs = cmd_buffer->cs;
518   unsigned reg = regs->counter0_lo;
519   unsigned reg_delta = 8;
520
521   assert(regs->select0);
522   for (unsigned idx = 0; idx < count; ++idx) {
523      if (regs->counters)
524         reg = regs->counters[idx];
525
526      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
527      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
528                         COPY_DATA_WR_CONFIRM | COPY_DATA_COUNT_SEL); /* 64 bits */
529      radeon_emit(cs, reg >> 2);
530      radeon_emit(cs, 0); /* unused */
531      radeon_emit(cs, va);
532      radeon_emit(cs, va >> 32);
533
534      va += sizeof(uint64_t) * 2 *
535            radv_pc_get_num_instances(cmd_buffer->device->physical_device, block);
536      reg += reg_delta;
537   }
538}
539
540static void
541radv_pc_sample_block(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count,
542                     uint64_t va)
543{
544   unsigned se_end = 1;
545   if (block->b->b->flags & AC_PC_BLOCK_SE)
546      se_end = cmd_buffer->device->physical_device->rad_info.max_se;
547
548   for (unsigned se = 0; se < se_end; ++se) {
549      for (unsigned instance = 0; instance < block->num_instances; ++instance) {
550         radv_emit_instance(cmd_buffer, se, instance);
551         radv_pc_emit_block_instance_read(cmd_buffer, block, count, va);
552         va += sizeof(uint64_t) * 2;
553      }
554   }
555}
556
557static void
558radv_pc_wait_idle(struct radv_cmd_buffer *cmd_buffer)
559{
560   struct radeon_cmdbuf *cs = cmd_buffer->cs;
561
562   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
563   radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
564
565   radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
566   radeon_emit(cs, 0);          /* CP_COHER_CNTL */
567   radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
568   radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
569   radeon_emit(cs, 0);          /* CP_COHER_BASE */
570   radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
571   radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
572   radeon_emit(cs, 0);          /* GCR_CNTL */
573
574   radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
575   radeon_emit(cs, 0);
576}
577
578static void
579radv_pc_stop_and_sample(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool,
580                        uint64_t va, bool end)
581{
582   struct radeon_cmdbuf *cs = cmd_buffer->cs;
583   struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
584
585   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
586   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
587
588   radv_pc_wait_idle(cmd_buffer);
589
590   radv_emit_instance(cmd_buffer, -1, -1);
591   radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, false);
592
593   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
594                          S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) |
595                             S_036020_PERFMON_SAMPLE_ENABLE(1));
596
597   for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
598      uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) +
599                         PERF_CTR_BO_PASS_OFFSET + 8 * pass;
600      uint64_t reg_va = va + (end ? 8 : 0);
601
602      radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
603      radeon_emit(cs, pred_va);
604      radeon_emit(cs, pred_va >> 32);
605      radeon_emit(cs, 0); /* Cache policy */
606
607      uint32_t *skip_dwords = cs->buf + cs->cdw;
608      radeon_emit(cs, 0);
609
610      for (unsigned i = 0; i < pool->num_pc_regs;) {
611         enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
612         struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
613         unsigned offset = ac_block->num_instances * pass;
614         unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block);
615
616         unsigned cnt = 1;
617         while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
618            ++cnt;
619
620         if (offset < cnt) {
621            unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
622            radv_pc_sample_block(cmd_buffer, ac_block, pass_reg_cnt,
623                                 reg_va + offset * num_instances * sizeof(uint64_t));
624         }
625
626         i += cnt;
627         reg_va += num_instances * sizeof(uint64_t) * 2 * cnt;
628      }
629
630      if (end) {
631         uint64_t signal_va = va + pool->b.stride - 8 - 8 * pass;
632         radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
633         radeon_emit(cs,
634                     S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
635         radeon_emit(cs, signal_va);
636         radeon_emit(cs, signal_va >> 32);
637         radeon_emit(cs, 1); /* value */
638      }
639
640      *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
641   }
642
643   radv_emit_instance(cmd_buffer, -1, -1);
644}
645
646void
647radv_pc_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool,
648                    uint64_t va)
649{
650   struct radeon_cmdbuf *cs = cmd_buffer->cs;
651   struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
652   ASSERTED unsigned cdw_max;
653
654   cmd_buffer->state.uses_perf_counters = true;
655
656   cdw_max = radeon_check_space(cmd_buffer->device->ws, cs,
657                                256 +                      /* Random one time stuff */
658                                   10 * pool->num_passes + /* COND_EXECs */
659                                   pool->b.stride / 8 * (5 + 8));
660
661   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo);
662   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo);
663
664   uint64_t perf_ctr_va =
665      radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
666   radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
667   radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
668   radeon_emit(cs, perf_ctr_va);
669   radeon_emit(cs, perf_ctr_va >> 32);
670   radeon_emit(cs, 0); /* value */
671
672   radv_pc_wait_idle(cmd_buffer);
673
674   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
675                          S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
676
677   radv_emit_inhibit_clockgating(cmd_buffer->device, cs, true);
678   radv_emit_spi_config_cntl(cmd_buffer->device, cs, true);
679   radv_perfcounter_emit_shaders(cs, 0x7f);
680
681   for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
682      uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) +
683                         PERF_CTR_BO_PASS_OFFSET + 8 * pass;
684
685      radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
686      radeon_emit(cs, pred_va);
687      radeon_emit(cs, pred_va >> 32);
688      radeon_emit(cs, 0); /* Cache policy */
689
690      uint32_t *skip_dwords = cs->buf + cs->cdw;
691      radeon_emit(cs, 0);
692
693      for (unsigned i = 0; i < pool->num_pc_regs;) {
694         enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
695         struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
696         unsigned offset = ac_block->num_instances * pass;
697
698         unsigned cnt = 1;
699         while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
700            ++cnt;
701
702         if (offset < cnt) {
703            unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
704            radv_emit_select(cmd_buffer, ac_block, pass_reg_cnt, pool->pc_regs + i + offset);
705         }
706
707         i += cnt;
708      }
709
710      *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
711   }
712
713   radv_emit_instance(cmd_buffer, -1, -1);
714
715   /* The following sequence actually starts the perfcounters. */
716
717   radv_pc_stop_and_sample(cmd_buffer, pool, va, false);
718
719   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
720                          S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
721
722   radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, true);
723
724   assert(cmd_buffer->cs->cdw <= cdw_max);
725}
726
727void
728radv_pc_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va)
729{
730   struct radeon_cmdbuf *cs = cmd_buffer->cs;
731   ASSERTED unsigned cdw_max;
732
733   cdw_max =
734      radeon_check_space(cmd_buffer->device->ws, cs,
735                         256 + /* Reserved for things that don't scale with passes/counters */
736                            5 * pool->num_passes + /* COND_EXECs */
737                            pool->b.stride / 8 * 8);
738
739   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo);
740   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo);
741
742   uint64_t perf_ctr_va =
743      radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
744   si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
745                              radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0,
746                              EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, perf_ctr_va, 1,
747                              cmd_buffer->gfx9_fence_va);
748   radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, perf_ctr_va, 1, 0xffffffff);
749
750   radv_pc_wait_idle(cmd_buffer);
751   radv_pc_stop_and_sample(cmd_buffer, pool, va, true);
752
753   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
754                          S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
755   radv_emit_spi_config_cntl(cmd_buffer->device, cs, false);
756   radv_emit_inhibit_clockgating(cmd_buffer->device, cs, false);
757
758   assert(cmd_buffer->cs->cdw <= cdw_max);
759}
760
761static uint64_t
762radv_pc_sum_reg(uint32_t reg, const uint64_t *data)
763{
764   unsigned instances = G_REG_INSTANCES(reg);
765   unsigned offset = G_REG_OFFSET(reg) / 8;
766   uint64_t result = 0;
767
768   if (G_REG_CONSTANT(reg))
769      return reg & 0x7fffffffu;
770
771   for (unsigned i = 0; i < instances; ++i) {
772      result += data[offset + 2 * i + 1] - data[offset + 2 * i];
773   }
774
775   return result;
776}
777
778static uint64_t
779radv_pc_max_reg(uint32_t reg, const uint64_t *data)
780{
781   unsigned instances = G_REG_INSTANCES(reg);
782   unsigned offset = G_REG_OFFSET(reg) / 8;
783   uint64_t result = 0;
784
785   if (G_REG_CONSTANT(reg))
786      return reg & 0x7fffffffu;
787
788   for (unsigned i = 0; i < instances; ++i) {
789      result = MAX2(result, data[offset + 2 * i + 1]);
790   }
791
792   return result;
793}
794
795static union VkPerformanceCounterResultKHR
796radv_pc_get_result(const struct radv_perfcounter_impl *impl, const uint64_t *data)
797{
798   union VkPerformanceCounterResultKHR result;
799
800   switch (impl->op) {
801   case RADV_PC_OP_MAX:
802      result.float64 = radv_pc_max_reg(impl->regs[0], data);
803      break;
804   case RADV_PC_OP_SUM:
805      result.float64 = radv_pc_sum_reg(impl->regs[0], data);
806      break;
807   case RADV_PC_OP_RATIO_DIVSCALE:
808      result.float64 = radv_pc_sum_reg(impl->regs[0], data) /
809                       (double)radv_pc_sum_reg(impl->regs[1], data) /
810                       radv_pc_sum_reg(impl->regs[2], data) * 100.0;
811      break;
812   case RADV_PC_OP_REVERSE_RATIO: {
813      double tmp = radv_pc_sum_reg(impl->regs[1], data);
814      result.float64 = (tmp - radv_pc_sum_reg(impl->regs[0], data)) / tmp * 100.0;
815      break;
816   }
817   case RADV_PC_OP_SUM_WEIGHTED_4:
818      result.float64 = 0.0;
819      for (unsigned i = 0; i < 4; ++i)
820         result.float64 +=
821            radv_pc_sum_reg(impl->regs[2 * i], data) * radv_pc_sum_reg(impl->regs[2 * i + 1], data);
822      break;
823   default:
824      unreachable("unhandled performance counter operation");
825   }
826   return result;
827}
828
829void
830radv_pc_get_results(const struct radv_pc_query_pool *pc_pool, const uint64_t *data, void *out)
831{
832   union VkPerformanceCounterResultKHR *pc_result = out;
833
834   for (unsigned i = 0; i < pc_pool->num_counters; ++i) {
835      pc_result[i] = radv_pc_get_result(pc_pool->counters + i, data);
836   }
837}
838
839VkResult
840radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
841   VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, uint32_t *pCounterCount,
842   VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
843{
844   RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
845
846   if (vk_queue_to_radv(pdevice, queueFamilyIndex) != RADV_QUEUE_GENERAL) {
847      *pCounterCount = 0;
848      return VK_SUCCESS;
849   }
850
851   if (!radv_init_perfcounter_descs(pdevice))
852      return VK_ERROR_OUT_OF_HOST_MEMORY;
853
854   uint32_t counter_cnt = pdevice->num_perfcounters;
855   const struct radv_perfcounter_desc *descs = pdevice->perfcounters;
856
857   if (!pCounters && !pCounterDescriptions) {
858      *pCounterCount = counter_cnt;
859      return VK_SUCCESS;
860   }
861
862   VkResult result = counter_cnt > *pCounterCount ? VK_INCOMPLETE : VK_SUCCESS;
863   counter_cnt = MIN2(counter_cnt, *pCounterCount);
864   *pCounterCount = counter_cnt;
865
866   for (uint32_t i = 0; i < counter_cnt; ++i) {
867      if (pCounters) {
868         pCounters[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_KHR;
869         pCounters[i].unit = descs[i].unit;
870         pCounters[i].scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
871         pCounters[i].storage = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR;
872
873         memset(&pCounters[i].uuid, 0, sizeof(pCounters[i].uuid));
874         strcpy((char*)&pCounters[i].uuid, "RADV");
875
876         const uint32_t uuid = descs[i].uuid;
877         memcpy(&pCounters[i].uuid[12], &uuid, sizeof(uuid));
878      }
879
880      if (pCounterDescriptions) {
881         pCounterDescriptions[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_DESCRIPTION_KHR;
882         pCounterDescriptions[i].flags =
883            VK_PERFORMANCE_COUNTER_DESCRIPTION_CONCURRENTLY_IMPACTED_BIT_KHR;
884         strcpy(pCounterDescriptions[i].name, descs[i].name);
885         strcpy(pCounterDescriptions[i].category, descs[i].category);
886         strcpy(pCounterDescriptions[i].description, descs[i].description);
887      }
888   }
889   return result;
890}
891
892void
893radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
894   VkPhysicalDevice physicalDevice,
895   const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo, uint32_t *pNumPasses)
896{
897   RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
898
899   if (pPerformanceQueryCreateInfo->counterIndexCount == 0) {
900      *pNumPasses = 0;
901      return;
902   }
903
904   if (!radv_init_perfcounter_descs(pdevice)) {
905      /* Can't return an error, so log */
906      fprintf(stderr, "radv: Failed to init perf counters\n");
907      *pNumPasses = 1;
908      return;
909   }
910
911   assert(vk_queue_to_radv(pdevice, pPerformanceQueryCreateInfo->queueFamilyIndex) ==
912          RADV_QUEUE_GENERAL);
913
914   unsigned num_regs = 0;
915   uint32_t *regs = NULL;
916   VkResult result =
917      radv_get_counter_registers(pdevice, pPerformanceQueryCreateInfo->counterIndexCount,
918                                 pPerformanceQueryCreateInfo->pCounterIndices, &num_regs, &regs);
919   if (result != VK_SUCCESS) {
920      /* Can't return an error, so log */
921      fprintf(stderr, "radv: Failed to allocate memory for perf counters\n");
922   }
923
924   *pNumPasses = radv_get_num_counter_passes(pdevice, num_regs, regs);
925   free(regs);
926}
927