1/* 2 * Copyright 2021 Valve Corporation 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "ac_spm.h" 26 27#include "util/bitscan.h" 28#include "util/u_memory.h" 29#include "ac_perfcounter.h" 30 31static struct ac_spm_block_select * 32ac_spm_get_block_select(struct ac_spm_trace_data *spm_trace, 33 const struct ac_pc_block *block) 34{ 35 struct ac_spm_block_select *block_sel, *new_block_sel; 36 uint32_t num_block_sel; 37 38 for (uint32_t i = 0; i < spm_trace->num_block_sel; i++) { 39 if (spm_trace->block_sel[i].b->b->b->gpu_block == block->b->b->gpu_block) 40 return &spm_trace->block_sel[i]; 41 } 42 43 /* Allocate a new select block if it doesn't already exist. */ 44 num_block_sel = spm_trace->num_block_sel + 1; 45 block_sel = realloc(spm_trace->block_sel, num_block_sel * sizeof(*block_sel)); 46 if (!block_sel) 47 return NULL; 48 49 spm_trace->num_block_sel = num_block_sel; 50 spm_trace->block_sel = block_sel; 51 52 /* Initialize the new select block. */ 53 new_block_sel = &spm_trace->block_sel[spm_trace->num_block_sel - 1]; 54 memset(new_block_sel, 0, sizeof(*new_block_sel)); 55 56 new_block_sel->b = block; 57 new_block_sel->num_counters = block->b->b->num_spm_counters; 58 59 /* Broadcast global block writes to SEs and SAs */ 60 if (!(block->b->b->flags & (AC_PC_BLOCK_SE | AC_PC_BLOCK_SHADER))) 61 new_block_sel->grbm_gfx_index = S_030800_SE_BROADCAST_WRITES(1) | 62 S_030800_SH_BROADCAST_WRITES(1); 63 /* Broadcast per SE block writes to SAs */ 64 else if (block->b->b->flags & AC_PC_BLOCK_SE) 65 new_block_sel->grbm_gfx_index = S_030800_SH_BROADCAST_WRITES(1); 66 67 return new_block_sel; 68} 69 70static void 71ac_spm_init_muxsel(const struct ac_pc_block *block, 72 struct ac_spm_counter_info *counter, 73 uint32_t spm_wire) 74{ 75 struct ac_spm_muxsel *muxsel = &counter->muxsel; 76 77 muxsel->counter = 2 * spm_wire + (counter->is_even ? 0 : 1); 78 muxsel->block = block->b->b->spm_block_select; 79 muxsel->shader_array = 0; 80 muxsel->instance = 0; 81} 82 83static bool 84ac_spm_map_counter(struct ac_spm_trace_data *spm_trace, 85 struct ac_spm_block_select *block_sel, 86 struct ac_spm_counter_info *counter, 87 uint32_t *spm_wire) 88{ 89 if (block_sel->b->b->b->gpu_block == SQ) { 90 for (unsigned i = 0; i < ARRAY_SIZE(spm_trace->sq_block_sel); i++) { 91 struct ac_spm_block_select *sq_block_sel = &spm_trace->sq_block_sel[i]; 92 struct ac_spm_counter_select *cntr_sel = &sq_block_sel->counters[0]; 93 if (i < spm_trace->num_used_sq_block_sel) 94 continue; 95 96 /* SQ doesn't support 16-bit counters. */ 97 cntr_sel->sel0 |= S_036700_PERF_SEL(counter->event_id) | 98 S_036700_SPM_MODE(3) | /* 32-bit clamp */ 99 S_036700_PERF_MODE(0); 100 cntr_sel->active |= 0x3; 101 102 /* 32-bits counter are always even. */ 103 counter->is_even = true; 104 105 /* One wire per SQ module. */ 106 *spm_wire = i; 107 108 spm_trace->num_used_sq_block_sel++; 109 return true; 110 } 111 } else { 112 /* Generic blocks. */ 113 for (unsigned i = 0; i < block_sel->num_counters; i++) { 114 struct ac_spm_counter_select *cntr_sel = &block_sel->counters[i]; 115 int index = ffs(~cntr_sel->active) - 1; 116 117 switch (index) { 118 case 0: /* use S_037004_PERF_SEL */ 119 cntr_sel->sel0 |= S_037004_PERF_SEL(counter->event_id) | 120 S_037004_CNTR_MODE(1) | /* 16-bit clamp */ 121 S_037004_PERF_MODE(0); /* accum */ 122 break; 123 case 1: /* use S_037004_PERF_SEL1 */ 124 cntr_sel->sel0 |= S_037004_PERF_SEL1(counter->event_id) | 125 S_037004_PERF_MODE1(0); 126 break; 127 case 2: /* use S_037004_PERF_SEL2 */ 128 cntr_sel->sel1 |= S_037008_PERF_SEL2(counter->event_id) | 129 S_037008_PERF_MODE2(0); 130 break; 131 case 3: /* use S_037004_PERF_SEL3 */ 132 cntr_sel->sel1 |= S_037008_PERF_SEL3(counter->event_id) | 133 S_037008_PERF_MODE3(0); 134 break; 135 default: 136 return false; 137 } 138 139 /* Mark this 16-bit counter as used. */ 140 cntr_sel->active |= 1 << index; 141 142 /* Determine if the counter is even or odd. */ 143 counter->is_even = !(index % 2); 144 145 /* Determine the SPM wire (one wire holds two 16-bit counters). */ 146 *spm_wire = !!(index >= 2); 147 148 return true; 149 } 150 } 151 152 return false; 153} 154 155static bool 156ac_spm_add_counter(const struct ac_perfcounters *pc, 157 struct ac_spm_trace_data *spm_trace, 158 const struct ac_spm_counter_create_info *info) 159{ 160 struct ac_spm_counter_info *counter; 161 struct ac_spm_block_select *block_sel; 162 struct ac_pc_block *block; 163 uint32_t spm_wire; 164 165 /* Check if the GPU block is valid. */ 166 block = ac_pc_get_block(pc, info->gpu_block); 167 if (!block) { 168 fprintf(stderr, "ac/spm: Invalid GPU block.\n"); 169 return false; 170 } 171 172 /* Check if the number of instances is valid. */ 173 if (info->instance > block->num_instances) { 174 fprintf(stderr, "ac/spm: Invalid instance ID.\n"); 175 return false; 176 } 177 178 /* Check if the event ID is valid. */ 179 if (info->event_id > block->b->selectors) { 180 fprintf(stderr, "ac/spm: Invalid event ID.\n"); 181 return false; 182 } 183 184 counter = &spm_trace->counters[spm_trace->num_counters]; 185 spm_trace->num_counters++; 186 187 counter->gpu_block = info->gpu_block; 188 counter->instance = info->instance; 189 counter->event_id = info->event_id; 190 191 /* Get the select block used to configure the counter. */ 192 block_sel = ac_spm_get_block_select(spm_trace, block); 193 if (!block_sel) 194 return false; 195 196 /* Map the counter to the select block. */ 197 if (!ac_spm_map_counter(spm_trace, block_sel, counter, &spm_wire)) { 198 fprintf(stderr, "ac/spm: No free slots available!\n"); 199 return false; 200 } 201 202 /* Determine the counter segment type. */ 203 if (block->b->b->flags & AC_PC_BLOCK_SE) { 204 counter->segment_type = AC_SPM_SEGMENT_TYPE_SE0; // XXX 205 } else { 206 counter->segment_type = AC_SPM_SEGMENT_TYPE_GLOBAL; 207 } 208 209 /* Configure the muxsel for SPM. */ 210 ac_spm_init_muxsel(block, counter, spm_wire); 211 212 return true; 213} 214 215bool ac_init_spm(const struct radeon_info *info, 216 const struct ac_perfcounters *pc, 217 unsigned num_counters, 218 const struct ac_spm_counter_create_info *counters, 219 struct ac_spm_trace_data *spm_trace) 220{ 221 spm_trace->counters = CALLOC(num_counters, sizeof(*spm_trace->counters)); 222 if (!spm_trace->counters) 223 return false; 224 225 for (unsigned i = 0; i < num_counters; i++) { 226 if (!ac_spm_add_counter(pc, spm_trace, &counters[i])) { 227 fprintf(stderr, "ac/spm: Failed to add SPM counter (%d).\n", i); 228 return false; 229 } 230 } 231 232 /* Determine the segment size and create a muxsel ram for every segment. */ 233 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) { 234 unsigned num_even_counters = 0, num_odd_counters = 0; 235 236 if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) { 237 /* The global segment always start with a 64-bit timestamp. */ 238 num_even_counters += AC_SPM_GLOBAL_TIMESTAMP_COUNTERS; 239 } 240 241 /* Count the number of even/odd counters for this segment. */ 242 for (unsigned c = 0; c < spm_trace->num_counters; c++) { 243 struct ac_spm_counter_info *counter = &spm_trace->counters[c]; 244 245 if (counter->segment_type != s) 246 continue; 247 248 if (counter->is_even) { 249 num_even_counters++; 250 } else { 251 num_odd_counters++; 252 } 253 } 254 255 /* Compute the number of lines. */ 256 unsigned even_lines = 257 DIV_ROUND_UP(num_even_counters, AC_SPM_NUM_COUNTER_PER_MUXSEL); 258 unsigned odd_lines = 259 DIV_ROUND_UP(num_odd_counters, AC_SPM_NUM_COUNTER_PER_MUXSEL); 260 unsigned num_lines = (even_lines > odd_lines) ? (2 * even_lines - 1) : (2 * odd_lines); 261 262 spm_trace->muxsel_lines[s] = CALLOC(num_lines, sizeof(*spm_trace->muxsel_lines[s])); 263 if (!spm_trace->muxsel_lines[s]) 264 return false; 265 spm_trace->num_muxsel_lines[s] = num_lines; 266 } 267 268 /* RLC uses the following order: Global, SE0, SE1, SE2, SE3. */ 269 const enum ac_spm_segment_type ordered_segment[AC_SPM_SEGMENT_TYPE_COUNT] = 270 { 271 AC_SPM_SEGMENT_TYPE_GLOBAL, 272 AC_SPM_SEGMENT_TYPE_SE0, 273 AC_SPM_SEGMENT_TYPE_SE1, 274 AC_SPM_SEGMENT_TYPE_SE2, 275 AC_SPM_SEGMENT_TYPE_SE3, 276 }; 277 278 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) { 279 if (!spm_trace->muxsel_lines[s]) 280 continue; 281 282 uint32_t segment_offset = 0; 283 for (unsigned i = 0; s != ordered_segment[i]; i++) { 284 segment_offset += spm_trace->num_muxsel_lines[ordered_segment[i]] * 285 AC_SPM_NUM_COUNTER_PER_MUXSEL; 286 } 287 288 uint32_t even_counter_idx = 0, even_line_idx = 0; 289 uint32_t odd_counter_idx = 0, odd_line_idx = 1; 290 291 /* Add the global timestamps first. */ 292 if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) { 293 struct ac_spm_muxsel global_timestamp_muxsel = { 294 .counter = 0x30, 295 .block = 0x3, 296 .shader_array = 0, 297 .instance = 0x1e, 298 }; 299 300 for (unsigned i = 0; i < 4; i++) { 301 spm_trace->muxsel_lines[s][even_line_idx].muxsel[even_counter_idx++] = global_timestamp_muxsel; 302 } 303 } 304 305 for (unsigned i = 0; i < spm_trace->num_counters; i++) { 306 struct ac_spm_counter_info *counter = &spm_trace->counters[i]; 307 308 if (counter->segment_type != s) 309 continue; 310 311 if (counter->is_even) { 312 counter->offset = segment_offset + even_line_idx * 313 AC_SPM_NUM_COUNTER_PER_MUXSEL + even_counter_idx; 314 315 spm_trace->muxsel_lines[s][even_line_idx].muxsel[even_counter_idx] = spm_trace->counters[i].muxsel; 316 if (++even_counter_idx == AC_SPM_NUM_COUNTER_PER_MUXSEL) { 317 even_counter_idx = 0; 318 even_line_idx += 2; 319 } 320 } else { 321 counter->offset = segment_offset + odd_line_idx * 322 AC_SPM_NUM_COUNTER_PER_MUXSEL + odd_counter_idx; 323 324 spm_trace->muxsel_lines[s][odd_line_idx].muxsel[odd_counter_idx] = spm_trace->counters[i].muxsel; 325 if (++odd_counter_idx == AC_SPM_NUM_COUNTER_PER_MUXSEL) { 326 odd_counter_idx = 0; 327 odd_line_idx += 2; 328 } 329 } 330 } 331 } 332 333 return true; 334} 335 336void ac_destroy_spm(struct ac_spm_trace_data *spm_trace) 337{ 338 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) { 339 FREE(spm_trace->muxsel_lines[s]); 340 } 341 FREE(spm_trace->block_sel); 342 FREE(spm_trace->counters); 343} 344 345uint32_t ac_spm_get_sample_size(const struct ac_spm_trace_data *spm_trace) 346{ 347 uint32_t sample_size = 0; /* in bytes */ 348 349 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) { 350 sample_size += spm_trace->num_muxsel_lines[s] * AC_SPM_MUXSEL_LINE_SIZE * 4; 351 } 352 353 return sample_size; 354} 355 356uint32_t ac_spm_get_num_samples(const struct ac_spm_trace_data *spm_trace) 357{ 358 uint32_t sample_size = ac_spm_get_sample_size(spm_trace); 359 uint32_t *ptr = (uint32_t *)spm_trace->ptr; 360 uint32_t data_size, num_lines_written; 361 uint32_t num_samples = 0; 362 363 /* Get the data size (in bytes) written by the hw to the ring buffer. */ 364 data_size = ptr[0]; 365 366 /* Compute the number of 256 bits (16 * 16-bits counters) lines written. */ 367 num_lines_written = data_size / (2 * AC_SPM_NUM_COUNTER_PER_MUXSEL); 368 369 /* Check for overflow. */ 370 if (num_lines_written % (sample_size / 32)) { 371 abort(); 372 } else { 373 num_samples = num_lines_written / (sample_size / 32); 374 } 375 376 return num_samples; 377} 378