1/*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *  Nicolai Hähnle <nicolai.haehnle@amd.com>
25 *
26 */
27
28#include "util/u_memory.h"
29#include "r600_query.h"
30#include "r600_pipe_common.h"
31#include "r600d_common.h"
32
33/* Max counters per HW block */
34#define R600_QUERY_MAX_COUNTERS 16
35
36static struct r600_perfcounter_block *
37lookup_counter(struct r600_perfcounters *pc, unsigned index,
38	       unsigned *base_gid, unsigned *sub_index)
39{
40	struct r600_perfcounter_block *block = pc->blocks;
41	unsigned bid;
42
43	*base_gid = 0;
44	for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
45		unsigned total = block->num_groups * block->num_selectors;
46
47		if (index < total) {
48			*sub_index = index;
49			return block;
50		}
51
52		index -= total;
53		*base_gid += block->num_groups;
54	}
55
56	return NULL;
57}
58
59static struct r600_perfcounter_block *
60lookup_group(struct r600_perfcounters *pc, unsigned *index)
61{
62	unsigned bid;
63	struct r600_perfcounter_block *block = pc->blocks;
64
65	for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
66		if (*index < block->num_groups)
67			return block;
68		*index -= block->num_groups;
69	}
70
71	return NULL;
72}
73
74struct r600_pc_group {
75	struct r600_pc_group *next;
76	struct r600_perfcounter_block *block;
77	unsigned sub_gid; /* only used during init */
78	unsigned result_base; /* only used during init */
79	int se;
80	int instance;
81	unsigned num_counters;
82	unsigned selectors[R600_QUERY_MAX_COUNTERS];
83};
84
85struct r600_pc_counter {
86	unsigned base;
87	unsigned qwords;
88	unsigned stride; /* in uint64s */
89};
90
91#define R600_PC_SHADERS_WINDOWING (1 << 31)
92
93struct r600_query_pc {
94	struct r600_query_hw b;
95
96	unsigned shaders;
97	unsigned num_counters;
98	struct r600_pc_counter *counters;
99	struct r600_pc_group *groups;
100};
101
102static void r600_pc_query_destroy(struct r600_common_screen *rscreen,
103				  struct r600_query *rquery)
104{
105	struct r600_query_pc *query = (struct r600_query_pc *)rquery;
106
107	while (query->groups) {
108		struct r600_pc_group *group = query->groups;
109		query->groups = group->next;
110		FREE(group);
111	}
112
113	FREE(query->counters);
114
115	r600_query_hw_destroy(rscreen, rquery);
116}
117
118static bool r600_pc_query_prepare_buffer(struct r600_common_screen *screen,
119					 struct r600_query_hw *hwquery,
120					 struct r600_resource *buffer)
121{
122	/* no-op */
123	return true;
124}
125
126static void r600_pc_query_emit_start(struct r600_common_context *ctx,
127				     struct r600_query_hw *hwquery,
128				     struct r600_resource *buffer, uint64_t va)
129{
130	struct r600_perfcounters *pc = ctx->screen->perfcounters;
131	struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
132	struct r600_pc_group *group;
133	int current_se = -1;
134	int current_instance = -1;
135
136	if (query->shaders)
137		pc->emit_shaders(ctx, query->shaders);
138
139	for (group = query->groups; group; group = group->next) {
140		struct r600_perfcounter_block *block = group->block;
141
142		if (group->se != current_se || group->instance != current_instance) {
143			current_se = group->se;
144			current_instance = group->instance;
145			pc->emit_instance(ctx, group->se, group->instance);
146		}
147
148		pc->emit_select(ctx, block, group->num_counters, group->selectors);
149	}
150
151	if (current_se != -1 || current_instance != -1)
152		pc->emit_instance(ctx, -1, -1);
153
154	pc->emit_start(ctx, buffer, va);
155}
156
157static void r600_pc_query_emit_stop(struct r600_common_context *ctx,
158				    struct r600_query_hw *hwquery,
159				    struct r600_resource *buffer, uint64_t va)
160{
161	struct r600_perfcounters *pc = ctx->screen->perfcounters;
162	struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
163	struct r600_pc_group *group;
164
165	pc->emit_stop(ctx, buffer, va);
166
167	for (group = query->groups; group; group = group->next) {
168		struct r600_perfcounter_block *block = group->block;
169		unsigned se = group->se >= 0 ? group->se : 0;
170		unsigned se_end = se + 1;
171
172		if ((block->flags & R600_PC_BLOCK_SE) && (group->se < 0))
173			se_end = ctx->screen->info.max_se;
174
175		do {
176			unsigned instance = group->instance >= 0 ? group->instance : 0;
177
178			do {
179				pc->emit_instance(ctx, se, instance);
180				pc->emit_read(ctx, block,
181					      group->num_counters, group->selectors,
182					      buffer, va);
183				va += sizeof(uint64_t) * group->num_counters;
184			} while (group->instance < 0 && ++instance < block->num_instances);
185		} while (++se < se_end);
186	}
187
188	pc->emit_instance(ctx, -1, -1);
189}
190
191static void r600_pc_query_clear_result(struct r600_query_hw *hwquery,
192				       union pipe_query_result *result)
193{
194	struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
195
196	memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
197}
198
199static void r600_pc_query_add_result(struct r600_common_screen *rscreen,
200				     struct r600_query_hw *hwquery,
201				     void *buffer,
202				     union pipe_query_result *result)
203{
204	struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
205	uint64_t *results = buffer;
206	unsigned i, j;
207
208	for (i = 0; i < query->num_counters; ++i) {
209		struct r600_pc_counter *counter = &query->counters[i];
210
211		for (j = 0; j < counter->qwords; ++j) {
212			uint32_t value = results[counter->base + j * counter->stride];
213			result->batch[i].u64 += value;
214		}
215	}
216}
217
218static struct r600_query_ops batch_query_ops = {
219	.destroy = r600_pc_query_destroy,
220	.begin = r600_query_hw_begin,
221	.end = r600_query_hw_end,
222	.get_result = r600_query_hw_get_result
223};
224
225static struct r600_query_hw_ops batch_query_hw_ops = {
226	.prepare_buffer = r600_pc_query_prepare_buffer,
227	.emit_start = r600_pc_query_emit_start,
228	.emit_stop = r600_pc_query_emit_stop,
229	.clear_result = r600_pc_query_clear_result,
230	.add_result = r600_pc_query_add_result,
231};
232
233static struct r600_pc_group *get_group_state(struct r600_common_screen *screen,
234					     struct r600_query_pc *query,
235					     struct r600_perfcounter_block *block,
236					     unsigned sub_gid)
237{
238	struct r600_pc_group *group = query->groups;
239
240	while (group) {
241		if (group->block == block && group->sub_gid == sub_gid)
242			return group;
243		group = group->next;
244	}
245
246	group = CALLOC_STRUCT(r600_pc_group);
247	if (!group)
248		return NULL;
249
250	group->block = block;
251	group->sub_gid = sub_gid;
252
253	if (block->flags & R600_PC_BLOCK_SHADER) {
254		unsigned sub_gids = block->num_instances;
255		unsigned shader_id;
256		unsigned shaders;
257		unsigned query_shaders;
258
259		if (block->flags & R600_PC_BLOCK_SE_GROUPS)
260			sub_gids = sub_gids * screen->info.max_se;
261		shader_id = sub_gid / sub_gids;
262		sub_gid = sub_gid % sub_gids;
263
264		shaders = screen->perfcounters->shader_type_bits[shader_id];
265
266		query_shaders = query->shaders & ~R600_PC_SHADERS_WINDOWING;
267		if (query_shaders && query_shaders != shaders) {
268			fprintf(stderr, "r600_perfcounter: incompatible shader groups\n");
269			FREE(group);
270			return NULL;
271		}
272		query->shaders = shaders;
273	}
274
275	if (block->flags & R600_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
276		// A non-zero value in query->shaders ensures that the shader
277		// masking is reset unless the user explicitly requests one.
278		query->shaders = R600_PC_SHADERS_WINDOWING;
279	}
280
281	if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
282		group->se = sub_gid / block->num_instances;
283		sub_gid = sub_gid % block->num_instances;
284	} else {
285		group->se = -1;
286	}
287
288	if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
289		group->instance = sub_gid;
290	} else {
291		group->instance = -1;
292	}
293
294	group->next = query->groups;
295	query->groups = group;
296
297	return group;
298}
299
300struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
301					   unsigned num_queries,
302					   unsigned *query_types)
303{
304	struct r600_common_screen *screen =
305		(struct r600_common_screen *)ctx->screen;
306	struct r600_perfcounters *pc = screen->perfcounters;
307	struct r600_perfcounter_block *block;
308	struct r600_pc_group *group;
309	struct r600_query_pc *query;
310	unsigned base_gid, sub_gid, sub_index;
311	unsigned i, j;
312
313	if (!pc)
314		return NULL;
315
316	query = CALLOC_STRUCT(r600_query_pc);
317	if (!query)
318		return NULL;
319
320	query->b.b.ops = &batch_query_ops;
321	query->b.ops = &batch_query_hw_ops;
322
323	query->num_counters = num_queries;
324
325	/* Collect selectors per group */
326	for (i = 0; i < num_queries; ++i) {
327		unsigned sub_gid;
328
329		if (query_types[i] < R600_QUERY_FIRST_PERFCOUNTER)
330			goto error;
331
332		block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER,
333				       &base_gid, &sub_index);
334		if (!block)
335			goto error;
336
337		sub_gid = sub_index / block->num_selectors;
338		sub_index = sub_index % block->num_selectors;
339
340		group = get_group_state(screen, query, block, sub_gid);
341		if (!group)
342			goto error;
343
344		if (group->num_counters >= block->num_counters) {
345			fprintf(stderr,
346				"perfcounter group %s: too many selected\n",
347				block->basename);
348			goto error;
349		}
350		group->selectors[group->num_counters] = sub_index;
351		++group->num_counters;
352	}
353
354	/* Compute result bases and CS size per group */
355	query->b.num_cs_dw_begin = pc->num_start_cs_dwords;
356	query->b.num_cs_dw_end = pc->num_stop_cs_dwords;
357
358	query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */
359	query->b.num_cs_dw_end += pc->num_instance_cs_dwords;
360
361	i = 0;
362	for (group = query->groups; group; group = group->next) {
363		struct r600_perfcounter_block *block = group->block;
364		unsigned select_dw, read_dw;
365		unsigned instances = 1;
366
367		if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
368			instances = screen->info.max_se;
369		if (group->instance < 0)
370			instances *= block->num_instances;
371
372		group->result_base = i;
373		query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
374		i += instances * group->num_counters;
375
376		pc->get_size(block, group->num_counters, group->selectors,
377			     &select_dw, &read_dw);
378		query->b.num_cs_dw_begin += select_dw;
379		query->b.num_cs_dw_end += instances * read_dw;
380		query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */
381		query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords;
382	}
383
384	if (query->shaders) {
385		if (query->shaders == R600_PC_SHADERS_WINDOWING)
386			query->shaders = 0xffffffff;
387		query->b.num_cs_dw_begin += pc->num_shaders_cs_dwords;
388	}
389
390	/* Map user-supplied query array to result indices */
391	query->counters = CALLOC(num_queries, sizeof(*query->counters));
392	for (i = 0; i < num_queries; ++i) {
393		struct r600_pc_counter *counter = &query->counters[i];
394		struct r600_perfcounter_block *block;
395
396		block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER,
397				       &base_gid, &sub_index);
398
399		sub_gid = sub_index / block->num_selectors;
400		sub_index = sub_index % block->num_selectors;
401
402		group = get_group_state(screen, query, block, sub_gid);
403		assert(group != NULL);
404
405		for (j = 0; j < group->num_counters; ++j) {
406			if (group->selectors[j] == sub_index)
407				break;
408		}
409
410		counter->base = group->result_base + j;
411		counter->stride = group->num_counters;
412
413		counter->qwords = 1;
414		if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
415			counter->qwords = screen->info.max_se;
416		if (group->instance < 0)
417			counter->qwords *= block->num_instances;
418	}
419
420	if (!r600_query_hw_init(screen, &query->b))
421		goto error;
422
423	return (struct pipe_query *)query;
424
425error:
426	r600_pc_query_destroy(screen, &query->b.b);
427	return NULL;
428}
429
430static bool r600_init_block_names(struct r600_common_screen *screen,
431				  struct r600_perfcounter_block *block)
432{
433	unsigned i, j, k;
434	unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
435	unsigned namelen;
436	char *groupname;
437	char *p;
438
439	if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
440		groups_instance = block->num_instances;
441	if (block->flags & R600_PC_BLOCK_SE_GROUPS)
442		groups_se = screen->info.max_se;
443	if (block->flags & R600_PC_BLOCK_SHADER)
444		groups_shader = screen->perfcounters->num_shader_types;
445
446	namelen = strlen(block->basename);
447	block->group_name_stride = namelen + 1;
448	if (block->flags & R600_PC_BLOCK_SHADER)
449		block->group_name_stride += 3;
450	if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
451		assert(groups_se <= 10);
452		block->group_name_stride += 1;
453
454		if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
455			block->group_name_stride += 1;
456	}
457	if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
458		assert(groups_instance <= 100);
459		block->group_name_stride += 2;
460	}
461
462	block->group_names = MALLOC(block->num_groups * block->group_name_stride);
463	if (!block->group_names)
464		return false;
465
466	groupname = block->group_names;
467	for (i = 0; i < groups_shader; ++i) {
468		const char *shader_suffix = screen->perfcounters->shader_type_suffixes[i];
469		unsigned shaderlen = strlen(shader_suffix);
470		for (j = 0; j < groups_se; ++j) {
471			for (k = 0; k < groups_instance; ++k) {
472				strcpy(groupname, block->basename);
473				p = groupname + namelen;
474
475				if (block->flags & R600_PC_BLOCK_SHADER) {
476					strcpy(p, shader_suffix);
477					p += shaderlen;
478				}
479
480				if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
481					p += sprintf(p, "%d", j);
482					if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
483						*p++ = '_';
484				}
485
486				if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
487					p += sprintf(p, "%d", k);
488
489				groupname += block->group_name_stride;
490			}
491		}
492	}
493
494	assert(block->num_selectors <= 1000);
495	block->selector_name_stride = block->group_name_stride + 4;
496	block->selector_names = MALLOC(block->num_groups * block->num_selectors *
497				       block->selector_name_stride);
498	if (!block->selector_names)
499		return false;
500
501	groupname = block->group_names;
502	p = block->selector_names;
503	for (i = 0; i < block->num_groups; ++i) {
504		for (j = 0; j < block->num_selectors; ++j) {
505			sprintf(p, "%s_%03d", groupname, j);
506			p += block->selector_name_stride;
507		}
508		groupname += block->group_name_stride;
509	}
510
511	return true;
512}
513
514int r600_get_perfcounter_info(struct r600_common_screen *screen,
515			      unsigned index,
516			      struct pipe_driver_query_info *info)
517{
518	struct r600_perfcounters *pc = screen->perfcounters;
519	struct r600_perfcounter_block *block;
520	unsigned base_gid, sub;
521
522	if (!pc)
523		return 0;
524
525	if (!info) {
526		unsigned bid, num_queries = 0;
527
528		for (bid = 0; bid < pc->num_blocks; ++bid) {
529			num_queries += pc->blocks[bid].num_selectors *
530				       pc->blocks[bid].num_groups;
531		}
532
533		return num_queries;
534	}
535
536	block = lookup_counter(pc, index, &base_gid, &sub);
537	if (!block)
538		return 0;
539
540	if (!block->selector_names) {
541		if (!r600_init_block_names(screen, block))
542			return 0;
543	}
544	info->name = block->selector_names + sub * block->selector_name_stride;
545	info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index;
546	info->max_value.u64 = 0;
547	info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
548	info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
549	info->group_id = base_gid + sub / block->num_selectors;
550	info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
551	if (sub > 0 && sub + 1 < block->num_selectors * block->num_groups)
552		info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
553	return 1;
554}
555
556int r600_get_perfcounter_group_info(struct r600_common_screen *screen,
557				    unsigned index,
558				    struct pipe_driver_query_group_info *info)
559{
560	struct r600_perfcounters *pc = screen->perfcounters;
561	struct r600_perfcounter_block *block;
562
563	if (!pc)
564		return 0;
565
566	if (!info)
567		return pc->num_groups;
568
569	block = lookup_group(pc, &index);
570	if (!block)
571		return 0;
572
573	if (!block->group_names) {
574		if (!r600_init_block_names(screen, block))
575			return 0;
576	}
577	info->name = block->group_names + index * block->group_name_stride;
578	info->num_queries = block->num_selectors;
579	info->max_active_queries = block->num_counters;
580	return 1;
581}
582
583void r600_perfcounters_destroy(struct r600_common_screen *rscreen)
584{
585	if (rscreen->perfcounters)
586		rscreen->perfcounters->cleanup(rscreen);
587}
588
589bool r600_perfcounters_init(struct r600_perfcounters *pc,
590			    unsigned num_blocks)
591{
592	pc->blocks = CALLOC(num_blocks, sizeof(struct r600_perfcounter_block));
593	if (!pc->blocks)
594		return false;
595
596	pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
597	pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
598
599	return true;
600}
601
602void r600_perfcounters_add_block(struct r600_common_screen *rscreen,
603				 struct r600_perfcounters *pc,
604				 const char *name, unsigned flags,
605				 unsigned counters, unsigned selectors,
606				 unsigned instances, void *data)
607{
608	struct r600_perfcounter_block *block = &pc->blocks[pc->num_blocks];
609
610	assert(counters <= R600_QUERY_MAX_COUNTERS);
611
612	block->basename = name;
613	block->flags = flags;
614	block->num_counters = counters;
615	block->num_selectors = selectors;
616	block->num_instances = MAX2(instances, 1);
617	block->data = data;
618
619	if (pc->separate_se && (block->flags & R600_PC_BLOCK_SE))
620		block->flags |= R600_PC_BLOCK_SE_GROUPS;
621	if (pc->separate_instance && block->num_instances > 1)
622		block->flags |= R600_PC_BLOCK_INSTANCE_GROUPS;
623
624	if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
625		block->num_groups = block->num_instances;
626	} else {
627		block->num_groups = 1;
628	}
629
630	if (block->flags & R600_PC_BLOCK_SE_GROUPS)
631		block->num_groups *= rscreen->info.max_se;
632	if (block->flags & R600_PC_BLOCK_SHADER)
633		block->num_groups *= pc->num_shader_types;
634
635	++pc->num_blocks;
636	pc->num_groups += block->num_groups;
637}
638
639void r600_perfcounters_do_destroy(struct r600_perfcounters *pc)
640{
641	unsigned i;
642
643	for (i = 0; i < pc->num_blocks; ++i) {
644		FREE(pc->blocks[i].group_names);
645		FREE(pc->blocks[i].selector_names);
646	}
647	FREE(pc->blocks);
648	FREE(pc);
649}
650