1/*
2 * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Rob Clark <robclark@freedesktop.org>
25 */
26
27/* NOTE: see https://github.com/freedreno/freedreno/wiki/A5xx-Queries */
28
29#include "freedreno_query_acc.h"
30#include "freedreno_resource.h"
31
32#include "fd5_context.h"
33#include "fd5_emit.h"
34#include "fd5_format.h"
35#include "fd5_query.h"
36
37struct PACKED fd5_query_sample {
38   uint64_t start;
39   uint64_t result;
40   uint64_t stop;
41};
42
43/* offset of a single field of an array of fd5_query_sample: */
44#define query_sample_idx(aq, idx, field)                                       \
45   fd_resource((aq)->prsc)->bo,                                                \
46      (idx * sizeof(struct fd5_query_sample)) +                                \
47         offsetof(struct fd5_query_sample, field),                             \
48      0, 0
49
50/* offset of a single field of fd5_query_sample: */
51#define query_sample(aq, field) query_sample_idx(aq, 0, field)
52
53/*
54 * Occlusion Query:
55 *
56 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
57 * interpret results
58 */
59
60static void
61occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
62{
63   struct fd_ringbuffer *ring = batch->draw;
64
65   OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
66   OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
67
68   OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
69   OUT_RELOC(ring, query_sample(aq, start));
70
71   fd5_event_write(batch, ring, ZPASS_DONE, false);
72   fd_reset_wfi(batch);
73
74   fd5_context(batch->ctx)->samples_passed_queries++;
75}
76
77static void
78occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch)
79{
80   struct fd_ringbuffer *ring = batch->draw;
81
82   OUT_PKT7(ring, CP_MEM_WRITE, 4);
83   OUT_RELOC(ring, query_sample(aq, stop));
84   OUT_RING(ring, 0xffffffff);
85   OUT_RING(ring, 0xffffffff);
86
87   OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
88
89   OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
90   OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
91
92   OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
93   OUT_RELOC(ring, query_sample(aq, stop));
94
95   fd5_event_write(batch, ring, ZPASS_DONE, false);
96   fd_reset_wfi(batch);
97
98   OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
99   OUT_RING(ring, 0x00000014); // XXX
100   OUT_RELOC(ring, query_sample(aq, stop));
101   OUT_RING(ring, 0xffffffff);
102   OUT_RING(ring, 0xffffffff);
103   OUT_RING(ring, 0x00000010); // XXX
104
105   /* result += stop - start: */
106   OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
107   OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
108   OUT_RELOC(ring, query_sample(aq, result)); /* dst */
109   OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
110   OUT_RELOC(ring, query_sample(aq, stop));   /* srcB */
111   OUT_RELOC(ring, query_sample(aq, start));  /* srcC */
112
113   fd5_context(batch->ctx)->samples_passed_queries--;
114}
115
116static void
117occlusion_counter_result(struct fd_acc_query *aq, void *buf,
118                         union pipe_query_result *result)
119{
120   struct fd5_query_sample *sp = buf;
121   result->u64 = sp->result;
122}
123
124static void
125occlusion_predicate_result(struct fd_acc_query *aq, void *buf,
126                           union pipe_query_result *result)
127{
128   struct fd5_query_sample *sp = buf;
129   result->b = !!sp->result;
130}
131
132static const struct fd_acc_sample_provider occlusion_counter = {
133   .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
134   .size = sizeof(struct fd5_query_sample),
135   .resume = occlusion_resume,
136   .pause = occlusion_pause,
137   .result = occlusion_counter_result,
138};
139
140static const struct fd_acc_sample_provider occlusion_predicate = {
141   .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
142   .size = sizeof(struct fd5_query_sample),
143   .resume = occlusion_resume,
144   .pause = occlusion_pause,
145   .result = occlusion_predicate_result,
146};
147
148static const struct fd_acc_sample_provider occlusion_predicate_conservative = {
149   .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
150   .size = sizeof(struct fd5_query_sample),
151   .resume = occlusion_resume,
152   .pause = occlusion_pause,
153   .result = occlusion_predicate_result,
154};
155
156/*
157 * Timestamp Queries:
158 */
159
160static void
161timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
162{
163   struct fd_ringbuffer *ring = batch->draw;
164
165   OUT_PKT7(ring, CP_EVENT_WRITE, 4);
166   OUT_RING(ring,
167            CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
168   OUT_RELOC(ring, query_sample(aq, start));
169   OUT_RING(ring, 0x00000000);
170
171   fd_reset_wfi(batch);
172}
173
174static void
175timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
176{
177   struct fd_ringbuffer *ring = batch->draw;
178
179   OUT_PKT7(ring, CP_EVENT_WRITE, 4);
180   OUT_RING(ring,
181            CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
182   OUT_RELOC(ring, query_sample(aq, stop));
183   OUT_RING(ring, 0x00000000);
184
185   fd_reset_wfi(batch);
186   fd_wfi(batch, ring);
187
188   /* result += stop - start: */
189   OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
190   OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
191   OUT_RELOC(ring, query_sample(aq, result)); /* dst */
192   OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
193   OUT_RELOC(ring, query_sample(aq, stop));   /* srcB */
194   OUT_RELOC(ring, query_sample(aq, start));  /* srcC */
195}
196
197static uint64_t
198ticks_to_ns(uint32_t ts)
199{
200   /* This is based on the 19.2MHz always-on rbbm timer.
201    *
202    * TODO we should probably query this value from kernel..
203    */
204   return ts * (1000000000 / 19200000);
205}
206
207static void
208time_elapsed_accumulate_result(struct fd_acc_query *aq, void *buf,
209                               union pipe_query_result *result)
210{
211   struct fd5_query_sample *sp = buf;
212   result->u64 = ticks_to_ns(sp->result);
213}
214
215static void
216timestamp_accumulate_result(struct fd_acc_query *aq, void *buf,
217                            union pipe_query_result *result)
218{
219   struct fd5_query_sample *sp = buf;
220   result->u64 = ticks_to_ns(sp->result);
221}
222
223static const struct fd_acc_sample_provider time_elapsed = {
224   .query_type = PIPE_QUERY_TIME_ELAPSED,
225   .always = true,
226   .size = sizeof(struct fd5_query_sample),
227   .resume = timestamp_resume,
228   .pause = timestamp_pause,
229   .result = time_elapsed_accumulate_result,
230};
231
232/* NOTE: timestamp query isn't going to give terribly sensible results
233 * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
234 * add in a binning pass, the results get even more non-sensical.  So
235 * we just return the timestamp on the first tile and hope that is
236 * kind of good enough.
237 */
238
239static const struct fd_acc_sample_provider timestamp = {
240   .query_type = PIPE_QUERY_TIMESTAMP,
241   .always = true,
242   .size = sizeof(struct fd5_query_sample),
243   .resume = timestamp_resume,
244   .pause = timestamp_pause,
245   .result = timestamp_accumulate_result,
246};
247
248/*
249 * Performance Counter (batch) queries:
250 *
251 * Only one of these is active at a time, per design of the gallium
252 * batch_query API design.  On perfcntr query tracks N query_types,
253 * each of which has a 'fd_batch_query_entry' that maps it back to
254 * the associated group and counter.
255 */
256
257struct fd_batch_query_entry {
258   uint8_t gid; /* group-id */
259   uint8_t cid; /* countable-id within the group */
260};
261
262struct fd_batch_query_data {
263   struct fd_screen *screen;
264   unsigned num_query_entries;
265   struct fd_batch_query_entry query_entries[];
266};
267
268static void
269perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
270{
271   struct fd_batch_query_data *data = aq->query_data;
272   struct fd_screen *screen = data->screen;
273   struct fd_ringbuffer *ring = batch->draw;
274
275   unsigned counters_per_group[screen->num_perfcntr_groups];
276   memset(counters_per_group, 0, sizeof(counters_per_group));
277
278   fd_wfi(batch, ring);
279
280   /* configure performance counters for the requested queries: */
281   for (unsigned i = 0; i < data->num_query_entries; i++) {
282      struct fd_batch_query_entry *entry = &data->query_entries[i];
283      const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
284      unsigned counter_idx = counters_per_group[entry->gid]++;
285
286      assert(counter_idx < g->num_counters);
287
288      OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1);
289      OUT_RING(ring, g->countables[entry->cid].selector);
290   }
291
292   memset(counters_per_group, 0, sizeof(counters_per_group));
293
294   /* and snapshot the start values */
295   for (unsigned i = 0; i < data->num_query_entries; i++) {
296      struct fd_batch_query_entry *entry = &data->query_entries[i];
297      const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
298      unsigned counter_idx = counters_per_group[entry->gid]++;
299      const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
300
301      OUT_PKT7(ring, CP_REG_TO_MEM, 3);
302      OUT_RING(ring, CP_REG_TO_MEM_0_64B |
303                        CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
304      OUT_RELOC(ring, query_sample_idx(aq, i, start));
305   }
306}
307
308static void
309perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
310{
311   struct fd_batch_query_data *data = aq->query_data;
312   struct fd_screen *screen = data->screen;
313   struct fd_ringbuffer *ring = batch->draw;
314
315   unsigned counters_per_group[screen->num_perfcntr_groups];
316   memset(counters_per_group, 0, sizeof(counters_per_group));
317
318   fd_wfi(batch, ring);
319
320   /* TODO do we need to bother to turn anything off? */
321
322   /* snapshot the end values: */
323   for (unsigned i = 0; i < data->num_query_entries; i++) {
324      struct fd_batch_query_entry *entry = &data->query_entries[i];
325      const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
326      unsigned counter_idx = counters_per_group[entry->gid]++;
327      const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
328
329      OUT_PKT7(ring, CP_REG_TO_MEM, 3);
330      OUT_RING(ring, CP_REG_TO_MEM_0_64B |
331                        CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
332      OUT_RELOC(ring, query_sample_idx(aq, i, stop));
333   }
334
335   /* and compute the result: */
336   for (unsigned i = 0; i < data->num_query_entries; i++) {
337      /* result += stop - start: */
338      OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
339      OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
340      OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */
341      OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */
342      OUT_RELOC(ring, query_sample_idx(aq, i, stop));   /* srcB */
343      OUT_RELOC(ring, query_sample_idx(aq, i, start));  /* srcC */
344   }
345}
346
347static void
348perfcntr_accumulate_result(struct fd_acc_query *aq, void *buf,
349                           union pipe_query_result *result)
350{
351   struct fd_batch_query_data *data = aq->query_data;
352   struct fd5_query_sample *sp = buf;
353
354   for (unsigned i = 0; i < data->num_query_entries; i++) {
355      result->batch[i].u64 = sp[i].result;
356   }
357}
358
359static const struct fd_acc_sample_provider perfcntr = {
360   .query_type = FD_QUERY_FIRST_PERFCNTR,
361   .always = true,
362   .resume = perfcntr_resume,
363   .pause = perfcntr_pause,
364   .result = perfcntr_accumulate_result,
365};
366
367static struct pipe_query *
368fd5_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
369                       unsigned *query_types)
370{
371   struct fd_context *ctx = fd_context(pctx);
372   struct fd_screen *screen = ctx->screen;
373   struct fd_query *q;
374   struct fd_acc_query *aq;
375   struct fd_batch_query_data *data;
376
377   data = CALLOC_VARIANT_LENGTH_STRUCT(
378      fd_batch_query_data, num_queries * sizeof(data->query_entries[0]));
379
380   data->screen = screen;
381   data->num_query_entries = num_queries;
382
383   /* validate the requested query_types and ensure we don't try
384    * to request more query_types of a given group than we have
385    * counters:
386    */
387   unsigned counters_per_group[screen->num_perfcntr_groups];
388   memset(counters_per_group, 0, sizeof(counters_per_group));
389
390   for (unsigned i = 0; i < num_queries; i++) {
391      unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
392
393      /* verify valid query_type, ie. is it actually a perfcntr? */
394      if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) ||
395          (idx >= screen->num_perfcntr_queries)) {
396         mesa_loge("invalid batch query query_type: %u", query_types[i]);
397         goto error;
398      }
399
400      struct fd_batch_query_entry *entry = &data->query_entries[i];
401      struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx];
402
403      entry->gid = pq->group_id;
404
405      /* the perfcntr_queries[] table flattens all the countables
406       * for each group in series, ie:
407       *
408       *   (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ...
409       *
410       * So to find the countable index just step back through the
411       * table to find the first entry with the same group-id.
412       */
413      while (pq > screen->perfcntr_queries) {
414         pq--;
415         if (pq->group_id == entry->gid)
416            entry->cid++;
417      }
418
419      if (counters_per_group[entry->gid] >=
420          screen->perfcntr_groups[entry->gid].num_counters) {
421         mesa_loge("too many counters for group %u\n", entry->gid);
422         goto error;
423      }
424
425      counters_per_group[entry->gid]++;
426   }
427
428   q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
429   aq = fd_acc_query(q);
430
431   /* sample buffer size is based on # of queries: */
432   aq->size = num_queries * sizeof(struct fd5_query_sample);
433   aq->query_data = data;
434
435   return (struct pipe_query *)q;
436
437error:
438   free(data);
439   return NULL;
440}
441
442void
443fd5_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
444{
445   struct fd_context *ctx = fd_context(pctx);
446
447   ctx->create_query = fd_acc_create_query;
448   ctx->query_update_batch = fd_acc_query_update_batch;
449
450   pctx->create_batch_query = fd5_create_batch_query;
451
452   fd_acc_query_register_provider(pctx, &occlusion_counter);
453   fd_acc_query_register_provider(pctx, &occlusion_predicate);
454   fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative);
455
456   fd_acc_query_register_provider(pctx, &time_elapsed);
457   fd_acc_query_register_provider(pctx, &timestamp);
458}
459