1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21bf215546Sopenharmony_ci * SOFTWARE.
22bf215546Sopenharmony_ci *
23bf215546Sopenharmony_ci * Authors:
24bf215546Sopenharmony_ci *    Rob Clark <robclark@freedesktop.org>
25bf215546Sopenharmony_ci */
26bf215546Sopenharmony_ci
27bf215546Sopenharmony_ci#include "freedreno_context.h"
28bf215546Sopenharmony_ci#include "freedreno_query_hw.h"
29bf215546Sopenharmony_ci#include "freedreno_util.h"
30bf215546Sopenharmony_ci
31bf215546Sopenharmony_ci#include "fd4_context.h"
32bf215546Sopenharmony_ci#include "fd4_draw.h"
33bf215546Sopenharmony_ci#include "fd4_format.h"
34bf215546Sopenharmony_ci#include "fd4_query.h"
35bf215546Sopenharmony_ci
36bf215546Sopenharmony_cistruct fd_rb_samp_ctrs {
37bf215546Sopenharmony_ci   uint64_t ctr[16];
38bf215546Sopenharmony_ci};
39bf215546Sopenharmony_ci
40bf215546Sopenharmony_ci/*
41bf215546Sopenharmony_ci * Occlusion Query:
42bf215546Sopenharmony_ci *
43bf215546Sopenharmony_ci * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
44bf215546Sopenharmony_ci * interpret results
45bf215546Sopenharmony_ci */
46bf215546Sopenharmony_ci
47bf215546Sopenharmony_cistatic struct fd_hw_sample *
48bf215546Sopenharmony_ciocclusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
49bf215546Sopenharmony_ci{
50bf215546Sopenharmony_ci   struct fd_hw_sample *samp =
51bf215546Sopenharmony_ci      fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));
52bf215546Sopenharmony_ci
53bf215546Sopenharmony_ci   /* low bits of sample addr should be zero (since they are control
54bf215546Sopenharmony_ci    * flags in RB_SAMPLE_COUNT_CONTROL):
55bf215546Sopenharmony_ci    */
56bf215546Sopenharmony_ci   assert((samp->offset & 0x3) == 0);
57bf215546Sopenharmony_ci
58bf215546Sopenharmony_ci   /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
59bf215546Sopenharmony_ci    * HW_QUERY_BASE_REG register:
60bf215546Sopenharmony_ci    */
61bf215546Sopenharmony_ci   OUT_PKT3(ring, CP_SET_CONSTANT, 3);
62bf215546Sopenharmony_ci   OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);
63bf215546Sopenharmony_ci   OUT_RING(ring, HW_QUERY_BASE_REG);
64bf215546Sopenharmony_ci   OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | samp->offset);
65bf215546Sopenharmony_ci
66bf215546Sopenharmony_ci   OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);
67bf215546Sopenharmony_ci   OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
68bf215546Sopenharmony_ci                        INDEX4_SIZE_32_BIT, USE_VISIBILITY));
69bf215546Sopenharmony_ci   OUT_RING(ring, 1); /* NumInstances */
70bf215546Sopenharmony_ci   OUT_RING(ring, 0); /* NumIndices */
71bf215546Sopenharmony_ci
72bf215546Sopenharmony_ci   fd_event_write(batch, ring, ZPASS_DONE);
73bf215546Sopenharmony_ci
74bf215546Sopenharmony_ci   return samp;
75bf215546Sopenharmony_ci}
76bf215546Sopenharmony_ci
77bf215546Sopenharmony_cistatic uint64_t
78bf215546Sopenharmony_cicount_samples(const struct fd_rb_samp_ctrs *start,
79bf215546Sopenharmony_ci              const struct fd_rb_samp_ctrs *end)
80bf215546Sopenharmony_ci{
81bf215546Sopenharmony_ci   return end->ctr[0] - start->ctr[0];
82bf215546Sopenharmony_ci}
83bf215546Sopenharmony_ci
84bf215546Sopenharmony_cistatic void
85bf215546Sopenharmony_ciocclusion_counter_accumulate_result(struct fd_context *ctx, const void *start,
86bf215546Sopenharmony_ci                                    const void *end,
87bf215546Sopenharmony_ci                                    union pipe_query_result *result)
88bf215546Sopenharmony_ci{
89bf215546Sopenharmony_ci   uint64_t n = count_samples(start, end);
90bf215546Sopenharmony_ci   result->u64 += n;
91bf215546Sopenharmony_ci}
92bf215546Sopenharmony_ci
93bf215546Sopenharmony_cistatic void
94bf215546Sopenharmony_ciocclusion_predicate_accumulate_result(struct fd_context *ctx, const void *start,
95bf215546Sopenharmony_ci                                      const void *end,
96bf215546Sopenharmony_ci                                      union pipe_query_result *result)
97bf215546Sopenharmony_ci{
98bf215546Sopenharmony_ci   uint64_t n = count_samples(start, end);
99bf215546Sopenharmony_ci   result->b |= (n > 0);
100bf215546Sopenharmony_ci}
101bf215546Sopenharmony_ci
102bf215546Sopenharmony_ci/*
103bf215546Sopenharmony_ci * Time Elapsed Query:
104bf215546Sopenharmony_ci *
105bf215546Sopenharmony_ci * Note: we could in theory support timestamp queries, but they
106bf215546Sopenharmony_ci * won't give sensible results for tilers.
107bf215546Sopenharmony_ci */
108bf215546Sopenharmony_ci
109bf215546Sopenharmony_cistatic void
110bf215546Sopenharmony_citime_elapsed_enable(struct fd_context *ctx,
111bf215546Sopenharmony_ci                    struct fd_ringbuffer *ring) assert_dt
112bf215546Sopenharmony_ci{
113bf215546Sopenharmony_ci   /* Right now, the assignment of countable to counter register is
114bf215546Sopenharmony_ci    * just hard coded.  If we start exposing more countables than we
115bf215546Sopenharmony_ci    * have counters, we will need to be more clever.
116bf215546Sopenharmony_ci    */
117bf215546Sopenharmony_ci   struct fd_batch *batch = fd_context_batch_locked(ctx);
118bf215546Sopenharmony_ci   fd_wfi(batch, ring);
119bf215546Sopenharmony_ci   OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1);
120bf215546Sopenharmony_ci   OUT_RING(ring, CP_ALWAYS_COUNT);
121bf215546Sopenharmony_ci   fd_batch_unlock_submit(batch);
122bf215546Sopenharmony_ci   fd_batch_reference(&batch, NULL);
123bf215546Sopenharmony_ci}
124bf215546Sopenharmony_ci
125bf215546Sopenharmony_cistatic struct fd_hw_sample *
126bf215546Sopenharmony_citime_elapsed_get_sample(struct fd_batch *batch,
127bf215546Sopenharmony_ci                        struct fd_ringbuffer *ring) assert_dt
128bf215546Sopenharmony_ci{
129bf215546Sopenharmony_ci   struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t));
130bf215546Sopenharmony_ci
131bf215546Sopenharmony_ci   /* use unused part of vsc_size_mem as scratch space, to avoid
132bf215546Sopenharmony_ci    * extra allocation:
133bf215546Sopenharmony_ci    */
134bf215546Sopenharmony_ci   struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem;
135bf215546Sopenharmony_ci   const int sample_off = 128;
136bf215546Sopenharmony_ci   const int addr_off = sample_off + 8;
137bf215546Sopenharmony_ci
138bf215546Sopenharmony_ci   assert(batch->ctx->screen->max_freq > 0);
139bf215546Sopenharmony_ci
140bf215546Sopenharmony_ci   /* Basic issue is that we need to read counter value to a relative
141bf215546Sopenharmony_ci    * destination (with per-tile offset) rather than absolute dest
142bf215546Sopenharmony_ci    * addr.  But there is no pm4 packet that can do that.  This is
143bf215546Sopenharmony_ci    * where it would be *really* nice if we could write our own fw
144bf215546Sopenharmony_ci    * since afaict implementing the sort of packet we need would be
145bf215546Sopenharmony_ci    * trivial.
146bf215546Sopenharmony_ci    *
147bf215546Sopenharmony_ci    * Instead, we:
148bf215546Sopenharmony_ci    * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer
149bf215546Sopenharmony_ci    * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer
150bf215546Sopenharmony_ci    * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base
151bf215546Sopenharmony_ci    *     address to the per-sample offset in the scratch buffer
152bf215546Sopenharmony_ci    * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3
153bf215546Sopenharmony_ci    *     to CP_ME_NRT_ADDR
154bf215546Sopenharmony_ci    * (5) CP_MEM_TO_REG's to copy saved counter value from scratch
155bf215546Sopenharmony_ci    *     buffer to CP_ME_NRT_DATA to trigger the write out to query
156bf215546Sopenharmony_ci    *     result buffer
157bf215546Sopenharmony_ci    *
158bf215546Sopenharmony_ci    * Straightforward, right?
159bf215546Sopenharmony_ci    *
160bf215546Sopenharmony_ci    * Maybe could swap the order of things in the scratch buffer to
161bf215546Sopenharmony_ci    * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one
162bf215546Sopenharmony_ci    * shot, but that's really just polishing a turd..
163bf215546Sopenharmony_ci    */
164bf215546Sopenharmony_ci
165bf215546Sopenharmony_ci   fd_wfi(batch, ring);
166bf215546Sopenharmony_ci
167bf215546Sopenharmony_ci   /* copy sample counter _LO and _HI to scratch: */
168bf215546Sopenharmony_ci   OUT_PKT3(ring, CP_REG_TO_MEM, 2);
169bf215546Sopenharmony_ci   OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |
170bf215546Sopenharmony_ci                     CP_REG_TO_MEM_0_64B |
171bf215546Sopenharmony_ci                     CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */
172bf215546Sopenharmony_ci   OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
173bf215546Sopenharmony_ci
174bf215546Sopenharmony_ci   /* ok... here we really *would* like to use the CP_SET_CONSTANT
175bf215546Sopenharmony_ci    * mode which can add a constant to value in reg2 and write to
176bf215546Sopenharmony_ci    * reg1... *but* that only works for banked/context registers,
177bf215546Sopenharmony_ci    * and CP_ME_NRT_DATA isn't one of those.. so we need to do some
178bf215546Sopenharmony_ci    * CP math to the scratch buffer instead:
179bf215546Sopenharmony_ci    *
180bf215546Sopenharmony_ci    * (note first 8 bytes are counter value, use offset 0x8 for
181bf215546Sopenharmony_ci    * address calculation)
182bf215546Sopenharmony_ci    */
183bf215546Sopenharmony_ci
184bf215546Sopenharmony_ci   /* per-sample offset to scratch bo: */
185bf215546Sopenharmony_ci   OUT_PKT3(ring, CP_MEM_WRITE, 2);
186bf215546Sopenharmony_ci   OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
187bf215546Sopenharmony_ci   OUT_RING(ring, samp->offset);
188bf215546Sopenharmony_ci
189bf215546Sopenharmony_ci   /* now add to that the per-tile base: */
190bf215546Sopenharmony_ci   OUT_PKT3(ring, CP_REG_TO_MEM, 2);
191bf215546Sopenharmony_ci   OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |
192bf215546Sopenharmony_ci                     CP_REG_TO_MEM_0_ACCUMULATE |
193bf215546Sopenharmony_ci                     CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */
194bf215546Sopenharmony_ci   OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
195bf215546Sopenharmony_ci
196bf215546Sopenharmony_ci   /* now copy that back to CP_ME_NRT_ADDR: */
197bf215546Sopenharmony_ci   OUT_PKT3(ring, CP_MEM_TO_REG, 2);
198bf215546Sopenharmony_ci   OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR);
199bf215546Sopenharmony_ci   OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
200bf215546Sopenharmony_ci
201bf215546Sopenharmony_ci   /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA
202bf215546Sopenharmony_ci    * to trigger the write to result buffer
203bf215546Sopenharmony_ci    */
204bf215546Sopenharmony_ci   OUT_PKT3(ring, CP_MEM_TO_REG, 2);
205bf215546Sopenharmony_ci   OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
206bf215546Sopenharmony_ci   OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
207bf215546Sopenharmony_ci
208bf215546Sopenharmony_ci   /* and again to get the value of the _HI reg from scratch: */
209bf215546Sopenharmony_ci   OUT_PKT3(ring, CP_MEM_TO_REG, 2);
210bf215546Sopenharmony_ci   OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
211bf215546Sopenharmony_ci   OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0);
212bf215546Sopenharmony_ci
213bf215546Sopenharmony_ci   /* Sigh.. */
214bf215546Sopenharmony_ci
215bf215546Sopenharmony_ci   return samp;
216bf215546Sopenharmony_ci}
217bf215546Sopenharmony_ci
218bf215546Sopenharmony_cistatic void
219bf215546Sopenharmony_citime_elapsed_accumulate_result(struct fd_context *ctx, const void *start,
220bf215546Sopenharmony_ci                               const void *end, union pipe_query_result *result)
221bf215546Sopenharmony_ci{
222bf215546Sopenharmony_ci   uint64_t n = *(uint64_t *)end - *(uint64_t *)start;
223bf215546Sopenharmony_ci   /* max_freq is in Hz, convert cycle count to ns: */
224bf215546Sopenharmony_ci   result->u64 += n * 1000000000 / ctx->screen->max_freq;
225bf215546Sopenharmony_ci}
226bf215546Sopenharmony_ci
227bf215546Sopenharmony_cistatic void
228bf215546Sopenharmony_citimestamp_accumulate_result(struct fd_context *ctx, const void *start,
229bf215546Sopenharmony_ci                            const void *end, union pipe_query_result *result)
230bf215546Sopenharmony_ci{
231bf215546Sopenharmony_ci   /* just return the value from fist tile: */
232bf215546Sopenharmony_ci   if (result->u64 != 0)
233bf215546Sopenharmony_ci      return;
234bf215546Sopenharmony_ci   uint64_t n = *(uint64_t *)start;
235bf215546Sopenharmony_ci   /* max_freq is in Hz, convert cycle count to ns: */
236bf215546Sopenharmony_ci   result->u64 = n * 1000000000 / ctx->screen->max_freq;
237bf215546Sopenharmony_ci}
238bf215546Sopenharmony_ci
239bf215546Sopenharmony_cistatic const struct fd_hw_sample_provider occlusion_counter = {
240bf215546Sopenharmony_ci   .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
241bf215546Sopenharmony_ci   .get_sample = occlusion_get_sample,
242bf215546Sopenharmony_ci   .accumulate_result = occlusion_counter_accumulate_result,
243bf215546Sopenharmony_ci};
244bf215546Sopenharmony_ci
245bf215546Sopenharmony_cistatic const struct fd_hw_sample_provider occlusion_predicate = {
246bf215546Sopenharmony_ci   .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
247bf215546Sopenharmony_ci   .get_sample = occlusion_get_sample,
248bf215546Sopenharmony_ci   .accumulate_result = occlusion_predicate_accumulate_result,
249bf215546Sopenharmony_ci};
250bf215546Sopenharmony_ci
251bf215546Sopenharmony_cistatic const struct fd_hw_sample_provider occlusion_predicate_conservative = {
252bf215546Sopenharmony_ci   .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
253bf215546Sopenharmony_ci   .get_sample = occlusion_get_sample,
254bf215546Sopenharmony_ci   .accumulate_result = occlusion_predicate_accumulate_result,
255bf215546Sopenharmony_ci};
256bf215546Sopenharmony_ci
257bf215546Sopenharmony_cistatic const struct fd_hw_sample_provider time_elapsed = {
258bf215546Sopenharmony_ci   .query_type = PIPE_QUERY_TIME_ELAPSED,
259bf215546Sopenharmony_ci   .always = true,
260bf215546Sopenharmony_ci   .enable = time_elapsed_enable,
261bf215546Sopenharmony_ci   .get_sample = time_elapsed_get_sample,
262bf215546Sopenharmony_ci   .accumulate_result = time_elapsed_accumulate_result,
263bf215546Sopenharmony_ci};
264bf215546Sopenharmony_ci
265bf215546Sopenharmony_ci/* NOTE: timestamp query isn't going to give terribly sensible results
266bf215546Sopenharmony_ci * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
267bf215546Sopenharmony_ci * add in a binning pass, the results get even more non-sensical.  So
268bf215546Sopenharmony_ci * we just return the timestamp on the first tile and hope that is
269bf215546Sopenharmony_ci * kind of good enough.
270bf215546Sopenharmony_ci */
271bf215546Sopenharmony_cistatic const struct fd_hw_sample_provider timestamp = {
272bf215546Sopenharmony_ci   .query_type = PIPE_QUERY_TIMESTAMP,
273bf215546Sopenharmony_ci   .always = true,
274bf215546Sopenharmony_ci   .enable = time_elapsed_enable,
275bf215546Sopenharmony_ci   .get_sample = time_elapsed_get_sample,
276bf215546Sopenharmony_ci   .accumulate_result = timestamp_accumulate_result,
277bf215546Sopenharmony_ci};
278bf215546Sopenharmony_ci
279bf215546Sopenharmony_civoid
280bf215546Sopenharmony_cifd4_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
281bf215546Sopenharmony_ci{
282bf215546Sopenharmony_ci   struct fd_context *ctx = fd_context(pctx);
283bf215546Sopenharmony_ci
284bf215546Sopenharmony_ci   ctx->create_query = fd_hw_create_query;
285bf215546Sopenharmony_ci   ctx->query_prepare = fd_hw_query_prepare;
286bf215546Sopenharmony_ci   ctx->query_prepare_tile = fd_hw_query_prepare_tile;
287bf215546Sopenharmony_ci   ctx->query_update_batch = fd_hw_query_update_batch;
288bf215546Sopenharmony_ci
289bf215546Sopenharmony_ci   fd_hw_query_register_provider(pctx, &occlusion_counter);
290bf215546Sopenharmony_ci   fd_hw_query_register_provider(pctx, &occlusion_predicate);
291bf215546Sopenharmony_ci   fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative);
292bf215546Sopenharmony_ci   fd_hw_query_register_provider(pctx, &time_elapsed);
293bf215546Sopenharmony_ci   fd_hw_query_register_provider(pctx, &timestamp);
294bf215546Sopenharmony_ci}
295