1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #include "freedreno_context.h"
28 #include "freedreno_query_hw.h"
29 #include "freedreno_util.h"
30
31 #include "fd4_context.h"
32 #include "fd4_draw.h"
33 #include "fd4_format.h"
34 #include "fd4_query.h"
35
36 struct fd_rb_samp_ctrs {
37 uint64_t ctr[16];
38 };
39
40 /*
41 * Occlusion Query:
42 *
43 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
44 * interpret results
45 */
46
47 static struct fd_hw_sample *
occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)48 occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
49 {
50 struct fd_hw_sample *samp =
51 fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));
52
53 /* low bits of sample addr should be zero (since they are control
54 * flags in RB_SAMPLE_COUNT_CONTROL):
55 */
56 assert((samp->offset & 0x3) == 0);
57
58 /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
59 * HW_QUERY_BASE_REG register:
60 */
61 OUT_PKT3(ring, CP_SET_CONSTANT, 3);
62 OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);
63 OUT_RING(ring, HW_QUERY_BASE_REG);
64 OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | samp->offset);
65
66 OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);
67 OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
68 INDEX4_SIZE_32_BIT, USE_VISIBILITY));
69 OUT_RING(ring, 1); /* NumInstances */
70 OUT_RING(ring, 0); /* NumIndices */
71
72 fd_event_write(batch, ring, ZPASS_DONE);
73
74 return samp;
75 }
76
77 static uint64_t
count_samples(const struct fd_rb_samp_ctrs *start, const struct fd_rb_samp_ctrs *end)78 count_samples(const struct fd_rb_samp_ctrs *start,
79 const struct fd_rb_samp_ctrs *end)
80 {
81 return end->ctr[0] - start->ctr[0];
82 }
83
84 static void
occlusion_counter_accumulate_result(struct fd_context *ctx, const void *start, const void *end, union pipe_query_result *result)85 occlusion_counter_accumulate_result(struct fd_context *ctx, const void *start,
86 const void *end,
87 union pipe_query_result *result)
88 {
89 uint64_t n = count_samples(start, end);
90 result->u64 += n;
91 }
92
93 static void
occlusion_predicate_accumulate_result(struct fd_context *ctx, const void *start, const void *end, union pipe_query_result *result)94 occlusion_predicate_accumulate_result(struct fd_context *ctx, const void *start,
95 const void *end,
96 union pipe_query_result *result)
97 {
98 uint64_t n = count_samples(start, end);
99 result->b |= (n > 0);
100 }
101
102 /*
103 * Time Elapsed Query:
104 *
105 * Note: we could in theory support timestamp queries, but they
106 * won't give sensible results for tilers.
107 */
108
109 static void
110 time_elapsed_enable(struct fd_context *ctx,
111 struct fd_ringbuffer *ring) assert_dt
112 {
113 /* Right now, the assignment of countable to counter register is
114 * just hard coded. If we start exposing more countables than we
115 * have counters, we will need to be more clever.
116 */
117 struct fd_batch *batch = fd_context_batch_locked(ctx);
118 fd_wfi(batch, ring);
119 OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1);
120 OUT_RING(ring, CP_ALWAYS_COUNT);
121 fd_batch_unlock_submit(batch);
122 fd_batch_reference(&batch, NULL);
123 }
124
125 static struct fd_hw_sample *
126 time_elapsed_get_sample(struct fd_batch *batch,
127 struct fd_ringbuffer *ring) assert_dt
128 {
129 struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t));
130
131 /* use unused part of vsc_size_mem as scratch space, to avoid
132 * extra allocation:
133 */
134 struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem;
135 const int sample_off = 128;
136 const int addr_off = sample_off + 8;
137
138 assert(batch->ctx->screen->max_freq > 0);
139
140 /* Basic issue is that we need to read counter value to a relative
141 * destination (with per-tile offset) rather than absolute dest
142 * addr. But there is no pm4 packet that can do that. This is
143 * where it would be *really* nice if we could write our own fw
144 * since afaict implementing the sort of packet we need would be
145 * trivial.
146 *
147 * Instead, we:
148 * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer
149 * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer
150 * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base
151 * address to the per-sample offset in the scratch buffer
152 * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3
153 * to CP_ME_NRT_ADDR
154 * (5) CP_MEM_TO_REG's to copy saved counter value from scratch
155 * buffer to CP_ME_NRT_DATA to trigger the write out to query
156 * result buffer
157 *
158 * Straightforward, right?
159 *
160 * Maybe could swap the order of things in the scratch buffer to
161 * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one
162 * shot, but that's really just polishing a turd..
163 */
164
165 fd_wfi(batch, ring);
166
167 /* copy sample counter _LO and _HI to scratch: */
168 OUT_PKT3(ring, CP_REG_TO_MEM, 2);
169 OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |
170 CP_REG_TO_MEM_0_64B |
171 CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */
172 OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
173
174 /* ok... here we really *would* like to use the CP_SET_CONSTANT
175 * mode which can add a constant to value in reg2 and write to
176 * reg1... *but* that only works for banked/context registers,
177 * and CP_ME_NRT_DATA isn't one of those.. so we need to do some
178 * CP math to the scratch buffer instead:
179 *
180 * (note first 8 bytes are counter value, use offset 0x8 for
181 * address calculation)
182 */
183
184 /* per-sample offset to scratch bo: */
185 OUT_PKT3(ring, CP_MEM_WRITE, 2);
186 OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
187 OUT_RING(ring, samp->offset);
188
189 /* now add to that the per-tile base: */
190 OUT_PKT3(ring, CP_REG_TO_MEM, 2);
191 OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |
192 CP_REG_TO_MEM_0_ACCUMULATE |
193 CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */
194 OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
195
196 /* now copy that back to CP_ME_NRT_ADDR: */
197 OUT_PKT3(ring, CP_MEM_TO_REG, 2);
198 OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR);
199 OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
200
201 /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA
202 * to trigger the write to result buffer
203 */
204 OUT_PKT3(ring, CP_MEM_TO_REG, 2);
205 OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
206 OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
207
208 /* and again to get the value of the _HI reg from scratch: */
209 OUT_PKT3(ring, CP_MEM_TO_REG, 2);
210 OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
211 OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0);
212
213 /* Sigh.. */
214
215 return samp;
216 }
217
218 static void
time_elapsed_accumulate_result(struct fd_context *ctx, const void *start, const void *end, union pipe_query_result *result)219 time_elapsed_accumulate_result(struct fd_context *ctx, const void *start,
220 const void *end, union pipe_query_result *result)
221 {
222 uint64_t n = *(uint64_t *)end - *(uint64_t *)start;
223 /* max_freq is in Hz, convert cycle count to ns: */
224 result->u64 += n * 1000000000 / ctx->screen->max_freq;
225 }
226
227 static void
timestamp_accumulate_result(struct fd_context *ctx, const void *start, const void *end, union pipe_query_result *result)228 timestamp_accumulate_result(struct fd_context *ctx, const void *start,
229 const void *end, union pipe_query_result *result)
230 {
231 /* just return the value from fist tile: */
232 if (result->u64 != 0)
233 return;
234 uint64_t n = *(uint64_t *)start;
235 /* max_freq is in Hz, convert cycle count to ns: */
236 result->u64 = n * 1000000000 / ctx->screen->max_freq;
237 }
238
239 static const struct fd_hw_sample_provider occlusion_counter = {
240 .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
241 .get_sample = occlusion_get_sample,
242 .accumulate_result = occlusion_counter_accumulate_result,
243 };
244
245 static const struct fd_hw_sample_provider occlusion_predicate = {
246 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
247 .get_sample = occlusion_get_sample,
248 .accumulate_result = occlusion_predicate_accumulate_result,
249 };
250
251 static const struct fd_hw_sample_provider occlusion_predicate_conservative = {
252 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
253 .get_sample = occlusion_get_sample,
254 .accumulate_result = occlusion_predicate_accumulate_result,
255 };
256
257 static const struct fd_hw_sample_provider time_elapsed = {
258 .query_type = PIPE_QUERY_TIME_ELAPSED,
259 .always = true,
260 .enable = time_elapsed_enable,
261 .get_sample = time_elapsed_get_sample,
262 .accumulate_result = time_elapsed_accumulate_result,
263 };
264
265 /* NOTE: timestamp query isn't going to give terribly sensible results
266 * on a tiler. But it is needed by qapitrace profile heatmap. If you
267 * add in a binning pass, the results get even more non-sensical. So
268 * we just return the timestamp on the first tile and hope that is
269 * kind of good enough.
270 */
271 static const struct fd_hw_sample_provider timestamp = {
272 .query_type = PIPE_QUERY_TIMESTAMP,
273 .always = true,
274 .enable = time_elapsed_enable,
275 .get_sample = time_elapsed_get_sample,
276 .accumulate_result = timestamp_accumulate_result,
277 };
278
279 void
280 fd4_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
281 {
282 struct fd_context *ctx = fd_context(pctx);
283
284 ctx->create_query = fd_hw_create_query;
285 ctx->query_prepare = fd_hw_query_prepare;
286 ctx->query_prepare_tile = fd_hw_query_prepare_tile;
287 ctx->query_update_batch = fd_hw_query_update_batch;
288
289 fd_hw_query_register_provider(pctx, &occlusion_counter);
290 fd_hw_query_register_provider(pctx, &occlusion_predicate);
291 fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative);
292 fd_hw_query_register_provider(pctx, &time_elapsed);
293 fd_hw_query_register_provider(pctx, ×tamp);
294 }
295