1/* 2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#include "freedreno_context.h" 28#include "freedreno_query_hw.h" 29#include "freedreno_util.h" 30 31#include "fd4_context.h" 32#include "fd4_draw.h" 33#include "fd4_format.h" 34#include "fd4_query.h" 35 36struct fd_rb_samp_ctrs { 37 uint64_t ctr[16]; 38}; 39 40/* 41 * Occlusion Query: 42 * 43 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they 44 * interpret results 45 */ 46 47static struct fd_hw_sample * 48occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring) 49{ 50 struct fd_hw_sample *samp = 51 fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs)); 52 53 /* low bits of sample addr should be zero (since they are control 54 * flags in RB_SAMPLE_COUNT_CONTROL): 55 */ 56 assert((samp->offset & 0x3) == 0); 57 58 /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of 59 * HW_QUERY_BASE_REG register: 60 */ 61 OUT_PKT3(ring, CP_SET_CONSTANT, 3); 62 OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000); 63 OUT_RING(ring, HW_QUERY_BASE_REG); 64 OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | samp->offset); 65 66 OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3); 67 OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX, 68 INDEX4_SIZE_32_BIT, USE_VISIBILITY)); 69 OUT_RING(ring, 1); /* NumInstances */ 70 OUT_RING(ring, 0); /* NumIndices */ 71 72 fd_event_write(batch, ring, ZPASS_DONE); 73 74 return samp; 75} 76 77static uint64_t 78count_samples(const struct fd_rb_samp_ctrs *start, 79 const struct fd_rb_samp_ctrs *end) 80{ 81 return end->ctr[0] - start->ctr[0]; 82} 83 84static void 85occlusion_counter_accumulate_result(struct fd_context *ctx, const void *start, 86 const void *end, 87 union pipe_query_result *result) 88{ 89 uint64_t n = count_samples(start, end); 90 result->u64 += n; 91} 92 93static void 94occlusion_predicate_accumulate_result(struct fd_context *ctx, const void *start, 95 const void *end, 96 union pipe_query_result *result) 97{ 98 uint64_t n = count_samples(start, end); 99 result->b |= (n > 0); 100} 101 102/* 103 * Time Elapsed Query: 104 * 105 * Note: we could in theory support timestamp queries, but they 106 * won't give sensible results for tilers. 107 */ 108 109static void 110time_elapsed_enable(struct fd_context *ctx, 111 struct fd_ringbuffer *ring) assert_dt 112{ 113 /* Right now, the assignment of countable to counter register is 114 * just hard coded. If we start exposing more countables than we 115 * have counters, we will need to be more clever. 116 */ 117 struct fd_batch *batch = fd_context_batch_locked(ctx); 118 fd_wfi(batch, ring); 119 OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1); 120 OUT_RING(ring, CP_ALWAYS_COUNT); 121 fd_batch_unlock_submit(batch); 122 fd_batch_reference(&batch, NULL); 123} 124 125static struct fd_hw_sample * 126time_elapsed_get_sample(struct fd_batch *batch, 127 struct fd_ringbuffer *ring) assert_dt 128{ 129 struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t)); 130 131 /* use unused part of vsc_size_mem as scratch space, to avoid 132 * extra allocation: 133 */ 134 struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem; 135 const int sample_off = 128; 136 const int addr_off = sample_off + 8; 137 138 assert(batch->ctx->screen->max_freq > 0); 139 140 /* Basic issue is that we need to read counter value to a relative 141 * destination (with per-tile offset) rather than absolute dest 142 * addr. But there is no pm4 packet that can do that. This is 143 * where it would be *really* nice if we could write our own fw 144 * since afaict implementing the sort of packet we need would be 145 * trivial. 146 * 147 * Instead, we: 148 * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer 149 * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer 150 * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base 151 * address to the per-sample offset in the scratch buffer 152 * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3 153 * to CP_ME_NRT_ADDR 154 * (5) CP_MEM_TO_REG's to copy saved counter value from scratch 155 * buffer to CP_ME_NRT_DATA to trigger the write out to query 156 * result buffer 157 * 158 * Straightforward, right? 159 * 160 * Maybe could swap the order of things in the scratch buffer to 161 * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one 162 * shot, but that's really just polishing a turd.. 163 */ 164 165 fd_wfi(batch, ring); 166 167 /* copy sample counter _LO and _HI to scratch: */ 168 OUT_PKT3(ring, CP_REG_TO_MEM, 2); 169 OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) | 170 CP_REG_TO_MEM_0_64B | 171 CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */ 172 OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); 173 174 /* ok... here we really *would* like to use the CP_SET_CONSTANT 175 * mode which can add a constant to value in reg2 and write to 176 * reg1... *but* that only works for banked/context registers, 177 * and CP_ME_NRT_DATA isn't one of those.. so we need to do some 178 * CP math to the scratch buffer instead: 179 * 180 * (note first 8 bytes are counter value, use offset 0x8 for 181 * address calculation) 182 */ 183 184 /* per-sample offset to scratch bo: */ 185 OUT_PKT3(ring, CP_MEM_WRITE, 2); 186 OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); 187 OUT_RING(ring, samp->offset); 188 189 /* now add to that the per-tile base: */ 190 OUT_PKT3(ring, CP_REG_TO_MEM, 2); 191 OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) | 192 CP_REG_TO_MEM_0_ACCUMULATE | 193 CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */ 194 OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); 195 196 /* now copy that back to CP_ME_NRT_ADDR: */ 197 OUT_PKT3(ring, CP_MEM_TO_REG, 2); 198 OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR); 199 OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); 200 201 /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA 202 * to trigger the write to result buffer 203 */ 204 OUT_PKT3(ring, CP_MEM_TO_REG, 2); 205 OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); 206 OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); 207 208 /* and again to get the value of the _HI reg from scratch: */ 209 OUT_PKT3(ring, CP_MEM_TO_REG, 2); 210 OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); 211 OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0); 212 213 /* Sigh.. */ 214 215 return samp; 216} 217 218static void 219time_elapsed_accumulate_result(struct fd_context *ctx, const void *start, 220 const void *end, union pipe_query_result *result) 221{ 222 uint64_t n = *(uint64_t *)end - *(uint64_t *)start; 223 /* max_freq is in Hz, convert cycle count to ns: */ 224 result->u64 += n * 1000000000 / ctx->screen->max_freq; 225} 226 227static void 228timestamp_accumulate_result(struct fd_context *ctx, const void *start, 229 const void *end, union pipe_query_result *result) 230{ 231 /* just return the value from fist tile: */ 232 if (result->u64 != 0) 233 return; 234 uint64_t n = *(uint64_t *)start; 235 /* max_freq is in Hz, convert cycle count to ns: */ 236 result->u64 = n * 1000000000 / ctx->screen->max_freq; 237} 238 239static const struct fd_hw_sample_provider occlusion_counter = { 240 .query_type = PIPE_QUERY_OCCLUSION_COUNTER, 241 .get_sample = occlusion_get_sample, 242 .accumulate_result = occlusion_counter_accumulate_result, 243}; 244 245static const struct fd_hw_sample_provider occlusion_predicate = { 246 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, 247 .get_sample = occlusion_get_sample, 248 .accumulate_result = occlusion_predicate_accumulate_result, 249}; 250 251static const struct fd_hw_sample_provider occlusion_predicate_conservative = { 252 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE, 253 .get_sample = occlusion_get_sample, 254 .accumulate_result = occlusion_predicate_accumulate_result, 255}; 256 257static const struct fd_hw_sample_provider time_elapsed = { 258 .query_type = PIPE_QUERY_TIME_ELAPSED, 259 .always = true, 260 .enable = time_elapsed_enable, 261 .get_sample = time_elapsed_get_sample, 262 .accumulate_result = time_elapsed_accumulate_result, 263}; 264 265/* NOTE: timestamp query isn't going to give terribly sensible results 266 * on a tiler. But it is needed by qapitrace profile heatmap. If you 267 * add in a binning pass, the results get even more non-sensical. So 268 * we just return the timestamp on the first tile and hope that is 269 * kind of good enough. 270 */ 271static const struct fd_hw_sample_provider timestamp = { 272 .query_type = PIPE_QUERY_TIMESTAMP, 273 .always = true, 274 .enable = time_elapsed_enable, 275 .get_sample = time_elapsed_get_sample, 276 .accumulate_result = timestamp_accumulate_result, 277}; 278 279void 280fd4_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis 281{ 282 struct fd_context *ctx = fd_context(pctx); 283 284 ctx->create_query = fd_hw_create_query; 285 ctx->query_prepare = fd_hw_query_prepare; 286 ctx->query_prepare_tile = fd_hw_query_prepare_tile; 287 ctx->query_update_batch = fd_hw_query_update_batch; 288 289 fd_hw_query_register_provider(pctx, &occlusion_counter); 290 fd_hw_query_register_provider(pctx, &occlusion_predicate); 291 fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative); 292 fd_hw_query_register_provider(pctx, &time_elapsed); 293 fd_hw_query_register_provider(pctx, ×tamp); 294} 295