1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21bf215546Sopenharmony_ci * SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci * Authors: 24bf215546Sopenharmony_ci * Rob Clark <robclark@freedesktop.org> 25bf215546Sopenharmony_ci */ 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci#include "freedreno_context.h" 28bf215546Sopenharmony_ci#include "freedreno_query_hw.h" 29bf215546Sopenharmony_ci#include "freedreno_util.h" 30bf215546Sopenharmony_ci 31bf215546Sopenharmony_ci#include "fd4_context.h" 32bf215546Sopenharmony_ci#include "fd4_draw.h" 33bf215546Sopenharmony_ci#include "fd4_format.h" 34bf215546Sopenharmony_ci#include "fd4_query.h" 35bf215546Sopenharmony_ci 36bf215546Sopenharmony_cistruct fd_rb_samp_ctrs { 37bf215546Sopenharmony_ci uint64_t ctr[16]; 38bf215546Sopenharmony_ci}; 39bf215546Sopenharmony_ci 40bf215546Sopenharmony_ci/* 41bf215546Sopenharmony_ci * Occlusion Query: 42bf215546Sopenharmony_ci * 43bf215546Sopenharmony_ci * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they 44bf215546Sopenharmony_ci * interpret results 45bf215546Sopenharmony_ci */ 46bf215546Sopenharmony_ci 47bf215546Sopenharmony_cistatic struct fd_hw_sample * 48bf215546Sopenharmony_ciocclusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring) 49bf215546Sopenharmony_ci{ 50bf215546Sopenharmony_ci struct fd_hw_sample *samp = 51bf215546Sopenharmony_ci fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs)); 52bf215546Sopenharmony_ci 53bf215546Sopenharmony_ci /* low bits of sample addr should be zero (since they are control 54bf215546Sopenharmony_ci * flags in RB_SAMPLE_COUNT_CONTROL): 55bf215546Sopenharmony_ci */ 56bf215546Sopenharmony_ci assert((samp->offset & 0x3) == 0); 57bf215546Sopenharmony_ci 58bf215546Sopenharmony_ci /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of 59bf215546Sopenharmony_ci * HW_QUERY_BASE_REG register: 60bf215546Sopenharmony_ci */ 61bf215546Sopenharmony_ci OUT_PKT3(ring, CP_SET_CONSTANT, 3); 62bf215546Sopenharmony_ci OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000); 63bf215546Sopenharmony_ci OUT_RING(ring, HW_QUERY_BASE_REG); 64bf215546Sopenharmony_ci OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | samp->offset); 65bf215546Sopenharmony_ci 66bf215546Sopenharmony_ci OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3); 67bf215546Sopenharmony_ci OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX, 68bf215546Sopenharmony_ci INDEX4_SIZE_32_BIT, USE_VISIBILITY)); 69bf215546Sopenharmony_ci OUT_RING(ring, 1); /* NumInstances */ 70bf215546Sopenharmony_ci OUT_RING(ring, 0); /* NumIndices */ 71bf215546Sopenharmony_ci 72bf215546Sopenharmony_ci fd_event_write(batch, ring, ZPASS_DONE); 73bf215546Sopenharmony_ci 74bf215546Sopenharmony_ci return samp; 75bf215546Sopenharmony_ci} 76bf215546Sopenharmony_ci 77bf215546Sopenharmony_cistatic uint64_t 78bf215546Sopenharmony_cicount_samples(const struct fd_rb_samp_ctrs *start, 79bf215546Sopenharmony_ci const struct fd_rb_samp_ctrs *end) 80bf215546Sopenharmony_ci{ 81bf215546Sopenharmony_ci return end->ctr[0] - start->ctr[0]; 82bf215546Sopenharmony_ci} 83bf215546Sopenharmony_ci 84bf215546Sopenharmony_cistatic void 85bf215546Sopenharmony_ciocclusion_counter_accumulate_result(struct fd_context *ctx, const void *start, 86bf215546Sopenharmony_ci const void *end, 87bf215546Sopenharmony_ci union pipe_query_result *result) 88bf215546Sopenharmony_ci{ 89bf215546Sopenharmony_ci uint64_t n = count_samples(start, end); 90bf215546Sopenharmony_ci result->u64 += n; 91bf215546Sopenharmony_ci} 92bf215546Sopenharmony_ci 93bf215546Sopenharmony_cistatic void 94bf215546Sopenharmony_ciocclusion_predicate_accumulate_result(struct fd_context *ctx, const void *start, 95bf215546Sopenharmony_ci const void *end, 96bf215546Sopenharmony_ci union pipe_query_result *result) 97bf215546Sopenharmony_ci{ 98bf215546Sopenharmony_ci uint64_t n = count_samples(start, end); 99bf215546Sopenharmony_ci result->b |= (n > 0); 100bf215546Sopenharmony_ci} 101bf215546Sopenharmony_ci 102bf215546Sopenharmony_ci/* 103bf215546Sopenharmony_ci * Time Elapsed Query: 104bf215546Sopenharmony_ci * 105bf215546Sopenharmony_ci * Note: we could in theory support timestamp queries, but they 106bf215546Sopenharmony_ci * won't give sensible results for tilers. 107bf215546Sopenharmony_ci */ 108bf215546Sopenharmony_ci 109bf215546Sopenharmony_cistatic void 110bf215546Sopenharmony_citime_elapsed_enable(struct fd_context *ctx, 111bf215546Sopenharmony_ci struct fd_ringbuffer *ring) assert_dt 112bf215546Sopenharmony_ci{ 113bf215546Sopenharmony_ci /* Right now, the assignment of countable to counter register is 114bf215546Sopenharmony_ci * just hard coded. If we start exposing more countables than we 115bf215546Sopenharmony_ci * have counters, we will need to be more clever. 116bf215546Sopenharmony_ci */ 117bf215546Sopenharmony_ci struct fd_batch *batch = fd_context_batch_locked(ctx); 118bf215546Sopenharmony_ci fd_wfi(batch, ring); 119bf215546Sopenharmony_ci OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1); 120bf215546Sopenharmony_ci OUT_RING(ring, CP_ALWAYS_COUNT); 121bf215546Sopenharmony_ci fd_batch_unlock_submit(batch); 122bf215546Sopenharmony_ci fd_batch_reference(&batch, NULL); 123bf215546Sopenharmony_ci} 124bf215546Sopenharmony_ci 125bf215546Sopenharmony_cistatic struct fd_hw_sample * 126bf215546Sopenharmony_citime_elapsed_get_sample(struct fd_batch *batch, 127bf215546Sopenharmony_ci struct fd_ringbuffer *ring) assert_dt 128bf215546Sopenharmony_ci{ 129bf215546Sopenharmony_ci struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t)); 130bf215546Sopenharmony_ci 131bf215546Sopenharmony_ci /* use unused part of vsc_size_mem as scratch space, to avoid 132bf215546Sopenharmony_ci * extra allocation: 133bf215546Sopenharmony_ci */ 134bf215546Sopenharmony_ci struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem; 135bf215546Sopenharmony_ci const int sample_off = 128; 136bf215546Sopenharmony_ci const int addr_off = sample_off + 8; 137bf215546Sopenharmony_ci 138bf215546Sopenharmony_ci assert(batch->ctx->screen->max_freq > 0); 139bf215546Sopenharmony_ci 140bf215546Sopenharmony_ci /* Basic issue is that we need to read counter value to a relative 141bf215546Sopenharmony_ci * destination (with per-tile offset) rather than absolute dest 142bf215546Sopenharmony_ci * addr. But there is no pm4 packet that can do that. This is 143bf215546Sopenharmony_ci * where it would be *really* nice if we could write our own fw 144bf215546Sopenharmony_ci * since afaict implementing the sort of packet we need would be 145bf215546Sopenharmony_ci * trivial. 146bf215546Sopenharmony_ci * 147bf215546Sopenharmony_ci * Instead, we: 148bf215546Sopenharmony_ci * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer 149bf215546Sopenharmony_ci * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer 150bf215546Sopenharmony_ci * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base 151bf215546Sopenharmony_ci * address to the per-sample offset in the scratch buffer 152bf215546Sopenharmony_ci * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3 153bf215546Sopenharmony_ci * to CP_ME_NRT_ADDR 154bf215546Sopenharmony_ci * (5) CP_MEM_TO_REG's to copy saved counter value from scratch 155bf215546Sopenharmony_ci * buffer to CP_ME_NRT_DATA to trigger the write out to query 156bf215546Sopenharmony_ci * result buffer 157bf215546Sopenharmony_ci * 158bf215546Sopenharmony_ci * Straightforward, right? 159bf215546Sopenharmony_ci * 160bf215546Sopenharmony_ci * Maybe could swap the order of things in the scratch buffer to 161bf215546Sopenharmony_ci * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one 162bf215546Sopenharmony_ci * shot, but that's really just polishing a turd.. 163bf215546Sopenharmony_ci */ 164bf215546Sopenharmony_ci 165bf215546Sopenharmony_ci fd_wfi(batch, ring); 166bf215546Sopenharmony_ci 167bf215546Sopenharmony_ci /* copy sample counter _LO and _HI to scratch: */ 168bf215546Sopenharmony_ci OUT_PKT3(ring, CP_REG_TO_MEM, 2); 169bf215546Sopenharmony_ci OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) | 170bf215546Sopenharmony_ci CP_REG_TO_MEM_0_64B | 171bf215546Sopenharmony_ci CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */ 172bf215546Sopenharmony_ci OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); 173bf215546Sopenharmony_ci 174bf215546Sopenharmony_ci /* ok... here we really *would* like to use the CP_SET_CONSTANT 175bf215546Sopenharmony_ci * mode which can add a constant to value in reg2 and write to 176bf215546Sopenharmony_ci * reg1... *but* that only works for banked/context registers, 177bf215546Sopenharmony_ci * and CP_ME_NRT_DATA isn't one of those.. so we need to do some 178bf215546Sopenharmony_ci * CP math to the scratch buffer instead: 179bf215546Sopenharmony_ci * 180bf215546Sopenharmony_ci * (note first 8 bytes are counter value, use offset 0x8 for 181bf215546Sopenharmony_ci * address calculation) 182bf215546Sopenharmony_ci */ 183bf215546Sopenharmony_ci 184bf215546Sopenharmony_ci /* per-sample offset to scratch bo: */ 185bf215546Sopenharmony_ci OUT_PKT3(ring, CP_MEM_WRITE, 2); 186bf215546Sopenharmony_ci OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); 187bf215546Sopenharmony_ci OUT_RING(ring, samp->offset); 188bf215546Sopenharmony_ci 189bf215546Sopenharmony_ci /* now add to that the per-tile base: */ 190bf215546Sopenharmony_ci OUT_PKT3(ring, CP_REG_TO_MEM, 2); 191bf215546Sopenharmony_ci OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) | 192bf215546Sopenharmony_ci CP_REG_TO_MEM_0_ACCUMULATE | 193bf215546Sopenharmony_ci CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */ 194bf215546Sopenharmony_ci OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); 195bf215546Sopenharmony_ci 196bf215546Sopenharmony_ci /* now copy that back to CP_ME_NRT_ADDR: */ 197bf215546Sopenharmony_ci OUT_PKT3(ring, CP_MEM_TO_REG, 2); 198bf215546Sopenharmony_ci OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR); 199bf215546Sopenharmony_ci OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); 200bf215546Sopenharmony_ci 201bf215546Sopenharmony_ci /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA 202bf215546Sopenharmony_ci * to trigger the write to result buffer 203bf215546Sopenharmony_ci */ 204bf215546Sopenharmony_ci OUT_PKT3(ring, CP_MEM_TO_REG, 2); 205bf215546Sopenharmony_ci OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); 206bf215546Sopenharmony_ci OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); 207bf215546Sopenharmony_ci 208bf215546Sopenharmony_ci /* and again to get the value of the _HI reg from scratch: */ 209bf215546Sopenharmony_ci OUT_PKT3(ring, CP_MEM_TO_REG, 2); 210bf215546Sopenharmony_ci OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); 211bf215546Sopenharmony_ci OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0); 212bf215546Sopenharmony_ci 213bf215546Sopenharmony_ci /* Sigh.. */ 214bf215546Sopenharmony_ci 215bf215546Sopenharmony_ci return samp; 216bf215546Sopenharmony_ci} 217bf215546Sopenharmony_ci 218bf215546Sopenharmony_cistatic void 219bf215546Sopenharmony_citime_elapsed_accumulate_result(struct fd_context *ctx, const void *start, 220bf215546Sopenharmony_ci const void *end, union pipe_query_result *result) 221bf215546Sopenharmony_ci{ 222bf215546Sopenharmony_ci uint64_t n = *(uint64_t *)end - *(uint64_t *)start; 223bf215546Sopenharmony_ci /* max_freq is in Hz, convert cycle count to ns: */ 224bf215546Sopenharmony_ci result->u64 += n * 1000000000 / ctx->screen->max_freq; 225bf215546Sopenharmony_ci} 226bf215546Sopenharmony_ci 227bf215546Sopenharmony_cistatic void 228bf215546Sopenharmony_citimestamp_accumulate_result(struct fd_context *ctx, const void *start, 229bf215546Sopenharmony_ci const void *end, union pipe_query_result *result) 230bf215546Sopenharmony_ci{ 231bf215546Sopenharmony_ci /* just return the value from fist tile: */ 232bf215546Sopenharmony_ci if (result->u64 != 0) 233bf215546Sopenharmony_ci return; 234bf215546Sopenharmony_ci uint64_t n = *(uint64_t *)start; 235bf215546Sopenharmony_ci /* max_freq is in Hz, convert cycle count to ns: */ 236bf215546Sopenharmony_ci result->u64 = n * 1000000000 / ctx->screen->max_freq; 237bf215546Sopenharmony_ci} 238bf215546Sopenharmony_ci 239bf215546Sopenharmony_cistatic const struct fd_hw_sample_provider occlusion_counter = { 240bf215546Sopenharmony_ci .query_type = PIPE_QUERY_OCCLUSION_COUNTER, 241bf215546Sopenharmony_ci .get_sample = occlusion_get_sample, 242bf215546Sopenharmony_ci .accumulate_result = occlusion_counter_accumulate_result, 243bf215546Sopenharmony_ci}; 244bf215546Sopenharmony_ci 245bf215546Sopenharmony_cistatic const struct fd_hw_sample_provider occlusion_predicate = { 246bf215546Sopenharmony_ci .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, 247bf215546Sopenharmony_ci .get_sample = occlusion_get_sample, 248bf215546Sopenharmony_ci .accumulate_result = occlusion_predicate_accumulate_result, 249bf215546Sopenharmony_ci}; 250bf215546Sopenharmony_ci 251bf215546Sopenharmony_cistatic const struct fd_hw_sample_provider occlusion_predicate_conservative = { 252bf215546Sopenharmony_ci .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE, 253bf215546Sopenharmony_ci .get_sample = occlusion_get_sample, 254bf215546Sopenharmony_ci .accumulate_result = occlusion_predicate_accumulate_result, 255bf215546Sopenharmony_ci}; 256bf215546Sopenharmony_ci 257bf215546Sopenharmony_cistatic const struct fd_hw_sample_provider time_elapsed = { 258bf215546Sopenharmony_ci .query_type = PIPE_QUERY_TIME_ELAPSED, 259bf215546Sopenharmony_ci .always = true, 260bf215546Sopenharmony_ci .enable = time_elapsed_enable, 261bf215546Sopenharmony_ci .get_sample = time_elapsed_get_sample, 262bf215546Sopenharmony_ci .accumulate_result = time_elapsed_accumulate_result, 263bf215546Sopenharmony_ci}; 264bf215546Sopenharmony_ci 265bf215546Sopenharmony_ci/* NOTE: timestamp query isn't going to give terribly sensible results 266bf215546Sopenharmony_ci * on a tiler. But it is needed by qapitrace profile heatmap. If you 267bf215546Sopenharmony_ci * add in a binning pass, the results get even more non-sensical. So 268bf215546Sopenharmony_ci * we just return the timestamp on the first tile and hope that is 269bf215546Sopenharmony_ci * kind of good enough. 270bf215546Sopenharmony_ci */ 271bf215546Sopenharmony_cistatic const struct fd_hw_sample_provider timestamp = { 272bf215546Sopenharmony_ci .query_type = PIPE_QUERY_TIMESTAMP, 273bf215546Sopenharmony_ci .always = true, 274bf215546Sopenharmony_ci .enable = time_elapsed_enable, 275bf215546Sopenharmony_ci .get_sample = time_elapsed_get_sample, 276bf215546Sopenharmony_ci .accumulate_result = timestamp_accumulate_result, 277bf215546Sopenharmony_ci}; 278bf215546Sopenharmony_ci 279bf215546Sopenharmony_civoid 280bf215546Sopenharmony_cifd4_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis 281bf215546Sopenharmony_ci{ 282bf215546Sopenharmony_ci struct fd_context *ctx = fd_context(pctx); 283bf215546Sopenharmony_ci 284bf215546Sopenharmony_ci ctx->create_query = fd_hw_create_query; 285bf215546Sopenharmony_ci ctx->query_prepare = fd_hw_query_prepare; 286bf215546Sopenharmony_ci ctx->query_prepare_tile = fd_hw_query_prepare_tile; 287bf215546Sopenharmony_ci ctx->query_update_batch = fd_hw_query_update_batch; 288bf215546Sopenharmony_ci 289bf215546Sopenharmony_ci fd_hw_query_register_provider(pctx, &occlusion_counter); 290bf215546Sopenharmony_ci fd_hw_query_register_provider(pctx, &occlusion_predicate); 291bf215546Sopenharmony_ci fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative); 292bf215546Sopenharmony_ci fd_hw_query_register_provider(pctx, &time_elapsed); 293bf215546Sopenharmony_ci fd_hw_query_register_provider(pctx, ×tamp); 294bf215546Sopenharmony_ci} 295