1/* 2 * Copyright © 2020 Google, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#ifndef _U_TRACE_H 25#define _U_TRACE_H 26 27#include <stdbool.h> 28#include <stdint.h> 29#include <stdio.h> 30 31#include "util/u_queue.h" 32 33#ifdef __cplusplus 34extern "C" { 35#endif 36 37/* A trace mechanism (very) loosely inspired by the linux kernel tracepoint 38 * mechanism, in that it allows for defining driver specific (or common) 39 * tracepoints, which generate 'trace_$name()' functions that can be 40 * called at various points in commandstream emit. 41 * 42 * Currently a printf backend is implemented, but the expectation is to 43 * also implement a perfetto backend for shipping out traces to a tool like 44 * AGI. 45 * 46 * Notable differences: 47 * 48 * - GPU timestamps! A driver provided callback is used to emit timestamps 49 * to a buffer. At a later point in time (when stalling to wait for the 50 * GPU is not required), the timestamps are re-united with the trace 51 * payload. This makes the trace mechanism suitable for profiling. 52 * 53 * - Instead of a systemwide trace ringbuffer, buffering of un-retired 54 * tracepoints is split into two stages. Traces are emitted to a 55 * 'u_trace' instance, and at a later time flushed to a 'u_trace_context' 56 * instance. This avoids the requirement that commandstream containing 57 * tracepoints is emitted in the same order as it is generated. 58 * 59 * If the hw has multiple parallel "engines" (for example, 3d/blit/compute) 60 * then a `u_trace_context` per-engine should be used. 61 * 62 * - Unlike kernel tracepoints, u_trace tracepoints are defined in py 63 * from which header and src files are generated. Since we already have 64 * a build dependency on python+mako, this gives more flexibility than 65 * clunky preprocessor macro magic. 66 * 67 */ 68 69struct u_trace_context; 70struct u_trace; 71struct u_trace_chunk; 72struct u_trace_printer; 73 74/** 75 * Special reserved value to indicate that no timestamp was captured, 76 * and that the timestamp of the previous trace should be reused. 77 */ 78#define U_TRACE_NO_TIMESTAMP ((uint64_t)0) 79 80/** 81 * Driver provided callback to create a timestamp buffer which will be 82 * read by u_trace_read_ts function. 83 */ 84typedef void* (*u_trace_create_ts_buffer)(struct u_trace_context *utctx, 85 uint32_t timestamps_count); 86 87/** 88 * Driver provided callback to delete a timestamp buffer. 89 */ 90typedef void (*u_trace_delete_ts_buffer)(struct u_trace_context *utctx, 91 void *timestamps); 92 93/** 94 * Driver provided callback to emit commands into the soecified command 95 * stream to capture a 64b timestamp into the specified timestamps buffer, 96 * at the specified index. 97 * 98 * The hw counter that the driver records should be something that runs at 99 * a fixed rate, even as the GPU freq changes. The same source used for 100 * GL_TIMESTAMP queries should be appropriate. 101 */ 102typedef void (*u_trace_record_ts)(struct u_trace *ut, void *cs, 103 void *timestamps, unsigned idx, 104 bool end_of_pipe); 105 106/** 107 * Driver provided callback to read back a previously recorded timestamp. 108 * If necessary, this should block until the GPU has finished writing back 109 * the timestamps. (The timestamps will be read back in order, so it is 110 * safe to only synchronize on idx==0.) 111 * 112 * flush_data is data provided by the driver via u_trace_flush. 113 * 114 * The returned timestamp should be in units of nanoseconds. The same 115 * timebase as GL_TIMESTAMP queries should be used. 116 * 117 * The driver can return the special U_TRACE_NO_TIMESTAMP value to indicate 118 * that no timestamp was captured and the timestamp from the previous trace 119 * will be re-used. (The first trace in the u_trace buf may not do this.) 120 * This allows the driver to detect cases where multiple tracepoints are 121 * emitted with no other intervening cmdstream, to avoid pointlessly 122 * capturing the same timestamp multiple times in a row. 123 */ 124typedef uint64_t (*u_trace_read_ts)(struct u_trace_context *utctx, 125 void *timestamps, unsigned idx, void *flush_data); 126 127/** 128 * Driver provided callback to delete flush data. 129 */ 130typedef void (*u_trace_delete_flush_data)(struct u_trace_context *utctx, 131 void *flush_data); 132 133/** 134 * The trace context provides tracking for "in-flight" traces, once the 135 * cmdstream that records timestamps has been flushed. 136 */ 137struct u_trace_context { 138 void *pctx; 139 140 u_trace_create_ts_buffer create_timestamp_buffer; 141 u_trace_delete_ts_buffer delete_timestamp_buffer; 142 u_trace_record_ts record_timestamp; 143 u_trace_read_ts read_timestamp; 144 u_trace_delete_flush_data delete_flush_data; 145 146 FILE *out; 147 struct u_trace_printer *out_printer; 148 149 /* Once u_trace_flush() is called u_trace_chunk's are queued up to 150 * render tracepoints on a queue. The per-chunk queue jobs block until 151 * timestamps are available. 152 */ 153 struct util_queue queue; 154 155#ifdef HAVE_PERFETTO 156 /* node in global list of trace contexts. */ 157 struct list_head node; 158#endif 159 160 /* State to accumulate time across N chunks associated with a single 161 * batch (u_trace). 162 */ 163 uint64_t last_time_ns; 164 uint64_t first_time_ns; 165 166 uint32_t frame_nr; 167 uint32_t batch_nr; 168 uint32_t event_nr; 169 bool start_of_frame; 170 171 /* list of unprocessed trace chunks in fifo order: */ 172 struct list_head flushed_trace_chunks; 173}; 174 175/** 176 * The u_trace ptr is passed as the first arg to generated tracepoints. 177 * It provides buffering for tracepoint payload until the corresponding 178 * driver cmdstream containing the emitted commands to capture is 179 * flushed. 180 * 181 * Individual tracepoints emitted to u_trace are expected to be "executed" 182 * (ie. timestamp captured) in FIFO order with respect to other tracepoints 183 * emitted to the same u_trace. But the order WRT other u_trace instances 184 * is undefined util u_trace_flush(). 185 */ 186struct u_trace { 187 struct u_trace_context *utctx; 188 189 struct list_head trace_chunks; /* list of unflushed trace chunks in fifo order */ 190 191 bool enabled; 192}; 193 194void u_trace_context_init(struct u_trace_context *utctx, 195 void *pctx, 196 u_trace_create_ts_buffer create_timestamp_buffer, 197 u_trace_delete_ts_buffer delete_timestamp_buffer, 198 u_trace_record_ts record_timestamp, 199 u_trace_read_ts read_timestamp, 200 u_trace_delete_flush_data delete_flush_data); 201void u_trace_context_fini(struct u_trace_context *utctx); 202 203/** 204 * Flush (trigger processing) of traces previously flushed to the trace-context 205 * by u_trace_flush(). 206 * 207 * This should typically be called in the driver's pctx->flush(). 208 */ 209void u_trace_context_process(struct u_trace_context *utctx, bool eof); 210 211void u_trace_init(struct u_trace *ut, struct u_trace_context *utctx); 212void u_trace_fini(struct u_trace *ut); 213 214bool u_trace_has_points(struct u_trace *ut); 215 216struct u_trace_iterator 217{ 218 struct u_trace *ut; 219 struct u_trace_chunk *chunk; 220 uint32_t event_idx; 221}; 222 223struct u_trace_iterator 224u_trace_begin_iterator(struct u_trace *ut); 225 226struct u_trace_iterator 227u_trace_end_iterator(struct u_trace *ut); 228 229bool 230u_trace_iterator_equal(struct u_trace_iterator a, 231 struct u_trace_iterator b); 232 233typedef void (*u_trace_copy_ts_buffer)(struct u_trace_context *utctx, 234 void *cmdstream, 235 void *ts_from, uint32_t from_offset, 236 void *ts_to, uint32_t to_offset, 237 uint32_t count); 238 239/** 240 * Clones tracepoints range into target u_trace. 241 * Provides callback for driver to copy timestamps on GPU from 242 * one buffer to another. 243 * 244 * It allows: 245 * - Tracing re-usable command buffer in Vulkan, by copying tracepoints 246 * each time it is submitted. 247 * - Per-tile tracing for tiling GPUs, by copying a range of tracepoints 248 * corresponding to a tile. 249 */ 250void u_trace_clone_append(struct u_trace_iterator begin_it, 251 struct u_trace_iterator end_it, 252 struct u_trace *into, 253 void *cmdstream, 254 u_trace_copy_ts_buffer copy_ts_buffer); 255 256void u_trace_disable_event_range(struct u_trace_iterator begin_it, 257 struct u_trace_iterator end_it); 258 259/** 260 * Flush traces to the parent trace-context. At this point, the expectation 261 * is that all the tracepoints are "executed" by the GPU following any previously 262 * flushed u_trace batch. 263 * 264 * flush_data is a way for driver to pass additional data, which becomes available 265 * only at the point of flush, to the u_trace_read_ts callback and perfetto. 266 * The typical example of such data would be a fence to wait on in u_trace_read_ts, 267 * and a submission_id to pass into perfetto. 268 * The destruction of the data is done via u_trace_delete_flush_data. 269 * 270 * This should typically be called when the corresponding cmdstream (containing 271 * the timestamp reads) is flushed to the kernel. 272 */ 273void u_trace_flush(struct u_trace *ut, void *flush_data, bool free_data); 274 275/** 276 * Whether command buffers should be instrumented even if not collecting 277 * traces. 278 */ 279extern bool ut_trace_instrument; 280 281#ifdef HAVE_PERFETTO 282extern int ut_perfetto_enabled; 283 284void u_trace_perfetto_start(void); 285void u_trace_perfetto_stop(void); 286#else 287# define ut_perfetto_enabled 0 288#endif 289 290static inline bool 291u_trace_context_actively_tracing(struct u_trace_context *utctx) 292{ 293 return !!utctx->out || (ut_perfetto_enabled > 0); 294} 295 296static inline bool 297u_trace_context_instrumenting(struct u_trace_context *utctx) 298{ 299 return !!utctx->out || ut_trace_instrument || (ut_perfetto_enabled > 0); 300} 301 302#ifdef __cplusplus 303} 304#endif 305 306#endif /* _U_TRACE_H */ 307