xref: /third_party/mesa3d/src/util/perf/u_trace.h (revision bf215546)
1/*
2 * Copyright © 2020 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#ifndef _U_TRACE_H
25#define _U_TRACE_H
26
27#include <stdbool.h>
28#include <stdint.h>
29#include <stdio.h>
30
31#include "util/u_queue.h"
32
33#ifdef __cplusplus
34extern "C" {
35#endif
36
37/* A trace mechanism (very) loosely inspired by the linux kernel tracepoint
38 * mechanism, in that it allows for defining driver specific (or common)
39 * tracepoints, which generate 'trace_$name()' functions that can be
40 * called at various points in commandstream emit.
41 *
42 * Currently a printf backend is implemented, but the expectation is to
43 * also implement a perfetto backend for shipping out traces to a tool like
44 * AGI.
45 *
46 * Notable differences:
47 *
48 *  - GPU timestamps!  A driver provided callback is used to emit timestamps
49 *    to a buffer.  At a later point in time (when stalling to wait for the
50 *    GPU is not required), the timestamps are re-united with the trace
51 *    payload.  This makes the trace mechanism suitable for profiling.
52 *
53 *  - Instead of a systemwide trace ringbuffer, buffering of un-retired
54 *    tracepoints is split into two stages.  Traces are emitted to a
55 *    'u_trace' instance, and at a later time flushed to a 'u_trace_context'
56 *    instance.  This avoids the requirement that commandstream containing
57 *    tracepoints is emitted in the same order as it is generated.
58 *
59 *    If the hw has multiple parallel "engines" (for example, 3d/blit/compute)
60 *    then a `u_trace_context` per-engine should be used.
61 *
62 *  - Unlike kernel tracepoints, u_trace tracepoints are defined in py
63 *    from which header and src files are generated.  Since we already have
64 *    a build dependency on python+mako, this gives more flexibility than
65 *    clunky preprocessor macro magic.
66 *
67 */
68
69struct u_trace_context;
70struct u_trace;
71struct u_trace_chunk;
72struct u_trace_printer;
73
74/**
75 * Special reserved value to indicate that no timestamp was captured,
76 * and that the timestamp of the previous trace should be reused.
77 */
78#define U_TRACE_NO_TIMESTAMP ((uint64_t)0)
79
80/**
81 * Driver provided callback to create a timestamp buffer which will be
82 * read by u_trace_read_ts function.
83 */
84typedef void* (*u_trace_create_ts_buffer)(struct u_trace_context *utctx,
85      uint32_t timestamps_count);
86
87/**
88 * Driver provided callback to delete a timestamp buffer.
89 */
90typedef void (*u_trace_delete_ts_buffer)(struct u_trace_context *utctx,
91      void *timestamps);
92
93/**
94 * Driver provided callback to emit commands into the soecified command
95 * stream to capture a 64b timestamp into the specified timestamps buffer,
96 * at the specified index.
97 *
98 * The hw counter that the driver records should be something that runs at
99 * a fixed rate, even as the GPU freq changes.  The same source used for
100 * GL_TIMESTAMP queries should be appropriate.
101 */
102typedef void (*u_trace_record_ts)(struct u_trace *ut, void *cs,
103                                  void *timestamps, unsigned idx,
104                                  bool end_of_pipe);
105
106/**
107 * Driver provided callback to read back a previously recorded timestamp.
108 * If necessary, this should block until the GPU has finished writing back
109 * the timestamps.  (The timestamps will be read back in order, so it is
110 * safe to only synchronize on idx==0.)
111 *
112 * flush_data is data provided by the driver via u_trace_flush.
113 *
114 * The returned timestamp should be in units of nanoseconds.  The same
115 * timebase as GL_TIMESTAMP queries should be used.
116 *
117 * The driver can return the special U_TRACE_NO_TIMESTAMP value to indicate
118 * that no timestamp was captured and the timestamp from the previous trace
119 * will be re-used.  (The first trace in the u_trace buf may not do this.)
120 * This allows the driver to detect cases where multiple tracepoints are
121 * emitted with no other intervening cmdstream, to avoid pointlessly
122 * capturing the same timestamp multiple times in a row.
123 */
124typedef uint64_t (*u_trace_read_ts)(struct u_trace_context *utctx,
125      void *timestamps, unsigned idx, void *flush_data);
126
127/**
128 * Driver provided callback to delete flush data.
129 */
130typedef void (*u_trace_delete_flush_data)(struct u_trace_context *utctx,
131      void *flush_data);
132
133/**
134 * The trace context provides tracking for "in-flight" traces, once the
135 * cmdstream that records timestamps has been flushed.
136 */
137struct u_trace_context {
138   void *pctx;
139
140   u_trace_create_ts_buffer  create_timestamp_buffer;
141   u_trace_delete_ts_buffer  delete_timestamp_buffer;
142   u_trace_record_ts         record_timestamp;
143   u_trace_read_ts           read_timestamp;
144   u_trace_delete_flush_data delete_flush_data;
145
146   FILE *out;
147   struct u_trace_printer *out_printer;
148
149   /* Once u_trace_flush() is called u_trace_chunk's are queued up to
150    * render tracepoints on a queue.  The per-chunk queue jobs block until
151    * timestamps are available.
152    */
153   struct util_queue queue;
154
155#ifdef HAVE_PERFETTO
156   /* node in global list of trace contexts. */
157   struct list_head node;
158#endif
159
160   /* State to accumulate time across N chunks associated with a single
161    * batch (u_trace).
162    */
163   uint64_t last_time_ns;
164   uint64_t first_time_ns;
165
166   uint32_t frame_nr;
167   uint32_t batch_nr;
168   uint32_t event_nr;
169   bool start_of_frame;
170
171   /* list of unprocessed trace chunks in fifo order: */
172   struct list_head flushed_trace_chunks;
173};
174
175/**
176 * The u_trace ptr is passed as the first arg to generated tracepoints.
177 * It provides buffering for tracepoint payload until the corresponding
178 * driver cmdstream containing the emitted commands to capture is
179 * flushed.
180 *
181 * Individual tracepoints emitted to u_trace are expected to be "executed"
182 * (ie. timestamp captured) in FIFO order with respect to other tracepoints
183 * emitted to the same u_trace.  But the order WRT other u_trace instances
184 * is undefined util u_trace_flush().
185 */
186struct u_trace {
187   struct u_trace_context *utctx;
188
189   struct list_head trace_chunks;  /* list of unflushed trace chunks in fifo order */
190
191   bool enabled;
192};
193
194void u_trace_context_init(struct u_trace_context *utctx,
195      void *pctx,
196      u_trace_create_ts_buffer   create_timestamp_buffer,
197      u_trace_delete_ts_buffer   delete_timestamp_buffer,
198      u_trace_record_ts          record_timestamp,
199      u_trace_read_ts            read_timestamp,
200      u_trace_delete_flush_data  delete_flush_data);
201void u_trace_context_fini(struct u_trace_context *utctx);
202
203/**
204 * Flush (trigger processing) of traces previously flushed to the trace-context
205 * by u_trace_flush().
206 *
207 * This should typically be called in the driver's pctx->flush().
208 */
209void u_trace_context_process(struct u_trace_context *utctx, bool eof);
210
211void u_trace_init(struct u_trace *ut, struct u_trace_context *utctx);
212void u_trace_fini(struct u_trace *ut);
213
214bool u_trace_has_points(struct u_trace *ut);
215
216struct u_trace_iterator
217{
218   struct u_trace *ut;
219   struct u_trace_chunk *chunk;
220   uint32_t event_idx;
221};
222
223struct u_trace_iterator
224u_trace_begin_iterator(struct u_trace *ut);
225
226struct u_trace_iterator
227u_trace_end_iterator(struct u_trace *ut);
228
229bool
230u_trace_iterator_equal(struct u_trace_iterator a,
231                       struct u_trace_iterator b);
232
233typedef void (*u_trace_copy_ts_buffer)(struct u_trace_context *utctx,
234      void *cmdstream,
235      void *ts_from, uint32_t from_offset,
236      void *ts_to, uint32_t to_offset,
237      uint32_t count);
238
239/**
240 * Clones tracepoints range into target u_trace.
241 * Provides callback for driver to copy timestamps on GPU from
242 * one buffer to another.
243 *
244 * It allows:
245 * - Tracing re-usable command buffer in Vulkan, by copying tracepoints
246 *   each time it is submitted.
247 * - Per-tile tracing for tiling GPUs, by copying a range of tracepoints
248 *   corresponding to a tile.
249 */
250void u_trace_clone_append(struct u_trace_iterator begin_it,
251                          struct u_trace_iterator end_it,
252                          struct u_trace *into,
253                          void *cmdstream,
254                          u_trace_copy_ts_buffer copy_ts_buffer);
255
256void u_trace_disable_event_range(struct u_trace_iterator begin_it,
257                                 struct u_trace_iterator end_it);
258
259/**
260 * Flush traces to the parent trace-context.  At this point, the expectation
261 * is that all the tracepoints are "executed" by the GPU following any previously
262 * flushed u_trace batch.
263 *
264 * flush_data is a way for driver to pass additional data, which becomes available
265 * only at the point of flush, to the u_trace_read_ts callback and perfetto.
266 * The typical example of such data would be a fence to wait on in u_trace_read_ts,
267 * and a submission_id to pass into perfetto.
268 * The destruction of the data is done via u_trace_delete_flush_data.
269 *
270 * This should typically be called when the corresponding cmdstream (containing
271 * the timestamp reads) is flushed to the kernel.
272 */
273void u_trace_flush(struct u_trace *ut, void *flush_data, bool free_data);
274
275/**
276 * Whether command buffers should be instrumented even if not collecting
277 * traces.
278 */
279extern bool ut_trace_instrument;
280
281#ifdef HAVE_PERFETTO
282extern int ut_perfetto_enabled;
283
284void u_trace_perfetto_start(void);
285void u_trace_perfetto_stop(void);
286#else
287#  define ut_perfetto_enabled 0
288#endif
289
290static inline bool
291u_trace_context_actively_tracing(struct u_trace_context *utctx)
292{
293   return !!utctx->out || (ut_perfetto_enabled > 0);
294}
295
296static inline bool
297u_trace_context_instrumenting(struct u_trace_context *utctx)
298{
299   return !!utctx->out || ut_trace_instrument || (ut_perfetto_enabled > 0);
300}
301
302#ifdef __cplusplus
303}
304#endif
305
306#endif  /* _U_TRACE_H */
307