1/*
2 * Copyright © 2019 Google LLC
3 * SPDX-License-Identifier: MIT
4 */
5
6#ifndef TU_CS_H
7#define TU_CS_H
8
9#include "tu_common.h"
10
11#include "freedreno_pm4.h"
12
13#include "tu_drm.h"
14
15/* For breadcrumbs we may open a network socket based on the envvar,
16 * it's not something that should be enabled by default.
17 */
18#define TU_BREADCRUMBS_ENABLED 0
19
20enum tu_cs_mode
21{
22
23   /*
24    * A command stream in TU_CS_MODE_GROW mode grows automatically whenever it
25    * is full.  tu_cs_begin must be called before command packet emission and
26    * tu_cs_end must be called after.
27    *
28    * This mode may create multiple entries internally.  The entries must be
29    * submitted together.
30    */
31   TU_CS_MODE_GROW,
32
33   /*
34    * A command stream in TU_CS_MODE_EXTERNAL mode wraps an external,
35    * fixed-size buffer.  tu_cs_begin and tu_cs_end are optional and have no
36    * effect on it.
37    *
38    * This mode does not create any entry or any BO.
39    */
40   TU_CS_MODE_EXTERNAL,
41
42   /*
43    * A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct
44    * command packet emission.  tu_cs_begin_sub_stream must be called to get a
45    * sub-stream to emit comamnd packets to.  When done with the sub-stream,
46    * tu_cs_end_sub_stream must be called.
47    *
48    * This mode does not create any entry internally.
49    */
50   TU_CS_MODE_SUB_STREAM,
51};
52
53struct tu_cs_entry
54{
55   /* No ownership */
56   const struct tu_bo *bo;
57
58   uint32_t size;
59   uint32_t offset;
60};
61
62struct tu_cs_memory {
63   uint32_t *map;
64   uint64_t iova;
65};
66
67struct tu_draw_state {
68   uint64_t iova : 48;
69   uint32_t size : 16;
70};
71
72#define TU_COND_EXEC_STACK_SIZE 4
73
74struct tu_cs
75{
76   uint32_t *start;
77   uint32_t *cur;
78   uint32_t *reserved_end;
79   uint32_t *end;
80
81   struct tu_device *device;
82   enum tu_cs_mode mode;
83   uint32_t next_bo_size;
84
85   struct tu_cs_entry *entries;
86   uint32_t entry_count;
87   uint32_t entry_capacity;
88
89   struct tu_bo **bos;
90   uint32_t bo_count;
91   uint32_t bo_capacity;
92
93   /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */
94   struct tu_bo *refcount_bo;
95
96   /* state for cond_exec_start/cond_exec_end */
97   uint32_t cond_stack_depth;
98   uint32_t cond_flags[TU_COND_EXEC_STACK_SIZE];
99   uint32_t *cond_dwords[TU_COND_EXEC_STACK_SIZE];
100
101   uint32_t breadcrumb_emit_after;
102};
103
104void
105tu_breadcrumbs_init(struct tu_device *device);
106
107void
108tu_breadcrumbs_finish(struct tu_device *device);
109
110void
111tu_cs_init(struct tu_cs *cs,
112           struct tu_device *device,
113           enum tu_cs_mode mode,
114           uint32_t initial_size);
115
116void
117tu_cs_init_external(struct tu_cs *cs, struct tu_device *device,
118                    uint32_t *start, uint32_t *end);
119
120void
121tu_cs_init_suballoc(struct tu_cs *cs, struct tu_device *device,
122                    struct tu_suballoc_bo *bo);
123
124void
125tu_cs_finish(struct tu_cs *cs);
126
127void
128tu_cs_begin(struct tu_cs *cs);
129
130void
131tu_cs_end(struct tu_cs *cs);
132
133VkResult
134tu_cs_begin_sub_stream(struct tu_cs *cs, uint32_t size, struct tu_cs *sub_cs);
135
136VkResult
137tu_cs_alloc(struct tu_cs *cs,
138            uint32_t count,
139            uint32_t size,
140            struct tu_cs_memory *memory);
141
142struct tu_cs_entry
143tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs);
144
145static inline struct tu_draw_state
146tu_cs_end_draw_state(struct tu_cs *cs, struct tu_cs *sub_cs)
147{
148   struct tu_cs_entry entry = tu_cs_end_sub_stream(cs, sub_cs);
149   return (struct tu_draw_state) {
150      .iova = entry.bo->iova + entry.offset,
151      .size = entry.size / sizeof(uint32_t),
152   };
153}
154
155VkResult
156tu_cs_reserve_space(struct tu_cs *cs, uint32_t reserved_size);
157
158static inline struct tu_draw_state
159tu_cs_draw_state(struct tu_cs *sub_cs, struct tu_cs *cs, uint32_t size)
160{
161   struct tu_cs_memory memory;
162
163   /* TODO: clean this up */
164   tu_cs_alloc(sub_cs, size, 1, &memory);
165   tu_cs_init_external(cs, sub_cs->device, memory.map, memory.map + size);
166   tu_cs_begin(cs);
167   tu_cs_reserve_space(cs, size);
168
169   return (struct tu_draw_state) {
170      .iova = memory.iova,
171      .size = size,
172   };
173}
174
175void
176tu_cs_reset(struct tu_cs *cs);
177
178VkResult
179tu_cs_add_entries(struct tu_cs *cs, struct tu_cs *target);
180
181/**
182 * Get the size of the command packets emitted since the last call to
183 * tu_cs_add_entry.
184 */
185static inline uint32_t
186tu_cs_get_size(const struct tu_cs *cs)
187{
188   return cs->cur - cs->start;
189}
190
191/**
192 * Return true if there is no command packet emitted since the last call to
193 * tu_cs_add_entry.
194 */
195static inline uint32_t
196tu_cs_is_empty(const struct tu_cs *cs)
197{
198   return tu_cs_get_size(cs) == 0;
199}
200
201/**
202 * Discard all entries.  This allows \a cs to be reused while keeping the
203 * existing BOs and command packets intact.
204 */
205static inline void
206tu_cs_discard_entries(struct tu_cs *cs)
207{
208   assert(cs->mode == TU_CS_MODE_GROW);
209   cs->entry_count = 0;
210}
211
212/**
213 * Get the size needed for tu_cs_emit_call.
214 */
215static inline uint32_t
216tu_cs_get_call_size(const struct tu_cs *cs)
217{
218   assert(cs->mode == TU_CS_MODE_GROW);
219   /* each CP_INDIRECT_BUFFER needs 4 dwords */
220   return cs->entry_count * 4;
221}
222
223/**
224 * Assert that we did not exceed the reserved space.
225 */
226static inline void
227tu_cs_sanity_check(const struct tu_cs *cs)
228{
229   assert(cs->start <= cs->cur);
230   assert(cs->cur <= cs->reserved_end);
231   assert(cs->reserved_end <= cs->end);
232}
233
234void
235tu_cs_emit_sync_breadcrumb(struct tu_cs *cs, uint8_t opcode, uint16_t cnt);
236
237/**
238 * Emit a uint32_t value into a command stream, without boundary checking.
239 */
240static inline void
241tu_cs_emit(struct tu_cs *cs, uint32_t value)
242{
243   assert(cs->cur < cs->reserved_end);
244   *cs->cur = value;
245   ++cs->cur;
246
247#if TU_BREADCRUMBS_ENABLED
248   cs->breadcrumb_emit_after--;
249   if (cs->breadcrumb_emit_after == 0)
250      tu_cs_emit_sync_breadcrumb(cs, -1, 0);
251#endif
252}
253
254/**
255 * Emit an array of uint32_t into a command stream, without boundary checking.
256 */
257static inline void
258tu_cs_emit_array(struct tu_cs *cs, const uint32_t *values, uint32_t length)
259{
260   assert(cs->cur + length <= cs->reserved_end);
261   memcpy(cs->cur, values, sizeof(uint32_t) * length);
262   cs->cur += length;
263}
264
265/**
266 * Get the size of the remaining space in the current BO.
267 */
268static inline uint32_t
269tu_cs_get_space(const struct tu_cs *cs)
270{
271   return cs->end - cs->cur;
272}
273
274static inline void
275tu_cs_reserve(struct tu_cs *cs, uint32_t reserved_size)
276{
277   if (cs->mode != TU_CS_MODE_GROW) {
278      assert(tu_cs_get_space(cs) >= reserved_size);
279      assert(cs->reserved_end == cs->end);
280      return;
281   }
282
283   if (tu_cs_get_space(cs) >= reserved_size &&
284       cs->entry_count < cs->entry_capacity) {
285      cs->reserved_end = cs->cur + reserved_size;
286      return;
287   }
288
289   ASSERTED VkResult result = tu_cs_reserve_space(cs, reserved_size);
290   /* TODO: set this error in tu_cs and use it */
291   assert(result == VK_SUCCESS);
292}
293
294/**
295 * Emit a type-4 command packet header into a command stream.
296 */
297static inline void
298tu_cs_emit_pkt4(struct tu_cs *cs, uint16_t regindx, uint16_t cnt)
299{
300   tu_cs_reserve(cs, cnt + 1);
301   tu_cs_emit(cs, pm4_pkt4_hdr(regindx, cnt));
302}
303
304/**
305 * Emit a type-7 command packet header into a command stream.
306 */
307static inline void
308tu_cs_emit_pkt7(struct tu_cs *cs, uint8_t opcode, uint16_t cnt)
309{
310#if TU_BREADCRUMBS_ENABLED
311   tu_cs_emit_sync_breadcrumb(cs, opcode, cnt + 1);
312#endif
313
314   tu_cs_reserve(cs, cnt + 1);
315   tu_cs_emit(cs, pm4_pkt7_hdr(opcode, cnt));
316}
317
318static inline void
319tu_cs_emit_wfi(struct tu_cs *cs)
320{
321   tu_cs_emit_pkt7(cs, CP_WAIT_FOR_IDLE, 0);
322}
323
324static inline void
325tu_cs_emit_qw(struct tu_cs *cs, uint64_t value)
326{
327   tu_cs_emit(cs, (uint32_t) value);
328   tu_cs_emit(cs, (uint32_t) (value >> 32));
329}
330
331static inline void
332tu_cs_emit_write_reg(struct tu_cs *cs, uint16_t reg, uint32_t value)
333{
334   tu_cs_emit_pkt4(cs, reg, 1);
335   tu_cs_emit(cs, value);
336}
337
338/**
339 * Emit a CP_INDIRECT_BUFFER command packet.
340 */
341static inline void
342tu_cs_emit_ib(struct tu_cs *cs, const struct tu_cs_entry *entry)
343{
344   assert(entry->bo);
345   assert(entry->size && entry->offset + entry->size <= entry->bo->size);
346   assert(entry->size % sizeof(uint32_t) == 0);
347   assert(entry->offset % sizeof(uint32_t) == 0);
348
349   tu_cs_emit_pkt7(cs, CP_INDIRECT_BUFFER, 3);
350   tu_cs_emit_qw(cs, entry->bo->iova + entry->offset);
351   tu_cs_emit(cs, entry->size / sizeof(uint32_t));
352}
353
354/* for compute which isn't using SET_DRAW_STATE */
355static inline void
356tu_cs_emit_state_ib(struct tu_cs *cs, struct tu_draw_state state)
357{
358   if (state.size) {
359      tu_cs_emit_pkt7(cs, CP_INDIRECT_BUFFER, 3);
360      tu_cs_emit_qw(cs, state.iova);
361      tu_cs_emit(cs, state.size);
362   }
363}
364
365/**
366 * Emit a CP_INDIRECT_BUFFER command packet for each entry in the target
367 * command stream.
368 */
369static inline void
370tu_cs_emit_call(struct tu_cs *cs, const struct tu_cs *target)
371{
372   assert(target->mode == TU_CS_MODE_GROW);
373   for (uint32_t i = 0; i < target->entry_count; i++)
374      tu_cs_emit_ib(cs, target->entries + i);
375}
376
377/* Helpers for bracketing a large sequence of commands of unknown size inside
378 * a CP_COND_REG_EXEC packet.
379 */
380static inline void
381tu_cond_exec_start(struct tu_cs *cs, uint32_t cond_flags)
382{
383   assert(cs->mode == TU_CS_MODE_GROW);
384   assert(cs->cond_stack_depth < TU_COND_EXEC_STACK_SIZE);
385
386   tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
387   tu_cs_emit(cs, cond_flags);
388
389   cs->cond_flags[cs->cond_stack_depth] = cond_flags;
390   cs->cond_dwords[cs->cond_stack_depth] = cs->cur;
391
392   /* Emit dummy DWORD field here */
393   tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(0));
394
395   cs->cond_stack_depth++;
396}
397#define CP_COND_EXEC_0_RENDER_MODE_GMEM \
398   (CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | CP_COND_REG_EXEC_0_GMEM)
399#define CP_COND_EXEC_0_RENDER_MODE_SYSMEM \
400   (CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | CP_COND_REG_EXEC_0_SYSMEM)
401
402static inline void
403tu_cond_exec_end(struct tu_cs *cs)
404{
405   assert(cs->cond_stack_depth > 0);
406   cs->cond_stack_depth--;
407
408   cs->cond_flags[cs->cond_stack_depth] = 0;
409   /* Subtract one here to account for the DWORD field itself. */
410   *cs->cond_dwords[cs->cond_stack_depth] =
411      cs->cur - cs->cond_dwords[cs->cond_stack_depth] - 1;
412}
413
414/* Temporary struct for tracking a register state to be written, used by
415 * a6xx-pack.h and tu_cs_emit_regs()
416 */
417struct tu_reg_value {
418   uint32_t reg;
419   uint64_t value;
420   bool is_address;
421   struct tu_bo *bo;
422   bool bo_write;
423   uint32_t bo_offset;
424   uint32_t bo_shift;
425};
426
427#define fd_reg_pair tu_reg_value
428#define __bo_type struct tu_bo *
429
430#include "a6xx-pack.xml.h"
431
432#define __assert_eq(a, b)                                               \
433   do {                                                                 \
434      if ((a) != (b)) {                                                 \
435         fprintf(stderr, "assert failed: " #a " (0x%x) != " #b " (0x%x)\n", a, b); \
436         assert((a) == (b));                                            \
437      }                                                                 \
438   } while (0)
439
440#define __ONE_REG(i, regs)                                      \
441   do {                                                         \
442      if (i < ARRAY_SIZE(regs) && regs[i].reg > 0) {            \
443         __assert_eq(regs[0].reg + i, regs[i].reg);             \
444         if (regs[i].bo) {                                      \
445            uint64_t v = regs[i].bo->iova + regs[i].bo_offset;  \
446            v >>= regs[i].bo_shift;                             \
447            v |= regs[i].value;                                 \
448                                                                \
449            *p++ = v;                                           \
450            *p++ = v >> 32;                                     \
451         } else {                                               \
452            *p++ = regs[i].value;                               \
453            if (regs[i].is_address)                             \
454               *p++ = regs[i].value >> 32;                      \
455         }                                                      \
456      }                                                         \
457   } while (0)
458
459/* Emits a sequence of register writes in order using a pkt4.  This will check
460 * (at runtime on a !NDEBUG build) that the registers were actually set up in
461 * order in the code.
462 *
463 * Note that references to buffers aren't automatically added to the CS,
464 * unlike in freedreno.  We are clever in various places to avoid duplicating
465 * the reference add work.
466 *
467 * Also, 64-bit address registers don't have a way (currently) to set a 64-bit
468 * address without having a reference to a BO, since the .dword field in the
469 * register's struct is only 32-bit wide.  We should fix this in the pack
470 * codegen later.
471 */
472#define tu_cs_emit_regs(cs, ...) do {                   \
473   const struct fd_reg_pair regs[] = { __VA_ARGS__ };   \
474   unsigned count = ARRAY_SIZE(regs);                   \
475                                                        \
476   STATIC_ASSERT(ARRAY_SIZE(regs) > 0);                 \
477   STATIC_ASSERT(ARRAY_SIZE(regs) <= 16);               \
478                                                        \
479   tu_cs_emit_pkt4((cs), regs[0].reg, count);             \
480   uint32_t *p = (cs)->cur;                               \
481   __ONE_REG( 0, regs);                                 \
482   __ONE_REG( 1, regs);                                 \
483   __ONE_REG( 2, regs);                                 \
484   __ONE_REG( 3, regs);                                 \
485   __ONE_REG( 4, regs);                                 \
486   __ONE_REG( 5, regs);                                 \
487   __ONE_REG( 6, regs);                                 \
488   __ONE_REG( 7, regs);                                 \
489   __ONE_REG( 8, regs);                                 \
490   __ONE_REG( 9, regs);                                 \
491   __ONE_REG(10, regs);                                 \
492   __ONE_REG(11, regs);                                 \
493   __ONE_REG(12, regs);                                 \
494   __ONE_REG(13, regs);                                 \
495   __ONE_REG(14, regs);                                 \
496   __ONE_REG(15, regs);                                 \
497   (cs)->cur = p;                                         \
498   } while (0)
499
500#endif /* TU_CS_H */
501