1/*
2 * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Rob Clark <robclark@freedesktop.org>
25 */
26
27#ifndef FREEDRENO_RINGBUFFER_H_
28#define FREEDRENO_RINGBUFFER_H_
29
30#include <stdio.h>
31#include "util/u_atomic.h"
32#include "util/u_debug.h"
33#include "util/u_queue.h"
34
35#include "adreno_common.xml.h"
36#include "adreno_pm4.xml.h"
37#include "freedreno_drmif.h"
38#include "freedreno_pm4.h"
39
40#ifdef __cplusplus
41extern "C" {
42#endif
43
44struct fd_submit;
45struct fd_ringbuffer;
46
47enum fd_ringbuffer_flags {
48
49   /* Primary ringbuffer for a submit, ie. an IB1 level rb
50    * which kernel must setup RB->IB1 CP_INDIRECT_BRANCH
51    * packets.
52    */
53   FD_RINGBUFFER_PRIMARY = 0x1,
54
55   /* Hint that the stateobj will be used for streaming state
56    * that is used once or a few times and then discarded.
57    *
58    * For sub-allocation, non streaming stateobj's should be
59    * sub-allocated from a page size buffer, so one long lived
60    * state obj doesn't prevent other pages from being freed.
61    * (Ie. it would be no worse than allocating a page sized
62    * bo for each small non-streaming stateobj).
63    *
64    * But streaming stateobj's could be sub-allocated from a
65    * larger buffer to reduce the alloc/del overhead.
66    */
67   FD_RINGBUFFER_STREAMING = 0x2,
68
69   /* Indicates that "growable" cmdstream can be used,
70    * consisting of multiple physical cmdstream buffers
71    */
72   FD_RINGBUFFER_GROWABLE = 0x4,
73
74   /* Internal use only: */
75   _FD_RINGBUFFER_OBJECT = 0x8,
76};
77
78/* A submit object manages/tracks all the state buildup for a "submit"
79 * ioctl to the kernel.  Additionally, with the exception of long-lived
80 * non-STREAMING stateobj rb's, rb's are allocated from the submit.
81 */
82struct fd_submit *fd_submit_new(struct fd_pipe *pipe);
83
84/* NOTE: all ringbuffer's create from the submit should be unref'd
85 * before destroying the submit.
86 */
87void fd_submit_del(struct fd_submit *submit);
88
89struct fd_submit * fd_submit_ref(struct fd_submit *submit);
90
91/* Allocate a new rb from the submit. */
92struct fd_ringbuffer *fd_submit_new_ringbuffer(struct fd_submit *submit,
93                                               uint32_t size,
94                                               enum fd_ringbuffer_flags flags);
95
96/**
97 * Encapsulates submit out-fence(s), which consist of a 'timestamp' (per-
98 * pipe (submitqueue) sequence number) and optionally, if requested, an
99 * out-fence-fd
100 */
101struct fd_submit_fence {
102   /**
103    * The ready fence is signaled once the submit is actually flushed down
104    * to the kernel, and fence/fence_fd are populated.  You must wait for
105    * this fence to be signaled before reading fence/fence_fd.
106    */
107   struct util_queue_fence ready;
108
109   struct fd_fence fence;
110
111   /**
112    * Optional dma_fence fd, returned by submit if use_fence_fd is true
113    */
114   int fence_fd;
115   bool use_fence_fd;
116};
117
118/* in_fence_fd: -1 for no in-fence, else fence fd
119 * out_fence can be NULL if no output fence is required
120 */
121int fd_submit_flush(struct fd_submit *submit, int in_fence_fd,
122                    struct fd_submit_fence *out_fence);
123
124struct fd_ringbuffer;
125struct fd_reloc;
126
127struct fd_ringbuffer_funcs {
128   void (*grow)(struct fd_ringbuffer *ring, uint32_t size);
129   void (*emit_reloc)(struct fd_ringbuffer *ring, const struct fd_reloc *reloc);
130   uint32_t (*emit_reloc_ring)(struct fd_ringbuffer *ring,
131                               struct fd_ringbuffer *target, uint32_t cmd_idx);
132   uint32_t (*cmd_count)(struct fd_ringbuffer *ring);
133   bool (*check_size)(struct fd_ringbuffer *ring);
134   void (*destroy)(struct fd_ringbuffer *ring);
135};
136
137/* the ringbuffer object is not opaque so that OUT_RING() type stuff
138 * can be inlined.  Note that users should not make assumptions about
139 * the size of this struct.
140 */
141struct fd_ringbuffer {
142   uint32_t *cur, *end, *start;
143   const struct fd_ringbuffer_funcs *funcs;
144
145   // size or end coudl probably go away
146   int size;
147   int32_t refcnt;
148   enum fd_ringbuffer_flags flags;
149};
150
151/* Allocate a new long-lived state object, not associated with
152 * a submit:
153 */
154struct fd_ringbuffer *fd_ringbuffer_new_object(struct fd_pipe *pipe,
155                                               uint32_t size);
156
157static inline void
158fd_ringbuffer_del(struct fd_ringbuffer *ring)
159{
160   if (!p_atomic_dec_zero(&ring->refcnt))
161      return;
162
163   ring->funcs->destroy(ring);
164}
165
166static inline struct fd_ringbuffer *
167fd_ringbuffer_ref(struct fd_ringbuffer *ring)
168{
169   p_atomic_inc(&ring->refcnt);
170   return ring;
171}
172
173static inline void
174fd_ringbuffer_grow(struct fd_ringbuffer *ring, uint32_t ndwords)
175{
176   assert(ring->funcs->grow); /* unsupported on kgsl */
177
178   /* there is an upper bound on IB size, which appears to be 0x0fffff */
179   ring->size = MIN2(ring->size << 1, 0x0fffff);
180
181   ring->funcs->grow(ring, ring->size);
182}
183
184static inline bool
185fd_ringbuffer_check_size(struct fd_ringbuffer *ring)
186{
187   return ring->funcs->check_size(ring);
188}
189
190static inline void
191fd_ringbuffer_emit(struct fd_ringbuffer *ring, uint32_t data)
192{
193   (*ring->cur++) = data;
194}
195
196struct fd_reloc {
197   struct fd_bo *bo;
198   uint64_t iova;
199   uint64_t orval;
200#define FD_RELOC_READ  0x0001
201#define FD_RELOC_WRITE 0x0002
202#define FD_RELOC_DUMP  0x0004
203   uint32_t offset;
204   int32_t shift;
205};
206
207/* We always mark BOs for write, instead of tracking it across reloc
208 * sources in userspace.  On the kernel side, this means we track a single
209 * excl fence in the BO instead of a set of read fences, which is cheaper.
210 * The downside is that a dmabuf-shared device won't be able to read in
211 * parallel with a read-only access by freedreno, but most other drivers
212 * have decided that that usecase isn't important enough to do this
213 * tracking, as well.
214 */
215#define FD_RELOC_FLAGS_INIT (FD_RELOC_READ | FD_RELOC_WRITE)
216
217/* NOTE: relocs are 2 dwords on a5xx+ */
218
219static inline void
220fd_ringbuffer_reloc(struct fd_ringbuffer *ring, const struct fd_reloc *reloc)
221{
222   ring->funcs->emit_reloc(ring, reloc);
223}
224
225static inline uint32_t
226fd_ringbuffer_cmd_count(struct fd_ringbuffer *ring)
227{
228   if (!ring->funcs->cmd_count)
229      return 1;
230   return ring->funcs->cmd_count(ring);
231}
232
233static inline uint32_t
234fd_ringbuffer_emit_reloc_ring_full(struct fd_ringbuffer *ring,
235                                   struct fd_ringbuffer *target,
236                                   uint32_t cmd_idx)
237{
238   return ring->funcs->emit_reloc_ring(ring, target, cmd_idx);
239}
240
241static inline uint32_t
242offset_bytes(void *end, void *start)
243{
244   return ((char *)end) - ((char *)start);
245}
246
247static inline uint32_t
248fd_ringbuffer_size(struct fd_ringbuffer *ring)
249{
250   /* only really needed for stateobj ringbuffers, and won't really
251    * do what you expect for growable rb's.. so lets just restrict
252    * this to stateobj's for now:
253    */
254   assert(!(ring->flags & FD_RINGBUFFER_GROWABLE));
255   return offset_bytes(ring->cur, ring->start);
256}
257
258static inline bool
259fd_ringbuffer_empty(struct fd_ringbuffer *ring)
260{
261   return (fd_ringbuffer_cmd_count(ring) == 1) &&
262          (offset_bytes(ring->cur, ring->start) == 0);
263}
264
265#define LOG_DWORDS 0
266
267static inline void
268OUT_RING(struct fd_ringbuffer *ring, uint32_t data)
269{
270   if (LOG_DWORDS) {
271      fprintf(stderr, "ring[%p]: OUT_RING   %04x:  %08x", ring,
272              (uint32_t)(ring->cur - ring->start), data);
273   }
274   fd_ringbuffer_emit(ring, data);
275}
276
277/*
278 * NOTE: OUT_RELOC() is 2 dwords (64b) on a5xx+
279 */
280static inline void
281OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo, uint32_t offset,
282          uint64_t orval, int32_t shift)
283{
284   if (LOG_DWORDS) {
285      fprintf(stderr, "ring[%p]: OUT_RELOC   %04x:  %p+%u << %d", ring,
286              (uint32_t)(ring->cur - ring->start), bo, offset, shift);
287   }
288   assert(offset < fd_bo_size(bo));
289
290   uint64_t iova = fd_bo_get_iova(bo) + offset;
291
292   if (shift < 0)
293      iova >>= -shift;
294   else
295      iova <<= shift;
296
297   iova |= orval;
298
299   struct fd_reloc reloc = {
300         .bo = bo,
301         .iova = iova,
302         .orval = orval,
303         .offset = offset,
304         .shift = shift,
305   };
306
307   fd_ringbuffer_reloc(ring, &reloc);
308}
309
310static inline void
311OUT_RB(struct fd_ringbuffer *ring, struct fd_ringbuffer *target)
312{
313   fd_ringbuffer_emit_reloc_ring_full(ring, target, 0);
314}
315
316static inline void
317BEGIN_RING(struct fd_ringbuffer *ring, uint32_t ndwords)
318{
319   if (unlikely(ring->cur + ndwords > ring->end))
320      fd_ringbuffer_grow(ring, ndwords);
321}
322
323static inline void
324OUT_PKT0(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
325{
326   BEGIN_RING(ring, cnt + 1);
327   OUT_RING(ring, pm4_pkt0_hdr(regindx, cnt));
328}
329
330static inline void
331OUT_PKT2(struct fd_ringbuffer *ring)
332{
333   BEGIN_RING(ring, 1);
334   OUT_RING(ring, CP_TYPE2_PKT);
335}
336
337static inline void
338OUT_PKT3(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
339{
340   BEGIN_RING(ring, cnt + 1);
341   OUT_RING(ring, CP_TYPE3_PKT | ((cnt - 1) << 16) | ((opcode & 0xFF) << 8));
342}
343
344/*
345 * Starting with a5xx, pkt4/pkt7 are used instead of pkt0/pkt3
346 */
347
348static inline void
349OUT_PKT4(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
350{
351   BEGIN_RING(ring, cnt + 1);
352   OUT_RING(ring, pm4_pkt4_hdr(regindx, cnt));
353}
354
355static inline void
356OUT_PKT7(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
357{
358   BEGIN_RING(ring, cnt + 1);
359   OUT_RING(ring, pm4_pkt7_hdr(opcode, cnt));
360}
361
362static inline void
363OUT_WFI(struct fd_ringbuffer *ring)
364{
365   OUT_PKT3(ring, CP_WAIT_FOR_IDLE, 1);
366   OUT_RING(ring, 0x00000000);
367}
368
369static inline void
370OUT_WFI5(struct fd_ringbuffer *ring)
371{
372   OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
373}
374
375#ifdef __cplusplus
376} /* end of extern "C" */
377#endif
378
379#endif /* FREEDRENO_RINGBUFFER_H_ */
380