1/*
2 * Copyright © 2014-2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/**
25 * @file v3dx_simulator.c
26 *
27 * Implements the actual HW interaction betweeh the GL driver's V3D simulator and the simulator.
28 *
29 * The register headers between V3D versions will have conflicting defines, so
30 * all register interactions appear in this file and are compiled per V3D version
31 * we support.
32 */
33
34#ifdef USE_V3D_SIMULATOR
35
36#include <assert.h>
37#include <stdbool.h>
38#include <stdio.h>
39
40#include "v3d_simulator.h"
41#include "v3d_simulator_wrapper.h"
42
43#include "util/macros.h"
44#include "util/bitscan.h"
45#include "drm-uapi/v3d_drm.h"
46
47#define HW_REGISTER_RO(x) (x)
48#define HW_REGISTER_RW(x) (x)
49#if V3D_VERSION >= 41
50#include "libs/core/v3d/registers/4.1.35.0/v3d.h"
51#else
52#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
53#endif
54
55#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val)
56#define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
57
58static void
59v3d_invalidate_l3(struct v3d_hw *v3d)
60{
61#if V3D_VERSION < 40
62        uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL);
63
64        V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET);
65        V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET);
66#endif
67}
68
69/* Invalidates the L2C cache.  This is a read-only cache for uniforms and instructions. */
70static void
71v3d_invalidate_l2c(struct v3d_hw *v3d)
72{
73        if (V3D_VERSION >= 33)
74                return;
75
76        V3D_WRITE(V3D_CTL_0_L2CACTL,
77                  V3D_CTL_0_L2CACTL_L2CCLR_SET |
78                  V3D_CTL_0_L2CACTL_L2CENA_SET);
79}
80
81enum v3d_l2t_cache_flush_mode {
82        V3D_CACHE_FLUSH_MODE_FLUSH,
83        V3D_CACHE_FLUSH_MODE_CLEAR,
84        V3D_CACHE_FLUSH_MODE_CLEAN,
85};
86
87/* Invalidates texture L2 cachelines */
88static void
89v3d_invalidate_l2t(struct v3d_hw *v3d)
90{
91        V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0);
92        V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0);
93        V3D_WRITE(V3D_CTL_0_L2TCACTL,
94                  V3D_CTL_0_L2TCACTL_L2TFLS_SET |
95                  (V3D_CACHE_FLUSH_MODE_FLUSH << V3D_CTL_0_L2TCACTL_L2TFLM_LSB));
96}
97
98/*
99 * Wait for l2tcactl, used for flushes.
100 *
101 * FIXME: for a multicore scenario we should pass here the core. All wrapper
102 * assumes just one core, so would be better to handle that on that case.
103 */
104static UNUSED void v3d_core_wait_l2tcactl(struct v3d_hw *v3d,
105                                          uint32_t ctrl)
106{
107   assert(!(ctrl & ~(V3D_CTL_0_L2TCACTL_TMUWCF_SET | V3D_CTL_0_L2TCACTL_L2TFLS_SET)));
108
109   while (V3D_READ(V3D_CTL_0_L2TCACTL) & ctrl) {
110           v3d_hw_tick(v3d);
111   }
112}
113
114/* Flushes dirty texture cachelines from the L1 write combiner */
115static void
116v3d_flush_l1td(struct v3d_hw *v3d)
117{
118        V3D_WRITE(V3D_CTL_0_L2TCACTL,
119                  V3D_CTL_0_L2TCACTL_TMUWCF_SET);
120
121        /* Note: here the kernel (and previous versions of the simulator
122         * wrapper) is using V3D_CTL_0_L2TCACTL_L2TFLS_SET, as with l2t. We
123         * understand that it makes more sense to do like this. We need to
124         * confirm which one is doing it correctly. So far things work fine on
125         * the simulator this way.
126         */
127        v3d_core_wait_l2tcactl(v3d, V3D_CTL_0_L2TCACTL_TMUWCF_SET);
128}
129
130/* Flushes dirty texture L2 cachelines */
131static void
132v3d_flush_l2t(struct v3d_hw *v3d)
133{
134        V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0);
135        V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0);
136        V3D_WRITE(V3D_CTL_0_L2TCACTL,
137                  V3D_CTL_0_L2TCACTL_L2TFLS_SET |
138                  (V3D_CACHE_FLUSH_MODE_CLEAN << V3D_CTL_0_L2TCACTL_L2TFLM_LSB));
139
140        v3d_core_wait_l2tcactl(v3d, V3D_CTL_0_L2TCACTL_L2TFLS_SET);
141}
142
143/* Invalidates the slice caches.  These are read-only caches. */
144static void
145v3d_invalidate_slices(struct v3d_hw *v3d)
146{
147        V3D_WRITE(V3D_CTL_0_SLCACTL, ~0);
148}
149
150static void
151v3d_invalidate_caches(struct v3d_hw *v3d)
152{
153        v3d_invalidate_l3(v3d);
154        v3d_invalidate_l2c(v3d);
155        v3d_invalidate_l2t(v3d);
156        v3d_invalidate_slices(v3d);
157}
158
159static uint32_t g_gmp_ofs;
160static void
161v3d_reload_gmp(struct v3d_hw *v3d)
162{
163        /* Completely reset the GMP. */
164        V3D_WRITE(V3D_GMP_CFG,
165                  V3D_GMP_CFG_PROTENABLE_SET);
166        V3D_WRITE(V3D_GMP_TABLE_ADDR, g_gmp_ofs);
167        V3D_WRITE(V3D_GMP_CLEAR_LOAD, ~0);
168        while (V3D_READ(V3D_GMP_STATUS) &
169               V3D_GMP_STATUS_CFG_BUSY_SET) {
170                ;
171        }
172}
173
174static UNUSED void
175v3d_flush_caches(struct v3d_hw *v3d)
176{
177        v3d_flush_l1td(v3d);
178        v3d_flush_l2t(v3d);
179}
180
181int
182v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d,
183                                 struct drm_v3d_submit_tfu *args)
184{
185        int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
186
187        V3D_WRITE(V3D_TFU_IIA, args->iia);
188        V3D_WRITE(V3D_TFU_IIS, args->iis);
189        V3D_WRITE(V3D_TFU_ICA, args->ica);
190        V3D_WRITE(V3D_TFU_IUA, args->iua);
191        V3D_WRITE(V3D_TFU_IOA, args->ioa);
192        V3D_WRITE(V3D_TFU_IOS, args->ios);
193        V3D_WRITE(V3D_TFU_COEF0, args->coef[0]);
194        V3D_WRITE(V3D_TFU_COEF1, args->coef[1]);
195        V3D_WRITE(V3D_TFU_COEF2, args->coef[2]);
196        V3D_WRITE(V3D_TFU_COEF3, args->coef[3]);
197
198        V3D_WRITE(V3D_TFU_ICFG, args->icfg);
199
200        while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
201                v3d_hw_tick(v3d);
202        }
203
204        return 0;
205}
206
207#if V3D_VERSION >= 41
208int
209v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
210                                 struct drm_v3d_submit_csd *args,
211                                 uint32_t gmp_ofs)
212{
213        int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) &
214                                   V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET);
215        g_gmp_ofs = gmp_ofs;
216        v3d_reload_gmp(v3d);
217
218        v3d_invalidate_caches(v3d);
219
220        V3D_WRITE(V3D_CSD_0_QUEUED_CFG1, args->cfg[1]);
221        V3D_WRITE(V3D_CSD_0_QUEUED_CFG2, args->cfg[2]);
222        V3D_WRITE(V3D_CSD_0_QUEUED_CFG3, args->cfg[3]);
223        V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]);
224        V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]);
225        V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]);
226        /* CFG0 kicks off the job */
227        V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]);
228
229        /* Now we wait for the dispatch to finish. The safest way is to check
230         * if NUM_COMPLETED_JOBS has increased. Note that in spite of that
231         * name that register field is about the number of completed
232         * dispatches.
233         */
234        while ((V3D_READ(V3D_CSD_0_STATUS) &
235                V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET) == last_completed_jobs) {
236                v3d_hw_tick(v3d);
237        }
238
239        v3d_flush_caches(v3d);
240
241        return 0;
242}
243#endif
244
245int
246v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
247                                struct drm_v3d_get_param *args)
248{
249        static const uint32_t reg_map[] = {
250                [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG,
251                [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1,
252                [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2,
253                [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3,
254                [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0,
255                [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1,
256                [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2,
257        };
258
259        switch (args->param) {
260        case DRM_V3D_PARAM_SUPPORTS_TFU:
261                args->value = 1;
262                return 0;
263        case DRM_V3D_PARAM_SUPPORTS_CSD:
264                args->value = V3D_VERSION >= 41;
265                return 0;
266        case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH:
267                args->value = 1;
268                return 0;
269        case DRM_V3D_PARAM_SUPPORTS_PERFMON:
270                args->value = V3D_VERSION >= 41;
271                return 0;
272        case DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT:
273                args->value = 1;
274                return 0;
275        }
276
277        if (args->param < ARRAY_SIZE(reg_map) && reg_map[args->param]) {
278                args->value = V3D_READ(reg_map[args->param]);
279                return 0;
280        }
281
282        fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM(%lld)\n",
283                (long long)args->value);
284        abort();
285}
286
287static struct v3d_hw *v3d_isr_hw;
288
289
290static void
291v3d_isr_core(struct v3d_hw *v3d,
292             unsigned core)
293{
294        /* FIXME: so far we are assuming just one core, and using only the _0_
295         * registers. If we add multiple-core on the simulator, we would need
296         * to pass core as a parameter, and chose the proper registers.
297         */
298        assert(core == 0);
299        uint32_t core_status = V3D_READ(V3D_CTL_0_INT_STS);
300        V3D_WRITE(V3D_CTL_0_INT_CLR, core_status);
301
302        if (core_status & V3D_CTL_0_INT_STS_INT_OUTOMEM_SET) {
303                uint32_t size = 256 * 1024;
304                uint32_t offset = v3d_simulator_get_spill(size);
305
306                v3d_reload_gmp(v3d);
307
308                V3D_WRITE(V3D_PTB_0_BPOA, offset);
309                V3D_WRITE(V3D_PTB_0_BPOS, size);
310                return;
311        }
312
313        if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
314                fprintf(stderr, "GMP violation at 0x%08x\n",
315                        V3D_READ(V3D_GMP_VIO_ADDR));
316                abort();
317        } else {
318                fprintf(stderr,
319                        "Unexpected ISR with core status 0x%08x\n",
320                        core_status);
321        }
322        abort();
323}
324
325static void
326handle_mmu_interruptions(struct v3d_hw *v3d,
327                         uint32_t hub_status)
328{
329        bool wrv = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_WRV_SET;
330        bool pti = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_PTI_SET;
331        bool cap = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET;
332
333        if (!(pti || cap || wrv))
334                return;
335
336        const char *client = "?";
337        uint32_t axi_id = V3D_READ(V3D_MMU_VIO_ID);
338        uint32_t va_width = 30;
339
340#if V3D_VERSION >= 41
341        static const char *const v3d41_axi_ids[] = {
342                "L2T",
343                "PTB",
344                "PSE",
345                "TLB",
346                "CLE",
347                "TFU",
348                "MMU",
349                "GMP",
350        };
351
352        axi_id = axi_id >> 5;
353        if (axi_id < ARRAY_SIZE(v3d41_axi_ids))
354                client = v3d41_axi_ids[axi_id];
355
356        uint32_t mmu_debug = V3D_READ(V3D_MMU_DEBUG_INFO);
357
358        va_width += ((mmu_debug & V3D_MMU_DEBUG_INFO_VA_WIDTH_SET)
359                     >> V3D_MMU_DEBUG_INFO_VA_WIDTH_LSB);
360#endif
361        /* Only the top bits (final number depends on the gen) of the virtual
362         * address are reported in the MMU VIO_ADDR register.
363         */
364        uint64_t vio_addr = ((uint64_t)V3D_READ(V3D_MMU_VIO_ADDR) <<
365                             (va_width - 32));
366
367        /* Difference with the kernal: here were are going to abort after
368         * logging, so we don't bother with some stuff that the kernel does,
369         * like restoring the MMU ctrl bits
370         */
371
372        fprintf(stderr, "MMU error from client %s (%d) at 0x%llx%s%s%s\n",
373                client, axi_id, (long long) vio_addr,
374                wrv ? ", write violation" : "",
375                pti ? ", pte invalid" : "",
376                cap ? ", cap exceeded" : "");
377
378        abort();
379}
380
381static void
382v3d_isr_hub(struct v3d_hw *v3d)
383{
384        uint32_t hub_status = V3D_READ(V3D_HUB_CTL_INT_STS);
385
386        /* Acknowledge the interrupts we're handling here */
387        V3D_WRITE(V3D_HUB_CTL_INT_CLR, hub_status);
388
389        if (hub_status & V3D_HUB_CTL_INT_STS_INT_TFUC_SET) {
390                /* FIXME: we were not able to raise this exception. We let the
391                 * unreachable here, so we could get one if it is raised on
392                 * the future. In any case, note that for this case we would
393                 * only be doing debugging log.
394                 */
395                unreachable("TFU Conversion Complete interrupt not handled");
396        }
397
398        handle_mmu_interruptions(v3d, hub_status);
399}
400
401static void
402v3d_isr(uint32_t hub_status)
403{
404        struct v3d_hw *v3d = v3d_isr_hw;
405        uint32_t mask = hub_status;
406
407        /* Check the hub_status bits */
408        while (mask) {
409                unsigned core = u_bit_scan(&mask);
410
411                if (core == v3d_hw_get_hub_core())
412                        v3d_isr_hub(v3d);
413                else
414                        v3d_isr_core(v3d, core);
415        }
416
417        return;
418}
419
420void
421v3dX(simulator_init_regs)(struct v3d_hw *v3d)
422{
423#if V3D_VERSION == 33
424        /* Set OVRTMUOUT to match kernel behavior.
425         *
426         * This means that the texture sampler uniform configuration's tmu
427         * output type field is used, instead of using the hardware default
428         * behavior based on the texture type.  If you want the default
429         * behavior, you can still put "2" in the indirect texture state's
430         * output_type field.
431         */
432        V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET);
433#endif
434
435        /* FIXME: the kernel captures some additional core interrupts here,
436         * for tracing. Perhaps we should evaluate to do the same here and add
437         * some debug options.
438         */
439        uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET |
440                                    V3D_CTL_0_INT_STS_INT_OUTOMEM_SET);
441        V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
442        V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
443
444        uint32_t hub_interrupts =
445           (V3D_HUB_CTL_INT_STS_INT_MMU_WRV_SET |  /* write violation */
446            V3D_HUB_CTL_INT_STS_INT_MMU_PTI_SET |  /* page table invalid */
447            V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET |  /* CAP exceeded */
448            V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */
449
450        V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts);
451        V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts);
452
453        v3d_isr_hw = v3d;
454        v3d_hw_set_isr(v3d, v3d_isr);
455}
456
457void
458v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
459                                struct drm_v3d_submit_cl *submit,
460                                uint32_t gmp_ofs)
461{
462        int last_bfc = (V3D_READ(V3D_CLE_0_BFC) &
463                        V3D_CLE_0_BFC_BMFCT_SET);
464
465        int last_rfc = (V3D_READ(V3D_CLE_0_RFC) &
466                        V3D_CLE_0_RFC_RMFCT_SET);
467
468        g_gmp_ofs = gmp_ofs;
469        v3d_reload_gmp(v3d);
470
471        v3d_invalidate_caches(v3d);
472
473        if (submit->qma) {
474                V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma);
475                V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms);
476        }
477#if V3D_VERSION >= 41
478        if (submit->qts) {
479                V3D_WRITE(V3D_CLE_0_CT0QTS,
480                          V3D_CLE_0_CT0QTS_CTQTSEN_SET |
481                          submit->qts);
482        }
483#endif
484        V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start);
485        V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end);
486
487        /* Wait for bin to complete before firing render.  The kernel's
488         * scheduler implements this using the GPU scheduler blocking on the
489         * bin fence completing.  (We don't use HW semaphores).
490         */
491        while ((V3D_READ(V3D_CLE_0_BFC) &
492                V3D_CLE_0_BFC_BMFCT_SET) == last_bfc) {
493                v3d_hw_tick(v3d);
494        }
495
496        v3d_invalidate_caches(v3d);
497
498        V3D_WRITE(V3D_CLE_0_CT1QBA, submit->rcl_start);
499        V3D_WRITE(V3D_CLE_0_CT1QEA, submit->rcl_end);
500
501        while ((V3D_READ(V3D_CLE_0_RFC) &
502                V3D_CLE_0_RFC_RMFCT_SET) == last_rfc) {
503                v3d_hw_tick(v3d);
504        }
505}
506
507#if V3D_VERSION >= 41
508#define V3D_PCTR_0_PCTR_N(x) (V3D_PCTR_0_PCTR0 + 4 * (x))
509#define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x))
510#define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8)
511#define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \
512                                                 V3D_PCTR_0_SRC_N_SHIFT(x) + 6))
513#endif
514
515void
516v3dX(simulator_perfmon_start)(struct v3d_hw *v3d,
517                              uint32_t ncounters,
518                              uint8_t *events)
519{
520#if V3D_VERSION >= 41
521        int i, j;
522        uint32_t source;
523        uint32_t mask = BITFIELD_RANGE(0, ncounters);
524
525        for (i = 0; i < ncounters; i+=4) {
526                source = i / 4;
527                uint32_t channels = 0;
528                for (j = 0; j < 4 && (i + j) < ncounters; j++)
529                        channels |= events[i + j] << V3D_PCTR_0_SRC_N_SHIFT(j);
530                V3D_WRITE(V3D_PCTR_0_SRC_N(source), channels);
531        }
532        V3D_WRITE(V3D_PCTR_0_CLR, mask);
533        V3D_WRITE(V3D_PCTR_0_OVERFLOW, mask);
534        V3D_WRITE(V3D_PCTR_0_EN, mask);
535#endif
536}
537
538void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d,
539                                  uint32_t ncounters,
540                                  uint64_t *values)
541{
542#if V3D_VERSION >= 41
543        int i;
544
545        for (i = 0; i < ncounters; i++)
546                values[i] += V3D_READ(V3D_PCTR_0_PCTR_N(i));
547
548        V3D_WRITE(V3D_PCTR_0_EN, 0);
549#endif
550}
551
552#endif /* USE_V3D_SIMULATOR */
553