1
2/*
3 * Copyright © 2014 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25#include <stdlib.h>
26
27#include "vc4_qpu.h"
28
29static void
30fail_instr(uint64_t inst, const char *msg)
31{
32        fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
33        vc4_qpu_disasm(&inst, 1);
34        fprintf(stderr, "\n");
35        abort();
36}
37
38static bool
39writes_reg(uint64_t inst, uint32_t w)
40{
41        return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||
42                QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);
43}
44
45static bool
46_reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
47{
48        struct {
49                uint32_t mux, addr;
50        } src_regs[] = {
51                { QPU_GET_FIELD(inst, QPU_ADD_A) },
52                { QPU_GET_FIELD(inst, QPU_ADD_B) },
53                { QPU_GET_FIELD(inst, QPU_MUL_A) },
54                { QPU_GET_FIELD(inst, QPU_MUL_B) },
55        };
56
57        /* Branches only reference raddr_a (no mux), and we don't use that
58         * feature of branching.
59         */
60        if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
61                return false;
62
63        /* Load immediates don't read any registers. */
64        if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
65                return false;
66
67        for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
68                if (!ignore_a &&
69                    src_regs[i].mux == QPU_MUX_A &&
70                    (QPU_GET_FIELD(inst, QPU_RADDR_A) == r))
71                        return true;
72
73                if (!ignore_b &&
74                    QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
75                    src_regs[i].mux == QPU_MUX_B &&
76                    (QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
77                        return true;
78        }
79
80        return false;
81}
82
83static bool
84reads_reg(uint64_t inst, uint32_t r)
85{
86        return _reads_reg(inst, r, false, false);
87}
88
89static bool
90reads_a_reg(uint64_t inst, uint32_t r)
91{
92        return _reads_reg(inst, r, false, true);
93}
94
95static bool
96reads_b_reg(uint64_t inst, uint32_t r)
97{
98        return _reads_reg(inst, r, true, false);
99}
100
101static bool
102writes_sfu(uint64_t inst)
103{
104        return (writes_reg(inst, QPU_W_SFU_RECIP) ||
105                writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||
106                writes_reg(inst, QPU_W_SFU_EXP) ||
107                writes_reg(inst, QPU_W_SFU_LOG));
108}
109
110/**
111 * Checks for the instruction restrictions from page 37 ("Summary of
112 * Instruction Restrictions").
113 */
114void
115vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
116{
117        bool scoreboard_locked = false;
118        bool threaded = false;
119
120        /* We don't want to do validation in release builds, but we want to
121         * keep compiling the validation code to make sure it doesn't get
122         * broken.
123         */
124#ifndef DEBUG
125        return;
126#endif
127
128        for (int i = 0; i < num_inst; i++) {
129                uint64_t inst = insts[i];
130                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
131
132                if (sig != QPU_SIG_PROG_END) {
133                        if (qpu_inst_is_tlb(inst))
134                                scoreboard_locked = true;
135
136                        if (sig == QPU_SIG_THREAD_SWITCH ||
137                            sig == QPU_SIG_LAST_THREAD_SWITCH) {
138                                threaded = true;
139                        }
140
141                        continue;
142                }
143
144                /* "The Thread End instruction must not write to either physical
145                 *  regfile A or B."
146                 */
147                if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
148                    QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
149                        fail_instr(inst, "write to phys reg in thread end");
150                }
151
152                /* Can't trigger an implicit wait on scoreboard in the program
153                 * end instruction.
154                 */
155                if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
156                        fail_instr(inst, "implicit sb wait in program end");
157
158                /* Two delay slots will be executed. */
159                assert(i + 2 <= num_inst);
160
161                 for (int j = i; j < i + 2; j++) {
162                         /* "The last three instructions of any program
163                          *  (Thread End plus the following two delay-slot
164                          *  instructions) must not do varyings read, uniforms
165                          *  read or any kind of VPM, VDR, or VDW read or
166                          *  write."
167                          */
168                         if (writes_reg(insts[j], QPU_W_VPM) ||
169                             reads_reg(insts[j], QPU_R_VARY) ||
170                             reads_reg(insts[j], QPU_R_UNIF) ||
171                             reads_reg(insts[j], QPU_R_VPM)) {
172                                 fail_instr(insts[j], "last 3 instructions "
173                                            "using fixed functions");
174                         }
175
176                         /* "The Thread End instruction and the following two
177                          *  delay slot instructions must not write or read
178                          *  address 14 in either regfile A or B."
179                          */
180                         if (writes_reg(insts[j], 14) ||
181                             reads_reg(insts[j], 14)) {
182                                 fail_instr(insts[j], "last 3 instructions "
183                                            "must not use r14");
184                         }
185                 }
186
187                 /* "The final program instruction (the second delay slot
188                  *  instruction) must not do a TLB Z write."
189                  */
190                 if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
191                         fail_instr(insts[i + 2], "final instruction doing "
192                                    "Z write");
193                 }
194        }
195
196        /* "A scoreboard wait must not occur in the first two instructions of
197         *  a fragment shader. This is either the explicit Wait for Scoreboard
198         *  signal or an implicit wait with the first tile-buffer read or
199         *  write instruction."
200         */
201        for (int i = 0; i < 2; i++) {
202                uint64_t inst = insts[i];
203
204                if (qpu_inst_is_tlb(inst))
205                        fail_instr(inst, "sb wait in first two insts");
206        }
207
208        /* "If TMU_NOSWAP is written, the write must be three instructions
209         *  before the first TMU write instruction.  For example, if
210         *  TMU_NOSWAP is written in the first shader instruction, the first
211         *  TMU write cannot occur before the 4th shader instruction."
212         */
213        int last_tmu_noswap = -10;
214        for (int i = 0; i < num_inst; i++) {
215                uint64_t inst = insts[i];
216
217                if ((i - last_tmu_noswap) <= 3 &&
218                    (writes_reg(inst, QPU_W_TMU0_S) ||
219                     writes_reg(inst, QPU_W_TMU1_S))) {
220                        fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
221                }
222
223                if (writes_reg(inst, QPU_W_TMU_NOSWAP))
224                    last_tmu_noswap = i;
225        }
226
227        /* "An instruction must not read from a location in physical regfile A
228         *  or B that was written to by the previous instruction."
229         */
230        for (int i = 0; i < num_inst - 1; i++) {
231                uint64_t inst = insts[i];
232                uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
233                uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
234                uint32_t waddr_a, waddr_b;
235
236                if (inst & QPU_WS) {
237                        waddr_b = add_waddr;
238                        waddr_a = mul_waddr;
239                } else {
240                        waddr_a = add_waddr;
241                        waddr_b = mul_waddr;
242                }
243
244                if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
245                    (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
246                        fail_instr(insts[i + 1],
247                                   "Reads physical reg too soon after write");
248                }
249        }
250
251        /* "After an SFU lookup instruction, accumulator r4 must not be read
252         *  in the following two instructions. Any other instruction that
253         *  results in r4 being written (that is, TMU read, TLB read, SFU
254         *  lookup) cannot occur in the two instructions following an SFU
255         *  lookup."
256         */
257        int last_sfu_inst = -10;
258        for (int i = 0; i < num_inst - 1; i++) {
259                uint64_t inst = insts[i];
260                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
261
262                if (i - last_sfu_inst <= 2 &&
263                    (writes_sfu(inst) ||
264                     sig == QPU_SIG_LOAD_TMU0 ||
265                     sig == QPU_SIG_LOAD_TMU1 ||
266                     sig == QPU_SIG_COLOR_LOAD)) {
267                        fail_instr(inst, "R4 write too soon after SFU write");
268                }
269
270                if (writes_sfu(inst))
271                        last_sfu_inst = i;
272        }
273
274        for (int i = 0; i < num_inst - 1; i++) {
275                uint64_t inst = insts[i];
276
277                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
278                    QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
279                    QPU_SMALL_IMM_MUL_ROT) {
280                        uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
281                        uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
282
283                        /* "The full horizontal vector rotate is only
284                         *  available when both of the mul ALU input arguments
285                         *  are taken from accumulators r0-r3."
286                         */
287                        if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
288                                fail_instr(inst,
289                                           "MUL rotate using non-accumulator "
290                                           "input");
291                        }
292
293                        if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
294                            QPU_SMALL_IMM_MUL_ROT) {
295                                /* "An instruction that does a vector rotate
296                                 *  by r5 must not immediately follow an
297                                 *  instruction that writes to r5."
298                                 */
299                                if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
300                                        fail_instr(inst,
301                                                   "vector rotate by r5 "
302                                                   "immediately after r5 write");
303                                }
304                        }
305
306                        /* "An instruction that does a vector rotate must not
307                         *  immediately follow an instruction that writes to the
308                         *  accumulator that is being rotated."
309                         */
310                        if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
311                            writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
312                                fail_instr(inst,
313                                           "vector rotate of value "
314                                           "written in previous instruction");
315                        }
316                }
317        }
318
319        /* "An instruction that does a vector rotate must not immediately
320         *  follow an instruction that writes to the accumulator that is being
321         *  rotated.
322         *
323         * XXX: TODO.
324         */
325
326        /* "After an instruction that does a TLB Z write, the multisample mask
327         *  must not be read as an instruction input argument in the following
328         *  two instruction. The TLB Z write instruction can, however, be
329         *  followed immediately by a TLB color write."
330         */
331        for (int i = 0; i < num_inst - 1; i++) {
332                uint64_t inst = insts[i];
333                if (writes_reg(inst, QPU_W_TLB_Z) &&
334                    (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
335                     reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
336                        fail_instr(inst, "TLB Z write followed by MS mask read");
337                }
338        }
339
340        /*
341         * "A single instruction can only perform a maximum of one of the
342         *  following closely coupled peripheral accesses in a single
343         *  instruction: TMU write, TMU read, TLB write, TLB read, TLB
344         *  combined color read and write, SFU write, Mutex read or Semaphore
345         *  access."
346         */
347        for (int i = 0; i < num_inst - 1; i++) {
348                uint64_t inst = insts[i];
349
350                if (qpu_num_sf_accesses(inst) > 1)
351                        fail_instr(inst, "Single instruction writes SFU twice");
352        }
353
354        /* "The uniform base pointer can be written (from SIMD element 0) by
355         *  the processor to reset the stream, there must be at least two
356         *  nonuniform-accessing instructions following a pointer change
357         *  before uniforms can be accessed once more."
358         */
359        int last_unif_pointer_update = -3;
360        for (int i = 0; i < num_inst; i++) {
361                uint64_t inst = insts[i];
362                uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
363                uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
364
365                if (reads_reg(inst, QPU_R_UNIF) &&
366                    i - last_unif_pointer_update <= 2) {
367                        fail_instr(inst,
368                                   "uniform read too soon after pointer update");
369                }
370
371                if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
372                    waddr_mul == QPU_W_UNIFORMS_ADDRESS)
373                        last_unif_pointer_update = i;
374        }
375
376        if (threaded) {
377                bool last_thrsw_found = false;
378                bool scoreboard_locked = false;
379                int tex_samples_outstanding = 0;
380                int last_tex_samples_outstanding = 0;
381                int thrsw_ip = -1;
382
383                for (int i = 0; i < num_inst; i++) {
384                        uint64_t inst = insts[i];
385                        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
386
387                        if (i == thrsw_ip) {
388                                /* In order to get texture results back in the
389                                 * correct order, before a new thrsw we have
390                                 * to read all the texture results from before
391                                 * the previous thrsw.
392                                 *
393                                 * FIXME: Is collecting the remaining results
394                                 * during the delay slots OK, or should we do
395                                 * this at THRSW signal time?
396                                 */
397                                if (last_tex_samples_outstanding != 0) {
398                                        fail_instr(inst, "THRSW with texture "
399                                                   "results from the previous "
400                                                   "THRSW still in the FIFO.");
401                                }
402
403                                last_tex_samples_outstanding =
404                                        tex_samples_outstanding;
405                                tex_samples_outstanding = 0;
406                        }
407
408                        if (qpu_inst_is_tlb(inst))
409                                scoreboard_locked = true;
410
411                        switch (sig) {
412                        case QPU_SIG_THREAD_SWITCH:
413                        case QPU_SIG_LAST_THREAD_SWITCH:
414                                /* No thread switching with the scoreboard
415                                 * locked.  Doing so means we may deadlock
416                                 * when the other thread tries to lock
417                                 * scoreboard.
418                                 */
419                                if (scoreboard_locked) {
420                                        fail_instr(inst, "THRSW with the "
421                                                   "scoreboard locked.");
422                                }
423
424                                /* No thread switching after lthrsw, since
425                                 * lthrsw means that we get delayed until the
426                                 * other shader is ready for us to terminate.
427                                 */
428                                if (last_thrsw_found) {
429                                        fail_instr(inst, "THRSW after a "
430                                                   "previous LTHRSW");
431                                }
432
433                                if (sig == QPU_SIG_LAST_THREAD_SWITCH)
434                                        last_thrsw_found = true;
435
436                                /* No THRSW while we already have a THRSW
437                                 * queued.
438                                 */
439                                if (i < thrsw_ip) {
440                                        fail_instr(inst,
441                                                   "THRSW with a THRSW queued.");
442                                }
443
444                                thrsw_ip = i + 3;
445                                break;
446
447                        case QPU_SIG_LOAD_TMU0:
448                        case QPU_SIG_LOAD_TMU1:
449                                if (last_tex_samples_outstanding == 0) {
450                                        fail_instr(inst, "TMU load with nothing "
451                                                   "in the results fifo from "
452                                                   "the previous THRSW.");
453                                }
454
455                                last_tex_samples_outstanding--;
456                                break;
457                        }
458
459                        uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
460                        uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
461                        if (waddr_add == QPU_W_TMU0_S ||
462                            waddr_add == QPU_W_TMU1_S ||
463                            waddr_mul == QPU_W_TMU0_S ||
464                            waddr_mul == QPU_W_TMU1_S) {
465                                tex_samples_outstanding++;
466                        }
467                }
468        }
469}
470