1/*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include <inttypes.h>
25
26#include "vc4_context.h"
27#include "vc4_qir.h"
28#include "vc4_qpu.h"
29#include "util/ralloc.h"
30
31static void
32vc4_dump_program(struct vc4_compile *c)
33{
34        fprintf(stderr, "%s prog %d/%d QPU:\n",
35                qir_get_stage_name(c->stage),
36                c->program_id, c->variant_id);
37
38        for (int i = 0; i < c->qpu_inst_count; i++) {
39                fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40                vc4_qpu_disasm(&c->qpu_insts[i], 1);
41                fprintf(stderr, "\n");
42        }
43        fprintf(stderr, "\n");
44}
45
46static void
47queue(struct qblock *block, uint64_t inst)
48{
49        struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
50        q->inst = inst;
51        list_addtail(&q->link, &block->qpu_inst_list);
52}
53
54static uint64_t *
55last_inst(struct qblock *block)
56{
57        struct queued_qpu_inst *q =
58                (struct queued_qpu_inst *)block->qpu_inst_list.prev;
59        return &q->inst;
60}
61
62static void
63set_last_cond_add(struct qblock *block, uint32_t cond)
64{
65        *last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
66}
67
68static void
69set_last_cond_mul(struct qblock *block, uint32_t cond)
70{
71        *last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
72}
73
74/**
75 * Some special registers can be read from either file, which lets us resolve
76 * raddr conflicts without extra MOVs.
77 */
78static bool
79swap_file(struct qpu_reg *src)
80{
81        switch (src->addr) {
82        case QPU_R_UNIF:
83        case QPU_R_VARY:
84                if (src->mux == QPU_MUX_SMALL_IMM) {
85                        return false;
86                } else {
87                        if (src->mux == QPU_MUX_A)
88                                src->mux = QPU_MUX_B;
89                        else
90                                src->mux = QPU_MUX_A;
91                        return true;
92                }
93
94        default:
95                return false;
96        }
97}
98
99/**
100 * Sets up the VPM read FIFO before we do any VPM read.
101 *
102 * VPM reads (vertex attribute input) and VPM writes (varyings output) from
103 * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
104 * VPM block.  In the VS/CS (unlike in the FS), the block starts out
105 * uninitialized, and you need to emit setup to the block before any VPM
106 * reads/writes.
107 *
108 * VRI has a FIFO in each direction, with each FIFO able to hold four
109 * 32-bit-per-vertex values.  VPM reads come through the read FIFO and VPM
110 * writes go through the write FIFO.  The read/write setup values from QPU go
111 * through the write FIFO as well, with a sideband signal indicating that
112 * they're setup values.  Once a read setup reaches the other side of the
113 * FIFO, the VPM block will start asynchronously reading vertex attributes and
114 * filling the read FIFO -- that way hopefully the QPU doesn't have to block
115 * on reads later.
116 *
117 * VPM read setup can configure 16 32-bit-per-vertex values to be read at a
118 * time, which is 4 vec4s.  If more than that is being read (since we support
119 * 8 vec4 vertex attributes), then multiple read setup writes need to be done.
120 *
121 * The existence of the FIFO makes it seem like you should be able to emit
122 * both setups for the 5-8 attribute cases and then do all the attribute
123 * reads.  However, once the setup value makes it to the other end of the
124 * write FIFO, it will immediately update the VPM block's setup register.
125 * That updated setup register would be used for read FIFO fills from then on,
126 * breaking whatever remaining VPM values were supposed to be read into the
127 * read FIFO from the previous attribute set.
128 *
129 * As a result, we need to emit the read setup, pull every VPM read value from
130 * that setup, and only then emit the second setup if applicable.
131 */
132static void
133setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
134{
135        if (c->num_inputs_in_fifo) {
136                c->num_inputs_in_fifo--;
137                return;
138        }
139
140        c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
141
142        queue(block,
143              qpu_load_imm_ui(qpu_vrsetup(),
144                              c->vpm_read_offset |
145                              0x00001a00 |
146                              ((c->num_inputs_in_fifo & 0xf) << 20)));
147        c->num_inputs_remaining -= c->num_inputs_in_fifo;
148        c->vpm_read_offset += c->num_inputs_in_fifo;
149
150        c->num_inputs_in_fifo--;
151}
152
153/**
154 * This is used to resolve the fact that we might register-allocate two
155 * different operands of an instruction to the same physical register file
156 * even though instructions have only one field for the register file source
157 * address.
158 *
159 * In that case, we need to move one to a temporary that can be used in the
160 * instruction, instead.  We reserve ra14/rb14 for this purpose.
161 */
162static void
163fixup_raddr_conflict(struct qblock *block,
164                     struct qpu_reg dst,
165                     struct qpu_reg *src0, struct qpu_reg *src1,
166                     struct qinst *inst, uint64_t *unpack)
167{
168        uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
169        uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
170
171        if (mux0 <= QPU_MUX_R5 ||
172            mux0 != mux1 ||
173            (src0->addr == src1->addr &&
174             src0->mux == src1->mux)) {
175                return;
176        }
177
178        if (swap_file(src0) || swap_file(src1))
179                return;
180
181        if (mux0 == QPU_MUX_A) {
182                /* Make sure we use the same type of MOV as the instruction,
183                 * in case of unpacks.
184                 */
185                if (qir_is_float_input(inst))
186                        queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
187                else
188                        queue(block, qpu_a_MOV(qpu_rb(14), *src0));
189
190                /* If we had an unpack on this A-file source, we need to put
191                 * it into this MOV, not into the later move from regfile B.
192                 */
193                if (inst->src[0].pack) {
194                        *last_inst(block) |= *unpack;
195                        *unpack = 0;
196                }
197                *src0 = qpu_rb(14);
198        } else {
199                queue(block, qpu_a_MOV(qpu_ra(14), *src0));
200                *src0 = qpu_ra(14);
201        }
202}
203
204static void
205set_last_dst_pack(struct qblock *block, struct qinst *inst)
206{
207        ASSERTED bool had_pm = *last_inst(block) & QPU_PM;
208        ASSERTED bool had_ws = *last_inst(block) & QPU_WS;
209        ASSERTED uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
210
211        if (!inst->dst.pack)
212                return;
213
214        *last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
215
216        if (qir_is_mul(inst)) {
217                assert(!unpack || had_pm);
218                *last_inst(block) |= QPU_PM;
219        } else {
220                assert(!unpack || !had_pm);
221                assert(!had_ws); /* dst must be a-file to pack. */
222        }
223}
224
225static void
226handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
227                    struct qpu_reg dst)
228{
229        if (dst.mux != QPU_MUX_R4) {
230                queue(block, qpu_a_MOV(dst, qpu_r4()));
231                set_last_cond_add(block, qinst->cond);
232        } else {
233                assert(qinst->cond == QPU_COND_ALWAYS);
234                if (qinst->sf)
235                        queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
236        }
237}
238
239static void
240vc4_generate_code_block(struct vc4_compile *c,
241                        struct qblock *block,
242                        struct qpu_reg *temp_registers)
243{
244        int last_vpm_read_index = -1;
245
246        qir_for_each_inst(qinst, block) {
247#if 0
248                fprintf(stderr, "translating qinst to qpu: ");
249                qir_dump_inst(qinst);
250                fprintf(stderr, "\n");
251#endif
252
253                static const struct {
254                        uint32_t op;
255                } translate[] = {
256#define A(name) [QOP_##name] = {QPU_A_##name}
257#define M(name) [QOP_##name] = {QPU_M_##name}
258                        A(FADD),
259                        A(FSUB),
260                        A(FMIN),
261                        A(FMAX),
262                        A(FMINABS),
263                        A(FMAXABS),
264                        A(FTOI),
265                        A(ITOF),
266                        A(ADD),
267                        A(SUB),
268                        A(SHL),
269                        A(SHR),
270                        A(ASR),
271                        A(MIN),
272                        A(MAX),
273                        A(AND),
274                        A(OR),
275                        A(XOR),
276                        A(NOT),
277
278                        M(FMUL),
279                        M(V8MULD),
280                        M(V8MIN),
281                        M(V8MAX),
282                        M(V8ADDS),
283                        M(V8SUBS),
284                        M(MUL24),
285
286                        /* If we replicate src[0] out to src[1], this works
287                         * out the same as a MOV.
288                         */
289                        [QOP_MOV] = { QPU_A_OR },
290                        [QOP_FMOV] = { QPU_A_FMAX },
291                        [QOP_MMOV] = { QPU_M_V8MIN },
292
293                        [QOP_MIN_NOIMM] = { QPU_A_MIN },
294                };
295
296                uint64_t unpack = 0;
297                struct qpu_reg src[ARRAY_SIZE(qinst->src)];
298                for (int i = 0; i < qir_get_nsrc(qinst); i++) {
299                        int index = qinst->src[i].index;
300                        switch (qinst->src[i].file) {
301                        case QFILE_NULL:
302                        case QFILE_LOAD_IMM:
303                                src[i] = qpu_rn(0);
304                                break;
305                        case QFILE_TEMP:
306                                src[i] = temp_registers[index];
307                                if (qinst->src[i].pack) {
308                                        assert(!unpack ||
309                                               unpack == qinst->src[i].pack);
310                                        unpack = QPU_SET_FIELD(qinst->src[i].pack,
311                                                               QPU_UNPACK);
312                                        if (src[i].mux == QPU_MUX_R4)
313                                                unpack |= QPU_PM;
314                                }
315                                break;
316                        case QFILE_UNIF:
317                                src[i] = qpu_unif();
318                                break;
319                        case QFILE_VARY:
320                                src[i] = qpu_vary();
321                                break;
322                        case QFILE_SMALL_IMM:
323                                src[i].mux = QPU_MUX_SMALL_IMM;
324                                src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
325                                /* This should only have returned a valid
326                                 * small immediate field, not ~0 for failure.
327                                 */
328                                assert(src[i].addr <= 47);
329                                break;
330                        case QFILE_VPM:
331                                setup_for_vpm_read(c, block);
332                                assert((int)qinst->src[i].index >=
333                                       last_vpm_read_index);
334                                (void)last_vpm_read_index;
335                                last_vpm_read_index = qinst->src[i].index;
336                                src[i] = qpu_ra(QPU_R_VPM);
337                                break;
338
339                        case QFILE_FRAG_X:
340                                src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
341                                break;
342                        case QFILE_FRAG_Y:
343                                src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
344                                break;
345                        case QFILE_FRAG_REV_FLAG:
346                                src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
347                                break;
348                        case QFILE_QPU_ELEMENT:
349                                src[i] = qpu_ra(QPU_R_ELEM_QPU);
350                                break;
351
352                        case QFILE_TLB_COLOR_WRITE:
353                        case QFILE_TLB_COLOR_WRITE_MS:
354                        case QFILE_TLB_Z_WRITE:
355                        case QFILE_TLB_STENCIL_SETUP:
356                        case QFILE_TEX_S:
357                        case QFILE_TEX_S_DIRECT:
358                        case QFILE_TEX_T:
359                        case QFILE_TEX_R:
360                        case QFILE_TEX_B:
361                                unreachable("bad qir src file");
362                        }
363                }
364
365                struct qpu_reg dst;
366                switch (qinst->dst.file) {
367                case QFILE_NULL:
368                        dst = qpu_ra(QPU_W_NOP);
369                        break;
370                case QFILE_TEMP:
371                        dst = temp_registers[qinst->dst.index];
372                        break;
373                case QFILE_VPM:
374                        dst = qpu_ra(QPU_W_VPM);
375                        break;
376
377                case QFILE_TLB_COLOR_WRITE:
378                        dst = qpu_tlbc();
379                        break;
380
381                case QFILE_TLB_COLOR_WRITE_MS:
382                        dst = qpu_tlbc_ms();
383                        break;
384
385                case QFILE_TLB_Z_WRITE:
386                        dst = qpu_ra(QPU_W_TLB_Z);
387                        break;
388
389                case QFILE_TLB_STENCIL_SETUP:
390                        dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
391                        break;
392
393                case QFILE_TEX_S:
394                case QFILE_TEX_S_DIRECT:
395                        dst = qpu_rb(QPU_W_TMU0_S);
396                        break;
397
398                case QFILE_TEX_T:
399                        dst = qpu_rb(QPU_W_TMU0_T);
400                        break;
401
402                case QFILE_TEX_R:
403                        dst = qpu_rb(QPU_W_TMU0_R);
404                        break;
405
406                case QFILE_TEX_B:
407                        dst = qpu_rb(QPU_W_TMU0_B);
408                        break;
409
410                case QFILE_VARY:
411                case QFILE_UNIF:
412                case QFILE_SMALL_IMM:
413                case QFILE_LOAD_IMM:
414                case QFILE_FRAG_X:
415                case QFILE_FRAG_Y:
416                case QFILE_FRAG_REV_FLAG:
417                case QFILE_QPU_ELEMENT:
418                        assert(!"not reached");
419                        break;
420                }
421
422                ASSERTED bool handled_qinst_cond = false;
423
424                switch (qinst->op) {
425                case QOP_RCP:
426                case QOP_RSQ:
427                case QOP_EXP2:
428                case QOP_LOG2:
429                        switch (qinst->op) {
430                        case QOP_RCP:
431                                queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
432                                                       src[0]) | unpack);
433                                break;
434                        case QOP_RSQ:
435                                queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
436                                                       src[0]) | unpack);
437                                break;
438                        case QOP_EXP2:
439                                queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
440                                                       src[0]) | unpack);
441                                break;
442                        case QOP_LOG2:
443                                queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
444                                                       src[0]) | unpack);
445                                break;
446                        default:
447                                abort();
448                        }
449
450                        handle_r4_qpu_write(block, qinst, dst);
451                        handled_qinst_cond = true;
452
453                        break;
454
455                case QOP_LOAD_IMM:
456                        assert(qinst->src[0].file == QFILE_LOAD_IMM);
457                        queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
458                        break;
459
460                case QOP_LOAD_IMM_U2:
461                        queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
462                        break;
463
464                case QOP_LOAD_IMM_I2:
465                        queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
466                        break;
467
468                case QOP_ROT_MUL:
469                        /* Rotation at the hardware level occurs on the inputs
470                         * to the MUL unit, and they must be accumulators in
471                         * order to have the time necessary to move things.
472                         */
473                        assert(src[0].mux <= QPU_MUX_R3);
474
475                        queue(block,
476                              qpu_m_rot(dst, src[0], qinst->src[1].index -
477                                        QPU_SMALL_IMM_MUL_ROT) | unpack);
478                        set_last_cond_mul(block, qinst->cond);
479                        handled_qinst_cond = true;
480                        set_last_dst_pack(block, qinst);
481                        break;
482
483                case QOP_MS_MASK:
484                        src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
485                        fixup_raddr_conflict(block, dst, &src[0], &src[1],
486                                             qinst, &unpack);
487                        queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
488                                               src[0], src[1]) | unpack);
489                        break;
490
491                case QOP_FRAG_Z:
492                case QOP_FRAG_W:
493                        /* QOP_FRAG_Z/W don't emit instructions, just allocate
494                         * the register to the Z/W payload.
495                         */
496                        break;
497
498                case QOP_TLB_COLOR_READ:
499                        queue(block, qpu_NOP());
500                        *last_inst(block) = qpu_set_sig(*last_inst(block),
501                                                        QPU_SIG_COLOR_LOAD);
502                        handle_r4_qpu_write(block, qinst, dst);
503                        handled_qinst_cond = true;
504                        break;
505
506                case QOP_VARY_ADD_C:
507                        queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
508                        break;
509
510
511                case QOP_TEX_RESULT:
512                        queue(block, qpu_NOP());
513                        *last_inst(block) = qpu_set_sig(*last_inst(block),
514                                                        QPU_SIG_LOAD_TMU0);
515                        handle_r4_qpu_write(block, qinst, dst);
516                        handled_qinst_cond = true;
517                        break;
518
519                case QOP_THRSW:
520                        queue(block, qpu_NOP());
521                        *last_inst(block) = qpu_set_sig(*last_inst(block),
522                                                        QPU_SIG_THREAD_SWITCH);
523                        c->last_thrsw = last_inst(block);
524                        break;
525
526                case QOP_BRANCH:
527                        /* The branch target will be updated at QPU scheduling
528                         * time.
529                         */
530                        queue(block, (qpu_branch(qinst->cond, 0) |
531                                      QPU_BRANCH_REL));
532                        handled_qinst_cond = true;
533                        break;
534
535                case QOP_UNIFORMS_RESET:
536                        fixup_raddr_conflict(block, dst, &src[0], &src[1],
537                                             qinst, &unpack);
538
539                        queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
540                                               src[0], src[1]));
541                        break;
542
543                default:
544                        assert(qinst->op < ARRAY_SIZE(translate));
545                        assert(translate[qinst->op].op != 0); /* NOPs */
546
547                        /* Skip emitting the MOV if it's a no-op. */
548                        if (qir_is_raw_mov(qinst) &&
549                            dst.mux == src[0].mux && dst.addr == src[0].addr) {
550                                break;
551                        }
552
553                        /* If we have only one source, put it in the second
554                         * argument slot as well so that we don't take up
555                         * another raddr just to get unused data.
556                         */
557                        if (qir_get_non_sideband_nsrc(qinst) == 1)
558                                src[1] = src[0];
559
560                        fixup_raddr_conflict(block, dst, &src[0], &src[1],
561                                             qinst, &unpack);
562
563                        if (qir_is_mul(qinst)) {
564                                queue(block, qpu_m_alu2(translate[qinst->op].op,
565                                                        dst,
566                                                        src[0], src[1]) | unpack);
567                                set_last_cond_mul(block, qinst->cond);
568                        } else {
569                                queue(block, qpu_a_alu2(translate[qinst->op].op,
570                                                        dst,
571                                                        src[0], src[1]) | unpack);
572                                set_last_cond_add(block, qinst->cond);
573                        }
574                        handled_qinst_cond = true;
575                        set_last_dst_pack(block, qinst);
576
577                        break;
578                }
579
580                assert(qinst->cond == QPU_COND_ALWAYS ||
581                       handled_qinst_cond);
582
583                if (qinst->sf)
584                        *last_inst(block) |= QPU_SF;
585        }
586}
587
588void
589vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
590{
591        struct qblock *start_block = list_first_entry(&c->blocks,
592                                                      struct qblock, link);
593
594        struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
595        if (!temp_registers)
596                return;
597
598        switch (c->stage) {
599        case QSTAGE_VERT:
600        case QSTAGE_COORD:
601                c->num_inputs_remaining = c->num_inputs;
602                queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
603                break;
604        case QSTAGE_FRAG:
605                break;
606        }
607
608        qir_for_each_block(block, c)
609                vc4_generate_code_block(c, block, temp_registers);
610
611        /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
612         *
613         * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
614         * that ensures that a later thread doesn't try to lock the scoreboard
615         * and terminate before an earlier-spawned thread on the same QPU, by
616         * delaying switching back to the later shader until earlier has
617         * finished.  Otherwise, if the earlier thread was hitting the same
618         * quad, the scoreboard would deadlock.
619         */
620        if (c->last_thrsw) {
621                assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
622                       QPU_SIG_THREAD_SWITCH);
623                *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
624                                  QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
625                                                QPU_SIG));
626        }
627
628        uint32_t cycles = qpu_schedule_instructions(c);
629        uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
630
631        /* thread end can't have VPM write or read */
632        if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
633                          QPU_WADDR_ADD) == QPU_W_VPM ||
634            QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
635                          QPU_WADDR_MUL) == QPU_W_VPM ||
636            QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
637                          QPU_RADDR_A) == QPU_R_VPM ||
638            QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
639                          QPU_RADDR_B) == QPU_R_VPM) {
640                qpu_serialize_one_inst(c, qpu_NOP());
641        }
642
643        /* thread end can't have uniform read */
644        if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
645                          QPU_RADDR_A) == QPU_R_UNIF ||
646            QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
647                          QPU_RADDR_B) == QPU_R_UNIF) {
648                qpu_serialize_one_inst(c, qpu_NOP());
649        }
650
651        /* thread end can't have TLB operations */
652        if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
653                qpu_serialize_one_inst(c, qpu_NOP());
654
655        /* Make sure there's no existing signal set (like for a small
656         * immediate)
657         */
658        if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
659                          QPU_SIG) != QPU_SIG_NONE) {
660                qpu_serialize_one_inst(c, qpu_NOP());
661        }
662
663        c->qpu_insts[c->qpu_inst_count - 1] =
664                qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
665                            QPU_SIG_PROG_END);
666        qpu_serialize_one_inst(c, qpu_NOP());
667        qpu_serialize_one_inst(c, qpu_NOP());
668
669        switch (c->stage) {
670        case QSTAGE_VERT:
671        case QSTAGE_COORD:
672                break;
673        case QSTAGE_FRAG:
674                c->qpu_insts[c->qpu_inst_count - 1] =
675                        qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
676                                    QPU_SIG_SCOREBOARD_UNLOCK);
677                break;
678        }
679
680        cycles += c->qpu_inst_count - inst_count_at_schedule_time;
681
682        if (vc4_debug & VC4_DEBUG_SHADERDB) {
683                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
684                        qir_get_stage_name(c->stage),
685                        c->program_id, c->variant_id,
686                        cycles);
687        }
688
689        if (vc4_debug & VC4_DEBUG_QPU)
690                vc4_dump_program(c);
691
692        vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
693
694        free(temp_registers);
695}
696