1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2010 Intel Corporation
3bf215546Sopenharmony_ci * Copyright © 2014-2017 Broadcom
4bf215546Sopenharmony_ci *
5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
8bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
10bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
11bf215546Sopenharmony_ci *
12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
14bf215546Sopenharmony_ci * Software.
15bf215546Sopenharmony_ci *
16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22bf215546Sopenharmony_ci * IN THE SOFTWARE.
23bf215546Sopenharmony_ci */
24bf215546Sopenharmony_ci
25bf215546Sopenharmony_ci/**
26bf215546Sopenharmony_ci * @file
27bf215546Sopenharmony_ci *
28bf215546Sopenharmony_ci * The basic model of the list scheduler is to take a basic block, compute a
29bf215546Sopenharmony_ci * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30bf215546Sopenharmony_ci * pick a DAG head, then put all the children that are now DAG heads into the
31bf215546Sopenharmony_ci * list of things to schedule.
32bf215546Sopenharmony_ci *
33bf215546Sopenharmony_ci * The goal of scheduling here is to pack pairs of operations together in a
34bf215546Sopenharmony_ci * single QPU instruction.
35bf215546Sopenharmony_ci */
36bf215546Sopenharmony_ci
37bf215546Sopenharmony_ci#include "qpu/qpu_disasm.h"
38bf215546Sopenharmony_ci#include "v3d_compiler.h"
39bf215546Sopenharmony_ci#include "util/ralloc.h"
40bf215546Sopenharmony_ci#include "util/dag.h"
41bf215546Sopenharmony_ci
42bf215546Sopenharmony_cistatic bool debug;
43bf215546Sopenharmony_ci
44bf215546Sopenharmony_cistruct schedule_node_child;
45bf215546Sopenharmony_ci
46bf215546Sopenharmony_cistruct schedule_node {
47bf215546Sopenharmony_ci        struct dag_node dag;
48bf215546Sopenharmony_ci        struct list_head link;
49bf215546Sopenharmony_ci        struct qinst *inst;
50bf215546Sopenharmony_ci
51bf215546Sopenharmony_ci        /* Longest cycles + instruction_latency() of any parent of this node. */
52bf215546Sopenharmony_ci        uint32_t unblocked_time;
53bf215546Sopenharmony_ci
54bf215546Sopenharmony_ci        /**
55bf215546Sopenharmony_ci         * Minimum number of cycles from scheduling this instruction until the
56bf215546Sopenharmony_ci         * end of the program, based on the slowest dependency chain through
57bf215546Sopenharmony_ci         * the children.
58bf215546Sopenharmony_ci         */
59bf215546Sopenharmony_ci        uint32_t delay;
60bf215546Sopenharmony_ci
61bf215546Sopenharmony_ci        /**
62bf215546Sopenharmony_ci         * cycles between this instruction being scheduled and when its result
63bf215546Sopenharmony_ci         * can be consumed.
64bf215546Sopenharmony_ci         */
65bf215546Sopenharmony_ci        uint32_t latency;
66bf215546Sopenharmony_ci};
67bf215546Sopenharmony_ci
68bf215546Sopenharmony_ci/* When walking the instructions in reverse, we need to swap before/after in
69bf215546Sopenharmony_ci * add_dep().
70bf215546Sopenharmony_ci */
71bf215546Sopenharmony_cienum direction { F, R };
72bf215546Sopenharmony_ci
73bf215546Sopenharmony_cistruct schedule_state {
74bf215546Sopenharmony_ci        const struct v3d_device_info *devinfo;
75bf215546Sopenharmony_ci        struct dag *dag;
76bf215546Sopenharmony_ci        struct schedule_node *last_r[6];
77bf215546Sopenharmony_ci        struct schedule_node *last_rf[64];
78bf215546Sopenharmony_ci        struct schedule_node *last_sf;
79bf215546Sopenharmony_ci        struct schedule_node *last_vpm_read;
80bf215546Sopenharmony_ci        struct schedule_node *last_tmu_write;
81bf215546Sopenharmony_ci        struct schedule_node *last_tmu_config;
82bf215546Sopenharmony_ci        struct schedule_node *last_tmu_read;
83bf215546Sopenharmony_ci        struct schedule_node *last_tlb;
84bf215546Sopenharmony_ci        struct schedule_node *last_vpm;
85bf215546Sopenharmony_ci        struct schedule_node *last_unif;
86bf215546Sopenharmony_ci        struct schedule_node *last_rtop;
87bf215546Sopenharmony_ci        struct schedule_node *last_unifa;
88bf215546Sopenharmony_ci        enum direction dir;
89bf215546Sopenharmony_ci        /* Estimated cycle when the current instruction would start. */
90bf215546Sopenharmony_ci        uint32_t time;
91bf215546Sopenharmony_ci};
92bf215546Sopenharmony_ci
93bf215546Sopenharmony_cistatic void
94bf215546Sopenharmony_ciadd_dep(struct schedule_state *state,
95bf215546Sopenharmony_ci        struct schedule_node *before,
96bf215546Sopenharmony_ci        struct schedule_node *after,
97bf215546Sopenharmony_ci        bool write)
98bf215546Sopenharmony_ci{
99bf215546Sopenharmony_ci        bool write_after_read = !write && state->dir == R;
100bf215546Sopenharmony_ci        uintptr_t edge_data = write_after_read;
101bf215546Sopenharmony_ci
102bf215546Sopenharmony_ci        if (!before || !after)
103bf215546Sopenharmony_ci                return;
104bf215546Sopenharmony_ci
105bf215546Sopenharmony_ci        assert(before != after);
106bf215546Sopenharmony_ci
107bf215546Sopenharmony_ci        if (state->dir == F)
108bf215546Sopenharmony_ci                dag_add_edge(&before->dag, &after->dag, edge_data);
109bf215546Sopenharmony_ci        else
110bf215546Sopenharmony_ci                dag_add_edge(&after->dag, &before->dag, edge_data);
111bf215546Sopenharmony_ci}
112bf215546Sopenharmony_ci
113bf215546Sopenharmony_cistatic void
114bf215546Sopenharmony_ciadd_read_dep(struct schedule_state *state,
115bf215546Sopenharmony_ci              struct schedule_node *before,
116bf215546Sopenharmony_ci              struct schedule_node *after)
117bf215546Sopenharmony_ci{
118bf215546Sopenharmony_ci        add_dep(state, before, after, false);
119bf215546Sopenharmony_ci}
120bf215546Sopenharmony_ci
121bf215546Sopenharmony_cistatic void
122bf215546Sopenharmony_ciadd_write_dep(struct schedule_state *state,
123bf215546Sopenharmony_ci              struct schedule_node **before,
124bf215546Sopenharmony_ci              struct schedule_node *after)
125bf215546Sopenharmony_ci{
126bf215546Sopenharmony_ci        add_dep(state, *before, after, true);
127bf215546Sopenharmony_ci        *before = after;
128bf215546Sopenharmony_ci}
129bf215546Sopenharmony_ci
130bf215546Sopenharmony_cistatic bool
131bf215546Sopenharmony_ciqpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
132bf215546Sopenharmony_ci{
133bf215546Sopenharmony_ci        if (inst->sig.ldtlb || inst->sig.ldtlbu)
134bf215546Sopenharmony_ci                return true;
135bf215546Sopenharmony_ci
136bf215546Sopenharmony_ci        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
137bf215546Sopenharmony_ci                return false;
138bf215546Sopenharmony_ci
139bf215546Sopenharmony_ci        if (inst->alu.add.magic_write &&
140bf215546Sopenharmony_ci            (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
141bf215546Sopenharmony_ci             inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
142bf215546Sopenharmony_ci                return true;
143bf215546Sopenharmony_ci
144bf215546Sopenharmony_ci        if (inst->alu.mul.magic_write &&
145bf215546Sopenharmony_ci            (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
146bf215546Sopenharmony_ci             inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
147bf215546Sopenharmony_ci                return true;
148bf215546Sopenharmony_ci
149bf215546Sopenharmony_ci        return false;
150bf215546Sopenharmony_ci}
151bf215546Sopenharmony_ci
152bf215546Sopenharmony_cistatic void
153bf215546Sopenharmony_ciprocess_mux_deps(struct schedule_state *state, struct schedule_node *n,
154bf215546Sopenharmony_ci                 enum v3d_qpu_mux mux)
155bf215546Sopenharmony_ci{
156bf215546Sopenharmony_ci        switch (mux) {
157bf215546Sopenharmony_ci        case V3D_QPU_MUX_A:
158bf215546Sopenharmony_ci                add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
159bf215546Sopenharmony_ci                break;
160bf215546Sopenharmony_ci        case V3D_QPU_MUX_B:
161bf215546Sopenharmony_ci                if (!n->inst->qpu.sig.small_imm) {
162bf215546Sopenharmony_ci                        add_read_dep(state,
163bf215546Sopenharmony_ci                                     state->last_rf[n->inst->qpu.raddr_b], n);
164bf215546Sopenharmony_ci                }
165bf215546Sopenharmony_ci                break;
166bf215546Sopenharmony_ci        default:
167bf215546Sopenharmony_ci                add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
168bf215546Sopenharmony_ci                break;
169bf215546Sopenharmony_ci        }
170bf215546Sopenharmony_ci}
171bf215546Sopenharmony_ci
172bf215546Sopenharmony_cistatic bool
173bf215546Sopenharmony_citmu_write_is_sequence_terminator(uint32_t waddr)
174bf215546Sopenharmony_ci{
175bf215546Sopenharmony_ci        switch (waddr) {
176bf215546Sopenharmony_ci        case V3D_QPU_WADDR_TMUS:
177bf215546Sopenharmony_ci        case V3D_QPU_WADDR_TMUSCM:
178bf215546Sopenharmony_ci        case V3D_QPU_WADDR_TMUSF:
179bf215546Sopenharmony_ci        case V3D_QPU_WADDR_TMUSLOD:
180bf215546Sopenharmony_ci        case V3D_QPU_WADDR_TMUA:
181bf215546Sopenharmony_ci        case V3D_QPU_WADDR_TMUAU:
182bf215546Sopenharmony_ci                return true;
183bf215546Sopenharmony_ci        default:
184bf215546Sopenharmony_ci                return false;
185bf215546Sopenharmony_ci        }
186bf215546Sopenharmony_ci}
187bf215546Sopenharmony_ci
188bf215546Sopenharmony_cistatic bool
189bf215546Sopenharmony_cican_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
190bf215546Sopenharmony_ci{
191bf215546Sopenharmony_ci        if (devinfo->ver < 40)
192bf215546Sopenharmony_ci                return false;
193bf215546Sopenharmony_ci
194bf215546Sopenharmony_ci        if (tmu_write_is_sequence_terminator(waddr))
195bf215546Sopenharmony_ci                return false;
196bf215546Sopenharmony_ci
197bf215546Sopenharmony_ci        if (waddr == V3D_QPU_WADDR_TMUD)
198bf215546Sopenharmony_ci                return false;
199bf215546Sopenharmony_ci
200bf215546Sopenharmony_ci        return true;
201bf215546Sopenharmony_ci}
202bf215546Sopenharmony_ci
203bf215546Sopenharmony_cistatic void
204bf215546Sopenharmony_ciprocess_waddr_deps(struct schedule_state *state, struct schedule_node *n,
205bf215546Sopenharmony_ci                   uint32_t waddr, bool magic)
206bf215546Sopenharmony_ci{
207bf215546Sopenharmony_ci        if (!magic) {
208bf215546Sopenharmony_ci                add_write_dep(state, &state->last_rf[waddr], n);
209bf215546Sopenharmony_ci        } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
210bf215546Sopenharmony_ci                if (can_reorder_tmu_write(state->devinfo, waddr))
211bf215546Sopenharmony_ci                        add_read_dep(state, state->last_tmu_write, n);
212bf215546Sopenharmony_ci                else
213bf215546Sopenharmony_ci                        add_write_dep(state, &state->last_tmu_write, n);
214bf215546Sopenharmony_ci
215bf215546Sopenharmony_ci                if (tmu_write_is_sequence_terminator(waddr))
216bf215546Sopenharmony_ci                        add_write_dep(state, &state->last_tmu_config, n);
217bf215546Sopenharmony_ci        } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
218bf215546Sopenharmony_ci                /* Handled by v3d_qpu_writes_r4() check. */
219bf215546Sopenharmony_ci        } else {
220bf215546Sopenharmony_ci                switch (waddr) {
221bf215546Sopenharmony_ci                case V3D_QPU_WADDR_R0:
222bf215546Sopenharmony_ci                case V3D_QPU_WADDR_R1:
223bf215546Sopenharmony_ci                case V3D_QPU_WADDR_R2:
224bf215546Sopenharmony_ci                        add_write_dep(state,
225bf215546Sopenharmony_ci                                      &state->last_r[waddr - V3D_QPU_WADDR_R0],
226bf215546Sopenharmony_ci                                      n);
227bf215546Sopenharmony_ci                        break;
228bf215546Sopenharmony_ci                case V3D_QPU_WADDR_R3:
229bf215546Sopenharmony_ci                case V3D_QPU_WADDR_R4:
230bf215546Sopenharmony_ci                case V3D_QPU_WADDR_R5:
231bf215546Sopenharmony_ci                        /* Handled by v3d_qpu_writes_r*() checks below. */
232bf215546Sopenharmony_ci                        break;
233bf215546Sopenharmony_ci
234bf215546Sopenharmony_ci                case V3D_QPU_WADDR_VPM:
235bf215546Sopenharmony_ci                case V3D_QPU_WADDR_VPMU:
236bf215546Sopenharmony_ci                        add_write_dep(state, &state->last_vpm, n);
237bf215546Sopenharmony_ci                        break;
238bf215546Sopenharmony_ci
239bf215546Sopenharmony_ci                case V3D_QPU_WADDR_TLB:
240bf215546Sopenharmony_ci                case V3D_QPU_WADDR_TLBU:
241bf215546Sopenharmony_ci                        add_write_dep(state, &state->last_tlb, n);
242bf215546Sopenharmony_ci                        break;
243bf215546Sopenharmony_ci
244bf215546Sopenharmony_ci                case V3D_QPU_WADDR_SYNC:
245bf215546Sopenharmony_ci                case V3D_QPU_WADDR_SYNCB:
246bf215546Sopenharmony_ci                case V3D_QPU_WADDR_SYNCU:
247bf215546Sopenharmony_ci                        /* For CS barrier(): Sync against any other memory
248bf215546Sopenharmony_ci                         * accesses.  There doesn't appear to be any need for
249bf215546Sopenharmony_ci                         * barriers to affect ALU operations.
250bf215546Sopenharmony_ci                         */
251bf215546Sopenharmony_ci                        add_write_dep(state, &state->last_tmu_write, n);
252bf215546Sopenharmony_ci                        add_write_dep(state, &state->last_tmu_read, n);
253bf215546Sopenharmony_ci                        break;
254bf215546Sopenharmony_ci
255bf215546Sopenharmony_ci                case V3D_QPU_WADDR_UNIFA:
256bf215546Sopenharmony_ci                        if (state->devinfo->ver >= 40)
257bf215546Sopenharmony_ci                                add_write_dep(state, &state->last_unifa, n);
258bf215546Sopenharmony_ci                        break;
259bf215546Sopenharmony_ci
260bf215546Sopenharmony_ci                case V3D_QPU_WADDR_NOP:
261bf215546Sopenharmony_ci                        break;
262bf215546Sopenharmony_ci
263bf215546Sopenharmony_ci                default:
264bf215546Sopenharmony_ci                        fprintf(stderr, "Unknown waddr %d\n", waddr);
265bf215546Sopenharmony_ci                        abort();
266bf215546Sopenharmony_ci                }
267bf215546Sopenharmony_ci        }
268bf215546Sopenharmony_ci}
269bf215546Sopenharmony_ci
270bf215546Sopenharmony_ci/**
271bf215546Sopenharmony_ci * Common code for dependencies that need to be tracked both forward and
272bf215546Sopenharmony_ci * backward.
273bf215546Sopenharmony_ci *
274bf215546Sopenharmony_ci * This is for things like "all reads of r4 have to happen between the r4
275bf215546Sopenharmony_ci * writes that surround them".
276bf215546Sopenharmony_ci */
277bf215546Sopenharmony_cistatic void
278bf215546Sopenharmony_cicalculate_deps(struct schedule_state *state, struct schedule_node *n)
279bf215546Sopenharmony_ci{
280bf215546Sopenharmony_ci        const struct v3d_device_info *devinfo = state->devinfo;
281bf215546Sopenharmony_ci        struct qinst *qinst = n->inst;
282bf215546Sopenharmony_ci        struct v3d_qpu_instr *inst = &qinst->qpu;
283bf215546Sopenharmony_ci        /* If the input and output segments are shared, then all VPM reads to
284bf215546Sopenharmony_ci         * a location need to happen before all writes.  We handle this by
285bf215546Sopenharmony_ci         * serializing all VPM operations for now.
286bf215546Sopenharmony_ci         */
287bf215546Sopenharmony_ci        bool separate_vpm_segment = false;
288bf215546Sopenharmony_ci
289bf215546Sopenharmony_ci        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
290bf215546Sopenharmony_ci                if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
291bf215546Sopenharmony_ci                        add_read_dep(state, state->last_sf, n);
292bf215546Sopenharmony_ci
293bf215546Sopenharmony_ci                /* XXX: BDI */
294bf215546Sopenharmony_ci                /* XXX: BDU */
295bf215546Sopenharmony_ci                /* XXX: ub */
296bf215546Sopenharmony_ci                /* XXX: raddr_a */
297bf215546Sopenharmony_ci
298bf215546Sopenharmony_ci                add_write_dep(state, &state->last_unif, n);
299bf215546Sopenharmony_ci                return;
300bf215546Sopenharmony_ci        }
301bf215546Sopenharmony_ci
302bf215546Sopenharmony_ci        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
303bf215546Sopenharmony_ci
304bf215546Sopenharmony_ci        /* XXX: LOAD_IMM */
305bf215546Sopenharmony_ci
306bf215546Sopenharmony_ci        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
307bf215546Sopenharmony_ci                process_mux_deps(state, n, inst->alu.add.a);
308bf215546Sopenharmony_ci        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
309bf215546Sopenharmony_ci                process_mux_deps(state, n, inst->alu.add.b);
310bf215546Sopenharmony_ci
311bf215546Sopenharmony_ci        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
312bf215546Sopenharmony_ci                process_mux_deps(state, n, inst->alu.mul.a);
313bf215546Sopenharmony_ci        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
314bf215546Sopenharmony_ci                process_mux_deps(state, n, inst->alu.mul.b);
315bf215546Sopenharmony_ci
316bf215546Sopenharmony_ci        switch (inst->alu.add.op) {
317bf215546Sopenharmony_ci        case V3D_QPU_A_VPMSETUP:
318bf215546Sopenharmony_ci                /* Could distinguish read/write by unpacking the uniform. */
319bf215546Sopenharmony_ci                add_write_dep(state, &state->last_vpm, n);
320bf215546Sopenharmony_ci                add_write_dep(state, &state->last_vpm_read, n);
321bf215546Sopenharmony_ci                break;
322bf215546Sopenharmony_ci
323bf215546Sopenharmony_ci        case V3D_QPU_A_STVPMV:
324bf215546Sopenharmony_ci        case V3D_QPU_A_STVPMD:
325bf215546Sopenharmony_ci        case V3D_QPU_A_STVPMP:
326bf215546Sopenharmony_ci                add_write_dep(state, &state->last_vpm, n);
327bf215546Sopenharmony_ci                break;
328bf215546Sopenharmony_ci
329bf215546Sopenharmony_ci        case V3D_QPU_A_LDVPMV_IN:
330bf215546Sopenharmony_ci        case V3D_QPU_A_LDVPMD_IN:
331bf215546Sopenharmony_ci        case V3D_QPU_A_LDVPMG_IN:
332bf215546Sopenharmony_ci        case V3D_QPU_A_LDVPMP:
333bf215546Sopenharmony_ci                if (!separate_vpm_segment)
334bf215546Sopenharmony_ci                        add_write_dep(state, &state->last_vpm, n);
335bf215546Sopenharmony_ci                break;
336bf215546Sopenharmony_ci
337bf215546Sopenharmony_ci        case V3D_QPU_A_VPMWT:
338bf215546Sopenharmony_ci                add_read_dep(state, state->last_vpm, n);
339bf215546Sopenharmony_ci                break;
340bf215546Sopenharmony_ci
341bf215546Sopenharmony_ci        case V3D_QPU_A_MSF:
342bf215546Sopenharmony_ci                add_read_dep(state, state->last_tlb, n);
343bf215546Sopenharmony_ci                break;
344bf215546Sopenharmony_ci
345bf215546Sopenharmony_ci        case V3D_QPU_A_SETMSF:
346bf215546Sopenharmony_ci        case V3D_QPU_A_SETREVF:
347bf215546Sopenharmony_ci                add_write_dep(state, &state->last_tlb, n);
348bf215546Sopenharmony_ci                break;
349bf215546Sopenharmony_ci
350bf215546Sopenharmony_ci        default:
351bf215546Sopenharmony_ci                break;
352bf215546Sopenharmony_ci        }
353bf215546Sopenharmony_ci
354bf215546Sopenharmony_ci        switch (inst->alu.mul.op) {
355bf215546Sopenharmony_ci        case V3D_QPU_M_MULTOP:
356bf215546Sopenharmony_ci        case V3D_QPU_M_UMUL24:
357bf215546Sopenharmony_ci                /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
358bf215546Sopenharmony_ci                 * resets it to 0.  We could possibly reorder umul24s relative
359bf215546Sopenharmony_ci                 * to each other, but for now just keep all the MUL parts in
360bf215546Sopenharmony_ci                 * order.
361bf215546Sopenharmony_ci                 */
362bf215546Sopenharmony_ci                add_write_dep(state, &state->last_rtop, n);
363bf215546Sopenharmony_ci                break;
364bf215546Sopenharmony_ci        default:
365bf215546Sopenharmony_ci                break;
366bf215546Sopenharmony_ci        }
367bf215546Sopenharmony_ci
368bf215546Sopenharmony_ci        if (inst->alu.add.op != V3D_QPU_A_NOP) {
369bf215546Sopenharmony_ci                process_waddr_deps(state, n, inst->alu.add.waddr,
370bf215546Sopenharmony_ci                                   inst->alu.add.magic_write);
371bf215546Sopenharmony_ci        }
372bf215546Sopenharmony_ci        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
373bf215546Sopenharmony_ci                process_waddr_deps(state, n, inst->alu.mul.waddr,
374bf215546Sopenharmony_ci                                   inst->alu.mul.magic_write);
375bf215546Sopenharmony_ci        }
376bf215546Sopenharmony_ci        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
377bf215546Sopenharmony_ci                process_waddr_deps(state, n, inst->sig_addr,
378bf215546Sopenharmony_ci                                   inst->sig_magic);
379bf215546Sopenharmony_ci        }
380bf215546Sopenharmony_ci
381bf215546Sopenharmony_ci        if (v3d_qpu_writes_r3(devinfo, inst))
382bf215546Sopenharmony_ci                add_write_dep(state, &state->last_r[3], n);
383bf215546Sopenharmony_ci        if (v3d_qpu_writes_r4(devinfo, inst))
384bf215546Sopenharmony_ci                add_write_dep(state, &state->last_r[4], n);
385bf215546Sopenharmony_ci        if (v3d_qpu_writes_r5(devinfo, inst))
386bf215546Sopenharmony_ci                add_write_dep(state, &state->last_r[5], n);
387bf215546Sopenharmony_ci
388bf215546Sopenharmony_ci        /* If we add any more dependencies here we should consider whether we
389bf215546Sopenharmony_ci         * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
390bf215546Sopenharmony_ci         */
391bf215546Sopenharmony_ci        if (inst->sig.thrsw) {
392bf215546Sopenharmony_ci                /* All accumulator contents and flags are undefined after the
393bf215546Sopenharmony_ci                 * switch.
394bf215546Sopenharmony_ci                 */
395bf215546Sopenharmony_ci                for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
396bf215546Sopenharmony_ci                        add_write_dep(state, &state->last_r[i], n);
397bf215546Sopenharmony_ci                add_write_dep(state, &state->last_sf, n);
398bf215546Sopenharmony_ci                add_write_dep(state, &state->last_rtop, n);
399bf215546Sopenharmony_ci
400bf215546Sopenharmony_ci                /* Scoreboard-locking operations have to stay after the last
401bf215546Sopenharmony_ci                 * thread switch.
402bf215546Sopenharmony_ci                 */
403bf215546Sopenharmony_ci                add_write_dep(state, &state->last_tlb, n);
404bf215546Sopenharmony_ci
405bf215546Sopenharmony_ci                add_write_dep(state, &state->last_tmu_write, n);
406bf215546Sopenharmony_ci                add_write_dep(state, &state->last_tmu_config, n);
407bf215546Sopenharmony_ci        }
408bf215546Sopenharmony_ci
409bf215546Sopenharmony_ci        if (v3d_qpu_waits_on_tmu(inst)) {
410bf215546Sopenharmony_ci                /* TMU loads are coming from a FIFO, so ordering is important.
411bf215546Sopenharmony_ci                 */
412bf215546Sopenharmony_ci                add_write_dep(state, &state->last_tmu_read, n);
413bf215546Sopenharmony_ci                /* Keep TMU loads after their TMU lookup terminator */
414bf215546Sopenharmony_ci                add_read_dep(state, state->last_tmu_config, n);
415bf215546Sopenharmony_ci        }
416bf215546Sopenharmony_ci
417bf215546Sopenharmony_ci        /* Allow wrtmuc to be reordered with other instructions in the
418bf215546Sopenharmony_ci         * same TMU sequence by using a read dependency on the last TMU
419bf215546Sopenharmony_ci         * sequence terminator.
420bf215546Sopenharmony_ci         */
421bf215546Sopenharmony_ci        if (inst->sig.wrtmuc)
422bf215546Sopenharmony_ci                add_read_dep(state, state->last_tmu_config, n);
423bf215546Sopenharmony_ci
424bf215546Sopenharmony_ci        if (inst->sig.ldtlb | inst->sig.ldtlbu)
425bf215546Sopenharmony_ci                add_write_dep(state, &state->last_tlb, n);
426bf215546Sopenharmony_ci
427bf215546Sopenharmony_ci        if (inst->sig.ldvpm) {
428bf215546Sopenharmony_ci                add_write_dep(state, &state->last_vpm_read, n);
429bf215546Sopenharmony_ci
430bf215546Sopenharmony_ci                /* At least for now, we're doing shared I/O segments, so queue
431bf215546Sopenharmony_ci                 * all writes after all reads.
432bf215546Sopenharmony_ci                 */
433bf215546Sopenharmony_ci                if (!separate_vpm_segment)
434bf215546Sopenharmony_ci                        add_write_dep(state, &state->last_vpm, n);
435bf215546Sopenharmony_ci        }
436bf215546Sopenharmony_ci
437bf215546Sopenharmony_ci        /* inst->sig.ldunif or sideband uniform read */
438bf215546Sopenharmony_ci        if (vir_has_uniform(qinst))
439bf215546Sopenharmony_ci                add_write_dep(state, &state->last_unif, n);
440bf215546Sopenharmony_ci
441bf215546Sopenharmony_ci        /* Both unifa and ldunifa must preserve ordering */
442bf215546Sopenharmony_ci        if (inst->sig.ldunifa || inst->sig.ldunifarf)
443bf215546Sopenharmony_ci                add_write_dep(state, &state->last_unifa, n);
444bf215546Sopenharmony_ci
445bf215546Sopenharmony_ci        if (v3d_qpu_reads_flags(inst))
446bf215546Sopenharmony_ci                add_read_dep(state, state->last_sf, n);
447bf215546Sopenharmony_ci        if (v3d_qpu_writes_flags(inst))
448bf215546Sopenharmony_ci                add_write_dep(state, &state->last_sf, n);
449bf215546Sopenharmony_ci}
450bf215546Sopenharmony_ci
451bf215546Sopenharmony_cistatic void
452bf215546Sopenharmony_cicalculate_forward_deps(struct v3d_compile *c, struct dag *dag,
453bf215546Sopenharmony_ci                       struct list_head *schedule_list)
454bf215546Sopenharmony_ci{
455bf215546Sopenharmony_ci        struct schedule_state state;
456bf215546Sopenharmony_ci
457bf215546Sopenharmony_ci        memset(&state, 0, sizeof(state));
458bf215546Sopenharmony_ci        state.dag = dag;
459bf215546Sopenharmony_ci        state.devinfo = c->devinfo;
460bf215546Sopenharmony_ci        state.dir = F;
461bf215546Sopenharmony_ci
462bf215546Sopenharmony_ci        list_for_each_entry(struct schedule_node, node, schedule_list, link)
463bf215546Sopenharmony_ci                calculate_deps(&state, node);
464bf215546Sopenharmony_ci}
465bf215546Sopenharmony_ci
466bf215546Sopenharmony_cistatic void
467bf215546Sopenharmony_cicalculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
468bf215546Sopenharmony_ci                       struct list_head *schedule_list)
469bf215546Sopenharmony_ci{
470bf215546Sopenharmony_ci        struct schedule_state state;
471bf215546Sopenharmony_ci
472bf215546Sopenharmony_ci        memset(&state, 0, sizeof(state));
473bf215546Sopenharmony_ci        state.dag = dag;
474bf215546Sopenharmony_ci        state.devinfo = c->devinfo;
475bf215546Sopenharmony_ci        state.dir = R;
476bf215546Sopenharmony_ci
477bf215546Sopenharmony_ci        list_for_each_entry_rev(struct schedule_node, node, schedule_list,
478bf215546Sopenharmony_ci                                link) {
479bf215546Sopenharmony_ci                calculate_deps(&state, (struct schedule_node *)node);
480bf215546Sopenharmony_ci        }
481bf215546Sopenharmony_ci}
482bf215546Sopenharmony_ci
483bf215546Sopenharmony_cistruct choose_scoreboard {
484bf215546Sopenharmony_ci        struct dag *dag;
485bf215546Sopenharmony_ci        int tick;
486bf215546Sopenharmony_ci        int last_magic_sfu_write_tick;
487bf215546Sopenharmony_ci        int last_stallable_sfu_reg;
488bf215546Sopenharmony_ci        int last_stallable_sfu_tick;
489bf215546Sopenharmony_ci        int last_ldvary_tick;
490bf215546Sopenharmony_ci        int last_unifa_write_tick;
491bf215546Sopenharmony_ci        int last_uniforms_reset_tick;
492bf215546Sopenharmony_ci        int last_thrsw_tick;
493bf215546Sopenharmony_ci        int last_branch_tick;
494bf215546Sopenharmony_ci        int last_setmsf_tick;
495bf215546Sopenharmony_ci        bool first_thrsw_emitted;
496bf215546Sopenharmony_ci        bool last_thrsw_emitted;
497bf215546Sopenharmony_ci        bool fixup_ldvary;
498bf215546Sopenharmony_ci        int ldvary_count;
499bf215546Sopenharmony_ci};
500bf215546Sopenharmony_ci
501bf215546Sopenharmony_cistatic bool
502bf215546Sopenharmony_cimux_reads_too_soon(struct choose_scoreboard *scoreboard,
503bf215546Sopenharmony_ci                   const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
504bf215546Sopenharmony_ci{
505bf215546Sopenharmony_ci        switch (mux) {
506bf215546Sopenharmony_ci        case V3D_QPU_MUX_R4:
507bf215546Sopenharmony_ci                if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
508bf215546Sopenharmony_ci                        return true;
509bf215546Sopenharmony_ci                break;
510bf215546Sopenharmony_ci
511bf215546Sopenharmony_ci        case V3D_QPU_MUX_R5:
512bf215546Sopenharmony_ci                if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
513bf215546Sopenharmony_ci                        return true;
514bf215546Sopenharmony_ci                break;
515bf215546Sopenharmony_ci        default:
516bf215546Sopenharmony_ci                break;
517bf215546Sopenharmony_ci        }
518bf215546Sopenharmony_ci
519bf215546Sopenharmony_ci        return false;
520bf215546Sopenharmony_ci}
521bf215546Sopenharmony_ci
522bf215546Sopenharmony_cistatic bool
523bf215546Sopenharmony_cireads_too_soon_after_write(struct choose_scoreboard *scoreboard,
524bf215546Sopenharmony_ci                           struct qinst *qinst)
525bf215546Sopenharmony_ci{
526bf215546Sopenharmony_ci        const struct v3d_qpu_instr *inst = &qinst->qpu;
527bf215546Sopenharmony_ci
528bf215546Sopenharmony_ci        /* XXX: Branching off of raddr. */
529bf215546Sopenharmony_ci        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
530bf215546Sopenharmony_ci                return false;
531bf215546Sopenharmony_ci
532bf215546Sopenharmony_ci        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
533bf215546Sopenharmony_ci
534bf215546Sopenharmony_ci        if (inst->alu.add.op != V3D_QPU_A_NOP) {
535bf215546Sopenharmony_ci                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
536bf215546Sopenharmony_ci                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
537bf215546Sopenharmony_ci                        return true;
538bf215546Sopenharmony_ci                }
539bf215546Sopenharmony_ci                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
540bf215546Sopenharmony_ci                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
541bf215546Sopenharmony_ci                        return true;
542bf215546Sopenharmony_ci                }
543bf215546Sopenharmony_ci        }
544bf215546Sopenharmony_ci
545bf215546Sopenharmony_ci        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
546bf215546Sopenharmony_ci                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
547bf215546Sopenharmony_ci                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
548bf215546Sopenharmony_ci                        return true;
549bf215546Sopenharmony_ci                }
550bf215546Sopenharmony_ci                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
551bf215546Sopenharmony_ci                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
552bf215546Sopenharmony_ci                        return true;
553bf215546Sopenharmony_ci                }
554bf215546Sopenharmony_ci        }
555bf215546Sopenharmony_ci
556bf215546Sopenharmony_ci        /* XXX: imm */
557bf215546Sopenharmony_ci
558bf215546Sopenharmony_ci        return false;
559bf215546Sopenharmony_ci}
560bf215546Sopenharmony_ci
561bf215546Sopenharmony_cistatic bool
562bf215546Sopenharmony_ciwrites_too_soon_after_write(const struct v3d_device_info *devinfo,
563bf215546Sopenharmony_ci                            struct choose_scoreboard *scoreboard,
564bf215546Sopenharmony_ci                            struct qinst *qinst)
565bf215546Sopenharmony_ci{
566bf215546Sopenharmony_ci        const struct v3d_qpu_instr *inst = &qinst->qpu;
567bf215546Sopenharmony_ci
568bf215546Sopenharmony_ci        /* Don't schedule any other r4 write too soon after an SFU write.
569bf215546Sopenharmony_ci         * This would normally be prevented by dependency tracking, but might
570bf215546Sopenharmony_ci         * occur if a dead SFU computation makes it to scheduling.
571bf215546Sopenharmony_ci         */
572bf215546Sopenharmony_ci        if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
573bf215546Sopenharmony_ci            v3d_qpu_writes_r4(devinfo, inst))
574bf215546Sopenharmony_ci                return true;
575bf215546Sopenharmony_ci
576bf215546Sopenharmony_ci        return false;
577bf215546Sopenharmony_ci}
578bf215546Sopenharmony_ci
579bf215546Sopenharmony_cistatic bool
580bf215546Sopenharmony_ciscoreboard_is_locked(struct choose_scoreboard *scoreboard,
581bf215546Sopenharmony_ci                     bool lock_scoreboard_on_first_thrsw)
582bf215546Sopenharmony_ci{
583bf215546Sopenharmony_ci        if (lock_scoreboard_on_first_thrsw) {
584bf215546Sopenharmony_ci                return scoreboard->first_thrsw_emitted &&
585bf215546Sopenharmony_ci                       scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
586bf215546Sopenharmony_ci        }
587bf215546Sopenharmony_ci
588bf215546Sopenharmony_ci        return scoreboard->last_thrsw_emitted &&
589bf215546Sopenharmony_ci               scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
590bf215546Sopenharmony_ci}
591bf215546Sopenharmony_ci
592bf215546Sopenharmony_cistatic bool
593bf215546Sopenharmony_cipixel_scoreboard_too_soon(struct v3d_compile *c,
594bf215546Sopenharmony_ci                          struct choose_scoreboard *scoreboard,
595bf215546Sopenharmony_ci                          const struct v3d_qpu_instr *inst)
596bf215546Sopenharmony_ci{
597bf215546Sopenharmony_ci        return qpu_inst_is_tlb(inst) &&
598bf215546Sopenharmony_ci               !scoreboard_is_locked(scoreboard,
599bf215546Sopenharmony_ci                                     c->lock_scoreboard_on_first_thrsw);
600bf215546Sopenharmony_ci}
601bf215546Sopenharmony_ci
602bf215546Sopenharmony_cistatic bool
603bf215546Sopenharmony_ciqpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
604bf215546Sopenharmony_ci                        uint32_t waddr) {
605bf215546Sopenharmony_ci
606bf215546Sopenharmony_ci        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
607bf215546Sopenharmony_ci           return false;
608bf215546Sopenharmony_ci
609bf215546Sopenharmony_ci        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
610bf215546Sopenharmony_ci            inst->raddr_a == waddr)
611bf215546Sopenharmony_ci              return true;
612bf215546Sopenharmony_ci
613bf215546Sopenharmony_ci        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
614bf215546Sopenharmony_ci            !inst->sig.small_imm && (inst->raddr_b == waddr))
615bf215546Sopenharmony_ci              return true;
616bf215546Sopenharmony_ci
617bf215546Sopenharmony_ci        return false;
618bf215546Sopenharmony_ci}
619bf215546Sopenharmony_ci
620bf215546Sopenharmony_cistatic bool
621bf215546Sopenharmony_cimux_read_stalls(struct choose_scoreboard *scoreboard,
622bf215546Sopenharmony_ci                const struct v3d_qpu_instr *inst)
623bf215546Sopenharmony_ci{
624bf215546Sopenharmony_ci        return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
625bf215546Sopenharmony_ci                qpu_instruction_uses_rf(inst,
626bf215546Sopenharmony_ci                                        scoreboard->last_stallable_sfu_reg);
627bf215546Sopenharmony_ci}
628bf215546Sopenharmony_ci
629bf215546Sopenharmony_ci/* We define a max schedule priority to allow negative priorities as result of
630bf215546Sopenharmony_ci * substracting this max when an instruction stalls. So instructions that
631bf215546Sopenharmony_ci * stall have lower priority than regular instructions. */
632bf215546Sopenharmony_ci#define MAX_SCHEDULE_PRIORITY 16
633bf215546Sopenharmony_ci
634bf215546Sopenharmony_cistatic int
635bf215546Sopenharmony_ciget_instruction_priority(const struct v3d_device_info *devinfo,
636bf215546Sopenharmony_ci                         const struct v3d_qpu_instr *inst)
637bf215546Sopenharmony_ci{
638bf215546Sopenharmony_ci        uint32_t baseline_score;
639bf215546Sopenharmony_ci        uint32_t next_score = 0;
640bf215546Sopenharmony_ci
641bf215546Sopenharmony_ci        /* Schedule TLB operations as late as possible, to get more
642bf215546Sopenharmony_ci         * parallelism between shaders.
643bf215546Sopenharmony_ci         */
644bf215546Sopenharmony_ci        if (qpu_inst_is_tlb(inst))
645bf215546Sopenharmony_ci                return next_score;
646bf215546Sopenharmony_ci        next_score++;
647bf215546Sopenharmony_ci
648bf215546Sopenharmony_ci        /* Empirical testing shows that using priorities to hide latency of
649bf215546Sopenharmony_ci         * TMU operations when scheduling QPU leads to slightly worse
650bf215546Sopenharmony_ci         * performance, even at 2 threads. We think this is because the thread
651bf215546Sopenharmony_ci         * switching is already quite effective at hiding latency and NIR
652bf215546Sopenharmony_ci         * scheduling (and possibly TMU pipelining too) are sufficient to hide
653bf215546Sopenharmony_ci         * TMU latency, so piling up on that here doesn't provide any benefits
654bf215546Sopenharmony_ci         * and instead may cause us to postpone critical paths that depend on
655bf215546Sopenharmony_ci         * the TMU results.
656bf215546Sopenharmony_ci         */
657bf215546Sopenharmony_ci#if 0
658bf215546Sopenharmony_ci        /* Schedule texture read results collection late to hide latency. */
659bf215546Sopenharmony_ci        if (v3d_qpu_waits_on_tmu(inst))
660bf215546Sopenharmony_ci                return next_score;
661bf215546Sopenharmony_ci        next_score++;
662bf215546Sopenharmony_ci#endif
663bf215546Sopenharmony_ci
664bf215546Sopenharmony_ci        /* Default score for things that aren't otherwise special. */
665bf215546Sopenharmony_ci        baseline_score = next_score;
666bf215546Sopenharmony_ci        next_score++;
667bf215546Sopenharmony_ci
668bf215546Sopenharmony_ci#if 0
669bf215546Sopenharmony_ci        /* Schedule texture read setup early to hide their latency better. */
670bf215546Sopenharmony_ci        if (v3d_qpu_writes_tmu(devinfo, inst))
671bf215546Sopenharmony_ci                return next_score;
672bf215546Sopenharmony_ci        next_score++;
673bf215546Sopenharmony_ci#endif
674bf215546Sopenharmony_ci
675bf215546Sopenharmony_ci        /* We should increase the maximum if we assert here */
676bf215546Sopenharmony_ci        assert(next_score < MAX_SCHEDULE_PRIORITY);
677bf215546Sopenharmony_ci
678bf215546Sopenharmony_ci        return baseline_score;
679bf215546Sopenharmony_ci}
680bf215546Sopenharmony_ci
681bf215546Sopenharmony_cienum {
682bf215546Sopenharmony_ci        V3D_PERIPHERAL_VPM_READ           = (1 << 0),
683bf215546Sopenharmony_ci        V3D_PERIPHERAL_VPM_WRITE          = (1 << 1),
684bf215546Sopenharmony_ci        V3D_PERIPHERAL_VPM_WAIT           = (1 << 2),
685bf215546Sopenharmony_ci        V3D_PERIPHERAL_SFU                = (1 << 3),
686bf215546Sopenharmony_ci        V3D_PERIPHERAL_TMU_WRITE          = (1 << 4),
687bf215546Sopenharmony_ci        V3D_PERIPHERAL_TMU_READ           = (1 << 5),
688bf215546Sopenharmony_ci        V3D_PERIPHERAL_TMU_WAIT           = (1 << 6),
689bf215546Sopenharmony_ci        V3D_PERIPHERAL_TMU_WRTMUC_SIG     = (1 << 7),
690bf215546Sopenharmony_ci        V3D_PERIPHERAL_TSY                = (1 << 8),
691bf215546Sopenharmony_ci        V3D_PERIPHERAL_TLB                = (1 << 9),
692bf215546Sopenharmony_ci};
693bf215546Sopenharmony_ci
694bf215546Sopenharmony_cistatic uint32_t
695bf215546Sopenharmony_ciqpu_peripherals(const struct v3d_device_info *devinfo,
696bf215546Sopenharmony_ci                const struct v3d_qpu_instr *inst)
697bf215546Sopenharmony_ci{
698bf215546Sopenharmony_ci        uint32_t result = 0;
699bf215546Sopenharmony_ci        if (v3d_qpu_reads_vpm(inst))
700bf215546Sopenharmony_ci                result |= V3D_PERIPHERAL_VPM_READ;
701bf215546Sopenharmony_ci        if (v3d_qpu_writes_vpm(inst))
702bf215546Sopenharmony_ci                result |= V3D_PERIPHERAL_VPM_WRITE;
703bf215546Sopenharmony_ci        if (v3d_qpu_waits_vpm(inst))
704bf215546Sopenharmony_ci                result |= V3D_PERIPHERAL_VPM_WAIT;
705bf215546Sopenharmony_ci
706bf215546Sopenharmony_ci        if (v3d_qpu_writes_tmu(devinfo, inst))
707bf215546Sopenharmony_ci                result |= V3D_PERIPHERAL_TMU_WRITE;
708bf215546Sopenharmony_ci        if (inst->sig.ldtmu)
709bf215546Sopenharmony_ci                result |= V3D_PERIPHERAL_TMU_READ;
710bf215546Sopenharmony_ci        if (inst->sig.wrtmuc)
711bf215546Sopenharmony_ci                result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
712bf215546Sopenharmony_ci
713bf215546Sopenharmony_ci        if (v3d_qpu_uses_sfu(inst))
714bf215546Sopenharmony_ci                result |= V3D_PERIPHERAL_SFU;
715bf215546Sopenharmony_ci
716bf215546Sopenharmony_ci        if (v3d_qpu_uses_tlb(inst))
717bf215546Sopenharmony_ci                result |= V3D_PERIPHERAL_TLB;
718bf215546Sopenharmony_ci
719bf215546Sopenharmony_ci        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
720bf215546Sopenharmony_ci                if (inst->alu.add.op != V3D_QPU_A_NOP &&
721bf215546Sopenharmony_ci                    inst->alu.add.magic_write &&
722bf215546Sopenharmony_ci                    v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
723bf215546Sopenharmony_ci                        result |= V3D_PERIPHERAL_TSY;
724bf215546Sopenharmony_ci                }
725bf215546Sopenharmony_ci
726bf215546Sopenharmony_ci                if (inst->alu.add.op == V3D_QPU_A_TMUWT)
727bf215546Sopenharmony_ci                        result |= V3D_PERIPHERAL_TMU_WAIT;
728bf215546Sopenharmony_ci        }
729bf215546Sopenharmony_ci
730bf215546Sopenharmony_ci        return result;
731bf215546Sopenharmony_ci}
732bf215546Sopenharmony_ci
733bf215546Sopenharmony_cistatic bool
734bf215546Sopenharmony_ciqpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
735bf215546Sopenharmony_ci                                 const struct v3d_qpu_instr *a,
736bf215546Sopenharmony_ci                                 const struct v3d_qpu_instr *b)
737bf215546Sopenharmony_ci{
738bf215546Sopenharmony_ci        const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
739bf215546Sopenharmony_ci        const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
740bf215546Sopenharmony_ci
741bf215546Sopenharmony_ci        /* We can always do one peripheral access per instruction. */
742bf215546Sopenharmony_ci        if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
743bf215546Sopenharmony_ci                return true;
744bf215546Sopenharmony_ci
745bf215546Sopenharmony_ci        if (devinfo->ver < 41)
746bf215546Sopenharmony_ci                return false;
747bf215546Sopenharmony_ci
748bf215546Sopenharmony_ci        /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than
749bf215546Sopenharmony_ci         * tmuc).
750bf215546Sopenharmony_ci         */
751bf215546Sopenharmony_ci        if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
752bf215546Sopenharmony_ci            b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
753bf215546Sopenharmony_ci                return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
754bf215546Sopenharmony_ci        }
755bf215546Sopenharmony_ci
756bf215546Sopenharmony_ci        if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE &&
757bf215546Sopenharmony_ci            b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) {
758bf215546Sopenharmony_ci                return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
759bf215546Sopenharmony_ci        }
760bf215546Sopenharmony_ci
761bf215546Sopenharmony_ci        /* V3D 4.1+ allows TMU read with VPM read/write. */
762bf215546Sopenharmony_ci        if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
763bf215546Sopenharmony_ci            (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
764bf215546Sopenharmony_ci             b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
765bf215546Sopenharmony_ci                return true;
766bf215546Sopenharmony_ci        }
767bf215546Sopenharmony_ci        if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
768bf215546Sopenharmony_ci            (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
769bf215546Sopenharmony_ci             a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
770bf215546Sopenharmony_ci                return true;
771bf215546Sopenharmony_ci        }
772bf215546Sopenharmony_ci
773bf215546Sopenharmony_ci        return false;
774bf215546Sopenharmony_ci}
775bf215546Sopenharmony_ci
776bf215546Sopenharmony_ci/* Compute a bitmask of which rf registers are used between
777bf215546Sopenharmony_ci * the two instructions.
778bf215546Sopenharmony_ci */
779bf215546Sopenharmony_cistatic uint64_t
780bf215546Sopenharmony_ciqpu_raddrs_used(const struct v3d_qpu_instr *a,
781bf215546Sopenharmony_ci                const struct v3d_qpu_instr *b)
782bf215546Sopenharmony_ci{
783bf215546Sopenharmony_ci        assert(a->type == V3D_QPU_INSTR_TYPE_ALU);
784bf215546Sopenharmony_ci        assert(b->type == V3D_QPU_INSTR_TYPE_ALU);
785bf215546Sopenharmony_ci
786bf215546Sopenharmony_ci        uint64_t raddrs_used = 0;
787bf215546Sopenharmony_ci        if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
788bf215546Sopenharmony_ci                raddrs_used |= (1ll << a->raddr_a);
789bf215546Sopenharmony_ci        if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
790bf215546Sopenharmony_ci                raddrs_used |= (1ll << a->raddr_b);
791bf215546Sopenharmony_ci        if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
792bf215546Sopenharmony_ci                raddrs_used |= (1ll << b->raddr_a);
793bf215546Sopenharmony_ci        if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
794bf215546Sopenharmony_ci                raddrs_used |= (1ll << b->raddr_b);
795bf215546Sopenharmony_ci
796bf215546Sopenharmony_ci        return raddrs_used;
797bf215546Sopenharmony_ci}
798bf215546Sopenharmony_ci
799bf215546Sopenharmony_ci/* Take two instructions and attempt to merge their raddr fields
800bf215546Sopenharmony_ci * into one merged instruction. Returns false if the two instructions
801bf215546Sopenharmony_ci * access more than two different rf registers between them, or more
802bf215546Sopenharmony_ci * than one rf register and one small immediate.
803bf215546Sopenharmony_ci */
804bf215546Sopenharmony_cistatic bool
805bf215546Sopenharmony_ciqpu_merge_raddrs(struct v3d_qpu_instr *result,
806bf215546Sopenharmony_ci                 const struct v3d_qpu_instr *add_instr,
807bf215546Sopenharmony_ci                 const struct v3d_qpu_instr *mul_instr)
808bf215546Sopenharmony_ci{
809bf215546Sopenharmony_ci        uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
810bf215546Sopenharmony_ci        int naddrs = util_bitcount64(raddrs_used);
811bf215546Sopenharmony_ci
812bf215546Sopenharmony_ci        if (naddrs > 2)
813bf215546Sopenharmony_ci                return false;
814bf215546Sopenharmony_ci
815bf215546Sopenharmony_ci        if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
816bf215546Sopenharmony_ci                if (naddrs > 1)
817bf215546Sopenharmony_ci                        return false;
818bf215546Sopenharmony_ci
819bf215546Sopenharmony_ci                if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
820bf215546Sopenharmony_ci                        if (add_instr->raddr_b != mul_instr->raddr_b)
821bf215546Sopenharmony_ci                                return false;
822bf215546Sopenharmony_ci
823bf215546Sopenharmony_ci                result->sig.small_imm = true;
824bf215546Sopenharmony_ci                result->raddr_b = add_instr->sig.small_imm ?
825bf215546Sopenharmony_ci                        add_instr->raddr_b : mul_instr->raddr_b;
826bf215546Sopenharmony_ci        }
827bf215546Sopenharmony_ci
828bf215546Sopenharmony_ci        if (naddrs == 0)
829bf215546Sopenharmony_ci                return true;
830bf215546Sopenharmony_ci
831bf215546Sopenharmony_ci        int raddr_a = ffsll(raddrs_used) - 1;
832bf215546Sopenharmony_ci        raddrs_used &= ~(1ll << raddr_a);
833bf215546Sopenharmony_ci        result->raddr_a = raddr_a;
834bf215546Sopenharmony_ci
835bf215546Sopenharmony_ci        if (!result->sig.small_imm) {
836bf215546Sopenharmony_ci                if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
837bf215546Sopenharmony_ci                    raddr_a == add_instr->raddr_b) {
838bf215546Sopenharmony_ci                        if (add_instr->alu.add.a == V3D_QPU_MUX_B)
839bf215546Sopenharmony_ci                                result->alu.add.a = V3D_QPU_MUX_A;
840bf215546Sopenharmony_ci                        if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
841bf215546Sopenharmony_ci                            v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
842bf215546Sopenharmony_ci                                result->alu.add.b = V3D_QPU_MUX_A;
843bf215546Sopenharmony_ci                        }
844bf215546Sopenharmony_ci                }
845bf215546Sopenharmony_ci                if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
846bf215546Sopenharmony_ci                    raddr_a == mul_instr->raddr_b) {
847bf215546Sopenharmony_ci                        if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
848bf215546Sopenharmony_ci                                result->alu.mul.a = V3D_QPU_MUX_A;
849bf215546Sopenharmony_ci                        if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
850bf215546Sopenharmony_ci                            v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
851bf215546Sopenharmony_ci                                result->alu.mul.b = V3D_QPU_MUX_A;
852bf215546Sopenharmony_ci                        }
853bf215546Sopenharmony_ci                }
854bf215546Sopenharmony_ci        }
855bf215546Sopenharmony_ci        if (!raddrs_used)
856bf215546Sopenharmony_ci                return true;
857bf215546Sopenharmony_ci
858bf215546Sopenharmony_ci        int raddr_b = ffsll(raddrs_used) - 1;
859bf215546Sopenharmony_ci        result->raddr_b = raddr_b;
860bf215546Sopenharmony_ci        if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
861bf215546Sopenharmony_ci            raddr_b == add_instr->raddr_a) {
862bf215546Sopenharmony_ci                if (add_instr->alu.add.a == V3D_QPU_MUX_A)
863bf215546Sopenharmony_ci                        result->alu.add.a = V3D_QPU_MUX_B;
864bf215546Sopenharmony_ci                if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
865bf215546Sopenharmony_ci                    v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
866bf215546Sopenharmony_ci                        result->alu.add.b = V3D_QPU_MUX_B;
867bf215546Sopenharmony_ci                }
868bf215546Sopenharmony_ci        }
869bf215546Sopenharmony_ci        if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
870bf215546Sopenharmony_ci            raddr_b == mul_instr->raddr_a) {
871bf215546Sopenharmony_ci                if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
872bf215546Sopenharmony_ci                        result->alu.mul.a = V3D_QPU_MUX_B;
873bf215546Sopenharmony_ci                if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
874bf215546Sopenharmony_ci                    v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
875bf215546Sopenharmony_ci                        result->alu.mul.b = V3D_QPU_MUX_B;
876bf215546Sopenharmony_ci                }
877bf215546Sopenharmony_ci        }
878bf215546Sopenharmony_ci
879bf215546Sopenharmony_ci        return true;
880bf215546Sopenharmony_ci}
881bf215546Sopenharmony_ci
882bf215546Sopenharmony_cistatic bool
883bf215546Sopenharmony_cican_do_add_as_mul(enum v3d_qpu_add_op op)
884bf215546Sopenharmony_ci{
885bf215546Sopenharmony_ci        switch (op) {
886bf215546Sopenharmony_ci        case V3D_QPU_A_ADD:
887bf215546Sopenharmony_ci        case V3D_QPU_A_SUB:
888bf215546Sopenharmony_ci                return true;
889bf215546Sopenharmony_ci        default:
890bf215546Sopenharmony_ci                return false;
891bf215546Sopenharmony_ci        }
892bf215546Sopenharmony_ci}
893bf215546Sopenharmony_ci
894bf215546Sopenharmony_cistatic enum v3d_qpu_mul_op
895bf215546Sopenharmony_ciadd_op_as_mul_op(enum v3d_qpu_add_op op)
896bf215546Sopenharmony_ci{
897bf215546Sopenharmony_ci        switch (op) {
898bf215546Sopenharmony_ci        case V3D_QPU_A_ADD:
899bf215546Sopenharmony_ci                return V3D_QPU_M_ADD;
900bf215546Sopenharmony_ci        case V3D_QPU_A_SUB:
901bf215546Sopenharmony_ci                return V3D_QPU_M_SUB;
902bf215546Sopenharmony_ci        default:
903bf215546Sopenharmony_ci                unreachable("unexpected add opcode");
904bf215546Sopenharmony_ci        }
905bf215546Sopenharmony_ci}
906bf215546Sopenharmony_ci
907bf215546Sopenharmony_cistatic void
908bf215546Sopenharmony_ciqpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
909bf215546Sopenharmony_ci{
910bf215546Sopenharmony_ci        STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
911bf215546Sopenharmony_ci        assert(inst->alu.add.op != V3D_QPU_A_NOP);
912bf215546Sopenharmony_ci        assert(inst->alu.mul.op == V3D_QPU_M_NOP);
913bf215546Sopenharmony_ci
914bf215546Sopenharmony_ci        memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
915bf215546Sopenharmony_ci        inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
916bf215546Sopenharmony_ci        inst->alu.add.op = V3D_QPU_A_NOP;
917bf215546Sopenharmony_ci
918bf215546Sopenharmony_ci        inst->flags.mc = inst->flags.ac;
919bf215546Sopenharmony_ci        inst->flags.mpf = inst->flags.apf;
920bf215546Sopenharmony_ci        inst->flags.muf = inst->flags.auf;
921bf215546Sopenharmony_ci        inst->flags.ac = V3D_QPU_COND_NONE;
922bf215546Sopenharmony_ci        inst->flags.apf = V3D_QPU_PF_NONE;
923bf215546Sopenharmony_ci        inst->flags.auf = V3D_QPU_UF_NONE;
924bf215546Sopenharmony_ci
925bf215546Sopenharmony_ci        inst->alu.mul.output_pack = inst->alu.add.output_pack;
926bf215546Sopenharmony_ci        inst->alu.mul.a_unpack = inst->alu.add.a_unpack;
927bf215546Sopenharmony_ci        inst->alu.mul.b_unpack = inst->alu.add.b_unpack;
928bf215546Sopenharmony_ci        inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
929bf215546Sopenharmony_ci        inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
930bf215546Sopenharmony_ci        inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
931bf215546Sopenharmony_ci}
932bf215546Sopenharmony_ci
933bf215546Sopenharmony_cistatic bool
934bf215546Sopenharmony_ciqpu_merge_inst(const struct v3d_device_info *devinfo,
935bf215546Sopenharmony_ci               struct v3d_qpu_instr *result,
936bf215546Sopenharmony_ci               const struct v3d_qpu_instr *a,
937bf215546Sopenharmony_ci               const struct v3d_qpu_instr *b)
938bf215546Sopenharmony_ci{
939bf215546Sopenharmony_ci        if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
940bf215546Sopenharmony_ci            b->type != V3D_QPU_INSTR_TYPE_ALU) {
941bf215546Sopenharmony_ci                return false;
942bf215546Sopenharmony_ci        }
943bf215546Sopenharmony_ci
944bf215546Sopenharmony_ci        if (!qpu_compatible_peripheral_access(devinfo, a, b))
945bf215546Sopenharmony_ci                return false;
946bf215546Sopenharmony_ci
947bf215546Sopenharmony_ci        struct v3d_qpu_instr merge = *a;
948bf215546Sopenharmony_ci        const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
949bf215546Sopenharmony_ci
950bf215546Sopenharmony_ci        struct v3d_qpu_instr mul_inst;
951bf215546Sopenharmony_ci        if (b->alu.add.op != V3D_QPU_A_NOP) {
952bf215546Sopenharmony_ci                if (a->alu.add.op == V3D_QPU_A_NOP) {
953bf215546Sopenharmony_ci                        merge.alu.add = b->alu.add;
954bf215546Sopenharmony_ci
955bf215546Sopenharmony_ci                        merge.flags.ac = b->flags.ac;
956bf215546Sopenharmony_ci                        merge.flags.apf = b->flags.apf;
957bf215546Sopenharmony_ci                        merge.flags.auf = b->flags.auf;
958bf215546Sopenharmony_ci
959bf215546Sopenharmony_ci                        add_instr = b;
960bf215546Sopenharmony_ci                        mul_instr = a;
961bf215546Sopenharmony_ci                }
962bf215546Sopenharmony_ci                /* If a's add op is used but its mul op is not, then see if we
963bf215546Sopenharmony_ci                 * can convert either a's add op or b's add op to a mul op
964bf215546Sopenharmony_ci                 * so we can merge.
965bf215546Sopenharmony_ci                 */
966bf215546Sopenharmony_ci                else if (a->alu.mul.op == V3D_QPU_M_NOP &&
967bf215546Sopenharmony_ci                         can_do_add_as_mul(b->alu.add.op)) {
968bf215546Sopenharmony_ci                        mul_inst = *b;
969bf215546Sopenharmony_ci                        qpu_convert_add_to_mul(&mul_inst);
970bf215546Sopenharmony_ci
971bf215546Sopenharmony_ci                        merge.alu.mul = mul_inst.alu.mul;
972bf215546Sopenharmony_ci
973bf215546Sopenharmony_ci                        merge.flags.mc = b->flags.ac;
974bf215546Sopenharmony_ci                        merge.flags.mpf = b->flags.apf;
975bf215546Sopenharmony_ci                        merge.flags.muf = b->flags.auf;
976bf215546Sopenharmony_ci
977bf215546Sopenharmony_ci                        add_instr = a;
978bf215546Sopenharmony_ci                        mul_instr = &mul_inst;
979bf215546Sopenharmony_ci                } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
980bf215546Sopenharmony_ci                           can_do_add_as_mul(a->alu.add.op)) {
981bf215546Sopenharmony_ci                        mul_inst = *a;
982bf215546Sopenharmony_ci                        qpu_convert_add_to_mul(&mul_inst);
983bf215546Sopenharmony_ci
984bf215546Sopenharmony_ci                        merge = mul_inst;
985bf215546Sopenharmony_ci                        merge.alu.add = b->alu.add;
986bf215546Sopenharmony_ci
987bf215546Sopenharmony_ci                        merge.flags.ac = b->flags.ac;
988bf215546Sopenharmony_ci                        merge.flags.apf = b->flags.apf;
989bf215546Sopenharmony_ci                        merge.flags.auf = b->flags.auf;
990bf215546Sopenharmony_ci
991bf215546Sopenharmony_ci                        add_instr = b;
992bf215546Sopenharmony_ci                        mul_instr = &mul_inst;
993bf215546Sopenharmony_ci                } else {
994bf215546Sopenharmony_ci                        return false;
995bf215546Sopenharmony_ci                }
996bf215546Sopenharmony_ci        }
997bf215546Sopenharmony_ci
998bf215546Sopenharmony_ci        if (b->alu.mul.op != V3D_QPU_M_NOP) {
999bf215546Sopenharmony_ci                if (a->alu.mul.op != V3D_QPU_M_NOP)
1000bf215546Sopenharmony_ci                        return false;
1001bf215546Sopenharmony_ci                merge.alu.mul = b->alu.mul;
1002bf215546Sopenharmony_ci
1003bf215546Sopenharmony_ci                merge.flags.mc = b->flags.mc;
1004bf215546Sopenharmony_ci                merge.flags.mpf = b->flags.mpf;
1005bf215546Sopenharmony_ci                merge.flags.muf = b->flags.muf;
1006bf215546Sopenharmony_ci
1007bf215546Sopenharmony_ci                mul_instr = b;
1008bf215546Sopenharmony_ci                add_instr = a;
1009bf215546Sopenharmony_ci        }
1010bf215546Sopenharmony_ci
1011bf215546Sopenharmony_ci        if (add_instr && mul_instr &&
1012bf215546Sopenharmony_ci            !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
1013bf215546Sopenharmony_ci                        return false;
1014bf215546Sopenharmony_ci        }
1015bf215546Sopenharmony_ci
1016bf215546Sopenharmony_ci        merge.sig.thrsw |= b->sig.thrsw;
1017bf215546Sopenharmony_ci        merge.sig.ldunif |= b->sig.ldunif;
1018bf215546Sopenharmony_ci        merge.sig.ldunifrf |= b->sig.ldunifrf;
1019bf215546Sopenharmony_ci        merge.sig.ldunifa |= b->sig.ldunifa;
1020bf215546Sopenharmony_ci        merge.sig.ldunifarf |= b->sig.ldunifarf;
1021bf215546Sopenharmony_ci        merge.sig.ldtmu |= b->sig.ldtmu;
1022bf215546Sopenharmony_ci        merge.sig.ldvary |= b->sig.ldvary;
1023bf215546Sopenharmony_ci        merge.sig.ldvpm |= b->sig.ldvpm;
1024bf215546Sopenharmony_ci        merge.sig.small_imm |= b->sig.small_imm;
1025bf215546Sopenharmony_ci        merge.sig.ldtlb |= b->sig.ldtlb;
1026bf215546Sopenharmony_ci        merge.sig.ldtlbu |= b->sig.ldtlbu;
1027bf215546Sopenharmony_ci        merge.sig.ucb |= b->sig.ucb;
1028bf215546Sopenharmony_ci        merge.sig.rotate |= b->sig.rotate;
1029bf215546Sopenharmony_ci        merge.sig.wrtmuc |= b->sig.wrtmuc;
1030bf215546Sopenharmony_ci
1031bf215546Sopenharmony_ci        if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
1032bf215546Sopenharmony_ci            v3d_qpu_sig_writes_address(devinfo, &b->sig))
1033bf215546Sopenharmony_ci                return false;
1034bf215546Sopenharmony_ci        merge.sig_addr |= b->sig_addr;
1035bf215546Sopenharmony_ci        merge.sig_magic |= b->sig_magic;
1036bf215546Sopenharmony_ci
1037bf215546Sopenharmony_ci        uint64_t packed;
1038bf215546Sopenharmony_ci        bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
1039bf215546Sopenharmony_ci
1040bf215546Sopenharmony_ci        *result = merge;
1041bf215546Sopenharmony_ci        /* No modifying the real instructions on failure. */
1042bf215546Sopenharmony_ci        assert(ok || (a != result && b != result));
1043bf215546Sopenharmony_ci
1044bf215546Sopenharmony_ci        return ok;
1045bf215546Sopenharmony_ci}
1046bf215546Sopenharmony_ci
1047bf215546Sopenharmony_cistatic inline bool
1048bf215546Sopenharmony_citry_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)
1049bf215546Sopenharmony_ci{
1050bf215546Sopenharmony_ci        return inst->sig.ldunif || inst->sig.ldunifrf;
1051bf215546Sopenharmony_ci}
1052bf215546Sopenharmony_ci
1053bf215546Sopenharmony_cistatic bool
1054bf215546Sopenharmony_ciqpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1055bf215546Sopenharmony_ci                                         struct choose_scoreboard *scoreboard,
1056bf215546Sopenharmony_ci                                         const struct qinst *qinst);
1057bf215546Sopenharmony_ci
1058bf215546Sopenharmony_cistatic struct schedule_node *
1059bf215546Sopenharmony_cichoose_instruction_to_schedule(struct v3d_compile *c,
1060bf215546Sopenharmony_ci                               struct choose_scoreboard *scoreboard,
1061bf215546Sopenharmony_ci                               struct schedule_node *prev_inst)
1062bf215546Sopenharmony_ci{
1063bf215546Sopenharmony_ci        struct schedule_node *chosen = NULL;
1064bf215546Sopenharmony_ci        int chosen_prio = 0;
1065bf215546Sopenharmony_ci
1066bf215546Sopenharmony_ci        /* Don't pair up anything with a thread switch signal -- emit_thrsw()
1067bf215546Sopenharmony_ci         * will handle pairing it along with filling the delay slots.
1068bf215546Sopenharmony_ci         */
1069bf215546Sopenharmony_ci        if (prev_inst) {
1070bf215546Sopenharmony_ci                if (prev_inst->inst->qpu.sig.thrsw)
1071bf215546Sopenharmony_ci                        return NULL;
1072bf215546Sopenharmony_ci        }
1073bf215546Sopenharmony_ci
1074bf215546Sopenharmony_ci        bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&
1075bf215546Sopenharmony_ci                                 scoreboard->ldvary_count < c->num_inputs;
1076bf215546Sopenharmony_ci        bool skipped_insts_for_ldvary_pipelining = false;
1077bf215546Sopenharmony_ciretry:
1078bf215546Sopenharmony_ci        list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
1079bf215546Sopenharmony_ci                            dag.link) {
1080bf215546Sopenharmony_ci                const struct v3d_qpu_instr *inst = &n->inst->qpu;
1081bf215546Sopenharmony_ci
1082bf215546Sopenharmony_ci                if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {
1083bf215546Sopenharmony_ci                        skipped_insts_for_ldvary_pipelining = true;
1084bf215546Sopenharmony_ci                        continue;
1085bf215546Sopenharmony_ci                }
1086bf215546Sopenharmony_ci
1087bf215546Sopenharmony_ci                /* Don't choose the branch instruction until it's the last one
1088bf215546Sopenharmony_ci                 * left.  We'll move it up to fit its delay slots after we
1089bf215546Sopenharmony_ci                 * choose it.
1090bf215546Sopenharmony_ci                 */
1091bf215546Sopenharmony_ci                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
1092bf215546Sopenharmony_ci                    !list_is_singular(&scoreboard->dag->heads)) {
1093bf215546Sopenharmony_ci                        continue;
1094bf215546Sopenharmony_ci                }
1095bf215546Sopenharmony_ci
1096bf215546Sopenharmony_ci                /* We need to have 3 delay slots between a write to unifa and
1097bf215546Sopenharmony_ci                 * a follow-up ldunifa.
1098bf215546Sopenharmony_ci                 */
1099bf215546Sopenharmony_ci                if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&
1100bf215546Sopenharmony_ci                    scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)
1101bf215546Sopenharmony_ci                        continue;
1102bf215546Sopenharmony_ci
1103bf215546Sopenharmony_ci                /* "An instruction must not read from a location in physical
1104bf215546Sopenharmony_ci                 *  regfile A or B that was written to by the previous
1105bf215546Sopenharmony_ci                 *  instruction."
1106bf215546Sopenharmony_ci                 */
1107bf215546Sopenharmony_ci                if (reads_too_soon_after_write(scoreboard, n->inst))
1108bf215546Sopenharmony_ci                        continue;
1109bf215546Sopenharmony_ci
1110bf215546Sopenharmony_ci                if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1111bf215546Sopenharmony_ci                        continue;
1112bf215546Sopenharmony_ci
1113bf215546Sopenharmony_ci                /* "Before doing a TLB access a scoreboard wait must have been
1114bf215546Sopenharmony_ci                 *  done. This happens either on the first or last thread
1115bf215546Sopenharmony_ci                 *  switch, depending on a setting (scb_wait_on_first_thrsw) in
1116bf215546Sopenharmony_ci                 *  the shader state."
1117bf215546Sopenharmony_ci                 */
1118bf215546Sopenharmony_ci                if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1119bf215546Sopenharmony_ci                        continue;
1120bf215546Sopenharmony_ci
1121bf215546Sopenharmony_ci                /* ldunif and ldvary both write r5, but ldunif does so a tick
1122bf215546Sopenharmony_ci                 * sooner.  If the ldvary's r5 wasn't used, then ldunif might
1123bf215546Sopenharmony_ci                 * otherwise get scheduled so ldunif and ldvary try to update
1124bf215546Sopenharmony_ci                 * r5 in the same tick.
1125bf215546Sopenharmony_ci                 */
1126bf215546Sopenharmony_ci                if ((inst->sig.ldunif || inst->sig.ldunifa) &&
1127bf215546Sopenharmony_ci                    scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
1128bf215546Sopenharmony_ci                        continue;
1129bf215546Sopenharmony_ci                }
1130bf215546Sopenharmony_ci
1131bf215546Sopenharmony_ci                /* If we are in a thrsw delay slot check that this instruction
1132bf215546Sopenharmony_ci                 * is valid for that.
1133bf215546Sopenharmony_ci                 */
1134bf215546Sopenharmony_ci                if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&
1135bf215546Sopenharmony_ci                    !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,
1136bf215546Sopenharmony_ci                                                              n->inst)) {
1137bf215546Sopenharmony_ci                        continue;
1138bf215546Sopenharmony_ci                }
1139bf215546Sopenharmony_ci
1140bf215546Sopenharmony_ci                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1141bf215546Sopenharmony_ci                        /* Don't try to put a branch in the delay slots of another
1142bf215546Sopenharmony_ci                         * branch or a unifa write.
1143bf215546Sopenharmony_ci                         */
1144bf215546Sopenharmony_ci                        if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
1145bf215546Sopenharmony_ci                                continue;
1146bf215546Sopenharmony_ci                        if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
1147bf215546Sopenharmony_ci                                continue;
1148bf215546Sopenharmony_ci
1149bf215546Sopenharmony_ci                        /* No branch with cond != 0,2,3 and msfign != 0 after
1150bf215546Sopenharmony_ci                         * setmsf.
1151bf215546Sopenharmony_ci                         */
1152bf215546Sopenharmony_ci                        if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&
1153bf215546Sopenharmony_ci                            inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
1154bf215546Sopenharmony_ci                            inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
1155bf215546Sopenharmony_ci                            inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
1156bf215546Sopenharmony_ci                            inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
1157bf215546Sopenharmony_ci                                continue;
1158bf215546Sopenharmony_ci                        }
1159bf215546Sopenharmony_ci                }
1160bf215546Sopenharmony_ci
1161bf215546Sopenharmony_ci                /* If we're trying to pair with another instruction, check
1162bf215546Sopenharmony_ci                 * that they're compatible.
1163bf215546Sopenharmony_ci                 */
1164bf215546Sopenharmony_ci                if (prev_inst) {
1165bf215546Sopenharmony_ci                        /* Don't pair up a thread switch signal -- we'll
1166bf215546Sopenharmony_ci                         * handle pairing it when we pick it on its own.
1167bf215546Sopenharmony_ci                         */
1168bf215546Sopenharmony_ci                        if (inst->sig.thrsw)
1169bf215546Sopenharmony_ci                                continue;
1170bf215546Sopenharmony_ci
1171bf215546Sopenharmony_ci                        if (prev_inst->inst->uniform != -1 &&
1172bf215546Sopenharmony_ci                            n->inst->uniform != -1)
1173bf215546Sopenharmony_ci                                continue;
1174bf215546Sopenharmony_ci
1175bf215546Sopenharmony_ci                       /* Simulator complains if we have two uniforms loaded in
1176bf215546Sopenharmony_ci                        * the the same instruction, which could happen if we
1177bf215546Sopenharmony_ci                        * have a ldunif or sideband uniform and we pair that
1178bf215546Sopenharmony_ci                        * with ldunifa.
1179bf215546Sopenharmony_ci                        */
1180bf215546Sopenharmony_ci                        if (vir_has_uniform(prev_inst->inst) &&
1181bf215546Sopenharmony_ci                            (inst->sig.ldunifa || inst->sig.ldunifarf)) {
1182bf215546Sopenharmony_ci                                continue;
1183bf215546Sopenharmony_ci                        }
1184bf215546Sopenharmony_ci
1185bf215546Sopenharmony_ci                        if ((prev_inst->inst->qpu.sig.ldunifa ||
1186bf215546Sopenharmony_ci                             prev_inst->inst->qpu.sig.ldunifarf) &&
1187bf215546Sopenharmony_ci                            vir_has_uniform(n->inst)) {
1188bf215546Sopenharmony_ci                                continue;
1189bf215546Sopenharmony_ci                        }
1190bf215546Sopenharmony_ci
1191bf215546Sopenharmony_ci                        /* Don't merge TLB instructions before we have acquired
1192bf215546Sopenharmony_ci                         * the scoreboard lock.
1193bf215546Sopenharmony_ci                         */
1194bf215546Sopenharmony_ci                        if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1195bf215546Sopenharmony_ci                                continue;
1196bf215546Sopenharmony_ci
1197bf215546Sopenharmony_ci                        /* When we succesfully pair up an ldvary we then try
1198bf215546Sopenharmony_ci                         * to merge it into the previous instruction if
1199bf215546Sopenharmony_ci                         * possible to improve pipelining. Don't pick up the
1200bf215546Sopenharmony_ci                         * ldvary now if the follow-up fixup would place
1201bf215546Sopenharmony_ci                         * it in the delay slots of a thrsw, which is not
1202bf215546Sopenharmony_ci                         * allowed and would prevent the fixup from being
1203bf215546Sopenharmony_ci                         * successul.
1204bf215546Sopenharmony_ci                         */
1205bf215546Sopenharmony_ci                        if (inst->sig.ldvary &&
1206bf215546Sopenharmony_ci                            scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
1207bf215546Sopenharmony_ci                                continue;
1208bf215546Sopenharmony_ci                        }
1209bf215546Sopenharmony_ci
1210bf215546Sopenharmony_ci                        struct v3d_qpu_instr merged_inst;
1211bf215546Sopenharmony_ci                        if (!qpu_merge_inst(c->devinfo, &merged_inst,
1212bf215546Sopenharmony_ci                                            &prev_inst->inst->qpu, inst)) {
1213bf215546Sopenharmony_ci                                continue;
1214bf215546Sopenharmony_ci                        }
1215bf215546Sopenharmony_ci                }
1216bf215546Sopenharmony_ci
1217bf215546Sopenharmony_ci                int prio = get_instruction_priority(c->devinfo, inst);
1218bf215546Sopenharmony_ci
1219bf215546Sopenharmony_ci                if (mux_read_stalls(scoreboard, inst)) {
1220bf215546Sopenharmony_ci                        /* Don't merge an instruction that stalls */
1221bf215546Sopenharmony_ci                        if (prev_inst)
1222bf215546Sopenharmony_ci                                continue;
1223bf215546Sopenharmony_ci                        else {
1224bf215546Sopenharmony_ci                                /* Any instruction that don't stall will have
1225bf215546Sopenharmony_ci                                 * higher scheduling priority */
1226bf215546Sopenharmony_ci                                prio -= MAX_SCHEDULE_PRIORITY;
1227bf215546Sopenharmony_ci                                assert(prio < 0);
1228bf215546Sopenharmony_ci                        }
1229bf215546Sopenharmony_ci                }
1230bf215546Sopenharmony_ci
1231bf215546Sopenharmony_ci                /* Found a valid instruction.  If nothing better comes along,
1232bf215546Sopenharmony_ci                 * this one works.
1233bf215546Sopenharmony_ci                 */
1234bf215546Sopenharmony_ci                if (!chosen) {
1235bf215546Sopenharmony_ci                        chosen = n;
1236bf215546Sopenharmony_ci                        chosen_prio = prio;
1237bf215546Sopenharmony_ci                        continue;
1238bf215546Sopenharmony_ci                }
1239bf215546Sopenharmony_ci
1240bf215546Sopenharmony_ci                if (prio > chosen_prio) {
1241bf215546Sopenharmony_ci                        chosen = n;
1242bf215546Sopenharmony_ci                        chosen_prio = prio;
1243bf215546Sopenharmony_ci                } else if (prio < chosen_prio) {
1244bf215546Sopenharmony_ci                        continue;
1245bf215546Sopenharmony_ci                }
1246bf215546Sopenharmony_ci
1247bf215546Sopenharmony_ci                if (n->delay > chosen->delay) {
1248bf215546Sopenharmony_ci                        chosen = n;
1249bf215546Sopenharmony_ci                        chosen_prio = prio;
1250bf215546Sopenharmony_ci                } else if (n->delay < chosen->delay) {
1251bf215546Sopenharmony_ci                        continue;
1252bf215546Sopenharmony_ci                }
1253bf215546Sopenharmony_ci        }
1254bf215546Sopenharmony_ci
1255bf215546Sopenharmony_ci        /* If we did not find any instruction to schedule but we discarded
1256bf215546Sopenharmony_ci         * some of them to prioritize ldvary pipelining, try again.
1257bf215546Sopenharmony_ci         */
1258bf215546Sopenharmony_ci        if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {
1259bf215546Sopenharmony_ci                skipped_insts_for_ldvary_pipelining = false;
1260bf215546Sopenharmony_ci                ldvary_pipelining = false;
1261bf215546Sopenharmony_ci                goto retry;
1262bf215546Sopenharmony_ci        }
1263bf215546Sopenharmony_ci
1264bf215546Sopenharmony_ci        if (chosen && chosen->inst->qpu.sig.ldvary) {
1265bf215546Sopenharmony_ci                scoreboard->ldvary_count++;
1266bf215546Sopenharmony_ci                /* If we are pairing an ldvary, flag it so we can fix it up for
1267bf215546Sopenharmony_ci                 * optimal pipelining of ldvary sequences.
1268bf215546Sopenharmony_ci                 */
1269bf215546Sopenharmony_ci                if (prev_inst)
1270bf215546Sopenharmony_ci                        scoreboard->fixup_ldvary = true;
1271bf215546Sopenharmony_ci        }
1272bf215546Sopenharmony_ci
1273bf215546Sopenharmony_ci        return chosen;
1274bf215546Sopenharmony_ci}
1275bf215546Sopenharmony_ci
1276bf215546Sopenharmony_cistatic void
1277bf215546Sopenharmony_ciupdate_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
1278bf215546Sopenharmony_ci                                  enum v3d_qpu_waddr waddr,
1279bf215546Sopenharmony_ci                                  const struct v3d_device_info *devinfo)
1280bf215546Sopenharmony_ci{
1281bf215546Sopenharmony_ci        if (v3d_qpu_magic_waddr_is_sfu(waddr))
1282bf215546Sopenharmony_ci                scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
1283bf215546Sopenharmony_ci        else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)
1284bf215546Sopenharmony_ci                scoreboard->last_unifa_write_tick = scoreboard->tick;
1285bf215546Sopenharmony_ci}
1286bf215546Sopenharmony_ci
1287bf215546Sopenharmony_cistatic void
1288bf215546Sopenharmony_ciupdate_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
1289bf215546Sopenharmony_ci                                      const struct v3d_qpu_instr *inst)
1290bf215546Sopenharmony_ci{
1291bf215546Sopenharmony_ci        if (v3d_qpu_instr_is_sfu(inst)) {
1292bf215546Sopenharmony_ci                scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
1293bf215546Sopenharmony_ci                scoreboard->last_stallable_sfu_tick = scoreboard->tick;
1294bf215546Sopenharmony_ci        }
1295bf215546Sopenharmony_ci}
1296bf215546Sopenharmony_ci
1297bf215546Sopenharmony_cistatic void
1298bf215546Sopenharmony_ciupdate_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
1299bf215546Sopenharmony_ci                             const struct v3d_qpu_instr *inst,
1300bf215546Sopenharmony_ci                             const struct v3d_device_info *devinfo)
1301bf215546Sopenharmony_ci{
1302bf215546Sopenharmony_ci        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1303bf215546Sopenharmony_ci                return;
1304bf215546Sopenharmony_ci
1305bf215546Sopenharmony_ci        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1306bf215546Sopenharmony_ci
1307bf215546Sopenharmony_ci        if (inst->alu.add.op != V3D_QPU_A_NOP)  {
1308bf215546Sopenharmony_ci                if (inst->alu.add.magic_write) {
1309bf215546Sopenharmony_ci                        update_scoreboard_for_magic_waddr(scoreboard,
1310bf215546Sopenharmony_ci                                                          inst->alu.add.waddr,
1311bf215546Sopenharmony_ci                                                          devinfo);
1312bf215546Sopenharmony_ci                } else {
1313bf215546Sopenharmony_ci                        update_scoreboard_for_sfu_stall_waddr(scoreboard,
1314bf215546Sopenharmony_ci                                                              inst);
1315bf215546Sopenharmony_ci                }
1316bf215546Sopenharmony_ci
1317bf215546Sopenharmony_ci                if (inst->alu.add.op == V3D_QPU_A_SETMSF)
1318bf215546Sopenharmony_ci                        scoreboard->last_setmsf_tick = scoreboard->tick;
1319bf215546Sopenharmony_ci        }
1320bf215546Sopenharmony_ci
1321bf215546Sopenharmony_ci        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
1322bf215546Sopenharmony_ci                if (inst->alu.mul.magic_write) {
1323bf215546Sopenharmony_ci                        update_scoreboard_for_magic_waddr(scoreboard,
1324bf215546Sopenharmony_ci                                                          inst->alu.mul.waddr,
1325bf215546Sopenharmony_ci                                                          devinfo);
1326bf215546Sopenharmony_ci                }
1327bf215546Sopenharmony_ci        }
1328bf215546Sopenharmony_ci
1329bf215546Sopenharmony_ci        if (inst->sig.ldvary)
1330bf215546Sopenharmony_ci                scoreboard->last_ldvary_tick = scoreboard->tick;
1331bf215546Sopenharmony_ci}
1332bf215546Sopenharmony_ci
1333bf215546Sopenharmony_cistatic void
1334bf215546Sopenharmony_cidump_state(const struct v3d_device_info *devinfo, struct dag *dag)
1335bf215546Sopenharmony_ci{
1336bf215546Sopenharmony_ci        list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
1337bf215546Sopenharmony_ci                fprintf(stderr, "         t=%4d: ", n->unblocked_time);
1338bf215546Sopenharmony_ci                v3d_qpu_dump(devinfo, &n->inst->qpu);
1339bf215546Sopenharmony_ci                fprintf(stderr, "\n");
1340bf215546Sopenharmony_ci
1341bf215546Sopenharmony_ci                util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1342bf215546Sopenharmony_ci                        struct schedule_node *child =
1343bf215546Sopenharmony_ci                                (struct schedule_node *)edge->child;
1344bf215546Sopenharmony_ci                        if (!child)
1345bf215546Sopenharmony_ci                                continue;
1346bf215546Sopenharmony_ci
1347bf215546Sopenharmony_ci                        fprintf(stderr, "                 - ");
1348bf215546Sopenharmony_ci                        v3d_qpu_dump(devinfo, &child->inst->qpu);
1349bf215546Sopenharmony_ci                        fprintf(stderr, " (%d parents, %c)\n",
1350bf215546Sopenharmony_ci                                child->dag.parent_count,
1351bf215546Sopenharmony_ci                                edge->data ? 'w' : 'r');
1352bf215546Sopenharmony_ci                }
1353bf215546Sopenharmony_ci        }
1354bf215546Sopenharmony_ci}
1355bf215546Sopenharmony_ci
1356bf215546Sopenharmony_cistatic uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,
1357bf215546Sopenharmony_ci                                    enum v3d_qpu_waddr waddr,
1358bf215546Sopenharmony_ci                                    const struct v3d_qpu_instr *after)
1359bf215546Sopenharmony_ci{
1360bf215546Sopenharmony_ci        /* Apply some huge latency between texture fetch requests and getting
1361bf215546Sopenharmony_ci         * their results back.
1362bf215546Sopenharmony_ci         *
1363bf215546Sopenharmony_ci         * FIXME: This is actually pretty bogus.  If we do:
1364bf215546Sopenharmony_ci         *
1365bf215546Sopenharmony_ci         * mov tmu0_s, a
1366bf215546Sopenharmony_ci         * <a bit of math>
1367bf215546Sopenharmony_ci         * mov tmu0_s, b
1368bf215546Sopenharmony_ci         * load_tmu0
1369bf215546Sopenharmony_ci         * <more math>
1370bf215546Sopenharmony_ci         * load_tmu0
1371bf215546Sopenharmony_ci         *
1372bf215546Sopenharmony_ci         * we count that as worse than
1373bf215546Sopenharmony_ci         *
1374bf215546Sopenharmony_ci         * mov tmu0_s, a
1375bf215546Sopenharmony_ci         * mov tmu0_s, b
1376bf215546Sopenharmony_ci         * <lots of math>
1377bf215546Sopenharmony_ci         * load_tmu0
1378bf215546Sopenharmony_ci         * <more math>
1379bf215546Sopenharmony_ci         * load_tmu0
1380bf215546Sopenharmony_ci         *
1381bf215546Sopenharmony_ci         * because we associate the first load_tmu0 with the *second* tmu0_s.
1382bf215546Sopenharmony_ci         */
1383bf215546Sopenharmony_ci        if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&
1384bf215546Sopenharmony_ci            v3d_qpu_waits_on_tmu(after)) {
1385bf215546Sopenharmony_ci                return 100;
1386bf215546Sopenharmony_ci        }
1387bf215546Sopenharmony_ci
1388bf215546Sopenharmony_ci        /* Assume that anything depending on us is consuming the SFU result. */
1389bf215546Sopenharmony_ci        if (v3d_qpu_magic_waddr_is_sfu(waddr))
1390bf215546Sopenharmony_ci                return 3;
1391bf215546Sopenharmony_ci
1392bf215546Sopenharmony_ci        return 1;
1393bf215546Sopenharmony_ci}
1394bf215546Sopenharmony_ci
1395bf215546Sopenharmony_cistatic uint32_t
1396bf215546Sopenharmony_ciinstruction_latency(const struct v3d_device_info *devinfo,
1397bf215546Sopenharmony_ci                    struct schedule_node *before, struct schedule_node *after)
1398bf215546Sopenharmony_ci{
1399bf215546Sopenharmony_ci        const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
1400bf215546Sopenharmony_ci        const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
1401bf215546Sopenharmony_ci        uint32_t latency = 1;
1402bf215546Sopenharmony_ci
1403bf215546Sopenharmony_ci        if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
1404bf215546Sopenharmony_ci            after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
1405bf215546Sopenharmony_ci                return latency;
1406bf215546Sopenharmony_ci
1407bf215546Sopenharmony_ci        if (before_inst->alu.add.magic_write) {
1408bf215546Sopenharmony_ci                latency = MAX2(latency,
1409bf215546Sopenharmony_ci                               magic_waddr_latency(devinfo,
1410bf215546Sopenharmony_ci                                                   before_inst->alu.add.waddr,
1411bf215546Sopenharmony_ci                                                   after_inst));
1412bf215546Sopenharmony_ci        }
1413bf215546Sopenharmony_ci
1414bf215546Sopenharmony_ci        if (before_inst->alu.mul.magic_write) {
1415bf215546Sopenharmony_ci                latency = MAX2(latency,
1416bf215546Sopenharmony_ci                               magic_waddr_latency(devinfo,
1417bf215546Sopenharmony_ci                                                   before_inst->alu.mul.waddr,
1418bf215546Sopenharmony_ci                                                   after_inst));
1419bf215546Sopenharmony_ci        }
1420bf215546Sopenharmony_ci
1421bf215546Sopenharmony_ci        if (v3d_qpu_instr_is_sfu(before_inst))
1422bf215546Sopenharmony_ci                return 2;
1423bf215546Sopenharmony_ci
1424bf215546Sopenharmony_ci        return latency;
1425bf215546Sopenharmony_ci}
1426bf215546Sopenharmony_ci
1427bf215546Sopenharmony_ci/** Recursive computation of the delay member of a node. */
1428bf215546Sopenharmony_cistatic void
1429bf215546Sopenharmony_cicompute_delay(struct dag_node *node, void *state)
1430bf215546Sopenharmony_ci{
1431bf215546Sopenharmony_ci        struct schedule_node *n = (struct schedule_node *)node;
1432bf215546Sopenharmony_ci        struct v3d_compile *c = (struct v3d_compile *) state;
1433bf215546Sopenharmony_ci
1434bf215546Sopenharmony_ci        n->delay = 1;
1435bf215546Sopenharmony_ci
1436bf215546Sopenharmony_ci        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1437bf215546Sopenharmony_ci                struct schedule_node *child =
1438bf215546Sopenharmony_ci                        (struct schedule_node *)edge->child;
1439bf215546Sopenharmony_ci
1440bf215546Sopenharmony_ci                n->delay = MAX2(n->delay, (child->delay +
1441bf215546Sopenharmony_ci                                           instruction_latency(c->devinfo, n,
1442bf215546Sopenharmony_ci                                                               child)));
1443bf215546Sopenharmony_ci        }
1444bf215546Sopenharmony_ci}
1445bf215546Sopenharmony_ci
1446bf215546Sopenharmony_ci/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
1447bf215546Sopenharmony_ci * should be called on it later to finish pruning the other edges).
1448bf215546Sopenharmony_ci */
1449bf215546Sopenharmony_cistatic void
1450bf215546Sopenharmony_cipre_remove_head(struct dag *dag, struct schedule_node *n)
1451bf215546Sopenharmony_ci{
1452bf215546Sopenharmony_ci        list_delinit(&n->dag.link);
1453bf215546Sopenharmony_ci
1454bf215546Sopenharmony_ci        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1455bf215546Sopenharmony_ci                if (edge->data)
1456bf215546Sopenharmony_ci                        dag_remove_edge(dag, edge);
1457bf215546Sopenharmony_ci        }
1458bf215546Sopenharmony_ci}
1459bf215546Sopenharmony_ci
1460bf215546Sopenharmony_cistatic void
1461bf215546Sopenharmony_cimark_instruction_scheduled(const struct v3d_device_info *devinfo,
1462bf215546Sopenharmony_ci                           struct dag *dag,
1463bf215546Sopenharmony_ci                           uint32_t time,
1464bf215546Sopenharmony_ci                           struct schedule_node *node)
1465bf215546Sopenharmony_ci{
1466bf215546Sopenharmony_ci        if (!node)
1467bf215546Sopenharmony_ci                return;
1468bf215546Sopenharmony_ci
1469bf215546Sopenharmony_ci        util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
1470bf215546Sopenharmony_ci                struct schedule_node *child =
1471bf215546Sopenharmony_ci                        (struct schedule_node *)edge->child;
1472bf215546Sopenharmony_ci
1473bf215546Sopenharmony_ci                if (!child)
1474bf215546Sopenharmony_ci                        continue;
1475bf215546Sopenharmony_ci
1476bf215546Sopenharmony_ci                uint32_t latency = instruction_latency(devinfo, node, child);
1477bf215546Sopenharmony_ci
1478bf215546Sopenharmony_ci                child->unblocked_time = MAX2(child->unblocked_time,
1479bf215546Sopenharmony_ci                                             time + latency);
1480bf215546Sopenharmony_ci        }
1481bf215546Sopenharmony_ci        dag_prune_head(dag, &node->dag);
1482bf215546Sopenharmony_ci}
1483bf215546Sopenharmony_ci
1484bf215546Sopenharmony_cistatic void
1485bf215546Sopenharmony_ciinsert_scheduled_instruction(struct v3d_compile *c,
1486bf215546Sopenharmony_ci                             struct qblock *block,
1487bf215546Sopenharmony_ci                             struct choose_scoreboard *scoreboard,
1488bf215546Sopenharmony_ci                             struct qinst *inst)
1489bf215546Sopenharmony_ci{
1490bf215546Sopenharmony_ci        list_addtail(&inst->link, &block->instructions);
1491bf215546Sopenharmony_ci
1492bf215546Sopenharmony_ci        update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
1493bf215546Sopenharmony_ci        c->qpu_inst_count++;
1494bf215546Sopenharmony_ci        scoreboard->tick++;
1495bf215546Sopenharmony_ci}
1496bf215546Sopenharmony_ci
1497bf215546Sopenharmony_cistatic struct qinst *
1498bf215546Sopenharmony_civir_nop()
1499bf215546Sopenharmony_ci{
1500bf215546Sopenharmony_ci        struct qreg undef = vir_nop_reg();
1501bf215546Sopenharmony_ci        struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1502bf215546Sopenharmony_ci
1503bf215546Sopenharmony_ci        return qinst;
1504bf215546Sopenharmony_ci}
1505bf215546Sopenharmony_ci
1506bf215546Sopenharmony_cistatic void
1507bf215546Sopenharmony_ciemit_nop(struct v3d_compile *c, struct qblock *block,
1508bf215546Sopenharmony_ci         struct choose_scoreboard *scoreboard)
1509bf215546Sopenharmony_ci{
1510bf215546Sopenharmony_ci        insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1511bf215546Sopenharmony_ci}
1512bf215546Sopenharmony_ci
1513bf215546Sopenharmony_cistatic bool
1514bf215546Sopenharmony_ciqpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
1515bf215546Sopenharmony_ci                              const struct qinst *qinst, int slot)
1516bf215546Sopenharmony_ci{
1517bf215546Sopenharmony_ci        const struct v3d_qpu_instr *inst = &qinst->qpu;
1518bf215546Sopenharmony_ci
1519bf215546Sopenharmony_ci        if (slot == 2 && qinst->is_tlb_z_write)
1520bf215546Sopenharmony_ci                return false;
1521bf215546Sopenharmony_ci
1522bf215546Sopenharmony_ci        if (slot > 0 && qinst->uniform != ~0)
1523bf215546Sopenharmony_ci                return false;
1524bf215546Sopenharmony_ci
1525bf215546Sopenharmony_ci        if (v3d_qpu_waits_vpm(inst))
1526bf215546Sopenharmony_ci                return false;
1527bf215546Sopenharmony_ci
1528bf215546Sopenharmony_ci        if (inst->sig.ldvary)
1529bf215546Sopenharmony_ci                return false;
1530bf215546Sopenharmony_ci
1531bf215546Sopenharmony_ci        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1532bf215546Sopenharmony_ci                /* GFXH-1625: TMUWT not allowed in the final instruction. */
1533bf215546Sopenharmony_ci                if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
1534bf215546Sopenharmony_ci                        return false;
1535bf215546Sopenharmony_ci
1536bf215546Sopenharmony_ci                /* No writing physical registers at the end. */
1537bf215546Sopenharmony_ci                if (!inst->alu.add.magic_write ||
1538bf215546Sopenharmony_ci                    !inst->alu.mul.magic_write) {
1539bf215546Sopenharmony_ci                        return false;
1540bf215546Sopenharmony_ci                }
1541bf215546Sopenharmony_ci
1542bf215546Sopenharmony_ci                if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
1543bf215546Sopenharmony_ci                    !inst->sig_magic) {
1544bf215546Sopenharmony_ci                        return false;
1545bf215546Sopenharmony_ci                }
1546bf215546Sopenharmony_ci
1547bf215546Sopenharmony_ci                if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
1548bf215546Sopenharmony_ci                        return false;
1549bf215546Sopenharmony_ci
1550bf215546Sopenharmony_ci                /* RF0-2 might be overwritten during the delay slots by
1551bf215546Sopenharmony_ci                 * fragment shader setup.
1552bf215546Sopenharmony_ci                 */
1553bf215546Sopenharmony_ci                if (inst->raddr_a < 3 &&
1554bf215546Sopenharmony_ci                    (inst->alu.add.a == V3D_QPU_MUX_A ||
1555bf215546Sopenharmony_ci                     inst->alu.add.b == V3D_QPU_MUX_A ||
1556bf215546Sopenharmony_ci                     inst->alu.mul.a == V3D_QPU_MUX_A ||
1557bf215546Sopenharmony_ci                     inst->alu.mul.b == V3D_QPU_MUX_A)) {
1558bf215546Sopenharmony_ci                        return false;
1559bf215546Sopenharmony_ci                }
1560bf215546Sopenharmony_ci
1561bf215546Sopenharmony_ci                if (inst->raddr_b < 3 &&
1562bf215546Sopenharmony_ci                    !inst->sig.small_imm &&
1563bf215546Sopenharmony_ci                    (inst->alu.add.a == V3D_QPU_MUX_B ||
1564bf215546Sopenharmony_ci                     inst->alu.add.b == V3D_QPU_MUX_B ||
1565bf215546Sopenharmony_ci                     inst->alu.mul.a == V3D_QPU_MUX_B ||
1566bf215546Sopenharmony_ci                     inst->alu.mul.b == V3D_QPU_MUX_B)) {
1567bf215546Sopenharmony_ci                        return false;
1568bf215546Sopenharmony_ci                }
1569bf215546Sopenharmony_ci        }
1570bf215546Sopenharmony_ci
1571bf215546Sopenharmony_ci        return true;
1572bf215546Sopenharmony_ci}
1573bf215546Sopenharmony_ci
1574bf215546Sopenharmony_ci/**
1575bf215546Sopenharmony_ci * This is called when trying to merge a thrsw back into the instruction stream
1576bf215546Sopenharmony_ci * of instructions that were scheduled *before* the thrsw signal to fill its
1577bf215546Sopenharmony_ci * delay slots. Because the actual execution of the thrsw happens after the
1578bf215546Sopenharmony_ci * delay slots, it is usually safe to do this, but there are some cases that
1579bf215546Sopenharmony_ci * need special care.
1580bf215546Sopenharmony_ci */
1581bf215546Sopenharmony_cistatic bool
1582bf215546Sopenharmony_ciqpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1583bf215546Sopenharmony_ci                                          const struct qinst *qinst,
1584bf215546Sopenharmony_ci                                          uint32_t slot)
1585bf215546Sopenharmony_ci{
1586bf215546Sopenharmony_ci        /* No scheduling SFU when the result would land in the other
1587bf215546Sopenharmony_ci         * thread.  The simulator complains for safety, though it
1588bf215546Sopenharmony_ci         * would only occur for dead code in our case.
1589bf215546Sopenharmony_ci         */
1590bf215546Sopenharmony_ci        if (slot > 0 &&
1591bf215546Sopenharmony_ci            qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
1592bf215546Sopenharmony_ci            (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
1593bf215546Sopenharmony_ci             v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
1594bf215546Sopenharmony_ci                return false;
1595bf215546Sopenharmony_ci        }
1596bf215546Sopenharmony_ci
1597bf215546Sopenharmony_ci        if (slot > 0 && qinst->qpu.sig.ldvary)
1598bf215546Sopenharmony_ci                return false;
1599bf215546Sopenharmony_ci
1600bf215546Sopenharmony_ci        /* unifa and the following 3 instructions can't overlap a
1601bf215546Sopenharmony_ci         * thread switch/end. The docs further clarify that this means
1602bf215546Sopenharmony_ci         * the cycle at which the actual thread switch/end happens
1603bf215546Sopenharmony_ci         * and not when the thrsw instruction is processed, which would
1604bf215546Sopenharmony_ci         * be after the 2 delay slots following the thrsw instruction.
1605bf215546Sopenharmony_ci         * This means that we can move up a thrsw up to the instruction
1606bf215546Sopenharmony_ci         * right after unifa:
1607bf215546Sopenharmony_ci         *
1608bf215546Sopenharmony_ci         * unifa, r5
1609bf215546Sopenharmony_ci         * thrsw
1610bf215546Sopenharmony_ci         * delay slot 1
1611bf215546Sopenharmony_ci         * delay slot 2
1612bf215546Sopenharmony_ci         * Thread switch happens here, 4 instructions away from unifa
1613bf215546Sopenharmony_ci         */
1614bf215546Sopenharmony_ci        if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
1615bf215546Sopenharmony_ci                return false;
1616bf215546Sopenharmony_ci
1617bf215546Sopenharmony_ci        return true;
1618bf215546Sopenharmony_ci}
1619bf215546Sopenharmony_ci
1620bf215546Sopenharmony_ci/**
1621bf215546Sopenharmony_ci * This is called for instructions scheduled *after* a thrsw signal that may
1622bf215546Sopenharmony_ci * land in the delay slots of the thrsw. Because these instructions were
1623bf215546Sopenharmony_ci * scheduled after the thrsw, we need to be careful when placing them into
1624bf215546Sopenharmony_ci * the delay slots, since that means that we are moving them ahead of the
1625bf215546Sopenharmony_ci * thread switch and we need to ensure that is not a problem.
1626bf215546Sopenharmony_ci */
1627bf215546Sopenharmony_cistatic bool
1628bf215546Sopenharmony_ciqpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1629bf215546Sopenharmony_ci                                         struct choose_scoreboard *scoreboard,
1630bf215546Sopenharmony_ci                                         const struct qinst *qinst)
1631bf215546Sopenharmony_ci{
1632bf215546Sopenharmony_ci        const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;
1633bf215546Sopenharmony_ci        assert(slot <= 2);
1634bf215546Sopenharmony_ci
1635bf215546Sopenharmony_ci        /* We merge thrsw instructions back into the instruction stream
1636bf215546Sopenharmony_ci         * manually, so any instructions scheduled after a thrsw shold be
1637bf215546Sopenharmony_ci         * in the actual delay slots and not in the same slot as the thrsw.
1638bf215546Sopenharmony_ci         */
1639bf215546Sopenharmony_ci        assert(slot >= 1);
1640bf215546Sopenharmony_ci
1641bf215546Sopenharmony_ci        /* No emitting a thrsw while the previous thrsw hasn't happened yet. */
1642bf215546Sopenharmony_ci        if (qinst->qpu.sig.thrsw)
1643bf215546Sopenharmony_ci                return false;
1644bf215546Sopenharmony_ci
1645bf215546Sopenharmony_ci        /* The restrictions for instructions scheduled before the the thrsw
1646bf215546Sopenharmony_ci         * also apply to instructions scheduled after the thrsw that we want
1647bf215546Sopenharmony_ci         * to place in its delay slots.
1648bf215546Sopenharmony_ci         */
1649bf215546Sopenharmony_ci        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
1650bf215546Sopenharmony_ci                return false;
1651bf215546Sopenharmony_ci
1652bf215546Sopenharmony_ci        /* TLB access is disallowed until scoreboard wait is executed, which
1653bf215546Sopenharmony_ci         * we do on the last thread switch.
1654bf215546Sopenharmony_ci         */
1655bf215546Sopenharmony_ci        if (qpu_inst_is_tlb(&qinst->qpu))
1656bf215546Sopenharmony_ci                return false;
1657bf215546Sopenharmony_ci
1658bf215546Sopenharmony_ci        /* Instruction sequence restrictions: Branch is not allowed in delay
1659bf215546Sopenharmony_ci         * slots of a thrsw.
1660bf215546Sopenharmony_ci         */
1661bf215546Sopenharmony_ci        if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
1662bf215546Sopenharmony_ci                return false;
1663bf215546Sopenharmony_ci
1664bf215546Sopenharmony_ci        /* Miscellaneous restrictions: At the point of a thrsw we need to have
1665bf215546Sopenharmony_ci         * at least one outstanding lookup or TSY wait.
1666bf215546Sopenharmony_ci         *
1667bf215546Sopenharmony_ci         * So avoid placing TMU instructions scheduled after the thrsw into
1668bf215546Sopenharmony_ci         * its delay slots or we may be compromising the integrity of our TMU
1669bf215546Sopenharmony_ci         * sequences. Also, notice that if we moved these instructions into
1670bf215546Sopenharmony_ci         * the delay slots of a previous thrsw we could overflow our TMU output
1671bf215546Sopenharmony_ci         * fifo, since we could be effectively pipelining a lookup scheduled
1672bf215546Sopenharmony_ci         * after the thrsw into the sequence before the thrsw.
1673bf215546Sopenharmony_ci         */
1674bf215546Sopenharmony_ci        if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||
1675bf215546Sopenharmony_ci            qinst->qpu.sig.wrtmuc) {
1676bf215546Sopenharmony_ci                return false;
1677bf215546Sopenharmony_ci        }
1678bf215546Sopenharmony_ci
1679bf215546Sopenharmony_ci        /* Don't move instructions that wait on the TMU before the thread switch
1680bf215546Sopenharmony_ci         * happens since that would make the current thread stall before the
1681bf215546Sopenharmony_ci         * switch, which is exactly what we want to avoid with the thrsw
1682bf215546Sopenharmony_ci         * instruction.
1683bf215546Sopenharmony_ci         */
1684bf215546Sopenharmony_ci        if (v3d_qpu_waits_on_tmu(&qinst->qpu))
1685bf215546Sopenharmony_ci                return false;
1686bf215546Sopenharmony_ci
1687bf215546Sopenharmony_ci        /* A thread switch invalidates all accumulators, so don't place any
1688bf215546Sopenharmony_ci         * instructions that write accumulators into the delay slots.
1689bf215546Sopenharmony_ci         */
1690bf215546Sopenharmony_ci        if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))
1691bf215546Sopenharmony_ci                return false;
1692bf215546Sopenharmony_ci
1693bf215546Sopenharmony_ci        /* Multop has an implicit write to the rtop register which is an
1694bf215546Sopenharmony_ci         * specialized accumulator that is only used with this instruction.
1695bf215546Sopenharmony_ci         */
1696bf215546Sopenharmony_ci        if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)
1697bf215546Sopenharmony_ci                return false;
1698bf215546Sopenharmony_ci
1699bf215546Sopenharmony_ci        /* Flags are invalidated across a thread switch, so dont' place
1700bf215546Sopenharmony_ci         * instructions that write flags into delay slots.
1701bf215546Sopenharmony_ci         */
1702bf215546Sopenharmony_ci        if (v3d_qpu_writes_flags(&qinst->qpu))
1703bf215546Sopenharmony_ci                return false;
1704bf215546Sopenharmony_ci
1705bf215546Sopenharmony_ci        /* TSY sync ops materialize at the point of the next thread switch,
1706bf215546Sopenharmony_ci         * therefore, if we have a TSY sync right after a thread switch, we
1707bf215546Sopenharmony_ci         * cannot place it in its delay slots, or we would be moving the sync
1708bf215546Sopenharmony_ci         * to the thrsw before it instead.
1709bf215546Sopenharmony_ci         */
1710bf215546Sopenharmony_ci        if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
1711bf215546Sopenharmony_ci                return false;
1712bf215546Sopenharmony_ci
1713bf215546Sopenharmony_ci        return true;
1714bf215546Sopenharmony_ci}
1715bf215546Sopenharmony_ci
1716bf215546Sopenharmony_cistatic bool
1717bf215546Sopenharmony_civalid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
1718bf215546Sopenharmony_ci                     struct qinst *qinst, int instructions_in_sequence,
1719bf215546Sopenharmony_ci                     bool is_thrend)
1720bf215546Sopenharmony_ci{
1721bf215546Sopenharmony_ci        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
1722bf215546Sopenharmony_ci        if (scoreboard->last_thrsw_tick + 3 >
1723bf215546Sopenharmony_ci            scoreboard->tick - instructions_in_sequence) {
1724bf215546Sopenharmony_ci                return false;
1725bf215546Sopenharmony_ci        }
1726bf215546Sopenharmony_ci
1727bf215546Sopenharmony_ci        for (int slot = 0; slot < instructions_in_sequence; slot++) {
1728bf215546Sopenharmony_ci                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
1729bf215546Sopenharmony_ci                        return false;
1730bf215546Sopenharmony_ci
1731bf215546Sopenharmony_ci                if (is_thrend &&
1732bf215546Sopenharmony_ci                    !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
1733bf215546Sopenharmony_ci                        return false;
1734bf215546Sopenharmony_ci                }
1735bf215546Sopenharmony_ci
1736bf215546Sopenharmony_ci                /* Note that the list is circular, so we can only do this up
1737bf215546Sopenharmony_ci                 * to instructions_in_sequence.
1738bf215546Sopenharmony_ci                 */
1739bf215546Sopenharmony_ci                qinst = (struct qinst *)qinst->link.next;
1740bf215546Sopenharmony_ci        }
1741bf215546Sopenharmony_ci
1742bf215546Sopenharmony_ci        return true;
1743bf215546Sopenharmony_ci}
1744bf215546Sopenharmony_ci
1745bf215546Sopenharmony_ci/**
1746bf215546Sopenharmony_ci * Emits a THRSW signal in the stream, trying to move it up to pair with
1747bf215546Sopenharmony_ci * another instruction.
1748bf215546Sopenharmony_ci */
1749bf215546Sopenharmony_cistatic int
1750bf215546Sopenharmony_ciemit_thrsw(struct v3d_compile *c,
1751bf215546Sopenharmony_ci           struct qblock *block,
1752bf215546Sopenharmony_ci           struct choose_scoreboard *scoreboard,
1753bf215546Sopenharmony_ci           struct qinst *inst,
1754bf215546Sopenharmony_ci           bool is_thrend)
1755bf215546Sopenharmony_ci{
1756bf215546Sopenharmony_ci        int time = 0;
1757bf215546Sopenharmony_ci
1758bf215546Sopenharmony_ci        /* There should be nothing in a thrsw inst being scheduled other than
1759bf215546Sopenharmony_ci         * the signal bits.
1760bf215546Sopenharmony_ci         */
1761bf215546Sopenharmony_ci        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
1762bf215546Sopenharmony_ci        assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
1763bf215546Sopenharmony_ci        assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
1764bf215546Sopenharmony_ci
1765bf215546Sopenharmony_ci        /* Don't try to emit a thrsw in the delay slots of a previous thrsw
1766bf215546Sopenharmony_ci         * or branch.
1767bf215546Sopenharmony_ci         */
1768bf215546Sopenharmony_ci        while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
1769bf215546Sopenharmony_ci                emit_nop(c, block, scoreboard);
1770bf215546Sopenharmony_ci                time++;
1771bf215546Sopenharmony_ci        }
1772bf215546Sopenharmony_ci        while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
1773bf215546Sopenharmony_ci                emit_nop(c, block, scoreboard);
1774bf215546Sopenharmony_ci                time++;
1775bf215546Sopenharmony_ci        }
1776bf215546Sopenharmony_ci
1777bf215546Sopenharmony_ci        /* Find how far back into previous instructions we can put the THRSW. */
1778bf215546Sopenharmony_ci        int slots_filled = 0;
1779bf215546Sopenharmony_ci        int invalid_sig_count = 0;
1780bf215546Sopenharmony_ci        bool last_thrsw_after_invalid_ok = false;
1781bf215546Sopenharmony_ci        struct qinst *merge_inst = NULL;
1782bf215546Sopenharmony_ci        vir_for_each_inst_rev(prev_inst, block) {
1783bf215546Sopenharmony_ci                if (!valid_thrsw_sequence(c, scoreboard,
1784bf215546Sopenharmony_ci                                          prev_inst, slots_filled + 1,
1785bf215546Sopenharmony_ci                                          is_thrend)) {
1786bf215546Sopenharmony_ci                        break;
1787bf215546Sopenharmony_ci                }
1788bf215546Sopenharmony_ci
1789bf215546Sopenharmony_ci                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
1790bf215546Sopenharmony_ci                sig.thrsw = true;
1791bf215546Sopenharmony_ci                uint32_t packed_sig;
1792bf215546Sopenharmony_ci                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
1793bf215546Sopenharmony_ci                        /* If we can't merge the thrsw here because of signal
1794bf215546Sopenharmony_ci                         * incompatibility, keep going, we might be able to
1795bf215546Sopenharmony_ci                         * merge it in an earlier instruction.
1796bf215546Sopenharmony_ci                         */
1797bf215546Sopenharmony_ci                        invalid_sig_count++;
1798bf215546Sopenharmony_ci                        goto cont_block;
1799bf215546Sopenharmony_ci                }
1800bf215546Sopenharmony_ci
1801bf215546Sopenharmony_ci                /* For last thrsw we need 2 consecutive slots that are
1802bf215546Sopenharmony_ci                 * thrsw compatible, so if we have previously jumped over
1803bf215546Sopenharmony_ci                 * an incompatible signal, flag that we have found the first
1804bf215546Sopenharmony_ci                 * valid slot here and keep going.
1805bf215546Sopenharmony_ci                 */
1806bf215546Sopenharmony_ci                if (inst->is_last_thrsw && invalid_sig_count > 0 &&
1807bf215546Sopenharmony_ci                    !last_thrsw_after_invalid_ok) {
1808bf215546Sopenharmony_ci                        last_thrsw_after_invalid_ok = true;
1809bf215546Sopenharmony_ci                        invalid_sig_count++;
1810bf215546Sopenharmony_ci                        goto cont_block;
1811bf215546Sopenharmony_ci                }
1812bf215546Sopenharmony_ci
1813bf215546Sopenharmony_ci                last_thrsw_after_invalid_ok = false;
1814bf215546Sopenharmony_ci                invalid_sig_count = 0;
1815bf215546Sopenharmony_ci                merge_inst = prev_inst;
1816bf215546Sopenharmony_ci
1817bf215546Sopenharmony_cicont_block:
1818bf215546Sopenharmony_ci                if (++slots_filled == 3)
1819bf215546Sopenharmony_ci                        break;
1820bf215546Sopenharmony_ci        }
1821bf215546Sopenharmony_ci
1822bf215546Sopenharmony_ci        /* If we jumped over a signal incompatibility and did not manage to
1823bf215546Sopenharmony_ci         * merge the thrsw in the end, we need to adjust slots filled to match
1824bf215546Sopenharmony_ci         * the last valid merge point.
1825bf215546Sopenharmony_ci         */
1826bf215546Sopenharmony_ci        assert(invalid_sig_count == 0 || slots_filled >= invalid_sig_count);
1827bf215546Sopenharmony_ci        if (invalid_sig_count > 0)
1828bf215546Sopenharmony_ci                slots_filled -= invalid_sig_count;
1829bf215546Sopenharmony_ci
1830bf215546Sopenharmony_ci        bool needs_free = false;
1831bf215546Sopenharmony_ci        if (merge_inst) {
1832bf215546Sopenharmony_ci                merge_inst->qpu.sig.thrsw = true;
1833bf215546Sopenharmony_ci                needs_free = true;
1834bf215546Sopenharmony_ci                scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
1835bf215546Sopenharmony_ci        } else {
1836bf215546Sopenharmony_ci                scoreboard->last_thrsw_tick = scoreboard->tick;
1837bf215546Sopenharmony_ci                insert_scheduled_instruction(c, block, scoreboard, inst);
1838bf215546Sopenharmony_ci                time++;
1839bf215546Sopenharmony_ci                slots_filled++;
1840bf215546Sopenharmony_ci                merge_inst = inst;
1841bf215546Sopenharmony_ci        }
1842bf215546Sopenharmony_ci
1843bf215546Sopenharmony_ci        scoreboard->first_thrsw_emitted = true;
1844bf215546Sopenharmony_ci
1845bf215546Sopenharmony_ci        /* If we're emitting the last THRSW (other than program end), then
1846bf215546Sopenharmony_ci         * signal that to the HW by emitting two THRSWs in a row.
1847bf215546Sopenharmony_ci         */
1848bf215546Sopenharmony_ci        if (inst->is_last_thrsw) {
1849bf215546Sopenharmony_ci                if (slots_filled <= 1) {
1850bf215546Sopenharmony_ci                        emit_nop(c, block, scoreboard);
1851bf215546Sopenharmony_ci                        time++;
1852bf215546Sopenharmony_ci                }
1853bf215546Sopenharmony_ci                struct qinst *second_inst =
1854bf215546Sopenharmony_ci                        (struct qinst *)merge_inst->link.next;
1855bf215546Sopenharmony_ci                second_inst->qpu.sig.thrsw = true;
1856bf215546Sopenharmony_ci                scoreboard->last_thrsw_emitted = true;
1857bf215546Sopenharmony_ci        }
1858bf215546Sopenharmony_ci
1859bf215546Sopenharmony_ci        /* Make sure the thread end executes within the program lifespan */
1860bf215546Sopenharmony_ci        if (is_thrend) {
1861bf215546Sopenharmony_ci                for (int i = 0; i < 3 - slots_filled; i++) {
1862bf215546Sopenharmony_ci                        emit_nop(c, block, scoreboard);
1863bf215546Sopenharmony_ci                        time++;
1864bf215546Sopenharmony_ci                }
1865bf215546Sopenharmony_ci        }
1866bf215546Sopenharmony_ci
1867bf215546Sopenharmony_ci        /* If we put our THRSW into another instruction, free up the
1868bf215546Sopenharmony_ci         * instruction that didn't end up scheduled into the list.
1869bf215546Sopenharmony_ci         */
1870bf215546Sopenharmony_ci        if (needs_free)
1871bf215546Sopenharmony_ci                free(inst);
1872bf215546Sopenharmony_ci
1873bf215546Sopenharmony_ci        return time;
1874bf215546Sopenharmony_ci}
1875bf215546Sopenharmony_ci
1876bf215546Sopenharmony_cistatic bool
1877bf215546Sopenharmony_ciqpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
1878bf215546Sopenharmony_ci{
1879bf215546Sopenharmony_ci        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
1880bf215546Sopenharmony_ci                return false;
1881bf215546Sopenharmony_ci
1882bf215546Sopenharmony_ci        if (inst->qpu.sig.thrsw)
1883bf215546Sopenharmony_ci                return false;
1884bf215546Sopenharmony_ci
1885bf215546Sopenharmony_ci        if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
1886bf215546Sopenharmony_ci                return false;
1887bf215546Sopenharmony_ci
1888bf215546Sopenharmony_ci        if (vir_has_uniform(inst))
1889bf215546Sopenharmony_ci                return false;
1890bf215546Sopenharmony_ci
1891bf215546Sopenharmony_ci        return true;
1892bf215546Sopenharmony_ci}
1893bf215546Sopenharmony_ci
1894bf215546Sopenharmony_cistatic void
1895bf215546Sopenharmony_ciemit_branch(struct v3d_compile *c,
1896bf215546Sopenharmony_ci           struct qblock *block,
1897bf215546Sopenharmony_ci           struct choose_scoreboard *scoreboard,
1898bf215546Sopenharmony_ci           struct qinst *inst)
1899bf215546Sopenharmony_ci{
1900bf215546Sopenharmony_ci        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
1901bf215546Sopenharmony_ci
1902bf215546Sopenharmony_ci        /* We should've not picked up a branch for the delay slots of a previous
1903bf215546Sopenharmony_ci         * thrsw, branch or unifa write instruction.
1904bf215546Sopenharmony_ci         */
1905bf215546Sopenharmony_ci        int branch_tick = scoreboard->tick;
1906bf215546Sopenharmony_ci        assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
1907bf215546Sopenharmony_ci        assert(scoreboard->last_branch_tick + 3 < branch_tick);
1908bf215546Sopenharmony_ci        assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
1909bf215546Sopenharmony_ci
1910bf215546Sopenharmony_ci        /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
1911bf215546Sopenharmony_ci         * setmsf.
1912bf215546Sopenharmony_ci         */
1913bf215546Sopenharmony_ci        bool is_safe_msf_branch =
1914bf215546Sopenharmony_ci                inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
1915bf215546Sopenharmony_ci                inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
1916bf215546Sopenharmony_ci                inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
1917bf215546Sopenharmony_ci                inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;
1918bf215546Sopenharmony_ci        assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||
1919bf215546Sopenharmony_ci               is_safe_msf_branch);
1920bf215546Sopenharmony_ci
1921bf215546Sopenharmony_ci        /* Insert the branch instruction */
1922bf215546Sopenharmony_ci        insert_scheduled_instruction(c, block, scoreboard, inst);
1923bf215546Sopenharmony_ci
1924bf215546Sopenharmony_ci        /* Now see if we can move the branch instruction back into the
1925bf215546Sopenharmony_ci         * instruction stream to fill its delay slots
1926bf215546Sopenharmony_ci         */
1927bf215546Sopenharmony_ci        int slots_filled = 0;
1928bf215546Sopenharmony_ci        while (slots_filled < 3 && block->instructions.next != &inst->link) {
1929bf215546Sopenharmony_ci                struct qinst *prev_inst = (struct qinst *) inst->link.prev;
1930bf215546Sopenharmony_ci                assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
1931bf215546Sopenharmony_ci
1932bf215546Sopenharmony_ci                /* Can't move the branch instruction if that would place it
1933bf215546Sopenharmony_ci                 * in the delay slots of other instructions.
1934bf215546Sopenharmony_ci                 */
1935bf215546Sopenharmony_ci                if (scoreboard->last_branch_tick + 3 >=
1936bf215546Sopenharmony_ci                    branch_tick - slots_filled - 1) {
1937bf215546Sopenharmony_ci                        break;
1938bf215546Sopenharmony_ci                }
1939bf215546Sopenharmony_ci
1940bf215546Sopenharmony_ci                if (scoreboard->last_thrsw_tick + 2 >=
1941bf215546Sopenharmony_ci                    branch_tick - slots_filled - 1) {
1942bf215546Sopenharmony_ci                        break;
1943bf215546Sopenharmony_ci                }
1944bf215546Sopenharmony_ci
1945bf215546Sopenharmony_ci                if (scoreboard->last_unifa_write_tick + 3 >=
1946bf215546Sopenharmony_ci                    branch_tick - slots_filled - 1) {
1947bf215546Sopenharmony_ci                        break;
1948bf215546Sopenharmony_ci                }
1949bf215546Sopenharmony_ci
1950bf215546Sopenharmony_ci                /* Do not move up a branch if it can disrupt an ldvary sequence
1951bf215546Sopenharmony_ci                 * as that can cause stomping of the r5 register.
1952bf215546Sopenharmony_ci                 */
1953bf215546Sopenharmony_ci                if (scoreboard->last_ldvary_tick + 2 >=
1954bf215546Sopenharmony_ci                    branch_tick - slots_filled) {
1955bf215546Sopenharmony_ci                       break;
1956bf215546Sopenharmony_ci                }
1957bf215546Sopenharmony_ci
1958bf215546Sopenharmony_ci                /* Can't move a conditional branch before the instruction
1959bf215546Sopenharmony_ci                 * that writes the flags for its condition.
1960bf215546Sopenharmony_ci                 */
1961bf215546Sopenharmony_ci                if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
1962bf215546Sopenharmony_ci                    inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
1963bf215546Sopenharmony_ci                        break;
1964bf215546Sopenharmony_ci                }
1965bf215546Sopenharmony_ci
1966bf215546Sopenharmony_ci                if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
1967bf215546Sopenharmony_ci                        break;
1968bf215546Sopenharmony_ci
1969bf215546Sopenharmony_ci                if (!is_safe_msf_branch) {
1970bf215546Sopenharmony_ci                        struct qinst *prev_prev_inst =
1971bf215546Sopenharmony_ci                                (struct qinst *) prev_inst->link.prev;
1972bf215546Sopenharmony_ci                        if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
1973bf215546Sopenharmony_ci                            prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {
1974bf215546Sopenharmony_ci                                break;
1975bf215546Sopenharmony_ci                        }
1976bf215546Sopenharmony_ci                }
1977bf215546Sopenharmony_ci
1978bf215546Sopenharmony_ci                list_del(&prev_inst->link);
1979bf215546Sopenharmony_ci                list_add(&prev_inst->link, &inst->link);
1980bf215546Sopenharmony_ci                slots_filled++;
1981bf215546Sopenharmony_ci        }
1982bf215546Sopenharmony_ci
1983bf215546Sopenharmony_ci        block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
1984bf215546Sopenharmony_ci        scoreboard->last_branch_tick = branch_tick - slots_filled;
1985bf215546Sopenharmony_ci
1986bf215546Sopenharmony_ci        /* Fill any remaining delay slots.
1987bf215546Sopenharmony_ci         *
1988bf215546Sopenharmony_ci         * For unconditional branches we'll try to fill these with the
1989bf215546Sopenharmony_ci         * first instructions in the successor block after scheduling
1990bf215546Sopenharmony_ci         * all blocks when setting up branch targets.
1991bf215546Sopenharmony_ci         */
1992bf215546Sopenharmony_ci        for (int i = 0; i < 3 - slots_filled; i++)
1993bf215546Sopenharmony_ci                emit_nop(c, block, scoreboard);
1994bf215546Sopenharmony_ci}
1995bf215546Sopenharmony_ci
1996bf215546Sopenharmony_cistatic bool
1997bf215546Sopenharmony_cialu_reads_register(struct v3d_qpu_instr *inst,
1998bf215546Sopenharmony_ci                   bool add, bool magic, uint32_t index)
1999bf215546Sopenharmony_ci{
2000bf215546Sopenharmony_ci        uint32_t num_src;
2001bf215546Sopenharmony_ci        enum v3d_qpu_mux mux_a, mux_b;
2002bf215546Sopenharmony_ci
2003bf215546Sopenharmony_ci        if (add) {
2004bf215546Sopenharmony_ci                num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
2005bf215546Sopenharmony_ci                mux_a = inst->alu.add.a;
2006bf215546Sopenharmony_ci                mux_b = inst->alu.add.b;
2007bf215546Sopenharmony_ci        } else {
2008bf215546Sopenharmony_ci                num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
2009bf215546Sopenharmony_ci                mux_a = inst->alu.mul.a;
2010bf215546Sopenharmony_ci                mux_b = inst->alu.mul.b;
2011bf215546Sopenharmony_ci        }
2012bf215546Sopenharmony_ci
2013bf215546Sopenharmony_ci        for (int i = 0; i < num_src; i++) {
2014bf215546Sopenharmony_ci                if (magic) {
2015bf215546Sopenharmony_ci                        if (i == 0 && mux_a == index)
2016bf215546Sopenharmony_ci                                return true;
2017bf215546Sopenharmony_ci                        if (i == 1 && mux_b == index)
2018bf215546Sopenharmony_ci                                return true;
2019bf215546Sopenharmony_ci                } else {
2020bf215546Sopenharmony_ci                        if (i == 0 && mux_a == V3D_QPU_MUX_A &&
2021bf215546Sopenharmony_ci                            inst->raddr_a == index) {
2022bf215546Sopenharmony_ci                                return true;
2023bf215546Sopenharmony_ci                        }
2024bf215546Sopenharmony_ci                        if (i == 0 && mux_a == V3D_QPU_MUX_B &&
2025bf215546Sopenharmony_ci                            inst->raddr_b == index) {
2026bf215546Sopenharmony_ci                                return true;
2027bf215546Sopenharmony_ci                        }
2028bf215546Sopenharmony_ci                        if (i == 1 && mux_b == V3D_QPU_MUX_A &&
2029bf215546Sopenharmony_ci                            inst->raddr_a == index) {
2030bf215546Sopenharmony_ci                                return true;
2031bf215546Sopenharmony_ci                        }
2032bf215546Sopenharmony_ci                        if (i == 1 && mux_b == V3D_QPU_MUX_B &&
2033bf215546Sopenharmony_ci                            inst->raddr_b == index) {
2034bf215546Sopenharmony_ci                                return true;
2035bf215546Sopenharmony_ci                        }
2036bf215546Sopenharmony_ci                }
2037bf215546Sopenharmony_ci        }
2038bf215546Sopenharmony_ci
2039bf215546Sopenharmony_ci        return false;
2040bf215546Sopenharmony_ci}
2041bf215546Sopenharmony_ci
2042bf215546Sopenharmony_ci/**
2043bf215546Sopenharmony_ci * This takes and ldvary signal merged into 'inst' and tries to move it up to
2044bf215546Sopenharmony_ci * the previous instruction to get good pipelining of ldvary sequences,
2045bf215546Sopenharmony_ci * transforming this:
2046bf215546Sopenharmony_ci *
2047bf215546Sopenharmony_ci * nop                  ; nop               ; ldvary.r4
2048bf215546Sopenharmony_ci * nop                  ; fmul  r0, r4, rf0 ;
2049bf215546Sopenharmony_ci * fadd  rf13, r0, r5   ; nop;              ; ldvary.r1  <-- inst
2050bf215546Sopenharmony_ci *
2051bf215546Sopenharmony_ci * into:
2052bf215546Sopenharmony_ci *
2053bf215546Sopenharmony_ci * nop                  ; nop               ; ldvary.r4
2054bf215546Sopenharmony_ci * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
2055bf215546Sopenharmony_ci * fadd  rf13, r0, r5   ; nop;              ;            <-- inst
2056bf215546Sopenharmony_ci *
2057bf215546Sopenharmony_ci * If we manage to do this successfully (we return true here), then flagging
2058bf215546Sopenharmony_ci * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
2059bf215546Sopenharmony_ci * we will be able to pick up to merge into 'inst', leading to code like this:
2060bf215546Sopenharmony_ci *
2061bf215546Sopenharmony_ci * nop                  ; nop               ; ldvary.r4
2062bf215546Sopenharmony_ci * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
2063bf215546Sopenharmony_ci * fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ;            <-- inst
2064bf215546Sopenharmony_ci */
2065bf215546Sopenharmony_cistatic bool
2066bf215546Sopenharmony_cifixup_pipelined_ldvary(struct v3d_compile *c,
2067bf215546Sopenharmony_ci                       struct choose_scoreboard *scoreboard,
2068bf215546Sopenharmony_ci                       struct qblock *block,
2069bf215546Sopenharmony_ci                       struct v3d_qpu_instr *inst)
2070bf215546Sopenharmony_ci{
2071bf215546Sopenharmony_ci        /* We only call this if we have successfuly merged an ldvary into a
2072bf215546Sopenharmony_ci         * previous instruction.
2073bf215546Sopenharmony_ci         */
2074bf215546Sopenharmony_ci        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
2075bf215546Sopenharmony_ci        assert(inst->sig.ldvary);
2076bf215546Sopenharmony_ci        uint32_t ldvary_magic = inst->sig_magic;
2077bf215546Sopenharmony_ci        uint32_t ldvary_index = inst->sig_addr;
2078bf215546Sopenharmony_ci
2079bf215546Sopenharmony_ci        /* The instruction in which we merged the ldvary cannot read
2080bf215546Sopenharmony_ci         * the ldvary destination, if it does, then moving the ldvary before
2081bf215546Sopenharmony_ci         * it would overwrite it.
2082bf215546Sopenharmony_ci         */
2083bf215546Sopenharmony_ci        if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
2084bf215546Sopenharmony_ci                return false;
2085bf215546Sopenharmony_ci        if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
2086bf215546Sopenharmony_ci                return false;
2087bf215546Sopenharmony_ci
2088bf215546Sopenharmony_ci        /* The implicit ldvary destination may not be written to by a signal
2089bf215546Sopenharmony_ci         * in the instruction following ldvary. Since we are planning to move
2090bf215546Sopenharmony_ci         * ldvary to the previous instruction, this means we need to check if
2091bf215546Sopenharmony_ci         * the current instruction has any other signal that could create this
2092bf215546Sopenharmony_ci         * conflict. The only other signal that can write to the implicit
2093bf215546Sopenharmony_ci         * ldvary destination that is compatible with ldvary in the same
2094bf215546Sopenharmony_ci         * instruction is ldunif.
2095bf215546Sopenharmony_ci         */
2096bf215546Sopenharmony_ci        if (inst->sig.ldunif)
2097bf215546Sopenharmony_ci                return false;
2098bf215546Sopenharmony_ci
2099bf215546Sopenharmony_ci        /* The previous instruction can't write to the same destination as the
2100bf215546Sopenharmony_ci         * ldvary.
2101bf215546Sopenharmony_ci         */
2102bf215546Sopenharmony_ci        struct qinst *prev = (struct qinst *) block->instructions.prev;
2103bf215546Sopenharmony_ci        if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
2104bf215546Sopenharmony_ci                return false;
2105bf215546Sopenharmony_ci
2106bf215546Sopenharmony_ci        if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
2107bf215546Sopenharmony_ci                if (prev->qpu.alu.add.magic_write == ldvary_magic &&
2108bf215546Sopenharmony_ci                    prev->qpu.alu.add.waddr == ldvary_index) {
2109bf215546Sopenharmony_ci                        return false;
2110bf215546Sopenharmony_ci                }
2111bf215546Sopenharmony_ci        }
2112bf215546Sopenharmony_ci
2113bf215546Sopenharmony_ci        if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
2114bf215546Sopenharmony_ci                if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
2115bf215546Sopenharmony_ci                    prev->qpu.alu.mul.waddr == ldvary_index) {
2116bf215546Sopenharmony_ci                        return false;
2117bf215546Sopenharmony_ci                }
2118bf215546Sopenharmony_ci        }
2119bf215546Sopenharmony_ci
2120bf215546Sopenharmony_ci        /* The previous instruction cannot have a conflicting signal */
2121bf215546Sopenharmony_ci        if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
2122bf215546Sopenharmony_ci                return false;
2123bf215546Sopenharmony_ci
2124bf215546Sopenharmony_ci        uint32_t sig;
2125bf215546Sopenharmony_ci        struct v3d_qpu_sig new_sig = prev->qpu.sig;
2126bf215546Sopenharmony_ci        new_sig.ldvary = true;
2127bf215546Sopenharmony_ci        if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig))
2128bf215546Sopenharmony_ci                return false;
2129bf215546Sopenharmony_ci
2130bf215546Sopenharmony_ci        /* The previous instruction cannot use flags since ldvary uses the
2131bf215546Sopenharmony_ci         * 'cond' instruction field to store the destination.
2132bf215546Sopenharmony_ci         */
2133bf215546Sopenharmony_ci        if (v3d_qpu_writes_flags(&prev->qpu))
2134bf215546Sopenharmony_ci                return false;
2135bf215546Sopenharmony_ci        if (v3d_qpu_reads_flags(&prev->qpu))
2136bf215546Sopenharmony_ci                return false;
2137bf215546Sopenharmony_ci
2138bf215546Sopenharmony_ci        /* We can't put an ldvary in the delay slots of a thrsw. We should've
2139bf215546Sopenharmony_ci         * prevented this when pairing up the ldvary with another instruction
2140bf215546Sopenharmony_ci         * and flagging it for a fixup.
2141bf215546Sopenharmony_ci         */
2142bf215546Sopenharmony_ci        assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
2143bf215546Sopenharmony_ci
2144bf215546Sopenharmony_ci        /* Move the ldvary to the previous instruction and remove it from the
2145bf215546Sopenharmony_ci         * current one.
2146bf215546Sopenharmony_ci         */
2147bf215546Sopenharmony_ci        prev->qpu.sig.ldvary = true;
2148bf215546Sopenharmony_ci        prev->qpu.sig_magic = ldvary_magic;
2149bf215546Sopenharmony_ci        prev->qpu.sig_addr = ldvary_index;
2150bf215546Sopenharmony_ci        scoreboard->last_ldvary_tick = scoreboard->tick - 1;
2151bf215546Sopenharmony_ci
2152bf215546Sopenharmony_ci        inst->sig.ldvary = false;
2153bf215546Sopenharmony_ci        inst->sig_magic = false;
2154bf215546Sopenharmony_ci        inst->sig_addr = 0;
2155bf215546Sopenharmony_ci
2156bf215546Sopenharmony_ci        /* By moving ldvary to the previous instruction we make it update
2157bf215546Sopenharmony_ci         * r5 in the current one, so nothing else in it should write r5.
2158bf215546Sopenharmony_ci         * This should've been prevented by our depedency tracking, which
2159bf215546Sopenharmony_ci         * would not allow ldvary to be paired up with an instruction that
2160bf215546Sopenharmony_ci         * writes r5 (since our dependency tracking doesn't know that the
2161bf215546Sopenharmony_ci         * ldvary write r5 happens in the next instruction).
2162bf215546Sopenharmony_ci         */
2163bf215546Sopenharmony_ci        assert(!v3d_qpu_writes_r5(c->devinfo, inst));
2164bf215546Sopenharmony_ci
2165bf215546Sopenharmony_ci        return true;
2166bf215546Sopenharmony_ci}
2167bf215546Sopenharmony_ci
2168bf215546Sopenharmony_cistatic uint32_t
2169bf215546Sopenharmony_cischedule_instructions(struct v3d_compile *c,
2170bf215546Sopenharmony_ci                      struct choose_scoreboard *scoreboard,
2171bf215546Sopenharmony_ci                      struct qblock *block,
2172bf215546Sopenharmony_ci                      enum quniform_contents *orig_uniform_contents,
2173bf215546Sopenharmony_ci                      uint32_t *orig_uniform_data,
2174bf215546Sopenharmony_ci                      uint32_t *next_uniform)
2175bf215546Sopenharmony_ci{
2176bf215546Sopenharmony_ci        const struct v3d_device_info *devinfo = c->devinfo;
2177bf215546Sopenharmony_ci        uint32_t time = 0;
2178bf215546Sopenharmony_ci
2179bf215546Sopenharmony_ci        while (!list_is_empty(&scoreboard->dag->heads)) {
2180bf215546Sopenharmony_ci                struct schedule_node *chosen =
2181bf215546Sopenharmony_ci                        choose_instruction_to_schedule(c, scoreboard, NULL);
2182bf215546Sopenharmony_ci                struct schedule_node *merge = NULL;
2183bf215546Sopenharmony_ci
2184bf215546Sopenharmony_ci                /* If there are no valid instructions to schedule, drop a NOP
2185bf215546Sopenharmony_ci                 * in.
2186bf215546Sopenharmony_ci                 */
2187bf215546Sopenharmony_ci                struct qinst *qinst = chosen ? chosen->inst : vir_nop();
2188bf215546Sopenharmony_ci                struct v3d_qpu_instr *inst = &qinst->qpu;
2189bf215546Sopenharmony_ci
2190bf215546Sopenharmony_ci                if (debug) {
2191bf215546Sopenharmony_ci                        fprintf(stderr, "t=%4d: current list:\n",
2192bf215546Sopenharmony_ci                                time);
2193bf215546Sopenharmony_ci                        dump_state(devinfo, scoreboard->dag);
2194bf215546Sopenharmony_ci                        fprintf(stderr, "t=%4d: chose:   ", time);
2195bf215546Sopenharmony_ci                        v3d_qpu_dump(devinfo, inst);
2196bf215546Sopenharmony_ci                        fprintf(stderr, "\n");
2197bf215546Sopenharmony_ci                }
2198bf215546Sopenharmony_ci
2199bf215546Sopenharmony_ci                /* We can't mark_instruction_scheduled() the chosen inst until
2200bf215546Sopenharmony_ci                 * we're done identifying instructions to merge, so put the
2201bf215546Sopenharmony_ci                 * merged instructions on a list for a moment.
2202bf215546Sopenharmony_ci                 */
2203bf215546Sopenharmony_ci                struct list_head merged_list;
2204bf215546Sopenharmony_ci                list_inithead(&merged_list);
2205bf215546Sopenharmony_ci
2206bf215546Sopenharmony_ci                /* Schedule this instruction onto the QPU list. Also try to
2207bf215546Sopenharmony_ci                 * find an instruction to pair with it.
2208bf215546Sopenharmony_ci                 */
2209bf215546Sopenharmony_ci                if (chosen) {
2210bf215546Sopenharmony_ci                        time = MAX2(chosen->unblocked_time, time);
2211bf215546Sopenharmony_ci                        pre_remove_head(scoreboard->dag, chosen);
2212bf215546Sopenharmony_ci
2213bf215546Sopenharmony_ci                        while ((merge =
2214bf215546Sopenharmony_ci                                choose_instruction_to_schedule(c, scoreboard,
2215bf215546Sopenharmony_ci                                                               chosen))) {
2216bf215546Sopenharmony_ci                                time = MAX2(merge->unblocked_time, time);
2217bf215546Sopenharmony_ci                                pre_remove_head(scoreboard->dag, merge);
2218bf215546Sopenharmony_ci                                list_addtail(&merge->link, &merged_list);
2219bf215546Sopenharmony_ci                                (void)qpu_merge_inst(devinfo, inst,
2220bf215546Sopenharmony_ci                                                     inst, &merge->inst->qpu);
2221bf215546Sopenharmony_ci                                if (merge->inst->uniform != -1) {
2222bf215546Sopenharmony_ci                                        chosen->inst->uniform =
2223bf215546Sopenharmony_ci                                                merge->inst->uniform;
2224bf215546Sopenharmony_ci                                }
2225bf215546Sopenharmony_ci
2226bf215546Sopenharmony_ci                                if (debug) {
2227bf215546Sopenharmony_ci                                        fprintf(stderr, "t=%4d: merging: ",
2228bf215546Sopenharmony_ci                                                time);
2229bf215546Sopenharmony_ci                                        v3d_qpu_dump(devinfo, &merge->inst->qpu);
2230bf215546Sopenharmony_ci                                        fprintf(stderr, "\n");
2231bf215546Sopenharmony_ci                                        fprintf(stderr, "         result: ");
2232bf215546Sopenharmony_ci                                        v3d_qpu_dump(devinfo, inst);
2233bf215546Sopenharmony_ci                                        fprintf(stderr, "\n");
2234bf215546Sopenharmony_ci                                }
2235bf215546Sopenharmony_ci
2236bf215546Sopenharmony_ci                                if (scoreboard->fixup_ldvary) {
2237bf215546Sopenharmony_ci                                        scoreboard->fixup_ldvary = false;
2238bf215546Sopenharmony_ci                                        if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
2239bf215546Sopenharmony_ci                                                /* Flag the ldvary as scheduled
2240bf215546Sopenharmony_ci                                                 * now so we can try to merge the
2241bf215546Sopenharmony_ci                                                 * follow-up instruction in the
2242bf215546Sopenharmony_ci                                                 * the ldvary sequence into the
2243bf215546Sopenharmony_ci                                                 * current instruction.
2244bf215546Sopenharmony_ci                                                 */
2245bf215546Sopenharmony_ci                                                mark_instruction_scheduled(
2246bf215546Sopenharmony_ci                                                        devinfo, scoreboard->dag,
2247bf215546Sopenharmony_ci                                                        time, merge);
2248bf215546Sopenharmony_ci                                        }
2249bf215546Sopenharmony_ci                                }
2250bf215546Sopenharmony_ci                        }
2251bf215546Sopenharmony_ci                        if (mux_read_stalls(scoreboard, inst))
2252bf215546Sopenharmony_ci                                c->qpu_inst_stalled_count++;
2253bf215546Sopenharmony_ci                }
2254bf215546Sopenharmony_ci
2255bf215546Sopenharmony_ci                /* Update the uniform index for the rewritten location --
2256bf215546Sopenharmony_ci                 * branch target updating will still need to change
2257bf215546Sopenharmony_ci                 * c->uniform_data[] using this index.
2258bf215546Sopenharmony_ci                 */
2259bf215546Sopenharmony_ci                if (qinst->uniform != -1) {
2260bf215546Sopenharmony_ci                        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
2261bf215546Sopenharmony_ci                                block->branch_uniform = *next_uniform;
2262bf215546Sopenharmony_ci
2263bf215546Sopenharmony_ci                        c->uniform_data[*next_uniform] =
2264bf215546Sopenharmony_ci                                orig_uniform_data[qinst->uniform];
2265bf215546Sopenharmony_ci                        c->uniform_contents[*next_uniform] =
2266bf215546Sopenharmony_ci                                orig_uniform_contents[qinst->uniform];
2267bf215546Sopenharmony_ci                        qinst->uniform = *next_uniform;
2268bf215546Sopenharmony_ci                        (*next_uniform)++;
2269bf215546Sopenharmony_ci                }
2270bf215546Sopenharmony_ci
2271bf215546Sopenharmony_ci                if (debug) {
2272bf215546Sopenharmony_ci                        fprintf(stderr, "\n");
2273bf215546Sopenharmony_ci                }
2274bf215546Sopenharmony_ci
2275bf215546Sopenharmony_ci                /* Now that we've scheduled a new instruction, some of its
2276bf215546Sopenharmony_ci                 * children can be promoted to the list of instructions ready to
2277bf215546Sopenharmony_ci                 * be scheduled.  Update the children's unblocked time for this
2278bf215546Sopenharmony_ci                 * DAG edge as we do so.
2279bf215546Sopenharmony_ci                 */
2280bf215546Sopenharmony_ci                mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);
2281bf215546Sopenharmony_ci                list_for_each_entry(struct schedule_node, merge, &merged_list,
2282bf215546Sopenharmony_ci                                    link) {
2283bf215546Sopenharmony_ci                        mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);
2284bf215546Sopenharmony_ci
2285bf215546Sopenharmony_ci                        /* The merged VIR instruction doesn't get re-added to the
2286bf215546Sopenharmony_ci                         * block, so free it now.
2287bf215546Sopenharmony_ci                         */
2288bf215546Sopenharmony_ci                        free(merge->inst);
2289bf215546Sopenharmony_ci                }
2290bf215546Sopenharmony_ci
2291bf215546Sopenharmony_ci                if (inst->sig.thrsw) {
2292bf215546Sopenharmony_ci                        time += emit_thrsw(c, block, scoreboard, qinst, false);
2293bf215546Sopenharmony_ci                } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
2294bf215546Sopenharmony_ci                        emit_branch(c, block, scoreboard, qinst);
2295bf215546Sopenharmony_ci                } else {
2296bf215546Sopenharmony_ci                        insert_scheduled_instruction(c, block,
2297bf215546Sopenharmony_ci                                                     scoreboard, qinst);
2298bf215546Sopenharmony_ci                }
2299bf215546Sopenharmony_ci        }
2300bf215546Sopenharmony_ci
2301bf215546Sopenharmony_ci        return time;
2302bf215546Sopenharmony_ci}
2303bf215546Sopenharmony_ci
2304bf215546Sopenharmony_cistatic uint32_t
2305bf215546Sopenharmony_ciqpu_schedule_instructions_block(struct v3d_compile *c,
2306bf215546Sopenharmony_ci                                struct choose_scoreboard *scoreboard,
2307bf215546Sopenharmony_ci                                struct qblock *block,
2308bf215546Sopenharmony_ci                                enum quniform_contents *orig_uniform_contents,
2309bf215546Sopenharmony_ci                                uint32_t *orig_uniform_data,
2310bf215546Sopenharmony_ci                                uint32_t *next_uniform)
2311bf215546Sopenharmony_ci{
2312bf215546Sopenharmony_ci        void *mem_ctx = ralloc_context(NULL);
2313bf215546Sopenharmony_ci        scoreboard->dag = dag_create(mem_ctx);
2314bf215546Sopenharmony_ci        struct list_head setup_list;
2315bf215546Sopenharmony_ci
2316bf215546Sopenharmony_ci        list_inithead(&setup_list);
2317bf215546Sopenharmony_ci
2318bf215546Sopenharmony_ci        /* Wrap each instruction in a scheduler structure. */
2319bf215546Sopenharmony_ci        while (!list_is_empty(&block->instructions)) {
2320bf215546Sopenharmony_ci                struct qinst *qinst = (struct qinst *)block->instructions.next;
2321bf215546Sopenharmony_ci                struct schedule_node *n =
2322bf215546Sopenharmony_ci                        rzalloc(mem_ctx, struct schedule_node);
2323bf215546Sopenharmony_ci
2324bf215546Sopenharmony_ci                dag_init_node(scoreboard->dag, &n->dag);
2325bf215546Sopenharmony_ci                n->inst = qinst;
2326bf215546Sopenharmony_ci
2327bf215546Sopenharmony_ci                list_del(&qinst->link);
2328bf215546Sopenharmony_ci                list_addtail(&n->link, &setup_list);
2329bf215546Sopenharmony_ci        }
2330bf215546Sopenharmony_ci
2331bf215546Sopenharmony_ci        calculate_forward_deps(c, scoreboard->dag, &setup_list);
2332bf215546Sopenharmony_ci        calculate_reverse_deps(c, scoreboard->dag, &setup_list);
2333bf215546Sopenharmony_ci
2334bf215546Sopenharmony_ci        dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);
2335bf215546Sopenharmony_ci
2336bf215546Sopenharmony_ci        uint32_t cycles = schedule_instructions(c, scoreboard, block,
2337bf215546Sopenharmony_ci                                                orig_uniform_contents,
2338bf215546Sopenharmony_ci                                                orig_uniform_data,
2339bf215546Sopenharmony_ci                                                next_uniform);
2340bf215546Sopenharmony_ci
2341bf215546Sopenharmony_ci        ralloc_free(mem_ctx);
2342bf215546Sopenharmony_ci        scoreboard->dag = NULL;
2343bf215546Sopenharmony_ci
2344bf215546Sopenharmony_ci        return cycles;
2345bf215546Sopenharmony_ci}
2346bf215546Sopenharmony_ci
2347bf215546Sopenharmony_cistatic void
2348bf215546Sopenharmony_ciqpu_set_branch_targets(struct v3d_compile *c)
2349bf215546Sopenharmony_ci{
2350bf215546Sopenharmony_ci        vir_for_each_block(block, c) {
2351bf215546Sopenharmony_ci                /* The end block of the program has no branch. */
2352bf215546Sopenharmony_ci                if (!block->successors[0])
2353bf215546Sopenharmony_ci                        continue;
2354bf215546Sopenharmony_ci
2355bf215546Sopenharmony_ci                /* If there was no branch instruction, then the successor
2356bf215546Sopenharmony_ci                 * block must follow immediately after this one.
2357bf215546Sopenharmony_ci                 */
2358bf215546Sopenharmony_ci                if (block->branch_qpu_ip == ~0) {
2359bf215546Sopenharmony_ci                        assert(block->end_qpu_ip + 1 ==
2360bf215546Sopenharmony_ci                               block->successors[0]->start_qpu_ip);
2361bf215546Sopenharmony_ci                        continue;
2362bf215546Sopenharmony_ci                }
2363bf215546Sopenharmony_ci
2364bf215546Sopenharmony_ci                /* Walk back through the delay slots to find the branch
2365bf215546Sopenharmony_ci                 * instr.
2366bf215546Sopenharmony_ci                 */
2367bf215546Sopenharmony_ci                struct qinst *branch = NULL;
2368bf215546Sopenharmony_ci                struct list_head *entry = block->instructions.prev;
2369bf215546Sopenharmony_ci                int32_t delay_slot_count = -1;
2370bf215546Sopenharmony_ci                struct qinst *delay_slots_start = NULL;
2371bf215546Sopenharmony_ci                for (int i = 0; i < 3; i++) {
2372bf215546Sopenharmony_ci                        entry = entry->prev;
2373bf215546Sopenharmony_ci                        struct qinst *inst =
2374bf215546Sopenharmony_ci                                container_of(entry, struct qinst, link);
2375bf215546Sopenharmony_ci
2376bf215546Sopenharmony_ci                        if (delay_slot_count == -1) {
2377bf215546Sopenharmony_ci                                if (!v3d_qpu_is_nop(&inst->qpu))
2378bf215546Sopenharmony_ci                                        delay_slot_count = i;
2379bf215546Sopenharmony_ci                                else
2380bf215546Sopenharmony_ci                                        delay_slots_start = inst;
2381bf215546Sopenharmony_ci                        }
2382bf215546Sopenharmony_ci
2383bf215546Sopenharmony_ci                        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
2384bf215546Sopenharmony_ci                                branch = inst;
2385bf215546Sopenharmony_ci                                break;
2386bf215546Sopenharmony_ci                        }
2387bf215546Sopenharmony_ci                }
2388bf215546Sopenharmony_ci                assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2389bf215546Sopenharmony_ci                assert(delay_slot_count >= 0 && delay_slot_count <= 3);
2390bf215546Sopenharmony_ci                assert(delay_slot_count == 0 || delay_slots_start != NULL);
2391bf215546Sopenharmony_ci
2392bf215546Sopenharmony_ci                /* Make sure that the if-we-don't-jump
2393bf215546Sopenharmony_ci                 * successor was scheduled just after the
2394bf215546Sopenharmony_ci                 * delay slots.
2395bf215546Sopenharmony_ci                 */
2396bf215546Sopenharmony_ci                assert(!block->successors[1] ||
2397bf215546Sopenharmony_ci                       block->successors[1]->start_qpu_ip ==
2398bf215546Sopenharmony_ci                       block->branch_qpu_ip + 4);
2399bf215546Sopenharmony_ci
2400bf215546Sopenharmony_ci                branch->qpu.branch.offset =
2401bf215546Sopenharmony_ci                        ((block->successors[0]->start_qpu_ip -
2402bf215546Sopenharmony_ci                          (block->branch_qpu_ip + 4)) *
2403bf215546Sopenharmony_ci                         sizeof(uint64_t));
2404bf215546Sopenharmony_ci
2405bf215546Sopenharmony_ci                /* Set up the relative offset to jump in the
2406bf215546Sopenharmony_ci                 * uniform stream.
2407bf215546Sopenharmony_ci                 *
2408bf215546Sopenharmony_ci                 * Use a temporary here, because
2409bf215546Sopenharmony_ci                 * uniform_data[inst->uniform] may be shared
2410bf215546Sopenharmony_ci                 * between multiple instructions.
2411bf215546Sopenharmony_ci                 */
2412bf215546Sopenharmony_ci                assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
2413bf215546Sopenharmony_ci                c->uniform_data[branch->uniform] =
2414bf215546Sopenharmony_ci                        (block->successors[0]->start_uniform -
2415bf215546Sopenharmony_ci                         (block->branch_uniform + 1)) * 4;
2416bf215546Sopenharmony_ci
2417bf215546Sopenharmony_ci                /* If this is an unconditional branch, try to fill any remaining
2418bf215546Sopenharmony_ci                 * delay slots with the initial instructions of the successor
2419bf215546Sopenharmony_ci                 * block.
2420bf215546Sopenharmony_ci                 *
2421bf215546Sopenharmony_ci                 * FIXME: we can do the same for conditional branches if we
2422bf215546Sopenharmony_ci                 * predicate the instructions to match the branch condition.
2423bf215546Sopenharmony_ci                 */
2424bf215546Sopenharmony_ci                if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
2425bf215546Sopenharmony_ci                        struct list_head *successor_insts =
2426bf215546Sopenharmony_ci                                &block->successors[0]->instructions;
2427bf215546Sopenharmony_ci                        delay_slot_count = MIN2(delay_slot_count,
2428bf215546Sopenharmony_ci                                                list_length(successor_insts));
2429bf215546Sopenharmony_ci                        struct qinst *s_inst =
2430bf215546Sopenharmony_ci                                (struct qinst *) successor_insts->next;
2431bf215546Sopenharmony_ci                        struct qinst *slot = delay_slots_start;
2432bf215546Sopenharmony_ci                        int slots_filled = 0;
2433bf215546Sopenharmony_ci                        while (slots_filled < delay_slot_count &&
2434bf215546Sopenharmony_ci                               qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {
2435bf215546Sopenharmony_ci                                memcpy(&slot->qpu, &s_inst->qpu,
2436bf215546Sopenharmony_ci                                       sizeof(slot->qpu));
2437bf215546Sopenharmony_ci                                s_inst = (struct qinst *) s_inst->link.next;
2438bf215546Sopenharmony_ci                                slot = (struct qinst *) slot->link.next;
2439bf215546Sopenharmony_ci                                slots_filled++;
2440bf215546Sopenharmony_ci                        }
2441bf215546Sopenharmony_ci                        branch->qpu.branch.offset +=
2442bf215546Sopenharmony_ci                                slots_filled * sizeof(uint64_t);
2443bf215546Sopenharmony_ci                }
2444bf215546Sopenharmony_ci        }
2445bf215546Sopenharmony_ci}
2446bf215546Sopenharmony_ci
2447bf215546Sopenharmony_ciuint32_t
2448bf215546Sopenharmony_civ3d_qpu_schedule_instructions(struct v3d_compile *c)
2449bf215546Sopenharmony_ci{
2450bf215546Sopenharmony_ci        const struct v3d_device_info *devinfo = c->devinfo;
2451bf215546Sopenharmony_ci        struct qblock *end_block = list_last_entry(&c->blocks,
2452bf215546Sopenharmony_ci                                                   struct qblock, link);
2453bf215546Sopenharmony_ci
2454bf215546Sopenharmony_ci        /* We reorder the uniforms as we schedule instructions, so save the
2455bf215546Sopenharmony_ci         * old data off and replace it.
2456bf215546Sopenharmony_ci         */
2457bf215546Sopenharmony_ci        uint32_t *uniform_data = c->uniform_data;
2458bf215546Sopenharmony_ci        enum quniform_contents *uniform_contents = c->uniform_contents;
2459bf215546Sopenharmony_ci        c->uniform_contents = ralloc_array(c, enum quniform_contents,
2460bf215546Sopenharmony_ci                                           c->num_uniforms);
2461bf215546Sopenharmony_ci        c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
2462bf215546Sopenharmony_ci        c->uniform_array_size = c->num_uniforms;
2463bf215546Sopenharmony_ci        uint32_t next_uniform = 0;
2464bf215546Sopenharmony_ci
2465bf215546Sopenharmony_ci        struct choose_scoreboard scoreboard;
2466bf215546Sopenharmony_ci        memset(&scoreboard, 0, sizeof(scoreboard));
2467bf215546Sopenharmony_ci        scoreboard.last_ldvary_tick = -10;
2468bf215546Sopenharmony_ci        scoreboard.last_unifa_write_tick = -10;
2469bf215546Sopenharmony_ci        scoreboard.last_magic_sfu_write_tick = -10;
2470bf215546Sopenharmony_ci        scoreboard.last_uniforms_reset_tick = -10;
2471bf215546Sopenharmony_ci        scoreboard.last_thrsw_tick = -10;
2472bf215546Sopenharmony_ci        scoreboard.last_branch_tick = -10;
2473bf215546Sopenharmony_ci        scoreboard.last_setmsf_tick = -10;
2474bf215546Sopenharmony_ci        scoreboard.last_stallable_sfu_tick = -10;
2475bf215546Sopenharmony_ci
2476bf215546Sopenharmony_ci        if (debug) {
2477bf215546Sopenharmony_ci                fprintf(stderr, "Pre-schedule instructions\n");
2478bf215546Sopenharmony_ci                vir_for_each_block(block, c) {
2479bf215546Sopenharmony_ci                        fprintf(stderr, "BLOCK %d\n", block->index);
2480bf215546Sopenharmony_ci                        list_for_each_entry(struct qinst, qinst,
2481bf215546Sopenharmony_ci                                            &block->instructions, link) {
2482bf215546Sopenharmony_ci                                v3d_qpu_dump(devinfo, &qinst->qpu);
2483bf215546Sopenharmony_ci                                fprintf(stderr, "\n");
2484bf215546Sopenharmony_ci                        }
2485bf215546Sopenharmony_ci                }
2486bf215546Sopenharmony_ci                fprintf(stderr, "\n");
2487bf215546Sopenharmony_ci        }
2488bf215546Sopenharmony_ci
2489bf215546Sopenharmony_ci        uint32_t cycles = 0;
2490bf215546Sopenharmony_ci        vir_for_each_block(block, c) {
2491bf215546Sopenharmony_ci                block->start_qpu_ip = c->qpu_inst_count;
2492bf215546Sopenharmony_ci                block->branch_qpu_ip = ~0;
2493bf215546Sopenharmony_ci                block->start_uniform = next_uniform;
2494bf215546Sopenharmony_ci
2495bf215546Sopenharmony_ci                cycles += qpu_schedule_instructions_block(c,
2496bf215546Sopenharmony_ci                                                          &scoreboard,
2497bf215546Sopenharmony_ci                                                          block,
2498bf215546Sopenharmony_ci                                                          uniform_contents,
2499bf215546Sopenharmony_ci                                                          uniform_data,
2500bf215546Sopenharmony_ci                                                          &next_uniform);
2501bf215546Sopenharmony_ci
2502bf215546Sopenharmony_ci                block->end_qpu_ip = c->qpu_inst_count - 1;
2503bf215546Sopenharmony_ci        }
2504bf215546Sopenharmony_ci
2505bf215546Sopenharmony_ci        /* Emit the program-end THRSW instruction. */;
2506bf215546Sopenharmony_ci        struct qinst *thrsw = vir_nop();
2507bf215546Sopenharmony_ci        thrsw->qpu.sig.thrsw = true;
2508bf215546Sopenharmony_ci        emit_thrsw(c, end_block, &scoreboard, thrsw, true);
2509bf215546Sopenharmony_ci
2510bf215546Sopenharmony_ci        qpu_set_branch_targets(c);
2511bf215546Sopenharmony_ci
2512bf215546Sopenharmony_ci        assert(next_uniform == c->num_uniforms);
2513bf215546Sopenharmony_ci
2514bf215546Sopenharmony_ci        return cycles;
2515bf215546Sopenharmony_ci}
2516