1 /*
2  * Copyright © 2010 Intel Corporation
3  * Copyright © 2014-2017 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 /**
26  * @file
27  *
28  * The basic model of the list scheduler is to take a basic block, compute a
29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30  * pick a DAG head, then put all the children that are now DAG heads into the
31  * list of things to schedule.
32  *
33  * The goal of scheduling here is to pack pairs of operations together in a
34  * single QPU instruction.
35  */
36 
37 #include "qpu/qpu_disasm.h"
38 #include "v3d_compiler.h"
39 #include "util/ralloc.h"
40 #include "util/dag.h"
41 
42 static bool debug;
43 
44 struct schedule_node_child;
45 
46 struct schedule_node {
47         struct dag_node dag;
48         struct list_head link;
49         struct qinst *inst;
50 
51         /* Longest cycles + instruction_latency() of any parent of this node. */
52         uint32_t unblocked_time;
53 
54         /**
55          * Minimum number of cycles from scheduling this instruction until the
56          * end of the program, based on the slowest dependency chain through
57          * the children.
58          */
59         uint32_t delay;
60 
61         /**
62          * cycles between this instruction being scheduled and when its result
63          * can be consumed.
64          */
65         uint32_t latency;
66 };
67 
68 /* When walking the instructions in reverse, we need to swap before/after in
69  * add_dep().
70  */
71 enum direction { F, R };
72 
73 struct schedule_state {
74         const struct v3d_device_info *devinfo;
75         struct dag *dag;
76         struct schedule_node *last_r[6];
77         struct schedule_node *last_rf[64];
78         struct schedule_node *last_sf;
79         struct schedule_node *last_vpm_read;
80         struct schedule_node *last_tmu_write;
81         struct schedule_node *last_tmu_config;
82         struct schedule_node *last_tmu_read;
83         struct schedule_node *last_tlb;
84         struct schedule_node *last_vpm;
85         struct schedule_node *last_unif;
86         struct schedule_node *last_rtop;
87         struct schedule_node *last_unifa;
88         enum direction dir;
89         /* Estimated cycle when the current instruction would start. */
90         uint32_t time;
91 };
92 
93 static void
add_dep(struct schedule_state *state, struct schedule_node *before, struct schedule_node *after, bool write)94 add_dep(struct schedule_state *state,
95         struct schedule_node *before,
96         struct schedule_node *after,
97         bool write)
98 {
99         bool write_after_read = !write && state->dir == R;
100         uintptr_t edge_data = write_after_read;
101 
102         if (!before || !after)
103                 return;
104 
105         assert(before != after);
106 
107         if (state->dir == F)
108                 dag_add_edge(&before->dag, &after->dag, edge_data);
109         else
110                 dag_add_edge(&after->dag, &before->dag, edge_data);
111 }
112 
113 static void
add_read_dep(struct schedule_state *state, struct schedule_node *before, struct schedule_node *after)114 add_read_dep(struct schedule_state *state,
115               struct schedule_node *before,
116               struct schedule_node *after)
117 {
118         add_dep(state, before, after, false);
119 }
120 
121 static void
add_write_dep(struct schedule_state *state, struct schedule_node **before, struct schedule_node *after)122 add_write_dep(struct schedule_state *state,
123               struct schedule_node **before,
124               struct schedule_node *after)
125 {
126         add_dep(state, *before, after, true);
127         *before = after;
128 }
129 
130 static bool
qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)131 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
132 {
133         if (inst->sig.ldtlb || inst->sig.ldtlbu)
134                 return true;
135 
136         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
137                 return false;
138 
139         if (inst->alu.add.magic_write &&
140             (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
141              inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
142                 return true;
143 
144         if (inst->alu.mul.magic_write &&
145             (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
146              inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
147                 return true;
148 
149         return false;
150 }
151 
152 static void
process_mux_deps(struct schedule_state *state, struct schedule_node *n, enum v3d_qpu_mux mux)153 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
154                  enum v3d_qpu_mux mux)
155 {
156         switch (mux) {
157         case V3D_QPU_MUX_A:
158                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
159                 break;
160         case V3D_QPU_MUX_B:
161                 if (!n->inst->qpu.sig.small_imm) {
162                         add_read_dep(state,
163                                      state->last_rf[n->inst->qpu.raddr_b], n);
164                 }
165                 break;
166         default:
167                 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
168                 break;
169         }
170 }
171 
172 static bool
tmu_write_is_sequence_terminator(uint32_t waddr)173 tmu_write_is_sequence_terminator(uint32_t waddr)
174 {
175         switch (waddr) {
176         case V3D_QPU_WADDR_TMUS:
177         case V3D_QPU_WADDR_TMUSCM:
178         case V3D_QPU_WADDR_TMUSF:
179         case V3D_QPU_WADDR_TMUSLOD:
180         case V3D_QPU_WADDR_TMUA:
181         case V3D_QPU_WADDR_TMUAU:
182                 return true;
183         default:
184                 return false;
185         }
186 }
187 
188 static bool
can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)189 can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
190 {
191         if (devinfo->ver < 40)
192                 return false;
193 
194         if (tmu_write_is_sequence_terminator(waddr))
195                 return false;
196 
197         if (waddr == V3D_QPU_WADDR_TMUD)
198                 return false;
199 
200         return true;
201 }
202 
203 static void
process_waddr_deps(struct schedule_state *state, struct schedule_node *n, uint32_t waddr, bool magic)204 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
205                    uint32_t waddr, bool magic)
206 {
207         if (!magic) {
208                 add_write_dep(state, &state->last_rf[waddr], n);
209         } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
210                 if (can_reorder_tmu_write(state->devinfo, waddr))
211                         add_read_dep(state, state->last_tmu_write, n);
212                 else
213                         add_write_dep(state, &state->last_tmu_write, n);
214 
215                 if (tmu_write_is_sequence_terminator(waddr))
216                         add_write_dep(state, &state->last_tmu_config, n);
217         } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
218                 /* Handled by v3d_qpu_writes_r4() check. */
219         } else {
220                 switch (waddr) {
221                 case V3D_QPU_WADDR_R0:
222                 case V3D_QPU_WADDR_R1:
223                 case V3D_QPU_WADDR_R2:
224                         add_write_dep(state,
225                                       &state->last_r[waddr - V3D_QPU_WADDR_R0],
226                                       n);
227                         break;
228                 case V3D_QPU_WADDR_R3:
229                 case V3D_QPU_WADDR_R4:
230                 case V3D_QPU_WADDR_R5:
231                         /* Handled by v3d_qpu_writes_r*() checks below. */
232                         break;
233 
234                 case V3D_QPU_WADDR_VPM:
235                 case V3D_QPU_WADDR_VPMU:
236                         add_write_dep(state, &state->last_vpm, n);
237                         break;
238 
239                 case V3D_QPU_WADDR_TLB:
240                 case V3D_QPU_WADDR_TLBU:
241                         add_write_dep(state, &state->last_tlb, n);
242                         break;
243 
244                 case V3D_QPU_WADDR_SYNC:
245                 case V3D_QPU_WADDR_SYNCB:
246                 case V3D_QPU_WADDR_SYNCU:
247                         /* For CS barrier(): Sync against any other memory
248                          * accesses.  There doesn't appear to be any need for
249                          * barriers to affect ALU operations.
250                          */
251                         add_write_dep(state, &state->last_tmu_write, n);
252                         add_write_dep(state, &state->last_tmu_read, n);
253                         break;
254 
255                 case V3D_QPU_WADDR_UNIFA:
256                         if (state->devinfo->ver >= 40)
257                                 add_write_dep(state, &state->last_unifa, n);
258                         break;
259 
260                 case V3D_QPU_WADDR_NOP:
261                         break;
262 
263                 default:
264                         fprintf(stderr, "Unknown waddr %d\n", waddr);
265                         abort();
266                 }
267         }
268 }
269 
270 /**
271  * Common code for dependencies that need to be tracked both forward and
272  * backward.
273  *
274  * This is for things like "all reads of r4 have to happen between the r4
275  * writes that surround them".
276  */
277 static void
calculate_deps(struct schedule_state *state, struct schedule_node *n)278 calculate_deps(struct schedule_state *state, struct schedule_node *n)
279 {
280         const struct v3d_device_info *devinfo = state->devinfo;
281         struct qinst *qinst = n->inst;
282         struct v3d_qpu_instr *inst = &qinst->qpu;
283         /* If the input and output segments are shared, then all VPM reads to
284          * a location need to happen before all writes.  We handle this by
285          * serializing all VPM operations for now.
286          */
287         bool separate_vpm_segment = false;
288 
289         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
290                 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
291                         add_read_dep(state, state->last_sf, n);
292 
293                 /* XXX: BDI */
294                 /* XXX: BDU */
295                 /* XXX: ub */
296                 /* XXX: raddr_a */
297 
298                 add_write_dep(state, &state->last_unif, n);
299                 return;
300         }
301 
302         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
303 
304         /* XXX: LOAD_IMM */
305 
306         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
307                 process_mux_deps(state, n, inst->alu.add.a);
308         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
309                 process_mux_deps(state, n, inst->alu.add.b);
310 
311         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
312                 process_mux_deps(state, n, inst->alu.mul.a);
313         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
314                 process_mux_deps(state, n, inst->alu.mul.b);
315 
316         switch (inst->alu.add.op) {
317         case V3D_QPU_A_VPMSETUP:
318                 /* Could distinguish read/write by unpacking the uniform. */
319                 add_write_dep(state, &state->last_vpm, n);
320                 add_write_dep(state, &state->last_vpm_read, n);
321                 break;
322 
323         case V3D_QPU_A_STVPMV:
324         case V3D_QPU_A_STVPMD:
325         case V3D_QPU_A_STVPMP:
326                 add_write_dep(state, &state->last_vpm, n);
327                 break;
328 
329         case V3D_QPU_A_LDVPMV_IN:
330         case V3D_QPU_A_LDVPMD_IN:
331         case V3D_QPU_A_LDVPMG_IN:
332         case V3D_QPU_A_LDVPMP:
333                 if (!separate_vpm_segment)
334                         add_write_dep(state, &state->last_vpm, n);
335                 break;
336 
337         case V3D_QPU_A_VPMWT:
338                 add_read_dep(state, state->last_vpm, n);
339                 break;
340 
341         case V3D_QPU_A_MSF:
342                 add_read_dep(state, state->last_tlb, n);
343                 break;
344 
345         case V3D_QPU_A_SETMSF:
346         case V3D_QPU_A_SETREVF:
347                 add_write_dep(state, &state->last_tlb, n);
348                 break;
349 
350         default:
351                 break;
352         }
353 
354         switch (inst->alu.mul.op) {
355         case V3D_QPU_M_MULTOP:
356         case V3D_QPU_M_UMUL24:
357                 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
358                  * resets it to 0.  We could possibly reorder umul24s relative
359                  * to each other, but for now just keep all the MUL parts in
360                  * order.
361                  */
362                 add_write_dep(state, &state->last_rtop, n);
363                 break;
364         default:
365                 break;
366         }
367 
368         if (inst->alu.add.op != V3D_QPU_A_NOP) {
369                 process_waddr_deps(state, n, inst->alu.add.waddr,
370                                    inst->alu.add.magic_write);
371         }
372         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
373                 process_waddr_deps(state, n, inst->alu.mul.waddr,
374                                    inst->alu.mul.magic_write);
375         }
376         if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
377                 process_waddr_deps(state, n, inst->sig_addr,
378                                    inst->sig_magic);
379         }
380 
381         if (v3d_qpu_writes_r3(devinfo, inst))
382                 add_write_dep(state, &state->last_r[3], n);
383         if (v3d_qpu_writes_r4(devinfo, inst))
384                 add_write_dep(state, &state->last_r[4], n);
385         if (v3d_qpu_writes_r5(devinfo, inst))
386                 add_write_dep(state, &state->last_r[5], n);
387 
388         /* If we add any more dependencies here we should consider whether we
389          * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
390          */
391         if (inst->sig.thrsw) {
392                 /* All accumulator contents and flags are undefined after the
393                  * switch.
394                  */
395                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
396                         add_write_dep(state, &state->last_r[i], n);
397                 add_write_dep(state, &state->last_sf, n);
398                 add_write_dep(state, &state->last_rtop, n);
399 
400                 /* Scoreboard-locking operations have to stay after the last
401                  * thread switch.
402                  */
403                 add_write_dep(state, &state->last_tlb, n);
404 
405                 add_write_dep(state, &state->last_tmu_write, n);
406                 add_write_dep(state, &state->last_tmu_config, n);
407         }
408 
409         if (v3d_qpu_waits_on_tmu(inst)) {
410                 /* TMU loads are coming from a FIFO, so ordering is important.
411                  */
412                 add_write_dep(state, &state->last_tmu_read, n);
413                 /* Keep TMU loads after their TMU lookup terminator */
414                 add_read_dep(state, state->last_tmu_config, n);
415         }
416 
417         /* Allow wrtmuc to be reordered with other instructions in the
418          * same TMU sequence by using a read dependency on the last TMU
419          * sequence terminator.
420          */
421         if (inst->sig.wrtmuc)
422                 add_read_dep(state, state->last_tmu_config, n);
423 
424         if (inst->sig.ldtlb | inst->sig.ldtlbu)
425                 add_write_dep(state, &state->last_tlb, n);
426 
427         if (inst->sig.ldvpm) {
428                 add_write_dep(state, &state->last_vpm_read, n);
429 
430                 /* At least for now, we're doing shared I/O segments, so queue
431                  * all writes after all reads.
432                  */
433                 if (!separate_vpm_segment)
434                         add_write_dep(state, &state->last_vpm, n);
435         }
436 
437         /* inst->sig.ldunif or sideband uniform read */
438         if (vir_has_uniform(qinst))
439                 add_write_dep(state, &state->last_unif, n);
440 
441         /* Both unifa and ldunifa must preserve ordering */
442         if (inst->sig.ldunifa || inst->sig.ldunifarf)
443                 add_write_dep(state, &state->last_unifa, n);
444 
445         if (v3d_qpu_reads_flags(inst))
446                 add_read_dep(state, state->last_sf, n);
447         if (v3d_qpu_writes_flags(inst))
448                 add_write_dep(state, &state->last_sf, n);
449 }
450 
451 static void
calculate_forward_deps(struct v3d_compile *c, struct dag *dag, struct list_head *schedule_list)452 calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
453                        struct list_head *schedule_list)
454 {
455         struct schedule_state state;
456 
457         memset(&state, 0, sizeof(state));
458         state.dag = dag;
459         state.devinfo = c->devinfo;
460         state.dir = F;
461 
462         list_for_each_entry(struct schedule_node, node, schedule_list, link)
463                 calculate_deps(&state, node);
464 }
465 
466 static void
calculate_reverse_deps(struct v3d_compile *c, struct dag *dag, struct list_head *schedule_list)467 calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
468                        struct list_head *schedule_list)
469 {
470         struct schedule_state state;
471 
472         memset(&state, 0, sizeof(state));
473         state.dag = dag;
474         state.devinfo = c->devinfo;
475         state.dir = R;
476 
477         list_for_each_entry_rev(struct schedule_node, node, schedule_list,
478                                 link) {
479                 calculate_deps(&state, (struct schedule_node *)node);
480         }
481 }
482 
483 struct choose_scoreboard {
484         struct dag *dag;
485         int tick;
486         int last_magic_sfu_write_tick;
487         int last_stallable_sfu_reg;
488         int last_stallable_sfu_tick;
489         int last_ldvary_tick;
490         int last_unifa_write_tick;
491         int last_uniforms_reset_tick;
492         int last_thrsw_tick;
493         int last_branch_tick;
494         int last_setmsf_tick;
495         bool first_thrsw_emitted;
496         bool last_thrsw_emitted;
497         bool fixup_ldvary;
498         int ldvary_count;
499 };
500 
501 static bool
mux_reads_too_soon(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)502 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
503                    const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
504 {
505         switch (mux) {
506         case V3D_QPU_MUX_R4:
507                 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
508                         return true;
509                 break;
510 
511         case V3D_QPU_MUX_R5:
512                 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
513                         return true;
514                 break;
515         default:
516                 break;
517         }
518 
519         return false;
520 }
521 
522 static bool
reads_too_soon_after_write(struct choose_scoreboard *scoreboard, struct qinst *qinst)523 reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
524                            struct qinst *qinst)
525 {
526         const struct v3d_qpu_instr *inst = &qinst->qpu;
527 
528         /* XXX: Branching off of raddr. */
529         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
530                 return false;
531 
532         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
533 
534         if (inst->alu.add.op != V3D_QPU_A_NOP) {
535                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
536                     mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
537                         return true;
538                 }
539                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
540                     mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
541                         return true;
542                 }
543         }
544 
545         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
546                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
547                     mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
548                         return true;
549                 }
550                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
551                     mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
552                         return true;
553                 }
554         }
555 
556         /* XXX: imm */
557 
558         return false;
559 }
560 
561 static bool
writes_too_soon_after_write(const struct v3d_device_info *devinfo, struct choose_scoreboard *scoreboard, struct qinst *qinst)562 writes_too_soon_after_write(const struct v3d_device_info *devinfo,
563                             struct choose_scoreboard *scoreboard,
564                             struct qinst *qinst)
565 {
566         const struct v3d_qpu_instr *inst = &qinst->qpu;
567 
568         /* Don't schedule any other r4 write too soon after an SFU write.
569          * This would normally be prevented by dependency tracking, but might
570          * occur if a dead SFU computation makes it to scheduling.
571          */
572         if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
573             v3d_qpu_writes_r4(devinfo, inst))
574                 return true;
575 
576         return false;
577 }
578 
579 static bool
scoreboard_is_locked(struct choose_scoreboard *scoreboard, bool lock_scoreboard_on_first_thrsw)580 scoreboard_is_locked(struct choose_scoreboard *scoreboard,
581                      bool lock_scoreboard_on_first_thrsw)
582 {
583         if (lock_scoreboard_on_first_thrsw) {
584                 return scoreboard->first_thrsw_emitted &&
585                        scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
586         }
587 
588         return scoreboard->last_thrsw_emitted &&
589                scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
590 }
591 
592 static bool
pixel_scoreboard_too_soon(struct v3d_compile *c, struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst)593 pixel_scoreboard_too_soon(struct v3d_compile *c,
594                           struct choose_scoreboard *scoreboard,
595                           const struct v3d_qpu_instr *inst)
596 {
597         return qpu_inst_is_tlb(inst) &&
598                !scoreboard_is_locked(scoreboard,
599                                      c->lock_scoreboard_on_first_thrsw);
600 }
601 
602 static bool
qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, uint32_t waddr)603 qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
604                         uint32_t waddr) {
605 
606         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
607            return false;
608 
609         if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
610             inst->raddr_a == waddr)
611               return true;
612 
613         if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
614             !inst->sig.small_imm && (inst->raddr_b == waddr))
615               return true;
616 
617         return false;
618 }
619 
620 static bool
mux_read_stalls(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst)621 mux_read_stalls(struct choose_scoreboard *scoreboard,
622                 const struct v3d_qpu_instr *inst)
623 {
624         return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
625                 qpu_instruction_uses_rf(inst,
626                                         scoreboard->last_stallable_sfu_reg);
627 }
628 
629 /* We define a max schedule priority to allow negative priorities as result of
630  * substracting this max when an instruction stalls. So instructions that
631  * stall have lower priority than regular instructions. */
632 #define MAX_SCHEDULE_PRIORITY 16
633 
634 static int
get_instruction_priority(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst)635 get_instruction_priority(const struct v3d_device_info *devinfo,
636                          const struct v3d_qpu_instr *inst)
637 {
638         uint32_t baseline_score;
639         uint32_t next_score = 0;
640 
641         /* Schedule TLB operations as late as possible, to get more
642          * parallelism between shaders.
643          */
644         if (qpu_inst_is_tlb(inst))
645                 return next_score;
646         next_score++;
647 
648         /* Empirical testing shows that using priorities to hide latency of
649          * TMU operations when scheduling QPU leads to slightly worse
650          * performance, even at 2 threads. We think this is because the thread
651          * switching is already quite effective at hiding latency and NIR
652          * scheduling (and possibly TMU pipelining too) are sufficient to hide
653          * TMU latency, so piling up on that here doesn't provide any benefits
654          * and instead may cause us to postpone critical paths that depend on
655          * the TMU results.
656          */
657 #if 0
658         /* Schedule texture read results collection late to hide latency. */
659         if (v3d_qpu_waits_on_tmu(inst))
660                 return next_score;
661         next_score++;
662 #endif
663 
664         /* Default score for things that aren't otherwise special. */
665         baseline_score = next_score;
666         next_score++;
667 
668 #if 0
669         /* Schedule texture read setup early to hide their latency better. */
670         if (v3d_qpu_writes_tmu(devinfo, inst))
671                 return next_score;
672         next_score++;
673 #endif
674 
675         /* We should increase the maximum if we assert here */
676         assert(next_score < MAX_SCHEDULE_PRIORITY);
677 
678         return baseline_score;
679 }
680 
681 enum {
682         V3D_PERIPHERAL_VPM_READ           = (1 << 0),
683         V3D_PERIPHERAL_VPM_WRITE          = (1 << 1),
684         V3D_PERIPHERAL_VPM_WAIT           = (1 << 2),
685         V3D_PERIPHERAL_SFU                = (1 << 3),
686         V3D_PERIPHERAL_TMU_WRITE          = (1 << 4),
687         V3D_PERIPHERAL_TMU_READ           = (1 << 5),
688         V3D_PERIPHERAL_TMU_WAIT           = (1 << 6),
689         V3D_PERIPHERAL_TMU_WRTMUC_SIG     = (1 << 7),
690         V3D_PERIPHERAL_TSY                = (1 << 8),
691         V3D_PERIPHERAL_TLB                = (1 << 9),
692 };
693 
694 static uint32_t
qpu_peripherals(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst)695 qpu_peripherals(const struct v3d_device_info *devinfo,
696                 const struct v3d_qpu_instr *inst)
697 {
698         uint32_t result = 0;
699         if (v3d_qpu_reads_vpm(inst))
700                 result |= V3D_PERIPHERAL_VPM_READ;
701         if (v3d_qpu_writes_vpm(inst))
702                 result |= V3D_PERIPHERAL_VPM_WRITE;
703         if (v3d_qpu_waits_vpm(inst))
704                 result |= V3D_PERIPHERAL_VPM_WAIT;
705 
706         if (v3d_qpu_writes_tmu(devinfo, inst))
707                 result |= V3D_PERIPHERAL_TMU_WRITE;
708         if (inst->sig.ldtmu)
709                 result |= V3D_PERIPHERAL_TMU_READ;
710         if (inst->sig.wrtmuc)
711                 result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
712 
713         if (v3d_qpu_uses_sfu(inst))
714                 result |= V3D_PERIPHERAL_SFU;
715 
716         if (v3d_qpu_uses_tlb(inst))
717                 result |= V3D_PERIPHERAL_TLB;
718 
719         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
720                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
721                     inst->alu.add.magic_write &&
722                     v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
723                         result |= V3D_PERIPHERAL_TSY;
724                 }
725 
726                 if (inst->alu.add.op == V3D_QPU_A_TMUWT)
727                         result |= V3D_PERIPHERAL_TMU_WAIT;
728         }
729 
730         return result;
731 }
732 
733 static bool
qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *a, const struct v3d_qpu_instr *b)734 qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
735                                  const struct v3d_qpu_instr *a,
736                                  const struct v3d_qpu_instr *b)
737 {
738         const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
739         const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
740 
741         /* We can always do one peripheral access per instruction. */
742         if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
743                 return true;
744 
745         if (devinfo->ver < 41)
746                 return false;
747 
748         /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than
749          * tmuc).
750          */
751         if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
752             b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
753                 return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
754         }
755 
756         if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE &&
757             b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) {
758                 return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
759         }
760 
761         /* V3D 4.1+ allows TMU read with VPM read/write. */
762         if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
763             (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
764              b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
765                 return true;
766         }
767         if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
768             (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
769              a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
770                 return true;
771         }
772 
773         return false;
774 }
775 
776 /* Compute a bitmask of which rf registers are used between
777  * the two instructions.
778  */
779 static uint64_t
qpu_raddrs_used(const struct v3d_qpu_instr *a, const struct v3d_qpu_instr *b)780 qpu_raddrs_used(const struct v3d_qpu_instr *a,
781                 const struct v3d_qpu_instr *b)
782 {
783         assert(a->type == V3D_QPU_INSTR_TYPE_ALU);
784         assert(b->type == V3D_QPU_INSTR_TYPE_ALU);
785 
786         uint64_t raddrs_used = 0;
787         if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
788                 raddrs_used |= (1ll << a->raddr_a);
789         if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
790                 raddrs_used |= (1ll << a->raddr_b);
791         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
792                 raddrs_used |= (1ll << b->raddr_a);
793         if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
794                 raddrs_used |= (1ll << b->raddr_b);
795 
796         return raddrs_used;
797 }
798 
799 /* Take two instructions and attempt to merge their raddr fields
800  * into one merged instruction. Returns false if the two instructions
801  * access more than two different rf registers between them, or more
802  * than one rf register and one small immediate.
803  */
804 static bool
qpu_merge_raddrs(struct v3d_qpu_instr *result, const struct v3d_qpu_instr *add_instr, const struct v3d_qpu_instr *mul_instr)805 qpu_merge_raddrs(struct v3d_qpu_instr *result,
806                  const struct v3d_qpu_instr *add_instr,
807                  const struct v3d_qpu_instr *mul_instr)
808 {
809         uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
810         int naddrs = util_bitcount64(raddrs_used);
811 
812         if (naddrs > 2)
813                 return false;
814 
815         if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
816                 if (naddrs > 1)
817                         return false;
818 
819                 if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
820                         if (add_instr->raddr_b != mul_instr->raddr_b)
821                                 return false;
822 
823                 result->sig.small_imm = true;
824                 result->raddr_b = add_instr->sig.small_imm ?
825                         add_instr->raddr_b : mul_instr->raddr_b;
826         }
827 
828         if (naddrs == 0)
829                 return true;
830 
831         int raddr_a = ffsll(raddrs_used) - 1;
832         raddrs_used &= ~(1ll << raddr_a);
833         result->raddr_a = raddr_a;
834 
835         if (!result->sig.small_imm) {
836                 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
837                     raddr_a == add_instr->raddr_b) {
838                         if (add_instr->alu.add.a == V3D_QPU_MUX_B)
839                                 result->alu.add.a = V3D_QPU_MUX_A;
840                         if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
841                             v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
842                                 result->alu.add.b = V3D_QPU_MUX_A;
843                         }
844                 }
845                 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
846                     raddr_a == mul_instr->raddr_b) {
847                         if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
848                                 result->alu.mul.a = V3D_QPU_MUX_A;
849                         if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
850                             v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
851                                 result->alu.mul.b = V3D_QPU_MUX_A;
852                         }
853                 }
854         }
855         if (!raddrs_used)
856                 return true;
857 
858         int raddr_b = ffsll(raddrs_used) - 1;
859         result->raddr_b = raddr_b;
860         if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
861             raddr_b == add_instr->raddr_a) {
862                 if (add_instr->alu.add.a == V3D_QPU_MUX_A)
863                         result->alu.add.a = V3D_QPU_MUX_B;
864                 if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
865                     v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
866                         result->alu.add.b = V3D_QPU_MUX_B;
867                 }
868         }
869         if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
870             raddr_b == mul_instr->raddr_a) {
871                 if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
872                         result->alu.mul.a = V3D_QPU_MUX_B;
873                 if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
874                     v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
875                         result->alu.mul.b = V3D_QPU_MUX_B;
876                 }
877         }
878 
879         return true;
880 }
881 
882 static bool
can_do_add_as_mul(enum v3d_qpu_add_op op)883 can_do_add_as_mul(enum v3d_qpu_add_op op)
884 {
885         switch (op) {
886         case V3D_QPU_A_ADD:
887         case V3D_QPU_A_SUB:
888                 return true;
889         default:
890                 return false;
891         }
892 }
893 
894 static enum v3d_qpu_mul_op
add_op_as_mul_op(enum v3d_qpu_add_op op)895 add_op_as_mul_op(enum v3d_qpu_add_op op)
896 {
897         switch (op) {
898         case V3D_QPU_A_ADD:
899                 return V3D_QPU_M_ADD;
900         case V3D_QPU_A_SUB:
901                 return V3D_QPU_M_SUB;
902         default:
903                 unreachable("unexpected add opcode");
904         }
905 }
906 
907 static void
qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)908 qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
909 {
910         STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
911         assert(inst->alu.add.op != V3D_QPU_A_NOP);
912         assert(inst->alu.mul.op == V3D_QPU_M_NOP);
913 
914         memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
915         inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
916         inst->alu.add.op = V3D_QPU_A_NOP;
917 
918         inst->flags.mc = inst->flags.ac;
919         inst->flags.mpf = inst->flags.apf;
920         inst->flags.muf = inst->flags.auf;
921         inst->flags.ac = V3D_QPU_COND_NONE;
922         inst->flags.apf = V3D_QPU_PF_NONE;
923         inst->flags.auf = V3D_QPU_UF_NONE;
924 
925         inst->alu.mul.output_pack = inst->alu.add.output_pack;
926         inst->alu.mul.a_unpack = inst->alu.add.a_unpack;
927         inst->alu.mul.b_unpack = inst->alu.add.b_unpack;
928         inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
929         inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
930         inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
931 }
932 
933 static bool
qpu_merge_inst(const struct v3d_device_info *devinfo, struct v3d_qpu_instr *result, const struct v3d_qpu_instr *a, const struct v3d_qpu_instr *b)934 qpu_merge_inst(const struct v3d_device_info *devinfo,
935                struct v3d_qpu_instr *result,
936                const struct v3d_qpu_instr *a,
937                const struct v3d_qpu_instr *b)
938 {
939         if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
940             b->type != V3D_QPU_INSTR_TYPE_ALU) {
941                 return false;
942         }
943 
944         if (!qpu_compatible_peripheral_access(devinfo, a, b))
945                 return false;
946 
947         struct v3d_qpu_instr merge = *a;
948         const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
949 
950         struct v3d_qpu_instr mul_inst;
951         if (b->alu.add.op != V3D_QPU_A_NOP) {
952                 if (a->alu.add.op == V3D_QPU_A_NOP) {
953                         merge.alu.add = b->alu.add;
954 
955                         merge.flags.ac = b->flags.ac;
956                         merge.flags.apf = b->flags.apf;
957                         merge.flags.auf = b->flags.auf;
958 
959                         add_instr = b;
960                         mul_instr = a;
961                 }
962                 /* If a's add op is used but its mul op is not, then see if we
963                  * can convert either a's add op or b's add op to a mul op
964                  * so we can merge.
965                  */
966                 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
967                          can_do_add_as_mul(b->alu.add.op)) {
968                         mul_inst = *b;
969                         qpu_convert_add_to_mul(&mul_inst);
970 
971                         merge.alu.mul = mul_inst.alu.mul;
972 
973                         merge.flags.mc = b->flags.ac;
974                         merge.flags.mpf = b->flags.apf;
975                         merge.flags.muf = b->flags.auf;
976 
977                         add_instr = a;
978                         mul_instr = &mul_inst;
979                 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
980                            can_do_add_as_mul(a->alu.add.op)) {
981                         mul_inst = *a;
982                         qpu_convert_add_to_mul(&mul_inst);
983 
984                         merge = mul_inst;
985                         merge.alu.add = b->alu.add;
986 
987                         merge.flags.ac = b->flags.ac;
988                         merge.flags.apf = b->flags.apf;
989                         merge.flags.auf = b->flags.auf;
990 
991                         add_instr = b;
992                         mul_instr = &mul_inst;
993                 } else {
994                         return false;
995                 }
996         }
997 
998         if (b->alu.mul.op != V3D_QPU_M_NOP) {
999                 if (a->alu.mul.op != V3D_QPU_M_NOP)
1000                         return false;
1001                 merge.alu.mul = b->alu.mul;
1002 
1003                 merge.flags.mc = b->flags.mc;
1004                 merge.flags.mpf = b->flags.mpf;
1005                 merge.flags.muf = b->flags.muf;
1006 
1007                 mul_instr = b;
1008                 add_instr = a;
1009         }
1010 
1011         if (add_instr && mul_instr &&
1012             !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
1013                         return false;
1014         }
1015 
1016         merge.sig.thrsw |= b->sig.thrsw;
1017         merge.sig.ldunif |= b->sig.ldunif;
1018         merge.sig.ldunifrf |= b->sig.ldunifrf;
1019         merge.sig.ldunifa |= b->sig.ldunifa;
1020         merge.sig.ldunifarf |= b->sig.ldunifarf;
1021         merge.sig.ldtmu |= b->sig.ldtmu;
1022         merge.sig.ldvary |= b->sig.ldvary;
1023         merge.sig.ldvpm |= b->sig.ldvpm;
1024         merge.sig.small_imm |= b->sig.small_imm;
1025         merge.sig.ldtlb |= b->sig.ldtlb;
1026         merge.sig.ldtlbu |= b->sig.ldtlbu;
1027         merge.sig.ucb |= b->sig.ucb;
1028         merge.sig.rotate |= b->sig.rotate;
1029         merge.sig.wrtmuc |= b->sig.wrtmuc;
1030 
1031         if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
1032             v3d_qpu_sig_writes_address(devinfo, &b->sig))
1033                 return false;
1034         merge.sig_addr |= b->sig_addr;
1035         merge.sig_magic |= b->sig_magic;
1036 
1037         uint64_t packed;
1038         bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
1039 
1040         *result = merge;
1041         /* No modifying the real instructions on failure. */
1042         assert(ok || (a != result && b != result));
1043 
1044         return ok;
1045 }
1046 
1047 static inline bool
try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)1048 try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)
1049 {
1050         return inst->sig.ldunif || inst->sig.ldunifrf;
1051 }
1052 
1053 static bool
1054 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1055                                          struct choose_scoreboard *scoreboard,
1056                                          const struct qinst *qinst);
1057 
1058 static struct schedule_node *
choose_instruction_to_schedule(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct schedule_node *prev_inst)1059 choose_instruction_to_schedule(struct v3d_compile *c,
1060                                struct choose_scoreboard *scoreboard,
1061                                struct schedule_node *prev_inst)
1062 {
1063         struct schedule_node *chosen = NULL;
1064         int chosen_prio = 0;
1065 
1066         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
1067          * will handle pairing it along with filling the delay slots.
1068          */
1069         if (prev_inst) {
1070                 if (prev_inst->inst->qpu.sig.thrsw)
1071                         return NULL;
1072         }
1073 
1074         bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&
1075                                  scoreboard->ldvary_count < c->num_inputs;
1076         bool skipped_insts_for_ldvary_pipelining = false;
1077 retry:
1078         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
1079                             dag.link) {
1080                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
1081 
1082                 if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {
1083                         skipped_insts_for_ldvary_pipelining = true;
1084                         continue;
1085                 }
1086 
1087                 /* Don't choose the branch instruction until it's the last one
1088                  * left.  We'll move it up to fit its delay slots after we
1089                  * choose it.
1090                  */
1091                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
1092                     !list_is_singular(&scoreboard->dag->heads)) {
1093                         continue;
1094                 }
1095 
1096                 /* We need to have 3 delay slots between a write to unifa and
1097                  * a follow-up ldunifa.
1098                  */
1099                 if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&
1100                     scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)
1101                         continue;
1102 
1103                 /* "An instruction must not read from a location in physical
1104                  *  regfile A or B that was written to by the previous
1105                  *  instruction."
1106                  */
1107                 if (reads_too_soon_after_write(scoreboard, n->inst))
1108                         continue;
1109 
1110                 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1111                         continue;
1112 
1113                 /* "Before doing a TLB access a scoreboard wait must have been
1114                  *  done. This happens either on the first or last thread
1115                  *  switch, depending on a setting (scb_wait_on_first_thrsw) in
1116                  *  the shader state."
1117                  */
1118                 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1119                         continue;
1120 
1121                 /* ldunif and ldvary both write r5, but ldunif does so a tick
1122                  * sooner.  If the ldvary's r5 wasn't used, then ldunif might
1123                  * otherwise get scheduled so ldunif and ldvary try to update
1124                  * r5 in the same tick.
1125                  */
1126                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
1127                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
1128                         continue;
1129                 }
1130 
1131                 /* If we are in a thrsw delay slot check that this instruction
1132                  * is valid for that.
1133                  */
1134                 if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&
1135                     !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,
1136                                                               n->inst)) {
1137                         continue;
1138                 }
1139 
1140                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1141                         /* Don't try to put a branch in the delay slots of another
1142                          * branch or a unifa write.
1143                          */
1144                         if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
1145                                 continue;
1146                         if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
1147                                 continue;
1148 
1149                         /* No branch with cond != 0,2,3 and msfign != 0 after
1150                          * setmsf.
1151                          */
1152                         if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&
1153                             inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
1154                             inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
1155                             inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
1156                             inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
1157                                 continue;
1158                         }
1159                 }
1160 
1161                 /* If we're trying to pair with another instruction, check
1162                  * that they're compatible.
1163                  */
1164                 if (prev_inst) {
1165                         /* Don't pair up a thread switch signal -- we'll
1166                          * handle pairing it when we pick it on its own.
1167                          */
1168                         if (inst->sig.thrsw)
1169                                 continue;
1170 
1171                         if (prev_inst->inst->uniform != -1 &&
1172                             n->inst->uniform != -1)
1173                                 continue;
1174 
1175                        /* Simulator complains if we have two uniforms loaded in
1176                         * the the same instruction, which could happen if we
1177                         * have a ldunif or sideband uniform and we pair that
1178                         * with ldunifa.
1179                         */
1180                         if (vir_has_uniform(prev_inst->inst) &&
1181                             (inst->sig.ldunifa || inst->sig.ldunifarf)) {
1182                                 continue;
1183                         }
1184 
1185                         if ((prev_inst->inst->qpu.sig.ldunifa ||
1186                              prev_inst->inst->qpu.sig.ldunifarf) &&
1187                             vir_has_uniform(n->inst)) {
1188                                 continue;
1189                         }
1190 
1191                         /* Don't merge TLB instructions before we have acquired
1192                          * the scoreboard lock.
1193                          */
1194                         if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1195                                 continue;
1196 
1197                         /* When we succesfully pair up an ldvary we then try
1198                          * to merge it into the previous instruction if
1199                          * possible to improve pipelining. Don't pick up the
1200                          * ldvary now if the follow-up fixup would place
1201                          * it in the delay slots of a thrsw, which is not
1202                          * allowed and would prevent the fixup from being
1203                          * successul.
1204                          */
1205                         if (inst->sig.ldvary &&
1206                             scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
1207                                 continue;
1208                         }
1209 
1210                         struct v3d_qpu_instr merged_inst;
1211                         if (!qpu_merge_inst(c->devinfo, &merged_inst,
1212                                             &prev_inst->inst->qpu, inst)) {
1213                                 continue;
1214                         }
1215                 }
1216 
1217                 int prio = get_instruction_priority(c->devinfo, inst);
1218 
1219                 if (mux_read_stalls(scoreboard, inst)) {
1220                         /* Don't merge an instruction that stalls */
1221                         if (prev_inst)
1222                                 continue;
1223                         else {
1224                                 /* Any instruction that don't stall will have
1225                                  * higher scheduling priority */
1226                                 prio -= MAX_SCHEDULE_PRIORITY;
1227                                 assert(prio < 0);
1228                         }
1229                 }
1230 
1231                 /* Found a valid instruction.  If nothing better comes along,
1232                  * this one works.
1233                  */
1234                 if (!chosen) {
1235                         chosen = n;
1236                         chosen_prio = prio;
1237                         continue;
1238                 }
1239 
1240                 if (prio > chosen_prio) {
1241                         chosen = n;
1242                         chosen_prio = prio;
1243                 } else if (prio < chosen_prio) {
1244                         continue;
1245                 }
1246 
1247                 if (n->delay > chosen->delay) {
1248                         chosen = n;
1249                         chosen_prio = prio;
1250                 } else if (n->delay < chosen->delay) {
1251                         continue;
1252                 }
1253         }
1254 
1255         /* If we did not find any instruction to schedule but we discarded
1256          * some of them to prioritize ldvary pipelining, try again.
1257          */
1258         if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {
1259                 skipped_insts_for_ldvary_pipelining = false;
1260                 ldvary_pipelining = false;
1261                 goto retry;
1262         }
1263 
1264         if (chosen && chosen->inst->qpu.sig.ldvary) {
1265                 scoreboard->ldvary_count++;
1266                 /* If we are pairing an ldvary, flag it so we can fix it up for
1267                  * optimal pipelining of ldvary sequences.
1268                  */
1269                 if (prev_inst)
1270                         scoreboard->fixup_ldvary = true;
1271         }
1272 
1273         return chosen;
1274 }
1275 
1276 static void
update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, enum v3d_qpu_waddr waddr, const struct v3d_device_info *devinfo)1277 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
1278                                   enum v3d_qpu_waddr waddr,
1279                                   const struct v3d_device_info *devinfo)
1280 {
1281         if (v3d_qpu_magic_waddr_is_sfu(waddr))
1282                 scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
1283         else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)
1284                 scoreboard->last_unifa_write_tick = scoreboard->tick;
1285 }
1286 
1287 static void
update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst)1288 update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
1289                                       const struct v3d_qpu_instr *inst)
1290 {
1291         if (v3d_qpu_instr_is_sfu(inst)) {
1292                 scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
1293                 scoreboard->last_stallable_sfu_tick = scoreboard->tick;
1294         }
1295 }
1296 
1297 static void
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst, const struct v3d_device_info *devinfo)1298 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
1299                              const struct v3d_qpu_instr *inst,
1300                              const struct v3d_device_info *devinfo)
1301 {
1302         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1303                 return;
1304 
1305         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1306 
1307         if (inst->alu.add.op != V3D_QPU_A_NOP)  {
1308                 if (inst->alu.add.magic_write) {
1309                         update_scoreboard_for_magic_waddr(scoreboard,
1310                                                           inst->alu.add.waddr,
1311                                                           devinfo);
1312                 } else {
1313                         update_scoreboard_for_sfu_stall_waddr(scoreboard,
1314                                                               inst);
1315                 }
1316 
1317                 if (inst->alu.add.op == V3D_QPU_A_SETMSF)
1318                         scoreboard->last_setmsf_tick = scoreboard->tick;
1319         }
1320 
1321         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
1322                 if (inst->alu.mul.magic_write) {
1323                         update_scoreboard_for_magic_waddr(scoreboard,
1324                                                           inst->alu.mul.waddr,
1325                                                           devinfo);
1326                 }
1327         }
1328 
1329         if (inst->sig.ldvary)
1330                 scoreboard->last_ldvary_tick = scoreboard->tick;
1331 }
1332 
1333 static void
dump_state(const struct v3d_device_info *devinfo, struct dag *dag)1334 dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
1335 {
1336         list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
1337                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
1338                 v3d_qpu_dump(devinfo, &n->inst->qpu);
1339                 fprintf(stderr, "\n");
1340 
1341                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1342                         struct schedule_node *child =
1343                                 (struct schedule_node *)edge->child;
1344                         if (!child)
1345                                 continue;
1346 
1347                         fprintf(stderr, "                 - ");
1348                         v3d_qpu_dump(devinfo, &child->inst->qpu);
1349                         fprintf(stderr, " (%d parents, %c)\n",
1350                                 child->dag.parent_count,
1351                                 edge->data ? 'w' : 'r');
1352                 }
1353         }
1354 }
1355 
magic_waddr_latency(const struct v3d_device_info *devinfo, enum v3d_qpu_waddr waddr, const struct v3d_qpu_instr *after)1356 static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,
1357                                     enum v3d_qpu_waddr waddr,
1358                                     const struct v3d_qpu_instr *after)
1359 {
1360         /* Apply some huge latency between texture fetch requests and getting
1361          * their results back.
1362          *
1363          * FIXME: This is actually pretty bogus.  If we do:
1364          *
1365          * mov tmu0_s, a
1366          * <a bit of math>
1367          * mov tmu0_s, b
1368          * load_tmu0
1369          * <more math>
1370          * load_tmu0
1371          *
1372          * we count that as worse than
1373          *
1374          * mov tmu0_s, a
1375          * mov tmu0_s, b
1376          * <lots of math>
1377          * load_tmu0
1378          * <more math>
1379          * load_tmu0
1380          *
1381          * because we associate the first load_tmu0 with the *second* tmu0_s.
1382          */
1383         if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&
1384             v3d_qpu_waits_on_tmu(after)) {
1385                 return 100;
1386         }
1387 
1388         /* Assume that anything depending on us is consuming the SFU result. */
1389         if (v3d_qpu_magic_waddr_is_sfu(waddr))
1390                 return 3;
1391 
1392         return 1;
1393 }
1394 
1395 static uint32_t
instruction_latency(const struct v3d_device_info *devinfo, struct schedule_node *before, struct schedule_node *after)1396 instruction_latency(const struct v3d_device_info *devinfo,
1397                     struct schedule_node *before, struct schedule_node *after)
1398 {
1399         const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
1400         const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
1401         uint32_t latency = 1;
1402 
1403         if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
1404             after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
1405                 return latency;
1406 
1407         if (before_inst->alu.add.magic_write) {
1408                 latency = MAX2(latency,
1409                                magic_waddr_latency(devinfo,
1410                                                    before_inst->alu.add.waddr,
1411                                                    after_inst));
1412         }
1413 
1414         if (before_inst->alu.mul.magic_write) {
1415                 latency = MAX2(latency,
1416                                magic_waddr_latency(devinfo,
1417                                                    before_inst->alu.mul.waddr,
1418                                                    after_inst));
1419         }
1420 
1421         if (v3d_qpu_instr_is_sfu(before_inst))
1422                 return 2;
1423 
1424         return latency;
1425 }
1426 
1427 /** Recursive computation of the delay member of a node. */
1428 static void
compute_delay(struct dag_node *node, void *state)1429 compute_delay(struct dag_node *node, void *state)
1430 {
1431         struct schedule_node *n = (struct schedule_node *)node;
1432         struct v3d_compile *c = (struct v3d_compile *) state;
1433 
1434         n->delay = 1;
1435 
1436         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1437                 struct schedule_node *child =
1438                         (struct schedule_node *)edge->child;
1439 
1440                 n->delay = MAX2(n->delay, (child->delay +
1441                                            instruction_latency(c->devinfo, n,
1442                                                                child)));
1443         }
1444 }
1445 
1446 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
1447  * should be called on it later to finish pruning the other edges).
1448  */
1449 static void
pre_remove_head(struct dag *dag, struct schedule_node *n)1450 pre_remove_head(struct dag *dag, struct schedule_node *n)
1451 {
1452         list_delinit(&n->dag.link);
1453 
1454         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1455                 if (edge->data)
1456                         dag_remove_edge(dag, edge);
1457         }
1458 }
1459 
1460 static void
mark_instruction_scheduled(const struct v3d_device_info *devinfo, struct dag *dag, uint32_t time, struct schedule_node *node)1461 mark_instruction_scheduled(const struct v3d_device_info *devinfo,
1462                            struct dag *dag,
1463                            uint32_t time,
1464                            struct schedule_node *node)
1465 {
1466         if (!node)
1467                 return;
1468 
1469         util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
1470                 struct schedule_node *child =
1471                         (struct schedule_node *)edge->child;
1472 
1473                 if (!child)
1474                         continue;
1475 
1476                 uint32_t latency = instruction_latency(devinfo, node, child);
1477 
1478                 child->unblocked_time = MAX2(child->unblocked_time,
1479                                              time + latency);
1480         }
1481         dag_prune_head(dag, &node->dag);
1482 }
1483 
1484 static void
insert_scheduled_instruction(struct v3d_compile *c, struct qblock *block, struct choose_scoreboard *scoreboard, struct qinst *inst)1485 insert_scheduled_instruction(struct v3d_compile *c,
1486                              struct qblock *block,
1487                              struct choose_scoreboard *scoreboard,
1488                              struct qinst *inst)
1489 {
1490         list_addtail(&inst->link, &block->instructions);
1491 
1492         update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
1493         c->qpu_inst_count++;
1494         scoreboard->tick++;
1495 }
1496 
1497 static struct qinst *
vir_nopnull1498 vir_nop()
1499 {
1500         struct qreg undef = vir_nop_reg();
1501         struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1502 
1503         return qinst;
1504 }
1505 
1506 static void
emit_nop(struct v3d_compile *c, struct qblock *block, struct choose_scoreboard *scoreboard)1507 emit_nop(struct v3d_compile *c, struct qblock *block,
1508          struct choose_scoreboard *scoreboard)
1509 {
1510         insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1511 }
1512 
1513 static bool
qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, const struct qinst *qinst, int slot)1514 qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
1515                               const struct qinst *qinst, int slot)
1516 {
1517         const struct v3d_qpu_instr *inst = &qinst->qpu;
1518 
1519         if (slot == 2 && qinst->is_tlb_z_write)
1520                 return false;
1521 
1522         if (slot > 0 && qinst->uniform != ~0)
1523                 return false;
1524 
1525         if (v3d_qpu_waits_vpm(inst))
1526                 return false;
1527 
1528         if (inst->sig.ldvary)
1529                 return false;
1530 
1531         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1532                 /* GFXH-1625: TMUWT not allowed in the final instruction. */
1533                 if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
1534                         return false;
1535 
1536                 /* No writing physical registers at the end. */
1537                 if (!inst->alu.add.magic_write ||
1538                     !inst->alu.mul.magic_write) {
1539                         return false;
1540                 }
1541 
1542                 if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
1543                     !inst->sig_magic) {
1544                         return false;
1545                 }
1546 
1547                 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
1548                         return false;
1549 
1550                 /* RF0-2 might be overwritten during the delay slots by
1551                  * fragment shader setup.
1552                  */
1553                 if (inst->raddr_a < 3 &&
1554                     (inst->alu.add.a == V3D_QPU_MUX_A ||
1555                      inst->alu.add.b == V3D_QPU_MUX_A ||
1556                      inst->alu.mul.a == V3D_QPU_MUX_A ||
1557                      inst->alu.mul.b == V3D_QPU_MUX_A)) {
1558                         return false;
1559                 }
1560 
1561                 if (inst->raddr_b < 3 &&
1562                     !inst->sig.small_imm &&
1563                     (inst->alu.add.a == V3D_QPU_MUX_B ||
1564                      inst->alu.add.b == V3D_QPU_MUX_B ||
1565                      inst->alu.mul.a == V3D_QPU_MUX_B ||
1566                      inst->alu.mul.b == V3D_QPU_MUX_B)) {
1567                         return false;
1568                 }
1569         }
1570 
1571         return true;
1572 }
1573 
1574 /**
1575  * This is called when trying to merge a thrsw back into the instruction stream
1576  * of instructions that were scheduled *before* the thrsw signal to fill its
1577  * delay slots. Because the actual execution of the thrsw happens after the
1578  * delay slots, it is usually safe to do this, but there are some cases that
1579  * need special care.
1580  */
1581 static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, const struct qinst *qinst, uint32_t slot)1582 qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1583                                           const struct qinst *qinst,
1584                                           uint32_t slot)
1585 {
1586         /* No scheduling SFU when the result would land in the other
1587          * thread.  The simulator complains for safety, though it
1588          * would only occur for dead code in our case.
1589          */
1590         if (slot > 0 &&
1591             qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
1592             (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
1593              v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
1594                 return false;
1595         }
1596 
1597         if (slot > 0 && qinst->qpu.sig.ldvary)
1598                 return false;
1599 
1600         /* unifa and the following 3 instructions can't overlap a
1601          * thread switch/end. The docs further clarify that this means
1602          * the cycle at which the actual thread switch/end happens
1603          * and not when the thrsw instruction is processed, which would
1604          * be after the 2 delay slots following the thrsw instruction.
1605          * This means that we can move up a thrsw up to the instruction
1606          * right after unifa:
1607          *
1608          * unifa, r5
1609          * thrsw
1610          * delay slot 1
1611          * delay slot 2
1612          * Thread switch happens here, 4 instructions away from unifa
1613          */
1614         if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
1615                 return false;
1616 
1617         return true;
1618 }
1619 
1620 /**
1621  * This is called for instructions scheduled *after* a thrsw signal that may
1622  * land in the delay slots of the thrsw. Because these instructions were
1623  * scheduled after the thrsw, we need to be careful when placing them into
1624  * the delay slots, since that means that we are moving them ahead of the
1625  * thread switch and we need to ensure that is not a problem.
1626  */
1627 static bool
qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, struct choose_scoreboard *scoreboard, const struct qinst *qinst)1628 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1629                                          struct choose_scoreboard *scoreboard,
1630                                          const struct qinst *qinst)
1631 {
1632         const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;
1633         assert(slot <= 2);
1634 
1635         /* We merge thrsw instructions back into the instruction stream
1636          * manually, so any instructions scheduled after a thrsw shold be
1637          * in the actual delay slots and not in the same slot as the thrsw.
1638          */
1639         assert(slot >= 1);
1640 
1641         /* No emitting a thrsw while the previous thrsw hasn't happened yet. */
1642         if (qinst->qpu.sig.thrsw)
1643                 return false;
1644 
1645         /* The restrictions for instructions scheduled before the the thrsw
1646          * also apply to instructions scheduled after the thrsw that we want
1647          * to place in its delay slots.
1648          */
1649         if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
1650                 return false;
1651 
1652         /* TLB access is disallowed until scoreboard wait is executed, which
1653          * we do on the last thread switch.
1654          */
1655         if (qpu_inst_is_tlb(&qinst->qpu))
1656                 return false;
1657 
1658         /* Instruction sequence restrictions: Branch is not allowed in delay
1659          * slots of a thrsw.
1660          */
1661         if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
1662                 return false;
1663 
1664         /* Miscellaneous restrictions: At the point of a thrsw we need to have
1665          * at least one outstanding lookup or TSY wait.
1666          *
1667          * So avoid placing TMU instructions scheduled after the thrsw into
1668          * its delay slots or we may be compromising the integrity of our TMU
1669          * sequences. Also, notice that if we moved these instructions into
1670          * the delay slots of a previous thrsw we could overflow our TMU output
1671          * fifo, since we could be effectively pipelining a lookup scheduled
1672          * after the thrsw into the sequence before the thrsw.
1673          */
1674         if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||
1675             qinst->qpu.sig.wrtmuc) {
1676                 return false;
1677         }
1678 
1679         /* Don't move instructions that wait on the TMU before the thread switch
1680          * happens since that would make the current thread stall before the
1681          * switch, which is exactly what we want to avoid with the thrsw
1682          * instruction.
1683          */
1684         if (v3d_qpu_waits_on_tmu(&qinst->qpu))
1685                 return false;
1686 
1687         /* A thread switch invalidates all accumulators, so don't place any
1688          * instructions that write accumulators into the delay slots.
1689          */
1690         if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))
1691                 return false;
1692 
1693         /* Multop has an implicit write to the rtop register which is an
1694          * specialized accumulator that is only used with this instruction.
1695          */
1696         if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)
1697                 return false;
1698 
1699         /* Flags are invalidated across a thread switch, so dont' place
1700          * instructions that write flags into delay slots.
1701          */
1702         if (v3d_qpu_writes_flags(&qinst->qpu))
1703                 return false;
1704 
1705         /* TSY sync ops materialize at the point of the next thread switch,
1706          * therefore, if we have a TSY sync right after a thread switch, we
1707          * cannot place it in its delay slots, or we would be moving the sync
1708          * to the thrsw before it instead.
1709          */
1710         if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
1711                 return false;
1712 
1713         return true;
1714 }
1715 
1716 static bool
valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qinst *qinst, int instructions_in_sequence, bool is_thrend)1717 valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
1718                      struct qinst *qinst, int instructions_in_sequence,
1719                      bool is_thrend)
1720 {
1721         /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
1722         if (scoreboard->last_thrsw_tick + 3 >
1723             scoreboard->tick - instructions_in_sequence) {
1724                 return false;
1725         }
1726 
1727         for (int slot = 0; slot < instructions_in_sequence; slot++) {
1728                 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
1729                         return false;
1730 
1731                 if (is_thrend &&
1732                     !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
1733                         return false;
1734                 }
1735 
1736                 /* Note that the list is circular, so we can only do this up
1737                  * to instructions_in_sequence.
1738                  */
1739                 qinst = (struct qinst *)qinst->link.next;
1740         }
1741 
1742         return true;
1743 }
1744 
1745 /**
1746  * Emits a THRSW signal in the stream, trying to move it up to pair with
1747  * another instruction.
1748  */
1749 static int
emit_thrsw(struct v3d_compile *c, struct qblock *block, struct choose_scoreboard *scoreboard, struct qinst *inst, bool is_thrend)1750 emit_thrsw(struct v3d_compile *c,
1751            struct qblock *block,
1752            struct choose_scoreboard *scoreboard,
1753            struct qinst *inst,
1754            bool is_thrend)
1755 {
1756         int time = 0;
1757 
1758         /* There should be nothing in a thrsw inst being scheduled other than
1759          * the signal bits.
1760          */
1761         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
1762         assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
1763         assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
1764 
1765         /* Don't try to emit a thrsw in the delay slots of a previous thrsw
1766          * or branch.
1767          */
1768         while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
1769                 emit_nop(c, block, scoreboard);
1770                 time++;
1771         }
1772         while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
1773                 emit_nop(c, block, scoreboard);
1774                 time++;
1775         }
1776 
1777         /* Find how far back into previous instructions we can put the THRSW. */
1778         int slots_filled = 0;
1779         int invalid_sig_count = 0;
1780         bool last_thrsw_after_invalid_ok = false;
1781         struct qinst *merge_inst = NULL;
1782         vir_for_each_inst_rev(prev_inst, block) {
1783                 if (!valid_thrsw_sequence(c, scoreboard,
1784                                           prev_inst, slots_filled + 1,
1785                                           is_thrend)) {
1786                         break;
1787                 }
1788 
1789                 struct v3d_qpu_sig sig = prev_inst->qpu.sig;
1790                 sig.thrsw = true;
1791                 uint32_t packed_sig;
1792                 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
1793                         /* If we can't merge the thrsw here because of signal
1794                          * incompatibility, keep going, we might be able to
1795                          * merge it in an earlier instruction.
1796                          */
1797                         invalid_sig_count++;
1798                         goto cont_block;
1799                 }
1800 
1801                 /* For last thrsw we need 2 consecutive slots that are
1802                  * thrsw compatible, so if we have previously jumped over
1803                  * an incompatible signal, flag that we have found the first
1804                  * valid slot here and keep going.
1805                  */
1806                 if (inst->is_last_thrsw && invalid_sig_count > 0 &&
1807                     !last_thrsw_after_invalid_ok) {
1808                         last_thrsw_after_invalid_ok = true;
1809                         invalid_sig_count++;
1810                         goto cont_block;
1811                 }
1812 
1813                 last_thrsw_after_invalid_ok = false;
1814                 invalid_sig_count = 0;
1815                 merge_inst = prev_inst;
1816 
1817 cont_block:
1818                 if (++slots_filled == 3)
1819                         break;
1820         }
1821 
1822         /* If we jumped over a signal incompatibility and did not manage to
1823          * merge the thrsw in the end, we need to adjust slots filled to match
1824          * the last valid merge point.
1825          */
1826         assert(invalid_sig_count == 0 || slots_filled >= invalid_sig_count);
1827         if (invalid_sig_count > 0)
1828                 slots_filled -= invalid_sig_count;
1829 
1830         bool needs_free = false;
1831         if (merge_inst) {
1832                 merge_inst->qpu.sig.thrsw = true;
1833                 needs_free = true;
1834                 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
1835         } else {
1836                 scoreboard->last_thrsw_tick = scoreboard->tick;
1837                 insert_scheduled_instruction(c, block, scoreboard, inst);
1838                 time++;
1839                 slots_filled++;
1840                 merge_inst = inst;
1841         }
1842 
1843         scoreboard->first_thrsw_emitted = true;
1844 
1845         /* If we're emitting the last THRSW (other than program end), then
1846          * signal that to the HW by emitting two THRSWs in a row.
1847          */
1848         if (inst->is_last_thrsw) {
1849                 if (slots_filled <= 1) {
1850                         emit_nop(c, block, scoreboard);
1851                         time++;
1852                 }
1853                 struct qinst *second_inst =
1854                         (struct qinst *)merge_inst->link.next;
1855                 second_inst->qpu.sig.thrsw = true;
1856                 scoreboard->last_thrsw_emitted = true;
1857         }
1858 
1859         /* Make sure the thread end executes within the program lifespan */
1860         if (is_thrend) {
1861                 for (int i = 0; i < 3 - slots_filled; i++) {
1862                         emit_nop(c, block, scoreboard);
1863                         time++;
1864                 }
1865         }
1866 
1867         /* If we put our THRSW into another instruction, free up the
1868          * instruction that didn't end up scheduled into the list.
1869          */
1870         if (needs_free)
1871                 free(inst);
1872 
1873         return time;
1874 }
1875 
1876 static bool
qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)1877 qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
1878 {
1879         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
1880                 return false;
1881 
1882         if (inst->qpu.sig.thrsw)
1883                 return false;
1884 
1885         if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
1886                 return false;
1887 
1888         if (vir_has_uniform(inst))
1889                 return false;
1890 
1891         return true;
1892 }
1893 
1894 static void
emit_branch(struct v3d_compile *c, struct qblock *block, struct choose_scoreboard *scoreboard, struct qinst *inst)1895 emit_branch(struct v3d_compile *c,
1896            struct qblock *block,
1897            struct choose_scoreboard *scoreboard,
1898            struct qinst *inst)
1899 {
1900         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
1901 
1902         /* We should've not picked up a branch for the delay slots of a previous
1903          * thrsw, branch or unifa write instruction.
1904          */
1905         int branch_tick = scoreboard->tick;
1906         assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
1907         assert(scoreboard->last_branch_tick + 3 < branch_tick);
1908         assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
1909 
1910         /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
1911          * setmsf.
1912          */
1913         bool is_safe_msf_branch =
1914                 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
1915                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
1916                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
1917                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;
1918         assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||
1919                is_safe_msf_branch);
1920 
1921         /* Insert the branch instruction */
1922         insert_scheduled_instruction(c, block, scoreboard, inst);
1923 
1924         /* Now see if we can move the branch instruction back into the
1925          * instruction stream to fill its delay slots
1926          */
1927         int slots_filled = 0;
1928         while (slots_filled < 3 && block->instructions.next != &inst->link) {
1929                 struct qinst *prev_inst = (struct qinst *) inst->link.prev;
1930                 assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
1931 
1932                 /* Can't move the branch instruction if that would place it
1933                  * in the delay slots of other instructions.
1934                  */
1935                 if (scoreboard->last_branch_tick + 3 >=
1936                     branch_tick - slots_filled - 1) {
1937                         break;
1938                 }
1939 
1940                 if (scoreboard->last_thrsw_tick + 2 >=
1941                     branch_tick - slots_filled - 1) {
1942                         break;
1943                 }
1944 
1945                 if (scoreboard->last_unifa_write_tick + 3 >=
1946                     branch_tick - slots_filled - 1) {
1947                         break;
1948                 }
1949 
1950                 /* Do not move up a branch if it can disrupt an ldvary sequence
1951                  * as that can cause stomping of the r5 register.
1952                  */
1953                 if (scoreboard->last_ldvary_tick + 2 >=
1954                     branch_tick - slots_filled) {
1955                        break;
1956                 }
1957 
1958                 /* Can't move a conditional branch before the instruction
1959                  * that writes the flags for its condition.
1960                  */
1961                 if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
1962                     inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
1963                         break;
1964                 }
1965 
1966                 if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
1967                         break;
1968 
1969                 if (!is_safe_msf_branch) {
1970                         struct qinst *prev_prev_inst =
1971                                 (struct qinst *) prev_inst->link.prev;
1972                         if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
1973                             prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {
1974                                 break;
1975                         }
1976                 }
1977 
1978                 list_del(&prev_inst->link);
1979                 list_add(&prev_inst->link, &inst->link);
1980                 slots_filled++;
1981         }
1982 
1983         block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
1984         scoreboard->last_branch_tick = branch_tick - slots_filled;
1985 
1986         /* Fill any remaining delay slots.
1987          *
1988          * For unconditional branches we'll try to fill these with the
1989          * first instructions in the successor block after scheduling
1990          * all blocks when setting up branch targets.
1991          */
1992         for (int i = 0; i < 3 - slots_filled; i++)
1993                 emit_nop(c, block, scoreboard);
1994 }
1995 
1996 static bool
alu_reads_register(struct v3d_qpu_instr *inst, bool add, bool magic, uint32_t index)1997 alu_reads_register(struct v3d_qpu_instr *inst,
1998                    bool add, bool magic, uint32_t index)
1999 {
2000         uint32_t num_src;
2001         enum v3d_qpu_mux mux_a, mux_b;
2002 
2003         if (add) {
2004                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
2005                 mux_a = inst->alu.add.a;
2006                 mux_b = inst->alu.add.b;
2007         } else {
2008                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
2009                 mux_a = inst->alu.mul.a;
2010                 mux_b = inst->alu.mul.b;
2011         }
2012 
2013         for (int i = 0; i < num_src; i++) {
2014                 if (magic) {
2015                         if (i == 0 && mux_a == index)
2016                                 return true;
2017                         if (i == 1 && mux_b == index)
2018                                 return true;
2019                 } else {
2020                         if (i == 0 && mux_a == V3D_QPU_MUX_A &&
2021                             inst->raddr_a == index) {
2022                                 return true;
2023                         }
2024                         if (i == 0 && mux_a == V3D_QPU_MUX_B &&
2025                             inst->raddr_b == index) {
2026                                 return true;
2027                         }
2028                         if (i == 1 && mux_b == V3D_QPU_MUX_A &&
2029                             inst->raddr_a == index) {
2030                                 return true;
2031                         }
2032                         if (i == 1 && mux_b == V3D_QPU_MUX_B &&
2033                             inst->raddr_b == index) {
2034                                 return true;
2035                         }
2036                 }
2037         }
2038 
2039         return false;
2040 }
2041 
2042 /**
2043  * This takes and ldvary signal merged into 'inst' and tries to move it up to
2044  * the previous instruction to get good pipelining of ldvary sequences,
2045  * transforming this:
2046  *
2047  * nop                  ; nop               ; ldvary.r4
2048  * nop                  ; fmul  r0, r4, rf0 ;
2049  * fadd  rf13, r0, r5   ; nop;              ; ldvary.r1  <-- inst
2050  *
2051  * into:
2052  *
2053  * nop                  ; nop               ; ldvary.r4
2054  * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
2055  * fadd  rf13, r0, r5   ; nop;              ;            <-- inst
2056  *
2057  * If we manage to do this successfully (we return true here), then flagging
2058  * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
2059  * we will be able to pick up to merge into 'inst', leading to code like this:
2060  *
2061  * nop                  ; nop               ; ldvary.r4
2062  * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
2063  * fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ;            <-- inst
2064  */
2065 static bool
fixup_pipelined_ldvary(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qblock *block, struct v3d_qpu_instr *inst)2066 fixup_pipelined_ldvary(struct v3d_compile *c,
2067                        struct choose_scoreboard *scoreboard,
2068                        struct qblock *block,
2069                        struct v3d_qpu_instr *inst)
2070 {
2071         /* We only call this if we have successfuly merged an ldvary into a
2072          * previous instruction.
2073          */
2074         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
2075         assert(inst->sig.ldvary);
2076         uint32_t ldvary_magic = inst->sig_magic;
2077         uint32_t ldvary_index = inst->sig_addr;
2078 
2079         /* The instruction in which we merged the ldvary cannot read
2080          * the ldvary destination, if it does, then moving the ldvary before
2081          * it would overwrite it.
2082          */
2083         if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
2084                 return false;
2085         if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
2086                 return false;
2087 
2088         /* The implicit ldvary destination may not be written to by a signal
2089          * in the instruction following ldvary. Since we are planning to move
2090          * ldvary to the previous instruction, this means we need to check if
2091          * the current instruction has any other signal that could create this
2092          * conflict. The only other signal that can write to the implicit
2093          * ldvary destination that is compatible with ldvary in the same
2094          * instruction is ldunif.
2095          */
2096         if (inst->sig.ldunif)
2097                 return false;
2098 
2099         /* The previous instruction can't write to the same destination as the
2100          * ldvary.
2101          */
2102         struct qinst *prev = (struct qinst *) block->instructions.prev;
2103         if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
2104                 return false;
2105 
2106         if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
2107                 if (prev->qpu.alu.add.magic_write == ldvary_magic &&
2108                     prev->qpu.alu.add.waddr == ldvary_index) {
2109                         return false;
2110                 }
2111         }
2112 
2113         if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
2114                 if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
2115                     prev->qpu.alu.mul.waddr == ldvary_index) {
2116                         return false;
2117                 }
2118         }
2119 
2120         /* The previous instruction cannot have a conflicting signal */
2121         if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
2122                 return false;
2123 
2124         uint32_t sig;
2125         struct v3d_qpu_sig new_sig = prev->qpu.sig;
2126         new_sig.ldvary = true;
2127         if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig))
2128                 return false;
2129 
2130         /* The previous instruction cannot use flags since ldvary uses the
2131          * 'cond' instruction field to store the destination.
2132          */
2133         if (v3d_qpu_writes_flags(&prev->qpu))
2134                 return false;
2135         if (v3d_qpu_reads_flags(&prev->qpu))
2136                 return false;
2137 
2138         /* We can't put an ldvary in the delay slots of a thrsw. We should've
2139          * prevented this when pairing up the ldvary with another instruction
2140          * and flagging it for a fixup.
2141          */
2142         assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
2143 
2144         /* Move the ldvary to the previous instruction and remove it from the
2145          * current one.
2146          */
2147         prev->qpu.sig.ldvary = true;
2148         prev->qpu.sig_magic = ldvary_magic;
2149         prev->qpu.sig_addr = ldvary_index;
2150         scoreboard->last_ldvary_tick = scoreboard->tick - 1;
2151 
2152         inst->sig.ldvary = false;
2153         inst->sig_magic = false;
2154         inst->sig_addr = 0;
2155 
2156         /* By moving ldvary to the previous instruction we make it update
2157          * r5 in the current one, so nothing else in it should write r5.
2158          * This should've been prevented by our depedency tracking, which
2159          * would not allow ldvary to be paired up with an instruction that
2160          * writes r5 (since our dependency tracking doesn't know that the
2161          * ldvary write r5 happens in the next instruction).
2162          */
2163         assert(!v3d_qpu_writes_r5(c->devinfo, inst));
2164 
2165         return true;
2166 }
2167 
2168 static uint32_t
schedule_instructions(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qblock *block, enum quniform_contents *orig_uniform_contents, uint32_t *orig_uniform_data, uint32_t *next_uniform)2169 schedule_instructions(struct v3d_compile *c,
2170                       struct choose_scoreboard *scoreboard,
2171                       struct qblock *block,
2172                       enum quniform_contents *orig_uniform_contents,
2173                       uint32_t *orig_uniform_data,
2174                       uint32_t *next_uniform)
2175 {
2176         const struct v3d_device_info *devinfo = c->devinfo;
2177         uint32_t time = 0;
2178 
2179         while (!list_is_empty(&scoreboard->dag->heads)) {
2180                 struct schedule_node *chosen =
2181                         choose_instruction_to_schedule(c, scoreboard, NULL);
2182                 struct schedule_node *merge = NULL;
2183 
2184                 /* If there are no valid instructions to schedule, drop a NOP
2185                  * in.
2186                  */
2187                 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
2188                 struct v3d_qpu_instr *inst = &qinst->qpu;
2189 
2190                 if (debug) {
2191                         fprintf(stderr, "t=%4d: current list:\n",
2192                                 time);
2193                         dump_state(devinfo, scoreboard->dag);
2194                         fprintf(stderr, "t=%4d: chose:   ", time);
2195                         v3d_qpu_dump(devinfo, inst);
2196                         fprintf(stderr, "\n");
2197                 }
2198 
2199                 /* We can't mark_instruction_scheduled() the chosen inst until
2200                  * we're done identifying instructions to merge, so put the
2201                  * merged instructions on a list for a moment.
2202                  */
2203                 struct list_head merged_list;
2204                 list_inithead(&merged_list);
2205 
2206                 /* Schedule this instruction onto the QPU list. Also try to
2207                  * find an instruction to pair with it.
2208                  */
2209                 if (chosen) {
2210                         time = MAX2(chosen->unblocked_time, time);
2211                         pre_remove_head(scoreboard->dag, chosen);
2212 
2213                         while ((merge =
2214                                 choose_instruction_to_schedule(c, scoreboard,
2215                                                                chosen))) {
2216                                 time = MAX2(merge->unblocked_time, time);
2217                                 pre_remove_head(scoreboard->dag, merge);
2218                                 list_addtail(&merge->link, &merged_list);
2219                                 (void)qpu_merge_inst(devinfo, inst,
2220                                                      inst, &merge->inst->qpu);
2221                                 if (merge->inst->uniform != -1) {
2222                                         chosen->inst->uniform =
2223                                                 merge->inst->uniform;
2224                                 }
2225 
2226                                 if (debug) {
2227                                         fprintf(stderr, "t=%4d: merging: ",
2228                                                 time);
2229                                         v3d_qpu_dump(devinfo, &merge->inst->qpu);
2230                                         fprintf(stderr, "\n");
2231                                         fprintf(stderr, "         result: ");
2232                                         v3d_qpu_dump(devinfo, inst);
2233                                         fprintf(stderr, "\n");
2234                                 }
2235 
2236                                 if (scoreboard->fixup_ldvary) {
2237                                         scoreboard->fixup_ldvary = false;
2238                                         if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
2239                                                 /* Flag the ldvary as scheduled
2240                                                  * now so we can try to merge the
2241                                                  * follow-up instruction in the
2242                                                  * the ldvary sequence into the
2243                                                  * current instruction.
2244                                                  */
2245                                                 mark_instruction_scheduled(
2246                                                         devinfo, scoreboard->dag,
2247                                                         time, merge);
2248                                         }
2249                                 }
2250                         }
2251                         if (mux_read_stalls(scoreboard, inst))
2252                                 c->qpu_inst_stalled_count++;
2253                 }
2254 
2255                 /* Update the uniform index for the rewritten location --
2256                  * branch target updating will still need to change
2257                  * c->uniform_data[] using this index.
2258                  */
2259                 if (qinst->uniform != -1) {
2260                         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
2261                                 block->branch_uniform = *next_uniform;
2262 
2263                         c->uniform_data[*next_uniform] =
2264                                 orig_uniform_data[qinst->uniform];
2265                         c->uniform_contents[*next_uniform] =
2266                                 orig_uniform_contents[qinst->uniform];
2267                         qinst->uniform = *next_uniform;
2268                         (*next_uniform)++;
2269                 }
2270 
2271                 if (debug) {
2272                         fprintf(stderr, "\n");
2273                 }
2274 
2275                 /* Now that we've scheduled a new instruction, some of its
2276                  * children can be promoted to the list of instructions ready to
2277                  * be scheduled.  Update the children's unblocked time for this
2278                  * DAG edge as we do so.
2279                  */
2280                 mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);
2281                 list_for_each_entry(struct schedule_node, merge, &merged_list,
2282                                     link) {
2283                         mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);
2284 
2285                         /* The merged VIR instruction doesn't get re-added to the
2286                          * block, so free it now.
2287                          */
2288                         free(merge->inst);
2289                 }
2290 
2291                 if (inst->sig.thrsw) {
2292                         time += emit_thrsw(c, block, scoreboard, qinst, false);
2293                 } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
2294                         emit_branch(c, block, scoreboard, qinst);
2295                 } else {
2296                         insert_scheduled_instruction(c, block,
2297                                                      scoreboard, qinst);
2298                 }
2299         }
2300 
2301         return time;
2302 }
2303 
2304 static uint32_t
qpu_schedule_instructions_block(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qblock *block, enum quniform_contents *orig_uniform_contents, uint32_t *orig_uniform_data, uint32_t *next_uniform)2305 qpu_schedule_instructions_block(struct v3d_compile *c,
2306                                 struct choose_scoreboard *scoreboard,
2307                                 struct qblock *block,
2308                                 enum quniform_contents *orig_uniform_contents,
2309                                 uint32_t *orig_uniform_data,
2310                                 uint32_t *next_uniform)
2311 {
2312         void *mem_ctx = ralloc_context(NULL);
2313         scoreboard->dag = dag_create(mem_ctx);
2314         struct list_head setup_list;
2315 
2316         list_inithead(&setup_list);
2317 
2318         /* Wrap each instruction in a scheduler structure. */
2319         while (!list_is_empty(&block->instructions)) {
2320                 struct qinst *qinst = (struct qinst *)block->instructions.next;
2321                 struct schedule_node *n =
2322                         rzalloc(mem_ctx, struct schedule_node);
2323 
2324                 dag_init_node(scoreboard->dag, &n->dag);
2325                 n->inst = qinst;
2326 
2327                 list_del(&qinst->link);
2328                 list_addtail(&n->link, &setup_list);
2329         }
2330 
2331         calculate_forward_deps(c, scoreboard->dag, &setup_list);
2332         calculate_reverse_deps(c, scoreboard->dag, &setup_list);
2333 
2334         dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);
2335 
2336         uint32_t cycles = schedule_instructions(c, scoreboard, block,
2337                                                 orig_uniform_contents,
2338                                                 orig_uniform_data,
2339                                                 next_uniform);
2340 
2341         ralloc_free(mem_ctx);
2342         scoreboard->dag = NULL;
2343 
2344         return cycles;
2345 }
2346 
2347 static void
qpu_set_branch_targets(struct v3d_compile *c)2348 qpu_set_branch_targets(struct v3d_compile *c)
2349 {
2350         vir_for_each_block(block, c) {
2351                 /* The end block of the program has no branch. */
2352                 if (!block->successors[0])
2353                         continue;
2354 
2355                 /* If there was no branch instruction, then the successor
2356                  * block must follow immediately after this one.
2357                  */
2358                 if (block->branch_qpu_ip == ~0) {
2359                         assert(block->end_qpu_ip + 1 ==
2360                                block->successors[0]->start_qpu_ip);
2361                         continue;
2362                 }
2363 
2364                 /* Walk back through the delay slots to find the branch
2365                  * instr.
2366                  */
2367                 struct qinst *branch = NULL;
2368                 struct list_head *entry = block->instructions.prev;
2369                 int32_t delay_slot_count = -1;
2370                 struct qinst *delay_slots_start = NULL;
2371                 for (int i = 0; i < 3; i++) {
2372                         entry = entry->prev;
2373                         struct qinst *inst =
2374                                 container_of(entry, struct qinst, link);
2375 
2376                         if (delay_slot_count == -1) {
2377                                 if (!v3d_qpu_is_nop(&inst->qpu))
2378                                         delay_slot_count = i;
2379                                 else
2380                                         delay_slots_start = inst;
2381                         }
2382 
2383                         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
2384                                 branch = inst;
2385                                 break;
2386                         }
2387                 }
2388                 assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2389                 assert(delay_slot_count >= 0 && delay_slot_count <= 3);
2390                 assert(delay_slot_count == 0 || delay_slots_start != NULL);
2391 
2392                 /* Make sure that the if-we-don't-jump
2393                  * successor was scheduled just after the
2394                  * delay slots.
2395                  */
2396                 assert(!block->successors[1] ||
2397                        block->successors[1]->start_qpu_ip ==
2398                        block->branch_qpu_ip + 4);
2399 
2400                 branch->qpu.branch.offset =
2401                         ((block->successors[0]->start_qpu_ip -
2402                           (block->branch_qpu_ip + 4)) *
2403                          sizeof(uint64_t));
2404 
2405                 /* Set up the relative offset to jump in the
2406                  * uniform stream.
2407                  *
2408                  * Use a temporary here, because
2409                  * uniform_data[inst->uniform] may be shared
2410                  * between multiple instructions.
2411                  */
2412                 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
2413                 c->uniform_data[branch->uniform] =
2414                         (block->successors[0]->start_uniform -
2415                          (block->branch_uniform + 1)) * 4;
2416 
2417                 /* If this is an unconditional branch, try to fill any remaining
2418                  * delay slots with the initial instructions of the successor
2419                  * block.
2420                  *
2421                  * FIXME: we can do the same for conditional branches if we
2422                  * predicate the instructions to match the branch condition.
2423                  */
2424                 if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
2425                         struct list_head *successor_insts =
2426                                 &block->successors[0]->instructions;
2427                         delay_slot_count = MIN2(delay_slot_count,
2428                                                 list_length(successor_insts));
2429                         struct qinst *s_inst =
2430                                 (struct qinst *) successor_insts->next;
2431                         struct qinst *slot = delay_slots_start;
2432                         int slots_filled = 0;
2433                         while (slots_filled < delay_slot_count &&
2434                                qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {
2435                                 memcpy(&slot->qpu, &s_inst->qpu,
2436                                        sizeof(slot->qpu));
2437                                 s_inst = (struct qinst *) s_inst->link.next;
2438                                 slot = (struct qinst *) slot->link.next;
2439                                 slots_filled++;
2440                         }
2441                         branch->qpu.branch.offset +=
2442                                 slots_filled * sizeof(uint64_t);
2443                 }
2444         }
2445 }
2446 
2447 uint32_t
v3d_qpu_schedule_instructions(struct v3d_compile *c)2448 v3d_qpu_schedule_instructions(struct v3d_compile *c)
2449 {
2450         const struct v3d_device_info *devinfo = c->devinfo;
2451         struct qblock *end_block = list_last_entry(&c->blocks,
2452                                                    struct qblock, link);
2453 
2454         /* We reorder the uniforms as we schedule instructions, so save the
2455          * old data off and replace it.
2456          */
2457         uint32_t *uniform_data = c->uniform_data;
2458         enum quniform_contents *uniform_contents = c->uniform_contents;
2459         c->uniform_contents = ralloc_array(c, enum quniform_contents,
2460                                            c->num_uniforms);
2461         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
2462         c->uniform_array_size = c->num_uniforms;
2463         uint32_t next_uniform = 0;
2464 
2465         struct choose_scoreboard scoreboard;
2466         memset(&scoreboard, 0, sizeof(scoreboard));
2467         scoreboard.last_ldvary_tick = -10;
2468         scoreboard.last_unifa_write_tick = -10;
2469         scoreboard.last_magic_sfu_write_tick = -10;
2470         scoreboard.last_uniforms_reset_tick = -10;
2471         scoreboard.last_thrsw_tick = -10;
2472         scoreboard.last_branch_tick = -10;
2473         scoreboard.last_setmsf_tick = -10;
2474         scoreboard.last_stallable_sfu_tick = -10;
2475 
2476         if (debug) {
2477                 fprintf(stderr, "Pre-schedule instructions\n");
2478                 vir_for_each_block(block, c) {
2479                         fprintf(stderr, "BLOCK %d\n", block->index);
2480                         list_for_each_entry(struct qinst, qinst,
2481                                             &block->instructions, link) {
2482                                 v3d_qpu_dump(devinfo, &qinst->qpu);
2483                                 fprintf(stderr, "\n");
2484                         }
2485                 }
2486                 fprintf(stderr, "\n");
2487         }
2488 
2489         uint32_t cycles = 0;
2490         vir_for_each_block(block, c) {
2491                 block->start_qpu_ip = c->qpu_inst_count;
2492                 block->branch_qpu_ip = ~0;
2493                 block->start_uniform = next_uniform;
2494 
2495                 cycles += qpu_schedule_instructions_block(c,
2496                                                           &scoreboard,
2497                                                           block,
2498                                                           uniform_contents,
2499                                                           uniform_data,
2500                                                           &next_uniform);
2501 
2502                 block->end_qpu_ip = c->qpu_inst_count - 1;
2503         }
2504 
2505         /* Emit the program-end THRSW instruction. */;
2506         struct qinst *thrsw = vir_nop();
2507         thrsw->qpu.sig.thrsw = true;
2508         emit_thrsw(c, end_block, &scoreboard, thrsw, true);
2509 
2510         qpu_set_branch_targets(c);
2511 
2512         assert(next_uniform == c->num_uniforms);
2513 
2514         return cycles;
2515 }
2516