1/*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014-2015 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25/**
26 * @file vc4_qir_schedule.c
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies from the bottom up, and make a list of the DAG
30 * heads.  Heuristically pick a DAG head and schedule (remove) it, then put
31 * all the parents that are now DAG heads into the list of things to
32 * schedule.
33 *
34 * The goal of scheduling here, before register allocation and conversion to
35 * QPU instructions, is to reduce register pressure by reordering instructions
36 * to consume values when possible.
37 */
38
39#include "vc4_qir.h"
40#include "util/dag.h"
41
42static bool debug;
43
44struct schedule_node {
45        struct dag_node dag;
46        struct list_head link;
47        struct qinst *inst;
48
49        /* Length of the longest (latency) chain from a DAG head to the this
50         * instruction.
51         */
52        uint32_t delay;
53
54        /* Longest time + latency_between(parent, this) of any parent of this
55         * node.
56         */
57        uint32_t unblocked_time;
58};
59
60struct schedule_state {
61        struct dag *dag;
62
63        uint32_t time;
64
65        uint32_t *temp_writes;
66
67        BITSET_WORD *temp_live;
68};
69
70/* When walking the instructions in reverse, we need to swap before/after in
71 * add_dep().
72 */
73enum direction { F, R };
74
75/**
76 * Marks a dependency between two intructions, that \p after must appear after
77 * \p before.
78 *
79 * Our dependencies are tracked as a DAG.  Since we're scheduling bottom-up,
80 * the latest instructions with nothing left to schedule are the DAG heads,
81 * and their inputs are their children.
82 */
83static void
84add_dep(enum direction dir,
85        struct schedule_node *before,
86        struct schedule_node *after)
87{
88        if (!before || !after)
89                return;
90
91        assert(before != after);
92
93        if (dir == R) {
94                struct schedule_node *t = before;
95                before = after;
96                after = t;
97        }
98
99        dag_add_edge(&after->dag, &before->dag, 0);
100}
101
102static void
103add_write_dep(enum direction dir,
104              struct schedule_node **before,
105              struct schedule_node *after)
106{
107        add_dep(dir, *before, after);
108        *before = after;
109}
110
111struct schedule_setup_state {
112        struct schedule_node **last_temp_write;
113        struct schedule_node *last_sf;
114        struct schedule_node *last_vary_read;
115        struct schedule_node *last_vpm_read;
116        struct schedule_node *last_vpm_write;
117        struct schedule_node *last_tex_coord;
118        struct schedule_node *last_tex_result;
119        struct schedule_node *last_tlb;
120        struct schedule_node *last_uniforms_reset;
121        enum direction dir;
122
123	/**
124         * Texture FIFO tracking.  This is done top-to-bottom, and is used to
125         * track the QOP_TEX_RESULTs and add dependencies on previous ones
126         * when trying to submit texture coords with TFREQ full or new texture
127         * fetches with TXRCV full.
128         */
129        struct {
130                struct schedule_node *node;
131                int coords;
132        } tex_fifo[8];
133        int tfreq_count; /**< Number of texture coords outstanding. */
134        int tfrcv_count; /**< Number of texture results outstanding. */
135        int tex_fifo_pos;
136};
137
138static void
139block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n)
140{
141        add_dep(state->dir, state->tex_fifo[0].node, n);
142
143        state->tfreq_count -= state->tex_fifo[0].coords;
144        state->tfrcv_count--;
145
146        memmove(&state->tex_fifo[0],
147                &state->tex_fifo[1],
148                state->tex_fifo_pos * sizeof(state->tex_fifo[0]));
149        state->tex_fifo_pos--;
150}
151
152/**
153 * Common code for dependencies that need to be tracked both forward and
154 * backward.
155 *
156 * This is for things like "all VPM reads have to happen in order."
157 */
158static void
159calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
160{
161        struct qinst *inst = n->inst;
162        enum direction dir = state->dir;
163
164
165        /* Add deps for temp registers and varyings accesses.  Note that we
166         * ignore uniforms accesses, because qir_reorder_uniforms() happens
167         * after this.
168         */
169        for (int i = 0; i < qir_get_nsrc(inst); i++) {
170                switch (inst->src[i].file) {
171                case QFILE_TEMP:
172                        add_dep(dir,
173                                state->last_temp_write[inst->src[i].index], n);
174                        break;
175
176                case QFILE_VARY:
177                        add_write_dep(dir, &state->last_vary_read, n);
178                        break;
179
180                case QFILE_VPM:
181                        add_write_dep(dir, &state->last_vpm_read, n);
182                        break;
183
184                default:
185                        break;
186                }
187        }
188
189        switch (inst->op) {
190        case QOP_VARY_ADD_C:
191                add_dep(dir, state->last_vary_read, n);
192                break;
193
194        case QOP_TEX_RESULT:
195                /* Results have to be fetched in order. */
196                add_write_dep(dir, &state->last_tex_result, n);
197                break;
198
199        case QOP_THRSW:
200                /* After a new THRSW, one must collect all texture samples
201                 * queued since the previous THRSW/program start.  For now, we
202                 * have one THRSW in between each texture setup and its
203                 * results collection as our input, and we just make sure that
204                 * that ordering is maintained.
205                 */
206                add_write_dep(dir, &state->last_tex_coord, n);
207                add_write_dep(dir, &state->last_tex_result, n);
208
209                /* accumulators and flags are lost across thread switches. */
210                add_write_dep(dir, &state->last_sf, n);
211
212                /* Setup, like the varyings, will need to be drained before we
213                 * thread switch.
214                 */
215                add_write_dep(dir, &state->last_vary_read, n);
216
217                /* The TLB-locking operations have to stay after the last
218                 * thread switch.
219                 */
220                add_write_dep(dir, &state->last_tlb, n);
221                break;
222
223        case QOP_TLB_COLOR_READ:
224        case QOP_MS_MASK:
225                add_write_dep(dir, &state->last_tlb, n);
226                break;
227
228        default:
229                break;
230        }
231
232        switch (inst->dst.file) {
233        case QFILE_VPM:
234                add_write_dep(dir, &state->last_vpm_write, n);
235                break;
236
237        case QFILE_TEMP:
238                add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
239                break;
240
241        case QFILE_TLB_COLOR_WRITE:
242        case QFILE_TLB_COLOR_WRITE_MS:
243        case QFILE_TLB_Z_WRITE:
244        case QFILE_TLB_STENCIL_SETUP:
245                add_write_dep(dir, &state->last_tlb, n);
246                break;
247
248        case QFILE_TEX_S_DIRECT:
249        case QFILE_TEX_S:
250        case QFILE_TEX_T:
251        case QFILE_TEX_R:
252        case QFILE_TEX_B:
253                /* Texturing setup gets scheduled in order, because
254                 * the uniforms referenced by them have to land in a
255                 * specific order.
256                 */
257                add_write_dep(dir, &state->last_tex_coord, n);
258                break;
259
260        default:
261                break;
262        }
263
264        if (qir_depends_on_flags(inst))
265                add_dep(dir, state->last_sf, n);
266
267        if (inst->sf)
268                add_write_dep(dir, &state->last_sf, n);
269}
270
271static void
272calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
273                       struct list_head *schedule_list)
274{
275        struct schedule_setup_state state;
276
277        memset(&state, 0, sizeof(state));
278        state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
279                                              c->num_temps);
280        state.dir = F;
281
282        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
283                struct qinst *inst = n->inst;
284
285                calculate_deps(&state, n);
286
287                for (int i = 0; i < qir_get_nsrc(inst); i++) {
288                        switch (inst->src[i].file) {
289                        case QFILE_UNIF:
290                                add_dep(state.dir, state.last_uniforms_reset, n);
291                                break;
292                        default:
293                                break;
294                        }
295                }
296
297                switch (inst->dst.file) {
298                case QFILE_TEX_S_DIRECT:
299                case QFILE_TEX_S:
300                case QFILE_TEX_T:
301                case QFILE_TEX_R:
302                case QFILE_TEX_B:
303                        /* From the VC4 spec:
304                         *
305                         *     "The TFREQ input FIFO holds two full lots of s,
306                         *      t, r, b data, plus associated setup data, per
307                         *      QPU, that is, there are eight data slots. For
308                         *      each texture request, slots are only consumed
309                         *      for the components of s, t, r, and b actually
310                         *      written. Thus the FIFO can hold four requests
311                         *      of just (s, t) data, or eight requests of just
312                         *      s data (for direct addressed data lookups).
313                         *
314                         *      Note that there is one FIFO per QPU, and the
315                         *      FIFO has no concept of threads - that is,
316                         *      multi-threaded shaders must be careful to use
317                         *      only 1/2 the FIFO depth before reading
318                         *      back. Multi-threaded programs must also
319                         *      therefore always thread switch on texture
320                         *      fetch as the other thread may have data
321                         *      waiting in the FIFO."
322                         *
323                         * If the texture coordinate fifo is full, block this
324                         * on the last QOP_TEX_RESULT.
325                         */
326                        if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) {
327                                block_until_tex_result(&state, n);
328                        }
329
330                        /* From the VC4 spec:
331                         *
332                         *     "Since the maximum number of texture requests
333                         *      in the input (TFREQ) FIFO is four lots of (s,
334                         *      t) data, the output (TFRCV) FIFO is sized to
335                         *      holds four lots of max-size color data per
336                         *      QPU. For non-float color, reads are packed
337                         *      RGBA8888 data (one read per pixel). For 16-bit
338                         *      float color, two reads are necessary per
339                         *      pixel, with reads packed as RG1616 then
340                         *      BA1616. So per QPU there are eight color slots
341                         *      in the TFRCV FIFO."
342                         *
343                         * If the texture result fifo is full, block adding
344                         * any more to it until the last QOP_TEX_RESULT.
345                         */
346                        if (inst->dst.file == QFILE_TEX_S ||
347                            inst->dst.file == QFILE_TEX_S_DIRECT) {
348                                if (state.tfrcv_count ==
349                                    (c->fs_threaded ? 2 : 4))
350                                        block_until_tex_result(&state, n);
351                                state.tfrcv_count++;
352                        }
353
354                        state.tex_fifo[state.tex_fifo_pos].coords++;
355                        state.tfreq_count++;
356                        break;
357
358                default:
359                        break;
360                }
361
362                switch (inst->op) {
363                case QOP_TEX_RESULT:
364                        /* Results have to be fetched after the
365                         * coordinate setup.  Note that we're assuming
366                         * here that our input shader has the texture
367                         * coord setup and result fetch in order,
368                         * which is true initially but not of our
369                         * instruction stream after this pass.
370                         */
371                        add_dep(state.dir, state.last_tex_coord, n);
372
373                        state.tex_fifo[state.tex_fifo_pos].node = n;
374
375                        state.tex_fifo_pos++;
376                        memset(&state.tex_fifo[state.tex_fifo_pos], 0,
377                               sizeof(state.tex_fifo[0]));
378                        break;
379
380                case QOP_UNIFORMS_RESET:
381                        add_write_dep(state.dir, &state.last_uniforms_reset, n);
382                        break;
383
384                default:
385                        break;
386                }
387        }
388}
389
390static void
391calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx,
392                       struct list_head *schedule_list)
393{
394        struct schedule_setup_state state;
395
396        memset(&state, 0, sizeof(state));
397        state.dir = R;
398        state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
399                                              c->num_temps);
400
401        list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) {
402                calculate_deps(&state, n);
403        }
404}
405
406static int
407get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
408{
409        int cost = 0;
410
411        if (inst->dst.file == QFILE_TEMP &&
412            state->temp_writes[inst->dst.index] == 1)
413                cost--;
414
415        for (int i = 0; i < qir_get_nsrc(inst); i++) {
416                if (inst->src[i].file != QFILE_TEMP ||
417                    BITSET_TEST(state->temp_live, inst->src[i].index)) {
418                        continue;
419                }
420
421                bool already_counted = false;
422                for (int j = 0; j < i; j++) {
423                        if (inst->src[i].file == inst->src[j].file &&
424                            inst->src[i].index == inst->src[j].index) {
425                                already_counted = true;
426                        }
427                }
428                if (!already_counted)
429                        cost++;
430        }
431
432        return cost;
433}
434
435static bool
436locks_scoreboard(struct qinst *inst)
437{
438        if (inst->op == QOP_TLB_COLOR_READ)
439                return true;
440
441        switch (inst->dst.file) {
442        case QFILE_TLB_Z_WRITE:
443        case QFILE_TLB_COLOR_WRITE:
444        case QFILE_TLB_COLOR_WRITE_MS:
445                return true;
446        default:
447                return false;
448        }
449}
450
451static struct schedule_node *
452choose_instruction(struct schedule_state *state)
453{
454        struct schedule_node *chosen = NULL;
455
456        list_for_each_entry(struct schedule_node, n, &state->dag->heads,
457                            dag.link) {
458                /* The branches aren't being tracked as dependencies.  Make
459                 * sure that they stay scheduled as the last instruction of
460                 * the block, which is to say the first one we choose to
461                 * schedule.
462                 */
463                if (n->inst->op == QOP_BRANCH)
464                        return n;
465
466                if (!chosen) {
467                        chosen = n;
468                        continue;
469                }
470
471                /* Prefer scheduling things that lock the scoreboard, so that
472                 * they appear late in the program and we get more parallelism
473                 * between shaders on multiple QPUs hitting the same fragment.
474                 */
475                if (locks_scoreboard(n->inst) &&
476                    !locks_scoreboard(chosen->inst)) {
477                        chosen = n;
478                        continue;
479                } else if (!locks_scoreboard(n->inst) &&
480                           locks_scoreboard(chosen->inst)) {
481                        continue;
482                }
483
484                /* If we would block on the previously chosen node, but would
485                 * block less on this one, then prefer it.
486                 */
487                if (chosen->unblocked_time > state->time &&
488                    n->unblocked_time < chosen->unblocked_time) {
489                        chosen = n;
490                        continue;
491                } else if (n->unblocked_time > state->time &&
492                           n->unblocked_time > chosen->unblocked_time) {
493                        continue;
494                }
495
496                /* If we can definitely reduce register pressure, do so
497                 * immediately.
498                 */
499                int register_pressure_cost =
500                        get_register_pressure_cost(state, n->inst);
501                int chosen_register_pressure_cost =
502                        get_register_pressure_cost(state, chosen->inst);
503
504                if (register_pressure_cost < chosen_register_pressure_cost) {
505                        chosen = n;
506                        continue;
507                } else if (register_pressure_cost >
508                           chosen_register_pressure_cost) {
509                        continue;
510                }
511
512                /* Otherwise, prefer instructions with the deepest chain to
513                 * the end of the program.  This avoids the problem of
514                 * "everything generates a temp, nothing finishes freeing one,
515                 * guess I'll just keep emitting varying mul/adds".
516                 */
517                if (n->delay > chosen->delay) {
518                        chosen = n;
519                        continue;
520                } else if (n->delay < chosen->delay) {
521                        continue;
522                }
523        }
524
525        return chosen;
526}
527
528static void
529dump_state(struct vc4_compile *c, struct schedule_state *state)
530{
531        uint32_t i = 0;
532        list_for_each_entry(struct schedule_node, n, &state->dag->heads,
533                            dag.link) {
534                fprintf(stderr, "%3d: ", i++);
535                qir_dump_inst(c, n->inst);
536                fprintf(stderr, " (%d cost)\n",
537                        get_register_pressure_cost(state, n->inst));
538
539                util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
540                        struct schedule_node *child =
541                                (struct schedule_node *)edge->child;
542                        fprintf(stderr, "   - ");
543                        qir_dump_inst(c, child->inst);
544                        fprintf(stderr, " (%d parents)\n",
545                                child->dag.parent_count);
546                }
547        }
548}
549
550/* Estimate of how many instructions we should schedule between operations.
551 *
552 * These aren't in real cycle counts, because we're just estimating cycle
553 * times anyway.  QIR instructions will get paired up when turned into QPU
554 * instructions, or extra NOP delays will have to be added due to register
555 * allocation choices.
556 */
557static uint32_t
558latency_between(struct schedule_node *before, struct schedule_node *after)
559{
560        if ((before->inst->dst.file == QFILE_TEX_S ||
561             before->inst->dst.file == QFILE_TEX_S_DIRECT) &&
562            after->inst->op == QOP_TEX_RESULT)
563                return 100;
564
565        switch (before->inst->op) {
566        case QOP_RCP:
567        case QOP_RSQ:
568        case QOP_EXP2:
569        case QOP_LOG2:
570                for (int i = 0; i < qir_get_nsrc(after->inst); i++) {
571                        if (after->inst->src[i].file ==
572                            before->inst->dst.file &&
573                            after->inst->src[i].index ==
574                            before->inst->dst.index) {
575                                /* There are two QPU delay slots before we can
576                                 * read a math result, which could be up to 4
577                                 * QIR instructions if they packed well.
578                                 */
579                                return 4;
580                        }
581                }
582                break;
583        default:
584                break;
585        }
586
587        return 1;
588}
589
590/** Recursive computation of the delay member of a node. */
591static void
592compute_delay(struct dag_node *node, void *state)
593{
594        struct schedule_node *n = (struct schedule_node *)node;
595
596        /* The color read needs to be scheduled late, to avoid locking the
597         * scoreboard early.  This is our best tool for encouraging that.  The
598         * other scoreboard locking ops will have this happen by default,
599         * since they are generally the DAG heads or close to them.
600         */
601        if (n->inst->op == QOP_TLB_COLOR_READ)
602                n->delay = 1000;
603        else
604                n->delay = 1;
605
606        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
607                struct schedule_node *child =
608                        (struct schedule_node *)edge->child;
609
610                n->delay = MAX2(n->delay, (child->delay +
611                                           latency_between(child, n)));
612        }
613}
614
615static void
616schedule_instructions(struct vc4_compile *c,
617                      struct qblock *block, struct schedule_state *state)
618{
619        if (debug) {
620                fprintf(stderr, "initial deps:\n");
621                dump_state(c, state);
622        }
623
624        state->time = 0;
625        while (!list_is_empty(&state->dag->heads)) {
626                struct schedule_node *chosen = choose_instruction(state);
627                struct qinst *inst = chosen->inst;
628
629                if (debug) {
630                        fprintf(stderr, "current list:\n");
631                        dump_state(c, state);
632                        fprintf(stderr, "chose: ");
633                        qir_dump_inst(c, inst);
634                        fprintf(stderr, " (%d cost)\n",
635                                get_register_pressure_cost(state, inst));
636                }
637
638                state->time = MAX2(state->time, chosen->unblocked_time);
639
640                /* Schedule this instruction back onto the QIR list. */
641                list_add(&inst->link, &block->instructions);
642
643                /* Now that we've scheduled a new instruction, some of its
644                 * children can be promoted to the list of instructions ready to
645                 * be scheduled.  Update the children's unblocked time for this
646                 * DAG edge as we do so.
647                 */
648                util_dynarray_foreach(&chosen->dag.edges,
649                                      struct dag_edge, edge) {
650                        struct schedule_node *child =
651                                (struct schedule_node *)edge->child;
652
653                        child->unblocked_time = MAX2(child->unblocked_time,
654                                                     state->time +
655                                                     latency_between(child,
656                                                                     chosen));
657                }
658                dag_prune_head(state->dag, &chosen->dag);
659
660                /* Update our tracking of register pressure. */
661                for (int i = 0; i < qir_get_nsrc(inst); i++) {
662                        if (inst->src[i].file == QFILE_TEMP)
663                                BITSET_SET(state->temp_live, inst->src[i].index);
664                }
665                if (inst->dst.file == QFILE_TEMP) {
666                        state->temp_writes[inst->dst.index]--;
667                        if (state->temp_writes[inst->dst.index] == 0)
668                                BITSET_CLEAR(state->temp_live, inst->dst.index);
669                }
670
671                state->time++;
672        }
673}
674
675static void
676qir_schedule_instructions_block(struct vc4_compile *c,
677                                struct qblock *block)
678{
679        struct schedule_state *state = rzalloc(NULL, struct schedule_state);
680
681        state->temp_writes = rzalloc_array(state, uint32_t, c->num_temps);
682        state->temp_live = rzalloc_array(state, BITSET_WORD,
683                                         BITSET_WORDS(c->num_temps));
684        state->dag = dag_create(state);
685
686        struct list_head setup_list;
687        list_inithead(&setup_list);
688
689        /* Wrap each instruction in a scheduler structure. */
690        qir_for_each_inst_safe(inst, block) {
691                struct schedule_node *n = rzalloc(state, struct schedule_node);
692
693                n->inst = inst;
694                list_del(&inst->link);
695                list_addtail(&n->link, &setup_list);
696                dag_init_node(state->dag, &n->dag);
697
698                if (inst->dst.file == QFILE_TEMP)
699                        state->temp_writes[inst->dst.index]++;
700        }
701
702        /* Dependencies tracked top-to-bottom. */
703        calculate_forward_deps(c, state, &setup_list);
704        /* Dependencies tracked bottom-to-top. */
705        calculate_reverse_deps(c, state, &setup_list);
706
707        dag_traverse_bottom_up(state->dag, compute_delay, NULL);
708
709        schedule_instructions(c, block, state);
710
711        ralloc_free(state);
712}
713
714void
715qir_schedule_instructions(struct vc4_compile *c)
716{
717
718        if (debug) {
719                fprintf(stderr, "Pre-schedule instructions\n");
720                qir_dump(c);
721        }
722
723        qir_for_each_block(block, c)
724                qir_schedule_instructions_block(c, block);
725
726        if (debug) {
727                fprintf(stderr, "Post-schedule instructions\n");
728                qir_dump(c);
729        }
730}
731