1/*
2 * Copyright © 2019 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "nir_schedule.h"
25#include "util/dag.h"
26#include "util/u_dynarray.h"
27
28/** @file
29 *
30 * Implements basic-block-level prepass instruction scheduling in NIR to
31 * manage register pressure.
32 *
33 * This is based on the Goodman/Hsu paper (1988, cached copy at
34 * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf).  We
35 * make up the DDG for NIR (which can be mostly done using the NIR def/use
36 * chains for SSA instructions, plus some edges for ordering register writes
37 * vs reads, and some more for ordering intrinsics).  Then we pick heads off
38 * of the DDG using their heuristic to emit the NIR instructions back into the
39 * block in their new order.
40 *
41 * The hard case for prepass scheduling on GPUs seems to always be consuming
42 * texture/ubo results.  The register pressure heuristic doesn't want to pick
43 * an instr that starts consuming texture results because it usually won't be
44 * the only usage, so that instruction will increase pressure.
45 *
46 * If you try to force consumption of tex results always, then in a case where
47 * single sample is used for many outputs, you'll end up picking every other
48 * user and expanding register pressure.  The partially_evaluated_path flag
49 * helps tremendously, in that if you happen for whatever reason to pick a
50 * texture sample's output, then you'll try to finish off that sample.  Future
51 * work may include doing some local search before locking in a choice, to try
52 * to more reliably find the case where just a few choices going against the
53 * heuristic can manage to free the whole vector.
54 */
55
56static bool debug;
57
58/**
59 * Represents a node in the DDG for a NIR instruction.
60 */
61typedef struct {
62   struct dag_node dag; /* must be first for our u_dynarray_foreach */
63   nir_instr *instr;
64   bool partially_evaluated_path;
65
66   /* Approximate estimate of the delay between starting this instruction and
67    * its results being available.
68    *
69    * Accuracy is not too important, given that we're prepass scheduling here
70    * and just trying to reduce excess dependencies introduced by a register
71    * allocator by stretching out the live intervals of expensive
72    * instructions.
73    */
74   uint32_t delay;
75
76   /* Cost of the maximum-delay path from this node to the leaves. */
77   uint32_t max_delay;
78
79   /* scoreboard->time value when this instruction can be scheduled without
80    * any stalls expected.
81    */
82   uint32_t ready_time;
83} nir_schedule_node;
84
85typedef struct {
86   struct dag *dag;
87
88   nir_shader *shader;
89
90   /* Mapping from nir_register * or nir_ssa_def * to a struct set of
91    * instructions remaining to be scheduled using the register.
92    */
93   struct hash_table *remaining_uses;
94
95   /* Map from nir_instr to nir_schedule_node * */
96   struct hash_table *instr_map;
97
98   /* Set of nir_register * or nir_ssa_def * that have had any instruction
99    * scheduled on them.
100    */
101   struct set *live_values;
102
103   /* An abstract approximation of the number of nir_scheduler_node->delay
104    * units since the start of the shader.
105    */
106   uint32_t time;
107
108   /* Number of channels currently used by the NIR instructions that have been
109    * scheduled.
110    */
111   int pressure;
112
113   /* Options specified by the backend */
114   const nir_schedule_options *options;
115} nir_schedule_scoreboard;
116
117/* When walking the instructions in reverse, we use this flag to swap
118 * before/after in add_dep().
119 */
120enum direction { F, R };
121
122struct nir_schedule_class_dep {
123   int klass;
124   nir_schedule_node *node;
125   struct nir_schedule_class_dep *next;
126};
127
128typedef struct {
129   nir_schedule_scoreboard *scoreboard;
130
131   /* Map from nir_register to nir_schedule_node * */
132   struct hash_table *reg_map;
133
134   /* Scheduler nodes for last instruction involved in some class of dependency.
135    */
136   nir_schedule_node *load_input;
137   nir_schedule_node *store_shared;
138   nir_schedule_node *unknown_intrinsic;
139   nir_schedule_node *discard;
140   nir_schedule_node *jump;
141
142   struct nir_schedule_class_dep *class_deps;
143
144   enum direction dir;
145} nir_deps_state;
146
147static void *
148_mesa_hash_table_search_data(struct hash_table *ht, void *key)
149{
150   struct hash_entry *entry = _mesa_hash_table_search(ht, key);
151   if (!entry)
152      return NULL;
153   return entry->data;
154}
155
156static nir_schedule_node *
157nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
158{
159   return _mesa_hash_table_search_data(instr_map, instr);
160}
161
162static struct set *
163nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
164{
165   if (src->is_ssa) {
166      return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
167   } else {
168      return _mesa_hash_table_search_data(scoreboard->remaining_uses,
169                                          src->reg.reg);
170   }
171}
172
173static int
174nir_schedule_def_pressure(nir_ssa_def *def)
175{
176   return def->num_components;
177}
178
179static int
180nir_schedule_src_pressure(nir_src *src)
181{
182   if (src->is_ssa)
183      return nir_schedule_def_pressure(src->ssa);
184   else
185      return src->reg.reg->num_components;
186}
187
188static int
189nir_schedule_dest_pressure(nir_dest *dest)
190{
191   if (dest->is_ssa)
192      return nir_schedule_def_pressure(&dest->ssa);
193   else
194      return dest->reg.reg->num_components;
195}
196
197/**
198 * Adds a dependency such that @after must appear in the final program after
199 * @before.
200 *
201 * We add @before as a child of @after, so that DAG heads are the outputs of
202 * the program and we make our scheduling decisions bottom to top.
203 */
204static void
205add_dep(nir_deps_state *state,
206        nir_schedule_node *before,
207        nir_schedule_node *after)
208{
209   if (!before || !after)
210      return;
211
212   assert(before != after);
213
214   if (state->dir == F)
215      dag_add_edge(&before->dag, &after->dag, 0);
216   else
217      dag_add_edge(&after->dag, &before->dag, 0);
218}
219
220
221static void
222add_read_dep(nir_deps_state *state,
223             nir_schedule_node *before,
224             nir_schedule_node *after)
225{
226   add_dep(state, before, after);
227}
228
229static void
230add_write_dep(nir_deps_state *state,
231              nir_schedule_node **before,
232              nir_schedule_node *after)
233{
234   add_dep(state, *before, after);
235   *before = after;
236}
237
238static bool
239nir_schedule_reg_src_deps(nir_src *src, void *in_state)
240{
241   nir_deps_state *state = in_state;
242
243   if (src->is_ssa)
244      return true;
245
246   struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
247                                                      src->reg.reg);
248   if (!entry)
249      return true;
250   nir_schedule_node *dst_n = entry->data;
251
252   nir_schedule_node *src_n =
253      nir_schedule_get_node(state->scoreboard->instr_map,
254                            src->parent_instr);
255
256   add_dep(state, dst_n, src_n);
257
258   return true;
259}
260
261static bool
262nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
263{
264   nir_deps_state *state = in_state;
265
266   if (dest->is_ssa)
267      return true;
268
269   nir_schedule_node *dest_n =
270      nir_schedule_get_node(state->scoreboard->instr_map,
271                            dest->reg.parent_instr);
272
273   struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
274                                                      dest->reg.reg);
275   if (!entry) {
276      _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
277      return true;
278   }
279   nir_schedule_node **before = (nir_schedule_node **)&entry->data;
280
281   add_write_dep(state, before, dest_n);
282
283   return true;
284}
285
286static bool
287nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
288{
289   nir_deps_state *state = in_state;
290   struct hash_table *instr_map = state->scoreboard->instr_map;
291   nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
292
293   nir_foreach_use(src, def) {
294      nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
295                                                       src->parent_instr);
296
297      add_read_dep(state, def_n, use_n);
298   }
299
300   return true;
301}
302
303static struct nir_schedule_class_dep *
304nir_schedule_get_class_dep(nir_deps_state *state,
305                           int klass)
306{
307   for (struct nir_schedule_class_dep *class_dep = state->class_deps;
308        class_dep != NULL;
309        class_dep = class_dep->next) {
310      if (class_dep->klass == klass)
311         return class_dep;
312   }
313
314   struct nir_schedule_class_dep *class_dep =
315      ralloc(state->reg_map, struct nir_schedule_class_dep);
316
317   class_dep->klass = klass;
318   class_dep->node = NULL;
319   class_dep->next = state->class_deps;
320
321   state->class_deps = class_dep;
322
323   return class_dep;
324}
325
326static void
327nir_schedule_intrinsic_deps(nir_deps_state *state,
328                            nir_intrinsic_instr *instr)
329{
330   nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
331                                                &instr->instr);
332   const nir_schedule_options *options = state->scoreboard->options;
333   nir_schedule_dependency dep;
334
335   if (options->intrinsic_cb &&
336       options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
337      struct nir_schedule_class_dep *class_dep =
338         nir_schedule_get_class_dep(state, dep.klass);
339
340      switch (dep.type) {
341      case NIR_SCHEDULE_READ_DEPENDENCY:
342         add_read_dep(state, class_dep->node, n);
343         break;
344      case NIR_SCHEDULE_WRITE_DEPENDENCY:
345         add_write_dep(state, &class_dep->node, n);
346         break;
347      }
348   }
349
350   switch (instr->intrinsic) {
351   case nir_intrinsic_load_uniform:
352   case nir_intrinsic_load_ubo:
353   case nir_intrinsic_load_front_face:
354      break;
355
356   case nir_intrinsic_discard:
357   case nir_intrinsic_discard_if:
358   case nir_intrinsic_demote:
359   case nir_intrinsic_demote_if:
360   case nir_intrinsic_terminate:
361   case nir_intrinsic_terminate_if:
362      /* We are adding two dependencies:
363       *
364       * * A individual one that we could use to add a read_dep while handling
365       *   nir_instr_type_tex
366       *
367       * * Include it on the unknown intrinsic set, as we want discard to be
368       *   serialized in in the same order relative to intervening stores or
369       *   atomic accesses to SSBOs and images
370       */
371      add_write_dep(state, &state->discard, n);
372      add_write_dep(state, &state->unknown_intrinsic, n);
373      break;
374
375   case nir_intrinsic_store_output:
376      /* For some hardware and stages, output stores affect the same shared
377       * memory as input loads.
378       */
379      if ((state->scoreboard->options->stages_with_shared_io_memory &
380           (1 << state->scoreboard->shader->info.stage)))
381         add_write_dep(state, &state->load_input, n);
382
383      /* Make sure that preceding discards stay before the store_output */
384      add_read_dep(state, state->discard, n);
385
386      break;
387
388   case nir_intrinsic_load_input:
389   case nir_intrinsic_load_per_vertex_input:
390      add_read_dep(state, state->load_input, n);
391      break;
392
393   case nir_intrinsic_load_shared:
394   case nir_intrinsic_load_shared2_amd:
395      /* Don't move load_shared beyond a following store_shared, as it could
396       * change their value
397       */
398      add_read_dep(state, state->store_shared, n);
399      break;
400
401   case nir_intrinsic_store_shared:
402   case nir_intrinsic_store_shared2_amd:
403      add_write_dep(state, &state->store_shared, n);
404      break;
405
406   case nir_intrinsic_control_barrier:
407   case nir_intrinsic_memory_barrier_shared:
408   case nir_intrinsic_group_memory_barrier:
409   /* A generic memory barrier can be emitted when multiple synchronization
410    * semantics are involved, including shared memory.
411    */
412   case nir_intrinsic_memory_barrier:
413      add_write_dep(state, &state->store_shared, n);
414
415      /* Serialize against ssbos/atomics/etc. */
416      add_write_dep(state, &state->unknown_intrinsic, n);
417      break;
418
419   case nir_intrinsic_scoped_barrier: {
420      const nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
421
422      if (modes & nir_var_mem_shared)
423         add_write_dep(state, &state->store_shared, n);
424
425      /* Serialize against other categories. */
426      add_write_dep(state, &state->unknown_intrinsic, n);
427
428      break;
429   }
430
431   default:
432      /* Attempt to handle other intrinsics that we haven't individually
433       * categorized by serializing them in the same order relative to each
434       * other.
435       */
436      add_write_dep(state, &state->unknown_intrinsic, n);
437      break;
438   }
439}
440
441/**
442 * Common code for dependencies that need to be tracked both forward and
443 * backward.
444 *
445 * This is for things like "all reads of r4 have to happen between the r4
446 * writes that surround them".
447 */
448static void
449nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
450{
451   nir_instr *instr = n->instr;
452
453   /* For NIR SSA defs, we only need to do a single pass of making the uses
454    * depend on the def.
455    */
456   if (state->dir == F)
457      nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
458
459   /* For NIR regs, track the last writer in the scheduler state so that we
460    * can keep the writes in order and let reads get reordered only between
461    * each write.
462    */
463   nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
464
465   nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
466
467   /* Make sure any other instructions keep their positions relative to
468    * jumps.
469    */
470   if (instr->type != nir_instr_type_jump)
471      add_read_dep(state, state->jump, n);
472
473   switch (instr->type) {
474   case nir_instr_type_ssa_undef:
475   case nir_instr_type_load_const:
476   case nir_instr_type_alu:
477   case nir_instr_type_deref:
478      break;
479
480   case nir_instr_type_tex:
481      /* Don't move texture ops before a discard, as that could increase
482       * memory bandwidth for reading the discarded samples.
483       */
484      add_read_dep(state, state->discard, n);
485      break;
486
487   case nir_instr_type_jump:
488      add_write_dep(state, &state->jump, n);
489      break;
490
491   case nir_instr_type_call:
492      unreachable("Calls should have been lowered");
493      break;
494
495   case nir_instr_type_parallel_copy:
496      unreachable("Parallel copies should have been lowered");
497      break;
498
499   case nir_instr_type_phi:
500      unreachable("nir_schedule() should be called after lowering from SSA");
501      break;
502
503   case nir_instr_type_intrinsic:
504      nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
505      break;
506   }
507}
508
509static void
510calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
511{
512   nir_deps_state state = {
513      .scoreboard = scoreboard,
514      .dir = F,
515      .reg_map = _mesa_pointer_hash_table_create(NULL),
516   };
517
518   nir_foreach_instr(instr, block) {
519      nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
520                                                      instr);
521      nir_schedule_calculate_deps(&state, node);
522   }
523
524   ralloc_free(state.reg_map);
525}
526
527static void
528calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
529{
530   nir_deps_state state = {
531      .scoreboard = scoreboard,
532      .dir = R,
533      .reg_map = _mesa_pointer_hash_table_create(NULL),
534   };
535
536   nir_foreach_instr_reverse(instr, block) {
537      nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
538                                                      instr);
539      nir_schedule_calculate_deps(&state, node);
540   }
541
542   ralloc_free(state.reg_map);
543}
544
545typedef struct {
546   nir_schedule_scoreboard *scoreboard;
547   int regs_freed;
548} nir_schedule_regs_freed_state;
549
550static bool
551nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
552{
553   nir_schedule_regs_freed_state *state = in_state;
554   nir_schedule_scoreboard *scoreboard = state->scoreboard;
555   struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
556
557   if (remaining_uses->entries == 1 &&
558       _mesa_set_search(remaining_uses, src->parent_instr)) {
559      state->regs_freed += nir_schedule_src_pressure(src);
560   }
561
562   return true;
563}
564
565static bool
566nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
567{
568   nir_schedule_regs_freed_state *state = in_state;
569
570   state->regs_freed -= nir_schedule_def_pressure(def);
571
572   return true;
573}
574
575static bool
576nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
577{
578   nir_schedule_regs_freed_state *state = in_state;
579   nir_schedule_scoreboard *scoreboard = state->scoreboard;
580
581   if (dest->is_ssa)
582      return true;
583
584   nir_register *reg = dest->reg.reg;
585
586   /* Only the first def of a reg counts against register pressure. */
587   if (!_mesa_set_search(scoreboard->live_values, reg))
588      state->regs_freed -= nir_schedule_dest_pressure(dest);
589
590   return true;
591}
592
593static int
594nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
595{
596   nir_schedule_regs_freed_state state = {
597      .scoreboard = scoreboard,
598   };
599
600   nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
601
602   nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
603
604   nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
605
606   return state.regs_freed;
607}
608
609/**
610 * Chooses an instruction that will minimise the register pressure as much as
611 * possible. This should only be used as a fallback when the regular scheduling
612 * generates a shader whose register allocation fails.
613 */
614static nir_schedule_node *
615nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
616{
617   nir_schedule_node *chosen = NULL;
618
619   /* Find the leader in the ready (shouldn't-stall) set with the mininum
620    * cost.
621    */
622   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
623      if (scoreboard->time < n->ready_time)
624         continue;
625
626      if (!chosen || chosen->max_delay > n->max_delay)
627         chosen = n;
628   }
629   if (chosen) {
630      if (debug) {
631         fprintf(stderr, "chose (ready fallback):          ");
632         nir_print_instr(chosen->instr, stderr);
633         fprintf(stderr, "\n");
634      }
635
636      return chosen;
637   }
638
639   /* Otherwise, choose the leader with the minimum cost. */
640   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
641      if (!chosen || chosen->max_delay > n->max_delay)
642         chosen = n;
643   }
644   if (debug) {
645      fprintf(stderr, "chose (leader fallback):         ");
646      nir_print_instr(chosen->instr, stderr);
647      fprintf(stderr, "\n");
648   }
649
650   return chosen;
651}
652
653/**
654 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
655 * Scheduling for Parallelism) heuristic.
656 *
657 * Picks an instruction on the critical that's ready to execute without
658 * stalls, if possible, otherwise picks the instruction on the critical path.
659 */
660static nir_schedule_node *
661nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
662{
663   nir_schedule_node *chosen = NULL;
664
665   /* Find the leader in the ready (shouldn't-stall) set with the maximum
666    * cost.
667    */
668   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
669      if (scoreboard->time < n->ready_time)
670         continue;
671
672      if (!chosen || chosen->max_delay < n->max_delay)
673         chosen = n;
674   }
675   if (chosen) {
676      if (debug) {
677         fprintf(stderr, "chose (ready):          ");
678         nir_print_instr(chosen->instr, stderr);
679         fprintf(stderr, "\n");
680      }
681
682      return chosen;
683   }
684
685   /* Otherwise, choose the leader with the maximum cost. */
686   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
687      if (!chosen || chosen->max_delay < n->max_delay)
688         chosen = n;
689   }
690   if (debug) {
691      fprintf(stderr, "chose (leader):         ");
692      nir_print_instr(chosen->instr, stderr);
693      fprintf(stderr, "\n");
694   }
695
696   return chosen;
697}
698
699/**
700 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
701 * Scheduling for Register pressure) heuristic.
702 */
703static nir_schedule_node *
704nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
705{
706   nir_schedule_node *chosen = NULL;
707
708   /* Find a ready inst with regs freed and pick the one with max cost. */
709   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
710      if (n->ready_time > scoreboard->time)
711         continue;
712
713      int regs_freed = nir_schedule_regs_freed(scoreboard, n);
714
715      if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
716         chosen = n;
717      }
718   }
719   if (chosen) {
720      if (debug) {
721         fprintf(stderr, "chose (freed+ready):    ");
722         nir_print_instr(chosen->instr, stderr);
723         fprintf(stderr, "\n");
724      }
725
726      return chosen;
727   }
728
729   /* Find a leader with regs freed and pick the one with max cost. */
730   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
731      int regs_freed = nir_schedule_regs_freed(scoreboard, n);
732
733      if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
734         chosen = n;
735      }
736   }
737   if (chosen) {
738      if (debug) {
739         fprintf(stderr, "chose (regs freed):     ");
740         nir_print_instr(chosen->instr, stderr);
741         fprintf(stderr, "\n");
742      }
743
744      return chosen;
745   }
746
747   /* Find a partially evaluated path and try to finish it off */
748   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
749      if (n->partially_evaluated_path &&
750          (!chosen || chosen->max_delay < n->max_delay)) {
751         chosen = n;
752      }
753   }
754   if (chosen) {
755      if (debug) {
756         fprintf(stderr, "chose (partial path):   ");
757         nir_print_instr(chosen->instr, stderr);
758         fprintf(stderr, "\n");
759      }
760
761      return chosen;
762   }
763
764   /* Contra the paper, pick a leader with no effect on used regs.  This may
765    * open up new opportunities, as otherwise a single-operand instr consuming
766    * a value will tend to block finding freeing that value.  This had a
767    * massive effect on reducing spilling on V3D.
768    *
769    * XXX: Should this prioritize ready?
770    */
771   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
772      if (nir_schedule_regs_freed(scoreboard, n) != 0)
773         continue;
774
775      if (!chosen || chosen->max_delay < n->max_delay)
776         chosen = n;
777   }
778   if (chosen) {
779      if (debug) {
780         fprintf(stderr, "chose (regs no-op):     ");
781         nir_print_instr(chosen->instr, stderr);
782         fprintf(stderr, "\n");
783      }
784
785      return chosen;
786   }
787
788   /* Pick the max delay of the remaining ready set. */
789   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
790      if (n->ready_time > scoreboard->time)
791         continue;
792      if (!chosen || chosen->max_delay < n->max_delay)
793         chosen = n;
794   }
795   if (chosen) {
796      if (debug) {
797         fprintf(stderr, "chose (ready max delay):   ");
798         nir_print_instr(chosen->instr, stderr);
799         fprintf(stderr, "\n");
800      }
801      return chosen;
802   }
803
804   /* Pick the max delay of the remaining leaders. */
805   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
806      if (!chosen || chosen->max_delay < n->max_delay)
807         chosen = n;
808   }
809
810   if (debug) {
811      fprintf(stderr, "chose (max delay):         ");
812      nir_print_instr(chosen->instr, stderr);
813      fprintf(stderr, "\n");
814   }
815
816   return chosen;
817}
818
819static void
820dump_state(nir_schedule_scoreboard *scoreboard)
821{
822   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
823      fprintf(stderr, "maxdel %5d ", n->max_delay);
824      nir_print_instr(n->instr, stderr);
825      fprintf(stderr, "\n");
826
827      util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
828         nir_schedule_node *child = (nir_schedule_node *)edge->child;
829
830         fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
831         nir_print_instr(child->instr, stderr);
832         fprintf(stderr, "\n");
833      }
834   }
835}
836
837static void
838nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
839                      void *reg_or_def,
840                      nir_instr *reg_or_def_parent,
841                      int pressure)
842{
843   /* Make the value live if it's the first time it's been used. */
844   if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
845      _mesa_set_add(scoreboard->live_values, reg_or_def);
846      scoreboard->pressure += pressure;
847   }
848
849   /* Make the value dead if it's the last remaining use.  Be careful when one
850    * instruction uses a value twice to not decrement pressure twice.
851    */
852   struct set *remaining_uses =
853      _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
854   struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
855   if (entry) {
856      _mesa_set_remove(remaining_uses, entry);
857
858      if (remaining_uses->entries == 0)
859         scoreboard->pressure -= pressure;
860   }
861}
862
863static bool
864nir_schedule_mark_src_scheduled(nir_src *src, void *state)
865{
866   nir_schedule_scoreboard *scoreboard = state;
867   struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
868
869   struct set_entry *entry = _mesa_set_search(remaining_uses,
870                                              src->parent_instr);
871   if (entry) {
872      /* Once we've used an SSA value in one instruction, bump the priority of
873       * the other uses so the SSA value can get fully consumed.
874       *
875       * We don't do this for registers, and it's would be a hassle and it's
876       * unclear if that would help or not.  Also, skip it for constants, as
877       * they're often folded as immediates into backend instructions and have
878       * many unrelated instructions all referencing the same value (0).
879       */
880      if (src->is_ssa &&
881          src->ssa->parent_instr->type != nir_instr_type_load_const) {
882         nir_foreach_use(other_src, src->ssa) {
883            if (other_src->parent_instr == src->parent_instr)
884               continue;
885
886            nir_schedule_node *n =
887               nir_schedule_get_node(scoreboard->instr_map,
888                                     other_src->parent_instr);
889
890            if (n && !n->partially_evaluated_path) {
891               if (debug) {
892                  fprintf(stderr, "  New partially evaluated path: ");
893                  nir_print_instr(n->instr, stderr);
894                  fprintf(stderr, "\n");
895               }
896
897               n->partially_evaluated_path = true;
898            }
899         }
900      }
901   }
902
903   nir_schedule_mark_use(scoreboard,
904                         src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
905                         src->parent_instr,
906                         nir_schedule_src_pressure(src));
907
908   return true;
909}
910
911static bool
912nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
913{
914   nir_schedule_scoreboard *scoreboard = state;
915
916   nir_schedule_mark_use(scoreboard, def, def->parent_instr,
917                         nir_schedule_def_pressure(def));
918
919   return true;
920}
921
922static bool
923nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
924{
925   nir_schedule_scoreboard *scoreboard = state;
926
927   /* SSA defs were handled in nir_schedule_mark_def_scheduled()
928    */
929   if (dest->is_ssa)
930      return true;
931
932   /* XXX: This is not actually accurate for regs -- the last use of a reg may
933    * have a live interval that extends across control flow.  We should
934    * calculate the live ranges of regs, and have scheduler nodes for the CF
935    * nodes that also "use" the reg.
936    */
937   nir_schedule_mark_use(scoreboard, dest->reg.reg,
938                         dest->reg.parent_instr,
939                         nir_schedule_dest_pressure(dest));
940
941   return true;
942}
943
944static void
945nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
946                                 nir_schedule_node *n)
947{
948   nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
949   nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
950   nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
951
952   util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
953      nir_schedule_node *child = (nir_schedule_node *)edge->child;
954
955      child->ready_time = MAX2(child->ready_time,
956                               scoreboard->time + n->delay);
957
958      if (child->dag.parent_count == 1) {
959         if (debug) {
960            fprintf(stderr, "  New DAG head: ");
961            nir_print_instr(child->instr, stderr);
962            fprintf(stderr, "\n");
963         }
964      }
965   }
966
967   dag_prune_head(scoreboard->dag, &n->dag);
968
969   scoreboard->time = MAX2(n->ready_time, scoreboard->time);
970   scoreboard->time++;
971}
972
973static void
974nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
975{
976   while (!list_is_empty(&scoreboard->dag->heads)) {
977      if (debug) {
978         fprintf(stderr, "current list:\n");
979         dump_state(scoreboard);
980      }
981
982      nir_schedule_node *chosen;
983      if (scoreboard->options->fallback)
984         chosen = nir_schedule_choose_instruction_fallback(scoreboard);
985      else if (scoreboard->pressure < scoreboard->options->threshold)
986         chosen = nir_schedule_choose_instruction_csp(scoreboard);
987      else
988         chosen = nir_schedule_choose_instruction_csr(scoreboard);
989
990      /* Now that we've scheduled a new instruction, some of its children may
991       * be promoted to the list of instructions ready to be scheduled.
992       */
993      nir_schedule_mark_node_scheduled(scoreboard, chosen);
994
995      /* Move the instruction to the end (so our first chosen instructions are
996       * the start of the program).
997       */
998      exec_node_remove(&chosen->instr->node);
999      exec_list_push_tail(&block->instr_list, &chosen->instr->node);
1000
1001      if (debug)
1002         fprintf(stderr, "\n");
1003   }
1004}
1005
1006static uint32_t
1007nir_schedule_get_delay(nir_schedule_scoreboard *scoreboard, nir_instr *instr)
1008{
1009   if (scoreboard->options->instr_delay_cb) {
1010      void *cb_data = scoreboard->options->instr_delay_cb_data;
1011      return scoreboard->options->instr_delay_cb(instr, cb_data);
1012   }
1013
1014   switch (instr->type) {
1015   case nir_instr_type_ssa_undef:
1016   case nir_instr_type_load_const:
1017   case nir_instr_type_alu:
1018   case nir_instr_type_deref:
1019   case nir_instr_type_jump:
1020   case nir_instr_type_parallel_copy:
1021   case nir_instr_type_call:
1022   case nir_instr_type_phi:
1023      return 1;
1024
1025   case nir_instr_type_intrinsic:
1026      switch (nir_instr_as_intrinsic(instr)->intrinsic) {
1027      case nir_intrinsic_load_ubo:
1028      case nir_intrinsic_load_ssbo:
1029      case nir_intrinsic_load_scratch:
1030      case nir_intrinsic_load_shared:
1031      case nir_intrinsic_image_load:
1032         return 50;
1033      default:
1034         return 1;
1035      }
1036      break;
1037
1038   case nir_instr_type_tex:
1039      /* Pick some large number to try to fetch textures early and sample them
1040       * late.
1041       */
1042      return 100;
1043   }
1044
1045   return 0;
1046}
1047
1048static void
1049nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
1050{
1051   nir_schedule_node *n = (nir_schedule_node *)node;
1052   uint32_t max_delay = 0;
1053
1054   util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1055      nir_schedule_node *child = (nir_schedule_node *)edge->child;
1056      max_delay = MAX2(child->max_delay, max_delay);
1057   }
1058
1059   n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
1060 }
1061
1062static void
1063nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
1064{
1065   void *mem_ctx = ralloc_context(NULL);
1066   scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
1067
1068   scoreboard->dag = dag_create(mem_ctx);
1069
1070   nir_foreach_instr(instr, block) {
1071      nir_schedule_node *n =
1072         rzalloc(mem_ctx, nir_schedule_node);
1073
1074      n->instr = instr;
1075      n->delay = nir_schedule_get_delay(scoreboard, instr);
1076      dag_init_node(scoreboard->dag, &n->dag);
1077
1078      _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
1079   }
1080
1081   calculate_forward_deps(scoreboard, block);
1082   calculate_reverse_deps(scoreboard, block);
1083
1084   dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
1085
1086   nir_schedule_instructions(scoreboard, block);
1087
1088   ralloc_free(mem_ctx);
1089   scoreboard->instr_map = NULL;
1090}
1091
1092static bool
1093nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
1094{
1095   nir_schedule_scoreboard *scoreboard = state;
1096   struct set *def_uses = _mesa_pointer_set_create(scoreboard);
1097
1098   _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
1099
1100   _mesa_set_add(def_uses, def->parent_instr);
1101
1102   nir_foreach_use(src, def) {
1103      _mesa_set_add(def_uses, src->parent_instr);
1104   }
1105
1106   /* XXX: Handle if uses */
1107
1108   return true;
1109}
1110
1111static nir_schedule_scoreboard *
1112nir_schedule_get_scoreboard(nir_shader *shader,
1113                            const nir_schedule_options *options)
1114{
1115   nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
1116
1117   scoreboard->shader = shader;
1118   scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
1119   scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
1120   scoreboard->options = options;
1121   scoreboard->pressure = 0;
1122
1123   nir_foreach_function(function, shader) {
1124      nir_foreach_register(reg, &function->impl->registers) {
1125         struct set *register_uses =
1126            _mesa_pointer_set_create(scoreboard);
1127
1128         _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
1129
1130         nir_foreach_use(src, reg) {
1131            _mesa_set_add(register_uses, src->parent_instr);
1132         }
1133
1134         /* XXX: Handle if uses */
1135
1136         nir_foreach_def(dest, reg) {
1137            _mesa_set_add(register_uses, dest->reg.parent_instr);
1138         }
1139      }
1140
1141      nir_foreach_block(block, function->impl) {
1142         nir_foreach_instr(instr, block) {
1143            nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1144                                scoreboard);
1145         }
1146
1147         /* XXX: We're ignoring if uses, which may prioritize scheduling other
1148          * uses of the if src even when it doesn't help.  That's not many
1149          * values, though, so meh.
1150          */
1151      }
1152   }
1153
1154   return scoreboard;
1155}
1156
1157static void
1158nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1159{
1160#ifdef NDEBUG
1161   return;
1162#endif
1163
1164   bool any_uses = false;
1165
1166   hash_table_foreach(scoreboard->remaining_uses, entry) {
1167      struct set *remaining_uses = entry->data;
1168
1169      set_foreach(remaining_uses, instr_entry) {
1170         if (!any_uses) {
1171            fprintf(stderr, "Tracked uses remain after scheduling.  "
1172                    "Affected instructions: \n");
1173            any_uses = true;
1174         }
1175         nir_print_instr(instr_entry->key, stderr);
1176         fprintf(stderr, "\n");
1177      }
1178   }
1179
1180   assert(!any_uses);
1181}
1182
1183/**
1184 * Schedules the NIR instructions to try to decrease stalls (for example,
1185 * delaying texture reads) while managing register pressure.
1186 *
1187 * The threshold represents "number of NIR register/SSA def channels live
1188 * before switching the scheduling heuristic to reduce register pressure",
1189 * since most of our GPU architectures are scalar (extending to vector with a
1190 * flag wouldn't be hard).  This number should be a bit below the number of
1191 * registers available (counting any that may be occupied by system value
1192 * payload values, for example), since the heuristic may not always be able to
1193 * free a register immediately.  The amount below the limit is up to you to
1194 * tune.
1195 */
1196void
1197nir_schedule(nir_shader *shader,
1198             const nir_schedule_options *options)
1199{
1200   nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1201                                                                     options);
1202
1203   if (debug) {
1204      fprintf(stderr, "NIR shader before scheduling:\n");
1205      nir_print_shader(shader, stderr);
1206   }
1207
1208   nir_foreach_function(function, shader) {
1209      if (!function->impl)
1210         continue;
1211
1212      nir_foreach_block(block, function->impl) {
1213         nir_schedule_block(scoreboard, block);
1214      }
1215   }
1216
1217   nir_schedule_validate_uses(scoreboard);
1218
1219   ralloc_free(scoreboard);
1220}
1221