1 /*
2  * Copyright © 2019 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file brw_fs_scoreboard.cpp
25  *
26  * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee
27  * data coherency between register reads and writes in previous generations.
28  * This lowering pass runs after register allocation in order to make up for
29  * it.
30  *
31  * It works by performing global dataflow analysis in order to determine the
32  * set of potential dependencies of every instruction in the shader, and then
33  * inserts any required SWSB annotations and additional SYNC instructions in
34  * order to guarantee data coherency.
35  *
36  * WARNING - Access of the following (rarely used) ARF registers is not
37  *           tracked here, and require the RegDist SWSB annotation to be set
38  *           to 1 by the generator in order to avoid data races:
39  *
40  *  - sp stack pointer
41  *  - sr0 state register
42  *  - cr0 control register
43  *  - ip instruction pointer
44  *  - tm0 timestamp register
45  *  - dbg0 debug register
46  *  - acc2-9 special accumulator registers on TGL
47  *  - mme0-7 math macro extended accumulator registers
48  *
49  * The following ARF registers don't need to be tracked here because data
50  * coherency is still provided transparently by the hardware:
51  *
52  *  - f0-1 flag registers
53  *  - n0 notification register
54  *  - tdr0 thread dependency register
55  */
56 
57 #include "brw_fs.h"
58 #include "brw_cfg.h"
59 
60 using namespace brw;
61 
62 namespace {
63    /**
64     * In-order instruction accounting.
65     * @{
66     */
67 
68    /**
69     * Return the RegDist pipeline the hardware will synchronize with if no
70     * pipeline information is provided in the SWSB annotation of an
71     * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
72     */
73    tgl_pipe
inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)74    inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
75    {
76       if (devinfo->verx10 >= 125) {
77          bool has_int_src = false, has_long_src = false;
78 
79          if (is_send(inst))
80             return TGL_PIPE_NONE;
81 
82          for (unsigned i = 0; i < inst->sources; i++) {
83             if (inst->src[i].file != BAD_FILE &&
84                 !inst->is_control_source(i)) {
85                const brw_reg_type t = inst->src[i].type;
86                has_int_src |= !brw_reg_type_is_floating_point(t);
87                has_long_src |= type_sz(t) >= 8;
88             }
89          }
90 
91          return has_long_src ? TGL_PIPE_LONG :
92                 has_int_src ? TGL_PIPE_INT :
93                 TGL_PIPE_FLOAT;
94 
95       } else {
96          return TGL_PIPE_FLOAT;
97       }
98    }
99 
100    /**
101     * Return the RegDist pipeline that will execute an instruction, or
102     * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the
103     * RegDist synchronization mechanism.
104     */
105    tgl_pipe
inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)106    inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
107    {
108       const brw_reg_type t = get_exec_type(inst);
109       const bool is_dword_multiply = !brw_reg_type_is_floating_point(t) &&
110          ((inst->opcode == BRW_OPCODE_MUL &&
111            MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
112           (inst->opcode == BRW_OPCODE_MAD &&
113            MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
114 
115       if (is_unordered(inst))
116          return TGL_PIPE_NONE;
117       else if (devinfo->verx10 < 125)
118          return TGL_PIPE_FLOAT;
119       else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
120                type_sz(t) >= 8)
121          return TGL_PIPE_INT;
122       else if (inst->opcode == SHADER_OPCODE_BROADCAST &&
123                !devinfo->has_64bit_float && type_sz(t) >= 8)
124          return TGL_PIPE_INT;
125       else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT)
126          return TGL_PIPE_FLOAT;
127       else if (type_sz(inst->dst.type) >= 8 || type_sz(t) >= 8 ||
128                is_dword_multiply) {
129          assert(devinfo->has_64bit_float || devinfo->has_64bit_int ||
130                 devinfo->has_integer_dword_mul);
131          return TGL_PIPE_LONG;
132       } else if (brw_reg_type_is_floating_point(inst->dst.type))
133          return TGL_PIPE_FLOAT;
134       else
135          return TGL_PIPE_INT;
136    }
137 
138    /**
139     * Index of the \p p pipeline counter in the ordered_address vector defined
140     * below.
141     */
142 #define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) :    \
143                 (abort(), ~0u))
144 
145    /**
146     * Number of in-order hardware instructions for pipeline index \p contained
147     * in this IR instruction.  This determines the increment applied to the
148     * RegDist counter calculated for any ordered dependency that crosses this
149     * instruction.
150     */
151    unsigned
ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst, unsigned p)152    ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,
153                 unsigned p)
154    {
155       switch (inst->opcode) {
156       case BRW_OPCODE_SYNC:
157       case BRW_OPCODE_DO:
158       case SHADER_OPCODE_UNDEF:
159       case SHADER_OPCODE_HALT_TARGET:
160       case FS_OPCODE_SCHEDULING_FENCE:
161          return 0;
162       default:
163          /* Note that the following is inaccurate for virtual instructions
164           * that expand to more in-order instructions than assumed here, but
165           * that can only lead to suboptimal execution ordering, data
166           * coherency won't be impacted.  Providing exact RegDist counts for
167           * each virtual instruction would allow better ALU performance, but
168           * it would require keeping this switch statement in perfect sync
169           * with the generator in order to avoid data corruption.  Lesson is
170           * (again) don't use virtual instructions if you want optimal
171           * scheduling.
172           */
173          if (!is_unordered(inst) && (p == IDX(inferred_exec_pipe(devinfo, inst)) ||
174                                      p == IDX(TGL_PIPE_ALL)))
175             return 1;
176          else
177             return 0;
178       }
179    }
180 
181    /**
182     * Type for an instruction counter that increments for in-order
183     * instructions only, arbitrarily denoted 'jp' throughout this lowering
184     * pass in order to distinguish it from the regular instruction counter.
185     * This is represented as a vector with an independent counter for each
186     * asynchronous ALU pipeline in the EU.
187     */
188    struct ordered_address {
189       /**
190        * Construct the ordered address of a dependency known to execute on a
191        * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL
192        * is provided), in which case the vector counter will be initialized
193        * with all components equal to INT_MIN (always satisfied) except for
194        * component IDX(p).
195        */
ordered_address__anon8930::ordered_address196       ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {
197          for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)
198             jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?
199                      INT_MIN : jp0);
200       }
201 
202       int jp[IDX(TGL_PIPE_ALL)];
203 
204       friend bool
operator ==__anon8930::ordered_address205       operator==(const ordered_address &jp0, const ordered_address &jp1)
206       {
207          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
208             if (jp0.jp[p] != jp1.jp[p])
209                return false;
210          }
211 
212          return true;
213       }
214    };
215 
216    /**
217     * Return true if the specified ordered address is trivially satisfied for
218     * all pipelines except potentially for the specified pipeline \p p.
219     */
220    bool
is_single_pipe(const ordered_address &jp, tgl_pipe p)221    is_single_pipe(const ordered_address &jp, tgl_pipe p)
222    {
223       for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
224          if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
225             return false;
226       }
227 
228       return true;
229    }
230 
231    /**
232     * Return the number of instructions in the program.
233     */
234    unsigned
num_instructions(const backend_shader *shader)235    num_instructions(const backend_shader *shader)
236    {
237       return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
238    }
239 
240    /**
241     * Calculate the local ordered_address instruction counter at every
242     * instruction of the shader for subsequent constant-time look-up.
243     */
244    ordered_address *
ordered_inst_addresses(const fs_visitor *shader)245    ordered_inst_addresses(const fs_visitor *shader)
246    {
247       ordered_address *jps = new ordered_address[num_instructions(shader)];
248       ordered_address jp(TGL_PIPE_ALL, 0);
249       unsigned ip = 0;
250 
251       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
252          jps[ip] = jp;
253          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
254             jp.jp[p] += ordered_unit(shader->devinfo, inst, p);
255          ip++;
256       }
257 
258       return jps;
259    }
260 
261    /**
262     * Synchronization mode required for data manipulated by in-order
263     * instructions.
264     *
265     * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
266     * enum for additional type safety.  The hardware doesn't provide control
267     * over the synchronization mode for RegDist annotations, this is only used
268     * internally in this pass in order to optimize out redundant read
269     * dependencies where possible.
270     */
271    enum tgl_regdist_mode {
272       TGL_REGDIST_NULL = 0,
273       TGL_REGDIST_SRC = 1,
274       TGL_REGDIST_DST = 2
275    };
276 
277    /**
278     * Allow bitwise arithmetic of tgl_regdist_mode enums.
279     */
280    tgl_regdist_mode
operator |(tgl_regdist_mode x, tgl_regdist_mode y)281    operator|(tgl_regdist_mode x, tgl_regdist_mode y)
282    {
283       return tgl_regdist_mode(unsigned(x) | unsigned(y));
284    }
285 
286    tgl_regdist_mode
operator &(tgl_regdist_mode x, tgl_regdist_mode y)287    operator&(tgl_regdist_mode x, tgl_regdist_mode y)
288    {
289       return tgl_regdist_mode(unsigned(x) & unsigned(y));
290    }
291 
292    tgl_regdist_mode &
operator |=(tgl_regdist_mode &x, tgl_regdist_mode y)293    operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
294    {
295       return x = x | y;
296    }
297 
298    tgl_regdist_mode &
operator &=(tgl_regdist_mode &x, tgl_regdist_mode y)299    operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
300    {
301       return x = x & y;
302    }
303 
304    /** @} */
305 
306    /**
307     * Representation of an equivalence relation among the set of unsigned
308     * integers.
309     *
310     * Its initial state is the identity relation '~' such that i ~ j if and
311     * only if i == j for every pair of unsigned integers i and j.
312     */
313    struct equivalence_relation {
equivalence_relation__anon8930::equivalence_relation314       equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
315       {
316          for (unsigned i = 0; i < n; i++)
317             is[i] = i;
318       }
319 
~equivalence_relation__anon8930::equivalence_relation320       ~equivalence_relation()
321       {
322          delete[] is;
323       }
324 
325       /**
326        * Return equivalence class index of the specified element.  Effectively
327        * this is the numeric value of an arbitrary representative from the
328        * equivalence class.
329        *
330        * Allows the evaluation of the equivalence relation according to the
331        * rule that i ~ j if and only if lookup(i) == lookup(j).
332        */
333       unsigned
lookup__anon8930::equivalence_relation334       lookup(unsigned i) const
335       {
336          if (i < n && is[i] != i)
337             return lookup(is[i]);
338          else
339             return i;
340       }
341 
342       /**
343        * Create an array with the results of the lookup() method for
344        * constant-time evaluation.
345        */
346       unsigned *
flatten__anon8930::equivalence_relation347       flatten() const
348       {
349          unsigned *ids = new unsigned[n];
350 
351          for (unsigned i = 0; i < n; i++)
352             ids[i] = lookup(i);
353 
354          return ids;
355       }
356 
357       /**
358        * Mutate the existing equivalence relation minimally by imposing the
359        * additional requirement that i ~ j.
360        *
361        * The algorithm updates the internal representation recursively in
362        * order to guarantee transitivity while preserving the previously
363        * specified equivalence requirements.
364        */
365       unsigned
link__anon8930::equivalence_relation366       link(unsigned i, unsigned j)
367       {
368          const unsigned k = lookup(i);
369          assign(i, k);
370          assign(j, k);
371          return k;
372       }
373 
374    private:
375       equivalence_relation(const equivalence_relation &);
376 
377       equivalence_relation &
378       operator=(const equivalence_relation &);
379 
380       /**
381        * Assign the representative of \p from to be equivalent to \p to.
382        *
383        * At the same time the data structure is partially flattened as much as
384        * it's possible without increasing the number of recursive calls.
385        */
386       void
assign__anon8930::equivalence_relation387       assign(unsigned from, unsigned to)
388       {
389          if (from != to) {
390             assert(from < n);
391 
392             if (is[from] != from)
393                assign(is[from], to);
394 
395             is[from] = to;
396          }
397       }
398 
399       unsigned *is;
400       unsigned n;
401    };
402 
403    /**
404     * Representation of a data dependency between two instructions in the
405     * program.
406     * @{
407     */
408    struct dependency {
409       /**
410        * No dependency information.
411        */
412       dependency() : ordered(TGL_REGDIST_NULL), jp(),
413                      unordered(TGL_SBID_NULL), id(0),
414                      exec_all(false) {}
415 
416       /**
417        * Construct a dependency on the in-order instruction with the provided
418        * ordered_address instruction counter.
419        */
420       dependency(tgl_regdist_mode mode, const ordered_address &jp,
421                  bool exec_all) :
422          ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
423          exec_all(exec_all) {}
424 
425       /**
426        * Construct a dependency on the out-of-order instruction with the
427        * specified synchronization token.
428        */
429       dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
430          ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),
431          exec_all(exec_all) {}
432 
433       /**
434        * Synchronization mode of in-order dependency, or zero if no in-order
435        * dependency is present.
436        */
437       tgl_regdist_mode ordered;
438 
439       /**
440        * Instruction counter of in-order dependency.
441        *
442        * For a dependency part of a different block in the program, this is
443        * relative to the specific control flow path taken between the
444        * dependency and the current block: It is the ordered_address such that
445        * the difference between it and the ordered_address of the first
446        * instruction of the current block is exactly the number of in-order
447        * instructions across that control flow path.  It is not guaranteed to
448        * be equal to the local ordered_address of the generating instruction
449        * [as returned by ordered_inst_addresses()], except for block-local
450        * dependencies.
451        */
452       ordered_address jp;
453 
454       /**
455        * Synchronization mode of unordered dependency, or zero if no unordered
456        * dependency is present.
457        */
458       tgl_sbid_mode unordered;
459 
460       /** Synchronization token of out-of-order dependency. */
461       unsigned id;
462 
463       /**
464        * Whether the dependency could be run with execution masking disabled,
465        * which might lead to the unwanted execution of the generating
466        * instruction in cases where a BB is executed with all channels
467        * disabled due to hardware bug Wa_1407528679.
468        */
469       bool exec_all;
470 
471       /**
472        * Trivial in-order dependency that's always satisfied.
473        *
474        * Note that unlike a default-constructed dependency() which is also
475        * trivially satisfied, this is considered to provide dependency
476        * information and can be used to clear a previously pending dependency
477        * via shadow().
478        */
479       static const dependency done;
480 
481       friend bool
482       operator==(const dependency &dep0, const dependency &dep1)
483       {
484          return dep0.ordered == dep1.ordered &&
485                 dep0.jp == dep1.jp &&
486                 dep0.unordered == dep1.unordered &&
487                 dep0.id == dep1.id &&
488                 dep0.exec_all == dep1.exec_all;
489       }
490 
491       friend bool
492       operator!=(const dependency &dep0, const dependency &dep1)
493       {
494          return !(dep0 == dep1);
495       }
496    };
497 
498    const dependency dependency::done =
499         dependency(TGL_REGDIST_DST, ordered_address(), false);
500 
501    /**
502     * Return whether \p dep contains any dependency information.
503     */
504    bool
505    is_valid(const dependency &dep)
506    {
507       return dep.ordered || dep.unordered;
508    }
509 
510    /**
511     * Combine \p dep0 and \p dep1 into a single dependency object that is only
512     * satisfied when both original dependencies are satisfied.  This might
513     * involve updating the equivalence relation \p eq in order to make sure
514     * that both out-of-order dependencies are assigned the same hardware SBID
515     * as synchronization token.
516     */
517    dependency
518    merge(equivalence_relation &eq,
519          const dependency &dep0, const dependency &dep1)
520    {
521       dependency dep;
522 
523       if (dep0.ordered || dep1.ordered) {
524          dep.ordered = dep0.ordered | dep1.ordered;
525          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
526             dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);
527       }
528 
529       if (dep0.unordered || dep1.unordered) {
530          dep.unordered = dep0.unordered | dep1.unordered;
531          dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
532                           dep1.unordered ? dep1.id : dep0.id);
533       }
534 
535       dep.exec_all = dep0.exec_all || dep1.exec_all;
536 
537       return dep;
538    }
539 
540    /**
541     * Override dependency information of \p dep0 with that of \p dep1.
542     */
543    dependency
544    shadow(const dependency &dep0, const dependency &dep1)
545    {
546       if (dep0.ordered == TGL_REGDIST_SRC &&
547           is_valid(dep1) && !(dep1.unordered & TGL_SBID_DST) &&
548                             !(dep1.ordered & TGL_REGDIST_DST)) {
549          /* As an optimization (see dependency_for_read()),
550           * instructions with a RaR dependency don't synchronize
551           * against a previous in-order read, so we need to pass
552           * through both ordered dependencies instead of simply
553           * dropping the first one.  Otherwise we could encounter a
554           * WaR data hazard between OP0 and OP2 in cases like:
555           *
556           *   OP0 r1:f r0:d
557           *   OP1 r2:d r0:d
558           *   OP2 r0:d r3:d
559           *
560           * since only the integer-pipeline r0 dependency from OP1
561           * would be visible to OP2, even though OP0 could technically
562           * execute after OP1 due to the floating-point and integer
563           * pipelines being asynchronous on Gfx12.5+ platforms, so
564           * synchronizing OP2 against OP1 would be insufficient.
565           */
566          dependency dep = dep1;
567 
568          dep.ordered |= dep0.ordered;
569          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
570                dep.jp.jp[p] = MAX2(dep.jp.jp[p], dep0.jp.jp[p]);
571 
572          return dep;
573       } else {
574          return is_valid(dep1) ? dep1 : dep0;
575       }
576    }
577 
578    /**
579     * Translate dependency information across the program.
580     *
581     * This returns a dependency on the same instruction translated to the
582     * ordered_address space of a different block.  The correct shift for
583     * transporting a dependency across an edge of the CFG is the difference
584     * between the local ordered_address of the first instruction of the target
585     * block and the local ordered_address of the instruction immediately after
586     * the end of the origin block.
587     */
588    dependency
589    transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])
590    {
591       if (dep.ordered) {
592          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
593             if (dep.jp.jp[p] > INT_MIN)
594                dep.jp.jp[p] += delta[p];
595          }
596       }
597 
598       return dep;
599    }
600 
601    /**
602     * Return simplified dependency removing any synchronization modes not
603     * applicable to an instruction reading the same register location.
604     */
605    dependency
606    dependency_for_read(dependency dep)
607    {
608       dep.ordered &= TGL_REGDIST_DST;
609       return dep;
610    }
611 
612    /**
613     * Return simplified dependency removing any synchronization modes not
614     * applicable to an instruction \p inst writing the same register location.
615     *
616     * This clears any WaR dependency for writes performed from the same
617     * pipeline as the read, since there is no possibility for a data hazard.
618     */
619    dependency
620    dependency_for_write(const struct intel_device_info *devinfo,
621                         const fs_inst *inst, dependency dep)
622    {
623       if (!is_unordered(inst) &&
624           is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))
625          dep.ordered &= TGL_REGDIST_DST;
626       return dep;
627    }
628 
629    /** @} */
630 
631    /**
632     * Scoreboard representation.  This keeps track of the data dependencies of
633     * registers with GRF granularity.
634     */
635    class scoreboard {
636    public:
637       /**
638        * Look up the most current data dependency for register \p r.
639        */
640       dependency
641       get(const fs_reg &r) const
642       {
643          if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
644             return *p;
645          else
646             return dependency();
647       }
648 
649       /**
650        * Specify the most current data dependency for register \p r.
651        */
652       void
653       set(const fs_reg &r, const dependency &d)
654       {
655          if (dependency *p = dep(r))
656             *p = d;
657       }
658 
659       /**
660        * Component-wise merge() of corresponding dependencies from two
661        * scoreboard objects.  \sa merge().
662        */
663       friend scoreboard
664       merge(equivalence_relation &eq,
665             const scoreboard &sb0, const scoreboard &sb1)
666       {
667          scoreboard sb;
668 
669          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
670             sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
671 
672          sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
673          sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
674 
675          return sb;
676       }
677 
678       /**
679        * Component-wise shadow() of corresponding dependencies from two
680        * scoreboard objects.  \sa shadow().
681        */
682       friend scoreboard
683       shadow(const scoreboard &sb0, const scoreboard &sb1)
684       {
685          scoreboard sb;
686 
687          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
688             sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
689 
690          sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
691          sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
692 
693          return sb;
694       }
695 
696       /**
697        * Component-wise transport() of dependencies from a scoreboard
698        * object.  \sa transport().
699        */
700       friend scoreboard
701       transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])
702       {
703          scoreboard sb;
704 
705          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
706             sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
707 
708          sb.addr_dep = transport(sb0.addr_dep, delta);
709          sb.accum_dep = transport(sb0.accum_dep, delta);
710 
711          return sb;
712       }
713 
714       friend bool
715       operator==(const scoreboard &sb0, const scoreboard &sb1)
716       {
717          for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
718             if (sb0.grf_deps[i] != sb1.grf_deps[i])
719                return false;
720          }
721 
722          if (sb0.addr_dep != sb1.addr_dep)
723             return false;
724 
725          if (sb0.accum_dep != sb1.accum_dep)
726             return false;
727 
728          return true;
729       }
730 
731       friend bool
732       operator!=(const scoreboard &sb0, const scoreboard &sb1)
733       {
734          return !(sb0 == sb1);
735       }
736 
737    private:
738       dependency grf_deps[BRW_MAX_GRF];
739       dependency addr_dep;
740       dependency accum_dep;
741 
742       dependency *
743       dep(const fs_reg &r)
744       {
745          const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
746                                reg_offset(r) / REG_SIZE);
747 
748          return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
749                  r.file == MRF ? &grf_deps[GFX7_MRF_HACK_START + reg] :
750                  r.file == ARF && reg >= BRW_ARF_ADDRESS &&
751                                   reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
752                  r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
753                                   reg < BRW_ARF_FLAG ? &accum_dep :
754                  NULL);
755       }
756    };
757 
758    /**
759     * Dependency list handling.
760     * @{
761     */
762    struct dependency_list {
763       dependency_list() : deps(NULL), n(0) {}
764 
765       ~dependency_list()
766       {
767          free(deps);
768       }
769 
770       void
771       push_back(const dependency &dep)
772       {
773          deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
774          deps[n++] = dep;
775       }
776 
777       unsigned
778       size() const
779       {
780          return n;
781       }
782 
783       const dependency &
784       operator[](unsigned i) const
785       {
786          assert(i < n);
787          return deps[i];
788       }
789 
790       dependency &
791       operator[](unsigned i)
792       {
793          assert(i < n);
794          return deps[i];
795       }
796 
797    private:
798       dependency_list(const dependency_list &);
799       dependency_list &
800       operator=(const dependency_list &);
801 
802       dependency *deps;
803       unsigned n;
804    };
805 
806    /**
807     * Add dependency \p dep to the list of dependencies of an instruction
808     * \p deps.
809     */
810    void
811    add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
812    {
813       if (is_valid(dep)) {
814          /* Translate the unordered dependency token first in order to keep
815           * the list minimally redundant.
816           */
817          if (dep.unordered)
818             dep.id = ids[dep.id];
819 
820          /* Try to combine the specified dependency with any existing ones. */
821          for (unsigned i = 0; i < deps.size(); i++) {
822             /* Don't combine otherwise matching dependencies if there is an
823              * exec_all mismatch which would cause a SET dependency to gain an
824              * exec_all flag, since that would prevent it from being baked
825              * into the instruction we want to allocate an SBID for.
826              */
827             if (deps[i].exec_all != dep.exec_all &&
828                 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
829                 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
830                continue;
831 
832             if (dep.ordered && deps[i].ordered) {
833                for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
834                   deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);
835 
836                deps[i].ordered |= dep.ordered;
837                deps[i].exec_all |= dep.exec_all;
838                dep.ordered = TGL_REGDIST_NULL;
839             }
840 
841             if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
842                deps[i].unordered |= dep.unordered;
843                deps[i].exec_all |= dep.exec_all;
844                dep.unordered = TGL_SBID_NULL;
845             }
846          }
847 
848          /* Add it to the end of the list if necessary. */
849          if (is_valid(dep))
850             deps.push_back(dep);
851       }
852    }
853 
854    /**
855     * Construct a tgl_swsb annotation encoding any ordered dependencies from
856     * the dependency list \p deps of an instruction with ordered_address \p
857     * jp.  If \p exec_all is false only dependencies known to be executed with
858     * channel masking applied will be considered in the calculation.
859     */
860    tgl_swsb
861    ordered_dependency_swsb(const dependency_list &deps,
862                            const ordered_address &jp,
863                            bool exec_all)
864    {
865       tgl_pipe p = TGL_PIPE_NONE;
866       unsigned min_dist = ~0u;
867 
868       for (unsigned i = 0; i < deps.size(); i++) {
869          if (deps[i].ordered && exec_all >= deps[i].exec_all) {
870             for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
871                const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);
872                const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);
873                assert(jp.jp[q] > deps[i].jp.jp[q]);
874                if (dist <= max_dist) {
875                   p = (p && IDX(p) != q ? TGL_PIPE_ALL :
876                        tgl_pipe(TGL_PIPE_FLOAT + q));
877                   min_dist = MIN3(min_dist, dist, 7);
878                }
879             }
880          }
881       }
882 
883       return { p ? min_dist : 0, p };
884    }
885 
886    /**
887     * Return whether the dependency list \p deps of an instruction with
888     * ordered_address \p jp has any non-trivial ordered dependencies.  If \p
889     * exec_all is false only dependencies known to be executed with channel
890     * masking applied will be considered in the calculation.
891     */
892    bool
893    find_ordered_dependency(const dependency_list &deps,
894                            const ordered_address &jp,
895                            bool exec_all)
896    {
897       return ordered_dependency_swsb(deps, jp, exec_all).regdist;
898    }
899 
900    /**
901     * Return the full tgl_sbid_mode bitset for the first unordered dependency
902     * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
903     * no such dependency is present.  If \p exec_all is false only
904     * dependencies known to be executed with channel masking applied will be
905     * considered in the calculation.
906     */
907    tgl_sbid_mode
908    find_unordered_dependency(const dependency_list &deps,
909                              tgl_sbid_mode unordered,
910                              bool exec_all)
911    {
912       if (unordered) {
913          for (unsigned i = 0; i < deps.size(); i++) {
914             if ((unordered & deps[i].unordered) &&
915                 exec_all >= deps[i].exec_all)
916                return deps[i].unordered;
917          }
918       }
919 
920       return TGL_SBID_NULL;
921    }
922 
923    /**
924     * Return the tgl_sbid_mode bitset of an unordered dependency from the list
925     * \p deps that can be represented directly in the SWSB annotation of the
926     * instruction without additional SYNC instructions, or zero if no such
927     * dependency is present.
928     */
929    tgl_sbid_mode
930    baked_unordered_dependency_mode(const struct intel_device_info *devinfo,
931                                    const fs_inst *inst,
932                                    const dependency_list &deps,
933                                    const ordered_address &jp)
934    {
935       const bool exec_all = inst->force_writemask_all;
936       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
937       const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
938                                                             exec_all).pipe;
939 
940       if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
941          return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
942       else if (has_ordered && is_unordered(inst))
943          return TGL_SBID_NULL;
944       else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
945                (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))
946          return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
947       else if (!has_ordered)
948          return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
949       else
950          return TGL_SBID_NULL;
951    }
952 
953    /**
954     * Return whether an ordered dependency from the list \p deps can be
955     * represented directly in the SWSB annotation of the instruction without
956     * additional SYNC instructions.
957     */
958    bool
959    baked_ordered_dependency_mode(const struct intel_device_info *devinfo,
960                                  const fs_inst *inst,
961                                  const dependency_list &deps,
962                                  const ordered_address &jp)
963    {
964       const bool exec_all = inst->force_writemask_all;
965       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
966       const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
967                                                             exec_all).pipe;
968       const tgl_sbid_mode unordered_mode =
969          baked_unordered_dependency_mode(devinfo, inst, deps, jp);
970 
971       if (!has_ordered)
972          return false;
973       else if (!unordered_mode)
974          return true;
975       else
976          return ordered_pipe == inferred_sync_pipe(devinfo, inst) &&
977                 unordered_mode == (is_unordered(inst) ? TGL_SBID_SET :
978                                    TGL_SBID_DST);
979    }
980 
981    /** @} */
982 
983    /**
984     * Shader instruction dependency calculation.
985     * @{
986     */
987 
988    /**
989     * Update scoreboard object \p sb to account for the execution of
990     * instruction \p inst.
991     */
992    void
993    update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
994                           const fs_inst *inst, unsigned ip, scoreboard &sb)
995    {
996       const bool exec_all = inst->force_writemask_all;
997       const struct intel_device_info *devinfo = shader->devinfo;
998       const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
999       const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :
1000                                      ordered_address();
1001       const bool is_ordered = ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL));
1002 
1003       /* Track any source registers that may be fetched asynchronously by this
1004        * instruction, otherwise clear the dependency in order to avoid
1005        * subsequent redundant synchronization.
1006        */
1007       for (unsigned i = 0; i < inst->sources; i++) {
1008          const dependency rd_dep =
1009             (inst->is_payload(i) ||
1010              inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
1011             is_ordered ? dependency(TGL_REGDIST_SRC, jp, exec_all) :
1012             dependency::done;
1013 
1014          for (unsigned j = 0; j < regs_read(inst, i); j++) {
1015             const fs_reg r = byte_offset(inst->src[i], REG_SIZE * j);
1016             sb.set(r, shadow(sb.get(r), rd_dep));
1017          }
1018       }
1019 
1020       if (inst->reads_accumulator_implicitly())
1021          sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
1022 
1023       if (is_send(inst) && inst->base_mrf != -1) {
1024          const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);
1025 
1026          for (unsigned j = 0; j < inst->mlen; j++)
1027             sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
1028       }
1029 
1030       /* Track any destination registers of this instruction. */
1031       const dependency wr_dep =
1032          is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
1033          is_ordered ? dependency(TGL_REGDIST_DST, jp, exec_all) :
1034          dependency();
1035 
1036       if (inst->writes_accumulator_implicitly(devinfo))
1037          sb.set(brw_acc_reg(8), wr_dep);
1038 
1039       if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
1040           !inst->dst.is_null()) {
1041          for (unsigned j = 0; j < regs_written(inst); j++)
1042             sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
1043       }
1044    }
1045 
1046    /**
1047     * Calculate scoreboard objects locally that represent any pending (and
1048     * unconditionally resolved) dependencies at the end of each block of the
1049     * program.
1050     */
1051    scoreboard *
gather_block_scoreboards(const fs_visitor *shader, const ordered_address *jps)1052    gather_block_scoreboards(const fs_visitor *shader,
1053                             const ordered_address *jps)
1054    {
1055       scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
1056       unsigned ip = 0;
1057 
1058       foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
1059          update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
1060 
1061       return sbs;
1062    }
1063 
1064    /**
1065     * Propagate data dependencies globally through the control flow graph
1066     * until a fixed point is reached.
1067     *
1068     * Calculates the set of dependencies potentially pending at the beginning
1069     * of each block, and returns it as an array of scoreboard objects.
1070     */
1071    scoreboard *
propagate_block_scoreboards(const fs_visitor *shader, const ordered_address *jps, equivalence_relation &eq)1072    propagate_block_scoreboards(const fs_visitor *shader,
1073                                const ordered_address *jps,
1074                                equivalence_relation &eq)
1075    {
1076       const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
1077       scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
1078       scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
1079 
1080       for (bool progress = true; progress;) {
1081          progress = false;
1082 
1083          foreach_block(block, shader->cfg) {
1084             const scoreboard sb = shadow(in_sbs[block->num],
1085                                          delta_sbs[block->num]);
1086 
1087             if (sb != out_sbs[block->num]) {
1088                foreach_list_typed(bblock_link, child_link, link,
1089                                   &block->children) {
1090                   scoreboard &in_sb = in_sbs[child_link->block->num];
1091                   int delta[IDX(TGL_PIPE_ALL)];
1092 
1093                   for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
1094                      delta[p] = jps[child_link->block->start_ip].jp[p]
1095                         - jps[block->end_ip].jp[p]
1096                         - ordered_unit(shader->devinfo,
1097                                        static_cast<const fs_inst *>(block->end()), p);
1098 
1099                   in_sb = merge(eq, in_sb, transport(sb, delta));
1100                }
1101 
1102                out_sbs[block->num] = sb;
1103                progress = true;
1104             }
1105          }
1106       }
1107 
1108       delete[] delta_sbs;
1109       delete[] out_sbs;
1110 
1111       return in_sbs;
1112    }
1113 
1114    /**
1115     * Return the list of potential dependencies of each instruction in the
1116     * shader based on the result of global dependency analysis.
1117     */
1118    dependency_list *
gather_inst_dependencies(const fs_visitor *shader, const ordered_address *jps)1119    gather_inst_dependencies(const fs_visitor *shader,
1120                             const ordered_address *jps)
1121    {
1122       const struct intel_device_info *devinfo = shader->devinfo;
1123       equivalence_relation eq(num_instructions(shader));
1124       scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
1125       const unsigned *ids = eq.flatten();
1126       dependency_list *deps = new dependency_list[num_instructions(shader)];
1127       unsigned ip = 0;
1128 
1129       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
1130          const bool exec_all = inst->force_writemask_all;
1131          const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1132          scoreboard &sb = sbs[block->num];
1133 
1134          for (unsigned i = 0; i < inst->sources; i++) {
1135             for (unsigned j = 0; j < regs_read(inst, i); j++)
1136                add_dependency(ids, deps[ip], dependency_for_read(
1137                   sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
1138          }
1139 
1140          if (inst->reads_accumulator_implicitly()) {
1141             /* Wa_22012725308:
1142              *
1143              * "When the accumulator registers are used as source and/or
1144              *  destination, hardware does not ensure prevention of write
1145              *  after read hazard across execution pipes."
1146              */
1147             const dependency dep = sb.get(brw_acc_reg(8));
1148             if (dep.ordered && !is_single_pipe(dep.jp, p))
1149                add_dependency(ids, deps[ip], dep);
1150          }
1151 
1152          if (is_send(inst) && inst->base_mrf != -1) {
1153             for (unsigned j = 0; j < inst->mlen; j++)
1154                add_dependency(ids, deps[ip], dependency_for_read(
1155                   sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
1156          }
1157 
1158          if (is_unordered(inst) && !inst->eot)
1159             add_dependency(ids, deps[ip],
1160                            dependency(TGL_SBID_SET, ip, exec_all));
1161 
1162          if (!inst->no_dd_check) {
1163             if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
1164                 !inst->dst.is_accumulator()) {
1165                for (unsigned j = 0; j < regs_written(inst); j++) {
1166                   add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1167                      sb.get(byte_offset(inst->dst, REG_SIZE * j))));
1168                }
1169             }
1170 
1171             if (inst->writes_accumulator_implicitly(devinfo) ||
1172                 inst->dst.is_accumulator()) {
1173                /* Wa_22012725308:
1174                 *
1175                 * "When the accumulator registers are used as source and/or
1176                 *  destination, hardware does not ensure prevention of write
1177                 *  after read hazard across execution pipes."
1178                 */
1179                const dependency dep = sb.get(brw_acc_reg(8));
1180                if (dep.ordered && !is_single_pipe(dep.jp, p))
1181                   add_dependency(ids, deps[ip], dep);
1182             }
1183 
1184             if (is_send(inst) && inst->base_mrf != -1) {
1185                for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
1186                   add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1187                      sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
1188             }
1189          }
1190 
1191          update_inst_scoreboard(shader, jps, inst, ip, sb);
1192          ip++;
1193       }
1194 
1195       delete[] sbs;
1196       delete[] ids;
1197 
1198       return deps;
1199    }
1200 
1201    /** @} */
1202 
1203    /**
1204     * Allocate SBID tokens to track the execution of every out-of-order
1205     * instruction of the shader.
1206     */
1207    dependency_list *
allocate_inst_dependencies(const fs_visitor *shader, const dependency_list *deps0)1208    allocate_inst_dependencies(const fs_visitor *shader,
1209                               const dependency_list *deps0)
1210    {
1211       /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
1212        *       shaders with a large number of SEND messages.
1213        */
1214 
1215       /* Allocate an unordered dependency ID to hardware SBID translation
1216        * table with as many entries as instructions there are in the shader,
1217        * which is the maximum number of unordered IDs we can find in the
1218        * program.
1219        */
1220       unsigned *ids = new unsigned[num_instructions(shader)];
1221       for (unsigned ip = 0; ip < num_instructions(shader); ip++)
1222          ids[ip] = ~0u;
1223 
1224       dependency_list *deps1 = new dependency_list[num_instructions(shader)];
1225       unsigned next_id = 0;
1226 
1227       for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
1228          for (unsigned i = 0; i < deps0[ip].size(); i++) {
1229             const dependency &dep = deps0[ip][i];
1230 
1231             if (dep.unordered && ids[dep.id] == ~0u)
1232                ids[dep.id] = (next_id++) & 0xf;
1233 
1234             add_dependency(ids, deps1[ip], dep);
1235          }
1236       }
1237 
1238       delete[] ids;
1239 
1240       return deps1;
1241    }
1242 
1243    /**
1244     * Emit dependency information provided by \p deps into the shader,
1245     * inserting additional SYNC instructions for dependencies that can't be
1246     * represented directly by annotating existing instructions.
1247     */
1248    void
emit_inst_dependencies(fs_visitor *shader, const ordered_address *jps, const dependency_list *deps)1249    emit_inst_dependencies(fs_visitor *shader,
1250                           const ordered_address *jps,
1251                           const dependency_list *deps)
1252    {
1253       const struct intel_device_info *devinfo = shader->devinfo;
1254       unsigned ip = 0;
1255 
1256       foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1257          const bool exec_all = inst->force_writemask_all;
1258          const bool ordered_mode =
1259             baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1260          const tgl_sbid_mode unordered_mode =
1261             baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1262          tgl_swsb swsb = !ordered_mode ? tgl_swsb() :
1263             ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1264 
1265          for (unsigned i = 0; i < deps[ip].size(); i++) {
1266             const dependency &dep = deps[ip][i];
1267 
1268             if (dep.unordered) {
1269                if (unordered_mode == dep.unordered &&
1270                    exec_all >= dep.exec_all && !swsb.mode) {
1271                   /* Bake unordered dependency into the instruction's SWSB if
1272                    * possible, except in cases where the current instruction
1273                    * isn't marked NoMask but the dependency is, since that
1274                    * might lead to data coherency issues due to
1275                    * Wa_1407528679.
1276                    */
1277                   swsb.sbid = dep.id;
1278                   swsb.mode = dep.unordered;
1279                } else {
1280                   /* Emit dependency into the SWSB of an extra SYNC
1281                    * instruction.
1282                    */
1283                   const fs_builder ibld = fs_builder(shader, block, inst)
1284                                           .exec_all().group(1, 0);
1285                   fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1286                                             brw_imm_ud(TGL_SYNC_NOP));
1287                   sync->sched.sbid = dep.id;
1288                   sync->sched.mode = dep.unordered;
1289                   assert(!(sync->sched.mode & TGL_SBID_SET));
1290                }
1291             }
1292          }
1293 
1294          for (unsigned i = 0; i < deps[ip].size(); i++) {
1295             const dependency &dep = deps[ip][i];
1296 
1297             if (dep.ordered &&
1298                 find_ordered_dependency(deps[ip], jps[ip], true) &&
1299                 (!ordered_mode || dep.exec_all > exec_all)) {
1300                /* If the current instruction is not marked NoMask but an
1301                 * ordered dependency is, perform the synchronization as a
1302                 * separate NoMask SYNC instruction in order to avoid data
1303                 * coherency issues due to Wa_1407528679.  The similar
1304                 * scenario with unordered dependencies should have been
1305                 * handled above.
1306                 */
1307                const fs_builder ibld = fs_builder(shader, block, inst)
1308                                        .exec_all().group(1, 0);
1309                fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1310                                          brw_imm_ud(TGL_SYNC_NOP));
1311                sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1312                break;
1313             }
1314          }
1315 
1316          /* Update the IR. */
1317          inst->sched = swsb;
1318          inst->no_dd_check = inst->no_dd_clear = false;
1319          ip++;
1320       }
1321    }
1322 }
1323 
1324 bool
lower_scoreboard()1325 fs_visitor::lower_scoreboard()
1326 {
1327    if (devinfo->ver >= 12) {
1328       const ordered_address *jps = ordered_inst_addresses(this);
1329       const dependency_list *deps0 = gather_inst_dependencies(this, jps);
1330       const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
1331       emit_inst_dependencies(this, jps, deps1);
1332       delete[] deps1;
1333       delete[] deps0;
1334       delete[] jps;
1335    }
1336 
1337    return true;
1338 }
1339