1/* -*- c++ -*- */
2/*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25#ifndef BRW_IR_FS_H
26#define BRW_IR_FS_H
27
28#include "brw_shader.h"
29
30class fs_inst;
31
32class fs_reg : public backend_reg {
33public:
34   DECLARE_RALLOC_CXX_OPERATORS(fs_reg)
35
36   void init();
37
38   fs_reg();
39   fs_reg(struct ::brw_reg reg);
40   fs_reg(enum brw_reg_file file, int nr);
41   fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type);
42
43   bool equals(const fs_reg &r) const;
44   bool negative_equals(const fs_reg &r) const;
45   bool is_contiguous() const;
46
47   /**
48    * Return the size in bytes of a single logical component of the
49    * register assuming the given execution width.
50    */
51   unsigned component_size(unsigned width) const;
52
53   /** Register region horizontal stride */
54   uint8_t stride;
55};
56
57static inline fs_reg
58negate(fs_reg reg)
59{
60   assert(reg.file != IMM);
61   reg.negate = !reg.negate;
62   return reg;
63}
64
65static inline fs_reg
66retype(fs_reg reg, enum brw_reg_type type)
67{
68   reg.type = type;
69   return reg;
70}
71
72static inline fs_reg
73byte_offset(fs_reg reg, unsigned delta)
74{
75   switch (reg.file) {
76   case BAD_FILE:
77      break;
78   case VGRF:
79   case ATTR:
80   case UNIFORM:
81      reg.offset += delta;
82      break;
83   case MRF: {
84      const unsigned suboffset = reg.offset + delta;
85      reg.nr += suboffset / REG_SIZE;
86      reg.offset = suboffset % REG_SIZE;
87      break;
88   }
89   case ARF:
90   case FIXED_GRF: {
91      const unsigned suboffset = reg.subnr + delta;
92      reg.nr += suboffset / REG_SIZE;
93      reg.subnr = suboffset % REG_SIZE;
94      break;
95   }
96   case IMM:
97   default:
98      assert(delta == 0);
99   }
100   return reg;
101}
102
103static inline fs_reg
104horiz_offset(const fs_reg &reg, unsigned delta)
105{
106   switch (reg.file) {
107   case BAD_FILE:
108   case UNIFORM:
109   case IMM:
110      /* These only have a single component that is implicitly splatted.  A
111       * horizontal offset should be a harmless no-op.
112       * XXX - Handle vector immediates correctly.
113       */
114      return reg;
115   case VGRF:
116   case MRF:
117   case ATTR:
118      return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
119   case ARF:
120   case FIXED_GRF:
121      if (reg.is_null()) {
122         return reg;
123      } else {
124         const unsigned stride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
125         return byte_offset(reg, delta * stride * type_sz(reg.type));
126      }
127   }
128   unreachable("Invalid register file");
129}
130
131static inline fs_reg
132offset(fs_reg reg, unsigned width, unsigned delta)
133{
134   switch (reg.file) {
135   case BAD_FILE:
136      break;
137   case ARF:
138   case FIXED_GRF:
139   case MRF:
140   case VGRF:
141   case ATTR:
142   case UNIFORM:
143      return byte_offset(reg, delta * reg.component_size(width));
144   case IMM:
145      assert(delta == 0);
146   }
147   return reg;
148}
149
150/**
151 * Get the scalar channel of \p reg given by \p idx and replicate it to all
152 * channels of the result.
153 */
154static inline fs_reg
155component(fs_reg reg, unsigned idx)
156{
157   reg = horiz_offset(reg, idx);
158   reg.stride = 0;
159   return reg;
160}
161
162/**
163 * Return an integer identifying the discrete address space a register is
164 * contained in.  A register is by definition fully contained in the single
165 * reg_space it belongs to, so two registers with different reg_space ids are
166 * guaranteed not to overlap.  Most register files are a single reg_space of
167 * its own, only the VGRF file is composed of multiple discrete address
168 * spaces, one for each VGRF allocation.
169 */
170static inline uint32_t
171reg_space(const fs_reg &r)
172{
173   return r.file << 16 | (r.file == VGRF ? r.nr : 0);
174}
175
176/**
177 * Return the base offset in bytes of a register relative to the start of its
178 * reg_space().
179 */
180static inline unsigned
181reg_offset(const fs_reg &r)
182{
183   return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
184          (r.file == UNIFORM ? 4 : REG_SIZE) + r.offset +
185          (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
186}
187
188/**
189 * Return the amount of padding in bytes left unused between individual
190 * components of register \p r due to a (horizontal) stride value greater than
191 * one, or zero if components are tightly packed in the register file.
192 */
193static inline unsigned
194reg_padding(const fs_reg &r)
195{
196   const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride :
197                            r.hstride == 0 ? 0 :
198                            1 << (r.hstride - 1));
199   return (MAX2(1, stride) - 1) * type_sz(r.type);
200}
201
202/**
203 * Return whether the register region starting at \p r and spanning \p dr
204 * bytes could potentially overlap the register region starting at \p s and
205 * spanning \p ds bytes.
206 */
207static inline bool
208regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
209{
210   if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
211      fs_reg t = r;
212      t.nr &= ~BRW_MRF_COMPR4;
213      /* COMPR4 regions are translated by the hardware during decompression
214       * into two separate half-regions 4 MRFs apart from each other.
215       */
216      return regions_overlap(t, dr / 2, s, ds) ||
217             regions_overlap(byte_offset(t, 4 * REG_SIZE), dr / 2, s, ds);
218
219   } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
220      return regions_overlap(s, ds, r, dr);
221
222   } else {
223      return reg_space(r) == reg_space(s) &&
224             !(reg_offset(r) + dr <= reg_offset(s) ||
225               reg_offset(s) + ds <= reg_offset(r));
226   }
227}
228
229/**
230 * Check that the register region given by r [r.offset, r.offset + dr[
231 * is fully contained inside the register region given by s
232 * [s.offset, s.offset + ds[.
233 */
234static inline bool
235region_contained_in(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
236{
237   return reg_space(r) == reg_space(s) &&
238          reg_offset(r) >= reg_offset(s) &&
239          reg_offset(r) + dr <= reg_offset(s) + ds;
240}
241
242/**
243 * Return whether the given register region is n-periodic, i.e. whether the
244 * original region remains invariant after shifting it by \p n scalar
245 * channels.
246 */
247static inline bool
248is_periodic(const fs_reg &reg, unsigned n)
249{
250   if (reg.file == BAD_FILE || reg.is_null()) {
251      return true;
252
253   } else if (reg.file == IMM) {
254      const unsigned period = (reg.type == BRW_REGISTER_TYPE_UV ||
255                               reg.type == BRW_REGISTER_TYPE_V ? 8 :
256                               reg.type == BRW_REGISTER_TYPE_VF ? 4 :
257                               1);
258      return n % period == 0;
259
260   } else if (reg.file == ARF || reg.file == FIXED_GRF) {
261      const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 :
262                               reg.vstride == 0 ? 1 << reg.width :
263                               ~0);
264      return n % period == 0;
265
266   } else {
267      return reg.stride == 0;
268   }
269}
270
271static inline bool
272is_uniform(const fs_reg &reg)
273{
274   return is_periodic(reg, 1);
275}
276
277/**
278 * Get the specified 8-component quarter of a register.
279 */
280static inline fs_reg
281quarter(const fs_reg &reg, unsigned idx)
282{
283   assert(idx < 4);
284   return horiz_offset(reg, 8 * idx);
285}
286
287/**
288 * Reinterpret each channel of register \p reg as a vector of values of the
289 * given smaller type and take the i-th subcomponent from each.
290 */
291static inline fs_reg
292subscript(fs_reg reg, brw_reg_type type, unsigned i)
293{
294   assert((i + 1) * type_sz(type) <= type_sz(reg.type));
295
296   if (reg.file == ARF || reg.file == FIXED_GRF) {
297      /* The stride is encoded inconsistently for fixed GRF and ARF registers
298       * as the log2 of the actual vertical and horizontal strides.
299       */
300      const int delta = util_logbase2(type_sz(reg.type)) -
301                        util_logbase2(type_sz(type));
302      reg.hstride += (reg.hstride ? delta : 0);
303      reg.vstride += (reg.vstride ? delta : 0);
304
305   } else if (reg.file == IMM) {
306      unsigned bit_size = type_sz(type) * 8;
307      reg.u64 >>= i * bit_size;
308      reg.u64 &= BITFIELD64_MASK(bit_size);
309      if (bit_size <= 16)
310         reg.u64 |= reg.u64 << 16;
311      return retype(reg, type);
312   } else {
313      reg.stride *= type_sz(reg.type) / type_sz(type);
314   }
315
316   return byte_offset(retype(reg, type), i * type_sz(type));
317}
318
319static inline fs_reg
320horiz_stride(fs_reg reg, unsigned s)
321{
322   reg.stride *= s;
323   return reg;
324}
325
326static const fs_reg reg_undef;
327
328class fs_inst : public backend_instruction {
329   fs_inst &operator=(const fs_inst &);
330
331   void init(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
332             const fs_reg *src, unsigned sources);
333
334public:
335   DECLARE_RALLOC_CXX_OPERATORS(fs_inst)
336
337   fs_inst();
338   fs_inst(enum opcode opcode, uint8_t exec_size);
339   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst);
340   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
341           const fs_reg &src0);
342   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
343           const fs_reg &src0, const fs_reg &src1);
344   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
345           const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
346   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
347           const fs_reg src[], unsigned sources);
348   fs_inst(const fs_inst &that);
349   ~fs_inst();
350
351   void resize_sources(uint8_t num_sources);
352
353   bool is_send_from_grf() const;
354   bool is_payload(unsigned arg) const;
355   bool is_partial_write() const;
356   unsigned components_read(unsigned i) const;
357   unsigned size_read(int arg) const;
358   bool can_do_source_mods(const struct intel_device_info *devinfo) const;
359   bool can_do_cmod();
360   bool can_change_types() const;
361   bool has_source_and_destination_hazard() const;
362   unsigned implied_mrf_writes() const;
363
364   /**
365    * Return whether \p arg is a control source of a virtual instruction which
366    * shouldn't contribute to the execution type and usual regioning
367    * restriction calculations of arithmetic instructions.
368    */
369   bool is_control_source(unsigned arg) const;
370
371   /**
372    * Return the subset of flag registers read by the instruction as a bitset
373    * with byte granularity.
374    */
375   unsigned flags_read(const intel_device_info *devinfo) const;
376
377   /**
378    * Return the subset of flag registers updated by the instruction (either
379    * partially or fully) as a bitset with byte granularity.
380    */
381   unsigned flags_written(const intel_device_info *devinfo) const;
382
383   fs_reg dst;
384   fs_reg *src;
385
386   uint8_t sources; /**< Number of fs_reg sources. */
387
388   bool last_rt:1;
389   bool pi_noperspective:1;   /**< Pixel interpolator noperspective flag */
390
391   tgl_swsb sched; /**< Scheduling info. */
392};
393
394/**
395 * Make the execution of \p inst dependent on the evaluation of a possibly
396 * inverted predicate.
397 */
398static inline fs_inst *
399set_predicate_inv(enum brw_predicate pred, bool inverse,
400                  fs_inst *inst)
401{
402   inst->predicate = pred;
403   inst->predicate_inverse = inverse;
404   return inst;
405}
406
407/**
408 * Make the execution of \p inst dependent on the evaluation of a predicate.
409 */
410static inline fs_inst *
411set_predicate(enum brw_predicate pred, fs_inst *inst)
412{
413   return set_predicate_inv(pred, false, inst);
414}
415
416/**
417 * Write the result of evaluating the condition given by \p mod to a flag
418 * register.
419 */
420static inline fs_inst *
421set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
422{
423   inst->conditional_mod = mod;
424   return inst;
425}
426
427/**
428 * Clamp the result of \p inst to the saturation range of its destination
429 * datatype.
430 */
431static inline fs_inst *
432set_saturate(bool saturate, fs_inst *inst)
433{
434   inst->saturate = saturate;
435   return inst;
436}
437
438/**
439 * Return the number of dataflow registers written by the instruction (either
440 * fully or partially) counted from 'floor(reg_offset(inst->dst) /
441 * register_size)'.  The somewhat arbitrary register size unit is 4B for the
442 * UNIFORM and IMM files and 32B for all other files.
443 */
444inline unsigned
445regs_written(const fs_inst *inst)
446{
447   assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
448   return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
449                       inst->size_written -
450                       MIN2(inst->size_written, reg_padding(inst->dst)),
451                       REG_SIZE);
452}
453
454/**
455 * Return the number of dataflow registers read by the instruction (either
456 * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
457 * register_size)'.  The somewhat arbitrary register size unit is 4B for the
458 * UNIFORM files and 32B for all other files.
459 */
460inline unsigned
461regs_read(const fs_inst *inst, unsigned i)
462{
463   if (inst->src[i].file == IMM)
464      return 1;
465
466   const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
467   return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
468                       inst->size_read(i) -
469                       MIN2(inst->size_read(i), reg_padding(inst->src[i])),
470                       reg_size);
471}
472
473static inline enum brw_reg_type
474get_exec_type(const fs_inst *inst)
475{
476   brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
477
478   for (int i = 0; i < inst->sources; i++) {
479      if (inst->src[i].file != BAD_FILE &&
480          !inst->is_control_source(i)) {
481         const brw_reg_type t = get_exec_type(inst->src[i].type);
482         if (type_sz(t) > type_sz(exec_type))
483            exec_type = t;
484         else if (type_sz(t) == type_sz(exec_type) &&
485                  brw_reg_type_is_floating_point(t))
486            exec_type = t;
487      }
488   }
489
490   if (exec_type == BRW_REGISTER_TYPE_B)
491      exec_type = inst->dst.type;
492
493   assert(exec_type != BRW_REGISTER_TYPE_B);
494
495   /* Promotion of the execution type to 32-bit for conversions from or to
496    * half-float seems to be consistent with the following text from the
497    * Cherryview PRM Vol. 7, "Execution Data Type":
498    *
499    * "When single precision and half precision floats are mixed between
500    *  source operands or between source and destination operand [..] single
501    *  precision float is the execution datatype."
502    *
503    * and from "Register Region Restrictions":
504    *
505    * "Conversion between Integer and HF (Half Float) must be DWord aligned
506    *  and strided by a DWord on the destination."
507    */
508   if (type_sz(exec_type) == 2 &&
509       inst->dst.type != exec_type) {
510      if (exec_type == BRW_REGISTER_TYPE_HF)
511         exec_type = BRW_REGISTER_TYPE_F;
512      else if (inst->dst.type == BRW_REGISTER_TYPE_HF)
513         exec_type = BRW_REGISTER_TYPE_D;
514   }
515
516   return exec_type;
517}
518
519static inline unsigned
520get_exec_type_size(const fs_inst *inst)
521{
522   return type_sz(get_exec_type(inst));
523}
524
525static inline bool
526is_send(const fs_inst *inst)
527{
528   return inst->mlen || inst->is_send_from_grf();
529}
530
531/**
532 * Return whether the instruction isn't an ALU instruction and cannot be
533 * assumed to complete in-order.
534 */
535static inline bool
536is_unordered(const fs_inst *inst)
537{
538   return is_send(inst) || inst->is_math();
539}
540
541/**
542 * Return whether the following regioning restriction applies to the specified
543 * instruction.  From the Cherryview PRM Vol 7. "Register Region
544 * Restrictions":
545 *
546 * "When source or destination datatype is 64b or operation is integer DWord
547 *  multiply, regioning in Align1 must follow these rules:
548 *
549 *  1. Source and Destination horizontal stride must be aligned to the same qword.
550 *  2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
551 *  3. Source and Destination offset must be the same, except the case of
552 *     scalar source."
553 */
554static inline bool
555has_dst_aligned_region_restriction(const intel_device_info *devinfo,
556                                   const fs_inst *inst,
557                                   brw_reg_type dst_type)
558{
559   const brw_reg_type exec_type = get_exec_type(inst);
560   /* Even though the hardware spec claims that "integer DWord multiply"
561    * operations are restricted, empirical evidence and the behavior of the
562    * simulator suggest that only 32x32-bit integer multiplication is
563    * restricted.
564    */
565   const bool is_dword_multiply = !brw_reg_type_is_floating_point(exec_type) &&
566      ((inst->opcode == BRW_OPCODE_MUL &&
567        MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
568       (inst->opcode == BRW_OPCODE_MAD &&
569        MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
570
571   if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 ||
572       (type_sz(exec_type) == 4 && is_dword_multiply))
573      return devinfo->platform == INTEL_PLATFORM_CHV ||
574             intel_device_info_is_9lp(devinfo) ||
575             devinfo->verx10 >= 125;
576
577   else if (brw_reg_type_is_floating_point(dst_type))
578      return devinfo->verx10 >= 125;
579
580   else
581      return false;
582}
583
584static inline bool
585has_dst_aligned_region_restriction(const intel_device_info *devinfo,
586                                   const fs_inst *inst)
587{
588   return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
589}
590
591/**
592 * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
593 * the specified register file into a VGRF.
594 *
595 * This implies identity register regions without any source-destination
596 * overlap, but otherwise has no implications on the location of sources and
597 * destination in the register file: Gathering any number of portions from
598 * multiple virtual registers in any order is allowed.
599 */
600inline bool
601is_copy_payload(brw_reg_file file, const fs_inst *inst)
602{
603   if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD ||
604       inst->is_partial_write() || inst->saturate ||
605       inst->dst.file != VGRF)
606      return false;
607
608   for (unsigned i = 0; i < inst->sources; i++) {
609      if (inst->src[i].file != file ||
610          inst->src[i].abs || inst->src[i].negate)
611         return false;
612
613      if (!inst->src[i].is_contiguous())
614         return false;
615
616      if (regions_overlap(inst->dst, inst->size_written,
617                          inst->src[i], inst->size_read(i)))
618         return false;
619   }
620
621   return true;
622}
623
624/**
625 * Like is_copy_payload(), but the instruction is required to copy a single
626 * contiguous block of registers from the given register file into the
627 * destination without any reordering.
628 */
629inline bool
630is_identity_payload(brw_reg_file file, const fs_inst *inst) {
631   if (is_copy_payload(file, inst)) {
632      fs_reg reg = inst->src[0];
633
634      for (unsigned i = 0; i < inst->sources; i++) {
635         reg.type = inst->src[i].type;
636         if (!inst->src[i].equals(reg))
637            return false;
638
639         reg = byte_offset(reg, inst->size_read(i));
640      }
641
642      return true;
643   } else {
644      return false;
645   }
646}
647
648/**
649 * Like is_copy_payload(), but the instruction is required to source data from
650 * at least two disjoint VGRFs.
651 *
652 * This doesn't necessarily rule out the elimination of this instruction
653 * through register coalescing, but due to limitations of the register
654 * coalesce pass it might be impossible to do so directly until a later stage,
655 * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
656 * instructions.
657 */
658inline bool
659is_multi_copy_payload(const fs_inst *inst) {
660   if (is_copy_payload(VGRF, inst)) {
661      for (unsigned i = 0; i < inst->sources; i++) {
662            if (inst->src[i].nr != inst->src[0].nr)
663               return true;
664      }
665   }
666
667   return false;
668}
669
670/**
671 * Like is_identity_payload(), but the instruction is required to copy the
672 * whole contents of a single VGRF into the destination.
673 *
674 * This means that there is a good chance that the instruction will be
675 * eliminated through register coalescing, but it's neither a necessary nor a
676 * sufficient condition for that to happen -- E.g. consider the case where
677 * source and destination registers diverge due to other instructions in the
678 * program overwriting part of their contents, which isn't something we can
679 * predict up front based on a cheap strictly local test of the copy
680 * instruction.
681 */
682inline bool
683is_coalescing_payload(const brw::simple_allocator &alloc, const fs_inst *inst)
684{
685   return is_identity_payload(VGRF, inst) &&
686          inst->src[0].offset == 0 &&
687          alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written;
688}
689
690bool
691has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst);
692
693#endif
694