1/* -*- c++ -*- */ 2/* 3 * Copyright © 2010-2015 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25#ifndef BRW_IR_FS_H 26#define BRW_IR_FS_H 27 28#include "brw_shader.h" 29 30class fs_inst; 31 32class fs_reg : public backend_reg { 33public: 34 DECLARE_RALLOC_CXX_OPERATORS(fs_reg) 35 36 void init(); 37 38 fs_reg(); 39 fs_reg(struct ::brw_reg reg); 40 fs_reg(enum brw_reg_file file, int nr); 41 fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type); 42 43 bool equals(const fs_reg &r) const; 44 bool negative_equals(const fs_reg &r) const; 45 bool is_contiguous() const; 46 47 /** 48 * Return the size in bytes of a single logical component of the 49 * register assuming the given execution width. 50 */ 51 unsigned component_size(unsigned width) const; 52 53 /** Register region horizontal stride */ 54 uint8_t stride; 55}; 56 57static inline fs_reg 58negate(fs_reg reg) 59{ 60 assert(reg.file != IMM); 61 reg.negate = !reg.negate; 62 return reg; 63} 64 65static inline fs_reg 66retype(fs_reg reg, enum brw_reg_type type) 67{ 68 reg.type = type; 69 return reg; 70} 71 72static inline fs_reg 73byte_offset(fs_reg reg, unsigned delta) 74{ 75 switch (reg.file) { 76 case BAD_FILE: 77 break; 78 case VGRF: 79 case ATTR: 80 case UNIFORM: 81 reg.offset += delta; 82 break; 83 case MRF: { 84 const unsigned suboffset = reg.offset + delta; 85 reg.nr += suboffset / REG_SIZE; 86 reg.offset = suboffset % REG_SIZE; 87 break; 88 } 89 case ARF: 90 case FIXED_GRF: { 91 const unsigned suboffset = reg.subnr + delta; 92 reg.nr += suboffset / REG_SIZE; 93 reg.subnr = suboffset % REG_SIZE; 94 break; 95 } 96 case IMM: 97 default: 98 assert(delta == 0); 99 } 100 return reg; 101} 102 103static inline fs_reg 104horiz_offset(const fs_reg ®, unsigned delta) 105{ 106 switch (reg.file) { 107 case BAD_FILE: 108 case UNIFORM: 109 case IMM: 110 /* These only have a single component that is implicitly splatted. A 111 * horizontal offset should be a harmless no-op. 112 * XXX - Handle vector immediates correctly. 113 */ 114 return reg; 115 case VGRF: 116 case MRF: 117 case ATTR: 118 return byte_offset(reg, delta * reg.stride * type_sz(reg.type)); 119 case ARF: 120 case FIXED_GRF: 121 if (reg.is_null()) { 122 return reg; 123 } else { 124 const unsigned stride = reg.hstride ? 1 << (reg.hstride - 1) : 0; 125 return byte_offset(reg, delta * stride * type_sz(reg.type)); 126 } 127 } 128 unreachable("Invalid register file"); 129} 130 131static inline fs_reg 132offset(fs_reg reg, unsigned width, unsigned delta) 133{ 134 switch (reg.file) { 135 case BAD_FILE: 136 break; 137 case ARF: 138 case FIXED_GRF: 139 case MRF: 140 case VGRF: 141 case ATTR: 142 case UNIFORM: 143 return byte_offset(reg, delta * reg.component_size(width)); 144 case IMM: 145 assert(delta == 0); 146 } 147 return reg; 148} 149 150/** 151 * Get the scalar channel of \p reg given by \p idx and replicate it to all 152 * channels of the result. 153 */ 154static inline fs_reg 155component(fs_reg reg, unsigned idx) 156{ 157 reg = horiz_offset(reg, idx); 158 reg.stride = 0; 159 return reg; 160} 161 162/** 163 * Return an integer identifying the discrete address space a register is 164 * contained in. A register is by definition fully contained in the single 165 * reg_space it belongs to, so two registers with different reg_space ids are 166 * guaranteed not to overlap. Most register files are a single reg_space of 167 * its own, only the VGRF file is composed of multiple discrete address 168 * spaces, one for each VGRF allocation. 169 */ 170static inline uint32_t 171reg_space(const fs_reg &r) 172{ 173 return r.file << 16 | (r.file == VGRF ? r.nr : 0); 174} 175 176/** 177 * Return the base offset in bytes of a register relative to the start of its 178 * reg_space(). 179 */ 180static inline unsigned 181reg_offset(const fs_reg &r) 182{ 183 return (r.file == VGRF || r.file == IMM ? 0 : r.nr) * 184 (r.file == UNIFORM ? 4 : REG_SIZE) + r.offset + 185 (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0); 186} 187 188/** 189 * Return the amount of padding in bytes left unused between individual 190 * components of register \p r due to a (horizontal) stride value greater than 191 * one, or zero if components are tightly packed in the register file. 192 */ 193static inline unsigned 194reg_padding(const fs_reg &r) 195{ 196 const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride : 197 r.hstride == 0 ? 0 : 198 1 << (r.hstride - 1)); 199 return (MAX2(1, stride) - 1) * type_sz(r.type); 200} 201 202/** 203 * Return whether the register region starting at \p r and spanning \p dr 204 * bytes could potentially overlap the register region starting at \p s and 205 * spanning \p ds bytes. 206 */ 207static inline bool 208regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds) 209{ 210 if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) { 211 fs_reg t = r; 212 t.nr &= ~BRW_MRF_COMPR4; 213 /* COMPR4 regions are translated by the hardware during decompression 214 * into two separate half-regions 4 MRFs apart from each other. 215 */ 216 return regions_overlap(t, dr / 2, s, ds) || 217 regions_overlap(byte_offset(t, 4 * REG_SIZE), dr / 2, s, ds); 218 219 } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) { 220 return regions_overlap(s, ds, r, dr); 221 222 } else { 223 return reg_space(r) == reg_space(s) && 224 !(reg_offset(r) + dr <= reg_offset(s) || 225 reg_offset(s) + ds <= reg_offset(r)); 226 } 227} 228 229/** 230 * Check that the register region given by r [r.offset, r.offset + dr[ 231 * is fully contained inside the register region given by s 232 * [s.offset, s.offset + ds[. 233 */ 234static inline bool 235region_contained_in(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds) 236{ 237 return reg_space(r) == reg_space(s) && 238 reg_offset(r) >= reg_offset(s) && 239 reg_offset(r) + dr <= reg_offset(s) + ds; 240} 241 242/** 243 * Return whether the given register region is n-periodic, i.e. whether the 244 * original region remains invariant after shifting it by \p n scalar 245 * channels. 246 */ 247static inline bool 248is_periodic(const fs_reg ®, unsigned n) 249{ 250 if (reg.file == BAD_FILE || reg.is_null()) { 251 return true; 252 253 } else if (reg.file == IMM) { 254 const unsigned period = (reg.type == BRW_REGISTER_TYPE_UV || 255 reg.type == BRW_REGISTER_TYPE_V ? 8 : 256 reg.type == BRW_REGISTER_TYPE_VF ? 4 : 257 1); 258 return n % period == 0; 259 260 } else if (reg.file == ARF || reg.file == FIXED_GRF) { 261 const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 : 262 reg.vstride == 0 ? 1 << reg.width : 263 ~0); 264 return n % period == 0; 265 266 } else { 267 return reg.stride == 0; 268 } 269} 270 271static inline bool 272is_uniform(const fs_reg ®) 273{ 274 return is_periodic(reg, 1); 275} 276 277/** 278 * Get the specified 8-component quarter of a register. 279 */ 280static inline fs_reg 281quarter(const fs_reg ®, unsigned idx) 282{ 283 assert(idx < 4); 284 return horiz_offset(reg, 8 * idx); 285} 286 287/** 288 * Reinterpret each channel of register \p reg as a vector of values of the 289 * given smaller type and take the i-th subcomponent from each. 290 */ 291static inline fs_reg 292subscript(fs_reg reg, brw_reg_type type, unsigned i) 293{ 294 assert((i + 1) * type_sz(type) <= type_sz(reg.type)); 295 296 if (reg.file == ARF || reg.file == FIXED_GRF) { 297 /* The stride is encoded inconsistently for fixed GRF and ARF registers 298 * as the log2 of the actual vertical and horizontal strides. 299 */ 300 const int delta = util_logbase2(type_sz(reg.type)) - 301 util_logbase2(type_sz(type)); 302 reg.hstride += (reg.hstride ? delta : 0); 303 reg.vstride += (reg.vstride ? delta : 0); 304 305 } else if (reg.file == IMM) { 306 unsigned bit_size = type_sz(type) * 8; 307 reg.u64 >>= i * bit_size; 308 reg.u64 &= BITFIELD64_MASK(bit_size); 309 if (bit_size <= 16) 310 reg.u64 |= reg.u64 << 16; 311 return retype(reg, type); 312 } else { 313 reg.stride *= type_sz(reg.type) / type_sz(type); 314 } 315 316 return byte_offset(retype(reg, type), i * type_sz(type)); 317} 318 319static inline fs_reg 320horiz_stride(fs_reg reg, unsigned s) 321{ 322 reg.stride *= s; 323 return reg; 324} 325 326static const fs_reg reg_undef; 327 328class fs_inst : public backend_instruction { 329 fs_inst &operator=(const fs_inst &); 330 331 void init(enum opcode opcode, uint8_t exec_width, const fs_reg &dst, 332 const fs_reg *src, unsigned sources); 333 334public: 335 DECLARE_RALLOC_CXX_OPERATORS(fs_inst) 336 337 fs_inst(); 338 fs_inst(enum opcode opcode, uint8_t exec_size); 339 fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst); 340 fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, 341 const fs_reg &src0); 342 fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, 343 const fs_reg &src0, const fs_reg &src1); 344 fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, 345 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2); 346 fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, 347 const fs_reg src[], unsigned sources); 348 fs_inst(const fs_inst &that); 349 ~fs_inst(); 350 351 void resize_sources(uint8_t num_sources); 352 353 bool is_send_from_grf() const; 354 bool is_payload(unsigned arg) const; 355 bool is_partial_write() const; 356 unsigned components_read(unsigned i) const; 357 unsigned size_read(int arg) const; 358 bool can_do_source_mods(const struct intel_device_info *devinfo) const; 359 bool can_do_cmod(); 360 bool can_change_types() const; 361 bool has_source_and_destination_hazard() const; 362 unsigned implied_mrf_writes() const; 363 364 /** 365 * Return whether \p arg is a control source of a virtual instruction which 366 * shouldn't contribute to the execution type and usual regioning 367 * restriction calculations of arithmetic instructions. 368 */ 369 bool is_control_source(unsigned arg) const; 370 371 /** 372 * Return the subset of flag registers read by the instruction as a bitset 373 * with byte granularity. 374 */ 375 unsigned flags_read(const intel_device_info *devinfo) const; 376 377 /** 378 * Return the subset of flag registers updated by the instruction (either 379 * partially or fully) as a bitset with byte granularity. 380 */ 381 unsigned flags_written(const intel_device_info *devinfo) const; 382 383 fs_reg dst; 384 fs_reg *src; 385 386 uint8_t sources; /**< Number of fs_reg sources. */ 387 388 bool last_rt:1; 389 bool pi_noperspective:1; /**< Pixel interpolator noperspective flag */ 390 391 tgl_swsb sched; /**< Scheduling info. */ 392}; 393 394/** 395 * Make the execution of \p inst dependent on the evaluation of a possibly 396 * inverted predicate. 397 */ 398static inline fs_inst * 399set_predicate_inv(enum brw_predicate pred, bool inverse, 400 fs_inst *inst) 401{ 402 inst->predicate = pred; 403 inst->predicate_inverse = inverse; 404 return inst; 405} 406 407/** 408 * Make the execution of \p inst dependent on the evaluation of a predicate. 409 */ 410static inline fs_inst * 411set_predicate(enum brw_predicate pred, fs_inst *inst) 412{ 413 return set_predicate_inv(pred, false, inst); 414} 415 416/** 417 * Write the result of evaluating the condition given by \p mod to a flag 418 * register. 419 */ 420static inline fs_inst * 421set_condmod(enum brw_conditional_mod mod, fs_inst *inst) 422{ 423 inst->conditional_mod = mod; 424 return inst; 425} 426 427/** 428 * Clamp the result of \p inst to the saturation range of its destination 429 * datatype. 430 */ 431static inline fs_inst * 432set_saturate(bool saturate, fs_inst *inst) 433{ 434 inst->saturate = saturate; 435 return inst; 436} 437 438/** 439 * Return the number of dataflow registers written by the instruction (either 440 * fully or partially) counted from 'floor(reg_offset(inst->dst) / 441 * register_size)'. The somewhat arbitrary register size unit is 4B for the 442 * UNIFORM and IMM files and 32B for all other files. 443 */ 444inline unsigned 445regs_written(const fs_inst *inst) 446{ 447 assert(inst->dst.file != UNIFORM && inst->dst.file != IMM); 448 return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + 449 inst->size_written - 450 MIN2(inst->size_written, reg_padding(inst->dst)), 451 REG_SIZE); 452} 453 454/** 455 * Return the number of dataflow registers read by the instruction (either 456 * fully or partially) counted from 'floor(reg_offset(inst->src[i]) / 457 * register_size)'. The somewhat arbitrary register size unit is 4B for the 458 * UNIFORM files and 32B for all other files. 459 */ 460inline unsigned 461regs_read(const fs_inst *inst, unsigned i) 462{ 463 if (inst->src[i].file == IMM) 464 return 1; 465 466 const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE; 467 return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + 468 inst->size_read(i) - 469 MIN2(inst->size_read(i), reg_padding(inst->src[i])), 470 reg_size); 471} 472 473static inline enum brw_reg_type 474get_exec_type(const fs_inst *inst) 475{ 476 brw_reg_type exec_type = BRW_REGISTER_TYPE_B; 477 478 for (int i = 0; i < inst->sources; i++) { 479 if (inst->src[i].file != BAD_FILE && 480 !inst->is_control_source(i)) { 481 const brw_reg_type t = get_exec_type(inst->src[i].type); 482 if (type_sz(t) > type_sz(exec_type)) 483 exec_type = t; 484 else if (type_sz(t) == type_sz(exec_type) && 485 brw_reg_type_is_floating_point(t)) 486 exec_type = t; 487 } 488 } 489 490 if (exec_type == BRW_REGISTER_TYPE_B) 491 exec_type = inst->dst.type; 492 493 assert(exec_type != BRW_REGISTER_TYPE_B); 494 495 /* Promotion of the execution type to 32-bit for conversions from or to 496 * half-float seems to be consistent with the following text from the 497 * Cherryview PRM Vol. 7, "Execution Data Type": 498 * 499 * "When single precision and half precision floats are mixed between 500 * source operands or between source and destination operand [..] single 501 * precision float is the execution datatype." 502 * 503 * and from "Register Region Restrictions": 504 * 505 * "Conversion between Integer and HF (Half Float) must be DWord aligned 506 * and strided by a DWord on the destination." 507 */ 508 if (type_sz(exec_type) == 2 && 509 inst->dst.type != exec_type) { 510 if (exec_type == BRW_REGISTER_TYPE_HF) 511 exec_type = BRW_REGISTER_TYPE_F; 512 else if (inst->dst.type == BRW_REGISTER_TYPE_HF) 513 exec_type = BRW_REGISTER_TYPE_D; 514 } 515 516 return exec_type; 517} 518 519static inline unsigned 520get_exec_type_size(const fs_inst *inst) 521{ 522 return type_sz(get_exec_type(inst)); 523} 524 525static inline bool 526is_send(const fs_inst *inst) 527{ 528 return inst->mlen || inst->is_send_from_grf(); 529} 530 531/** 532 * Return whether the instruction isn't an ALU instruction and cannot be 533 * assumed to complete in-order. 534 */ 535static inline bool 536is_unordered(const fs_inst *inst) 537{ 538 return is_send(inst) || inst->is_math(); 539} 540 541/** 542 * Return whether the following regioning restriction applies to the specified 543 * instruction. From the Cherryview PRM Vol 7. "Register Region 544 * Restrictions": 545 * 546 * "When source or destination datatype is 64b or operation is integer DWord 547 * multiply, regioning in Align1 must follow these rules: 548 * 549 * 1. Source and Destination horizontal stride must be aligned to the same qword. 550 * 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride. 551 * 3. Source and Destination offset must be the same, except the case of 552 * scalar source." 553 */ 554static inline bool 555has_dst_aligned_region_restriction(const intel_device_info *devinfo, 556 const fs_inst *inst, 557 brw_reg_type dst_type) 558{ 559 const brw_reg_type exec_type = get_exec_type(inst); 560 /* Even though the hardware spec claims that "integer DWord multiply" 561 * operations are restricted, empirical evidence and the behavior of the 562 * simulator suggest that only 32x32-bit integer multiplication is 563 * restricted. 564 */ 565 const bool is_dword_multiply = !brw_reg_type_is_floating_point(exec_type) && 566 ((inst->opcode == BRW_OPCODE_MUL && 567 MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) || 568 (inst->opcode == BRW_OPCODE_MAD && 569 MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4)); 570 571 if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 || 572 (type_sz(exec_type) == 4 && is_dword_multiply)) 573 return devinfo->platform == INTEL_PLATFORM_CHV || 574 intel_device_info_is_9lp(devinfo) || 575 devinfo->verx10 >= 125; 576 577 else if (brw_reg_type_is_floating_point(dst_type)) 578 return devinfo->verx10 >= 125; 579 580 else 581 return false; 582} 583 584static inline bool 585has_dst_aligned_region_restriction(const intel_device_info *devinfo, 586 const fs_inst *inst) 587{ 588 return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type); 589} 590 591/** 592 * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from 593 * the specified register file into a VGRF. 594 * 595 * This implies identity register regions without any source-destination 596 * overlap, but otherwise has no implications on the location of sources and 597 * destination in the register file: Gathering any number of portions from 598 * multiple virtual registers in any order is allowed. 599 */ 600inline bool 601is_copy_payload(brw_reg_file file, const fs_inst *inst) 602{ 603 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD || 604 inst->is_partial_write() || inst->saturate || 605 inst->dst.file != VGRF) 606 return false; 607 608 for (unsigned i = 0; i < inst->sources; i++) { 609 if (inst->src[i].file != file || 610 inst->src[i].abs || inst->src[i].negate) 611 return false; 612 613 if (!inst->src[i].is_contiguous()) 614 return false; 615 616 if (regions_overlap(inst->dst, inst->size_written, 617 inst->src[i], inst->size_read(i))) 618 return false; 619 } 620 621 return true; 622} 623 624/** 625 * Like is_copy_payload(), but the instruction is required to copy a single 626 * contiguous block of registers from the given register file into the 627 * destination without any reordering. 628 */ 629inline bool 630is_identity_payload(brw_reg_file file, const fs_inst *inst) { 631 if (is_copy_payload(file, inst)) { 632 fs_reg reg = inst->src[0]; 633 634 for (unsigned i = 0; i < inst->sources; i++) { 635 reg.type = inst->src[i].type; 636 if (!inst->src[i].equals(reg)) 637 return false; 638 639 reg = byte_offset(reg, inst->size_read(i)); 640 } 641 642 return true; 643 } else { 644 return false; 645 } 646} 647 648/** 649 * Like is_copy_payload(), but the instruction is required to source data from 650 * at least two disjoint VGRFs. 651 * 652 * This doesn't necessarily rule out the elimination of this instruction 653 * through register coalescing, but due to limitations of the register 654 * coalesce pass it might be impossible to do so directly until a later stage, 655 * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV 656 * instructions. 657 */ 658inline bool 659is_multi_copy_payload(const fs_inst *inst) { 660 if (is_copy_payload(VGRF, inst)) { 661 for (unsigned i = 0; i < inst->sources; i++) { 662 if (inst->src[i].nr != inst->src[0].nr) 663 return true; 664 } 665 } 666 667 return false; 668} 669 670/** 671 * Like is_identity_payload(), but the instruction is required to copy the 672 * whole contents of a single VGRF into the destination. 673 * 674 * This means that there is a good chance that the instruction will be 675 * eliminated through register coalescing, but it's neither a necessary nor a 676 * sufficient condition for that to happen -- E.g. consider the case where 677 * source and destination registers diverge due to other instructions in the 678 * program overwriting part of their contents, which isn't something we can 679 * predict up front based on a cheap strictly local test of the copy 680 * instruction. 681 */ 682inline bool 683is_coalescing_payload(const brw::simple_allocator &alloc, const fs_inst *inst) 684{ 685 return is_identity_payload(VGRF, inst) && 686 inst->src[0].offset == 0 && 687 alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written; 688} 689 690bool 691has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst); 692 693#endif 694