1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_vec4.h" 25#include "brw_cfg.h" 26#include "brw_eu.h" 27#include "util/u_math.h" 28 29namespace brw { 30 31vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst, 32 const src_reg &src0, const src_reg &src1, 33 const src_reg &src2) 34{ 35 this->opcode = opcode; 36 this->dst = dst; 37 this->src[0] = src0; 38 this->src[1] = src1; 39 this->src[2] = src2; 40 this->saturate = false; 41 this->force_writemask_all = false; 42 this->no_dd_clear = false; 43 this->no_dd_check = false; 44 this->writes_accumulator = false; 45 this->conditional_mod = BRW_CONDITIONAL_NONE; 46 this->predicate = BRW_PREDICATE_NONE; 47 this->predicate_inverse = false; 48 this->target = 0; 49 this->shadow_compare = false; 50 this->eot = false; 51 this->ir = NULL; 52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; 53 this->header_size = 0; 54 this->flag_subreg = 0; 55 this->mlen = 0; 56 this->base_mrf = 0; 57 this->offset = 0; 58 this->exec_size = 8; 59 this->group = 0; 60 this->size_written = (dst.file == BAD_FILE ? 61 0 : this->exec_size * type_sz(dst.type)); 62 this->annotation = NULL; 63} 64 65vec4_instruction * 66vec4_visitor::emit(vec4_instruction *inst) 67{ 68 inst->ir = this->base_ir; 69 inst->annotation = this->current_annotation; 70 71 this->instructions.push_tail(inst); 72 73 return inst; 74} 75 76vec4_instruction * 77vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst, 78 vec4_instruction *new_inst) 79{ 80 new_inst->ir = inst->ir; 81 new_inst->annotation = inst->annotation; 82 83 inst->insert_before(block, new_inst); 84 85 return inst; 86} 87 88vec4_instruction * 89vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 90 const src_reg &src1, const src_reg &src2) 91{ 92 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2)); 93} 94 95 96vec4_instruction * 97vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 98 const src_reg &src1) 99{ 100 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1)); 101} 102 103vec4_instruction * 104vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) 105{ 106 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0)); 107} 108 109vec4_instruction * 110vec4_visitor::emit(enum opcode opcode, const dst_reg &dst) 111{ 112 return emit(new(mem_ctx) vec4_instruction(opcode, dst)); 113} 114 115vec4_instruction * 116vec4_visitor::emit(enum opcode opcode) 117{ 118 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg())); 119} 120 121#define ALU1(op) \ 122 vec4_instruction * \ 123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \ 124 { \ 125 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \ 126 } 127 128#define ALU2(op) \ 129 vec4_instruction * \ 130 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 131 const src_reg &src1) \ 132 { \ 133 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ 134 src0, src1); \ 135 } 136 137#define ALU2_ACC(op) \ 138 vec4_instruction * \ 139 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 140 const src_reg &src1) \ 141 { \ 142 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \ 143 BRW_OPCODE_##op, dst, src0, src1); \ 144 inst->writes_accumulator = true; \ 145 return inst; \ 146 } 147 148#define ALU3(op) \ 149 vec4_instruction * \ 150 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 151 const src_reg &src1, const src_reg &src2) \ 152 { \ 153 assert(devinfo->ver >= 6); \ 154 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ 155 src0, src1, src2); \ 156 } 157 158ALU1(NOT) 159ALU1(MOV) 160ALU1(FRC) 161ALU1(RNDD) 162ALU1(RNDE) 163ALU1(RNDZ) 164ALU1(F32TO16) 165ALU1(F16TO32) 166ALU2(ADD) 167ALU2(MUL) 168ALU2_ACC(MACH) 169ALU2(AND) 170ALU2(OR) 171ALU2(XOR) 172ALU2(DP3) 173ALU2(DP4) 174ALU2(DPH) 175ALU2(SHL) 176ALU2(SHR) 177ALU2(ASR) 178ALU3(LRP) 179ALU1(BFREV) 180ALU3(BFE) 181ALU2(BFI1) 182ALU3(BFI2) 183ALU1(FBH) 184ALU1(FBL) 185ALU1(CBIT) 186ALU3(MAD) 187ALU2_ACC(ADDC) 188ALU2_ACC(SUBB) 189ALU2(MAC) 190ALU1(DIM) 191 192/** Gfx4 predicated IF. */ 193vec4_instruction * 194vec4_visitor::IF(enum brw_predicate predicate) 195{ 196 vec4_instruction *inst; 197 198 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF); 199 inst->predicate = predicate; 200 201 return inst; 202} 203 204/** Gfx6 IF with embedded comparison. */ 205vec4_instruction * 206vec4_visitor::IF(src_reg src0, src_reg src1, 207 enum brw_conditional_mod condition) 208{ 209 assert(devinfo->ver == 6); 210 211 vec4_instruction *inst; 212 213 resolve_ud_negate(&src0); 214 resolve_ud_negate(&src1); 215 216 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(), 217 src0, src1); 218 inst->conditional_mod = condition; 219 220 return inst; 221} 222 223/** 224 * CMP: Sets the low bit of the destination channels with the result 225 * of the comparison, while the upper bits are undefined, and updates 226 * the flag register with the packed 16 bits of the result. 227 */ 228vec4_instruction * 229vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, 230 enum brw_conditional_mod condition) 231{ 232 vec4_instruction *inst; 233 234 /* Take the instruction: 235 * 236 * CMP null<d> src0<f> src1<f> 237 * 238 * Original gfx4 does type conversion to the destination type before 239 * comparison, producing garbage results for floating point comparisons. 240 * 241 * The destination type doesn't matter on newer generations, so we set the 242 * type to match src0 so we can compact the instruction. 243 */ 244 dst.type = src0.type; 245 246 resolve_ud_negate(&src0); 247 resolve_ud_negate(&src1); 248 249 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1); 250 inst->conditional_mod = condition; 251 252 return inst; 253} 254 255vec4_instruction * 256vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index) 257{ 258 vec4_instruction *inst; 259 260 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_READ, 261 dst, index); 262 inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver) + 1; 263 inst->mlen = 2; 264 265 return inst; 266} 267 268vec4_instruction * 269vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src, 270 const src_reg &index) 271{ 272 vec4_instruction *inst; 273 274 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_WRITE, 275 dst, src, index); 276 inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver); 277 inst->mlen = 3; 278 279 return inst; 280} 281 282src_reg 283vec4_visitor::fix_3src_operand(const src_reg &src) 284{ 285 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be 286 * able to use vertical stride of zero to replicate the vec4 uniform, like 287 * 288 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] 289 * 290 * But you can't, since vertical stride is always four in three-source 291 * instructions. Instead, insert a MOV instruction to do the replication so 292 * that the three-source instruction can consume it. 293 */ 294 295 /* The MOV is only needed if the source is a uniform or immediate. */ 296 if (src.file != UNIFORM && src.file != IMM) 297 return src; 298 299 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) 300 return src; 301 302 dst_reg expanded = dst_reg(this, glsl_type::vec4_type); 303 expanded.type = src.type; 304 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); 305 return src_reg(expanded); 306} 307 308src_reg 309vec4_visitor::fix_math_operand(const src_reg &src) 310{ 311 if (devinfo->ver < 6 || src.file == BAD_FILE) 312 return src; 313 314 /* The gfx6 math instruction ignores the source modifiers -- 315 * swizzle, abs, negate, and at least some parts of the register 316 * region description. 317 * 318 * Rather than trying to enumerate all these cases, *always* expand the 319 * operand to a temp GRF for gfx6. 320 * 321 * For gfx7, keep the operand as-is, except if immediate, which gfx7 still 322 * can't use. 323 */ 324 325 if (devinfo->ver == 7 && src.file != IMM) 326 return src; 327 328 dst_reg expanded = dst_reg(this, glsl_type::vec4_type); 329 expanded.type = src.type; 330 emit(MOV(expanded, src)); 331 return src_reg(expanded); 332} 333 334vec4_instruction * 335vec4_visitor::emit_math(enum opcode opcode, 336 const dst_reg &dst, 337 const src_reg &src0, const src_reg &src1) 338{ 339 vec4_instruction *math = 340 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1)); 341 342 if (devinfo->ver == 6 && dst.writemask != WRITEMASK_XYZW) { 343 /* MATH on Gfx6 must be align1, so we can't do writemasks. */ 344 math->dst = dst_reg(this, glsl_type::vec4_type); 345 math->dst.type = dst.type; 346 math = emit(MOV(dst, src_reg(math->dst))); 347 } else if (devinfo->ver < 6) { 348 math->base_mrf = 1; 349 math->mlen = src1.file == BAD_FILE ? 1 : 2; 350 } 351 352 return math; 353} 354 355void 356vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0) 357{ 358 if (devinfo->ver < 7) { 359 unreachable("ir_unop_pack_half_2x16 should be lowered"); 360 } 361 362 assert(dst.type == BRW_REGISTER_TYPE_UD); 363 assert(src0.type == BRW_REGISTER_TYPE_F); 364 365 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: 366 * 367 * Because this instruction does not have a 16-bit floating-point type, 368 * the destination data type must be Word (W). 369 * 370 * The destination must be DWord-aligned and specify a horizontal stride 371 * (HorzStride) of 2. The 16-bit result is stored in the lower word of 372 * each destination channel and the upper word is not modified. 373 * 374 * The above restriction implies that the f32to16 instruction must use 375 * align1 mode, because only in align1 mode is it possible to specify 376 * horizontal stride. We choose here to defy the hardware docs and emit 377 * align16 instructions. 378 * 379 * (I [chadv] did attempt to emit align1 instructions for VS f32to16 380 * instructions. I was partially successful in that the code passed all 381 * tests. However, the code was dubiously correct and fragile, and the 382 * tests were not harsh enough to probe that frailty. Not trusting the 383 * code, I chose instead to remain in align16 mode in defiance of the hw 384 * docs). 385 * 386 * I've [chadv] experimentally confirmed that, on gfx7 hardware and the 387 * simulator, emitting a f32to16 in align16 mode with UD as destination 388 * data type is safe. The behavior differs from that specified in the PRM 389 * in that the upper word of each destination channel is cleared to 0. 390 */ 391 392 dst_reg tmp_dst(this, glsl_type::uvec2_type); 393 src_reg tmp_src(tmp_dst); 394 395#if 0 396 /* Verify the undocumented behavior on which the following instructions 397 * rely. If f32to16 fails to clear the upper word of the X and Y channels, 398 * then the result of the bit-or instruction below will be incorrect. 399 * 400 * You should inspect the disasm output in order to verify that the MOV is 401 * not optimized away. 402 */ 403 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u))); 404#endif 405 406 /* Give tmp the form below, where "." means untouched. 407 * 408 * w z y x w z y x 409 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll| 410 * 411 * That the upper word of each write-channel be 0 is required for the 412 * following bit-shift and bit-or instructions to work. Note that this 413 * relies on the undocumented hardware behavior mentioned above. 414 */ 415 tmp_dst.writemask = WRITEMASK_XY; 416 emit(F32TO16(tmp_dst, src0)); 417 418 /* Give the write-channels of dst the form: 419 * 0xhhhh0000 420 */ 421 tmp_src.swizzle = BRW_SWIZZLE_YYYY; 422 emit(SHL(dst, tmp_src, brw_imm_ud(16u))); 423 424 /* Finally, give the write-channels of dst the form of packHalf2x16's 425 * output: 426 * 0xhhhhllll 427 */ 428 tmp_src.swizzle = BRW_SWIZZLE_XXXX; 429 emit(OR(dst, src_reg(dst), tmp_src)); 430} 431 432void 433vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0) 434{ 435 if (devinfo->ver < 7) { 436 unreachable("ir_unop_unpack_half_2x16 should be lowered"); 437 } 438 439 assert(dst.type == BRW_REGISTER_TYPE_F); 440 assert(src0.type == BRW_REGISTER_TYPE_UD); 441 442 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: 443 * 444 * Because this instruction does not have a 16-bit floating-point type, 445 * the source data type must be Word (W). The destination type must be 446 * F (Float). 447 * 448 * To use W as the source data type, we must adjust horizontal strides, 449 * which is only possible in align1 mode. All my [chadv] attempts at 450 * emitting align1 instructions for unpackHalf2x16 failed to pass the 451 * Piglit tests, so I gave up. 452 * 453 * I've verified that, on gfx7 hardware and the simulator, it is safe to 454 * emit f16to32 in align16 mode with UD as source data type. 455 */ 456 457 dst_reg tmp_dst(this, glsl_type::uvec2_type); 458 src_reg tmp_src(tmp_dst); 459 460 tmp_dst.writemask = WRITEMASK_X; 461 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu))); 462 463 tmp_dst.writemask = WRITEMASK_Y; 464 emit(SHR(tmp_dst, src0, brw_imm_ud(16u))); 465 466 dst.writemask = WRITEMASK_XY; 467 emit(F16TO32(dst, tmp_src)); 468} 469 470void 471vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0) 472{ 473 /* Instead of splitting the 32-bit integer, shifting, and ORing it back 474 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate 475 * is not suitable to generate the shift values, but we can use the packed 476 * vector float and a type-converting MOV. 477 */ 478 dst_reg shift(this, glsl_type::uvec4_type); 479 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); 480 481 dst_reg shifted(this, glsl_type::uvec4_type); 482 src0.swizzle = BRW_SWIZZLE_XXXX; 483 emit(SHR(shifted, src0, src_reg(shift))); 484 485 shifted.type = BRW_REGISTER_TYPE_UB; 486 dst_reg f(this, glsl_type::vec4_type); 487 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); 488 489 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f))); 490} 491 492void 493vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0) 494{ 495 /* Instead of splitting the 32-bit integer, shifting, and ORing it back 496 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate 497 * is not suitable to generate the shift values, but we can use the packed 498 * vector float and a type-converting MOV. 499 */ 500 dst_reg shift(this, glsl_type::uvec4_type); 501 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); 502 503 dst_reg shifted(this, glsl_type::uvec4_type); 504 src0.swizzle = BRW_SWIZZLE_XXXX; 505 emit(SHR(shifted, src0, src_reg(shift))); 506 507 shifted.type = BRW_REGISTER_TYPE_B; 508 dst_reg f(this, glsl_type::vec4_type); 509 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); 510 511 dst_reg scaled(this, glsl_type::vec4_type); 512 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f))); 513 514 dst_reg max(this, glsl_type::vec4_type); 515 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f)); 516 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f)); 517} 518 519void 520vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0) 521{ 522 dst_reg saturated(this, glsl_type::vec4_type); 523 vec4_instruction *inst = emit(MOV(saturated, src0)); 524 inst->saturate = true; 525 526 dst_reg scaled(this, glsl_type::vec4_type); 527 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f))); 528 529 dst_reg rounded(this, glsl_type::vec4_type); 530 emit(RNDE(rounded, src_reg(scaled))); 531 532 dst_reg u(this, glsl_type::uvec4_type); 533 emit(MOV(u, src_reg(rounded))); 534 535 src_reg bytes(u); 536 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); 537} 538 539void 540vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0) 541{ 542 dst_reg max(this, glsl_type::vec4_type); 543 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f)); 544 545 dst_reg min(this, glsl_type::vec4_type); 546 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f)); 547 548 dst_reg scaled(this, glsl_type::vec4_type); 549 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f))); 550 551 dst_reg rounded(this, glsl_type::vec4_type); 552 emit(RNDE(rounded, src_reg(scaled))); 553 554 dst_reg i(this, glsl_type::ivec4_type); 555 emit(MOV(i, src_reg(rounded))); 556 557 src_reg bytes(i); 558 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); 559} 560 561/* 562 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 == 563 * false) elements needed to pack a type. 564 */ 565static int 566type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless) 567{ 568 unsigned int i; 569 int size; 570 571 switch (type->base_type) { 572 case GLSL_TYPE_UINT: 573 case GLSL_TYPE_INT: 574 case GLSL_TYPE_FLOAT: 575 case GLSL_TYPE_FLOAT16: 576 case GLSL_TYPE_BOOL: 577 case GLSL_TYPE_DOUBLE: 578 case GLSL_TYPE_UINT16: 579 case GLSL_TYPE_INT16: 580 case GLSL_TYPE_UINT8: 581 case GLSL_TYPE_INT8: 582 case GLSL_TYPE_UINT64: 583 case GLSL_TYPE_INT64: 584 if (type->is_matrix()) { 585 const glsl_type *col_type = type->column_type(); 586 unsigned col_slots = 587 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1; 588 return type->matrix_columns * col_slots; 589 } else { 590 /* Regardless of size of vector, it gets a vec4. This is bad 591 * packing for things like floats, but otherwise arrays become a 592 * mess. Hopefully a later pass over the code can pack scalars 593 * down if appropriate. 594 */ 595 return (as_vec4 && type->is_dual_slot()) ? 2 : 1; 596 } 597 case GLSL_TYPE_ARRAY: 598 assert(type->length > 0); 599 return type_size_xvec4(type->fields.array, as_vec4, bindless) * 600 type->length; 601 case GLSL_TYPE_STRUCT: 602 case GLSL_TYPE_INTERFACE: 603 size = 0; 604 for (i = 0; i < type->length; i++) { 605 size += type_size_xvec4(type->fields.structure[i].type, as_vec4, 606 bindless); 607 } 608 return size; 609 case GLSL_TYPE_SUBROUTINE: 610 return 1; 611 612 case GLSL_TYPE_SAMPLER: 613 case GLSL_TYPE_TEXTURE: 614 /* Samplers and textures take up no register space, since they're baked 615 * in at link time. 616 */ 617 return bindless ? 1 : 0; 618 case GLSL_TYPE_ATOMIC_UINT: 619 return 0; 620 case GLSL_TYPE_IMAGE: 621 return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4); 622 case GLSL_TYPE_VOID: 623 case GLSL_TYPE_ERROR: 624 case GLSL_TYPE_FUNCTION: 625 unreachable("not reached"); 626 } 627 628 return 0; 629} 630 631/** 632 * Returns the minimum number of vec4 elements needed to pack a type. 633 * 634 * For simple types, it will return 1 (a single vec4); for matrices, the 635 * number of columns; for array and struct, the sum of the vec4_size of 636 * each of its elements; and for sampler and atomic, zero. 637 * 638 * This method is useful to calculate how much register space is needed to 639 * store a particular type. 640 */ 641extern "C" int 642type_size_vec4(const struct glsl_type *type, bool bindless) 643{ 644 return type_size_xvec4(type, true, bindless); 645} 646 647/** 648 * Returns the minimum number of dvec4 elements needed to pack a type. 649 * 650 * For simple types, it will return 1 (a single dvec4); for matrices, the 651 * number of columns; for array and struct, the sum of the dvec4_size of 652 * each of its elements; and for sampler and atomic, zero. 653 * 654 * This method is useful to calculate how much register space is needed to 655 * store a particular type. 656 * 657 * Measuring double-precision vertex inputs as dvec4 is required because 658 * ARB_vertex_attrib_64bit states that these uses the same number of locations 659 * than the single-precision version. That is, two consecutives dvec4 would be 660 * located in location "x" and location "x+1", not "x+2". 661 * 662 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs, 663 * remap_vs_attrs() will take in account both the location and also if the 664 * type fits in one or two vec4 slots. 665 */ 666extern "C" int 667type_size_dvec4(const struct glsl_type *type, bool bindless) 668{ 669 return type_size_xvec4(type, false, bindless); 670} 671 672src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type) 673{ 674 init(); 675 676 this->file = VGRF; 677 this->nr = v->alloc.allocate(type_size_vec4(type, false)); 678 679 if (type->is_array() || type->is_struct()) { 680 this->swizzle = BRW_SWIZZLE_NOOP; 681 } else { 682 this->swizzle = brw_swizzle_for_size(type->vector_elements); 683 } 684 685 this->type = brw_type_for_base_type(type); 686} 687 688src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size) 689{ 690 assert(size > 0); 691 692 init(); 693 694 this->file = VGRF; 695 this->nr = v->alloc.allocate(type_size_vec4(type, false) * size); 696 697 this->swizzle = BRW_SWIZZLE_NOOP; 698 699 this->type = brw_type_for_base_type(type); 700} 701 702dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type) 703{ 704 init(); 705 706 this->file = VGRF; 707 this->nr = v->alloc.allocate(type_size_vec4(type, false)); 708 709 if (type->is_array() || type->is_struct()) { 710 this->writemask = WRITEMASK_XYZW; 711 } else { 712 this->writemask = (1 << type->vector_elements) - 1; 713 } 714 715 this->type = brw_type_for_base_type(type); 716} 717 718vec4_instruction * 719vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst, 720 src_reg src0, src_reg src1) 721{ 722 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1); 723 inst->conditional_mod = conditionalmod; 724 return inst; 725} 726 727/** 728 * Emits the instructions needed to perform a pull constant load. before_block 729 * and before_inst can be NULL in which case the instruction will be appended 730 * to the end of the instruction list. 731 */ 732void 733vec4_visitor::emit_pull_constant_load_reg(dst_reg dst, 734 src_reg surf_index, 735 src_reg offset_reg, 736 bblock_t *before_block, 737 vec4_instruction *before_inst) 738{ 739 assert((before_inst == NULL && before_block == NULL) || 740 (before_inst && before_block)); 741 742 vec4_instruction *pull; 743 744 if (devinfo->ver >= 7) { 745 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type); 746 747 grf_offset.type = offset_reg.type; 748 749 pull = MOV(grf_offset, offset_reg); 750 751 if (before_inst) 752 emit_before(before_block, before_inst, pull); 753 else 754 emit(pull); 755 756 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GFX7, 757 dst, 758 surf_index, 759 src_reg(grf_offset)); 760 pull->mlen = 1; 761 } else { 762 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD, 763 dst, 764 surf_index, 765 offset_reg); 766 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1; 767 pull->mlen = 1; 768 } 769 770 if (before_inst) 771 emit_before(before_block, before_inst, pull); 772 else 773 emit(pull); 774} 775 776src_reg 777vec4_visitor::emit_uniformize(const src_reg &src) 778{ 779 const src_reg chan_index(this, glsl_type::uint_type); 780 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type), 781 src.type); 782 783 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index)) 784 ->force_writemask_all = true; 785 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index) 786 ->force_writemask_all = true; 787 788 return src_reg(dst); 789} 790 791void 792vec4_visitor::gs_emit_vertex(int /* stream_id */) 793{ 794 unreachable("not reached"); 795} 796 797void 798vec4_visitor::gs_end_primitive() 799{ 800 unreachable("not reached"); 801} 802 803void 804vec4_visitor::emit_ndc_computation() 805{ 806 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE) 807 return; 808 809 /* Get the position */ 810 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]); 811 812 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */ 813 dst_reg ndc = dst_reg(this, glsl_type::vec4_type); 814 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc; 815 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4; 816 817 current_annotation = "NDC"; 818 dst_reg ndc_w = ndc; 819 ndc_w.writemask = WRITEMASK_W; 820 src_reg pos_w = pos; 821 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W); 822 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w); 823 824 dst_reg ndc_xyz = ndc; 825 ndc_xyz.writemask = WRITEMASK_XYZ; 826 827 emit(MUL(ndc_xyz, pos, src_reg(ndc_w))); 828} 829 830void 831vec4_visitor::emit_psiz_and_flags(dst_reg reg) 832{ 833 if (devinfo->ver < 6 && 834 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) || 835 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE || 836 devinfo->has_negative_rhw_bug)) { 837 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type); 838 dst_reg header1_w = header1; 839 header1_w.writemask = WRITEMASK_W; 840 841 emit(MOV(header1, brw_imm_ud(0u))); 842 843 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { 844 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); 845 846 current_annotation = "Point size"; 847 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11)))); 848 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8))); 849 } 850 851 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) { 852 current_annotation = "Clipping flags"; 853 dst_reg flags0 = dst_reg(this, glsl_type::uint_type); 854 855 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 856 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0)); 857 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0))); 858 } 859 860 if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) { 861 dst_reg flags1 = dst_reg(this, glsl_type::uint_type); 862 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 863 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0)); 864 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4))); 865 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1))); 866 } 867 868 /* i965 clipping workaround: 869 * 1) Test for -ve rhw 870 * 2) If set, 871 * set ndc = (0,0,0,0) 872 * set ucp[6] = 1 873 * 874 * Later, clipping will detect ucp[6] and ensure the primitive is 875 * clipped against all fixed planes. 876 */ 877 if (devinfo->has_negative_rhw_bug && 878 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) { 879 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]); 880 ndc_w.swizzle = BRW_SWIZZLE_WWWW; 881 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 882 vec4_instruction *inst; 883 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6))); 884 inst->predicate = BRW_PREDICATE_NORMAL; 885 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F; 886 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f))); 887 inst->predicate = BRW_PREDICATE_NORMAL; 888 } 889 890 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1))); 891 } else if (devinfo->ver < 6) { 892 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u))); 893 } else { 894 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0))); 895 if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) { 896 dst_reg reg_w = reg; 897 reg_w.writemask = WRITEMASK_W; 898 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); 899 reg_as_src.type = reg_w.type; 900 reg_as_src.swizzle = brw_swizzle_for_size(1); 901 emit(MOV(reg_w, reg_as_src)); 902 } 903 if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) { 904 dst_reg reg_y = reg; 905 reg_y.writemask = WRITEMASK_Y; 906 reg_y.type = BRW_REGISTER_TYPE_D; 907 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type; 908 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0]))); 909 } 910 if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) { 911 dst_reg reg_z = reg; 912 reg_z.writemask = WRITEMASK_Z; 913 reg_z.type = BRW_REGISTER_TYPE_D; 914 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type; 915 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0]))); 916 } 917 } 918} 919 920vec4_instruction * 921vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component) 922{ 923 assert(varying < VARYING_SLOT_MAX); 924 925 unsigned num_comps = output_num_components[varying][component]; 926 if (num_comps == 0) 927 return NULL; 928 929 assert(output_reg[varying][component].type == reg.type); 930 current_annotation = output_reg_annotation[varying]; 931 if (output_reg[varying][component].file != BAD_FILE) { 932 src_reg src = src_reg(output_reg[varying][component]); 933 src.swizzle = BRW_SWZ_COMP_OUTPUT(component); 934 reg.writemask = 935 brw_writemask_for_component_packing(num_comps, component); 936 return emit(MOV(reg, src)); 937 } 938 return NULL; 939} 940 941void 942vec4_visitor::emit_urb_slot(dst_reg reg, int varying) 943{ 944 reg.type = BRW_REGISTER_TYPE_F; 945 output_reg[varying][0].type = reg.type; 946 947 switch (varying) { 948 case VARYING_SLOT_PSIZ: 949 { 950 /* PSIZ is always in slot 0, and is coupled with other flags. */ 951 current_annotation = "indices, point width, clip flags"; 952 emit_psiz_and_flags(reg); 953 break; 954 } 955 case BRW_VARYING_SLOT_NDC: 956 current_annotation = "NDC"; 957 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) 958 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]))); 959 break; 960 case VARYING_SLOT_POS: 961 current_annotation = "gl_Position"; 962 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE) 963 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0]))); 964 break; 965 case BRW_VARYING_SLOT_PAD: 966 /* No need to write to this slot */ 967 break; 968 default: 969 for (int i = 0; i < 4; i++) { 970 emit_generic_urb_slot(reg, varying, i); 971 } 972 break; 973 } 974} 975 976static unsigned 977align_interleaved_urb_mlen(const struct intel_device_info *devinfo, 978 unsigned mlen) 979{ 980 if (devinfo->ver >= 6) { 981 /* URB data written (does not include the message header reg) must 982 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, 983 * section 5.4.3.2.2: URB_INTERLEAVED. 984 * 985 * URB entries are allocated on a multiple of 1024 bits, so an 986 * extra 128 bits written here to make the end align to 256 is 987 * no problem. 988 */ 989 if ((mlen % 2) != 1) 990 mlen++; 991 } 992 993 return mlen; 994} 995 996 997/** 998 * Generates the VUE payload plus the necessary URB write instructions to 999 * output it. 1000 * 1001 * The VUE layout is documented in Volume 2a. 1002 */ 1003void 1004vec4_visitor::emit_vertex() 1005{ 1006 /* MRF 0 is reserved for the debugger, so start with message header 1007 * in MRF 1. 1008 */ 1009 int base_mrf = 1; 1010 int mrf = base_mrf; 1011 /* In the process of generating our URB write message contents, we 1012 * may need to unspill a register or load from an array. Those 1013 * reads would use MRFs 14-15. 1014 */ 1015 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver); 1016 1017 /* The following assertion verifies that max_usable_mrf causes an 1018 * even-numbered amount of URB write data, which will meet gfx6's 1019 * requirements for length alignment. 1020 */ 1021 assert ((max_usable_mrf - base_mrf) % 2 == 0); 1022 1023 /* First mrf is the g0-based message header containing URB handles and 1024 * such. 1025 */ 1026 emit_urb_write_header(mrf++); 1027 1028 if (devinfo->ver < 6) { 1029 emit_ndc_computation(); 1030 } 1031 1032 /* We may need to split this up into several URB writes, so do them in a 1033 * loop. 1034 */ 1035 int slot = 0; 1036 bool complete = false; 1037 do { 1038 /* URB offset is in URB row increments, and each of our MRFs is half of 1039 * one of those, since we're doing interleaved writes. 1040 */ 1041 int offset = slot / 2; 1042 1043 mrf = base_mrf + 1; 1044 for (; slot < prog_data->vue_map.num_slots; ++slot) { 1045 emit_urb_slot(dst_reg(MRF, mrf++), 1046 prog_data->vue_map.slot_to_varying[slot]); 1047 1048 /* If this was max_usable_mrf, we can't fit anything more into this 1049 * URB WRITE. Same thing if we reached the maximum length available. 1050 */ 1051 if (mrf > max_usable_mrf || 1052 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { 1053 slot++; 1054 break; 1055 } 1056 } 1057 1058 complete = slot >= prog_data->vue_map.num_slots; 1059 current_annotation = "URB write"; 1060 vec4_instruction *inst = emit_urb_write_opcode(complete); 1061 inst->base_mrf = base_mrf; 1062 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf); 1063 inst->offset += offset; 1064 } while(!complete); 1065} 1066 1067 1068src_reg 1069vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst, 1070 src_reg *reladdr, int reg_offset) 1071{ 1072 /* Because we store the values to scratch interleaved like our 1073 * vertex data, we need to scale the vec4 index by 2. 1074 */ 1075 int message_header_scale = 2; 1076 1077 /* Pre-gfx6, the message header uses byte offsets instead of vec4 1078 * (16-byte) offset units. 1079 */ 1080 if (devinfo->ver < 6) 1081 message_header_scale *= 16; 1082 1083 if (reladdr) { 1084 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have 1085 * to multiply the reladdr by 2. Notice that the reg_offset part 1086 * is in units of 16 bytes and is used to select the low/high 16-byte 1087 * chunk of a full dvec4, so we don't want to multiply that part. 1088 */ 1089 src_reg index = src_reg(this, glsl_type::int_type); 1090 if (type_sz(inst->dst.type) < 8) { 1091 emit_before(block, inst, ADD(dst_reg(index), *reladdr, 1092 brw_imm_d(reg_offset))); 1093 emit_before(block, inst, MUL(dst_reg(index), index, 1094 brw_imm_d(message_header_scale))); 1095 } else { 1096 emit_before(block, inst, MUL(dst_reg(index), *reladdr, 1097 brw_imm_d(message_header_scale * 2))); 1098 emit_before(block, inst, ADD(dst_reg(index), index, 1099 brw_imm_d(reg_offset * message_header_scale))); 1100 } 1101 return index; 1102 } else { 1103 return brw_imm_d(reg_offset * message_header_scale); 1104 } 1105} 1106 1107/** 1108 * Emits an instruction before @inst to load the value named by @orig_src 1109 * from scratch space at @base_offset to @temp. 1110 * 1111 * @base_offset is measured in 32-byte units (the size of a register). 1112 */ 1113void 1114vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst, 1115 dst_reg temp, src_reg orig_src, 1116 int base_offset) 1117{ 1118 assert(orig_src.offset % REG_SIZE == 0); 1119 int reg_offset = base_offset + orig_src.offset / REG_SIZE; 1120 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr, 1121 reg_offset); 1122 1123 if (type_sz(orig_src.type) < 8) { 1124 emit_before(block, inst, SCRATCH_READ(temp, index)); 1125 } else { 1126 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type); 1127 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F); 1128 emit_before(block, inst, SCRATCH_READ(shuffled_float, index)); 1129 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1); 1130 vec4_instruction *last_read = 1131 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index); 1132 emit_before(block, inst, last_read); 1133 shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read); 1134 } 1135} 1136 1137/** 1138 * Emits an instruction after @inst to store the value to be written 1139 * to @orig_dst to scratch space at @base_offset, from @temp. 1140 * 1141 * @base_offset is measured in 32-byte units (the size of a register). 1142 */ 1143void 1144vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst, 1145 int base_offset) 1146{ 1147 assert(inst->dst.offset % REG_SIZE == 0); 1148 int reg_offset = base_offset + inst->dst.offset / REG_SIZE; 1149 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, 1150 reg_offset); 1151 1152 /* Create a temporary register to store *inst's result in. 1153 * 1154 * We have to be careful in MOVing from our temporary result register in 1155 * the scratch write. If we swizzle from channels of the temporary that 1156 * weren't initialized, it will confuse live interval analysis, which will 1157 * make spilling fail to make progress. 1158 */ 1159 bool is_64bit = type_sz(inst->dst.type) == 8; 1160 const glsl_type *alloc_type = 1161 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type; 1162 const src_reg temp = swizzle(retype(src_reg(this, alloc_type), 1163 inst->dst.type), 1164 brw_swizzle_for_mask(inst->dst.writemask)); 1165 1166 if (!is_64bit) { 1167 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), 1168 inst->dst.writemask)); 1169 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index); 1170 if (inst->opcode != BRW_OPCODE_SEL) 1171 write->predicate = inst->predicate; 1172 write->ir = inst->ir; 1173 write->annotation = inst->annotation; 1174 inst->insert_after(block, write); 1175 } else { 1176 dst_reg shuffled = dst_reg(this, alloc_type); 1177 vec4_instruction *last = 1178 shuffle_64bit_data(shuffled, temp, true, true, block, inst); 1179 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F)); 1180 1181 uint8_t mask = 0; 1182 if (inst->dst.writemask & WRITEMASK_X) 1183 mask |= WRITEMASK_XY; 1184 if (inst->dst.writemask & WRITEMASK_Y) 1185 mask |= WRITEMASK_ZW; 1186 if (mask) { 1187 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); 1188 1189 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index); 1190 if (inst->opcode != BRW_OPCODE_SEL) 1191 write->predicate = inst->predicate; 1192 write->ir = inst->ir; 1193 write->annotation = inst->annotation; 1194 last->insert_after(block, write); 1195 } 1196 1197 mask = 0; 1198 if (inst->dst.writemask & WRITEMASK_Z) 1199 mask |= WRITEMASK_XY; 1200 if (inst->dst.writemask & WRITEMASK_W) 1201 mask |= WRITEMASK_ZW; 1202 if (mask) { 1203 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); 1204 1205 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, 1206 reg_offset + 1); 1207 vec4_instruction *write = 1208 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index); 1209 if (inst->opcode != BRW_OPCODE_SEL) 1210 write->predicate = inst->predicate; 1211 write->ir = inst->ir; 1212 write->annotation = inst->annotation; 1213 last->insert_after(block, write); 1214 } 1215 } 1216 1217 inst->dst.file = temp.file; 1218 inst->dst.nr = temp.nr; 1219 inst->dst.offset %= REG_SIZE; 1220 inst->dst.reladdr = NULL; 1221} 1222 1223/** 1224 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so, 1225 * adds the scratch read(s) before \p inst. The function also checks for 1226 * recursive reladdr scratch accesses, issuing the corresponding scratch 1227 * loads and rewriting reladdr references accordingly. 1228 * 1229 * \return \p src if it did not require a scratch load, otherwise, the 1230 * register holding the result of the scratch load that the caller should 1231 * use to rewrite src. 1232 */ 1233src_reg 1234vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block, 1235 vec4_instruction *inst, src_reg src) 1236{ 1237 /* Resolve recursive reladdr scratch access by calling ourselves 1238 * with src.reladdr 1239 */ 1240 if (src.reladdr) 1241 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, 1242 *src.reladdr); 1243 1244 /* Now handle scratch access on src */ 1245 if (src.file == VGRF && scratch_loc[src.nr] != -1) { 1246 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ? 1247 glsl_type::dvec4_type : glsl_type::vec4_type); 1248 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]); 1249 src.nr = temp.nr; 1250 src.offset %= REG_SIZE; 1251 src.reladdr = NULL; 1252 } 1253 1254 return src; 1255} 1256 1257/** 1258 * We can't generally support array access in GRF space, because a 1259 * single instruction's destination can only span 2 contiguous 1260 * registers. So, we send all GRF arrays that get variable index 1261 * access to scratch space. 1262 */ 1263void 1264vec4_visitor::move_grf_array_access_to_scratch() 1265{ 1266 int scratch_loc[this->alloc.count]; 1267 memset(scratch_loc, -1, sizeof(scratch_loc)); 1268 1269 /* First, calculate the set of virtual GRFs that need to be punted 1270 * to scratch due to having any array access on them, and where in 1271 * scratch. 1272 */ 1273 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1274 if (inst->dst.file == VGRF && inst->dst.reladdr) { 1275 if (scratch_loc[inst->dst.nr] == -1) { 1276 scratch_loc[inst->dst.nr] = last_scratch; 1277 last_scratch += this->alloc.sizes[inst->dst.nr]; 1278 } 1279 1280 for (src_reg *iter = inst->dst.reladdr; 1281 iter->reladdr; 1282 iter = iter->reladdr) { 1283 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { 1284 scratch_loc[iter->nr] = last_scratch; 1285 last_scratch += this->alloc.sizes[iter->nr]; 1286 } 1287 } 1288 } 1289 1290 for (int i = 0 ; i < 3; i++) { 1291 for (src_reg *iter = &inst->src[i]; 1292 iter->reladdr; 1293 iter = iter->reladdr) { 1294 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { 1295 scratch_loc[iter->nr] = last_scratch; 1296 last_scratch += this->alloc.sizes[iter->nr]; 1297 } 1298 } 1299 } 1300 } 1301 1302 /* Now, for anything that will be accessed through scratch, rewrite 1303 * it to load/store. Note that this is a _safe list walk, because 1304 * we may generate a new scratch_write instruction after the one 1305 * we're processing. 1306 */ 1307 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 1308 /* Set up the annotation tracking for new generated instructions. */ 1309 base_ir = inst->ir; 1310 current_annotation = inst->annotation; 1311 1312 /* First handle scratch access on the dst. Notice we have to handle 1313 * the case where the dst's reladdr also points to scratch space. 1314 */ 1315 if (inst->dst.reladdr) 1316 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, 1317 *inst->dst.reladdr); 1318 1319 /* Now that we have handled any (possibly recursive) reladdr scratch 1320 * accesses for dst we can safely do the scratch write for dst itself 1321 */ 1322 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1) 1323 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]); 1324 1325 /* Now handle scratch access on any src. In this case, since inst->src[i] 1326 * already is a src_reg, we can just call emit_resolve_reladdr with 1327 * inst->src[i] and it will take care of handling scratch loads for 1328 * both src and src.reladdr (recursively). 1329 */ 1330 for (int i = 0 ; i < 3; i++) { 1331 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst, 1332 inst->src[i]); 1333 } 1334 } 1335} 1336 1337void 1338vec4_visitor::resolve_ud_negate(src_reg *reg) 1339{ 1340 if (reg->type != BRW_REGISTER_TYPE_UD || 1341 !reg->negate) 1342 return; 1343 1344 src_reg temp = src_reg(this, glsl_type::uvec4_type); 1345 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg); 1346 *reg = temp; 1347} 1348 1349vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, 1350 void *log_data, 1351 const struct brw_sampler_prog_key_data *key_tex, 1352 struct brw_vue_prog_data *prog_data, 1353 const nir_shader *shader, 1354 void *mem_ctx, 1355 bool no_spills, 1356 bool debug_enabled) 1357 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base, 1358 debug_enabled), 1359 key_tex(key_tex), 1360 prog_data(prog_data), 1361 fail_msg(NULL), 1362 first_non_payload_grf(0), 1363 ubo_push_start(), 1364 push_length(0), 1365 live_analysis(this), performance_analysis(this), 1366 need_all_constants_in_pull_buffer(false), 1367 no_spills(no_spills), 1368 last_scratch(0) 1369{ 1370 this->failed = false; 1371 1372 this->base_ir = NULL; 1373 this->current_annotation = NULL; 1374 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation)); 1375 1376 memset(this->output_num_components, 0, sizeof(this->output_num_components)); 1377 1378 this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF; 1379 1380 this->uniforms = 0; 1381 1382 this->nir_locals = NULL; 1383 this->nir_ssa_values = NULL; 1384} 1385 1386 1387void 1388vec4_visitor::fail(const char *format, ...) 1389{ 1390 va_list va; 1391 char *msg; 1392 1393 if (failed) 1394 return; 1395 1396 failed = true; 1397 1398 va_start(va, format); 1399 msg = ralloc_vasprintf(mem_ctx, format, va); 1400 va_end(va); 1401 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg); 1402 1403 this->fail_msg = msg; 1404 1405 if (unlikely(debug_enabled)) { 1406 fprintf(stderr, "%s", msg); 1407 } 1408} 1409 1410} /* namespace brw */ 1411