1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_nir.h" 25#include "brw_vec4.h" 26#include "brw_vec4_builder.h" 27#include "brw_vec4_surface_builder.h" 28#include "brw_eu.h" 29 30using namespace brw; 31using namespace brw::surface_access; 32 33namespace brw { 34 35void 36vec4_visitor::emit_nir_code() 37{ 38 if (nir->num_uniforms > 0) 39 nir_setup_uniforms(); 40 41 nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir)); 42} 43 44void 45vec4_visitor::nir_setup_uniforms() 46{ 47 uniforms = nir->num_uniforms / 16; 48} 49 50void 51vec4_visitor::nir_emit_impl(nir_function_impl *impl) 52{ 53 nir_locals = ralloc_array(mem_ctx, dst_reg, impl->reg_alloc); 54 for (unsigned i = 0; i < impl->reg_alloc; i++) { 55 nir_locals[i] = dst_reg(); 56 } 57 58 foreach_list_typed(nir_register, reg, node, &impl->registers) { 59 unsigned array_elems = 60 reg->num_array_elems == 0 ? 1 : reg->num_array_elems; 61 const unsigned num_regs = array_elems * DIV_ROUND_UP(reg->bit_size, 32); 62 nir_locals[reg->index] = dst_reg(VGRF, alloc.allocate(num_regs)); 63 64 if (reg->bit_size == 64) 65 nir_locals[reg->index].type = BRW_REGISTER_TYPE_DF; 66 } 67 68 nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc); 69 70 nir_emit_cf_list(&impl->body); 71} 72 73void 74vec4_visitor::nir_emit_cf_list(exec_list *list) 75{ 76 exec_list_validate(list); 77 foreach_list_typed(nir_cf_node, node, node, list) { 78 switch (node->type) { 79 case nir_cf_node_if: 80 nir_emit_if(nir_cf_node_as_if(node)); 81 break; 82 83 case nir_cf_node_loop: 84 nir_emit_loop(nir_cf_node_as_loop(node)); 85 break; 86 87 case nir_cf_node_block: 88 nir_emit_block(nir_cf_node_as_block(node)); 89 break; 90 91 default: 92 unreachable("Invalid CFG node block"); 93 } 94 } 95} 96 97void 98vec4_visitor::nir_emit_if(nir_if *if_stmt) 99{ 100 /* First, put the condition in f0 */ 101 src_reg condition = get_nir_src(if_stmt->condition, BRW_REGISTER_TYPE_D, 1); 102 vec4_instruction *inst = emit(MOV(dst_null_d(), condition)); 103 inst->conditional_mod = BRW_CONDITIONAL_NZ; 104 105 /* We can just predicate based on the X channel, as the condition only 106 * goes on its own line */ 107 emit(IF(BRW_PREDICATE_ALIGN16_REPLICATE_X)); 108 109 nir_emit_cf_list(&if_stmt->then_list); 110 111 /* note: if the else is empty, dead CF elimination will remove it */ 112 emit(BRW_OPCODE_ELSE); 113 114 nir_emit_cf_list(&if_stmt->else_list); 115 116 emit(BRW_OPCODE_ENDIF); 117} 118 119void 120vec4_visitor::nir_emit_loop(nir_loop *loop) 121{ 122 emit(BRW_OPCODE_DO); 123 124 nir_emit_cf_list(&loop->body); 125 126 emit(BRW_OPCODE_WHILE); 127} 128 129void 130vec4_visitor::nir_emit_block(nir_block *block) 131{ 132 nir_foreach_instr(instr, block) { 133 nir_emit_instr(instr); 134 } 135} 136 137void 138vec4_visitor::nir_emit_instr(nir_instr *instr) 139{ 140 base_ir = instr; 141 142 switch (instr->type) { 143 case nir_instr_type_load_const: 144 nir_emit_load_const(nir_instr_as_load_const(instr)); 145 break; 146 147 case nir_instr_type_intrinsic: 148 nir_emit_intrinsic(nir_instr_as_intrinsic(instr)); 149 break; 150 151 case nir_instr_type_alu: 152 nir_emit_alu(nir_instr_as_alu(instr)); 153 break; 154 155 case nir_instr_type_jump: 156 nir_emit_jump(nir_instr_as_jump(instr)); 157 break; 158 159 case nir_instr_type_tex: 160 nir_emit_texture(nir_instr_as_tex(instr)); 161 break; 162 163 case nir_instr_type_ssa_undef: 164 nir_emit_undef(nir_instr_as_ssa_undef(instr)); 165 break; 166 167 default: 168 unreachable("VS instruction not yet implemented by NIR->vec4"); 169 } 170} 171 172static dst_reg 173dst_reg_for_nir_reg(vec4_visitor *v, nir_register *nir_reg, 174 unsigned base_offset, nir_src *indirect) 175{ 176 dst_reg reg; 177 178 reg = v->nir_locals[nir_reg->index]; 179 if (nir_reg->bit_size == 64) 180 reg.type = BRW_REGISTER_TYPE_DF; 181 reg = offset(reg, 8, base_offset); 182 if (indirect) { 183 reg.reladdr = 184 new(v->mem_ctx) src_reg(v->get_nir_src(*indirect, 185 BRW_REGISTER_TYPE_D, 186 1)); 187 } 188 return reg; 189} 190 191dst_reg 192vec4_visitor::get_nir_dest(const nir_dest &dest) 193{ 194 if (dest.is_ssa) { 195 dst_reg dst = 196 dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(dest.ssa.bit_size, 32))); 197 if (dest.ssa.bit_size == 64) 198 dst.type = BRW_REGISTER_TYPE_DF; 199 nir_ssa_values[dest.ssa.index] = dst; 200 return dst; 201 } else { 202 return dst_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset, 203 dest.reg.indirect); 204 } 205} 206 207dst_reg 208vec4_visitor::get_nir_dest(const nir_dest &dest, enum brw_reg_type type) 209{ 210 return retype(get_nir_dest(dest), type); 211} 212 213dst_reg 214vec4_visitor::get_nir_dest(const nir_dest &dest, nir_alu_type type) 215{ 216 return get_nir_dest(dest, brw_type_for_nir_type(devinfo, type)); 217} 218 219src_reg 220vec4_visitor::get_nir_src(const nir_src &src, enum brw_reg_type type, 221 unsigned num_components) 222{ 223 dst_reg reg; 224 225 if (src.is_ssa) { 226 assert(src.ssa != NULL); 227 reg = nir_ssa_values[src.ssa->index]; 228 } 229 else { 230 reg = dst_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset, 231 src.reg.indirect); 232 } 233 234 reg = retype(reg, type); 235 236 src_reg reg_as_src = src_reg(reg); 237 reg_as_src.swizzle = brw_swizzle_for_size(num_components); 238 return reg_as_src; 239} 240 241src_reg 242vec4_visitor::get_nir_src(const nir_src &src, nir_alu_type type, 243 unsigned num_components) 244{ 245 return get_nir_src(src, brw_type_for_nir_type(devinfo, type), 246 num_components); 247} 248 249src_reg 250vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components) 251{ 252 /* if type is not specified, default to signed int */ 253 return get_nir_src(src, nir_type_int32, num_components); 254} 255 256src_reg 257vec4_visitor::get_nir_src_imm(const nir_src &src) 258{ 259 assert(nir_src_num_components(src) == 1); 260 assert(nir_src_bit_size(src) == 32); 261 return nir_src_is_const(src) ? src_reg(brw_imm_d(nir_src_as_int(src))) : 262 get_nir_src(src, 1); 263} 264 265src_reg 266vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr) 267{ 268 nir_src *offset_src = nir_get_io_offset_src(instr); 269 270 if (nir_src_is_const(*offset_src)) { 271 /* The only constant offset we should find is 0. brw_nir.c's 272 * add_const_offset_to_base() will fold other constant offsets 273 * into instr->const_index[0]. 274 */ 275 assert(nir_src_as_uint(*offset_src) == 0); 276 return src_reg(); 277 } 278 279 return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1); 280} 281 282static src_reg 283setup_imm_df(const vec4_builder &bld, double v) 284{ 285 const intel_device_info *devinfo = bld.shader->devinfo; 286 assert(devinfo->ver == 7); 287 288 /* gfx7.5 does not support DF immediates straightforward but the DIM 289 * instruction allows to set the 64-bit immediate value. 290 */ 291 if (devinfo->verx10 == 75) { 292 const vec4_builder ubld = bld.exec_all(); 293 const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_DF); 294 ubld.DIM(dst, brw_imm_df(v)); 295 return swizzle(src_reg(dst), BRW_SWIZZLE_XXXX); 296 } 297 298 /* gfx7 does not support DF immediates */ 299 union { 300 double d; 301 struct { 302 uint32_t i1; 303 uint32_t i2; 304 }; 305 } di; 306 307 di.d = v; 308 309 /* Write the low 32-bit of the constant to the X:UD channel and the 310 * high 32-bit to the Y:UD channel to build the constant in a VGRF. 311 * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes 312 * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle 313 * XXXX so any access to the VGRF only reads the constant data in these 314 * channels. 315 */ 316 const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 317 for (unsigned n = 0; n < 2; n++) { 318 const vec4_builder ubld = bld.exec_all().group(4, n); 319 ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), brw_imm_ud(di.i1)); 320 ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), brw_imm_ud(di.i2)); 321 } 322 323 return swizzle(src_reg(retype(tmp, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX); 324} 325 326void 327vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr) 328{ 329 dst_reg reg; 330 331 if (instr->def.bit_size == 64) { 332 reg = dst_reg(VGRF, alloc.allocate(2)); 333 reg.type = BRW_REGISTER_TYPE_DF; 334 } else { 335 reg = dst_reg(VGRF, alloc.allocate(1)); 336 reg.type = BRW_REGISTER_TYPE_D; 337 } 338 339 const vec4_builder ibld = vec4_builder(this).at_end(); 340 unsigned remaining = brw_writemask_for_size(instr->def.num_components); 341 342 /* @FIXME: consider emitting vector operations to save some MOVs in 343 * cases where the components are representable in 8 bits. 344 * For now, we emit a MOV for each distinct value. 345 */ 346 for (unsigned i = 0; i < instr->def.num_components; i++) { 347 unsigned writemask = 1 << i; 348 349 if ((remaining & writemask) == 0) 350 continue; 351 352 for (unsigned j = i; j < instr->def.num_components; j++) { 353 if ((instr->def.bit_size == 32 && 354 instr->value[i].u32 == instr->value[j].u32) || 355 (instr->def.bit_size == 64 && 356 instr->value[i].f64 == instr->value[j].f64)) { 357 writemask |= 1 << j; 358 } 359 } 360 361 reg.writemask = writemask; 362 if (instr->def.bit_size == 64) { 363 emit(MOV(reg, setup_imm_df(ibld, instr->value[i].f64))); 364 } else { 365 emit(MOV(reg, brw_imm_d(instr->value[i].i32))); 366 } 367 368 remaining &= ~writemask; 369 } 370 371 /* Set final writemask */ 372 reg.writemask = brw_writemask_for_size(instr->def.num_components); 373 374 nir_ssa_values[instr->def.index] = reg; 375} 376 377src_reg 378vec4_visitor::get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr) 379{ 380 /* SSBO stores are weird in that their index is in src[1] */ 381 const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0; 382 383 if (nir_src_is_const(instr->src[src])) { 384 return brw_imm_ud(nir_src_as_uint(instr->src[src])); 385 } else { 386 return emit_uniformize(get_nir_src(instr->src[src])); 387 } 388} 389 390void 391vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) 392{ 393 dst_reg dest; 394 src_reg src; 395 396 switch (instr->intrinsic) { 397 398 case nir_intrinsic_load_input: { 399 assert(nir_dest_bit_size(instr->dest) == 32); 400 /* We set EmitNoIndirectInput for VS */ 401 unsigned load_offset = nir_src_as_uint(instr->src[0]); 402 403 dest = get_nir_dest(instr->dest); 404 dest.writemask = brw_writemask_for_size(instr->num_components); 405 406 src = src_reg(ATTR, instr->const_index[0] + load_offset, 407 glsl_type::uvec4_type); 408 src = retype(src, dest.type); 409 410 /* Swizzle source based on component layout qualifier */ 411 src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr)); 412 emit(MOV(dest, src)); 413 break; 414 } 415 416 case nir_intrinsic_store_output: { 417 assert(nir_src_bit_size(instr->src[0]) == 32); 418 unsigned store_offset = nir_src_as_uint(instr->src[1]); 419 int varying = instr->const_index[0] + store_offset; 420 src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 421 instr->num_components); 422 423 unsigned c = nir_intrinsic_component(instr); 424 output_reg[varying][c] = dst_reg(src); 425 output_num_components[varying][c] = instr->num_components; 426 break; 427 } 428 429 case nir_intrinsic_get_ssbo_size: { 430 assert(nir_src_num_components(instr->src[0]) == 1); 431 unsigned ssbo_index = nir_src_is_const(instr->src[0]) ? 432 nir_src_as_uint(instr->src[0]) : 0; 433 434 dst_reg result_dst = get_nir_dest(instr->dest); 435 vec4_instruction *inst = new(mem_ctx) 436 vec4_instruction(SHADER_OPCODE_GET_BUFFER_SIZE, result_dst); 437 438 inst->base_mrf = 2; 439 inst->mlen = 1; /* always at least one */ 440 inst->src[1] = brw_imm_ud(ssbo_index); 441 442 /* MRF for the first parameter */ 443 src_reg lod = brw_imm_d(0); 444 int param_base = inst->base_mrf; 445 int writemask = WRITEMASK_X; 446 emit(MOV(dst_reg(MRF, param_base, glsl_type::int_type, writemask), lod)); 447 448 emit(inst); 449 break; 450 } 451 452 case nir_intrinsic_store_ssbo: { 453 assert(devinfo->ver == 7); 454 455 /* brw_nir_lower_mem_access_bit_sizes takes care of this */ 456 assert(nir_src_bit_size(instr->src[0]) == 32); 457 assert(nir_intrinsic_write_mask(instr) == 458 (1u << instr->num_components) - 1); 459 460 src_reg surf_index = get_nir_ssbo_intrinsic_index(instr); 461 src_reg offset_reg = retype(get_nir_src_imm(instr->src[2]), 462 BRW_REGISTER_TYPE_UD); 463 464 /* Value */ 465 src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4); 466 467 /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped 468 * writes will use SIMD8 mode. In order to hide this and keep symmetry across 469 * typed and untyped messages and across hardware platforms, the 470 * current implementation of the untyped messages will transparently convert 471 * the SIMD4x2 payload into an equivalent SIMD8 payload by transposing it 472 * and enabling only channel X on the SEND instruction. 473 * 474 * The above, works well for full vector writes, but not for partial writes 475 * where we want to write some channels and not others, like when we have 476 * code such as v.xyw = vec3(1,2,4). Because the untyped write messages are 477 * quite restrictive with regards to the channel enables we can configure in 478 * the message descriptor (not all combinations are allowed) we cannot simply 479 * implement these scenarios with a single message while keeping the 480 * aforementioned symmetry in the implementation. For now we de decided that 481 * it is better to keep the symmetry to reduce complexity, so in situations 482 * such as the one described we end up emitting two untyped write messages 483 * (one for xy and another for w). 484 * 485 * The code below packs consecutive channels into a single write message, 486 * detects gaps in the vector write and if needed, sends a second message 487 * with the remaining channels. If in the future we decide that we want to 488 * emit a single message at the expense of losing the symmetry in the 489 * implementation we can: 490 * 491 * 1) For IvyBridge: Only use the red channel of the untyped write SIMD8 492 * message payload. In this mode we can write up to 8 offsets and dwords 493 * to the red channel only (for the two vec4s in the SIMD4x2 execution) 494 * and select which of the 8 channels carry data to write by setting the 495 * appropriate writemask in the dst register of the SEND instruction. 496 * It would require to write a new generator opcode specifically for 497 * IvyBridge since we would need to prepare a SIMD8 payload that could 498 * use any channel, not just X. 499 * 500 * 2) For Haswell+: Simply send a single write message but set the writemask 501 * on the dst of the SEND instruction to select the channels we want to 502 * write. It would require to modify the current messages to receive 503 * and honor the writemask provided. 504 */ 505 const vec4_builder bld = vec4_builder(this).at_end() 506 .annotate(current_annotation, base_ir); 507 508 emit_untyped_write(bld, surf_index, offset_reg, val_reg, 509 1 /* dims */, instr->num_components /* size */, 510 BRW_PREDICATE_NONE); 511 break; 512 } 513 514 case nir_intrinsic_load_ssbo: { 515 assert(devinfo->ver == 7); 516 517 /* brw_nir_lower_mem_access_bit_sizes takes care of this */ 518 assert(nir_dest_bit_size(instr->dest) == 32); 519 520 src_reg surf_index = get_nir_ssbo_intrinsic_index(instr); 521 src_reg offset_reg = retype(get_nir_src_imm(instr->src[1]), 522 BRW_REGISTER_TYPE_UD); 523 524 /* Read the vector */ 525 const vec4_builder bld = vec4_builder(this).at_end() 526 .annotate(current_annotation, base_ir); 527 528 src_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, 529 1 /* dims */, 4 /* size*/, 530 BRW_PREDICATE_NONE); 531 dst_reg dest = get_nir_dest(instr->dest); 532 read_result.type = dest.type; 533 read_result.swizzle = brw_swizzle_for_size(instr->num_components); 534 emit(MOV(dest, read_result)); 535 break; 536 } 537 538 case nir_intrinsic_ssbo_atomic_add: 539 case nir_intrinsic_ssbo_atomic_imin: 540 case nir_intrinsic_ssbo_atomic_umin: 541 case nir_intrinsic_ssbo_atomic_imax: 542 case nir_intrinsic_ssbo_atomic_umax: 543 case nir_intrinsic_ssbo_atomic_and: 544 case nir_intrinsic_ssbo_atomic_or: 545 case nir_intrinsic_ssbo_atomic_xor: 546 case nir_intrinsic_ssbo_atomic_exchange: 547 case nir_intrinsic_ssbo_atomic_comp_swap: 548 nir_emit_ssbo_atomic(brw_aop_for_nir_intrinsic(instr), instr); 549 break; 550 551 case nir_intrinsic_load_vertex_id: 552 unreachable("should be lowered by lower_vertex_id()"); 553 554 case nir_intrinsic_load_vertex_id_zero_base: 555 case nir_intrinsic_load_base_vertex: 556 case nir_intrinsic_load_instance_id: 557 case nir_intrinsic_load_base_instance: 558 case nir_intrinsic_load_draw_id: 559 case nir_intrinsic_load_invocation_id: 560 unreachable("should be lowered by brw_nir_lower_vs_inputs()"); 561 562 case nir_intrinsic_load_uniform: { 563 /* Offsets are in bytes but they should always be multiples of 4 */ 564 assert(nir_intrinsic_base(instr) % 4 == 0); 565 566 dest = get_nir_dest(instr->dest); 567 568 src = src_reg(dst_reg(UNIFORM, nir_intrinsic_base(instr) / 16)); 569 src.type = dest.type; 570 571 /* Uniforms don't actually have to be vec4 aligned. In the case that 572 * it isn't, we have to use a swizzle to shift things around. They 573 * do still have the std140 alignment requirement that vec2's have to 574 * be vec2-aligned and vec3's and vec4's have to be vec4-aligned. 575 * 576 * The swizzle also works in the indirect case as the generator adds 577 * the swizzle to the offset for us. 578 */ 579 const int type_size = type_sz(src.type); 580 unsigned shift = (nir_intrinsic_base(instr) % 16) / type_size; 581 assert(shift + instr->num_components <= 4); 582 583 if (nir_src_is_const(instr->src[0])) { 584 const unsigned load_offset = nir_src_as_uint(instr->src[0]); 585 /* Offsets are in bytes but they should always be multiples of 4 */ 586 assert(load_offset % 4 == 0); 587 588 src.swizzle = brw_swizzle_for_size(instr->num_components); 589 dest.writemask = brw_writemask_for_size(instr->num_components); 590 unsigned offset = load_offset + shift * type_size; 591 src.offset = ROUND_DOWN_TO(offset, 16); 592 shift = (offset % 16) / type_size; 593 assert(shift + instr->num_components <= 4); 594 src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift); 595 596 emit(MOV(dest, src)); 597 } else { 598 /* Uniform arrays are vec4 aligned, because of std140 alignment 599 * rules. 600 */ 601 assert(shift == 0); 602 603 src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); 604 605 /* MOV_INDIRECT is going to stomp the whole thing anyway */ 606 dest.writemask = WRITEMASK_XYZW; 607 608 emit(SHADER_OPCODE_MOV_INDIRECT, dest, src, 609 indirect, brw_imm_ud(instr->const_index[1])); 610 } 611 break; 612 } 613 614 case nir_intrinsic_load_ubo: { 615 src_reg surf_index; 616 617 dest = get_nir_dest(instr->dest); 618 619 if (nir_src_is_const(instr->src[0])) { 620 /* The block index is a constant, so just emit the binding table entry 621 * as an immediate. 622 */ 623 const unsigned index = nir_src_as_uint(instr->src[0]); 624 surf_index = brw_imm_ud(index); 625 } else { 626 /* The block index is not a constant. Evaluate the index expression 627 * per-channel and add the base UBO index; we have to select a value 628 * from any live channel. 629 */ 630 surf_index = src_reg(this, glsl_type::uint_type); 631 emit(MOV(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int32, 632 instr->num_components))); 633 surf_index = emit_uniformize(surf_index); 634 } 635 636 src_reg push_reg; 637 src_reg offset_reg; 638 if (nir_src_is_const(instr->src[1])) { 639 unsigned load_offset = nir_src_as_uint(instr->src[1]); 640 unsigned aligned_offset = load_offset & ~15; 641 offset_reg = brw_imm_ud(aligned_offset); 642 643 /* See if we've selected this as a push constant candidate */ 644 if (nir_src_is_const(instr->src[0])) { 645 const unsigned ubo_block = nir_src_as_uint(instr->src[0]); 646 const unsigned offset_256b = aligned_offset / 32; 647 648 for (int i = 0; i < 4; i++) { 649 const struct brw_ubo_range *range = &prog_data->base.ubo_ranges[i]; 650 if (range->block == ubo_block && 651 offset_256b >= range->start && 652 offset_256b < range->start + range->length) { 653 654 push_reg = src_reg(dst_reg(UNIFORM, UBO_START + i)); 655 push_reg.type = dest.type; 656 push_reg.offset = aligned_offset - 32 * range->start; 657 break; 658 } 659 } 660 } 661 } else { 662 offset_reg = src_reg(this, glsl_type::uint_type); 663 emit(MOV(dst_reg(offset_reg), 664 get_nir_src(instr->src[1], nir_type_uint32, 1))); 665 } 666 667 src_reg packed_consts; 668 if (push_reg.file != BAD_FILE) { 669 packed_consts = push_reg; 670 } else if (nir_dest_bit_size(instr->dest) == 32) { 671 packed_consts = src_reg(this, glsl_type::vec4_type); 672 emit_pull_constant_load_reg(dst_reg(packed_consts), 673 surf_index, 674 offset_reg, 675 NULL, NULL /* before_block/inst */); 676 prog_data->base.has_ubo_pull = true; 677 } else { 678 src_reg temp = src_reg(this, glsl_type::dvec4_type); 679 src_reg temp_float = retype(temp, BRW_REGISTER_TYPE_F); 680 681 emit_pull_constant_load_reg(dst_reg(temp_float), 682 surf_index, offset_reg, NULL, NULL); 683 if (offset_reg.file == IMM) 684 offset_reg.ud += 16; 685 else 686 emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16u))); 687 emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)), 688 surf_index, offset_reg, NULL, NULL); 689 prog_data->base.has_ubo_pull = true; 690 691 packed_consts = src_reg(this, glsl_type::dvec4_type); 692 shuffle_64bit_data(dst_reg(packed_consts), temp, false); 693 } 694 695 packed_consts.swizzle = brw_swizzle_for_size(instr->num_components); 696 if (nir_src_is_const(instr->src[1])) { 697 unsigned load_offset = nir_src_as_uint(instr->src[1]); 698 unsigned type_size = type_sz(dest.type); 699 packed_consts.swizzle += 700 BRW_SWIZZLE4(load_offset % 16 / type_size, 701 load_offset % 16 / type_size, 702 load_offset % 16 / type_size, 703 load_offset % 16 / type_size); 704 } 705 706 emit(MOV(dest, retype(packed_consts, dest.type))); 707 708 break; 709 } 710 711 case nir_intrinsic_scoped_barrier: 712 assert(nir_intrinsic_execution_scope(instr) == NIR_SCOPE_NONE); 713 FALLTHROUGH; 714 case nir_intrinsic_memory_barrier: { 715 const vec4_builder bld = 716 vec4_builder(this).at_end().annotate(current_annotation, base_ir); 717 const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 718 vec4_instruction *fence = 719 bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0)); 720 fence->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; 721 break; 722 } 723 724 case nir_intrinsic_shader_clock: { 725 /* We cannot do anything if there is an event, so ignore it for now */ 726 const src_reg shader_clock = get_timestamp(); 727 const enum brw_reg_type type = brw_type_for_base_type(glsl_type::uvec2_type); 728 729 dest = get_nir_dest(instr->dest, type); 730 emit(MOV(dest, shader_clock)); 731 break; 732 } 733 734 default: 735 unreachable("Unknown intrinsic"); 736 } 737} 738 739void 740vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr) 741{ 742 dst_reg dest; 743 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 744 dest = get_nir_dest(instr->dest); 745 746 src_reg surface = get_nir_ssbo_intrinsic_index(instr); 747 src_reg offset = get_nir_src(instr->src[1], 1); 748 src_reg data1; 749 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 750 data1 = get_nir_src(instr->src[2], 1); 751 src_reg data2; 752 if (op == BRW_AOP_CMPWR) 753 data2 = get_nir_src(instr->src[3], 1); 754 755 /* Emit the actual atomic operation operation */ 756 const vec4_builder bld = 757 vec4_builder(this).at_end().annotate(current_annotation, base_ir); 758 759 src_reg atomic_result = emit_untyped_atomic(bld, surface, offset, 760 data1, data2, 761 1 /* dims */, 1 /* rsize */, 762 op, 763 BRW_PREDICATE_NONE); 764 dest.type = atomic_result.type; 765 bld.MOV(dest, atomic_result); 766} 767 768static unsigned 769brw_swizzle_for_nir_swizzle(uint8_t swizzle[4]) 770{ 771 return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]); 772} 773 774bool 775vec4_visitor::optimize_predicate(nir_alu_instr *instr, 776 enum brw_predicate *predicate) 777{ 778 if (!instr->src[0].src.is_ssa || 779 instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 780 return false; 781 782 nir_alu_instr *cmp_instr = 783 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 784 785 switch (cmp_instr->op) { 786 case nir_op_b32any_fnequal2: 787 case nir_op_b32any_inequal2: 788 case nir_op_b32any_fnequal3: 789 case nir_op_b32any_inequal3: 790 case nir_op_b32any_fnequal4: 791 case nir_op_b32any_inequal4: 792 *predicate = BRW_PREDICATE_ALIGN16_ANY4H; 793 break; 794 case nir_op_b32all_fequal2: 795 case nir_op_b32all_iequal2: 796 case nir_op_b32all_fequal3: 797 case nir_op_b32all_iequal3: 798 case nir_op_b32all_fequal4: 799 case nir_op_b32all_iequal4: 800 *predicate = BRW_PREDICATE_ALIGN16_ALL4H; 801 break; 802 default: 803 return false; 804 } 805 806 unsigned size_swizzle = 807 brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]); 808 809 src_reg op[2]; 810 assert(nir_op_infos[cmp_instr->op].num_inputs == 2); 811 for (unsigned i = 0; i < 2; i++) { 812 nir_alu_type type = nir_op_infos[cmp_instr->op].input_types[i]; 813 unsigned bit_size = nir_src_bit_size(cmp_instr->src[i].src); 814 type = (nir_alu_type) (((unsigned) type) | bit_size); 815 op[i] = get_nir_src(cmp_instr->src[i].src, type, 4); 816 unsigned base_swizzle = 817 brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle); 818 op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle); 819 } 820 821 emit(CMP(dst_null_d(), op[0], op[1], 822 brw_cmod_for_nir_comparison(cmp_instr->op))); 823 824 return true; 825} 826 827static void 828emit_find_msb_using_lzd(const vec4_builder &bld, 829 const dst_reg &dst, 830 const src_reg &src, 831 bool is_signed) 832{ 833 vec4_instruction *inst; 834 src_reg temp = src; 835 836 if (is_signed) { 837 /* LZD of an absolute value source almost always does the right 838 * thing. There are two problem values: 839 * 840 * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns 841 * 0. However, findMSB(int(0x80000000)) == 30. 842 * 843 * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns 844 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 845 * 846 * For a value of zero or negative one, -1 will be returned. 847 * 848 * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but 849 * findMSB(-(1<<x)) should return x-1. 850 * 851 * For all negative number cases, including 0x80000000 and 852 * 0xffffffff, the correct value is obtained from LZD if instead of 853 * negating the (already negative) value the logical-not is used. A 854 * conditional logical-not can be achieved in two instructions. 855 */ 856 temp = src_reg(bld.vgrf(BRW_REGISTER_TYPE_D)); 857 858 bld.ASR(dst_reg(temp), src, brw_imm_d(31)); 859 bld.XOR(dst_reg(temp), temp, src); 860 } 861 862 bld.LZD(retype(dst, BRW_REGISTER_TYPE_UD), 863 retype(temp, BRW_REGISTER_TYPE_UD)); 864 865 /* LZD counts from the MSB side, while GLSL's findMSB() wants the count 866 * from the LSB side. Subtract the result from 31 to convert the MSB count 867 * into an LSB count. If no bits are set, LZD will return 32. 31-32 = -1, 868 * which is exactly what findMSB() is supposed to return. 869 */ 870 inst = bld.ADD(dst, retype(src_reg(dst), BRW_REGISTER_TYPE_D), 871 brw_imm_d(31)); 872 inst->src[0].negate = true; 873} 874 875void 876vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src) 877{ 878 enum opcode op; 879 switch (dst.type) { 880 case BRW_REGISTER_TYPE_D: 881 op = VEC4_OPCODE_DOUBLE_TO_D32; 882 break; 883 case BRW_REGISTER_TYPE_UD: 884 op = VEC4_OPCODE_DOUBLE_TO_U32; 885 break; 886 case BRW_REGISTER_TYPE_F: 887 op = VEC4_OPCODE_DOUBLE_TO_F32; 888 break; 889 default: 890 unreachable("Unknown conversion"); 891 } 892 893 dst_reg temp = dst_reg(this, glsl_type::dvec4_type); 894 emit(MOV(temp, src)); 895 dst_reg temp2 = dst_reg(this, glsl_type::dvec4_type); 896 emit(op, temp2, src_reg(temp)); 897 898 emit(VEC4_OPCODE_PICK_LOW_32BIT, retype(temp2, dst.type), src_reg(temp2)); 899 emit(MOV(dst, src_reg(retype(temp2, dst.type)))); 900} 901 902void 903vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src) 904{ 905 dst_reg tmp_dst = dst_reg(src_reg(this, glsl_type::dvec4_type)); 906 src_reg tmp_src = retype(src_reg(this, glsl_type::vec4_type), src.type); 907 emit(MOV(dst_reg(tmp_src), src)); 908 emit(VEC4_OPCODE_TO_DOUBLE, tmp_dst, tmp_src); 909 emit(MOV(dst, src_reg(tmp_dst))); 910} 911 912/** 913 * Try to use an immediate value for a source 914 * 915 * In cases of flow control, constant propagation is sometimes unable to 916 * determine that a register contains a constant value. To work around this, 917 * try to emit a literal as one of the sources. If \c try_src0_also is set, 918 * \c op[0] will also be tried for an immediate value. 919 * 920 * If \c op[0] is modified, the operands will be exchanged so that \c op[1] 921 * will always be the immediate value. 922 * 923 * \return The index of the source that was modified, 0 or 1, if successful. 924 * Otherwise, -1. 925 * 926 * \param op - Operands to the instruction 927 * \param try_src0_also - True if \c op[0] should also be a candidate for 928 * getting an immediate value. This should only be set 929 * for commutative operations. 930 */ 931static int 932try_immediate_source(const nir_alu_instr *instr, src_reg *op, 933 bool try_src0_also) 934{ 935 unsigned idx; 936 937 /* MOV should be the only single-source instruction passed to this 938 * function. Any other unary instruction with a constant source should 939 * have been constant-folded away! 940 */ 941 assert(nir_op_infos[instr->op].num_inputs > 1 || 942 instr->op == nir_op_mov); 943 944 if (instr->op != nir_op_mov && 945 nir_src_bit_size(instr->src[1].src) == 32 && 946 nir_src_is_const(instr->src[1].src)) { 947 idx = 1; 948 } else if (try_src0_also && 949 nir_src_bit_size(instr->src[0].src) == 32 && 950 nir_src_is_const(instr->src[0].src)) { 951 idx = 0; 952 } else { 953 return -1; 954 } 955 956 const enum brw_reg_type old_type = op[idx].type; 957 958 switch (old_type) { 959 case BRW_REGISTER_TYPE_D: 960 case BRW_REGISTER_TYPE_UD: { 961 int first_comp = -1; 962 int d = 0; 963 964 for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) { 965 if (nir_alu_instr_channel_used(instr, idx, i)) { 966 if (first_comp < 0) { 967 first_comp = i; 968 d = nir_src_comp_as_int(instr->src[idx].src, 969 instr->src[idx].swizzle[i]); 970 } else if (d != nir_src_comp_as_int(instr->src[idx].src, 971 instr->src[idx].swizzle[i])) { 972 return -1; 973 } 974 } 975 } 976 977 assert(first_comp >= 0); 978 979 if (op[idx].abs) 980 d = MAX2(-d, d); 981 982 if (op[idx].negate) 983 d = -d; 984 985 op[idx] = retype(src_reg(brw_imm_d(d)), old_type); 986 break; 987 } 988 989 case BRW_REGISTER_TYPE_F: { 990 int first_comp = -1; 991 float f[NIR_MAX_VEC_COMPONENTS] = { 0.0f }; 992 bool is_scalar = true; 993 994 for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) { 995 if (nir_alu_instr_channel_used(instr, idx, i)) { 996 f[i] = nir_src_comp_as_float(instr->src[idx].src, 997 instr->src[idx].swizzle[i]); 998 if (first_comp < 0) { 999 first_comp = i; 1000 } else if (f[first_comp] != f[i]) { 1001 is_scalar = false; 1002 } 1003 } 1004 } 1005 1006 if (is_scalar) { 1007 if (op[idx].abs) 1008 f[first_comp] = fabs(f[first_comp]); 1009 1010 if (op[idx].negate) 1011 f[first_comp] = -f[first_comp]; 1012 1013 op[idx] = src_reg(brw_imm_f(f[first_comp])); 1014 assert(op[idx].type == old_type); 1015 } else { 1016 uint8_t vf_values[4] = { 0, 0, 0, 0 }; 1017 1018 for (unsigned i = 0; i < ARRAY_SIZE(vf_values); i++) { 1019 1020 if (op[idx].abs) 1021 f[i] = fabs(f[i]); 1022 1023 if (op[idx].negate) 1024 f[i] = -f[i]; 1025 1026 const int vf = brw_float_to_vf(f[i]); 1027 if (vf == -1) 1028 return -1; 1029 1030 vf_values[i] = vf; 1031 } 1032 1033 op[idx] = src_reg(brw_imm_vf4(vf_values[0], vf_values[1], 1034 vf_values[2], vf_values[3])); 1035 } 1036 break; 1037 } 1038 1039 default: 1040 unreachable("Non-32bit type."); 1041 } 1042 1043 /* If the instruction has more than one source, the instruction format only 1044 * allows source 1 to be an immediate value. If the immediate value was 1045 * source 0, then the sources must be exchanged. 1046 */ 1047 if (idx == 0 && instr->op != nir_op_mov) { 1048 src_reg tmp = op[0]; 1049 op[0] = op[1]; 1050 op[1] = tmp; 1051 } 1052 1053 return idx; 1054} 1055 1056void 1057vec4_visitor::fix_float_operands(src_reg op[3], nir_alu_instr *instr) 1058{ 1059 bool fixed[3] = { false, false, false }; 1060 1061 for (unsigned i = 0; i < 2; i++) { 1062 if (!nir_src_is_const(instr->src[i].src)) 1063 continue; 1064 1065 for (unsigned j = i + 1; j < 3; j++) { 1066 if (fixed[j]) 1067 continue; 1068 1069 if (!nir_src_is_const(instr->src[j].src)) 1070 continue; 1071 1072 if (nir_alu_srcs_equal(instr, instr, i, j)) { 1073 if (!fixed[i]) 1074 op[i] = fix_3src_operand(op[i]); 1075 1076 op[j] = op[i]; 1077 1078 fixed[i] = true; 1079 fixed[j] = true; 1080 } else if (nir_alu_srcs_negative_equal(instr, instr, i, j)) { 1081 if (!fixed[i]) 1082 op[i] = fix_3src_operand(op[i]); 1083 1084 op[j] = op[i]; 1085 op[j].negate = !op[j].negate; 1086 1087 fixed[i] = true; 1088 fixed[j] = true; 1089 } 1090 } 1091 } 1092 1093 for (unsigned i = 0; i < 3; i++) { 1094 if (!fixed[i]) 1095 op[i] = fix_3src_operand(op[i]); 1096 } 1097} 1098 1099static bool 1100const_src_fits_in_16_bits(const nir_src &src, brw_reg_type type) 1101{ 1102 assert(nir_src_is_const(src)); 1103 if (brw_reg_type_is_unsigned_integer(type)) { 1104 return nir_src_comp_as_uint(src, 0) <= UINT16_MAX; 1105 } else { 1106 const int64_t c = nir_src_comp_as_int(src, 0); 1107 return c <= INT16_MAX && c >= INT16_MIN; 1108 } 1109} 1110 1111void 1112vec4_visitor::nir_emit_alu(nir_alu_instr *instr) 1113{ 1114 vec4_instruction *inst; 1115 1116 nir_alu_type dst_type = (nir_alu_type) (nir_op_infos[instr->op].output_type | 1117 nir_dest_bit_size(instr->dest.dest)); 1118 dst_reg dst = get_nir_dest(instr->dest.dest, dst_type); 1119 dst.writemask = instr->dest.write_mask; 1120 1121 assert(!instr->dest.saturate); 1122 1123 src_reg op[4]; 1124 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1125 /* We don't lower to source modifiers, so they shouldn't exist. */ 1126 assert(!instr->src[i].abs); 1127 assert(!instr->src[i].negate); 1128 1129 nir_alu_type src_type = (nir_alu_type) 1130 (nir_op_infos[instr->op].input_types[i] | 1131 nir_src_bit_size(instr->src[i].src)); 1132 op[i] = get_nir_src(instr->src[i].src, src_type, 4); 1133 op[i].swizzle = brw_swizzle_for_nir_swizzle(instr->src[i].swizzle); 1134 } 1135 1136#ifndef NDEBUG 1137 /* On Gen7 and earlier, no functionality is exposed that should allow 8-bit 1138 * integer types to ever exist. 1139 */ 1140 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 1141 assert(type_sz(op[i].type) > 1); 1142#endif 1143 1144 switch (instr->op) { 1145 case nir_op_mov: 1146 try_immediate_source(instr, &op[0], true); 1147 inst = emit(MOV(dst, op[0])); 1148 break; 1149 1150 case nir_op_vec2: 1151 case nir_op_vec3: 1152 case nir_op_vec4: 1153 unreachable("not reached: should be handled by lower_vec_to_movs()"); 1154 1155 case nir_op_i2f32: 1156 case nir_op_u2f32: 1157 inst = emit(MOV(dst, op[0])); 1158 break; 1159 1160 case nir_op_f2f32: 1161 case nir_op_f2i32: 1162 case nir_op_f2u32: 1163 if (nir_src_bit_size(instr->src[0].src) == 64) 1164 emit_conversion_from_double(dst, op[0]); 1165 else 1166 inst = emit(MOV(dst, op[0])); 1167 break; 1168 1169 case nir_op_f2f64: 1170 case nir_op_i2f64: 1171 case nir_op_u2f64: 1172 emit_conversion_to_double(dst, op[0]); 1173 break; 1174 1175 case nir_op_fsat: 1176 inst = emit(MOV(dst, op[0])); 1177 inst->saturate = true; 1178 break; 1179 1180 case nir_op_fneg: 1181 case nir_op_ineg: 1182 op[0].negate = true; 1183 inst = emit(MOV(dst, op[0])); 1184 break; 1185 1186 case nir_op_fabs: 1187 case nir_op_iabs: 1188 op[0].negate = false; 1189 op[0].abs = true; 1190 inst = emit(MOV(dst, op[0])); 1191 break; 1192 1193 case nir_op_iadd: 1194 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1195 FALLTHROUGH; 1196 case nir_op_fadd: 1197 try_immediate_source(instr, op, true); 1198 inst = emit(ADD(dst, op[0], op[1])); 1199 break; 1200 1201 case nir_op_uadd_sat: 1202 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1203 inst = emit(ADD(dst, op[0], op[1])); 1204 inst->saturate = true; 1205 break; 1206 1207 case nir_op_fmul: 1208 try_immediate_source(instr, op, true); 1209 inst = emit(MUL(dst, op[0], op[1])); 1210 break; 1211 1212 case nir_op_imul: { 1213 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1214 1215 /* For integer multiplication, the MUL uses the low 16 bits of one of 1216 * the operands (src0 through SNB, src1 on IVB and later). The MACH 1217 * accumulates in the contribution of the upper 16 bits of that 1218 * operand. If we can determine that one of the args is in the low 1219 * 16 bits, though, we can just emit a single MUL. 1220 */ 1221 if (nir_src_is_const(instr->src[0].src) && 1222 nir_alu_instr_src_read_mask(instr, 0) == 1 && 1223 const_src_fits_in_16_bits(instr->src[0].src, op[0].type)) { 1224 if (devinfo->ver < 7) 1225 emit(MUL(dst, op[0], op[1])); 1226 else 1227 emit(MUL(dst, op[1], op[0])); 1228 } else if (nir_src_is_const(instr->src[1].src) && 1229 nir_alu_instr_src_read_mask(instr, 1) == 1 && 1230 const_src_fits_in_16_bits(instr->src[1].src, op[1].type)) { 1231 if (devinfo->ver < 7) 1232 emit(MUL(dst, op[1], op[0])); 1233 else 1234 emit(MUL(dst, op[0], op[1])); 1235 } else { 1236 struct brw_reg acc = retype(brw_acc_reg(8), dst.type); 1237 1238 emit(MUL(acc, op[0], op[1])); 1239 emit(MACH(dst_null_d(), op[0], op[1])); 1240 emit(MOV(dst, src_reg(acc))); 1241 } 1242 break; 1243 } 1244 1245 case nir_op_imul_high: 1246 case nir_op_umul_high: { 1247 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1248 struct brw_reg acc = retype(brw_acc_reg(8), dst.type); 1249 1250 emit(MUL(acc, op[0], op[1])); 1251 emit(MACH(dst, op[0], op[1])); 1252 break; 1253 } 1254 1255 case nir_op_frcp: 1256 inst = emit_math(SHADER_OPCODE_RCP, dst, op[0]); 1257 break; 1258 1259 case nir_op_fexp2: 1260 inst = emit_math(SHADER_OPCODE_EXP2, dst, op[0]); 1261 break; 1262 1263 case nir_op_flog2: 1264 inst = emit_math(SHADER_OPCODE_LOG2, dst, op[0]); 1265 break; 1266 1267 case nir_op_fsin: 1268 inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]); 1269 break; 1270 1271 case nir_op_fcos: 1272 inst = emit_math(SHADER_OPCODE_COS, dst, op[0]); 1273 break; 1274 1275 case nir_op_idiv: 1276 case nir_op_udiv: 1277 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1278 emit_math(SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]); 1279 break; 1280 1281 case nir_op_umod: 1282 case nir_op_irem: 1283 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it 1284 * appears that our hardware just does the right thing for signed 1285 * remainder. 1286 */ 1287 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1288 emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]); 1289 break; 1290 1291 case nir_op_imod: { 1292 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ 1293 inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]); 1294 1295 /* Math instructions don't support conditional mod */ 1296 inst = emit(MOV(dst_null_d(), src_reg(dst))); 1297 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1298 1299 /* Now, we need to determine if signs of the sources are different. 1300 * When we XOR the sources, the top bit is 0 if they are the same and 1 1301 * if they are different. We can then use a conditional modifier to 1302 * turn that into a predicate. This leads us to an XOR.l instruction. 1303 * 1304 * Technically, according to the PRM, you're not allowed to use .l on a 1305 * XOR instruction. However, empirical experiments and Curro's reading 1306 * of the simulator source both indicate that it's safe. 1307 */ 1308 src_reg tmp = src_reg(this, glsl_type::ivec4_type); 1309 inst = emit(XOR(dst_reg(tmp), op[0], op[1])); 1310 inst->predicate = BRW_PREDICATE_NORMAL; 1311 inst->conditional_mod = BRW_CONDITIONAL_L; 1312 1313 /* If the result of the initial remainder operation is non-zero and the 1314 * two sources have different signs, add in a copy of op[1] to get the 1315 * final integer modulus value. 1316 */ 1317 inst = emit(ADD(dst, src_reg(dst), op[1])); 1318 inst->predicate = BRW_PREDICATE_NORMAL; 1319 break; 1320 } 1321 1322 case nir_op_ldexp: 1323 unreachable("not reached: should be handled by ldexp_to_arith()"); 1324 1325 case nir_op_fsqrt: 1326 inst = emit_math(SHADER_OPCODE_SQRT, dst, op[0]); 1327 break; 1328 1329 case nir_op_frsq: 1330 inst = emit_math(SHADER_OPCODE_RSQ, dst, op[0]); 1331 break; 1332 1333 case nir_op_fpow: 1334 inst = emit_math(SHADER_OPCODE_POW, dst, op[0], op[1]); 1335 break; 1336 1337 case nir_op_uadd_carry: { 1338 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1339 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD); 1340 1341 emit(ADDC(dst_null_ud(), op[0], op[1])); 1342 emit(MOV(dst, src_reg(acc))); 1343 break; 1344 } 1345 1346 case nir_op_usub_borrow: { 1347 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1348 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD); 1349 1350 emit(SUBB(dst_null_ud(), op[0], op[1])); 1351 emit(MOV(dst, src_reg(acc))); 1352 break; 1353 } 1354 1355 case nir_op_ftrunc: 1356 inst = emit(RNDZ(dst, op[0])); 1357 if (devinfo->ver < 6) { 1358 inst->conditional_mod = BRW_CONDITIONAL_R; 1359 inst = emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f))); 1360 inst->predicate = BRW_PREDICATE_NORMAL; 1361 inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */ 1362 } 1363 break; 1364 1365 case nir_op_fceil: { 1366 src_reg tmp = src_reg(this, glsl_type::float_type); 1367 tmp.swizzle = 1368 brw_swizzle_for_size(instr->src[0].src.is_ssa ? 1369 instr->src[0].src.ssa->num_components : 1370 instr->src[0].src.reg.reg->num_components); 1371 1372 op[0].negate = !op[0].negate; 1373 emit(RNDD(dst_reg(tmp), op[0])); 1374 tmp.negate = true; 1375 inst = emit(MOV(dst, tmp)); 1376 break; 1377 } 1378 1379 case nir_op_ffloor: 1380 inst = emit(RNDD(dst, op[0])); 1381 break; 1382 1383 case nir_op_ffract: 1384 inst = emit(FRC(dst, op[0])); 1385 break; 1386 1387 case nir_op_fround_even: 1388 inst = emit(RNDE(dst, op[0])); 1389 if (devinfo->ver < 6) { 1390 inst->conditional_mod = BRW_CONDITIONAL_R; 1391 inst = emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f))); 1392 inst->predicate = BRW_PREDICATE_NORMAL; 1393 inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */ 1394 } 1395 break; 1396 1397 case nir_op_fquantize2f16: { 1398 /* See also vec4_visitor::emit_pack_half_2x16() */ 1399 src_reg tmp16 = src_reg(this, glsl_type::uvec4_type); 1400 src_reg tmp32 = src_reg(this, glsl_type::vec4_type); 1401 src_reg zero = src_reg(this, glsl_type::vec4_type); 1402 1403 /* Check for denormal */ 1404 src_reg abs_src0 = op[0]; 1405 abs_src0.abs = true; 1406 emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), 1407 BRW_CONDITIONAL_L)); 1408 /* Get the appropriately signed zero */ 1409 emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD), 1410 retype(op[0], BRW_REGISTER_TYPE_UD), 1411 brw_imm_ud(0x80000000))); 1412 /* Do the actual F32 -> F16 -> F32 conversion */ 1413 emit(F32TO16(dst_reg(tmp16), op[0])); 1414 emit(F16TO32(dst_reg(tmp32), tmp16)); 1415 /* Select that or zero based on normal status */ 1416 inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32); 1417 inst->predicate = BRW_PREDICATE_NORMAL; 1418 break; 1419 } 1420 1421 case nir_op_imin: 1422 case nir_op_umin: 1423 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1424 FALLTHROUGH; 1425 case nir_op_fmin: 1426 try_immediate_source(instr, op, true); 1427 inst = emit_minmax(BRW_CONDITIONAL_L, dst, op[0], op[1]); 1428 break; 1429 1430 case nir_op_imax: 1431 case nir_op_umax: 1432 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1433 FALLTHROUGH; 1434 case nir_op_fmax: 1435 try_immediate_source(instr, op, true); 1436 inst = emit_minmax(BRW_CONDITIONAL_GE, dst, op[0], op[1]); 1437 break; 1438 1439 case nir_op_fddx: 1440 case nir_op_fddx_coarse: 1441 case nir_op_fddx_fine: 1442 case nir_op_fddy: 1443 case nir_op_fddy_coarse: 1444 case nir_op_fddy_fine: 1445 unreachable("derivatives are not valid in vertex shaders"); 1446 1447 case nir_op_ilt32: 1448 case nir_op_ult32: 1449 case nir_op_ige32: 1450 case nir_op_uge32: 1451 case nir_op_ieq32: 1452 case nir_op_ine32: 1453 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1454 FALLTHROUGH; 1455 case nir_op_flt32: 1456 case nir_op_fge32: 1457 case nir_op_feq32: 1458 case nir_op_fneu32: { 1459 enum brw_conditional_mod conditional_mod = 1460 brw_cmod_for_nir_comparison(instr->op); 1461 1462 if (nir_src_bit_size(instr->src[0].src) < 64) { 1463 /* If the order of the sources is changed due to an immediate value, 1464 * then the condition must also be changed. 1465 */ 1466 if (try_immediate_source(instr, op, true) == 0) 1467 conditional_mod = brw_swap_cmod(conditional_mod); 1468 1469 emit(CMP(dst, op[0], op[1], conditional_mod)); 1470 } else { 1471 /* Produce a 32-bit boolean result from the DF comparison by selecting 1472 * only the low 32-bit in each DF produced. Do this in a temporary 1473 * so we can then move from there to the result using align16 again 1474 * to honor the original writemask. 1475 */ 1476 dst_reg temp = dst_reg(this, glsl_type::dvec4_type); 1477 emit(CMP(temp, op[0], op[1], conditional_mod)); 1478 dst_reg result = dst_reg(this, glsl_type::bvec4_type); 1479 emit(VEC4_OPCODE_PICK_LOW_32BIT, result, src_reg(temp)); 1480 emit(MOV(dst, src_reg(result))); 1481 } 1482 break; 1483 } 1484 1485 case nir_op_b32all_iequal2: 1486 case nir_op_b32all_iequal3: 1487 case nir_op_b32all_iequal4: 1488 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1489 FALLTHROUGH; 1490 case nir_op_b32all_fequal2: 1491 case nir_op_b32all_fequal3: 1492 case nir_op_b32all_fequal4: { 1493 unsigned swiz = 1494 brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); 1495 1496 emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz), 1497 brw_cmod_for_nir_comparison(instr->op))); 1498 emit(MOV(dst, brw_imm_d(0))); 1499 inst = emit(MOV(dst, brw_imm_d(~0))); 1500 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H; 1501 break; 1502 } 1503 1504 case nir_op_b32any_inequal2: 1505 case nir_op_b32any_inequal3: 1506 case nir_op_b32any_inequal4: 1507 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1508 FALLTHROUGH; 1509 case nir_op_b32any_fnequal2: 1510 case nir_op_b32any_fnequal3: 1511 case nir_op_b32any_fnequal4: { 1512 unsigned swiz = 1513 brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); 1514 1515 emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz), 1516 brw_cmod_for_nir_comparison(instr->op))); 1517 1518 emit(MOV(dst, brw_imm_d(0))); 1519 inst = emit(MOV(dst, brw_imm_d(~0))); 1520 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H; 1521 break; 1522 } 1523 1524 case nir_op_inot: 1525 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1526 emit(NOT(dst, op[0])); 1527 break; 1528 1529 case nir_op_ixor: 1530 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1531 try_immediate_source(instr, op, true); 1532 emit(XOR(dst, op[0], op[1])); 1533 break; 1534 1535 case nir_op_ior: 1536 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1537 try_immediate_source(instr, op, true); 1538 emit(OR(dst, op[0], op[1])); 1539 break; 1540 1541 case nir_op_iand: 1542 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1543 try_immediate_source(instr, op, true); 1544 emit(AND(dst, op[0], op[1])); 1545 break; 1546 1547 case nir_op_b2i32: 1548 case nir_op_b2f32: 1549 case nir_op_b2f64: 1550 if (nir_dest_bit_size(instr->dest.dest) > 32) { 1551 assert(dst.type == BRW_REGISTER_TYPE_DF); 1552 emit_conversion_to_double(dst, negate(op[0])); 1553 } else { 1554 emit(MOV(dst, negate(op[0]))); 1555 } 1556 break; 1557 1558 case nir_op_f2b32: 1559 if (nir_src_bit_size(instr->src[0].src) == 64) { 1560 /* We use a MOV with conditional_mod to check if the provided value is 1561 * 0.0. We want this to flush denormalized numbers to zero, so we set a 1562 * source modifier on the source operand to trigger this, as source 1563 * modifiers don't affect the result of the testing against 0.0. 1564 */ 1565 src_reg value = op[0]; 1566 value.abs = true; 1567 vec4_instruction *inst = emit(MOV(dst_null_df(), value)); 1568 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1569 1570 src_reg one = src_reg(this, glsl_type::ivec4_type); 1571 emit(MOV(dst_reg(one), brw_imm_d(~0))); 1572 inst = emit(BRW_OPCODE_SEL, dst, one, brw_imm_d(0)); 1573 inst->predicate = BRW_PREDICATE_NORMAL; 1574 } else { 1575 emit(CMP(dst, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ)); 1576 } 1577 break; 1578 1579 case nir_op_i2b32: 1580 emit(CMP(dst, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ)); 1581 break; 1582 1583 case nir_op_unpack_half_2x16_split_x: 1584 case nir_op_unpack_half_2x16_split_y: 1585 case nir_op_pack_half_2x16_split: 1586 unreachable("not reached: should not occur in vertex shader"); 1587 1588 case nir_op_unpack_snorm_2x16: 1589 case nir_op_unpack_unorm_2x16: 1590 case nir_op_pack_snorm_2x16: 1591 case nir_op_pack_unorm_2x16: 1592 unreachable("not reached: should be handled by lower_packing_builtins"); 1593 1594 case nir_op_pack_uvec4_to_uint: 1595 unreachable("not reached"); 1596 1597 case nir_op_pack_uvec2_to_uint: { 1598 dst_reg tmp1 = dst_reg(this, glsl_type::uint_type); 1599 tmp1.writemask = WRITEMASK_X; 1600 op[0].swizzle = BRW_SWIZZLE_YYYY; 1601 emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u)))); 1602 1603 dst_reg tmp2 = dst_reg(this, glsl_type::uint_type); 1604 tmp2.writemask = WRITEMASK_X; 1605 op[0].swizzle = BRW_SWIZZLE_XXXX; 1606 emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu)))); 1607 1608 emit(OR(dst, src_reg(tmp1), src_reg(tmp2))); 1609 break; 1610 } 1611 1612 case nir_op_pack_64_2x32_split: { 1613 dst_reg result = dst_reg(this, glsl_type::dvec4_type); 1614 dst_reg tmp = dst_reg(this, glsl_type::uvec4_type); 1615 emit(MOV(tmp, retype(op[0], BRW_REGISTER_TYPE_UD))); 1616 emit(VEC4_OPCODE_SET_LOW_32BIT, result, src_reg(tmp)); 1617 emit(MOV(tmp, retype(op[1], BRW_REGISTER_TYPE_UD))); 1618 emit(VEC4_OPCODE_SET_HIGH_32BIT, result, src_reg(tmp)); 1619 emit(MOV(dst, src_reg(result))); 1620 break; 1621 } 1622 1623 case nir_op_unpack_64_2x32_split_x: 1624 case nir_op_unpack_64_2x32_split_y: { 1625 enum opcode oper = (instr->op == nir_op_unpack_64_2x32_split_x) ? 1626 VEC4_OPCODE_PICK_LOW_32BIT : VEC4_OPCODE_PICK_HIGH_32BIT; 1627 dst_reg tmp = dst_reg(this, glsl_type::dvec4_type); 1628 emit(MOV(tmp, op[0])); 1629 dst_reg tmp2 = dst_reg(this, glsl_type::uvec4_type); 1630 emit(oper, tmp2, src_reg(tmp)); 1631 emit(MOV(dst, src_reg(tmp2))); 1632 break; 1633 } 1634 1635 case nir_op_unpack_half_2x16: 1636 /* As NIR does not guarantee that we have a correct swizzle outside the 1637 * boundaries of a vector, and the implementation of emit_unpack_half_2x16 1638 * uses the source operand in an operation with WRITEMASK_Y while our 1639 * source operand has only size 1, it accessed incorrect data producing 1640 * regressions in Piglit. We repeat the swizzle of the first component on the 1641 * rest of components to avoid regressions. In the vec4_visitor IR code path 1642 * this is not needed because the operand has already the correct swizzle. 1643 */ 1644 op[0].swizzle = brw_compose_swizzle(BRW_SWIZZLE_XXXX, op[0].swizzle); 1645 emit_unpack_half_2x16(dst, op[0]); 1646 break; 1647 1648 case nir_op_pack_half_2x16: 1649 emit_pack_half_2x16(dst, op[0]); 1650 break; 1651 1652 case nir_op_unpack_unorm_4x8: 1653 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1654 emit_unpack_unorm_4x8(dst, op[0]); 1655 break; 1656 1657 case nir_op_pack_unorm_4x8: 1658 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1659 emit_pack_unorm_4x8(dst, op[0]); 1660 break; 1661 1662 case nir_op_unpack_snorm_4x8: 1663 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1664 emit_unpack_snorm_4x8(dst, op[0]); 1665 break; 1666 1667 case nir_op_pack_snorm_4x8: 1668 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1669 emit_pack_snorm_4x8(dst, op[0]); 1670 break; 1671 1672 case nir_op_bitfield_reverse: 1673 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1674 emit(BFREV(dst, op[0])); 1675 break; 1676 1677 case nir_op_bit_count: 1678 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1679 emit(CBIT(dst, op[0])); 1680 break; 1681 1682 case nir_op_ufind_msb: 1683 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1684 emit_find_msb_using_lzd(vec4_builder(this).at_end(), dst, op[0], false); 1685 break; 1686 1687 case nir_op_ifind_msb: { 1688 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1689 vec4_builder bld = vec4_builder(this).at_end(); 1690 src_reg src(dst); 1691 1692 if (devinfo->ver < 7) { 1693 emit_find_msb_using_lzd(bld, dst, op[0], true); 1694 } else { 1695 emit(FBH(retype(dst, BRW_REGISTER_TYPE_UD), op[0])); 1696 1697 /* FBH counts from the MSB side, while GLSL's findMSB() wants the 1698 * count from the LSB side. If FBH didn't return an error 1699 * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB 1700 * count into an LSB count. 1701 */ 1702 bld.CMP(dst_null_d(), src, brw_imm_d(-1), BRW_CONDITIONAL_NZ); 1703 1704 inst = bld.ADD(dst, src, brw_imm_d(31)); 1705 inst->predicate = BRW_PREDICATE_NORMAL; 1706 inst->src[0].negate = true; 1707 } 1708 break; 1709 } 1710 1711 case nir_op_find_lsb: { 1712 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1713 vec4_builder bld = vec4_builder(this).at_end(); 1714 1715 if (devinfo->ver < 7) { 1716 dst_reg temp = bld.vgrf(BRW_REGISTER_TYPE_D); 1717 1718 /* (x & -x) generates a value that consists of only the LSB of x. 1719 * For all powers of 2, findMSB(y) == findLSB(y). 1720 */ 1721 src_reg src = src_reg(retype(op[0], BRW_REGISTER_TYPE_D)); 1722 src_reg negated_src = src; 1723 1724 /* One must be negated, and the other must be non-negated. It 1725 * doesn't matter which is which. 1726 */ 1727 negated_src.negate = true; 1728 src.negate = false; 1729 1730 bld.AND(temp, src, negated_src); 1731 emit_find_msb_using_lzd(bld, dst, src_reg(temp), false); 1732 } else { 1733 bld.FBL(dst, op[0]); 1734 } 1735 break; 1736 } 1737 1738 case nir_op_ubitfield_extract: 1739 case nir_op_ibitfield_extract: 1740 unreachable("should have been lowered"); 1741 case nir_op_ubfe: 1742 case nir_op_ibfe: 1743 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1744 op[0] = fix_3src_operand(op[0]); 1745 op[1] = fix_3src_operand(op[1]); 1746 op[2] = fix_3src_operand(op[2]); 1747 1748 emit(BFE(dst, op[2], op[1], op[0])); 1749 break; 1750 1751 case nir_op_bfm: 1752 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1753 emit(BFI1(dst, op[0], op[1])); 1754 break; 1755 1756 case nir_op_bfi: 1757 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1758 op[0] = fix_3src_operand(op[0]); 1759 op[1] = fix_3src_operand(op[1]); 1760 op[2] = fix_3src_operand(op[2]); 1761 1762 emit(BFI2(dst, op[0], op[1], op[2])); 1763 break; 1764 1765 case nir_op_bitfield_insert: 1766 unreachable("not reached: should have been lowered"); 1767 1768 case nir_op_fsign: 1769 if (type_sz(op[0].type) < 8) { 1770 /* AND(val, 0x80000000) gives the sign bit. 1771 * 1772 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not 1773 * zero. 1774 */ 1775 emit(CMP(dst_null_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ)); 1776 1777 op[0].type = BRW_REGISTER_TYPE_UD; 1778 dst.type = BRW_REGISTER_TYPE_UD; 1779 emit(AND(dst, op[0], brw_imm_ud(0x80000000u))); 1780 1781 inst = emit(OR(dst, src_reg(dst), brw_imm_ud(0x3f800000u))); 1782 inst->predicate = BRW_PREDICATE_NORMAL; 1783 dst.type = BRW_REGISTER_TYPE_F; 1784 } else { 1785 /* For doubles we do the same but we need to consider: 1786 * 1787 * - We use a MOV with conditional_mod instead of a CMP so that we can 1788 * skip loading a 0.0 immediate. We use a source modifier on the 1789 * source of the MOV so that we flush denormalized values to 0. 1790 * Since we want to compare against 0, this won't alter the result. 1791 * - We need to extract the high 32-bit of each DF where the sign 1792 * is stored. 1793 * - We need to produce a DF result. 1794 */ 1795 1796 /* Check for zero */ 1797 src_reg value = op[0]; 1798 value.abs = true; 1799 inst = emit(MOV(dst_null_df(), value)); 1800 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1801 1802 /* AND each high 32-bit channel with 0x80000000u */ 1803 dst_reg tmp = dst_reg(this, glsl_type::uvec4_type); 1804 emit(VEC4_OPCODE_PICK_HIGH_32BIT, tmp, op[0]); 1805 emit(AND(tmp, src_reg(tmp), brw_imm_ud(0x80000000u))); 1806 1807 /* Add 1.0 to each channel, predicated to skip the cases where the 1808 * channel's value was 0 1809 */ 1810 inst = emit(OR(tmp, src_reg(tmp), brw_imm_ud(0x3f800000u))); 1811 inst->predicate = BRW_PREDICATE_NORMAL; 1812 1813 /* Now convert the result from float to double */ 1814 emit_conversion_to_double(dst, retype(src_reg(tmp), 1815 BRW_REGISTER_TYPE_F)); 1816 } 1817 break; 1818 1819 case nir_op_ishl: 1820 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1821 try_immediate_source(instr, op, false); 1822 emit(SHL(dst, op[0], op[1])); 1823 break; 1824 1825 case nir_op_ishr: 1826 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1827 try_immediate_source(instr, op, false); 1828 emit(ASR(dst, op[0], op[1])); 1829 break; 1830 1831 case nir_op_ushr: 1832 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1833 try_immediate_source(instr, op, false); 1834 emit(SHR(dst, op[0], op[1])); 1835 break; 1836 1837 case nir_op_ffma: 1838 if (type_sz(dst.type) == 8) { 1839 dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type); 1840 emit(MUL(mul_dst, op[1], op[0])); 1841 inst = emit(ADD(dst, src_reg(mul_dst), op[2])); 1842 } else { 1843 fix_float_operands(op, instr); 1844 inst = emit(MAD(dst, op[2], op[1], op[0])); 1845 } 1846 break; 1847 1848 case nir_op_flrp: 1849 fix_float_operands(op, instr); 1850 inst = emit(LRP(dst, op[2], op[1], op[0])); 1851 break; 1852 1853 case nir_op_b32csel: 1854 enum brw_predicate predicate; 1855 if (!optimize_predicate(instr, &predicate)) { 1856 emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ)); 1857 switch (dst.writemask) { 1858 case WRITEMASK_X: 1859 predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X; 1860 break; 1861 case WRITEMASK_Y: 1862 predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y; 1863 break; 1864 case WRITEMASK_Z: 1865 predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z; 1866 break; 1867 case WRITEMASK_W: 1868 predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W; 1869 break; 1870 default: 1871 predicate = BRW_PREDICATE_NORMAL; 1872 break; 1873 } 1874 } 1875 inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]); 1876 inst->predicate = predicate; 1877 break; 1878 1879 case nir_op_fdot2_replicated: 1880 try_immediate_source(instr, op, true); 1881 inst = emit(BRW_OPCODE_DP2, dst, op[0], op[1]); 1882 break; 1883 1884 case nir_op_fdot3_replicated: 1885 try_immediate_source(instr, op, true); 1886 inst = emit(BRW_OPCODE_DP3, dst, op[0], op[1]); 1887 break; 1888 1889 case nir_op_fdot4_replicated: 1890 try_immediate_source(instr, op, true); 1891 inst = emit(BRW_OPCODE_DP4, dst, op[0], op[1]); 1892 break; 1893 1894 case nir_op_fdph_replicated: 1895 try_immediate_source(instr, op, false); 1896 inst = emit(BRW_OPCODE_DPH, dst, op[0], op[1]); 1897 break; 1898 1899 case nir_op_fdiv: 1900 unreachable("not reached: should be lowered by lower_fdiv in the compiler"); 1901 1902 case nir_op_fmod: 1903 unreachable("not reached: should be lowered by lower_fmod in the compiler"); 1904 1905 case nir_op_fsub: 1906 case nir_op_isub: 1907 unreachable("not reached: should be handled by ir_sub_to_add_neg"); 1908 1909 default: 1910 unreachable("Unimplemented ALU operation"); 1911 } 1912 1913 /* If we need to do a boolean resolve, replace the result with -(x & 1) 1914 * to sign extend the low bit to 0/~0 1915 */ 1916 if (devinfo->ver <= 5 && 1917 (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == 1918 BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { 1919 dst_reg masked = dst_reg(this, glsl_type::int_type); 1920 masked.writemask = dst.writemask; 1921 emit(AND(masked, src_reg(dst), brw_imm_d(1))); 1922 src_reg masked_neg = src_reg(masked); 1923 masked_neg.negate = true; 1924 emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg)); 1925 } 1926} 1927 1928void 1929vec4_visitor::nir_emit_jump(nir_jump_instr *instr) 1930{ 1931 switch (instr->type) { 1932 case nir_jump_break: 1933 emit(BRW_OPCODE_BREAK); 1934 break; 1935 1936 case nir_jump_continue: 1937 emit(BRW_OPCODE_CONTINUE); 1938 break; 1939 1940 case nir_jump_return: 1941 FALLTHROUGH; 1942 default: 1943 unreachable("unknown jump"); 1944 } 1945} 1946 1947static bool 1948is_high_sampler(const struct intel_device_info *devinfo, src_reg sampler) 1949{ 1950 if (devinfo->verx10 != 75) 1951 return false; 1952 1953 return sampler.file != IMM || sampler.ud >= 16; 1954} 1955 1956void 1957vec4_visitor::nir_emit_texture(nir_tex_instr *instr) 1958{ 1959 unsigned texture = instr->texture_index; 1960 unsigned sampler = instr->sampler_index; 1961 src_reg texture_reg = brw_imm_ud(texture); 1962 src_reg sampler_reg = brw_imm_ud(sampler); 1963 src_reg coordinate; 1964 const glsl_type *coord_type = NULL; 1965 src_reg shadow_comparator; 1966 src_reg offset_value; 1967 src_reg lod, lod2; 1968 src_reg sample_index; 1969 src_reg mcs; 1970 1971 dst_reg dest = get_nir_dest(instr->dest, instr->dest_type); 1972 1973 /* The hardware requires a LOD for buffer textures */ 1974 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 1975 lod = brw_imm_d(0); 1976 1977 /* Load the texture operation sources */ 1978 uint32_t constant_offset = 0; 1979 for (unsigned i = 0; i < instr->num_srcs; i++) { 1980 switch (instr->src[i].src_type) { 1981 case nir_tex_src_comparator: 1982 shadow_comparator = get_nir_src(instr->src[i].src, 1983 BRW_REGISTER_TYPE_F, 1); 1984 break; 1985 1986 case nir_tex_src_coord: { 1987 unsigned src_size = nir_tex_instr_src_size(instr, i); 1988 1989 switch (instr->op) { 1990 case nir_texop_txf: 1991 case nir_texop_txf_ms: 1992 case nir_texop_samples_identical: 1993 coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1994 src_size); 1995 coord_type = glsl_type::ivec(src_size); 1996 break; 1997 1998 default: 1999 coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 2000 src_size); 2001 coord_type = glsl_type::vec(src_size); 2002 break; 2003 } 2004 break; 2005 } 2006 2007 case nir_tex_src_ddx: 2008 lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 2009 nir_tex_instr_src_size(instr, i)); 2010 break; 2011 2012 case nir_tex_src_ddy: 2013 lod2 = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 2014 nir_tex_instr_src_size(instr, i)); 2015 break; 2016 2017 case nir_tex_src_lod: 2018 switch (instr->op) { 2019 case nir_texop_txs: 2020 case nir_texop_txf: 2021 lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1); 2022 break; 2023 2024 default: 2025 lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 1); 2026 break; 2027 } 2028 break; 2029 2030 case nir_tex_src_ms_index: { 2031 sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1); 2032 break; 2033 } 2034 2035 case nir_tex_src_offset: 2036 if (!brw_texture_offset(instr, i, &constant_offset)) { 2037 offset_value = 2038 get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2); 2039 } 2040 break; 2041 2042 case nir_tex_src_texture_offset: { 2043 /* Emit code to evaluate the actual indexing expression */ 2044 src_reg src = get_nir_src(instr->src[i].src, 1); 2045 src_reg temp(this, glsl_type::uint_type); 2046 emit(ADD(dst_reg(temp), src, brw_imm_ud(texture))); 2047 texture_reg = emit_uniformize(temp); 2048 break; 2049 } 2050 2051 case nir_tex_src_sampler_offset: { 2052 /* Emit code to evaluate the actual indexing expression */ 2053 src_reg src = get_nir_src(instr->src[i].src, 1); 2054 src_reg temp(this, glsl_type::uint_type); 2055 emit(ADD(dst_reg(temp), src, brw_imm_ud(sampler))); 2056 sampler_reg = emit_uniformize(temp); 2057 break; 2058 } 2059 2060 case nir_tex_src_projector: 2061 unreachable("Should be lowered by nir_lower_tex"); 2062 2063 case nir_tex_src_bias: 2064 unreachable("LOD bias is not valid for vertex shaders.\n"); 2065 2066 default: 2067 unreachable("unknown texture source"); 2068 } 2069 } 2070 2071 if (instr->op == nir_texop_txf_ms || 2072 instr->op == nir_texop_samples_identical) { 2073 assert(coord_type != NULL); 2074 if (devinfo->ver >= 7 && 2075 key_tex->compressed_multisample_layout_mask & (1 << texture)) { 2076 mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg); 2077 } else { 2078 mcs = brw_imm_ud(0u); 2079 } 2080 } 2081 2082 /* Stuff the channel select bits in the top of the texture offset */ 2083 if (instr->op == nir_texop_tg4) { 2084 if (instr->component == 1 && 2085 (key_tex->gather_channel_quirk_mask & (1 << texture))) { 2086 /* gather4 sampler is broken for green channel on RG32F -- 2087 * we must ask for blue instead. 2088 */ 2089 constant_offset |= 2 << 16; 2090 } else { 2091 constant_offset |= instr->component << 16; 2092 } 2093 } 2094 2095 enum opcode opcode; 2096 switch (instr->op) { 2097 case nir_texop_tex: opcode = SHADER_OPCODE_TXL; break; 2098 case nir_texop_txl: opcode = SHADER_OPCODE_TXL; break; 2099 case nir_texop_txd: opcode = SHADER_OPCODE_TXD; break; 2100 case nir_texop_txf: opcode = SHADER_OPCODE_TXF; break; 2101 case nir_texop_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break; 2102 case nir_texop_txs: opcode = SHADER_OPCODE_TXS; break; 2103 case nir_texop_query_levels: opcode = SHADER_OPCODE_TXS; break; 2104 case nir_texop_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break; 2105 case nir_texop_tg4: 2106 opcode = offset_value.file != BAD_FILE ? SHADER_OPCODE_TG4_OFFSET 2107 : SHADER_OPCODE_TG4; 2108 break; 2109 case nir_texop_samples_identical: { 2110 /* There are some challenges implementing this for vec4, and it seems 2111 * unlikely to be used anyway. For now, just return false ways. 2112 */ 2113 emit(MOV(dest, brw_imm_ud(0u))); 2114 return; 2115 } 2116 case nir_texop_txb: 2117 case nir_texop_lod: 2118 unreachable("Implicit LOD is only valid inside fragment shaders."); 2119 default: 2120 unreachable("Unrecognized tex op"); 2121 } 2122 2123 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest); 2124 2125 inst->offset = constant_offset; 2126 2127 /* The message header is necessary for: 2128 * - Gfx4 (always) 2129 * - Texel offsets 2130 * - Gather channel selection 2131 * - Sampler indices too large to fit in a 4-bit value. 2132 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal 2133 */ 2134 inst->header_size = 2135 (devinfo->ver < 5 || 2136 inst->offset != 0 || 2137 opcode == SHADER_OPCODE_TG4 || 2138 opcode == SHADER_OPCODE_TG4_OFFSET || 2139 opcode == SHADER_OPCODE_SAMPLEINFO || 2140 is_high_sampler(devinfo, sampler_reg)) ? 1 : 0; 2141 inst->base_mrf = 2; 2142 inst->mlen = inst->header_size; 2143 inst->dst.writemask = WRITEMASK_XYZW; 2144 inst->shadow_compare = shadow_comparator.file != BAD_FILE; 2145 2146 inst->src[1] = texture_reg; 2147 inst->src[2] = sampler_reg; 2148 2149 /* MRF for the first parameter */ 2150 int param_base = inst->base_mrf + inst->header_size; 2151 2152 if (opcode == SHADER_OPCODE_TXS) { 2153 int writemask = devinfo->ver == 4 ? WRITEMASK_W : WRITEMASK_X; 2154 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod)); 2155 inst->mlen++; 2156 } else if (opcode == SHADER_OPCODE_SAMPLEINFO) { 2157 inst->dst.writemask = WRITEMASK_X; 2158 } else { 2159 /* Load the coordinate */ 2160 /* FINISHME: gl_clamp_mask and saturate */ 2161 int coord_mask = (1 << instr->coord_components) - 1; 2162 int zero_mask = 0xf & ~coord_mask; 2163 2164 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask), 2165 coordinate)); 2166 inst->mlen++; 2167 2168 if (zero_mask != 0) { 2169 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask), 2170 brw_imm_d(0))); 2171 } 2172 /* Load the shadow comparator */ 2173 if (shadow_comparator.file != BAD_FILE && 2174 opcode != SHADER_OPCODE_TXD && 2175 opcode != SHADER_OPCODE_TG4_OFFSET) { 2176 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type, 2177 WRITEMASK_X), 2178 shadow_comparator)); 2179 inst->mlen++; 2180 } 2181 2182 /* Load the LOD info */ 2183 switch (opcode) { 2184 case SHADER_OPCODE_TXL: { 2185 int mrf, writemask; 2186 if (devinfo->ver >= 5) { 2187 mrf = param_base + 1; 2188 if (shadow_comparator.file != BAD_FILE) { 2189 writemask = WRITEMASK_Y; 2190 /* mlen already incremented */ 2191 } else { 2192 writemask = WRITEMASK_X; 2193 inst->mlen++; 2194 } 2195 } else /* devinfo->ver == 4 */ { 2196 mrf = param_base; 2197 writemask = WRITEMASK_W; 2198 } 2199 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod)); 2200 break; 2201 } 2202 2203 case SHADER_OPCODE_TXF: 2204 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod)); 2205 break; 2206 2207 case SHADER_OPCODE_TXF_CMS: 2208 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X), 2209 sample_index)); 2210 if (devinfo->ver >= 7) { 2211 /* MCS data is in the first channel of `mcs`, but we need to get it into 2212 * the .y channel of the second vec4 of params, so replicate .x across 2213 * the whole vec4 and then mask off everything except .y 2214 */ 2215 mcs.swizzle = BRW_SWIZZLE_XXXX; 2216 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y), 2217 mcs)); 2218 } 2219 inst->mlen++; 2220 break; 2221 2222 case SHADER_OPCODE_TXD: { 2223 const brw_reg_type type = lod.type; 2224 2225 if (devinfo->ver >= 5) { 2226 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); 2227 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); 2228 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod)); 2229 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2)); 2230 inst->mlen++; 2231 2232 if (nir_tex_instr_dest_size(instr) == 3 || 2233 shadow_comparator.file != BAD_FILE) { 2234 lod.swizzle = BRW_SWIZZLE_ZZZZ; 2235 lod2.swizzle = BRW_SWIZZLE_ZZZZ; 2236 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod)); 2237 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2)); 2238 inst->mlen++; 2239 2240 if (shadow_comparator.file != BAD_FILE) { 2241 emit(MOV(dst_reg(MRF, param_base + 2, 2242 shadow_comparator.type, WRITEMASK_Z), 2243 shadow_comparator)); 2244 } 2245 } 2246 } else /* devinfo->ver == 4 */ { 2247 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod)); 2248 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2)); 2249 inst->mlen += 2; 2250 } 2251 break; 2252 } 2253 2254 case SHADER_OPCODE_TG4_OFFSET: 2255 if (shadow_comparator.file != BAD_FILE) { 2256 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W), 2257 shadow_comparator)); 2258 } 2259 2260 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY), 2261 offset_value)); 2262 inst->mlen++; 2263 break; 2264 2265 default: 2266 break; 2267 } 2268 } 2269 2270 emit(inst); 2271 2272 /* fixup num layers (z) for cube arrays: hardware returns faces * layers; 2273 * spec requires layers. 2274 */ 2275 if (instr->op == nir_texop_txs && devinfo->ver < 7) { 2276 /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */ 2277 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z), 2278 src_reg(inst->dst), brw_imm_d(1)); 2279 } 2280 2281 if (instr->op == nir_texop_query_levels) { 2282 /* # levels is in .w */ 2283 src_reg swizzled(dest); 2284 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, 2285 SWIZZLE_W, SWIZZLE_W); 2286 emit(MOV(dest, swizzled)); 2287 } 2288} 2289 2290src_reg 2291vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type, 2292 src_reg coordinate, src_reg surface) 2293{ 2294 vec4_instruction *inst = 2295 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS, 2296 dst_reg(this, glsl_type::uvec4_type)); 2297 inst->base_mrf = 2; 2298 inst->src[1] = surface; 2299 inst->src[2] = brw_imm_ud(0); /* sampler */ 2300 inst->mlen = 1; 2301 2302 const int param_base = inst->base_mrf; 2303 2304 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */ 2305 int coord_mask = (1 << coordinate_type->vector_elements) - 1; 2306 int zero_mask = 0xf & ~coord_mask; 2307 2308 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask), 2309 coordinate)); 2310 2311 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask), 2312 brw_imm_d(0))); 2313 2314 emit(inst); 2315 return src_reg(inst->dst); 2316} 2317 2318void 2319vec4_visitor::nir_emit_undef(nir_ssa_undef_instr *instr) 2320{ 2321 nir_ssa_values[instr->def.index] = 2322 dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32))); 2323} 2324 2325/* SIMD4x2 64bit data is stored in register space like this: 2326 * 2327 * r0.0:DF x0 y0 z0 w0 2328 * r1.0:DF x1 y1 z1 w1 2329 * 2330 * When we need to write data such as this to memory using 32-bit write 2331 * messages we need to shuffle it in this fashion: 2332 * 2333 * r0.0:DF x0 y0 x1 y1 (to be written at base offset) 2334 * r0.0:DF z0 w0 z1 w1 (to be written at base offset + 16) 2335 * 2336 * We need to do the inverse operation when we read using 32-bit messages, 2337 * which we can do by applying the same exact shuffling on the 64-bit data 2338 * read, only that because the data for each vertex is positioned differently 2339 * we need to apply different channel enables. 2340 * 2341 * This function takes 64bit data and shuffles it as explained above. 2342 * 2343 * The @for_write parameter is used to specify if the shuffling is being done 2344 * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit 2345 * write message (for_write = true), or instead we are doing the inverse 2346 * operation and we have just read 64-bit data using a 32-bit messages that we 2347 * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false). 2348 * 2349 * If @block and @ref are non-NULL, then the shuffling is done after @ref, 2350 * otherwise the instructions are emitted normally at the end. The function 2351 * returns the last instruction inserted. 2352 * 2353 * Notice that @src and @dst cannot be the same register. 2354 */ 2355vec4_instruction * 2356vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write, 2357 bool for_scratch, 2358 bblock_t *block, vec4_instruction *ref) 2359{ 2360 assert(type_sz(src.type) == 8); 2361 assert(type_sz(dst.type) == 8); 2362 assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE)); 2363 assert(!ref == !block); 2364 2365 opcode mov_op = for_scratch ? VEC4_OPCODE_MOV_FOR_SCRATCH : BRW_OPCODE_MOV; 2366 2367 const vec4_builder bld = !ref ? vec4_builder(this).at_end() : 2368 vec4_builder(this).at(block, ref->next); 2369 2370 /* Resolve swizzle in src */ 2371 if (src.swizzle != BRW_SWIZZLE_XYZW) { 2372 dst_reg data = dst_reg(this, glsl_type::dvec4_type); 2373 bld.emit(mov_op, data, src); 2374 src = src_reg(data); 2375 } 2376 2377 /* dst+0.XY = src+0.XY */ 2378 bld.group(4, 0).emit(mov_op, writemask(dst, WRITEMASK_XY), src); 2379 2380 /* dst+0.ZW = src+1.XY */ 2381 bld.group(4, for_write ? 1 : 0) 2382 .emit(mov_op, writemask(dst, WRITEMASK_ZW), 2383 swizzle(byte_offset(src, REG_SIZE), BRW_SWIZZLE_XYXY)); 2384 2385 /* dst+1.XY = src+0.ZW */ 2386 bld.group(4, for_write ? 0 : 1) 2387 .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY), 2388 swizzle(src, BRW_SWIZZLE_ZWZW)); 2389 2390 /* dst+1.ZW = src+1.ZW */ 2391 return bld.group(4, 1) 2392 .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW), 2393 byte_offset(src, REG_SIZE)); 2394} 2395 2396} 2397