1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "compiler/glsl/ir.h" 25#include "brw_fs.h" 26#include "brw_nir.h" 27#include "brw_rt.h" 28#include "brw_eu.h" 29#include "nir_search_helpers.h" 30#include "util/u_math.h" 31#include "util/bitscan.h" 32 33using namespace brw; 34 35void 36fs_visitor::emit_nir_code() 37{ 38 emit_shader_float_controls_execution_mode(); 39 40 /* emit the arrays used for inputs and outputs - load/store intrinsics will 41 * be converted to reads/writes of these arrays 42 */ 43 nir_setup_outputs(); 44 nir_setup_uniforms(); 45 nir_emit_system_values(); 46 last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width; 47 48 nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir)); 49 50 bld.emit(SHADER_OPCODE_HALT_TARGET); 51} 52 53void 54fs_visitor::nir_setup_outputs() 55{ 56 if (stage == MESA_SHADER_TESS_CTRL || 57 stage == MESA_SHADER_TASK || 58 stage == MESA_SHADER_MESH || 59 stage == MESA_SHADER_FRAGMENT) 60 return; 61 62 unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, }; 63 64 /* Calculate the size of output registers in a separate pass, before 65 * allocating them. With ARB_enhanced_layouts, multiple output variables 66 * may occupy the same slot, but have different type sizes. 67 */ 68 nir_foreach_shader_out_variable(var, nir) { 69 const int loc = var->data.driver_location; 70 const unsigned var_vec4s = 71 var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4) 72 : type_size_vec4(var->type, true); 73 vec4s[loc] = MAX2(vec4s[loc], var_vec4s); 74 } 75 76 for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) { 77 if (vec4s[loc] == 0) { 78 loc++; 79 continue; 80 } 81 82 unsigned reg_size = vec4s[loc]; 83 84 /* Check if there are any ranges that start within this range and extend 85 * past it. If so, include them in this allocation. 86 */ 87 for (unsigned i = 1; i < reg_size; i++) { 88 assert(i + loc < ARRAY_SIZE(vec4s)); 89 reg_size = MAX2(vec4s[i + loc] + i, reg_size); 90 } 91 92 fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size); 93 for (unsigned i = 0; i < reg_size; i++) { 94 assert(loc + i < ARRAY_SIZE(outputs)); 95 outputs[loc + i] = offset(reg, bld, 4 * i); 96 } 97 98 loc += reg_size; 99 } 100} 101 102void 103fs_visitor::nir_setup_uniforms() 104{ 105 /* Only the first compile gets to set up uniforms. */ 106 if (push_constant_loc) 107 return; 108 109 uniforms = nir->num_uniforms / 4; 110 111 if (gl_shader_stage_is_compute(stage) && devinfo->verx10 < 125) { 112 /* Add uniforms for builtins after regular NIR uniforms. */ 113 assert(uniforms == prog_data->nr_params); 114 115 uint32_t *param; 116 if (nir->info.workgroup_size_variable && 117 compiler->lower_variable_group_size) { 118 param = brw_stage_prog_data_add_params(prog_data, 3); 119 for (unsigned i = 0; i < 3; i++) { 120 param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i); 121 group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); 122 } 123 } 124 125 /* Subgroup ID must be the last uniform on the list. This will make 126 * easier later to split between cross thread and per thread 127 * uniforms. 128 */ 129 param = brw_stage_prog_data_add_params(prog_data, 1); 130 *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; 131 subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); 132 } 133} 134 135static bool 136emit_system_values_block(nir_block *block, fs_visitor *v) 137{ 138 fs_reg *reg; 139 140 nir_foreach_instr(instr, block) { 141 if (instr->type != nir_instr_type_intrinsic) 142 continue; 143 144 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 145 switch (intrin->intrinsic) { 146 case nir_intrinsic_load_vertex_id: 147 case nir_intrinsic_load_base_vertex: 148 unreachable("should be lowered by nir_lower_system_values()."); 149 150 case nir_intrinsic_load_vertex_id_zero_base: 151 case nir_intrinsic_load_is_indexed_draw: 152 case nir_intrinsic_load_first_vertex: 153 case nir_intrinsic_load_instance_id: 154 case nir_intrinsic_load_base_instance: 155 unreachable("should be lowered by brw_nir_lower_vs_inputs()."); 156 break; 157 158 case nir_intrinsic_load_draw_id: 159 /* For Task/Mesh, draw_id will be handled later in 160 * nir_emit_mesh_task_intrinsic(). 161 */ 162 if (!gl_shader_stage_is_mesh(v->stage)) 163 unreachable("should be lowered by brw_nir_lower_vs_inputs()."); 164 break; 165 166 case nir_intrinsic_load_invocation_id: 167 if (v->stage == MESA_SHADER_TESS_CTRL) 168 break; 169 assert(v->stage == MESA_SHADER_GEOMETRY); 170 reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 171 if (reg->file == BAD_FILE) { 172 const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL); 173 fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 174 fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 175 abld.SHR(iid, g1, brw_imm_ud(27u)); 176 *reg = iid; 177 } 178 break; 179 180 case nir_intrinsic_load_sample_pos: 181 case nir_intrinsic_load_sample_pos_or_center: 182 assert(v->stage == MESA_SHADER_FRAGMENT); 183 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 184 if (reg->file == BAD_FILE) 185 *reg = v->emit_samplepos_setup(); 186 break; 187 188 case nir_intrinsic_load_sample_id: 189 assert(v->stage == MESA_SHADER_FRAGMENT); 190 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 191 if (reg->file == BAD_FILE) 192 *reg = v->emit_sampleid_setup(); 193 break; 194 195 case nir_intrinsic_load_sample_mask_in: 196 assert(v->stage == MESA_SHADER_FRAGMENT); 197 assert(v->devinfo->ver >= 7); 198 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN]; 199 if (reg->file == BAD_FILE) 200 *reg = v->emit_samplemaskin_setup(); 201 break; 202 203 case nir_intrinsic_load_workgroup_id: 204 assert(gl_shader_stage_uses_workgroup(v->stage)); 205 reg = &v->nir_system_values[SYSTEM_VALUE_WORKGROUP_ID]; 206 if (reg->file == BAD_FILE) 207 *reg = v->emit_work_group_id_setup(); 208 break; 209 210 case nir_intrinsic_load_helper_invocation: 211 assert(v->stage == MESA_SHADER_FRAGMENT); 212 reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION]; 213 if (reg->file == BAD_FILE) { 214 const fs_builder abld = 215 v->bld.annotate("gl_HelperInvocation", NULL); 216 217 /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the 218 * pixel mask is in g1.7 of the thread payload. 219 * 220 * We move the per-channel pixel enable bit to the low bit of each 221 * channel by shifting the byte containing the pixel mask by the 222 * vector immediate 0x76543210UV. 223 * 224 * The region of <1,8,0> reads only 1 byte (the pixel masks for 225 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel 226 * masks for 2 and 3) in SIMD16. 227 */ 228 fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1); 229 230 for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) { 231 const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i); 232 hbld.SHR(offset(shifted, hbld, i), 233 stride(retype(brw_vec1_grf(1 + i, 7), 234 BRW_REGISTER_TYPE_UB), 235 1, 8, 0), 236 brw_imm_v(0x76543210)); 237 } 238 239 /* A set bit in the pixel mask means the channel is enabled, but 240 * that is the opposite of gl_HelperInvocation so we need to invert 241 * the mask. 242 * 243 * The negate source-modifier bit of logical instructions on Gfx8+ 244 * performs 1's complement negation, so we can use that instead of 245 * a NOT instruction. 246 */ 247 fs_reg inverted = negate(shifted); 248 if (v->devinfo->ver < 8) { 249 inverted = abld.vgrf(BRW_REGISTER_TYPE_UW); 250 abld.NOT(inverted, shifted); 251 } 252 253 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing 254 * with 1 and negating. 255 */ 256 fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 257 abld.AND(anded, inverted, brw_imm_uw(1)); 258 259 fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1); 260 abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D))); 261 *reg = dst; 262 } 263 break; 264 265 case nir_intrinsic_load_frag_shading_rate: 266 reg = &v->nir_system_values[SYSTEM_VALUE_FRAG_SHADING_RATE]; 267 if (reg->file == BAD_FILE) 268 *reg = v->emit_shading_rate_setup(); 269 break; 270 271 default: 272 break; 273 } 274 } 275 276 return true; 277} 278 279void 280fs_visitor::nir_emit_system_values() 281{ 282 nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX); 283 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { 284 nir_system_values[i] = fs_reg(); 285 } 286 287 /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we 288 * never end up using it. 289 */ 290 { 291 const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL); 292 fs_reg ® = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; 293 reg = abld.vgrf(BRW_REGISTER_TYPE_UW); 294 295 const fs_builder allbld8 = abld.group(8, 0).exec_all(); 296 allbld8.MOV(reg, brw_imm_v(0x76543210)); 297 if (dispatch_width > 8) 298 allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u)); 299 if (dispatch_width > 16) { 300 const fs_builder allbld16 = abld.group(16, 0).exec_all(); 301 allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u)); 302 } 303 } 304 305 nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir); 306 nir_foreach_block(block, impl) 307 emit_system_values_block(block, this); 308} 309 310void 311fs_visitor::nir_emit_impl(nir_function_impl *impl) 312{ 313 nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc); 314 for (unsigned i = 0; i < impl->reg_alloc; i++) { 315 nir_locals[i] = fs_reg(); 316 } 317 318 foreach_list_typed(nir_register, reg, node, &impl->registers) { 319 unsigned array_elems = 320 reg->num_array_elems == 0 ? 1 : reg->num_array_elems; 321 unsigned size = array_elems * reg->num_components; 322 const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B : 323 brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F); 324 nir_locals[reg->index] = bld.vgrf(reg_type, size); 325 } 326 327 nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg, 328 impl->ssa_alloc); 329 330 nir_emit_cf_list(&impl->body); 331} 332 333void 334fs_visitor::nir_emit_cf_list(exec_list *list) 335{ 336 exec_list_validate(list); 337 foreach_list_typed(nir_cf_node, node, node, list) { 338 switch (node->type) { 339 case nir_cf_node_if: 340 nir_emit_if(nir_cf_node_as_if(node)); 341 break; 342 343 case nir_cf_node_loop: 344 nir_emit_loop(nir_cf_node_as_loop(node)); 345 break; 346 347 case nir_cf_node_block: 348 nir_emit_block(nir_cf_node_as_block(node)); 349 break; 350 351 default: 352 unreachable("Invalid CFG node block"); 353 } 354 } 355} 356 357void 358fs_visitor::nir_emit_if(nir_if *if_stmt) 359{ 360 bool invert; 361 fs_reg cond_reg; 362 363 /* If the condition has the form !other_condition, use other_condition as 364 * the source, but invert the predicate on the if instruction. 365 */ 366 nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition); 367 if (cond != NULL && cond->op == nir_op_inot) { 368 invert = true; 369 cond_reg = get_nir_src(cond->src[0].src); 370 cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]); 371 } else { 372 invert = false; 373 cond_reg = get_nir_src(if_stmt->condition); 374 } 375 376 /* first, put the condition into f0 */ 377 fs_inst *inst = bld.MOV(bld.null_reg_d(), 378 retype(cond_reg, BRW_REGISTER_TYPE_D)); 379 inst->conditional_mod = BRW_CONDITIONAL_NZ; 380 381 bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert; 382 383 nir_emit_cf_list(&if_stmt->then_list); 384 385 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) { 386 bld.emit(BRW_OPCODE_ELSE); 387 nir_emit_cf_list(&if_stmt->else_list); 388 } 389 390 bld.emit(BRW_OPCODE_ENDIF); 391 392 if (devinfo->ver < 7) 393 limit_dispatch_width(16, "Non-uniform control flow unsupported " 394 "in SIMD32 mode."); 395} 396 397void 398fs_visitor::nir_emit_loop(nir_loop *loop) 399{ 400 bld.emit(BRW_OPCODE_DO); 401 402 nir_emit_cf_list(&loop->body); 403 404 bld.emit(BRW_OPCODE_WHILE); 405 406 if (devinfo->ver < 7) 407 limit_dispatch_width(16, "Non-uniform control flow unsupported " 408 "in SIMD32 mode."); 409} 410 411void 412fs_visitor::nir_emit_block(nir_block *block) 413{ 414 nir_foreach_instr(instr, block) { 415 nir_emit_instr(instr); 416 } 417} 418 419void 420fs_visitor::nir_emit_instr(nir_instr *instr) 421{ 422 const fs_builder abld = bld.annotate(NULL, instr); 423 424 switch (instr->type) { 425 case nir_instr_type_alu: 426 nir_emit_alu(abld, nir_instr_as_alu(instr), true); 427 break; 428 429 case nir_instr_type_deref: 430 unreachable("All derefs should've been lowered"); 431 break; 432 433 case nir_instr_type_intrinsic: 434 switch (stage) { 435 case MESA_SHADER_VERTEX: 436 nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 437 break; 438 case MESA_SHADER_TESS_CTRL: 439 nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 440 break; 441 case MESA_SHADER_TESS_EVAL: 442 nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr)); 443 break; 444 case MESA_SHADER_GEOMETRY: 445 nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 446 break; 447 case MESA_SHADER_FRAGMENT: 448 nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 449 break; 450 case MESA_SHADER_COMPUTE: 451 case MESA_SHADER_KERNEL: 452 nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 453 break; 454 case MESA_SHADER_RAYGEN: 455 case MESA_SHADER_ANY_HIT: 456 case MESA_SHADER_CLOSEST_HIT: 457 case MESA_SHADER_MISS: 458 case MESA_SHADER_INTERSECTION: 459 case MESA_SHADER_CALLABLE: 460 nir_emit_bs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 461 break; 462 case MESA_SHADER_TASK: 463 nir_emit_task_intrinsic(abld, nir_instr_as_intrinsic(instr)); 464 break; 465 case MESA_SHADER_MESH: 466 nir_emit_mesh_intrinsic(abld, nir_instr_as_intrinsic(instr)); 467 break; 468 default: 469 unreachable("unsupported shader stage"); 470 } 471 break; 472 473 case nir_instr_type_tex: 474 nir_emit_texture(abld, nir_instr_as_tex(instr)); 475 break; 476 477 case nir_instr_type_load_const: 478 nir_emit_load_const(abld, nir_instr_as_load_const(instr)); 479 break; 480 481 case nir_instr_type_ssa_undef: 482 /* We create a new VGRF for undefs on every use (by handling 483 * them in get_nir_src()), rather than for each definition. 484 * This helps register coalescing eliminate MOVs from undef. 485 */ 486 break; 487 488 case nir_instr_type_jump: 489 nir_emit_jump(abld, nir_instr_as_jump(instr)); 490 break; 491 492 default: 493 unreachable("unknown instruction type"); 494 } 495} 496 497/** 498 * Recognizes a parent instruction of nir_op_extract_* and changes the type to 499 * match instr. 500 */ 501bool 502fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, 503 const fs_reg &result) 504{ 505 if (!instr->src[0].src.is_ssa || 506 !instr->src[0].src.ssa->parent_instr) 507 return false; 508 509 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 510 return false; 511 512 nir_alu_instr *src0 = 513 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 514 515 if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 && 516 src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) 517 return false; 518 519 unsigned element = nir_src_as_uint(src0->src[1].src); 520 521 /* Element type to extract.*/ 522 const brw_reg_type type = brw_int_type( 523 src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1, 524 src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8); 525 526 fs_reg op0 = get_nir_src(src0->src[0].src); 527 op0.type = brw_type_for_nir_type(devinfo, 528 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] | 529 nir_src_bit_size(src0->src[0].src))); 530 op0 = offset(op0, bld, src0->src[0].swizzle[0]); 531 532 bld.MOV(result, subscript(op0, type, element)); 533 return true; 534} 535 536bool 537fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, 538 const fs_reg &result) 539{ 540 nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src); 541 if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face) 542 return false; 543 544 if (!nir_src_is_const(instr->src[1].src) || 545 !nir_src_is_const(instr->src[2].src)) 546 return false; 547 548 const float value1 = nir_src_as_float(instr->src[1].src); 549 const float value2 = nir_src_as_float(instr->src[2].src); 550 if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f) 551 return false; 552 553 /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */ 554 assert(value1 == -value2); 555 556 fs_reg tmp = vgrf(glsl_type::int_type); 557 558 if (devinfo->ver >= 12) { 559 /* Bit 15 of g1.1 is 0 if the polygon is front facing. */ 560 fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W)); 561 562 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 563 * 564 * or(8) tmp.1<2>W g1.1<0,1,0>W 0x00003f80W 565 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 566 * 567 * and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). 568 */ 569 if (value1 == -1.0f) 570 g1.negate = true; 571 572 bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), 573 g1, brw_imm_uw(0x3f80)); 574 } else if (devinfo->ver >= 6) { 575 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ 576 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); 577 578 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 579 * 580 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W 581 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 582 * 583 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). 584 * 585 * This negation looks like it's safe in practice, because bits 0:4 will 586 * surely be TRIANGLES 587 */ 588 589 if (value1 == -1.0f) { 590 g0.negate = true; 591 } 592 593 bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), 594 g0, brw_imm_uw(0x3f80)); 595 } else { 596 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */ 597 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); 598 599 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 600 * 601 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D 602 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 603 * 604 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0). 605 * 606 * This negation looks like it's safe in practice, because bits 0:4 will 607 * surely be TRIANGLES 608 */ 609 610 if (value1 == -1.0f) { 611 g1_6.negate = true; 612 } 613 614 bld.OR(tmp, g1_6, brw_imm_d(0x3f800000)); 615 } 616 bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000)); 617 618 return true; 619} 620 621static void 622emit_find_msb_using_lzd(const fs_builder &bld, 623 const fs_reg &result, 624 const fs_reg &src, 625 bool is_signed) 626{ 627 fs_inst *inst; 628 fs_reg temp = src; 629 630 if (is_signed) { 631 /* LZD of an absolute value source almost always does the right 632 * thing. There are two problem values: 633 * 634 * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns 635 * 0. However, findMSB(int(0x80000000)) == 30. 636 * 637 * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns 638 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 639 * 640 * For a value of zero or negative one, -1 will be returned. 641 * 642 * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but 643 * findMSB(-(1<<x)) should return x-1. 644 * 645 * For all negative number cases, including 0x80000000 and 646 * 0xffffffff, the correct value is obtained from LZD if instead of 647 * negating the (already negative) value the logical-not is used. A 648 * conditional logical-not can be achieved in two instructions. 649 */ 650 temp = bld.vgrf(BRW_REGISTER_TYPE_D); 651 652 bld.ASR(temp, src, brw_imm_d(31)); 653 bld.XOR(temp, temp, src); 654 } 655 656 bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), 657 retype(temp, BRW_REGISTER_TYPE_UD)); 658 659 /* LZD counts from the MSB side, while GLSL's findMSB() wants the count 660 * from the LSB side. Subtract the result from 31 to convert the MSB 661 * count into an LSB count. If no bits are set, LZD will return 32. 662 * 31-32 = -1, which is exactly what findMSB() is supposed to return. 663 */ 664 inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31)); 665 inst->src[0].negate = true; 666} 667 668static brw_rnd_mode 669brw_rnd_mode_from_nir_op (const nir_op op) { 670 switch (op) { 671 case nir_op_f2f16_rtz: 672 return BRW_RND_MODE_RTZ; 673 case nir_op_f2f16_rtne: 674 return BRW_RND_MODE_RTNE; 675 default: 676 unreachable("Operation doesn't support rounding mode"); 677 } 678} 679 680static brw_rnd_mode 681brw_rnd_mode_from_execution_mode(unsigned execution_mode) 682{ 683 if (nir_has_any_rounding_mode_rtne(execution_mode)) 684 return BRW_RND_MODE_RTNE; 685 if (nir_has_any_rounding_mode_rtz(execution_mode)) 686 return BRW_RND_MODE_RTZ; 687 return BRW_RND_MODE_UNSPECIFIED; 688} 689 690fs_reg 691fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, 692 nir_alu_instr *instr, 693 fs_reg *op, 694 bool need_dest) 695{ 696 fs_reg result = 697 need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud(); 698 699 result.type = brw_type_for_nir_type(devinfo, 700 (nir_alu_type)(nir_op_infos[instr->op].output_type | 701 nir_dest_bit_size(instr->dest.dest))); 702 703 assert(!instr->dest.saturate); 704 705 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 706 /* We don't lower to source modifiers so they should not exist. */ 707 assert(!instr->src[i].abs); 708 assert(!instr->src[i].negate); 709 710 op[i] = get_nir_src(instr->src[i].src); 711 op[i].type = brw_type_for_nir_type(devinfo, 712 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | 713 nir_src_bit_size(instr->src[i].src))); 714 } 715 716 /* Move and vecN instrutions may still be vectored. Return the raw, 717 * vectored source and destination so that fs_visitor::nir_emit_alu can 718 * handle it. Other callers should not have to handle these kinds of 719 * instructions. 720 */ 721 switch (instr->op) { 722 case nir_op_mov: 723 case nir_op_vec2: 724 case nir_op_vec3: 725 case nir_op_vec4: 726 case nir_op_vec8: 727 case nir_op_vec16: 728 return result; 729 default: 730 break; 731 } 732 733 /* At this point, we have dealt with any instruction that operates on 734 * more than a single channel. Therefore, we can just adjust the source 735 * and destination registers for that channel and emit the instruction. 736 */ 737 unsigned channel = 0; 738 if (nir_op_infos[instr->op].output_size == 0) { 739 /* Since NIR is doing the scalarizing for us, we should only ever see 740 * vectorized operations with a single channel. 741 */ 742 assert(util_bitcount(instr->dest.write_mask) == 1); 743 channel = ffs(instr->dest.write_mask) - 1; 744 745 result = offset(result, bld, channel); 746 } 747 748 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 749 assert(nir_op_infos[instr->op].input_sizes[i] < 2); 750 op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]); 751 } 752 753 return result; 754} 755 756void 757fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr, 758 fs_reg *op) 759{ 760 for (unsigned i = 0; i < 2; i++) { 761 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src); 762 763 if (inot_instr != NULL && inot_instr->op == nir_op_inot) { 764 /* The source of the inot is now the source of instr. */ 765 prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false); 766 767 assert(!op[i].negate); 768 op[i].negate = true; 769 } else { 770 op[i] = resolve_source_modifiers(op[i]); 771 } 772 } 773} 774 775bool 776fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld, 777 fs_reg result, 778 nir_alu_instr *instr) 779{ 780 if (devinfo->ver < 6 || devinfo->verx10 >= 125) 781 return false; 782 783 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src); 784 785 if (inot_instr == NULL || inot_instr->op != nir_op_inot) 786 return false; 787 788 /* HF is also possible as a destination on BDW+. For nir_op_b2i, the set 789 * of valid size-changing combinations is a bit more complex. 790 * 791 * The source restriction is just because I was lazy about generating the 792 * constant below. 793 */ 794 if (nir_dest_bit_size(instr->dest.dest) != 32 || 795 nir_src_bit_size(inot_instr->src[0].src) != 32) 796 return false; 797 798 /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1, 799 * this is float(1 + a). 800 */ 801 fs_reg op; 802 803 prepare_alu_destination_and_sources(bld, inot_instr, &op, false); 804 805 /* Ignore the saturate modifier, if there is one. The result of the 806 * arithmetic can only be 0 or 1, so the clamping will do nothing anyway. 807 */ 808 bld.ADD(result, op, brw_imm_d(1)); 809 810 return true; 811} 812 813/** 814 * Emit code for nir_op_fsign possibly fused with a nir_op_fmul 815 * 816 * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of 817 * the source of \c instr that is a \c nir_op_fsign. 818 */ 819void 820fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, 821 fs_reg result, fs_reg *op, unsigned fsign_src) 822{ 823 fs_inst *inst; 824 825 assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul); 826 assert(fsign_src < nir_op_infos[instr->op].num_inputs); 827 828 if (instr->op != nir_op_fsign) { 829 const nir_alu_instr *const fsign_instr = 830 nir_src_as_alu_instr(instr->src[fsign_src].src); 831 832 /* op[fsign_src] has the nominal result of the fsign, and op[1 - 833 * fsign_src] has the other multiply source. This must be rearranged so 834 * that op[0] is the source of the fsign op[1] is the other multiply 835 * source. 836 */ 837 if (fsign_src != 0) 838 op[1] = op[0]; 839 840 op[0] = get_nir_src(fsign_instr->src[0].src); 841 842 const nir_alu_type t = 843 (nir_alu_type)(nir_op_infos[instr->op].input_types[0] | 844 nir_src_bit_size(fsign_instr->src[0].src)); 845 846 op[0].type = brw_type_for_nir_type(devinfo, t); 847 848 unsigned channel = 0; 849 if (nir_op_infos[instr->op].output_size == 0) { 850 /* Since NIR is doing the scalarizing for us, we should only ever see 851 * vectorized operations with a single channel. 852 */ 853 assert(util_bitcount(instr->dest.write_mask) == 1); 854 channel = ffs(instr->dest.write_mask) - 1; 855 } 856 857 op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]); 858 } 859 860 if (type_sz(op[0].type) == 2) { 861 /* AND(val, 0x8000) gives the sign bit. 862 * 863 * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. 864 */ 865 fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF); 866 bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ); 867 868 op[0].type = BRW_REGISTER_TYPE_UW; 869 result.type = BRW_REGISTER_TYPE_UW; 870 bld.AND(result, op[0], brw_imm_uw(0x8000u)); 871 872 if (instr->op == nir_op_fsign) 873 inst = bld.OR(result, result, brw_imm_uw(0x3c00u)); 874 else { 875 /* Use XOR here to get the result sign correct. */ 876 inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW)); 877 } 878 879 inst->predicate = BRW_PREDICATE_NORMAL; 880 } else if (type_sz(op[0].type) == 4) { 881 /* AND(val, 0x80000000) gives the sign bit. 882 * 883 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not 884 * zero. 885 */ 886 bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); 887 888 op[0].type = BRW_REGISTER_TYPE_UD; 889 result.type = BRW_REGISTER_TYPE_UD; 890 bld.AND(result, op[0], brw_imm_ud(0x80000000u)); 891 892 if (instr->op == nir_op_fsign) 893 inst = bld.OR(result, result, brw_imm_ud(0x3f800000u)); 894 else { 895 /* Use XOR here to get the result sign correct. */ 896 inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD)); 897 } 898 899 inst->predicate = BRW_PREDICATE_NORMAL; 900 } else { 901 /* For doubles we do the same but we need to consider: 902 * 903 * - 2-src instructions can't operate with 64-bit immediates 904 * - The sign is encoded in the high 32-bit of each DF 905 * - We need to produce a DF result. 906 */ 907 908 fs_reg zero = vgrf(glsl_type::double_type); 909 bld.MOV(zero, setup_imm_df(bld, 0.0)); 910 bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ); 911 912 bld.MOV(result, zero); 913 914 fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1); 915 bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1), 916 brw_imm_ud(0x80000000u)); 917 918 if (instr->op == nir_op_fsign) { 919 set_predicate(BRW_PREDICATE_NORMAL, 920 bld.OR(r, r, brw_imm_ud(0x3ff00000u))); 921 } else { 922 /* This could be done better in some cases. If the scale is an 923 * immediate with the low 32-bits all 0, emitting a separate XOR and 924 * OR would allow an algebraic optimization to remove the OR. There 925 * are currently zero instances of fsign(double(x))*IMM in shader-db 926 * or any test suite, so it is hard to care at this time. 927 */ 928 fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ); 929 inst = bld.XOR(result_int64, result_int64, 930 retype(op[1], BRW_REGISTER_TYPE_UQ)); 931 } 932 } 933} 934 935/** 936 * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign 937 * 938 * Checks the operands of a \c nir_op_fmul to determine whether or not 939 * \c emit_fsign could fuse the multiplication with the \c sign() calculation. 940 * 941 * \param instr The multiplication instruction 942 * 943 * \param fsign_src The source of \c instr that may or may not be a 944 * \c nir_op_fsign 945 */ 946static bool 947can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src) 948{ 949 assert(instr->op == nir_op_fmul); 950 951 nir_alu_instr *const fsign_instr = 952 nir_src_as_alu_instr(instr->src[fsign_src].src); 953 954 /* Rules: 955 * 956 * 1. instr->src[fsign_src] must be a nir_op_fsign. 957 * 2. The nir_op_fsign can only be used by this multiplication. 958 * 3. The source that is the nir_op_fsign does not have source modifiers. 959 * \c emit_fsign only examines the source modifiers of the source of the 960 * \c nir_op_fsign. 961 * 962 * The nir_op_fsign must also not have the saturate modifier, but steps 963 * have already been taken (in nir_opt_algebraic) to ensure that. 964 */ 965 return fsign_instr != NULL && fsign_instr->op == nir_op_fsign && 966 is_used_once(fsign_instr); 967} 968 969void 970fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, 971 bool need_dest) 972{ 973 fs_inst *inst; 974 unsigned execution_mode = 975 bld.shader->nir->info.float_controls_execution_mode; 976 977 fs_reg op[NIR_MAX_VEC_COMPONENTS]; 978 fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest); 979 980#ifndef NDEBUG 981 /* Everything except raw moves, some type conversions, iabs, and ineg 982 * should have 8-bit sources lowered by nir_lower_bit_size in 983 * brw_preprocess_nir or by brw_nir_lower_conversions in 984 * brw_postprocess_nir. 985 */ 986 switch (instr->op) { 987 case nir_op_mov: 988 case nir_op_vec2: 989 case nir_op_vec3: 990 case nir_op_vec4: 991 case nir_op_vec8: 992 case nir_op_vec16: 993 case nir_op_i2f16: 994 case nir_op_i2f32: 995 case nir_op_i2i16: 996 case nir_op_i2i32: 997 case nir_op_u2f16: 998 case nir_op_u2f32: 999 case nir_op_u2u16: 1000 case nir_op_u2u32: 1001 case nir_op_iabs: 1002 case nir_op_ineg: 1003 case nir_op_pack_32_4x8_split: 1004 break; 1005 1006 default: 1007 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1008 assert(type_sz(op[i].type) > 1); 1009 } 1010 } 1011#endif 1012 1013 switch (instr->op) { 1014 case nir_op_mov: 1015 case nir_op_vec2: 1016 case nir_op_vec3: 1017 case nir_op_vec4: 1018 case nir_op_vec8: 1019 case nir_op_vec16: { 1020 fs_reg temp = result; 1021 bool need_extra_copy = false; 1022 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1023 if (!instr->src[i].src.is_ssa && 1024 instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) { 1025 need_extra_copy = true; 1026 temp = bld.vgrf(result.type, 4); 1027 break; 1028 } 1029 } 1030 1031 for (unsigned i = 0; i < 4; i++) { 1032 if (!(instr->dest.write_mask & (1 << i))) 1033 continue; 1034 1035 if (instr->op == nir_op_mov) { 1036 bld.MOV(offset(temp, bld, i), 1037 offset(op[0], bld, instr->src[0].swizzle[i])); 1038 } else { 1039 bld.MOV(offset(temp, bld, i), 1040 offset(op[i], bld, instr->src[i].swizzle[0])); 1041 } 1042 } 1043 1044 /* In this case the source and destination registers were the same, 1045 * so we need to insert an extra set of moves in order to deal with 1046 * any swizzling. 1047 */ 1048 if (need_extra_copy) { 1049 for (unsigned i = 0; i < 4; i++) { 1050 if (!(instr->dest.write_mask & (1 << i))) 1051 continue; 1052 1053 bld.MOV(offset(result, bld, i), offset(temp, bld, i)); 1054 } 1055 } 1056 return; 1057 } 1058 1059 case nir_op_i2f32: 1060 case nir_op_u2f32: 1061 if (optimize_extract_to_float(instr, result)) 1062 return; 1063 inst = bld.MOV(result, op[0]); 1064 break; 1065 1066 case nir_op_f2f16_rtne: 1067 case nir_op_f2f16_rtz: 1068 case nir_op_f2f16: { 1069 brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED; 1070 1071 if (nir_op_f2f16 == instr->op) 1072 rnd = brw_rnd_mode_from_execution_mode(execution_mode); 1073 else 1074 rnd = brw_rnd_mode_from_nir_op(instr->op); 1075 1076 if (BRW_RND_MODE_UNSPECIFIED != rnd) 1077 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd)); 1078 1079 /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending 1080 * on the HW gen, it is a special hw opcode or just a MOV, and 1081 * brw_F32TO16 (at brw_eu_emit) would do the work to chose. 1082 * 1083 * But if we want to use that opcode, we need to provide support on 1084 * different optimizations and lowerings. As right now HF support is 1085 * only for gfx8+, it will be better to use directly the MOV, and use 1086 * BRW_OPCODE_F32TO16 when/if we work for HF support on gfx7. 1087 */ 1088 assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1089 inst = bld.MOV(result, op[0]); 1090 break; 1091 } 1092 1093 case nir_op_b2i8: 1094 case nir_op_b2i16: 1095 case nir_op_b2i32: 1096 case nir_op_b2i64: 1097 case nir_op_b2f16: 1098 case nir_op_b2f32: 1099 case nir_op_b2f64: 1100 if (try_emit_b2fi_of_inot(bld, result, instr)) 1101 break; 1102 op[0].type = BRW_REGISTER_TYPE_D; 1103 op[0].negate = !op[0].negate; 1104 FALLTHROUGH; 1105 case nir_op_i2f64: 1106 case nir_op_i2i64: 1107 case nir_op_u2f64: 1108 case nir_op_u2u64: 1109 case nir_op_f2f64: 1110 case nir_op_f2i64: 1111 case nir_op_f2u64: 1112 case nir_op_i2i32: 1113 case nir_op_u2u32: 1114 case nir_op_f2i32: 1115 case nir_op_f2u32: 1116 case nir_op_i2f16: 1117 case nir_op_u2f16: 1118 case nir_op_f2i16: 1119 case nir_op_f2u16: 1120 case nir_op_f2i8: 1121 case nir_op_f2u8: 1122 if (result.type == BRW_REGISTER_TYPE_B || 1123 result.type == BRW_REGISTER_TYPE_UB || 1124 result.type == BRW_REGISTER_TYPE_HF) 1125 assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1126 1127 if (op[0].type == BRW_REGISTER_TYPE_B || 1128 op[0].type == BRW_REGISTER_TYPE_UB || 1129 op[0].type == BRW_REGISTER_TYPE_HF) 1130 assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ 1131 1132 inst = bld.MOV(result, op[0]); 1133 break; 1134 1135 case nir_op_i2i8: 1136 case nir_op_u2u8: 1137 assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1138 FALLTHROUGH; 1139 case nir_op_i2i16: 1140 case nir_op_u2u16: { 1141 /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns. 1142 * Emitting the instructions one by one results in two MOV instructions 1143 * that won't be propagated. By handling both instructions here, a 1144 * single MOV is emitted. 1145 */ 1146 nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src); 1147 if (extract_instr != NULL) { 1148 if (extract_instr->op == nir_op_extract_u8 || 1149 extract_instr->op == nir_op_extract_i8) { 1150 prepare_alu_destination_and_sources(bld, extract_instr, op, false); 1151 1152 const unsigned byte = nir_src_as_uint(extract_instr->src[1].src); 1153 const brw_reg_type type = 1154 brw_int_type(1, extract_instr->op == nir_op_extract_i8); 1155 1156 op[0] = subscript(op[0], type, byte); 1157 } else if (extract_instr->op == nir_op_extract_u16 || 1158 extract_instr->op == nir_op_extract_i16) { 1159 prepare_alu_destination_and_sources(bld, extract_instr, op, false); 1160 1161 const unsigned word = nir_src_as_uint(extract_instr->src[1].src); 1162 const brw_reg_type type = 1163 brw_int_type(2, extract_instr->op == nir_op_extract_i16); 1164 1165 op[0] = subscript(op[0], type, word); 1166 } 1167 } 1168 1169 inst = bld.MOV(result, op[0]); 1170 break; 1171 } 1172 1173 case nir_op_fsat: 1174 inst = bld.MOV(result, op[0]); 1175 inst->saturate = true; 1176 break; 1177 1178 case nir_op_fneg: 1179 case nir_op_ineg: 1180 op[0].negate = true; 1181 inst = bld.MOV(result, op[0]); 1182 break; 1183 1184 case nir_op_fabs: 1185 case nir_op_iabs: 1186 op[0].negate = false; 1187 op[0].abs = true; 1188 inst = bld.MOV(result, op[0]); 1189 break; 1190 1191 case nir_op_f2f32: 1192 if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1193 brw_rnd_mode rnd = 1194 brw_rnd_mode_from_execution_mode(execution_mode); 1195 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1196 brw_imm_d(rnd)); 1197 } 1198 1199 if (op[0].type == BRW_REGISTER_TYPE_HF) 1200 assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ 1201 1202 inst = bld.MOV(result, op[0]); 1203 break; 1204 1205 case nir_op_fsign: 1206 emit_fsign(bld, instr, result, op, 0); 1207 break; 1208 1209 case nir_op_frcp: 1210 inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); 1211 break; 1212 1213 case nir_op_fexp2: 1214 inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]); 1215 break; 1216 1217 case nir_op_flog2: 1218 inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]); 1219 break; 1220 1221 case nir_op_fsin: 1222 inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); 1223 break; 1224 1225 case nir_op_fcos: 1226 inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); 1227 break; 1228 1229 case nir_op_fddx_fine: 1230 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 1231 break; 1232 case nir_op_fddx: 1233 case nir_op_fddx_coarse: 1234 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 1235 break; 1236 case nir_op_fddy_fine: 1237 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 1238 break; 1239 case nir_op_fddy: 1240 case nir_op_fddy_coarse: 1241 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 1242 break; 1243 1244 case nir_op_fadd: 1245 if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1246 brw_rnd_mode rnd = 1247 brw_rnd_mode_from_execution_mode(execution_mode); 1248 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1249 brw_imm_d(rnd)); 1250 } 1251 FALLTHROUGH; 1252 case nir_op_iadd: 1253 inst = bld.ADD(result, op[0], op[1]); 1254 break; 1255 1256 case nir_op_iadd3: 1257 inst = bld.ADD3(result, op[0], op[1], op[2]); 1258 break; 1259 1260 case nir_op_iadd_sat: 1261 case nir_op_uadd_sat: 1262 inst = bld.ADD(result, op[0], op[1]); 1263 inst->saturate = true; 1264 break; 1265 1266 case nir_op_isub_sat: 1267 bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]); 1268 break; 1269 1270 case nir_op_usub_sat: 1271 bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]); 1272 break; 1273 1274 case nir_op_irhadd: 1275 case nir_op_urhadd: 1276 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1277 inst = bld.AVG(result, op[0], op[1]); 1278 break; 1279 1280 case nir_op_ihadd: 1281 case nir_op_uhadd: { 1282 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1283 fs_reg tmp = bld.vgrf(result.type); 1284 1285 if (devinfo->ver >= 8) { 1286 op[0] = resolve_source_modifiers(op[0]); 1287 op[1] = resolve_source_modifiers(op[1]); 1288 } 1289 1290 /* AVG(x, y) - ((x ^ y) & 1) */ 1291 bld.XOR(tmp, op[0], op[1]); 1292 bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type)); 1293 bld.AVG(result, op[0], op[1]); 1294 inst = bld.ADD(result, result, tmp); 1295 inst->src[1].negate = true; 1296 break; 1297 } 1298 1299 case nir_op_fmul: 1300 for (unsigned i = 0; i < 2; i++) { 1301 if (can_fuse_fmul_fsign(instr, i)) { 1302 emit_fsign(bld, instr, result, op, i); 1303 return; 1304 } 1305 } 1306 1307 /* We emit the rounding mode after the previous fsign optimization since 1308 * it won't result in a MUL, but will try to negate the value by other 1309 * means. 1310 */ 1311 if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1312 brw_rnd_mode rnd = 1313 brw_rnd_mode_from_execution_mode(execution_mode); 1314 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1315 brw_imm_d(rnd)); 1316 } 1317 1318 inst = bld.MUL(result, op[0], op[1]); 1319 break; 1320 1321 case nir_op_imul_2x32_64: 1322 case nir_op_umul_2x32_64: 1323 bld.MUL(result, op[0], op[1]); 1324 break; 1325 1326 case nir_op_imul_32x16: 1327 case nir_op_umul_32x16: { 1328 const bool ud = instr->op == nir_op_umul_32x16; 1329 1330 assert(nir_dest_bit_size(instr->dest.dest) == 32); 1331 1332 /* Before Gfx7, the order of the 32-bit source and the 16-bit source was 1333 * swapped. The extension isn't enabled on those platforms, so don't 1334 * pretend to support the differences. 1335 */ 1336 assert(devinfo->ver >= 7); 1337 1338 if (op[1].file == IMM) 1339 op[1] = ud ? brw_imm_uw(op[1].ud) : brw_imm_w(op[1].d); 1340 else { 1341 const enum brw_reg_type word_type = 1342 ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W; 1343 1344 op[1] = subscript(op[1], word_type, 0); 1345 } 1346 1347 const enum brw_reg_type dword_type = 1348 ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; 1349 1350 bld.MUL(result, retype(op[0], dword_type), op[1]); 1351 break; 1352 } 1353 1354 case nir_op_imul: 1355 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1356 bld.MUL(result, op[0], op[1]); 1357 break; 1358 1359 case nir_op_imul_high: 1360 case nir_op_umul_high: 1361 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1362 if (nir_dest_bit_size(instr->dest.dest) == 32) { 1363 bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]); 1364 } else { 1365 fs_reg tmp = bld.vgrf(brw_reg_type_from_bit_size(32, op[0].type)); 1366 bld.MUL(tmp, op[0], op[1]); 1367 bld.MOV(result, subscript(tmp, result.type, 1)); 1368 } 1369 break; 1370 1371 case nir_op_idiv: 1372 case nir_op_udiv: 1373 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1374 bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]); 1375 break; 1376 1377 case nir_op_uadd_carry: 1378 unreachable("Should have been lowered by carry_to_arith()."); 1379 1380 case nir_op_usub_borrow: 1381 unreachable("Should have been lowered by borrow_to_arith()."); 1382 1383 case nir_op_umod: 1384 case nir_op_irem: 1385 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it 1386 * appears that our hardware just does the right thing for signed 1387 * remainder. 1388 */ 1389 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1390 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1391 break; 1392 1393 case nir_op_imod: { 1394 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ 1395 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1396 1397 /* Math instructions don't support conditional mod */ 1398 inst = bld.MOV(bld.null_reg_d(), result); 1399 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1400 1401 /* Now, we need to determine if signs of the sources are different. 1402 * When we XOR the sources, the top bit is 0 if they are the same and 1 1403 * if they are different. We can then use a conditional modifier to 1404 * turn that into a predicate. This leads us to an XOR.l instruction. 1405 * 1406 * Technically, according to the PRM, you're not allowed to use .l on a 1407 * XOR instruction. However, empirical experiments and Curro's reading 1408 * of the simulator source both indicate that it's safe. 1409 */ 1410 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D); 1411 inst = bld.XOR(tmp, op[0], op[1]); 1412 inst->predicate = BRW_PREDICATE_NORMAL; 1413 inst->conditional_mod = BRW_CONDITIONAL_L; 1414 1415 /* If the result of the initial remainder operation is non-zero and the 1416 * two sources have different signs, add in a copy of op[1] to get the 1417 * final integer modulus value. 1418 */ 1419 inst = bld.ADD(result, result, op[1]); 1420 inst->predicate = BRW_PREDICATE_NORMAL; 1421 break; 1422 } 1423 1424 case nir_op_flt32: 1425 case nir_op_fge32: 1426 case nir_op_feq32: 1427 case nir_op_fneu32: { 1428 fs_reg dest = result; 1429 1430 const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1431 if (bit_size != 32) 1432 dest = bld.vgrf(op[0].type, 1); 1433 1434 bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op)); 1435 1436 if (bit_size > 32) { 1437 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1438 } else if(bit_size < 32) { 1439 /* When we convert the result to 32-bit we need to be careful and do 1440 * it as a signed conversion to get sign extension (for 32-bit true) 1441 */ 1442 const brw_reg_type src_type = 1443 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1444 1445 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1446 } 1447 break; 1448 } 1449 1450 case nir_op_ilt32: 1451 case nir_op_ult32: 1452 case nir_op_ige32: 1453 case nir_op_uge32: 1454 case nir_op_ieq32: 1455 case nir_op_ine32: { 1456 fs_reg dest = result; 1457 1458 const uint32_t bit_size = type_sz(op[0].type) * 8; 1459 if (bit_size != 32) 1460 dest = bld.vgrf(op[0].type, 1); 1461 1462 bld.CMP(dest, op[0], op[1], 1463 brw_cmod_for_nir_comparison(instr->op)); 1464 1465 if (bit_size > 32) { 1466 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1467 } else if (bit_size < 32) { 1468 /* When we convert the result to 32-bit we need to be careful and do 1469 * it as a signed conversion to get sign extension (for 32-bit true) 1470 */ 1471 const brw_reg_type src_type = 1472 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1473 1474 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1475 } 1476 break; 1477 } 1478 1479 case nir_op_inot: 1480 if (devinfo->ver >= 8) { 1481 nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src); 1482 1483 if (inot_src_instr != NULL && 1484 (inot_src_instr->op == nir_op_ior || 1485 inot_src_instr->op == nir_op_ixor || 1486 inot_src_instr->op == nir_op_iand)) { 1487 /* The sources of the source logical instruction are now the 1488 * sources of the instruction that will be generated. 1489 */ 1490 prepare_alu_destination_and_sources(bld, inot_src_instr, op, false); 1491 resolve_inot_sources(bld, inot_src_instr, op); 1492 1493 /* Smash all of the sources and destination to be signed. This 1494 * doesn't matter for the operation of the instruction, but cmod 1495 * propagation fails on unsigned sources with negation (due to 1496 * fs_inst::can_do_cmod returning false). 1497 */ 1498 result.type = 1499 brw_type_for_nir_type(devinfo, 1500 (nir_alu_type)(nir_type_int | 1501 nir_dest_bit_size(instr->dest.dest))); 1502 op[0].type = 1503 brw_type_for_nir_type(devinfo, 1504 (nir_alu_type)(nir_type_int | 1505 nir_src_bit_size(inot_src_instr->src[0].src))); 1506 op[1].type = 1507 brw_type_for_nir_type(devinfo, 1508 (nir_alu_type)(nir_type_int | 1509 nir_src_bit_size(inot_src_instr->src[1].src))); 1510 1511 /* For XOR, only invert one of the sources. Arbitrarily choose 1512 * the first source. 1513 */ 1514 op[0].negate = !op[0].negate; 1515 if (inot_src_instr->op != nir_op_ixor) 1516 op[1].negate = !op[1].negate; 1517 1518 switch (inot_src_instr->op) { 1519 case nir_op_ior: 1520 bld.AND(result, op[0], op[1]); 1521 return; 1522 1523 case nir_op_iand: 1524 bld.OR(result, op[0], op[1]); 1525 return; 1526 1527 case nir_op_ixor: 1528 bld.XOR(result, op[0], op[1]); 1529 return; 1530 1531 default: 1532 unreachable("impossible opcode"); 1533 } 1534 } 1535 op[0] = resolve_source_modifiers(op[0]); 1536 } 1537 bld.NOT(result, op[0]); 1538 break; 1539 case nir_op_ixor: 1540 if (devinfo->ver >= 8) { 1541 resolve_inot_sources(bld, instr, op); 1542 } 1543 bld.XOR(result, op[0], op[1]); 1544 break; 1545 case nir_op_ior: 1546 if (devinfo->ver >= 8) { 1547 resolve_inot_sources(bld, instr, op); 1548 } 1549 bld.OR(result, op[0], op[1]); 1550 break; 1551 case nir_op_iand: 1552 if (devinfo->ver >= 8) { 1553 resolve_inot_sources(bld, instr, op); 1554 } 1555 bld.AND(result, op[0], op[1]); 1556 break; 1557 1558 case nir_op_fdot2: 1559 case nir_op_fdot3: 1560 case nir_op_fdot4: 1561 case nir_op_b32all_fequal2: 1562 case nir_op_b32all_iequal2: 1563 case nir_op_b32all_fequal3: 1564 case nir_op_b32all_iequal3: 1565 case nir_op_b32all_fequal4: 1566 case nir_op_b32all_iequal4: 1567 case nir_op_b32any_fnequal2: 1568 case nir_op_b32any_inequal2: 1569 case nir_op_b32any_fnequal3: 1570 case nir_op_b32any_inequal3: 1571 case nir_op_b32any_fnequal4: 1572 case nir_op_b32any_inequal4: 1573 unreachable("Lowered by nir_lower_alu_reductions"); 1574 1575 case nir_op_ldexp: 1576 unreachable("not reached: should be handled by ldexp_to_arith()"); 1577 1578 case nir_op_fsqrt: 1579 inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]); 1580 break; 1581 1582 case nir_op_frsq: 1583 inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]); 1584 break; 1585 1586 case nir_op_i2b32: 1587 case nir_op_f2b32: { 1588 uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1589 if (bit_size == 64) { 1590 /* two-argument instructions can't take 64-bit immediates */ 1591 fs_reg zero; 1592 fs_reg tmp; 1593 1594 if (instr->op == nir_op_f2b32) { 1595 zero = vgrf(glsl_type::double_type); 1596 tmp = vgrf(glsl_type::double_type); 1597 bld.MOV(zero, setup_imm_df(bld, 0.0)); 1598 } else { 1599 zero = vgrf(glsl_type::int64_t_type); 1600 tmp = vgrf(glsl_type::int64_t_type); 1601 bld.MOV(zero, brw_imm_q(0)); 1602 } 1603 1604 /* A SIMD16 execution needs to be split in two instructions, so use 1605 * a vgrf instead of the flag register as dst so instruction splitting 1606 * works 1607 */ 1608 bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ); 1609 bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0)); 1610 } else { 1611 fs_reg zero; 1612 if (bit_size == 32) { 1613 zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0); 1614 } else { 1615 assert(bit_size == 16); 1616 zero = instr->op == nir_op_f2b32 ? 1617 retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0); 1618 } 1619 bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ); 1620 } 1621 break; 1622 } 1623 1624 case nir_op_ftrunc: 1625 inst = bld.RNDZ(result, op[0]); 1626 if (devinfo->ver < 6) { 1627 set_condmod(BRW_CONDITIONAL_R, inst); 1628 set_predicate(BRW_PREDICATE_NORMAL, 1629 bld.ADD(result, result, brw_imm_f(1.0f))); 1630 inst = bld.MOV(result, result); /* for potential saturation */ 1631 } 1632 break; 1633 1634 case nir_op_fceil: { 1635 op[0].negate = !op[0].negate; 1636 fs_reg temp = vgrf(glsl_type::float_type); 1637 bld.RNDD(temp, op[0]); 1638 temp.negate = true; 1639 inst = bld.MOV(result, temp); 1640 break; 1641 } 1642 case nir_op_ffloor: 1643 inst = bld.RNDD(result, op[0]); 1644 break; 1645 case nir_op_ffract: 1646 inst = bld.FRC(result, op[0]); 1647 break; 1648 case nir_op_fround_even: 1649 inst = bld.RNDE(result, op[0]); 1650 if (devinfo->ver < 6) { 1651 set_condmod(BRW_CONDITIONAL_R, inst); 1652 set_predicate(BRW_PREDICATE_NORMAL, 1653 bld.ADD(result, result, brw_imm_f(1.0f))); 1654 inst = bld.MOV(result, result); /* for potential saturation */ 1655 } 1656 break; 1657 1658 case nir_op_fquantize2f16: { 1659 fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D); 1660 fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F); 1661 fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F); 1662 1663 /* The destination stride must be at least as big as the source stride. */ 1664 tmp16.type = devinfo->ver > 7 1665 ? BRW_REGISTER_TYPE_HF : BRW_REGISTER_TYPE_W; 1666 tmp16.stride = 2; 1667 1668 /* Check for denormal */ 1669 fs_reg abs_src0 = op[0]; 1670 abs_src0.abs = true; 1671 bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), 1672 BRW_CONDITIONAL_L); 1673 /* Get the appropriately signed zero */ 1674 bld.AND(retype(zero, BRW_REGISTER_TYPE_UD), 1675 retype(op[0], BRW_REGISTER_TYPE_UD), 1676 brw_imm_ud(0x80000000)); 1677 /* Do the actual F32 -> F16 -> F32 conversion */ 1678 bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]); 1679 bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16); 1680 /* Select that or zero based on normal status */ 1681 inst = bld.SEL(result, zero, tmp32); 1682 inst->predicate = BRW_PREDICATE_NORMAL; 1683 break; 1684 } 1685 1686 case nir_op_imin: 1687 case nir_op_umin: 1688 case nir_op_fmin: 1689 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L); 1690 break; 1691 1692 case nir_op_imax: 1693 case nir_op_umax: 1694 case nir_op_fmax: 1695 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE); 1696 break; 1697 1698 case nir_op_pack_snorm_2x16: 1699 case nir_op_pack_snorm_4x8: 1700 case nir_op_pack_unorm_2x16: 1701 case nir_op_pack_unorm_4x8: 1702 case nir_op_unpack_snorm_2x16: 1703 case nir_op_unpack_snorm_4x8: 1704 case nir_op_unpack_unorm_2x16: 1705 case nir_op_unpack_unorm_4x8: 1706 case nir_op_unpack_half_2x16: 1707 case nir_op_pack_half_2x16: 1708 unreachable("not reached: should be handled by lower_packing_builtins"); 1709 1710 case nir_op_unpack_half_2x16_split_x_flush_to_zero: 1711 assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); 1712 FALLTHROUGH; 1713 case nir_op_unpack_half_2x16_split_x: 1714 inst = bld.emit(BRW_OPCODE_F16TO32, result, 1715 subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); 1716 break; 1717 1718 case nir_op_unpack_half_2x16_split_y_flush_to_zero: 1719 assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); 1720 FALLTHROUGH; 1721 case nir_op_unpack_half_2x16_split_y: 1722 inst = bld.emit(BRW_OPCODE_F16TO32, result, 1723 subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); 1724 break; 1725 1726 case nir_op_pack_64_2x32_split: 1727 case nir_op_pack_32_2x16_split: 1728 bld.emit(FS_OPCODE_PACK, result, op[0], op[1]); 1729 break; 1730 1731 case nir_op_pack_32_4x8_split: 1732 bld.emit(FS_OPCODE_PACK, result, op, 4); 1733 break; 1734 1735 case nir_op_unpack_64_2x32_split_x: 1736 case nir_op_unpack_64_2x32_split_y: { 1737 if (instr->op == nir_op_unpack_64_2x32_split_x) 1738 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0)); 1739 else 1740 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1)); 1741 break; 1742 } 1743 1744 case nir_op_unpack_32_2x16_split_x: 1745 case nir_op_unpack_32_2x16_split_y: { 1746 if (instr->op == nir_op_unpack_32_2x16_split_x) 1747 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); 1748 else 1749 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); 1750 break; 1751 } 1752 1753 case nir_op_fpow: 1754 inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]); 1755 break; 1756 1757 case nir_op_bitfield_reverse: 1758 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1759 bld.BFREV(result, op[0]); 1760 break; 1761 1762 case nir_op_bit_count: 1763 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1764 bld.CBIT(result, op[0]); 1765 break; 1766 1767 case nir_op_ufind_msb: { 1768 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1769 emit_find_msb_using_lzd(bld, result, op[0], false); 1770 break; 1771 } 1772 1773 case nir_op_uclz: 1774 assert(nir_dest_bit_size(instr->dest.dest) == 32); 1775 bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]); 1776 break; 1777 1778 case nir_op_ifind_msb: { 1779 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1780 1781 if (devinfo->ver < 7) { 1782 emit_find_msb_using_lzd(bld, result, op[0], true); 1783 } else { 1784 bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]); 1785 1786 /* FBH counts from the MSB side, while GLSL's findMSB() wants the 1787 * count from the LSB side. If FBH didn't return an error 1788 * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB 1789 * count into an LSB count. 1790 */ 1791 bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ); 1792 1793 inst = bld.ADD(result, result, brw_imm_d(31)); 1794 inst->predicate = BRW_PREDICATE_NORMAL; 1795 inst->src[0].negate = true; 1796 } 1797 break; 1798 } 1799 1800 case nir_op_find_lsb: 1801 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1802 1803 if (devinfo->ver < 7) { 1804 fs_reg temp = vgrf(glsl_type::int_type); 1805 1806 /* (x & -x) generates a value that consists of only the LSB of x. 1807 * For all powers of 2, findMSB(y) == findLSB(y). 1808 */ 1809 fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D); 1810 fs_reg negated_src = src; 1811 1812 /* One must be negated, and the other must be non-negated. It 1813 * doesn't matter which is which. 1814 */ 1815 negated_src.negate = true; 1816 src.negate = false; 1817 1818 bld.AND(temp, src, negated_src); 1819 emit_find_msb_using_lzd(bld, result, temp, false); 1820 } else { 1821 bld.FBL(result, op[0]); 1822 } 1823 break; 1824 1825 case nir_op_ubitfield_extract: 1826 case nir_op_ibitfield_extract: 1827 unreachable("should have been lowered"); 1828 case nir_op_ubfe: 1829 case nir_op_ibfe: 1830 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1831 bld.BFE(result, op[2], op[1], op[0]); 1832 break; 1833 case nir_op_bfm: 1834 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1835 bld.BFI1(result, op[0], op[1]); 1836 break; 1837 case nir_op_bfi: 1838 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1839 bld.BFI2(result, op[0], op[1], op[2]); 1840 break; 1841 1842 case nir_op_bitfield_insert: 1843 unreachable("not reached: should have been lowered"); 1844 1845 /* For all shift operations: 1846 * 1847 * Gen4 - Gen7: After application of source modifiers, the low 5-bits of 1848 * src1 are used an unsigned value for the shift count. 1849 * 1850 * Gen8: As with earlier platforms, but for Q and UQ types on src0, the low 1851 * 6-bit of src1 are used. 1852 * 1853 * Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for 1854 * W or UW src0). 1855 * 1856 * The implication is that the following instruction will produce a 1857 * different result on Gen9+ than on previous platforms: 1858 * 1859 * shr(8) g4<1>UW g12<8,8,1>UW 0x0010UW 1860 * 1861 * where Gen9+ will shift by zero, and earlier platforms will shift by 16. 1862 * 1863 * This does not seem to be the case. Experimentally, it has been 1864 * determined that shifts of 16-bit values on Gen8 behave properly. Shifts 1865 * of 8-bit values on both Gen8 and Gen9 do not. Gen11+ lowers 8-bit 1866 * values, so those platforms were not tested. No features expose access 1867 * to 8- or 16-bit types on Gen7 or earlier, so those platforms were not 1868 * tested either. See 1869 * https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76. 1870 * 1871 * This is part of the reason 8-bit values are lowered to 16-bit on all 1872 * platforms. 1873 */ 1874 case nir_op_ishl: 1875 bld.SHL(result, op[0], op[1]); 1876 break; 1877 case nir_op_ishr: 1878 bld.ASR(result, op[0], op[1]); 1879 break; 1880 case nir_op_ushr: 1881 bld.SHR(result, op[0], op[1]); 1882 break; 1883 1884 case nir_op_urol: 1885 bld.ROL(result, op[0], op[1]); 1886 break; 1887 case nir_op_uror: 1888 bld.ROR(result, op[0], op[1]); 1889 break; 1890 1891 case nir_op_pack_half_2x16_split: 1892 bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]); 1893 break; 1894 1895 case nir_op_sdot_4x8_iadd: 1896 case nir_op_sdot_4x8_iadd_sat: 1897 inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D), 1898 retype(op[2], BRW_REGISTER_TYPE_D), 1899 retype(op[0], BRW_REGISTER_TYPE_D), 1900 retype(op[1], BRW_REGISTER_TYPE_D)); 1901 1902 if (instr->op == nir_op_sdot_4x8_iadd_sat) 1903 inst->saturate = true; 1904 break; 1905 1906 case nir_op_udot_4x8_uadd: 1907 case nir_op_udot_4x8_uadd_sat: 1908 inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_UD), 1909 retype(op[2], BRW_REGISTER_TYPE_UD), 1910 retype(op[0], BRW_REGISTER_TYPE_UD), 1911 retype(op[1], BRW_REGISTER_TYPE_UD)); 1912 1913 if (instr->op == nir_op_udot_4x8_uadd_sat) 1914 inst->saturate = true; 1915 break; 1916 1917 case nir_op_sudot_4x8_iadd: 1918 case nir_op_sudot_4x8_iadd_sat: 1919 inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D), 1920 retype(op[2], BRW_REGISTER_TYPE_D), 1921 retype(op[0], BRW_REGISTER_TYPE_D), 1922 retype(op[1], BRW_REGISTER_TYPE_UD)); 1923 1924 if (instr->op == nir_op_sudot_4x8_iadd_sat) 1925 inst->saturate = true; 1926 break; 1927 1928 case nir_op_ffma: 1929 if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1930 brw_rnd_mode rnd = 1931 brw_rnd_mode_from_execution_mode(execution_mode); 1932 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1933 brw_imm_d(rnd)); 1934 } 1935 1936 inst = bld.MAD(result, op[2], op[1], op[0]); 1937 break; 1938 1939 case nir_op_flrp: 1940 if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1941 brw_rnd_mode rnd = 1942 brw_rnd_mode_from_execution_mode(execution_mode); 1943 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1944 brw_imm_d(rnd)); 1945 } 1946 1947 inst = bld.LRP(result, op[0], op[1], op[2]); 1948 break; 1949 1950 case nir_op_b32csel: 1951 if (optimize_frontfacing_ternary(instr, result)) 1952 return; 1953 1954 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ); 1955 inst = bld.SEL(result, op[1], op[2]); 1956 inst->predicate = BRW_PREDICATE_NORMAL; 1957 break; 1958 1959 case nir_op_extract_u8: 1960 case nir_op_extract_i8: { 1961 unsigned byte = nir_src_as_uint(instr->src[1].src); 1962 1963 /* The PRMs say: 1964 * 1965 * BDW+ 1966 * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB. 1967 * Use two instructions and a word or DWord intermediate integer type. 1968 */ 1969 if (nir_dest_bit_size(instr->dest.dest) == 64) { 1970 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1971 1972 if (instr->op == nir_op_extract_i8) { 1973 /* If we need to sign extend, extract to a word first */ 1974 fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W); 1975 bld.MOV(w_temp, subscript(op[0], type, byte)); 1976 bld.MOV(result, w_temp); 1977 } else if (byte & 1) { 1978 /* Extract the high byte from the word containing the desired byte 1979 * offset. 1980 */ 1981 bld.SHR(result, 1982 subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), 1983 brw_imm_uw(8)); 1984 } else { 1985 /* Otherwise use an AND with 0xff and a word type */ 1986 bld.AND(result, 1987 subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), 1988 brw_imm_uw(0xff)); 1989 } 1990 } else { 1991 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1992 bld.MOV(result, subscript(op[0], type, byte)); 1993 } 1994 break; 1995 } 1996 1997 case nir_op_extract_u16: 1998 case nir_op_extract_i16: { 1999 const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16); 2000 unsigned word = nir_src_as_uint(instr->src[1].src); 2001 bld.MOV(result, subscript(op[0], type, word)); 2002 break; 2003 } 2004 2005 default: 2006 unreachable("unhandled instruction"); 2007 } 2008 2009 /* If we need to do a boolean resolve, replace the result with -(x & 1) 2010 * to sign extend the low bit to 0/~0 2011 */ 2012 if (devinfo->ver <= 5 && 2013 !result.is_null() && 2014 (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { 2015 fs_reg masked = vgrf(glsl_type::int_type); 2016 bld.AND(masked, result, brw_imm_d(1)); 2017 masked.negate = true; 2018 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked); 2019 } 2020} 2021 2022void 2023fs_visitor::nir_emit_load_const(const fs_builder &bld, 2024 nir_load_const_instr *instr) 2025{ 2026 const brw_reg_type reg_type = 2027 brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D); 2028 fs_reg reg = bld.vgrf(reg_type, instr->def.num_components); 2029 2030 switch (instr->def.bit_size) { 2031 case 8: 2032 for (unsigned i = 0; i < instr->def.num_components; i++) 2033 bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8)); 2034 break; 2035 2036 case 16: 2037 for (unsigned i = 0; i < instr->def.num_components; i++) 2038 bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16)); 2039 break; 2040 2041 case 32: 2042 for (unsigned i = 0; i < instr->def.num_components; i++) 2043 bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32)); 2044 break; 2045 2046 case 64: 2047 assert(devinfo->ver >= 7); 2048 if (devinfo->ver == 7) { 2049 /* We don't get 64-bit integer types until gfx8 */ 2050 for (unsigned i = 0; i < instr->def.num_components; i++) { 2051 bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF), 2052 setup_imm_df(bld, instr->value[i].f64)); 2053 } 2054 } else { 2055 for (unsigned i = 0; i < instr->def.num_components; i++) 2056 bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64)); 2057 } 2058 break; 2059 2060 default: 2061 unreachable("Invalid bit size"); 2062 } 2063 2064 nir_ssa_values[instr->def.index] = reg; 2065} 2066 2067fs_reg 2068fs_visitor::get_nir_src(const nir_src &src) 2069{ 2070 fs_reg reg; 2071 if (src.is_ssa) { 2072 if (nir_src_is_undef(src)) { 2073 const brw_reg_type reg_type = 2074 brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D); 2075 reg = bld.vgrf(reg_type, src.ssa->num_components); 2076 } else { 2077 reg = nir_ssa_values[src.ssa->index]; 2078 } 2079 } else { 2080 /* We don't handle indirects on locals */ 2081 assert(src.reg.indirect == NULL); 2082 reg = offset(nir_locals[src.reg.reg->index], bld, 2083 src.reg.base_offset * src.reg.reg->num_components); 2084 } 2085 2086 if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) { 2087 /* The only 64-bit type available on gfx7 is DF, so use that. */ 2088 reg.type = BRW_REGISTER_TYPE_DF; 2089 } else { 2090 /* To avoid floating-point denorm flushing problems, set the type by 2091 * default to an integer type - instructions that need floating point 2092 * semantics will set this to F if they need to 2093 */ 2094 reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src), 2095 BRW_REGISTER_TYPE_D); 2096 } 2097 2098 return reg; 2099} 2100 2101/** 2102 * Return an IMM for constants; otherwise call get_nir_src() as normal. 2103 * 2104 * This function should not be called on any value which may be 64 bits. 2105 * We could theoretically support 64-bit on gfx8+ but we choose not to 2106 * because it wouldn't work in general (no gfx7 support) and there are 2107 * enough restrictions in 64-bit immediates that you can't take the return 2108 * value and treat it the same as the result of get_nir_src(). 2109 */ 2110fs_reg 2111fs_visitor::get_nir_src_imm(const nir_src &src) 2112{ 2113 assert(nir_src_bit_size(src) == 32); 2114 return nir_src_is_const(src) ? 2115 fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src); 2116} 2117 2118fs_reg 2119fs_visitor::get_nir_dest(const nir_dest &dest) 2120{ 2121 if (dest.is_ssa) { 2122 const brw_reg_type reg_type = 2123 brw_reg_type_from_bit_size(dest.ssa.bit_size, 2124 dest.ssa.bit_size == 8 ? 2125 BRW_REGISTER_TYPE_D : 2126 BRW_REGISTER_TYPE_F); 2127 nir_ssa_values[dest.ssa.index] = 2128 bld.vgrf(reg_type, dest.ssa.num_components); 2129 bld.UNDEF(nir_ssa_values[dest.ssa.index]); 2130 return nir_ssa_values[dest.ssa.index]; 2131 } else { 2132 /* We don't handle indirects on locals */ 2133 assert(dest.reg.indirect == NULL); 2134 return offset(nir_locals[dest.reg.reg->index], bld, 2135 dest.reg.base_offset * dest.reg.reg->num_components); 2136 } 2137} 2138 2139void 2140fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst, 2141 unsigned wr_mask) 2142{ 2143 for (unsigned i = 0; i < 4; i++) { 2144 if (!((wr_mask >> i) & 1)) 2145 continue; 2146 2147 fs_inst *new_inst = new(mem_ctx) fs_inst(inst); 2148 new_inst->dst = offset(new_inst->dst, bld, i); 2149 for (unsigned j = 0; j < new_inst->sources; j++) 2150 if (new_inst->src[j].file == VGRF) 2151 new_inst->src[j] = offset(new_inst->src[j], bld, i); 2152 2153 bld.emit(new_inst); 2154 } 2155} 2156 2157static fs_inst * 2158emit_pixel_interpolater_send(const fs_builder &bld, 2159 enum opcode opcode, 2160 const fs_reg &dst, 2161 const fs_reg &src, 2162 const fs_reg &desc, 2163 glsl_interp_mode interpolation) 2164{ 2165 struct brw_wm_prog_data *wm_prog_data = 2166 brw_wm_prog_data(bld.shader->stage_prog_data); 2167 2168 fs_inst *inst = bld.emit(opcode, dst, src, desc); 2169 /* 2 floats per slot returned */ 2170 inst->size_written = 2 * dst.component_size(inst->exec_size); 2171 if (interpolation == INTERP_MODE_NOPERSPECTIVE) { 2172 inst->pi_noperspective = true; 2173 /* TGL BSpec says: 2174 * This field cannot be set to "Linear Interpolation" 2175 * unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled" 2176 */ 2177 wm_prog_data->uses_nonperspective_interp_modes = true; 2178 } 2179 2180 wm_prog_data->pulls_bary = true; 2181 2182 return inst; 2183} 2184 2185/** 2186 * Computes 1 << x, given a D/UD register containing some value x. 2187 */ 2188static fs_reg 2189intexp2(const fs_builder &bld, const fs_reg &x) 2190{ 2191 assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); 2192 2193 fs_reg result = bld.vgrf(x.type, 1); 2194 fs_reg one = bld.vgrf(x.type, 1); 2195 2196 bld.MOV(one, retype(brw_imm_d(1), one.type)); 2197 bld.SHL(result, one, x); 2198 return result; 2199} 2200 2201void 2202fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src) 2203{ 2204 assert(stage == MESA_SHADER_GEOMETRY); 2205 2206 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2207 2208 if (gs_compile->control_data_header_size_bits == 0) 2209 return; 2210 2211 /* We can only do EndPrimitive() functionality when the control data 2212 * consists of cut bits. Fortunately, the only time it isn't is when the 2213 * output type is points, in which case EndPrimitive() is a no-op. 2214 */ 2215 if (gs_prog_data->control_data_format != 2216 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { 2217 return; 2218 } 2219 2220 /* Cut bits use one bit per vertex. */ 2221 assert(gs_compile->control_data_bits_per_vertex == 1); 2222 2223 fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 2224 vertex_count.type = BRW_REGISTER_TYPE_UD; 2225 2226 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting 2227 * vertex n, 0 otherwise. So all we need to do here is mark bit 2228 * (vertex_count - 1) % 32 in the cut_bits register to indicate that 2229 * EndPrimitive() was called after emitting vertex (vertex_count - 1); 2230 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. 2231 * 2232 * Note that if EndPrimitive() is called before emitting any vertices, this 2233 * will cause us to set bit 31 of the control_data_bits register to 1. 2234 * That's fine because: 2235 * 2236 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be 2237 * output, so the hardware will ignore cut bit 31. 2238 * 2239 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the 2240 * last vertex, so setting cut bit 31 has no effect (since the primitive 2241 * is automatically ended when the GS terminates). 2242 * 2243 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the 2244 * control_data_bits register to 0 when the first vertex is emitted. 2245 */ 2246 2247 const fs_builder abld = bld.annotate("end primitive"); 2248 2249 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ 2250 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2251 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 2252 fs_reg mask = intexp2(abld, prev_count); 2253 /* Note: we're relying on the fact that the GEN SHL instruction only pays 2254 * attention to the lower 5 bits of its second source argument, so on this 2255 * architecture, 1 << (vertex_count - 1) is equivalent to 1 << 2256 * ((vertex_count - 1) % 32). 2257 */ 2258 abld.OR(this->control_data_bits, this->control_data_bits, mask); 2259} 2260 2261void 2262fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) 2263{ 2264 assert(stage == MESA_SHADER_GEOMETRY); 2265 assert(gs_compile->control_data_bits_per_vertex != 0); 2266 2267 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2268 2269 const fs_builder abld = bld.annotate("emit control data bits"); 2270 const fs_builder fwa_bld = bld.exec_all(); 2271 2272 /* We use a single UD register to accumulate control data bits (32 bits 2273 * for each of the SIMD8 channels). So we need to write a DWord (32 bits) 2274 * at a time. 2275 * 2276 * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets. 2277 * We have select a 128-bit group via the Global and Per-Slot Offsets, then 2278 * use the Channel Mask phase to enable/disable which DWord within that 2279 * group to write. (Remember, different SIMD8 channels may have emitted 2280 * different numbers of vertices, so we may need per-slot offsets.) 2281 * 2282 * Channel masking presents an annoying problem: we may have to replicate 2283 * the data up to 4 times: 2284 * 2285 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. 2286 * 2287 * To avoid penalizing shaders that emit a small number of vertices, we 2288 * can avoid these sometimes: if the size of the control data header is 2289 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land 2290 * land in the same 128-bit group, so we can skip per-slot offsets. 2291 * 2292 * Similarly, if the control data header is <= 32 bits, there is only one 2293 * DWord, so we can skip channel masks. 2294 */ 2295 fs_reg channel_mask, per_slot_offset; 2296 2297 if (gs_compile->control_data_header_size_bits > 32) 2298 channel_mask = vgrf(glsl_type::uint_type); 2299 2300 if (gs_compile->control_data_header_size_bits > 128) 2301 per_slot_offset = vgrf(glsl_type::uint_type); 2302 2303 /* Figure out which DWord we're trying to write to using the formula: 2304 * 2305 * dword_index = (vertex_count - 1) * bits_per_vertex / 32 2306 * 2307 * Since bits_per_vertex is a power of two, and is known at compile 2308 * time, this can be optimized to: 2309 * 2310 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) 2311 */ 2312 if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) { 2313 fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2314 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2315 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 2316 unsigned log2_bits_per_vertex = 2317 util_last_bit(gs_compile->control_data_bits_per_vertex); 2318 abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex)); 2319 2320 if (per_slot_offset.file != BAD_FILE) { 2321 /* Set the per-slot offset to dword_index / 4, so that we'll write to 2322 * the appropriate OWord within the control data header. 2323 */ 2324 abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u)); 2325 } 2326 2327 /* Set the channel masks to 1 << (dword_index % 4), so that we'll 2328 * write to the appropriate DWORD within the OWORD. 2329 */ 2330 fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2331 fwa_bld.AND(channel, dword_index, brw_imm_ud(3u)); 2332 channel_mask = intexp2(fwa_bld, channel); 2333 /* Then the channel masks need to be in bits 23:16. */ 2334 fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u)); 2335 } 2336 2337 /* Store the control data bits in the message payload and send it. */ 2338 const unsigned header_size = 1 + unsigned(channel_mask.file != BAD_FILE) + 2339 unsigned(per_slot_offset.file != BAD_FILE); 2340 2341 /* If there are channel masks, add 3 extra copies of the data. */ 2342 const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE); 2343 2344 fs_reg sources[4]; 2345 2346 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) 2347 sources[i] = this->control_data_bits; 2348 2349 fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 2350 srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 2351 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset; 2352 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask; 2353 srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length), 2354 BRW_REGISTER_TYPE_F); 2355 abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0); 2356 2357 fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, 2358 srcs, ARRAY_SIZE(srcs)); 2359 inst->mlen = header_size + length; 2360 /* We need to increment Global Offset by 256-bits to make room for 2361 * Broadwell's extra "Vertex Count" payload at the beginning of the 2362 * URB entry. Since this is an OWord message, Global Offset is counted 2363 * in 128-bit units, so we must set it to 2. 2364 */ 2365 if (gs_prog_data->static_vertex_count == -1) 2366 inst->offset = 2; 2367} 2368 2369void 2370fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count, 2371 unsigned stream_id) 2372{ 2373 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ 2374 2375 /* Note: we are calling this *before* increasing vertex_count, so 2376 * this->vertex_count == vertex_count - 1 in the formula above. 2377 */ 2378 2379 /* Stream mode uses 2 bits per vertex */ 2380 assert(gs_compile->control_data_bits_per_vertex == 2); 2381 2382 /* Must be a valid stream */ 2383 assert(stream_id < MAX_VERTEX_STREAMS); 2384 2385 /* Control data bits are initialized to 0 so we don't have to set any 2386 * bits when sending vertices to stream 0. 2387 */ 2388 if (stream_id == 0) 2389 return; 2390 2391 const fs_builder abld = bld.annotate("set stream control data bits", NULL); 2392 2393 /* reg::sid = stream_id */ 2394 fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2395 abld.MOV(sid, brw_imm_ud(stream_id)); 2396 2397 /* reg:shift_count = 2 * (vertex_count - 1) */ 2398 fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2399 abld.SHL(shift_count, vertex_count, brw_imm_ud(1u)); 2400 2401 /* Note: we're relying on the fact that the GEN SHL instruction only pays 2402 * attention to the lower 5 bits of its second source argument, so on this 2403 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to 2404 * stream_id << ((2 * (vertex_count - 1)) % 32). 2405 */ 2406 fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2407 abld.SHL(mask, sid, shift_count); 2408 abld.OR(this->control_data_bits, this->control_data_bits, mask); 2409} 2410 2411void 2412fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src, 2413 unsigned stream_id) 2414{ 2415 assert(stage == MESA_SHADER_GEOMETRY); 2416 2417 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2418 2419 fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 2420 vertex_count.type = BRW_REGISTER_TYPE_UD; 2421 2422 /* Haswell and later hardware ignores the "Render Stream Select" bits 2423 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, 2424 * and instead sends all primitives down the pipeline for rasterization. 2425 * If the SOL stage is enabled, "Render Stream Select" is honored and 2426 * primitives bound to non-zero streams are discarded after stream output. 2427 * 2428 * Since the only purpose of primives sent to non-zero streams is to 2429 * be recorded by transform feedback, we can simply discard all geometry 2430 * bound to these streams when transform feedback is disabled. 2431 */ 2432 if (stream_id > 0 && !nir->info.has_transform_feedback_varyings) 2433 return; 2434 2435 /* If we're outputting 32 control data bits or less, then we can wait 2436 * until the shader is over to output them all. Otherwise we need to 2437 * output them as we go. Now is the time to do it, since we're about to 2438 * output the vertex_count'th vertex, so it's guaranteed that the 2439 * control data bits associated with the (vertex_count - 1)th vertex are 2440 * correct. 2441 */ 2442 if (gs_compile->control_data_header_size_bits > 32) { 2443 const fs_builder abld = 2444 bld.annotate("emit vertex: emit control data bits"); 2445 2446 /* Only emit control data bits if we've finished accumulating a batch 2447 * of 32 bits. This is the case when: 2448 * 2449 * (vertex_count * bits_per_vertex) % 32 == 0 2450 * 2451 * (in other words, when the last 5 bits of vertex_count * 2452 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some 2453 * integer n (which is always the case, since bits_per_vertex is 2454 * always 1 or 2), this is equivalent to requiring that the last 5-n 2455 * bits of vertex_count are 0: 2456 * 2457 * vertex_count & (2^(5-n) - 1) == 0 2458 * 2459 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is 2460 * equivalent to: 2461 * 2462 * vertex_count & (32 / bits_per_vertex - 1) == 0 2463 * 2464 * TODO: If vertex_count is an immediate, we could do some of this math 2465 * at compile time... 2466 */ 2467 fs_inst *inst = 2468 abld.AND(bld.null_reg_d(), vertex_count, 2469 brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u)); 2470 inst->conditional_mod = BRW_CONDITIONAL_Z; 2471 2472 abld.IF(BRW_PREDICATE_NORMAL); 2473 /* If vertex_count is 0, then no control data bits have been 2474 * accumulated yet, so we can skip emitting them. 2475 */ 2476 abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u), 2477 BRW_CONDITIONAL_NEQ); 2478 abld.IF(BRW_PREDICATE_NORMAL); 2479 emit_gs_control_data_bits(vertex_count); 2480 abld.emit(BRW_OPCODE_ENDIF); 2481 2482 /* Reset control_data_bits to 0 so we can start accumulating a new 2483 * batch. 2484 * 2485 * Note: in the case where vertex_count == 0, this neutralizes the 2486 * effect of any call to EndPrimitive() that the shader may have 2487 * made before outputting its first vertex. 2488 */ 2489 inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u)); 2490 inst->force_writemask_all = true; 2491 abld.emit(BRW_OPCODE_ENDIF); 2492 } 2493 2494 emit_urb_writes(vertex_count); 2495 2496 /* In stream mode we have to set control data bits for all vertices 2497 * unless we have disabled control data bits completely (which we do 2498 * do for GL_POINTS outputs that don't use streams). 2499 */ 2500 if (gs_compile->control_data_header_size_bits > 0 && 2501 gs_prog_data->control_data_format == 2502 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { 2503 set_gs_stream_control_data_bits(vertex_count, stream_id); 2504 } 2505} 2506 2507void 2508fs_visitor::emit_gs_input_load(const fs_reg &dst, 2509 const nir_src &vertex_src, 2510 unsigned base_offset, 2511 const nir_src &offset_src, 2512 unsigned num_components, 2513 unsigned first_component) 2514{ 2515 assert(type_sz(dst.type) == 4); 2516 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2517 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8; 2518 2519 /* TODO: figure out push input layout for invocations == 1 */ 2520 if (gs_prog_data->invocations == 1 && 2521 nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) && 2522 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) { 2523 int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 + 2524 nir_src_as_uint(vertex_src) * push_reg_count; 2525 for (unsigned i = 0; i < num_components; i++) { 2526 bld.MOV(offset(dst, bld, i), 2527 fs_reg(ATTR, imm_offset + i + first_component, dst.type)); 2528 } 2529 return; 2530 } 2531 2532 /* Resort to the pull model. Ensure the VUE handles are provided. */ 2533 assert(gs_prog_data->base.include_vue_handles); 2534 2535 unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2; 2536 fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2537 2538 if (gs_prog_data->invocations == 1) { 2539 if (nir_src_is_const(vertex_src)) { 2540 /* The vertex index is constant; just select the proper URB handle. */ 2541 icp_handle = 2542 retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0), 2543 BRW_REGISTER_TYPE_UD); 2544 } else { 2545 /* The vertex index is non-constant. We need to use indirect 2546 * addressing to fetch the proper URB handle. 2547 * 2548 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> 2549 * indicating that channel <n> should read the handle from 2550 * DWord <n>. We convert that to bytes by multiplying by 4. 2551 * 2552 * Next, we convert the vertex index to bytes by multiplying 2553 * by 32 (shifting by 5), and add the two together. This is 2554 * the final indirect byte offset. 2555 */ 2556 fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1); 2557 fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2558 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2559 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2560 2561 /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ 2562 bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); 2563 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ 2564 bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); 2565 /* Convert vertex_index to bytes (multiply by 32) */ 2566 bld.SHL(vertex_offset_bytes, 2567 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2568 brw_imm_ud(5u)); 2569 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); 2570 2571 /* Use first_icp_handle as the base offset. There is one register 2572 * of URB handles per vertex, so inform the register allocator that 2573 * we might read up to nir->info.gs.vertices_in registers. 2574 */ 2575 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2576 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2577 fs_reg(icp_offset_bytes), 2578 brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE)); 2579 } 2580 } else { 2581 assert(gs_prog_data->invocations > 1); 2582 2583 if (nir_src_is_const(vertex_src)) { 2584 unsigned vertex = nir_src_as_uint(vertex_src); 2585 assert(devinfo->ver >= 9 || vertex <= 5); 2586 bld.MOV(icp_handle, 2587 retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8), 2588 BRW_REGISTER_TYPE_UD)); 2589 } else { 2590 /* The vertex index is non-constant. We need to use indirect 2591 * addressing to fetch the proper URB handle. 2592 * 2593 */ 2594 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2595 2596 /* Convert vertex_index to bytes (multiply by 4) */ 2597 bld.SHL(icp_offset_bytes, 2598 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2599 brw_imm_ud(2u)); 2600 2601 /* Use first_icp_handle as the base offset. There is one DWord 2602 * of URB handles per vertex, so inform the register allocator that 2603 * we might read up to ceil(nir->info.gs.vertices_in / 8) registers. 2604 */ 2605 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2606 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2607 fs_reg(icp_offset_bytes), 2608 brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) * 2609 REG_SIZE)); 2610 } 2611 } 2612 2613 fs_inst *inst; 2614 fs_reg indirect_offset = get_nir_src(offset_src); 2615 2616 if (nir_src_is_const(offset_src)) { 2617 fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 2618 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle; 2619 2620 /* Constant indexing - use global offset. */ 2621 if (first_component != 0) { 2622 unsigned read_components = num_components + first_component; 2623 fs_reg tmp = bld.vgrf(dst.type, read_components); 2624 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs, 2625 ARRAY_SIZE(srcs)); 2626 inst->size_written = read_components * 2627 tmp.component_size(inst->exec_size); 2628 for (unsigned i = 0; i < num_components; i++) { 2629 bld.MOV(offset(dst, bld, i), 2630 offset(tmp, bld, i + first_component)); 2631 } 2632 } else { 2633 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs, 2634 ARRAY_SIZE(srcs)); 2635 inst->size_written = num_components * 2636 dst.component_size(inst->exec_size); 2637 } 2638 inst->offset = base_offset + nir_src_as_uint(offset_src); 2639 inst->mlen = 1; 2640 } else { 2641 /* Indirect indexing - use per-slot offsets as well. */ 2642 unsigned read_components = num_components + first_component; 2643 fs_reg tmp = bld.vgrf(dst.type, read_components); 2644 2645 fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 2646 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle; 2647 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; 2648 2649 if (first_component != 0) { 2650 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 2651 srcs, ARRAY_SIZE(srcs)); 2652 inst->size_written = read_components * 2653 tmp.component_size(inst->exec_size); 2654 for (unsigned i = 0; i < num_components; i++) { 2655 bld.MOV(offset(dst, bld, i), 2656 offset(tmp, bld, i + first_component)); 2657 } 2658 } else { 2659 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, 2660 srcs, ARRAY_SIZE(srcs)); 2661 inst->size_written = num_components * 2662 dst.component_size(inst->exec_size); 2663 } 2664 inst->offset = base_offset; 2665 inst->mlen = 2; 2666 } 2667} 2668 2669fs_reg 2670fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) 2671{ 2672 nir_src *offset_src = nir_get_io_offset_src(instr); 2673 2674 if (nir_src_is_const(*offset_src)) { 2675 /* The only constant offset we should find is 0. brw_nir.c's 2676 * add_const_offset_to_base() will fold other constant offsets 2677 * into instr->const_index[0]. 2678 */ 2679 assert(nir_src_as_uint(*offset_src) == 0); 2680 return fs_reg(); 2681 } 2682 2683 return get_nir_src(*offset_src); 2684} 2685 2686void 2687fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, 2688 nir_intrinsic_instr *instr) 2689{ 2690 assert(stage == MESA_SHADER_VERTEX); 2691 2692 fs_reg dest; 2693 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2694 dest = get_nir_dest(instr->dest); 2695 2696 switch (instr->intrinsic) { 2697 case nir_intrinsic_load_vertex_id: 2698 case nir_intrinsic_load_base_vertex: 2699 unreachable("should be lowered by nir_lower_system_values()"); 2700 2701 case nir_intrinsic_load_input: { 2702 assert(nir_dest_bit_size(instr->dest) == 32); 2703 fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type); 2704 src = offset(src, bld, nir_intrinsic_component(instr)); 2705 src = offset(src, bld, nir_src_as_uint(instr->src[0])); 2706 2707 for (unsigned i = 0; i < instr->num_components; i++) 2708 bld.MOV(offset(dest, bld, i), offset(src, bld, i)); 2709 break; 2710 } 2711 2712 case nir_intrinsic_load_vertex_id_zero_base: 2713 case nir_intrinsic_load_instance_id: 2714 case nir_intrinsic_load_base_instance: 2715 case nir_intrinsic_load_draw_id: 2716 case nir_intrinsic_load_first_vertex: 2717 case nir_intrinsic_load_is_indexed_draw: 2718 unreachable("lowered by brw_nir_lower_vs_inputs"); 2719 2720 default: 2721 nir_emit_intrinsic(bld, instr); 2722 break; 2723 } 2724} 2725 2726fs_reg 2727fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld, 2728 nir_intrinsic_instr *instr) 2729{ 2730 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2731 const nir_src &vertex_src = instr->src[0]; 2732 nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src); 2733 fs_reg icp_handle; 2734 2735 if (nir_src_is_const(vertex_src)) { 2736 /* Emit a MOV to resolve <0,1,0> regioning. */ 2737 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2738 unsigned vertex = nir_src_as_uint(vertex_src); 2739 bld.MOV(icp_handle, 2740 retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7), 2741 BRW_REGISTER_TYPE_UD)); 2742 } else if (tcs_prog_data->instances == 1 && vertex_intrin && 2743 vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) { 2744 /* For the common case of only 1 instance, an array index of 2745 * gl_InvocationID means reading g1. Skip all the indirect work. 2746 */ 2747 icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); 2748 } else { 2749 /* The vertex index is non-constant. We need to use indirect 2750 * addressing to fetch the proper URB handle. 2751 */ 2752 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2753 2754 /* Each ICP handle is a single DWord (4 bytes) */ 2755 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2756 bld.SHL(vertex_offset_bytes, 2757 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2758 brw_imm_ud(2u)); 2759 2760 /* Start at g1. We might read up to 4 registers. */ 2761 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2762 retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes, 2763 brw_imm_ud(4 * REG_SIZE)); 2764 } 2765 2766 return icp_handle; 2767} 2768 2769fs_reg 2770fs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld, 2771 nir_intrinsic_instr *instr) 2772{ 2773 struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; 2774 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2775 const nir_src &vertex_src = instr->src[0]; 2776 2777 unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2; 2778 2779 if (nir_src_is_const(vertex_src)) { 2780 return fs_reg(retype(brw_vec8_grf(first_icp_handle + 2781 nir_src_as_uint(vertex_src), 0), 2782 BRW_REGISTER_TYPE_UD)); 2783 } 2784 2785 /* The vertex index is non-constant. We need to use indirect 2786 * addressing to fetch the proper URB handle. 2787 * 2788 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> 2789 * indicating that channel <n> should read the handle from 2790 * DWord <n>. We convert that to bytes by multiplying by 4. 2791 * 2792 * Next, we convert the vertex index to bytes by multiplying 2793 * by 32 (shifting by 5), and add the two together. This is 2794 * the final indirect byte offset. 2795 */ 2796 fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2797 fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1); 2798 fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2799 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2800 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2801 2802 /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ 2803 bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); 2804 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ 2805 bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); 2806 /* Convert vertex_index to bytes (multiply by 32) */ 2807 bld.SHL(vertex_offset_bytes, 2808 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2809 brw_imm_ud(5u)); 2810 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); 2811 2812 /* Use first_icp_handle as the base offset. There is one register 2813 * of URB handles per vertex, so inform the register allocator that 2814 * we might read up to nir->info.gs.vertices_in registers. 2815 */ 2816 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2817 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2818 icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE)); 2819 2820 return icp_handle; 2821} 2822 2823struct brw_reg 2824fs_visitor::get_tcs_output_urb_handle() 2825{ 2826 struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); 2827 2828 if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) { 2829 return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); 2830 } else { 2831 assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH); 2832 return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); 2833 } 2834} 2835 2836void 2837fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, 2838 nir_intrinsic_instr *instr) 2839{ 2840 assert(stage == MESA_SHADER_TESS_CTRL); 2841 struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; 2842 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2843 struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base; 2844 2845 bool eight_patch = 2846 vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH; 2847 2848 fs_reg dst; 2849 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2850 dst = get_nir_dest(instr->dest); 2851 2852 switch (instr->intrinsic) { 2853 case nir_intrinsic_load_primitive_id: 2854 bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0) 2855 : brw_vec1_grf(0, 1))); 2856 break; 2857 case nir_intrinsic_load_invocation_id: 2858 bld.MOV(retype(dst, invocation_id.type), invocation_id); 2859 break; 2860 case nir_intrinsic_load_patch_vertices_in: 2861 bld.MOV(retype(dst, BRW_REGISTER_TYPE_D), 2862 brw_imm_d(tcs_key->input_vertices)); 2863 break; 2864 2865 case nir_intrinsic_control_barrier: { 2866 if (tcs_prog_data->instances == 1) 2867 break; 2868 2869 fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2870 fs_reg m0_2 = component(m0, 2); 2871 2872 const fs_builder chanbld = bld.exec_all().group(1, 0); 2873 2874 /* Zero the message header */ 2875 bld.exec_all().MOV(m0, brw_imm_ud(0u)); 2876 2877 if (devinfo->verx10 >= 125) { 2878 /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */ 2879 fs_reg m0_10ub = component(retype(m0, BRW_REGISTER_TYPE_UB), 10); 2880 fs_reg r0_11ub = 2881 stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UB), 11), 2882 0, 1, 0); 2883 bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub); 2884 } else if (devinfo->ver >= 11) { 2885 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2886 brw_imm_ud(INTEL_MASK(30, 24))); 2887 2888 /* Set the Barrier Count and the enable bit */ 2889 chanbld.OR(m0_2, m0_2, 2890 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15))); 2891 } else { 2892 /* Copy "Barrier ID" from r0.2, bits 16:13 */ 2893 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2894 brw_imm_ud(INTEL_MASK(16, 13))); 2895 2896 /* Shift it up to bits 27:24. */ 2897 chanbld.SHL(m0_2, m0_2, brw_imm_ud(11)); 2898 2899 /* Set the Barrier Count and the enable bit */ 2900 chanbld.OR(m0_2, m0_2, 2901 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15))); 2902 } 2903 2904 bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0); 2905 break; 2906 } 2907 2908 case nir_intrinsic_load_input: 2909 unreachable("nir_lower_io should never give us these."); 2910 break; 2911 2912 case nir_intrinsic_load_per_vertex_input: { 2913 assert(nir_dest_bit_size(instr->dest) == 32); 2914 fs_reg indirect_offset = get_indirect_offset(instr); 2915 unsigned imm_offset = instr->const_index[0]; 2916 fs_inst *inst; 2917 2918 fs_reg icp_handle = 2919 eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr) 2920 : get_tcs_single_patch_icp_handle(bld, instr); 2921 2922 /* We can only read two double components with each URB read, so 2923 * we send two read messages in that case, each one loading up to 2924 * two double components. 2925 */ 2926 unsigned num_components = instr->num_components; 2927 unsigned first_component = nir_intrinsic_component(instr); 2928 2929 fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 2930 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle; 2931 2932 if (indirect_offset.file == BAD_FILE) { 2933 /* Constant indexing - use global offset. */ 2934 if (first_component != 0) { 2935 unsigned read_components = num_components + first_component; 2936 fs_reg tmp = bld.vgrf(dst.type, read_components); 2937 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs, 2938 ARRAY_SIZE(srcs)); 2939 for (unsigned i = 0; i < num_components; i++) { 2940 bld.MOV(offset(dst, bld, i), 2941 offset(tmp, bld, i + first_component)); 2942 } 2943 } else { 2944 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs, 2945 ARRAY_SIZE(srcs)); 2946 } 2947 inst->offset = imm_offset; 2948 inst->mlen = 1; 2949 } else { 2950 /* Indirect indexing - use per-slot offsets as well. */ 2951 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; 2952 2953 if (first_component != 0) { 2954 unsigned read_components = num_components + first_component; 2955 fs_reg tmp = bld.vgrf(dst.type, read_components); 2956 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 2957 srcs, ARRAY_SIZE(srcs)); 2958 for (unsigned i = 0; i < num_components; i++) { 2959 bld.MOV(offset(dst, bld, i), 2960 offset(tmp, bld, i + first_component)); 2961 } 2962 } else { 2963 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, 2964 srcs, ARRAY_SIZE(srcs)); 2965 } 2966 inst->offset = imm_offset; 2967 inst->mlen = 2; 2968 } 2969 inst->size_written = (num_components + first_component) * 2970 inst->dst.component_size(inst->exec_size); 2971 2972 /* Copy the temporary to the destination to deal with writemasking. 2973 * 2974 * Also attempt to deal with gl_PointSize being in the .w component. 2975 */ 2976 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { 2977 assert(type_sz(dst.type) == 4); 2978 inst->dst = bld.vgrf(dst.type, 4); 2979 inst->size_written = 4 * REG_SIZE; 2980 bld.MOV(dst, offset(inst->dst, bld, 3)); 2981 } 2982 break; 2983 } 2984 2985 case nir_intrinsic_load_output: 2986 case nir_intrinsic_load_per_vertex_output: { 2987 assert(nir_dest_bit_size(instr->dest) == 32); 2988 fs_reg indirect_offset = get_indirect_offset(instr); 2989 unsigned imm_offset = instr->const_index[0]; 2990 unsigned first_component = nir_intrinsic_component(instr); 2991 2992 struct brw_reg output_handles = get_tcs_output_urb_handle(); 2993 2994 fs_inst *inst; 2995 if (indirect_offset.file == BAD_FILE) { 2996 /* This MOV replicates the output handle to all enabled channels 2997 * is SINGLE_PATCH mode. 2998 */ 2999 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 3000 bld.MOV(patch_handle, output_handles); 3001 3002 { 3003 fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 3004 srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle; 3005 3006 if (first_component != 0) { 3007 unsigned read_components = 3008 instr->num_components + first_component; 3009 fs_reg tmp = bld.vgrf(dst.type, read_components); 3010 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 3011 srcs, ARRAY_SIZE(srcs)); 3012 inst->size_written = read_components * REG_SIZE; 3013 for (unsigned i = 0; i < instr->num_components; i++) { 3014 bld.MOV(offset(dst, bld, i), 3015 offset(tmp, bld, i + first_component)); 3016 } 3017 } else { 3018 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, 3019 srcs, ARRAY_SIZE(srcs)); 3020 inst->size_written = instr->num_components * REG_SIZE; 3021 } 3022 inst->offset = imm_offset; 3023 inst->mlen = 1; 3024 } 3025 } else { 3026 /* Indirect indexing - use per-slot offsets as well. */ 3027 fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 3028 srcs[URB_LOGICAL_SRC_HANDLE] = output_handles; 3029 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; 3030 3031 if (first_component != 0) { 3032 unsigned read_components = 3033 instr->num_components + first_component; 3034 fs_reg tmp = bld.vgrf(dst.type, read_components); 3035 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 3036 srcs, ARRAY_SIZE(srcs)); 3037 inst->size_written = read_components * REG_SIZE; 3038 for (unsigned i = 0; i < instr->num_components; i++) { 3039 bld.MOV(offset(dst, bld, i), 3040 offset(tmp, bld, i + first_component)); 3041 } 3042 } else { 3043 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, 3044 srcs, ARRAY_SIZE(srcs)); 3045 inst->size_written = instr->num_components * REG_SIZE; 3046 } 3047 inst->offset = imm_offset; 3048 inst->mlen = 2; 3049 } 3050 break; 3051 } 3052 3053 case nir_intrinsic_store_output: 3054 case nir_intrinsic_store_per_vertex_output: { 3055 assert(nir_src_bit_size(instr->src[0]) == 32); 3056 fs_reg value = get_nir_src(instr->src[0]); 3057 fs_reg indirect_offset = get_indirect_offset(instr); 3058 unsigned imm_offset = instr->const_index[0]; 3059 unsigned mask = instr->const_index[1]; 3060 3061 if (mask == 0) 3062 break; 3063 3064 unsigned num_components = util_last_bit(mask); 3065 3066 /* We can only pack two 64-bit components in a single message, so send 3067 * 2 messages if we have more components 3068 */ 3069 unsigned first_component = nir_intrinsic_component(instr); 3070 mask = mask << first_component; 3071 3072 fs_reg mask_reg; 3073 if (mask != WRITEMASK_XYZW) 3074 mask_reg = brw_imm_ud(mask << 16); 3075 3076 fs_reg sources[4]; 3077 3078 for (unsigned i = 0; i < num_components; i++) { 3079 if (!(mask & (1 << (i + first_component)))) 3080 continue; 3081 3082 sources[i + first_component] = offset(value, bld, i); 3083 } 3084 3085 unsigned header_size = 1 + unsigned(indirect_offset.file != BAD_FILE) + 3086 unsigned(mask != WRITEMASK_XYZW); 3087 const unsigned length = num_components + first_component; 3088 3089 fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 3090 srcs[URB_LOGICAL_SRC_HANDLE] = get_tcs_output_urb_handle(); 3091 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; 3092 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg; 3093 srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length), 3094 BRW_REGISTER_TYPE_F); 3095 bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0); 3096 3097 fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, 3098 srcs, ARRAY_SIZE(srcs)); 3099 inst->offset = imm_offset; 3100 inst->mlen = header_size + length; 3101 break; 3102 } 3103 3104 default: 3105 nir_emit_intrinsic(bld, instr); 3106 break; 3107 } 3108} 3109 3110void 3111fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, 3112 nir_intrinsic_instr *instr) 3113{ 3114 assert(stage == MESA_SHADER_TESS_EVAL); 3115 struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data); 3116 3117 fs_reg dest; 3118 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3119 dest = get_nir_dest(instr->dest); 3120 3121 switch (instr->intrinsic) { 3122 case nir_intrinsic_load_primitive_id: 3123 bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1))); 3124 break; 3125 case nir_intrinsic_load_tess_coord: 3126 /* gl_TessCoord is part of the payload in g1-3 */ 3127 for (unsigned i = 0; i < 3; i++) { 3128 bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0))); 3129 } 3130 break; 3131 3132 case nir_intrinsic_load_input: 3133 case nir_intrinsic_load_per_vertex_input: { 3134 assert(nir_dest_bit_size(instr->dest) == 32); 3135 fs_reg indirect_offset = get_indirect_offset(instr); 3136 unsigned imm_offset = instr->const_index[0]; 3137 unsigned first_component = nir_intrinsic_component(instr); 3138 3139 fs_inst *inst; 3140 if (indirect_offset.file == BAD_FILE) { 3141 /* Arbitrarily only push up to 32 vec4 slots worth of data, 3142 * which is 16 registers (since each holds 2 vec4 slots). 3143 */ 3144 const unsigned max_push_slots = 32; 3145 if (imm_offset < max_push_slots) { 3146 fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type); 3147 for (int i = 0; i < instr->num_components; i++) { 3148 unsigned comp = 4 * (imm_offset % 2) + i + first_component; 3149 bld.MOV(offset(dest, bld, i), component(src, comp)); 3150 } 3151 3152 tes_prog_data->base.urb_read_length = 3153 MAX2(tes_prog_data->base.urb_read_length, 3154 (imm_offset / 2) + 1); 3155 } else { 3156 /* Replicate the patch handle to all enabled channels */ 3157 fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 3158 srcs[URB_LOGICAL_SRC_HANDLE] = 3159 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); 3160 3161 if (first_component != 0) { 3162 unsigned read_components = 3163 instr->num_components + first_component; 3164 fs_reg tmp = bld.vgrf(dest.type, read_components); 3165 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 3166 srcs, ARRAY_SIZE(srcs)); 3167 inst->size_written = read_components * REG_SIZE; 3168 for (unsigned i = 0; i < instr->num_components; i++) { 3169 bld.MOV(offset(dest, bld, i), 3170 offset(tmp, bld, i + first_component)); 3171 } 3172 } else { 3173 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest, 3174 srcs, ARRAY_SIZE(srcs)); 3175 inst->size_written = instr->num_components * REG_SIZE; 3176 } 3177 inst->mlen = 1; 3178 inst->offset = imm_offset; 3179 } 3180 } else { 3181 /* Indirect indexing - use per-slot offsets as well. */ 3182 3183 /* We can only read two double components with each URB read, so 3184 * we send two read messages in that case, each one loading up to 3185 * two double components. 3186 */ 3187 unsigned num_components = instr->num_components; 3188 3189 fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 3190 srcs[URB_LOGICAL_SRC_HANDLE] = 3191 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); 3192 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; 3193 3194 if (first_component != 0) { 3195 unsigned read_components = 3196 num_components + first_component; 3197 fs_reg tmp = bld.vgrf(dest.type, read_components); 3198 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 3199 srcs, ARRAY_SIZE(srcs)); 3200 for (unsigned i = 0; i < num_components; i++) { 3201 bld.MOV(offset(dest, bld, i), 3202 offset(tmp, bld, i + first_component)); 3203 } 3204 } else { 3205 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest, 3206 srcs, ARRAY_SIZE(srcs)); 3207 } 3208 inst->mlen = 2; 3209 inst->offset = imm_offset; 3210 inst->size_written = (num_components + first_component) * 3211 inst->dst.component_size(inst->exec_size); 3212 } 3213 break; 3214 } 3215 default: 3216 nir_emit_intrinsic(bld, instr); 3217 break; 3218 } 3219} 3220 3221void 3222fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, 3223 nir_intrinsic_instr *instr) 3224{ 3225 assert(stage == MESA_SHADER_GEOMETRY); 3226 fs_reg indirect_offset; 3227 3228 fs_reg dest; 3229 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3230 dest = get_nir_dest(instr->dest); 3231 3232 switch (instr->intrinsic) { 3233 case nir_intrinsic_load_primitive_id: 3234 assert(stage == MESA_SHADER_GEOMETRY); 3235 assert(brw_gs_prog_data(prog_data)->include_primitive_id); 3236 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 3237 retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD)); 3238 break; 3239 3240 case nir_intrinsic_load_input: 3241 unreachable("load_input intrinsics are invalid for the GS stage"); 3242 3243 case nir_intrinsic_load_per_vertex_input: 3244 emit_gs_input_load(dest, instr->src[0], instr->const_index[0], 3245 instr->src[1], instr->num_components, 3246 nir_intrinsic_component(instr)); 3247 break; 3248 3249 case nir_intrinsic_emit_vertex_with_counter: 3250 emit_gs_vertex(instr->src[0], instr->const_index[0]); 3251 break; 3252 3253 case nir_intrinsic_end_primitive_with_counter: 3254 emit_gs_end_primitive(instr->src[0]); 3255 break; 3256 3257 case nir_intrinsic_set_vertex_and_primitive_count: 3258 bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0])); 3259 break; 3260 3261 case nir_intrinsic_load_invocation_id: { 3262 fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 3263 assert(val.file != BAD_FILE); 3264 dest.type = val.type; 3265 bld.MOV(dest, val); 3266 break; 3267 } 3268 3269 default: 3270 nir_emit_intrinsic(bld, instr); 3271 break; 3272 } 3273} 3274 3275/** 3276 * Fetch the current render target layer index. 3277 */ 3278static fs_reg 3279fetch_render_target_array_index(const fs_builder &bld) 3280{ 3281 if (bld.shader->devinfo->ver >= 12) { 3282 /* The render target array index is provided in the thread payload as 3283 * bits 26:16 of r1.1. 3284 */ 3285 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); 3286 bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3), 3287 brw_imm_uw(0x7ff)); 3288 return idx; 3289 } else if (bld.shader->devinfo->ver >= 6) { 3290 /* The render target array index is provided in the thread payload as 3291 * bits 26:16 of r0.0. 3292 */ 3293 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); 3294 bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1), 3295 brw_imm_uw(0x7ff)); 3296 return idx; 3297 } else { 3298 /* Pre-SNB we only ever render into the first layer of the framebuffer 3299 * since layered rendering is not implemented. 3300 */ 3301 return brw_imm_ud(0); 3302 } 3303} 3304 3305/** 3306 * Fake non-coherent framebuffer read implemented using TXF to fetch from the 3307 * framebuffer at the current fragment coordinates and sample index. 3308 */ 3309fs_inst * 3310fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, 3311 unsigned target) 3312{ 3313 const struct intel_device_info *devinfo = bld.shader->devinfo; 3314 3315 assert(bld.shader->stage == MESA_SHADER_FRAGMENT); 3316 const brw_wm_prog_key *wm_key = 3317 reinterpret_cast<const brw_wm_prog_key *>(key); 3318 assert(!wm_key->coherent_fb_fetch); 3319 3320 /* Calculate the fragment coordinates. */ 3321 const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); 3322 bld.MOV(offset(coords, bld, 0), pixel_x); 3323 bld.MOV(offset(coords, bld, 1), pixel_y); 3324 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld)); 3325 3326 /* Calculate the sample index and MCS payload when multisampling. Luckily 3327 * the MCS fetch message behaves deterministically for UMS surfaces, so it 3328 * shouldn't be necessary to recompile based on whether the framebuffer is 3329 * CMS or UMS. 3330 */ 3331 if (wm_key->multisample_fbo && 3332 nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) 3333 nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(); 3334 3335 const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 3336 const fs_reg mcs = wm_key->multisample_fbo ? 3337 emit_mcs_fetch(coords, 3, brw_imm_ud(target), fs_reg()) : fs_reg(); 3338 3339 /* Use either a normal or a CMS texel fetch message depending on whether 3340 * the framebuffer is single or multisample. On SKL+ use the wide CMS 3341 * message just in case the framebuffer uses 16x multisampling, it should 3342 * be equivalent to the normal CMS fetch for lower multisampling modes. 3343 */ 3344 opcode op; 3345 if (wm_key->multisample_fbo) { 3346 /* On SKL+ use the wide CMS message just in case the framebuffer uses 16x 3347 * multisampling, it should be equivalent to the normal CMS fetch for 3348 * lower multisampling modes. 3349 * 3350 * On Gfx12HP, there is only CMS_W variant available. 3351 */ 3352 if (devinfo->verx10 >= 125) 3353 op = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL; 3354 else if (devinfo->ver >= 9) 3355 op = SHADER_OPCODE_TXF_CMS_W_LOGICAL; 3356 else 3357 op = SHADER_OPCODE_TXF_CMS_LOGICAL; 3358 } else { 3359 op = SHADER_OPCODE_TXF_LOGICAL; 3360 } 3361 3362 /* Emit the instruction. */ 3363 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 3364 srcs[TEX_LOGICAL_SRC_COORDINATE] = coords; 3365 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0); 3366 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample; 3367 srcs[TEX_LOGICAL_SRC_MCS] = mcs; 3368 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(target); 3369 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0); 3370 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3); 3371 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0); 3372 3373 fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs)); 3374 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3375 3376 return inst; 3377} 3378 3379/** 3380 * Actual coherent framebuffer read implemented using the native render target 3381 * read message. Requires SKL+. 3382 */ 3383static fs_inst * 3384emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target) 3385{ 3386 assert(bld.shader->devinfo->ver >= 9); 3387 fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst); 3388 inst->target = target; 3389 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3390 3391 return inst; 3392} 3393 3394static fs_reg 3395alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n) 3396{ 3397 if (n && regs[0].file != BAD_FILE) { 3398 return regs[0]; 3399 3400 } else { 3401 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size); 3402 3403 for (unsigned i = 0; i < n; i++) 3404 regs[i] = tmp; 3405 3406 return tmp; 3407 } 3408} 3409 3410static fs_reg 3411alloc_frag_output(fs_visitor *v, unsigned location) 3412{ 3413 assert(v->stage == MESA_SHADER_FRAGMENT); 3414 const brw_wm_prog_key *const key = 3415 reinterpret_cast<const brw_wm_prog_key *>(v->key); 3416 const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION); 3417 const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX); 3418 3419 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1)) 3420 return alloc_temporary(v->bld, 4, &v->dual_src_output, 1); 3421 3422 else if (l == FRAG_RESULT_COLOR) 3423 return alloc_temporary(v->bld, 4, v->outputs, 3424 MAX2(key->nr_color_regions, 1)); 3425 3426 else if (l == FRAG_RESULT_DEPTH) 3427 return alloc_temporary(v->bld, 1, &v->frag_depth, 1); 3428 3429 else if (l == FRAG_RESULT_STENCIL) 3430 return alloc_temporary(v->bld, 1, &v->frag_stencil, 1); 3431 3432 else if (l == FRAG_RESULT_SAMPLE_MASK) 3433 return alloc_temporary(v->bld, 1, &v->sample_mask, 1); 3434 3435 else if (l >= FRAG_RESULT_DATA0 && 3436 l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS) 3437 return alloc_temporary(v->bld, 4, 3438 &v->outputs[l - FRAG_RESULT_DATA0], 1); 3439 3440 else 3441 unreachable("Invalid location"); 3442} 3443 3444void 3445fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, 3446 nir_intrinsic_instr *instr) 3447{ 3448 assert(stage == MESA_SHADER_FRAGMENT); 3449 3450 fs_reg dest; 3451 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3452 dest = get_nir_dest(instr->dest); 3453 3454 switch (instr->intrinsic) { 3455 case nir_intrinsic_load_front_face: 3456 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 3457 emit_frontfacing_interpolation()); 3458 break; 3459 3460 case nir_intrinsic_load_sample_pos: 3461 case nir_intrinsic_load_sample_pos_or_center: { 3462 fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 3463 assert(sample_pos.file != BAD_FILE); 3464 dest.type = sample_pos.type; 3465 bld.MOV(dest, sample_pos); 3466 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1)); 3467 break; 3468 } 3469 3470 case nir_intrinsic_load_layer_id: 3471 dest.type = BRW_REGISTER_TYPE_UD; 3472 bld.MOV(dest, fetch_render_target_array_index(bld)); 3473 break; 3474 3475 case nir_intrinsic_is_helper_invocation: 3476 emit_is_helper_invocation(dest); 3477 break; 3478 3479 case nir_intrinsic_load_helper_invocation: 3480 case nir_intrinsic_load_sample_mask_in: 3481 case nir_intrinsic_load_sample_id: 3482 case nir_intrinsic_load_frag_shading_rate: { 3483 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3484 fs_reg val = nir_system_values[sv]; 3485 assert(val.file != BAD_FILE); 3486 dest.type = val.type; 3487 bld.MOV(dest, val); 3488 break; 3489 } 3490 3491 case nir_intrinsic_store_output: { 3492 const fs_reg src = get_nir_src(instr->src[0]); 3493 const unsigned store_offset = nir_src_as_uint(instr->src[1]); 3494 const unsigned location = nir_intrinsic_base(instr) + 3495 SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION); 3496 const fs_reg new_dest = retype(alloc_frag_output(this, location), 3497 src.type); 3498 3499 for (unsigned j = 0; j < instr->num_components; j++) 3500 bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j), 3501 offset(src, bld, j)); 3502 3503 break; 3504 } 3505 3506 case nir_intrinsic_load_output: { 3507 const unsigned l = GET_FIELD(nir_intrinsic_base(instr), 3508 BRW_NIR_FRAG_OUTPUT_LOCATION); 3509 assert(l >= FRAG_RESULT_DATA0); 3510 const unsigned load_offset = nir_src_as_uint(instr->src[0]); 3511 const unsigned target = l - FRAG_RESULT_DATA0 + load_offset; 3512 const fs_reg tmp = bld.vgrf(dest.type, 4); 3513 3514 if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch) 3515 emit_coherent_fb_read(bld, tmp, target); 3516 else 3517 emit_non_coherent_fb_read(bld, tmp, target); 3518 3519 for (unsigned j = 0; j < instr->num_components; j++) { 3520 bld.MOV(offset(dest, bld, j), 3521 offset(tmp, bld, nir_intrinsic_component(instr) + j)); 3522 } 3523 3524 break; 3525 } 3526 3527 case nir_intrinsic_demote: 3528 case nir_intrinsic_discard: 3529 case nir_intrinsic_terminate: 3530 case nir_intrinsic_demote_if: 3531 case nir_intrinsic_discard_if: 3532 case nir_intrinsic_terminate_if: { 3533 /* We track our discarded pixels in f0.1/f1.0. By predicating on it, we 3534 * can update just the flag bits that aren't yet discarded. If there's 3535 * no condition, we emit a CMP of g0 != g0, so all currently executing 3536 * channels will get turned off. 3537 */ 3538 fs_inst *cmp = NULL; 3539 if (instr->intrinsic == nir_intrinsic_demote_if || 3540 instr->intrinsic == nir_intrinsic_discard_if || 3541 instr->intrinsic == nir_intrinsic_terminate_if) { 3542 nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]); 3543 3544 if (alu != NULL && 3545 alu->op != nir_op_bcsel && 3546 (devinfo->ver > 5 || 3547 (alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE || 3548 alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 || 3549 alu->op == nir_op_flt32 || alu->op == nir_op_fge32 || 3550 alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 || 3551 alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 || 3552 alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) { 3553 /* Re-emit the instruction that generated the Boolean value, but 3554 * do not store it. Since this instruction will be conditional, 3555 * other instructions that want to use the real Boolean value may 3556 * get garbage. This was a problem for piglit's fs-discard-exit-2 3557 * test. 3558 * 3559 * Ideally we'd detect that the instruction cannot have a 3560 * conditional modifier before emitting the instructions. Alas, 3561 * that is nigh impossible. Instead, we're going to assume the 3562 * instruction (or last instruction) generated can have a 3563 * conditional modifier. If it cannot, fallback to the old-style 3564 * compare, and hope dead code elimination will clean up the 3565 * extra instructions generated. 3566 */ 3567 nir_emit_alu(bld, alu, false); 3568 3569 cmp = (fs_inst *) instructions.get_tail(); 3570 if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) { 3571 if (cmp->can_do_cmod()) 3572 cmp->conditional_mod = BRW_CONDITIONAL_Z; 3573 else 3574 cmp = NULL; 3575 } else { 3576 /* The old sequence that would have been generated is, 3577 * basically, bool_result == false. This is equivalent to 3578 * !bool_result, so negate the old modifier. 3579 */ 3580 cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod); 3581 } 3582 } 3583 3584 if (cmp == NULL) { 3585 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]), 3586 brw_imm_d(0), BRW_CONDITIONAL_Z); 3587 } 3588 } else { 3589 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), 3590 BRW_REGISTER_TYPE_UW)); 3591 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ); 3592 } 3593 3594 cmp->predicate = BRW_PREDICATE_NORMAL; 3595 cmp->flag_subreg = sample_mask_flag_subreg(this); 3596 3597 fs_inst *jump = bld.emit(BRW_OPCODE_HALT); 3598 jump->flag_subreg = sample_mask_flag_subreg(this); 3599 jump->predicate_inverse = true; 3600 3601 if (instr->intrinsic == nir_intrinsic_terminate || 3602 instr->intrinsic == nir_intrinsic_terminate_if) { 3603 jump->predicate = BRW_PREDICATE_NORMAL; 3604 } else { 3605 /* Only jump when the whole quad is demoted. For historical 3606 * reasons this is also used for discard. 3607 */ 3608 jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H; 3609 } 3610 3611 if (devinfo->ver < 7) 3612 limit_dispatch_width( 3613 16, "Fragment discard/demote not implemented in SIMD32 mode.\n"); 3614 break; 3615 } 3616 3617 case nir_intrinsic_load_input: { 3618 /* In Fragment Shaders load_input is used either for flat inputs or 3619 * per-primitive inputs. 3620 */ 3621 assert(nir_dest_bit_size(instr->dest) == 32); 3622 unsigned base = nir_intrinsic_base(instr); 3623 unsigned comp = nir_intrinsic_component(instr); 3624 unsigned num_components = instr->num_components; 3625 3626 /* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */ 3627 3628 /* Special case fields in the VUE header */ 3629 if (base == VARYING_SLOT_LAYER) 3630 comp = 1; 3631 else if (base == VARYING_SLOT_VIEWPORT) 3632 comp = 2; 3633 3634 if (BITFIELD64_BIT(base) & nir->info.per_primitive_inputs) { 3635 assert(base != VARYING_SLOT_PRIMITIVE_INDICES); 3636 for (unsigned int i = 0; i < num_components; i++) { 3637 bld.MOV(offset(dest, bld, i), 3638 retype(component(per_primitive_reg(base), comp + i), dest.type)); 3639 } 3640 } else { 3641 for (unsigned int i = 0; i < num_components; i++) { 3642 bld.MOV(offset(dest, bld, i), 3643 retype(component(interp_reg(base, comp + i), 3), dest.type)); 3644 } 3645 } 3646 break; 3647 } 3648 3649 case nir_intrinsic_load_fs_input_interp_deltas: { 3650 assert(stage == MESA_SHADER_FRAGMENT); 3651 assert(nir_src_as_uint(instr->src[0]) == 0); 3652 fs_reg interp = interp_reg(nir_intrinsic_base(instr), 3653 nir_intrinsic_component(instr)); 3654 dest.type = BRW_REGISTER_TYPE_F; 3655 bld.MOV(offset(dest, bld, 0), component(interp, 3)); 3656 bld.MOV(offset(dest, bld, 1), component(interp, 1)); 3657 bld.MOV(offset(dest, bld, 2), component(interp, 0)); 3658 break; 3659 } 3660 3661 case nir_intrinsic_load_barycentric_pixel: 3662 case nir_intrinsic_load_barycentric_centroid: 3663 case nir_intrinsic_load_barycentric_sample: { 3664 /* Use the delta_xy values computed from the payload */ 3665 enum brw_barycentric_mode bary = brw_barycentric_mode(instr); 3666 const fs_reg srcs[] = { offset(this->delta_xy[bary], bld, 0), 3667 offset(this->delta_xy[bary], bld, 1) }; 3668 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); 3669 break; 3670 } 3671 3672 case nir_intrinsic_load_barycentric_at_sample: { 3673 const glsl_interp_mode interpolation = 3674 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3675 3676 if (nir_src_is_const(instr->src[0])) { 3677 unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4; 3678 3679 emit_pixel_interpolater_send(bld, 3680 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3681 dest, 3682 fs_reg(), /* src */ 3683 brw_imm_ud(msg_data), 3684 interpolation); 3685 } else { 3686 const fs_reg sample_src = retype(get_nir_src(instr->src[0]), 3687 BRW_REGISTER_TYPE_UD); 3688 3689 if (nir_src_is_always_uniform(instr->src[0])) { 3690 const fs_reg sample_id = bld.emit_uniformize(sample_src); 3691 const fs_reg msg_data = vgrf(glsl_type::uint_type); 3692 bld.exec_all().group(1, 0) 3693 .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3694 emit_pixel_interpolater_send(bld, 3695 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3696 dest, 3697 fs_reg(), /* src */ 3698 component(msg_data, 0), 3699 interpolation); 3700 } else { 3701 /* Make a loop that sends a message to the pixel interpolater 3702 * for the sample number in each live channel. If there are 3703 * multiple channels with the same sample number then these 3704 * will be handled simultaneously with a single iteration of 3705 * the loop. 3706 */ 3707 bld.emit(BRW_OPCODE_DO); 3708 3709 /* Get the next live sample number into sample_id_reg */ 3710 const fs_reg sample_id = bld.emit_uniformize(sample_src); 3711 3712 /* Set the flag register so that we can perform the send 3713 * message on all channels that have the same sample number 3714 */ 3715 bld.CMP(bld.null_reg_ud(), 3716 sample_src, sample_id, 3717 BRW_CONDITIONAL_EQ); 3718 const fs_reg msg_data = vgrf(glsl_type::uint_type); 3719 bld.exec_all().group(1, 0) 3720 .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3721 fs_inst *inst = 3722 emit_pixel_interpolater_send(bld, 3723 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3724 dest, 3725 fs_reg(), /* src */ 3726 component(msg_data, 0), 3727 interpolation); 3728 set_predicate(BRW_PREDICATE_NORMAL, inst); 3729 3730 /* Continue the loop if there are any live channels left */ 3731 set_predicate_inv(BRW_PREDICATE_NORMAL, 3732 true, /* inverse */ 3733 bld.emit(BRW_OPCODE_WHILE)); 3734 } 3735 } 3736 break; 3737 } 3738 3739 case nir_intrinsic_load_barycentric_at_offset: { 3740 const glsl_interp_mode interpolation = 3741 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3742 3743 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3744 3745 if (const_offset) { 3746 assert(nir_src_bit_size(instr->src[0]) == 32); 3747 unsigned off_x = const_offset[0].u32 & 0xf; 3748 unsigned off_y = const_offset[1].u32 & 0xf; 3749 3750 emit_pixel_interpolater_send(bld, 3751 FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, 3752 dest, 3753 fs_reg(), /* src */ 3754 brw_imm_ud(off_x | (off_y << 4)), 3755 interpolation); 3756 } else { 3757 fs_reg src = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_D); 3758 const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; 3759 emit_pixel_interpolater_send(bld, 3760 opcode, 3761 dest, 3762 src, 3763 brw_imm_ud(0u), 3764 interpolation); 3765 } 3766 break; 3767 } 3768 3769 case nir_intrinsic_load_frag_coord: 3770 emit_fragcoord_interpolation(dest); 3771 break; 3772 3773 case nir_intrinsic_load_interpolated_input: { 3774 assert(instr->src[0].ssa && 3775 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic); 3776 nir_intrinsic_instr *bary_intrinsic = 3777 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); 3778 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; 3779 enum glsl_interp_mode interp_mode = 3780 (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic); 3781 fs_reg dst_xy; 3782 3783 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || 3784 bary_intrin == nir_intrinsic_load_barycentric_at_sample) { 3785 /* Use the result of the PI message. */ 3786 dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); 3787 } else { 3788 /* Use the delta_xy values computed from the payload */ 3789 enum brw_barycentric_mode bary = brw_barycentric_mode(bary_intrinsic); 3790 dst_xy = this->delta_xy[bary]; 3791 } 3792 3793 for (unsigned int i = 0; i < instr->num_components; i++) { 3794 fs_reg interp = 3795 component(interp_reg(nir_intrinsic_base(instr), 3796 nir_intrinsic_component(instr) + i), 0); 3797 interp.type = BRW_REGISTER_TYPE_F; 3798 dest.type = BRW_REGISTER_TYPE_F; 3799 3800 if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) { 3801 fs_reg tmp = vgrf(glsl_type::float_type); 3802 bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp); 3803 bld.MUL(offset(dest, bld, i), tmp, this->pixel_w); 3804 } else { 3805 bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); 3806 } 3807 } 3808 break; 3809 } 3810 3811 default: 3812 nir_emit_intrinsic(bld, instr); 3813 break; 3814 } 3815} 3816 3817void 3818fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, 3819 nir_intrinsic_instr *instr) 3820{ 3821 assert(gl_shader_stage_uses_workgroup(stage)); 3822 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); 3823 3824 fs_reg dest; 3825 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3826 dest = get_nir_dest(instr->dest); 3827 3828 switch (instr->intrinsic) { 3829 case nir_intrinsic_control_barrier: 3830 /* The whole workgroup fits in a single HW thread, so all the 3831 * invocations are already executed lock-step. Instead of an actual 3832 * barrier just emit a scheduling fence, that will generate no code. 3833 */ 3834 if (!nir->info.workgroup_size_variable && 3835 workgroup_size() <= dispatch_width) { 3836 bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE); 3837 break; 3838 } 3839 3840 emit_barrier(); 3841 cs_prog_data->uses_barrier = true; 3842 break; 3843 3844 case nir_intrinsic_load_subgroup_id: 3845 if (devinfo->verx10 >= 125) 3846 bld.AND(retype(dest, BRW_REGISTER_TYPE_UD), 3847 retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 3848 brw_imm_ud(INTEL_MASK(7, 0))); 3849 else 3850 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id); 3851 break; 3852 3853 case nir_intrinsic_load_local_invocation_id: 3854 case nir_intrinsic_load_workgroup_id: { 3855 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3856 fs_reg val = nir_system_values[sv]; 3857 assert(val.file != BAD_FILE); 3858 dest.type = val.type; 3859 for (unsigned i = 0; i < 3; i++) 3860 bld.MOV(offset(dest, bld, i), offset(val, bld, i)); 3861 break; 3862 } 3863 3864 case nir_intrinsic_load_num_workgroups: { 3865 assert(nir_dest_bit_size(instr->dest) == 32); 3866 3867 cs_prog_data->uses_num_work_groups = true; 3868 3869 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3870 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(0); 3871 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3872 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */ 3873 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0); 3874 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 3875 fs_inst *inst = 3876 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 3877 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 3878 inst->size_written = 3 * dispatch_width * 4; 3879 break; 3880 } 3881 3882 case nir_intrinsic_shared_atomic_add: 3883 case nir_intrinsic_shared_atomic_imin: 3884 case nir_intrinsic_shared_atomic_umin: 3885 case nir_intrinsic_shared_atomic_imax: 3886 case nir_intrinsic_shared_atomic_umax: 3887 case nir_intrinsic_shared_atomic_and: 3888 case nir_intrinsic_shared_atomic_or: 3889 case nir_intrinsic_shared_atomic_xor: 3890 case nir_intrinsic_shared_atomic_exchange: 3891 case nir_intrinsic_shared_atomic_comp_swap: 3892 nir_emit_shared_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); 3893 break; 3894 case nir_intrinsic_shared_atomic_fmin: 3895 case nir_intrinsic_shared_atomic_fmax: 3896 case nir_intrinsic_shared_atomic_fcomp_swap: 3897 nir_emit_shared_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); 3898 break; 3899 3900 case nir_intrinsic_load_shared: { 3901 assert(devinfo->ver >= 7); 3902 assert(nir_intrinsic_base(instr) == 0); 3903 3904 const unsigned bit_size = nir_dest_bit_size(instr->dest); 3905 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3906 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 3907 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]); 3908 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3909 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 3910 3911 /* Make dest unsigned because that's what the temporary will be */ 3912 dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 3913 3914 /* Read the vector */ 3915 assert(nir_dest_bit_size(instr->dest) <= 32); 3916 assert(nir_intrinsic_align(instr) > 0); 3917 if (nir_dest_bit_size(instr->dest) == 32 && 3918 nir_intrinsic_align(instr) >= 4) { 3919 assert(nir_dest_num_components(instr->dest) <= 4); 3920 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 3921 fs_inst *inst = 3922 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 3923 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 3924 inst->size_written = instr->num_components * dispatch_width * 4; 3925 } else { 3926 assert(nir_dest_num_components(instr->dest) == 1); 3927 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 3928 3929 fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 3930 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 3931 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 3932 bld.MOV(dest, subscript(read_result, dest.type, 0)); 3933 } 3934 break; 3935 } 3936 3937 case nir_intrinsic_store_shared: { 3938 assert(devinfo->ver >= 7); 3939 assert(nir_intrinsic_base(instr) == 0); 3940 3941 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 3942 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3943 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 3944 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 3945 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3946 /* No point in masking with sample mask, here we're handling compute 3947 * intrinsics. 3948 */ 3949 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 3950 3951 fs_reg data = get_nir_src(instr->src[0]); 3952 data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 3953 3954 assert(nir_src_bit_size(instr->src[0]) <= 32); 3955 assert(nir_intrinsic_write_mask(instr) == 3956 (1u << instr->num_components) - 1); 3957 assert(nir_intrinsic_align(instr) > 0); 3958 if (nir_src_bit_size(instr->src[0]) == 32 && 3959 nir_intrinsic_align(instr) >= 4) { 3960 assert(nir_src_num_components(instr->src[0]) <= 4); 3961 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 3962 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 3963 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 3964 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 3965 } else { 3966 assert(nir_src_num_components(instr->src[0]) == 1); 3967 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 3968 3969 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 3970 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 3971 3972 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 3973 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 3974 } 3975 break; 3976 } 3977 3978 case nir_intrinsic_load_workgroup_size: { 3979 /* For non-variable case, this should've been lowered already. */ 3980 assert(nir->info.workgroup_size_variable); 3981 3982 assert(compiler->lower_variable_group_size); 3983 assert(gl_shader_stage_is_compute(stage)); 3984 3985 for (unsigned i = 0; i < 3; i++) { 3986 bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), 3987 group_size[i]); 3988 } 3989 break; 3990 } 3991 3992 default: 3993 nir_emit_intrinsic(bld, instr); 3994 break; 3995 } 3996} 3997 3998static void 3999emit_rt_lsc_fence(const fs_builder &bld, 4000 enum lsc_fence_scope scope, 4001 enum lsc_flush_type flush_type) 4002{ 4003 const intel_device_info *devinfo = bld.shader->devinfo; 4004 4005 const fs_builder ubld = bld.exec_all().group(8, 0); 4006 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4007 fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp, 4008 brw_imm_ud(0) /* desc */, 4009 brw_imm_ud(0) /* ex_desc */, 4010 brw_vec8_grf(0, 0) /* payload */); 4011 send->sfid = GFX12_SFID_UGM; 4012 send->desc = lsc_fence_msg_desc(devinfo, scope, flush_type, true); 4013 send->mlen = 1; /* g0 header */ 4014 send->ex_mlen = 0; 4015 send->size_written = REG_SIZE; /* Temp write for scheduling */ 4016 send->send_has_side_effects = true; 4017 4018 ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp); 4019} 4020 4021 4022void 4023fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld, 4024 nir_intrinsic_instr *instr) 4025{ 4026 assert(brw_shader_stage_is_bindless(stage)); 4027 4028 fs_reg dest; 4029 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 4030 dest = get_nir_dest(instr->dest); 4031 4032 switch (instr->intrinsic) { 4033 case nir_intrinsic_load_btd_global_arg_addr_intel: 4034 bld.MOV(dest, retype(brw_vec1_grf(2, 0), dest.type)); 4035 break; 4036 4037 case nir_intrinsic_load_btd_local_arg_addr_intel: 4038 bld.MOV(dest, retype(brw_vec1_grf(2, 2), dest.type)); 4039 break; 4040 4041 case nir_intrinsic_load_btd_shader_type_intel: { 4042 fs_reg ud_dest = retype(dest, BRW_REGISTER_TYPE_UD); 4043 bld.MOV(ud_dest, retype(brw_vec1_grf(0, 3), ud_dest.type)); 4044 bld.AND(ud_dest, ud_dest, brw_imm_ud(0xf)); 4045 break; 4046 } 4047 4048 default: 4049 nir_emit_intrinsic(bld, instr); 4050 break; 4051 } 4052} 4053 4054static fs_reg 4055brw_nir_reduction_op_identity(const fs_builder &bld, 4056 nir_op op, brw_reg_type type) 4057{ 4058 nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8); 4059 switch (type_sz(type)) { 4060 case 1: 4061 if (type == BRW_REGISTER_TYPE_UB) { 4062 return brw_imm_uw(value.u8); 4063 } else { 4064 assert(type == BRW_REGISTER_TYPE_B); 4065 return brw_imm_w(value.i8); 4066 } 4067 case 2: 4068 return retype(brw_imm_uw(value.u16), type); 4069 case 4: 4070 return retype(brw_imm_ud(value.u32), type); 4071 case 8: 4072 if (type == BRW_REGISTER_TYPE_DF) 4073 return setup_imm_df(bld, value.f64); 4074 else 4075 return retype(brw_imm_u64(value.u64), type); 4076 default: 4077 unreachable("Invalid type size"); 4078 } 4079} 4080 4081static opcode 4082brw_op_for_nir_reduction_op(nir_op op) 4083{ 4084 switch (op) { 4085 case nir_op_iadd: return BRW_OPCODE_ADD; 4086 case nir_op_fadd: return BRW_OPCODE_ADD; 4087 case nir_op_imul: return BRW_OPCODE_MUL; 4088 case nir_op_fmul: return BRW_OPCODE_MUL; 4089 case nir_op_imin: return BRW_OPCODE_SEL; 4090 case nir_op_umin: return BRW_OPCODE_SEL; 4091 case nir_op_fmin: return BRW_OPCODE_SEL; 4092 case nir_op_imax: return BRW_OPCODE_SEL; 4093 case nir_op_umax: return BRW_OPCODE_SEL; 4094 case nir_op_fmax: return BRW_OPCODE_SEL; 4095 case nir_op_iand: return BRW_OPCODE_AND; 4096 case nir_op_ior: return BRW_OPCODE_OR; 4097 case nir_op_ixor: return BRW_OPCODE_XOR; 4098 default: 4099 unreachable("Invalid reduction operation"); 4100 } 4101} 4102 4103static brw_conditional_mod 4104brw_cond_mod_for_nir_reduction_op(nir_op op) 4105{ 4106 switch (op) { 4107 case nir_op_iadd: return BRW_CONDITIONAL_NONE; 4108 case nir_op_fadd: return BRW_CONDITIONAL_NONE; 4109 case nir_op_imul: return BRW_CONDITIONAL_NONE; 4110 case nir_op_fmul: return BRW_CONDITIONAL_NONE; 4111 case nir_op_imin: return BRW_CONDITIONAL_L; 4112 case nir_op_umin: return BRW_CONDITIONAL_L; 4113 case nir_op_fmin: return BRW_CONDITIONAL_L; 4114 case nir_op_imax: return BRW_CONDITIONAL_GE; 4115 case nir_op_umax: return BRW_CONDITIONAL_GE; 4116 case nir_op_fmax: return BRW_CONDITIONAL_GE; 4117 case nir_op_iand: return BRW_CONDITIONAL_NONE; 4118 case nir_op_ior: return BRW_CONDITIONAL_NONE; 4119 case nir_op_ixor: return BRW_CONDITIONAL_NONE; 4120 default: 4121 unreachable("Invalid reduction operation"); 4122 } 4123} 4124 4125fs_reg 4126fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld, 4127 nir_intrinsic_instr *instr) 4128{ 4129 fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD); 4130 fs_reg surf_index = image; 4131 4132 return bld.emit_uniformize(surf_index); 4133} 4134 4135fs_reg 4136fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld, 4137 nir_intrinsic_instr *instr) 4138{ 4139 /* SSBO stores are weird in that their index is in src[1] */ 4140 const bool is_store = 4141 instr->intrinsic == nir_intrinsic_store_ssbo || 4142 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel; 4143 const unsigned src = is_store ? 1 : 0; 4144 4145 if (nir_src_is_const(instr->src[src])) { 4146 return brw_imm_ud(nir_src_as_uint(instr->src[src])); 4147 } else { 4148 return bld.emit_uniformize(get_nir_src(instr->src[src])); 4149 } 4150} 4151 4152/** 4153 * The offsets we get from NIR act as if each SIMD channel has it's own blob 4154 * of contiguous space. However, if we actually place each SIMD channel in 4155 * it's own space, we end up with terrible cache performance because each SIMD 4156 * channel accesses a different cache line even when they're all accessing the 4157 * same byte offset. To deal with this problem, we swizzle the address using 4158 * a simple algorithm which ensures that any time a SIMD message reads or 4159 * writes the same address, it's all in the same cache line. We have to keep 4160 * the bottom two bits fixed so that we can read/write up to a dword at a time 4161 * and the individual element is contiguous. We do this by splitting the 4162 * address as follows: 4163 * 4164 * 31 4-6 2 0 4165 * +-------------------------------+------------+----------+ 4166 * | Hi address bits | chan index | addr low | 4167 * +-------------------------------+------------+----------+ 4168 * 4169 * In other words, the bottom two address bits stay, and the top 30 get 4170 * shifted up so that we can stick the SIMD channel index in the middle. This 4171 * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit 4172 * at the same logical offset, the scratch read/write instruction acts on 4173 * continuous elements and we get good cache locality. 4174 */ 4175fs_reg 4176fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld, 4177 const fs_reg &nir_addr, 4178 bool in_dwords) 4179{ 4180 const fs_reg &chan_index = 4181 nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; 4182 const unsigned chan_index_bits = ffs(dispatch_width) - 1; 4183 4184 fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD); 4185 if (in_dwords) { 4186 /* In this case, we know the address is aligned to a DWORD and we want 4187 * the final address in DWORDs. 4188 */ 4189 bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2)); 4190 bld.OR(addr, addr, chan_index); 4191 } else { 4192 /* This case substantially more annoying because we have to pay 4193 * attention to those pesky two bottom bits. 4194 */ 4195 fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD); 4196 bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u)); 4197 bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits)); 4198 fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD); 4199 bld.SHL(chan_addr, chan_index, brw_imm_ud(2)); 4200 bld.AND(addr, nir_addr, brw_imm_ud(0x3u)); 4201 bld.OR(addr, addr, addr_hi); 4202 bld.OR(addr, addr, chan_addr); 4203 } 4204 return addr; 4205} 4206 4207static unsigned 4208choose_oword_block_size_dwords(unsigned dwords) 4209{ 4210 unsigned block; 4211 if (dwords >= 32) { 4212 block = 32; 4213 } else if (dwords >= 16) { 4214 block = 16; 4215 } else { 4216 block = 8; 4217 } 4218 assert(block <= dwords); 4219 return block; 4220} 4221 4222static void 4223increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v) 4224{ 4225 if (bld.shader->devinfo->has_64bit_int) { 4226 bld.ADD(address, address, brw_imm_ud(v)); 4227 } else { 4228 fs_reg low = retype(address, BRW_REGISTER_TYPE_UD); 4229 fs_reg high = offset(low, bld, 1); 4230 4231 /* Add low and if that overflows, add carry to high. */ 4232 bld.ADD(low, low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O; 4233 bld.ADD(high, high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL; 4234 } 4235} 4236 4237static fs_reg 4238emit_fence(const fs_builder &bld, enum opcode opcode, 4239 uint8_t sfid, uint32_t desc, 4240 bool commit_enable, uint8_t bti) 4241{ 4242 assert(opcode == SHADER_OPCODE_INTERLOCK || 4243 opcode == SHADER_OPCODE_MEMORY_FENCE); 4244 4245 fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD); 4246 fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0), 4247 brw_imm_ud(commit_enable), 4248 brw_imm_ud(bti)); 4249 fence->sfid = sfid; 4250 fence->desc = desc; 4251 4252 return dst; 4253} 4254 4255static uint32_t 4256lsc_fence_descriptor_for_intrinsic(const struct intel_device_info *devinfo, 4257 nir_intrinsic_instr *instr) 4258{ 4259 assert(devinfo->has_lsc); 4260 4261 enum lsc_fence_scope scope = LSC_FENCE_LOCAL; 4262 enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE; 4263 4264 if (nir_intrinsic_has_memory_scope(instr)) { 4265 switch (nir_intrinsic_memory_scope(instr)) { 4266 case NIR_SCOPE_DEVICE: 4267 case NIR_SCOPE_QUEUE_FAMILY: 4268 scope = LSC_FENCE_TILE; 4269 flush_type = LSC_FLUSH_TYPE_EVICT; 4270 break; 4271 case NIR_SCOPE_WORKGROUP: 4272 scope = LSC_FENCE_THREADGROUP; 4273 flush_type = LSC_FLUSH_TYPE_EVICT; 4274 break; 4275 case NIR_SCOPE_SHADER_CALL: 4276 case NIR_SCOPE_INVOCATION: 4277 case NIR_SCOPE_SUBGROUP: 4278 case NIR_SCOPE_NONE: 4279 break; 4280 } 4281 } else { 4282 /* No scope defined. */ 4283 scope = LSC_FENCE_TILE; 4284 flush_type = LSC_FLUSH_TYPE_EVICT; 4285 } 4286 return lsc_fence_msg_desc(devinfo, scope, flush_type, true); 4287} 4288 4289void 4290fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) 4291{ 4292 fs_reg dest; 4293 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 4294 dest = get_nir_dest(instr->dest); 4295 4296 switch (instr->intrinsic) { 4297 case nir_intrinsic_image_load: 4298 case nir_intrinsic_image_store: 4299 case nir_intrinsic_image_atomic_add: 4300 case nir_intrinsic_image_atomic_imin: 4301 case nir_intrinsic_image_atomic_umin: 4302 case nir_intrinsic_image_atomic_imax: 4303 case nir_intrinsic_image_atomic_umax: 4304 case nir_intrinsic_image_atomic_and: 4305 case nir_intrinsic_image_atomic_or: 4306 case nir_intrinsic_image_atomic_xor: 4307 case nir_intrinsic_image_atomic_exchange: 4308 case nir_intrinsic_image_atomic_comp_swap: 4309 case nir_intrinsic_bindless_image_load: 4310 case nir_intrinsic_bindless_image_store: 4311 case nir_intrinsic_bindless_image_atomic_add: 4312 case nir_intrinsic_bindless_image_atomic_imin: 4313 case nir_intrinsic_bindless_image_atomic_umin: 4314 case nir_intrinsic_bindless_image_atomic_imax: 4315 case nir_intrinsic_bindless_image_atomic_umax: 4316 case nir_intrinsic_bindless_image_atomic_and: 4317 case nir_intrinsic_bindless_image_atomic_or: 4318 case nir_intrinsic_bindless_image_atomic_xor: 4319 case nir_intrinsic_bindless_image_atomic_exchange: 4320 case nir_intrinsic_bindless_image_atomic_comp_swap: { 4321 /* Get some metadata from the image intrinsic. */ 4322 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; 4323 4324 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4325 4326 switch (instr->intrinsic) { 4327 case nir_intrinsic_image_load: 4328 case nir_intrinsic_image_store: 4329 case nir_intrinsic_image_atomic_add: 4330 case nir_intrinsic_image_atomic_imin: 4331 case nir_intrinsic_image_atomic_umin: 4332 case nir_intrinsic_image_atomic_imax: 4333 case nir_intrinsic_image_atomic_umax: 4334 case nir_intrinsic_image_atomic_and: 4335 case nir_intrinsic_image_atomic_or: 4336 case nir_intrinsic_image_atomic_xor: 4337 case nir_intrinsic_image_atomic_exchange: 4338 case nir_intrinsic_image_atomic_comp_swap: 4339 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4340 get_nir_image_intrinsic_image(bld, instr); 4341 break; 4342 4343 default: 4344 /* Bindless */ 4345 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = 4346 bld.emit_uniformize(get_nir_src(instr->src[0])); 4347 break; 4348 } 4349 4350 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4351 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = 4352 brw_imm_ud(nir_image_intrinsic_coord_components(instr)); 4353 4354 /* Emit an image load, store or atomic op. */ 4355 if (instr->intrinsic == nir_intrinsic_image_load || 4356 instr->intrinsic == nir_intrinsic_bindless_image_load) { 4357 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4358 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 4359 fs_inst *inst = 4360 bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, 4361 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4362 inst->size_written = instr->num_components * dispatch_width * 4; 4363 } else if (instr->intrinsic == nir_intrinsic_image_store || 4364 instr->intrinsic == nir_intrinsic_bindless_image_store) { 4365 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4366 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]); 4367 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 4368 bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, 4369 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4370 } else { 4371 unsigned num_srcs = info->num_srcs; 4372 int op = brw_aop_for_nir_intrinsic(instr); 4373 if (op == BRW_AOP_INC || op == BRW_AOP_DEC) { 4374 assert(num_srcs == 4); 4375 num_srcs = 3; 4376 } 4377 4378 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 4379 4380 fs_reg data; 4381 if (num_srcs >= 4) 4382 data = get_nir_src(instr->src[3]); 4383 if (num_srcs >= 5) { 4384 fs_reg tmp = bld.vgrf(data.type, 2); 4385 fs_reg sources[2] = { data, get_nir_src(instr->src[4]) }; 4386 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 4387 data = tmp; 4388 } 4389 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 4390 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 4391 4392 bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, 4393 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4394 } 4395 break; 4396 } 4397 4398 case nir_intrinsic_image_size: 4399 case nir_intrinsic_bindless_image_size: { 4400 /* Cube image sizes should have previously been lowered to a 2D array */ 4401 assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE); 4402 4403 /* Unlike the [un]typed load and store opcodes, the TXS that this turns 4404 * into will handle the binding table index for us in the geneerator. 4405 * Incidentally, this means that we can handle bindless with exactly the 4406 * same code. 4407 */ 4408 fs_reg image = retype(get_nir_src_imm(instr->src[0]), 4409 BRW_REGISTER_TYPE_UD); 4410 image = bld.emit_uniformize(image); 4411 4412 assert(nir_src_as_uint(instr->src[1]) == 0); 4413 4414 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 4415 if (instr->intrinsic == nir_intrinsic_image_size) 4416 srcs[TEX_LOGICAL_SRC_SURFACE] = image; 4417 else 4418 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image; 4419 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0); 4420 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0); 4421 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); 4422 4423 /* Since the image size is always uniform, we can just emit a SIMD8 4424 * query instruction and splat the result out. 4425 */ 4426 const fs_builder ubld = bld.exec_all().group(8, 0); 4427 4428 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 4429 fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL, 4430 tmp, srcs, ARRAY_SIZE(srcs)); 4431 inst->size_written = 4 * REG_SIZE; 4432 4433 for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) { 4434 bld.MOV(offset(retype(dest, tmp.type), bld, c), 4435 component(offset(tmp, ubld, c), 0)); 4436 } 4437 break; 4438 } 4439 4440 case nir_intrinsic_image_load_raw_intel: { 4441 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4442 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4443 get_nir_image_intrinsic_image(bld, instr); 4444 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4445 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4446 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4447 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 4448 4449 fs_inst *inst = 4450 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 4451 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4452 inst->size_written = instr->num_components * dispatch_width * 4; 4453 break; 4454 } 4455 4456 case nir_intrinsic_image_store_raw_intel: { 4457 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4458 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4459 get_nir_image_intrinsic_image(bld, instr); 4460 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4461 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]); 4462 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4463 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4464 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 4465 4466 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 4467 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4468 break; 4469 } 4470 4471 case nir_intrinsic_scoped_barrier: 4472 assert(nir_intrinsic_execution_scope(instr) == NIR_SCOPE_NONE); 4473 FALLTHROUGH; 4474 case nir_intrinsic_group_memory_barrier: 4475 case nir_intrinsic_memory_barrier_shared: 4476 case nir_intrinsic_memory_barrier_buffer: 4477 case nir_intrinsic_memory_barrier_image: 4478 case nir_intrinsic_memory_barrier: 4479 case nir_intrinsic_begin_invocation_interlock: 4480 case nir_intrinsic_end_invocation_interlock: { 4481 bool ugm_fence, slm_fence, tgm_fence, urb_fence; 4482 const enum opcode opcode = 4483 instr->intrinsic == nir_intrinsic_begin_invocation_interlock ? 4484 SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE; 4485 4486 switch (instr->intrinsic) { 4487 case nir_intrinsic_scoped_barrier: { 4488 nir_variable_mode modes = nir_intrinsic_memory_modes(instr); 4489 ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global); 4490 slm_fence = modes & nir_var_mem_shared; 4491 tgm_fence = modes & nir_var_image; 4492 urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload); 4493 break; 4494 } 4495 4496 case nir_intrinsic_begin_invocation_interlock: 4497 case nir_intrinsic_end_invocation_interlock: 4498 /* For beginInvocationInterlockARB(), we will generate a memory fence 4499 * but with a different opcode so that generator can pick SENDC 4500 * instead of SEND. 4501 * 4502 * For endInvocationInterlockARB(), we need to insert a memory fence which 4503 * stalls in the shader until the memory transactions prior to that 4504 * fence are complete. This ensures that the shader does not end before 4505 * any writes from its critical section have landed. Otherwise, you can 4506 * end up with a case where the next invocation on that pixel properly 4507 * stalls for previous FS invocation on its pixel to complete but 4508 * doesn't actually wait for the dataport memory transactions from that 4509 * thread to land before submitting its own. 4510 * 4511 * Handling them here will allow the logic for IVB render cache (see 4512 * below) to be reused. 4513 */ 4514 assert(stage == MESA_SHADER_FRAGMENT); 4515 ugm_fence = tgm_fence = true; 4516 slm_fence = urb_fence = false; 4517 break; 4518 4519 default: 4520 ugm_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared && 4521 instr->intrinsic != nir_intrinsic_memory_barrier_image; 4522 slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || 4523 instr->intrinsic == nir_intrinsic_memory_barrier || 4524 instr->intrinsic == nir_intrinsic_memory_barrier_shared; 4525 tgm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || 4526 instr->intrinsic == nir_intrinsic_memory_barrier || 4527 instr->intrinsic == nir_intrinsic_memory_barrier_image; 4528 urb_fence = instr->intrinsic == nir_intrinsic_memory_barrier; 4529 break; 4530 } 4531 4532 if (nir->info.shared_size > 0) { 4533 assert(gl_shader_stage_uses_workgroup(stage)); 4534 } else { 4535 slm_fence = false; 4536 } 4537 4538 /* If the workgroup fits in a single HW thread, the messages for SLM are 4539 * processed in-order and the shader itself is already synchronized so 4540 * the memory fence is not necessary. 4541 * 4542 * TODO: Check if applies for many HW threads sharing same Data Port. 4543 */ 4544 if (!nir->info.workgroup_size_variable && 4545 slm_fence && workgroup_size() <= dispatch_width) 4546 slm_fence = false; 4547 4548 switch (stage) { 4549 case MESA_SHADER_TESS_CTRL: 4550 case MESA_SHADER_TASK: 4551 case MESA_SHADER_MESH: 4552 break; 4553 default: 4554 urb_fence = false; 4555 break; 4556 } 4557 4558 unsigned fence_regs_count = 0; 4559 fs_reg fence_regs[4] = {}; 4560 4561 const fs_builder ubld = bld.group(8, 0); 4562 4563 if (devinfo->has_lsc) { 4564 assert(devinfo->verx10 >= 125); 4565 uint32_t desc = 4566 lsc_fence_descriptor_for_intrinsic(devinfo, instr); 4567 if (ugm_fence) { 4568 fence_regs[fence_regs_count++] = 4569 emit_fence(ubld, opcode, GFX12_SFID_UGM, desc, 4570 true /* commit_enable */, 4571 0 /* bti; ignored for LSC */); 4572 } 4573 4574 if (tgm_fence) { 4575 fence_regs[fence_regs_count++] = 4576 emit_fence(ubld, opcode, GFX12_SFID_TGM, desc, 4577 true /* commit_enable */, 4578 0 /* bti; ignored for LSC */); 4579 } 4580 4581 if (slm_fence) { 4582 assert(opcode == SHADER_OPCODE_MEMORY_FENCE); 4583 fence_regs[fence_regs_count++] = 4584 emit_fence(ubld, opcode, GFX12_SFID_SLM, desc, 4585 true /* commit_enable */, 4586 0 /* BTI; ignored for LSC */); 4587 } 4588 4589 if (urb_fence) { 4590 assert(opcode == SHADER_OPCODE_MEMORY_FENCE); 4591 fence_regs[fence_regs_count++] = 4592 emit_fence(ubld, opcode, BRW_SFID_URB, desc, 4593 true /* commit_enable */, 4594 0 /* BTI; ignored for LSC */); 4595 } 4596 } else if (devinfo->ver >= 11) { 4597 if (tgm_fence || ugm_fence || urb_fence) { 4598 fence_regs[fence_regs_count++] = 4599 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, 4600 true /* commit_enable HSD ES # 1404612949 */, 4601 0 /* BTI = 0 means data cache */); 4602 } 4603 4604 if (slm_fence) { 4605 assert(opcode == SHADER_OPCODE_MEMORY_FENCE); 4606 fence_regs[fence_regs_count++] = 4607 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, 4608 true /* commit_enable HSD ES # 1404612949 */, 4609 GFX7_BTI_SLM); 4610 } 4611 } else { 4612 /* Prior to Icelake, they're all lumped into a single cache except on 4613 * Ivy Bridge and Bay Trail where typed messages actually go through 4614 * the render cache. There, we need both fences because we may 4615 * access storage images as either typed or untyped. 4616 */ 4617 const bool render_fence = tgm_fence && devinfo->verx10 == 70; 4618 4619 /* Simulation also complains on Gfx9 if we do not enable commit. 4620 */ 4621 const bool commit_enable = render_fence || 4622 instr->intrinsic == nir_intrinsic_end_invocation_interlock || 4623 devinfo->ver == 9; 4624 4625 if (tgm_fence || ugm_fence || slm_fence || urb_fence) { 4626 fence_regs[fence_regs_count++] = 4627 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, 4628 commit_enable, 0 /* BTI */); 4629 } 4630 4631 if (render_fence) { 4632 fence_regs[fence_regs_count++] = 4633 emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0, 4634 commit_enable, /* bti */ 0); 4635 } 4636 } 4637 4638 assert(fence_regs_count <= ARRAY_SIZE(fence_regs)); 4639 4640 /* There are four cases where we want to insert a stall: 4641 * 4642 * 1. If we're a nir_intrinsic_end_invocation_interlock. This is 4643 * required to ensure that the shader EOT doesn't happen until 4644 * after the fence returns. Otherwise, we might end up with the 4645 * next shader invocation for that pixel not respecting our fence 4646 * because it may happen on a different HW thread. 4647 * 4648 * 2. If we have multiple fences. This is required to ensure that 4649 * they all complete and nothing gets weirdly out-of-order. 4650 * 4651 * 3. If we have no fences. In this case, we need at least a 4652 * scheduling barrier to keep the compiler from moving things 4653 * around in an invalid way. 4654 * 4655 * 4. On platforms with LSC. 4656 */ 4657 if (instr->intrinsic == nir_intrinsic_end_invocation_interlock || 4658 fence_regs_count != 1 || devinfo->has_lsc) { 4659 ubld.exec_all().group(1, 0).emit( 4660 FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), 4661 fence_regs, fence_regs_count); 4662 } 4663 4664 break; 4665 } 4666 4667 case nir_intrinsic_memory_barrier_tcs_patch: 4668 break; 4669 4670 case nir_intrinsic_shader_clock: { 4671 /* We cannot do anything if there is an event, so ignore it for now */ 4672 const fs_reg shader_clock = get_timestamp(bld); 4673 const fs_reg srcs[] = { component(shader_clock, 0), 4674 component(shader_clock, 1) }; 4675 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); 4676 break; 4677 } 4678 4679 case nir_intrinsic_image_samples: 4680 /* The driver does not support multi-sampled images. */ 4681 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1)); 4682 break; 4683 4684 case nir_intrinsic_load_reloc_const_intel: { 4685 uint32_t id = nir_intrinsic_param_idx(instr); 4686 bld.emit(SHADER_OPCODE_MOV_RELOC_IMM, 4687 dest, brw_imm_ud(id)); 4688 break; 4689 } 4690 4691 case nir_intrinsic_load_uniform: { 4692 /* Offsets are in bytes but they should always aligned to 4693 * the type size 4694 */ 4695 assert(instr->const_index[0] % 4 == 0 || 4696 instr->const_index[0] % type_sz(dest.type) == 0); 4697 4698 fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type); 4699 4700 if (nir_src_is_const(instr->src[0])) { 4701 unsigned load_offset = nir_src_as_uint(instr->src[0]); 4702 assert(load_offset % type_sz(dest.type) == 0); 4703 /* For 16-bit types we add the module of the const_index[0] 4704 * offset to access to not 32-bit aligned element 4705 */ 4706 src.offset = load_offset + instr->const_index[0] % 4; 4707 4708 for (unsigned j = 0; j < instr->num_components; j++) { 4709 bld.MOV(offset(dest, bld, j), offset(src, bld, j)); 4710 } 4711 } else { 4712 fs_reg indirect = retype(get_nir_src(instr->src[0]), 4713 BRW_REGISTER_TYPE_UD); 4714 4715 /* We need to pass a size to the MOV_INDIRECT but we don't want it to 4716 * go past the end of the uniform. In order to keep the n'th 4717 * component from running past, we subtract off the size of all but 4718 * one component of the vector. 4719 */ 4720 assert(instr->const_index[1] >= 4721 instr->num_components * (int) type_sz(dest.type)); 4722 unsigned read_size = instr->const_index[1] - 4723 (instr->num_components - 1) * type_sz(dest.type); 4724 4725 bool supports_64bit_indirects = 4726 devinfo->platform != INTEL_PLATFORM_CHV && !intel_device_info_is_9lp(devinfo); 4727 4728 if (type_sz(dest.type) != 8 || supports_64bit_indirects) { 4729 for (unsigned j = 0; j < instr->num_components; j++) { 4730 bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4731 offset(dest, bld, j), offset(src, bld, j), 4732 indirect, brw_imm_ud(read_size)); 4733 } 4734 } else { 4735 const unsigned num_mov_indirects = 4736 type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD); 4737 /* We read a little bit less per MOV INDIRECT, as they are now 4738 * 32-bits ones instead of 64-bit. Fix read_size then. 4739 */ 4740 const unsigned read_size_32bit = read_size - 4741 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD); 4742 for (unsigned j = 0; j < instr->num_components; j++) { 4743 for (unsigned i = 0; i < num_mov_indirects; i++) { 4744 bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4745 subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i), 4746 subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i), 4747 indirect, brw_imm_ud(read_size_32bit)); 4748 } 4749 } 4750 } 4751 } 4752 break; 4753 } 4754 4755 case nir_intrinsic_load_ubo: { 4756 fs_reg surf_index; 4757 if (nir_src_is_const(instr->src[0])) { 4758 const unsigned index = nir_src_as_uint(instr->src[0]); 4759 surf_index = brw_imm_ud(index); 4760 } else { 4761 /* The block index is not a constant. Evaluate the index expression 4762 * per-channel and add the base UBO index; we have to select a value 4763 * from any live channel. 4764 */ 4765 surf_index = vgrf(glsl_type::uint_type); 4766 bld.MOV(surf_index, get_nir_src(instr->src[0])); 4767 surf_index = bld.emit_uniformize(surf_index); 4768 } 4769 4770 if (!nir_src_is_const(instr->src[1])) { 4771 fs_reg base_offset = retype(get_nir_src(instr->src[1]), 4772 BRW_REGISTER_TYPE_UD); 4773 4774 for (int i = 0; i < instr->num_components; i++) 4775 VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, 4776 base_offset, i * type_sz(dest.type), 4777 nir_dest_bit_size(instr->dest) / 8); 4778 4779 prog_data->has_ubo_pull = true; 4780 } else { 4781 /* Even if we are loading doubles, a pull constant load will load 4782 * a 32-bit vec4, so should only reserve vgrf space for that. If we 4783 * need to load a full dvec4 we will have to emit 2 loads. This is 4784 * similar to demote_pull_constants(), except that in that case we 4785 * see individual accesses to each component of the vector and then 4786 * we let CSE deal with duplicate loads. Here we see a vector access 4787 * and we have to split it if necessary. 4788 */ 4789 const unsigned type_size = type_sz(dest.type); 4790 const unsigned load_offset = nir_src_as_uint(instr->src[1]); 4791 4792 /* See if we've selected this as a push constant candidate */ 4793 if (nir_src_is_const(instr->src[0])) { 4794 const unsigned ubo_block = nir_src_as_uint(instr->src[0]); 4795 const unsigned offset_256b = load_offset / 32; 4796 4797 fs_reg push_reg; 4798 for (int i = 0; i < 4; i++) { 4799 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; 4800 if (range->block == ubo_block && 4801 offset_256b >= range->start && 4802 offset_256b < range->start + range->length) { 4803 4804 push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type); 4805 push_reg.offset = load_offset - 32 * range->start; 4806 break; 4807 } 4808 } 4809 4810 if (push_reg.file != BAD_FILE) { 4811 for (unsigned i = 0; i < instr->num_components; i++) { 4812 bld.MOV(offset(dest, bld, i), 4813 byte_offset(push_reg, i * type_size)); 4814 } 4815 break; 4816 } 4817 } 4818 4819 prog_data->has_ubo_pull = true; 4820 4821 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ 4822 const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); 4823 const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4824 4825 for (unsigned c = 0; c < instr->num_components;) { 4826 const unsigned base = load_offset + c * type_size; 4827 /* Number of usable components in the next block-aligned load. */ 4828 const unsigned count = MIN2(instr->num_components - c, 4829 (block_sz - base % block_sz) / type_size); 4830 4831 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 4832 packed_consts, surf_index, 4833 brw_imm_ud(base & ~(block_sz - 1))); 4834 4835 const fs_reg consts = 4836 retype(byte_offset(packed_consts, base & (block_sz - 1)), 4837 dest.type); 4838 4839 for (unsigned d = 0; d < count; d++) 4840 bld.MOV(offset(dest, bld, c + d), component(consts, d)); 4841 4842 c += count; 4843 } 4844 } 4845 break; 4846 } 4847 4848 case nir_intrinsic_load_global: 4849 case nir_intrinsic_load_global_constant: { 4850 assert(devinfo->ver >= 8); 4851 4852 assert(nir_dest_bit_size(instr->dest) <= 32); 4853 assert(nir_intrinsic_align(instr) > 0); 4854 fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 4855 srcs[A64_LOGICAL_ADDRESS] = get_nir_src(instr->src[0]); 4856 srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ 4857 srcs[A64_LOGICAL_ENABLE_HELPERS] = 4858 brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS); 4859 4860 if (nir_dest_bit_size(instr->dest) == 32 && 4861 nir_intrinsic_align(instr) >= 4) { 4862 assert(nir_dest_num_components(instr->dest) <= 4); 4863 4864 srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components); 4865 4866 fs_inst *inst = 4867 bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest, 4868 srcs, A64_LOGICAL_NUM_SRCS); 4869 inst->size_written = instr->num_components * 4870 inst->dst.component_size(inst->exec_size); 4871 } else { 4872 const unsigned bit_size = nir_dest_bit_size(instr->dest); 4873 assert(nir_dest_num_components(instr->dest) == 1); 4874 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4875 4876 srcs[A64_LOGICAL_ARG] = brw_imm_ud(bit_size); 4877 4878 bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp, 4879 srcs, A64_LOGICAL_NUM_SRCS); 4880 bld.MOV(dest, subscript(tmp, dest.type, 0)); 4881 } 4882 break; 4883 } 4884 4885 case nir_intrinsic_store_global: { 4886 assert(devinfo->ver >= 8); 4887 4888 assert(nir_src_bit_size(instr->src[0]) <= 32); 4889 assert(nir_intrinsic_write_mask(instr) == 4890 (1u << instr->num_components) - 1); 4891 assert(nir_intrinsic_align(instr) > 0); 4892 4893 fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 4894 srcs[A64_LOGICAL_ADDRESS] = get_nir_src(instr->src[1]); 4895 srcs[A64_LOGICAL_ENABLE_HELPERS] = 4896 brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS); 4897 4898 if (nir_src_bit_size(instr->src[0]) == 32 && 4899 nir_intrinsic_align(instr) >= 4) { 4900 assert(nir_src_num_components(instr->src[0]) <= 4); 4901 4902 srcs[A64_LOGICAL_SRC] = get_nir_src(instr->src[0]); /* Data */ 4903 srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components); 4904 4905 bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, fs_reg(), 4906 srcs, A64_LOGICAL_NUM_SRCS); 4907 } else { 4908 assert(nir_src_num_components(instr->src[0]) == 1); 4909 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 4910 brw_reg_type data_type = 4911 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4912 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4913 bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type)); 4914 4915 srcs[A64_LOGICAL_SRC] = tmp; 4916 srcs[A64_LOGICAL_ARG] = brw_imm_ud(nir_src_bit_size(instr->src[0])); 4917 4918 bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, fs_reg(), 4919 srcs, A64_LOGICAL_NUM_SRCS); 4920 } 4921 break; 4922 } 4923 4924 case nir_intrinsic_global_atomic_add: 4925 case nir_intrinsic_global_atomic_imin: 4926 case nir_intrinsic_global_atomic_umin: 4927 case nir_intrinsic_global_atomic_imax: 4928 case nir_intrinsic_global_atomic_umax: 4929 case nir_intrinsic_global_atomic_and: 4930 case nir_intrinsic_global_atomic_or: 4931 case nir_intrinsic_global_atomic_xor: 4932 case nir_intrinsic_global_atomic_exchange: 4933 case nir_intrinsic_global_atomic_comp_swap: 4934 nir_emit_global_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); 4935 break; 4936 case nir_intrinsic_global_atomic_fadd: 4937 case nir_intrinsic_global_atomic_fmin: 4938 case nir_intrinsic_global_atomic_fmax: 4939 case nir_intrinsic_global_atomic_fcomp_swap: 4940 nir_emit_global_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); 4941 break; 4942 4943 case nir_intrinsic_load_global_const_block_intel: { 4944 assert(nir_dest_bit_size(instr->dest) == 32); 4945 assert(instr->num_components == 8 || instr->num_components == 16); 4946 4947 const fs_builder ubld = bld.exec_all().group(instr->num_components, 0); 4948 fs_reg load_val; 4949 4950 bool is_pred_const = nir_src_is_const(instr->src[1]); 4951 if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) { 4952 /* In this case, we don't want the UBO load at all. We really 4953 * shouldn't get here but it's possible. 4954 */ 4955 load_val = brw_imm_ud(0); 4956 } else { 4957 /* The uniform process may stomp the flag so do this first */ 4958 fs_reg addr = bld.emit_uniformize(get_nir_src(instr->src[0])); 4959 4960 load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4961 4962 /* If the predicate is constant and we got here, then it's non-zero 4963 * and we don't need the predicate at all. 4964 */ 4965 if (!is_pred_const) { 4966 /* Load the predicate */ 4967 fs_reg pred = bld.emit_uniformize(get_nir_src(instr->src[1])); 4968 fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred); 4969 mov->conditional_mod = BRW_CONDITIONAL_NZ; 4970 4971 /* Stomp the destination with 0 if we're OOB */ 4972 mov = ubld.MOV(load_val, brw_imm_ud(0)); 4973 mov->predicate = BRW_PREDICATE_NORMAL; 4974 mov->predicate_inverse = true; 4975 } 4976 4977 fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 4978 srcs[A64_LOGICAL_ADDRESS] = addr; 4979 srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ 4980 srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components); 4981 /* This intrinsic loads memory from a uniform address, sometimes 4982 * shared across lanes. We never need to mask it. 4983 */ 4984 srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); 4985 4986 fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL, 4987 load_val, srcs, A64_LOGICAL_NUM_SRCS); 4988 if (!is_pred_const) 4989 load->predicate = BRW_PREDICATE_NORMAL; 4990 } 4991 4992 /* From the HW perspective, we just did a single SIMD16 instruction 4993 * which loaded a dword in each SIMD channel. From NIR's perspective, 4994 * this instruction returns a vec16. Any users of this data in the 4995 * back-end will expect a vec16 per SIMD channel so we have to emit a 4996 * pile of MOVs to resolve this discrepancy. Fortunately, copy-prop 4997 * will generally clean them up for us. 4998 */ 4999 for (unsigned i = 0; i < instr->num_components; i++) { 5000 bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), 5001 component(load_val, i)); 5002 } 5003 break; 5004 } 5005 5006 case nir_intrinsic_load_ssbo: { 5007 assert(devinfo->ver >= 7); 5008 5009 const unsigned bit_size = nir_dest_bit_size(instr->dest); 5010 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5011 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 5012 get_nir_ssbo_intrinsic_index(bld, instr); 5013 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 5014 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5015 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 5016 5017 /* Make dest unsigned because that's what the temporary will be */ 5018 dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 5019 5020 /* Read the vector */ 5021 assert(nir_dest_bit_size(instr->dest) <= 32); 5022 assert(nir_intrinsic_align(instr) > 0); 5023 if (nir_dest_bit_size(instr->dest) == 32 && 5024 nir_intrinsic_align(instr) >= 4) { 5025 assert(nir_dest_num_components(instr->dest) <= 4); 5026 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 5027 fs_inst *inst = 5028 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 5029 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5030 inst->size_written = instr->num_components * dispatch_width * 4; 5031 } else { 5032 assert(nir_dest_num_components(instr->dest) == 1); 5033 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 5034 5035 fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 5036 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 5037 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 5038 bld.MOV(dest, subscript(read_result, dest.type, 0)); 5039 } 5040 break; 5041 } 5042 5043 case nir_intrinsic_store_ssbo: { 5044 assert(devinfo->ver >= 7); 5045 5046 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 5047 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5048 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 5049 get_nir_ssbo_intrinsic_index(bld, instr); 5050 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]); 5051 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5052 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 5053 5054 fs_reg data = get_nir_src(instr->src[0]); 5055 data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 5056 5057 assert(nir_src_bit_size(instr->src[0]) <= 32); 5058 assert(nir_intrinsic_write_mask(instr) == 5059 (1u << instr->num_components) - 1); 5060 assert(nir_intrinsic_align(instr) > 0); 5061 if (nir_src_bit_size(instr->src[0]) == 32 && 5062 nir_intrinsic_align(instr) >= 4) { 5063 assert(nir_src_num_components(instr->src[0]) <= 4); 5064 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5065 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 5066 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 5067 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5068 } else { 5069 assert(nir_src_num_components(instr->src[0]) == 1); 5070 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 5071 5072 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 5073 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 5074 5075 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 5076 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5077 } 5078 break; 5079 } 5080 5081 case nir_intrinsic_store_output: { 5082 assert(nir_src_bit_size(instr->src[0]) == 32); 5083 fs_reg src = get_nir_src(instr->src[0]); 5084 5085 unsigned store_offset = nir_src_as_uint(instr->src[1]); 5086 unsigned num_components = instr->num_components; 5087 unsigned first_component = nir_intrinsic_component(instr); 5088 5089 fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld, 5090 4 * store_offset), src.type); 5091 for (unsigned j = 0; j < num_components; j++) { 5092 bld.MOV(offset(new_dest, bld, j + first_component), 5093 offset(src, bld, j)); 5094 } 5095 break; 5096 } 5097 5098 case nir_intrinsic_ssbo_atomic_add: 5099 case nir_intrinsic_ssbo_atomic_imin: 5100 case nir_intrinsic_ssbo_atomic_umin: 5101 case nir_intrinsic_ssbo_atomic_imax: 5102 case nir_intrinsic_ssbo_atomic_umax: 5103 case nir_intrinsic_ssbo_atomic_and: 5104 case nir_intrinsic_ssbo_atomic_or: 5105 case nir_intrinsic_ssbo_atomic_xor: 5106 case nir_intrinsic_ssbo_atomic_exchange: 5107 case nir_intrinsic_ssbo_atomic_comp_swap: 5108 nir_emit_ssbo_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); 5109 break; 5110 case nir_intrinsic_ssbo_atomic_fadd: 5111 case nir_intrinsic_ssbo_atomic_fmin: 5112 case nir_intrinsic_ssbo_atomic_fmax: 5113 case nir_intrinsic_ssbo_atomic_fcomp_swap: 5114 nir_emit_ssbo_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); 5115 break; 5116 5117 case nir_intrinsic_get_ssbo_size: { 5118 assert(nir_src_num_components(instr->src[0]) == 1); 5119 unsigned ssbo_index = nir_src_is_const(instr->src[0]) ? 5120 nir_src_as_uint(instr->src[0]) : 0; 5121 5122 /* A resinfo's sampler message is used to get the buffer size. The 5123 * SIMD8's writeback message consists of four registers and SIMD16's 5124 * writeback message consists of 8 destination registers (two per each 5125 * component). Because we are only interested on the first channel of 5126 * the first returned component, where resinfo returns the buffer size 5127 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of 5128 * the dispatch width. 5129 */ 5130 const fs_builder ubld = bld.exec_all().group(8, 0); 5131 fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5132 fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 5133 5134 /* Set LOD = 0 */ 5135 ubld.MOV(src_payload, brw_imm_d(0)); 5136 5137 const unsigned index = ssbo_index; 5138 fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload, 5139 src_payload, brw_imm_ud(index)); 5140 inst->header_size = 0; 5141 inst->mlen = 1; 5142 inst->size_written = 4 * REG_SIZE; 5143 5144 /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting: 5145 * 5146 * "Out-of-bounds checking is always performed at a DWord granularity. If 5147 * any part of the DWord is out-of-bounds then the whole DWord is 5148 * considered out-of-bounds." 5149 * 5150 * This implies that types with size smaller than 4-bytes need to be 5151 * padded if they don't complete the last dword of the buffer. But as we 5152 * need to maintain the original size we need to reverse the padding 5153 * calculation to return the correct size to know the number of elements 5154 * of an unsized array. As we stored in the last two bits of the surface 5155 * size the needed padding for the buffer, we calculate here the 5156 * original buffer_size reversing the surface_size calculation: 5157 * 5158 * surface_size = isl_align(buffer_size, 4) + 5159 * (isl_align(buffer_size) - buffer_size) 5160 * 5161 * buffer_size = surface_size & ~3 - surface_size & 3 5162 */ 5163 5164 fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5165 fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5166 fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5167 5168 ubld.AND(size_padding, ret_payload, brw_imm_ud(3)); 5169 ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3)); 5170 ubld.ADD(buffer_size, size_aligned4, negate(size_padding)); 5171 5172 bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0)); 5173 break; 5174 } 5175 5176 case nir_intrinsic_load_scratch: { 5177 assert(devinfo->ver >= 7); 5178 5179 assert(nir_dest_num_components(instr->dest) == 1); 5180 const unsigned bit_size = nir_dest_bit_size(instr->dest); 5181 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5182 5183 if (devinfo->verx10 >= 125) { 5184 const fs_builder ubld = bld.exec_all().group(1, 0); 5185 fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0); 5186 ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), 5187 brw_imm_ud(~0x3ffu)); 5188 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle; 5189 } else if (devinfo->ver >= 8) { 5190 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 5191 brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT); 5192 } else { 5193 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); 5194 } 5195 5196 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5197 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 5198 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 5199 const fs_reg nir_addr = get_nir_src(instr->src[0]); 5200 5201 /* Make dest unsigned because that's what the temporary will be */ 5202 dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 5203 5204 /* Read the vector */ 5205 assert(nir_dest_num_components(instr->dest) == 1); 5206 assert(nir_dest_bit_size(instr->dest) <= 32); 5207 assert(nir_intrinsic_align(instr) > 0); 5208 if (nir_dest_bit_size(instr->dest) == 32 && 5209 nir_intrinsic_align(instr) >= 4) { 5210 if (devinfo->verx10 >= 125) { 5211 assert(nir_dest_bit_size(instr->dest) == 32 && 5212 nir_intrinsic_align(instr) >= 4); 5213 5214 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5215 swizzle_nir_scratch_addr(bld, nir_addr, false); 5216 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); 5217 5218 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 5219 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5220 } else { 5221 /* The offset for a DWORD scattered message is in dwords. */ 5222 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5223 swizzle_nir_scratch_addr(bld, nir_addr, true); 5224 5225 bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL, 5226 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5227 } 5228 } else { 5229 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5230 swizzle_nir_scratch_addr(bld, nir_addr, false); 5231 5232 fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 5233 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 5234 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 5235 bld.MOV(dest, read_result); 5236 } 5237 5238 shader_stats.fill_count += DIV_ROUND_UP(dispatch_width, 16); 5239 break; 5240 } 5241 5242 case nir_intrinsic_store_scratch: { 5243 assert(devinfo->ver >= 7); 5244 5245 assert(nir_src_num_components(instr->src[0]) == 1); 5246 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 5247 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5248 5249 if (devinfo->verx10 >= 125) { 5250 const fs_builder ubld = bld.exec_all().group(1, 0); 5251 fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0); 5252 ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), 5253 brw_imm_ud(~0x3ffu)); 5254 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle; 5255 } else if (devinfo->ver >= 8) { 5256 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 5257 brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT); 5258 } else { 5259 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); 5260 } 5261 5262 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5263 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 5264 /** 5265 * While this instruction has side-effects, it should not be predicated 5266 * on sample mask, because otherwise fs helper invocations would 5267 * load undefined values from scratch memory. And scratch memory 5268 * load-stores are produced from operations without side-effects, thus 5269 * they should not have different behaviour in the helper invocations. 5270 */ 5271 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 5272 const fs_reg nir_addr = get_nir_src(instr->src[1]); 5273 5274 fs_reg data = get_nir_src(instr->src[0]); 5275 data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 5276 5277 assert(nir_src_num_components(instr->src[0]) == 1); 5278 assert(nir_src_bit_size(instr->src[0]) <= 32); 5279 assert(nir_intrinsic_write_mask(instr) == 1); 5280 assert(nir_intrinsic_align(instr) > 0); 5281 if (nir_src_bit_size(instr->src[0]) == 32 && 5282 nir_intrinsic_align(instr) >= 4) { 5283 if (devinfo->verx10 >= 125) { 5284 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5285 5286 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5287 swizzle_nir_scratch_addr(bld, nir_addr, false); 5288 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); 5289 5290 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 5291 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5292 } else { 5293 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5294 5295 /* The offset for a DWORD scattered message is in dwords. */ 5296 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5297 swizzle_nir_scratch_addr(bld, nir_addr, true); 5298 5299 bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL, 5300 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5301 } 5302 } else { 5303 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 5304 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 5305 5306 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5307 swizzle_nir_scratch_addr(bld, nir_addr, false); 5308 5309 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 5310 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5311 } 5312 shader_stats.spill_count += DIV_ROUND_UP(dispatch_width, 16); 5313 break; 5314 } 5315 5316 case nir_intrinsic_load_subgroup_size: 5317 /* This should only happen for fragment shaders because every other case 5318 * is lowered in NIR so we can optimize on it. 5319 */ 5320 assert(stage == MESA_SHADER_FRAGMENT); 5321 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width)); 5322 break; 5323 5324 case nir_intrinsic_load_subgroup_invocation: 5325 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 5326 nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]); 5327 break; 5328 5329 case nir_intrinsic_load_subgroup_eq_mask: 5330 case nir_intrinsic_load_subgroup_ge_mask: 5331 case nir_intrinsic_load_subgroup_gt_mask: 5332 case nir_intrinsic_load_subgroup_le_mask: 5333 case nir_intrinsic_load_subgroup_lt_mask: 5334 unreachable("not reached"); 5335 5336 case nir_intrinsic_vote_any: { 5337 const fs_builder ubld = bld.exec_all().group(1, 0); 5338 5339 /* The any/all predicates do not consider channel enables. To prevent 5340 * dead channels from affecting the result, we initialize the flag with 5341 * with the identity value for the logical operation. 5342 */ 5343 if (dispatch_width == 32) { 5344 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 5345 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 5346 brw_imm_ud(0)); 5347 } else { 5348 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0)); 5349 } 5350 bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 5351 5352 /* For some reason, the any/all predicates don't work properly with 5353 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 5354 * doesn't read the correct subset of the flag register and you end up 5355 * getting garbage in the second half. Work around this by using a pair 5356 * of 1-wide MOVs and scattering the result. 5357 */ 5358 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 5359 ubld.MOV(res1, brw_imm_d(0)); 5360 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H : 5361 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H : 5362 BRW_PREDICATE_ALIGN1_ANY32H, 5363 ubld.MOV(res1, brw_imm_d(-1))); 5364 5365 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 5366 break; 5367 } 5368 case nir_intrinsic_vote_all: { 5369 const fs_builder ubld = bld.exec_all().group(1, 0); 5370 5371 /* The any/all predicates do not consider channel enables. To prevent 5372 * dead channels from affecting the result, we initialize the flag with 5373 * with the identity value for the logical operation. 5374 */ 5375 if (dispatch_width == 32) { 5376 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 5377 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 5378 brw_imm_ud(0xffffffff)); 5379 } else { 5380 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 5381 } 5382 bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 5383 5384 /* For some reason, the any/all predicates don't work properly with 5385 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 5386 * doesn't read the correct subset of the flag register and you end up 5387 * getting garbage in the second half. Work around this by using a pair 5388 * of 1-wide MOVs and scattering the result. 5389 */ 5390 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 5391 ubld.MOV(res1, brw_imm_d(0)); 5392 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 5393 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 5394 BRW_PREDICATE_ALIGN1_ALL32H, 5395 ubld.MOV(res1, brw_imm_d(-1))); 5396 5397 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 5398 break; 5399 } 5400 case nir_intrinsic_vote_feq: 5401 case nir_intrinsic_vote_ieq: { 5402 fs_reg value = get_nir_src(instr->src[0]); 5403 if (instr->intrinsic == nir_intrinsic_vote_feq) { 5404 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 5405 value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B : 5406 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); 5407 } 5408 5409 fs_reg uniformized = bld.emit_uniformize(value); 5410 const fs_builder ubld = bld.exec_all().group(1, 0); 5411 5412 /* The any/all predicates do not consider channel enables. To prevent 5413 * dead channels from affecting the result, we initialize the flag with 5414 * with the identity value for the logical operation. 5415 */ 5416 if (dispatch_width == 32) { 5417 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 5418 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 5419 brw_imm_ud(0xffffffff)); 5420 } else { 5421 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 5422 } 5423 bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z); 5424 5425 /* For some reason, the any/all predicates don't work properly with 5426 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 5427 * doesn't read the correct subset of the flag register and you end up 5428 * getting garbage in the second half. Work around this by using a pair 5429 * of 1-wide MOVs and scattering the result. 5430 */ 5431 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 5432 ubld.MOV(res1, brw_imm_d(0)); 5433 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 5434 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 5435 BRW_PREDICATE_ALIGN1_ALL32H, 5436 ubld.MOV(res1, brw_imm_d(-1))); 5437 5438 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 5439 break; 5440 } 5441 5442 case nir_intrinsic_ballot: { 5443 const fs_reg value = retype(get_nir_src(instr->src[0]), 5444 BRW_REGISTER_TYPE_UD); 5445 struct brw_reg flag = brw_flag_reg(0, 0); 5446 /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well 5447 * as f0.0. This is a problem for fragment programs as we currently use 5448 * f0.1 for discards. Fortunately, we don't support SIMD32 fragment 5449 * programs yet so this isn't a problem. When we do, something will 5450 * have to change. 5451 */ 5452 if (dispatch_width == 32) 5453 flag.type = BRW_REGISTER_TYPE_UD; 5454 5455 bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u)); 5456 bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ); 5457 5458 if (instr->dest.ssa.bit_size > 32) { 5459 dest.type = BRW_REGISTER_TYPE_UQ; 5460 } else { 5461 dest.type = BRW_REGISTER_TYPE_UD; 5462 } 5463 bld.MOV(dest, flag); 5464 break; 5465 } 5466 5467 case nir_intrinsic_read_invocation: { 5468 const fs_reg value = get_nir_src(instr->src[0]); 5469 const fs_reg invocation = get_nir_src(instr->src[1]); 5470 fs_reg tmp = bld.vgrf(value.type); 5471 5472 bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value, 5473 bld.emit_uniformize(invocation)); 5474 5475 bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0))); 5476 break; 5477 } 5478 5479 case nir_intrinsic_read_first_invocation: { 5480 const fs_reg value = get_nir_src(instr->src[0]); 5481 bld.MOV(retype(dest, value.type), bld.emit_uniformize(value)); 5482 break; 5483 } 5484 5485 case nir_intrinsic_shuffle: { 5486 const fs_reg value = get_nir_src(instr->src[0]); 5487 const fs_reg index = get_nir_src(instr->src[1]); 5488 5489 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index); 5490 break; 5491 } 5492 5493 case nir_intrinsic_first_invocation: { 5494 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 5495 bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp); 5496 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 5497 fs_reg(component(tmp, 0))); 5498 break; 5499 } 5500 5501 case nir_intrinsic_last_invocation: { 5502 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 5503 bld.exec_all().emit(SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp); 5504 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 5505 fs_reg(component(tmp, 0))); 5506 break; 5507 } 5508 5509 case nir_intrinsic_quad_broadcast: { 5510 const fs_reg value = get_nir_src(instr->src[0]); 5511 const unsigned index = nir_src_as_uint(instr->src[1]); 5512 5513 bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type), 5514 value, brw_imm_ud(index), brw_imm_ud(4)); 5515 break; 5516 } 5517 5518 case nir_intrinsic_quad_swap_horizontal: { 5519 const fs_reg value = get_nir_src(instr->src[0]); 5520 const fs_reg tmp = bld.vgrf(value.type); 5521 if (devinfo->ver <= 7) { 5522 /* The hardware doesn't seem to support these crazy regions with 5523 * compressed instructions on gfx7 and earlier so we fall back to 5524 * using quad swizzles. Fortunately, we don't support 64-bit 5525 * anything in Vulkan on gfx7. 5526 */ 5527 assert(nir_src_bit_size(instr->src[0]) == 32); 5528 const fs_builder ubld = bld.exec_all(); 5529 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 5530 brw_imm_ud(BRW_SWIZZLE4(1,0,3,2))); 5531 bld.MOV(retype(dest, value.type), tmp); 5532 } else { 5533 const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0); 5534 5535 const fs_reg src_left = horiz_stride(value, 2); 5536 const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2); 5537 const fs_reg tmp_left = horiz_stride(tmp, 2); 5538 const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2); 5539 5540 ubld.MOV(tmp_left, src_right); 5541 ubld.MOV(tmp_right, src_left); 5542 5543 } 5544 bld.MOV(retype(dest, value.type), tmp); 5545 break; 5546 } 5547 5548 case nir_intrinsic_quad_swap_vertical: { 5549 const fs_reg value = get_nir_src(instr->src[0]); 5550 if (nir_src_bit_size(instr->src[0]) == 32) { 5551 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 5552 const fs_reg tmp = bld.vgrf(value.type); 5553 const fs_builder ubld = bld.exec_all(); 5554 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 5555 brw_imm_ud(BRW_SWIZZLE4(2,3,0,1))); 5556 bld.MOV(retype(dest, value.type), tmp); 5557 } else { 5558 /* For larger data types, we have to either emit dispatch_width many 5559 * MOVs or else fall back to doing indirects. 5560 */ 5561 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 5562 bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 5563 brw_imm_w(0x2)); 5564 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 5565 } 5566 break; 5567 } 5568 5569 case nir_intrinsic_quad_swap_diagonal: { 5570 const fs_reg value = get_nir_src(instr->src[0]); 5571 if (nir_src_bit_size(instr->src[0]) == 32) { 5572 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 5573 const fs_reg tmp = bld.vgrf(value.type); 5574 const fs_builder ubld = bld.exec_all(); 5575 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 5576 brw_imm_ud(BRW_SWIZZLE4(3,2,1,0))); 5577 bld.MOV(retype(dest, value.type), tmp); 5578 } else { 5579 /* For larger data types, we have to either emit dispatch_width many 5580 * MOVs or else fall back to doing indirects. 5581 */ 5582 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 5583 bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 5584 brw_imm_w(0x3)); 5585 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 5586 } 5587 break; 5588 } 5589 5590 case nir_intrinsic_reduce: { 5591 fs_reg src = get_nir_src(instr->src[0]); 5592 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 5593 unsigned cluster_size = nir_intrinsic_cluster_size(instr); 5594 if (cluster_size == 0 || cluster_size > dispatch_width) 5595 cluster_size = dispatch_width; 5596 5597 /* Figure out the source type */ 5598 src.type = brw_type_for_nir_type(devinfo, 5599 (nir_alu_type)(nir_op_infos[redop].input_types[0] | 5600 nir_src_bit_size(instr->src[0]))); 5601 5602 fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 5603 opcode brw_op = brw_op_for_nir_reduction_op(redop); 5604 brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 5605 5606 /* Set up a register for all of our scratching around and initialize it 5607 * to reduction operation's identity value. 5608 */ 5609 fs_reg scan = bld.vgrf(src.type); 5610 bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 5611 5612 bld.emit_scan(brw_op, scan, cluster_size, cond_mod); 5613 5614 dest.type = src.type; 5615 if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) { 5616 /* In this case, CLUSTER_BROADCAST instruction isn't needed because 5617 * the distance between clusters is at least 2 GRFs. In this case, 5618 * we don't need the weird striding of the CLUSTER_BROADCAST 5619 * instruction and can just do regular MOVs. 5620 */ 5621 assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0); 5622 const unsigned groups = 5623 (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2); 5624 const unsigned group_size = dispatch_width / groups; 5625 for (unsigned i = 0; i < groups; i++) { 5626 const unsigned cluster = (i * group_size) / cluster_size; 5627 const unsigned comp = cluster * cluster_size + (cluster_size - 1); 5628 bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size), 5629 component(scan, comp)); 5630 } 5631 } else { 5632 bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan, 5633 brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size)); 5634 } 5635 break; 5636 } 5637 5638 case nir_intrinsic_inclusive_scan: 5639 case nir_intrinsic_exclusive_scan: { 5640 fs_reg src = get_nir_src(instr->src[0]); 5641 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 5642 5643 /* Figure out the source type */ 5644 src.type = brw_type_for_nir_type(devinfo, 5645 (nir_alu_type)(nir_op_infos[redop].input_types[0] | 5646 nir_src_bit_size(instr->src[0]))); 5647 5648 fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 5649 opcode brw_op = brw_op_for_nir_reduction_op(redop); 5650 brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 5651 5652 /* Set up a register for all of our scratching around and initialize it 5653 * to reduction operation's identity value. 5654 */ 5655 fs_reg scan = bld.vgrf(src.type); 5656 const fs_builder allbld = bld.exec_all(); 5657 allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 5658 5659 if (instr->intrinsic == nir_intrinsic_exclusive_scan) { 5660 /* Exclusive scan is a bit harder because we have to do an annoying 5661 * shift of the contents before we can begin. To make things worse, 5662 * we can't do this with a normal stride; we have to use indirects. 5663 */ 5664 fs_reg shifted = bld.vgrf(src.type); 5665 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 5666 allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 5667 brw_imm_w(-1)); 5668 allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx); 5669 allbld.group(1, 0).MOV(component(shifted, 0), identity); 5670 scan = shifted; 5671 } 5672 5673 bld.emit_scan(brw_op, scan, dispatch_width, cond_mod); 5674 5675 bld.MOV(retype(dest, src.type), scan); 5676 break; 5677 } 5678 5679 case nir_intrinsic_load_global_block_intel: { 5680 assert(nir_dest_bit_size(instr->dest) == 32); 5681 5682 fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[0])); 5683 5684 const fs_builder ubld1 = bld.exec_all().group(1, 0); 5685 const fs_builder ubld8 = bld.exec_all().group(8, 0); 5686 const fs_builder ubld16 = bld.exec_all().group(16, 0); 5687 5688 const unsigned total = instr->num_components * dispatch_width; 5689 unsigned loaded = 0; 5690 5691 while (loaded < total) { 5692 const unsigned block = 5693 choose_oword_block_size_dwords(total - loaded); 5694 const unsigned block_bytes = block * 4; 5695 5696 const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5697 5698 fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 5699 srcs[A64_LOGICAL_ADDRESS] = address; 5700 srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ 5701 srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); 5702 srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(1); 5703 ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, 5704 retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD), 5705 srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes; 5706 5707 increment_a64_address(ubld1, address, block_bytes); 5708 loaded += block; 5709 } 5710 5711 assert(loaded == total); 5712 break; 5713 } 5714 5715 case nir_intrinsic_store_global_block_intel: { 5716 assert(nir_src_bit_size(instr->src[0]) == 32); 5717 5718 fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[1])); 5719 fs_reg src = get_nir_src(instr->src[0]); 5720 5721 const fs_builder ubld1 = bld.exec_all().group(1, 0); 5722 const fs_builder ubld8 = bld.exec_all().group(8, 0); 5723 const fs_builder ubld16 = bld.exec_all().group(16, 0); 5724 5725 const unsigned total = instr->num_components * dispatch_width; 5726 unsigned written = 0; 5727 5728 while (written < total) { 5729 const unsigned block = 5730 choose_oword_block_size_dwords(total - written); 5731 5732 fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 5733 srcs[A64_LOGICAL_ADDRESS] = address; 5734 srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4), 5735 BRW_REGISTER_TYPE_UD); 5736 srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); 5737 srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); 5738 5739 const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5740 ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, fs_reg(), 5741 srcs, A64_LOGICAL_NUM_SRCS); 5742 5743 const unsigned block_bytes = block * 4; 5744 increment_a64_address(ubld1, address, block_bytes); 5745 written += block; 5746 } 5747 5748 assert(written == total); 5749 break; 5750 } 5751 5752 case nir_intrinsic_load_shared_block_intel: 5753 case nir_intrinsic_load_ssbo_block_intel: { 5754 assert(nir_dest_bit_size(instr->dest) == 32); 5755 5756 const bool is_ssbo = 5757 instr->intrinsic == nir_intrinsic_load_ssbo_block_intel; 5758 fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 1 : 0])); 5759 5760 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5761 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ? 5762 get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM)); 5763 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address; 5764 5765 const fs_builder ubld1 = bld.exec_all().group(1, 0); 5766 const fs_builder ubld8 = bld.exec_all().group(8, 0); 5767 const fs_builder ubld16 = bld.exec_all().group(16, 0); 5768 5769 const unsigned total = instr->num_components * dispatch_width; 5770 unsigned loaded = 0; 5771 5772 while (loaded < total) { 5773 const unsigned block = 5774 choose_oword_block_size_dwords(total - loaded); 5775 const unsigned block_bytes = block * 4; 5776 5777 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); 5778 5779 const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5780 ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, 5781 retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD), 5782 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes; 5783 5784 ubld1.ADD(address, address, brw_imm_ud(block_bytes)); 5785 loaded += block; 5786 } 5787 5788 assert(loaded == total); 5789 break; 5790 } 5791 5792 case nir_intrinsic_store_shared_block_intel: 5793 case nir_intrinsic_store_ssbo_block_intel: { 5794 assert(nir_src_bit_size(instr->src[0]) == 32); 5795 5796 const bool is_ssbo = 5797 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel; 5798 5799 fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 2 : 1])); 5800 fs_reg src = get_nir_src(instr->src[0]); 5801 5802 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5803 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ? 5804 get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM)); 5805 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address; 5806 5807 const fs_builder ubld1 = bld.exec_all().group(1, 0); 5808 const fs_builder ubld8 = bld.exec_all().group(8, 0); 5809 const fs_builder ubld16 = bld.exec_all().group(16, 0); 5810 5811 const unsigned total = instr->num_components * dispatch_width; 5812 unsigned written = 0; 5813 5814 while (written < total) { 5815 const unsigned block = 5816 choose_oword_block_size_dwords(total - written); 5817 5818 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); 5819 srcs[SURFACE_LOGICAL_SRC_DATA] = 5820 retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD); 5821 5822 const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5823 ubld.emit(SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL, 5824 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5825 5826 const unsigned block_bytes = block * 4; 5827 ubld1.ADD(address, address, brw_imm_ud(block_bytes)); 5828 written += block; 5829 } 5830 5831 assert(written == total); 5832 break; 5833 } 5834 5835 case nir_intrinsic_load_topology_id_intel: { 5836 /* These move around basically every hardware generation, so don' 5837 * do any >= checks and fail if the platform hasn't explicitly 5838 * been enabled here. 5839 */ 5840 assert(devinfo->ver == 12); 5841 5842 /* Here is what the layout of SR0 looks like on Gfx12 : 5843 * [13:11] : Slice ID. 5844 * [10:9] : Dual-SubSlice ID 5845 * [8] : SubSlice ID 5846 * [7] : EUID[2] (aka EU Row ID) 5847 * [6] : Reserved 5848 * [5:4] : EUID[1:0] 5849 * [2:0] : Thread ID 5850 */ 5851 fs_reg raw_id = bld.vgrf(BRW_REGISTER_TYPE_UD); 5852 bld.emit(SHADER_OPCODE_READ_SR_REG, raw_id, brw_imm_ud(0)); 5853 switch (nir_intrinsic_base(instr)) { 5854 case BRW_TOPOLOGY_ID_DSS: 5855 bld.AND(raw_id, raw_id, brw_imm_ud(0x3fff)); 5856 /* Get rid of anything below dualsubslice */ 5857 bld.SHR(retype(dest, BRW_REGISTER_TYPE_UD), raw_id, brw_imm_ud(9)); 5858 break; 5859 case BRW_TOPOLOGY_ID_EU_THREAD_SIMD: { 5860 limit_dispatch_width(16, "Topology helper for Ray queries, " 5861 "not supported in SIMD32 mode."); 5862 fs_reg dst = retype(dest, BRW_REGISTER_TYPE_UD); 5863 5864 /* EU[3:0] << 7 5865 * 5866 * The 4bit EU[3:0] we need to build for ray query memory addresses 5867 * computations is a bit odd : 5868 * 5869 * EU[1:0] = raw_id[5:4] (identified as EUID[1:0]) 5870 * EU[2] = raw_id[8] (identified as SubSlice ID) 5871 * EU[3] = raw_id[7] (identified as EUID[2] or Row ID) 5872 */ 5873 { 5874 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 5875 bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(7, 7))); 5876 bld.SHL(dst, tmp, brw_imm_ud(3)); 5877 bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(8, 8))); 5878 bld.SHL(tmp, tmp, brw_imm_ud(1)); 5879 bld.OR(dst, dst, tmp); 5880 bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(5, 4))); 5881 bld.SHL(tmp, tmp, brw_imm_ud(3)); 5882 bld.OR(dst, dst, tmp); 5883 } 5884 5885 /* ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0]) */ 5886 { 5887 bld.AND(raw_id, raw_id, brw_imm_ud(INTEL_MASK(2, 0))); 5888 bld.SHL(raw_id, raw_id, brw_imm_ud(4)); 5889 bld.OR(dst, dst, raw_id); 5890 } 5891 5892 /* LaneID[0:3] << 0 (We build up LaneID by putting the right number 5893 * in each lane) 5894 */ 5895 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW); 5896 const fs_builder ubld8 = bld.exec_all().group(8, 0); 5897 ubld8.MOV(quarter(tmp, 0), brw_imm_v(0x76543210)); 5898 if (bld.dispatch_width() == 16) { 5899 /* Sets 0xfedcba98 to the upper part of the register. */ 5900 ubld8.ADD(quarter(tmp, 1), quarter(tmp, 0), brw_imm_ud(8)); 5901 } 5902 bld.ADD(dst, dst, tmp); 5903 break; 5904 } 5905 default: 5906 unreachable("Invalid topology id type"); 5907 } 5908 break; 5909 } 5910 5911 case nir_intrinsic_load_btd_stack_id_intel: 5912 if (stage == MESA_SHADER_COMPUTE) { 5913 assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids); 5914 } else { 5915 assert(brw_shader_stage_is_bindless(stage)); 5916 } 5917 /* Stack IDs are always in R1 regardless of whether we're coming from a 5918 * bindless shader or a regular compute shader. 5919 */ 5920 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 5921 retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW)); 5922 break; 5923 5924 case nir_intrinsic_btd_spawn_intel: 5925 if (stage == MESA_SHADER_COMPUTE) { 5926 assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids); 5927 } else { 5928 assert(brw_shader_stage_is_bindless(stage)); 5929 } 5930 /* Make sure all the pointers to resume shaders have landed where other 5931 * threads can see them. 5932 */ 5933 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE); 5934 5935 bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(), 5936 bld.emit_uniformize(get_nir_src(instr->src[0])), 5937 get_nir_src(instr->src[1])); 5938 break; 5939 5940 case nir_intrinsic_btd_retire_intel: 5941 if (stage == MESA_SHADER_COMPUTE) { 5942 assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids); 5943 } else { 5944 assert(brw_shader_stage_is_bindless(stage)); 5945 } 5946 /* Make sure all the pointers to resume shaders have landed where other 5947 * threads can see them. 5948 */ 5949 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE); 5950 bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL); 5951 break; 5952 5953 case nir_intrinsic_trace_ray_intel: { 5954 const bool synchronous = nir_intrinsic_synchronous(instr); 5955 assert(brw_shader_stage_is_bindless(stage) || synchronous); 5956 5957 /* Make sure all the previous RT structure writes are visible to the RT 5958 * fixed function within the DSS, as well as stack pointers to resume 5959 * shaders. 5960 */ 5961 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE); 5962 5963 fs_reg srcs[RT_LOGICAL_NUM_SRCS]; 5964 5965 fs_reg globals = get_nir_src(instr->src[0]); 5966 srcs[RT_LOGICAL_SRC_GLOBALS] = bld.emit_uniformize(globals); 5967 srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(instr->src[1]); 5968 srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(instr->src[2]); 5969 srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous); 5970 bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, bld.null_reg_ud(), 5971 srcs, RT_LOGICAL_NUM_SRCS); 5972 5973 /* There is no actual value to use in the destination register of the 5974 * synchronous trace instruction. All of the communication with the HW 5975 * unit happens through memory reads/writes. So to ensure that the 5976 * operation has completed before we go read the results in memory, we 5977 * need a barrier followed by an invalidate before accessing memory. 5978 */ 5979 if (synchronous) { 5980 bld.emit(BRW_OPCODE_SYNC, bld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR)); 5981 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_INVALIDATE); 5982 } 5983 break; 5984 } 5985 5986 default: 5987#ifndef NDEBUG 5988 assert(instr->intrinsic < nir_num_intrinsics); 5989 fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name); 5990#endif 5991 unreachable("unknown intrinsic"); 5992 } 5993} 5994 5995void 5996fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, 5997 int op, nir_intrinsic_instr *instr) 5998{ 5999 /* The BTI untyped atomic messages only support 32-bit atomics. If you 6000 * just look at the big table of messages in the Vol 7 of the SKL PRM, they 6001 * appear to exist. However, if you look at Vol 2a, there are no message 6002 * descriptors provided for Qword atomic ops except for A64 messages. 6003 */ 6004 assert(nir_dest_bit_size(instr->dest) == 32 || 6005 (nir_dest_bit_size(instr->dest) == 64 && devinfo->has_lsc)); 6006 6007 fs_reg dest; 6008 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 6009 dest = get_nir_dest(instr->dest); 6010 6011 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 6012 srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr); 6013 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 6014 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 6015 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 6016 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 6017 6018 fs_reg data; 6019 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 6020 data = get_nir_src(instr->src[2]); 6021 6022 if (op == BRW_AOP_CMPWR) { 6023 fs_reg tmp = bld.vgrf(data.type, 2); 6024 fs_reg sources[2] = { data, get_nir_src(instr->src[3]) }; 6025 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6026 data = tmp; 6027 } 6028 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 6029 6030 /* Emit the actual atomic operation */ 6031 6032 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 6033 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 6034} 6035 6036void 6037fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld, 6038 int op, nir_intrinsic_instr *instr) 6039{ 6040 fs_reg dest; 6041 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 6042 dest = get_nir_dest(instr->dest); 6043 6044 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 6045 srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr); 6046 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 6047 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 6048 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 6049 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 6050 6051 fs_reg data = get_nir_src(instr->src[2]); 6052 if (op == BRW_AOP_FCMPWR) { 6053 fs_reg tmp = bld.vgrf(data.type, 2); 6054 fs_reg sources[2] = { data, get_nir_src(instr->src[3]) }; 6055 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6056 data = tmp; 6057 } 6058 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 6059 6060 /* Emit the actual atomic operation */ 6061 6062 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL, 6063 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 6064} 6065 6066void 6067fs_visitor::nir_emit_shared_atomic(const fs_builder &bld, 6068 int op, nir_intrinsic_instr *instr) 6069{ 6070 fs_reg dest; 6071 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 6072 dest = get_nir_dest(instr->dest); 6073 6074 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 6075 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 6076 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 6077 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 6078 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 6079 6080 fs_reg data; 6081 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 6082 data = get_nir_src(instr->src[1]); 6083 if (op == BRW_AOP_CMPWR) { 6084 fs_reg tmp = bld.vgrf(data.type, 2); 6085 fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 6086 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6087 data = tmp; 6088 } 6089 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 6090 6091 /* Get the offset */ 6092 if (nir_src_is_const(instr->src[0])) { 6093 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 6094 brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0])); 6095 } else { 6096 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type); 6097 bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], 6098 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 6099 brw_imm_ud(instr->const_index[0])); 6100 } 6101 6102 /* Emit the actual atomic operation operation */ 6103 6104 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 6105 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 6106} 6107 6108void 6109fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld, 6110 int op, nir_intrinsic_instr *instr) 6111{ 6112 fs_reg dest; 6113 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 6114 dest = get_nir_dest(instr->dest); 6115 6116 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 6117 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 6118 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 6119 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 6120 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 6121 6122 fs_reg data = get_nir_src(instr->src[1]); 6123 if (op == BRW_AOP_FCMPWR) { 6124 fs_reg tmp = bld.vgrf(data.type, 2); 6125 fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 6126 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6127 data = tmp; 6128 } 6129 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 6130 6131 /* Get the offset */ 6132 if (nir_src_is_const(instr->src[0])) { 6133 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 6134 brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0])); 6135 } else { 6136 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type); 6137 bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], 6138 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 6139 brw_imm_ud(instr->const_index[0])); 6140 } 6141 6142 /* Emit the actual atomic operation operation */ 6143 6144 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL, 6145 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 6146} 6147 6148static fs_reg 6149expand_to_32bit(const fs_builder &bld, const fs_reg &src) 6150{ 6151 if (type_sz(src.type) == 2) { 6152 fs_reg src32 = bld.vgrf(BRW_REGISTER_TYPE_UD); 6153 bld.MOV(src32, retype(src, BRW_REGISTER_TYPE_UW)); 6154 return src32; 6155 } else { 6156 return src; 6157 } 6158} 6159 6160void 6161fs_visitor::nir_emit_global_atomic(const fs_builder &bld, 6162 int op, nir_intrinsic_instr *instr) 6163{ 6164 fs_reg dest; 6165 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 6166 dest = get_nir_dest(instr->dest); 6167 6168 fs_reg addr = get_nir_src(instr->src[0]); 6169 6170 fs_reg data; 6171 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 6172 data = expand_to_32bit(bld, get_nir_src(instr->src[1])); 6173 6174 if (op == BRW_AOP_CMPWR) { 6175 fs_reg tmp = bld.vgrf(data.type, 2); 6176 fs_reg sources[2] = { 6177 data, 6178 expand_to_32bit(bld, get_nir_src(instr->src[2])) 6179 }; 6180 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6181 data = tmp; 6182 } 6183 6184 fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 6185 srcs[A64_LOGICAL_ADDRESS] = addr; 6186 srcs[A64_LOGICAL_SRC] = data; 6187 srcs[A64_LOGICAL_ARG] = brw_imm_ud(op); 6188 srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); 6189 6190 switch (nir_dest_bit_size(instr->dest)) { 6191 case 16: { 6192 fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD); 6193 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL, dest32, 6194 srcs, A64_LOGICAL_NUM_SRCS); 6195 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32); 6196 break; 6197 } 6198 case 32: 6199 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest, 6200 srcs, A64_LOGICAL_NUM_SRCS); 6201 break; 6202 case 64: 6203 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL, dest, 6204 srcs, A64_LOGICAL_NUM_SRCS); 6205 break; 6206 default: 6207 unreachable("Unsupported bit size"); 6208 } 6209} 6210 6211void 6212fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld, 6213 int op, nir_intrinsic_instr *instr) 6214{ 6215 assert(nir_intrinsic_infos[instr->intrinsic].has_dest); 6216 fs_reg dest = get_nir_dest(instr->dest); 6217 6218 fs_reg addr = get_nir_src(instr->src[0]); 6219 6220 assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC); 6221 fs_reg data = expand_to_32bit(bld, get_nir_src(instr->src[1])); 6222 6223 if (op == BRW_AOP_FCMPWR) { 6224 fs_reg tmp = bld.vgrf(data.type, 2); 6225 fs_reg sources[2] = { 6226 data, 6227 expand_to_32bit(bld, get_nir_src(instr->src[2])) 6228 }; 6229 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6230 data = tmp; 6231 } 6232 6233 fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 6234 srcs[A64_LOGICAL_ADDRESS] = addr; 6235 srcs[A64_LOGICAL_SRC] = data; 6236 srcs[A64_LOGICAL_ARG] = brw_imm_ud(op); 6237 srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); 6238 6239 switch (nir_dest_bit_size(instr->dest)) { 6240 case 16: { 6241 fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD); 6242 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL, dest32, 6243 srcs, A64_LOGICAL_NUM_SRCS); 6244 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32); 6245 break; 6246 } 6247 case 32: 6248 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL, dest, 6249 srcs, A64_LOGICAL_NUM_SRCS); 6250 break; 6251 case 64: 6252 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL, dest, 6253 srcs, A64_LOGICAL_NUM_SRCS); 6254 break; 6255 default: 6256 unreachable("Unsupported bit size"); 6257 } 6258} 6259 6260void 6261fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) 6262{ 6263 unsigned texture = instr->texture_index; 6264 unsigned sampler = instr->sampler_index; 6265 6266 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 6267 6268 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture); 6269 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler); 6270 6271 int lod_components = 0; 6272 6273 /* The hardware requires a LOD for buffer textures */ 6274 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 6275 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0); 6276 6277 uint32_t header_bits = 0; 6278 for (unsigned i = 0; i < instr->num_srcs; i++) { 6279 fs_reg src = get_nir_src(instr->src[i].src); 6280 switch (instr->src[i].src_type) { 6281 case nir_tex_src_bias: 6282 srcs[TEX_LOGICAL_SRC_LOD] = 6283 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 6284 break; 6285 case nir_tex_src_comparator: 6286 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F); 6287 break; 6288 case nir_tex_src_coord: 6289 switch (instr->op) { 6290 case nir_texop_txf: 6291 case nir_texop_txf_ms: 6292 case nir_texop_txf_ms_mcs_intel: 6293 case nir_texop_samples_identical: 6294 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D); 6295 break; 6296 default: 6297 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F); 6298 break; 6299 } 6300 6301 /* Wa_14013363432: 6302 * 6303 * Compiler should send U,V,R parameters even if V,R are 0. 6304 */ 6305 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && devinfo->verx10 == 125) 6306 assert(instr->coord_components >= 3u); 6307 break; 6308 case nir_tex_src_ddx: 6309 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F); 6310 lod_components = nir_tex_instr_src_size(instr, i); 6311 break; 6312 case nir_tex_src_ddy: 6313 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F); 6314 break; 6315 case nir_tex_src_lod: 6316 switch (instr->op) { 6317 case nir_texop_txs: 6318 srcs[TEX_LOGICAL_SRC_LOD] = 6319 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD); 6320 break; 6321 case nir_texop_txf: 6322 srcs[TEX_LOGICAL_SRC_LOD] = 6323 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D); 6324 break; 6325 default: 6326 srcs[TEX_LOGICAL_SRC_LOD] = 6327 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 6328 break; 6329 } 6330 break; 6331 case nir_tex_src_min_lod: 6332 srcs[TEX_LOGICAL_SRC_MIN_LOD] = 6333 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 6334 break; 6335 case nir_tex_src_ms_index: 6336 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD); 6337 break; 6338 6339 case nir_tex_src_offset: { 6340 uint32_t offset_bits = 0; 6341 if (brw_texture_offset(instr, i, &offset_bits)) { 6342 header_bits |= offset_bits; 6343 } else { 6344 /* On gfx12.5+, if the offsets are not both constant and in the 6345 * {-8,7} range, nir_lower_tex() will have already lowered the 6346 * source offset. So we should never reach this point. 6347 */ 6348 assert(devinfo->verx10 < 125); 6349 srcs[TEX_LOGICAL_SRC_TG4_OFFSET] = 6350 retype(src, BRW_REGISTER_TYPE_D); 6351 } 6352 break; 6353 } 6354 6355 case nir_tex_src_projector: 6356 unreachable("should be lowered"); 6357 6358 case nir_tex_src_texture_offset: { 6359 /* Emit code to evaluate the actual indexing expression */ 6360 fs_reg tmp = vgrf(glsl_type::uint_type); 6361 bld.ADD(tmp, src, brw_imm_ud(texture)); 6362 srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp); 6363 break; 6364 } 6365 6366 case nir_tex_src_sampler_offset: { 6367 /* Emit code to evaluate the actual indexing expression */ 6368 fs_reg tmp = vgrf(glsl_type::uint_type); 6369 bld.ADD(tmp, src, brw_imm_ud(sampler)); 6370 srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp); 6371 break; 6372 } 6373 6374 case nir_tex_src_texture_handle: 6375 assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1); 6376 srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg(); 6377 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src); 6378 break; 6379 6380 case nir_tex_src_sampler_handle: 6381 assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1); 6382 srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg(); 6383 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src); 6384 break; 6385 6386 case nir_tex_src_ms_mcs_intel: 6387 assert(instr->op == nir_texop_txf_ms); 6388 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D); 6389 break; 6390 6391 default: 6392 unreachable("unknown texture source"); 6393 } 6394 } 6395 6396 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE && 6397 (instr->op == nir_texop_txf_ms || 6398 instr->op == nir_texop_samples_identical)) { 6399 if (devinfo->ver >= 7 && 6400 key_tex->compressed_multisample_layout_mask & (1 << texture)) { 6401 srcs[TEX_LOGICAL_SRC_MCS] = 6402 emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE], 6403 instr->coord_components, 6404 srcs[TEX_LOGICAL_SRC_SURFACE], 6405 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]); 6406 } else { 6407 srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u); 6408 } 6409 } 6410 6411 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components); 6412 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components); 6413 6414 enum opcode opcode; 6415 switch (instr->op) { 6416 case nir_texop_tex: 6417 opcode = SHADER_OPCODE_TEX_LOGICAL; 6418 break; 6419 case nir_texop_txb: 6420 opcode = FS_OPCODE_TXB_LOGICAL; 6421 break; 6422 case nir_texop_txl: 6423 opcode = SHADER_OPCODE_TXL_LOGICAL; 6424 break; 6425 case nir_texop_txd: 6426 opcode = SHADER_OPCODE_TXD_LOGICAL; 6427 break; 6428 case nir_texop_txf: 6429 opcode = SHADER_OPCODE_TXF_LOGICAL; 6430 break; 6431 case nir_texop_txf_ms: 6432 /* On Gfx12HP there is only CMS_W available. From the Bspec: Shared 6433 * Functions - 3D Sampler - Messages - Message Format: 6434 * 6435 * ld2dms REMOVEDBY(GEN:HAS:1406788836) 6436 */ 6437 if (devinfo->verx10 >= 125) 6438 opcode = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL; 6439 else if ((key_tex->msaa_16 & (1 << sampler))) 6440 opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL; 6441 else 6442 opcode = SHADER_OPCODE_TXF_CMS_LOGICAL; 6443 break; 6444 case nir_texop_txf_ms_mcs_intel: 6445 opcode = SHADER_OPCODE_TXF_MCS_LOGICAL; 6446 break; 6447 case nir_texop_query_levels: 6448 case nir_texop_txs: 6449 opcode = SHADER_OPCODE_TXS_LOGICAL; 6450 break; 6451 case nir_texop_lod: 6452 opcode = SHADER_OPCODE_LOD_LOGICAL; 6453 break; 6454 case nir_texop_tg4: 6455 if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) 6456 opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL; 6457 else 6458 opcode = SHADER_OPCODE_TG4_LOGICAL; 6459 break; 6460 case nir_texop_texture_samples: 6461 opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL; 6462 break; 6463 case nir_texop_samples_identical: { 6464 fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D); 6465 6466 /* If mcs is an immediate value, it means there is no MCS. In that case 6467 * just return false. 6468 */ 6469 if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) { 6470 bld.MOV(dst, brw_imm_ud(0u)); 6471 } else if ((key_tex->msaa_16 & (1 << sampler))) { 6472 fs_reg tmp = vgrf(glsl_type::uint_type); 6473 bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS], 6474 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1)); 6475 bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ); 6476 } else { 6477 bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u), 6478 BRW_CONDITIONAL_EQ); 6479 } 6480 return; 6481 } 6482 default: 6483 unreachable("unknown texture opcode"); 6484 } 6485 6486 if (instr->op == nir_texop_tg4) { 6487 if (instr->component == 1 && 6488 key_tex->gather_channel_quirk_mask & (1 << texture)) { 6489 /* gather4 sampler is broken for green channel on RG32F -- 6490 * we must ask for blue instead. 6491 */ 6492 header_bits |= 2 << 16; 6493 } else { 6494 header_bits |= instr->component << 16; 6495 } 6496 } 6497 6498 fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4); 6499 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); 6500 inst->offset = header_bits; 6501 6502 const unsigned dest_size = nir_tex_instr_dest_size(instr); 6503 if (devinfo->ver >= 9 && 6504 instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) { 6505 unsigned write_mask = instr->dest.is_ssa ? 6506 nir_ssa_def_components_read(&instr->dest.ssa): 6507 (1 << dest_size) - 1; 6508 assert(write_mask != 0); /* dead code should have been eliminated */ 6509 inst->size_written = util_last_bit(write_mask) * 6510 inst->dst.component_size(inst->exec_size); 6511 } else { 6512 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 6513 } 6514 6515 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE) 6516 inst->shadow_compare = true; 6517 6518 fs_reg nir_dest[5]; 6519 for (unsigned i = 0; i < dest_size; i++) 6520 nir_dest[i] = offset(dst, bld, i); 6521 6522 if (instr->op == nir_texop_query_levels) { 6523 /* # levels is in .w */ 6524 if (devinfo->ver <= 9) { 6525 /** 6526 * Wa_1940217: 6527 * 6528 * When a surface of type SURFTYPE_NULL is accessed by resinfo, the 6529 * MIPCount returned is undefined instead of 0. 6530 */ 6531 fs_inst *mov = bld.MOV(bld.null_reg_d(), dst); 6532 mov->conditional_mod = BRW_CONDITIONAL_NZ; 6533 nir_dest[0] = bld.vgrf(BRW_REGISTER_TYPE_D); 6534 fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0)); 6535 sel->predicate = BRW_PREDICATE_NORMAL; 6536 } else { 6537 nir_dest[0] = offset(dst, bld, 3); 6538 } 6539 } else if (instr->op == nir_texop_txs && 6540 dest_size >= 3 && devinfo->ver < 7) { 6541 /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */ 6542 fs_reg depth = offset(dst, bld, 2); 6543 nir_dest[2] = vgrf(glsl_type::int_type); 6544 bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE); 6545 } 6546 6547 bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0); 6548} 6549 6550void 6551fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr) 6552{ 6553 switch (instr->type) { 6554 case nir_jump_break: 6555 bld.emit(BRW_OPCODE_BREAK); 6556 break; 6557 case nir_jump_continue: 6558 bld.emit(BRW_OPCODE_CONTINUE); 6559 break; 6560 case nir_jump_halt: 6561 bld.emit(BRW_OPCODE_HALT); 6562 break; 6563 case nir_jump_return: 6564 default: 6565 unreachable("unknown jump"); 6566 } 6567} 6568 6569/* 6570 * This helper takes a source register and un/shuffles it into the destination 6571 * register. 6572 * 6573 * If source type size is smaller than destination type size the operation 6574 * needed is a component shuffle. The opposite case would be an unshuffle. If 6575 * source/destination type size is equal a shuffle is done that would be 6576 * equivalent to a simple MOV. 6577 * 6578 * For example, if source is a 16-bit type and destination is 32-bit. A 3 6579 * components .xyz 16-bit vector on SIMD8 would be. 6580 * 6581 * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8| 6582 * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | | 6583 * 6584 * This helper will return the following 2 32-bit components with the 16-bit 6585 * values shuffled: 6586 * 6587 * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8| 6588 * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 | 6589 * 6590 * For unshuffle, the example would be the opposite, a 64-bit type source 6591 * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8 6592 * would be: 6593 * 6594 * | x1l x1h | x2l x2h | x3l x3h | x4l x4h | 6595 * | x5l x5h | x6l x6h | x7l x7h | x8l x8h | 6596 * | y1l y1h | y2l y2h | y3l y3h | y4l y4h | 6597 * | y5l y5h | y6l y6h | y7l y7h | y8l y8h | 6598 * 6599 * The returned result would be the following 4 32-bit components unshuffled: 6600 * 6601 * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l | 6602 * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h | 6603 * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l | 6604 * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h | 6605 * 6606 * - Source and destination register must not be overlapped. 6607 * - components units are measured in terms of the smaller type between 6608 * source and destination because we are un/shuffling the smaller 6609 * components from/into the bigger ones. 6610 * - first_component parameter allows skipping source components. 6611 */ 6612void 6613shuffle_src_to_dst(const fs_builder &bld, 6614 const fs_reg &dst, 6615 const fs_reg &src, 6616 uint32_t first_component, 6617 uint32_t components) 6618{ 6619 if (type_sz(src.type) == type_sz(dst.type)) { 6620 assert(!regions_overlap(dst, 6621 type_sz(dst.type) * bld.dispatch_width() * components, 6622 offset(src, bld, first_component), 6623 type_sz(src.type) * bld.dispatch_width() * components)); 6624 for (unsigned i = 0; i < components; i++) { 6625 bld.MOV(retype(offset(dst, bld, i), src.type), 6626 offset(src, bld, i + first_component)); 6627 } 6628 } else if (type_sz(src.type) < type_sz(dst.type)) { 6629 /* Source is shuffled into destination */ 6630 unsigned size_ratio = type_sz(dst.type) / type_sz(src.type); 6631 assert(!regions_overlap(dst, 6632 type_sz(dst.type) * bld.dispatch_width() * 6633 DIV_ROUND_UP(components, size_ratio), 6634 offset(src, bld, first_component), 6635 type_sz(src.type) * bld.dispatch_width() * components)); 6636 6637 brw_reg_type shuffle_type = 6638 brw_reg_type_from_bit_size(8 * type_sz(src.type), 6639 BRW_REGISTER_TYPE_D); 6640 for (unsigned i = 0; i < components; i++) { 6641 fs_reg shuffle_component_i = 6642 subscript(offset(dst, bld, i / size_ratio), 6643 shuffle_type, i % size_ratio); 6644 bld.MOV(shuffle_component_i, 6645 retype(offset(src, bld, i + first_component), shuffle_type)); 6646 } 6647 } else { 6648 /* Source is unshuffled into destination */ 6649 unsigned size_ratio = type_sz(src.type) / type_sz(dst.type); 6650 assert(!regions_overlap(dst, 6651 type_sz(dst.type) * bld.dispatch_width() * components, 6652 offset(src, bld, first_component / size_ratio), 6653 type_sz(src.type) * bld.dispatch_width() * 6654 DIV_ROUND_UP(components + (first_component % size_ratio), 6655 size_ratio))); 6656 6657 brw_reg_type shuffle_type = 6658 brw_reg_type_from_bit_size(8 * type_sz(dst.type), 6659 BRW_REGISTER_TYPE_D); 6660 for (unsigned i = 0; i < components; i++) { 6661 fs_reg shuffle_component_i = 6662 subscript(offset(src, bld, (first_component + i) / size_ratio), 6663 shuffle_type, (first_component + i) % size_ratio); 6664 bld.MOV(retype(offset(dst, bld, i), shuffle_type), 6665 shuffle_component_i); 6666 } 6667 } 6668} 6669 6670void 6671shuffle_from_32bit_read(const fs_builder &bld, 6672 const fs_reg &dst, 6673 const fs_reg &src, 6674 uint32_t first_component, 6675 uint32_t components) 6676{ 6677 assert(type_sz(src.type) == 4); 6678 6679 /* This function takes components in units of the destination type while 6680 * shuffle_src_to_dst takes components in units of the smallest type 6681 */ 6682 if (type_sz(dst.type) > 4) { 6683 assert(type_sz(dst.type) == 8); 6684 first_component *= 2; 6685 components *= 2; 6686 } 6687 6688 shuffle_src_to_dst(bld, dst, src, first_component, components); 6689} 6690 6691fs_reg 6692setup_imm_df(const fs_builder &bld, double v) 6693{ 6694 const struct intel_device_info *devinfo = bld.shader->devinfo; 6695 assert(devinfo->ver >= 7); 6696 6697 if (devinfo->ver >= 8) 6698 return brw_imm_df(v); 6699 6700 /* gfx7.5 does not support DF immediates straightforward but the DIM 6701 * instruction allows to set the 64-bit immediate value. 6702 */ 6703 if (devinfo->platform == INTEL_PLATFORM_HSW) { 6704 const fs_builder ubld = bld.exec_all().group(1, 0); 6705 fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1); 6706 ubld.DIM(dst, brw_imm_df(v)); 6707 return component(dst, 0); 6708 } 6709 6710 /* gfx7 does not support DF immediates, so we generate a 64-bit constant by 6711 * writing the low 32-bit of the constant to suboffset 0 of a VGRF and 6712 * the high 32-bit to suboffset 4 and then applying a stride of 0. 6713 * 6714 * Alternatively, we could also produce a normal VGRF (without stride 0) 6715 * by writing to all the channels in the VGRF, however, that would hit the 6716 * gfx7 bug where we have to split writes that span more than 1 register 6717 * into instructions with a width of 4 (otherwise the write to the second 6718 * register written runs into an execmask hardware bug) which isn't very 6719 * nice. 6720 */ 6721 union { 6722 double d; 6723 struct { 6724 uint32_t i1; 6725 uint32_t i2; 6726 }; 6727 } di; 6728 6729 di.d = v; 6730 6731 const fs_builder ubld = bld.exec_all().group(1, 0); 6732 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 6733 ubld.MOV(tmp, brw_imm_ud(di.i1)); 6734 ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2)); 6735 6736 return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0); 6737} 6738 6739fs_reg 6740setup_imm_b(const fs_builder &bld, int8_t v) 6741{ 6742 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B); 6743 bld.MOV(tmp, brw_imm_w(v)); 6744 return tmp; 6745} 6746 6747fs_reg 6748setup_imm_ub(const fs_builder &bld, uint8_t v) 6749{ 6750 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB); 6751 bld.MOV(tmp, brw_imm_uw(v)); 6752 return tmp; 6753} 6754