1/* 2 * Copyright © 2010, 2022 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** 25 * @file brw_lower_logical_sends.cpp 26 */ 27 28#include "brw_eu.h" 29#include "brw_fs.h" 30 31using namespace brw; 32 33static void 34lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst) 35{ 36 const intel_device_info *devinfo = bld.shader->devinfo; 37 const bool per_slot_present = 38 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE; 39 40 assert(inst->size_written % REG_SIZE == 0); 41 assert(inst->header_size == 0); 42 43 fs_reg *payload_sources = new fs_reg[inst->mlen]; 44 fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen), 45 BRW_REGISTER_TYPE_F); 46 47 unsigned header_size = 0; 48 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE]; 49 if (per_slot_present) 50 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS]; 51 52 bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size); 53 54 delete [] payload_sources; 55 56 inst->opcode = SHADER_OPCODE_SEND; 57 inst->header_size = header_size; 58 59 inst->sfid = BRW_SFID_URB; 60 inst->desc = brw_urb_desc(devinfo, 61 GFX8_URB_OPCODE_SIMD8_READ, 62 per_slot_present, 63 false, 64 inst->offset); 65 66 inst->ex_desc = 0; 67 inst->ex_mlen = 0; 68 inst->send_is_volatile = true; 69 70 inst->resize_sources(4); 71 72 inst->src[0] = brw_imm_ud(0); /* desc */ 73 inst->src[1] = brw_imm_ud(0); /* ex_desc */ 74 inst->src[2] = payload; 75 inst->src[3] = brw_null_reg(); 76} 77 78static void 79lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst) 80{ 81 const intel_device_info *devinfo = bld.shader->devinfo; 82 const bool per_slot_present = 83 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE; 84 const bool channel_mask_present = 85 inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE; 86 87 assert(inst->header_size == 0); 88 89 fs_reg *payload_sources = new fs_reg[inst->mlen]; 90 fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen), 91 BRW_REGISTER_TYPE_F); 92 93 unsigned header_size = 0; 94 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE]; 95 if (per_slot_present) 96 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS]; 97 98 if (channel_mask_present) 99 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK]; 100 101 for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++) 102 payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j); 103 104 bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size); 105 106 delete [] payload_sources; 107 108 inst->opcode = SHADER_OPCODE_SEND; 109 inst->header_size = header_size; 110 inst->dst = brw_null_reg(); 111 112 inst->sfid = BRW_SFID_URB; 113 inst->desc = brw_urb_desc(devinfo, 114 GFX8_URB_OPCODE_SIMD8_WRITE, 115 per_slot_present, 116 channel_mask_present, 117 inst->offset); 118 119 inst->ex_desc = 0; 120 inst->ex_mlen = 0; 121 inst->send_has_side_effects = true; 122 123 inst->resize_sources(4); 124 125 inst->src[0] = brw_imm_ud(0); /* desc */ 126 inst->src[1] = brw_imm_ud(0); /* ex_desc */ 127 inst->src[2] = payload; 128 inst->src[3] = brw_null_reg(); 129} 130 131static void 132setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key, 133 fs_reg *dst, fs_reg color, unsigned components) 134{ 135 if (key->clamp_fragment_color) { 136 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4); 137 assert(color.type == BRW_REGISTER_TYPE_F); 138 139 for (unsigned i = 0; i < components; i++) 140 set_saturate(true, 141 bld.MOV(offset(tmp, bld, i), offset(color, bld, i))); 142 143 color = tmp; 144 } 145 146 for (unsigned i = 0; i < components; i++) 147 dst[i] = offset(color, bld, i); 148} 149 150static void 151lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, 152 const struct brw_wm_prog_data *prog_data, 153 const brw_wm_prog_key *key, 154 const fs_visitor::thread_payload &payload) 155{ 156 assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM); 157 const intel_device_info *devinfo = bld.shader->devinfo; 158 const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0]; 159 const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1]; 160 const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA]; 161 const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH]; 162 const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH]; 163 const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL]; 164 fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK]; 165 const unsigned components = 166 inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud; 167 168 assert(inst->target != 0 || src0_alpha.file == BAD_FILE); 169 170 /* We can potentially have a message length of up to 15, so we have to set 171 * base_mrf to either 0 or 1 in order to fit in m0..m15. 172 */ 173 fs_reg sources[15]; 174 int header_size = 2, payload_header_size; 175 unsigned length = 0; 176 177 if (devinfo->ver < 6) { 178 /* TODO: Support SIMD32 on gfx4-5 */ 179 assert(bld.group() < 16); 180 181 /* For gfx4-5, we always have a header consisting of g0 and g1. We have 182 * an implied MOV from g0,g1 to the start of the message. The MOV from 183 * g0 is handled by the hardware and the MOV from g1 is provided by the 184 * generator. This is required because, on gfx4-5, the generator may 185 * generate two write messages with different message lengths in order 186 * to handle AA data properly. 187 * 188 * Also, since the pixel mask goes in the g0 portion of the message and 189 * since render target writes are the last thing in the shader, we write 190 * the pixel mask directly into g0 and it will get copied as part of the 191 * implied write. 192 */ 193 if (prog_data->uses_kill) { 194 bld.exec_all().group(1, 0) 195 .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 196 brw_sample_mask_reg(bld)); 197 } 198 199 assert(length == 0); 200 length = 2; 201 } else if ((devinfo->verx10 <= 70 && 202 prog_data->uses_kill) || 203 (devinfo->ver < 11 && 204 (color1.file != BAD_FILE || key->nr_color_regions > 1))) { 205 /* From the Sandy Bridge PRM, volume 4, page 198: 206 * 207 * "Dispatched Pixel Enables. One bit per pixel indicating 208 * which pixels were originally enabled when the thread was 209 * dispatched. This field is only required for the end-of- 210 * thread message and on all dual-source messages." 211 */ 212 const fs_builder ubld = bld.exec_all().group(8, 0); 213 214 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 215 if (bld.group() < 16) { 216 /* The header starts off as g0 and g1 for the first half */ 217 ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0), 218 BRW_REGISTER_TYPE_UD)); 219 } else { 220 /* The header starts off as g0 and g2 for the second half */ 221 assert(bld.group() < 32); 222 const fs_reg header_sources[2] = { 223 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD), 224 retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD), 225 }; 226 ubld.LOAD_PAYLOAD(header, header_sources, 2, 0); 227 228 /* Gfx12 will require additional fix-ups if we ever hit this path. */ 229 assert(devinfo->ver < 12); 230 } 231 232 uint32_t g00_bits = 0; 233 234 /* Set "Source0 Alpha Present to RenderTarget" bit in message 235 * header. 236 */ 237 if (src0_alpha.file != BAD_FILE) 238 g00_bits |= 1 << 11; 239 240 /* Set computes stencil to render target */ 241 if (prog_data->computed_stencil) 242 g00_bits |= 1 << 14; 243 244 if (g00_bits) { 245 /* OR extra bits into g0.0 */ 246 ubld.group(1, 0).OR(component(header, 0), 247 retype(brw_vec1_grf(0, 0), 248 BRW_REGISTER_TYPE_UD), 249 brw_imm_ud(g00_bits)); 250 } 251 252 /* Set the render target index for choosing BLEND_STATE. */ 253 if (inst->target > 0) { 254 ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target)); 255 } 256 257 if (prog_data->uses_kill) { 258 ubld.group(1, 0).MOV(retype(component(header, 15), 259 BRW_REGISTER_TYPE_UW), 260 brw_sample_mask_reg(bld)); 261 } 262 263 assert(length == 0); 264 sources[0] = header; 265 sources[1] = horiz_offset(header, 8); 266 length = 2; 267 } 268 assert(length == 0 || length == 2); 269 header_size = length; 270 271 if (payload.aa_dest_stencil_reg[0]) { 272 assert(inst->group < 16); 273 sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1)); 274 bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha") 275 .MOV(sources[length], 276 fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0))); 277 length++; 278 } 279 280 if (src0_alpha.file != BAD_FILE) { 281 for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) { 282 const fs_builder &ubld = bld.exec_all().group(8, i) 283 .annotate("FB write src0 alpha"); 284 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F); 285 ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8)); 286 setup_color_payload(ubld, key, &sources[length], tmp, 1); 287 length++; 288 } 289 } 290 291 if (sample_mask.file != BAD_FILE) { 292 sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1), 293 BRW_REGISTER_TYPE_UD); 294 295 /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are 296 * relevant. Since it's unsigned single words one vgrf is always 297 * 16-wide, but only the lower or higher 8 channels will be used by the 298 * hardware when doing a SIMD8 write depending on whether we have 299 * selected the subspans for the first or second half respectively. 300 */ 301 assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4); 302 sample_mask.type = BRW_REGISTER_TYPE_UW; 303 sample_mask.stride *= 2; 304 305 bld.exec_all().annotate("FB write oMask") 306 .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW), 307 inst->group % 16), 308 sample_mask); 309 length++; 310 } 311 312 payload_header_size = length; 313 314 setup_color_payload(bld, key, &sources[length], color0, components); 315 length += 4; 316 317 if (color1.file != BAD_FILE) { 318 setup_color_payload(bld, key, &sources[length], color1, components); 319 length += 4; 320 } 321 322 if (src_depth.file != BAD_FILE) { 323 sources[length] = src_depth; 324 length++; 325 } 326 327 if (dst_depth.file != BAD_FILE) { 328 sources[length] = dst_depth; 329 length++; 330 } 331 332 if (src_stencil.file != BAD_FILE) { 333 assert(devinfo->ver >= 9); 334 assert(bld.dispatch_width() == 8); 335 336 /* XXX: src_stencil is only available on gfx9+. dst_depth is never 337 * available on gfx9+. As such it's impossible to have both enabled at the 338 * same time and therefore length cannot overrun the array. 339 */ 340 assert(length < 15); 341 342 sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD); 343 bld.exec_all().annotate("FB write OS") 344 .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB), 345 subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0)); 346 length++; 347 } 348 349 fs_inst *load; 350 if (devinfo->ver >= 7) { 351 /* Send from the GRF */ 352 fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F); 353 load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size); 354 payload.nr = bld.shader->alloc.allocate(regs_written(load)); 355 load->dst = payload; 356 357 uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data); 358 359 inst->desc = 360 (inst->group / 16) << 11 | /* rt slot group */ 361 brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt, 362 prog_data->per_coarse_pixel_dispatch); 363 364 uint32_t ex_desc = 0; 365 if (devinfo->ver >= 11) { 366 /* Set the "Render Target Index" and "Src0 Alpha Present" fields 367 * in the extended message descriptor, in lieu of using a header. 368 */ 369 ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15; 370 371 if (key->nr_color_regions == 0) 372 ex_desc |= 1 << 20; /* Null Render Target */ 373 } 374 inst->ex_desc = ex_desc; 375 376 inst->opcode = SHADER_OPCODE_SEND; 377 inst->resize_sources(3); 378 inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE; 379 inst->src[0] = brw_imm_ud(0); 380 inst->src[1] = brw_imm_ud(0); 381 inst->src[2] = payload; 382 inst->mlen = regs_written(load); 383 inst->ex_mlen = 0; 384 inst->header_size = header_size; 385 inst->check_tdr = true; 386 inst->send_has_side_effects = true; 387 } else { 388 /* Send from the MRF */ 389 load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F), 390 sources, length, payload_header_size); 391 392 /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD 393 * will do this for us if we just give it a COMPR4 destination. 394 */ 395 if (devinfo->ver < 6 && bld.dispatch_width() == 16) 396 load->dst.nr |= BRW_MRF_COMPR4; 397 398 if (devinfo->ver < 6) { 399 /* Set up src[0] for the implied MOV from grf0-1 */ 400 inst->resize_sources(1); 401 inst->src[0] = brw_vec8_grf(0, 0); 402 } else { 403 inst->resize_sources(0); 404 } 405 inst->base_mrf = 1; 406 inst->opcode = FS_OPCODE_FB_WRITE; 407 inst->mlen = regs_written(load); 408 inst->header_size = header_size; 409 } 410} 411 412static void 413lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst) 414{ 415 const intel_device_info *devinfo = bld.shader->devinfo; 416 const fs_builder &ubld = bld.exec_all().group(8, 0); 417 const unsigned length = 2; 418 const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length); 419 420 if (bld.group() < 16) { 421 ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0), 422 BRW_REGISTER_TYPE_UD)); 423 } else { 424 assert(bld.group() < 32); 425 const fs_reg header_sources[] = { 426 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD), 427 retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD) 428 }; 429 ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0); 430 431 if (devinfo->ver >= 12) { 432 /* On Gfx12 the Viewport and Render Target Array Index fields (AKA 433 * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render 434 * target message header format was updated accordingly -- However 435 * the updated format only works for the lower 16 channels in a 436 * SIMD32 thread, since the higher 16 channels want the subspan data 437 * from r2 instead of r1, so we need to copy over the contents of 438 * r1.1 in order to fix things up. 439 */ 440 ubld.group(1, 0).MOV(component(header, 9), 441 retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD)); 442 } 443 } 444 445 /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) : 446 * 447 * "Must be zero for Render Target Read message." 448 * 449 * For bits : 450 * - 14 : Stencil Present to Render Target 451 * - 13 : Source Depth Present to Render Target 452 * - 12 : oMask to Render Target 453 * - 11 : Source0 Alpha Present to Render Target 454 */ 455 ubld.group(1, 0).AND(component(header, 0), 456 component(header, 0), 457 brw_imm_ud(~INTEL_MASK(14, 11))); 458 459 inst->resize_sources(1); 460 inst->src[0] = header; 461 inst->opcode = FS_OPCODE_FB_READ; 462 inst->mlen = length; 463 inst->header_size = length; 464} 465 466static void 467lower_sampler_logical_send_gfx4(const fs_builder &bld, fs_inst *inst, opcode op, 468 const fs_reg &coordinate, 469 const fs_reg &shadow_c, 470 const fs_reg &lod, const fs_reg &lod2, 471 const fs_reg &surface, 472 const fs_reg &sampler, 473 unsigned coord_components, 474 unsigned grad_components) 475{ 476 const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB || 477 op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS); 478 fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F); 479 fs_reg msg_end = msg_begin; 480 481 /* g0 header. */ 482 msg_end = offset(msg_end, bld.group(8, 0), 1); 483 484 for (unsigned i = 0; i < coord_components; i++) 485 bld.MOV(retype(offset(msg_end, bld, i), coordinate.type), 486 offset(coordinate, bld, i)); 487 488 msg_end = offset(msg_end, bld, coord_components); 489 490 /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8 491 * require all three components to be present and zero if they are unused. 492 */ 493 if (coord_components > 0 && 494 (has_lod || shadow_c.file != BAD_FILE || 495 (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) { 496 assert(coord_components <= 3); 497 for (unsigned i = 0; i < 3 - coord_components; i++) 498 bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f)); 499 500 msg_end = offset(msg_end, bld, 3 - coord_components); 501 } 502 503 if (op == SHADER_OPCODE_TXD) { 504 /* TXD unsupported in SIMD16 mode. */ 505 assert(bld.dispatch_width() == 8); 506 507 /* the slots for u and v are always present, but r is optional */ 508 if (coord_components < 2) 509 msg_end = offset(msg_end, bld, 2 - coord_components); 510 511 /* P = u, v, r 512 * dPdx = dudx, dvdx, drdx 513 * dPdy = dudy, dvdy, drdy 514 * 515 * 1-arg: Does not exist. 516 * 517 * 2-arg: dudx dvdx dudy dvdy 518 * dPdx.x dPdx.y dPdy.x dPdy.y 519 * m4 m5 m6 m7 520 * 521 * 3-arg: dudx dvdx drdx dudy dvdy drdy 522 * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z 523 * m5 m6 m7 m8 m9 m10 524 */ 525 for (unsigned i = 0; i < grad_components; i++) 526 bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i)); 527 528 msg_end = offset(msg_end, bld, MAX2(grad_components, 2)); 529 530 for (unsigned i = 0; i < grad_components; i++) 531 bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i)); 532 533 msg_end = offset(msg_end, bld, MAX2(grad_components, 2)); 534 } 535 536 if (has_lod) { 537 /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without* 538 * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode. 539 */ 540 assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 : 541 bld.dispatch_width() == 16); 542 543 const brw_reg_type type = 544 (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ? 545 BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F); 546 bld.MOV(retype(msg_end, type), lod); 547 msg_end = offset(msg_end, bld, 1); 548 } 549 550 if (shadow_c.file != BAD_FILE) { 551 if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) { 552 /* There's no plain shadow compare message, so we use shadow 553 * compare with a bias of 0.0. 554 */ 555 bld.MOV(msg_end, brw_imm_f(0.0f)); 556 msg_end = offset(msg_end, bld, 1); 557 } 558 559 bld.MOV(msg_end, shadow_c); 560 msg_end = offset(msg_end, bld, 1); 561 } 562 563 inst->opcode = op; 564 inst->src[0] = reg_undef; 565 inst->src[1] = surface; 566 inst->src[2] = sampler; 567 inst->resize_sources(3); 568 inst->base_mrf = msg_begin.nr; 569 inst->mlen = msg_end.nr - msg_begin.nr; 570 inst->header_size = 1; 571} 572 573static void 574lower_sampler_logical_send_gfx5(const fs_builder &bld, fs_inst *inst, opcode op, 575 const fs_reg &coordinate, 576 const fs_reg &shadow_c, 577 const fs_reg &lod, const fs_reg &lod2, 578 const fs_reg &sample_index, 579 const fs_reg &surface, 580 const fs_reg &sampler, 581 unsigned coord_components, 582 unsigned grad_components) 583{ 584 fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F); 585 fs_reg msg_coords = message; 586 unsigned header_size = 0; 587 588 if (inst->offset != 0) { 589 /* The offsets set up by the visitor are in the m1 header, so we can't 590 * go headerless. 591 */ 592 header_size = 1; 593 message.nr--; 594 } 595 596 for (unsigned i = 0; i < coord_components; i++) 597 bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), 598 offset(coordinate, bld, i)); 599 600 fs_reg msg_end = offset(msg_coords, bld, coord_components); 601 fs_reg msg_lod = offset(msg_coords, bld, 4); 602 603 if (shadow_c.file != BAD_FILE) { 604 fs_reg msg_shadow = msg_lod; 605 bld.MOV(msg_shadow, shadow_c); 606 msg_lod = offset(msg_shadow, bld, 1); 607 msg_end = msg_lod; 608 } 609 610 switch (op) { 611 case SHADER_OPCODE_TXL: 612 case FS_OPCODE_TXB: 613 bld.MOV(msg_lod, lod); 614 msg_end = offset(msg_lod, bld, 1); 615 break; 616 case SHADER_OPCODE_TXD: 617 /** 618 * P = u, v, r 619 * dPdx = dudx, dvdx, drdx 620 * dPdy = dudy, dvdy, drdy 621 * 622 * Load up these values: 623 * - dudx dudy dvdx dvdy drdx drdy 624 * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z 625 */ 626 msg_end = msg_lod; 627 for (unsigned i = 0; i < grad_components; i++) { 628 bld.MOV(msg_end, offset(lod, bld, i)); 629 msg_end = offset(msg_end, bld, 1); 630 631 bld.MOV(msg_end, offset(lod2, bld, i)); 632 msg_end = offset(msg_end, bld, 1); 633 } 634 break; 635 case SHADER_OPCODE_TXS: 636 msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD); 637 bld.MOV(msg_lod, lod); 638 msg_end = offset(msg_lod, bld, 1); 639 break; 640 case SHADER_OPCODE_TXF: 641 msg_lod = offset(msg_coords, bld, 3); 642 bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod); 643 msg_end = offset(msg_lod, bld, 1); 644 break; 645 case SHADER_OPCODE_TXF_CMS: 646 msg_lod = offset(msg_coords, bld, 3); 647 /* lod */ 648 bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); 649 /* sample index */ 650 bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index); 651 msg_end = offset(msg_lod, bld, 2); 652 break; 653 default: 654 break; 655 } 656 657 inst->opcode = op; 658 inst->src[0] = reg_undef; 659 inst->src[1] = surface; 660 inst->src[2] = sampler; 661 inst->resize_sources(3); 662 inst->base_mrf = message.nr; 663 inst->mlen = msg_end.nr - message.nr; 664 inst->header_size = header_size; 665 666 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */ 667 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE); 668} 669 670static bool 671is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler) 672{ 673 if (devinfo->verx10 <= 70) 674 return false; 675 676 return sampler.file != IMM || sampler.ud >= 16; 677} 678 679static unsigned 680sampler_msg_type(const intel_device_info *devinfo, 681 opcode opcode, bool shadow_compare) 682{ 683 assert(devinfo->ver >= 5); 684 switch (opcode) { 685 case SHADER_OPCODE_TEX: 686 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE : 687 GFX5_SAMPLER_MESSAGE_SAMPLE; 688 case FS_OPCODE_TXB: 689 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE : 690 GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS; 691 case SHADER_OPCODE_TXL: 692 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE : 693 GFX5_SAMPLER_MESSAGE_SAMPLE_LOD; 694 case SHADER_OPCODE_TXL_LZ: 695 return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ : 696 GFX9_SAMPLER_MESSAGE_SAMPLE_LZ; 697 case SHADER_OPCODE_TXS: 698 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: 699 return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO; 700 case SHADER_OPCODE_TXD: 701 assert(!shadow_compare || devinfo->verx10 >= 75); 702 return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE : 703 GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS; 704 case SHADER_OPCODE_TXF: 705 return GFX5_SAMPLER_MESSAGE_SAMPLE_LD; 706 case SHADER_OPCODE_TXF_LZ: 707 assert(devinfo->ver >= 9); 708 return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ; 709 case SHADER_OPCODE_TXF_CMS_W: 710 assert(devinfo->ver >= 9); 711 return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; 712 case SHADER_OPCODE_TXF_CMS: 713 return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS : 714 GFX5_SAMPLER_MESSAGE_SAMPLE_LD; 715 case SHADER_OPCODE_TXF_UMS: 716 assert(devinfo->ver >= 7); 717 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS; 718 case SHADER_OPCODE_TXF_MCS: 719 assert(devinfo->ver >= 7); 720 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; 721 case SHADER_OPCODE_LOD: 722 return GFX5_SAMPLER_MESSAGE_LOD; 723 case SHADER_OPCODE_TG4: 724 assert(devinfo->ver >= 7); 725 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C : 726 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4; 727 break; 728 case SHADER_OPCODE_TG4_OFFSET: 729 assert(devinfo->ver >= 7); 730 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C : 731 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; 732 case SHADER_OPCODE_SAMPLEINFO: 733 return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; 734 default: 735 unreachable("not reached"); 736 } 737} 738 739/** 740 * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to 741 * the given requested_alignment_sz. 742 */ 743static fs_inst * 744emit_load_payload_with_padding(const fs_builder &bld, const fs_reg &dst, 745 const fs_reg *src, unsigned sources, 746 unsigned header_size, 747 unsigned requested_alignment_sz) 748{ 749 unsigned length = 0; 750 unsigned num_srcs = 751 sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width()); 752 fs_reg *src_comps = new fs_reg[num_srcs]; 753 754 for (unsigned i = 0; i < header_size; i++) 755 src_comps[length++] = src[i]; 756 757 for (unsigned i = header_size; i < sources; i++) { 758 unsigned src_sz = 759 retype(dst, src[i].type).component_size(bld.dispatch_width()); 760 const enum brw_reg_type padding_payload_type = 761 brw_reg_type_from_bit_size(type_sz(src[i].type) * 8, 762 BRW_REGISTER_TYPE_UD); 763 764 src_comps[length++] = src[i]; 765 766 /* Expand the real sources if component of requested payload type is 767 * larger than real source component. 768 */ 769 if (src_sz < requested_alignment_sz) { 770 for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) { 771 src_comps[length++] = retype(fs_reg(), padding_payload_type); 772 } 773 } 774 } 775 776 fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size); 777 delete[] src_comps; 778 779 return inst; 780} 781 782static void 783lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, 784 const fs_reg &coordinate, 785 const fs_reg &shadow_c, 786 fs_reg lod, const fs_reg &lod2, 787 const fs_reg &min_lod, 788 const fs_reg &sample_index, 789 const fs_reg &mcs, 790 const fs_reg &surface, 791 const fs_reg &sampler, 792 const fs_reg &surface_handle, 793 const fs_reg &sampler_handle, 794 const fs_reg &tg4_offset, 795 unsigned payload_type_bit_size, 796 unsigned coord_components, 797 unsigned grad_components) 798{ 799 const intel_device_info *devinfo = bld.shader->devinfo; 800 const enum brw_reg_type payload_type = 801 brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F); 802 const enum brw_reg_type payload_unsigned_type = 803 brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_UD); 804 const enum brw_reg_type payload_signed_type = 805 brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D); 806 unsigned reg_width = bld.dispatch_width() / 8; 807 unsigned header_size = 0, length = 0; 808 fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE]; 809 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) 810 sources[i] = bld.vgrf(payload_type); 811 812 /* We must have exactly one of surface/sampler and surface/sampler_handle */ 813 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); 814 assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE)); 815 816 if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET || 817 inst->offset != 0 || inst->eot || 818 op == SHADER_OPCODE_SAMPLEINFO || 819 sampler_handle.file != BAD_FILE || 820 is_high_sampler(devinfo, sampler)) { 821 /* For general texture offsets (no txf workaround), we need a header to 822 * put them in. 823 * 824 * TG4 needs to place its channel select in the header, for interaction 825 * with ARB_texture_swizzle. The sampler index is only 4-bits, so for 826 * larger sampler numbers we need to offset the Sampler State Pointer in 827 * the header. 828 */ 829 fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD); 830 header_size = 1; 831 length++; 832 833 /* If we're requesting fewer than four channels worth of response, 834 * and we have an explicit header, we need to set up the sampler 835 * writemask. It's reversed from normal: 1 means "don't write". 836 */ 837 if (!inst->eot && regs_written(inst) != 4 * reg_width) { 838 assert(regs_written(inst) % reg_width == 0); 839 unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf; 840 inst->offset |= mask << 12; 841 } 842 843 /* Build the actual header */ 844 const fs_builder ubld = bld.exec_all().group(8, 0); 845 const fs_builder ubld1 = ubld.group(1, 0); 846 ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 847 if (inst->offset) { 848 ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset)); 849 } else if (bld.shader->stage != MESA_SHADER_VERTEX && 850 bld.shader->stage != MESA_SHADER_FRAGMENT) { 851 /* The vertex and fragment stages have g0.2 set to 0, so 852 * header0.2 is 0 when g0 is copied. Other stages may not, so we 853 * must set it to 0 to avoid setting undesirable bits in the 854 * message. 855 */ 856 ubld1.MOV(component(header, 2), brw_imm_ud(0)); 857 } 858 859 if (sampler_handle.file != BAD_FILE) { 860 /* Bindless sampler handles aren't relative to the sampler state 861 * pointer passed into the shader through SAMPLER_STATE_POINTERS_*. 862 * Instead, it's an absolute pointer relative to dynamic state base 863 * address. 864 * 865 * Sampler states are 16 bytes each and the pointer we give here has 866 * to be 32-byte aligned. In order to avoid more indirect messages 867 * than required, we assume that all bindless sampler states are 868 * 32-byte aligned. This sacrifices a bit of general state base 869 * address space but means we can do something more efficient in the 870 * shader. 871 */ 872 ubld1.MOV(component(header, 3), sampler_handle); 873 } else if (is_high_sampler(devinfo, sampler)) { 874 fs_reg sampler_state_ptr = 875 retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD); 876 877 /* Gfx11+ sampler message headers include bits in 4:0 which conflict 878 * with the ones included in g0.3 bits 4:0. Mask them out. 879 */ 880 if (devinfo->ver >= 11) { 881 sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD); 882 ubld1.AND(sampler_state_ptr, 883 retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD), 884 brw_imm_ud(INTEL_MASK(31, 5))); 885 } 886 887 if (sampler.file == BRW_IMMEDIATE_VALUE) { 888 assert(sampler.ud >= 16); 889 const int sampler_state_size = 16; /* 16 bytes */ 890 891 ubld1.ADD(component(header, 3), sampler_state_ptr, 892 brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size)); 893 } else { 894 fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD); 895 ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0)); 896 ubld1.SHL(tmp, tmp, brw_imm_ud(4)); 897 ubld1.ADD(component(header, 3), sampler_state_ptr, tmp); 898 } 899 } else if (devinfo->ver >= 11) { 900 /* Gfx11+ sampler message headers include bits in 4:0 which conflict 901 * with the ones included in g0.3 bits 4:0. Mask them out. 902 */ 903 ubld1.AND(component(header, 3), 904 retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD), 905 brw_imm_ud(INTEL_MASK(31, 5))); 906 } 907 } 908 909 if (shadow_c.file != BAD_FILE) { 910 bld.MOV(sources[length], shadow_c); 911 length++; 912 } 913 914 bool coordinate_done = false; 915 916 /* Set up the LOD info */ 917 switch (op) { 918 case FS_OPCODE_TXB: 919 case SHADER_OPCODE_TXL: 920 if (devinfo->ver >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) { 921 op = SHADER_OPCODE_TXL_LZ; 922 break; 923 } 924 bld.MOV(sources[length], lod); 925 length++; 926 break; 927 case SHADER_OPCODE_TXD: 928 /* TXD should have been lowered in SIMD16 mode. */ 929 assert(bld.dispatch_width() == 8); 930 931 /* Load dPdx and the coordinate together: 932 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z 933 */ 934 for (unsigned i = 0; i < coord_components; i++) { 935 bld.MOV(sources[length++], offset(coordinate, bld, i)); 936 937 /* For cube map array, the coordinate is (u,v,r,ai) but there are 938 * only derivatives for (u, v, r). 939 */ 940 if (i < grad_components) { 941 bld.MOV(sources[length++], offset(lod, bld, i)); 942 bld.MOV(sources[length++], offset(lod2, bld, i)); 943 } 944 } 945 946 coordinate_done = true; 947 break; 948 case SHADER_OPCODE_TXS: 949 bld.MOV(retype(sources[length], payload_unsigned_type), lod); 950 length++; 951 break; 952 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: 953 /* We need an LOD; just use 0 */ 954 bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0)); 955 length++; 956 break; 957 case SHADER_OPCODE_TXF: 958 /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. 959 * On Gfx9 they are u, v, lod, r 960 */ 961 bld.MOV(retype(sources[length++], payload_signed_type), coordinate); 962 963 if (devinfo->ver >= 9) { 964 if (coord_components >= 2) { 965 bld.MOV(retype(sources[length], payload_signed_type), 966 offset(coordinate, bld, 1)); 967 } else { 968 sources[length] = brw_imm_d(0); 969 } 970 length++; 971 } 972 973 if (devinfo->ver >= 9 && lod.is_zero()) { 974 op = SHADER_OPCODE_TXF_LZ; 975 } else { 976 bld.MOV(retype(sources[length], payload_signed_type), lod); 977 length++; 978 } 979 980 for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++) 981 bld.MOV(retype(sources[length++], payload_signed_type), 982 offset(coordinate, bld, i)); 983 984 coordinate_done = true; 985 break; 986 987 case SHADER_OPCODE_TXF_CMS: 988 case SHADER_OPCODE_TXF_CMS_W: 989 case SHADER_OPCODE_TXF_UMS: 990 case SHADER_OPCODE_TXF_MCS: 991 if (op == SHADER_OPCODE_TXF_UMS || 992 op == SHADER_OPCODE_TXF_CMS || 993 op == SHADER_OPCODE_TXF_CMS_W) { 994 bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index); 995 } 996 997 /* Data from the multisample control surface. */ 998 if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) { 999 unsigned num_mcs_components = 1; 1000 1001 /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs - 1002 * Shared Functions - 3D Sampler - Messages - Message Format: 1003 * 1004 * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r 1005 */ 1006 if (devinfo->verx10 >= 125 && op == SHADER_OPCODE_TXF_CMS_W) 1007 num_mcs_components = 4; 1008 else if (op == SHADER_OPCODE_TXF_CMS_W) 1009 num_mcs_components = 2; 1010 1011 for (unsigned i = 0; i < num_mcs_components; ++i) { 1012 bld.MOV(retype(sources[length++], payload_unsigned_type), 1013 mcs.file == IMM ? mcs : offset(mcs, bld, i)); 1014 } 1015 } 1016 1017 /* There is no offsetting for this message; just copy in the integer 1018 * texture coordinates. 1019 */ 1020 for (unsigned i = 0; i < coord_components; i++) 1021 bld.MOV(retype(sources[length++], payload_signed_type), 1022 offset(coordinate, bld, i)); 1023 1024 coordinate_done = true; 1025 break; 1026 case SHADER_OPCODE_TG4_OFFSET: 1027 /* More crazy intermixing */ 1028 for (unsigned i = 0; i < 2; i++) /* u, v */ 1029 bld.MOV(sources[length++], offset(coordinate, bld, i)); 1030 1031 for (unsigned i = 0; i < 2; i++) /* offu, offv */ 1032 bld.MOV(retype(sources[length++], payload_signed_type), 1033 offset(tg4_offset, bld, i)); 1034 1035 if (coord_components == 3) /* r if present */ 1036 bld.MOV(sources[length++], offset(coordinate, bld, 2)); 1037 1038 coordinate_done = true; 1039 break; 1040 default: 1041 break; 1042 } 1043 1044 /* Set up the coordinate (except for cases where it was done above) */ 1045 if (!coordinate_done) { 1046 for (unsigned i = 0; i < coord_components; i++) 1047 bld.MOV(retype(sources[length++], payload_type), 1048 offset(coordinate, bld, i)); 1049 } 1050 1051 if (min_lod.file != BAD_FILE) { 1052 /* Account for all of the missing coordinate sources */ 1053 if (op == SHADER_OPCODE_TXD && devinfo->verx10 >= 125) { 1054 /* On DG2 and newer platforms, sample_d can only be used with 1D and 1055 * 2D surfaces, so the maximum number of gradient components is 2. 1056 * In spite of this limitation, the Bspec lists a mysterious R 1057 * component before the min_lod, so the maximum coordinate components 1058 * is 3. 1059 * 1060 * Wa_1209978020 1061 */ 1062 length += 3 - coord_components; 1063 length += (2 - grad_components) * 2; 1064 } else { 1065 length += 4 - coord_components; 1066 if (op == SHADER_OPCODE_TXD) 1067 length += (3 - grad_components) * 2; 1068 } 1069 1070 bld.MOV(sources[length++], min_lod); 1071 } 1072 1073 const fs_reg src_payload = 1074 fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width), 1075 BRW_REGISTER_TYPE_F); 1076 /* In case of 16-bit payload each component takes one full register in 1077 * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16 1078 * elements. In SIMD8H case hardware simply expects the components to be 1079 * padded (i.e., aligned on reg boundary). 1080 */ 1081 fs_inst *load_payload_inst = 1082 emit_load_payload_with_padding(bld, src_payload, sources, length, 1083 header_size, REG_SIZE); 1084 unsigned mlen = load_payload_inst->size_written / REG_SIZE; 1085 unsigned simd_mode = 0; 1086 if (payload_type_bit_size == 16) { 1087 assert(devinfo->ver >= 11); 1088 simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H : 1089 GFX10_SAMPLER_SIMD_MODE_SIMD16H; 1090 } else { 1091 simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 : 1092 BRW_SAMPLER_SIMD_MODE_SIMD16; 1093 } 1094 1095 /* Generate the SEND. */ 1096 inst->opcode = SHADER_OPCODE_SEND; 1097 inst->mlen = mlen; 1098 inst->header_size = header_size; 1099 1100 const unsigned msg_type = 1101 sampler_msg_type(devinfo, op, inst->shadow_compare); 1102 1103 inst->sfid = BRW_SFID_SAMPLER; 1104 if (surface.file == IMM && 1105 (sampler.file == IMM || sampler_handle.file != BAD_FILE)) { 1106 inst->desc = brw_sampler_desc(devinfo, surface.ud, 1107 sampler.file == IMM ? sampler.ud % 16 : 0, 1108 msg_type, 1109 simd_mode, 1110 0 /* return_format unused on gfx7+ */); 1111 inst->src[0] = brw_imm_ud(0); 1112 inst->src[1] = brw_imm_ud(0); 1113 } else if (surface_handle.file != BAD_FILE) { 1114 /* Bindless surface */ 1115 assert(devinfo->ver >= 9); 1116 inst->desc = brw_sampler_desc(devinfo, 1117 GFX9_BTI_BINDLESS, 1118 sampler.file == IMM ? sampler.ud % 16 : 0, 1119 msg_type, 1120 simd_mode, 1121 0 /* return_format unused on gfx7+ */); 1122 1123 /* For bindless samplers, the entire address is included in the message 1124 * header so we can leave the portion in the message descriptor 0. 1125 */ 1126 if (sampler_handle.file != BAD_FILE || sampler.file == IMM) { 1127 inst->src[0] = brw_imm_ud(0); 1128 } else { 1129 const fs_builder ubld = bld.group(1, 0).exec_all(); 1130 fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD); 1131 ubld.SHL(desc, sampler, brw_imm_ud(8)); 1132 inst->src[0] = desc; 1133 } 1134 1135 /* We assume that the driver provided the handle in the top 20 bits so 1136 * we can use the surface handle directly as the extended descriptor. 1137 */ 1138 inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD); 1139 } else { 1140 /* Immediate portion of the descriptor */ 1141 inst->desc = brw_sampler_desc(devinfo, 1142 0, /* surface */ 1143 0, /* sampler */ 1144 msg_type, 1145 simd_mode, 1146 0 /* return_format unused on gfx7+ */); 1147 const fs_builder ubld = bld.group(1, 0).exec_all(); 1148 fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD); 1149 if (surface.equals(sampler)) { 1150 /* This case is common in GL */ 1151 ubld.MUL(desc, surface, brw_imm_ud(0x101)); 1152 } else { 1153 if (sampler_handle.file != BAD_FILE) { 1154 ubld.MOV(desc, surface); 1155 } else if (sampler.file == IMM) { 1156 ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8)); 1157 } else { 1158 ubld.SHL(desc, sampler, brw_imm_ud(8)); 1159 ubld.OR(desc, desc, surface); 1160 } 1161 } 1162 ubld.AND(desc, desc, brw_imm_ud(0xfff)); 1163 1164 inst->src[0] = component(desc, 0); 1165 inst->src[1] = brw_imm_ud(0); /* ex_desc */ 1166 } 1167 1168 inst->ex_desc = 0; 1169 1170 inst->src[2] = src_payload; 1171 inst->resize_sources(3); 1172 1173 if (inst->eot) { 1174 /* EOT sampler messages don't make sense to split because it would 1175 * involve ending half of the thread early. 1176 */ 1177 assert(inst->group == 0); 1178 /* We need to use SENDC for EOT sampler messages */ 1179 inst->check_tdr = true; 1180 inst->send_has_side_effects = true; 1181 } 1182 1183 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */ 1184 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE); 1185} 1186 1187static unsigned 1188get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo, 1189 opcode op, const fs_reg *src) 1190{ 1191 unsigned src_type_size = 0; 1192 1193 /* All sources need to have the same size, therefore seek the first valid 1194 * and take the size from there. 1195 */ 1196 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) { 1197 if (src[i].file != BAD_FILE) { 1198 src_type_size = brw_reg_type_to_size(src[i].type); 1199 break; 1200 } 1201 } 1202 1203 assert(src_type_size == 2 || src_type_size == 4); 1204 1205#ifndef NDEBUG 1206 /* Make sure all sources agree. On gfx12 this doesn't hold when sampling 1207 * compressed multisampled surfaces. There the payload contains MCS data 1208 * which is already in 16-bits unlike the other parameters that need forced 1209 * conversion. 1210 */ 1211 if (devinfo->verx10 < 125 || 1212 (op != SHADER_OPCODE_TXF_CMS_W && 1213 op != SHADER_OPCODE_TXF_CMS)) { 1214 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) { 1215 assert(src[i].file == BAD_FILE || 1216 brw_reg_type_to_size(src[i].type) == src_type_size); 1217 } 1218 } 1219#endif 1220 1221 if (devinfo->verx10 < 125) 1222 return src_type_size * 8; 1223 1224 /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec: 1225 * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message 1226 * Format [GFX12:HAS:1209977870] * 1227 * 1228 * ld2dms_w SIMD8H and SIMD16H Only 1229 * ld_mcs SIMD8H and SIMD16H Only 1230 * ld2dms REMOVEDBY(GEN:HAS:1406788836) 1231 */ 1232 1233 if (op == SHADER_OPCODE_TXF_CMS_W || 1234 op == SHADER_OPCODE_TXF_CMS || 1235 op == SHADER_OPCODE_TXF_UMS || 1236 op == SHADER_OPCODE_TXF_MCS) 1237 src_type_size = 2; 1238 1239 return src_type_size * 8; 1240} 1241 1242static void 1243lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) 1244{ 1245 const intel_device_info *devinfo = bld.shader->devinfo; 1246 const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE]; 1247 const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C]; 1248 const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD]; 1249 const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2]; 1250 const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD]; 1251 const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX]; 1252 const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS]; 1253 const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE]; 1254 const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER]; 1255 const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE]; 1256 const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE]; 1257 const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET]; 1258 assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM); 1259 const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; 1260 assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); 1261 const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; 1262 1263 if (devinfo->ver >= 7) { 1264 const unsigned msg_payload_type_bit_size = 1265 get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src); 1266 1267 /* 16-bit payloads are available only on gfx11+ */ 1268 assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11); 1269 1270 lower_sampler_logical_send_gfx7(bld, inst, op, coordinate, 1271 shadow_c, lod, lod2, min_lod, 1272 sample_index, 1273 mcs, surface, sampler, 1274 surface_handle, sampler_handle, 1275 tg4_offset, 1276 msg_payload_type_bit_size, 1277 coord_components, grad_components); 1278 } else if (devinfo->ver >= 5) { 1279 lower_sampler_logical_send_gfx5(bld, inst, op, coordinate, 1280 shadow_c, lod, lod2, sample_index, 1281 surface, sampler, 1282 coord_components, grad_components); 1283 } else { 1284 lower_sampler_logical_send_gfx4(bld, inst, op, coordinate, 1285 shadow_c, lod, lod2, 1286 surface, sampler, 1287 coord_components, grad_components); 1288 } 1289} 1290 1291/** 1292 * Predicate the specified instruction on the vector mask. 1293 */ 1294static void 1295emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst) 1296{ 1297 assert(bld.shader->stage == MESA_SHADER_FRAGMENT && 1298 bld.group() == inst->group && 1299 bld.dispatch_width() == inst->exec_size); 1300 1301 const fs_builder ubld = bld.exec_all().group(1, 0); 1302 1303 const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader); 1304 const fs_reg vector_mask = ubld.vgrf(BRW_REGISTER_TYPE_UW); 1305 ubld.emit(SHADER_OPCODE_READ_SR_REG, vector_mask, brw_imm_ud(3)); 1306 const unsigned subreg = sample_mask_flag_subreg(v); 1307 1308 ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask); 1309 1310 if (inst->predicate) { 1311 assert(inst->predicate == BRW_PREDICATE_NORMAL); 1312 assert(!inst->predicate_inverse); 1313 assert(inst->flag_subreg == 0); 1314 /* Combine the vector mask with the existing predicate by using a 1315 * vertical predication mode. 1316 */ 1317 inst->predicate = BRW_PREDICATE_ALIGN1_ALLV; 1318 } else { 1319 inst->flag_subreg = subreg; 1320 inst->predicate = BRW_PREDICATE_NORMAL; 1321 inst->predicate_inverse = false; 1322 } 1323} 1324 1325static void 1326setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc, 1327 const fs_reg &surface, const fs_reg &surface_handle) 1328{ 1329 const ASSERTED intel_device_info *devinfo = bld.shader->devinfo; 1330 1331 /* We must have exactly one of surface and surface_handle */ 1332 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); 1333 1334 if (surface.file == IMM) { 1335 inst->desc = desc | (surface.ud & 0xff); 1336 inst->src[0] = brw_imm_ud(0); 1337 inst->src[1] = brw_imm_ud(0); /* ex_desc */ 1338 } else if (surface_handle.file != BAD_FILE) { 1339 /* Bindless surface */ 1340 assert(devinfo->ver >= 9); 1341 inst->desc = desc | GFX9_BTI_BINDLESS; 1342 inst->src[0] = brw_imm_ud(0); 1343 1344 /* We assume that the driver provided the handle in the top 20 bits so 1345 * we can use the surface handle directly as the extended descriptor. 1346 */ 1347 inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD); 1348 } else { 1349 inst->desc = desc; 1350 const fs_builder ubld = bld.exec_all().group(1, 0); 1351 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); 1352 ubld.AND(tmp, surface, brw_imm_ud(0xff)); 1353 inst->src[0] = component(tmp, 0); 1354 inst->src[1] = brw_imm_ud(0); /* ex_desc */ 1355 } 1356} 1357 1358static void 1359lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) 1360{ 1361 const intel_device_info *devinfo = bld.shader->devinfo; 1362 1363 /* Get the logical send arguments. */ 1364 const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS]; 1365 const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA]; 1366 const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE]; 1367 const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE]; 1368 const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS]; 1369 const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG]; 1370 const fs_reg &allow_sample_mask = 1371 inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK]; 1372 assert(arg.file == IMM); 1373 assert(allow_sample_mask.file == IMM); 1374 1375 /* Calculate the total number of components of the payload. */ 1376 const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS); 1377 const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA); 1378 1379 const bool is_typed_access = 1380 inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL || 1381 inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL || 1382 inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL; 1383 1384 const bool is_surface_access = is_typed_access || 1385 inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL || 1386 inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL || 1387 inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL; 1388 1389 const bool is_stateless = 1390 surface.file == IMM && (surface.ud == BRW_BTI_STATELESS || 1391 surface.ud == GFX8_BTI_STATELESS_NON_COHERENT); 1392 1393 const bool has_side_effects = inst->has_side_effects(); 1394 1395 fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) : 1396 fs_reg(brw_imm_d(0xffff)); 1397 1398 /* From the BDW PRM Volume 7, page 147: 1399 * 1400 * "For the Data Cache Data Port*, the header must be present for the 1401 * following message types: [...] Typed read/write/atomics" 1402 * 1403 * Earlier generations have a similar wording. Because of this restriction 1404 * we don't attempt to implement sample masks via predication for such 1405 * messages prior to Gfx9, since we have to provide a header anyway. On 1406 * Gfx11+ the header has been removed so we can only use predication. 1407 * 1408 * For all stateless A32 messages, we also need a header 1409 */ 1410 fs_reg header; 1411 if ((devinfo->ver < 9 && is_typed_access) || is_stateless) { 1412 fs_builder ubld = bld.exec_all().group(8, 0); 1413 header = ubld.vgrf(BRW_REGISTER_TYPE_UD); 1414 if (is_stateless) { 1415 assert(!is_surface_access); 1416 ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header); 1417 } else { 1418 ubld.MOV(header, brw_imm_d(0)); 1419 if (is_surface_access) 1420 ubld.group(1, 0).MOV(component(header, 7), sample_mask); 1421 } 1422 } 1423 const unsigned header_sz = header.file != BAD_FILE ? 1 : 0; 1424 1425 fs_reg payload, payload2; 1426 unsigned mlen, ex_mlen = 0; 1427 if (devinfo->ver >= 9 && 1428 (src.file == BAD_FILE || header.file == BAD_FILE)) { 1429 /* We have split sends on gfx9 and above */ 1430 if (header.file == BAD_FILE) { 1431 payload = bld.move_to_vgrf(addr, addr_sz); 1432 payload2 = bld.move_to_vgrf(src, src_sz); 1433 mlen = addr_sz * (inst->exec_size / 8); 1434 ex_mlen = src_sz * (inst->exec_size / 8); 1435 } else { 1436 assert(src.file == BAD_FILE); 1437 payload = header; 1438 payload2 = bld.move_to_vgrf(addr, addr_sz); 1439 mlen = header_sz; 1440 ex_mlen = addr_sz * (inst->exec_size / 8); 1441 } 1442 } else { 1443 /* Allocate space for the payload. */ 1444 const unsigned sz = header_sz + addr_sz + src_sz; 1445 payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz); 1446 fs_reg *const components = new fs_reg[sz]; 1447 unsigned n = 0; 1448 1449 /* Construct the payload. */ 1450 if (header.file != BAD_FILE) 1451 components[n++] = header; 1452 1453 for (unsigned i = 0; i < addr_sz; i++) 1454 components[n++] = offset(addr, bld, i); 1455 1456 for (unsigned i = 0; i < src_sz; i++) 1457 components[n++] = offset(src, bld, i); 1458 1459 bld.LOAD_PAYLOAD(payload, components, sz, header_sz); 1460 mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8; 1461 1462 delete[] components; 1463 } 1464 1465 /* Predicate the instruction on the sample mask if no header is 1466 * provided. 1467 */ 1468 if ((header.file == BAD_FILE || !is_surface_access) && 1469 sample_mask.file != BAD_FILE && sample_mask.file != IMM) 1470 brw_emit_predicate_on_sample_mask(bld, inst); 1471 1472 uint32_t sfid; 1473 switch (inst->opcode) { 1474 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: 1475 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: 1476 /* Byte scattered opcodes go through the normal data cache */ 1477 sfid = GFX7_SFID_DATAPORT_DATA_CACHE; 1478 break; 1479 1480 case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: 1481 case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: 1482 sfid = devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE : 1483 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE : 1484 BRW_DATAPORT_READ_TARGET_RENDER_CACHE; 1485 break; 1486 1487 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: 1488 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: 1489 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: 1490 case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: 1491 /* Untyped Surface messages go through the data cache but the SFID value 1492 * changed on Haswell. 1493 */ 1494 sfid = (devinfo->verx10 >= 75 ? 1495 HSW_SFID_DATAPORT_DATA_CACHE_1 : 1496 GFX7_SFID_DATAPORT_DATA_CACHE); 1497 break; 1498 1499 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: 1500 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: 1501 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: 1502 /* Typed surface messages go through the render cache on IVB and the 1503 * data cache on HSW+. 1504 */ 1505 sfid = (devinfo->verx10 >= 75 ? 1506 HSW_SFID_DATAPORT_DATA_CACHE_1 : 1507 GFX6_SFID_DATAPORT_RENDER_CACHE); 1508 break; 1509 1510 default: 1511 unreachable("Unsupported surface opcode"); 1512 } 1513 1514 uint32_t desc; 1515 switch (inst->opcode) { 1516 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: 1517 desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size, 1518 arg.ud, /* num_channels */ 1519 false /* write */); 1520 break; 1521 1522 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: 1523 desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size, 1524 arg.ud, /* num_channels */ 1525 true /* write */); 1526 break; 1527 1528 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: 1529 desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size, 1530 arg.ud, /* bit_size */ 1531 false /* write */); 1532 break; 1533 1534 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: 1535 desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size, 1536 arg.ud, /* bit_size */ 1537 true /* write */); 1538 break; 1539 1540 case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: 1541 assert(arg.ud == 32); /* bit_size */ 1542 desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size, 1543 false /* write */); 1544 break; 1545 1546 case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: 1547 assert(arg.ud == 32); /* bit_size */ 1548 desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size, 1549 true /* write */); 1550 break; 1551 1552 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: 1553 desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size, 1554 arg.ud, /* atomic_op */ 1555 !inst->dst.is_null()); 1556 break; 1557 1558 case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: 1559 desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size, 1560 arg.ud, /* atomic_op */ 1561 !inst->dst.is_null()); 1562 break; 1563 1564 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: 1565 desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group, 1566 arg.ud, /* num_channels */ 1567 false /* write */); 1568 break; 1569 1570 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: 1571 desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group, 1572 arg.ud, /* num_channels */ 1573 true /* write */); 1574 break; 1575 1576 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: 1577 desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group, 1578 arg.ud, /* atomic_op */ 1579 !inst->dst.is_null()); 1580 break; 1581 1582 default: 1583 unreachable("Unknown surface logical instruction"); 1584 } 1585 1586 /* Update the original instruction. */ 1587 inst->opcode = SHADER_OPCODE_SEND; 1588 inst->mlen = mlen; 1589 inst->ex_mlen = ex_mlen; 1590 inst->header_size = header_sz; 1591 inst->send_has_side_effects = has_side_effects; 1592 inst->send_is_volatile = !has_side_effects; 1593 1594 /* Set up SFID and descriptors */ 1595 inst->sfid = sfid; 1596 setup_surface_descriptors(bld, inst, desc, surface, surface_handle); 1597 1598 inst->resize_sources(4); 1599 1600 /* Finally, the payload */ 1601 inst->src[2] = payload; 1602 inst->src[3] = payload2; 1603} 1604 1605static enum lsc_opcode 1606brw_atomic_op_to_lsc_atomic_op(unsigned op) 1607{ 1608 switch(op) { 1609 case BRW_AOP_AND: 1610 return LSC_OP_ATOMIC_AND; 1611 case BRW_AOP_OR: 1612 return LSC_OP_ATOMIC_OR; 1613 case BRW_AOP_XOR: 1614 return LSC_OP_ATOMIC_XOR; 1615 case BRW_AOP_MOV: 1616 return LSC_OP_ATOMIC_STORE; 1617 case BRW_AOP_INC: 1618 return LSC_OP_ATOMIC_INC; 1619 case BRW_AOP_DEC: 1620 return LSC_OP_ATOMIC_DEC; 1621 case BRW_AOP_ADD: 1622 return LSC_OP_ATOMIC_ADD; 1623 case BRW_AOP_SUB: 1624 return LSC_OP_ATOMIC_SUB; 1625 case BRW_AOP_IMAX: 1626 return LSC_OP_ATOMIC_MAX; 1627 case BRW_AOP_IMIN: 1628 return LSC_OP_ATOMIC_MIN; 1629 case BRW_AOP_UMAX: 1630 return LSC_OP_ATOMIC_UMAX; 1631 case BRW_AOP_UMIN: 1632 return LSC_OP_ATOMIC_UMIN; 1633 case BRW_AOP_CMPWR: 1634 return LSC_OP_ATOMIC_CMPXCHG; 1635 default: 1636 assert(false); 1637 unreachable("invalid atomic opcode"); 1638 } 1639} 1640 1641static enum lsc_opcode 1642brw_atomic_op_to_lsc_fatomic_op(uint32_t aop) 1643{ 1644 switch(aop) { 1645 case BRW_AOP_FMAX: 1646 return LSC_OP_ATOMIC_FMAX; 1647 case BRW_AOP_FMIN: 1648 return LSC_OP_ATOMIC_FMIN; 1649 case BRW_AOP_FCMPWR: 1650 return LSC_OP_ATOMIC_FCMPXCHG; 1651 case BRW_AOP_FADD: 1652 return LSC_OP_ATOMIC_FADD; 1653 default: 1654 unreachable("Unsupported float atomic opcode"); 1655 } 1656} 1657 1658static enum lsc_data_size 1659lsc_bits_to_data_size(unsigned bit_size) 1660{ 1661 switch (bit_size / 8) { 1662 case 1: return LSC_DATA_SIZE_D8U32; 1663 case 2: return LSC_DATA_SIZE_D16U32; 1664 case 4: return LSC_DATA_SIZE_D32; 1665 case 8: return LSC_DATA_SIZE_D64; 1666 default: 1667 unreachable("Unsupported data size."); 1668 } 1669} 1670 1671static void 1672lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst) 1673{ 1674 const intel_device_info *devinfo = bld.shader->devinfo; 1675 assert(devinfo->has_lsc); 1676 1677 /* Get the logical send arguments. */ 1678 const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS]; 1679 const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA]; 1680 const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE]; 1681 const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE]; 1682 const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS]; 1683 const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG]; 1684 const fs_reg allow_sample_mask = 1685 inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK]; 1686 assert(arg.file == IMM); 1687 assert(allow_sample_mask.file == IMM); 1688 1689 /* Calculate the total number of components of the payload. */ 1690 const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS); 1691 const unsigned src_comps = inst->components_read(SURFACE_LOGICAL_SRC_DATA); 1692 const unsigned src_sz = type_sz(src.type); 1693 1694 const bool has_side_effects = inst->has_side_effects(); 1695 1696 unsigned ex_mlen = 0; 1697 fs_reg payload, payload2; 1698 payload = bld.move_to_vgrf(addr, addr_sz); 1699 if (src.file != BAD_FILE) { 1700 payload2 = bld.move_to_vgrf(src, src_comps); 1701 ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE; 1702 } 1703 1704 /* Predicate the instruction on the sample mask if needed */ 1705 fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) : 1706 fs_reg(brw_imm_d(0xffff)); 1707 if (sample_mask.file != BAD_FILE && sample_mask.file != IMM) 1708 brw_emit_predicate_on_sample_mask(bld, inst); 1709 1710 if (surface.file == IMM && surface.ud == GFX7_BTI_SLM) 1711 inst->sfid = GFX12_SFID_SLM; 1712 else 1713 inst->sfid = GFX12_SFID_UGM; 1714 1715 /* We must have exactly one of surface and surface_handle */ 1716 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); 1717 1718 enum lsc_addr_surface_type surf_type; 1719 if (surface_handle.file != BAD_FILE) 1720 surf_type = LSC_ADDR_SURFTYPE_BSS; 1721 else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM) 1722 surf_type = LSC_ADDR_SURFTYPE_FLAT; 1723 else 1724 surf_type = LSC_ADDR_SURFTYPE_BTI; 1725 1726 switch (inst->opcode) { 1727 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: 1728 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size, 1729 surf_type, LSC_ADDR_SIZE_A32, 1730 1 /* num_coordinates */, 1731 LSC_DATA_SIZE_D32, arg.ud /* num_channels */, 1732 false /* transpose */, 1733 LSC_CACHE_LOAD_L1STATE_L3MOCS, 1734 true /* has_dest */); 1735 break; 1736 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: 1737 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size, 1738 surf_type, LSC_ADDR_SIZE_A32, 1739 1 /* num_coordinates */, 1740 LSC_DATA_SIZE_D32, arg.ud /* num_channels */, 1741 false /* transpose */, 1742 LSC_CACHE_STORE_L1STATE_L3MOCS, 1743 false /* has_dest */); 1744 break; 1745 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: 1746 case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: { 1747 /* Bspec: Atomic instruction -> Cache section: 1748 * 1749 * Atomic messages are always forced to "un-cacheable" in the L1 1750 * cache. 1751 */ 1752 enum lsc_opcode opcode = 1753 inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL ? 1754 brw_atomic_op_to_lsc_fatomic_op(arg.ud) : 1755 brw_atomic_op_to_lsc_atomic_op(arg.ud); 1756 inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size, 1757 surf_type, LSC_ADDR_SIZE_A32, 1758 1 /* num_coordinates */, 1759 lsc_bits_to_data_size(src_sz * 8), 1760 1 /* num_channels */, 1761 false /* transpose */, 1762 LSC_CACHE_STORE_L1UC_L3WB, 1763 !inst->dst.is_null()); 1764 break; 1765 } 1766 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: 1767 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size, 1768 surf_type, LSC_ADDR_SIZE_A32, 1769 1 /* num_coordinates */, 1770 lsc_bits_to_data_size(arg.ud), 1771 1 /* num_channels */, 1772 false /* transpose */, 1773 LSC_CACHE_LOAD_L1STATE_L3MOCS, 1774 true /* has_dest */); 1775 break; 1776 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: 1777 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size, 1778 surf_type, LSC_ADDR_SIZE_A32, 1779 1 /* num_coordinates */, 1780 lsc_bits_to_data_size(arg.ud), 1781 1 /* num_channels */, 1782 false /* transpose */, 1783 LSC_CACHE_STORE_L1STATE_L3MOCS, 1784 false /* has_dest */); 1785 break; 1786 default: 1787 unreachable("Unknown surface logical instruction"); 1788 } 1789 1790 inst->src[0] = brw_imm_ud(0); 1791 1792 /* Set up extended descriptors */ 1793 switch (surf_type) { 1794 case LSC_ADDR_SURFTYPE_FLAT: 1795 inst->src[1] = brw_imm_ud(0); 1796 break; 1797 case LSC_ADDR_SURFTYPE_BSS: 1798 /* We assume that the driver provided the handle in the top 20 bits so 1799 * we can use the surface handle directly as the extended descriptor. 1800 */ 1801 inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD); 1802 break; 1803 case LSC_ADDR_SURFTYPE_BTI: 1804 if (surface.file == IMM) { 1805 inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud)); 1806 } else { 1807 const fs_builder ubld = bld.exec_all().group(1, 0); 1808 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); 1809 ubld.SHL(tmp, surface, brw_imm_ud(24)); 1810 inst->src[1] = component(tmp, 0); 1811 } 1812 break; 1813 default: 1814 unreachable("Unknown surface type"); 1815 } 1816 1817 /* Update the original instruction. */ 1818 inst->opcode = SHADER_OPCODE_SEND; 1819 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); 1820 inst->ex_mlen = ex_mlen; 1821 inst->header_size = 0; 1822 inst->send_has_side_effects = has_side_effects; 1823 inst->send_is_volatile = !has_side_effects; 1824 1825 inst->resize_sources(4); 1826 1827 /* Finally, the payload */ 1828 inst->src[2] = payload; 1829 inst->src[3] = payload2; 1830} 1831 1832static void 1833lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst) 1834{ 1835 const intel_device_info *devinfo = bld.shader->devinfo; 1836 assert(devinfo->ver >= 9); 1837 1838 /* Get the logical send arguments. */ 1839 const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS]; 1840 const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA]; 1841 const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE]; 1842 const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE]; 1843 const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG]; 1844 assert(arg.file == IMM); 1845 assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE); 1846 assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE); 1847 1848 const bool is_stateless = 1849 surface.file == IMM && (surface.ud == BRW_BTI_STATELESS || 1850 surface.ud == GFX8_BTI_STATELESS_NON_COHERENT); 1851 1852 const bool has_side_effects = inst->has_side_effects(); 1853 1854 const bool align_16B = 1855 inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL; 1856 1857 const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL; 1858 1859 /* The address is stored in the header. See MH_A32_GO and MH_BTS_GO. */ 1860 fs_builder ubld = bld.exec_all().group(8, 0); 1861 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD); 1862 1863 if (is_stateless) 1864 ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header); 1865 else 1866 ubld.MOV(header, brw_imm_d(0)); 1867 1868 /* Address in OWord units when aligned to OWords. */ 1869 if (align_16B) 1870 ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4)); 1871 else 1872 ubld.group(1, 0).MOV(component(header, 2), addr); 1873 1874 fs_reg data; 1875 unsigned ex_mlen = 0; 1876 if (write) { 1877 const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA); 1878 data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD); 1879 ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE; 1880 } 1881 1882 inst->opcode = SHADER_OPCODE_SEND; 1883 inst->mlen = 1; 1884 inst->ex_mlen = ex_mlen; 1885 inst->header_size = 1; 1886 inst->send_has_side_effects = has_side_effects; 1887 inst->send_is_volatile = !has_side_effects; 1888 1889 inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; 1890 1891 const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B, 1892 arg.ud, write); 1893 setup_surface_descriptors(bld, inst, desc, surface, surface_handle); 1894 1895 inst->resize_sources(4); 1896 1897 inst->src[2] = header; 1898 inst->src[3] = data; 1899} 1900 1901static fs_reg 1902emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr) 1903{ 1904 const fs_builder ubld = bld.exec_all().group(8, 0); 1905 1906 assert(type_sz(addr.type) == 8 && addr.stride == 0); 1907 1908 fs_reg expanded_addr = addr; 1909 if (addr.file == UNIFORM) { 1910 /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */ 1911 expanded_addr = ubld.vgrf(BRW_REGISTER_TYPE_UQ); 1912 expanded_addr.stride = 0; 1913 ubld.MOV(expanded_addr, retype(addr, BRW_REGISTER_TYPE_UQ)); 1914 } 1915 1916 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD); 1917 ubld.MOV(header, brw_imm_ud(0)); 1918 1919 /* Use a 2-wide MOV to fill out the address */ 1920 fs_reg addr_vec2 = expanded_addr; 1921 addr_vec2.type = BRW_REGISTER_TYPE_UD; 1922 addr_vec2.stride = 1; 1923 ubld.group(2, 0).MOV(header, addr_vec2); 1924 1925 return header; 1926} 1927 1928static void 1929emit_fragment_mask(const fs_builder &bld, fs_inst *inst) 1930{ 1931 assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM); 1932 const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud; 1933 1934 /* If we're a fragment shader, we have to predicate with the sample mask to 1935 * avoid helper invocations to avoid helper invocations in instructions 1936 * with side effects, unless they are explicitly required. 1937 * 1938 * There are also special cases when we actually want to run on helpers 1939 * (ray queries). 1940 */ 1941 assert(bld.shader->stage == MESA_SHADER_FRAGMENT); 1942 if (enable_helpers) 1943 emit_predicate_on_vector_mask(bld, inst); 1944 else if (inst->has_side_effects()) 1945 brw_emit_predicate_on_sample_mask(bld, inst); 1946} 1947 1948static void 1949lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst) 1950{ 1951 const intel_device_info *devinfo = bld.shader->devinfo; 1952 1953 /* Get the logical send arguments. */ 1954 const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS]; 1955 const fs_reg &src = inst->src[A64_LOGICAL_SRC]; 1956 const unsigned src_sz = type_sz(src.type); 1957 1958 const unsigned src_comps = inst->components_read(1); 1959 assert(inst->src[A64_LOGICAL_ARG].file == IMM); 1960 const unsigned arg = inst->src[A64_LOGICAL_ARG].ud; 1961 const bool has_side_effects = inst->has_side_effects(); 1962 1963 fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD); 1964 fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps), 1965 BRW_REGISTER_TYPE_UD); 1966 unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE; 1967 1968 switch (inst->opcode) { 1969 case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: 1970 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size, 1971 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, 1972 1 /* num_coordinates */, 1973 LSC_DATA_SIZE_D32, arg /* num_channels */, 1974 false /* transpose */, 1975 LSC_CACHE_LOAD_L1STATE_L3MOCS, 1976 true /* has_dest */); 1977 break; 1978 case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: 1979 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size, 1980 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, 1981 1 /* num_coordinates */, 1982 LSC_DATA_SIZE_D32, arg /* num_channels */, 1983 false /* transpose */, 1984 LSC_CACHE_STORE_L1STATE_L3MOCS, 1985 false /* has_dest */); 1986 break; 1987 case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: 1988 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size, 1989 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, 1990 1 /* num_coordinates */, 1991 lsc_bits_to_data_size(arg), 1992 1 /* num_channels */, 1993 false /* transpose */, 1994 LSC_CACHE_LOAD_L1STATE_L3MOCS, 1995 true /* has_dest */); 1996 break; 1997 case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: 1998 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size, 1999 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, 2000 1 /* num_coordinates */, 2001 lsc_bits_to_data_size(arg), 2002 1 /* num_channels */, 2003 false /* transpose */, 2004 LSC_CACHE_STORE_L1STATE_L3MOCS, 2005 false /* has_dest */); 2006 break; 2007 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: 2008 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL: 2009 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: { 2010 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL: 2011 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL: 2012 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL: 2013 /* Bspec: Atomic instruction -> Cache section: 2014 * 2015 * Atomic messages are always forced to "un-cacheable" in the L1 2016 * cache. 2017 */ 2018 enum lsc_opcode opcode = 2019 (inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL || 2020 inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL || 2021 inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL) ? 2022 brw_atomic_op_to_lsc_atomic_op(arg) : 2023 brw_atomic_op_to_lsc_fatomic_op(arg); 2024 inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size, 2025 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, 2026 1 /* num_coordinates */, 2027 lsc_bits_to_data_size(src_sz * 8), 2028 1 /* num_channels */, 2029 false /* transpose */, 2030 LSC_CACHE_STORE_L1UC_L3WB, 2031 !inst->dst.is_null()); 2032 break; 2033 } 2034 default: 2035 unreachable("Unknown A64 logical instruction"); 2036 } 2037 2038 if (bld.shader->stage == MESA_SHADER_FRAGMENT) 2039 emit_fragment_mask(bld, inst); 2040 2041 /* Update the original instruction. */ 2042 inst->opcode = SHADER_OPCODE_SEND; 2043 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); 2044 inst->ex_mlen = ex_mlen; 2045 inst->header_size = 0; 2046 inst->send_has_side_effects = has_side_effects; 2047 inst->send_is_volatile = !has_side_effects; 2048 2049 /* Set up SFID and descriptors */ 2050 inst->sfid = GFX12_SFID_UGM; 2051 inst->resize_sources(4); 2052 inst->src[0] = brw_imm_ud(0); /* desc */ 2053 inst->src[1] = brw_imm_ud(0); /* ex_desc */ 2054 inst->src[2] = payload; 2055 inst->src[3] = payload2; 2056} 2057 2058static void 2059lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) 2060{ 2061 const intel_device_info *devinfo = bld.shader->devinfo; 2062 2063 const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS]; 2064 const fs_reg &src = inst->src[A64_LOGICAL_SRC]; 2065 const unsigned src_comps = inst->components_read(1); 2066 assert(inst->src[A64_LOGICAL_ARG].file == IMM); 2067 const unsigned arg = inst->src[A64_LOGICAL_ARG].ud; 2068 const bool has_side_effects = inst->has_side_effects(); 2069 2070 fs_reg payload, payload2; 2071 unsigned mlen, ex_mlen = 0, header_size = 0; 2072 if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL || 2073 inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL || 2074 inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) { 2075 assert(devinfo->ver >= 9); 2076 2077 /* OWORD messages only take a scalar address in a header */ 2078 mlen = 1; 2079 header_size = 1; 2080 payload = emit_a64_oword_block_header(bld, addr); 2081 2082 if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) { 2083 ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE; 2084 payload2 = retype(bld.move_to_vgrf(src, src_comps), 2085 BRW_REGISTER_TYPE_UD); 2086 } 2087 } else if (devinfo->ver >= 9) { 2088 /* On Skylake and above, we have SENDS */ 2089 mlen = 2 * (inst->exec_size / 8); 2090 ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE; 2091 payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD); 2092 payload2 = retype(bld.move_to_vgrf(src, src_comps), 2093 BRW_REGISTER_TYPE_UD); 2094 } else { 2095 /* Add two because the address is 64-bit */ 2096 const unsigned dwords = 2 + src_comps; 2097 mlen = dwords * (inst->exec_size / 8); 2098 2099 fs_reg sources[5]; 2100 2101 sources[0] = addr; 2102 2103 for (unsigned i = 0; i < src_comps; i++) 2104 sources[1 + i] = offset(src, bld, i); 2105 2106 payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords); 2107 bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0); 2108 } 2109 2110 uint32_t desc; 2111 switch (inst->opcode) { 2112 case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: 2113 desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size, 2114 arg, /* num_channels */ 2115 false /* write */); 2116 break; 2117 2118 case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: 2119 desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size, 2120 arg, /* num_channels */ 2121 true /* write */); 2122 break; 2123 2124 case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: 2125 desc = brw_dp_a64_oword_block_rw_desc(devinfo, 2126 true, /* align_16B */ 2127 arg, /* num_dwords */ 2128 false /* write */); 2129 break; 2130 2131 case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: 2132 desc = brw_dp_a64_oword_block_rw_desc(devinfo, 2133 false, /* align_16B */ 2134 arg, /* num_dwords */ 2135 false /* write */); 2136 break; 2137 2138 case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: 2139 desc = brw_dp_a64_oword_block_rw_desc(devinfo, 2140 true, /* align_16B */ 2141 arg, /* num_dwords */ 2142 true /* write */); 2143 break; 2144 2145 case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: 2146 desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size, 2147 arg, /* bit_size */ 2148 false /* write */); 2149 break; 2150 2151 case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: 2152 desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size, 2153 arg, /* bit_size */ 2154 true /* write */); 2155 break; 2156 2157 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: 2158 desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32, 2159 arg, /* atomic_op */ 2160 !inst->dst.is_null()); 2161 break; 2162 2163 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL: 2164 desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 16, 2165 arg, /* atomic_op */ 2166 !inst->dst.is_null()); 2167 break; 2168 2169 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: 2170 desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64, 2171 arg, /* atomic_op */ 2172 !inst->dst.is_null()); 2173 break; 2174 2175 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL: 2176 desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size, 2177 16, /* bit_size */ 2178 arg, /* atomic_op */ 2179 !inst->dst.is_null()); 2180 break; 2181 2182 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL: 2183 desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size, 2184 32, /* bit_size */ 2185 arg, /* atomic_op */ 2186 !inst->dst.is_null()); 2187 break; 2188 2189 default: 2190 unreachable("Unknown A64 logical instruction"); 2191 } 2192 2193 if (bld.shader->stage == MESA_SHADER_FRAGMENT) 2194 emit_fragment_mask(bld, inst); 2195 2196 /* Update the original instruction. */ 2197 inst->opcode = SHADER_OPCODE_SEND; 2198 inst->mlen = mlen; 2199 inst->ex_mlen = ex_mlen; 2200 inst->header_size = header_size; 2201 inst->send_has_side_effects = has_side_effects; 2202 inst->send_is_volatile = !has_side_effects; 2203 2204 /* Set up SFID and descriptors */ 2205 inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1; 2206 inst->desc = desc; 2207 inst->resize_sources(4); 2208 inst->src[0] = brw_imm_ud(0); /* desc */ 2209 inst->src[1] = brw_imm_ud(0); /* ex_desc */ 2210 inst->src[2] = payload; 2211 inst->src[3] = payload2; 2212} 2213 2214static void 2215lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld, 2216 fs_inst *inst) 2217{ 2218 const intel_device_info *devinfo = bld.shader->devinfo; 2219 ASSERTED const brw_compiler *compiler = bld.shader->compiler; 2220 2221 fs_reg index = inst->src[0]; 2222 2223 /* We are switching the instruction from an ALU-like instruction to a 2224 * send-from-grf instruction. Since sends can't handle strides or 2225 * source modifiers, we have to make a copy of the offset source. 2226 */ 2227 fs_reg ubo_offset = bld.move_to_vgrf(inst->src[1], 1); 2228 2229 assert(inst->src[2].file == BRW_IMMEDIATE_VALUE); 2230 unsigned alignment = inst->src[2].ud; 2231 2232 inst->opcode = SHADER_OPCODE_SEND; 2233 inst->sfid = GFX12_SFID_UGM; 2234 inst->resize_sources(3); 2235 inst->src[0] = brw_imm_ud(0); 2236 2237 if (index.file == IMM) { 2238 inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, index.ud)); 2239 } else { 2240 const fs_builder ubld = bld.exec_all().group(1, 0); 2241 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); 2242 ubld.SHL(tmp, index, brw_imm_ud(24)); 2243 inst->src[1] = component(tmp, 0); 2244 } 2245 2246 assert(!compiler->indirect_ubos_use_sampler); 2247 2248 inst->src[2] = ubo_offset; /* payload */ 2249 if (alignment >= 4) { 2250 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size, 2251 LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32, 2252 1 /* num_coordinates */, 2253 LSC_DATA_SIZE_D32, 2254 4 /* num_channels */, 2255 false /* transpose */, 2256 LSC_CACHE_LOAD_L1STATE_L3MOCS, 2257 true /* has_dest */); 2258 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); 2259 } else { 2260 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size, 2261 LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32, 2262 1 /* num_coordinates */, 2263 LSC_DATA_SIZE_D32, 2264 1 /* num_channels */, 2265 false /* transpose */, 2266 LSC_CACHE_LOAD_L1STATE_L3MOCS, 2267 true /* has_dest */); 2268 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); 2269 /* The byte scattered messages can only read one dword at a time so 2270 * we have to duplicate the message 4 times to read the full vec4. 2271 * Hopefully, dead code will clean up the mess if some of them aren't 2272 * needed. 2273 */ 2274 assert(inst->size_written == 16 * inst->exec_size); 2275 inst->size_written /= 4; 2276 for (unsigned c = 1; c < 4; c++) { 2277 /* Emit a copy of the instruction because we're about to modify 2278 * it. Because this loop starts at 1, we will emit copies for the 2279 * first 3 and the final one will be the modified instruction. 2280 */ 2281 bld.emit(*inst); 2282 2283 /* Offset the source */ 2284 inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD); 2285 bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4)); 2286 2287 /* Offset the destination */ 2288 inst->dst = offset(inst->dst, bld, 1); 2289 } 2290 } 2291} 2292 2293static void 2294lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst) 2295{ 2296 const intel_device_info *devinfo = bld.shader->devinfo; 2297 const brw_compiler *compiler = bld.shader->compiler; 2298 2299 if (devinfo->ver >= 7) { 2300 fs_reg index = inst->src[0]; 2301 /* We are switching the instruction from an ALU-like instruction to a 2302 * send-from-grf instruction. Since sends can't handle strides or 2303 * source modifiers, we have to make a copy of the offset source. 2304 */ 2305 fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); 2306 bld.MOV(ubo_offset, inst->src[1]); 2307 2308 assert(inst->src[2].file == BRW_IMMEDIATE_VALUE); 2309 unsigned alignment = inst->src[2].ud; 2310 2311 inst->opcode = SHADER_OPCODE_SEND; 2312 inst->mlen = inst->exec_size / 8; 2313 inst->resize_sources(3); 2314 2315 if (index.file == IMM) { 2316 inst->desc = index.ud & 0xff; 2317 inst->src[0] = brw_imm_ud(0); 2318 } else { 2319 inst->desc = 0; 2320 const fs_builder ubld = bld.exec_all().group(1, 0); 2321 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); 2322 ubld.AND(tmp, index, brw_imm_ud(0xff)); 2323 inst->src[0] = component(tmp, 0); 2324 } 2325 inst->src[1] = brw_imm_ud(0); /* ex_desc */ 2326 inst->src[2] = ubo_offset; /* payload */ 2327 2328 if (compiler->indirect_ubos_use_sampler) { 2329 const unsigned simd_mode = 2330 inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 : 2331 BRW_SAMPLER_SIMD_MODE_SIMD16; 2332 2333 inst->sfid = BRW_SFID_SAMPLER; 2334 inst->desc |= brw_sampler_desc(devinfo, 0, 0, 2335 GFX5_SAMPLER_MESSAGE_SAMPLE_LD, 2336 simd_mode, 0); 2337 } else if (alignment >= 4) { 2338 inst->sfid = (devinfo->verx10 >= 75 ? 2339 HSW_SFID_DATAPORT_DATA_CACHE_1 : 2340 GFX7_SFID_DATAPORT_DATA_CACHE); 2341 inst->desc |= brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size, 2342 4, /* num_channels */ 2343 false /* write */); 2344 } else { 2345 inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; 2346 inst->desc |= brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size, 2347 32, /* bit_size */ 2348 false /* write */); 2349 /* The byte scattered messages can only read one dword at a time so 2350 * we have to duplicate the message 4 times to read the full vec4. 2351 * Hopefully, dead code will clean up the mess if some of them aren't 2352 * needed. 2353 */ 2354 assert(inst->size_written == 16 * inst->exec_size); 2355 inst->size_written /= 4; 2356 for (unsigned c = 1; c < 4; c++) { 2357 /* Emit a copy of the instruction because we're about to modify 2358 * it. Because this loop starts at 1, we will emit copies for the 2359 * first 3 and the final one will be the modified instruction. 2360 */ 2361 bld.emit(*inst); 2362 2363 /* Offset the source */ 2364 inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD); 2365 bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4)); 2366 2367 /* Offset the destination */ 2368 inst->dst = offset(inst->dst, bld, 1); 2369 } 2370 } 2371 } else { 2372 const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver), 2373 BRW_REGISTER_TYPE_UD); 2374 2375 bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]); 2376 2377 inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4; 2378 inst->resize_sources(1); 2379 inst->base_mrf = payload.nr; 2380 inst->header_size = 1; 2381 inst->mlen = 1 + inst->exec_size / 8; 2382 } 2383} 2384 2385static void 2386lower_math_logical_send(const fs_builder &bld, fs_inst *inst) 2387{ 2388 assert(bld.shader->devinfo->ver < 6); 2389 2390 inst->base_mrf = 2; 2391 inst->mlen = inst->sources * inst->exec_size / 8; 2392 2393 if (inst->sources > 1) { 2394 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 2395 * "Message Payload": 2396 * 2397 * "Operand0[7]. For the INT DIV functions, this operand is the 2398 * denominator." 2399 * ... 2400 * "Operand1[7]. For the INT DIV functions, this operand is the 2401 * numerator." 2402 */ 2403 const bool is_int_div = inst->opcode != SHADER_OPCODE_POW; 2404 const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0]; 2405 const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1]; 2406 2407 inst->resize_sources(1); 2408 inst->src[0] = src0; 2409 2410 assert(inst->exec_size == 8); 2411 bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1); 2412 } 2413} 2414 2415static void 2416lower_btd_logical_send(const fs_builder &bld, fs_inst *inst) 2417{ 2418 const intel_device_info *devinfo = bld.shader->devinfo; 2419 fs_reg global_addr = inst->src[0]; 2420 const fs_reg &btd_record = inst->src[1]; 2421 2422 const unsigned mlen = 2; 2423 const fs_builder ubld = bld.exec_all().group(8, 0); 2424 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2425 2426 ubld.MOV(header, brw_imm_ud(0)); 2427 switch (inst->opcode) { 2428 case SHADER_OPCODE_BTD_SPAWN_LOGICAL: 2429 assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0); 2430 global_addr.type = BRW_REGISTER_TYPE_UD; 2431 global_addr.stride = 1; 2432 ubld.group(2, 0).MOV(header, global_addr); 2433 break; 2434 2435 case SHADER_OPCODE_BTD_RETIRE_LOGICAL: 2436 /* The bottom bit is the Stack ID release bit */ 2437 ubld.group(1, 0).MOV(header, brw_imm_ud(1)); 2438 break; 2439 2440 default: 2441 unreachable("Invalid BTD message"); 2442 } 2443 2444 /* Stack IDs are always in R1 regardless of whether we're coming from a 2445 * bindless shader or a regular compute shader. 2446 */ 2447 fs_reg stack_ids = 2448 retype(byte_offset(header, REG_SIZE), BRW_REGISTER_TYPE_UW); 2449 bld.MOV(stack_ids, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW)); 2450 2451 unsigned ex_mlen = 0; 2452 fs_reg payload; 2453 if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) { 2454 ex_mlen = 2 * (inst->exec_size / 8); 2455 payload = bld.move_to_vgrf(btd_record, 1); 2456 } else { 2457 assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL); 2458 /* All these messages take a BTD and things complain if we don't provide 2459 * one for RETIRE. However, it shouldn't ever actually get used so fill 2460 * it with zero. 2461 */ 2462 ex_mlen = 2 * (inst->exec_size / 8); 2463 payload = bld.move_to_vgrf(brw_imm_uq(0), 1); 2464 } 2465 2466 /* Update the original instruction. */ 2467 inst->opcode = SHADER_OPCODE_SEND; 2468 inst->mlen = mlen; 2469 inst->ex_mlen = ex_mlen; 2470 inst->header_size = 0; /* HW docs require has_header = false */ 2471 inst->send_has_side_effects = true; 2472 inst->send_is_volatile = false; 2473 2474 /* Set up SFID and descriptors */ 2475 inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH; 2476 inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size, 2477 GEN_RT_BTD_MESSAGE_SPAWN); 2478 inst->resize_sources(4); 2479 inst->src[0] = brw_imm_ud(0); /* desc */ 2480 inst->src[1] = brw_imm_ud(0); /* ex_desc */ 2481 inst->src[2] = header; 2482 inst->src[3] = payload; 2483} 2484 2485static void 2486lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst) 2487{ 2488 const intel_device_info *devinfo = bld.shader->devinfo; 2489 /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal 2490 * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q 2491 * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword 2492 * so that the MOV operates on 2 components rather than twice the same 2493 * component. 2494 */ 2495 fs_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_REGISTER_TYPE_UD); 2496 globals_addr.stride = 1; 2497 const fs_reg &bvh_level = 2498 inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ? 2499 inst->src[RT_LOGICAL_SRC_BVH_LEVEL] : 2500 bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL], 2501 inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL)); 2502 const fs_reg &trace_ray_control = 2503 inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ? 2504 inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] : 2505 bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL], 2506 inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL)); 2507 const fs_reg &synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS]; 2508 assert(synchronous_src.file == BRW_IMMEDIATE_VALUE); 2509 const bool synchronous = synchronous_src.ud; 2510 2511 const unsigned mlen = 1; 2512 const fs_builder ubld = bld.exec_all().group(8, 0); 2513 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD); 2514 ubld.MOV(header, brw_imm_ud(0)); 2515 ubld.group(2, 0).MOV(header, globals_addr); 2516 if (synchronous) 2517 ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous)); 2518 2519 const unsigned ex_mlen = inst->exec_size / 8; 2520 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD); 2521 if (bvh_level.file == BRW_IMMEDIATE_VALUE && 2522 trace_ray_control.file == BRW_IMMEDIATE_VALUE) { 2523 bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) | 2524 (bvh_level.ud & 0x7))); 2525 } else { 2526 bld.SHL(payload, trace_ray_control, brw_imm_ud(8)); 2527 bld.OR(payload, payload, bvh_level); 2528 } 2529 2530 /* When doing synchronous traversal, the HW implicitly computes the 2531 * stack_id using the following formula : 2532 * 2533 * EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0] 2534 * 2535 * Only in the asynchronous case we need to set the stack_id given from the 2536 * payload register. 2537 */ 2538 if (!synchronous) { 2539 bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1), 2540 retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW), 2541 brw_imm_uw(0x7ff)); 2542 } 2543 2544 /* Update the original instruction. */ 2545 inst->opcode = SHADER_OPCODE_SEND; 2546 inst->mlen = mlen; 2547 inst->ex_mlen = ex_mlen; 2548 inst->header_size = 0; /* HW docs require has_header = false */ 2549 inst->send_has_side_effects = true; 2550 inst->send_is_volatile = false; 2551 2552 /* Set up SFID and descriptors */ 2553 inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR; 2554 inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size); 2555 inst->resize_sources(4); 2556 inst->src[0] = brw_imm_ud(0); /* desc */ 2557 inst->src[1] = brw_imm_ud(0); /* ex_desc */ 2558 inst->src[2] = header; 2559 inst->src[3] = payload; 2560} 2561 2562bool 2563fs_visitor::lower_logical_sends() 2564{ 2565 bool progress = false; 2566 2567 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { 2568 const fs_builder ibld(this, block, inst); 2569 2570 switch (inst->opcode) { 2571 case FS_OPCODE_FB_WRITE_LOGICAL: 2572 assert(stage == MESA_SHADER_FRAGMENT); 2573 lower_fb_write_logical_send(ibld, inst, 2574 brw_wm_prog_data(prog_data), 2575 (const brw_wm_prog_key *)key, 2576 payload); 2577 break; 2578 2579 case FS_OPCODE_FB_READ_LOGICAL: 2580 lower_fb_read_logical_send(ibld, inst); 2581 break; 2582 2583 case SHADER_OPCODE_TEX_LOGICAL: 2584 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX); 2585 break; 2586 2587 case SHADER_OPCODE_TXD_LOGICAL: 2588 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD); 2589 break; 2590 2591 case SHADER_OPCODE_TXF_LOGICAL: 2592 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF); 2593 break; 2594 2595 case SHADER_OPCODE_TXL_LOGICAL: 2596 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL); 2597 break; 2598 2599 case SHADER_OPCODE_TXS_LOGICAL: 2600 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS); 2601 break; 2602 2603 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: 2604 lower_sampler_logical_send(ibld, inst, 2605 SHADER_OPCODE_IMAGE_SIZE_LOGICAL); 2606 break; 2607 2608 case FS_OPCODE_TXB_LOGICAL: 2609 lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB); 2610 break; 2611 2612 case SHADER_OPCODE_TXF_CMS_LOGICAL: 2613 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS); 2614 break; 2615 2616 case SHADER_OPCODE_TXF_CMS_W_LOGICAL: 2617 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: 2618 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W); 2619 break; 2620 2621 case SHADER_OPCODE_TXF_UMS_LOGICAL: 2622 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS); 2623 break; 2624 2625 case SHADER_OPCODE_TXF_MCS_LOGICAL: 2626 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS); 2627 break; 2628 2629 case SHADER_OPCODE_LOD_LOGICAL: 2630 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD); 2631 break; 2632 2633 case SHADER_OPCODE_TG4_LOGICAL: 2634 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4); 2635 break; 2636 2637 case SHADER_OPCODE_TG4_OFFSET_LOGICAL: 2638 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET); 2639 break; 2640 2641 case SHADER_OPCODE_SAMPLEINFO_LOGICAL: 2642 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO); 2643 break; 2644 2645 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: 2646 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: 2647 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: 2648 case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: 2649 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: 2650 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: 2651 if (devinfo->has_lsc) { 2652 lower_lsc_surface_logical_send(ibld, inst); 2653 break; 2654 } 2655 case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: 2656 case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: 2657 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: 2658 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: 2659 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: 2660 lower_surface_logical_send(ibld, inst); 2661 break; 2662 2663 case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL: 2664 case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: 2665 case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL: 2666 lower_surface_block_logical_send(ibld, inst); 2667 break; 2668 2669 case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: 2670 case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: 2671 case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: 2672 case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: 2673 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: 2674 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL: 2675 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: 2676 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL: 2677 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL: 2678 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL: 2679 if (devinfo->has_lsc) { 2680 lower_lsc_a64_logical_send(ibld, inst); 2681 break; 2682 } 2683 case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: 2684 case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: 2685 case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: 2686 lower_a64_logical_send(ibld, inst); 2687 break; 2688 2689 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: 2690 if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler) 2691 lower_lsc_varying_pull_constant_logical_send(ibld, inst); 2692 else 2693 lower_varying_pull_constant_logical_send(ibld, inst); 2694 break; 2695 2696 case SHADER_OPCODE_RCP: 2697 case SHADER_OPCODE_RSQ: 2698 case SHADER_OPCODE_SQRT: 2699 case SHADER_OPCODE_EXP2: 2700 case SHADER_OPCODE_LOG2: 2701 case SHADER_OPCODE_SIN: 2702 case SHADER_OPCODE_COS: 2703 case SHADER_OPCODE_POW: 2704 case SHADER_OPCODE_INT_QUOTIENT: 2705 case SHADER_OPCODE_INT_REMAINDER: 2706 /* The math opcodes are overloaded for the send-like and 2707 * expression-like instructions which seems kind of icky. Gfx6+ has 2708 * a native (but rather quirky) MATH instruction so we don't need to 2709 * do anything here. On Gfx4-5 we'll have to lower the Gfx6-like 2710 * logical instructions (which we can easily recognize because they 2711 * have mlen = 0) into send-like virtual instructions. 2712 */ 2713 if (devinfo->ver < 6 && inst->mlen == 0) { 2714 lower_math_logical_send(ibld, inst); 2715 break; 2716 2717 } else { 2718 continue; 2719 } 2720 2721 case SHADER_OPCODE_BTD_SPAWN_LOGICAL: 2722 case SHADER_OPCODE_BTD_RETIRE_LOGICAL: 2723 lower_btd_logical_send(ibld, inst); 2724 break; 2725 2726 case RT_OPCODE_TRACE_RAY_LOGICAL: 2727 lower_trace_ray_logical_send(ibld, inst); 2728 break; 2729 2730 case SHADER_OPCODE_URB_READ_LOGICAL: 2731 lower_urb_read_logical_send(ibld, inst); 2732 break; 2733 2734 case SHADER_OPCODE_URB_WRITE_LOGICAL: 2735 lower_urb_write_logical_send(ibld, inst); 2736 break; 2737 2738 default: 2739 continue; 2740 } 2741 2742 progress = true; 2743 } 2744 2745 if (progress) 2746 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); 2747 2748 return progress; 2749} 2750