1/*
2 * Copyright © 2010, 2022 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/**
25 * @file brw_lower_logical_sends.cpp
26 */
27
28#include "brw_eu.h"
29#include "brw_fs.h"
30
31using namespace brw;
32
33static void
34lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst)
35{
36   const intel_device_info *devinfo = bld.shader->devinfo;
37   const bool per_slot_present =
38      inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
39
40   assert(inst->size_written % REG_SIZE == 0);
41   assert(inst->header_size == 0);
42
43   fs_reg *payload_sources = new fs_reg[inst->mlen];
44   fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen),
45                           BRW_REGISTER_TYPE_F);
46
47   unsigned header_size = 0;
48   payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
49   if (per_slot_present)
50      payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
51
52   bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size);
53
54   delete [] payload_sources;
55
56   inst->opcode = SHADER_OPCODE_SEND;
57   inst->header_size = header_size;
58
59   inst->sfid = BRW_SFID_URB;
60   inst->desc = brw_urb_desc(devinfo,
61                             GFX8_URB_OPCODE_SIMD8_READ,
62                             per_slot_present,
63                             false,
64                             inst->offset);
65
66   inst->ex_desc = 0;
67   inst->ex_mlen = 0;
68   inst->send_is_volatile = true;
69
70   inst->resize_sources(4);
71
72   inst->src[0] = brw_imm_ud(0); /* desc */
73   inst->src[1] = brw_imm_ud(0); /* ex_desc */
74   inst->src[2] = payload;
75   inst->src[3] = brw_null_reg();
76}
77
78static void
79lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst)
80{
81   const intel_device_info *devinfo = bld.shader->devinfo;
82   const bool per_slot_present =
83      inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
84   const bool channel_mask_present =
85      inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
86
87   assert(inst->header_size == 0);
88
89   fs_reg *payload_sources = new fs_reg[inst->mlen];
90   fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen),
91                           BRW_REGISTER_TYPE_F);
92
93   unsigned header_size = 0;
94   payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
95   if (per_slot_present)
96      payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
97
98   if (channel_mask_present)
99      payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
100
101   for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++)
102      payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
103
104   bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size);
105
106   delete [] payload_sources;
107
108   inst->opcode = SHADER_OPCODE_SEND;
109   inst->header_size = header_size;
110   inst->dst = brw_null_reg();
111
112   inst->sfid = BRW_SFID_URB;
113   inst->desc = brw_urb_desc(devinfo,
114                             GFX8_URB_OPCODE_SIMD8_WRITE,
115                             per_slot_present,
116                             channel_mask_present,
117                             inst->offset);
118
119   inst->ex_desc = 0;
120   inst->ex_mlen = 0;
121   inst->send_has_side_effects = true;
122
123   inst->resize_sources(4);
124
125   inst->src[0] = brw_imm_ud(0); /* desc */
126   inst->src[1] = brw_imm_ud(0); /* ex_desc */
127   inst->src[2] = payload;
128   inst->src[3] = brw_null_reg();
129}
130
131static void
132setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
133                    fs_reg *dst, fs_reg color, unsigned components)
134{
135   if (key->clamp_fragment_color) {
136      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
137      assert(color.type == BRW_REGISTER_TYPE_F);
138
139      for (unsigned i = 0; i < components; i++)
140         set_saturate(true,
141                      bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
142
143      color = tmp;
144   }
145
146   for (unsigned i = 0; i < components; i++)
147      dst[i] = offset(color, bld, i);
148}
149
150static void
151lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
152                            const struct brw_wm_prog_data *prog_data,
153                            const brw_wm_prog_key *key,
154                            const fs_visitor::thread_payload &payload)
155{
156   assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
157   const intel_device_info *devinfo = bld.shader->devinfo;
158   const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
159   const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
160   const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
161   const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
162   const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
163   const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
164   fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
165   const unsigned components =
166      inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
167
168   assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
169
170   /* We can potentially have a message length of up to 15, so we have to set
171    * base_mrf to either 0 or 1 in order to fit in m0..m15.
172    */
173   fs_reg sources[15];
174   int header_size = 2, payload_header_size;
175   unsigned length = 0;
176
177   if (devinfo->ver < 6) {
178      /* TODO: Support SIMD32 on gfx4-5 */
179      assert(bld.group() < 16);
180
181      /* For gfx4-5, we always have a header consisting of g0 and g1.  We have
182       * an implied MOV from g0,g1 to the start of the message.  The MOV from
183       * g0 is handled by the hardware and the MOV from g1 is provided by the
184       * generator.  This is required because, on gfx4-5, the generator may
185       * generate two write messages with different message lengths in order
186       * to handle AA data properly.
187       *
188       * Also, since the pixel mask goes in the g0 portion of the message and
189       * since render target writes are the last thing in the shader, we write
190       * the pixel mask directly into g0 and it will get copied as part of the
191       * implied write.
192       */
193      if (prog_data->uses_kill) {
194         bld.exec_all().group(1, 0)
195            .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
196                 brw_sample_mask_reg(bld));
197      }
198
199      assert(length == 0);
200      length = 2;
201   } else if ((devinfo->verx10 <= 70 &&
202               prog_data->uses_kill) ||
203              (devinfo->ver < 11 &&
204               (color1.file != BAD_FILE || key->nr_color_regions > 1))) {
205      /* From the Sandy Bridge PRM, volume 4, page 198:
206       *
207       *     "Dispatched Pixel Enables. One bit per pixel indicating
208       *      which pixels were originally enabled when the thread was
209       *      dispatched. This field is only required for the end-of-
210       *      thread message and on all dual-source messages."
211       */
212      const fs_builder ubld = bld.exec_all().group(8, 0);
213
214      fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
215      if (bld.group() < 16) {
216         /* The header starts off as g0 and g1 for the first half */
217         ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
218                                              BRW_REGISTER_TYPE_UD));
219      } else {
220         /* The header starts off as g0 and g2 for the second half */
221         assert(bld.group() < 32);
222         const fs_reg header_sources[2] = {
223            retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
224            retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
225         };
226         ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
227
228         /* Gfx12 will require additional fix-ups if we ever hit this path. */
229         assert(devinfo->ver < 12);
230      }
231
232      uint32_t g00_bits = 0;
233
234      /* Set "Source0 Alpha Present to RenderTarget" bit in message
235       * header.
236       */
237      if (src0_alpha.file != BAD_FILE)
238         g00_bits |= 1 << 11;
239
240      /* Set computes stencil to render target */
241      if (prog_data->computed_stencil)
242         g00_bits |= 1 << 14;
243
244      if (g00_bits) {
245         /* OR extra bits into g0.0 */
246         ubld.group(1, 0).OR(component(header, 0),
247                             retype(brw_vec1_grf(0, 0),
248                                    BRW_REGISTER_TYPE_UD),
249                             brw_imm_ud(g00_bits));
250      }
251
252      /* Set the render target index for choosing BLEND_STATE. */
253      if (inst->target > 0) {
254         ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
255      }
256
257      if (prog_data->uses_kill) {
258         ubld.group(1, 0).MOV(retype(component(header, 15),
259                                     BRW_REGISTER_TYPE_UW),
260                              brw_sample_mask_reg(bld));
261      }
262
263      assert(length == 0);
264      sources[0] = header;
265      sources[1] = horiz_offset(header, 8);
266      length = 2;
267   }
268   assert(length == 0 || length == 2);
269   header_size = length;
270
271   if (payload.aa_dest_stencil_reg[0]) {
272      assert(inst->group < 16);
273      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
274      bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
275         .MOV(sources[length],
276              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
277      length++;
278   }
279
280   if (src0_alpha.file != BAD_FILE) {
281      for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
282         const fs_builder &ubld = bld.exec_all().group(8, i)
283                                    .annotate("FB write src0 alpha");
284         const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
285         ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
286         setup_color_payload(ubld, key, &sources[length], tmp, 1);
287         length++;
288      }
289   }
290
291   if (sample_mask.file != BAD_FILE) {
292      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
293                               BRW_REGISTER_TYPE_UD);
294
295      /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
296       * relevant.  Since it's unsigned single words one vgrf is always
297       * 16-wide, but only the lower or higher 8 channels will be used by the
298       * hardware when doing a SIMD8 write depending on whether we have
299       * selected the subspans for the first or second half respectively.
300       */
301      assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
302      sample_mask.type = BRW_REGISTER_TYPE_UW;
303      sample_mask.stride *= 2;
304
305      bld.exec_all().annotate("FB write oMask")
306         .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
307                           inst->group % 16),
308              sample_mask);
309      length++;
310   }
311
312   payload_header_size = length;
313
314   setup_color_payload(bld, key, &sources[length], color0, components);
315   length += 4;
316
317   if (color1.file != BAD_FILE) {
318      setup_color_payload(bld, key, &sources[length], color1, components);
319      length += 4;
320   }
321
322   if (src_depth.file != BAD_FILE) {
323      sources[length] = src_depth;
324      length++;
325   }
326
327   if (dst_depth.file != BAD_FILE) {
328      sources[length] = dst_depth;
329      length++;
330   }
331
332   if (src_stencil.file != BAD_FILE) {
333      assert(devinfo->ver >= 9);
334      assert(bld.dispatch_width() == 8);
335
336      /* XXX: src_stencil is only available on gfx9+. dst_depth is never
337       * available on gfx9+. As such it's impossible to have both enabled at the
338       * same time and therefore length cannot overrun the array.
339       */
340      assert(length < 15);
341
342      sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
343      bld.exec_all().annotate("FB write OS")
344         .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
345              subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
346      length++;
347   }
348
349   fs_inst *load;
350   if (devinfo->ver >= 7) {
351      /* Send from the GRF */
352      fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
353      load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
354      payload.nr = bld.shader->alloc.allocate(regs_written(load));
355      load->dst = payload;
356
357      uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
358
359      inst->desc =
360         (inst->group / 16) << 11 | /* rt slot group */
361         brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
362                           prog_data->per_coarse_pixel_dispatch);
363
364      uint32_t ex_desc = 0;
365      if (devinfo->ver >= 11) {
366         /* Set the "Render Target Index" and "Src0 Alpha Present" fields
367          * in the extended message descriptor, in lieu of using a header.
368          */
369         ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
370
371         if (key->nr_color_regions == 0)
372            ex_desc |= 1 << 20; /* Null Render Target */
373      }
374      inst->ex_desc = ex_desc;
375
376      inst->opcode = SHADER_OPCODE_SEND;
377      inst->resize_sources(3);
378      inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
379      inst->src[0] = brw_imm_ud(0);
380      inst->src[1] = brw_imm_ud(0);
381      inst->src[2] = payload;
382      inst->mlen = regs_written(load);
383      inst->ex_mlen = 0;
384      inst->header_size = header_size;
385      inst->check_tdr = true;
386      inst->send_has_side_effects = true;
387   } else {
388      /* Send from the MRF */
389      load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
390                              sources, length, payload_header_size);
391
392      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
393       * will do this for us if we just give it a COMPR4 destination.
394       */
395      if (devinfo->ver < 6 && bld.dispatch_width() == 16)
396         load->dst.nr |= BRW_MRF_COMPR4;
397
398      if (devinfo->ver < 6) {
399         /* Set up src[0] for the implied MOV from grf0-1 */
400         inst->resize_sources(1);
401         inst->src[0] = brw_vec8_grf(0, 0);
402      } else {
403         inst->resize_sources(0);
404      }
405      inst->base_mrf = 1;
406      inst->opcode = FS_OPCODE_FB_WRITE;
407      inst->mlen = regs_written(load);
408      inst->header_size = header_size;
409   }
410}
411
412static void
413lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
414{
415   const intel_device_info *devinfo = bld.shader->devinfo;
416   const fs_builder &ubld = bld.exec_all().group(8, 0);
417   const unsigned length = 2;
418   const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
419
420   if (bld.group() < 16) {
421      ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
422                                           BRW_REGISTER_TYPE_UD));
423   } else {
424      assert(bld.group() < 32);
425      const fs_reg header_sources[] = {
426         retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
427         retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
428      };
429      ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
430
431      if (devinfo->ver >= 12) {
432         /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
433          * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
434          * target message header format was updated accordingly -- However
435          * the updated format only works for the lower 16 channels in a
436          * SIMD32 thread, since the higher 16 channels want the subspan data
437          * from r2 instead of r1, so we need to copy over the contents of
438          * r1.1 in order to fix things up.
439          */
440         ubld.group(1, 0).MOV(component(header, 9),
441                              retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
442      }
443   }
444
445   /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
446    *
447    *   "Must be zero for Render Target Read message."
448    *
449    * For bits :
450    *   - 14 : Stencil Present to Render Target
451    *   - 13 : Source Depth Present to Render Target
452    *   - 12 : oMask to Render Target
453    *   - 11 : Source0 Alpha Present to Render Target
454    */
455   ubld.group(1, 0).AND(component(header, 0),
456                        component(header, 0),
457                        brw_imm_ud(~INTEL_MASK(14, 11)));
458
459   inst->resize_sources(1);
460   inst->src[0] = header;
461   inst->opcode = FS_OPCODE_FB_READ;
462   inst->mlen = length;
463   inst->header_size = length;
464}
465
466static void
467lower_sampler_logical_send_gfx4(const fs_builder &bld, fs_inst *inst, opcode op,
468                                const fs_reg &coordinate,
469                                const fs_reg &shadow_c,
470                                const fs_reg &lod, const fs_reg &lod2,
471                                const fs_reg &surface,
472                                const fs_reg &sampler,
473                                unsigned coord_components,
474                                unsigned grad_components)
475{
476   const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
477                         op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
478   fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
479   fs_reg msg_end = msg_begin;
480
481   /* g0 header. */
482   msg_end = offset(msg_end, bld.group(8, 0), 1);
483
484   for (unsigned i = 0; i < coord_components; i++)
485      bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
486              offset(coordinate, bld, i));
487
488   msg_end = offset(msg_end, bld, coord_components);
489
490   /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
491    * require all three components to be present and zero if they are unused.
492    */
493   if (coord_components > 0 &&
494       (has_lod || shadow_c.file != BAD_FILE ||
495        (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
496      assert(coord_components <= 3);
497      for (unsigned i = 0; i < 3 - coord_components; i++)
498         bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
499
500      msg_end = offset(msg_end, bld, 3 - coord_components);
501   }
502
503   if (op == SHADER_OPCODE_TXD) {
504      /* TXD unsupported in SIMD16 mode. */
505      assert(bld.dispatch_width() == 8);
506
507      /* the slots for u and v are always present, but r is optional */
508      if (coord_components < 2)
509         msg_end = offset(msg_end, bld, 2 - coord_components);
510
511      /*  P   = u, v, r
512       * dPdx = dudx, dvdx, drdx
513       * dPdy = dudy, dvdy, drdy
514       *
515       * 1-arg: Does not exist.
516       *
517       * 2-arg: dudx   dvdx   dudy   dvdy
518       *        dPdx.x dPdx.y dPdy.x dPdy.y
519       *        m4     m5     m6     m7
520       *
521       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
522       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
523       *        m5     m6     m7     m8     m9     m10
524       */
525      for (unsigned i = 0; i < grad_components; i++)
526         bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
527
528      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
529
530      for (unsigned i = 0; i < grad_components; i++)
531         bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
532
533      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
534   }
535
536   if (has_lod) {
537      /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
538       * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
539       */
540      assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
541             bld.dispatch_width() == 16);
542
543      const brw_reg_type type =
544         (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
545          BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
546      bld.MOV(retype(msg_end, type), lod);
547      msg_end = offset(msg_end, bld, 1);
548   }
549
550   if (shadow_c.file != BAD_FILE) {
551      if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
552         /* There's no plain shadow compare message, so we use shadow
553          * compare with a bias of 0.0.
554          */
555         bld.MOV(msg_end, brw_imm_f(0.0f));
556         msg_end = offset(msg_end, bld, 1);
557      }
558
559      bld.MOV(msg_end, shadow_c);
560      msg_end = offset(msg_end, bld, 1);
561   }
562
563   inst->opcode = op;
564   inst->src[0] = reg_undef;
565   inst->src[1] = surface;
566   inst->src[2] = sampler;
567   inst->resize_sources(3);
568   inst->base_mrf = msg_begin.nr;
569   inst->mlen = msg_end.nr - msg_begin.nr;
570   inst->header_size = 1;
571}
572
573static void
574lower_sampler_logical_send_gfx5(const fs_builder &bld, fs_inst *inst, opcode op,
575                                const fs_reg &coordinate,
576                                const fs_reg &shadow_c,
577                                const fs_reg &lod, const fs_reg &lod2,
578                                const fs_reg &sample_index,
579                                const fs_reg &surface,
580                                const fs_reg &sampler,
581                                unsigned coord_components,
582                                unsigned grad_components)
583{
584   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
585   fs_reg msg_coords = message;
586   unsigned header_size = 0;
587
588   if (inst->offset != 0) {
589      /* The offsets set up by the visitor are in the m1 header, so we can't
590       * go headerless.
591       */
592      header_size = 1;
593      message.nr--;
594   }
595
596   for (unsigned i = 0; i < coord_components; i++)
597      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
598              offset(coordinate, bld, i));
599
600   fs_reg msg_end = offset(msg_coords, bld, coord_components);
601   fs_reg msg_lod = offset(msg_coords, bld, 4);
602
603   if (shadow_c.file != BAD_FILE) {
604      fs_reg msg_shadow = msg_lod;
605      bld.MOV(msg_shadow, shadow_c);
606      msg_lod = offset(msg_shadow, bld, 1);
607      msg_end = msg_lod;
608   }
609
610   switch (op) {
611   case SHADER_OPCODE_TXL:
612   case FS_OPCODE_TXB:
613      bld.MOV(msg_lod, lod);
614      msg_end = offset(msg_lod, bld, 1);
615      break;
616   case SHADER_OPCODE_TXD:
617      /**
618       *  P   =  u,    v,    r
619       * dPdx = dudx, dvdx, drdx
620       * dPdy = dudy, dvdy, drdy
621       *
622       * Load up these values:
623       * - dudx   dudy   dvdx   dvdy   drdx   drdy
624       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
625       */
626      msg_end = msg_lod;
627      for (unsigned i = 0; i < grad_components; i++) {
628         bld.MOV(msg_end, offset(lod, bld, i));
629         msg_end = offset(msg_end, bld, 1);
630
631         bld.MOV(msg_end, offset(lod2, bld, i));
632         msg_end = offset(msg_end, bld, 1);
633      }
634      break;
635   case SHADER_OPCODE_TXS:
636      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
637      bld.MOV(msg_lod, lod);
638      msg_end = offset(msg_lod, bld, 1);
639      break;
640   case SHADER_OPCODE_TXF:
641      msg_lod = offset(msg_coords, bld, 3);
642      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
643      msg_end = offset(msg_lod, bld, 1);
644      break;
645   case SHADER_OPCODE_TXF_CMS:
646      msg_lod = offset(msg_coords, bld, 3);
647      /* lod */
648      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
649      /* sample index */
650      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
651      msg_end = offset(msg_lod, bld, 2);
652      break;
653   default:
654      break;
655   }
656
657   inst->opcode = op;
658   inst->src[0] = reg_undef;
659   inst->src[1] = surface;
660   inst->src[2] = sampler;
661   inst->resize_sources(3);
662   inst->base_mrf = message.nr;
663   inst->mlen = msg_end.nr - message.nr;
664   inst->header_size = header_size;
665
666   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
667   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
668}
669
670static bool
671is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler)
672{
673   if (devinfo->verx10 <= 70)
674      return false;
675
676   return sampler.file != IMM || sampler.ud >= 16;
677}
678
679static unsigned
680sampler_msg_type(const intel_device_info *devinfo,
681                 opcode opcode, bool shadow_compare)
682{
683   assert(devinfo->ver >= 5);
684   switch (opcode) {
685   case SHADER_OPCODE_TEX:
686      return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
687                              GFX5_SAMPLER_MESSAGE_SAMPLE;
688   case FS_OPCODE_TXB:
689      return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
690                              GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
691   case SHADER_OPCODE_TXL:
692      return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
693                              GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
694   case SHADER_OPCODE_TXL_LZ:
695      return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
696                              GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
697   case SHADER_OPCODE_TXS:
698   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
699      return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
700   case SHADER_OPCODE_TXD:
701      assert(!shadow_compare || devinfo->verx10 >= 75);
702      return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
703                              GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
704   case SHADER_OPCODE_TXF:
705      return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
706   case SHADER_OPCODE_TXF_LZ:
707      assert(devinfo->ver >= 9);
708      return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
709   case SHADER_OPCODE_TXF_CMS_W:
710      assert(devinfo->ver >= 9);
711      return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
712   case SHADER_OPCODE_TXF_CMS:
713      return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
714                                 GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
715   case SHADER_OPCODE_TXF_UMS:
716      assert(devinfo->ver >= 7);
717      return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
718   case SHADER_OPCODE_TXF_MCS:
719      assert(devinfo->ver >= 7);
720      return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
721   case SHADER_OPCODE_LOD:
722      return GFX5_SAMPLER_MESSAGE_LOD;
723   case SHADER_OPCODE_TG4:
724      assert(devinfo->ver >= 7);
725      return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
726                              GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
727      break;
728   case SHADER_OPCODE_TG4_OFFSET:
729      assert(devinfo->ver >= 7);
730      return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
731                              GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
732   case SHADER_OPCODE_SAMPLEINFO:
733      return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
734   default:
735      unreachable("not reached");
736   }
737}
738
739/**
740 * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
741 * the given requested_alignment_sz.
742 */
743static fs_inst *
744emit_load_payload_with_padding(const fs_builder &bld, const fs_reg &dst,
745                               const fs_reg *src, unsigned sources,
746                               unsigned header_size,
747                               unsigned requested_alignment_sz)
748{
749   unsigned length = 0;
750   unsigned num_srcs =
751      sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
752   fs_reg *src_comps = new fs_reg[num_srcs];
753
754   for (unsigned i = 0; i < header_size; i++)
755      src_comps[length++] = src[i];
756
757   for (unsigned i = header_size; i < sources; i++) {
758      unsigned src_sz =
759         retype(dst, src[i].type).component_size(bld.dispatch_width());
760      const enum brw_reg_type padding_payload_type =
761         brw_reg_type_from_bit_size(type_sz(src[i].type) * 8,
762                                    BRW_REGISTER_TYPE_UD);
763
764      src_comps[length++] = src[i];
765
766      /* Expand the real sources if component of requested payload type is
767       * larger than real source component.
768       */
769      if (src_sz < requested_alignment_sz) {
770         for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
771            src_comps[length++] = retype(fs_reg(), padding_payload_type);
772         }
773      }
774   }
775
776   fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
777   delete[] src_comps;
778
779   return inst;
780}
781
782static void
783lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
784                                const fs_reg &coordinate,
785                                const fs_reg &shadow_c,
786                                fs_reg lod, const fs_reg &lod2,
787                                const fs_reg &min_lod,
788                                const fs_reg &sample_index,
789                                const fs_reg &mcs,
790                                const fs_reg &surface,
791                                const fs_reg &sampler,
792                                const fs_reg &surface_handle,
793                                const fs_reg &sampler_handle,
794                                const fs_reg &tg4_offset,
795                                unsigned payload_type_bit_size,
796                                unsigned coord_components,
797                                unsigned grad_components)
798{
799   const intel_device_info *devinfo = bld.shader->devinfo;
800   const enum brw_reg_type payload_type =
801      brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F);
802   const enum brw_reg_type payload_unsigned_type =
803      brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_UD);
804   const enum brw_reg_type payload_signed_type =
805      brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D);
806   unsigned reg_width = bld.dispatch_width() / 8;
807   unsigned header_size = 0, length = 0;
808   fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
809   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
810      sources[i] = bld.vgrf(payload_type);
811
812   /* We must have exactly one of surface/sampler and surface/sampler_handle */
813   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
814   assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
815
816   if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
817       inst->offset != 0 || inst->eot ||
818       op == SHADER_OPCODE_SAMPLEINFO ||
819       sampler_handle.file != BAD_FILE ||
820       is_high_sampler(devinfo, sampler)) {
821      /* For general texture offsets (no txf workaround), we need a header to
822       * put them in.
823       *
824       * TG4 needs to place its channel select in the header, for interaction
825       * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
826       * larger sampler numbers we need to offset the Sampler State Pointer in
827       * the header.
828       */
829      fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
830      header_size = 1;
831      length++;
832
833      /* If we're requesting fewer than four channels worth of response,
834       * and we have an explicit header, we need to set up the sampler
835       * writemask.  It's reversed from normal: 1 means "don't write".
836       */
837      if (!inst->eot && regs_written(inst) != 4 * reg_width) {
838         assert(regs_written(inst) % reg_width == 0);
839         unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
840         inst->offset |= mask << 12;
841      }
842
843      /* Build the actual header */
844      const fs_builder ubld = bld.exec_all().group(8, 0);
845      const fs_builder ubld1 = ubld.group(1, 0);
846      ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
847      if (inst->offset) {
848         ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
849      } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
850                 bld.shader->stage != MESA_SHADER_FRAGMENT) {
851         /* The vertex and fragment stages have g0.2 set to 0, so
852          * header0.2 is 0 when g0 is copied. Other stages may not, so we
853          * must set it to 0 to avoid setting undesirable bits in the
854          * message.
855          */
856         ubld1.MOV(component(header, 2), brw_imm_ud(0));
857      }
858
859      if (sampler_handle.file != BAD_FILE) {
860         /* Bindless sampler handles aren't relative to the sampler state
861          * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
862          * Instead, it's an absolute pointer relative to dynamic state base
863          * address.
864          *
865          * Sampler states are 16 bytes each and the pointer we give here has
866          * to be 32-byte aligned.  In order to avoid more indirect messages
867          * than required, we assume that all bindless sampler states are
868          * 32-byte aligned.  This sacrifices a bit of general state base
869          * address space but means we can do something more efficient in the
870          * shader.
871          */
872         ubld1.MOV(component(header, 3), sampler_handle);
873      } else if (is_high_sampler(devinfo, sampler)) {
874         fs_reg sampler_state_ptr =
875            retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
876
877         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
878          * with the ones included in g0.3 bits 4:0.  Mask them out.
879          */
880         if (devinfo->ver >= 11) {
881            sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
882            ubld1.AND(sampler_state_ptr,
883                      retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
884                      brw_imm_ud(INTEL_MASK(31, 5)));
885         }
886
887         if (sampler.file == BRW_IMMEDIATE_VALUE) {
888            assert(sampler.ud >= 16);
889            const int sampler_state_size = 16; /* 16 bytes */
890
891            ubld1.ADD(component(header, 3), sampler_state_ptr,
892                      brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
893         } else {
894            fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
895            ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
896            ubld1.SHL(tmp, tmp, brw_imm_ud(4));
897            ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
898         }
899      } else if (devinfo->ver >= 11) {
900         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
901          * with the ones included in g0.3 bits 4:0.  Mask them out.
902          */
903         ubld1.AND(component(header, 3),
904                   retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
905                   brw_imm_ud(INTEL_MASK(31, 5)));
906      }
907   }
908
909   if (shadow_c.file != BAD_FILE) {
910      bld.MOV(sources[length], shadow_c);
911      length++;
912   }
913
914   bool coordinate_done = false;
915
916   /* Set up the LOD info */
917   switch (op) {
918   case FS_OPCODE_TXB:
919   case SHADER_OPCODE_TXL:
920      if (devinfo->ver >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
921         op = SHADER_OPCODE_TXL_LZ;
922         break;
923      }
924      bld.MOV(sources[length], lod);
925      length++;
926      break;
927   case SHADER_OPCODE_TXD:
928      /* TXD should have been lowered in SIMD16 mode. */
929      assert(bld.dispatch_width() == 8);
930
931      /* Load dPdx and the coordinate together:
932       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
933       */
934      for (unsigned i = 0; i < coord_components; i++) {
935         bld.MOV(sources[length++], offset(coordinate, bld, i));
936
937         /* For cube map array, the coordinate is (u,v,r,ai) but there are
938          * only derivatives for (u, v, r).
939          */
940         if (i < grad_components) {
941            bld.MOV(sources[length++], offset(lod, bld, i));
942            bld.MOV(sources[length++], offset(lod2, bld, i));
943         }
944      }
945
946      coordinate_done = true;
947      break;
948   case SHADER_OPCODE_TXS:
949      bld.MOV(retype(sources[length], payload_unsigned_type), lod);
950      length++;
951      break;
952   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
953      /* We need an LOD; just use 0 */
954      bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0));
955      length++;
956      break;
957   case SHADER_OPCODE_TXF:
958      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
959       * On Gfx9 they are u, v, lod, r
960       */
961      bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
962
963      if (devinfo->ver >= 9) {
964         if (coord_components >= 2) {
965            bld.MOV(retype(sources[length], payload_signed_type),
966                    offset(coordinate, bld, 1));
967         } else {
968            sources[length] = brw_imm_d(0);
969         }
970         length++;
971      }
972
973      if (devinfo->ver >= 9 && lod.is_zero()) {
974         op = SHADER_OPCODE_TXF_LZ;
975      } else {
976         bld.MOV(retype(sources[length], payload_signed_type), lod);
977         length++;
978      }
979
980      for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
981         bld.MOV(retype(sources[length++], payload_signed_type),
982                 offset(coordinate, bld, i));
983
984      coordinate_done = true;
985      break;
986
987   case SHADER_OPCODE_TXF_CMS:
988   case SHADER_OPCODE_TXF_CMS_W:
989   case SHADER_OPCODE_TXF_UMS:
990   case SHADER_OPCODE_TXF_MCS:
991      if (op == SHADER_OPCODE_TXF_UMS ||
992          op == SHADER_OPCODE_TXF_CMS ||
993          op == SHADER_OPCODE_TXF_CMS_W) {
994         bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index);
995      }
996
997      /* Data from the multisample control surface. */
998      if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
999         unsigned num_mcs_components = 1;
1000
1001         /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
1002          * Shared Functions - 3D Sampler - Messages - Message Format:
1003          *
1004          *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
1005          */
1006         if (devinfo->verx10 >= 125 && op == SHADER_OPCODE_TXF_CMS_W)
1007            num_mcs_components = 4;
1008         else if (op == SHADER_OPCODE_TXF_CMS_W)
1009            num_mcs_components = 2;
1010
1011         for (unsigned i = 0; i < num_mcs_components; ++i) {
1012            bld.MOV(retype(sources[length++], payload_unsigned_type),
1013                    mcs.file == IMM ? mcs : offset(mcs, bld, i));
1014         }
1015      }
1016
1017      /* There is no offsetting for this message; just copy in the integer
1018       * texture coordinates.
1019       */
1020      for (unsigned i = 0; i < coord_components; i++)
1021         bld.MOV(retype(sources[length++], payload_signed_type),
1022                 offset(coordinate, bld, i));
1023
1024      coordinate_done = true;
1025      break;
1026   case SHADER_OPCODE_TG4_OFFSET:
1027      /* More crazy intermixing */
1028      for (unsigned i = 0; i < 2; i++) /* u, v */
1029         bld.MOV(sources[length++], offset(coordinate, bld, i));
1030
1031      for (unsigned i = 0; i < 2; i++) /* offu, offv */
1032         bld.MOV(retype(sources[length++], payload_signed_type),
1033                 offset(tg4_offset, bld, i));
1034
1035      if (coord_components == 3) /* r if present */
1036         bld.MOV(sources[length++], offset(coordinate, bld, 2));
1037
1038      coordinate_done = true;
1039      break;
1040   default:
1041      break;
1042   }
1043
1044   /* Set up the coordinate (except for cases where it was done above) */
1045   if (!coordinate_done) {
1046      for (unsigned i = 0; i < coord_components; i++)
1047         bld.MOV(retype(sources[length++], payload_type),
1048                 offset(coordinate, bld, i));
1049   }
1050
1051   if (min_lod.file != BAD_FILE) {
1052      /* Account for all of the missing coordinate sources */
1053      if (op == SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
1054         /* On DG2 and newer platforms, sample_d can only be used with 1D and
1055          * 2D surfaces, so the maximum number of gradient components is 2.
1056          * In spite of this limitation, the Bspec lists a mysterious R
1057          * component before the min_lod, so the maximum coordinate components
1058          * is 3.
1059          *
1060          * Wa_1209978020
1061          */
1062         length += 3 - coord_components;
1063         length += (2 - grad_components) * 2;
1064      } else {
1065         length += 4 - coord_components;
1066         if (op == SHADER_OPCODE_TXD)
1067            length += (3 - grad_components) * 2;
1068      }
1069
1070      bld.MOV(sources[length++], min_lod);
1071   }
1072
1073   const fs_reg src_payload =
1074      fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
1075                                              BRW_REGISTER_TYPE_F);
1076   /* In case of 16-bit payload each component takes one full register in
1077    * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
1078    * elements. In SIMD8H case hardware simply expects the components to be
1079    * padded (i.e., aligned on reg boundary).
1080    */
1081   fs_inst *load_payload_inst =
1082      emit_load_payload_with_padding(bld, src_payload, sources, length,
1083                                     header_size, REG_SIZE);
1084   unsigned mlen = load_payload_inst->size_written / REG_SIZE;
1085   unsigned simd_mode = 0;
1086   if (payload_type_bit_size == 16) {
1087      assert(devinfo->ver >= 11);
1088      simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
1089                                         GFX10_SAMPLER_SIMD_MODE_SIMD16H;
1090   } else {
1091      simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
1092                                         BRW_SAMPLER_SIMD_MODE_SIMD16;
1093   }
1094
1095   /* Generate the SEND. */
1096   inst->opcode = SHADER_OPCODE_SEND;
1097   inst->mlen = mlen;
1098   inst->header_size = header_size;
1099
1100   const unsigned msg_type =
1101      sampler_msg_type(devinfo, op, inst->shadow_compare);
1102
1103   inst->sfid = BRW_SFID_SAMPLER;
1104   if (surface.file == IMM &&
1105       (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
1106      inst->desc = brw_sampler_desc(devinfo, surface.ud,
1107                                    sampler.file == IMM ? sampler.ud % 16 : 0,
1108                                    msg_type,
1109                                    simd_mode,
1110                                    0 /* return_format unused on gfx7+ */);
1111      inst->src[0] = brw_imm_ud(0);
1112      inst->src[1] = brw_imm_ud(0);
1113   } else if (surface_handle.file != BAD_FILE) {
1114      /* Bindless surface */
1115      assert(devinfo->ver >= 9);
1116      inst->desc = brw_sampler_desc(devinfo,
1117                                    GFX9_BTI_BINDLESS,
1118                                    sampler.file == IMM ? sampler.ud % 16 : 0,
1119                                    msg_type,
1120                                    simd_mode,
1121                                    0 /* return_format unused on gfx7+ */);
1122
1123      /* For bindless samplers, the entire address is included in the message
1124       * header so we can leave the portion in the message descriptor 0.
1125       */
1126      if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
1127         inst->src[0] = brw_imm_ud(0);
1128      } else {
1129         const fs_builder ubld = bld.group(1, 0).exec_all();
1130         fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1131         ubld.SHL(desc, sampler, brw_imm_ud(8));
1132         inst->src[0] = desc;
1133      }
1134
1135      /* We assume that the driver provided the handle in the top 20 bits so
1136       * we can use the surface handle directly as the extended descriptor.
1137       */
1138      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1139   } else {
1140      /* Immediate portion of the descriptor */
1141      inst->desc = brw_sampler_desc(devinfo,
1142                                    0, /* surface */
1143                                    0, /* sampler */
1144                                    msg_type,
1145                                    simd_mode,
1146                                    0 /* return_format unused on gfx7+ */);
1147      const fs_builder ubld = bld.group(1, 0).exec_all();
1148      fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1149      if (surface.equals(sampler)) {
1150         /* This case is common in GL */
1151         ubld.MUL(desc, surface, brw_imm_ud(0x101));
1152      } else {
1153         if (sampler_handle.file != BAD_FILE) {
1154            ubld.MOV(desc, surface);
1155         } else if (sampler.file == IMM) {
1156            ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
1157         } else {
1158            ubld.SHL(desc, sampler, brw_imm_ud(8));
1159            ubld.OR(desc, desc, surface);
1160         }
1161      }
1162      ubld.AND(desc, desc, brw_imm_ud(0xfff));
1163
1164      inst->src[0] = component(desc, 0);
1165      inst->src[1] = brw_imm_ud(0); /* ex_desc */
1166   }
1167
1168   inst->ex_desc = 0;
1169
1170   inst->src[2] = src_payload;
1171   inst->resize_sources(3);
1172
1173   if (inst->eot) {
1174      /* EOT sampler messages don't make sense to split because it would
1175       * involve ending half of the thread early.
1176       */
1177      assert(inst->group == 0);
1178      /* We need to use SENDC for EOT sampler messages */
1179      inst->check_tdr = true;
1180      inst->send_has_side_effects = true;
1181   }
1182
1183   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1184   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
1185}
1186
1187static unsigned
1188get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1189                                      opcode op, const fs_reg *src)
1190{
1191   unsigned src_type_size = 0;
1192
1193   /* All sources need to have the same size, therefore seek the first valid
1194    * and take the size from there.
1195    */
1196   for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1197      if (src[i].file != BAD_FILE) {
1198         src_type_size = brw_reg_type_to_size(src[i].type);
1199         break;
1200      }
1201   }
1202
1203   assert(src_type_size == 2 || src_type_size == 4);
1204
1205#ifndef NDEBUG
1206   /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
1207    * compressed multisampled surfaces. There the payload contains MCS data
1208    * which is already in 16-bits unlike the other parameters that need forced
1209    * conversion.
1210    */
1211   if (devinfo->verx10 < 125 ||
1212       (op != SHADER_OPCODE_TXF_CMS_W &&
1213        op != SHADER_OPCODE_TXF_CMS)) {
1214      for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1215         assert(src[i].file == BAD_FILE ||
1216                brw_reg_type_to_size(src[i].type) == src_type_size);
1217      }
1218   }
1219#endif
1220
1221   if (devinfo->verx10 < 125)
1222      return src_type_size * 8;
1223
1224   /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
1225    * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
1226    * Format [GFX12:HAS:1209977870] *
1227    *
1228    *  ld2dms_w       SIMD8H and SIMD16H Only
1229    *  ld_mcs         SIMD8H and SIMD16H Only
1230    *  ld2dms         REMOVEDBY(GEN:HAS:1406788836)
1231    */
1232
1233   if (op == SHADER_OPCODE_TXF_CMS_W ||
1234       op == SHADER_OPCODE_TXF_CMS ||
1235       op == SHADER_OPCODE_TXF_UMS ||
1236       op == SHADER_OPCODE_TXF_MCS)
1237      src_type_size = 2;
1238
1239   return src_type_size * 8;
1240}
1241
1242static void
1243lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
1244{
1245   const intel_device_info *devinfo = bld.shader->devinfo;
1246   const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1247   const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1248   const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
1249   const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1250   const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1251   const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1252   const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1253   const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1254   const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1255   const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1256   const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1257   const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1258   assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1259   const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1260   assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1261   const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1262
1263   if (devinfo->ver >= 7) {
1264      const unsigned msg_payload_type_bit_size =
1265         get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);
1266
1267      /* 16-bit payloads are available only on gfx11+ */
1268      assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
1269
1270      lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
1271                                      shadow_c, lod, lod2, min_lod,
1272                                      sample_index,
1273                                      mcs, surface, sampler,
1274                                      surface_handle, sampler_handle,
1275                                      tg4_offset,
1276                                      msg_payload_type_bit_size,
1277                                      coord_components, grad_components);
1278   } else if (devinfo->ver >= 5) {
1279      lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
1280                                      shadow_c, lod, lod2, sample_index,
1281                                      surface, sampler,
1282                                      coord_components, grad_components);
1283   } else {
1284      lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,
1285                                      shadow_c, lod, lod2,
1286                                      surface, sampler,
1287                                      coord_components, grad_components);
1288   }
1289}
1290
1291/**
1292 * Predicate the specified instruction on the vector mask.
1293 */
1294static void
1295emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst)
1296{
1297   assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1298          bld.group() == inst->group &&
1299          bld.dispatch_width() == inst->exec_size);
1300
1301   const fs_builder ubld = bld.exec_all().group(1, 0);
1302
1303   const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
1304   const fs_reg vector_mask = ubld.vgrf(BRW_REGISTER_TYPE_UW);
1305   ubld.emit(SHADER_OPCODE_READ_SR_REG, vector_mask, brw_imm_ud(3));
1306   const unsigned subreg = sample_mask_flag_subreg(v);
1307
1308   ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);
1309
1310   if (inst->predicate) {
1311      assert(inst->predicate == BRW_PREDICATE_NORMAL);
1312      assert(!inst->predicate_inverse);
1313      assert(inst->flag_subreg == 0);
1314      /* Combine the vector mask with the existing predicate by using a
1315       * vertical predication mode.
1316       */
1317      inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1318   } else {
1319      inst->flag_subreg = subreg;
1320      inst->predicate = BRW_PREDICATE_NORMAL;
1321      inst->predicate_inverse = false;
1322   }
1323}
1324
1325static void
1326setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
1327                          const fs_reg &surface, const fs_reg &surface_handle)
1328{
1329   const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1330
1331   /* We must have exactly one of surface and surface_handle */
1332   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1333
1334   if (surface.file == IMM) {
1335      inst->desc = desc | (surface.ud & 0xff);
1336      inst->src[0] = brw_imm_ud(0);
1337      inst->src[1] = brw_imm_ud(0); /* ex_desc */
1338   } else if (surface_handle.file != BAD_FILE) {
1339      /* Bindless surface */
1340      assert(devinfo->ver >= 9);
1341      inst->desc = desc | GFX9_BTI_BINDLESS;
1342      inst->src[0] = brw_imm_ud(0);
1343
1344      /* We assume that the driver provided the handle in the top 20 bits so
1345       * we can use the surface handle directly as the extended descriptor.
1346       */
1347      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1348   } else {
1349      inst->desc = desc;
1350      const fs_builder ubld = bld.exec_all().group(1, 0);
1351      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1352      ubld.AND(tmp, surface, brw_imm_ud(0xff));
1353      inst->src[0] = component(tmp, 0);
1354      inst->src[1] = brw_imm_ud(0); /* ex_desc */
1355   }
1356}
1357
1358static void
1359lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
1360{
1361   const intel_device_info *devinfo = bld.shader->devinfo;
1362
1363   /* Get the logical send arguments. */
1364   const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1365   const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1366   const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1367   const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1368   const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1369   const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1370   const fs_reg &allow_sample_mask =
1371      inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1372   assert(arg.file == IMM);
1373   assert(allow_sample_mask.file == IMM);
1374
1375   /* Calculate the total number of components of the payload. */
1376   const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1377   const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1378
1379   const bool is_typed_access =
1380      inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
1381      inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
1382      inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
1383
1384   const bool is_surface_access = is_typed_access ||
1385      inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
1386      inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
1387      inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
1388
1389   const bool is_stateless =
1390      surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1391                              surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1392
1393   const bool has_side_effects = inst->has_side_effects();
1394
1395   fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
1396                                               fs_reg(brw_imm_d(0xffff));
1397
1398   /* From the BDW PRM Volume 7, page 147:
1399    *
1400    *  "For the Data Cache Data Port*, the header must be present for the
1401    *   following message types: [...] Typed read/write/atomics"
1402    *
1403    * Earlier generations have a similar wording.  Because of this restriction
1404    * we don't attempt to implement sample masks via predication for such
1405    * messages prior to Gfx9, since we have to provide a header anyway.  On
1406    * Gfx11+ the header has been removed so we can only use predication.
1407    *
1408    * For all stateless A32 messages, we also need a header
1409    */
1410   fs_reg header;
1411   if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
1412      fs_builder ubld = bld.exec_all().group(8, 0);
1413      header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1414      if (is_stateless) {
1415         assert(!is_surface_access);
1416         ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
1417      } else {
1418         ubld.MOV(header, brw_imm_d(0));
1419         if (is_surface_access)
1420            ubld.group(1, 0).MOV(component(header, 7), sample_mask);
1421      }
1422   }
1423   const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
1424
1425   fs_reg payload, payload2;
1426   unsigned mlen, ex_mlen = 0;
1427   if (devinfo->ver >= 9 &&
1428       (src.file == BAD_FILE || header.file == BAD_FILE)) {
1429      /* We have split sends on gfx9 and above */
1430      if (header.file == BAD_FILE) {
1431         payload = bld.move_to_vgrf(addr, addr_sz);
1432         payload2 = bld.move_to_vgrf(src, src_sz);
1433         mlen = addr_sz * (inst->exec_size / 8);
1434         ex_mlen = src_sz * (inst->exec_size / 8);
1435      } else {
1436         assert(src.file == BAD_FILE);
1437         payload = header;
1438         payload2 = bld.move_to_vgrf(addr, addr_sz);
1439         mlen = header_sz;
1440         ex_mlen = addr_sz * (inst->exec_size / 8);
1441      }
1442   } else {
1443      /* Allocate space for the payload. */
1444      const unsigned sz = header_sz + addr_sz + src_sz;
1445      payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
1446      fs_reg *const components = new fs_reg[sz];
1447      unsigned n = 0;
1448
1449      /* Construct the payload. */
1450      if (header.file != BAD_FILE)
1451         components[n++] = header;
1452
1453      for (unsigned i = 0; i < addr_sz; i++)
1454         components[n++] = offset(addr, bld, i);
1455
1456      for (unsigned i = 0; i < src_sz; i++)
1457         components[n++] = offset(src, bld, i);
1458
1459      bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
1460      mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
1461
1462      delete[] components;
1463   }
1464
1465   /* Predicate the instruction on the sample mask if no header is
1466    * provided.
1467    */
1468   if ((header.file == BAD_FILE || !is_surface_access) &&
1469       sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1470      brw_emit_predicate_on_sample_mask(bld, inst);
1471
1472   uint32_t sfid;
1473   switch (inst->opcode) {
1474   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1475   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1476      /* Byte scattered opcodes go through the normal data cache */
1477      sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1478      break;
1479
1480   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1481   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1482      sfid =  devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1483              devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1484                                  BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
1485      break;
1486
1487   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1488   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1489   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1490   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
1491      /* Untyped Surface messages go through the data cache but the SFID value
1492       * changed on Haswell.
1493       */
1494      sfid = (devinfo->verx10 >= 75 ?
1495              HSW_SFID_DATAPORT_DATA_CACHE_1 :
1496              GFX7_SFID_DATAPORT_DATA_CACHE);
1497      break;
1498
1499   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1500   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1501   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1502      /* Typed surface messages go through the render cache on IVB and the
1503       * data cache on HSW+.
1504       */
1505      sfid = (devinfo->verx10 >= 75 ?
1506              HSW_SFID_DATAPORT_DATA_CACHE_1 :
1507              GFX6_SFID_DATAPORT_RENDER_CACHE);
1508      break;
1509
1510   default:
1511      unreachable("Unsupported surface opcode");
1512   }
1513
1514   uint32_t desc;
1515   switch (inst->opcode) {
1516   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1517      desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1518                                            arg.ud, /* num_channels */
1519                                            false   /* write */);
1520      break;
1521
1522   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1523      desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1524                                            arg.ud, /* num_channels */
1525                                            true    /* write */);
1526      break;
1527
1528   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1529      desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1530                                           arg.ud, /* bit_size */
1531                                           false   /* write */);
1532      break;
1533
1534   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1535      desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1536                                           arg.ud, /* bit_size */
1537                                           true    /* write */);
1538      break;
1539
1540   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1541      assert(arg.ud == 32); /* bit_size */
1542      desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1543                                            false  /* write */);
1544      break;
1545
1546   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1547      assert(arg.ud == 32); /* bit_size */
1548      desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1549                                            true   /* write */);
1550      break;
1551
1552   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1553      desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1554                                        arg.ud, /* atomic_op */
1555                                        !inst->dst.is_null());
1556      break;
1557
1558   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
1559      desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
1560                                              arg.ud, /* atomic_op */
1561                                              !inst->dst.is_null());
1562      break;
1563
1564   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1565      desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1566                                          arg.ud, /* num_channels */
1567                                          false   /* write */);
1568      break;
1569
1570   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1571      desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1572                                          arg.ud, /* num_channels */
1573                                          true    /* write */);
1574      break;
1575
1576   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1577      desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1578                                      arg.ud, /* atomic_op */
1579                                      !inst->dst.is_null());
1580      break;
1581
1582   default:
1583      unreachable("Unknown surface logical instruction");
1584   }
1585
1586   /* Update the original instruction. */
1587   inst->opcode = SHADER_OPCODE_SEND;
1588   inst->mlen = mlen;
1589   inst->ex_mlen = ex_mlen;
1590   inst->header_size = header_sz;
1591   inst->send_has_side_effects = has_side_effects;
1592   inst->send_is_volatile = !has_side_effects;
1593
1594   /* Set up SFID and descriptors */
1595   inst->sfid = sfid;
1596   setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1597
1598   inst->resize_sources(4);
1599
1600   /* Finally, the payload */
1601   inst->src[2] = payload;
1602   inst->src[3] = payload2;
1603}
1604
1605static enum lsc_opcode
1606brw_atomic_op_to_lsc_atomic_op(unsigned op)
1607{
1608   switch(op) {
1609   case BRW_AOP_AND:
1610      return LSC_OP_ATOMIC_AND;
1611   case BRW_AOP_OR:
1612      return LSC_OP_ATOMIC_OR;
1613   case BRW_AOP_XOR:
1614      return LSC_OP_ATOMIC_XOR;
1615   case BRW_AOP_MOV:
1616      return LSC_OP_ATOMIC_STORE;
1617   case BRW_AOP_INC:
1618      return LSC_OP_ATOMIC_INC;
1619   case BRW_AOP_DEC:
1620      return LSC_OP_ATOMIC_DEC;
1621   case BRW_AOP_ADD:
1622      return LSC_OP_ATOMIC_ADD;
1623   case BRW_AOP_SUB:
1624      return LSC_OP_ATOMIC_SUB;
1625   case BRW_AOP_IMAX:
1626      return LSC_OP_ATOMIC_MAX;
1627   case BRW_AOP_IMIN:
1628      return LSC_OP_ATOMIC_MIN;
1629   case BRW_AOP_UMAX:
1630      return LSC_OP_ATOMIC_UMAX;
1631   case BRW_AOP_UMIN:
1632      return LSC_OP_ATOMIC_UMIN;
1633   case BRW_AOP_CMPWR:
1634      return LSC_OP_ATOMIC_CMPXCHG;
1635   default:
1636      assert(false);
1637      unreachable("invalid atomic opcode");
1638   }
1639}
1640
1641static enum lsc_opcode
1642brw_atomic_op_to_lsc_fatomic_op(uint32_t aop)
1643{
1644   switch(aop) {
1645   case BRW_AOP_FMAX:
1646      return LSC_OP_ATOMIC_FMAX;
1647   case BRW_AOP_FMIN:
1648      return LSC_OP_ATOMIC_FMIN;
1649   case BRW_AOP_FCMPWR:
1650      return LSC_OP_ATOMIC_FCMPXCHG;
1651   case BRW_AOP_FADD:
1652      return LSC_OP_ATOMIC_FADD;
1653   default:
1654      unreachable("Unsupported float atomic opcode");
1655   }
1656}
1657
1658static enum lsc_data_size
1659lsc_bits_to_data_size(unsigned bit_size)
1660{
1661   switch (bit_size / 8) {
1662   case 1:  return LSC_DATA_SIZE_D8U32;
1663   case 2:  return LSC_DATA_SIZE_D16U32;
1664   case 4:  return LSC_DATA_SIZE_D32;
1665   case 8:  return LSC_DATA_SIZE_D64;
1666   default:
1667      unreachable("Unsupported data size.");
1668   }
1669}
1670
1671static void
1672lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
1673{
1674   const intel_device_info *devinfo = bld.shader->devinfo;
1675   assert(devinfo->has_lsc);
1676
1677   /* Get the logical send arguments. */
1678   const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1679   const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1680   const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1681   const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1682   const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1683   const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1684   const fs_reg allow_sample_mask =
1685      inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1686   assert(arg.file == IMM);
1687   assert(allow_sample_mask.file == IMM);
1688
1689   /* Calculate the total number of components of the payload. */
1690   const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1691   const unsigned src_comps = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1692   const unsigned src_sz = type_sz(src.type);
1693
1694   const bool has_side_effects = inst->has_side_effects();
1695
1696   unsigned ex_mlen = 0;
1697   fs_reg payload, payload2;
1698   payload = bld.move_to_vgrf(addr, addr_sz);
1699   if (src.file != BAD_FILE) {
1700      payload2 = bld.move_to_vgrf(src, src_comps);
1701      ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
1702   }
1703
1704   /* Predicate the instruction on the sample mask if needed */
1705   fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
1706                                               fs_reg(brw_imm_d(0xffff));
1707   if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1708      brw_emit_predicate_on_sample_mask(bld, inst);
1709
1710   if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1711      inst->sfid = GFX12_SFID_SLM;
1712   else
1713      inst->sfid = GFX12_SFID_UGM;
1714
1715   /* We must have exactly one of surface and surface_handle */
1716   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1717
1718   enum lsc_addr_surface_type surf_type;
1719   if (surface_handle.file != BAD_FILE)
1720      surf_type = LSC_ADDR_SURFTYPE_BSS;
1721   else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1722      surf_type = LSC_ADDR_SURFTYPE_FLAT;
1723   else
1724      surf_type = LSC_ADDR_SURFTYPE_BTI;
1725
1726   switch (inst->opcode) {
1727   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1728      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
1729                                surf_type, LSC_ADDR_SIZE_A32,
1730                                1 /* num_coordinates */,
1731                                LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1732                                false /* transpose */,
1733                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
1734                                true /* has_dest */);
1735      break;
1736   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1737      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
1738                                surf_type, LSC_ADDR_SIZE_A32,
1739                                1 /* num_coordinates */,
1740                                LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1741                                false /* transpose */,
1742                                LSC_CACHE_STORE_L1STATE_L3MOCS,
1743                                false /* has_dest */);
1744      break;
1745   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1746   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
1747      /* Bspec: Atomic instruction -> Cache section:
1748       *
1749       *    Atomic messages are always forced to "un-cacheable" in the L1
1750       *    cache.
1751       */
1752      enum lsc_opcode opcode =
1753         inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL ?
1754         brw_atomic_op_to_lsc_fatomic_op(arg.ud) :
1755         brw_atomic_op_to_lsc_atomic_op(arg.ud);
1756      inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
1757                                surf_type, LSC_ADDR_SIZE_A32,
1758                                1 /* num_coordinates */,
1759                                lsc_bits_to_data_size(src_sz * 8),
1760                                1 /* num_channels */,
1761                                false /* transpose */,
1762                                LSC_CACHE_STORE_L1UC_L3WB,
1763                                !inst->dst.is_null());
1764      break;
1765   }
1766   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1767      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
1768                                surf_type, LSC_ADDR_SIZE_A32,
1769                                1 /* num_coordinates */,
1770                                lsc_bits_to_data_size(arg.ud),
1771                                1 /* num_channels */,
1772                                false /* transpose */,
1773                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
1774                                true /* has_dest */);
1775      break;
1776   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1777      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
1778                                surf_type, LSC_ADDR_SIZE_A32,
1779                                1 /* num_coordinates */,
1780                                lsc_bits_to_data_size(arg.ud),
1781                                1 /* num_channels */,
1782                                false /* transpose */,
1783                                LSC_CACHE_STORE_L1STATE_L3MOCS,
1784                                false /* has_dest */);
1785      break;
1786   default:
1787      unreachable("Unknown surface logical instruction");
1788   }
1789
1790   inst->src[0] = brw_imm_ud(0);
1791
1792   /* Set up extended descriptors */
1793   switch (surf_type) {
1794   case LSC_ADDR_SURFTYPE_FLAT:
1795      inst->src[1] = brw_imm_ud(0);
1796      break;
1797   case LSC_ADDR_SURFTYPE_BSS:
1798      /* We assume that the driver provided the handle in the top 20 bits so
1799       * we can use the surface handle directly as the extended descriptor.
1800       */
1801      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1802      break;
1803   case LSC_ADDR_SURFTYPE_BTI:
1804      if (surface.file == IMM) {
1805         inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
1806      } else {
1807         const fs_builder ubld = bld.exec_all().group(1, 0);
1808         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1809         ubld.SHL(tmp, surface, brw_imm_ud(24));
1810         inst->src[1] = component(tmp, 0);
1811      }
1812      break;
1813   default:
1814      unreachable("Unknown surface type");
1815   }
1816
1817   /* Update the original instruction. */
1818   inst->opcode = SHADER_OPCODE_SEND;
1819   inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
1820   inst->ex_mlen = ex_mlen;
1821   inst->header_size = 0;
1822   inst->send_has_side_effects = has_side_effects;
1823   inst->send_is_volatile = !has_side_effects;
1824
1825   inst->resize_sources(4);
1826
1827   /* Finally, the payload */
1828   inst->src[2] = payload;
1829   inst->src[3] = payload2;
1830}
1831
1832static void
1833lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)
1834{
1835   const intel_device_info *devinfo = bld.shader->devinfo;
1836   assert(devinfo->ver >= 9);
1837
1838   /* Get the logical send arguments. */
1839   const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1840   const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1841   const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1842   const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1843   const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1844   assert(arg.file == IMM);
1845   assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
1846   assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
1847
1848   const bool is_stateless =
1849      surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1850                              surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1851
1852   const bool has_side_effects = inst->has_side_effects();
1853
1854   const bool align_16B =
1855      inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
1856
1857   const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
1858
1859   /* The address is stored in the header.  See MH_A32_GO and MH_BTS_GO. */
1860   fs_builder ubld = bld.exec_all().group(8, 0);
1861   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1862
1863   if (is_stateless)
1864      ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
1865   else
1866      ubld.MOV(header, brw_imm_d(0));
1867
1868   /* Address in OWord units when aligned to OWords. */
1869   if (align_16B)
1870      ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4));
1871   else
1872      ubld.group(1, 0).MOV(component(header, 2), addr);
1873
1874   fs_reg data;
1875   unsigned ex_mlen = 0;
1876   if (write) {
1877      const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1878      data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
1879      ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;
1880   }
1881
1882   inst->opcode = SHADER_OPCODE_SEND;
1883   inst->mlen = 1;
1884   inst->ex_mlen = ex_mlen;
1885   inst->header_size = 1;
1886   inst->send_has_side_effects = has_side_effects;
1887   inst->send_is_volatile = !has_side_effects;
1888
1889   inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1890
1891   const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B,
1892                                                    arg.ud, write);
1893   setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1894
1895   inst->resize_sources(4);
1896
1897   inst->src[2] = header;
1898   inst->src[3] = data;
1899}
1900
1901static fs_reg
1902emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr)
1903{
1904   const fs_builder ubld = bld.exec_all().group(8, 0);
1905
1906   assert(type_sz(addr.type) == 8 && addr.stride == 0);
1907
1908   fs_reg expanded_addr = addr;
1909   if (addr.file == UNIFORM) {
1910      /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
1911      expanded_addr = ubld.vgrf(BRW_REGISTER_TYPE_UQ);
1912      expanded_addr.stride = 0;
1913      ubld.MOV(expanded_addr, retype(addr, BRW_REGISTER_TYPE_UQ));
1914   }
1915
1916   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1917   ubld.MOV(header, brw_imm_ud(0));
1918
1919   /* Use a 2-wide MOV to fill out the address */
1920   fs_reg addr_vec2 = expanded_addr;
1921   addr_vec2.type = BRW_REGISTER_TYPE_UD;
1922   addr_vec2.stride = 1;
1923   ubld.group(2, 0).MOV(header, addr_vec2);
1924
1925   return header;
1926}
1927
1928static void
1929emit_fragment_mask(const fs_builder &bld, fs_inst *inst)
1930{
1931   assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM);
1932   const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud;
1933
1934   /* If we're a fragment shader, we have to predicate with the sample mask to
1935    * avoid helper invocations to avoid helper invocations in instructions
1936    * with side effects, unless they are explicitly required.
1937    *
1938    * There are also special cases when we actually want to run on helpers
1939    * (ray queries).
1940    */
1941   assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
1942   if (enable_helpers)
1943      emit_predicate_on_vector_mask(bld, inst);
1944   else if (inst->has_side_effects())
1945      brw_emit_predicate_on_sample_mask(bld, inst);
1946}
1947
1948static void
1949lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
1950{
1951   const intel_device_info *devinfo = bld.shader->devinfo;
1952
1953   /* Get the logical send arguments. */
1954   const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS];
1955   const fs_reg &src = inst->src[A64_LOGICAL_SRC];
1956   const unsigned src_sz = type_sz(src.type);
1957
1958   const unsigned src_comps = inst->components_read(1);
1959   assert(inst->src[A64_LOGICAL_ARG].file == IMM);
1960   const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
1961   const bool has_side_effects = inst->has_side_effects();
1962
1963   fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
1964   fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
1965                            BRW_REGISTER_TYPE_UD);
1966   unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;
1967
1968   switch (inst->opcode) {
1969   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
1970      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
1971                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
1972                                1 /* num_coordinates */,
1973                                LSC_DATA_SIZE_D32, arg /* num_channels */,
1974                                false /* transpose */,
1975                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
1976                                true /* has_dest */);
1977      break;
1978   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
1979      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
1980                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
1981                                1 /* num_coordinates */,
1982                                LSC_DATA_SIZE_D32, arg /* num_channels */,
1983                                false /* transpose */,
1984                                LSC_CACHE_STORE_L1STATE_L3MOCS,
1985                                false /* has_dest */);
1986      break;
1987   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
1988      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
1989                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
1990                                1 /* num_coordinates */,
1991                                lsc_bits_to_data_size(arg),
1992                                1 /* num_channels */,
1993                                false /* transpose */,
1994                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
1995                                true /* has_dest */);
1996      break;
1997   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
1998      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
1999                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2000                                1 /* num_coordinates */,
2001                                lsc_bits_to_data_size(arg),
2002                                1 /* num_channels */,
2003                                false /* transpose */,
2004                                LSC_CACHE_STORE_L1STATE_L3MOCS,
2005                                false /* has_dest */);
2006      break;
2007   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2008   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
2009   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: {
2010   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
2011   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
2012   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
2013      /* Bspec: Atomic instruction -> Cache section:
2014       *
2015       *    Atomic messages are always forced to "un-cacheable" in the L1
2016       *    cache.
2017       */
2018      enum lsc_opcode opcode =
2019         (inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL ||
2020          inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL ||
2021          inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL) ?
2022         brw_atomic_op_to_lsc_atomic_op(arg) :
2023         brw_atomic_op_to_lsc_fatomic_op(arg);
2024      inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
2025                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2026                                1 /* num_coordinates */,
2027                                lsc_bits_to_data_size(src_sz * 8),
2028                                1 /* num_channels */,
2029                                false /* transpose */,
2030                                LSC_CACHE_STORE_L1UC_L3WB,
2031                                !inst->dst.is_null());
2032      break;
2033   }
2034   default:
2035      unreachable("Unknown A64 logical instruction");
2036   }
2037
2038   if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2039      emit_fragment_mask(bld, inst);
2040
2041   /* Update the original instruction. */
2042   inst->opcode = SHADER_OPCODE_SEND;
2043   inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2044   inst->ex_mlen = ex_mlen;
2045   inst->header_size = 0;
2046   inst->send_has_side_effects = has_side_effects;
2047   inst->send_is_volatile = !has_side_effects;
2048
2049   /* Set up SFID and descriptors */
2050   inst->sfid = GFX12_SFID_UGM;
2051   inst->resize_sources(4);
2052   inst->src[0] = brw_imm_ud(0); /* desc */
2053   inst->src[1] = brw_imm_ud(0); /* ex_desc */
2054   inst->src[2] = payload;
2055   inst->src[3] = payload2;
2056}
2057
2058static void
2059lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
2060{
2061   const intel_device_info *devinfo = bld.shader->devinfo;
2062
2063   const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS];
2064   const fs_reg &src = inst->src[A64_LOGICAL_SRC];
2065   const unsigned src_comps = inst->components_read(1);
2066   assert(inst->src[A64_LOGICAL_ARG].file == IMM);
2067   const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
2068   const bool has_side_effects = inst->has_side_effects();
2069
2070   fs_reg payload, payload2;
2071   unsigned mlen, ex_mlen = 0, header_size = 0;
2072   if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
2073       inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||
2074       inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {
2075      assert(devinfo->ver >= 9);
2076
2077      /* OWORD messages only take a scalar address in a header */
2078      mlen = 1;
2079      header_size = 1;
2080      payload = emit_a64_oword_block_header(bld, addr);
2081
2082      if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {
2083         ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2084         payload2 = retype(bld.move_to_vgrf(src, src_comps),
2085                           BRW_REGISTER_TYPE_UD);
2086      }
2087   } else if (devinfo->ver >= 9) {
2088      /* On Skylake and above, we have SENDS */
2089      mlen = 2 * (inst->exec_size / 8);
2090      ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2091      payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
2092      payload2 = retype(bld.move_to_vgrf(src, src_comps),
2093                        BRW_REGISTER_TYPE_UD);
2094   } else {
2095      /* Add two because the address is 64-bit */
2096      const unsigned dwords = 2 + src_comps;
2097      mlen = dwords * (inst->exec_size / 8);
2098
2099      fs_reg sources[5];
2100
2101      sources[0] = addr;
2102
2103      for (unsigned i = 0; i < src_comps; i++)
2104         sources[1 + i] = offset(src, bld, i);
2105
2106      payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
2107      bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
2108   }
2109
2110   uint32_t desc;
2111   switch (inst->opcode) {
2112   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2113      desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2114                                                arg,   /* num_channels */
2115                                                false  /* write */);
2116      break;
2117
2118   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2119      desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2120                                                arg,   /* num_channels */
2121                                                true   /* write */);
2122      break;
2123
2124   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2125      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2126                                            true,    /* align_16B */
2127                                            arg,     /* num_dwords */
2128                                            false    /* write */);
2129      break;
2130
2131   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2132      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2133                                            false,   /* align_16B */
2134                                            arg,     /* num_dwords */
2135                                            false    /* write */);
2136      break;
2137
2138   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2139      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2140                                            true,    /* align_16B */
2141                                            arg,     /* num_dwords */
2142                                            true     /* write */);
2143      break;
2144
2145   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2146      desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2147                                               arg,   /* bit_size */
2148                                               false  /* write */);
2149      break;
2150
2151   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2152      desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2153                                               arg,   /* bit_size */
2154                                               true   /* write */);
2155      break;
2156
2157   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2158      desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32,
2159                                            arg,   /* atomic_op */
2160                                            !inst->dst.is_null());
2161      break;
2162
2163   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
2164      desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 16,
2165                                            arg,   /* atomic_op */
2166                                            !inst->dst.is_null());
2167      break;
2168
2169   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
2170      desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64,
2171                                            arg,   /* atomic_op */
2172                                            !inst->dst.is_null());
2173      break;
2174
2175   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
2176      desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
2177                                                  16, /* bit_size */
2178                                                  arg,   /* atomic_op */
2179                                                  !inst->dst.is_null());
2180      break;
2181
2182   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
2183      desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
2184                                                  32, /* bit_size */
2185                                                  arg,   /* atomic_op */
2186                                                  !inst->dst.is_null());
2187      break;
2188
2189   default:
2190      unreachable("Unknown A64 logical instruction");
2191   }
2192
2193   if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2194      emit_fragment_mask(bld, inst);
2195
2196   /* Update the original instruction. */
2197   inst->opcode = SHADER_OPCODE_SEND;
2198   inst->mlen = mlen;
2199   inst->ex_mlen = ex_mlen;
2200   inst->header_size = header_size;
2201   inst->send_has_side_effects = has_side_effects;
2202   inst->send_is_volatile = !has_side_effects;
2203
2204   /* Set up SFID and descriptors */
2205   inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2206   inst->desc = desc;
2207   inst->resize_sources(4);
2208   inst->src[0] = brw_imm_ud(0); /* desc */
2209   inst->src[1] = brw_imm_ud(0); /* ex_desc */
2210   inst->src[2] = payload;
2211   inst->src[3] = payload2;
2212}
2213
2214static void
2215lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
2216                                             fs_inst *inst)
2217{
2218   const intel_device_info *devinfo = bld.shader->devinfo;
2219   ASSERTED const brw_compiler *compiler = bld.shader->compiler;
2220
2221   fs_reg index = inst->src[0];
2222
2223   /* We are switching the instruction from an ALU-like instruction to a
2224    * send-from-grf instruction.  Since sends can't handle strides or
2225    * source modifiers, we have to make a copy of the offset source.
2226    */
2227   fs_reg ubo_offset = bld.move_to_vgrf(inst->src[1], 1);
2228
2229   assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
2230   unsigned alignment = inst->src[2].ud;
2231
2232   inst->opcode = SHADER_OPCODE_SEND;
2233   inst->sfid = GFX12_SFID_UGM;
2234   inst->resize_sources(3);
2235   inst->src[0] = brw_imm_ud(0);
2236
2237   if (index.file == IMM) {
2238      inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, index.ud));
2239   } else {
2240      const fs_builder ubld = bld.exec_all().group(1, 0);
2241      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2242      ubld.SHL(tmp, index, brw_imm_ud(24));
2243      inst->src[1] = component(tmp, 0);
2244   }
2245
2246   assert(!compiler->indirect_ubos_use_sampler);
2247
2248   inst->src[2] = ubo_offset; /* payload */
2249   if (alignment >= 4) {
2250      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
2251                                LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
2252                                1 /* num_coordinates */,
2253                                LSC_DATA_SIZE_D32,
2254                                4 /* num_channels */,
2255                                false /* transpose */,
2256                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
2257                                true /* has_dest */);
2258      inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2259   } else {
2260      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2261                                LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
2262                                1 /* num_coordinates */,
2263                                LSC_DATA_SIZE_D32,
2264                                1 /* num_channels */,
2265                                false /* transpose */,
2266                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
2267                                true /* has_dest */);
2268      inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2269      /* The byte scattered messages can only read one dword at a time so
2270       * we have to duplicate the message 4 times to read the full vec4.
2271       * Hopefully, dead code will clean up the mess if some of them aren't
2272       * needed.
2273       */
2274      assert(inst->size_written == 16 * inst->exec_size);
2275      inst->size_written /= 4;
2276      for (unsigned c = 1; c < 4; c++) {
2277         /* Emit a copy of the instruction because we're about to modify
2278          * it.  Because this loop starts at 1, we will emit copies for the
2279          * first 3 and the final one will be the modified instruction.
2280          */
2281         bld.emit(*inst);
2282
2283         /* Offset the source */
2284         inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
2285         bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2286
2287         /* Offset the destination */
2288         inst->dst = offset(inst->dst, bld, 1);
2289      }
2290   }
2291}
2292
2293static void
2294lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
2295{
2296   const intel_device_info *devinfo = bld.shader->devinfo;
2297   const brw_compiler *compiler = bld.shader->compiler;
2298
2299   if (devinfo->ver >= 7) {
2300      fs_reg index = inst->src[0];
2301      /* We are switching the instruction from an ALU-like instruction to a
2302       * send-from-grf instruction.  Since sends can't handle strides or
2303       * source modifiers, we have to make a copy of the offset source.
2304       */
2305      fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
2306      bld.MOV(ubo_offset, inst->src[1]);
2307
2308      assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
2309      unsigned alignment = inst->src[2].ud;
2310
2311      inst->opcode = SHADER_OPCODE_SEND;
2312      inst->mlen = inst->exec_size / 8;
2313      inst->resize_sources(3);
2314
2315      if (index.file == IMM) {
2316         inst->desc = index.ud & 0xff;
2317         inst->src[0] = brw_imm_ud(0);
2318      } else {
2319         inst->desc = 0;
2320         const fs_builder ubld = bld.exec_all().group(1, 0);
2321         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2322         ubld.AND(tmp, index, brw_imm_ud(0xff));
2323         inst->src[0] = component(tmp, 0);
2324      }
2325      inst->src[1] = brw_imm_ud(0); /* ex_desc */
2326      inst->src[2] = ubo_offset; /* payload */
2327
2328      if (compiler->indirect_ubos_use_sampler) {
2329         const unsigned simd_mode =
2330            inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
2331                                   BRW_SAMPLER_SIMD_MODE_SIMD16;
2332
2333         inst->sfid = BRW_SFID_SAMPLER;
2334         inst->desc |= brw_sampler_desc(devinfo, 0, 0,
2335                                        GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
2336                                        simd_mode, 0);
2337      } else if (alignment >= 4) {
2338         inst->sfid = (devinfo->verx10 >= 75 ?
2339                       HSW_SFID_DATAPORT_DATA_CACHE_1 :
2340                       GFX7_SFID_DATAPORT_DATA_CACHE);
2341         inst->desc |= brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
2342                                                      4, /* num_channels */
2343                                                      false   /* write */);
2344      } else {
2345         inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2346         inst->desc |= brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
2347                                                     32,     /* bit_size */
2348                                                     false   /* write */);
2349         /* The byte scattered messages can only read one dword at a time so
2350          * we have to duplicate the message 4 times to read the full vec4.
2351          * Hopefully, dead code will clean up the mess if some of them aren't
2352          * needed.
2353          */
2354         assert(inst->size_written == 16 * inst->exec_size);
2355         inst->size_written /= 4;
2356         for (unsigned c = 1; c < 4; c++) {
2357            /* Emit a copy of the instruction because we're about to modify
2358             * it.  Because this loop starts at 1, we will emit copies for the
2359             * first 3 and the final one will be the modified instruction.
2360             */
2361            bld.emit(*inst);
2362
2363            /* Offset the source */
2364            inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
2365            bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2366
2367            /* Offset the destination */
2368            inst->dst = offset(inst->dst, bld, 1);
2369         }
2370      }
2371   } else {
2372      const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),
2373                           BRW_REGISTER_TYPE_UD);
2374
2375      bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
2376
2377      inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;
2378      inst->resize_sources(1);
2379      inst->base_mrf = payload.nr;
2380      inst->header_size = 1;
2381      inst->mlen = 1 + inst->exec_size / 8;
2382   }
2383}
2384
2385static void
2386lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
2387{
2388   assert(bld.shader->devinfo->ver < 6);
2389
2390   inst->base_mrf = 2;
2391   inst->mlen = inst->sources * inst->exec_size / 8;
2392
2393   if (inst->sources > 1) {
2394      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
2395       * "Message Payload":
2396       *
2397       * "Operand0[7].  For the INT DIV functions, this operand is the
2398       *  denominator."
2399       *  ...
2400       * "Operand1[7].  For the INT DIV functions, this operand is the
2401       *  numerator."
2402       */
2403      const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
2404      const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
2405      const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
2406
2407      inst->resize_sources(1);
2408      inst->src[0] = src0;
2409
2410      assert(inst->exec_size == 8);
2411      bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
2412   }
2413}
2414
2415static void
2416lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
2417{
2418   const intel_device_info *devinfo = bld.shader->devinfo;
2419   fs_reg global_addr = inst->src[0];
2420   const fs_reg &btd_record = inst->src[1];
2421
2422   const unsigned mlen = 2;
2423   const fs_builder ubld = bld.exec_all().group(8, 0);
2424   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2425
2426   ubld.MOV(header, brw_imm_ud(0));
2427   switch (inst->opcode) {
2428   case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2429      assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0);
2430      global_addr.type = BRW_REGISTER_TYPE_UD;
2431      global_addr.stride = 1;
2432      ubld.group(2, 0).MOV(header, global_addr);
2433      break;
2434
2435   case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2436      /* The bottom bit is the Stack ID release bit */
2437      ubld.group(1, 0).MOV(header, brw_imm_ud(1));
2438      break;
2439
2440   default:
2441      unreachable("Invalid BTD message");
2442   }
2443
2444   /* Stack IDs are always in R1 regardless of whether we're coming from a
2445    * bindless shader or a regular compute shader.
2446    */
2447   fs_reg stack_ids =
2448      retype(byte_offset(header, REG_SIZE), BRW_REGISTER_TYPE_UW);
2449   bld.MOV(stack_ids, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));
2450
2451   unsigned ex_mlen = 0;
2452   fs_reg payload;
2453   if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
2454      ex_mlen = 2 * (inst->exec_size / 8);
2455      payload = bld.move_to_vgrf(btd_record, 1);
2456   } else {
2457      assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
2458      /* All these messages take a BTD and things complain if we don't provide
2459       * one for RETIRE.  However, it shouldn't ever actually get used so fill
2460       * it with zero.
2461       */
2462      ex_mlen = 2 * (inst->exec_size / 8);
2463      payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
2464   }
2465
2466   /* Update the original instruction. */
2467   inst->opcode = SHADER_OPCODE_SEND;
2468   inst->mlen = mlen;
2469   inst->ex_mlen = ex_mlen;
2470   inst->header_size = 0; /* HW docs require has_header = false */
2471   inst->send_has_side_effects = true;
2472   inst->send_is_volatile = false;
2473
2474   /* Set up SFID and descriptors */
2475   inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
2476   inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
2477                                   GEN_RT_BTD_MESSAGE_SPAWN);
2478   inst->resize_sources(4);
2479   inst->src[0] = brw_imm_ud(0); /* desc */
2480   inst->src[1] = brw_imm_ud(0); /* ex_desc */
2481   inst->src[2] = header;
2482   inst->src[3] = payload;
2483}
2484
2485static void
2486lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
2487{
2488   const intel_device_info *devinfo = bld.shader->devinfo;
2489   /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
2490    * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
2491    * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
2492    * so that the MOV operates on 2 components rather than twice the same
2493    * component.
2494    */
2495   fs_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_REGISTER_TYPE_UD);
2496   globals_addr.stride = 1;
2497   const fs_reg &bvh_level =
2498      inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ?
2499      inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
2500      bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
2501                       inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
2502   const fs_reg &trace_ray_control =
2503      inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ?
2504      inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
2505      bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
2506                       inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
2507   const fs_reg &synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
2508   assert(synchronous_src.file == BRW_IMMEDIATE_VALUE);
2509   const bool synchronous = synchronous_src.ud;
2510
2511   const unsigned mlen = 1;
2512   const fs_builder ubld = bld.exec_all().group(8, 0);
2513   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2514   ubld.MOV(header, brw_imm_ud(0));
2515   ubld.group(2, 0).MOV(header, globals_addr);
2516   if (synchronous)
2517      ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
2518
2519   const unsigned ex_mlen = inst->exec_size / 8;
2520   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
2521   if (bvh_level.file == BRW_IMMEDIATE_VALUE &&
2522       trace_ray_control.file == BRW_IMMEDIATE_VALUE) {
2523      bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) |
2524                                  (bvh_level.ud & 0x7)));
2525   } else {
2526      bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
2527      bld.OR(payload, payload, bvh_level);
2528   }
2529
2530   /* When doing synchronous traversal, the HW implicitly computes the
2531    * stack_id using the following formula :
2532    *
2533    *    EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
2534    *
2535    * Only in the asynchronous case we need to set the stack_id given from the
2536    * payload register.
2537    */
2538   if (!synchronous) {
2539      bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
2540              retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
2541              brw_imm_uw(0x7ff));
2542   }
2543
2544   /* Update the original instruction. */
2545   inst->opcode = SHADER_OPCODE_SEND;
2546   inst->mlen = mlen;
2547   inst->ex_mlen = ex_mlen;
2548   inst->header_size = 0; /* HW docs require has_header = false */
2549   inst->send_has_side_effects = true;
2550   inst->send_is_volatile = false;
2551
2552   /* Set up SFID and descriptors */
2553   inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
2554   inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
2555   inst->resize_sources(4);
2556   inst->src[0] = brw_imm_ud(0); /* desc */
2557   inst->src[1] = brw_imm_ud(0); /* ex_desc */
2558   inst->src[2] = header;
2559   inst->src[3] = payload;
2560}
2561
2562bool
2563fs_visitor::lower_logical_sends()
2564{
2565   bool progress = false;
2566
2567   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2568      const fs_builder ibld(this, block, inst);
2569
2570      switch (inst->opcode) {
2571      case FS_OPCODE_FB_WRITE_LOGICAL:
2572         assert(stage == MESA_SHADER_FRAGMENT);
2573         lower_fb_write_logical_send(ibld, inst,
2574                                     brw_wm_prog_data(prog_data),
2575                                     (const brw_wm_prog_key *)key,
2576                                     payload);
2577         break;
2578
2579      case FS_OPCODE_FB_READ_LOGICAL:
2580         lower_fb_read_logical_send(ibld, inst);
2581         break;
2582
2583      case SHADER_OPCODE_TEX_LOGICAL:
2584         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
2585         break;
2586
2587      case SHADER_OPCODE_TXD_LOGICAL:
2588         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
2589         break;
2590
2591      case SHADER_OPCODE_TXF_LOGICAL:
2592         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
2593         break;
2594
2595      case SHADER_OPCODE_TXL_LOGICAL:
2596         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
2597         break;
2598
2599      case SHADER_OPCODE_TXS_LOGICAL:
2600         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
2601         break;
2602
2603      case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
2604         lower_sampler_logical_send(ibld, inst,
2605                                    SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
2606         break;
2607
2608      case FS_OPCODE_TXB_LOGICAL:
2609         lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
2610         break;
2611
2612      case SHADER_OPCODE_TXF_CMS_LOGICAL:
2613         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
2614         break;
2615
2616      case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
2617      case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
2618         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
2619         break;
2620
2621      case SHADER_OPCODE_TXF_UMS_LOGICAL:
2622         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
2623         break;
2624
2625      case SHADER_OPCODE_TXF_MCS_LOGICAL:
2626         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
2627         break;
2628
2629      case SHADER_OPCODE_LOD_LOGICAL:
2630         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
2631         break;
2632
2633      case SHADER_OPCODE_TG4_LOGICAL:
2634         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
2635         break;
2636
2637      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
2638         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
2639         break;
2640
2641      case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
2642         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
2643         break;
2644
2645      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
2646      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
2647      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
2648      case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
2649      case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
2650      case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
2651         if (devinfo->has_lsc) {
2652            lower_lsc_surface_logical_send(ibld, inst);
2653            break;
2654         }
2655      case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
2656      case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
2657      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
2658      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
2659      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
2660         lower_surface_logical_send(ibld, inst);
2661         break;
2662
2663      case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL:
2664      case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2665      case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
2666         lower_surface_block_logical_send(ibld, inst);
2667         break;
2668
2669      case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2670      case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2671      case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2672      case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2673      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2674      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
2675      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
2676      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
2677      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
2678      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
2679         if (devinfo->has_lsc) {
2680            lower_lsc_a64_logical_send(ibld, inst);
2681            break;
2682         }
2683      case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2684      case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2685      case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2686         lower_a64_logical_send(ibld, inst);
2687         break;
2688
2689      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
2690         if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler)
2691            lower_lsc_varying_pull_constant_logical_send(ibld, inst);
2692         else
2693            lower_varying_pull_constant_logical_send(ibld, inst);
2694         break;
2695
2696      case SHADER_OPCODE_RCP:
2697      case SHADER_OPCODE_RSQ:
2698      case SHADER_OPCODE_SQRT:
2699      case SHADER_OPCODE_EXP2:
2700      case SHADER_OPCODE_LOG2:
2701      case SHADER_OPCODE_SIN:
2702      case SHADER_OPCODE_COS:
2703      case SHADER_OPCODE_POW:
2704      case SHADER_OPCODE_INT_QUOTIENT:
2705      case SHADER_OPCODE_INT_REMAINDER:
2706         /* The math opcodes are overloaded for the send-like and
2707          * expression-like instructions which seems kind of icky.  Gfx6+ has
2708          * a native (but rather quirky) MATH instruction so we don't need to
2709          * do anything here.  On Gfx4-5 we'll have to lower the Gfx6-like
2710          * logical instructions (which we can easily recognize because they
2711          * have mlen = 0) into send-like virtual instructions.
2712          */
2713         if (devinfo->ver < 6 && inst->mlen == 0) {
2714            lower_math_logical_send(ibld, inst);
2715            break;
2716
2717         } else {
2718            continue;
2719         }
2720
2721      case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2722      case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2723         lower_btd_logical_send(ibld, inst);
2724         break;
2725
2726      case RT_OPCODE_TRACE_RAY_LOGICAL:
2727         lower_trace_ray_logical_send(ibld, inst);
2728         break;
2729
2730      case SHADER_OPCODE_URB_READ_LOGICAL:
2731         lower_urb_read_logical_send(ibld, inst);
2732         break;
2733
2734      case SHADER_OPCODE_URB_WRITE_LOGICAL:
2735         lower_urb_write_logical_send(ibld, inst);
2736         break;
2737
2738      default:
2739         continue;
2740      }
2741
2742      progress = true;
2743   }
2744
2745   if (progress)
2746      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2747
2748   return progress;
2749}
2750