1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2010 Intel Corporation
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci/** @file brw_fs_generator.cpp
25bf215546Sopenharmony_ci *
26bf215546Sopenharmony_ci * This file supports generating code from the FS LIR to the actual
27bf215546Sopenharmony_ci * native instructions.
28bf215546Sopenharmony_ci */
29bf215546Sopenharmony_ci
30bf215546Sopenharmony_ci#include "brw_eu.h"
31bf215546Sopenharmony_ci#include "brw_fs.h"
32bf215546Sopenharmony_ci#include "brw_cfg.h"
33bf215546Sopenharmony_ci#include "util/mesa-sha1.h"
34bf215546Sopenharmony_ci#include "util/half_float.h"
35bf215546Sopenharmony_ci
36bf215546Sopenharmony_cistatic enum brw_reg_file
37bf215546Sopenharmony_cibrw_file_from_reg(fs_reg *reg)
38bf215546Sopenharmony_ci{
39bf215546Sopenharmony_ci   switch (reg->file) {
40bf215546Sopenharmony_ci   case ARF:
41bf215546Sopenharmony_ci      return BRW_ARCHITECTURE_REGISTER_FILE;
42bf215546Sopenharmony_ci   case FIXED_GRF:
43bf215546Sopenharmony_ci   case VGRF:
44bf215546Sopenharmony_ci      return BRW_GENERAL_REGISTER_FILE;
45bf215546Sopenharmony_ci   case MRF:
46bf215546Sopenharmony_ci      return BRW_MESSAGE_REGISTER_FILE;
47bf215546Sopenharmony_ci   case IMM:
48bf215546Sopenharmony_ci      return BRW_IMMEDIATE_VALUE;
49bf215546Sopenharmony_ci   case BAD_FILE:
50bf215546Sopenharmony_ci   case ATTR:
51bf215546Sopenharmony_ci   case UNIFORM:
52bf215546Sopenharmony_ci      unreachable("not reached");
53bf215546Sopenharmony_ci   }
54bf215546Sopenharmony_ci   return BRW_ARCHITECTURE_REGISTER_FILE;
55bf215546Sopenharmony_ci}
56bf215546Sopenharmony_ci
57bf215546Sopenharmony_cistatic struct brw_reg
58bf215546Sopenharmony_cibrw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst,
59bf215546Sopenharmony_ci                    fs_reg *reg, bool compressed)
60bf215546Sopenharmony_ci{
61bf215546Sopenharmony_ci   struct brw_reg brw_reg;
62bf215546Sopenharmony_ci
63bf215546Sopenharmony_ci   switch (reg->file) {
64bf215546Sopenharmony_ci   case MRF:
65bf215546Sopenharmony_ci      assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
66bf215546Sopenharmony_ci      FALLTHROUGH;
67bf215546Sopenharmony_ci   case VGRF:
68bf215546Sopenharmony_ci      if (reg->stride == 0) {
69bf215546Sopenharmony_ci         brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
70bf215546Sopenharmony_ci      } else {
71bf215546Sopenharmony_ci         /* From the Haswell PRM:
72bf215546Sopenharmony_ci          *
73bf215546Sopenharmony_ci          *  "VertStride must be used to cross GRF register boundaries. This
74bf215546Sopenharmony_ci          *   rule implies that elements within a 'Width' cannot cross GRF
75bf215546Sopenharmony_ci          *   boundaries."
76bf215546Sopenharmony_ci          *
77bf215546Sopenharmony_ci          * The maximum width value that could satisfy this restriction is:
78bf215546Sopenharmony_ci          */
79bf215546Sopenharmony_ci         const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
80bf215546Sopenharmony_ci
81bf215546Sopenharmony_ci         /* Because the hardware can only split source regions at a whole
82bf215546Sopenharmony_ci          * multiple of width during decompression (i.e. vertically), clamp
83bf215546Sopenharmony_ci          * the value obtained above to the physical execution size of a
84bf215546Sopenharmony_ci          * single decompressed chunk of the instruction:
85bf215546Sopenharmony_ci          */
86bf215546Sopenharmony_ci         const unsigned phys_width = compressed ? inst->exec_size / 2 :
87bf215546Sopenharmony_ci                                     inst->exec_size;
88bf215546Sopenharmony_ci
89bf215546Sopenharmony_ci         const unsigned max_hw_width = 16;
90bf215546Sopenharmony_ci
91bf215546Sopenharmony_ci         /* XXX - The equation above is strictly speaking not correct on
92bf215546Sopenharmony_ci          *       hardware that supports unbalanced GRF writes -- On Gfx9+
93bf215546Sopenharmony_ci          *       each decompressed chunk of the instruction may have a
94bf215546Sopenharmony_ci          *       different execution size when the number of components
95bf215546Sopenharmony_ci          *       written to each destination GRF is not the same.
96bf215546Sopenharmony_ci          */
97bf215546Sopenharmony_ci         if (reg->stride > 4) {
98bf215546Sopenharmony_ci            assert(reg != &inst->dst);
99bf215546Sopenharmony_ci            assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
100bf215546Sopenharmony_ci            brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0);
101bf215546Sopenharmony_ci            brw_reg = stride(brw_reg, reg->stride, 1, 0);
102bf215546Sopenharmony_ci         } else {
103bf215546Sopenharmony_ci            const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
104bf215546Sopenharmony_ci            brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
105bf215546Sopenharmony_ci            brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
106bf215546Sopenharmony_ci         }
107bf215546Sopenharmony_ci
108bf215546Sopenharmony_ci         if (devinfo->verx10 == 70) {
109bf215546Sopenharmony_ci            /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
110bf215546Sopenharmony_ci             *  "Each DF (Double Float) operand uses an element size of 4 rather
111bf215546Sopenharmony_ci             *   than 8 and all regioning parameters are twice what the values
112bf215546Sopenharmony_ci             *   would be based on the true element size: ExecSize, Width,
113bf215546Sopenharmony_ci             *   HorzStride, and VertStride. Each DF operand uses a pair of
114bf215546Sopenharmony_ci             *   channels and all masking and swizzing should be adjusted
115bf215546Sopenharmony_ci             *   appropriately."
116bf215546Sopenharmony_ci             *
117bf215546Sopenharmony_ci             * From the IvyBridge PRM (Special Requirements for Handling Double
118bf215546Sopenharmony_ci             * Precision Data Types, page 71):
119bf215546Sopenharmony_ci             *  "In Align1 mode, all regioning parameters like stride, execution
120bf215546Sopenharmony_ci             *   size, and width must use the syntax of a pair of packed
121bf215546Sopenharmony_ci             *   floats. The offsets for these data types must be 64-bit
122bf215546Sopenharmony_ci             *   aligned. The execution size and regioning parameters are in terms
123bf215546Sopenharmony_ci             *   of floats."
124bf215546Sopenharmony_ci             *
125bf215546Sopenharmony_ci             * Summarized: when handling DF-typed arguments, ExecSize,
126bf215546Sopenharmony_ci             * VertStride, and Width must be doubled.
127bf215546Sopenharmony_ci             *
128bf215546Sopenharmony_ci             * It applies to BayTrail too.
129bf215546Sopenharmony_ci             */
130bf215546Sopenharmony_ci            if (type_sz(reg->type) == 8) {
131bf215546Sopenharmony_ci               brw_reg.width++;
132bf215546Sopenharmony_ci               if (brw_reg.vstride > 0)
133bf215546Sopenharmony_ci                  brw_reg.vstride++;
134bf215546Sopenharmony_ci               assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
135bf215546Sopenharmony_ci            }
136bf215546Sopenharmony_ci
137bf215546Sopenharmony_ci            /* When converting from DF->F, we set the destination stride to 2
138bf215546Sopenharmony_ci             * because each d2f conversion implicitly writes 2 floats, being
139bf215546Sopenharmony_ci             * the first one the converted value. IVB/BYT actually writes two
140bf215546Sopenharmony_ci             * F components per SIMD channel, and every other component is
141bf215546Sopenharmony_ci             * filled with garbage.
142bf215546Sopenharmony_ci             */
143bf215546Sopenharmony_ci            if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
144bf215546Sopenharmony_ci                type_sz(inst->dst.type) < 8) {
145bf215546Sopenharmony_ci               assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
146bf215546Sopenharmony_ci               brw_reg.hstride--;
147bf215546Sopenharmony_ci            }
148bf215546Sopenharmony_ci         }
149bf215546Sopenharmony_ci      }
150bf215546Sopenharmony_ci
151bf215546Sopenharmony_ci      brw_reg = retype(brw_reg, reg->type);
152bf215546Sopenharmony_ci      brw_reg = byte_offset(brw_reg, reg->offset);
153bf215546Sopenharmony_ci      brw_reg.abs = reg->abs;
154bf215546Sopenharmony_ci      brw_reg.negate = reg->negate;
155bf215546Sopenharmony_ci      break;
156bf215546Sopenharmony_ci   case ARF:
157bf215546Sopenharmony_ci   case FIXED_GRF:
158bf215546Sopenharmony_ci   case IMM:
159bf215546Sopenharmony_ci      assert(reg->offset == 0);
160bf215546Sopenharmony_ci      brw_reg = reg->as_brw_reg();
161bf215546Sopenharmony_ci      break;
162bf215546Sopenharmony_ci   case BAD_FILE:
163bf215546Sopenharmony_ci      /* Probably unused. */
164bf215546Sopenharmony_ci      brw_reg = brw_null_reg();
165bf215546Sopenharmony_ci      break;
166bf215546Sopenharmony_ci   case ATTR:
167bf215546Sopenharmony_ci   case UNIFORM:
168bf215546Sopenharmony_ci      unreachable("not reached");
169bf215546Sopenharmony_ci   }
170bf215546Sopenharmony_ci
171bf215546Sopenharmony_ci   /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
172bf215546Sopenharmony_ci    * region, but on IVB and BYT DF regions must be programmed in terms of
173bf215546Sopenharmony_ci    * floats. A <0,2,1> region accomplishes this.
174bf215546Sopenharmony_ci    */
175bf215546Sopenharmony_ci   if (devinfo->verx10 == 70 &&
176bf215546Sopenharmony_ci       type_sz(reg->type) == 8 &&
177bf215546Sopenharmony_ci       brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
178bf215546Sopenharmony_ci       brw_reg.width == BRW_WIDTH_1 &&
179bf215546Sopenharmony_ci       brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
180bf215546Sopenharmony_ci      brw_reg.width = BRW_WIDTH_2;
181bf215546Sopenharmony_ci      brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
182bf215546Sopenharmony_ci   }
183bf215546Sopenharmony_ci
184bf215546Sopenharmony_ci   return brw_reg;
185bf215546Sopenharmony_ci}
186bf215546Sopenharmony_ci
187bf215546Sopenharmony_cifs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
188bf215546Sopenharmony_ci                           void *mem_ctx,
189bf215546Sopenharmony_ci                           struct brw_stage_prog_data *prog_data,
190bf215546Sopenharmony_ci                           bool runtime_check_aads_emit,
191bf215546Sopenharmony_ci                           gl_shader_stage stage)
192bf215546Sopenharmony_ci
193bf215546Sopenharmony_ci   : compiler(compiler), log_data(log_data),
194bf215546Sopenharmony_ci     devinfo(compiler->devinfo),
195bf215546Sopenharmony_ci     prog_data(prog_data), dispatch_width(0),
196bf215546Sopenharmony_ci     runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
197bf215546Sopenharmony_ci     shader_name(NULL), stage(stage), mem_ctx(mem_ctx)
198bf215546Sopenharmony_ci{
199bf215546Sopenharmony_ci   p = rzalloc(mem_ctx, struct brw_codegen);
200bf215546Sopenharmony_ci   brw_init_codegen(&compiler->isa, p, mem_ctx);
201bf215546Sopenharmony_ci
202bf215546Sopenharmony_ci   /* In the FS code generator, we are very careful to ensure that we always
203bf215546Sopenharmony_ci    * set the right execution size so we don't need the EU code to "help" us
204bf215546Sopenharmony_ci    * by trying to infer it.  Sometimes, it infers the wrong thing.
205bf215546Sopenharmony_ci    */
206bf215546Sopenharmony_ci   p->automatic_exec_sizes = false;
207bf215546Sopenharmony_ci}
208bf215546Sopenharmony_ci
209bf215546Sopenharmony_cifs_generator::~fs_generator()
210bf215546Sopenharmony_ci{
211bf215546Sopenharmony_ci}
212bf215546Sopenharmony_ci
213bf215546Sopenharmony_ciclass ip_record : public exec_node {
214bf215546Sopenharmony_cipublic:
215bf215546Sopenharmony_ci   DECLARE_RALLOC_CXX_OPERATORS(ip_record)
216bf215546Sopenharmony_ci
217bf215546Sopenharmony_ci   ip_record(int ip)
218bf215546Sopenharmony_ci   {
219bf215546Sopenharmony_ci      this->ip = ip;
220bf215546Sopenharmony_ci   }
221bf215546Sopenharmony_ci
222bf215546Sopenharmony_ci   int ip;
223bf215546Sopenharmony_ci};
224bf215546Sopenharmony_ci
225bf215546Sopenharmony_cibool
226bf215546Sopenharmony_cifs_generator::patch_halt_jumps()
227bf215546Sopenharmony_ci{
228bf215546Sopenharmony_ci   if (this->discard_halt_patches.is_empty())
229bf215546Sopenharmony_ci      return false;
230bf215546Sopenharmony_ci
231bf215546Sopenharmony_ci   int scale = brw_jump_scale(p->devinfo);
232bf215546Sopenharmony_ci
233bf215546Sopenharmony_ci   if (devinfo->ver >= 6) {
234bf215546Sopenharmony_ci      /* There is a somewhat strange undocumented requirement of using
235bf215546Sopenharmony_ci       * HALT, according to the simulator.  If some channel has HALTed to
236bf215546Sopenharmony_ci       * a particular UIP, then by the end of the program, every channel
237bf215546Sopenharmony_ci       * must have HALTed to that UIP.  Furthermore, the tracking is a
238bf215546Sopenharmony_ci       * stack, so you can't do the final halt of a UIP after starting
239bf215546Sopenharmony_ci       * halting to a new UIP.
240bf215546Sopenharmony_ci       *
241bf215546Sopenharmony_ci       * Symptoms of not emitting this instruction on actual hardware
242bf215546Sopenharmony_ci       * included GPU hangs and sparkly rendering on the piglit discard
243bf215546Sopenharmony_ci       * tests.
244bf215546Sopenharmony_ci       */
245bf215546Sopenharmony_ci      brw_inst *last_halt = brw_HALT(p);
246bf215546Sopenharmony_ci      brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
247bf215546Sopenharmony_ci      brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
248bf215546Sopenharmony_ci   }
249bf215546Sopenharmony_ci
250bf215546Sopenharmony_ci   int ip = p->nr_insn;
251bf215546Sopenharmony_ci
252bf215546Sopenharmony_ci   foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
253bf215546Sopenharmony_ci      brw_inst *patch = &p->store[patch_ip->ip];
254bf215546Sopenharmony_ci
255bf215546Sopenharmony_ci      assert(brw_inst_opcode(p->isa, patch) == BRW_OPCODE_HALT);
256bf215546Sopenharmony_ci      if (devinfo->ver >= 6) {
257bf215546Sopenharmony_ci         /* HALT takes a half-instruction distance from the pre-incremented IP. */
258bf215546Sopenharmony_ci         brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
259bf215546Sopenharmony_ci      } else {
260bf215546Sopenharmony_ci         brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale));
261bf215546Sopenharmony_ci      }
262bf215546Sopenharmony_ci   }
263bf215546Sopenharmony_ci
264bf215546Sopenharmony_ci   this->discard_halt_patches.make_empty();
265bf215546Sopenharmony_ci
266bf215546Sopenharmony_ci   if (devinfo->ver < 6) {
267bf215546Sopenharmony_ci      /* From the g965 PRM:
268bf215546Sopenharmony_ci       *
269bf215546Sopenharmony_ci       *    "As DMask is not automatically reloaded into AMask upon completion
270bf215546Sopenharmony_ci       *    of this instruction, software has to manually restore AMask upon
271bf215546Sopenharmony_ci       *    completion."
272bf215546Sopenharmony_ci       *
273bf215546Sopenharmony_ci       * DMask lives in the bottom 16 bits of sr0.1.
274bf215546Sopenharmony_ci       */
275bf215546Sopenharmony_ci      brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK),
276bf215546Sopenharmony_ci                                   retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW));
277bf215546Sopenharmony_ci      brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1);
278bf215546Sopenharmony_ci      brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE);
279bf215546Sopenharmony_ci      brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE);
280bf215546Sopenharmony_ci      brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH);
281bf215546Sopenharmony_ci   }
282bf215546Sopenharmony_ci
283bf215546Sopenharmony_ci   if (devinfo->ver == 4 && devinfo->platform != INTEL_PLATFORM_G4X) {
284bf215546Sopenharmony_ci      /* From the g965 PRM:
285bf215546Sopenharmony_ci       *
286bf215546Sopenharmony_ci       *    "[DevBW, DevCL] Erratum: The subfields in mask stack register are
287bf215546Sopenharmony_ci       *    reset to zero during graphics reset, however, they are not
288bf215546Sopenharmony_ci       *    initialized at thread dispatch. These subfields will retain the
289bf215546Sopenharmony_ci       *    values from the previous thread. Software should make sure the
290bf215546Sopenharmony_ci       *    mask stack is empty (reset to zero) before terminating the thread.
291bf215546Sopenharmony_ci       *    In case that this is not practical, software may have to reset the
292bf215546Sopenharmony_ci       *    mask stack at the beginning of each kernel, which will impact the
293bf215546Sopenharmony_ci       *    performance."
294bf215546Sopenharmony_ci       *
295bf215546Sopenharmony_ci       * Luckily we can rely on:
296bf215546Sopenharmony_ci       *
297bf215546Sopenharmony_ci       *    "[DevBW, DevCL] This register access restriction is not
298bf215546Sopenharmony_ci       *    applicable, hardware does ensure execution pipeline coherency,
299bf215546Sopenharmony_ci       *    when a mask stack register is used as an explicit source and/or
300bf215546Sopenharmony_ci       *    destination."
301bf215546Sopenharmony_ci       */
302bf215546Sopenharmony_ci      brw_push_insn_state(p);
303bf215546Sopenharmony_ci      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
304bf215546Sopenharmony_ci      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
305bf215546Sopenharmony_ci
306bf215546Sopenharmony_ci      brw_set_default_exec_size(p, BRW_EXECUTE_2);
307bf215546Sopenharmony_ci      brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0));
308bf215546Sopenharmony_ci
309bf215546Sopenharmony_ci      brw_set_default_exec_size(p, BRW_EXECUTE_16);
310bf215546Sopenharmony_ci      /* Reset the if stack. */
311bf215546Sopenharmony_ci      brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW),
312bf215546Sopenharmony_ci              brw_imm_uw(0));
313bf215546Sopenharmony_ci
314bf215546Sopenharmony_ci      brw_pop_insn_state(p);
315bf215546Sopenharmony_ci   }
316bf215546Sopenharmony_ci
317bf215546Sopenharmony_ci   return true;
318bf215546Sopenharmony_ci}
319bf215546Sopenharmony_ci
320bf215546Sopenharmony_civoid
321bf215546Sopenharmony_cifs_generator::generate_send(fs_inst *inst,
322bf215546Sopenharmony_ci                            struct brw_reg dst,
323bf215546Sopenharmony_ci                            struct brw_reg desc,
324bf215546Sopenharmony_ci                            struct brw_reg ex_desc,
325bf215546Sopenharmony_ci                            struct brw_reg payload,
326bf215546Sopenharmony_ci                            struct brw_reg payload2)
327bf215546Sopenharmony_ci{
328bf215546Sopenharmony_ci   const bool dst_is_null = dst.file == BRW_ARCHITECTURE_REGISTER_FILE &&
329bf215546Sopenharmony_ci                            dst.nr == BRW_ARF_NULL;
330bf215546Sopenharmony_ci   const unsigned rlen = dst_is_null ? 0 : inst->size_written / REG_SIZE;
331bf215546Sopenharmony_ci
332bf215546Sopenharmony_ci   uint32_t desc_imm = inst->desc |
333bf215546Sopenharmony_ci      brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
334bf215546Sopenharmony_ci
335bf215546Sopenharmony_ci   uint32_t ex_desc_imm = inst->ex_desc |
336bf215546Sopenharmony_ci      brw_message_ex_desc(devinfo, inst->ex_mlen);
337bf215546Sopenharmony_ci
338bf215546Sopenharmony_ci   if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) {
339bf215546Sopenharmony_ci      /* If we have any sort of extended descriptor, then we need SENDS.  This
340bf215546Sopenharmony_ci       * also covers the dual-payload case because ex_mlen goes in ex_desc.
341bf215546Sopenharmony_ci       */
342bf215546Sopenharmony_ci      brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
343bf215546Sopenharmony_ci                                      desc, desc_imm, ex_desc, ex_desc_imm,
344bf215546Sopenharmony_ci                                      inst->eot);
345bf215546Sopenharmony_ci      if (inst->check_tdr)
346bf215546Sopenharmony_ci         brw_inst_set_opcode(p->isa, brw_last_inst,
347bf215546Sopenharmony_ci                             devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
348bf215546Sopenharmony_ci   } else {
349bf215546Sopenharmony_ci      brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
350bf215546Sopenharmony_ci                                   inst->eot);
351bf215546Sopenharmony_ci      if (inst->check_tdr)
352bf215546Sopenharmony_ci         brw_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC);
353bf215546Sopenharmony_ci   }
354bf215546Sopenharmony_ci}
355bf215546Sopenharmony_ci
356bf215546Sopenharmony_civoid
357bf215546Sopenharmony_cifs_generator::fire_fb_write(fs_inst *inst,
358bf215546Sopenharmony_ci                            struct brw_reg payload,
359bf215546Sopenharmony_ci                            struct brw_reg implied_header,
360bf215546Sopenharmony_ci                            GLuint nr)
361bf215546Sopenharmony_ci{
362bf215546Sopenharmony_ci   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
363bf215546Sopenharmony_ci
364bf215546Sopenharmony_ci   if (devinfo->ver < 6) {
365bf215546Sopenharmony_ci      brw_push_insn_state(p);
366bf215546Sopenharmony_ci      brw_set_default_exec_size(p, BRW_EXECUTE_8);
367bf215546Sopenharmony_ci      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
368bf215546Sopenharmony_ci      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
369bf215546Sopenharmony_ci      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
370bf215546Sopenharmony_ci      brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1),
371bf215546Sopenharmony_ci              offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1));
372bf215546Sopenharmony_ci      brw_pop_insn_state(p);
373bf215546Sopenharmony_ci   }
374bf215546Sopenharmony_ci
375bf215546Sopenharmony_ci   uint32_t msg_control = brw_fb_write_msg_control(inst, prog_data);
376bf215546Sopenharmony_ci
377bf215546Sopenharmony_ci   /* We assume render targets start at 0, because headerless FB write
378bf215546Sopenharmony_ci    * messages set "Render Target Index" to 0.  Using a different binding
379bf215546Sopenharmony_ci    * table index would make it impossible to use headerless messages.
380bf215546Sopenharmony_ci    */
381bf215546Sopenharmony_ci   const uint32_t surf_index = inst->target;
382bf215546Sopenharmony_ci
383bf215546Sopenharmony_ci   brw_inst *insn = brw_fb_WRITE(p,
384bf215546Sopenharmony_ci                                 payload,
385bf215546Sopenharmony_ci                                 retype(implied_header, BRW_REGISTER_TYPE_UW),
386bf215546Sopenharmony_ci                                 msg_control,
387bf215546Sopenharmony_ci                                 surf_index,
388bf215546Sopenharmony_ci                                 nr,
389bf215546Sopenharmony_ci                                 0,
390bf215546Sopenharmony_ci                                 inst->eot,
391bf215546Sopenharmony_ci                                 inst->last_rt,
392bf215546Sopenharmony_ci                                 inst->header_size != 0);
393bf215546Sopenharmony_ci
394bf215546Sopenharmony_ci   if (devinfo->ver >= 6)
395bf215546Sopenharmony_ci      brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16);
396bf215546Sopenharmony_ci}
397bf215546Sopenharmony_ci
398bf215546Sopenharmony_civoid
399bf215546Sopenharmony_cifs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
400bf215546Sopenharmony_ci{
401bf215546Sopenharmony_ci   if (devinfo->verx10 <= 70) {
402bf215546Sopenharmony_ci      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
403bf215546Sopenharmony_ci      brw_set_default_flag_reg(p, 0, 0);
404bf215546Sopenharmony_ci   }
405bf215546Sopenharmony_ci
406bf215546Sopenharmony_ci   const struct brw_reg implied_header =
407bf215546Sopenharmony_ci      devinfo->ver < 6 ? payload : brw_null_reg();
408bf215546Sopenharmony_ci
409bf215546Sopenharmony_ci   if (inst->base_mrf >= 0)
410bf215546Sopenharmony_ci      payload = brw_message_reg(inst->base_mrf);
411bf215546Sopenharmony_ci
412bf215546Sopenharmony_ci   if (!runtime_check_aads_emit) {
413bf215546Sopenharmony_ci      fire_fb_write(inst, payload, implied_header, inst->mlen);
414bf215546Sopenharmony_ci   } else {
415bf215546Sopenharmony_ci      /* This can only happen in gen < 6 */
416bf215546Sopenharmony_ci      assert(devinfo->ver < 6);
417bf215546Sopenharmony_ci
418bf215546Sopenharmony_ci      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
419bf215546Sopenharmony_ci
420bf215546Sopenharmony_ci      /* Check runtime bit to detect if we have to send AA data or not */
421bf215546Sopenharmony_ci      brw_push_insn_state(p);
422bf215546Sopenharmony_ci      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
423bf215546Sopenharmony_ci      brw_set_default_exec_size(p, BRW_EXECUTE_1);
424bf215546Sopenharmony_ci      brw_AND(p,
425bf215546Sopenharmony_ci              v1_null_ud,
426bf215546Sopenharmony_ci              retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
427bf215546Sopenharmony_ci              brw_imm_ud(1<<26));
428bf215546Sopenharmony_ci      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
429bf215546Sopenharmony_ci
430bf215546Sopenharmony_ci      int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
431bf215546Sopenharmony_ci      brw_pop_insn_state(p);
432bf215546Sopenharmony_ci      {
433bf215546Sopenharmony_ci         /* Don't send AA data */
434bf215546Sopenharmony_ci         fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
435bf215546Sopenharmony_ci      }
436bf215546Sopenharmony_ci      brw_land_fwd_jump(p, jmp);
437bf215546Sopenharmony_ci      fire_fb_write(inst, payload, implied_header, inst->mlen);
438bf215546Sopenharmony_ci   }
439bf215546Sopenharmony_ci}
440bf215546Sopenharmony_ci
441bf215546Sopenharmony_civoid
442bf215546Sopenharmony_cifs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
443bf215546Sopenharmony_ci                               struct brw_reg payload)
444bf215546Sopenharmony_ci{
445bf215546Sopenharmony_ci   assert(inst->size_written % REG_SIZE == 0);
446bf215546Sopenharmony_ci   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
447bf215546Sopenharmony_ci   /* We assume that render targets start at binding table index 0. */
448bf215546Sopenharmony_ci   const unsigned surf_index = inst->target;
449bf215546Sopenharmony_ci
450bf215546Sopenharmony_ci   gfx9_fb_READ(p, dst, payload, surf_index,
451bf215546Sopenharmony_ci                inst->header_size, inst->size_written / REG_SIZE,
452bf215546Sopenharmony_ci                prog_data->persample_dispatch);
453bf215546Sopenharmony_ci}
454bf215546Sopenharmony_ci
455bf215546Sopenharmony_civoid
456bf215546Sopenharmony_cifs_generator::generate_mov_indirect(fs_inst *inst,
457bf215546Sopenharmony_ci                                    struct brw_reg dst,
458bf215546Sopenharmony_ci                                    struct brw_reg reg,
459bf215546Sopenharmony_ci                                    struct brw_reg indirect_byte_offset)
460bf215546Sopenharmony_ci{
461bf215546Sopenharmony_ci   assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
462bf215546Sopenharmony_ci   assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
463bf215546Sopenharmony_ci   assert(!reg.abs && !reg.negate);
464bf215546Sopenharmony_ci   assert(reg.type == dst.type);
465bf215546Sopenharmony_ci
466bf215546Sopenharmony_ci   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
467bf215546Sopenharmony_ci
468bf215546Sopenharmony_ci   if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
469bf215546Sopenharmony_ci      imm_byte_offset += indirect_byte_offset.ud;
470bf215546Sopenharmony_ci
471bf215546Sopenharmony_ci      reg.nr = imm_byte_offset / REG_SIZE;
472bf215546Sopenharmony_ci      reg.subnr = imm_byte_offset % REG_SIZE;
473bf215546Sopenharmony_ci      if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) {
474bf215546Sopenharmony_ci         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
475bf215546Sopenharmony_ci                    subscript(reg, BRW_REGISTER_TYPE_D, 0));
476bf215546Sopenharmony_ci         brw_set_default_swsb(p, tgl_swsb_null());
477bf215546Sopenharmony_ci         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
478bf215546Sopenharmony_ci                    subscript(reg, BRW_REGISTER_TYPE_D, 1));
479bf215546Sopenharmony_ci      } else {
480bf215546Sopenharmony_ci         brw_MOV(p, dst, reg);
481bf215546Sopenharmony_ci      }
482bf215546Sopenharmony_ci   } else {
483bf215546Sopenharmony_ci      /* Prior to Broadwell, there are only 8 address registers. */
484bf215546Sopenharmony_ci      assert(inst->exec_size <= 8 || devinfo->ver >= 8);
485bf215546Sopenharmony_ci
486bf215546Sopenharmony_ci      /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
487bf215546Sopenharmony_ci      struct brw_reg addr = vec8(brw_address_reg(0));
488bf215546Sopenharmony_ci
489bf215546Sopenharmony_ci      /* Whether we can use destination dependency control without running the
490bf215546Sopenharmony_ci       * risk of a hang if an instruction gets shot down.
491bf215546Sopenharmony_ci       */
492bf215546Sopenharmony_ci      const bool use_dep_ctrl = !inst->predicate &&
493bf215546Sopenharmony_ci                                inst->exec_size == dispatch_width;
494bf215546Sopenharmony_ci      brw_inst *insn;
495bf215546Sopenharmony_ci
496bf215546Sopenharmony_ci      /* The destination stride of an instruction (in bytes) must be greater
497bf215546Sopenharmony_ci       * than or equal to the size of the rest of the instruction.  Since the
498bf215546Sopenharmony_ci       * address register is of type UW, we can't use a D-type instruction.
499bf215546Sopenharmony_ci       * In order to get around this, re retype to UW and use a stride.
500bf215546Sopenharmony_ci       */
501bf215546Sopenharmony_ci      indirect_byte_offset =
502bf215546Sopenharmony_ci         retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
503bf215546Sopenharmony_ci
504bf215546Sopenharmony_ci      /* There are a number of reasons why we don't use the base offset here.
505bf215546Sopenharmony_ci       * One reason is that the field is only 9 bits which means we can only
506bf215546Sopenharmony_ci       * use it to access the first 16 GRFs.  Also, from the Haswell PRM
507bf215546Sopenharmony_ci       * section "Register Region Restrictions":
508bf215546Sopenharmony_ci       *
509bf215546Sopenharmony_ci       *    "The lower bits of the AddressImmediate must not overflow to
510bf215546Sopenharmony_ci       *    change the register address.  The lower 5 bits of Address
511bf215546Sopenharmony_ci       *    Immediate when added to lower 5 bits of address register gives
512bf215546Sopenharmony_ci       *    the sub-register offset. The upper bits of Address Immediate
513bf215546Sopenharmony_ci       *    when added to upper bits of address register gives the register
514bf215546Sopenharmony_ci       *    address. Any overflow from sub-register offset is dropped."
515bf215546Sopenharmony_ci       *
516bf215546Sopenharmony_ci       * Since the indirect may cause us to cross a register boundary, this
517bf215546Sopenharmony_ci       * makes the base offset almost useless.  We could try and do something
518bf215546Sopenharmony_ci       * clever where we use a actual base offset if base_offset % 32 == 0 but
519bf215546Sopenharmony_ci       * that would mean we were generating different code depending on the
520bf215546Sopenharmony_ci       * base offset.  Instead, for the sake of consistency, we'll just do the
521bf215546Sopenharmony_ci       * add ourselves.  This restriction is only listed in the Haswell PRM
522bf215546Sopenharmony_ci       * but empirical testing indicates that it applies on all older
523bf215546Sopenharmony_ci       * generations and is lifted on Broadwell.
524bf215546Sopenharmony_ci       *
525bf215546Sopenharmony_ci       * In the end, while base_offset is nice to look at in the generated
526bf215546Sopenharmony_ci       * code, using it saves us 0 instructions and would require quite a bit
527bf215546Sopenharmony_ci       * of case-by-case work.  It's just not worth it.
528bf215546Sopenharmony_ci       *
529bf215546Sopenharmony_ci       * Due to a hardware bug some platforms (particularly Gfx11+) seem to
530bf215546Sopenharmony_ci       * require the address components of all channels to be valid whether or
531bf215546Sopenharmony_ci       * not they're active, which causes issues if we use VxH addressing
532bf215546Sopenharmony_ci       * under non-uniform control-flow.  We can easily work around that by
533bf215546Sopenharmony_ci       * initializing the whole address register with a pipelined NoMask MOV
534bf215546Sopenharmony_ci       * instruction.
535bf215546Sopenharmony_ci       */
536bf215546Sopenharmony_ci      if (devinfo->ver >= 7) {
537bf215546Sopenharmony_ci         insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset));
538bf215546Sopenharmony_ci         brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
539bf215546Sopenharmony_ci         brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
540bf215546Sopenharmony_ci         if (devinfo->ver >= 12)
541bf215546Sopenharmony_ci            brw_set_default_swsb(p, tgl_swsb_null());
542bf215546Sopenharmony_ci         else
543bf215546Sopenharmony_ci            brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
544bf215546Sopenharmony_ci      }
545bf215546Sopenharmony_ci
546bf215546Sopenharmony_ci      insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
547bf215546Sopenharmony_ci      if (devinfo->ver >= 12)
548bf215546Sopenharmony_ci         brw_set_default_swsb(p, tgl_swsb_regdist(1));
549bf215546Sopenharmony_ci      else if (devinfo->ver >= 7)
550bf215546Sopenharmony_ci         brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
551bf215546Sopenharmony_ci
552bf215546Sopenharmony_ci      if (type_sz(reg.type) > 4 &&
553bf215546Sopenharmony_ci          ((devinfo->verx10 == 70) ||
554bf215546Sopenharmony_ci           devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) ||
555bf215546Sopenharmony_ci           !devinfo->has_64bit_float || devinfo->verx10 >= 125)) {
556bf215546Sopenharmony_ci         /* IVB has an issue (which we found empirically) where it reads two
557bf215546Sopenharmony_ci          * address register components per channel for indirectly addressed
558bf215546Sopenharmony_ci          * 64-bit sources.
559bf215546Sopenharmony_ci          *
560bf215546Sopenharmony_ci          * From the Cherryview PRM Vol 7. "Register Region Restrictions":
561bf215546Sopenharmony_ci          *
562bf215546Sopenharmony_ci          *    "When source or destination datatype is 64b or operation is
563bf215546Sopenharmony_ci          *    integer DWord multiply, indirect addressing must not be used."
564bf215546Sopenharmony_ci          *
565bf215546Sopenharmony_ci          * To work around both of these, we do two integer MOVs insead of one
566bf215546Sopenharmony_ci          * 64-bit MOV.  Because no double value should ever cross a register
567bf215546Sopenharmony_ci          * boundary, it's safe to use the immediate offset in the indirect
568bf215546Sopenharmony_ci          * here to handle adding 4 bytes to the offset and avoid the extra
569bf215546Sopenharmony_ci          * ADD to the register file.
570bf215546Sopenharmony_ci          */
571bf215546Sopenharmony_ci         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
572bf215546Sopenharmony_ci                    retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
573bf215546Sopenharmony_ci         brw_set_default_swsb(p, tgl_swsb_null());
574bf215546Sopenharmony_ci         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
575bf215546Sopenharmony_ci                    retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
576bf215546Sopenharmony_ci      } else {
577bf215546Sopenharmony_ci         struct brw_reg ind_src = brw_VxH_indirect(0, 0);
578bf215546Sopenharmony_ci
579bf215546Sopenharmony_ci         brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
580bf215546Sopenharmony_ci
581bf215546Sopenharmony_ci         if (devinfo->ver == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
582bf215546Sopenharmony_ci             !inst->get_next()->is_tail_sentinel() &&
583bf215546Sopenharmony_ci             ((fs_inst *)inst->get_next())->mlen > 0) {
584bf215546Sopenharmony_ci            /* From the Sandybridge PRM:
585bf215546Sopenharmony_ci             *
586bf215546Sopenharmony_ci             *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
587bf215546Sopenharmony_ci             *    instruction that “indexed/indirect” source AND is followed
588bf215546Sopenharmony_ci             *    by a send, the instruction requires a “Switch”. This is to
589bf215546Sopenharmony_ci             *    avoid race condition where send may dispatch before MRF is
590bf215546Sopenharmony_ci             *    updated."
591bf215546Sopenharmony_ci             */
592bf215546Sopenharmony_ci            brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
593bf215546Sopenharmony_ci         }
594bf215546Sopenharmony_ci      }
595bf215546Sopenharmony_ci   }
596bf215546Sopenharmony_ci}
597bf215546Sopenharmony_ci
598bf215546Sopenharmony_civoid
599bf215546Sopenharmony_cifs_generator::generate_shuffle(fs_inst *inst,
600bf215546Sopenharmony_ci                               struct brw_reg dst,
601bf215546Sopenharmony_ci                               struct brw_reg src,
602bf215546Sopenharmony_ci                               struct brw_reg idx)
603bf215546Sopenharmony_ci{
604bf215546Sopenharmony_ci   assert(src.file == BRW_GENERAL_REGISTER_FILE);
605bf215546Sopenharmony_ci   assert(!src.abs && !src.negate);
606bf215546Sopenharmony_ci
607bf215546Sopenharmony_ci   /* Ivy bridge has some strange behavior that makes this a real pain to
608bf215546Sopenharmony_ci    * implement for 64-bit values so we just don't bother.
609bf215546Sopenharmony_ci    */
610bf215546Sopenharmony_ci   assert((devinfo->verx10 >= 75 && devinfo->has_64bit_float) ||
611bf215546Sopenharmony_ci          type_sz(src.type) <= 4);
612bf215546Sopenharmony_ci
613bf215546Sopenharmony_ci   /* Because we're using the address register, we're limited to 8-wide
614bf215546Sopenharmony_ci    * execution on gfx7.  On gfx8, we're limited to 16-wide by the address
615bf215546Sopenharmony_ci    * register file and 8-wide for 64-bit types.  We could try and make this
616bf215546Sopenharmony_ci    * instruction splittable higher up in the compiler but that gets weird
617bf215546Sopenharmony_ci    * because it reads all of the channels regardless of execution size.  It's
618bf215546Sopenharmony_ci    * easier just to split it here.
619bf215546Sopenharmony_ci    */
620bf215546Sopenharmony_ci   const unsigned lower_width =
621bf215546Sopenharmony_ci      devinfo->ver <= 7 || element_sz(src) > 4 || element_sz(dst) > 4 ? 8 :
622bf215546Sopenharmony_ci      MIN2(16, inst->exec_size);
623bf215546Sopenharmony_ci
624bf215546Sopenharmony_ci   brw_set_default_exec_size(p, cvt(lower_width) - 1);
625bf215546Sopenharmony_ci   for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
626bf215546Sopenharmony_ci      brw_set_default_group(p, group);
627bf215546Sopenharmony_ci
628bf215546Sopenharmony_ci      if ((src.vstride == 0 && src.hstride == 0) ||
629bf215546Sopenharmony_ci          idx.file == BRW_IMMEDIATE_VALUE) {
630bf215546Sopenharmony_ci         /* Trivial, the source is already uniform or the index is a constant.
631bf215546Sopenharmony_ci          * We will typically not get here if the optimizer is doing its job,
632bf215546Sopenharmony_ci          * but asserting would be mean.
633bf215546Sopenharmony_ci          */
634bf215546Sopenharmony_ci         const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
635bf215546Sopenharmony_ci         struct brw_reg group_src = stride(suboffset(src, i), 0, 1, 0);
636bf215546Sopenharmony_ci         struct brw_reg group_dst = suboffset(dst, group << (dst.hstride - 1));
637bf215546Sopenharmony_ci         brw_MOV(p, group_dst, group_src);
638bf215546Sopenharmony_ci      } else {
639bf215546Sopenharmony_ci         /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
640bf215546Sopenharmony_ci         struct brw_reg addr = vec8(brw_address_reg(0));
641bf215546Sopenharmony_ci
642bf215546Sopenharmony_ci         struct brw_reg group_idx = suboffset(idx, group);
643bf215546Sopenharmony_ci
644bf215546Sopenharmony_ci         if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
645bf215546Sopenharmony_ci            /* Things get grumpy if the register is too wide. */
646bf215546Sopenharmony_ci            group_idx.width--;
647bf215546Sopenharmony_ci            group_idx.vstride--;
648bf215546Sopenharmony_ci         }
649bf215546Sopenharmony_ci
650bf215546Sopenharmony_ci         assert(type_sz(group_idx.type) <= 4);
651bf215546Sopenharmony_ci         if (type_sz(group_idx.type) == 4) {
652bf215546Sopenharmony_ci            /* The destination stride of an instruction (in bytes) must be
653bf215546Sopenharmony_ci             * greater than or equal to the size of the rest of the
654bf215546Sopenharmony_ci             * instruction.  Since the address register is of type UW, we
655bf215546Sopenharmony_ci             * can't use a D-type instruction.  In order to get around this,
656bf215546Sopenharmony_ci             * re retype to UW and use a stride.
657bf215546Sopenharmony_ci             */
658bf215546Sopenharmony_ci            group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
659bf215546Sopenharmony_ci         }
660bf215546Sopenharmony_ci
661bf215546Sopenharmony_ci         uint32_t src_start_offset = src.nr * REG_SIZE + src.subnr;
662bf215546Sopenharmony_ci
663bf215546Sopenharmony_ci         /* From the Haswell PRM:
664bf215546Sopenharmony_ci          *
665bf215546Sopenharmony_ci          *    "When a sequence of NoDDChk and NoDDClr are used, the last
666bf215546Sopenharmony_ci          *    instruction that completes the scoreboard clear must have a
667bf215546Sopenharmony_ci          *    non-zero execution mask. This means, if any kind of predication
668bf215546Sopenharmony_ci          *    can change the execution mask or channel enable of the last
669bf215546Sopenharmony_ci          *    instruction, the optimization must be avoided.  This is to
670bf215546Sopenharmony_ci          *    avoid instructions being shot down the pipeline when no writes
671bf215546Sopenharmony_ci          *    are required."
672bf215546Sopenharmony_ci          *
673bf215546Sopenharmony_ci          * Whenever predication is enabled or the instructions being emitted
674bf215546Sopenharmony_ci          * aren't the full width, it's possible that it will be run with zero
675bf215546Sopenharmony_ci          * channels enabled so we can't use dependency control without
676bf215546Sopenharmony_ci          * running the risk of a hang if an instruction gets shot down.
677bf215546Sopenharmony_ci          */
678bf215546Sopenharmony_ci         const bool use_dep_ctrl = !inst->predicate &&
679bf215546Sopenharmony_ci                                   lower_width == dispatch_width;
680bf215546Sopenharmony_ci         brw_inst *insn;
681bf215546Sopenharmony_ci
682bf215546Sopenharmony_ci         /* Due to a hardware bug some platforms (particularly Gfx11+) seem
683bf215546Sopenharmony_ci          * to require the address components of all channels to be valid
684bf215546Sopenharmony_ci          * whether or not they're active, which causes issues if we use VxH
685bf215546Sopenharmony_ci          * addressing under non-uniform control-flow.  We can easily work
686bf215546Sopenharmony_ci          * around that by initializing the whole address register with a
687bf215546Sopenharmony_ci          * pipelined NoMask MOV instruction.
688bf215546Sopenharmony_ci          */
689bf215546Sopenharmony_ci         insn = brw_MOV(p, addr, brw_imm_uw(src_start_offset));
690bf215546Sopenharmony_ci         brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
691bf215546Sopenharmony_ci         brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
692bf215546Sopenharmony_ci         if (devinfo->ver >= 12)
693bf215546Sopenharmony_ci            brw_set_default_swsb(p, tgl_swsb_null());
694bf215546Sopenharmony_ci         else
695bf215546Sopenharmony_ci            brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
696bf215546Sopenharmony_ci
697bf215546Sopenharmony_ci         /* Take into account the component size and horizontal stride. */
698bf215546Sopenharmony_ci         assert(src.vstride == src.hstride + src.width);
699bf215546Sopenharmony_ci         insn = brw_SHL(p, addr, group_idx,
700bf215546Sopenharmony_ci                        brw_imm_uw(util_logbase2(type_sz(src.type)) +
701bf215546Sopenharmony_ci                                   src.hstride - 1));
702bf215546Sopenharmony_ci         if (devinfo->ver >= 12)
703bf215546Sopenharmony_ci            brw_set_default_swsb(p, tgl_swsb_regdist(1));
704bf215546Sopenharmony_ci         else
705bf215546Sopenharmony_ci            brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
706bf215546Sopenharmony_ci
707bf215546Sopenharmony_ci         /* Add on the register start offset */
708bf215546Sopenharmony_ci         brw_ADD(p, addr, addr, brw_imm_uw(src_start_offset));
709bf215546Sopenharmony_ci         brw_MOV(p, suboffset(dst, group << (dst.hstride - 1)),
710bf215546Sopenharmony_ci                 retype(brw_VxH_indirect(0, 0), src.type));
711bf215546Sopenharmony_ci      }
712bf215546Sopenharmony_ci
713bf215546Sopenharmony_ci      brw_set_default_swsb(p, tgl_swsb_null());
714bf215546Sopenharmony_ci   }
715bf215546Sopenharmony_ci}
716bf215546Sopenharmony_ci
717bf215546Sopenharmony_civoid
718bf215546Sopenharmony_cifs_generator::generate_quad_swizzle(const fs_inst *inst,
719bf215546Sopenharmony_ci                                    struct brw_reg dst, struct brw_reg src,
720bf215546Sopenharmony_ci                                    unsigned swiz)
721bf215546Sopenharmony_ci{
722bf215546Sopenharmony_ci   /* Requires a quad. */
723bf215546Sopenharmony_ci   assert(inst->exec_size >= 4);
724bf215546Sopenharmony_ci
725bf215546Sopenharmony_ci   if (src.file == BRW_IMMEDIATE_VALUE ||
726bf215546Sopenharmony_ci       has_scalar_region(src)) {
727bf215546Sopenharmony_ci      /* The value is uniform across all channels */
728bf215546Sopenharmony_ci      brw_MOV(p, dst, src);
729bf215546Sopenharmony_ci
730bf215546Sopenharmony_ci   } else if (devinfo->ver < 11 && type_sz(src.type) == 4) {
731bf215546Sopenharmony_ci      /* This only works on 8-wide 32-bit values */
732bf215546Sopenharmony_ci      assert(inst->exec_size == 8);
733bf215546Sopenharmony_ci      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
734bf215546Sopenharmony_ci      assert(src.vstride == src.width + 1);
735bf215546Sopenharmony_ci      brw_set_default_access_mode(p, BRW_ALIGN_16);
736bf215546Sopenharmony_ci      struct brw_reg swiz_src = stride(src, 4, 4, 1);
737bf215546Sopenharmony_ci      swiz_src.swizzle = swiz;
738bf215546Sopenharmony_ci      brw_MOV(p, dst, swiz_src);
739bf215546Sopenharmony_ci
740bf215546Sopenharmony_ci   } else {
741bf215546Sopenharmony_ci      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
742bf215546Sopenharmony_ci      assert(src.vstride == src.width + 1);
743bf215546Sopenharmony_ci      const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
744bf215546Sopenharmony_ci
745bf215546Sopenharmony_ci      switch (swiz) {
746bf215546Sopenharmony_ci      case BRW_SWIZZLE_XXXX:
747bf215546Sopenharmony_ci      case BRW_SWIZZLE_YYYY:
748bf215546Sopenharmony_ci      case BRW_SWIZZLE_ZZZZ:
749bf215546Sopenharmony_ci      case BRW_SWIZZLE_WWWW:
750bf215546Sopenharmony_ci         brw_MOV(p, dst, stride(src_0, 4, 4, 0));
751bf215546Sopenharmony_ci         break;
752bf215546Sopenharmony_ci
753bf215546Sopenharmony_ci      case BRW_SWIZZLE_XXZZ:
754bf215546Sopenharmony_ci      case BRW_SWIZZLE_YYWW:
755bf215546Sopenharmony_ci         brw_MOV(p, dst, stride(src_0, 2, 2, 0));
756bf215546Sopenharmony_ci         break;
757bf215546Sopenharmony_ci
758bf215546Sopenharmony_ci      case BRW_SWIZZLE_XYXY:
759bf215546Sopenharmony_ci      case BRW_SWIZZLE_ZWZW:
760bf215546Sopenharmony_ci         assert(inst->exec_size == 4);
761bf215546Sopenharmony_ci         brw_MOV(p, dst, stride(src_0, 0, 2, 1));
762bf215546Sopenharmony_ci         break;
763bf215546Sopenharmony_ci
764bf215546Sopenharmony_ci      default:
765bf215546Sopenharmony_ci         assert(inst->force_writemask_all);
766bf215546Sopenharmony_ci         brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
767bf215546Sopenharmony_ci
768bf215546Sopenharmony_ci         for (unsigned c = 0; c < 4; c++) {
769bf215546Sopenharmony_ci            brw_inst *insn = brw_MOV(
770bf215546Sopenharmony_ci               p, stride(suboffset(dst, c),
771bf215546Sopenharmony_ci                         4 * inst->dst.stride, 1, 4 * inst->dst.stride),
772bf215546Sopenharmony_ci               stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
773bf215546Sopenharmony_ci
774bf215546Sopenharmony_ci            if (devinfo->ver < 12) {
775bf215546Sopenharmony_ci               brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
776bf215546Sopenharmony_ci               brw_inst_set_no_dd_check(devinfo, insn, c > 0);
777bf215546Sopenharmony_ci            }
778bf215546Sopenharmony_ci
779bf215546Sopenharmony_ci            brw_set_default_swsb(p, tgl_swsb_null());
780bf215546Sopenharmony_ci         }
781bf215546Sopenharmony_ci
782bf215546Sopenharmony_ci         break;
783bf215546Sopenharmony_ci      }
784bf215546Sopenharmony_ci   }
785bf215546Sopenharmony_ci}
786bf215546Sopenharmony_ci
787bf215546Sopenharmony_civoid
788bf215546Sopenharmony_cifs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
789bf215546Sopenharmony_ci{
790bf215546Sopenharmony_ci   struct brw_inst *insn;
791bf215546Sopenharmony_ci
792bf215546Sopenharmony_ci   insn = brw_next_insn(p, BRW_OPCODE_SEND);
793bf215546Sopenharmony_ci
794bf215546Sopenharmony_ci   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
795bf215546Sopenharmony_ci   brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
796bf215546Sopenharmony_ci   if (devinfo->ver < 12)
797bf215546Sopenharmony_ci      brw_set_src1(p, insn, brw_imm_ud(0u));
798bf215546Sopenharmony_ci
799bf215546Sopenharmony_ci   /* For XeHP and newer send a message to the message gateway to terminate a
800bf215546Sopenharmony_ci    * compute shader. For older devices, a message is sent to the thread
801bf215546Sopenharmony_ci    * spawner.
802bf215546Sopenharmony_ci    */
803bf215546Sopenharmony_ci   if (devinfo->verx10 >= 125)
804bf215546Sopenharmony_ci      brw_inst_set_sfid(devinfo, insn, BRW_SFID_MESSAGE_GATEWAY);
805bf215546Sopenharmony_ci   else
806bf215546Sopenharmony_ci      brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
807bf215546Sopenharmony_ci   brw_inst_set_mlen(devinfo, insn, 1);
808bf215546Sopenharmony_ci   brw_inst_set_rlen(devinfo, insn, 0);
809bf215546Sopenharmony_ci   brw_inst_set_eot(devinfo, insn, inst->eot);
810bf215546Sopenharmony_ci   brw_inst_set_header_present(devinfo, insn, false);
811bf215546Sopenharmony_ci
812bf215546Sopenharmony_ci   brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
813bf215546Sopenharmony_ci
814bf215546Sopenharmony_ci   if (devinfo->ver < 11) {
815bf215546Sopenharmony_ci      brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
816bf215546Sopenharmony_ci
817bf215546Sopenharmony_ci      /* Note that even though the thread has a URB resource associated with it,
818bf215546Sopenharmony_ci       * we set the "do not dereference URB" bit, because the URB resource is
819bf215546Sopenharmony_ci       * managed by the fixed-function unit, so it will free it automatically.
820bf215546Sopenharmony_ci       */
821bf215546Sopenharmony_ci      brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
822bf215546Sopenharmony_ci   }
823bf215546Sopenharmony_ci
824bf215546Sopenharmony_ci   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
825bf215546Sopenharmony_ci}
826bf215546Sopenharmony_ci
827bf215546Sopenharmony_civoid
828bf215546Sopenharmony_cifs_generator::generate_barrier(fs_inst *, struct brw_reg src)
829bf215546Sopenharmony_ci{
830bf215546Sopenharmony_ci   brw_barrier(p, src);
831bf215546Sopenharmony_ci   if (devinfo->ver >= 12) {
832bf215546Sopenharmony_ci      brw_set_default_swsb(p, tgl_swsb_null());
833bf215546Sopenharmony_ci      brw_SYNC(p, TGL_SYNC_BAR);
834bf215546Sopenharmony_ci   } else {
835bf215546Sopenharmony_ci      brw_WAIT(p);
836bf215546Sopenharmony_ci   }
837bf215546Sopenharmony_ci}
838bf215546Sopenharmony_ci
839bf215546Sopenharmony_cibool
840bf215546Sopenharmony_cifs_generator::generate_linterp(fs_inst *inst,
841bf215546Sopenharmony_ci                               struct brw_reg dst, struct brw_reg *src)
842bf215546Sopenharmony_ci{
843bf215546Sopenharmony_ci   /* PLN reads:
844bf215546Sopenharmony_ci    *                      /   in SIMD16   \
845bf215546Sopenharmony_ci    *    -----------------------------------
846bf215546Sopenharmony_ci    *   | src1+0 | src1+1 | src1+2 | src1+3 |
847bf215546Sopenharmony_ci    *   |-----------------------------------|
848bf215546Sopenharmony_ci    *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
849bf215546Sopenharmony_ci    *    -----------------------------------
850bf215546Sopenharmony_ci    *
851bf215546Sopenharmony_ci    * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
852bf215546Sopenharmony_ci    *
853bf215546Sopenharmony_ci    *    -----------------------------------
854bf215546Sopenharmony_ci    *   | src1+0 | src1+1 | src1+2 | src1+3 |
855bf215546Sopenharmony_ci    *   |-----------------------------------|
856bf215546Sopenharmony_ci    *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
857bf215546Sopenharmony_ci    *   |-----------------------------------|
858bf215546Sopenharmony_ci    *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
859bf215546Sopenharmony_ci    *    -----------------------------------
860bf215546Sopenharmony_ci    *
861bf215546Sopenharmony_ci    * See also: emit_interpolation_setup_gfx4().
862bf215546Sopenharmony_ci    */
863bf215546Sopenharmony_ci   struct brw_reg delta_x = src[0];
864bf215546Sopenharmony_ci   struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
865bf215546Sopenharmony_ci   struct brw_reg interp = src[1];
866bf215546Sopenharmony_ci   brw_inst *i[2];
867bf215546Sopenharmony_ci
868bf215546Sopenharmony_ci   /* nir_lower_interpolation() will do the lowering to MAD instructions for
869bf215546Sopenharmony_ci    * us on gfx11+
870bf215546Sopenharmony_ci    */
871bf215546Sopenharmony_ci   assert(devinfo->ver < 11);
872bf215546Sopenharmony_ci
873bf215546Sopenharmony_ci   if (devinfo->has_pln) {
874bf215546Sopenharmony_ci      if (devinfo->ver <= 6 && (delta_x.nr & 1) != 0) {
875bf215546Sopenharmony_ci         /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane":
876bf215546Sopenharmony_ci          *
877bf215546Sopenharmony_ci          *    "[DevSNB]:<src1> must be even register aligned.
878bf215546Sopenharmony_ci          *
879bf215546Sopenharmony_ci          * This restriction is lifted on Ivy Bridge.
880bf215546Sopenharmony_ci          *
881bf215546Sopenharmony_ci          * This means that we need to split PLN into LINE+MAC on-the-fly.
882bf215546Sopenharmony_ci          * Unfortunately, the inputs are laid out for PLN and not LINE+MAC so
883bf215546Sopenharmony_ci          * we have to split into SIMD8 pieces.  For gfx4 (!has_pln), the
884bf215546Sopenharmony_ci          * coordinate registers are laid out differently so we leave it as a
885bf215546Sopenharmony_ci          * SIMD16 instruction.
886bf215546Sopenharmony_ci          */
887bf215546Sopenharmony_ci         assert(inst->exec_size == 8 || inst->exec_size == 16);
888bf215546Sopenharmony_ci         assert(inst->group % 16 == 0);
889bf215546Sopenharmony_ci
890bf215546Sopenharmony_ci         brw_push_insn_state(p);
891bf215546Sopenharmony_ci         brw_set_default_exec_size(p, BRW_EXECUTE_8);
892bf215546Sopenharmony_ci
893bf215546Sopenharmony_ci         /* Thanks to two accumulators, we can emit all the LINEs and then all
894bf215546Sopenharmony_ci          * the MACs.  This improves parallelism a bit.
895bf215546Sopenharmony_ci          */
896bf215546Sopenharmony_ci         for (unsigned g = 0; g < inst->exec_size / 8; g++) {
897bf215546Sopenharmony_ci            brw_inst *line = brw_LINE(p, brw_null_reg(), interp,
898bf215546Sopenharmony_ci                                      offset(delta_x, g * 2));
899bf215546Sopenharmony_ci            brw_inst_set_group(devinfo, line, inst->group + g * 8);
900bf215546Sopenharmony_ci
901bf215546Sopenharmony_ci            /* LINE writes the accumulator automatically on gfx4-5.  On Sandy
902bf215546Sopenharmony_ci             * Bridge and later, we have to explicitly enable it.
903bf215546Sopenharmony_ci             */
904bf215546Sopenharmony_ci            if (devinfo->ver >= 6)
905bf215546Sopenharmony_ci               brw_inst_set_acc_wr_control(p->devinfo, line, true);
906bf215546Sopenharmony_ci
907bf215546Sopenharmony_ci            /* brw_set_default_saturate() is called before emitting
908bf215546Sopenharmony_ci             * instructions, so the saturate bit is set in each instruction,
909bf215546Sopenharmony_ci             * so we need to unset it on the LINE instructions.
910bf215546Sopenharmony_ci             */
911bf215546Sopenharmony_ci            brw_inst_set_saturate(p->devinfo, line, false);
912bf215546Sopenharmony_ci         }
913bf215546Sopenharmony_ci
914bf215546Sopenharmony_ci         for (unsigned g = 0; g < inst->exec_size / 8; g++) {
915bf215546Sopenharmony_ci            brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1),
916bf215546Sopenharmony_ci                                    offset(delta_x, g * 2 + 1));
917bf215546Sopenharmony_ci            brw_inst_set_group(devinfo, mac, inst->group + g * 8);
918bf215546Sopenharmony_ci            brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod);
919bf215546Sopenharmony_ci         }
920bf215546Sopenharmony_ci
921bf215546Sopenharmony_ci         brw_pop_insn_state(p);
922bf215546Sopenharmony_ci
923bf215546Sopenharmony_ci         return true;
924bf215546Sopenharmony_ci      } else {
925bf215546Sopenharmony_ci         brw_PLN(p, dst, interp, delta_x);
926bf215546Sopenharmony_ci
927bf215546Sopenharmony_ci         return false;
928bf215546Sopenharmony_ci      }
929bf215546Sopenharmony_ci   } else {
930bf215546Sopenharmony_ci      i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x);
931bf215546Sopenharmony_ci      i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y);
932bf215546Sopenharmony_ci
933bf215546Sopenharmony_ci      brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod);
934bf215546Sopenharmony_ci
935bf215546Sopenharmony_ci      /* brw_set_default_saturate() is called before emitting instructions, so
936bf215546Sopenharmony_ci       * the saturate bit is set in each instruction, so we need to unset it on
937bf215546Sopenharmony_ci       * the first instruction.
938bf215546Sopenharmony_ci       */
939bf215546Sopenharmony_ci      brw_inst_set_saturate(p->devinfo, i[0], false);
940bf215546Sopenharmony_ci
941bf215546Sopenharmony_ci      return true;
942bf215546Sopenharmony_ci   }
943bf215546Sopenharmony_ci}
944bf215546Sopenharmony_ci
945bf215546Sopenharmony_civoid
946bf215546Sopenharmony_cifs_generator::generate_get_buffer_size(fs_inst *inst,
947bf215546Sopenharmony_ci                                       struct brw_reg dst,
948bf215546Sopenharmony_ci                                       struct brw_reg src,
949bf215546Sopenharmony_ci                                       struct brw_reg surf_index)
950bf215546Sopenharmony_ci{
951bf215546Sopenharmony_ci   assert(devinfo->ver >= 7);
952bf215546Sopenharmony_ci   assert(surf_index.file == BRW_IMMEDIATE_VALUE);
953bf215546Sopenharmony_ci
954bf215546Sopenharmony_ci   uint32_t simd_mode;
955bf215546Sopenharmony_ci   int rlen = 4;
956bf215546Sopenharmony_ci
957bf215546Sopenharmony_ci   switch (inst->exec_size) {
958bf215546Sopenharmony_ci   case 8:
959bf215546Sopenharmony_ci      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
960bf215546Sopenharmony_ci      break;
961bf215546Sopenharmony_ci   case 16:
962bf215546Sopenharmony_ci      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
963bf215546Sopenharmony_ci      break;
964bf215546Sopenharmony_ci   default:
965bf215546Sopenharmony_ci      unreachable("Invalid width for texture instruction");
966bf215546Sopenharmony_ci   }
967bf215546Sopenharmony_ci
968bf215546Sopenharmony_ci   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
969bf215546Sopenharmony_ci      rlen = 8;
970bf215546Sopenharmony_ci      dst = vec16(dst);
971bf215546Sopenharmony_ci   }
972bf215546Sopenharmony_ci
973bf215546Sopenharmony_ci   uint32_t return_format =
974bf215546Sopenharmony_ci      devinfo->ver >= 8 ? GFX8_SAMPLER_RETURN_FORMAT_32BITS :
975bf215546Sopenharmony_ci                          BRW_SAMPLER_RETURN_FORMAT_SINT32;
976bf215546Sopenharmony_ci   brw_SAMPLE(p,
977bf215546Sopenharmony_ci              retype(dst, BRW_REGISTER_TYPE_UW),
978bf215546Sopenharmony_ci              inst->base_mrf,
979bf215546Sopenharmony_ci              src,
980bf215546Sopenharmony_ci              surf_index.ud,
981bf215546Sopenharmony_ci              0,
982bf215546Sopenharmony_ci              GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
983bf215546Sopenharmony_ci              rlen, /* response length */
984bf215546Sopenharmony_ci              inst->mlen,
985bf215546Sopenharmony_ci              inst->header_size > 0,
986bf215546Sopenharmony_ci              simd_mode,
987bf215546Sopenharmony_ci              return_format);
988bf215546Sopenharmony_ci}
989bf215546Sopenharmony_ci
990bf215546Sopenharmony_civoid
991bf215546Sopenharmony_cifs_generator::generate_tex(fs_inst *inst, struct brw_reg dst,
992bf215546Sopenharmony_ci                           struct brw_reg surface_index,
993bf215546Sopenharmony_ci                           struct brw_reg sampler_index)
994bf215546Sopenharmony_ci{
995bf215546Sopenharmony_ci   assert(devinfo->ver < 7);
996bf215546Sopenharmony_ci   assert(inst->size_written % REG_SIZE == 0);
997bf215546Sopenharmony_ci   int msg_type = -1;
998bf215546Sopenharmony_ci   uint32_t simd_mode;
999bf215546Sopenharmony_ci   uint32_t return_format;
1000bf215546Sopenharmony_ci
1001bf215546Sopenharmony_ci   /* Sampler EOT message of less than the dispatch width would kill the
1002bf215546Sopenharmony_ci    * thread prematurely.
1003bf215546Sopenharmony_ci    */
1004bf215546Sopenharmony_ci   assert(!inst->eot || inst->exec_size == dispatch_width);
1005bf215546Sopenharmony_ci
1006bf215546Sopenharmony_ci   switch (dst.type) {
1007bf215546Sopenharmony_ci   case BRW_REGISTER_TYPE_D:
1008bf215546Sopenharmony_ci      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
1009bf215546Sopenharmony_ci      break;
1010bf215546Sopenharmony_ci   case BRW_REGISTER_TYPE_UD:
1011bf215546Sopenharmony_ci      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
1012bf215546Sopenharmony_ci      break;
1013bf215546Sopenharmony_ci   default:
1014bf215546Sopenharmony_ci      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1015bf215546Sopenharmony_ci      break;
1016bf215546Sopenharmony_ci   }
1017bf215546Sopenharmony_ci
1018bf215546Sopenharmony_ci   /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
1019bf215546Sopenharmony_ci    * is set as part of the message descriptor.  On gfx4, the PRM seems to
1020bf215546Sopenharmony_ci    * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
1021bf215546Sopenharmony_ci    * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
1022bf215546Sopenharmony_ci    * gone from the message descriptor entirely and you just get UINT32 all
1023bf215546Sopenharmony_ci    * the time regasrdless.  Since we can really only do non-UINT32 on gfx4,
1024bf215546Sopenharmony_ci    * just stomp it to UINT32 all the time.
1025bf215546Sopenharmony_ci    */
1026bf215546Sopenharmony_ci   if (inst->opcode == SHADER_OPCODE_TXS)
1027bf215546Sopenharmony_ci      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
1028bf215546Sopenharmony_ci
1029bf215546Sopenharmony_ci   switch (inst->exec_size) {
1030bf215546Sopenharmony_ci   case 8:
1031bf215546Sopenharmony_ci      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1032bf215546Sopenharmony_ci      break;
1033bf215546Sopenharmony_ci   case 16:
1034bf215546Sopenharmony_ci      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1035bf215546Sopenharmony_ci      break;
1036bf215546Sopenharmony_ci   default:
1037bf215546Sopenharmony_ci      unreachable("Invalid width for texture instruction");
1038bf215546Sopenharmony_ci   }
1039bf215546Sopenharmony_ci
1040bf215546Sopenharmony_ci   if (devinfo->ver >= 5) {
1041bf215546Sopenharmony_ci      switch (inst->opcode) {
1042bf215546Sopenharmony_ci      case SHADER_OPCODE_TEX:
1043bf215546Sopenharmony_ci	 if (inst->shadow_compare) {
1044bf215546Sopenharmony_ci	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
1045bf215546Sopenharmony_ci	 } else {
1046bf215546Sopenharmony_ci	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE;
1047bf215546Sopenharmony_ci	 }
1048bf215546Sopenharmony_ci	 break;
1049bf215546Sopenharmony_ci      case FS_OPCODE_TXB:
1050bf215546Sopenharmony_ci	 if (inst->shadow_compare) {
1051bf215546Sopenharmony_ci	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
1052bf215546Sopenharmony_ci	 } else {
1053bf215546Sopenharmony_ci	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1054bf215546Sopenharmony_ci	 }
1055bf215546Sopenharmony_ci	 break;
1056bf215546Sopenharmony_ci      case SHADER_OPCODE_TXL:
1057bf215546Sopenharmony_ci	 if (inst->shadow_compare) {
1058bf215546Sopenharmony_ci	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
1059bf215546Sopenharmony_ci	 } else {
1060bf215546Sopenharmony_ci	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
1061bf215546Sopenharmony_ci	 }
1062bf215546Sopenharmony_ci	 break;
1063bf215546Sopenharmony_ci      case SHADER_OPCODE_TXS:
1064bf215546Sopenharmony_ci	 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
1065bf215546Sopenharmony_ci	 break;
1066bf215546Sopenharmony_ci      case SHADER_OPCODE_TXD:
1067bf215546Sopenharmony_ci         assert(!inst->shadow_compare);
1068bf215546Sopenharmony_ci         msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
1069bf215546Sopenharmony_ci	 break;
1070bf215546Sopenharmony_ci      case SHADER_OPCODE_TXF:
1071bf215546Sopenharmony_ci	 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
1072bf215546Sopenharmony_ci	 break;
1073bf215546Sopenharmony_ci      case SHADER_OPCODE_TXF_CMS:
1074bf215546Sopenharmony_ci         msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
1075bf215546Sopenharmony_ci         break;
1076bf215546Sopenharmony_ci      case SHADER_OPCODE_LOD:
1077bf215546Sopenharmony_ci         msg_type = GFX5_SAMPLER_MESSAGE_LOD;
1078bf215546Sopenharmony_ci         break;
1079bf215546Sopenharmony_ci      case SHADER_OPCODE_TG4:
1080bf215546Sopenharmony_ci         assert(devinfo->ver == 6);
1081bf215546Sopenharmony_ci         assert(!inst->shadow_compare);
1082bf215546Sopenharmony_ci         msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
1083bf215546Sopenharmony_ci         break;
1084bf215546Sopenharmony_ci      case SHADER_OPCODE_SAMPLEINFO:
1085bf215546Sopenharmony_ci         msg_type = GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
1086bf215546Sopenharmony_ci         break;
1087bf215546Sopenharmony_ci      default:
1088bf215546Sopenharmony_ci	 unreachable("not reached");
1089bf215546Sopenharmony_ci      }
1090bf215546Sopenharmony_ci   } else {
1091bf215546Sopenharmony_ci      switch (inst->opcode) {
1092bf215546Sopenharmony_ci      case SHADER_OPCODE_TEX:
1093bf215546Sopenharmony_ci	 /* Note that G45 and older determines shadow compare and dispatch width
1094bf215546Sopenharmony_ci	  * from message length for most messages.
1095bf215546Sopenharmony_ci	  */
1096bf215546Sopenharmony_ci         if (inst->exec_size == 8) {
1097bf215546Sopenharmony_ci            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
1098bf215546Sopenharmony_ci            if (inst->shadow_compare) {
1099bf215546Sopenharmony_ci               assert(inst->mlen == 6);
1100bf215546Sopenharmony_ci            } else {
1101bf215546Sopenharmony_ci               assert(inst->mlen <= 4);
1102bf215546Sopenharmony_ci            }
1103bf215546Sopenharmony_ci         } else {
1104bf215546Sopenharmony_ci            if (inst->shadow_compare) {
1105bf215546Sopenharmony_ci               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1106bf215546Sopenharmony_ci               assert(inst->mlen == 9);
1107bf215546Sopenharmony_ci            } else {
1108bf215546Sopenharmony_ci               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1109bf215546Sopenharmony_ci               assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
1110bf215546Sopenharmony_ci            }
1111bf215546Sopenharmony_ci         }
1112bf215546Sopenharmony_ci	 break;
1113bf215546Sopenharmony_ci      case FS_OPCODE_TXB:
1114bf215546Sopenharmony_ci	 if (inst->shadow_compare) {
1115bf215546Sopenharmony_ci            assert(inst->exec_size == 8);
1116bf215546Sopenharmony_ci	    assert(inst->mlen == 6);
1117bf215546Sopenharmony_ci	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
1118bf215546Sopenharmony_ci	 } else {
1119bf215546Sopenharmony_ci	    assert(inst->mlen == 9);
1120bf215546Sopenharmony_ci	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1121bf215546Sopenharmony_ci	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1122bf215546Sopenharmony_ci	 }
1123bf215546Sopenharmony_ci	 break;
1124bf215546Sopenharmony_ci      case SHADER_OPCODE_TXL:
1125bf215546Sopenharmony_ci	 if (inst->shadow_compare) {
1126bf215546Sopenharmony_ci            assert(inst->exec_size == 8);
1127bf215546Sopenharmony_ci	    assert(inst->mlen == 6);
1128bf215546Sopenharmony_ci	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
1129bf215546Sopenharmony_ci	 } else {
1130bf215546Sopenharmony_ci	    assert(inst->mlen == 9);
1131bf215546Sopenharmony_ci	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
1132bf215546Sopenharmony_ci	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1133bf215546Sopenharmony_ci	 }
1134bf215546Sopenharmony_ci	 break;
1135bf215546Sopenharmony_ci      case SHADER_OPCODE_TXD:
1136bf215546Sopenharmony_ci	 /* There is no sample_d_c message; comparisons are done manually */
1137bf215546Sopenharmony_ci         assert(inst->exec_size == 8);
1138bf215546Sopenharmony_ci	 assert(inst->mlen == 7 || inst->mlen == 10);
1139bf215546Sopenharmony_ci	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
1140bf215546Sopenharmony_ci	 break;
1141bf215546Sopenharmony_ci      case SHADER_OPCODE_TXF:
1142bf215546Sopenharmony_ci         assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
1143bf215546Sopenharmony_ci	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1144bf215546Sopenharmony_ci	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1145bf215546Sopenharmony_ci	 break;
1146bf215546Sopenharmony_ci      case SHADER_OPCODE_TXS:
1147bf215546Sopenharmony_ci	 assert(inst->mlen == 3);
1148bf215546Sopenharmony_ci	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
1149bf215546Sopenharmony_ci	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1150bf215546Sopenharmony_ci	 break;
1151bf215546Sopenharmony_ci      default:
1152bf215546Sopenharmony_ci	 unreachable("not reached");
1153bf215546Sopenharmony_ci      }
1154bf215546Sopenharmony_ci   }
1155bf215546Sopenharmony_ci   assert(msg_type != -1);
1156bf215546Sopenharmony_ci
1157bf215546Sopenharmony_ci   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
1158bf215546Sopenharmony_ci      dst = vec16(dst);
1159bf215546Sopenharmony_ci   }
1160bf215546Sopenharmony_ci
1161bf215546Sopenharmony_ci   assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
1162bf215546Sopenharmony_ci
1163bf215546Sopenharmony_ci   /* Load the message header if present.  If there's a texture offset,
1164bf215546Sopenharmony_ci    * we need to set it up explicitly and load the offset bitfield.
1165bf215546Sopenharmony_ci    * Otherwise, we can use an implied move from g0 to the first message reg.
1166bf215546Sopenharmony_ci    */
1167bf215546Sopenharmony_ci   struct brw_reg src = brw_null_reg();
1168bf215546Sopenharmony_ci   if (inst->header_size != 0) {
1169bf215546Sopenharmony_ci      if (devinfo->ver < 6 && !inst->offset) {
1170bf215546Sopenharmony_ci         /* Set up an implied move from g0 to the MRF. */
1171bf215546Sopenharmony_ci         src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
1172bf215546Sopenharmony_ci      } else {
1173bf215546Sopenharmony_ci         const tgl_swsb swsb = brw_get_default_swsb(p);
1174bf215546Sopenharmony_ci         assert(inst->base_mrf != -1);
1175bf215546Sopenharmony_ci         struct brw_reg header_reg = brw_message_reg(inst->base_mrf);
1176bf215546Sopenharmony_ci
1177bf215546Sopenharmony_ci         brw_push_insn_state(p);
1178bf215546Sopenharmony_ci         brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1179bf215546Sopenharmony_ci         brw_set_default_exec_size(p, BRW_EXECUTE_8);
1180bf215546Sopenharmony_ci         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1181bf215546Sopenharmony_ci         brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1182bf215546Sopenharmony_ci         /* Explicitly set up the message header by copying g0 to the MRF. */
1183bf215546Sopenharmony_ci         brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
1184bf215546Sopenharmony_ci         brw_set_default_swsb(p, tgl_swsb_regdist(1));
1185bf215546Sopenharmony_ci
1186bf215546Sopenharmony_ci         brw_set_default_exec_size(p, BRW_EXECUTE_1);
1187bf215546Sopenharmony_ci         if (inst->offset) {
1188bf215546Sopenharmony_ci            /* Set the offset bits in DWord 2. */
1189bf215546Sopenharmony_ci            brw_MOV(p, get_element_ud(header_reg, 2),
1190bf215546Sopenharmony_ci                       brw_imm_ud(inst->offset));
1191bf215546Sopenharmony_ci         }
1192bf215546Sopenharmony_ci
1193bf215546Sopenharmony_ci         brw_pop_insn_state(p);
1194bf215546Sopenharmony_ci         brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1195bf215546Sopenharmony_ci      }
1196bf215546Sopenharmony_ci   }
1197bf215546Sopenharmony_ci
1198bf215546Sopenharmony_ci   assert(surface_index.file == BRW_IMMEDIATE_VALUE);
1199bf215546Sopenharmony_ci   assert(sampler_index.file == BRW_IMMEDIATE_VALUE);
1200bf215546Sopenharmony_ci
1201bf215546Sopenharmony_ci   brw_SAMPLE(p,
1202bf215546Sopenharmony_ci              retype(dst, BRW_REGISTER_TYPE_UW),
1203bf215546Sopenharmony_ci              inst->base_mrf,
1204bf215546Sopenharmony_ci              src,
1205bf215546Sopenharmony_ci              surface_index.ud,
1206bf215546Sopenharmony_ci              sampler_index.ud % 16,
1207bf215546Sopenharmony_ci              msg_type,
1208bf215546Sopenharmony_ci              inst->size_written / REG_SIZE,
1209bf215546Sopenharmony_ci              inst->mlen,
1210bf215546Sopenharmony_ci              inst->header_size != 0,
1211bf215546Sopenharmony_ci              simd_mode,
1212bf215546Sopenharmony_ci              return_format);
1213bf215546Sopenharmony_ci}
1214bf215546Sopenharmony_ci
1215bf215546Sopenharmony_ci
1216bf215546Sopenharmony_ci/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
1217bf215546Sopenharmony_ci * looking like:
1218bf215546Sopenharmony_ci *
1219bf215546Sopenharmony_ci * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
1220bf215546Sopenharmony_ci *
1221bf215546Sopenharmony_ci * Ideally, we want to produce:
1222bf215546Sopenharmony_ci *
1223bf215546Sopenharmony_ci *           DDX                     DDY
1224bf215546Sopenharmony_ci * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
1225bf215546Sopenharmony_ci *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
1226bf215546Sopenharmony_ci *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
1227bf215546Sopenharmony_ci *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
1228bf215546Sopenharmony_ci *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
1229bf215546Sopenharmony_ci *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
1230bf215546Sopenharmony_ci *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
1231bf215546Sopenharmony_ci *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
1232bf215546Sopenharmony_ci *
1233bf215546Sopenharmony_ci * and add another set of two more subspans if in 16-pixel dispatch mode.
1234bf215546Sopenharmony_ci *
1235bf215546Sopenharmony_ci * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
1236bf215546Sopenharmony_ci * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1237bf215546Sopenharmony_ci * pair.  But the ideal approximation may impose a huge performance cost on
1238bf215546Sopenharmony_ci * sample_d.  On at least Haswell, sample_d instruction does some
1239bf215546Sopenharmony_ci * optimizations if the same LOD is used for all pixels in the subspan.
1240bf215546Sopenharmony_ci *
1241bf215546Sopenharmony_ci * For DDY, we need to use ALIGN16 mode since it's capable of doing the
1242bf215546Sopenharmony_ci * appropriate swizzling.
1243bf215546Sopenharmony_ci */
1244bf215546Sopenharmony_civoid
1245bf215546Sopenharmony_cifs_generator::generate_ddx(const fs_inst *inst,
1246bf215546Sopenharmony_ci                           struct brw_reg dst, struct brw_reg src)
1247bf215546Sopenharmony_ci{
1248bf215546Sopenharmony_ci   unsigned vstride, width;
1249bf215546Sopenharmony_ci
1250bf215546Sopenharmony_ci   if (devinfo->ver >= 8) {
1251bf215546Sopenharmony_ci      if (inst->opcode == FS_OPCODE_DDX_FINE) {
1252bf215546Sopenharmony_ci         /* produce accurate derivatives */
1253bf215546Sopenharmony_ci         vstride = BRW_VERTICAL_STRIDE_2;
1254bf215546Sopenharmony_ci         width = BRW_WIDTH_2;
1255bf215546Sopenharmony_ci      } else {
1256bf215546Sopenharmony_ci         /* replicate the derivative at the top-left pixel to other pixels */
1257bf215546Sopenharmony_ci         vstride = BRW_VERTICAL_STRIDE_4;
1258bf215546Sopenharmony_ci         width = BRW_WIDTH_4;
1259bf215546Sopenharmony_ci      }
1260bf215546Sopenharmony_ci
1261bf215546Sopenharmony_ci      struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
1262bf215546Sopenharmony_ci      struct brw_reg src1 = src;
1263bf215546Sopenharmony_ci
1264bf215546Sopenharmony_ci      src0.vstride = vstride;
1265bf215546Sopenharmony_ci      src0.width   = width;
1266bf215546Sopenharmony_ci      src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1267bf215546Sopenharmony_ci      src1.vstride = vstride;
1268bf215546Sopenharmony_ci      src1.width   = width;
1269bf215546Sopenharmony_ci      src1.hstride = BRW_HORIZONTAL_STRIDE_0;
1270bf215546Sopenharmony_ci
1271bf215546Sopenharmony_ci      brw_ADD(p, dst, src0, negate(src1));
1272bf215546Sopenharmony_ci   } else {
1273bf215546Sopenharmony_ci      /* On Haswell and earlier, the region used above appears to not work
1274bf215546Sopenharmony_ci       * correctly for compressed instructions.  At least on Haswell and
1275bf215546Sopenharmony_ci       * Iron Lake, compressed ALIGN16 instructions do work.  Since we
1276bf215546Sopenharmony_ci       * would have to split to SIMD8 no matter which method we choose, we
1277bf215546Sopenharmony_ci       * may as well use ALIGN16 on all platforms gfx7 and earlier.
1278bf215546Sopenharmony_ci       */
1279bf215546Sopenharmony_ci      struct brw_reg src0 = stride(src, 4, 4, 1);
1280bf215546Sopenharmony_ci      struct brw_reg src1 = stride(src, 4, 4, 1);
1281bf215546Sopenharmony_ci      if (inst->opcode == FS_OPCODE_DDX_FINE) {
1282bf215546Sopenharmony_ci         src0.swizzle = BRW_SWIZZLE_XXZZ;
1283bf215546Sopenharmony_ci         src1.swizzle = BRW_SWIZZLE_YYWW;
1284bf215546Sopenharmony_ci      } else {
1285bf215546Sopenharmony_ci         src0.swizzle = BRW_SWIZZLE_XXXX;
1286bf215546Sopenharmony_ci         src1.swizzle = BRW_SWIZZLE_YYYY;
1287bf215546Sopenharmony_ci      }
1288bf215546Sopenharmony_ci
1289bf215546Sopenharmony_ci      brw_push_insn_state(p);
1290bf215546Sopenharmony_ci      brw_set_default_access_mode(p, BRW_ALIGN_16);
1291bf215546Sopenharmony_ci      brw_ADD(p, dst, negate(src0), src1);
1292bf215546Sopenharmony_ci      brw_pop_insn_state(p);
1293bf215546Sopenharmony_ci   }
1294bf215546Sopenharmony_ci}
1295bf215546Sopenharmony_ci
1296bf215546Sopenharmony_ci/* The negate_value boolean is used to negate the derivative computation for
1297bf215546Sopenharmony_ci * FBOs, since they place the origin at the upper left instead of the lower
1298bf215546Sopenharmony_ci * left.
1299bf215546Sopenharmony_ci */
1300bf215546Sopenharmony_civoid
1301bf215546Sopenharmony_cifs_generator::generate_ddy(const fs_inst *inst,
1302bf215546Sopenharmony_ci                           struct brw_reg dst, struct brw_reg src)
1303bf215546Sopenharmony_ci{
1304bf215546Sopenharmony_ci   const uint32_t type_size = type_sz(src.type);
1305bf215546Sopenharmony_ci
1306bf215546Sopenharmony_ci   if (inst->opcode == FS_OPCODE_DDY_FINE) {
1307bf215546Sopenharmony_ci      /* produce accurate derivatives.
1308bf215546Sopenharmony_ci       *
1309bf215546Sopenharmony_ci       * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU)
1310bf215546Sopenharmony_ci       * "Register Region Restrictions", Section "1. Special Restrictions":
1311bf215546Sopenharmony_ci       *
1312bf215546Sopenharmony_ci       *    "In Align16 mode, the channel selects and channel enables apply to
1313bf215546Sopenharmony_ci       *     a pair of half-floats, because these parameters are defined for
1314bf215546Sopenharmony_ci       *     DWord elements ONLY. This is applicable when both source and
1315bf215546Sopenharmony_ci       *     destination are half-floats."
1316bf215546Sopenharmony_ci       *
1317bf215546Sopenharmony_ci       * So for half-float operations we use the Gfx11+ Align1 path. CHV
1318bf215546Sopenharmony_ci       * inherits its FP16 hardware from SKL, so it is not affected.
1319bf215546Sopenharmony_ci       */
1320bf215546Sopenharmony_ci      if (devinfo->ver >= 11 ||
1321bf215546Sopenharmony_ci          (devinfo->platform == INTEL_PLATFORM_BDW && src.type == BRW_REGISTER_TYPE_HF)) {
1322bf215546Sopenharmony_ci         src = stride(src, 0, 2, 1);
1323bf215546Sopenharmony_ci
1324bf215546Sopenharmony_ci         brw_push_insn_state(p);
1325bf215546Sopenharmony_ci         brw_set_default_exec_size(p, BRW_EXECUTE_4);
1326bf215546Sopenharmony_ci         for (uint32_t g = 0; g < inst->exec_size; g += 4) {
1327bf215546Sopenharmony_ci            brw_set_default_group(p, inst->group + g);
1328bf215546Sopenharmony_ci            brw_ADD(p, byte_offset(dst, g * type_size),
1329bf215546Sopenharmony_ci                       negate(byte_offset(src,  g * type_size)),
1330bf215546Sopenharmony_ci                       byte_offset(src, (g + 2) * type_size));
1331bf215546Sopenharmony_ci            brw_set_default_swsb(p, tgl_swsb_null());
1332bf215546Sopenharmony_ci         }
1333bf215546Sopenharmony_ci         brw_pop_insn_state(p);
1334bf215546Sopenharmony_ci      } else {
1335bf215546Sopenharmony_ci         struct brw_reg src0 = stride(src, 4, 4, 1);
1336bf215546Sopenharmony_ci         struct brw_reg src1 = stride(src, 4, 4, 1);
1337bf215546Sopenharmony_ci         src0.swizzle = BRW_SWIZZLE_XYXY;
1338bf215546Sopenharmony_ci         src1.swizzle = BRW_SWIZZLE_ZWZW;
1339bf215546Sopenharmony_ci
1340bf215546Sopenharmony_ci         brw_push_insn_state(p);
1341bf215546Sopenharmony_ci         brw_set_default_access_mode(p, BRW_ALIGN_16);
1342bf215546Sopenharmony_ci         brw_ADD(p, dst, negate(src0), src1);
1343bf215546Sopenharmony_ci         brw_pop_insn_state(p);
1344bf215546Sopenharmony_ci      }
1345bf215546Sopenharmony_ci   } else {
1346bf215546Sopenharmony_ci      /* replicate the derivative at the top-left pixel to other pixels */
1347bf215546Sopenharmony_ci      if (devinfo->ver >= 8) {
1348bf215546Sopenharmony_ci         struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
1349bf215546Sopenharmony_ci         struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
1350bf215546Sopenharmony_ci
1351bf215546Sopenharmony_ci         brw_ADD(p, dst, negate(src0), src1);
1352bf215546Sopenharmony_ci      } else {
1353bf215546Sopenharmony_ci         /* On Haswell and earlier, the region used above appears to not work
1354bf215546Sopenharmony_ci          * correctly for compressed instructions.  At least on Haswell and
1355bf215546Sopenharmony_ci          * Iron Lake, compressed ALIGN16 instructions do work.  Since we
1356bf215546Sopenharmony_ci          * would have to split to SIMD8 no matter which method we choose, we
1357bf215546Sopenharmony_ci          * may as well use ALIGN16 on all platforms gfx7 and earlier.
1358bf215546Sopenharmony_ci          */
1359bf215546Sopenharmony_ci         struct brw_reg src0 = stride(src, 4, 4, 1);
1360bf215546Sopenharmony_ci         struct brw_reg src1 = stride(src, 4, 4, 1);
1361bf215546Sopenharmony_ci         src0.swizzle = BRW_SWIZZLE_XXXX;
1362bf215546Sopenharmony_ci         src1.swizzle = BRW_SWIZZLE_ZZZZ;
1363bf215546Sopenharmony_ci
1364bf215546Sopenharmony_ci         brw_push_insn_state(p);
1365bf215546Sopenharmony_ci         brw_set_default_access_mode(p, BRW_ALIGN_16);
1366bf215546Sopenharmony_ci         brw_ADD(p, dst, negate(src0), src1);
1367bf215546Sopenharmony_ci         brw_pop_insn_state(p);
1368bf215546Sopenharmony_ci      }
1369bf215546Sopenharmony_ci   }
1370bf215546Sopenharmony_ci}
1371bf215546Sopenharmony_ci
1372bf215546Sopenharmony_civoid
1373bf215546Sopenharmony_cifs_generator::generate_halt(fs_inst *)
1374bf215546Sopenharmony_ci{
1375bf215546Sopenharmony_ci   /* This HALT will be patched up at FB write time to point UIP at the end of
1376bf215546Sopenharmony_ci    * the program, and at brw_uip_jip() JIP will be set to the end of the
1377bf215546Sopenharmony_ci    * current block (or the program).
1378bf215546Sopenharmony_ci    */
1379bf215546Sopenharmony_ci   this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1380bf215546Sopenharmony_ci   brw_HALT(p);
1381bf215546Sopenharmony_ci}
1382bf215546Sopenharmony_ci
1383bf215546Sopenharmony_civoid
1384bf215546Sopenharmony_cifs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1385bf215546Sopenharmony_ci{
1386bf215546Sopenharmony_ci   /* The 32-wide messages only respect the first 16-wide half of the channel
1387bf215546Sopenharmony_ci    * enable signals which are replicated identically for the second group of
1388bf215546Sopenharmony_ci    * 16 channels, so we cannot use them unless the write is marked
1389bf215546Sopenharmony_ci    * force_writemask_all.
1390bf215546Sopenharmony_ci    */
1391bf215546Sopenharmony_ci   const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1392bf215546Sopenharmony_ci                               MIN2(16, inst->exec_size);
1393bf215546Sopenharmony_ci   const unsigned block_size = 4 * lower_size / REG_SIZE;
1394bf215546Sopenharmony_ci   const tgl_swsb swsb = brw_get_default_swsb(p);
1395bf215546Sopenharmony_ci   assert(inst->mlen != 0);
1396bf215546Sopenharmony_ci
1397bf215546Sopenharmony_ci   brw_push_insn_state(p);
1398bf215546Sopenharmony_ci   brw_set_default_exec_size(p, cvt(lower_size) - 1);
1399bf215546Sopenharmony_ci   brw_set_default_compression(p, lower_size > 8);
1400bf215546Sopenharmony_ci
1401bf215546Sopenharmony_ci   for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1402bf215546Sopenharmony_ci      brw_set_default_group(p, inst->group + lower_size * i);
1403bf215546Sopenharmony_ci
1404bf215546Sopenharmony_ci      if (i > 0) {
1405bf215546Sopenharmony_ci         assert(swsb.mode & TGL_SBID_SET);
1406bf215546Sopenharmony_ci         brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid));
1407bf215546Sopenharmony_ci      } else {
1408bf215546Sopenharmony_ci         brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1409bf215546Sopenharmony_ci      }
1410bf215546Sopenharmony_ci
1411bf215546Sopenharmony_ci      brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1412bf215546Sopenharmony_ci              retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1413bf215546Sopenharmony_ci
1414bf215546Sopenharmony_ci      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1415bf215546Sopenharmony_ci      brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1416bf215546Sopenharmony_ci                                    block_size,
1417bf215546Sopenharmony_ci                                    inst->offset + block_size * REG_SIZE * i);
1418bf215546Sopenharmony_ci   }
1419bf215546Sopenharmony_ci
1420bf215546Sopenharmony_ci   brw_pop_insn_state(p);
1421bf215546Sopenharmony_ci}
1422bf215546Sopenharmony_ci
1423bf215546Sopenharmony_civoid
1424bf215546Sopenharmony_cifs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1425bf215546Sopenharmony_ci{
1426bf215546Sopenharmony_ci   assert(inst->exec_size <= 16 || inst->force_writemask_all);
1427bf215546Sopenharmony_ci   assert(inst->mlen != 0);
1428bf215546Sopenharmony_ci
1429bf215546Sopenharmony_ci   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1430bf215546Sopenharmony_ci                                inst->exec_size / 8, inst->offset);
1431bf215546Sopenharmony_ci}
1432bf215546Sopenharmony_ci
1433bf215546Sopenharmony_civoid
1434bf215546Sopenharmony_cifs_generator::generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst)
1435bf215546Sopenharmony_ci{
1436bf215546Sopenharmony_ci   assert(inst->exec_size <= 16 || inst->force_writemask_all);
1437bf215546Sopenharmony_ci
1438bf215546Sopenharmony_ci   gfx7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1439bf215546Sopenharmony_ci}
1440bf215546Sopenharmony_ci
1441bf215546Sopenharmony_ci/* The A32 messages take a buffer base address in header.5:[31:0] (See
1442bf215546Sopenharmony_ci * MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered
1443bf215546Sopenharmony_ci * and OWord block messages in the SKL PRM Vol. 2d for more details.)
1444bf215546Sopenharmony_ci * Unfortunately, there are a number of subtle differences:
1445bf215546Sopenharmony_ci *
1446bf215546Sopenharmony_ci * For the block read/write messages:
1447bf215546Sopenharmony_ci *
1448bf215546Sopenharmony_ci *   - We always stomp header.2 to fill in the actual scratch address (in
1449bf215546Sopenharmony_ci *     units of OWORDs) so we don't care what's in there.
1450bf215546Sopenharmony_ci *
1451bf215546Sopenharmony_ci *   - They rely on per-thread scratch space value in header.3[3:0] to do
1452bf215546Sopenharmony_ci *     bounds checking so that needs to be valid.  The upper bits of
1453bf215546Sopenharmony_ci *     header.3 are ignored, though, so we can copy all of g0.3.
1454bf215546Sopenharmony_ci *
1455bf215546Sopenharmony_ci *   - They ignore header.5[9:0] and assumes the address is 1KB aligned.
1456bf215546Sopenharmony_ci *
1457bf215546Sopenharmony_ci *
1458bf215546Sopenharmony_ci * For the byte/dword scattered read/write messages:
1459bf215546Sopenharmony_ci *
1460bf215546Sopenharmony_ci *   - We want header.2 to be zero because that gets added to the per-channel
1461bf215546Sopenharmony_ci *     offset in the non-header portion of the message.
1462bf215546Sopenharmony_ci *
1463bf215546Sopenharmony_ci *   - Contrary to what the docs claim, they don't do any bounds checking so
1464bf215546Sopenharmony_ci *     the value of header.3[3:0] doesn't matter.
1465bf215546Sopenharmony_ci *
1466bf215546Sopenharmony_ci *   - They consider all of header.5 for the base address and header.5[9:0]
1467bf215546Sopenharmony_ci *     are not ignored.  This means that we can't copy g0.5 verbatim because
1468bf215546Sopenharmony_ci *     g0.5[9:0] contains the FFTID on most platforms.  Instead, we have to
1469bf215546Sopenharmony_ci *     use an AND to mask off the bottom 10 bits.
1470bf215546Sopenharmony_ci *
1471bf215546Sopenharmony_ci *
1472bf215546Sopenharmony_ci * For block messages, just copying g0 gives a valid header because all the
1473bf215546Sopenharmony_ci * garbage gets ignored except for header.2 which we stomp as part of message
1474bf215546Sopenharmony_ci * setup.  For byte/dword scattered messages, we can just zero out the header
1475bf215546Sopenharmony_ci * and copy over the bits we need from g0.5.  This opcode, however, tries to
1476bf215546Sopenharmony_ci * satisfy the requirements of both by starting with 0 and filling out the
1477bf215546Sopenharmony_ci * information required by either set of opcodes.
1478bf215546Sopenharmony_ci */
1479bf215546Sopenharmony_civoid
1480bf215546Sopenharmony_cifs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst)
1481bf215546Sopenharmony_ci{
1482bf215546Sopenharmony_ci   assert(inst->exec_size == 8 && inst->force_writemask_all);
1483bf215546Sopenharmony_ci   assert(dst.file == BRW_GENERAL_REGISTER_FILE);
1484bf215546Sopenharmony_ci
1485bf215546Sopenharmony_ci   dst.type = BRW_REGISTER_TYPE_UD;
1486bf215546Sopenharmony_ci
1487bf215546Sopenharmony_ci   brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0));
1488bf215546Sopenharmony_ci   if (devinfo->ver >= 12)
1489bf215546Sopenharmony_ci      brw_set_default_swsb(p, tgl_swsb_null());
1490bf215546Sopenharmony_ci   else
1491bf215546Sopenharmony_ci      brw_inst_set_no_dd_clear(p->devinfo, insn, true);
1492bf215546Sopenharmony_ci
1493bf215546Sopenharmony_ci   /* Copy the per-thread scratch space size from g0.3[3:0] */
1494bf215546Sopenharmony_ci   brw_set_default_exec_size(p, BRW_EXECUTE_1);
1495bf215546Sopenharmony_ci   insn = brw_AND(p, suboffset(dst, 3),
1496bf215546Sopenharmony_ci                     retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
1497bf215546Sopenharmony_ci                     brw_imm_ud(INTEL_MASK(3, 0)));
1498bf215546Sopenharmony_ci   if (devinfo->ver < 12) {
1499bf215546Sopenharmony_ci      brw_inst_set_no_dd_clear(p->devinfo, insn, true);
1500bf215546Sopenharmony_ci      brw_inst_set_no_dd_check(p->devinfo, insn, true);
1501bf215546Sopenharmony_ci   }
1502bf215546Sopenharmony_ci
1503bf215546Sopenharmony_ci   /* Copy the scratch base address from g0.5[31:10] */
1504bf215546Sopenharmony_ci   insn = brw_AND(p, suboffset(dst, 5),
1505bf215546Sopenharmony_ci                     retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
1506bf215546Sopenharmony_ci                     brw_imm_ud(INTEL_MASK(31, 10)));
1507bf215546Sopenharmony_ci   if (devinfo->ver < 12)
1508bf215546Sopenharmony_ci      brw_inst_set_no_dd_check(p->devinfo, insn, true);
1509bf215546Sopenharmony_ci}
1510bf215546Sopenharmony_ci
1511bf215546Sopenharmony_civoid
1512bf215546Sopenharmony_cifs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1513bf215546Sopenharmony_ci                                                  struct brw_reg dst,
1514bf215546Sopenharmony_ci                                                  struct brw_reg index,
1515bf215546Sopenharmony_ci                                                  struct brw_reg offset)
1516bf215546Sopenharmony_ci{
1517bf215546Sopenharmony_ci   assert(type_sz(dst.type) == 4);
1518bf215546Sopenharmony_ci   assert(inst->mlen != 0);
1519bf215546Sopenharmony_ci
1520bf215546Sopenharmony_ci   assert(index.file == BRW_IMMEDIATE_VALUE &&
1521bf215546Sopenharmony_ci	  index.type == BRW_REGISTER_TYPE_UD);
1522bf215546Sopenharmony_ci   uint32_t surf_index = index.ud;
1523bf215546Sopenharmony_ci
1524bf215546Sopenharmony_ci   assert(offset.file == BRW_IMMEDIATE_VALUE &&
1525bf215546Sopenharmony_ci	  offset.type == BRW_REGISTER_TYPE_UD);
1526bf215546Sopenharmony_ci   uint32_t read_offset = offset.ud;
1527bf215546Sopenharmony_ci
1528bf215546Sopenharmony_ci   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1529bf215546Sopenharmony_ci			read_offset, surf_index);
1530bf215546Sopenharmony_ci}
1531bf215546Sopenharmony_ci
1532bf215546Sopenharmony_civoid
1533bf215546Sopenharmony_cifs_generator::generate_uniform_pull_constant_load_gfx7(fs_inst *inst,
1534bf215546Sopenharmony_ci                                                       struct brw_reg dst,
1535bf215546Sopenharmony_ci                                                       struct brw_reg index,
1536bf215546Sopenharmony_ci                                                       struct brw_reg payload)
1537bf215546Sopenharmony_ci{
1538bf215546Sopenharmony_ci   assert(index.type == BRW_REGISTER_TYPE_UD);
1539bf215546Sopenharmony_ci   assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1540bf215546Sopenharmony_ci   assert(type_sz(dst.type) == 4);
1541bf215546Sopenharmony_ci   assert(!devinfo->has_lsc);
1542bf215546Sopenharmony_ci
1543bf215546Sopenharmony_ci   if (index.file == BRW_IMMEDIATE_VALUE) {
1544bf215546Sopenharmony_ci      const uint32_t surf_index = index.ud;
1545bf215546Sopenharmony_ci
1546bf215546Sopenharmony_ci      brw_push_insn_state(p);
1547bf215546Sopenharmony_ci      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1548bf215546Sopenharmony_ci      brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1549bf215546Sopenharmony_ci      brw_pop_insn_state(p);
1550bf215546Sopenharmony_ci
1551bf215546Sopenharmony_ci      brw_inst_set_sfid(devinfo, send, GFX6_SFID_DATAPORT_CONSTANT_CACHE);
1552bf215546Sopenharmony_ci      brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
1553bf215546Sopenharmony_ci      brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
1554bf215546Sopenharmony_ci      brw_set_desc(p, send,
1555bf215546Sopenharmony_ci                   brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written,
1556bf215546Sopenharmony_ci                                                             REG_SIZE), true) |
1557bf215546Sopenharmony_ci                   brw_dp_desc(devinfo, surf_index,
1558bf215546Sopenharmony_ci                               GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
1559bf215546Sopenharmony_ci                               BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)));
1560bf215546Sopenharmony_ci
1561bf215546Sopenharmony_ci   } else {
1562bf215546Sopenharmony_ci      const tgl_swsb swsb = brw_get_default_swsb(p);
1563bf215546Sopenharmony_ci      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1564bf215546Sopenharmony_ci
1565bf215546Sopenharmony_ci      brw_push_insn_state(p);
1566bf215546Sopenharmony_ci      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1567bf215546Sopenharmony_ci
1568bf215546Sopenharmony_ci      /* a0.0 = surf_index & 0xff */
1569bf215546Sopenharmony_ci      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1570bf215546Sopenharmony_ci      brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1571bf215546Sopenharmony_ci      brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1572bf215546Sopenharmony_ci      brw_set_dest(p, insn_and, addr);
1573bf215546Sopenharmony_ci      brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1574bf215546Sopenharmony_ci      brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1575bf215546Sopenharmony_ci
1576bf215546Sopenharmony_ci      /* dst = send(payload, a0.0 | <descriptor>) */
1577bf215546Sopenharmony_ci      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1578bf215546Sopenharmony_ci      brw_send_indirect_message(
1579bf215546Sopenharmony_ci         p, GFX6_SFID_DATAPORT_CONSTANT_CACHE,
1580bf215546Sopenharmony_ci         retype(dst, BRW_REGISTER_TYPE_UD),
1581bf215546Sopenharmony_ci         retype(payload, BRW_REGISTER_TYPE_UD), addr,
1582bf215546Sopenharmony_ci         brw_message_desc(devinfo, 1,
1583bf215546Sopenharmony_ci                          DIV_ROUND_UP(inst->size_written, REG_SIZE), true) |
1584bf215546Sopenharmony_ci         brw_dp_desc(devinfo, 0 /* surface */,
1585bf215546Sopenharmony_ci                     GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
1586bf215546Sopenharmony_ci                     BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)),
1587bf215546Sopenharmony_ci         false /* EOT */);
1588bf215546Sopenharmony_ci
1589bf215546Sopenharmony_ci      brw_pop_insn_state(p);
1590bf215546Sopenharmony_ci   }
1591bf215546Sopenharmony_ci}
1592bf215546Sopenharmony_ci
1593bf215546Sopenharmony_civoid
1594bf215546Sopenharmony_cifs_generator::generate_varying_pull_constant_load_gfx4(fs_inst *inst,
1595bf215546Sopenharmony_ci                                                       struct brw_reg dst,
1596bf215546Sopenharmony_ci                                                       struct brw_reg index)
1597bf215546Sopenharmony_ci{
1598bf215546Sopenharmony_ci   assert(devinfo->ver < 7); /* Should use the gfx7 variant. */
1599bf215546Sopenharmony_ci   assert(inst->header_size != 0);
1600bf215546Sopenharmony_ci   assert(inst->mlen);
1601bf215546Sopenharmony_ci
1602bf215546Sopenharmony_ci   assert(index.file == BRW_IMMEDIATE_VALUE &&
1603bf215546Sopenharmony_ci	  index.type == BRW_REGISTER_TYPE_UD);
1604bf215546Sopenharmony_ci   uint32_t surf_index = index.ud;
1605bf215546Sopenharmony_ci
1606bf215546Sopenharmony_ci   uint32_t simd_mode, rlen, msg_type;
1607bf215546Sopenharmony_ci   if (inst->exec_size == 16) {
1608bf215546Sopenharmony_ci      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1609bf215546Sopenharmony_ci      rlen = 8;
1610bf215546Sopenharmony_ci   } else {
1611bf215546Sopenharmony_ci      assert(inst->exec_size == 8);
1612bf215546Sopenharmony_ci      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1613bf215546Sopenharmony_ci      rlen = 4;
1614bf215546Sopenharmony_ci   }
1615bf215546Sopenharmony_ci
1616bf215546Sopenharmony_ci   if (devinfo->ver >= 5)
1617bf215546Sopenharmony_ci      msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
1618bf215546Sopenharmony_ci   else {
1619bf215546Sopenharmony_ci      /* We always use the SIMD16 message so that we only have to load U, and
1620bf215546Sopenharmony_ci       * not V or R.
1621bf215546Sopenharmony_ci       */
1622bf215546Sopenharmony_ci      msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1623bf215546Sopenharmony_ci      assert(inst->mlen == 3);
1624bf215546Sopenharmony_ci      assert(inst->size_written == 8 * REG_SIZE);
1625bf215546Sopenharmony_ci      rlen = 8;
1626bf215546Sopenharmony_ci      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1627bf215546Sopenharmony_ci   }
1628bf215546Sopenharmony_ci
1629bf215546Sopenharmony_ci   struct brw_reg header = brw_vec8_grf(0, 0);
1630bf215546Sopenharmony_ci   gfx6_resolve_implied_move(p, &header, inst->base_mrf);
1631bf215546Sopenharmony_ci
1632bf215546Sopenharmony_ci   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1633bf215546Sopenharmony_ci   brw_inst_set_compression(devinfo, send, false);
1634bf215546Sopenharmony_ci   brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER);
1635bf215546Sopenharmony_ci   brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1636bf215546Sopenharmony_ci   brw_set_src0(p, send, header);
1637bf215546Sopenharmony_ci   if (devinfo->ver < 6)
1638bf215546Sopenharmony_ci      brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1639bf215546Sopenharmony_ci
1640bf215546Sopenharmony_ci   /* Our surface is set up as floats, regardless of what actual data is
1641bf215546Sopenharmony_ci    * stored in it.
1642bf215546Sopenharmony_ci    */
1643bf215546Sopenharmony_ci   uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1644bf215546Sopenharmony_ci   brw_set_desc(p, send,
1645bf215546Sopenharmony_ci                brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) |
1646bf215546Sopenharmony_ci                brw_sampler_desc(devinfo, surf_index,
1647bf215546Sopenharmony_ci                                 0, /* sampler (unused) */
1648bf215546Sopenharmony_ci                                 msg_type, simd_mode, return_format));
1649bf215546Sopenharmony_ci}
1650bf215546Sopenharmony_ci
1651bf215546Sopenharmony_civoid
1652bf215546Sopenharmony_cifs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1653bf215546Sopenharmony_ci                                                struct brw_reg dst,
1654bf215546Sopenharmony_ci                                                struct brw_reg src,
1655bf215546Sopenharmony_ci                                                struct brw_reg msg_data,
1656bf215546Sopenharmony_ci                                                unsigned msg_type)
1657bf215546Sopenharmony_ci{
1658bf215546Sopenharmony_ci   const bool has_payload = inst->src[0].file != BAD_FILE;
1659bf215546Sopenharmony_ci   assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1660bf215546Sopenharmony_ci   assert(inst->size_written % REG_SIZE == 0);
1661bf215546Sopenharmony_ci
1662bf215546Sopenharmony_ci   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1663bf215546Sopenharmony_ci
1664bf215546Sopenharmony_ci   brw_pixel_interpolator_query(p,
1665bf215546Sopenharmony_ci         retype(dst, BRW_REGISTER_TYPE_UW),
1666bf215546Sopenharmony_ci         /* If we don't have a payload, what we send doesn't matter */
1667bf215546Sopenharmony_ci         has_payload ? src : brw_vec8_grf(0, 0),
1668bf215546Sopenharmony_ci         inst->pi_noperspective,
1669bf215546Sopenharmony_ci         prog_data->per_coarse_pixel_dispatch,
1670bf215546Sopenharmony_ci         msg_type,
1671bf215546Sopenharmony_ci         msg_data,
1672bf215546Sopenharmony_ci         has_payload ? 2 * inst->exec_size / 8 : 1,
1673bf215546Sopenharmony_ci         inst->size_written / REG_SIZE);
1674bf215546Sopenharmony_ci}
1675bf215546Sopenharmony_ci
1676bf215546Sopenharmony_ci/* Sets vstride=1, width=4, hstride=0 of register src1 during
1677bf215546Sopenharmony_ci * the ADD instruction.
1678bf215546Sopenharmony_ci */
1679bf215546Sopenharmony_civoid
1680bf215546Sopenharmony_cifs_generator::generate_set_sample_id(fs_inst *inst,
1681bf215546Sopenharmony_ci                                     struct brw_reg dst,
1682bf215546Sopenharmony_ci                                     struct brw_reg src0,
1683bf215546Sopenharmony_ci                                     struct brw_reg src1)
1684bf215546Sopenharmony_ci{
1685bf215546Sopenharmony_ci   assert(dst.type == BRW_REGISTER_TYPE_D ||
1686bf215546Sopenharmony_ci          dst.type == BRW_REGISTER_TYPE_UD);
1687bf215546Sopenharmony_ci   assert(src0.type == BRW_REGISTER_TYPE_D ||
1688bf215546Sopenharmony_ci          src0.type == BRW_REGISTER_TYPE_UD);
1689bf215546Sopenharmony_ci
1690bf215546Sopenharmony_ci   const struct brw_reg reg = stride(src1, 1, 4, 0);
1691bf215546Sopenharmony_ci   const unsigned lower_size = MIN2(inst->exec_size,
1692bf215546Sopenharmony_ci                                    devinfo->ver >= 8 ? 16 : 8);
1693bf215546Sopenharmony_ci
1694bf215546Sopenharmony_ci   for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1695bf215546Sopenharmony_ci      brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8),
1696bf215546Sopenharmony_ci                               offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) *
1697bf215546Sopenharmony_ci                                             (i * lower_size / (1 << src0.width))) *
1698bf215546Sopenharmony_ci                                            type_sz(src0.type) / REG_SIZE),
1699bf215546Sopenharmony_ci                               suboffset(reg, i * lower_size / 4));
1700bf215546Sopenharmony_ci      brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1);
1701bf215546Sopenharmony_ci      brw_inst_set_group(devinfo, insn, inst->group + lower_size * i);
1702bf215546Sopenharmony_ci      brw_inst_set_compression(devinfo, insn, lower_size > 8);
1703bf215546Sopenharmony_ci      brw_set_default_swsb(p, tgl_swsb_null());
1704bf215546Sopenharmony_ci   }
1705bf215546Sopenharmony_ci}
1706bf215546Sopenharmony_ci
1707bf215546Sopenharmony_civoid
1708bf215546Sopenharmony_cifs_generator::generate_pack_half_2x16_split(fs_inst *,
1709bf215546Sopenharmony_ci                                            struct brw_reg dst,
1710bf215546Sopenharmony_ci                                            struct brw_reg x,
1711bf215546Sopenharmony_ci                                            struct brw_reg y)
1712bf215546Sopenharmony_ci{
1713bf215546Sopenharmony_ci   assert(devinfo->ver >= 7);
1714bf215546Sopenharmony_ci   assert(dst.type == BRW_REGISTER_TYPE_UD);
1715bf215546Sopenharmony_ci   assert(x.type == BRW_REGISTER_TYPE_F);
1716bf215546Sopenharmony_ci   assert(y.type == BRW_REGISTER_TYPE_F);
1717bf215546Sopenharmony_ci
1718bf215546Sopenharmony_ci   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1719bf215546Sopenharmony_ci    *
1720bf215546Sopenharmony_ci    *   Because this instruction does not have a 16-bit floating-point type,
1721bf215546Sopenharmony_ci    *   the destination data type must be Word (W).
1722bf215546Sopenharmony_ci    *
1723bf215546Sopenharmony_ci    *   The destination must be DWord-aligned and specify a horizontal stride
1724bf215546Sopenharmony_ci    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
1725bf215546Sopenharmony_ci    *   each destination channel and the upper word is not modified.
1726bf215546Sopenharmony_ci    */
1727bf215546Sopenharmony_ci   const enum brw_reg_type t = devinfo->ver > 7
1728bf215546Sopenharmony_ci      ? BRW_REGISTER_TYPE_HF : BRW_REGISTER_TYPE_W;
1729bf215546Sopenharmony_ci   struct brw_reg dst_w = spread(retype(dst, t), 2);
1730bf215546Sopenharmony_ci
1731bf215546Sopenharmony_ci   if (y.file == IMM) {
1732bf215546Sopenharmony_ci      const uint32_t hhhh0000 = _mesa_float_to_half(y.f) << 16;
1733bf215546Sopenharmony_ci
1734bf215546Sopenharmony_ci      brw_MOV(p, dst, brw_imm_ud(hhhh0000));
1735bf215546Sopenharmony_ci      brw_set_default_swsb(p, tgl_swsb_regdist(1));
1736bf215546Sopenharmony_ci   } else {
1737bf215546Sopenharmony_ci      /* Give each 32-bit channel of dst the form below, where "." means
1738bf215546Sopenharmony_ci       * unchanged.
1739bf215546Sopenharmony_ci       *   0x....hhhh
1740bf215546Sopenharmony_ci       */
1741bf215546Sopenharmony_ci      brw_F32TO16(p, dst_w, y);
1742bf215546Sopenharmony_ci
1743bf215546Sopenharmony_ci      /* Now the form:
1744bf215546Sopenharmony_ci       *   0xhhhh0000
1745bf215546Sopenharmony_ci       */
1746bf215546Sopenharmony_ci      brw_set_default_swsb(p, tgl_swsb_regdist(1));
1747bf215546Sopenharmony_ci      brw_SHL(p, dst, dst, brw_imm_ud(16u));
1748bf215546Sopenharmony_ci   }
1749bf215546Sopenharmony_ci
1750bf215546Sopenharmony_ci   /* And, finally the form of packHalf2x16's output:
1751bf215546Sopenharmony_ci    *   0xhhhhllll
1752bf215546Sopenharmony_ci    */
1753bf215546Sopenharmony_ci   brw_F32TO16(p, dst_w, x);
1754bf215546Sopenharmony_ci}
1755bf215546Sopenharmony_ci
1756bf215546Sopenharmony_civoid
1757bf215546Sopenharmony_cifs_generator::enable_debug(const char *shader_name)
1758bf215546Sopenharmony_ci{
1759bf215546Sopenharmony_ci   debug_flag = true;
1760bf215546Sopenharmony_ci   this->shader_name = shader_name;
1761bf215546Sopenharmony_ci}
1762bf215546Sopenharmony_ci
1763bf215546Sopenharmony_ciint
1764bf215546Sopenharmony_cifs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
1765bf215546Sopenharmony_ci                            struct shader_stats shader_stats,
1766bf215546Sopenharmony_ci                            const brw::performance &perf,
1767bf215546Sopenharmony_ci                            struct brw_compile_stats *stats)
1768bf215546Sopenharmony_ci{
1769bf215546Sopenharmony_ci   /* align to 64 byte boundary. */
1770bf215546Sopenharmony_ci   brw_realign(p, 64);
1771bf215546Sopenharmony_ci
1772bf215546Sopenharmony_ci   this->dispatch_width = dispatch_width;
1773bf215546Sopenharmony_ci
1774bf215546Sopenharmony_ci   int start_offset = p->next_insn_offset;
1775bf215546Sopenharmony_ci
1776bf215546Sopenharmony_ci   int loop_count = 0, send_count = 0, nop_count = 0;
1777bf215546Sopenharmony_ci   bool is_accum_used = false;
1778bf215546Sopenharmony_ci
1779bf215546Sopenharmony_ci   struct disasm_info *disasm_info = disasm_initialize(p->isa, cfg);
1780bf215546Sopenharmony_ci
1781bf215546Sopenharmony_ci   foreach_block_and_inst (block, fs_inst, inst, cfg) {
1782bf215546Sopenharmony_ci      if (inst->opcode == SHADER_OPCODE_UNDEF)
1783bf215546Sopenharmony_ci         continue;
1784bf215546Sopenharmony_ci
1785bf215546Sopenharmony_ci      struct brw_reg src[4], dst;
1786bf215546Sopenharmony_ci      unsigned int last_insn_offset = p->next_insn_offset;
1787bf215546Sopenharmony_ci      bool multiple_instructions_emitted = false;
1788bf215546Sopenharmony_ci      tgl_swsb swsb = inst->sched;
1789bf215546Sopenharmony_ci
1790bf215546Sopenharmony_ci      /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1791bf215546Sopenharmony_ci       * "Register Region Restrictions" section: for BDW, SKL:
1792bf215546Sopenharmony_ci       *
1793bf215546Sopenharmony_ci       *    "A POW/FDIV operation must not be followed by an instruction
1794bf215546Sopenharmony_ci       *     that requires two destination registers."
1795bf215546Sopenharmony_ci       *
1796bf215546Sopenharmony_ci       * The documentation is often lacking annotations for Atom parts,
1797bf215546Sopenharmony_ci       * and empirically this affects CHV as well.
1798bf215546Sopenharmony_ci       */
1799bf215546Sopenharmony_ci      if (devinfo->ver >= 8 &&
1800bf215546Sopenharmony_ci          devinfo->ver <= 9 &&
1801bf215546Sopenharmony_ci          p->nr_insn > 1 &&
1802bf215546Sopenharmony_ci          brw_inst_opcode(p->isa, brw_last_inst) == BRW_OPCODE_MATH &&
1803bf215546Sopenharmony_ci          brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1804bf215546Sopenharmony_ci          inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1805bf215546Sopenharmony_ci         brw_NOP(p);
1806bf215546Sopenharmony_ci         last_insn_offset = p->next_insn_offset;
1807bf215546Sopenharmony_ci
1808bf215546Sopenharmony_ci         /* In order to avoid spurious instruction count differences when the
1809bf215546Sopenharmony_ci          * instruction schedule changes, keep track of the number of inserted
1810bf215546Sopenharmony_ci          * NOPs.
1811bf215546Sopenharmony_ci          */
1812bf215546Sopenharmony_ci         nop_count++;
1813bf215546Sopenharmony_ci      }
1814bf215546Sopenharmony_ci
1815bf215546Sopenharmony_ci      /* Wa_14010017096:
1816bf215546Sopenharmony_ci       *
1817bf215546Sopenharmony_ci       * Clear accumulator register before end of thread.
1818bf215546Sopenharmony_ci       */
1819bf215546Sopenharmony_ci      if (inst->eot && is_accum_used && devinfo->ver >= 12) {
1820bf215546Sopenharmony_ci         brw_set_default_exec_size(p, BRW_EXECUTE_16);
1821bf215546Sopenharmony_ci         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1822bf215546Sopenharmony_ci         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1823bf215546Sopenharmony_ci         brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1824bf215546Sopenharmony_ci         brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f));
1825bf215546Sopenharmony_ci         last_insn_offset = p->next_insn_offset;
1826bf215546Sopenharmony_ci         swsb = tgl_swsb_dst_dep(swsb, 1);
1827bf215546Sopenharmony_ci      }
1828bf215546Sopenharmony_ci
1829bf215546Sopenharmony_ci      if (!is_accum_used && !inst->eot) {
1830bf215546Sopenharmony_ci         is_accum_used = inst->writes_accumulator_implicitly(devinfo) ||
1831bf215546Sopenharmony_ci                         inst->dst.is_accumulator();
1832bf215546Sopenharmony_ci      }
1833bf215546Sopenharmony_ci
1834bf215546Sopenharmony_ci      /* Wa_14013745556:
1835bf215546Sopenharmony_ci       *
1836bf215546Sopenharmony_ci       * Always use @1 SWSB for EOT.
1837bf215546Sopenharmony_ci       */
1838bf215546Sopenharmony_ci      if (inst->eot && devinfo->ver >= 12) {
1839bf215546Sopenharmony_ci         if (tgl_swsb_src_dep(swsb).mode) {
1840bf215546Sopenharmony_ci            brw_set_default_exec_size(p, BRW_EXECUTE_1);
1841bf215546Sopenharmony_ci            brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1842bf215546Sopenharmony_ci            brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1843bf215546Sopenharmony_ci            brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1844bf215546Sopenharmony_ci            brw_SYNC(p, TGL_SYNC_NOP);
1845bf215546Sopenharmony_ci            last_insn_offset = p->next_insn_offset;
1846bf215546Sopenharmony_ci         }
1847bf215546Sopenharmony_ci
1848bf215546Sopenharmony_ci         swsb = tgl_swsb_dst_dep(swsb, 1);
1849bf215546Sopenharmony_ci      }
1850bf215546Sopenharmony_ci
1851bf215546Sopenharmony_ci      if (unlikely(debug_flag))
1852bf215546Sopenharmony_ci         disasm_annotate(disasm_info, inst, p->next_insn_offset);
1853bf215546Sopenharmony_ci
1854bf215546Sopenharmony_ci      /* If the instruction writes to more than one register, it needs to be
1855bf215546Sopenharmony_ci       * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
1856bf215546Sopenharmony_ci       * hardware figures out by itself what the right compression mode is,
1857bf215546Sopenharmony_ci       * but we still need to know whether the instruction is compressed to
1858bf215546Sopenharmony_ci       * set up the source register regions appropriately.
1859bf215546Sopenharmony_ci       *
1860bf215546Sopenharmony_ci       * XXX - This is wrong for instructions that write a single register but
1861bf215546Sopenharmony_ci       *       read more than one which should strictly speaking be treated as
1862bf215546Sopenharmony_ci       *       compressed.  For instructions that don't write any registers it
1863bf215546Sopenharmony_ci       *       relies on the destination being a null register of the correct
1864bf215546Sopenharmony_ci       *       type and regioning so the instruction is considered compressed
1865bf215546Sopenharmony_ci       *       or not accordingly.
1866bf215546Sopenharmony_ci       */
1867bf215546Sopenharmony_ci      const bool compressed =
1868bf215546Sopenharmony_ci           inst->dst.component_size(inst->exec_size) > REG_SIZE;
1869bf215546Sopenharmony_ci      brw_set_default_compression(p, compressed);
1870bf215546Sopenharmony_ci      brw_set_default_group(p, inst->group);
1871bf215546Sopenharmony_ci
1872bf215546Sopenharmony_ci      for (unsigned int i = 0; i < inst->sources; i++) {
1873bf215546Sopenharmony_ci         src[i] = brw_reg_from_fs_reg(devinfo, inst,
1874bf215546Sopenharmony_ci                                      &inst->src[i], compressed);
1875bf215546Sopenharmony_ci	 /* The accumulator result appears to get used for the
1876bf215546Sopenharmony_ci	  * conditional modifier generation.  When negating a UD
1877bf215546Sopenharmony_ci	  * value, there is a 33rd bit generated for the sign in the
1878bf215546Sopenharmony_ci	  * accumulator value, so now you can't check, for example,
1879bf215546Sopenharmony_ci	  * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
1880bf215546Sopenharmony_ci	  */
1881bf215546Sopenharmony_ci	 assert(!inst->conditional_mod ||
1882bf215546Sopenharmony_ci		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1883bf215546Sopenharmony_ci		!inst->src[i].negate);
1884bf215546Sopenharmony_ci      }
1885bf215546Sopenharmony_ci      dst = brw_reg_from_fs_reg(devinfo, inst,
1886bf215546Sopenharmony_ci                                &inst->dst, compressed);
1887bf215546Sopenharmony_ci
1888bf215546Sopenharmony_ci      brw_set_default_access_mode(p, BRW_ALIGN_1);
1889bf215546Sopenharmony_ci      brw_set_default_predicate_control(p, inst->predicate);
1890bf215546Sopenharmony_ci      brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1891bf215546Sopenharmony_ci      /* On gfx7 and above, hardware automatically adds the group onto the
1892bf215546Sopenharmony_ci       * flag subregister number.  On Sandy Bridge and older, we have to do it
1893bf215546Sopenharmony_ci       * ourselves.
1894bf215546Sopenharmony_ci       */
1895bf215546Sopenharmony_ci      const unsigned flag_subreg = inst->flag_subreg +
1896bf215546Sopenharmony_ci         (devinfo->ver >= 7 ? 0 : inst->group / 16);
1897bf215546Sopenharmony_ci      brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2);
1898bf215546Sopenharmony_ci      brw_set_default_saturate(p, inst->saturate);
1899bf215546Sopenharmony_ci      brw_set_default_mask_control(p, inst->force_writemask_all);
1900bf215546Sopenharmony_ci      brw_set_default_acc_write_control(p, inst->writes_accumulator);
1901bf215546Sopenharmony_ci      brw_set_default_swsb(p, swsb);
1902bf215546Sopenharmony_ci
1903bf215546Sopenharmony_ci      unsigned exec_size = inst->exec_size;
1904bf215546Sopenharmony_ci      if (devinfo->verx10 == 70 &&
1905bf215546Sopenharmony_ci          (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) {
1906bf215546Sopenharmony_ci         exec_size *= 2;
1907bf215546Sopenharmony_ci      }
1908bf215546Sopenharmony_ci
1909bf215546Sopenharmony_ci      brw_set_default_exec_size(p, cvt(exec_size) - 1);
1910bf215546Sopenharmony_ci
1911bf215546Sopenharmony_ci      assert(inst->force_writemask_all || inst->exec_size >= 4);
1912bf215546Sopenharmony_ci      assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
1913bf215546Sopenharmony_ci      assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->ver));
1914bf215546Sopenharmony_ci      assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1915bf215546Sopenharmony_ci
1916bf215546Sopenharmony_ci      switch (inst->opcode) {
1917bf215546Sopenharmony_ci      case BRW_OPCODE_SYNC:
1918bf215546Sopenharmony_ci         assert(src[0].file == BRW_IMMEDIATE_VALUE);
1919bf215546Sopenharmony_ci         brw_SYNC(p, tgl_sync_function(src[0].ud));
1920bf215546Sopenharmony_ci         break;
1921bf215546Sopenharmony_ci      case BRW_OPCODE_MOV:
1922bf215546Sopenharmony_ci	 brw_MOV(p, dst, src[0]);
1923bf215546Sopenharmony_ci	 break;
1924bf215546Sopenharmony_ci      case BRW_OPCODE_ADD:
1925bf215546Sopenharmony_ci	 brw_ADD(p, dst, src[0], src[1]);
1926bf215546Sopenharmony_ci	 break;
1927bf215546Sopenharmony_ci      case BRW_OPCODE_MUL:
1928bf215546Sopenharmony_ci	 brw_MUL(p, dst, src[0], src[1]);
1929bf215546Sopenharmony_ci	 break;
1930bf215546Sopenharmony_ci      case BRW_OPCODE_AVG:
1931bf215546Sopenharmony_ci	 brw_AVG(p, dst, src[0], src[1]);
1932bf215546Sopenharmony_ci	 break;
1933bf215546Sopenharmony_ci      case BRW_OPCODE_MACH:
1934bf215546Sopenharmony_ci	 brw_MACH(p, dst, src[0], src[1]);
1935bf215546Sopenharmony_ci	 break;
1936bf215546Sopenharmony_ci
1937bf215546Sopenharmony_ci      case BRW_OPCODE_DP4A:
1938bf215546Sopenharmony_ci         assert(devinfo->ver >= 12);
1939bf215546Sopenharmony_ci         brw_DP4A(p, dst, src[0], src[1], src[2]);
1940bf215546Sopenharmony_ci         break;
1941bf215546Sopenharmony_ci
1942bf215546Sopenharmony_ci      case BRW_OPCODE_LINE:
1943bf215546Sopenharmony_ci         brw_LINE(p, dst, src[0], src[1]);
1944bf215546Sopenharmony_ci         break;
1945bf215546Sopenharmony_ci
1946bf215546Sopenharmony_ci      case BRW_OPCODE_MAD:
1947bf215546Sopenharmony_ci         assert(devinfo->ver >= 6);
1948bf215546Sopenharmony_ci         if (devinfo->ver < 10)
1949bf215546Sopenharmony_ci            brw_set_default_access_mode(p, BRW_ALIGN_16);
1950bf215546Sopenharmony_ci         brw_MAD(p, dst, src[0], src[1], src[2]);
1951bf215546Sopenharmony_ci	 break;
1952bf215546Sopenharmony_ci
1953bf215546Sopenharmony_ci      case BRW_OPCODE_LRP:
1954bf215546Sopenharmony_ci         assert(devinfo->ver >= 6 && devinfo->ver <= 10);
1955bf215546Sopenharmony_ci         if (devinfo->ver < 10)
1956bf215546Sopenharmony_ci            brw_set_default_access_mode(p, BRW_ALIGN_16);
1957bf215546Sopenharmony_ci         brw_LRP(p, dst, src[0], src[1], src[2]);
1958bf215546Sopenharmony_ci	 break;
1959bf215546Sopenharmony_ci
1960bf215546Sopenharmony_ci      case BRW_OPCODE_ADD3:
1961bf215546Sopenharmony_ci         assert(devinfo->verx10 >= 125);
1962bf215546Sopenharmony_ci         brw_ADD3(p, dst, src[0], src[1], src[2]);
1963bf215546Sopenharmony_ci         break;
1964bf215546Sopenharmony_ci
1965bf215546Sopenharmony_ci      case BRW_OPCODE_FRC:
1966bf215546Sopenharmony_ci	 brw_FRC(p, dst, src[0]);
1967bf215546Sopenharmony_ci	 break;
1968bf215546Sopenharmony_ci      case BRW_OPCODE_RNDD:
1969bf215546Sopenharmony_ci	 brw_RNDD(p, dst, src[0]);
1970bf215546Sopenharmony_ci	 break;
1971bf215546Sopenharmony_ci      case BRW_OPCODE_RNDE:
1972bf215546Sopenharmony_ci	 brw_RNDE(p, dst, src[0]);
1973bf215546Sopenharmony_ci	 break;
1974bf215546Sopenharmony_ci      case BRW_OPCODE_RNDZ:
1975bf215546Sopenharmony_ci	 brw_RNDZ(p, dst, src[0]);
1976bf215546Sopenharmony_ci	 break;
1977bf215546Sopenharmony_ci
1978bf215546Sopenharmony_ci      case BRW_OPCODE_AND:
1979bf215546Sopenharmony_ci	 brw_AND(p, dst, src[0], src[1]);
1980bf215546Sopenharmony_ci	 break;
1981bf215546Sopenharmony_ci      case BRW_OPCODE_OR:
1982bf215546Sopenharmony_ci	 brw_OR(p, dst, src[0], src[1]);
1983bf215546Sopenharmony_ci	 break;
1984bf215546Sopenharmony_ci      case BRW_OPCODE_XOR:
1985bf215546Sopenharmony_ci	 brw_XOR(p, dst, src[0], src[1]);
1986bf215546Sopenharmony_ci	 break;
1987bf215546Sopenharmony_ci      case BRW_OPCODE_NOT:
1988bf215546Sopenharmony_ci	 brw_NOT(p, dst, src[0]);
1989bf215546Sopenharmony_ci	 break;
1990bf215546Sopenharmony_ci      case BRW_OPCODE_ASR:
1991bf215546Sopenharmony_ci	 brw_ASR(p, dst, src[0], src[1]);
1992bf215546Sopenharmony_ci	 break;
1993bf215546Sopenharmony_ci      case BRW_OPCODE_SHR:
1994bf215546Sopenharmony_ci	 brw_SHR(p, dst, src[0], src[1]);
1995bf215546Sopenharmony_ci	 break;
1996bf215546Sopenharmony_ci      case BRW_OPCODE_SHL:
1997bf215546Sopenharmony_ci	 brw_SHL(p, dst, src[0], src[1]);
1998bf215546Sopenharmony_ci	 break;
1999bf215546Sopenharmony_ci      case BRW_OPCODE_ROL:
2000bf215546Sopenharmony_ci	 assert(devinfo->ver >= 11);
2001bf215546Sopenharmony_ci	 assert(src[0].type == dst.type);
2002bf215546Sopenharmony_ci	 brw_ROL(p, dst, src[0], src[1]);
2003bf215546Sopenharmony_ci	 break;
2004bf215546Sopenharmony_ci      case BRW_OPCODE_ROR:
2005bf215546Sopenharmony_ci	 assert(devinfo->ver >= 11);
2006bf215546Sopenharmony_ci	 assert(src[0].type == dst.type);
2007bf215546Sopenharmony_ci	 brw_ROR(p, dst, src[0], src[1]);
2008bf215546Sopenharmony_ci	 break;
2009bf215546Sopenharmony_ci      case BRW_OPCODE_F32TO16:
2010bf215546Sopenharmony_ci         assert(devinfo->ver >= 7);
2011bf215546Sopenharmony_ci         brw_F32TO16(p, dst, src[0]);
2012bf215546Sopenharmony_ci         break;
2013bf215546Sopenharmony_ci      case BRW_OPCODE_F16TO32:
2014bf215546Sopenharmony_ci         assert(devinfo->ver >= 7);
2015bf215546Sopenharmony_ci         brw_F16TO32(p, dst, src[0]);
2016bf215546Sopenharmony_ci         break;
2017bf215546Sopenharmony_ci      case BRW_OPCODE_CMP:
2018bf215546Sopenharmony_ci         if (inst->exec_size >= 16 && devinfo->verx10 == 70 &&
2019bf215546Sopenharmony_ci             dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2020bf215546Sopenharmony_ci            /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
2021bf215546Sopenharmony_ci             * implemented in the compiler is not sufficient. Overriding the
2022bf215546Sopenharmony_ci             * type when the destination is the null register is necessary but
2023bf215546Sopenharmony_ci             * not sufficient by itself.
2024bf215546Sopenharmony_ci             */
2025bf215546Sopenharmony_ci            dst.type = BRW_REGISTER_TYPE_D;
2026bf215546Sopenharmony_ci         }
2027bf215546Sopenharmony_ci         brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
2028bf215546Sopenharmony_ci	 break;
2029bf215546Sopenharmony_ci      case BRW_OPCODE_CMPN:
2030bf215546Sopenharmony_ci         if (inst->exec_size >= 16 && devinfo->verx10 == 70 &&
2031bf215546Sopenharmony_ci             dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2032bf215546Sopenharmony_ci            /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
2033bf215546Sopenharmony_ci             * implemented in the compiler is not sufficient. Overriding the
2034bf215546Sopenharmony_ci             * type when the destination is the null register is necessary but
2035bf215546Sopenharmony_ci             * not sufficient by itself.
2036bf215546Sopenharmony_ci             */
2037bf215546Sopenharmony_ci            dst.type = BRW_REGISTER_TYPE_D;
2038bf215546Sopenharmony_ci         }
2039bf215546Sopenharmony_ci         brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]);
2040bf215546Sopenharmony_ci         break;
2041bf215546Sopenharmony_ci      case BRW_OPCODE_SEL:
2042bf215546Sopenharmony_ci	 brw_SEL(p, dst, src[0], src[1]);
2043bf215546Sopenharmony_ci	 break;
2044bf215546Sopenharmony_ci      case BRW_OPCODE_CSEL:
2045bf215546Sopenharmony_ci         assert(devinfo->ver >= 8);
2046bf215546Sopenharmony_ci         if (devinfo->ver < 10)
2047bf215546Sopenharmony_ci            brw_set_default_access_mode(p, BRW_ALIGN_16);
2048bf215546Sopenharmony_ci         brw_CSEL(p, dst, src[0], src[1], src[2]);
2049bf215546Sopenharmony_ci         break;
2050bf215546Sopenharmony_ci      case BRW_OPCODE_BFREV:
2051bf215546Sopenharmony_ci         assert(devinfo->ver >= 7);
2052bf215546Sopenharmony_ci         brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
2053bf215546Sopenharmony_ci                   retype(src[0], BRW_REGISTER_TYPE_UD));
2054bf215546Sopenharmony_ci         break;
2055bf215546Sopenharmony_ci      case BRW_OPCODE_FBH:
2056bf215546Sopenharmony_ci         assert(devinfo->ver >= 7);
2057bf215546Sopenharmony_ci         brw_FBH(p, retype(dst, src[0].type), src[0]);
2058bf215546Sopenharmony_ci         break;
2059bf215546Sopenharmony_ci      case BRW_OPCODE_FBL:
2060bf215546Sopenharmony_ci         assert(devinfo->ver >= 7);
2061bf215546Sopenharmony_ci         brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
2062bf215546Sopenharmony_ci                 retype(src[0], BRW_REGISTER_TYPE_UD));
2063bf215546Sopenharmony_ci         break;
2064bf215546Sopenharmony_ci      case BRW_OPCODE_LZD:
2065bf215546Sopenharmony_ci         brw_LZD(p, dst, src[0]);
2066bf215546Sopenharmony_ci         break;
2067bf215546Sopenharmony_ci      case BRW_OPCODE_CBIT:
2068bf215546Sopenharmony_ci         assert(devinfo->ver >= 7);
2069bf215546Sopenharmony_ci         brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
2070bf215546Sopenharmony_ci                  retype(src[0], BRW_REGISTER_TYPE_UD));
2071bf215546Sopenharmony_ci         break;
2072bf215546Sopenharmony_ci      case BRW_OPCODE_ADDC:
2073bf215546Sopenharmony_ci         assert(devinfo->ver >= 7);
2074bf215546Sopenharmony_ci         brw_ADDC(p, dst, src[0], src[1]);
2075bf215546Sopenharmony_ci         break;
2076bf215546Sopenharmony_ci      case BRW_OPCODE_SUBB:
2077bf215546Sopenharmony_ci         assert(devinfo->ver >= 7);
2078bf215546Sopenharmony_ci         brw_SUBB(p, dst, src[0], src[1]);
2079bf215546Sopenharmony_ci         break;
2080bf215546Sopenharmony_ci      case BRW_OPCODE_MAC:
2081bf215546Sopenharmony_ci         brw_MAC(p, dst, src[0], src[1]);
2082bf215546Sopenharmony_ci         break;
2083bf215546Sopenharmony_ci
2084bf215546Sopenharmony_ci      case BRW_OPCODE_BFE:
2085bf215546Sopenharmony_ci         assert(devinfo->ver >= 7);
2086bf215546Sopenharmony_ci         if (devinfo->ver < 10)
2087bf215546Sopenharmony_ci            brw_set_default_access_mode(p, BRW_ALIGN_16);
2088bf215546Sopenharmony_ci         brw_BFE(p, dst, src[0], src[1], src[2]);
2089bf215546Sopenharmony_ci         break;
2090bf215546Sopenharmony_ci
2091bf215546Sopenharmony_ci      case BRW_OPCODE_BFI1:
2092bf215546Sopenharmony_ci         assert(devinfo->ver >= 7);
2093bf215546Sopenharmony_ci         brw_BFI1(p, dst, src[0], src[1]);
2094bf215546Sopenharmony_ci         break;
2095bf215546Sopenharmony_ci      case BRW_OPCODE_BFI2:
2096bf215546Sopenharmony_ci         assert(devinfo->ver >= 7);
2097bf215546Sopenharmony_ci         if (devinfo->ver < 10)
2098bf215546Sopenharmony_ci            brw_set_default_access_mode(p, BRW_ALIGN_16);
2099bf215546Sopenharmony_ci         brw_BFI2(p, dst, src[0], src[1], src[2]);
2100bf215546Sopenharmony_ci         break;
2101bf215546Sopenharmony_ci
2102bf215546Sopenharmony_ci      case BRW_OPCODE_IF:
2103bf215546Sopenharmony_ci	 if (inst->src[0].file != BAD_FILE) {
2104bf215546Sopenharmony_ci	    /* The instruction has an embedded compare (only allowed on gfx6) */
2105bf215546Sopenharmony_ci	    assert(devinfo->ver == 6);
2106bf215546Sopenharmony_ci	    gfx6_IF(p, inst->conditional_mod, src[0], src[1]);
2107bf215546Sopenharmony_ci	 } else {
2108bf215546Sopenharmony_ci	    brw_IF(p, brw_get_default_exec_size(p));
2109bf215546Sopenharmony_ci	 }
2110bf215546Sopenharmony_ci	 break;
2111bf215546Sopenharmony_ci
2112bf215546Sopenharmony_ci      case BRW_OPCODE_ELSE:
2113bf215546Sopenharmony_ci	 brw_ELSE(p);
2114bf215546Sopenharmony_ci	 break;
2115bf215546Sopenharmony_ci      case BRW_OPCODE_ENDIF:
2116bf215546Sopenharmony_ci	 brw_ENDIF(p);
2117bf215546Sopenharmony_ci	 break;
2118bf215546Sopenharmony_ci
2119bf215546Sopenharmony_ci      case BRW_OPCODE_DO:
2120bf215546Sopenharmony_ci	 brw_DO(p, brw_get_default_exec_size(p));
2121bf215546Sopenharmony_ci	 break;
2122bf215546Sopenharmony_ci
2123bf215546Sopenharmony_ci      case BRW_OPCODE_BREAK:
2124bf215546Sopenharmony_ci	 brw_BREAK(p);
2125bf215546Sopenharmony_ci	 break;
2126bf215546Sopenharmony_ci      case BRW_OPCODE_CONTINUE:
2127bf215546Sopenharmony_ci         brw_CONT(p);
2128bf215546Sopenharmony_ci	 break;
2129bf215546Sopenharmony_ci
2130bf215546Sopenharmony_ci      case BRW_OPCODE_WHILE:
2131bf215546Sopenharmony_ci	 brw_WHILE(p);
2132bf215546Sopenharmony_ci         loop_count++;
2133bf215546Sopenharmony_ci	 break;
2134bf215546Sopenharmony_ci
2135bf215546Sopenharmony_ci      case SHADER_OPCODE_RCP:
2136bf215546Sopenharmony_ci      case SHADER_OPCODE_RSQ:
2137bf215546Sopenharmony_ci      case SHADER_OPCODE_SQRT:
2138bf215546Sopenharmony_ci      case SHADER_OPCODE_EXP2:
2139bf215546Sopenharmony_ci      case SHADER_OPCODE_LOG2:
2140bf215546Sopenharmony_ci      case SHADER_OPCODE_SIN:
2141bf215546Sopenharmony_ci      case SHADER_OPCODE_COS:
2142bf215546Sopenharmony_ci         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
2143bf215546Sopenharmony_ci	 if (devinfo->ver >= 6) {
2144bf215546Sopenharmony_ci            assert(inst->mlen == 0);
2145bf215546Sopenharmony_ci            assert(devinfo->ver >= 7 || inst->exec_size == 8);
2146bf215546Sopenharmony_ci            gfx6_math(p, dst, brw_math_function(inst->opcode),
2147bf215546Sopenharmony_ci                      src[0], brw_null_reg());
2148bf215546Sopenharmony_ci	 } else {
2149bf215546Sopenharmony_ci            assert(inst->mlen >= 1);
2150bf215546Sopenharmony_ci            assert(devinfo->ver == 5 || devinfo->platform == INTEL_PLATFORM_G4X || inst->exec_size == 8);
2151bf215546Sopenharmony_ci            gfx4_math(p, dst,
2152bf215546Sopenharmony_ci                      brw_math_function(inst->opcode),
2153bf215546Sopenharmony_ci                      inst->base_mrf, src[0],
2154bf215546Sopenharmony_ci                      BRW_MATH_PRECISION_FULL);
2155bf215546Sopenharmony_ci            send_count++;
2156bf215546Sopenharmony_ci	 }
2157bf215546Sopenharmony_ci	 break;
2158bf215546Sopenharmony_ci      case SHADER_OPCODE_INT_QUOTIENT:
2159bf215546Sopenharmony_ci      case SHADER_OPCODE_INT_REMAINDER:
2160bf215546Sopenharmony_ci      case SHADER_OPCODE_POW:
2161bf215546Sopenharmony_ci         assert(devinfo->verx10 < 125);
2162bf215546Sopenharmony_ci         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
2163bf215546Sopenharmony_ci         if (devinfo->ver >= 6) {
2164bf215546Sopenharmony_ci            assert(inst->mlen == 0);
2165bf215546Sopenharmony_ci            assert((devinfo->ver >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
2166bf215546Sopenharmony_ci                   inst->exec_size == 8);
2167bf215546Sopenharmony_ci            gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
2168bf215546Sopenharmony_ci         } else {
2169bf215546Sopenharmony_ci            assert(inst->mlen >= 1);
2170bf215546Sopenharmony_ci            assert(inst->exec_size == 8);
2171bf215546Sopenharmony_ci            gfx4_math(p, dst, brw_math_function(inst->opcode),
2172bf215546Sopenharmony_ci                      inst->base_mrf, src[0],
2173bf215546Sopenharmony_ci                      BRW_MATH_PRECISION_FULL);
2174bf215546Sopenharmony_ci            send_count++;
2175bf215546Sopenharmony_ci	 }
2176bf215546Sopenharmony_ci	 break;
2177bf215546Sopenharmony_ci      case FS_OPCODE_LINTERP:
2178bf215546Sopenharmony_ci	 multiple_instructions_emitted = generate_linterp(inst, dst, src);
2179bf215546Sopenharmony_ci	 break;
2180bf215546Sopenharmony_ci      case FS_OPCODE_PIXEL_X:
2181bf215546Sopenharmony_ci         assert(src[0].type == BRW_REGISTER_TYPE_UW);
2182bf215546Sopenharmony_ci         assert(src[1].type == BRW_REGISTER_TYPE_UW);
2183bf215546Sopenharmony_ci         src[0].subnr = 0 * type_sz(src[0].type);
2184bf215546Sopenharmony_ci         if (src[1].file == BRW_IMMEDIATE_VALUE) {
2185bf215546Sopenharmony_ci            assert(src[1].ud == 0);
2186bf215546Sopenharmony_ci            brw_MOV(p, dst, stride(src[0], 8, 4, 1));
2187bf215546Sopenharmony_ci         } else {
2188bf215546Sopenharmony_ci            /* Coarse pixel case */
2189bf215546Sopenharmony_ci            brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
2190bf215546Sopenharmony_ci         }
2191bf215546Sopenharmony_ci         break;
2192bf215546Sopenharmony_ci      case FS_OPCODE_PIXEL_Y:
2193bf215546Sopenharmony_ci         assert(src[0].type == BRW_REGISTER_TYPE_UW);
2194bf215546Sopenharmony_ci         assert(src[1].type == BRW_REGISTER_TYPE_UW);
2195bf215546Sopenharmony_ci         src[0].subnr = 4 * type_sz(src[0].type);
2196bf215546Sopenharmony_ci         if (src[1].file == BRW_IMMEDIATE_VALUE) {
2197bf215546Sopenharmony_ci            assert(src[1].ud == 0);
2198bf215546Sopenharmony_ci            brw_MOV(p, dst, stride(src[0], 8, 4, 1));
2199bf215546Sopenharmony_ci         } else {
2200bf215546Sopenharmony_ci            /* Coarse pixel case */
2201bf215546Sopenharmony_ci            brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
2202bf215546Sopenharmony_ci         }
2203bf215546Sopenharmony_ci         break;
2204bf215546Sopenharmony_ci
2205bf215546Sopenharmony_ci      case SHADER_OPCODE_SEND:
2206bf215546Sopenharmony_ci         generate_send(inst, dst, src[0], src[1], src[2],
2207bf215546Sopenharmony_ci                       inst->ex_mlen > 0 ? src[3] : brw_null_reg());
2208bf215546Sopenharmony_ci         send_count++;
2209bf215546Sopenharmony_ci         break;
2210bf215546Sopenharmony_ci
2211bf215546Sopenharmony_ci      case SHADER_OPCODE_GET_BUFFER_SIZE:
2212bf215546Sopenharmony_ci         generate_get_buffer_size(inst, dst, src[0], src[1]);
2213bf215546Sopenharmony_ci         send_count++;
2214bf215546Sopenharmony_ci         break;
2215bf215546Sopenharmony_ci      case SHADER_OPCODE_TEX:
2216bf215546Sopenharmony_ci      case FS_OPCODE_TXB:
2217bf215546Sopenharmony_ci      case SHADER_OPCODE_TXD:
2218bf215546Sopenharmony_ci      case SHADER_OPCODE_TXF:
2219bf215546Sopenharmony_ci      case SHADER_OPCODE_TXF_CMS:
2220bf215546Sopenharmony_ci      case SHADER_OPCODE_TXL:
2221bf215546Sopenharmony_ci      case SHADER_OPCODE_TXS:
2222bf215546Sopenharmony_ci      case SHADER_OPCODE_LOD:
2223bf215546Sopenharmony_ci      case SHADER_OPCODE_TG4:
2224bf215546Sopenharmony_ci      case SHADER_OPCODE_SAMPLEINFO:
2225bf215546Sopenharmony_ci         assert(inst->src[0].file == BAD_FILE);
2226bf215546Sopenharmony_ci         generate_tex(inst, dst, src[1], src[2]);
2227bf215546Sopenharmony_ci         send_count++;
2228bf215546Sopenharmony_ci         break;
2229bf215546Sopenharmony_ci
2230bf215546Sopenharmony_ci      case FS_OPCODE_DDX_COARSE:
2231bf215546Sopenharmony_ci      case FS_OPCODE_DDX_FINE:
2232bf215546Sopenharmony_ci         generate_ddx(inst, dst, src[0]);
2233bf215546Sopenharmony_ci         break;
2234bf215546Sopenharmony_ci      case FS_OPCODE_DDY_COARSE:
2235bf215546Sopenharmony_ci      case FS_OPCODE_DDY_FINE:
2236bf215546Sopenharmony_ci         generate_ddy(inst, dst, src[0]);
2237bf215546Sopenharmony_ci	 break;
2238bf215546Sopenharmony_ci
2239bf215546Sopenharmony_ci      case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
2240bf215546Sopenharmony_ci	 generate_scratch_write(inst, src[0]);
2241bf215546Sopenharmony_ci         send_count++;
2242bf215546Sopenharmony_ci	 break;
2243bf215546Sopenharmony_ci
2244bf215546Sopenharmony_ci      case SHADER_OPCODE_GFX4_SCRATCH_READ:
2245bf215546Sopenharmony_ci	 generate_scratch_read(inst, dst);
2246bf215546Sopenharmony_ci         send_count++;
2247bf215546Sopenharmony_ci	 break;
2248bf215546Sopenharmony_ci
2249bf215546Sopenharmony_ci      case SHADER_OPCODE_GFX7_SCRATCH_READ:
2250bf215546Sopenharmony_ci	 generate_scratch_read_gfx7(inst, dst);
2251bf215546Sopenharmony_ci         send_count++;
2252bf215546Sopenharmony_ci	 break;
2253bf215546Sopenharmony_ci
2254bf215546Sopenharmony_ci      case SHADER_OPCODE_SCRATCH_HEADER:
2255bf215546Sopenharmony_ci         generate_scratch_header(inst, dst);
2256bf215546Sopenharmony_ci         break;
2257bf215546Sopenharmony_ci
2258bf215546Sopenharmony_ci      case SHADER_OPCODE_MOV_INDIRECT:
2259bf215546Sopenharmony_ci         generate_mov_indirect(inst, dst, src[0], src[1]);
2260bf215546Sopenharmony_ci         break;
2261bf215546Sopenharmony_ci
2262bf215546Sopenharmony_ci      case SHADER_OPCODE_MOV_RELOC_IMM:
2263bf215546Sopenharmony_ci         assert(src[0].file == BRW_IMMEDIATE_VALUE);
2264bf215546Sopenharmony_ci         brw_MOV_reloc_imm(p, dst, dst.type, src[0].ud);
2265bf215546Sopenharmony_ci         break;
2266bf215546Sopenharmony_ci
2267bf215546Sopenharmony_ci      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2268bf215546Sopenharmony_ci         assert(inst->force_writemask_all);
2269bf215546Sopenharmony_ci	 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
2270bf215546Sopenharmony_ci         send_count++;
2271bf215546Sopenharmony_ci	 break;
2272bf215546Sopenharmony_ci
2273bf215546Sopenharmony_ci      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
2274bf215546Sopenharmony_ci         assert(inst->force_writemask_all);
2275bf215546Sopenharmony_ci	 generate_uniform_pull_constant_load_gfx7(inst, dst, src[0], src[1]);
2276bf215546Sopenharmony_ci         send_count++;
2277bf215546Sopenharmony_ci	 break;
2278bf215546Sopenharmony_ci
2279bf215546Sopenharmony_ci      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
2280bf215546Sopenharmony_ci	 generate_varying_pull_constant_load_gfx4(inst, dst, src[0]);
2281bf215546Sopenharmony_ci         send_count++;
2282bf215546Sopenharmony_ci	 break;
2283bf215546Sopenharmony_ci
2284bf215546Sopenharmony_ci      case FS_OPCODE_REP_FB_WRITE:
2285bf215546Sopenharmony_ci      case FS_OPCODE_FB_WRITE:
2286bf215546Sopenharmony_ci	 generate_fb_write(inst, src[0]);
2287bf215546Sopenharmony_ci         send_count++;
2288bf215546Sopenharmony_ci	 break;
2289bf215546Sopenharmony_ci
2290bf215546Sopenharmony_ci      case FS_OPCODE_FB_READ:
2291bf215546Sopenharmony_ci         generate_fb_read(inst, dst, src[0]);
2292bf215546Sopenharmony_ci         send_count++;
2293bf215546Sopenharmony_ci         break;
2294bf215546Sopenharmony_ci
2295bf215546Sopenharmony_ci      case BRW_OPCODE_HALT:
2296bf215546Sopenharmony_ci         generate_halt(inst);
2297bf215546Sopenharmony_ci         break;
2298bf215546Sopenharmony_ci
2299bf215546Sopenharmony_ci      case SHADER_OPCODE_INTERLOCK:
2300bf215546Sopenharmony_ci      case SHADER_OPCODE_MEMORY_FENCE: {
2301bf215546Sopenharmony_ci         assert(src[1].file == BRW_IMMEDIATE_VALUE);
2302bf215546Sopenharmony_ci         assert(src[2].file == BRW_IMMEDIATE_VALUE);
2303bf215546Sopenharmony_ci
2304bf215546Sopenharmony_ci         const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
2305bf215546Sopenharmony_ci            BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
2306bf215546Sopenharmony_ci
2307bf215546Sopenharmony_ci         brw_memory_fence(p, dst, src[0], send_op,
2308bf215546Sopenharmony_ci                          brw_message_target(inst->sfid),
2309bf215546Sopenharmony_ci                          inst->desc,
2310bf215546Sopenharmony_ci                          /* commit_enable */ src[1].ud,
2311bf215546Sopenharmony_ci                          /* bti */ src[2].ud);
2312bf215546Sopenharmony_ci         send_count++;
2313bf215546Sopenharmony_ci         break;
2314bf215546Sopenharmony_ci      }
2315bf215546Sopenharmony_ci
2316bf215546Sopenharmony_ci      case FS_OPCODE_SCHEDULING_FENCE:
2317bf215546Sopenharmony_ci         if (inst->sources == 0 && swsb.regdist == 0 &&
2318bf215546Sopenharmony_ci                                   swsb.mode == TGL_SBID_NULL) {
2319bf215546Sopenharmony_ci            if (unlikely(debug_flag))
2320bf215546Sopenharmony_ci               disasm_info->use_tail = true;
2321bf215546Sopenharmony_ci            break;
2322bf215546Sopenharmony_ci         }
2323bf215546Sopenharmony_ci
2324bf215546Sopenharmony_ci         if (devinfo->ver >= 12) {
2325bf215546Sopenharmony_ci            /* Use the available SWSB information to stall.  A single SYNC is
2326bf215546Sopenharmony_ci             * sufficient since if there were multiple dependencies, the
2327bf215546Sopenharmony_ci             * scoreboard algorithm already injected other SYNCs before this
2328bf215546Sopenharmony_ci             * instruction.
2329bf215546Sopenharmony_ci             */
2330bf215546Sopenharmony_ci            brw_SYNC(p, TGL_SYNC_NOP);
2331bf215546Sopenharmony_ci         } else {
2332bf215546Sopenharmony_ci            for (unsigned i = 0; i < inst->sources; i++) {
2333bf215546Sopenharmony_ci               /* Emit a MOV to force a stall until the instruction producing the
2334bf215546Sopenharmony_ci                * registers finishes.
2335bf215546Sopenharmony_ci                */
2336bf215546Sopenharmony_ci               brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
2337bf215546Sopenharmony_ci                       retype(src[i], BRW_REGISTER_TYPE_UW));
2338bf215546Sopenharmony_ci            }
2339bf215546Sopenharmony_ci
2340bf215546Sopenharmony_ci            if (inst->sources > 1)
2341bf215546Sopenharmony_ci               multiple_instructions_emitted = true;
2342bf215546Sopenharmony_ci         }
2343bf215546Sopenharmony_ci
2344bf215546Sopenharmony_ci         break;
2345bf215546Sopenharmony_ci
2346bf215546Sopenharmony_ci      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2347bf215546Sopenharmony_ci         brw_find_live_channel(p, dst, false);
2348bf215546Sopenharmony_ci         break;
2349bf215546Sopenharmony_ci      case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
2350bf215546Sopenharmony_ci         brw_find_live_channel(p, dst, true);
2351bf215546Sopenharmony_ci         break;
2352bf215546Sopenharmony_ci
2353bf215546Sopenharmony_ci      case FS_OPCODE_LOAD_LIVE_CHANNELS: {
2354bf215546Sopenharmony_ci         assert(devinfo->ver >= 8);
2355bf215546Sopenharmony_ci         assert(inst->force_writemask_all && inst->group == 0);
2356bf215546Sopenharmony_ci         assert(inst->dst.file == BAD_FILE);
2357bf215546Sopenharmony_ci         brw_set_default_exec_size(p, BRW_EXECUTE_1);
2358bf215546Sopenharmony_ci         brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg),
2359bf215546Sopenharmony_ci                           BRW_REGISTER_TYPE_UD),
2360bf215546Sopenharmony_ci                 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
2361bf215546Sopenharmony_ci         break;
2362bf215546Sopenharmony_ci      }
2363bf215546Sopenharmony_ci      case SHADER_OPCODE_BROADCAST:
2364bf215546Sopenharmony_ci         assert(inst->force_writemask_all);
2365bf215546Sopenharmony_ci         brw_broadcast(p, dst, src[0], src[1]);
2366bf215546Sopenharmony_ci         break;
2367bf215546Sopenharmony_ci
2368bf215546Sopenharmony_ci      case SHADER_OPCODE_SHUFFLE:
2369bf215546Sopenharmony_ci         generate_shuffle(inst, dst, src[0], src[1]);
2370bf215546Sopenharmony_ci         break;
2371bf215546Sopenharmony_ci
2372bf215546Sopenharmony_ci      case SHADER_OPCODE_SEL_EXEC:
2373bf215546Sopenharmony_ci         assert(inst->force_writemask_all);
2374bf215546Sopenharmony_ci         assert(devinfo->has_64bit_float || type_sz(dst.type) <= 4);
2375bf215546Sopenharmony_ci         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2376bf215546Sopenharmony_ci         brw_MOV(p, dst, src[1]);
2377bf215546Sopenharmony_ci         brw_set_default_mask_control(p, BRW_MASK_ENABLE);
2378bf215546Sopenharmony_ci         brw_set_default_swsb(p, tgl_swsb_null());
2379bf215546Sopenharmony_ci         brw_MOV(p, dst, src[0]);
2380bf215546Sopenharmony_ci         break;
2381bf215546Sopenharmony_ci
2382bf215546Sopenharmony_ci      case SHADER_OPCODE_QUAD_SWIZZLE:
2383bf215546Sopenharmony_ci         assert(src[1].file == BRW_IMMEDIATE_VALUE);
2384bf215546Sopenharmony_ci         assert(src[1].type == BRW_REGISTER_TYPE_UD);
2385bf215546Sopenharmony_ci         generate_quad_swizzle(inst, dst, src[0], src[1].ud);
2386bf215546Sopenharmony_ci         break;
2387bf215546Sopenharmony_ci
2388bf215546Sopenharmony_ci      case SHADER_OPCODE_CLUSTER_BROADCAST: {
2389bf215546Sopenharmony_ci         assert((devinfo->platform != INTEL_PLATFORM_CHV &&
2390bf215546Sopenharmony_ci                 !intel_device_info_is_9lp(devinfo) &&
2391bf215546Sopenharmony_ci                 devinfo->has_64bit_float) || type_sz(src[0].type) <= 4);
2392bf215546Sopenharmony_ci         assert(!src[0].negate && !src[0].abs);
2393bf215546Sopenharmony_ci         assert(src[1].file == BRW_IMMEDIATE_VALUE);
2394bf215546Sopenharmony_ci         assert(src[1].type == BRW_REGISTER_TYPE_UD);
2395bf215546Sopenharmony_ci         assert(src[2].file == BRW_IMMEDIATE_VALUE);
2396bf215546Sopenharmony_ci         assert(src[2].type == BRW_REGISTER_TYPE_UD);
2397bf215546Sopenharmony_ci         const unsigned component = src[1].ud;
2398bf215546Sopenharmony_ci         const unsigned cluster_size = src[2].ud;
2399bf215546Sopenharmony_ci         assert(inst->src[0].file != ARF && inst->src[0].file != FIXED_GRF);
2400bf215546Sopenharmony_ci         const unsigned s = inst->src[0].stride;
2401bf215546Sopenharmony_ci         unsigned vstride = cluster_size * s;
2402bf215546Sopenharmony_ci         unsigned width = cluster_size;
2403bf215546Sopenharmony_ci
2404bf215546Sopenharmony_ci         /* The maximum exec_size is 32, but the maximum width is only 16. */
2405bf215546Sopenharmony_ci         if (inst->exec_size == width) {
2406bf215546Sopenharmony_ci            vstride = 0;
2407bf215546Sopenharmony_ci            width = 1;
2408bf215546Sopenharmony_ci         }
2409bf215546Sopenharmony_ci
2410bf215546Sopenharmony_ci         struct brw_reg strided = stride(suboffset(src[0], component * s),
2411bf215546Sopenharmony_ci                                         vstride, width, 0);
2412bf215546Sopenharmony_ci         brw_MOV(p, dst, strided);
2413bf215546Sopenharmony_ci         break;
2414bf215546Sopenharmony_ci      }
2415bf215546Sopenharmony_ci
2416bf215546Sopenharmony_ci      case FS_OPCODE_SET_SAMPLE_ID:
2417bf215546Sopenharmony_ci         generate_set_sample_id(inst, dst, src[0], src[1]);
2418bf215546Sopenharmony_ci         break;
2419bf215546Sopenharmony_ci
2420bf215546Sopenharmony_ci      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2421bf215546Sopenharmony_ci          generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2422bf215546Sopenharmony_ci          break;
2423bf215546Sopenharmony_ci
2424bf215546Sopenharmony_ci      case SHADER_OPCODE_HALT_TARGET:
2425bf215546Sopenharmony_ci         /* This is the place where the final HALT needs to be inserted if
2426bf215546Sopenharmony_ci          * we've emitted any discards.  If not, this will emit no code.
2427bf215546Sopenharmony_ci          */
2428bf215546Sopenharmony_ci         if (!patch_halt_jumps()) {
2429bf215546Sopenharmony_ci            if (unlikely(debug_flag)) {
2430bf215546Sopenharmony_ci               disasm_info->use_tail = true;
2431bf215546Sopenharmony_ci            }
2432bf215546Sopenharmony_ci         }
2433bf215546Sopenharmony_ci         break;
2434bf215546Sopenharmony_ci
2435bf215546Sopenharmony_ci      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2436bf215546Sopenharmony_ci         generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2437bf215546Sopenharmony_ci                                           GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2438bf215546Sopenharmony_ci         send_count++;
2439bf215546Sopenharmony_ci         break;
2440bf215546Sopenharmony_ci
2441bf215546Sopenharmony_ci      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2442bf215546Sopenharmony_ci         generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2443bf215546Sopenharmony_ci                                           GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2444bf215546Sopenharmony_ci         send_count++;
2445bf215546Sopenharmony_ci         break;
2446bf215546Sopenharmony_ci
2447bf215546Sopenharmony_ci      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2448bf215546Sopenharmony_ci         generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2449bf215546Sopenharmony_ci                                           GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2450bf215546Sopenharmony_ci         send_count++;
2451bf215546Sopenharmony_ci         break;
2452bf215546Sopenharmony_ci
2453bf215546Sopenharmony_ci      case CS_OPCODE_CS_TERMINATE:
2454bf215546Sopenharmony_ci         generate_cs_terminate(inst, src[0]);
2455bf215546Sopenharmony_ci         send_count++;
2456bf215546Sopenharmony_ci         break;
2457bf215546Sopenharmony_ci
2458bf215546Sopenharmony_ci      case SHADER_OPCODE_BARRIER:
2459bf215546Sopenharmony_ci	 generate_barrier(inst, src[0]);
2460bf215546Sopenharmony_ci         send_count++;
2461bf215546Sopenharmony_ci	 break;
2462bf215546Sopenharmony_ci
2463bf215546Sopenharmony_ci      case BRW_OPCODE_DIM:
2464bf215546Sopenharmony_ci         assert(devinfo->platform == INTEL_PLATFORM_HSW);
2465bf215546Sopenharmony_ci         assert(src[0].type == BRW_REGISTER_TYPE_DF);
2466bf215546Sopenharmony_ci         assert(dst.type == BRW_REGISTER_TYPE_DF);
2467bf215546Sopenharmony_ci         brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2468bf215546Sopenharmony_ci         break;
2469bf215546Sopenharmony_ci
2470bf215546Sopenharmony_ci      case SHADER_OPCODE_RND_MODE: {
2471bf215546Sopenharmony_ci         assert(src[0].file == BRW_IMMEDIATE_VALUE);
2472bf215546Sopenharmony_ci         /*
2473bf215546Sopenharmony_ci          * Changes the floating point rounding mode updating the control
2474bf215546Sopenharmony_ci          * register field defined at cr0.0[5-6] bits.
2475bf215546Sopenharmony_ci          */
2476bf215546Sopenharmony_ci         enum brw_rnd_mode mode =
2477bf215546Sopenharmony_ci            (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT);
2478bf215546Sopenharmony_ci         brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK);
2479bf215546Sopenharmony_ci      }
2480bf215546Sopenharmony_ci         break;
2481bf215546Sopenharmony_ci
2482bf215546Sopenharmony_ci      case SHADER_OPCODE_FLOAT_CONTROL_MODE:
2483bf215546Sopenharmony_ci         assert(src[0].file == BRW_IMMEDIATE_VALUE);
2484bf215546Sopenharmony_ci         assert(src[1].file == BRW_IMMEDIATE_VALUE);
2485bf215546Sopenharmony_ci         brw_float_controls_mode(p, src[0].d, src[1].d);
2486bf215546Sopenharmony_ci         break;
2487bf215546Sopenharmony_ci
2488bf215546Sopenharmony_ci      case SHADER_OPCODE_READ_SR_REG:
2489bf215546Sopenharmony_ci         if (devinfo->ver >= 12) {
2490bf215546Sopenharmony_ci            /* There is a SWSB restriction that requires that any time sr0 is
2491bf215546Sopenharmony_ci             * accessed both the instruction doing the access and the next one
2492bf215546Sopenharmony_ci             * have SWSB set to RegDist(1).
2493bf215546Sopenharmony_ci             */
2494bf215546Sopenharmony_ci            if (brw_get_default_swsb(p).mode != TGL_SBID_NULL)
2495bf215546Sopenharmony_ci               brw_SYNC(p, TGL_SYNC_NOP);
2496bf215546Sopenharmony_ci            assert(src[0].file == BRW_IMMEDIATE_VALUE);
2497bf215546Sopenharmony_ci            brw_set_default_swsb(p, tgl_swsb_regdist(1));
2498bf215546Sopenharmony_ci            brw_MOV(p, dst, brw_sr0_reg(src[0].ud));
2499bf215546Sopenharmony_ci            brw_set_default_swsb(p, tgl_swsb_regdist(1));
2500bf215546Sopenharmony_ci            brw_AND(p, dst, dst, brw_imm_ud(0xffffffff));
2501bf215546Sopenharmony_ci         } else {
2502bf215546Sopenharmony_ci            brw_MOV(p, dst, brw_sr0_reg(src[0].ud));
2503bf215546Sopenharmony_ci         }
2504bf215546Sopenharmony_ci         break;
2505bf215546Sopenharmony_ci
2506bf215546Sopenharmony_ci      default:
2507bf215546Sopenharmony_ci         unreachable("Unsupported opcode");
2508bf215546Sopenharmony_ci
2509bf215546Sopenharmony_ci      case SHADER_OPCODE_LOAD_PAYLOAD:
2510bf215546Sopenharmony_ci         unreachable("Should be lowered by lower_load_payload()");
2511bf215546Sopenharmony_ci      }
2512bf215546Sopenharmony_ci
2513bf215546Sopenharmony_ci      if (multiple_instructions_emitted)
2514bf215546Sopenharmony_ci         continue;
2515bf215546Sopenharmony_ci
2516bf215546Sopenharmony_ci      if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2517bf215546Sopenharmony_ci         assert(p->next_insn_offset == last_insn_offset + 16 ||
2518bf215546Sopenharmony_ci                !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2519bf215546Sopenharmony_ci                 "emitting more than 1 instruction");
2520bf215546Sopenharmony_ci
2521bf215546Sopenharmony_ci         brw_inst *last = &p->store[last_insn_offset / 16];
2522bf215546Sopenharmony_ci
2523bf215546Sopenharmony_ci         if (inst->conditional_mod)
2524bf215546Sopenharmony_ci            brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2525bf215546Sopenharmony_ci         if (devinfo->ver < 12) {
2526bf215546Sopenharmony_ci            brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2527bf215546Sopenharmony_ci            brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2528bf215546Sopenharmony_ci         }
2529bf215546Sopenharmony_ci      }
2530bf215546Sopenharmony_ci   }
2531bf215546Sopenharmony_ci
2532bf215546Sopenharmony_ci   brw_set_uip_jip(p, start_offset);
2533bf215546Sopenharmony_ci
2534bf215546Sopenharmony_ci   /* end of program sentinel */
2535bf215546Sopenharmony_ci   disasm_new_inst_group(disasm_info, p->next_insn_offset);
2536bf215546Sopenharmony_ci
2537bf215546Sopenharmony_ci   /* `send_count` explicitly does not include spills or fills, as we'd
2538bf215546Sopenharmony_ci    * like to use it as a metric for intentional memory access or other
2539bf215546Sopenharmony_ci    * shared function use.  Otherwise, subtle changes to scheduling or
2540bf215546Sopenharmony_ci    * register allocation could cause it to fluctuate wildly - and that
2541bf215546Sopenharmony_ci    * effect is already counted in spill/fill counts.
2542bf215546Sopenharmony_ci    */
2543bf215546Sopenharmony_ci   send_count -= shader_stats.spill_count;
2544bf215546Sopenharmony_ci   send_count -= shader_stats.fill_count;
2545bf215546Sopenharmony_ci
2546bf215546Sopenharmony_ci#ifndef NDEBUG
2547bf215546Sopenharmony_ci   bool validated =
2548bf215546Sopenharmony_ci#else
2549bf215546Sopenharmony_ci   if (unlikely(debug_flag))
2550bf215546Sopenharmony_ci#endif
2551bf215546Sopenharmony_ci      brw_validate_instructions(&compiler->isa, p->store,
2552bf215546Sopenharmony_ci                                start_offset,
2553bf215546Sopenharmony_ci                                p->next_insn_offset,
2554bf215546Sopenharmony_ci                                disasm_info);
2555bf215546Sopenharmony_ci
2556bf215546Sopenharmony_ci   int before_size = p->next_insn_offset - start_offset;
2557bf215546Sopenharmony_ci   brw_compact_instructions(p, start_offset, disasm_info);
2558bf215546Sopenharmony_ci   int after_size = p->next_insn_offset - start_offset;
2559bf215546Sopenharmony_ci
2560bf215546Sopenharmony_ci   if (unlikely(debug_flag)) {
2561bf215546Sopenharmony_ci      unsigned char sha1[21];
2562bf215546Sopenharmony_ci      char sha1buf[41];
2563bf215546Sopenharmony_ci
2564bf215546Sopenharmony_ci      _mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst),
2565bf215546Sopenharmony_ci                         after_size, sha1);
2566bf215546Sopenharmony_ci      _mesa_sha1_format(sha1buf, sha1);
2567bf215546Sopenharmony_ci
2568bf215546Sopenharmony_ci      fprintf(stderr, "Native code for %s (sha1 %s)\n"
2569bf215546Sopenharmony_ci              "SIMD%d shader: %d instructions. %d loops. %u cycles. "
2570bf215546Sopenharmony_ci              "%d:%d spills:fills, %u sends, "
2571bf215546Sopenharmony_ci              "scheduled with mode %s. "
2572bf215546Sopenharmony_ci              "Promoted %u constants. "
2573bf215546Sopenharmony_ci              "Compacted %d to %d bytes (%.0f%%)\n",
2574bf215546Sopenharmony_ci              shader_name, sha1buf,
2575bf215546Sopenharmony_ci              dispatch_width, before_size / 16,
2576bf215546Sopenharmony_ci              loop_count, perf.latency,
2577bf215546Sopenharmony_ci              shader_stats.spill_count,
2578bf215546Sopenharmony_ci              shader_stats.fill_count,
2579bf215546Sopenharmony_ci              send_count,
2580bf215546Sopenharmony_ci              shader_stats.scheduler_mode,
2581bf215546Sopenharmony_ci              shader_stats.promoted_constants,
2582bf215546Sopenharmony_ci              before_size, after_size,
2583bf215546Sopenharmony_ci              100.0f * (before_size - after_size) / before_size);
2584bf215546Sopenharmony_ci
2585bf215546Sopenharmony_ci      /* overriding the shader makes disasm_info invalid */
2586bf215546Sopenharmony_ci      if (!brw_try_override_assembly(p, start_offset, sha1buf)) {
2587bf215546Sopenharmony_ci         dump_assembly(p->store, start_offset, p->next_insn_offset,
2588bf215546Sopenharmony_ci                       disasm_info, perf.block_latency);
2589bf215546Sopenharmony_ci      } else {
2590bf215546Sopenharmony_ci         fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
2591bf215546Sopenharmony_ci      }
2592bf215546Sopenharmony_ci   }
2593bf215546Sopenharmony_ci   ralloc_free(disasm_info);
2594bf215546Sopenharmony_ci#ifndef NDEBUG
2595bf215546Sopenharmony_ci   if (!validated && !debug_flag) {
2596bf215546Sopenharmony_ci      fprintf(stderr,
2597bf215546Sopenharmony_ci            "Validation failed. Rerun with INTEL_DEBUG=shaders to get more information.\n");
2598bf215546Sopenharmony_ci   }
2599bf215546Sopenharmony_ci#endif
2600bf215546Sopenharmony_ci   assert(validated);
2601bf215546Sopenharmony_ci
2602bf215546Sopenharmony_ci   brw_shader_debug_log(compiler, log_data,
2603bf215546Sopenharmony_ci                        "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2604bf215546Sopenharmony_ci                        "%d:%d spills:fills, %u sends, "
2605bf215546Sopenharmony_ci                        "scheduled with mode %s, "
2606bf215546Sopenharmony_ci                        "Promoted %u constants, "
2607bf215546Sopenharmony_ci                        "compacted %d to %d bytes.\n",
2608bf215546Sopenharmony_ci                        _mesa_shader_stage_to_abbrev(stage),
2609bf215546Sopenharmony_ci                        dispatch_width, before_size / 16 - nop_count,
2610bf215546Sopenharmony_ci                        loop_count, perf.latency,
2611bf215546Sopenharmony_ci                        shader_stats.spill_count,
2612bf215546Sopenharmony_ci                        shader_stats.fill_count,
2613bf215546Sopenharmony_ci                        send_count,
2614bf215546Sopenharmony_ci                        shader_stats.scheduler_mode,
2615bf215546Sopenharmony_ci                        shader_stats.promoted_constants,
2616bf215546Sopenharmony_ci                        before_size, after_size);
2617bf215546Sopenharmony_ci   if (stats) {
2618bf215546Sopenharmony_ci      stats->dispatch_width = dispatch_width;
2619bf215546Sopenharmony_ci      stats->instructions = before_size / 16 - nop_count;
2620bf215546Sopenharmony_ci      stats->sends = send_count;
2621bf215546Sopenharmony_ci      stats->loops = loop_count;
2622bf215546Sopenharmony_ci      stats->cycles = perf.latency;
2623bf215546Sopenharmony_ci      stats->spills = shader_stats.spill_count;
2624bf215546Sopenharmony_ci      stats->fills = shader_stats.fill_count;
2625bf215546Sopenharmony_ci   }
2626bf215546Sopenharmony_ci
2627bf215546Sopenharmony_ci   return start_offset;
2628bf215546Sopenharmony_ci}
2629bf215546Sopenharmony_ci
2630bf215546Sopenharmony_civoid
2631bf215546Sopenharmony_cifs_generator::add_const_data(void *data, unsigned size)
2632bf215546Sopenharmony_ci{
2633bf215546Sopenharmony_ci   assert(prog_data->const_data_size == 0);
2634bf215546Sopenharmony_ci   if (size > 0) {
2635bf215546Sopenharmony_ci      prog_data->const_data_size = size;
2636bf215546Sopenharmony_ci      prog_data->const_data_offset = brw_append_data(p, data, size, 32);
2637bf215546Sopenharmony_ci   }
2638bf215546Sopenharmony_ci}
2639bf215546Sopenharmony_ci
2640bf215546Sopenharmony_civoid
2641bf215546Sopenharmony_cifs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt)
2642bf215546Sopenharmony_ci{
2643bf215546Sopenharmony_ci   assert(brw_shader_stage_is_bindless(stage));
2644bf215546Sopenharmony_ci   struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data);
2645bf215546Sopenharmony_ci   if (num_resume_shaders > 0) {
2646bf215546Sopenharmony_ci      bs_prog_data->resume_sbt_offset =
2647bf215546Sopenharmony_ci         brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32);
2648bf215546Sopenharmony_ci      for (unsigned i = 0; i < num_resume_shaders; i++) {
2649bf215546Sopenharmony_ci         size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt);
2650bf215546Sopenharmony_ci         assert(offset <= UINT32_MAX);
2651bf215546Sopenharmony_ci         brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET,
2652bf215546Sopenharmony_ci                       BRW_SHADER_RELOC_TYPE_U32,
2653bf215546Sopenharmony_ci                       (uint32_t)offset, (uint32_t)sbt[i]);
2654bf215546Sopenharmony_ci      }
2655bf215546Sopenharmony_ci   }
2656bf215546Sopenharmony_ci}
2657bf215546Sopenharmony_ci
2658bf215546Sopenharmony_ciconst unsigned *
2659bf215546Sopenharmony_cifs_generator::get_assembly()
2660bf215546Sopenharmony_ci{
2661bf215546Sopenharmony_ci   prog_data->relocs = brw_get_shader_relocs(p, &prog_data->num_relocs);
2662bf215546Sopenharmony_ci
2663bf215546Sopenharmony_ci   return brw_get_program(p, &prog_data->program_size);
2664bf215546Sopenharmony_ci}
2665