1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2010 Intel Corporation 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci/** @file brw_fs_generator.cpp 25bf215546Sopenharmony_ci * 26bf215546Sopenharmony_ci * This file supports generating code from the FS LIR to the actual 27bf215546Sopenharmony_ci * native instructions. 28bf215546Sopenharmony_ci */ 29bf215546Sopenharmony_ci 30bf215546Sopenharmony_ci#include "brw_eu.h" 31bf215546Sopenharmony_ci#include "brw_fs.h" 32bf215546Sopenharmony_ci#include "brw_cfg.h" 33bf215546Sopenharmony_ci#include "util/mesa-sha1.h" 34bf215546Sopenharmony_ci#include "util/half_float.h" 35bf215546Sopenharmony_ci 36bf215546Sopenharmony_cistatic enum brw_reg_file 37bf215546Sopenharmony_cibrw_file_from_reg(fs_reg *reg) 38bf215546Sopenharmony_ci{ 39bf215546Sopenharmony_ci switch (reg->file) { 40bf215546Sopenharmony_ci case ARF: 41bf215546Sopenharmony_ci return BRW_ARCHITECTURE_REGISTER_FILE; 42bf215546Sopenharmony_ci case FIXED_GRF: 43bf215546Sopenharmony_ci case VGRF: 44bf215546Sopenharmony_ci return BRW_GENERAL_REGISTER_FILE; 45bf215546Sopenharmony_ci case MRF: 46bf215546Sopenharmony_ci return BRW_MESSAGE_REGISTER_FILE; 47bf215546Sopenharmony_ci case IMM: 48bf215546Sopenharmony_ci return BRW_IMMEDIATE_VALUE; 49bf215546Sopenharmony_ci case BAD_FILE: 50bf215546Sopenharmony_ci case ATTR: 51bf215546Sopenharmony_ci case UNIFORM: 52bf215546Sopenharmony_ci unreachable("not reached"); 53bf215546Sopenharmony_ci } 54bf215546Sopenharmony_ci return BRW_ARCHITECTURE_REGISTER_FILE; 55bf215546Sopenharmony_ci} 56bf215546Sopenharmony_ci 57bf215546Sopenharmony_cistatic struct brw_reg 58bf215546Sopenharmony_cibrw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst, 59bf215546Sopenharmony_ci fs_reg *reg, bool compressed) 60bf215546Sopenharmony_ci{ 61bf215546Sopenharmony_ci struct brw_reg brw_reg; 62bf215546Sopenharmony_ci 63bf215546Sopenharmony_ci switch (reg->file) { 64bf215546Sopenharmony_ci case MRF: 65bf215546Sopenharmony_ci assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver)); 66bf215546Sopenharmony_ci FALLTHROUGH; 67bf215546Sopenharmony_ci case VGRF: 68bf215546Sopenharmony_ci if (reg->stride == 0) { 69bf215546Sopenharmony_ci brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0); 70bf215546Sopenharmony_ci } else { 71bf215546Sopenharmony_ci /* From the Haswell PRM: 72bf215546Sopenharmony_ci * 73bf215546Sopenharmony_ci * "VertStride must be used to cross GRF register boundaries. This 74bf215546Sopenharmony_ci * rule implies that elements within a 'Width' cannot cross GRF 75bf215546Sopenharmony_ci * boundaries." 76bf215546Sopenharmony_ci * 77bf215546Sopenharmony_ci * The maximum width value that could satisfy this restriction is: 78bf215546Sopenharmony_ci */ 79bf215546Sopenharmony_ci const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type)); 80bf215546Sopenharmony_ci 81bf215546Sopenharmony_ci /* Because the hardware can only split source regions at a whole 82bf215546Sopenharmony_ci * multiple of width during decompression (i.e. vertically), clamp 83bf215546Sopenharmony_ci * the value obtained above to the physical execution size of a 84bf215546Sopenharmony_ci * single decompressed chunk of the instruction: 85bf215546Sopenharmony_ci */ 86bf215546Sopenharmony_ci const unsigned phys_width = compressed ? inst->exec_size / 2 : 87bf215546Sopenharmony_ci inst->exec_size; 88bf215546Sopenharmony_ci 89bf215546Sopenharmony_ci const unsigned max_hw_width = 16; 90bf215546Sopenharmony_ci 91bf215546Sopenharmony_ci /* XXX - The equation above is strictly speaking not correct on 92bf215546Sopenharmony_ci * hardware that supports unbalanced GRF writes -- On Gfx9+ 93bf215546Sopenharmony_ci * each decompressed chunk of the instruction may have a 94bf215546Sopenharmony_ci * different execution size when the number of components 95bf215546Sopenharmony_ci * written to each destination GRF is not the same. 96bf215546Sopenharmony_ci */ 97bf215546Sopenharmony_ci if (reg->stride > 4) { 98bf215546Sopenharmony_ci assert(reg != &inst->dst); 99bf215546Sopenharmony_ci assert(reg->stride * type_sz(reg->type) <= REG_SIZE); 100bf215546Sopenharmony_ci brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0); 101bf215546Sopenharmony_ci brw_reg = stride(brw_reg, reg->stride, 1, 0); 102bf215546Sopenharmony_ci } else { 103bf215546Sopenharmony_ci const unsigned width = MIN3(reg_width, phys_width, max_hw_width); 104bf215546Sopenharmony_ci brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); 105bf215546Sopenharmony_ci brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); 106bf215546Sopenharmony_ci } 107bf215546Sopenharmony_ci 108bf215546Sopenharmony_ci if (devinfo->verx10 == 70) { 109bf215546Sopenharmony_ci /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13): 110bf215546Sopenharmony_ci * "Each DF (Double Float) operand uses an element size of 4 rather 111bf215546Sopenharmony_ci * than 8 and all regioning parameters are twice what the values 112bf215546Sopenharmony_ci * would be based on the true element size: ExecSize, Width, 113bf215546Sopenharmony_ci * HorzStride, and VertStride. Each DF operand uses a pair of 114bf215546Sopenharmony_ci * channels and all masking and swizzing should be adjusted 115bf215546Sopenharmony_ci * appropriately." 116bf215546Sopenharmony_ci * 117bf215546Sopenharmony_ci * From the IvyBridge PRM (Special Requirements for Handling Double 118bf215546Sopenharmony_ci * Precision Data Types, page 71): 119bf215546Sopenharmony_ci * "In Align1 mode, all regioning parameters like stride, execution 120bf215546Sopenharmony_ci * size, and width must use the syntax of a pair of packed 121bf215546Sopenharmony_ci * floats. The offsets for these data types must be 64-bit 122bf215546Sopenharmony_ci * aligned. The execution size and regioning parameters are in terms 123bf215546Sopenharmony_ci * of floats." 124bf215546Sopenharmony_ci * 125bf215546Sopenharmony_ci * Summarized: when handling DF-typed arguments, ExecSize, 126bf215546Sopenharmony_ci * VertStride, and Width must be doubled. 127bf215546Sopenharmony_ci * 128bf215546Sopenharmony_ci * It applies to BayTrail too. 129bf215546Sopenharmony_ci */ 130bf215546Sopenharmony_ci if (type_sz(reg->type) == 8) { 131bf215546Sopenharmony_ci brw_reg.width++; 132bf215546Sopenharmony_ci if (brw_reg.vstride > 0) 133bf215546Sopenharmony_ci brw_reg.vstride++; 134bf215546Sopenharmony_ci assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1); 135bf215546Sopenharmony_ci } 136bf215546Sopenharmony_ci 137bf215546Sopenharmony_ci /* When converting from DF->F, we set the destination stride to 2 138bf215546Sopenharmony_ci * because each d2f conversion implicitly writes 2 floats, being 139bf215546Sopenharmony_ci * the first one the converted value. IVB/BYT actually writes two 140bf215546Sopenharmony_ci * F components per SIMD channel, and every other component is 141bf215546Sopenharmony_ci * filled with garbage. 142bf215546Sopenharmony_ci */ 143bf215546Sopenharmony_ci if (reg == &inst->dst && get_exec_type_size(inst) == 8 && 144bf215546Sopenharmony_ci type_sz(inst->dst.type) < 8) { 145bf215546Sopenharmony_ci assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1); 146bf215546Sopenharmony_ci brw_reg.hstride--; 147bf215546Sopenharmony_ci } 148bf215546Sopenharmony_ci } 149bf215546Sopenharmony_ci } 150bf215546Sopenharmony_ci 151bf215546Sopenharmony_ci brw_reg = retype(brw_reg, reg->type); 152bf215546Sopenharmony_ci brw_reg = byte_offset(brw_reg, reg->offset); 153bf215546Sopenharmony_ci brw_reg.abs = reg->abs; 154bf215546Sopenharmony_ci brw_reg.negate = reg->negate; 155bf215546Sopenharmony_ci break; 156bf215546Sopenharmony_ci case ARF: 157bf215546Sopenharmony_ci case FIXED_GRF: 158bf215546Sopenharmony_ci case IMM: 159bf215546Sopenharmony_ci assert(reg->offset == 0); 160bf215546Sopenharmony_ci brw_reg = reg->as_brw_reg(); 161bf215546Sopenharmony_ci break; 162bf215546Sopenharmony_ci case BAD_FILE: 163bf215546Sopenharmony_ci /* Probably unused. */ 164bf215546Sopenharmony_ci brw_reg = brw_null_reg(); 165bf215546Sopenharmony_ci break; 166bf215546Sopenharmony_ci case ATTR: 167bf215546Sopenharmony_ci case UNIFORM: 168bf215546Sopenharmony_ci unreachable("not reached"); 169bf215546Sopenharmony_ci } 170bf215546Sopenharmony_ci 171bf215546Sopenharmony_ci /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0> 172bf215546Sopenharmony_ci * region, but on IVB and BYT DF regions must be programmed in terms of 173bf215546Sopenharmony_ci * floats. A <0,2,1> region accomplishes this. 174bf215546Sopenharmony_ci */ 175bf215546Sopenharmony_ci if (devinfo->verx10 == 70 && 176bf215546Sopenharmony_ci type_sz(reg->type) == 8 && 177bf215546Sopenharmony_ci brw_reg.vstride == BRW_VERTICAL_STRIDE_0 && 178bf215546Sopenharmony_ci brw_reg.width == BRW_WIDTH_1 && 179bf215546Sopenharmony_ci brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) { 180bf215546Sopenharmony_ci brw_reg.width = BRW_WIDTH_2; 181bf215546Sopenharmony_ci brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1; 182bf215546Sopenharmony_ci } 183bf215546Sopenharmony_ci 184bf215546Sopenharmony_ci return brw_reg; 185bf215546Sopenharmony_ci} 186bf215546Sopenharmony_ci 187bf215546Sopenharmony_cifs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data, 188bf215546Sopenharmony_ci void *mem_ctx, 189bf215546Sopenharmony_ci struct brw_stage_prog_data *prog_data, 190bf215546Sopenharmony_ci bool runtime_check_aads_emit, 191bf215546Sopenharmony_ci gl_shader_stage stage) 192bf215546Sopenharmony_ci 193bf215546Sopenharmony_ci : compiler(compiler), log_data(log_data), 194bf215546Sopenharmony_ci devinfo(compiler->devinfo), 195bf215546Sopenharmony_ci prog_data(prog_data), dispatch_width(0), 196bf215546Sopenharmony_ci runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), 197bf215546Sopenharmony_ci shader_name(NULL), stage(stage), mem_ctx(mem_ctx) 198bf215546Sopenharmony_ci{ 199bf215546Sopenharmony_ci p = rzalloc(mem_ctx, struct brw_codegen); 200bf215546Sopenharmony_ci brw_init_codegen(&compiler->isa, p, mem_ctx); 201bf215546Sopenharmony_ci 202bf215546Sopenharmony_ci /* In the FS code generator, we are very careful to ensure that we always 203bf215546Sopenharmony_ci * set the right execution size so we don't need the EU code to "help" us 204bf215546Sopenharmony_ci * by trying to infer it. Sometimes, it infers the wrong thing. 205bf215546Sopenharmony_ci */ 206bf215546Sopenharmony_ci p->automatic_exec_sizes = false; 207bf215546Sopenharmony_ci} 208bf215546Sopenharmony_ci 209bf215546Sopenharmony_cifs_generator::~fs_generator() 210bf215546Sopenharmony_ci{ 211bf215546Sopenharmony_ci} 212bf215546Sopenharmony_ci 213bf215546Sopenharmony_ciclass ip_record : public exec_node { 214bf215546Sopenharmony_cipublic: 215bf215546Sopenharmony_ci DECLARE_RALLOC_CXX_OPERATORS(ip_record) 216bf215546Sopenharmony_ci 217bf215546Sopenharmony_ci ip_record(int ip) 218bf215546Sopenharmony_ci { 219bf215546Sopenharmony_ci this->ip = ip; 220bf215546Sopenharmony_ci } 221bf215546Sopenharmony_ci 222bf215546Sopenharmony_ci int ip; 223bf215546Sopenharmony_ci}; 224bf215546Sopenharmony_ci 225bf215546Sopenharmony_cibool 226bf215546Sopenharmony_cifs_generator::patch_halt_jumps() 227bf215546Sopenharmony_ci{ 228bf215546Sopenharmony_ci if (this->discard_halt_patches.is_empty()) 229bf215546Sopenharmony_ci return false; 230bf215546Sopenharmony_ci 231bf215546Sopenharmony_ci int scale = brw_jump_scale(p->devinfo); 232bf215546Sopenharmony_ci 233bf215546Sopenharmony_ci if (devinfo->ver >= 6) { 234bf215546Sopenharmony_ci /* There is a somewhat strange undocumented requirement of using 235bf215546Sopenharmony_ci * HALT, according to the simulator. If some channel has HALTed to 236bf215546Sopenharmony_ci * a particular UIP, then by the end of the program, every channel 237bf215546Sopenharmony_ci * must have HALTed to that UIP. Furthermore, the tracking is a 238bf215546Sopenharmony_ci * stack, so you can't do the final halt of a UIP after starting 239bf215546Sopenharmony_ci * halting to a new UIP. 240bf215546Sopenharmony_ci * 241bf215546Sopenharmony_ci * Symptoms of not emitting this instruction on actual hardware 242bf215546Sopenharmony_ci * included GPU hangs and sparkly rendering on the piglit discard 243bf215546Sopenharmony_ci * tests. 244bf215546Sopenharmony_ci */ 245bf215546Sopenharmony_ci brw_inst *last_halt = brw_HALT(p); 246bf215546Sopenharmony_ci brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); 247bf215546Sopenharmony_ci brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); 248bf215546Sopenharmony_ci } 249bf215546Sopenharmony_ci 250bf215546Sopenharmony_ci int ip = p->nr_insn; 251bf215546Sopenharmony_ci 252bf215546Sopenharmony_ci foreach_in_list(ip_record, patch_ip, &discard_halt_patches) { 253bf215546Sopenharmony_ci brw_inst *patch = &p->store[patch_ip->ip]; 254bf215546Sopenharmony_ci 255bf215546Sopenharmony_ci assert(brw_inst_opcode(p->isa, patch) == BRW_OPCODE_HALT); 256bf215546Sopenharmony_ci if (devinfo->ver >= 6) { 257bf215546Sopenharmony_ci /* HALT takes a half-instruction distance from the pre-incremented IP. */ 258bf215546Sopenharmony_ci brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); 259bf215546Sopenharmony_ci } else { 260bf215546Sopenharmony_ci brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale)); 261bf215546Sopenharmony_ci } 262bf215546Sopenharmony_ci } 263bf215546Sopenharmony_ci 264bf215546Sopenharmony_ci this->discard_halt_patches.make_empty(); 265bf215546Sopenharmony_ci 266bf215546Sopenharmony_ci if (devinfo->ver < 6) { 267bf215546Sopenharmony_ci /* From the g965 PRM: 268bf215546Sopenharmony_ci * 269bf215546Sopenharmony_ci * "As DMask is not automatically reloaded into AMask upon completion 270bf215546Sopenharmony_ci * of this instruction, software has to manually restore AMask upon 271bf215546Sopenharmony_ci * completion." 272bf215546Sopenharmony_ci * 273bf215546Sopenharmony_ci * DMask lives in the bottom 16 bits of sr0.1. 274bf215546Sopenharmony_ci */ 275bf215546Sopenharmony_ci brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK), 276bf215546Sopenharmony_ci retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW)); 277bf215546Sopenharmony_ci brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1); 278bf215546Sopenharmony_ci brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE); 279bf215546Sopenharmony_ci brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE); 280bf215546Sopenharmony_ci brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH); 281bf215546Sopenharmony_ci } 282bf215546Sopenharmony_ci 283bf215546Sopenharmony_ci if (devinfo->ver == 4 && devinfo->platform != INTEL_PLATFORM_G4X) { 284bf215546Sopenharmony_ci /* From the g965 PRM: 285bf215546Sopenharmony_ci * 286bf215546Sopenharmony_ci * "[DevBW, DevCL] Erratum: The subfields in mask stack register are 287bf215546Sopenharmony_ci * reset to zero during graphics reset, however, they are not 288bf215546Sopenharmony_ci * initialized at thread dispatch. These subfields will retain the 289bf215546Sopenharmony_ci * values from the previous thread. Software should make sure the 290bf215546Sopenharmony_ci * mask stack is empty (reset to zero) before terminating the thread. 291bf215546Sopenharmony_ci * In case that this is not practical, software may have to reset the 292bf215546Sopenharmony_ci * mask stack at the beginning of each kernel, which will impact the 293bf215546Sopenharmony_ci * performance." 294bf215546Sopenharmony_ci * 295bf215546Sopenharmony_ci * Luckily we can rely on: 296bf215546Sopenharmony_ci * 297bf215546Sopenharmony_ci * "[DevBW, DevCL] This register access restriction is not 298bf215546Sopenharmony_ci * applicable, hardware does ensure execution pipeline coherency, 299bf215546Sopenharmony_ci * when a mask stack register is used as an explicit source and/or 300bf215546Sopenharmony_ci * destination." 301bf215546Sopenharmony_ci */ 302bf215546Sopenharmony_ci brw_push_insn_state(p); 303bf215546Sopenharmony_ci brw_set_default_mask_control(p, BRW_MASK_DISABLE); 304bf215546Sopenharmony_ci brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 305bf215546Sopenharmony_ci 306bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_2); 307bf215546Sopenharmony_ci brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0)); 308bf215546Sopenharmony_ci 309bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_16); 310bf215546Sopenharmony_ci /* Reset the if stack. */ 311bf215546Sopenharmony_ci brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW), 312bf215546Sopenharmony_ci brw_imm_uw(0)); 313bf215546Sopenharmony_ci 314bf215546Sopenharmony_ci brw_pop_insn_state(p); 315bf215546Sopenharmony_ci } 316bf215546Sopenharmony_ci 317bf215546Sopenharmony_ci return true; 318bf215546Sopenharmony_ci} 319bf215546Sopenharmony_ci 320bf215546Sopenharmony_civoid 321bf215546Sopenharmony_cifs_generator::generate_send(fs_inst *inst, 322bf215546Sopenharmony_ci struct brw_reg dst, 323bf215546Sopenharmony_ci struct brw_reg desc, 324bf215546Sopenharmony_ci struct brw_reg ex_desc, 325bf215546Sopenharmony_ci struct brw_reg payload, 326bf215546Sopenharmony_ci struct brw_reg payload2) 327bf215546Sopenharmony_ci{ 328bf215546Sopenharmony_ci const bool dst_is_null = dst.file == BRW_ARCHITECTURE_REGISTER_FILE && 329bf215546Sopenharmony_ci dst.nr == BRW_ARF_NULL; 330bf215546Sopenharmony_ci const unsigned rlen = dst_is_null ? 0 : inst->size_written / REG_SIZE; 331bf215546Sopenharmony_ci 332bf215546Sopenharmony_ci uint32_t desc_imm = inst->desc | 333bf215546Sopenharmony_ci brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size); 334bf215546Sopenharmony_ci 335bf215546Sopenharmony_ci uint32_t ex_desc_imm = inst->ex_desc | 336bf215546Sopenharmony_ci brw_message_ex_desc(devinfo, inst->ex_mlen); 337bf215546Sopenharmony_ci 338bf215546Sopenharmony_ci if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) { 339bf215546Sopenharmony_ci /* If we have any sort of extended descriptor, then we need SENDS. This 340bf215546Sopenharmony_ci * also covers the dual-payload case because ex_mlen goes in ex_desc. 341bf215546Sopenharmony_ci */ 342bf215546Sopenharmony_ci brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2, 343bf215546Sopenharmony_ci desc, desc_imm, ex_desc, ex_desc_imm, 344bf215546Sopenharmony_ci inst->eot); 345bf215546Sopenharmony_ci if (inst->check_tdr) 346bf215546Sopenharmony_ci brw_inst_set_opcode(p->isa, brw_last_inst, 347bf215546Sopenharmony_ci devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); 348bf215546Sopenharmony_ci } else { 349bf215546Sopenharmony_ci brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm, 350bf215546Sopenharmony_ci inst->eot); 351bf215546Sopenharmony_ci if (inst->check_tdr) 352bf215546Sopenharmony_ci brw_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC); 353bf215546Sopenharmony_ci } 354bf215546Sopenharmony_ci} 355bf215546Sopenharmony_ci 356bf215546Sopenharmony_civoid 357bf215546Sopenharmony_cifs_generator::fire_fb_write(fs_inst *inst, 358bf215546Sopenharmony_ci struct brw_reg payload, 359bf215546Sopenharmony_ci struct brw_reg implied_header, 360bf215546Sopenharmony_ci GLuint nr) 361bf215546Sopenharmony_ci{ 362bf215546Sopenharmony_ci struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 363bf215546Sopenharmony_ci 364bf215546Sopenharmony_ci if (devinfo->ver < 6) { 365bf215546Sopenharmony_ci brw_push_insn_state(p); 366bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_8); 367bf215546Sopenharmony_ci brw_set_default_mask_control(p, BRW_MASK_DISABLE); 368bf215546Sopenharmony_ci brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 369bf215546Sopenharmony_ci brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 370bf215546Sopenharmony_ci brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1), 371bf215546Sopenharmony_ci offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1)); 372bf215546Sopenharmony_ci brw_pop_insn_state(p); 373bf215546Sopenharmony_ci } 374bf215546Sopenharmony_ci 375bf215546Sopenharmony_ci uint32_t msg_control = brw_fb_write_msg_control(inst, prog_data); 376bf215546Sopenharmony_ci 377bf215546Sopenharmony_ci /* We assume render targets start at 0, because headerless FB write 378bf215546Sopenharmony_ci * messages set "Render Target Index" to 0. Using a different binding 379bf215546Sopenharmony_ci * table index would make it impossible to use headerless messages. 380bf215546Sopenharmony_ci */ 381bf215546Sopenharmony_ci const uint32_t surf_index = inst->target; 382bf215546Sopenharmony_ci 383bf215546Sopenharmony_ci brw_inst *insn = brw_fb_WRITE(p, 384bf215546Sopenharmony_ci payload, 385bf215546Sopenharmony_ci retype(implied_header, BRW_REGISTER_TYPE_UW), 386bf215546Sopenharmony_ci msg_control, 387bf215546Sopenharmony_ci surf_index, 388bf215546Sopenharmony_ci nr, 389bf215546Sopenharmony_ci 0, 390bf215546Sopenharmony_ci inst->eot, 391bf215546Sopenharmony_ci inst->last_rt, 392bf215546Sopenharmony_ci inst->header_size != 0); 393bf215546Sopenharmony_ci 394bf215546Sopenharmony_ci if (devinfo->ver >= 6) 395bf215546Sopenharmony_ci brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16); 396bf215546Sopenharmony_ci} 397bf215546Sopenharmony_ci 398bf215546Sopenharmony_civoid 399bf215546Sopenharmony_cifs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload) 400bf215546Sopenharmony_ci{ 401bf215546Sopenharmony_ci if (devinfo->verx10 <= 70) { 402bf215546Sopenharmony_ci brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 403bf215546Sopenharmony_ci brw_set_default_flag_reg(p, 0, 0); 404bf215546Sopenharmony_ci } 405bf215546Sopenharmony_ci 406bf215546Sopenharmony_ci const struct brw_reg implied_header = 407bf215546Sopenharmony_ci devinfo->ver < 6 ? payload : brw_null_reg(); 408bf215546Sopenharmony_ci 409bf215546Sopenharmony_ci if (inst->base_mrf >= 0) 410bf215546Sopenharmony_ci payload = brw_message_reg(inst->base_mrf); 411bf215546Sopenharmony_ci 412bf215546Sopenharmony_ci if (!runtime_check_aads_emit) { 413bf215546Sopenharmony_ci fire_fb_write(inst, payload, implied_header, inst->mlen); 414bf215546Sopenharmony_ci } else { 415bf215546Sopenharmony_ci /* This can only happen in gen < 6 */ 416bf215546Sopenharmony_ci assert(devinfo->ver < 6); 417bf215546Sopenharmony_ci 418bf215546Sopenharmony_ci struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 419bf215546Sopenharmony_ci 420bf215546Sopenharmony_ci /* Check runtime bit to detect if we have to send AA data or not */ 421bf215546Sopenharmony_ci brw_push_insn_state(p); 422bf215546Sopenharmony_ci brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 423bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_1); 424bf215546Sopenharmony_ci brw_AND(p, 425bf215546Sopenharmony_ci v1_null_ud, 426bf215546Sopenharmony_ci retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD), 427bf215546Sopenharmony_ci brw_imm_ud(1<<26)); 428bf215546Sopenharmony_ci brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); 429bf215546Sopenharmony_ci 430bf215546Sopenharmony_ci int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store; 431bf215546Sopenharmony_ci brw_pop_insn_state(p); 432bf215546Sopenharmony_ci { 433bf215546Sopenharmony_ci /* Don't send AA data */ 434bf215546Sopenharmony_ci fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1); 435bf215546Sopenharmony_ci } 436bf215546Sopenharmony_ci brw_land_fwd_jump(p, jmp); 437bf215546Sopenharmony_ci fire_fb_write(inst, payload, implied_header, inst->mlen); 438bf215546Sopenharmony_ci } 439bf215546Sopenharmony_ci} 440bf215546Sopenharmony_ci 441bf215546Sopenharmony_civoid 442bf215546Sopenharmony_cifs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst, 443bf215546Sopenharmony_ci struct brw_reg payload) 444bf215546Sopenharmony_ci{ 445bf215546Sopenharmony_ci assert(inst->size_written % REG_SIZE == 0); 446bf215546Sopenharmony_ci struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 447bf215546Sopenharmony_ci /* We assume that render targets start at binding table index 0. */ 448bf215546Sopenharmony_ci const unsigned surf_index = inst->target; 449bf215546Sopenharmony_ci 450bf215546Sopenharmony_ci gfx9_fb_READ(p, dst, payload, surf_index, 451bf215546Sopenharmony_ci inst->header_size, inst->size_written / REG_SIZE, 452bf215546Sopenharmony_ci prog_data->persample_dispatch); 453bf215546Sopenharmony_ci} 454bf215546Sopenharmony_ci 455bf215546Sopenharmony_civoid 456bf215546Sopenharmony_cifs_generator::generate_mov_indirect(fs_inst *inst, 457bf215546Sopenharmony_ci struct brw_reg dst, 458bf215546Sopenharmony_ci struct brw_reg reg, 459bf215546Sopenharmony_ci struct brw_reg indirect_byte_offset) 460bf215546Sopenharmony_ci{ 461bf215546Sopenharmony_ci assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD); 462bf215546Sopenharmony_ci assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE); 463bf215546Sopenharmony_ci assert(!reg.abs && !reg.negate); 464bf215546Sopenharmony_ci assert(reg.type == dst.type); 465bf215546Sopenharmony_ci 466bf215546Sopenharmony_ci unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr; 467bf215546Sopenharmony_ci 468bf215546Sopenharmony_ci if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) { 469bf215546Sopenharmony_ci imm_byte_offset += indirect_byte_offset.ud; 470bf215546Sopenharmony_ci 471bf215546Sopenharmony_ci reg.nr = imm_byte_offset / REG_SIZE; 472bf215546Sopenharmony_ci reg.subnr = imm_byte_offset % REG_SIZE; 473bf215546Sopenharmony_ci if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) { 474bf215546Sopenharmony_ci brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), 475bf215546Sopenharmony_ci subscript(reg, BRW_REGISTER_TYPE_D, 0)); 476bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_null()); 477bf215546Sopenharmony_ci brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), 478bf215546Sopenharmony_ci subscript(reg, BRW_REGISTER_TYPE_D, 1)); 479bf215546Sopenharmony_ci } else { 480bf215546Sopenharmony_ci brw_MOV(p, dst, reg); 481bf215546Sopenharmony_ci } 482bf215546Sopenharmony_ci } else { 483bf215546Sopenharmony_ci /* Prior to Broadwell, there are only 8 address registers. */ 484bf215546Sopenharmony_ci assert(inst->exec_size <= 8 || devinfo->ver >= 8); 485bf215546Sopenharmony_ci 486bf215546Sopenharmony_ci /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ 487bf215546Sopenharmony_ci struct brw_reg addr = vec8(brw_address_reg(0)); 488bf215546Sopenharmony_ci 489bf215546Sopenharmony_ci /* Whether we can use destination dependency control without running the 490bf215546Sopenharmony_ci * risk of a hang if an instruction gets shot down. 491bf215546Sopenharmony_ci */ 492bf215546Sopenharmony_ci const bool use_dep_ctrl = !inst->predicate && 493bf215546Sopenharmony_ci inst->exec_size == dispatch_width; 494bf215546Sopenharmony_ci brw_inst *insn; 495bf215546Sopenharmony_ci 496bf215546Sopenharmony_ci /* The destination stride of an instruction (in bytes) must be greater 497bf215546Sopenharmony_ci * than or equal to the size of the rest of the instruction. Since the 498bf215546Sopenharmony_ci * address register is of type UW, we can't use a D-type instruction. 499bf215546Sopenharmony_ci * In order to get around this, re retype to UW and use a stride. 500bf215546Sopenharmony_ci */ 501bf215546Sopenharmony_ci indirect_byte_offset = 502bf215546Sopenharmony_ci retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW); 503bf215546Sopenharmony_ci 504bf215546Sopenharmony_ci /* There are a number of reasons why we don't use the base offset here. 505bf215546Sopenharmony_ci * One reason is that the field is only 9 bits which means we can only 506bf215546Sopenharmony_ci * use it to access the first 16 GRFs. Also, from the Haswell PRM 507bf215546Sopenharmony_ci * section "Register Region Restrictions": 508bf215546Sopenharmony_ci * 509bf215546Sopenharmony_ci * "The lower bits of the AddressImmediate must not overflow to 510bf215546Sopenharmony_ci * change the register address. The lower 5 bits of Address 511bf215546Sopenharmony_ci * Immediate when added to lower 5 bits of address register gives 512bf215546Sopenharmony_ci * the sub-register offset. The upper bits of Address Immediate 513bf215546Sopenharmony_ci * when added to upper bits of address register gives the register 514bf215546Sopenharmony_ci * address. Any overflow from sub-register offset is dropped." 515bf215546Sopenharmony_ci * 516bf215546Sopenharmony_ci * Since the indirect may cause us to cross a register boundary, this 517bf215546Sopenharmony_ci * makes the base offset almost useless. We could try and do something 518bf215546Sopenharmony_ci * clever where we use a actual base offset if base_offset % 32 == 0 but 519bf215546Sopenharmony_ci * that would mean we were generating different code depending on the 520bf215546Sopenharmony_ci * base offset. Instead, for the sake of consistency, we'll just do the 521bf215546Sopenharmony_ci * add ourselves. This restriction is only listed in the Haswell PRM 522bf215546Sopenharmony_ci * but empirical testing indicates that it applies on all older 523bf215546Sopenharmony_ci * generations and is lifted on Broadwell. 524bf215546Sopenharmony_ci * 525bf215546Sopenharmony_ci * In the end, while base_offset is nice to look at in the generated 526bf215546Sopenharmony_ci * code, using it saves us 0 instructions and would require quite a bit 527bf215546Sopenharmony_ci * of case-by-case work. It's just not worth it. 528bf215546Sopenharmony_ci * 529bf215546Sopenharmony_ci * Due to a hardware bug some platforms (particularly Gfx11+) seem to 530bf215546Sopenharmony_ci * require the address components of all channels to be valid whether or 531bf215546Sopenharmony_ci * not they're active, which causes issues if we use VxH addressing 532bf215546Sopenharmony_ci * under non-uniform control-flow. We can easily work around that by 533bf215546Sopenharmony_ci * initializing the whole address register with a pipelined NoMask MOV 534bf215546Sopenharmony_ci * instruction. 535bf215546Sopenharmony_ci */ 536bf215546Sopenharmony_ci if (devinfo->ver >= 7) { 537bf215546Sopenharmony_ci insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset)); 538bf215546Sopenharmony_ci brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); 539bf215546Sopenharmony_ci brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); 540bf215546Sopenharmony_ci if (devinfo->ver >= 12) 541bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_null()); 542bf215546Sopenharmony_ci else 543bf215546Sopenharmony_ci brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl); 544bf215546Sopenharmony_ci } 545bf215546Sopenharmony_ci 546bf215546Sopenharmony_ci insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); 547bf215546Sopenharmony_ci if (devinfo->ver >= 12) 548bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_regdist(1)); 549bf215546Sopenharmony_ci else if (devinfo->ver >= 7) 550bf215546Sopenharmony_ci brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl); 551bf215546Sopenharmony_ci 552bf215546Sopenharmony_ci if (type_sz(reg.type) > 4 && 553bf215546Sopenharmony_ci ((devinfo->verx10 == 70) || 554bf215546Sopenharmony_ci devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) || 555bf215546Sopenharmony_ci !devinfo->has_64bit_float || devinfo->verx10 >= 125)) { 556bf215546Sopenharmony_ci /* IVB has an issue (which we found empirically) where it reads two 557bf215546Sopenharmony_ci * address register components per channel for indirectly addressed 558bf215546Sopenharmony_ci * 64-bit sources. 559bf215546Sopenharmony_ci * 560bf215546Sopenharmony_ci * From the Cherryview PRM Vol 7. "Register Region Restrictions": 561bf215546Sopenharmony_ci * 562bf215546Sopenharmony_ci * "When source or destination datatype is 64b or operation is 563bf215546Sopenharmony_ci * integer DWord multiply, indirect addressing must not be used." 564bf215546Sopenharmony_ci * 565bf215546Sopenharmony_ci * To work around both of these, we do two integer MOVs insead of one 566bf215546Sopenharmony_ci * 64-bit MOV. Because no double value should ever cross a register 567bf215546Sopenharmony_ci * boundary, it's safe to use the immediate offset in the indirect 568bf215546Sopenharmony_ci * here to handle adding 4 bytes to the offset and avoid the extra 569bf215546Sopenharmony_ci * ADD to the register file. 570bf215546Sopenharmony_ci */ 571bf215546Sopenharmony_ci brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), 572bf215546Sopenharmony_ci retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); 573bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_null()); 574bf215546Sopenharmony_ci brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), 575bf215546Sopenharmony_ci retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); 576bf215546Sopenharmony_ci } else { 577bf215546Sopenharmony_ci struct brw_reg ind_src = brw_VxH_indirect(0, 0); 578bf215546Sopenharmony_ci 579bf215546Sopenharmony_ci brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type)); 580bf215546Sopenharmony_ci 581bf215546Sopenharmony_ci if (devinfo->ver == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE && 582bf215546Sopenharmony_ci !inst->get_next()->is_tail_sentinel() && 583bf215546Sopenharmony_ci ((fs_inst *)inst->get_next())->mlen > 0) { 584bf215546Sopenharmony_ci /* From the Sandybridge PRM: 585bf215546Sopenharmony_ci * 586bf215546Sopenharmony_ci * "[Errata: DevSNB(SNB)] If MRF register is updated by any 587bf215546Sopenharmony_ci * instruction that “indexed/indirect” source AND is followed 588bf215546Sopenharmony_ci * by a send, the instruction requires a “Switch”. This is to 589bf215546Sopenharmony_ci * avoid race condition where send may dispatch before MRF is 590bf215546Sopenharmony_ci * updated." 591bf215546Sopenharmony_ci */ 592bf215546Sopenharmony_ci brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH); 593bf215546Sopenharmony_ci } 594bf215546Sopenharmony_ci } 595bf215546Sopenharmony_ci } 596bf215546Sopenharmony_ci} 597bf215546Sopenharmony_ci 598bf215546Sopenharmony_civoid 599bf215546Sopenharmony_cifs_generator::generate_shuffle(fs_inst *inst, 600bf215546Sopenharmony_ci struct brw_reg dst, 601bf215546Sopenharmony_ci struct brw_reg src, 602bf215546Sopenharmony_ci struct brw_reg idx) 603bf215546Sopenharmony_ci{ 604bf215546Sopenharmony_ci assert(src.file == BRW_GENERAL_REGISTER_FILE); 605bf215546Sopenharmony_ci assert(!src.abs && !src.negate); 606bf215546Sopenharmony_ci 607bf215546Sopenharmony_ci /* Ivy bridge has some strange behavior that makes this a real pain to 608bf215546Sopenharmony_ci * implement for 64-bit values so we just don't bother. 609bf215546Sopenharmony_ci */ 610bf215546Sopenharmony_ci assert((devinfo->verx10 >= 75 && devinfo->has_64bit_float) || 611bf215546Sopenharmony_ci type_sz(src.type) <= 4); 612bf215546Sopenharmony_ci 613bf215546Sopenharmony_ci /* Because we're using the address register, we're limited to 8-wide 614bf215546Sopenharmony_ci * execution on gfx7. On gfx8, we're limited to 16-wide by the address 615bf215546Sopenharmony_ci * register file and 8-wide for 64-bit types. We could try and make this 616bf215546Sopenharmony_ci * instruction splittable higher up in the compiler but that gets weird 617bf215546Sopenharmony_ci * because it reads all of the channels regardless of execution size. It's 618bf215546Sopenharmony_ci * easier just to split it here. 619bf215546Sopenharmony_ci */ 620bf215546Sopenharmony_ci const unsigned lower_width = 621bf215546Sopenharmony_ci devinfo->ver <= 7 || element_sz(src) > 4 || element_sz(dst) > 4 ? 8 : 622bf215546Sopenharmony_ci MIN2(16, inst->exec_size); 623bf215546Sopenharmony_ci 624bf215546Sopenharmony_ci brw_set_default_exec_size(p, cvt(lower_width) - 1); 625bf215546Sopenharmony_ci for (unsigned group = 0; group < inst->exec_size; group += lower_width) { 626bf215546Sopenharmony_ci brw_set_default_group(p, group); 627bf215546Sopenharmony_ci 628bf215546Sopenharmony_ci if ((src.vstride == 0 && src.hstride == 0) || 629bf215546Sopenharmony_ci idx.file == BRW_IMMEDIATE_VALUE) { 630bf215546Sopenharmony_ci /* Trivial, the source is already uniform or the index is a constant. 631bf215546Sopenharmony_ci * We will typically not get here if the optimizer is doing its job, 632bf215546Sopenharmony_ci * but asserting would be mean. 633bf215546Sopenharmony_ci */ 634bf215546Sopenharmony_ci const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0; 635bf215546Sopenharmony_ci struct brw_reg group_src = stride(suboffset(src, i), 0, 1, 0); 636bf215546Sopenharmony_ci struct brw_reg group_dst = suboffset(dst, group << (dst.hstride - 1)); 637bf215546Sopenharmony_ci brw_MOV(p, group_dst, group_src); 638bf215546Sopenharmony_ci } else { 639bf215546Sopenharmony_ci /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ 640bf215546Sopenharmony_ci struct brw_reg addr = vec8(brw_address_reg(0)); 641bf215546Sopenharmony_ci 642bf215546Sopenharmony_ci struct brw_reg group_idx = suboffset(idx, group); 643bf215546Sopenharmony_ci 644bf215546Sopenharmony_ci if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) { 645bf215546Sopenharmony_ci /* Things get grumpy if the register is too wide. */ 646bf215546Sopenharmony_ci group_idx.width--; 647bf215546Sopenharmony_ci group_idx.vstride--; 648bf215546Sopenharmony_ci } 649bf215546Sopenharmony_ci 650bf215546Sopenharmony_ci assert(type_sz(group_idx.type) <= 4); 651bf215546Sopenharmony_ci if (type_sz(group_idx.type) == 4) { 652bf215546Sopenharmony_ci /* The destination stride of an instruction (in bytes) must be 653bf215546Sopenharmony_ci * greater than or equal to the size of the rest of the 654bf215546Sopenharmony_ci * instruction. Since the address register is of type UW, we 655bf215546Sopenharmony_ci * can't use a D-type instruction. In order to get around this, 656bf215546Sopenharmony_ci * re retype to UW and use a stride. 657bf215546Sopenharmony_ci */ 658bf215546Sopenharmony_ci group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W); 659bf215546Sopenharmony_ci } 660bf215546Sopenharmony_ci 661bf215546Sopenharmony_ci uint32_t src_start_offset = src.nr * REG_SIZE + src.subnr; 662bf215546Sopenharmony_ci 663bf215546Sopenharmony_ci /* From the Haswell PRM: 664bf215546Sopenharmony_ci * 665bf215546Sopenharmony_ci * "When a sequence of NoDDChk and NoDDClr are used, the last 666bf215546Sopenharmony_ci * instruction that completes the scoreboard clear must have a 667bf215546Sopenharmony_ci * non-zero execution mask. This means, if any kind of predication 668bf215546Sopenharmony_ci * can change the execution mask or channel enable of the last 669bf215546Sopenharmony_ci * instruction, the optimization must be avoided. This is to 670bf215546Sopenharmony_ci * avoid instructions being shot down the pipeline when no writes 671bf215546Sopenharmony_ci * are required." 672bf215546Sopenharmony_ci * 673bf215546Sopenharmony_ci * Whenever predication is enabled or the instructions being emitted 674bf215546Sopenharmony_ci * aren't the full width, it's possible that it will be run with zero 675bf215546Sopenharmony_ci * channels enabled so we can't use dependency control without 676bf215546Sopenharmony_ci * running the risk of a hang if an instruction gets shot down. 677bf215546Sopenharmony_ci */ 678bf215546Sopenharmony_ci const bool use_dep_ctrl = !inst->predicate && 679bf215546Sopenharmony_ci lower_width == dispatch_width; 680bf215546Sopenharmony_ci brw_inst *insn; 681bf215546Sopenharmony_ci 682bf215546Sopenharmony_ci /* Due to a hardware bug some platforms (particularly Gfx11+) seem 683bf215546Sopenharmony_ci * to require the address components of all channels to be valid 684bf215546Sopenharmony_ci * whether or not they're active, which causes issues if we use VxH 685bf215546Sopenharmony_ci * addressing under non-uniform control-flow. We can easily work 686bf215546Sopenharmony_ci * around that by initializing the whole address register with a 687bf215546Sopenharmony_ci * pipelined NoMask MOV instruction. 688bf215546Sopenharmony_ci */ 689bf215546Sopenharmony_ci insn = brw_MOV(p, addr, brw_imm_uw(src_start_offset)); 690bf215546Sopenharmony_ci brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); 691bf215546Sopenharmony_ci brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); 692bf215546Sopenharmony_ci if (devinfo->ver >= 12) 693bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_null()); 694bf215546Sopenharmony_ci else 695bf215546Sopenharmony_ci brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl); 696bf215546Sopenharmony_ci 697bf215546Sopenharmony_ci /* Take into account the component size and horizontal stride. */ 698bf215546Sopenharmony_ci assert(src.vstride == src.hstride + src.width); 699bf215546Sopenharmony_ci insn = brw_SHL(p, addr, group_idx, 700bf215546Sopenharmony_ci brw_imm_uw(util_logbase2(type_sz(src.type)) + 701bf215546Sopenharmony_ci src.hstride - 1)); 702bf215546Sopenharmony_ci if (devinfo->ver >= 12) 703bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_regdist(1)); 704bf215546Sopenharmony_ci else 705bf215546Sopenharmony_ci brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl); 706bf215546Sopenharmony_ci 707bf215546Sopenharmony_ci /* Add on the register start offset */ 708bf215546Sopenharmony_ci brw_ADD(p, addr, addr, brw_imm_uw(src_start_offset)); 709bf215546Sopenharmony_ci brw_MOV(p, suboffset(dst, group << (dst.hstride - 1)), 710bf215546Sopenharmony_ci retype(brw_VxH_indirect(0, 0), src.type)); 711bf215546Sopenharmony_ci } 712bf215546Sopenharmony_ci 713bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_null()); 714bf215546Sopenharmony_ci } 715bf215546Sopenharmony_ci} 716bf215546Sopenharmony_ci 717bf215546Sopenharmony_civoid 718bf215546Sopenharmony_cifs_generator::generate_quad_swizzle(const fs_inst *inst, 719bf215546Sopenharmony_ci struct brw_reg dst, struct brw_reg src, 720bf215546Sopenharmony_ci unsigned swiz) 721bf215546Sopenharmony_ci{ 722bf215546Sopenharmony_ci /* Requires a quad. */ 723bf215546Sopenharmony_ci assert(inst->exec_size >= 4); 724bf215546Sopenharmony_ci 725bf215546Sopenharmony_ci if (src.file == BRW_IMMEDIATE_VALUE || 726bf215546Sopenharmony_ci has_scalar_region(src)) { 727bf215546Sopenharmony_ci /* The value is uniform across all channels */ 728bf215546Sopenharmony_ci brw_MOV(p, dst, src); 729bf215546Sopenharmony_ci 730bf215546Sopenharmony_ci } else if (devinfo->ver < 11 && type_sz(src.type) == 4) { 731bf215546Sopenharmony_ci /* This only works on 8-wide 32-bit values */ 732bf215546Sopenharmony_ci assert(inst->exec_size == 8); 733bf215546Sopenharmony_ci assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); 734bf215546Sopenharmony_ci assert(src.vstride == src.width + 1); 735bf215546Sopenharmony_ci brw_set_default_access_mode(p, BRW_ALIGN_16); 736bf215546Sopenharmony_ci struct brw_reg swiz_src = stride(src, 4, 4, 1); 737bf215546Sopenharmony_ci swiz_src.swizzle = swiz; 738bf215546Sopenharmony_ci brw_MOV(p, dst, swiz_src); 739bf215546Sopenharmony_ci 740bf215546Sopenharmony_ci } else { 741bf215546Sopenharmony_ci assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); 742bf215546Sopenharmony_ci assert(src.vstride == src.width + 1); 743bf215546Sopenharmony_ci const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0)); 744bf215546Sopenharmony_ci 745bf215546Sopenharmony_ci switch (swiz) { 746bf215546Sopenharmony_ci case BRW_SWIZZLE_XXXX: 747bf215546Sopenharmony_ci case BRW_SWIZZLE_YYYY: 748bf215546Sopenharmony_ci case BRW_SWIZZLE_ZZZZ: 749bf215546Sopenharmony_ci case BRW_SWIZZLE_WWWW: 750bf215546Sopenharmony_ci brw_MOV(p, dst, stride(src_0, 4, 4, 0)); 751bf215546Sopenharmony_ci break; 752bf215546Sopenharmony_ci 753bf215546Sopenharmony_ci case BRW_SWIZZLE_XXZZ: 754bf215546Sopenharmony_ci case BRW_SWIZZLE_YYWW: 755bf215546Sopenharmony_ci brw_MOV(p, dst, stride(src_0, 2, 2, 0)); 756bf215546Sopenharmony_ci break; 757bf215546Sopenharmony_ci 758bf215546Sopenharmony_ci case BRW_SWIZZLE_XYXY: 759bf215546Sopenharmony_ci case BRW_SWIZZLE_ZWZW: 760bf215546Sopenharmony_ci assert(inst->exec_size == 4); 761bf215546Sopenharmony_ci brw_MOV(p, dst, stride(src_0, 0, 2, 1)); 762bf215546Sopenharmony_ci break; 763bf215546Sopenharmony_ci 764bf215546Sopenharmony_ci default: 765bf215546Sopenharmony_ci assert(inst->force_writemask_all); 766bf215546Sopenharmony_ci brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1); 767bf215546Sopenharmony_ci 768bf215546Sopenharmony_ci for (unsigned c = 0; c < 4; c++) { 769bf215546Sopenharmony_ci brw_inst *insn = brw_MOV( 770bf215546Sopenharmony_ci p, stride(suboffset(dst, c), 771bf215546Sopenharmony_ci 4 * inst->dst.stride, 1, 4 * inst->dst.stride), 772bf215546Sopenharmony_ci stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0)); 773bf215546Sopenharmony_ci 774bf215546Sopenharmony_ci if (devinfo->ver < 12) { 775bf215546Sopenharmony_ci brw_inst_set_no_dd_clear(devinfo, insn, c < 3); 776bf215546Sopenharmony_ci brw_inst_set_no_dd_check(devinfo, insn, c > 0); 777bf215546Sopenharmony_ci } 778bf215546Sopenharmony_ci 779bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_null()); 780bf215546Sopenharmony_ci } 781bf215546Sopenharmony_ci 782bf215546Sopenharmony_ci break; 783bf215546Sopenharmony_ci } 784bf215546Sopenharmony_ci } 785bf215546Sopenharmony_ci} 786bf215546Sopenharmony_ci 787bf215546Sopenharmony_civoid 788bf215546Sopenharmony_cifs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) 789bf215546Sopenharmony_ci{ 790bf215546Sopenharmony_ci struct brw_inst *insn; 791bf215546Sopenharmony_ci 792bf215546Sopenharmony_ci insn = brw_next_insn(p, BRW_OPCODE_SEND); 793bf215546Sopenharmony_ci 794bf215546Sopenharmony_ci brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); 795bf215546Sopenharmony_ci brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW)); 796bf215546Sopenharmony_ci if (devinfo->ver < 12) 797bf215546Sopenharmony_ci brw_set_src1(p, insn, brw_imm_ud(0u)); 798bf215546Sopenharmony_ci 799bf215546Sopenharmony_ci /* For XeHP and newer send a message to the message gateway to terminate a 800bf215546Sopenharmony_ci * compute shader. For older devices, a message is sent to the thread 801bf215546Sopenharmony_ci * spawner. 802bf215546Sopenharmony_ci */ 803bf215546Sopenharmony_ci if (devinfo->verx10 >= 125) 804bf215546Sopenharmony_ci brw_inst_set_sfid(devinfo, insn, BRW_SFID_MESSAGE_GATEWAY); 805bf215546Sopenharmony_ci else 806bf215546Sopenharmony_ci brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER); 807bf215546Sopenharmony_ci brw_inst_set_mlen(devinfo, insn, 1); 808bf215546Sopenharmony_ci brw_inst_set_rlen(devinfo, insn, 0); 809bf215546Sopenharmony_ci brw_inst_set_eot(devinfo, insn, inst->eot); 810bf215546Sopenharmony_ci brw_inst_set_header_present(devinfo, insn, false); 811bf215546Sopenharmony_ci 812bf215546Sopenharmony_ci brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */ 813bf215546Sopenharmony_ci 814bf215546Sopenharmony_ci if (devinfo->ver < 11) { 815bf215546Sopenharmony_ci brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ 816bf215546Sopenharmony_ci 817bf215546Sopenharmony_ci /* Note that even though the thread has a URB resource associated with it, 818bf215546Sopenharmony_ci * we set the "do not dereference URB" bit, because the URB resource is 819bf215546Sopenharmony_ci * managed by the fixed-function unit, so it will free it automatically. 820bf215546Sopenharmony_ci */ 821bf215546Sopenharmony_ci brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ 822bf215546Sopenharmony_ci } 823bf215546Sopenharmony_ci 824bf215546Sopenharmony_ci brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); 825bf215546Sopenharmony_ci} 826bf215546Sopenharmony_ci 827bf215546Sopenharmony_civoid 828bf215546Sopenharmony_cifs_generator::generate_barrier(fs_inst *, struct brw_reg src) 829bf215546Sopenharmony_ci{ 830bf215546Sopenharmony_ci brw_barrier(p, src); 831bf215546Sopenharmony_ci if (devinfo->ver >= 12) { 832bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_null()); 833bf215546Sopenharmony_ci brw_SYNC(p, TGL_SYNC_BAR); 834bf215546Sopenharmony_ci } else { 835bf215546Sopenharmony_ci brw_WAIT(p); 836bf215546Sopenharmony_ci } 837bf215546Sopenharmony_ci} 838bf215546Sopenharmony_ci 839bf215546Sopenharmony_cibool 840bf215546Sopenharmony_cifs_generator::generate_linterp(fs_inst *inst, 841bf215546Sopenharmony_ci struct brw_reg dst, struct brw_reg *src) 842bf215546Sopenharmony_ci{ 843bf215546Sopenharmony_ci /* PLN reads: 844bf215546Sopenharmony_ci * / in SIMD16 \ 845bf215546Sopenharmony_ci * ----------------------------------- 846bf215546Sopenharmony_ci * | src1+0 | src1+1 | src1+2 | src1+3 | 847bf215546Sopenharmony_ci * |-----------------------------------| 848bf215546Sopenharmony_ci * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)| 849bf215546Sopenharmony_ci * ----------------------------------- 850bf215546Sopenharmony_ci * 851bf215546Sopenharmony_ci * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys: 852bf215546Sopenharmony_ci * 853bf215546Sopenharmony_ci * ----------------------------------- 854bf215546Sopenharmony_ci * | src1+0 | src1+1 | src1+2 | src1+3 | 855bf215546Sopenharmony_ci * |-----------------------------------| 856bf215546Sopenharmony_ci * |(x0, x1)|(y0, y1)| | | in SIMD8 857bf215546Sopenharmony_ci * |-----------------------------------| 858bf215546Sopenharmony_ci * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16 859bf215546Sopenharmony_ci * ----------------------------------- 860bf215546Sopenharmony_ci * 861bf215546Sopenharmony_ci * See also: emit_interpolation_setup_gfx4(). 862bf215546Sopenharmony_ci */ 863bf215546Sopenharmony_ci struct brw_reg delta_x = src[0]; 864bf215546Sopenharmony_ci struct brw_reg delta_y = offset(src[0], inst->exec_size / 8); 865bf215546Sopenharmony_ci struct brw_reg interp = src[1]; 866bf215546Sopenharmony_ci brw_inst *i[2]; 867bf215546Sopenharmony_ci 868bf215546Sopenharmony_ci /* nir_lower_interpolation() will do the lowering to MAD instructions for 869bf215546Sopenharmony_ci * us on gfx11+ 870bf215546Sopenharmony_ci */ 871bf215546Sopenharmony_ci assert(devinfo->ver < 11); 872bf215546Sopenharmony_ci 873bf215546Sopenharmony_ci if (devinfo->has_pln) { 874bf215546Sopenharmony_ci if (devinfo->ver <= 6 && (delta_x.nr & 1) != 0) { 875bf215546Sopenharmony_ci /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane": 876bf215546Sopenharmony_ci * 877bf215546Sopenharmony_ci * "[DevSNB]:<src1> must be even register aligned. 878bf215546Sopenharmony_ci * 879bf215546Sopenharmony_ci * This restriction is lifted on Ivy Bridge. 880bf215546Sopenharmony_ci * 881bf215546Sopenharmony_ci * This means that we need to split PLN into LINE+MAC on-the-fly. 882bf215546Sopenharmony_ci * Unfortunately, the inputs are laid out for PLN and not LINE+MAC so 883bf215546Sopenharmony_ci * we have to split into SIMD8 pieces. For gfx4 (!has_pln), the 884bf215546Sopenharmony_ci * coordinate registers are laid out differently so we leave it as a 885bf215546Sopenharmony_ci * SIMD16 instruction. 886bf215546Sopenharmony_ci */ 887bf215546Sopenharmony_ci assert(inst->exec_size == 8 || inst->exec_size == 16); 888bf215546Sopenharmony_ci assert(inst->group % 16 == 0); 889bf215546Sopenharmony_ci 890bf215546Sopenharmony_ci brw_push_insn_state(p); 891bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_8); 892bf215546Sopenharmony_ci 893bf215546Sopenharmony_ci /* Thanks to two accumulators, we can emit all the LINEs and then all 894bf215546Sopenharmony_ci * the MACs. This improves parallelism a bit. 895bf215546Sopenharmony_ci */ 896bf215546Sopenharmony_ci for (unsigned g = 0; g < inst->exec_size / 8; g++) { 897bf215546Sopenharmony_ci brw_inst *line = brw_LINE(p, brw_null_reg(), interp, 898bf215546Sopenharmony_ci offset(delta_x, g * 2)); 899bf215546Sopenharmony_ci brw_inst_set_group(devinfo, line, inst->group + g * 8); 900bf215546Sopenharmony_ci 901bf215546Sopenharmony_ci /* LINE writes the accumulator automatically on gfx4-5. On Sandy 902bf215546Sopenharmony_ci * Bridge and later, we have to explicitly enable it. 903bf215546Sopenharmony_ci */ 904bf215546Sopenharmony_ci if (devinfo->ver >= 6) 905bf215546Sopenharmony_ci brw_inst_set_acc_wr_control(p->devinfo, line, true); 906bf215546Sopenharmony_ci 907bf215546Sopenharmony_ci /* brw_set_default_saturate() is called before emitting 908bf215546Sopenharmony_ci * instructions, so the saturate bit is set in each instruction, 909bf215546Sopenharmony_ci * so we need to unset it on the LINE instructions. 910bf215546Sopenharmony_ci */ 911bf215546Sopenharmony_ci brw_inst_set_saturate(p->devinfo, line, false); 912bf215546Sopenharmony_ci } 913bf215546Sopenharmony_ci 914bf215546Sopenharmony_ci for (unsigned g = 0; g < inst->exec_size / 8; g++) { 915bf215546Sopenharmony_ci brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1), 916bf215546Sopenharmony_ci offset(delta_x, g * 2 + 1)); 917bf215546Sopenharmony_ci brw_inst_set_group(devinfo, mac, inst->group + g * 8); 918bf215546Sopenharmony_ci brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod); 919bf215546Sopenharmony_ci } 920bf215546Sopenharmony_ci 921bf215546Sopenharmony_ci brw_pop_insn_state(p); 922bf215546Sopenharmony_ci 923bf215546Sopenharmony_ci return true; 924bf215546Sopenharmony_ci } else { 925bf215546Sopenharmony_ci brw_PLN(p, dst, interp, delta_x); 926bf215546Sopenharmony_ci 927bf215546Sopenharmony_ci return false; 928bf215546Sopenharmony_ci } 929bf215546Sopenharmony_ci } else { 930bf215546Sopenharmony_ci i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x); 931bf215546Sopenharmony_ci i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y); 932bf215546Sopenharmony_ci 933bf215546Sopenharmony_ci brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod); 934bf215546Sopenharmony_ci 935bf215546Sopenharmony_ci /* brw_set_default_saturate() is called before emitting instructions, so 936bf215546Sopenharmony_ci * the saturate bit is set in each instruction, so we need to unset it on 937bf215546Sopenharmony_ci * the first instruction. 938bf215546Sopenharmony_ci */ 939bf215546Sopenharmony_ci brw_inst_set_saturate(p->devinfo, i[0], false); 940bf215546Sopenharmony_ci 941bf215546Sopenharmony_ci return true; 942bf215546Sopenharmony_ci } 943bf215546Sopenharmony_ci} 944bf215546Sopenharmony_ci 945bf215546Sopenharmony_civoid 946bf215546Sopenharmony_cifs_generator::generate_get_buffer_size(fs_inst *inst, 947bf215546Sopenharmony_ci struct brw_reg dst, 948bf215546Sopenharmony_ci struct brw_reg src, 949bf215546Sopenharmony_ci struct brw_reg surf_index) 950bf215546Sopenharmony_ci{ 951bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 952bf215546Sopenharmony_ci assert(surf_index.file == BRW_IMMEDIATE_VALUE); 953bf215546Sopenharmony_ci 954bf215546Sopenharmony_ci uint32_t simd_mode; 955bf215546Sopenharmony_ci int rlen = 4; 956bf215546Sopenharmony_ci 957bf215546Sopenharmony_ci switch (inst->exec_size) { 958bf215546Sopenharmony_ci case 8: 959bf215546Sopenharmony_ci simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 960bf215546Sopenharmony_ci break; 961bf215546Sopenharmony_ci case 16: 962bf215546Sopenharmony_ci simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 963bf215546Sopenharmony_ci break; 964bf215546Sopenharmony_ci default: 965bf215546Sopenharmony_ci unreachable("Invalid width for texture instruction"); 966bf215546Sopenharmony_ci } 967bf215546Sopenharmony_ci 968bf215546Sopenharmony_ci if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 969bf215546Sopenharmony_ci rlen = 8; 970bf215546Sopenharmony_ci dst = vec16(dst); 971bf215546Sopenharmony_ci } 972bf215546Sopenharmony_ci 973bf215546Sopenharmony_ci uint32_t return_format = 974bf215546Sopenharmony_ci devinfo->ver >= 8 ? GFX8_SAMPLER_RETURN_FORMAT_32BITS : 975bf215546Sopenharmony_ci BRW_SAMPLER_RETURN_FORMAT_SINT32; 976bf215546Sopenharmony_ci brw_SAMPLE(p, 977bf215546Sopenharmony_ci retype(dst, BRW_REGISTER_TYPE_UW), 978bf215546Sopenharmony_ci inst->base_mrf, 979bf215546Sopenharmony_ci src, 980bf215546Sopenharmony_ci surf_index.ud, 981bf215546Sopenharmony_ci 0, 982bf215546Sopenharmony_ci GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO, 983bf215546Sopenharmony_ci rlen, /* response length */ 984bf215546Sopenharmony_ci inst->mlen, 985bf215546Sopenharmony_ci inst->header_size > 0, 986bf215546Sopenharmony_ci simd_mode, 987bf215546Sopenharmony_ci return_format); 988bf215546Sopenharmony_ci} 989bf215546Sopenharmony_ci 990bf215546Sopenharmony_civoid 991bf215546Sopenharmony_cifs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, 992bf215546Sopenharmony_ci struct brw_reg surface_index, 993bf215546Sopenharmony_ci struct brw_reg sampler_index) 994bf215546Sopenharmony_ci{ 995bf215546Sopenharmony_ci assert(devinfo->ver < 7); 996bf215546Sopenharmony_ci assert(inst->size_written % REG_SIZE == 0); 997bf215546Sopenharmony_ci int msg_type = -1; 998bf215546Sopenharmony_ci uint32_t simd_mode; 999bf215546Sopenharmony_ci uint32_t return_format; 1000bf215546Sopenharmony_ci 1001bf215546Sopenharmony_ci /* Sampler EOT message of less than the dispatch width would kill the 1002bf215546Sopenharmony_ci * thread prematurely. 1003bf215546Sopenharmony_ci */ 1004bf215546Sopenharmony_ci assert(!inst->eot || inst->exec_size == dispatch_width); 1005bf215546Sopenharmony_ci 1006bf215546Sopenharmony_ci switch (dst.type) { 1007bf215546Sopenharmony_ci case BRW_REGISTER_TYPE_D: 1008bf215546Sopenharmony_ci return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; 1009bf215546Sopenharmony_ci break; 1010bf215546Sopenharmony_ci case BRW_REGISTER_TYPE_UD: 1011bf215546Sopenharmony_ci return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 1012bf215546Sopenharmony_ci break; 1013bf215546Sopenharmony_ci default: 1014bf215546Sopenharmony_ci return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 1015bf215546Sopenharmony_ci break; 1016bf215546Sopenharmony_ci } 1017bf215546Sopenharmony_ci 1018bf215546Sopenharmony_ci /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type 1019bf215546Sopenharmony_ci * is set as part of the message descriptor. On gfx4, the PRM seems to 1020bf215546Sopenharmony_ci * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on 1021bf215546Sopenharmony_ci * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is 1022bf215546Sopenharmony_ci * gone from the message descriptor entirely and you just get UINT32 all 1023bf215546Sopenharmony_ci * the time regasrdless. Since we can really only do non-UINT32 on gfx4, 1024bf215546Sopenharmony_ci * just stomp it to UINT32 all the time. 1025bf215546Sopenharmony_ci */ 1026bf215546Sopenharmony_ci if (inst->opcode == SHADER_OPCODE_TXS) 1027bf215546Sopenharmony_ci return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 1028bf215546Sopenharmony_ci 1029bf215546Sopenharmony_ci switch (inst->exec_size) { 1030bf215546Sopenharmony_ci case 8: 1031bf215546Sopenharmony_ci simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 1032bf215546Sopenharmony_ci break; 1033bf215546Sopenharmony_ci case 16: 1034bf215546Sopenharmony_ci simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1035bf215546Sopenharmony_ci break; 1036bf215546Sopenharmony_ci default: 1037bf215546Sopenharmony_ci unreachable("Invalid width for texture instruction"); 1038bf215546Sopenharmony_ci } 1039bf215546Sopenharmony_ci 1040bf215546Sopenharmony_ci if (devinfo->ver >= 5) { 1041bf215546Sopenharmony_ci switch (inst->opcode) { 1042bf215546Sopenharmony_ci case SHADER_OPCODE_TEX: 1043bf215546Sopenharmony_ci if (inst->shadow_compare) { 1044bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 1045bf215546Sopenharmony_ci } else { 1046bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE; 1047bf215546Sopenharmony_ci } 1048bf215546Sopenharmony_ci break; 1049bf215546Sopenharmony_ci case FS_OPCODE_TXB: 1050bf215546Sopenharmony_ci if (inst->shadow_compare) { 1051bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; 1052bf215546Sopenharmony_ci } else { 1053bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS; 1054bf215546Sopenharmony_ci } 1055bf215546Sopenharmony_ci break; 1056bf215546Sopenharmony_ci case SHADER_OPCODE_TXL: 1057bf215546Sopenharmony_ci if (inst->shadow_compare) { 1058bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 1059bf215546Sopenharmony_ci } else { 1060bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD; 1061bf215546Sopenharmony_ci } 1062bf215546Sopenharmony_ci break; 1063bf215546Sopenharmony_ci case SHADER_OPCODE_TXS: 1064bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO; 1065bf215546Sopenharmony_ci break; 1066bf215546Sopenharmony_ci case SHADER_OPCODE_TXD: 1067bf215546Sopenharmony_ci assert(!inst->shadow_compare); 1068bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS; 1069bf215546Sopenharmony_ci break; 1070bf215546Sopenharmony_ci case SHADER_OPCODE_TXF: 1071bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD; 1072bf215546Sopenharmony_ci break; 1073bf215546Sopenharmony_ci case SHADER_OPCODE_TXF_CMS: 1074bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD; 1075bf215546Sopenharmony_ci break; 1076bf215546Sopenharmony_ci case SHADER_OPCODE_LOD: 1077bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_LOD; 1078bf215546Sopenharmony_ci break; 1079bf215546Sopenharmony_ci case SHADER_OPCODE_TG4: 1080bf215546Sopenharmony_ci assert(devinfo->ver == 6); 1081bf215546Sopenharmony_ci assert(!inst->shadow_compare); 1082bf215546Sopenharmony_ci msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4; 1083bf215546Sopenharmony_ci break; 1084bf215546Sopenharmony_ci case SHADER_OPCODE_SAMPLEINFO: 1085bf215546Sopenharmony_ci msg_type = GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; 1086bf215546Sopenharmony_ci break; 1087bf215546Sopenharmony_ci default: 1088bf215546Sopenharmony_ci unreachable("not reached"); 1089bf215546Sopenharmony_ci } 1090bf215546Sopenharmony_ci } else { 1091bf215546Sopenharmony_ci switch (inst->opcode) { 1092bf215546Sopenharmony_ci case SHADER_OPCODE_TEX: 1093bf215546Sopenharmony_ci /* Note that G45 and older determines shadow compare and dispatch width 1094bf215546Sopenharmony_ci * from message length for most messages. 1095bf215546Sopenharmony_ci */ 1096bf215546Sopenharmony_ci if (inst->exec_size == 8) { 1097bf215546Sopenharmony_ci msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 1098bf215546Sopenharmony_ci if (inst->shadow_compare) { 1099bf215546Sopenharmony_ci assert(inst->mlen == 6); 1100bf215546Sopenharmony_ci } else { 1101bf215546Sopenharmony_ci assert(inst->mlen <= 4); 1102bf215546Sopenharmony_ci } 1103bf215546Sopenharmony_ci } else { 1104bf215546Sopenharmony_ci if (inst->shadow_compare) { 1105bf215546Sopenharmony_ci msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE; 1106bf215546Sopenharmony_ci assert(inst->mlen == 9); 1107bf215546Sopenharmony_ci } else { 1108bf215546Sopenharmony_ci msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE; 1109bf215546Sopenharmony_ci assert(inst->mlen <= 7 && inst->mlen % 2 == 1); 1110bf215546Sopenharmony_ci } 1111bf215546Sopenharmony_ci } 1112bf215546Sopenharmony_ci break; 1113bf215546Sopenharmony_ci case FS_OPCODE_TXB: 1114bf215546Sopenharmony_ci if (inst->shadow_compare) { 1115bf215546Sopenharmony_ci assert(inst->exec_size == 8); 1116bf215546Sopenharmony_ci assert(inst->mlen == 6); 1117bf215546Sopenharmony_ci msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 1118bf215546Sopenharmony_ci } else { 1119bf215546Sopenharmony_ci assert(inst->mlen == 9); 1120bf215546Sopenharmony_ci msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 1121bf215546Sopenharmony_ci simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1122bf215546Sopenharmony_ci } 1123bf215546Sopenharmony_ci break; 1124bf215546Sopenharmony_ci case SHADER_OPCODE_TXL: 1125bf215546Sopenharmony_ci if (inst->shadow_compare) { 1126bf215546Sopenharmony_ci assert(inst->exec_size == 8); 1127bf215546Sopenharmony_ci assert(inst->mlen == 6); 1128bf215546Sopenharmony_ci msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 1129bf215546Sopenharmony_ci } else { 1130bf215546Sopenharmony_ci assert(inst->mlen == 9); 1131bf215546Sopenharmony_ci msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 1132bf215546Sopenharmony_ci simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1133bf215546Sopenharmony_ci } 1134bf215546Sopenharmony_ci break; 1135bf215546Sopenharmony_ci case SHADER_OPCODE_TXD: 1136bf215546Sopenharmony_ci /* There is no sample_d_c message; comparisons are done manually */ 1137bf215546Sopenharmony_ci assert(inst->exec_size == 8); 1138bf215546Sopenharmony_ci assert(inst->mlen == 7 || inst->mlen == 10); 1139bf215546Sopenharmony_ci msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS; 1140bf215546Sopenharmony_ci break; 1141bf215546Sopenharmony_ci case SHADER_OPCODE_TXF: 1142bf215546Sopenharmony_ci assert(inst->mlen <= 9 && inst->mlen % 2 == 1); 1143bf215546Sopenharmony_ci msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; 1144bf215546Sopenharmony_ci simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1145bf215546Sopenharmony_ci break; 1146bf215546Sopenharmony_ci case SHADER_OPCODE_TXS: 1147bf215546Sopenharmony_ci assert(inst->mlen == 3); 1148bf215546Sopenharmony_ci msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO; 1149bf215546Sopenharmony_ci simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1150bf215546Sopenharmony_ci break; 1151bf215546Sopenharmony_ci default: 1152bf215546Sopenharmony_ci unreachable("not reached"); 1153bf215546Sopenharmony_ci } 1154bf215546Sopenharmony_ci } 1155bf215546Sopenharmony_ci assert(msg_type != -1); 1156bf215546Sopenharmony_ci 1157bf215546Sopenharmony_ci if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 1158bf215546Sopenharmony_ci dst = vec16(dst); 1159bf215546Sopenharmony_ci } 1160bf215546Sopenharmony_ci 1161bf215546Sopenharmony_ci assert(sampler_index.type == BRW_REGISTER_TYPE_UD); 1162bf215546Sopenharmony_ci 1163bf215546Sopenharmony_ci /* Load the message header if present. If there's a texture offset, 1164bf215546Sopenharmony_ci * we need to set it up explicitly and load the offset bitfield. 1165bf215546Sopenharmony_ci * Otherwise, we can use an implied move from g0 to the first message reg. 1166bf215546Sopenharmony_ci */ 1167bf215546Sopenharmony_ci struct brw_reg src = brw_null_reg(); 1168bf215546Sopenharmony_ci if (inst->header_size != 0) { 1169bf215546Sopenharmony_ci if (devinfo->ver < 6 && !inst->offset) { 1170bf215546Sopenharmony_ci /* Set up an implied move from g0 to the MRF. */ 1171bf215546Sopenharmony_ci src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 1172bf215546Sopenharmony_ci } else { 1173bf215546Sopenharmony_ci const tgl_swsb swsb = brw_get_default_swsb(p); 1174bf215546Sopenharmony_ci assert(inst->base_mrf != -1); 1175bf215546Sopenharmony_ci struct brw_reg header_reg = brw_message_reg(inst->base_mrf); 1176bf215546Sopenharmony_ci 1177bf215546Sopenharmony_ci brw_push_insn_state(p); 1178bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 1179bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_8); 1180bf215546Sopenharmony_ci brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1181bf215546Sopenharmony_ci brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 1182bf215546Sopenharmony_ci /* Explicitly set up the message header by copying g0 to the MRF. */ 1183bf215546Sopenharmony_ci brw_MOV(p, header_reg, brw_vec8_grf(0, 0)); 1184bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_regdist(1)); 1185bf215546Sopenharmony_ci 1186bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_1); 1187bf215546Sopenharmony_ci if (inst->offset) { 1188bf215546Sopenharmony_ci /* Set the offset bits in DWord 2. */ 1189bf215546Sopenharmony_ci brw_MOV(p, get_element_ud(header_reg, 2), 1190bf215546Sopenharmony_ci brw_imm_ud(inst->offset)); 1191bf215546Sopenharmony_ci } 1192bf215546Sopenharmony_ci 1193bf215546Sopenharmony_ci brw_pop_insn_state(p); 1194bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 1195bf215546Sopenharmony_ci } 1196bf215546Sopenharmony_ci } 1197bf215546Sopenharmony_ci 1198bf215546Sopenharmony_ci assert(surface_index.file == BRW_IMMEDIATE_VALUE); 1199bf215546Sopenharmony_ci assert(sampler_index.file == BRW_IMMEDIATE_VALUE); 1200bf215546Sopenharmony_ci 1201bf215546Sopenharmony_ci brw_SAMPLE(p, 1202bf215546Sopenharmony_ci retype(dst, BRW_REGISTER_TYPE_UW), 1203bf215546Sopenharmony_ci inst->base_mrf, 1204bf215546Sopenharmony_ci src, 1205bf215546Sopenharmony_ci surface_index.ud, 1206bf215546Sopenharmony_ci sampler_index.ud % 16, 1207bf215546Sopenharmony_ci msg_type, 1208bf215546Sopenharmony_ci inst->size_written / REG_SIZE, 1209bf215546Sopenharmony_ci inst->mlen, 1210bf215546Sopenharmony_ci inst->header_size != 0, 1211bf215546Sopenharmony_ci simd_mode, 1212bf215546Sopenharmony_ci return_format); 1213bf215546Sopenharmony_ci} 1214bf215546Sopenharmony_ci 1215bf215546Sopenharmony_ci 1216bf215546Sopenharmony_ci/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 1217bf215546Sopenharmony_ci * looking like: 1218bf215546Sopenharmony_ci * 1219bf215546Sopenharmony_ci * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 1220bf215546Sopenharmony_ci * 1221bf215546Sopenharmony_ci * Ideally, we want to produce: 1222bf215546Sopenharmony_ci * 1223bf215546Sopenharmony_ci * DDX DDY 1224bf215546Sopenharmony_ci * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 1225bf215546Sopenharmony_ci * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 1226bf215546Sopenharmony_ci * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 1227bf215546Sopenharmony_ci * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 1228bf215546Sopenharmony_ci * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 1229bf215546Sopenharmony_ci * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 1230bf215546Sopenharmony_ci * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 1231bf215546Sopenharmony_ci * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 1232bf215546Sopenharmony_ci * 1233bf215546Sopenharmony_ci * and add another set of two more subspans if in 16-pixel dispatch mode. 1234bf215546Sopenharmony_ci * 1235bf215546Sopenharmony_ci * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 1236bf215546Sopenharmony_ci * for each pair, and vertstride = 2 jumps us 2 elements after processing a 1237bf215546Sopenharmony_ci * pair. But the ideal approximation may impose a huge performance cost on 1238bf215546Sopenharmony_ci * sample_d. On at least Haswell, sample_d instruction does some 1239bf215546Sopenharmony_ci * optimizations if the same LOD is used for all pixels in the subspan. 1240bf215546Sopenharmony_ci * 1241bf215546Sopenharmony_ci * For DDY, we need to use ALIGN16 mode since it's capable of doing the 1242bf215546Sopenharmony_ci * appropriate swizzling. 1243bf215546Sopenharmony_ci */ 1244bf215546Sopenharmony_civoid 1245bf215546Sopenharmony_cifs_generator::generate_ddx(const fs_inst *inst, 1246bf215546Sopenharmony_ci struct brw_reg dst, struct brw_reg src) 1247bf215546Sopenharmony_ci{ 1248bf215546Sopenharmony_ci unsigned vstride, width; 1249bf215546Sopenharmony_ci 1250bf215546Sopenharmony_ci if (devinfo->ver >= 8) { 1251bf215546Sopenharmony_ci if (inst->opcode == FS_OPCODE_DDX_FINE) { 1252bf215546Sopenharmony_ci /* produce accurate derivatives */ 1253bf215546Sopenharmony_ci vstride = BRW_VERTICAL_STRIDE_2; 1254bf215546Sopenharmony_ci width = BRW_WIDTH_2; 1255bf215546Sopenharmony_ci } else { 1256bf215546Sopenharmony_ci /* replicate the derivative at the top-left pixel to other pixels */ 1257bf215546Sopenharmony_ci vstride = BRW_VERTICAL_STRIDE_4; 1258bf215546Sopenharmony_ci width = BRW_WIDTH_4; 1259bf215546Sopenharmony_ci } 1260bf215546Sopenharmony_ci 1261bf215546Sopenharmony_ci struct brw_reg src0 = byte_offset(src, type_sz(src.type));; 1262bf215546Sopenharmony_ci struct brw_reg src1 = src; 1263bf215546Sopenharmony_ci 1264bf215546Sopenharmony_ci src0.vstride = vstride; 1265bf215546Sopenharmony_ci src0.width = width; 1266bf215546Sopenharmony_ci src0.hstride = BRW_HORIZONTAL_STRIDE_0; 1267bf215546Sopenharmony_ci src1.vstride = vstride; 1268bf215546Sopenharmony_ci src1.width = width; 1269bf215546Sopenharmony_ci src1.hstride = BRW_HORIZONTAL_STRIDE_0; 1270bf215546Sopenharmony_ci 1271bf215546Sopenharmony_ci brw_ADD(p, dst, src0, negate(src1)); 1272bf215546Sopenharmony_ci } else { 1273bf215546Sopenharmony_ci /* On Haswell and earlier, the region used above appears to not work 1274bf215546Sopenharmony_ci * correctly for compressed instructions. At least on Haswell and 1275bf215546Sopenharmony_ci * Iron Lake, compressed ALIGN16 instructions do work. Since we 1276bf215546Sopenharmony_ci * would have to split to SIMD8 no matter which method we choose, we 1277bf215546Sopenharmony_ci * may as well use ALIGN16 on all platforms gfx7 and earlier. 1278bf215546Sopenharmony_ci */ 1279bf215546Sopenharmony_ci struct brw_reg src0 = stride(src, 4, 4, 1); 1280bf215546Sopenharmony_ci struct brw_reg src1 = stride(src, 4, 4, 1); 1281bf215546Sopenharmony_ci if (inst->opcode == FS_OPCODE_DDX_FINE) { 1282bf215546Sopenharmony_ci src0.swizzle = BRW_SWIZZLE_XXZZ; 1283bf215546Sopenharmony_ci src1.swizzle = BRW_SWIZZLE_YYWW; 1284bf215546Sopenharmony_ci } else { 1285bf215546Sopenharmony_ci src0.swizzle = BRW_SWIZZLE_XXXX; 1286bf215546Sopenharmony_ci src1.swizzle = BRW_SWIZZLE_YYYY; 1287bf215546Sopenharmony_ci } 1288bf215546Sopenharmony_ci 1289bf215546Sopenharmony_ci brw_push_insn_state(p); 1290bf215546Sopenharmony_ci brw_set_default_access_mode(p, BRW_ALIGN_16); 1291bf215546Sopenharmony_ci brw_ADD(p, dst, negate(src0), src1); 1292bf215546Sopenharmony_ci brw_pop_insn_state(p); 1293bf215546Sopenharmony_ci } 1294bf215546Sopenharmony_ci} 1295bf215546Sopenharmony_ci 1296bf215546Sopenharmony_ci/* The negate_value boolean is used to negate the derivative computation for 1297bf215546Sopenharmony_ci * FBOs, since they place the origin at the upper left instead of the lower 1298bf215546Sopenharmony_ci * left. 1299bf215546Sopenharmony_ci */ 1300bf215546Sopenharmony_civoid 1301bf215546Sopenharmony_cifs_generator::generate_ddy(const fs_inst *inst, 1302bf215546Sopenharmony_ci struct brw_reg dst, struct brw_reg src) 1303bf215546Sopenharmony_ci{ 1304bf215546Sopenharmony_ci const uint32_t type_size = type_sz(src.type); 1305bf215546Sopenharmony_ci 1306bf215546Sopenharmony_ci if (inst->opcode == FS_OPCODE_DDY_FINE) { 1307bf215546Sopenharmony_ci /* produce accurate derivatives. 1308bf215546Sopenharmony_ci * 1309bf215546Sopenharmony_ci * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU) 1310bf215546Sopenharmony_ci * "Register Region Restrictions", Section "1. Special Restrictions": 1311bf215546Sopenharmony_ci * 1312bf215546Sopenharmony_ci * "In Align16 mode, the channel selects and channel enables apply to 1313bf215546Sopenharmony_ci * a pair of half-floats, because these parameters are defined for 1314bf215546Sopenharmony_ci * DWord elements ONLY. This is applicable when both source and 1315bf215546Sopenharmony_ci * destination are half-floats." 1316bf215546Sopenharmony_ci * 1317bf215546Sopenharmony_ci * So for half-float operations we use the Gfx11+ Align1 path. CHV 1318bf215546Sopenharmony_ci * inherits its FP16 hardware from SKL, so it is not affected. 1319bf215546Sopenharmony_ci */ 1320bf215546Sopenharmony_ci if (devinfo->ver >= 11 || 1321bf215546Sopenharmony_ci (devinfo->platform == INTEL_PLATFORM_BDW && src.type == BRW_REGISTER_TYPE_HF)) { 1322bf215546Sopenharmony_ci src = stride(src, 0, 2, 1); 1323bf215546Sopenharmony_ci 1324bf215546Sopenharmony_ci brw_push_insn_state(p); 1325bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_4); 1326bf215546Sopenharmony_ci for (uint32_t g = 0; g < inst->exec_size; g += 4) { 1327bf215546Sopenharmony_ci brw_set_default_group(p, inst->group + g); 1328bf215546Sopenharmony_ci brw_ADD(p, byte_offset(dst, g * type_size), 1329bf215546Sopenharmony_ci negate(byte_offset(src, g * type_size)), 1330bf215546Sopenharmony_ci byte_offset(src, (g + 2) * type_size)); 1331bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_null()); 1332bf215546Sopenharmony_ci } 1333bf215546Sopenharmony_ci brw_pop_insn_state(p); 1334bf215546Sopenharmony_ci } else { 1335bf215546Sopenharmony_ci struct brw_reg src0 = stride(src, 4, 4, 1); 1336bf215546Sopenharmony_ci struct brw_reg src1 = stride(src, 4, 4, 1); 1337bf215546Sopenharmony_ci src0.swizzle = BRW_SWIZZLE_XYXY; 1338bf215546Sopenharmony_ci src1.swizzle = BRW_SWIZZLE_ZWZW; 1339bf215546Sopenharmony_ci 1340bf215546Sopenharmony_ci brw_push_insn_state(p); 1341bf215546Sopenharmony_ci brw_set_default_access_mode(p, BRW_ALIGN_16); 1342bf215546Sopenharmony_ci brw_ADD(p, dst, negate(src0), src1); 1343bf215546Sopenharmony_ci brw_pop_insn_state(p); 1344bf215546Sopenharmony_ci } 1345bf215546Sopenharmony_ci } else { 1346bf215546Sopenharmony_ci /* replicate the derivative at the top-left pixel to other pixels */ 1347bf215546Sopenharmony_ci if (devinfo->ver >= 8) { 1348bf215546Sopenharmony_ci struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size); 1349bf215546Sopenharmony_ci struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size); 1350bf215546Sopenharmony_ci 1351bf215546Sopenharmony_ci brw_ADD(p, dst, negate(src0), src1); 1352bf215546Sopenharmony_ci } else { 1353bf215546Sopenharmony_ci /* On Haswell and earlier, the region used above appears to not work 1354bf215546Sopenharmony_ci * correctly for compressed instructions. At least on Haswell and 1355bf215546Sopenharmony_ci * Iron Lake, compressed ALIGN16 instructions do work. Since we 1356bf215546Sopenharmony_ci * would have to split to SIMD8 no matter which method we choose, we 1357bf215546Sopenharmony_ci * may as well use ALIGN16 on all platforms gfx7 and earlier. 1358bf215546Sopenharmony_ci */ 1359bf215546Sopenharmony_ci struct brw_reg src0 = stride(src, 4, 4, 1); 1360bf215546Sopenharmony_ci struct brw_reg src1 = stride(src, 4, 4, 1); 1361bf215546Sopenharmony_ci src0.swizzle = BRW_SWIZZLE_XXXX; 1362bf215546Sopenharmony_ci src1.swizzle = BRW_SWIZZLE_ZZZZ; 1363bf215546Sopenharmony_ci 1364bf215546Sopenharmony_ci brw_push_insn_state(p); 1365bf215546Sopenharmony_ci brw_set_default_access_mode(p, BRW_ALIGN_16); 1366bf215546Sopenharmony_ci brw_ADD(p, dst, negate(src0), src1); 1367bf215546Sopenharmony_ci brw_pop_insn_state(p); 1368bf215546Sopenharmony_ci } 1369bf215546Sopenharmony_ci } 1370bf215546Sopenharmony_ci} 1371bf215546Sopenharmony_ci 1372bf215546Sopenharmony_civoid 1373bf215546Sopenharmony_cifs_generator::generate_halt(fs_inst *) 1374bf215546Sopenharmony_ci{ 1375bf215546Sopenharmony_ci /* This HALT will be patched up at FB write time to point UIP at the end of 1376bf215546Sopenharmony_ci * the program, and at brw_uip_jip() JIP will be set to the end of the 1377bf215546Sopenharmony_ci * current block (or the program). 1378bf215546Sopenharmony_ci */ 1379bf215546Sopenharmony_ci this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); 1380bf215546Sopenharmony_ci brw_HALT(p); 1381bf215546Sopenharmony_ci} 1382bf215546Sopenharmony_ci 1383bf215546Sopenharmony_civoid 1384bf215546Sopenharmony_cifs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) 1385bf215546Sopenharmony_ci{ 1386bf215546Sopenharmony_ci /* The 32-wide messages only respect the first 16-wide half of the channel 1387bf215546Sopenharmony_ci * enable signals which are replicated identically for the second group of 1388bf215546Sopenharmony_ci * 16 channels, so we cannot use them unless the write is marked 1389bf215546Sopenharmony_ci * force_writemask_all. 1390bf215546Sopenharmony_ci */ 1391bf215546Sopenharmony_ci const unsigned lower_size = inst->force_writemask_all ? inst->exec_size : 1392bf215546Sopenharmony_ci MIN2(16, inst->exec_size); 1393bf215546Sopenharmony_ci const unsigned block_size = 4 * lower_size / REG_SIZE; 1394bf215546Sopenharmony_ci const tgl_swsb swsb = brw_get_default_swsb(p); 1395bf215546Sopenharmony_ci assert(inst->mlen != 0); 1396bf215546Sopenharmony_ci 1397bf215546Sopenharmony_ci brw_push_insn_state(p); 1398bf215546Sopenharmony_ci brw_set_default_exec_size(p, cvt(lower_size) - 1); 1399bf215546Sopenharmony_ci brw_set_default_compression(p, lower_size > 8); 1400bf215546Sopenharmony_ci 1401bf215546Sopenharmony_ci for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { 1402bf215546Sopenharmony_ci brw_set_default_group(p, inst->group + lower_size * i); 1403bf215546Sopenharmony_ci 1404bf215546Sopenharmony_ci if (i > 0) { 1405bf215546Sopenharmony_ci assert(swsb.mode & TGL_SBID_SET); 1406bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid)); 1407bf215546Sopenharmony_ci } else { 1408bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 1409bf215546Sopenharmony_ci } 1410bf215546Sopenharmony_ci 1411bf215546Sopenharmony_ci brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0), 1412bf215546Sopenharmony_ci retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD)); 1413bf215546Sopenharmony_ci 1414bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 1415bf215546Sopenharmony_ci brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1416bf215546Sopenharmony_ci block_size, 1417bf215546Sopenharmony_ci inst->offset + block_size * REG_SIZE * i); 1418bf215546Sopenharmony_ci } 1419bf215546Sopenharmony_ci 1420bf215546Sopenharmony_ci brw_pop_insn_state(p); 1421bf215546Sopenharmony_ci} 1422bf215546Sopenharmony_ci 1423bf215546Sopenharmony_civoid 1424bf215546Sopenharmony_cifs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst) 1425bf215546Sopenharmony_ci{ 1426bf215546Sopenharmony_ci assert(inst->exec_size <= 16 || inst->force_writemask_all); 1427bf215546Sopenharmony_ci assert(inst->mlen != 0); 1428bf215546Sopenharmony_ci 1429bf215546Sopenharmony_ci brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1430bf215546Sopenharmony_ci inst->exec_size / 8, inst->offset); 1431bf215546Sopenharmony_ci} 1432bf215546Sopenharmony_ci 1433bf215546Sopenharmony_civoid 1434bf215546Sopenharmony_cifs_generator::generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst) 1435bf215546Sopenharmony_ci{ 1436bf215546Sopenharmony_ci assert(inst->exec_size <= 16 || inst->force_writemask_all); 1437bf215546Sopenharmony_ci 1438bf215546Sopenharmony_ci gfx7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset); 1439bf215546Sopenharmony_ci} 1440bf215546Sopenharmony_ci 1441bf215546Sopenharmony_ci/* The A32 messages take a buffer base address in header.5:[31:0] (See 1442bf215546Sopenharmony_ci * MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered 1443bf215546Sopenharmony_ci * and OWord block messages in the SKL PRM Vol. 2d for more details.) 1444bf215546Sopenharmony_ci * Unfortunately, there are a number of subtle differences: 1445bf215546Sopenharmony_ci * 1446bf215546Sopenharmony_ci * For the block read/write messages: 1447bf215546Sopenharmony_ci * 1448bf215546Sopenharmony_ci * - We always stomp header.2 to fill in the actual scratch address (in 1449bf215546Sopenharmony_ci * units of OWORDs) so we don't care what's in there. 1450bf215546Sopenharmony_ci * 1451bf215546Sopenharmony_ci * - They rely on per-thread scratch space value in header.3[3:0] to do 1452bf215546Sopenharmony_ci * bounds checking so that needs to be valid. The upper bits of 1453bf215546Sopenharmony_ci * header.3 are ignored, though, so we can copy all of g0.3. 1454bf215546Sopenharmony_ci * 1455bf215546Sopenharmony_ci * - They ignore header.5[9:0] and assumes the address is 1KB aligned. 1456bf215546Sopenharmony_ci * 1457bf215546Sopenharmony_ci * 1458bf215546Sopenharmony_ci * For the byte/dword scattered read/write messages: 1459bf215546Sopenharmony_ci * 1460bf215546Sopenharmony_ci * - We want header.2 to be zero because that gets added to the per-channel 1461bf215546Sopenharmony_ci * offset in the non-header portion of the message. 1462bf215546Sopenharmony_ci * 1463bf215546Sopenharmony_ci * - Contrary to what the docs claim, they don't do any bounds checking so 1464bf215546Sopenharmony_ci * the value of header.3[3:0] doesn't matter. 1465bf215546Sopenharmony_ci * 1466bf215546Sopenharmony_ci * - They consider all of header.5 for the base address and header.5[9:0] 1467bf215546Sopenharmony_ci * are not ignored. This means that we can't copy g0.5 verbatim because 1468bf215546Sopenharmony_ci * g0.5[9:0] contains the FFTID on most platforms. Instead, we have to 1469bf215546Sopenharmony_ci * use an AND to mask off the bottom 10 bits. 1470bf215546Sopenharmony_ci * 1471bf215546Sopenharmony_ci * 1472bf215546Sopenharmony_ci * For block messages, just copying g0 gives a valid header because all the 1473bf215546Sopenharmony_ci * garbage gets ignored except for header.2 which we stomp as part of message 1474bf215546Sopenharmony_ci * setup. For byte/dword scattered messages, we can just zero out the header 1475bf215546Sopenharmony_ci * and copy over the bits we need from g0.5. This opcode, however, tries to 1476bf215546Sopenharmony_ci * satisfy the requirements of both by starting with 0 and filling out the 1477bf215546Sopenharmony_ci * information required by either set of opcodes. 1478bf215546Sopenharmony_ci */ 1479bf215546Sopenharmony_civoid 1480bf215546Sopenharmony_cifs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst) 1481bf215546Sopenharmony_ci{ 1482bf215546Sopenharmony_ci assert(inst->exec_size == 8 && inst->force_writemask_all); 1483bf215546Sopenharmony_ci assert(dst.file == BRW_GENERAL_REGISTER_FILE); 1484bf215546Sopenharmony_ci 1485bf215546Sopenharmony_ci dst.type = BRW_REGISTER_TYPE_UD; 1486bf215546Sopenharmony_ci 1487bf215546Sopenharmony_ci brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0)); 1488bf215546Sopenharmony_ci if (devinfo->ver >= 12) 1489bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_null()); 1490bf215546Sopenharmony_ci else 1491bf215546Sopenharmony_ci brw_inst_set_no_dd_clear(p->devinfo, insn, true); 1492bf215546Sopenharmony_ci 1493bf215546Sopenharmony_ci /* Copy the per-thread scratch space size from g0.3[3:0] */ 1494bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_1); 1495bf215546Sopenharmony_ci insn = brw_AND(p, suboffset(dst, 3), 1496bf215546Sopenharmony_ci retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD), 1497bf215546Sopenharmony_ci brw_imm_ud(INTEL_MASK(3, 0))); 1498bf215546Sopenharmony_ci if (devinfo->ver < 12) { 1499bf215546Sopenharmony_ci brw_inst_set_no_dd_clear(p->devinfo, insn, true); 1500bf215546Sopenharmony_ci brw_inst_set_no_dd_check(p->devinfo, insn, true); 1501bf215546Sopenharmony_ci } 1502bf215546Sopenharmony_ci 1503bf215546Sopenharmony_ci /* Copy the scratch base address from g0.5[31:10] */ 1504bf215546Sopenharmony_ci insn = brw_AND(p, suboffset(dst, 5), 1505bf215546Sopenharmony_ci retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), 1506bf215546Sopenharmony_ci brw_imm_ud(INTEL_MASK(31, 10))); 1507bf215546Sopenharmony_ci if (devinfo->ver < 12) 1508bf215546Sopenharmony_ci brw_inst_set_no_dd_check(p->devinfo, insn, true); 1509bf215546Sopenharmony_ci} 1510bf215546Sopenharmony_ci 1511bf215546Sopenharmony_civoid 1512bf215546Sopenharmony_cifs_generator::generate_uniform_pull_constant_load(fs_inst *inst, 1513bf215546Sopenharmony_ci struct brw_reg dst, 1514bf215546Sopenharmony_ci struct brw_reg index, 1515bf215546Sopenharmony_ci struct brw_reg offset) 1516bf215546Sopenharmony_ci{ 1517bf215546Sopenharmony_ci assert(type_sz(dst.type) == 4); 1518bf215546Sopenharmony_ci assert(inst->mlen != 0); 1519bf215546Sopenharmony_ci 1520bf215546Sopenharmony_ci assert(index.file == BRW_IMMEDIATE_VALUE && 1521bf215546Sopenharmony_ci index.type == BRW_REGISTER_TYPE_UD); 1522bf215546Sopenharmony_ci uint32_t surf_index = index.ud; 1523bf215546Sopenharmony_ci 1524bf215546Sopenharmony_ci assert(offset.file == BRW_IMMEDIATE_VALUE && 1525bf215546Sopenharmony_ci offset.type == BRW_REGISTER_TYPE_UD); 1526bf215546Sopenharmony_ci uint32_t read_offset = offset.ud; 1527bf215546Sopenharmony_ci 1528bf215546Sopenharmony_ci brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 1529bf215546Sopenharmony_ci read_offset, surf_index); 1530bf215546Sopenharmony_ci} 1531bf215546Sopenharmony_ci 1532bf215546Sopenharmony_civoid 1533bf215546Sopenharmony_cifs_generator::generate_uniform_pull_constant_load_gfx7(fs_inst *inst, 1534bf215546Sopenharmony_ci struct brw_reg dst, 1535bf215546Sopenharmony_ci struct brw_reg index, 1536bf215546Sopenharmony_ci struct brw_reg payload) 1537bf215546Sopenharmony_ci{ 1538bf215546Sopenharmony_ci assert(index.type == BRW_REGISTER_TYPE_UD); 1539bf215546Sopenharmony_ci assert(payload.file == BRW_GENERAL_REGISTER_FILE); 1540bf215546Sopenharmony_ci assert(type_sz(dst.type) == 4); 1541bf215546Sopenharmony_ci assert(!devinfo->has_lsc); 1542bf215546Sopenharmony_ci 1543bf215546Sopenharmony_ci if (index.file == BRW_IMMEDIATE_VALUE) { 1544bf215546Sopenharmony_ci const uint32_t surf_index = index.ud; 1545bf215546Sopenharmony_ci 1546bf215546Sopenharmony_ci brw_push_insn_state(p); 1547bf215546Sopenharmony_ci brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1548bf215546Sopenharmony_ci brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1549bf215546Sopenharmony_ci brw_pop_insn_state(p); 1550bf215546Sopenharmony_ci 1551bf215546Sopenharmony_ci brw_inst_set_sfid(devinfo, send, GFX6_SFID_DATAPORT_CONSTANT_CACHE); 1552bf215546Sopenharmony_ci brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); 1553bf215546Sopenharmony_ci brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); 1554bf215546Sopenharmony_ci brw_set_desc(p, send, 1555bf215546Sopenharmony_ci brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written, 1556bf215546Sopenharmony_ci REG_SIZE), true) | 1557bf215546Sopenharmony_ci brw_dp_desc(devinfo, surf_index, 1558bf215546Sopenharmony_ci GFX7_DATAPORT_DC_OWORD_BLOCK_READ, 1559bf215546Sopenharmony_ci BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size))); 1560bf215546Sopenharmony_ci 1561bf215546Sopenharmony_ci } else { 1562bf215546Sopenharmony_ci const tgl_swsb swsb = brw_get_default_swsb(p); 1563bf215546Sopenharmony_ci struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 1564bf215546Sopenharmony_ci 1565bf215546Sopenharmony_ci brw_push_insn_state(p); 1566bf215546Sopenharmony_ci brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1567bf215546Sopenharmony_ci 1568bf215546Sopenharmony_ci /* a0.0 = surf_index & 0xff */ 1569bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 1570bf215546Sopenharmony_ci brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); 1571bf215546Sopenharmony_ci brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); 1572bf215546Sopenharmony_ci brw_set_dest(p, insn_and, addr); 1573bf215546Sopenharmony_ci brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); 1574bf215546Sopenharmony_ci brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); 1575bf215546Sopenharmony_ci 1576bf215546Sopenharmony_ci /* dst = send(payload, a0.0 | <descriptor>) */ 1577bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 1578bf215546Sopenharmony_ci brw_send_indirect_message( 1579bf215546Sopenharmony_ci p, GFX6_SFID_DATAPORT_CONSTANT_CACHE, 1580bf215546Sopenharmony_ci retype(dst, BRW_REGISTER_TYPE_UD), 1581bf215546Sopenharmony_ci retype(payload, BRW_REGISTER_TYPE_UD), addr, 1582bf215546Sopenharmony_ci brw_message_desc(devinfo, 1, 1583bf215546Sopenharmony_ci DIV_ROUND_UP(inst->size_written, REG_SIZE), true) | 1584bf215546Sopenharmony_ci brw_dp_desc(devinfo, 0 /* surface */, 1585bf215546Sopenharmony_ci GFX7_DATAPORT_DC_OWORD_BLOCK_READ, 1586bf215546Sopenharmony_ci BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)), 1587bf215546Sopenharmony_ci false /* EOT */); 1588bf215546Sopenharmony_ci 1589bf215546Sopenharmony_ci brw_pop_insn_state(p); 1590bf215546Sopenharmony_ci } 1591bf215546Sopenharmony_ci} 1592bf215546Sopenharmony_ci 1593bf215546Sopenharmony_civoid 1594bf215546Sopenharmony_cifs_generator::generate_varying_pull_constant_load_gfx4(fs_inst *inst, 1595bf215546Sopenharmony_ci struct brw_reg dst, 1596bf215546Sopenharmony_ci struct brw_reg index) 1597bf215546Sopenharmony_ci{ 1598bf215546Sopenharmony_ci assert(devinfo->ver < 7); /* Should use the gfx7 variant. */ 1599bf215546Sopenharmony_ci assert(inst->header_size != 0); 1600bf215546Sopenharmony_ci assert(inst->mlen); 1601bf215546Sopenharmony_ci 1602bf215546Sopenharmony_ci assert(index.file == BRW_IMMEDIATE_VALUE && 1603bf215546Sopenharmony_ci index.type == BRW_REGISTER_TYPE_UD); 1604bf215546Sopenharmony_ci uint32_t surf_index = index.ud; 1605bf215546Sopenharmony_ci 1606bf215546Sopenharmony_ci uint32_t simd_mode, rlen, msg_type; 1607bf215546Sopenharmony_ci if (inst->exec_size == 16) { 1608bf215546Sopenharmony_ci simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1609bf215546Sopenharmony_ci rlen = 8; 1610bf215546Sopenharmony_ci } else { 1611bf215546Sopenharmony_ci assert(inst->exec_size == 8); 1612bf215546Sopenharmony_ci simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 1613bf215546Sopenharmony_ci rlen = 4; 1614bf215546Sopenharmony_ci } 1615bf215546Sopenharmony_ci 1616bf215546Sopenharmony_ci if (devinfo->ver >= 5) 1617bf215546Sopenharmony_ci msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD; 1618bf215546Sopenharmony_ci else { 1619bf215546Sopenharmony_ci /* We always use the SIMD16 message so that we only have to load U, and 1620bf215546Sopenharmony_ci * not V or R. 1621bf215546Sopenharmony_ci */ 1622bf215546Sopenharmony_ci msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; 1623bf215546Sopenharmony_ci assert(inst->mlen == 3); 1624bf215546Sopenharmony_ci assert(inst->size_written == 8 * REG_SIZE); 1625bf215546Sopenharmony_ci rlen = 8; 1626bf215546Sopenharmony_ci simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1627bf215546Sopenharmony_ci } 1628bf215546Sopenharmony_ci 1629bf215546Sopenharmony_ci struct brw_reg header = brw_vec8_grf(0, 0); 1630bf215546Sopenharmony_ci gfx6_resolve_implied_move(p, &header, inst->base_mrf); 1631bf215546Sopenharmony_ci 1632bf215546Sopenharmony_ci brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1633bf215546Sopenharmony_ci brw_inst_set_compression(devinfo, send, false); 1634bf215546Sopenharmony_ci brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER); 1635bf215546Sopenharmony_ci brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); 1636bf215546Sopenharmony_ci brw_set_src0(p, send, header); 1637bf215546Sopenharmony_ci if (devinfo->ver < 6) 1638bf215546Sopenharmony_ci brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf); 1639bf215546Sopenharmony_ci 1640bf215546Sopenharmony_ci /* Our surface is set up as floats, regardless of what actual data is 1641bf215546Sopenharmony_ci * stored in it. 1642bf215546Sopenharmony_ci */ 1643bf215546Sopenharmony_ci uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 1644bf215546Sopenharmony_ci brw_set_desc(p, send, 1645bf215546Sopenharmony_ci brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) | 1646bf215546Sopenharmony_ci brw_sampler_desc(devinfo, surf_index, 1647bf215546Sopenharmony_ci 0, /* sampler (unused) */ 1648bf215546Sopenharmony_ci msg_type, simd_mode, return_format)); 1649bf215546Sopenharmony_ci} 1650bf215546Sopenharmony_ci 1651bf215546Sopenharmony_civoid 1652bf215546Sopenharmony_cifs_generator::generate_pixel_interpolator_query(fs_inst *inst, 1653bf215546Sopenharmony_ci struct brw_reg dst, 1654bf215546Sopenharmony_ci struct brw_reg src, 1655bf215546Sopenharmony_ci struct brw_reg msg_data, 1656bf215546Sopenharmony_ci unsigned msg_type) 1657bf215546Sopenharmony_ci{ 1658bf215546Sopenharmony_ci const bool has_payload = inst->src[0].file != BAD_FILE; 1659bf215546Sopenharmony_ci assert(msg_data.type == BRW_REGISTER_TYPE_UD); 1660bf215546Sopenharmony_ci assert(inst->size_written % REG_SIZE == 0); 1661bf215546Sopenharmony_ci 1662bf215546Sopenharmony_ci struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 1663bf215546Sopenharmony_ci 1664bf215546Sopenharmony_ci brw_pixel_interpolator_query(p, 1665bf215546Sopenharmony_ci retype(dst, BRW_REGISTER_TYPE_UW), 1666bf215546Sopenharmony_ci /* If we don't have a payload, what we send doesn't matter */ 1667bf215546Sopenharmony_ci has_payload ? src : brw_vec8_grf(0, 0), 1668bf215546Sopenharmony_ci inst->pi_noperspective, 1669bf215546Sopenharmony_ci prog_data->per_coarse_pixel_dispatch, 1670bf215546Sopenharmony_ci msg_type, 1671bf215546Sopenharmony_ci msg_data, 1672bf215546Sopenharmony_ci has_payload ? 2 * inst->exec_size / 8 : 1, 1673bf215546Sopenharmony_ci inst->size_written / REG_SIZE); 1674bf215546Sopenharmony_ci} 1675bf215546Sopenharmony_ci 1676bf215546Sopenharmony_ci/* Sets vstride=1, width=4, hstride=0 of register src1 during 1677bf215546Sopenharmony_ci * the ADD instruction. 1678bf215546Sopenharmony_ci */ 1679bf215546Sopenharmony_civoid 1680bf215546Sopenharmony_cifs_generator::generate_set_sample_id(fs_inst *inst, 1681bf215546Sopenharmony_ci struct brw_reg dst, 1682bf215546Sopenharmony_ci struct brw_reg src0, 1683bf215546Sopenharmony_ci struct brw_reg src1) 1684bf215546Sopenharmony_ci{ 1685bf215546Sopenharmony_ci assert(dst.type == BRW_REGISTER_TYPE_D || 1686bf215546Sopenharmony_ci dst.type == BRW_REGISTER_TYPE_UD); 1687bf215546Sopenharmony_ci assert(src0.type == BRW_REGISTER_TYPE_D || 1688bf215546Sopenharmony_ci src0.type == BRW_REGISTER_TYPE_UD); 1689bf215546Sopenharmony_ci 1690bf215546Sopenharmony_ci const struct brw_reg reg = stride(src1, 1, 4, 0); 1691bf215546Sopenharmony_ci const unsigned lower_size = MIN2(inst->exec_size, 1692bf215546Sopenharmony_ci devinfo->ver >= 8 ? 16 : 8); 1693bf215546Sopenharmony_ci 1694bf215546Sopenharmony_ci for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { 1695bf215546Sopenharmony_ci brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8), 1696bf215546Sopenharmony_ci offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) * 1697bf215546Sopenharmony_ci (i * lower_size / (1 << src0.width))) * 1698bf215546Sopenharmony_ci type_sz(src0.type) / REG_SIZE), 1699bf215546Sopenharmony_ci suboffset(reg, i * lower_size / 4)); 1700bf215546Sopenharmony_ci brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1); 1701bf215546Sopenharmony_ci brw_inst_set_group(devinfo, insn, inst->group + lower_size * i); 1702bf215546Sopenharmony_ci brw_inst_set_compression(devinfo, insn, lower_size > 8); 1703bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_null()); 1704bf215546Sopenharmony_ci } 1705bf215546Sopenharmony_ci} 1706bf215546Sopenharmony_ci 1707bf215546Sopenharmony_civoid 1708bf215546Sopenharmony_cifs_generator::generate_pack_half_2x16_split(fs_inst *, 1709bf215546Sopenharmony_ci struct brw_reg dst, 1710bf215546Sopenharmony_ci struct brw_reg x, 1711bf215546Sopenharmony_ci struct brw_reg y) 1712bf215546Sopenharmony_ci{ 1713bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 1714bf215546Sopenharmony_ci assert(dst.type == BRW_REGISTER_TYPE_UD); 1715bf215546Sopenharmony_ci assert(x.type == BRW_REGISTER_TYPE_F); 1716bf215546Sopenharmony_ci assert(y.type == BRW_REGISTER_TYPE_F); 1717bf215546Sopenharmony_ci 1718bf215546Sopenharmony_ci /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: 1719bf215546Sopenharmony_ci * 1720bf215546Sopenharmony_ci * Because this instruction does not have a 16-bit floating-point type, 1721bf215546Sopenharmony_ci * the destination data type must be Word (W). 1722bf215546Sopenharmony_ci * 1723bf215546Sopenharmony_ci * The destination must be DWord-aligned and specify a horizontal stride 1724bf215546Sopenharmony_ci * (HorzStride) of 2. The 16-bit result is stored in the lower word of 1725bf215546Sopenharmony_ci * each destination channel and the upper word is not modified. 1726bf215546Sopenharmony_ci */ 1727bf215546Sopenharmony_ci const enum brw_reg_type t = devinfo->ver > 7 1728bf215546Sopenharmony_ci ? BRW_REGISTER_TYPE_HF : BRW_REGISTER_TYPE_W; 1729bf215546Sopenharmony_ci struct brw_reg dst_w = spread(retype(dst, t), 2); 1730bf215546Sopenharmony_ci 1731bf215546Sopenharmony_ci if (y.file == IMM) { 1732bf215546Sopenharmony_ci const uint32_t hhhh0000 = _mesa_float_to_half(y.f) << 16; 1733bf215546Sopenharmony_ci 1734bf215546Sopenharmony_ci brw_MOV(p, dst, brw_imm_ud(hhhh0000)); 1735bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_regdist(1)); 1736bf215546Sopenharmony_ci } else { 1737bf215546Sopenharmony_ci /* Give each 32-bit channel of dst the form below, where "." means 1738bf215546Sopenharmony_ci * unchanged. 1739bf215546Sopenharmony_ci * 0x....hhhh 1740bf215546Sopenharmony_ci */ 1741bf215546Sopenharmony_ci brw_F32TO16(p, dst_w, y); 1742bf215546Sopenharmony_ci 1743bf215546Sopenharmony_ci /* Now the form: 1744bf215546Sopenharmony_ci * 0xhhhh0000 1745bf215546Sopenharmony_ci */ 1746bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_regdist(1)); 1747bf215546Sopenharmony_ci brw_SHL(p, dst, dst, brw_imm_ud(16u)); 1748bf215546Sopenharmony_ci } 1749bf215546Sopenharmony_ci 1750bf215546Sopenharmony_ci /* And, finally the form of packHalf2x16's output: 1751bf215546Sopenharmony_ci * 0xhhhhllll 1752bf215546Sopenharmony_ci */ 1753bf215546Sopenharmony_ci brw_F32TO16(p, dst_w, x); 1754bf215546Sopenharmony_ci} 1755bf215546Sopenharmony_ci 1756bf215546Sopenharmony_civoid 1757bf215546Sopenharmony_cifs_generator::enable_debug(const char *shader_name) 1758bf215546Sopenharmony_ci{ 1759bf215546Sopenharmony_ci debug_flag = true; 1760bf215546Sopenharmony_ci this->shader_name = shader_name; 1761bf215546Sopenharmony_ci} 1762bf215546Sopenharmony_ci 1763bf215546Sopenharmony_ciint 1764bf215546Sopenharmony_cifs_generator::generate_code(const cfg_t *cfg, int dispatch_width, 1765bf215546Sopenharmony_ci struct shader_stats shader_stats, 1766bf215546Sopenharmony_ci const brw::performance &perf, 1767bf215546Sopenharmony_ci struct brw_compile_stats *stats) 1768bf215546Sopenharmony_ci{ 1769bf215546Sopenharmony_ci /* align to 64 byte boundary. */ 1770bf215546Sopenharmony_ci brw_realign(p, 64); 1771bf215546Sopenharmony_ci 1772bf215546Sopenharmony_ci this->dispatch_width = dispatch_width; 1773bf215546Sopenharmony_ci 1774bf215546Sopenharmony_ci int start_offset = p->next_insn_offset; 1775bf215546Sopenharmony_ci 1776bf215546Sopenharmony_ci int loop_count = 0, send_count = 0, nop_count = 0; 1777bf215546Sopenharmony_ci bool is_accum_used = false; 1778bf215546Sopenharmony_ci 1779bf215546Sopenharmony_ci struct disasm_info *disasm_info = disasm_initialize(p->isa, cfg); 1780bf215546Sopenharmony_ci 1781bf215546Sopenharmony_ci foreach_block_and_inst (block, fs_inst, inst, cfg) { 1782bf215546Sopenharmony_ci if (inst->opcode == SHADER_OPCODE_UNDEF) 1783bf215546Sopenharmony_ci continue; 1784bf215546Sopenharmony_ci 1785bf215546Sopenharmony_ci struct brw_reg src[4], dst; 1786bf215546Sopenharmony_ci unsigned int last_insn_offset = p->next_insn_offset; 1787bf215546Sopenharmony_ci bool multiple_instructions_emitted = false; 1788bf215546Sopenharmony_ci tgl_swsb swsb = inst->sched; 1789bf215546Sopenharmony_ci 1790bf215546Sopenharmony_ci /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the 1791bf215546Sopenharmony_ci * "Register Region Restrictions" section: for BDW, SKL: 1792bf215546Sopenharmony_ci * 1793bf215546Sopenharmony_ci * "A POW/FDIV operation must not be followed by an instruction 1794bf215546Sopenharmony_ci * that requires two destination registers." 1795bf215546Sopenharmony_ci * 1796bf215546Sopenharmony_ci * The documentation is often lacking annotations for Atom parts, 1797bf215546Sopenharmony_ci * and empirically this affects CHV as well. 1798bf215546Sopenharmony_ci */ 1799bf215546Sopenharmony_ci if (devinfo->ver >= 8 && 1800bf215546Sopenharmony_ci devinfo->ver <= 9 && 1801bf215546Sopenharmony_ci p->nr_insn > 1 && 1802bf215546Sopenharmony_ci brw_inst_opcode(p->isa, brw_last_inst) == BRW_OPCODE_MATH && 1803bf215546Sopenharmony_ci brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW && 1804bf215546Sopenharmony_ci inst->dst.component_size(inst->exec_size) > REG_SIZE) { 1805bf215546Sopenharmony_ci brw_NOP(p); 1806bf215546Sopenharmony_ci last_insn_offset = p->next_insn_offset; 1807bf215546Sopenharmony_ci 1808bf215546Sopenharmony_ci /* In order to avoid spurious instruction count differences when the 1809bf215546Sopenharmony_ci * instruction schedule changes, keep track of the number of inserted 1810bf215546Sopenharmony_ci * NOPs. 1811bf215546Sopenharmony_ci */ 1812bf215546Sopenharmony_ci nop_count++; 1813bf215546Sopenharmony_ci } 1814bf215546Sopenharmony_ci 1815bf215546Sopenharmony_ci /* Wa_14010017096: 1816bf215546Sopenharmony_ci * 1817bf215546Sopenharmony_ci * Clear accumulator register before end of thread. 1818bf215546Sopenharmony_ci */ 1819bf215546Sopenharmony_ci if (inst->eot && is_accum_used && devinfo->ver >= 12) { 1820bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_16); 1821bf215546Sopenharmony_ci brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1822bf215546Sopenharmony_ci brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 1823bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 1824bf215546Sopenharmony_ci brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f)); 1825bf215546Sopenharmony_ci last_insn_offset = p->next_insn_offset; 1826bf215546Sopenharmony_ci swsb = tgl_swsb_dst_dep(swsb, 1); 1827bf215546Sopenharmony_ci } 1828bf215546Sopenharmony_ci 1829bf215546Sopenharmony_ci if (!is_accum_used && !inst->eot) { 1830bf215546Sopenharmony_ci is_accum_used = inst->writes_accumulator_implicitly(devinfo) || 1831bf215546Sopenharmony_ci inst->dst.is_accumulator(); 1832bf215546Sopenharmony_ci } 1833bf215546Sopenharmony_ci 1834bf215546Sopenharmony_ci /* Wa_14013745556: 1835bf215546Sopenharmony_ci * 1836bf215546Sopenharmony_ci * Always use @1 SWSB for EOT. 1837bf215546Sopenharmony_ci */ 1838bf215546Sopenharmony_ci if (inst->eot && devinfo->ver >= 12) { 1839bf215546Sopenharmony_ci if (tgl_swsb_src_dep(swsb).mode) { 1840bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_1); 1841bf215546Sopenharmony_ci brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1842bf215546Sopenharmony_ci brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 1843bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 1844bf215546Sopenharmony_ci brw_SYNC(p, TGL_SYNC_NOP); 1845bf215546Sopenharmony_ci last_insn_offset = p->next_insn_offset; 1846bf215546Sopenharmony_ci } 1847bf215546Sopenharmony_ci 1848bf215546Sopenharmony_ci swsb = tgl_swsb_dst_dep(swsb, 1); 1849bf215546Sopenharmony_ci } 1850bf215546Sopenharmony_ci 1851bf215546Sopenharmony_ci if (unlikely(debug_flag)) 1852bf215546Sopenharmony_ci disasm_annotate(disasm_info, inst, p->next_insn_offset); 1853bf215546Sopenharmony_ci 1854bf215546Sopenharmony_ci /* If the instruction writes to more than one register, it needs to be 1855bf215546Sopenharmony_ci * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the 1856bf215546Sopenharmony_ci * hardware figures out by itself what the right compression mode is, 1857bf215546Sopenharmony_ci * but we still need to know whether the instruction is compressed to 1858bf215546Sopenharmony_ci * set up the source register regions appropriately. 1859bf215546Sopenharmony_ci * 1860bf215546Sopenharmony_ci * XXX - This is wrong for instructions that write a single register but 1861bf215546Sopenharmony_ci * read more than one which should strictly speaking be treated as 1862bf215546Sopenharmony_ci * compressed. For instructions that don't write any registers it 1863bf215546Sopenharmony_ci * relies on the destination being a null register of the correct 1864bf215546Sopenharmony_ci * type and regioning so the instruction is considered compressed 1865bf215546Sopenharmony_ci * or not accordingly. 1866bf215546Sopenharmony_ci */ 1867bf215546Sopenharmony_ci const bool compressed = 1868bf215546Sopenharmony_ci inst->dst.component_size(inst->exec_size) > REG_SIZE; 1869bf215546Sopenharmony_ci brw_set_default_compression(p, compressed); 1870bf215546Sopenharmony_ci brw_set_default_group(p, inst->group); 1871bf215546Sopenharmony_ci 1872bf215546Sopenharmony_ci for (unsigned int i = 0; i < inst->sources; i++) { 1873bf215546Sopenharmony_ci src[i] = brw_reg_from_fs_reg(devinfo, inst, 1874bf215546Sopenharmony_ci &inst->src[i], compressed); 1875bf215546Sopenharmony_ci /* The accumulator result appears to get used for the 1876bf215546Sopenharmony_ci * conditional modifier generation. When negating a UD 1877bf215546Sopenharmony_ci * value, there is a 33rd bit generated for the sign in the 1878bf215546Sopenharmony_ci * accumulator value, so now you can't check, for example, 1879bf215546Sopenharmony_ci * equality with a 32-bit value. See piglit fs-op-neg-uvec4. 1880bf215546Sopenharmony_ci */ 1881bf215546Sopenharmony_ci assert(!inst->conditional_mod || 1882bf215546Sopenharmony_ci inst->src[i].type != BRW_REGISTER_TYPE_UD || 1883bf215546Sopenharmony_ci !inst->src[i].negate); 1884bf215546Sopenharmony_ci } 1885bf215546Sopenharmony_ci dst = brw_reg_from_fs_reg(devinfo, inst, 1886bf215546Sopenharmony_ci &inst->dst, compressed); 1887bf215546Sopenharmony_ci 1888bf215546Sopenharmony_ci brw_set_default_access_mode(p, BRW_ALIGN_1); 1889bf215546Sopenharmony_ci brw_set_default_predicate_control(p, inst->predicate); 1890bf215546Sopenharmony_ci brw_set_default_predicate_inverse(p, inst->predicate_inverse); 1891bf215546Sopenharmony_ci /* On gfx7 and above, hardware automatically adds the group onto the 1892bf215546Sopenharmony_ci * flag subregister number. On Sandy Bridge and older, we have to do it 1893bf215546Sopenharmony_ci * ourselves. 1894bf215546Sopenharmony_ci */ 1895bf215546Sopenharmony_ci const unsigned flag_subreg = inst->flag_subreg + 1896bf215546Sopenharmony_ci (devinfo->ver >= 7 ? 0 : inst->group / 16); 1897bf215546Sopenharmony_ci brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2); 1898bf215546Sopenharmony_ci brw_set_default_saturate(p, inst->saturate); 1899bf215546Sopenharmony_ci brw_set_default_mask_control(p, inst->force_writemask_all); 1900bf215546Sopenharmony_ci brw_set_default_acc_write_control(p, inst->writes_accumulator); 1901bf215546Sopenharmony_ci brw_set_default_swsb(p, swsb); 1902bf215546Sopenharmony_ci 1903bf215546Sopenharmony_ci unsigned exec_size = inst->exec_size; 1904bf215546Sopenharmony_ci if (devinfo->verx10 == 70 && 1905bf215546Sopenharmony_ci (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) { 1906bf215546Sopenharmony_ci exec_size *= 2; 1907bf215546Sopenharmony_ci } 1908bf215546Sopenharmony_ci 1909bf215546Sopenharmony_ci brw_set_default_exec_size(p, cvt(exec_size) - 1); 1910bf215546Sopenharmony_ci 1911bf215546Sopenharmony_ci assert(inst->force_writemask_all || inst->exec_size >= 4); 1912bf215546Sopenharmony_ci assert(inst->force_writemask_all || inst->group % inst->exec_size == 0); 1913bf215546Sopenharmony_ci assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->ver)); 1914bf215546Sopenharmony_ci assert(inst->mlen <= BRW_MAX_MSG_LENGTH); 1915bf215546Sopenharmony_ci 1916bf215546Sopenharmony_ci switch (inst->opcode) { 1917bf215546Sopenharmony_ci case BRW_OPCODE_SYNC: 1918bf215546Sopenharmony_ci assert(src[0].file == BRW_IMMEDIATE_VALUE); 1919bf215546Sopenharmony_ci brw_SYNC(p, tgl_sync_function(src[0].ud)); 1920bf215546Sopenharmony_ci break; 1921bf215546Sopenharmony_ci case BRW_OPCODE_MOV: 1922bf215546Sopenharmony_ci brw_MOV(p, dst, src[0]); 1923bf215546Sopenharmony_ci break; 1924bf215546Sopenharmony_ci case BRW_OPCODE_ADD: 1925bf215546Sopenharmony_ci brw_ADD(p, dst, src[0], src[1]); 1926bf215546Sopenharmony_ci break; 1927bf215546Sopenharmony_ci case BRW_OPCODE_MUL: 1928bf215546Sopenharmony_ci brw_MUL(p, dst, src[0], src[1]); 1929bf215546Sopenharmony_ci break; 1930bf215546Sopenharmony_ci case BRW_OPCODE_AVG: 1931bf215546Sopenharmony_ci brw_AVG(p, dst, src[0], src[1]); 1932bf215546Sopenharmony_ci break; 1933bf215546Sopenharmony_ci case BRW_OPCODE_MACH: 1934bf215546Sopenharmony_ci brw_MACH(p, dst, src[0], src[1]); 1935bf215546Sopenharmony_ci break; 1936bf215546Sopenharmony_ci 1937bf215546Sopenharmony_ci case BRW_OPCODE_DP4A: 1938bf215546Sopenharmony_ci assert(devinfo->ver >= 12); 1939bf215546Sopenharmony_ci brw_DP4A(p, dst, src[0], src[1], src[2]); 1940bf215546Sopenharmony_ci break; 1941bf215546Sopenharmony_ci 1942bf215546Sopenharmony_ci case BRW_OPCODE_LINE: 1943bf215546Sopenharmony_ci brw_LINE(p, dst, src[0], src[1]); 1944bf215546Sopenharmony_ci break; 1945bf215546Sopenharmony_ci 1946bf215546Sopenharmony_ci case BRW_OPCODE_MAD: 1947bf215546Sopenharmony_ci assert(devinfo->ver >= 6); 1948bf215546Sopenharmony_ci if (devinfo->ver < 10) 1949bf215546Sopenharmony_ci brw_set_default_access_mode(p, BRW_ALIGN_16); 1950bf215546Sopenharmony_ci brw_MAD(p, dst, src[0], src[1], src[2]); 1951bf215546Sopenharmony_ci break; 1952bf215546Sopenharmony_ci 1953bf215546Sopenharmony_ci case BRW_OPCODE_LRP: 1954bf215546Sopenharmony_ci assert(devinfo->ver >= 6 && devinfo->ver <= 10); 1955bf215546Sopenharmony_ci if (devinfo->ver < 10) 1956bf215546Sopenharmony_ci brw_set_default_access_mode(p, BRW_ALIGN_16); 1957bf215546Sopenharmony_ci brw_LRP(p, dst, src[0], src[1], src[2]); 1958bf215546Sopenharmony_ci break; 1959bf215546Sopenharmony_ci 1960bf215546Sopenharmony_ci case BRW_OPCODE_ADD3: 1961bf215546Sopenharmony_ci assert(devinfo->verx10 >= 125); 1962bf215546Sopenharmony_ci brw_ADD3(p, dst, src[0], src[1], src[2]); 1963bf215546Sopenharmony_ci break; 1964bf215546Sopenharmony_ci 1965bf215546Sopenharmony_ci case BRW_OPCODE_FRC: 1966bf215546Sopenharmony_ci brw_FRC(p, dst, src[0]); 1967bf215546Sopenharmony_ci break; 1968bf215546Sopenharmony_ci case BRW_OPCODE_RNDD: 1969bf215546Sopenharmony_ci brw_RNDD(p, dst, src[0]); 1970bf215546Sopenharmony_ci break; 1971bf215546Sopenharmony_ci case BRW_OPCODE_RNDE: 1972bf215546Sopenharmony_ci brw_RNDE(p, dst, src[0]); 1973bf215546Sopenharmony_ci break; 1974bf215546Sopenharmony_ci case BRW_OPCODE_RNDZ: 1975bf215546Sopenharmony_ci brw_RNDZ(p, dst, src[0]); 1976bf215546Sopenharmony_ci break; 1977bf215546Sopenharmony_ci 1978bf215546Sopenharmony_ci case BRW_OPCODE_AND: 1979bf215546Sopenharmony_ci brw_AND(p, dst, src[0], src[1]); 1980bf215546Sopenharmony_ci break; 1981bf215546Sopenharmony_ci case BRW_OPCODE_OR: 1982bf215546Sopenharmony_ci brw_OR(p, dst, src[0], src[1]); 1983bf215546Sopenharmony_ci break; 1984bf215546Sopenharmony_ci case BRW_OPCODE_XOR: 1985bf215546Sopenharmony_ci brw_XOR(p, dst, src[0], src[1]); 1986bf215546Sopenharmony_ci break; 1987bf215546Sopenharmony_ci case BRW_OPCODE_NOT: 1988bf215546Sopenharmony_ci brw_NOT(p, dst, src[0]); 1989bf215546Sopenharmony_ci break; 1990bf215546Sopenharmony_ci case BRW_OPCODE_ASR: 1991bf215546Sopenharmony_ci brw_ASR(p, dst, src[0], src[1]); 1992bf215546Sopenharmony_ci break; 1993bf215546Sopenharmony_ci case BRW_OPCODE_SHR: 1994bf215546Sopenharmony_ci brw_SHR(p, dst, src[0], src[1]); 1995bf215546Sopenharmony_ci break; 1996bf215546Sopenharmony_ci case BRW_OPCODE_SHL: 1997bf215546Sopenharmony_ci brw_SHL(p, dst, src[0], src[1]); 1998bf215546Sopenharmony_ci break; 1999bf215546Sopenharmony_ci case BRW_OPCODE_ROL: 2000bf215546Sopenharmony_ci assert(devinfo->ver >= 11); 2001bf215546Sopenharmony_ci assert(src[0].type == dst.type); 2002bf215546Sopenharmony_ci brw_ROL(p, dst, src[0], src[1]); 2003bf215546Sopenharmony_ci break; 2004bf215546Sopenharmony_ci case BRW_OPCODE_ROR: 2005bf215546Sopenharmony_ci assert(devinfo->ver >= 11); 2006bf215546Sopenharmony_ci assert(src[0].type == dst.type); 2007bf215546Sopenharmony_ci brw_ROR(p, dst, src[0], src[1]); 2008bf215546Sopenharmony_ci break; 2009bf215546Sopenharmony_ci case BRW_OPCODE_F32TO16: 2010bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2011bf215546Sopenharmony_ci brw_F32TO16(p, dst, src[0]); 2012bf215546Sopenharmony_ci break; 2013bf215546Sopenharmony_ci case BRW_OPCODE_F16TO32: 2014bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2015bf215546Sopenharmony_ci brw_F16TO32(p, dst, src[0]); 2016bf215546Sopenharmony_ci break; 2017bf215546Sopenharmony_ci case BRW_OPCODE_CMP: 2018bf215546Sopenharmony_ci if (inst->exec_size >= 16 && devinfo->verx10 == 70 && 2019bf215546Sopenharmony_ci dst.file == BRW_ARCHITECTURE_REGISTER_FILE) { 2020bf215546Sopenharmony_ci /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround 2021bf215546Sopenharmony_ci * implemented in the compiler is not sufficient. Overriding the 2022bf215546Sopenharmony_ci * type when the destination is the null register is necessary but 2023bf215546Sopenharmony_ci * not sufficient by itself. 2024bf215546Sopenharmony_ci */ 2025bf215546Sopenharmony_ci dst.type = BRW_REGISTER_TYPE_D; 2026bf215546Sopenharmony_ci } 2027bf215546Sopenharmony_ci brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 2028bf215546Sopenharmony_ci break; 2029bf215546Sopenharmony_ci case BRW_OPCODE_CMPN: 2030bf215546Sopenharmony_ci if (inst->exec_size >= 16 && devinfo->verx10 == 70 && 2031bf215546Sopenharmony_ci dst.file == BRW_ARCHITECTURE_REGISTER_FILE) { 2032bf215546Sopenharmony_ci /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround 2033bf215546Sopenharmony_ci * implemented in the compiler is not sufficient. Overriding the 2034bf215546Sopenharmony_ci * type when the destination is the null register is necessary but 2035bf215546Sopenharmony_ci * not sufficient by itself. 2036bf215546Sopenharmony_ci */ 2037bf215546Sopenharmony_ci dst.type = BRW_REGISTER_TYPE_D; 2038bf215546Sopenharmony_ci } 2039bf215546Sopenharmony_ci brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]); 2040bf215546Sopenharmony_ci break; 2041bf215546Sopenharmony_ci case BRW_OPCODE_SEL: 2042bf215546Sopenharmony_ci brw_SEL(p, dst, src[0], src[1]); 2043bf215546Sopenharmony_ci break; 2044bf215546Sopenharmony_ci case BRW_OPCODE_CSEL: 2045bf215546Sopenharmony_ci assert(devinfo->ver >= 8); 2046bf215546Sopenharmony_ci if (devinfo->ver < 10) 2047bf215546Sopenharmony_ci brw_set_default_access_mode(p, BRW_ALIGN_16); 2048bf215546Sopenharmony_ci brw_CSEL(p, dst, src[0], src[1], src[2]); 2049bf215546Sopenharmony_ci break; 2050bf215546Sopenharmony_ci case BRW_OPCODE_BFREV: 2051bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2052bf215546Sopenharmony_ci brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), 2053bf215546Sopenharmony_ci retype(src[0], BRW_REGISTER_TYPE_UD)); 2054bf215546Sopenharmony_ci break; 2055bf215546Sopenharmony_ci case BRW_OPCODE_FBH: 2056bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2057bf215546Sopenharmony_ci brw_FBH(p, retype(dst, src[0].type), src[0]); 2058bf215546Sopenharmony_ci break; 2059bf215546Sopenharmony_ci case BRW_OPCODE_FBL: 2060bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2061bf215546Sopenharmony_ci brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), 2062bf215546Sopenharmony_ci retype(src[0], BRW_REGISTER_TYPE_UD)); 2063bf215546Sopenharmony_ci break; 2064bf215546Sopenharmony_ci case BRW_OPCODE_LZD: 2065bf215546Sopenharmony_ci brw_LZD(p, dst, src[0]); 2066bf215546Sopenharmony_ci break; 2067bf215546Sopenharmony_ci case BRW_OPCODE_CBIT: 2068bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2069bf215546Sopenharmony_ci brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), 2070bf215546Sopenharmony_ci retype(src[0], BRW_REGISTER_TYPE_UD)); 2071bf215546Sopenharmony_ci break; 2072bf215546Sopenharmony_ci case BRW_OPCODE_ADDC: 2073bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2074bf215546Sopenharmony_ci brw_ADDC(p, dst, src[0], src[1]); 2075bf215546Sopenharmony_ci break; 2076bf215546Sopenharmony_ci case BRW_OPCODE_SUBB: 2077bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2078bf215546Sopenharmony_ci brw_SUBB(p, dst, src[0], src[1]); 2079bf215546Sopenharmony_ci break; 2080bf215546Sopenharmony_ci case BRW_OPCODE_MAC: 2081bf215546Sopenharmony_ci brw_MAC(p, dst, src[0], src[1]); 2082bf215546Sopenharmony_ci break; 2083bf215546Sopenharmony_ci 2084bf215546Sopenharmony_ci case BRW_OPCODE_BFE: 2085bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2086bf215546Sopenharmony_ci if (devinfo->ver < 10) 2087bf215546Sopenharmony_ci brw_set_default_access_mode(p, BRW_ALIGN_16); 2088bf215546Sopenharmony_ci brw_BFE(p, dst, src[0], src[1], src[2]); 2089bf215546Sopenharmony_ci break; 2090bf215546Sopenharmony_ci 2091bf215546Sopenharmony_ci case BRW_OPCODE_BFI1: 2092bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2093bf215546Sopenharmony_ci brw_BFI1(p, dst, src[0], src[1]); 2094bf215546Sopenharmony_ci break; 2095bf215546Sopenharmony_ci case BRW_OPCODE_BFI2: 2096bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2097bf215546Sopenharmony_ci if (devinfo->ver < 10) 2098bf215546Sopenharmony_ci brw_set_default_access_mode(p, BRW_ALIGN_16); 2099bf215546Sopenharmony_ci brw_BFI2(p, dst, src[0], src[1], src[2]); 2100bf215546Sopenharmony_ci break; 2101bf215546Sopenharmony_ci 2102bf215546Sopenharmony_ci case BRW_OPCODE_IF: 2103bf215546Sopenharmony_ci if (inst->src[0].file != BAD_FILE) { 2104bf215546Sopenharmony_ci /* The instruction has an embedded compare (only allowed on gfx6) */ 2105bf215546Sopenharmony_ci assert(devinfo->ver == 6); 2106bf215546Sopenharmony_ci gfx6_IF(p, inst->conditional_mod, src[0], src[1]); 2107bf215546Sopenharmony_ci } else { 2108bf215546Sopenharmony_ci brw_IF(p, brw_get_default_exec_size(p)); 2109bf215546Sopenharmony_ci } 2110bf215546Sopenharmony_ci break; 2111bf215546Sopenharmony_ci 2112bf215546Sopenharmony_ci case BRW_OPCODE_ELSE: 2113bf215546Sopenharmony_ci brw_ELSE(p); 2114bf215546Sopenharmony_ci break; 2115bf215546Sopenharmony_ci case BRW_OPCODE_ENDIF: 2116bf215546Sopenharmony_ci brw_ENDIF(p); 2117bf215546Sopenharmony_ci break; 2118bf215546Sopenharmony_ci 2119bf215546Sopenharmony_ci case BRW_OPCODE_DO: 2120bf215546Sopenharmony_ci brw_DO(p, brw_get_default_exec_size(p)); 2121bf215546Sopenharmony_ci break; 2122bf215546Sopenharmony_ci 2123bf215546Sopenharmony_ci case BRW_OPCODE_BREAK: 2124bf215546Sopenharmony_ci brw_BREAK(p); 2125bf215546Sopenharmony_ci break; 2126bf215546Sopenharmony_ci case BRW_OPCODE_CONTINUE: 2127bf215546Sopenharmony_ci brw_CONT(p); 2128bf215546Sopenharmony_ci break; 2129bf215546Sopenharmony_ci 2130bf215546Sopenharmony_ci case BRW_OPCODE_WHILE: 2131bf215546Sopenharmony_ci brw_WHILE(p); 2132bf215546Sopenharmony_ci loop_count++; 2133bf215546Sopenharmony_ci break; 2134bf215546Sopenharmony_ci 2135bf215546Sopenharmony_ci case SHADER_OPCODE_RCP: 2136bf215546Sopenharmony_ci case SHADER_OPCODE_RSQ: 2137bf215546Sopenharmony_ci case SHADER_OPCODE_SQRT: 2138bf215546Sopenharmony_ci case SHADER_OPCODE_EXP2: 2139bf215546Sopenharmony_ci case SHADER_OPCODE_LOG2: 2140bf215546Sopenharmony_ci case SHADER_OPCODE_SIN: 2141bf215546Sopenharmony_ci case SHADER_OPCODE_COS: 2142bf215546Sopenharmony_ci assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 2143bf215546Sopenharmony_ci if (devinfo->ver >= 6) { 2144bf215546Sopenharmony_ci assert(inst->mlen == 0); 2145bf215546Sopenharmony_ci assert(devinfo->ver >= 7 || inst->exec_size == 8); 2146bf215546Sopenharmony_ci gfx6_math(p, dst, brw_math_function(inst->opcode), 2147bf215546Sopenharmony_ci src[0], brw_null_reg()); 2148bf215546Sopenharmony_ci } else { 2149bf215546Sopenharmony_ci assert(inst->mlen >= 1); 2150bf215546Sopenharmony_ci assert(devinfo->ver == 5 || devinfo->platform == INTEL_PLATFORM_G4X || inst->exec_size == 8); 2151bf215546Sopenharmony_ci gfx4_math(p, dst, 2152bf215546Sopenharmony_ci brw_math_function(inst->opcode), 2153bf215546Sopenharmony_ci inst->base_mrf, src[0], 2154bf215546Sopenharmony_ci BRW_MATH_PRECISION_FULL); 2155bf215546Sopenharmony_ci send_count++; 2156bf215546Sopenharmony_ci } 2157bf215546Sopenharmony_ci break; 2158bf215546Sopenharmony_ci case SHADER_OPCODE_INT_QUOTIENT: 2159bf215546Sopenharmony_ci case SHADER_OPCODE_INT_REMAINDER: 2160bf215546Sopenharmony_ci case SHADER_OPCODE_POW: 2161bf215546Sopenharmony_ci assert(devinfo->verx10 < 125); 2162bf215546Sopenharmony_ci assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 2163bf215546Sopenharmony_ci if (devinfo->ver >= 6) { 2164bf215546Sopenharmony_ci assert(inst->mlen == 0); 2165bf215546Sopenharmony_ci assert((devinfo->ver >= 7 && inst->opcode == SHADER_OPCODE_POW) || 2166bf215546Sopenharmony_ci inst->exec_size == 8); 2167bf215546Sopenharmony_ci gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); 2168bf215546Sopenharmony_ci } else { 2169bf215546Sopenharmony_ci assert(inst->mlen >= 1); 2170bf215546Sopenharmony_ci assert(inst->exec_size == 8); 2171bf215546Sopenharmony_ci gfx4_math(p, dst, brw_math_function(inst->opcode), 2172bf215546Sopenharmony_ci inst->base_mrf, src[0], 2173bf215546Sopenharmony_ci BRW_MATH_PRECISION_FULL); 2174bf215546Sopenharmony_ci send_count++; 2175bf215546Sopenharmony_ci } 2176bf215546Sopenharmony_ci break; 2177bf215546Sopenharmony_ci case FS_OPCODE_LINTERP: 2178bf215546Sopenharmony_ci multiple_instructions_emitted = generate_linterp(inst, dst, src); 2179bf215546Sopenharmony_ci break; 2180bf215546Sopenharmony_ci case FS_OPCODE_PIXEL_X: 2181bf215546Sopenharmony_ci assert(src[0].type == BRW_REGISTER_TYPE_UW); 2182bf215546Sopenharmony_ci assert(src[1].type == BRW_REGISTER_TYPE_UW); 2183bf215546Sopenharmony_ci src[0].subnr = 0 * type_sz(src[0].type); 2184bf215546Sopenharmony_ci if (src[1].file == BRW_IMMEDIATE_VALUE) { 2185bf215546Sopenharmony_ci assert(src[1].ud == 0); 2186bf215546Sopenharmony_ci brw_MOV(p, dst, stride(src[0], 8, 4, 1)); 2187bf215546Sopenharmony_ci } else { 2188bf215546Sopenharmony_ci /* Coarse pixel case */ 2189bf215546Sopenharmony_ci brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]); 2190bf215546Sopenharmony_ci } 2191bf215546Sopenharmony_ci break; 2192bf215546Sopenharmony_ci case FS_OPCODE_PIXEL_Y: 2193bf215546Sopenharmony_ci assert(src[0].type == BRW_REGISTER_TYPE_UW); 2194bf215546Sopenharmony_ci assert(src[1].type == BRW_REGISTER_TYPE_UW); 2195bf215546Sopenharmony_ci src[0].subnr = 4 * type_sz(src[0].type); 2196bf215546Sopenharmony_ci if (src[1].file == BRW_IMMEDIATE_VALUE) { 2197bf215546Sopenharmony_ci assert(src[1].ud == 0); 2198bf215546Sopenharmony_ci brw_MOV(p, dst, stride(src[0], 8, 4, 1)); 2199bf215546Sopenharmony_ci } else { 2200bf215546Sopenharmony_ci /* Coarse pixel case */ 2201bf215546Sopenharmony_ci brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]); 2202bf215546Sopenharmony_ci } 2203bf215546Sopenharmony_ci break; 2204bf215546Sopenharmony_ci 2205bf215546Sopenharmony_ci case SHADER_OPCODE_SEND: 2206bf215546Sopenharmony_ci generate_send(inst, dst, src[0], src[1], src[2], 2207bf215546Sopenharmony_ci inst->ex_mlen > 0 ? src[3] : brw_null_reg()); 2208bf215546Sopenharmony_ci send_count++; 2209bf215546Sopenharmony_ci break; 2210bf215546Sopenharmony_ci 2211bf215546Sopenharmony_ci case SHADER_OPCODE_GET_BUFFER_SIZE: 2212bf215546Sopenharmony_ci generate_get_buffer_size(inst, dst, src[0], src[1]); 2213bf215546Sopenharmony_ci send_count++; 2214bf215546Sopenharmony_ci break; 2215bf215546Sopenharmony_ci case SHADER_OPCODE_TEX: 2216bf215546Sopenharmony_ci case FS_OPCODE_TXB: 2217bf215546Sopenharmony_ci case SHADER_OPCODE_TXD: 2218bf215546Sopenharmony_ci case SHADER_OPCODE_TXF: 2219bf215546Sopenharmony_ci case SHADER_OPCODE_TXF_CMS: 2220bf215546Sopenharmony_ci case SHADER_OPCODE_TXL: 2221bf215546Sopenharmony_ci case SHADER_OPCODE_TXS: 2222bf215546Sopenharmony_ci case SHADER_OPCODE_LOD: 2223bf215546Sopenharmony_ci case SHADER_OPCODE_TG4: 2224bf215546Sopenharmony_ci case SHADER_OPCODE_SAMPLEINFO: 2225bf215546Sopenharmony_ci assert(inst->src[0].file == BAD_FILE); 2226bf215546Sopenharmony_ci generate_tex(inst, dst, src[1], src[2]); 2227bf215546Sopenharmony_ci send_count++; 2228bf215546Sopenharmony_ci break; 2229bf215546Sopenharmony_ci 2230bf215546Sopenharmony_ci case FS_OPCODE_DDX_COARSE: 2231bf215546Sopenharmony_ci case FS_OPCODE_DDX_FINE: 2232bf215546Sopenharmony_ci generate_ddx(inst, dst, src[0]); 2233bf215546Sopenharmony_ci break; 2234bf215546Sopenharmony_ci case FS_OPCODE_DDY_COARSE: 2235bf215546Sopenharmony_ci case FS_OPCODE_DDY_FINE: 2236bf215546Sopenharmony_ci generate_ddy(inst, dst, src[0]); 2237bf215546Sopenharmony_ci break; 2238bf215546Sopenharmony_ci 2239bf215546Sopenharmony_ci case SHADER_OPCODE_GFX4_SCRATCH_WRITE: 2240bf215546Sopenharmony_ci generate_scratch_write(inst, src[0]); 2241bf215546Sopenharmony_ci send_count++; 2242bf215546Sopenharmony_ci break; 2243bf215546Sopenharmony_ci 2244bf215546Sopenharmony_ci case SHADER_OPCODE_GFX4_SCRATCH_READ: 2245bf215546Sopenharmony_ci generate_scratch_read(inst, dst); 2246bf215546Sopenharmony_ci send_count++; 2247bf215546Sopenharmony_ci break; 2248bf215546Sopenharmony_ci 2249bf215546Sopenharmony_ci case SHADER_OPCODE_GFX7_SCRATCH_READ: 2250bf215546Sopenharmony_ci generate_scratch_read_gfx7(inst, dst); 2251bf215546Sopenharmony_ci send_count++; 2252bf215546Sopenharmony_ci break; 2253bf215546Sopenharmony_ci 2254bf215546Sopenharmony_ci case SHADER_OPCODE_SCRATCH_HEADER: 2255bf215546Sopenharmony_ci generate_scratch_header(inst, dst); 2256bf215546Sopenharmony_ci break; 2257bf215546Sopenharmony_ci 2258bf215546Sopenharmony_ci case SHADER_OPCODE_MOV_INDIRECT: 2259bf215546Sopenharmony_ci generate_mov_indirect(inst, dst, src[0], src[1]); 2260bf215546Sopenharmony_ci break; 2261bf215546Sopenharmony_ci 2262bf215546Sopenharmony_ci case SHADER_OPCODE_MOV_RELOC_IMM: 2263bf215546Sopenharmony_ci assert(src[0].file == BRW_IMMEDIATE_VALUE); 2264bf215546Sopenharmony_ci brw_MOV_reloc_imm(p, dst, dst.type, src[0].ud); 2265bf215546Sopenharmony_ci break; 2266bf215546Sopenharmony_ci 2267bf215546Sopenharmony_ci case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 2268bf215546Sopenharmony_ci assert(inst->force_writemask_all); 2269bf215546Sopenharmony_ci generate_uniform_pull_constant_load(inst, dst, src[0], src[1]); 2270bf215546Sopenharmony_ci send_count++; 2271bf215546Sopenharmony_ci break; 2272bf215546Sopenharmony_ci 2273bf215546Sopenharmony_ci case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: 2274bf215546Sopenharmony_ci assert(inst->force_writemask_all); 2275bf215546Sopenharmony_ci generate_uniform_pull_constant_load_gfx7(inst, dst, src[0], src[1]); 2276bf215546Sopenharmony_ci send_count++; 2277bf215546Sopenharmony_ci break; 2278bf215546Sopenharmony_ci 2279bf215546Sopenharmony_ci case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: 2280bf215546Sopenharmony_ci generate_varying_pull_constant_load_gfx4(inst, dst, src[0]); 2281bf215546Sopenharmony_ci send_count++; 2282bf215546Sopenharmony_ci break; 2283bf215546Sopenharmony_ci 2284bf215546Sopenharmony_ci case FS_OPCODE_REP_FB_WRITE: 2285bf215546Sopenharmony_ci case FS_OPCODE_FB_WRITE: 2286bf215546Sopenharmony_ci generate_fb_write(inst, src[0]); 2287bf215546Sopenharmony_ci send_count++; 2288bf215546Sopenharmony_ci break; 2289bf215546Sopenharmony_ci 2290bf215546Sopenharmony_ci case FS_OPCODE_FB_READ: 2291bf215546Sopenharmony_ci generate_fb_read(inst, dst, src[0]); 2292bf215546Sopenharmony_ci send_count++; 2293bf215546Sopenharmony_ci break; 2294bf215546Sopenharmony_ci 2295bf215546Sopenharmony_ci case BRW_OPCODE_HALT: 2296bf215546Sopenharmony_ci generate_halt(inst); 2297bf215546Sopenharmony_ci break; 2298bf215546Sopenharmony_ci 2299bf215546Sopenharmony_ci case SHADER_OPCODE_INTERLOCK: 2300bf215546Sopenharmony_ci case SHADER_OPCODE_MEMORY_FENCE: { 2301bf215546Sopenharmony_ci assert(src[1].file == BRW_IMMEDIATE_VALUE); 2302bf215546Sopenharmony_ci assert(src[2].file == BRW_IMMEDIATE_VALUE); 2303bf215546Sopenharmony_ci 2304bf215546Sopenharmony_ci const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ? 2305bf215546Sopenharmony_ci BRW_OPCODE_SENDC : BRW_OPCODE_SEND; 2306bf215546Sopenharmony_ci 2307bf215546Sopenharmony_ci brw_memory_fence(p, dst, src[0], send_op, 2308bf215546Sopenharmony_ci brw_message_target(inst->sfid), 2309bf215546Sopenharmony_ci inst->desc, 2310bf215546Sopenharmony_ci /* commit_enable */ src[1].ud, 2311bf215546Sopenharmony_ci /* bti */ src[2].ud); 2312bf215546Sopenharmony_ci send_count++; 2313bf215546Sopenharmony_ci break; 2314bf215546Sopenharmony_ci } 2315bf215546Sopenharmony_ci 2316bf215546Sopenharmony_ci case FS_OPCODE_SCHEDULING_FENCE: 2317bf215546Sopenharmony_ci if (inst->sources == 0 && swsb.regdist == 0 && 2318bf215546Sopenharmony_ci swsb.mode == TGL_SBID_NULL) { 2319bf215546Sopenharmony_ci if (unlikely(debug_flag)) 2320bf215546Sopenharmony_ci disasm_info->use_tail = true; 2321bf215546Sopenharmony_ci break; 2322bf215546Sopenharmony_ci } 2323bf215546Sopenharmony_ci 2324bf215546Sopenharmony_ci if (devinfo->ver >= 12) { 2325bf215546Sopenharmony_ci /* Use the available SWSB information to stall. A single SYNC is 2326bf215546Sopenharmony_ci * sufficient since if there were multiple dependencies, the 2327bf215546Sopenharmony_ci * scoreboard algorithm already injected other SYNCs before this 2328bf215546Sopenharmony_ci * instruction. 2329bf215546Sopenharmony_ci */ 2330bf215546Sopenharmony_ci brw_SYNC(p, TGL_SYNC_NOP); 2331bf215546Sopenharmony_ci } else { 2332bf215546Sopenharmony_ci for (unsigned i = 0; i < inst->sources; i++) { 2333bf215546Sopenharmony_ci /* Emit a MOV to force a stall until the instruction producing the 2334bf215546Sopenharmony_ci * registers finishes. 2335bf215546Sopenharmony_ci */ 2336bf215546Sopenharmony_ci brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), 2337bf215546Sopenharmony_ci retype(src[i], BRW_REGISTER_TYPE_UW)); 2338bf215546Sopenharmony_ci } 2339bf215546Sopenharmony_ci 2340bf215546Sopenharmony_ci if (inst->sources > 1) 2341bf215546Sopenharmony_ci multiple_instructions_emitted = true; 2342bf215546Sopenharmony_ci } 2343bf215546Sopenharmony_ci 2344bf215546Sopenharmony_ci break; 2345bf215546Sopenharmony_ci 2346bf215546Sopenharmony_ci case SHADER_OPCODE_FIND_LIVE_CHANNEL: 2347bf215546Sopenharmony_ci brw_find_live_channel(p, dst, false); 2348bf215546Sopenharmony_ci break; 2349bf215546Sopenharmony_ci case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: 2350bf215546Sopenharmony_ci brw_find_live_channel(p, dst, true); 2351bf215546Sopenharmony_ci break; 2352bf215546Sopenharmony_ci 2353bf215546Sopenharmony_ci case FS_OPCODE_LOAD_LIVE_CHANNELS: { 2354bf215546Sopenharmony_ci assert(devinfo->ver >= 8); 2355bf215546Sopenharmony_ci assert(inst->force_writemask_all && inst->group == 0); 2356bf215546Sopenharmony_ci assert(inst->dst.file == BAD_FILE); 2357bf215546Sopenharmony_ci brw_set_default_exec_size(p, BRW_EXECUTE_1); 2358bf215546Sopenharmony_ci brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg), 2359bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UD), 2360bf215546Sopenharmony_ci retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); 2361bf215546Sopenharmony_ci break; 2362bf215546Sopenharmony_ci } 2363bf215546Sopenharmony_ci case SHADER_OPCODE_BROADCAST: 2364bf215546Sopenharmony_ci assert(inst->force_writemask_all); 2365bf215546Sopenharmony_ci brw_broadcast(p, dst, src[0], src[1]); 2366bf215546Sopenharmony_ci break; 2367bf215546Sopenharmony_ci 2368bf215546Sopenharmony_ci case SHADER_OPCODE_SHUFFLE: 2369bf215546Sopenharmony_ci generate_shuffle(inst, dst, src[0], src[1]); 2370bf215546Sopenharmony_ci break; 2371bf215546Sopenharmony_ci 2372bf215546Sopenharmony_ci case SHADER_OPCODE_SEL_EXEC: 2373bf215546Sopenharmony_ci assert(inst->force_writemask_all); 2374bf215546Sopenharmony_ci assert(devinfo->has_64bit_float || type_sz(dst.type) <= 4); 2375bf215546Sopenharmony_ci brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2376bf215546Sopenharmony_ci brw_MOV(p, dst, src[1]); 2377bf215546Sopenharmony_ci brw_set_default_mask_control(p, BRW_MASK_ENABLE); 2378bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_null()); 2379bf215546Sopenharmony_ci brw_MOV(p, dst, src[0]); 2380bf215546Sopenharmony_ci break; 2381bf215546Sopenharmony_ci 2382bf215546Sopenharmony_ci case SHADER_OPCODE_QUAD_SWIZZLE: 2383bf215546Sopenharmony_ci assert(src[1].file == BRW_IMMEDIATE_VALUE); 2384bf215546Sopenharmony_ci assert(src[1].type == BRW_REGISTER_TYPE_UD); 2385bf215546Sopenharmony_ci generate_quad_swizzle(inst, dst, src[0], src[1].ud); 2386bf215546Sopenharmony_ci break; 2387bf215546Sopenharmony_ci 2388bf215546Sopenharmony_ci case SHADER_OPCODE_CLUSTER_BROADCAST: { 2389bf215546Sopenharmony_ci assert((devinfo->platform != INTEL_PLATFORM_CHV && 2390bf215546Sopenharmony_ci !intel_device_info_is_9lp(devinfo) && 2391bf215546Sopenharmony_ci devinfo->has_64bit_float) || type_sz(src[0].type) <= 4); 2392bf215546Sopenharmony_ci assert(!src[0].negate && !src[0].abs); 2393bf215546Sopenharmony_ci assert(src[1].file == BRW_IMMEDIATE_VALUE); 2394bf215546Sopenharmony_ci assert(src[1].type == BRW_REGISTER_TYPE_UD); 2395bf215546Sopenharmony_ci assert(src[2].file == BRW_IMMEDIATE_VALUE); 2396bf215546Sopenharmony_ci assert(src[2].type == BRW_REGISTER_TYPE_UD); 2397bf215546Sopenharmony_ci const unsigned component = src[1].ud; 2398bf215546Sopenharmony_ci const unsigned cluster_size = src[2].ud; 2399bf215546Sopenharmony_ci assert(inst->src[0].file != ARF && inst->src[0].file != FIXED_GRF); 2400bf215546Sopenharmony_ci const unsigned s = inst->src[0].stride; 2401bf215546Sopenharmony_ci unsigned vstride = cluster_size * s; 2402bf215546Sopenharmony_ci unsigned width = cluster_size; 2403bf215546Sopenharmony_ci 2404bf215546Sopenharmony_ci /* The maximum exec_size is 32, but the maximum width is only 16. */ 2405bf215546Sopenharmony_ci if (inst->exec_size == width) { 2406bf215546Sopenharmony_ci vstride = 0; 2407bf215546Sopenharmony_ci width = 1; 2408bf215546Sopenharmony_ci } 2409bf215546Sopenharmony_ci 2410bf215546Sopenharmony_ci struct brw_reg strided = stride(suboffset(src[0], component * s), 2411bf215546Sopenharmony_ci vstride, width, 0); 2412bf215546Sopenharmony_ci brw_MOV(p, dst, strided); 2413bf215546Sopenharmony_ci break; 2414bf215546Sopenharmony_ci } 2415bf215546Sopenharmony_ci 2416bf215546Sopenharmony_ci case FS_OPCODE_SET_SAMPLE_ID: 2417bf215546Sopenharmony_ci generate_set_sample_id(inst, dst, src[0], src[1]); 2418bf215546Sopenharmony_ci break; 2419bf215546Sopenharmony_ci 2420bf215546Sopenharmony_ci case FS_OPCODE_PACK_HALF_2x16_SPLIT: 2421bf215546Sopenharmony_ci generate_pack_half_2x16_split(inst, dst, src[0], src[1]); 2422bf215546Sopenharmony_ci break; 2423bf215546Sopenharmony_ci 2424bf215546Sopenharmony_ci case SHADER_OPCODE_HALT_TARGET: 2425bf215546Sopenharmony_ci /* This is the place where the final HALT needs to be inserted if 2426bf215546Sopenharmony_ci * we've emitted any discards. If not, this will emit no code. 2427bf215546Sopenharmony_ci */ 2428bf215546Sopenharmony_ci if (!patch_halt_jumps()) { 2429bf215546Sopenharmony_ci if (unlikely(debug_flag)) { 2430bf215546Sopenharmony_ci disasm_info->use_tail = true; 2431bf215546Sopenharmony_ci } 2432bf215546Sopenharmony_ci } 2433bf215546Sopenharmony_ci break; 2434bf215546Sopenharmony_ci 2435bf215546Sopenharmony_ci case FS_OPCODE_INTERPOLATE_AT_SAMPLE: 2436bf215546Sopenharmony_ci generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2437bf215546Sopenharmony_ci GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE); 2438bf215546Sopenharmony_ci send_count++; 2439bf215546Sopenharmony_ci break; 2440bf215546Sopenharmony_ci 2441bf215546Sopenharmony_ci case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: 2442bf215546Sopenharmony_ci generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2443bf215546Sopenharmony_ci GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET); 2444bf215546Sopenharmony_ci send_count++; 2445bf215546Sopenharmony_ci break; 2446bf215546Sopenharmony_ci 2447bf215546Sopenharmony_ci case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: 2448bf215546Sopenharmony_ci generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2449bf215546Sopenharmony_ci GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET); 2450bf215546Sopenharmony_ci send_count++; 2451bf215546Sopenharmony_ci break; 2452bf215546Sopenharmony_ci 2453bf215546Sopenharmony_ci case CS_OPCODE_CS_TERMINATE: 2454bf215546Sopenharmony_ci generate_cs_terminate(inst, src[0]); 2455bf215546Sopenharmony_ci send_count++; 2456bf215546Sopenharmony_ci break; 2457bf215546Sopenharmony_ci 2458bf215546Sopenharmony_ci case SHADER_OPCODE_BARRIER: 2459bf215546Sopenharmony_ci generate_barrier(inst, src[0]); 2460bf215546Sopenharmony_ci send_count++; 2461bf215546Sopenharmony_ci break; 2462bf215546Sopenharmony_ci 2463bf215546Sopenharmony_ci case BRW_OPCODE_DIM: 2464bf215546Sopenharmony_ci assert(devinfo->platform == INTEL_PLATFORM_HSW); 2465bf215546Sopenharmony_ci assert(src[0].type == BRW_REGISTER_TYPE_DF); 2466bf215546Sopenharmony_ci assert(dst.type == BRW_REGISTER_TYPE_DF); 2467bf215546Sopenharmony_ci brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); 2468bf215546Sopenharmony_ci break; 2469bf215546Sopenharmony_ci 2470bf215546Sopenharmony_ci case SHADER_OPCODE_RND_MODE: { 2471bf215546Sopenharmony_ci assert(src[0].file == BRW_IMMEDIATE_VALUE); 2472bf215546Sopenharmony_ci /* 2473bf215546Sopenharmony_ci * Changes the floating point rounding mode updating the control 2474bf215546Sopenharmony_ci * register field defined at cr0.0[5-6] bits. 2475bf215546Sopenharmony_ci */ 2476bf215546Sopenharmony_ci enum brw_rnd_mode mode = 2477bf215546Sopenharmony_ci (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT); 2478bf215546Sopenharmony_ci brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK); 2479bf215546Sopenharmony_ci } 2480bf215546Sopenharmony_ci break; 2481bf215546Sopenharmony_ci 2482bf215546Sopenharmony_ci case SHADER_OPCODE_FLOAT_CONTROL_MODE: 2483bf215546Sopenharmony_ci assert(src[0].file == BRW_IMMEDIATE_VALUE); 2484bf215546Sopenharmony_ci assert(src[1].file == BRW_IMMEDIATE_VALUE); 2485bf215546Sopenharmony_ci brw_float_controls_mode(p, src[0].d, src[1].d); 2486bf215546Sopenharmony_ci break; 2487bf215546Sopenharmony_ci 2488bf215546Sopenharmony_ci case SHADER_OPCODE_READ_SR_REG: 2489bf215546Sopenharmony_ci if (devinfo->ver >= 12) { 2490bf215546Sopenharmony_ci /* There is a SWSB restriction that requires that any time sr0 is 2491bf215546Sopenharmony_ci * accessed both the instruction doing the access and the next one 2492bf215546Sopenharmony_ci * have SWSB set to RegDist(1). 2493bf215546Sopenharmony_ci */ 2494bf215546Sopenharmony_ci if (brw_get_default_swsb(p).mode != TGL_SBID_NULL) 2495bf215546Sopenharmony_ci brw_SYNC(p, TGL_SYNC_NOP); 2496bf215546Sopenharmony_ci assert(src[0].file == BRW_IMMEDIATE_VALUE); 2497bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_regdist(1)); 2498bf215546Sopenharmony_ci brw_MOV(p, dst, brw_sr0_reg(src[0].ud)); 2499bf215546Sopenharmony_ci brw_set_default_swsb(p, tgl_swsb_regdist(1)); 2500bf215546Sopenharmony_ci brw_AND(p, dst, dst, brw_imm_ud(0xffffffff)); 2501bf215546Sopenharmony_ci } else { 2502bf215546Sopenharmony_ci brw_MOV(p, dst, brw_sr0_reg(src[0].ud)); 2503bf215546Sopenharmony_ci } 2504bf215546Sopenharmony_ci break; 2505bf215546Sopenharmony_ci 2506bf215546Sopenharmony_ci default: 2507bf215546Sopenharmony_ci unreachable("Unsupported opcode"); 2508bf215546Sopenharmony_ci 2509bf215546Sopenharmony_ci case SHADER_OPCODE_LOAD_PAYLOAD: 2510bf215546Sopenharmony_ci unreachable("Should be lowered by lower_load_payload()"); 2511bf215546Sopenharmony_ci } 2512bf215546Sopenharmony_ci 2513bf215546Sopenharmony_ci if (multiple_instructions_emitted) 2514bf215546Sopenharmony_ci continue; 2515bf215546Sopenharmony_ci 2516bf215546Sopenharmony_ci if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { 2517bf215546Sopenharmony_ci assert(p->next_insn_offset == last_insn_offset + 16 || 2518bf215546Sopenharmony_ci !"conditional_mod, no_dd_check, or no_dd_clear set for IR " 2519bf215546Sopenharmony_ci "emitting more than 1 instruction"); 2520bf215546Sopenharmony_ci 2521bf215546Sopenharmony_ci brw_inst *last = &p->store[last_insn_offset / 16]; 2522bf215546Sopenharmony_ci 2523bf215546Sopenharmony_ci if (inst->conditional_mod) 2524bf215546Sopenharmony_ci brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); 2525bf215546Sopenharmony_ci if (devinfo->ver < 12) { 2526bf215546Sopenharmony_ci brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); 2527bf215546Sopenharmony_ci brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); 2528bf215546Sopenharmony_ci } 2529bf215546Sopenharmony_ci } 2530bf215546Sopenharmony_ci } 2531bf215546Sopenharmony_ci 2532bf215546Sopenharmony_ci brw_set_uip_jip(p, start_offset); 2533bf215546Sopenharmony_ci 2534bf215546Sopenharmony_ci /* end of program sentinel */ 2535bf215546Sopenharmony_ci disasm_new_inst_group(disasm_info, p->next_insn_offset); 2536bf215546Sopenharmony_ci 2537bf215546Sopenharmony_ci /* `send_count` explicitly does not include spills or fills, as we'd 2538bf215546Sopenharmony_ci * like to use it as a metric for intentional memory access or other 2539bf215546Sopenharmony_ci * shared function use. Otherwise, subtle changes to scheduling or 2540bf215546Sopenharmony_ci * register allocation could cause it to fluctuate wildly - and that 2541bf215546Sopenharmony_ci * effect is already counted in spill/fill counts. 2542bf215546Sopenharmony_ci */ 2543bf215546Sopenharmony_ci send_count -= shader_stats.spill_count; 2544bf215546Sopenharmony_ci send_count -= shader_stats.fill_count; 2545bf215546Sopenharmony_ci 2546bf215546Sopenharmony_ci#ifndef NDEBUG 2547bf215546Sopenharmony_ci bool validated = 2548bf215546Sopenharmony_ci#else 2549bf215546Sopenharmony_ci if (unlikely(debug_flag)) 2550bf215546Sopenharmony_ci#endif 2551bf215546Sopenharmony_ci brw_validate_instructions(&compiler->isa, p->store, 2552bf215546Sopenharmony_ci start_offset, 2553bf215546Sopenharmony_ci p->next_insn_offset, 2554bf215546Sopenharmony_ci disasm_info); 2555bf215546Sopenharmony_ci 2556bf215546Sopenharmony_ci int before_size = p->next_insn_offset - start_offset; 2557bf215546Sopenharmony_ci brw_compact_instructions(p, start_offset, disasm_info); 2558bf215546Sopenharmony_ci int after_size = p->next_insn_offset - start_offset; 2559bf215546Sopenharmony_ci 2560bf215546Sopenharmony_ci if (unlikely(debug_flag)) { 2561bf215546Sopenharmony_ci unsigned char sha1[21]; 2562bf215546Sopenharmony_ci char sha1buf[41]; 2563bf215546Sopenharmony_ci 2564bf215546Sopenharmony_ci _mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst), 2565bf215546Sopenharmony_ci after_size, sha1); 2566bf215546Sopenharmony_ci _mesa_sha1_format(sha1buf, sha1); 2567bf215546Sopenharmony_ci 2568bf215546Sopenharmony_ci fprintf(stderr, "Native code for %s (sha1 %s)\n" 2569bf215546Sopenharmony_ci "SIMD%d shader: %d instructions. %d loops. %u cycles. " 2570bf215546Sopenharmony_ci "%d:%d spills:fills, %u sends, " 2571bf215546Sopenharmony_ci "scheduled with mode %s. " 2572bf215546Sopenharmony_ci "Promoted %u constants. " 2573bf215546Sopenharmony_ci "Compacted %d to %d bytes (%.0f%%)\n", 2574bf215546Sopenharmony_ci shader_name, sha1buf, 2575bf215546Sopenharmony_ci dispatch_width, before_size / 16, 2576bf215546Sopenharmony_ci loop_count, perf.latency, 2577bf215546Sopenharmony_ci shader_stats.spill_count, 2578bf215546Sopenharmony_ci shader_stats.fill_count, 2579bf215546Sopenharmony_ci send_count, 2580bf215546Sopenharmony_ci shader_stats.scheduler_mode, 2581bf215546Sopenharmony_ci shader_stats.promoted_constants, 2582bf215546Sopenharmony_ci before_size, after_size, 2583bf215546Sopenharmony_ci 100.0f * (before_size - after_size) / before_size); 2584bf215546Sopenharmony_ci 2585bf215546Sopenharmony_ci /* overriding the shader makes disasm_info invalid */ 2586bf215546Sopenharmony_ci if (!brw_try_override_assembly(p, start_offset, sha1buf)) { 2587bf215546Sopenharmony_ci dump_assembly(p->store, start_offset, p->next_insn_offset, 2588bf215546Sopenharmony_ci disasm_info, perf.block_latency); 2589bf215546Sopenharmony_ci } else { 2590bf215546Sopenharmony_ci fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf); 2591bf215546Sopenharmony_ci } 2592bf215546Sopenharmony_ci } 2593bf215546Sopenharmony_ci ralloc_free(disasm_info); 2594bf215546Sopenharmony_ci#ifndef NDEBUG 2595bf215546Sopenharmony_ci if (!validated && !debug_flag) { 2596bf215546Sopenharmony_ci fprintf(stderr, 2597bf215546Sopenharmony_ci "Validation failed. Rerun with INTEL_DEBUG=shaders to get more information.\n"); 2598bf215546Sopenharmony_ci } 2599bf215546Sopenharmony_ci#endif 2600bf215546Sopenharmony_ci assert(validated); 2601bf215546Sopenharmony_ci 2602bf215546Sopenharmony_ci brw_shader_debug_log(compiler, log_data, 2603bf215546Sopenharmony_ci "%s SIMD%d shader: %d inst, %d loops, %u cycles, " 2604bf215546Sopenharmony_ci "%d:%d spills:fills, %u sends, " 2605bf215546Sopenharmony_ci "scheduled with mode %s, " 2606bf215546Sopenharmony_ci "Promoted %u constants, " 2607bf215546Sopenharmony_ci "compacted %d to %d bytes.\n", 2608bf215546Sopenharmony_ci _mesa_shader_stage_to_abbrev(stage), 2609bf215546Sopenharmony_ci dispatch_width, before_size / 16 - nop_count, 2610bf215546Sopenharmony_ci loop_count, perf.latency, 2611bf215546Sopenharmony_ci shader_stats.spill_count, 2612bf215546Sopenharmony_ci shader_stats.fill_count, 2613bf215546Sopenharmony_ci send_count, 2614bf215546Sopenharmony_ci shader_stats.scheduler_mode, 2615bf215546Sopenharmony_ci shader_stats.promoted_constants, 2616bf215546Sopenharmony_ci before_size, after_size); 2617bf215546Sopenharmony_ci if (stats) { 2618bf215546Sopenharmony_ci stats->dispatch_width = dispatch_width; 2619bf215546Sopenharmony_ci stats->instructions = before_size / 16 - nop_count; 2620bf215546Sopenharmony_ci stats->sends = send_count; 2621bf215546Sopenharmony_ci stats->loops = loop_count; 2622bf215546Sopenharmony_ci stats->cycles = perf.latency; 2623bf215546Sopenharmony_ci stats->spills = shader_stats.spill_count; 2624bf215546Sopenharmony_ci stats->fills = shader_stats.fill_count; 2625bf215546Sopenharmony_ci } 2626bf215546Sopenharmony_ci 2627bf215546Sopenharmony_ci return start_offset; 2628bf215546Sopenharmony_ci} 2629bf215546Sopenharmony_ci 2630bf215546Sopenharmony_civoid 2631bf215546Sopenharmony_cifs_generator::add_const_data(void *data, unsigned size) 2632bf215546Sopenharmony_ci{ 2633bf215546Sopenharmony_ci assert(prog_data->const_data_size == 0); 2634bf215546Sopenharmony_ci if (size > 0) { 2635bf215546Sopenharmony_ci prog_data->const_data_size = size; 2636bf215546Sopenharmony_ci prog_data->const_data_offset = brw_append_data(p, data, size, 32); 2637bf215546Sopenharmony_ci } 2638bf215546Sopenharmony_ci} 2639bf215546Sopenharmony_ci 2640bf215546Sopenharmony_civoid 2641bf215546Sopenharmony_cifs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt) 2642bf215546Sopenharmony_ci{ 2643bf215546Sopenharmony_ci assert(brw_shader_stage_is_bindless(stage)); 2644bf215546Sopenharmony_ci struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data); 2645bf215546Sopenharmony_ci if (num_resume_shaders > 0) { 2646bf215546Sopenharmony_ci bs_prog_data->resume_sbt_offset = 2647bf215546Sopenharmony_ci brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32); 2648bf215546Sopenharmony_ci for (unsigned i = 0; i < num_resume_shaders; i++) { 2649bf215546Sopenharmony_ci size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt); 2650bf215546Sopenharmony_ci assert(offset <= UINT32_MAX); 2651bf215546Sopenharmony_ci brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET, 2652bf215546Sopenharmony_ci BRW_SHADER_RELOC_TYPE_U32, 2653bf215546Sopenharmony_ci (uint32_t)offset, (uint32_t)sbt[i]); 2654bf215546Sopenharmony_ci } 2655bf215546Sopenharmony_ci } 2656bf215546Sopenharmony_ci} 2657bf215546Sopenharmony_ci 2658bf215546Sopenharmony_ciconst unsigned * 2659bf215546Sopenharmony_cifs_generator::get_assembly() 2660bf215546Sopenharmony_ci{ 2661bf215546Sopenharmony_ci prog_data->relocs = brw_get_shader_relocs(p, &prog_data->num_relocs); 2662bf215546Sopenharmony_ci 2663bf215546Sopenharmony_ci return brw_get_program(p, &prog_data->program_size); 2664bf215546Sopenharmony_ci} 2665