1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2010 Intel Corporation 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci#include "compiler/glsl/ir.h" 25bf215546Sopenharmony_ci#include "brw_fs.h" 26bf215546Sopenharmony_ci#include "brw_nir.h" 27bf215546Sopenharmony_ci#include "brw_rt.h" 28bf215546Sopenharmony_ci#include "brw_eu.h" 29bf215546Sopenharmony_ci#include "nir_search_helpers.h" 30bf215546Sopenharmony_ci#include "util/u_math.h" 31bf215546Sopenharmony_ci#include "util/bitscan.h" 32bf215546Sopenharmony_ci 33bf215546Sopenharmony_ciusing namespace brw; 34bf215546Sopenharmony_ci 35bf215546Sopenharmony_civoid 36bf215546Sopenharmony_cifs_visitor::emit_nir_code() 37bf215546Sopenharmony_ci{ 38bf215546Sopenharmony_ci emit_shader_float_controls_execution_mode(); 39bf215546Sopenharmony_ci 40bf215546Sopenharmony_ci /* emit the arrays used for inputs and outputs - load/store intrinsics will 41bf215546Sopenharmony_ci * be converted to reads/writes of these arrays 42bf215546Sopenharmony_ci */ 43bf215546Sopenharmony_ci nir_setup_outputs(); 44bf215546Sopenharmony_ci nir_setup_uniforms(); 45bf215546Sopenharmony_ci nir_emit_system_values(); 46bf215546Sopenharmony_ci last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width; 47bf215546Sopenharmony_ci 48bf215546Sopenharmony_ci nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir)); 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_HALT_TARGET); 51bf215546Sopenharmony_ci} 52bf215546Sopenharmony_ci 53bf215546Sopenharmony_civoid 54bf215546Sopenharmony_cifs_visitor::nir_setup_outputs() 55bf215546Sopenharmony_ci{ 56bf215546Sopenharmony_ci if (stage == MESA_SHADER_TESS_CTRL || 57bf215546Sopenharmony_ci stage == MESA_SHADER_TASK || 58bf215546Sopenharmony_ci stage == MESA_SHADER_MESH || 59bf215546Sopenharmony_ci stage == MESA_SHADER_FRAGMENT) 60bf215546Sopenharmony_ci return; 61bf215546Sopenharmony_ci 62bf215546Sopenharmony_ci unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, }; 63bf215546Sopenharmony_ci 64bf215546Sopenharmony_ci /* Calculate the size of output registers in a separate pass, before 65bf215546Sopenharmony_ci * allocating them. With ARB_enhanced_layouts, multiple output variables 66bf215546Sopenharmony_ci * may occupy the same slot, but have different type sizes. 67bf215546Sopenharmony_ci */ 68bf215546Sopenharmony_ci nir_foreach_shader_out_variable(var, nir) { 69bf215546Sopenharmony_ci const int loc = var->data.driver_location; 70bf215546Sopenharmony_ci const unsigned var_vec4s = 71bf215546Sopenharmony_ci var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4) 72bf215546Sopenharmony_ci : type_size_vec4(var->type, true); 73bf215546Sopenharmony_ci vec4s[loc] = MAX2(vec4s[loc], var_vec4s); 74bf215546Sopenharmony_ci } 75bf215546Sopenharmony_ci 76bf215546Sopenharmony_ci for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) { 77bf215546Sopenharmony_ci if (vec4s[loc] == 0) { 78bf215546Sopenharmony_ci loc++; 79bf215546Sopenharmony_ci continue; 80bf215546Sopenharmony_ci } 81bf215546Sopenharmony_ci 82bf215546Sopenharmony_ci unsigned reg_size = vec4s[loc]; 83bf215546Sopenharmony_ci 84bf215546Sopenharmony_ci /* Check if there are any ranges that start within this range and extend 85bf215546Sopenharmony_ci * past it. If so, include them in this allocation. 86bf215546Sopenharmony_ci */ 87bf215546Sopenharmony_ci for (unsigned i = 1; i < reg_size; i++) { 88bf215546Sopenharmony_ci assert(i + loc < ARRAY_SIZE(vec4s)); 89bf215546Sopenharmony_ci reg_size = MAX2(vec4s[i + loc] + i, reg_size); 90bf215546Sopenharmony_ci } 91bf215546Sopenharmony_ci 92bf215546Sopenharmony_ci fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size); 93bf215546Sopenharmony_ci for (unsigned i = 0; i < reg_size; i++) { 94bf215546Sopenharmony_ci assert(loc + i < ARRAY_SIZE(outputs)); 95bf215546Sopenharmony_ci outputs[loc + i] = offset(reg, bld, 4 * i); 96bf215546Sopenharmony_ci } 97bf215546Sopenharmony_ci 98bf215546Sopenharmony_ci loc += reg_size; 99bf215546Sopenharmony_ci } 100bf215546Sopenharmony_ci} 101bf215546Sopenharmony_ci 102bf215546Sopenharmony_civoid 103bf215546Sopenharmony_cifs_visitor::nir_setup_uniforms() 104bf215546Sopenharmony_ci{ 105bf215546Sopenharmony_ci /* Only the first compile gets to set up uniforms. */ 106bf215546Sopenharmony_ci if (push_constant_loc) 107bf215546Sopenharmony_ci return; 108bf215546Sopenharmony_ci 109bf215546Sopenharmony_ci uniforms = nir->num_uniforms / 4; 110bf215546Sopenharmony_ci 111bf215546Sopenharmony_ci if (gl_shader_stage_is_compute(stage) && devinfo->verx10 < 125) { 112bf215546Sopenharmony_ci /* Add uniforms for builtins after regular NIR uniforms. */ 113bf215546Sopenharmony_ci assert(uniforms == prog_data->nr_params); 114bf215546Sopenharmony_ci 115bf215546Sopenharmony_ci uint32_t *param; 116bf215546Sopenharmony_ci if (nir->info.workgroup_size_variable && 117bf215546Sopenharmony_ci compiler->lower_variable_group_size) { 118bf215546Sopenharmony_ci param = brw_stage_prog_data_add_params(prog_data, 3); 119bf215546Sopenharmony_ci for (unsigned i = 0; i < 3; i++) { 120bf215546Sopenharmony_ci param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i); 121bf215546Sopenharmony_ci group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); 122bf215546Sopenharmony_ci } 123bf215546Sopenharmony_ci } 124bf215546Sopenharmony_ci 125bf215546Sopenharmony_ci /* Subgroup ID must be the last uniform on the list. This will make 126bf215546Sopenharmony_ci * easier later to split between cross thread and per thread 127bf215546Sopenharmony_ci * uniforms. 128bf215546Sopenharmony_ci */ 129bf215546Sopenharmony_ci param = brw_stage_prog_data_add_params(prog_data, 1); 130bf215546Sopenharmony_ci *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; 131bf215546Sopenharmony_ci subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); 132bf215546Sopenharmony_ci } 133bf215546Sopenharmony_ci} 134bf215546Sopenharmony_ci 135bf215546Sopenharmony_cistatic bool 136bf215546Sopenharmony_ciemit_system_values_block(nir_block *block, fs_visitor *v) 137bf215546Sopenharmony_ci{ 138bf215546Sopenharmony_ci fs_reg *reg; 139bf215546Sopenharmony_ci 140bf215546Sopenharmony_ci nir_foreach_instr(instr, block) { 141bf215546Sopenharmony_ci if (instr->type != nir_instr_type_intrinsic) 142bf215546Sopenharmony_ci continue; 143bf215546Sopenharmony_ci 144bf215546Sopenharmony_ci nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 145bf215546Sopenharmony_ci switch (intrin->intrinsic) { 146bf215546Sopenharmony_ci case nir_intrinsic_load_vertex_id: 147bf215546Sopenharmony_ci case nir_intrinsic_load_base_vertex: 148bf215546Sopenharmony_ci unreachable("should be lowered by nir_lower_system_values()."); 149bf215546Sopenharmony_ci 150bf215546Sopenharmony_ci case nir_intrinsic_load_vertex_id_zero_base: 151bf215546Sopenharmony_ci case nir_intrinsic_load_is_indexed_draw: 152bf215546Sopenharmony_ci case nir_intrinsic_load_first_vertex: 153bf215546Sopenharmony_ci case nir_intrinsic_load_instance_id: 154bf215546Sopenharmony_ci case nir_intrinsic_load_base_instance: 155bf215546Sopenharmony_ci unreachable("should be lowered by brw_nir_lower_vs_inputs()."); 156bf215546Sopenharmony_ci break; 157bf215546Sopenharmony_ci 158bf215546Sopenharmony_ci case nir_intrinsic_load_draw_id: 159bf215546Sopenharmony_ci /* For Task/Mesh, draw_id will be handled later in 160bf215546Sopenharmony_ci * nir_emit_mesh_task_intrinsic(). 161bf215546Sopenharmony_ci */ 162bf215546Sopenharmony_ci if (!gl_shader_stage_is_mesh(v->stage)) 163bf215546Sopenharmony_ci unreachable("should be lowered by brw_nir_lower_vs_inputs()."); 164bf215546Sopenharmony_ci break; 165bf215546Sopenharmony_ci 166bf215546Sopenharmony_ci case nir_intrinsic_load_invocation_id: 167bf215546Sopenharmony_ci if (v->stage == MESA_SHADER_TESS_CTRL) 168bf215546Sopenharmony_ci break; 169bf215546Sopenharmony_ci assert(v->stage == MESA_SHADER_GEOMETRY); 170bf215546Sopenharmony_ci reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 171bf215546Sopenharmony_ci if (reg->file == BAD_FILE) { 172bf215546Sopenharmony_ci const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL); 173bf215546Sopenharmony_ci fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 174bf215546Sopenharmony_ci fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 175bf215546Sopenharmony_ci abld.SHR(iid, g1, brw_imm_ud(27u)); 176bf215546Sopenharmony_ci *reg = iid; 177bf215546Sopenharmony_ci } 178bf215546Sopenharmony_ci break; 179bf215546Sopenharmony_ci 180bf215546Sopenharmony_ci case nir_intrinsic_load_sample_pos: 181bf215546Sopenharmony_ci case nir_intrinsic_load_sample_pos_or_center: 182bf215546Sopenharmony_ci assert(v->stage == MESA_SHADER_FRAGMENT); 183bf215546Sopenharmony_ci reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 184bf215546Sopenharmony_ci if (reg->file == BAD_FILE) 185bf215546Sopenharmony_ci *reg = v->emit_samplepos_setup(); 186bf215546Sopenharmony_ci break; 187bf215546Sopenharmony_ci 188bf215546Sopenharmony_ci case nir_intrinsic_load_sample_id: 189bf215546Sopenharmony_ci assert(v->stage == MESA_SHADER_FRAGMENT); 190bf215546Sopenharmony_ci reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 191bf215546Sopenharmony_ci if (reg->file == BAD_FILE) 192bf215546Sopenharmony_ci *reg = v->emit_sampleid_setup(); 193bf215546Sopenharmony_ci break; 194bf215546Sopenharmony_ci 195bf215546Sopenharmony_ci case nir_intrinsic_load_sample_mask_in: 196bf215546Sopenharmony_ci assert(v->stage == MESA_SHADER_FRAGMENT); 197bf215546Sopenharmony_ci assert(v->devinfo->ver >= 7); 198bf215546Sopenharmony_ci reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN]; 199bf215546Sopenharmony_ci if (reg->file == BAD_FILE) 200bf215546Sopenharmony_ci *reg = v->emit_samplemaskin_setup(); 201bf215546Sopenharmony_ci break; 202bf215546Sopenharmony_ci 203bf215546Sopenharmony_ci case nir_intrinsic_load_workgroup_id: 204bf215546Sopenharmony_ci assert(gl_shader_stage_uses_workgroup(v->stage)); 205bf215546Sopenharmony_ci reg = &v->nir_system_values[SYSTEM_VALUE_WORKGROUP_ID]; 206bf215546Sopenharmony_ci if (reg->file == BAD_FILE) 207bf215546Sopenharmony_ci *reg = v->emit_work_group_id_setup(); 208bf215546Sopenharmony_ci break; 209bf215546Sopenharmony_ci 210bf215546Sopenharmony_ci case nir_intrinsic_load_helper_invocation: 211bf215546Sopenharmony_ci assert(v->stage == MESA_SHADER_FRAGMENT); 212bf215546Sopenharmony_ci reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION]; 213bf215546Sopenharmony_ci if (reg->file == BAD_FILE) { 214bf215546Sopenharmony_ci const fs_builder abld = 215bf215546Sopenharmony_ci v->bld.annotate("gl_HelperInvocation", NULL); 216bf215546Sopenharmony_ci 217bf215546Sopenharmony_ci /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the 218bf215546Sopenharmony_ci * pixel mask is in g1.7 of the thread payload. 219bf215546Sopenharmony_ci * 220bf215546Sopenharmony_ci * We move the per-channel pixel enable bit to the low bit of each 221bf215546Sopenharmony_ci * channel by shifting the byte containing the pixel mask by the 222bf215546Sopenharmony_ci * vector immediate 0x76543210UV. 223bf215546Sopenharmony_ci * 224bf215546Sopenharmony_ci * The region of <1,8,0> reads only 1 byte (the pixel masks for 225bf215546Sopenharmony_ci * subspans 0 and 1) in SIMD8 and an additional byte (the pixel 226bf215546Sopenharmony_ci * masks for 2 and 3) in SIMD16. 227bf215546Sopenharmony_ci */ 228bf215546Sopenharmony_ci fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1); 229bf215546Sopenharmony_ci 230bf215546Sopenharmony_ci for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) { 231bf215546Sopenharmony_ci const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i); 232bf215546Sopenharmony_ci hbld.SHR(offset(shifted, hbld, i), 233bf215546Sopenharmony_ci stride(retype(brw_vec1_grf(1 + i, 7), 234bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UB), 235bf215546Sopenharmony_ci 1, 8, 0), 236bf215546Sopenharmony_ci brw_imm_v(0x76543210)); 237bf215546Sopenharmony_ci } 238bf215546Sopenharmony_ci 239bf215546Sopenharmony_ci /* A set bit in the pixel mask means the channel is enabled, but 240bf215546Sopenharmony_ci * that is the opposite of gl_HelperInvocation so we need to invert 241bf215546Sopenharmony_ci * the mask. 242bf215546Sopenharmony_ci * 243bf215546Sopenharmony_ci * The negate source-modifier bit of logical instructions on Gfx8+ 244bf215546Sopenharmony_ci * performs 1's complement negation, so we can use that instead of 245bf215546Sopenharmony_ci * a NOT instruction. 246bf215546Sopenharmony_ci */ 247bf215546Sopenharmony_ci fs_reg inverted = negate(shifted); 248bf215546Sopenharmony_ci if (v->devinfo->ver < 8) { 249bf215546Sopenharmony_ci inverted = abld.vgrf(BRW_REGISTER_TYPE_UW); 250bf215546Sopenharmony_ci abld.NOT(inverted, shifted); 251bf215546Sopenharmony_ci } 252bf215546Sopenharmony_ci 253bf215546Sopenharmony_ci /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing 254bf215546Sopenharmony_ci * with 1 and negating. 255bf215546Sopenharmony_ci */ 256bf215546Sopenharmony_ci fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 257bf215546Sopenharmony_ci abld.AND(anded, inverted, brw_imm_uw(1)); 258bf215546Sopenharmony_ci 259bf215546Sopenharmony_ci fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1); 260bf215546Sopenharmony_ci abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D))); 261bf215546Sopenharmony_ci *reg = dst; 262bf215546Sopenharmony_ci } 263bf215546Sopenharmony_ci break; 264bf215546Sopenharmony_ci 265bf215546Sopenharmony_ci case nir_intrinsic_load_frag_shading_rate: 266bf215546Sopenharmony_ci reg = &v->nir_system_values[SYSTEM_VALUE_FRAG_SHADING_RATE]; 267bf215546Sopenharmony_ci if (reg->file == BAD_FILE) 268bf215546Sopenharmony_ci *reg = v->emit_shading_rate_setup(); 269bf215546Sopenharmony_ci break; 270bf215546Sopenharmony_ci 271bf215546Sopenharmony_ci default: 272bf215546Sopenharmony_ci break; 273bf215546Sopenharmony_ci } 274bf215546Sopenharmony_ci } 275bf215546Sopenharmony_ci 276bf215546Sopenharmony_ci return true; 277bf215546Sopenharmony_ci} 278bf215546Sopenharmony_ci 279bf215546Sopenharmony_civoid 280bf215546Sopenharmony_cifs_visitor::nir_emit_system_values() 281bf215546Sopenharmony_ci{ 282bf215546Sopenharmony_ci nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX); 283bf215546Sopenharmony_ci for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { 284bf215546Sopenharmony_ci nir_system_values[i] = fs_reg(); 285bf215546Sopenharmony_ci } 286bf215546Sopenharmony_ci 287bf215546Sopenharmony_ci /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we 288bf215546Sopenharmony_ci * never end up using it. 289bf215546Sopenharmony_ci */ 290bf215546Sopenharmony_ci { 291bf215546Sopenharmony_ci const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL); 292bf215546Sopenharmony_ci fs_reg ® = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; 293bf215546Sopenharmony_ci reg = abld.vgrf(BRW_REGISTER_TYPE_UW); 294bf215546Sopenharmony_ci 295bf215546Sopenharmony_ci const fs_builder allbld8 = abld.group(8, 0).exec_all(); 296bf215546Sopenharmony_ci allbld8.MOV(reg, brw_imm_v(0x76543210)); 297bf215546Sopenharmony_ci if (dispatch_width > 8) 298bf215546Sopenharmony_ci allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u)); 299bf215546Sopenharmony_ci if (dispatch_width > 16) { 300bf215546Sopenharmony_ci const fs_builder allbld16 = abld.group(16, 0).exec_all(); 301bf215546Sopenharmony_ci allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u)); 302bf215546Sopenharmony_ci } 303bf215546Sopenharmony_ci } 304bf215546Sopenharmony_ci 305bf215546Sopenharmony_ci nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir); 306bf215546Sopenharmony_ci nir_foreach_block(block, impl) 307bf215546Sopenharmony_ci emit_system_values_block(block, this); 308bf215546Sopenharmony_ci} 309bf215546Sopenharmony_ci 310bf215546Sopenharmony_civoid 311bf215546Sopenharmony_cifs_visitor::nir_emit_impl(nir_function_impl *impl) 312bf215546Sopenharmony_ci{ 313bf215546Sopenharmony_ci nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc); 314bf215546Sopenharmony_ci for (unsigned i = 0; i < impl->reg_alloc; i++) { 315bf215546Sopenharmony_ci nir_locals[i] = fs_reg(); 316bf215546Sopenharmony_ci } 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci foreach_list_typed(nir_register, reg, node, &impl->registers) { 319bf215546Sopenharmony_ci unsigned array_elems = 320bf215546Sopenharmony_ci reg->num_array_elems == 0 ? 1 : reg->num_array_elems; 321bf215546Sopenharmony_ci unsigned size = array_elems * reg->num_components; 322bf215546Sopenharmony_ci const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B : 323bf215546Sopenharmony_ci brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F); 324bf215546Sopenharmony_ci nir_locals[reg->index] = bld.vgrf(reg_type, size); 325bf215546Sopenharmony_ci } 326bf215546Sopenharmony_ci 327bf215546Sopenharmony_ci nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg, 328bf215546Sopenharmony_ci impl->ssa_alloc); 329bf215546Sopenharmony_ci 330bf215546Sopenharmony_ci nir_emit_cf_list(&impl->body); 331bf215546Sopenharmony_ci} 332bf215546Sopenharmony_ci 333bf215546Sopenharmony_civoid 334bf215546Sopenharmony_cifs_visitor::nir_emit_cf_list(exec_list *list) 335bf215546Sopenharmony_ci{ 336bf215546Sopenharmony_ci exec_list_validate(list); 337bf215546Sopenharmony_ci foreach_list_typed(nir_cf_node, node, node, list) { 338bf215546Sopenharmony_ci switch (node->type) { 339bf215546Sopenharmony_ci case nir_cf_node_if: 340bf215546Sopenharmony_ci nir_emit_if(nir_cf_node_as_if(node)); 341bf215546Sopenharmony_ci break; 342bf215546Sopenharmony_ci 343bf215546Sopenharmony_ci case nir_cf_node_loop: 344bf215546Sopenharmony_ci nir_emit_loop(nir_cf_node_as_loop(node)); 345bf215546Sopenharmony_ci break; 346bf215546Sopenharmony_ci 347bf215546Sopenharmony_ci case nir_cf_node_block: 348bf215546Sopenharmony_ci nir_emit_block(nir_cf_node_as_block(node)); 349bf215546Sopenharmony_ci break; 350bf215546Sopenharmony_ci 351bf215546Sopenharmony_ci default: 352bf215546Sopenharmony_ci unreachable("Invalid CFG node block"); 353bf215546Sopenharmony_ci } 354bf215546Sopenharmony_ci } 355bf215546Sopenharmony_ci} 356bf215546Sopenharmony_ci 357bf215546Sopenharmony_civoid 358bf215546Sopenharmony_cifs_visitor::nir_emit_if(nir_if *if_stmt) 359bf215546Sopenharmony_ci{ 360bf215546Sopenharmony_ci bool invert; 361bf215546Sopenharmony_ci fs_reg cond_reg; 362bf215546Sopenharmony_ci 363bf215546Sopenharmony_ci /* If the condition has the form !other_condition, use other_condition as 364bf215546Sopenharmony_ci * the source, but invert the predicate on the if instruction. 365bf215546Sopenharmony_ci */ 366bf215546Sopenharmony_ci nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition); 367bf215546Sopenharmony_ci if (cond != NULL && cond->op == nir_op_inot) { 368bf215546Sopenharmony_ci invert = true; 369bf215546Sopenharmony_ci cond_reg = get_nir_src(cond->src[0].src); 370bf215546Sopenharmony_ci cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]); 371bf215546Sopenharmony_ci } else { 372bf215546Sopenharmony_ci invert = false; 373bf215546Sopenharmony_ci cond_reg = get_nir_src(if_stmt->condition); 374bf215546Sopenharmony_ci } 375bf215546Sopenharmony_ci 376bf215546Sopenharmony_ci /* first, put the condition into f0 */ 377bf215546Sopenharmony_ci fs_inst *inst = bld.MOV(bld.null_reg_d(), 378bf215546Sopenharmony_ci retype(cond_reg, BRW_REGISTER_TYPE_D)); 379bf215546Sopenharmony_ci inst->conditional_mod = BRW_CONDITIONAL_NZ; 380bf215546Sopenharmony_ci 381bf215546Sopenharmony_ci bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert; 382bf215546Sopenharmony_ci 383bf215546Sopenharmony_ci nir_emit_cf_list(&if_stmt->then_list); 384bf215546Sopenharmony_ci 385bf215546Sopenharmony_ci if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) { 386bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_ELSE); 387bf215546Sopenharmony_ci nir_emit_cf_list(&if_stmt->else_list); 388bf215546Sopenharmony_ci } 389bf215546Sopenharmony_ci 390bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_ENDIF); 391bf215546Sopenharmony_ci 392bf215546Sopenharmony_ci if (devinfo->ver < 7) 393bf215546Sopenharmony_ci limit_dispatch_width(16, "Non-uniform control flow unsupported " 394bf215546Sopenharmony_ci "in SIMD32 mode."); 395bf215546Sopenharmony_ci} 396bf215546Sopenharmony_ci 397bf215546Sopenharmony_civoid 398bf215546Sopenharmony_cifs_visitor::nir_emit_loop(nir_loop *loop) 399bf215546Sopenharmony_ci{ 400bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_DO); 401bf215546Sopenharmony_ci 402bf215546Sopenharmony_ci nir_emit_cf_list(&loop->body); 403bf215546Sopenharmony_ci 404bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_WHILE); 405bf215546Sopenharmony_ci 406bf215546Sopenharmony_ci if (devinfo->ver < 7) 407bf215546Sopenharmony_ci limit_dispatch_width(16, "Non-uniform control flow unsupported " 408bf215546Sopenharmony_ci "in SIMD32 mode."); 409bf215546Sopenharmony_ci} 410bf215546Sopenharmony_ci 411bf215546Sopenharmony_civoid 412bf215546Sopenharmony_cifs_visitor::nir_emit_block(nir_block *block) 413bf215546Sopenharmony_ci{ 414bf215546Sopenharmony_ci nir_foreach_instr(instr, block) { 415bf215546Sopenharmony_ci nir_emit_instr(instr); 416bf215546Sopenharmony_ci } 417bf215546Sopenharmony_ci} 418bf215546Sopenharmony_ci 419bf215546Sopenharmony_civoid 420bf215546Sopenharmony_cifs_visitor::nir_emit_instr(nir_instr *instr) 421bf215546Sopenharmony_ci{ 422bf215546Sopenharmony_ci const fs_builder abld = bld.annotate(NULL, instr); 423bf215546Sopenharmony_ci 424bf215546Sopenharmony_ci switch (instr->type) { 425bf215546Sopenharmony_ci case nir_instr_type_alu: 426bf215546Sopenharmony_ci nir_emit_alu(abld, nir_instr_as_alu(instr), true); 427bf215546Sopenharmony_ci break; 428bf215546Sopenharmony_ci 429bf215546Sopenharmony_ci case nir_instr_type_deref: 430bf215546Sopenharmony_ci unreachable("All derefs should've been lowered"); 431bf215546Sopenharmony_ci break; 432bf215546Sopenharmony_ci 433bf215546Sopenharmony_ci case nir_instr_type_intrinsic: 434bf215546Sopenharmony_ci switch (stage) { 435bf215546Sopenharmony_ci case MESA_SHADER_VERTEX: 436bf215546Sopenharmony_ci nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 437bf215546Sopenharmony_ci break; 438bf215546Sopenharmony_ci case MESA_SHADER_TESS_CTRL: 439bf215546Sopenharmony_ci nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 440bf215546Sopenharmony_ci break; 441bf215546Sopenharmony_ci case MESA_SHADER_TESS_EVAL: 442bf215546Sopenharmony_ci nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr)); 443bf215546Sopenharmony_ci break; 444bf215546Sopenharmony_ci case MESA_SHADER_GEOMETRY: 445bf215546Sopenharmony_ci nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 446bf215546Sopenharmony_ci break; 447bf215546Sopenharmony_ci case MESA_SHADER_FRAGMENT: 448bf215546Sopenharmony_ci nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 449bf215546Sopenharmony_ci break; 450bf215546Sopenharmony_ci case MESA_SHADER_COMPUTE: 451bf215546Sopenharmony_ci case MESA_SHADER_KERNEL: 452bf215546Sopenharmony_ci nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 453bf215546Sopenharmony_ci break; 454bf215546Sopenharmony_ci case MESA_SHADER_RAYGEN: 455bf215546Sopenharmony_ci case MESA_SHADER_ANY_HIT: 456bf215546Sopenharmony_ci case MESA_SHADER_CLOSEST_HIT: 457bf215546Sopenharmony_ci case MESA_SHADER_MISS: 458bf215546Sopenharmony_ci case MESA_SHADER_INTERSECTION: 459bf215546Sopenharmony_ci case MESA_SHADER_CALLABLE: 460bf215546Sopenharmony_ci nir_emit_bs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 461bf215546Sopenharmony_ci break; 462bf215546Sopenharmony_ci case MESA_SHADER_TASK: 463bf215546Sopenharmony_ci nir_emit_task_intrinsic(abld, nir_instr_as_intrinsic(instr)); 464bf215546Sopenharmony_ci break; 465bf215546Sopenharmony_ci case MESA_SHADER_MESH: 466bf215546Sopenharmony_ci nir_emit_mesh_intrinsic(abld, nir_instr_as_intrinsic(instr)); 467bf215546Sopenharmony_ci break; 468bf215546Sopenharmony_ci default: 469bf215546Sopenharmony_ci unreachable("unsupported shader stage"); 470bf215546Sopenharmony_ci } 471bf215546Sopenharmony_ci break; 472bf215546Sopenharmony_ci 473bf215546Sopenharmony_ci case nir_instr_type_tex: 474bf215546Sopenharmony_ci nir_emit_texture(abld, nir_instr_as_tex(instr)); 475bf215546Sopenharmony_ci break; 476bf215546Sopenharmony_ci 477bf215546Sopenharmony_ci case nir_instr_type_load_const: 478bf215546Sopenharmony_ci nir_emit_load_const(abld, nir_instr_as_load_const(instr)); 479bf215546Sopenharmony_ci break; 480bf215546Sopenharmony_ci 481bf215546Sopenharmony_ci case nir_instr_type_ssa_undef: 482bf215546Sopenharmony_ci /* We create a new VGRF for undefs on every use (by handling 483bf215546Sopenharmony_ci * them in get_nir_src()), rather than for each definition. 484bf215546Sopenharmony_ci * This helps register coalescing eliminate MOVs from undef. 485bf215546Sopenharmony_ci */ 486bf215546Sopenharmony_ci break; 487bf215546Sopenharmony_ci 488bf215546Sopenharmony_ci case nir_instr_type_jump: 489bf215546Sopenharmony_ci nir_emit_jump(abld, nir_instr_as_jump(instr)); 490bf215546Sopenharmony_ci break; 491bf215546Sopenharmony_ci 492bf215546Sopenharmony_ci default: 493bf215546Sopenharmony_ci unreachable("unknown instruction type"); 494bf215546Sopenharmony_ci } 495bf215546Sopenharmony_ci} 496bf215546Sopenharmony_ci 497bf215546Sopenharmony_ci/** 498bf215546Sopenharmony_ci * Recognizes a parent instruction of nir_op_extract_* and changes the type to 499bf215546Sopenharmony_ci * match instr. 500bf215546Sopenharmony_ci */ 501bf215546Sopenharmony_cibool 502bf215546Sopenharmony_cifs_visitor::optimize_extract_to_float(nir_alu_instr *instr, 503bf215546Sopenharmony_ci const fs_reg &result) 504bf215546Sopenharmony_ci{ 505bf215546Sopenharmony_ci if (!instr->src[0].src.is_ssa || 506bf215546Sopenharmony_ci !instr->src[0].src.ssa->parent_instr) 507bf215546Sopenharmony_ci return false; 508bf215546Sopenharmony_ci 509bf215546Sopenharmony_ci if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 510bf215546Sopenharmony_ci return false; 511bf215546Sopenharmony_ci 512bf215546Sopenharmony_ci nir_alu_instr *src0 = 513bf215546Sopenharmony_ci nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 514bf215546Sopenharmony_ci 515bf215546Sopenharmony_ci if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 && 516bf215546Sopenharmony_ci src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) 517bf215546Sopenharmony_ci return false; 518bf215546Sopenharmony_ci 519bf215546Sopenharmony_ci unsigned element = nir_src_as_uint(src0->src[1].src); 520bf215546Sopenharmony_ci 521bf215546Sopenharmony_ci /* Element type to extract.*/ 522bf215546Sopenharmony_ci const brw_reg_type type = brw_int_type( 523bf215546Sopenharmony_ci src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1, 524bf215546Sopenharmony_ci src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8); 525bf215546Sopenharmony_ci 526bf215546Sopenharmony_ci fs_reg op0 = get_nir_src(src0->src[0].src); 527bf215546Sopenharmony_ci op0.type = brw_type_for_nir_type(devinfo, 528bf215546Sopenharmony_ci (nir_alu_type)(nir_op_infos[src0->op].input_types[0] | 529bf215546Sopenharmony_ci nir_src_bit_size(src0->src[0].src))); 530bf215546Sopenharmony_ci op0 = offset(op0, bld, src0->src[0].swizzle[0]); 531bf215546Sopenharmony_ci 532bf215546Sopenharmony_ci bld.MOV(result, subscript(op0, type, element)); 533bf215546Sopenharmony_ci return true; 534bf215546Sopenharmony_ci} 535bf215546Sopenharmony_ci 536bf215546Sopenharmony_cibool 537bf215546Sopenharmony_cifs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, 538bf215546Sopenharmony_ci const fs_reg &result) 539bf215546Sopenharmony_ci{ 540bf215546Sopenharmony_ci nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src); 541bf215546Sopenharmony_ci if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face) 542bf215546Sopenharmony_ci return false; 543bf215546Sopenharmony_ci 544bf215546Sopenharmony_ci if (!nir_src_is_const(instr->src[1].src) || 545bf215546Sopenharmony_ci !nir_src_is_const(instr->src[2].src)) 546bf215546Sopenharmony_ci return false; 547bf215546Sopenharmony_ci 548bf215546Sopenharmony_ci const float value1 = nir_src_as_float(instr->src[1].src); 549bf215546Sopenharmony_ci const float value2 = nir_src_as_float(instr->src[2].src); 550bf215546Sopenharmony_ci if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f) 551bf215546Sopenharmony_ci return false; 552bf215546Sopenharmony_ci 553bf215546Sopenharmony_ci /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */ 554bf215546Sopenharmony_ci assert(value1 == -value2); 555bf215546Sopenharmony_ci 556bf215546Sopenharmony_ci fs_reg tmp = vgrf(glsl_type::int_type); 557bf215546Sopenharmony_ci 558bf215546Sopenharmony_ci if (devinfo->ver >= 12) { 559bf215546Sopenharmony_ci /* Bit 15 of g1.1 is 0 if the polygon is front facing. */ 560bf215546Sopenharmony_ci fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W)); 561bf215546Sopenharmony_ci 562bf215546Sopenharmony_ci /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 563bf215546Sopenharmony_ci * 564bf215546Sopenharmony_ci * or(8) tmp.1<2>W g1.1<0,1,0>W 0x00003f80W 565bf215546Sopenharmony_ci * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 566bf215546Sopenharmony_ci * 567bf215546Sopenharmony_ci * and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). 568bf215546Sopenharmony_ci */ 569bf215546Sopenharmony_ci if (value1 == -1.0f) 570bf215546Sopenharmony_ci g1.negate = true; 571bf215546Sopenharmony_ci 572bf215546Sopenharmony_ci bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), 573bf215546Sopenharmony_ci g1, brw_imm_uw(0x3f80)); 574bf215546Sopenharmony_ci } else if (devinfo->ver >= 6) { 575bf215546Sopenharmony_ci /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ 576bf215546Sopenharmony_ci fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); 577bf215546Sopenharmony_ci 578bf215546Sopenharmony_ci /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 579bf215546Sopenharmony_ci * 580bf215546Sopenharmony_ci * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W 581bf215546Sopenharmony_ci * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 582bf215546Sopenharmony_ci * 583bf215546Sopenharmony_ci * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). 584bf215546Sopenharmony_ci * 585bf215546Sopenharmony_ci * This negation looks like it's safe in practice, because bits 0:4 will 586bf215546Sopenharmony_ci * surely be TRIANGLES 587bf215546Sopenharmony_ci */ 588bf215546Sopenharmony_ci 589bf215546Sopenharmony_ci if (value1 == -1.0f) { 590bf215546Sopenharmony_ci g0.negate = true; 591bf215546Sopenharmony_ci } 592bf215546Sopenharmony_ci 593bf215546Sopenharmony_ci bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), 594bf215546Sopenharmony_ci g0, brw_imm_uw(0x3f80)); 595bf215546Sopenharmony_ci } else { 596bf215546Sopenharmony_ci /* Bit 31 of g1.6 is 0 if the polygon is front facing. */ 597bf215546Sopenharmony_ci fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); 598bf215546Sopenharmony_ci 599bf215546Sopenharmony_ci /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 600bf215546Sopenharmony_ci * 601bf215546Sopenharmony_ci * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D 602bf215546Sopenharmony_ci * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 603bf215546Sopenharmony_ci * 604bf215546Sopenharmony_ci * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0). 605bf215546Sopenharmony_ci * 606bf215546Sopenharmony_ci * This negation looks like it's safe in practice, because bits 0:4 will 607bf215546Sopenharmony_ci * surely be TRIANGLES 608bf215546Sopenharmony_ci */ 609bf215546Sopenharmony_ci 610bf215546Sopenharmony_ci if (value1 == -1.0f) { 611bf215546Sopenharmony_ci g1_6.negate = true; 612bf215546Sopenharmony_ci } 613bf215546Sopenharmony_ci 614bf215546Sopenharmony_ci bld.OR(tmp, g1_6, brw_imm_d(0x3f800000)); 615bf215546Sopenharmony_ci } 616bf215546Sopenharmony_ci bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000)); 617bf215546Sopenharmony_ci 618bf215546Sopenharmony_ci return true; 619bf215546Sopenharmony_ci} 620bf215546Sopenharmony_ci 621bf215546Sopenharmony_cistatic void 622bf215546Sopenharmony_ciemit_find_msb_using_lzd(const fs_builder &bld, 623bf215546Sopenharmony_ci const fs_reg &result, 624bf215546Sopenharmony_ci const fs_reg &src, 625bf215546Sopenharmony_ci bool is_signed) 626bf215546Sopenharmony_ci{ 627bf215546Sopenharmony_ci fs_inst *inst; 628bf215546Sopenharmony_ci fs_reg temp = src; 629bf215546Sopenharmony_ci 630bf215546Sopenharmony_ci if (is_signed) { 631bf215546Sopenharmony_ci /* LZD of an absolute value source almost always does the right 632bf215546Sopenharmony_ci * thing. There are two problem values: 633bf215546Sopenharmony_ci * 634bf215546Sopenharmony_ci * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns 635bf215546Sopenharmony_ci * 0. However, findMSB(int(0x80000000)) == 30. 636bf215546Sopenharmony_ci * 637bf215546Sopenharmony_ci * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns 638bf215546Sopenharmony_ci * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 639bf215546Sopenharmony_ci * 640bf215546Sopenharmony_ci * For a value of zero or negative one, -1 will be returned. 641bf215546Sopenharmony_ci * 642bf215546Sopenharmony_ci * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but 643bf215546Sopenharmony_ci * findMSB(-(1<<x)) should return x-1. 644bf215546Sopenharmony_ci * 645bf215546Sopenharmony_ci * For all negative number cases, including 0x80000000 and 646bf215546Sopenharmony_ci * 0xffffffff, the correct value is obtained from LZD if instead of 647bf215546Sopenharmony_ci * negating the (already negative) value the logical-not is used. A 648bf215546Sopenharmony_ci * conditional logical-not can be achieved in two instructions. 649bf215546Sopenharmony_ci */ 650bf215546Sopenharmony_ci temp = bld.vgrf(BRW_REGISTER_TYPE_D); 651bf215546Sopenharmony_ci 652bf215546Sopenharmony_ci bld.ASR(temp, src, brw_imm_d(31)); 653bf215546Sopenharmony_ci bld.XOR(temp, temp, src); 654bf215546Sopenharmony_ci } 655bf215546Sopenharmony_ci 656bf215546Sopenharmony_ci bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), 657bf215546Sopenharmony_ci retype(temp, BRW_REGISTER_TYPE_UD)); 658bf215546Sopenharmony_ci 659bf215546Sopenharmony_ci /* LZD counts from the MSB side, while GLSL's findMSB() wants the count 660bf215546Sopenharmony_ci * from the LSB side. Subtract the result from 31 to convert the MSB 661bf215546Sopenharmony_ci * count into an LSB count. If no bits are set, LZD will return 32. 662bf215546Sopenharmony_ci * 31-32 = -1, which is exactly what findMSB() is supposed to return. 663bf215546Sopenharmony_ci */ 664bf215546Sopenharmony_ci inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31)); 665bf215546Sopenharmony_ci inst->src[0].negate = true; 666bf215546Sopenharmony_ci} 667bf215546Sopenharmony_ci 668bf215546Sopenharmony_cistatic brw_rnd_mode 669bf215546Sopenharmony_cibrw_rnd_mode_from_nir_op (const nir_op op) { 670bf215546Sopenharmony_ci switch (op) { 671bf215546Sopenharmony_ci case nir_op_f2f16_rtz: 672bf215546Sopenharmony_ci return BRW_RND_MODE_RTZ; 673bf215546Sopenharmony_ci case nir_op_f2f16_rtne: 674bf215546Sopenharmony_ci return BRW_RND_MODE_RTNE; 675bf215546Sopenharmony_ci default: 676bf215546Sopenharmony_ci unreachable("Operation doesn't support rounding mode"); 677bf215546Sopenharmony_ci } 678bf215546Sopenharmony_ci} 679bf215546Sopenharmony_ci 680bf215546Sopenharmony_cistatic brw_rnd_mode 681bf215546Sopenharmony_cibrw_rnd_mode_from_execution_mode(unsigned execution_mode) 682bf215546Sopenharmony_ci{ 683bf215546Sopenharmony_ci if (nir_has_any_rounding_mode_rtne(execution_mode)) 684bf215546Sopenharmony_ci return BRW_RND_MODE_RTNE; 685bf215546Sopenharmony_ci if (nir_has_any_rounding_mode_rtz(execution_mode)) 686bf215546Sopenharmony_ci return BRW_RND_MODE_RTZ; 687bf215546Sopenharmony_ci return BRW_RND_MODE_UNSPECIFIED; 688bf215546Sopenharmony_ci} 689bf215546Sopenharmony_ci 690bf215546Sopenharmony_cifs_reg 691bf215546Sopenharmony_cifs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, 692bf215546Sopenharmony_ci nir_alu_instr *instr, 693bf215546Sopenharmony_ci fs_reg *op, 694bf215546Sopenharmony_ci bool need_dest) 695bf215546Sopenharmony_ci{ 696bf215546Sopenharmony_ci fs_reg result = 697bf215546Sopenharmony_ci need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud(); 698bf215546Sopenharmony_ci 699bf215546Sopenharmony_ci result.type = brw_type_for_nir_type(devinfo, 700bf215546Sopenharmony_ci (nir_alu_type)(nir_op_infos[instr->op].output_type | 701bf215546Sopenharmony_ci nir_dest_bit_size(instr->dest.dest))); 702bf215546Sopenharmony_ci 703bf215546Sopenharmony_ci assert(!instr->dest.saturate); 704bf215546Sopenharmony_ci 705bf215546Sopenharmony_ci for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 706bf215546Sopenharmony_ci /* We don't lower to source modifiers so they should not exist. */ 707bf215546Sopenharmony_ci assert(!instr->src[i].abs); 708bf215546Sopenharmony_ci assert(!instr->src[i].negate); 709bf215546Sopenharmony_ci 710bf215546Sopenharmony_ci op[i] = get_nir_src(instr->src[i].src); 711bf215546Sopenharmony_ci op[i].type = brw_type_for_nir_type(devinfo, 712bf215546Sopenharmony_ci (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | 713bf215546Sopenharmony_ci nir_src_bit_size(instr->src[i].src))); 714bf215546Sopenharmony_ci } 715bf215546Sopenharmony_ci 716bf215546Sopenharmony_ci /* Move and vecN instrutions may still be vectored. Return the raw, 717bf215546Sopenharmony_ci * vectored source and destination so that fs_visitor::nir_emit_alu can 718bf215546Sopenharmony_ci * handle it. Other callers should not have to handle these kinds of 719bf215546Sopenharmony_ci * instructions. 720bf215546Sopenharmony_ci */ 721bf215546Sopenharmony_ci switch (instr->op) { 722bf215546Sopenharmony_ci case nir_op_mov: 723bf215546Sopenharmony_ci case nir_op_vec2: 724bf215546Sopenharmony_ci case nir_op_vec3: 725bf215546Sopenharmony_ci case nir_op_vec4: 726bf215546Sopenharmony_ci case nir_op_vec8: 727bf215546Sopenharmony_ci case nir_op_vec16: 728bf215546Sopenharmony_ci return result; 729bf215546Sopenharmony_ci default: 730bf215546Sopenharmony_ci break; 731bf215546Sopenharmony_ci } 732bf215546Sopenharmony_ci 733bf215546Sopenharmony_ci /* At this point, we have dealt with any instruction that operates on 734bf215546Sopenharmony_ci * more than a single channel. Therefore, we can just adjust the source 735bf215546Sopenharmony_ci * and destination registers for that channel and emit the instruction. 736bf215546Sopenharmony_ci */ 737bf215546Sopenharmony_ci unsigned channel = 0; 738bf215546Sopenharmony_ci if (nir_op_infos[instr->op].output_size == 0) { 739bf215546Sopenharmony_ci /* Since NIR is doing the scalarizing for us, we should only ever see 740bf215546Sopenharmony_ci * vectorized operations with a single channel. 741bf215546Sopenharmony_ci */ 742bf215546Sopenharmony_ci assert(util_bitcount(instr->dest.write_mask) == 1); 743bf215546Sopenharmony_ci channel = ffs(instr->dest.write_mask) - 1; 744bf215546Sopenharmony_ci 745bf215546Sopenharmony_ci result = offset(result, bld, channel); 746bf215546Sopenharmony_ci } 747bf215546Sopenharmony_ci 748bf215546Sopenharmony_ci for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 749bf215546Sopenharmony_ci assert(nir_op_infos[instr->op].input_sizes[i] < 2); 750bf215546Sopenharmony_ci op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]); 751bf215546Sopenharmony_ci } 752bf215546Sopenharmony_ci 753bf215546Sopenharmony_ci return result; 754bf215546Sopenharmony_ci} 755bf215546Sopenharmony_ci 756bf215546Sopenharmony_civoid 757bf215546Sopenharmony_cifs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr, 758bf215546Sopenharmony_ci fs_reg *op) 759bf215546Sopenharmony_ci{ 760bf215546Sopenharmony_ci for (unsigned i = 0; i < 2; i++) { 761bf215546Sopenharmony_ci nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src); 762bf215546Sopenharmony_ci 763bf215546Sopenharmony_ci if (inot_instr != NULL && inot_instr->op == nir_op_inot) { 764bf215546Sopenharmony_ci /* The source of the inot is now the source of instr. */ 765bf215546Sopenharmony_ci prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false); 766bf215546Sopenharmony_ci 767bf215546Sopenharmony_ci assert(!op[i].negate); 768bf215546Sopenharmony_ci op[i].negate = true; 769bf215546Sopenharmony_ci } else { 770bf215546Sopenharmony_ci op[i] = resolve_source_modifiers(op[i]); 771bf215546Sopenharmony_ci } 772bf215546Sopenharmony_ci } 773bf215546Sopenharmony_ci} 774bf215546Sopenharmony_ci 775bf215546Sopenharmony_cibool 776bf215546Sopenharmony_cifs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld, 777bf215546Sopenharmony_ci fs_reg result, 778bf215546Sopenharmony_ci nir_alu_instr *instr) 779bf215546Sopenharmony_ci{ 780bf215546Sopenharmony_ci if (devinfo->ver < 6 || devinfo->verx10 >= 125) 781bf215546Sopenharmony_ci return false; 782bf215546Sopenharmony_ci 783bf215546Sopenharmony_ci nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src); 784bf215546Sopenharmony_ci 785bf215546Sopenharmony_ci if (inot_instr == NULL || inot_instr->op != nir_op_inot) 786bf215546Sopenharmony_ci return false; 787bf215546Sopenharmony_ci 788bf215546Sopenharmony_ci /* HF is also possible as a destination on BDW+. For nir_op_b2i, the set 789bf215546Sopenharmony_ci * of valid size-changing combinations is a bit more complex. 790bf215546Sopenharmony_ci * 791bf215546Sopenharmony_ci * The source restriction is just because I was lazy about generating the 792bf215546Sopenharmony_ci * constant below. 793bf215546Sopenharmony_ci */ 794bf215546Sopenharmony_ci if (nir_dest_bit_size(instr->dest.dest) != 32 || 795bf215546Sopenharmony_ci nir_src_bit_size(inot_instr->src[0].src) != 32) 796bf215546Sopenharmony_ci return false; 797bf215546Sopenharmony_ci 798bf215546Sopenharmony_ci /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1, 799bf215546Sopenharmony_ci * this is float(1 + a). 800bf215546Sopenharmony_ci */ 801bf215546Sopenharmony_ci fs_reg op; 802bf215546Sopenharmony_ci 803bf215546Sopenharmony_ci prepare_alu_destination_and_sources(bld, inot_instr, &op, false); 804bf215546Sopenharmony_ci 805bf215546Sopenharmony_ci /* Ignore the saturate modifier, if there is one. The result of the 806bf215546Sopenharmony_ci * arithmetic can only be 0 or 1, so the clamping will do nothing anyway. 807bf215546Sopenharmony_ci */ 808bf215546Sopenharmony_ci bld.ADD(result, op, brw_imm_d(1)); 809bf215546Sopenharmony_ci 810bf215546Sopenharmony_ci return true; 811bf215546Sopenharmony_ci} 812bf215546Sopenharmony_ci 813bf215546Sopenharmony_ci/** 814bf215546Sopenharmony_ci * Emit code for nir_op_fsign possibly fused with a nir_op_fmul 815bf215546Sopenharmony_ci * 816bf215546Sopenharmony_ci * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of 817bf215546Sopenharmony_ci * the source of \c instr that is a \c nir_op_fsign. 818bf215546Sopenharmony_ci */ 819bf215546Sopenharmony_civoid 820bf215546Sopenharmony_cifs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, 821bf215546Sopenharmony_ci fs_reg result, fs_reg *op, unsigned fsign_src) 822bf215546Sopenharmony_ci{ 823bf215546Sopenharmony_ci fs_inst *inst; 824bf215546Sopenharmony_ci 825bf215546Sopenharmony_ci assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul); 826bf215546Sopenharmony_ci assert(fsign_src < nir_op_infos[instr->op].num_inputs); 827bf215546Sopenharmony_ci 828bf215546Sopenharmony_ci if (instr->op != nir_op_fsign) { 829bf215546Sopenharmony_ci const nir_alu_instr *const fsign_instr = 830bf215546Sopenharmony_ci nir_src_as_alu_instr(instr->src[fsign_src].src); 831bf215546Sopenharmony_ci 832bf215546Sopenharmony_ci /* op[fsign_src] has the nominal result of the fsign, and op[1 - 833bf215546Sopenharmony_ci * fsign_src] has the other multiply source. This must be rearranged so 834bf215546Sopenharmony_ci * that op[0] is the source of the fsign op[1] is the other multiply 835bf215546Sopenharmony_ci * source. 836bf215546Sopenharmony_ci */ 837bf215546Sopenharmony_ci if (fsign_src != 0) 838bf215546Sopenharmony_ci op[1] = op[0]; 839bf215546Sopenharmony_ci 840bf215546Sopenharmony_ci op[0] = get_nir_src(fsign_instr->src[0].src); 841bf215546Sopenharmony_ci 842bf215546Sopenharmony_ci const nir_alu_type t = 843bf215546Sopenharmony_ci (nir_alu_type)(nir_op_infos[instr->op].input_types[0] | 844bf215546Sopenharmony_ci nir_src_bit_size(fsign_instr->src[0].src)); 845bf215546Sopenharmony_ci 846bf215546Sopenharmony_ci op[0].type = brw_type_for_nir_type(devinfo, t); 847bf215546Sopenharmony_ci 848bf215546Sopenharmony_ci unsigned channel = 0; 849bf215546Sopenharmony_ci if (nir_op_infos[instr->op].output_size == 0) { 850bf215546Sopenharmony_ci /* Since NIR is doing the scalarizing for us, we should only ever see 851bf215546Sopenharmony_ci * vectorized operations with a single channel. 852bf215546Sopenharmony_ci */ 853bf215546Sopenharmony_ci assert(util_bitcount(instr->dest.write_mask) == 1); 854bf215546Sopenharmony_ci channel = ffs(instr->dest.write_mask) - 1; 855bf215546Sopenharmony_ci } 856bf215546Sopenharmony_ci 857bf215546Sopenharmony_ci op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]); 858bf215546Sopenharmony_ci } 859bf215546Sopenharmony_ci 860bf215546Sopenharmony_ci if (type_sz(op[0].type) == 2) { 861bf215546Sopenharmony_ci /* AND(val, 0x8000) gives the sign bit. 862bf215546Sopenharmony_ci * 863bf215546Sopenharmony_ci * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. 864bf215546Sopenharmony_ci */ 865bf215546Sopenharmony_ci fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF); 866bf215546Sopenharmony_ci bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ); 867bf215546Sopenharmony_ci 868bf215546Sopenharmony_ci op[0].type = BRW_REGISTER_TYPE_UW; 869bf215546Sopenharmony_ci result.type = BRW_REGISTER_TYPE_UW; 870bf215546Sopenharmony_ci bld.AND(result, op[0], brw_imm_uw(0x8000u)); 871bf215546Sopenharmony_ci 872bf215546Sopenharmony_ci if (instr->op == nir_op_fsign) 873bf215546Sopenharmony_ci inst = bld.OR(result, result, brw_imm_uw(0x3c00u)); 874bf215546Sopenharmony_ci else { 875bf215546Sopenharmony_ci /* Use XOR here to get the result sign correct. */ 876bf215546Sopenharmony_ci inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW)); 877bf215546Sopenharmony_ci } 878bf215546Sopenharmony_ci 879bf215546Sopenharmony_ci inst->predicate = BRW_PREDICATE_NORMAL; 880bf215546Sopenharmony_ci } else if (type_sz(op[0].type) == 4) { 881bf215546Sopenharmony_ci /* AND(val, 0x80000000) gives the sign bit. 882bf215546Sopenharmony_ci * 883bf215546Sopenharmony_ci * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not 884bf215546Sopenharmony_ci * zero. 885bf215546Sopenharmony_ci */ 886bf215546Sopenharmony_ci bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); 887bf215546Sopenharmony_ci 888bf215546Sopenharmony_ci op[0].type = BRW_REGISTER_TYPE_UD; 889bf215546Sopenharmony_ci result.type = BRW_REGISTER_TYPE_UD; 890bf215546Sopenharmony_ci bld.AND(result, op[0], brw_imm_ud(0x80000000u)); 891bf215546Sopenharmony_ci 892bf215546Sopenharmony_ci if (instr->op == nir_op_fsign) 893bf215546Sopenharmony_ci inst = bld.OR(result, result, brw_imm_ud(0x3f800000u)); 894bf215546Sopenharmony_ci else { 895bf215546Sopenharmony_ci /* Use XOR here to get the result sign correct. */ 896bf215546Sopenharmony_ci inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD)); 897bf215546Sopenharmony_ci } 898bf215546Sopenharmony_ci 899bf215546Sopenharmony_ci inst->predicate = BRW_PREDICATE_NORMAL; 900bf215546Sopenharmony_ci } else { 901bf215546Sopenharmony_ci /* For doubles we do the same but we need to consider: 902bf215546Sopenharmony_ci * 903bf215546Sopenharmony_ci * - 2-src instructions can't operate with 64-bit immediates 904bf215546Sopenharmony_ci * - The sign is encoded in the high 32-bit of each DF 905bf215546Sopenharmony_ci * - We need to produce a DF result. 906bf215546Sopenharmony_ci */ 907bf215546Sopenharmony_ci 908bf215546Sopenharmony_ci fs_reg zero = vgrf(glsl_type::double_type); 909bf215546Sopenharmony_ci bld.MOV(zero, setup_imm_df(bld, 0.0)); 910bf215546Sopenharmony_ci bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ); 911bf215546Sopenharmony_ci 912bf215546Sopenharmony_ci bld.MOV(result, zero); 913bf215546Sopenharmony_ci 914bf215546Sopenharmony_ci fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1); 915bf215546Sopenharmony_ci bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1), 916bf215546Sopenharmony_ci brw_imm_ud(0x80000000u)); 917bf215546Sopenharmony_ci 918bf215546Sopenharmony_ci if (instr->op == nir_op_fsign) { 919bf215546Sopenharmony_ci set_predicate(BRW_PREDICATE_NORMAL, 920bf215546Sopenharmony_ci bld.OR(r, r, brw_imm_ud(0x3ff00000u))); 921bf215546Sopenharmony_ci } else { 922bf215546Sopenharmony_ci /* This could be done better in some cases. If the scale is an 923bf215546Sopenharmony_ci * immediate with the low 32-bits all 0, emitting a separate XOR and 924bf215546Sopenharmony_ci * OR would allow an algebraic optimization to remove the OR. There 925bf215546Sopenharmony_ci * are currently zero instances of fsign(double(x))*IMM in shader-db 926bf215546Sopenharmony_ci * or any test suite, so it is hard to care at this time. 927bf215546Sopenharmony_ci */ 928bf215546Sopenharmony_ci fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ); 929bf215546Sopenharmony_ci inst = bld.XOR(result_int64, result_int64, 930bf215546Sopenharmony_ci retype(op[1], BRW_REGISTER_TYPE_UQ)); 931bf215546Sopenharmony_ci } 932bf215546Sopenharmony_ci } 933bf215546Sopenharmony_ci} 934bf215546Sopenharmony_ci 935bf215546Sopenharmony_ci/** 936bf215546Sopenharmony_ci * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign 937bf215546Sopenharmony_ci * 938bf215546Sopenharmony_ci * Checks the operands of a \c nir_op_fmul to determine whether or not 939bf215546Sopenharmony_ci * \c emit_fsign could fuse the multiplication with the \c sign() calculation. 940bf215546Sopenharmony_ci * 941bf215546Sopenharmony_ci * \param instr The multiplication instruction 942bf215546Sopenharmony_ci * 943bf215546Sopenharmony_ci * \param fsign_src The source of \c instr that may or may not be a 944bf215546Sopenharmony_ci * \c nir_op_fsign 945bf215546Sopenharmony_ci */ 946bf215546Sopenharmony_cistatic bool 947bf215546Sopenharmony_cican_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src) 948bf215546Sopenharmony_ci{ 949bf215546Sopenharmony_ci assert(instr->op == nir_op_fmul); 950bf215546Sopenharmony_ci 951bf215546Sopenharmony_ci nir_alu_instr *const fsign_instr = 952bf215546Sopenharmony_ci nir_src_as_alu_instr(instr->src[fsign_src].src); 953bf215546Sopenharmony_ci 954bf215546Sopenharmony_ci /* Rules: 955bf215546Sopenharmony_ci * 956bf215546Sopenharmony_ci * 1. instr->src[fsign_src] must be a nir_op_fsign. 957bf215546Sopenharmony_ci * 2. The nir_op_fsign can only be used by this multiplication. 958bf215546Sopenharmony_ci * 3. The source that is the nir_op_fsign does not have source modifiers. 959bf215546Sopenharmony_ci * \c emit_fsign only examines the source modifiers of the source of the 960bf215546Sopenharmony_ci * \c nir_op_fsign. 961bf215546Sopenharmony_ci * 962bf215546Sopenharmony_ci * The nir_op_fsign must also not have the saturate modifier, but steps 963bf215546Sopenharmony_ci * have already been taken (in nir_opt_algebraic) to ensure that. 964bf215546Sopenharmony_ci */ 965bf215546Sopenharmony_ci return fsign_instr != NULL && fsign_instr->op == nir_op_fsign && 966bf215546Sopenharmony_ci is_used_once(fsign_instr); 967bf215546Sopenharmony_ci} 968bf215546Sopenharmony_ci 969bf215546Sopenharmony_civoid 970bf215546Sopenharmony_cifs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, 971bf215546Sopenharmony_ci bool need_dest) 972bf215546Sopenharmony_ci{ 973bf215546Sopenharmony_ci fs_inst *inst; 974bf215546Sopenharmony_ci unsigned execution_mode = 975bf215546Sopenharmony_ci bld.shader->nir->info.float_controls_execution_mode; 976bf215546Sopenharmony_ci 977bf215546Sopenharmony_ci fs_reg op[NIR_MAX_VEC_COMPONENTS]; 978bf215546Sopenharmony_ci fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest); 979bf215546Sopenharmony_ci 980bf215546Sopenharmony_ci#ifndef NDEBUG 981bf215546Sopenharmony_ci /* Everything except raw moves, some type conversions, iabs, and ineg 982bf215546Sopenharmony_ci * should have 8-bit sources lowered by nir_lower_bit_size in 983bf215546Sopenharmony_ci * brw_preprocess_nir or by brw_nir_lower_conversions in 984bf215546Sopenharmony_ci * brw_postprocess_nir. 985bf215546Sopenharmony_ci */ 986bf215546Sopenharmony_ci switch (instr->op) { 987bf215546Sopenharmony_ci case nir_op_mov: 988bf215546Sopenharmony_ci case nir_op_vec2: 989bf215546Sopenharmony_ci case nir_op_vec3: 990bf215546Sopenharmony_ci case nir_op_vec4: 991bf215546Sopenharmony_ci case nir_op_vec8: 992bf215546Sopenharmony_ci case nir_op_vec16: 993bf215546Sopenharmony_ci case nir_op_i2f16: 994bf215546Sopenharmony_ci case nir_op_i2f32: 995bf215546Sopenharmony_ci case nir_op_i2i16: 996bf215546Sopenharmony_ci case nir_op_i2i32: 997bf215546Sopenharmony_ci case nir_op_u2f16: 998bf215546Sopenharmony_ci case nir_op_u2f32: 999bf215546Sopenharmony_ci case nir_op_u2u16: 1000bf215546Sopenharmony_ci case nir_op_u2u32: 1001bf215546Sopenharmony_ci case nir_op_iabs: 1002bf215546Sopenharmony_ci case nir_op_ineg: 1003bf215546Sopenharmony_ci case nir_op_pack_32_4x8_split: 1004bf215546Sopenharmony_ci break; 1005bf215546Sopenharmony_ci 1006bf215546Sopenharmony_ci default: 1007bf215546Sopenharmony_ci for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1008bf215546Sopenharmony_ci assert(type_sz(op[i].type) > 1); 1009bf215546Sopenharmony_ci } 1010bf215546Sopenharmony_ci } 1011bf215546Sopenharmony_ci#endif 1012bf215546Sopenharmony_ci 1013bf215546Sopenharmony_ci switch (instr->op) { 1014bf215546Sopenharmony_ci case nir_op_mov: 1015bf215546Sopenharmony_ci case nir_op_vec2: 1016bf215546Sopenharmony_ci case nir_op_vec3: 1017bf215546Sopenharmony_ci case nir_op_vec4: 1018bf215546Sopenharmony_ci case nir_op_vec8: 1019bf215546Sopenharmony_ci case nir_op_vec16: { 1020bf215546Sopenharmony_ci fs_reg temp = result; 1021bf215546Sopenharmony_ci bool need_extra_copy = false; 1022bf215546Sopenharmony_ci for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1023bf215546Sopenharmony_ci if (!instr->src[i].src.is_ssa && 1024bf215546Sopenharmony_ci instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) { 1025bf215546Sopenharmony_ci need_extra_copy = true; 1026bf215546Sopenharmony_ci temp = bld.vgrf(result.type, 4); 1027bf215546Sopenharmony_ci break; 1028bf215546Sopenharmony_ci } 1029bf215546Sopenharmony_ci } 1030bf215546Sopenharmony_ci 1031bf215546Sopenharmony_ci for (unsigned i = 0; i < 4; i++) { 1032bf215546Sopenharmony_ci if (!(instr->dest.write_mask & (1 << i))) 1033bf215546Sopenharmony_ci continue; 1034bf215546Sopenharmony_ci 1035bf215546Sopenharmony_ci if (instr->op == nir_op_mov) { 1036bf215546Sopenharmony_ci bld.MOV(offset(temp, bld, i), 1037bf215546Sopenharmony_ci offset(op[0], bld, instr->src[0].swizzle[i])); 1038bf215546Sopenharmony_ci } else { 1039bf215546Sopenharmony_ci bld.MOV(offset(temp, bld, i), 1040bf215546Sopenharmony_ci offset(op[i], bld, instr->src[i].swizzle[0])); 1041bf215546Sopenharmony_ci } 1042bf215546Sopenharmony_ci } 1043bf215546Sopenharmony_ci 1044bf215546Sopenharmony_ci /* In this case the source and destination registers were the same, 1045bf215546Sopenharmony_ci * so we need to insert an extra set of moves in order to deal with 1046bf215546Sopenharmony_ci * any swizzling. 1047bf215546Sopenharmony_ci */ 1048bf215546Sopenharmony_ci if (need_extra_copy) { 1049bf215546Sopenharmony_ci for (unsigned i = 0; i < 4; i++) { 1050bf215546Sopenharmony_ci if (!(instr->dest.write_mask & (1 << i))) 1051bf215546Sopenharmony_ci continue; 1052bf215546Sopenharmony_ci 1053bf215546Sopenharmony_ci bld.MOV(offset(result, bld, i), offset(temp, bld, i)); 1054bf215546Sopenharmony_ci } 1055bf215546Sopenharmony_ci } 1056bf215546Sopenharmony_ci return; 1057bf215546Sopenharmony_ci } 1058bf215546Sopenharmony_ci 1059bf215546Sopenharmony_ci case nir_op_i2f32: 1060bf215546Sopenharmony_ci case nir_op_u2f32: 1061bf215546Sopenharmony_ci if (optimize_extract_to_float(instr, result)) 1062bf215546Sopenharmony_ci return; 1063bf215546Sopenharmony_ci inst = bld.MOV(result, op[0]); 1064bf215546Sopenharmony_ci break; 1065bf215546Sopenharmony_ci 1066bf215546Sopenharmony_ci case nir_op_f2f16_rtne: 1067bf215546Sopenharmony_ci case nir_op_f2f16_rtz: 1068bf215546Sopenharmony_ci case nir_op_f2f16: { 1069bf215546Sopenharmony_ci brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED; 1070bf215546Sopenharmony_ci 1071bf215546Sopenharmony_ci if (nir_op_f2f16 == instr->op) 1072bf215546Sopenharmony_ci rnd = brw_rnd_mode_from_execution_mode(execution_mode); 1073bf215546Sopenharmony_ci else 1074bf215546Sopenharmony_ci rnd = brw_rnd_mode_from_nir_op(instr->op); 1075bf215546Sopenharmony_ci 1076bf215546Sopenharmony_ci if (BRW_RND_MODE_UNSPECIFIED != rnd) 1077bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd)); 1078bf215546Sopenharmony_ci 1079bf215546Sopenharmony_ci /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending 1080bf215546Sopenharmony_ci * on the HW gen, it is a special hw opcode or just a MOV, and 1081bf215546Sopenharmony_ci * brw_F32TO16 (at brw_eu_emit) would do the work to chose. 1082bf215546Sopenharmony_ci * 1083bf215546Sopenharmony_ci * But if we want to use that opcode, we need to provide support on 1084bf215546Sopenharmony_ci * different optimizations and lowerings. As right now HF support is 1085bf215546Sopenharmony_ci * only for gfx8+, it will be better to use directly the MOV, and use 1086bf215546Sopenharmony_ci * BRW_OPCODE_F32TO16 when/if we work for HF support on gfx7. 1087bf215546Sopenharmony_ci */ 1088bf215546Sopenharmony_ci assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1089bf215546Sopenharmony_ci inst = bld.MOV(result, op[0]); 1090bf215546Sopenharmony_ci break; 1091bf215546Sopenharmony_ci } 1092bf215546Sopenharmony_ci 1093bf215546Sopenharmony_ci case nir_op_b2i8: 1094bf215546Sopenharmony_ci case nir_op_b2i16: 1095bf215546Sopenharmony_ci case nir_op_b2i32: 1096bf215546Sopenharmony_ci case nir_op_b2i64: 1097bf215546Sopenharmony_ci case nir_op_b2f16: 1098bf215546Sopenharmony_ci case nir_op_b2f32: 1099bf215546Sopenharmony_ci case nir_op_b2f64: 1100bf215546Sopenharmony_ci if (try_emit_b2fi_of_inot(bld, result, instr)) 1101bf215546Sopenharmony_ci break; 1102bf215546Sopenharmony_ci op[0].type = BRW_REGISTER_TYPE_D; 1103bf215546Sopenharmony_ci op[0].negate = !op[0].negate; 1104bf215546Sopenharmony_ci FALLTHROUGH; 1105bf215546Sopenharmony_ci case nir_op_i2f64: 1106bf215546Sopenharmony_ci case nir_op_i2i64: 1107bf215546Sopenharmony_ci case nir_op_u2f64: 1108bf215546Sopenharmony_ci case nir_op_u2u64: 1109bf215546Sopenharmony_ci case nir_op_f2f64: 1110bf215546Sopenharmony_ci case nir_op_f2i64: 1111bf215546Sopenharmony_ci case nir_op_f2u64: 1112bf215546Sopenharmony_ci case nir_op_i2i32: 1113bf215546Sopenharmony_ci case nir_op_u2u32: 1114bf215546Sopenharmony_ci case nir_op_f2i32: 1115bf215546Sopenharmony_ci case nir_op_f2u32: 1116bf215546Sopenharmony_ci case nir_op_i2f16: 1117bf215546Sopenharmony_ci case nir_op_u2f16: 1118bf215546Sopenharmony_ci case nir_op_f2i16: 1119bf215546Sopenharmony_ci case nir_op_f2u16: 1120bf215546Sopenharmony_ci case nir_op_f2i8: 1121bf215546Sopenharmony_ci case nir_op_f2u8: 1122bf215546Sopenharmony_ci if (result.type == BRW_REGISTER_TYPE_B || 1123bf215546Sopenharmony_ci result.type == BRW_REGISTER_TYPE_UB || 1124bf215546Sopenharmony_ci result.type == BRW_REGISTER_TYPE_HF) 1125bf215546Sopenharmony_ci assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1126bf215546Sopenharmony_ci 1127bf215546Sopenharmony_ci if (op[0].type == BRW_REGISTER_TYPE_B || 1128bf215546Sopenharmony_ci op[0].type == BRW_REGISTER_TYPE_UB || 1129bf215546Sopenharmony_ci op[0].type == BRW_REGISTER_TYPE_HF) 1130bf215546Sopenharmony_ci assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ 1131bf215546Sopenharmony_ci 1132bf215546Sopenharmony_ci inst = bld.MOV(result, op[0]); 1133bf215546Sopenharmony_ci break; 1134bf215546Sopenharmony_ci 1135bf215546Sopenharmony_ci case nir_op_i2i8: 1136bf215546Sopenharmony_ci case nir_op_u2u8: 1137bf215546Sopenharmony_ci assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1138bf215546Sopenharmony_ci FALLTHROUGH; 1139bf215546Sopenharmony_ci case nir_op_i2i16: 1140bf215546Sopenharmony_ci case nir_op_u2u16: { 1141bf215546Sopenharmony_ci /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns. 1142bf215546Sopenharmony_ci * Emitting the instructions one by one results in two MOV instructions 1143bf215546Sopenharmony_ci * that won't be propagated. By handling both instructions here, a 1144bf215546Sopenharmony_ci * single MOV is emitted. 1145bf215546Sopenharmony_ci */ 1146bf215546Sopenharmony_ci nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src); 1147bf215546Sopenharmony_ci if (extract_instr != NULL) { 1148bf215546Sopenharmony_ci if (extract_instr->op == nir_op_extract_u8 || 1149bf215546Sopenharmony_ci extract_instr->op == nir_op_extract_i8) { 1150bf215546Sopenharmony_ci prepare_alu_destination_and_sources(bld, extract_instr, op, false); 1151bf215546Sopenharmony_ci 1152bf215546Sopenharmony_ci const unsigned byte = nir_src_as_uint(extract_instr->src[1].src); 1153bf215546Sopenharmony_ci const brw_reg_type type = 1154bf215546Sopenharmony_ci brw_int_type(1, extract_instr->op == nir_op_extract_i8); 1155bf215546Sopenharmony_ci 1156bf215546Sopenharmony_ci op[0] = subscript(op[0], type, byte); 1157bf215546Sopenharmony_ci } else if (extract_instr->op == nir_op_extract_u16 || 1158bf215546Sopenharmony_ci extract_instr->op == nir_op_extract_i16) { 1159bf215546Sopenharmony_ci prepare_alu_destination_and_sources(bld, extract_instr, op, false); 1160bf215546Sopenharmony_ci 1161bf215546Sopenharmony_ci const unsigned word = nir_src_as_uint(extract_instr->src[1].src); 1162bf215546Sopenharmony_ci const brw_reg_type type = 1163bf215546Sopenharmony_ci brw_int_type(2, extract_instr->op == nir_op_extract_i16); 1164bf215546Sopenharmony_ci 1165bf215546Sopenharmony_ci op[0] = subscript(op[0], type, word); 1166bf215546Sopenharmony_ci } 1167bf215546Sopenharmony_ci } 1168bf215546Sopenharmony_ci 1169bf215546Sopenharmony_ci inst = bld.MOV(result, op[0]); 1170bf215546Sopenharmony_ci break; 1171bf215546Sopenharmony_ci } 1172bf215546Sopenharmony_ci 1173bf215546Sopenharmony_ci case nir_op_fsat: 1174bf215546Sopenharmony_ci inst = bld.MOV(result, op[0]); 1175bf215546Sopenharmony_ci inst->saturate = true; 1176bf215546Sopenharmony_ci break; 1177bf215546Sopenharmony_ci 1178bf215546Sopenharmony_ci case nir_op_fneg: 1179bf215546Sopenharmony_ci case nir_op_ineg: 1180bf215546Sopenharmony_ci op[0].negate = true; 1181bf215546Sopenharmony_ci inst = bld.MOV(result, op[0]); 1182bf215546Sopenharmony_ci break; 1183bf215546Sopenharmony_ci 1184bf215546Sopenharmony_ci case nir_op_fabs: 1185bf215546Sopenharmony_ci case nir_op_iabs: 1186bf215546Sopenharmony_ci op[0].negate = false; 1187bf215546Sopenharmony_ci op[0].abs = true; 1188bf215546Sopenharmony_ci inst = bld.MOV(result, op[0]); 1189bf215546Sopenharmony_ci break; 1190bf215546Sopenharmony_ci 1191bf215546Sopenharmony_ci case nir_op_f2f32: 1192bf215546Sopenharmony_ci if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1193bf215546Sopenharmony_ci brw_rnd_mode rnd = 1194bf215546Sopenharmony_ci brw_rnd_mode_from_execution_mode(execution_mode); 1195bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1196bf215546Sopenharmony_ci brw_imm_d(rnd)); 1197bf215546Sopenharmony_ci } 1198bf215546Sopenharmony_ci 1199bf215546Sopenharmony_ci if (op[0].type == BRW_REGISTER_TYPE_HF) 1200bf215546Sopenharmony_ci assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ 1201bf215546Sopenharmony_ci 1202bf215546Sopenharmony_ci inst = bld.MOV(result, op[0]); 1203bf215546Sopenharmony_ci break; 1204bf215546Sopenharmony_ci 1205bf215546Sopenharmony_ci case nir_op_fsign: 1206bf215546Sopenharmony_ci emit_fsign(bld, instr, result, op, 0); 1207bf215546Sopenharmony_ci break; 1208bf215546Sopenharmony_ci 1209bf215546Sopenharmony_ci case nir_op_frcp: 1210bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); 1211bf215546Sopenharmony_ci break; 1212bf215546Sopenharmony_ci 1213bf215546Sopenharmony_ci case nir_op_fexp2: 1214bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]); 1215bf215546Sopenharmony_ci break; 1216bf215546Sopenharmony_ci 1217bf215546Sopenharmony_ci case nir_op_flog2: 1218bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]); 1219bf215546Sopenharmony_ci break; 1220bf215546Sopenharmony_ci 1221bf215546Sopenharmony_ci case nir_op_fsin: 1222bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); 1223bf215546Sopenharmony_ci break; 1224bf215546Sopenharmony_ci 1225bf215546Sopenharmony_ci case nir_op_fcos: 1226bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); 1227bf215546Sopenharmony_ci break; 1228bf215546Sopenharmony_ci 1229bf215546Sopenharmony_ci case nir_op_fddx_fine: 1230bf215546Sopenharmony_ci inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 1231bf215546Sopenharmony_ci break; 1232bf215546Sopenharmony_ci case nir_op_fddx: 1233bf215546Sopenharmony_ci case nir_op_fddx_coarse: 1234bf215546Sopenharmony_ci inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 1235bf215546Sopenharmony_ci break; 1236bf215546Sopenharmony_ci case nir_op_fddy_fine: 1237bf215546Sopenharmony_ci inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 1238bf215546Sopenharmony_ci break; 1239bf215546Sopenharmony_ci case nir_op_fddy: 1240bf215546Sopenharmony_ci case nir_op_fddy_coarse: 1241bf215546Sopenharmony_ci inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 1242bf215546Sopenharmony_ci break; 1243bf215546Sopenharmony_ci 1244bf215546Sopenharmony_ci case nir_op_fadd: 1245bf215546Sopenharmony_ci if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1246bf215546Sopenharmony_ci brw_rnd_mode rnd = 1247bf215546Sopenharmony_ci brw_rnd_mode_from_execution_mode(execution_mode); 1248bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1249bf215546Sopenharmony_ci brw_imm_d(rnd)); 1250bf215546Sopenharmony_ci } 1251bf215546Sopenharmony_ci FALLTHROUGH; 1252bf215546Sopenharmony_ci case nir_op_iadd: 1253bf215546Sopenharmony_ci inst = bld.ADD(result, op[0], op[1]); 1254bf215546Sopenharmony_ci break; 1255bf215546Sopenharmony_ci 1256bf215546Sopenharmony_ci case nir_op_iadd3: 1257bf215546Sopenharmony_ci inst = bld.ADD3(result, op[0], op[1], op[2]); 1258bf215546Sopenharmony_ci break; 1259bf215546Sopenharmony_ci 1260bf215546Sopenharmony_ci case nir_op_iadd_sat: 1261bf215546Sopenharmony_ci case nir_op_uadd_sat: 1262bf215546Sopenharmony_ci inst = bld.ADD(result, op[0], op[1]); 1263bf215546Sopenharmony_ci inst->saturate = true; 1264bf215546Sopenharmony_ci break; 1265bf215546Sopenharmony_ci 1266bf215546Sopenharmony_ci case nir_op_isub_sat: 1267bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]); 1268bf215546Sopenharmony_ci break; 1269bf215546Sopenharmony_ci 1270bf215546Sopenharmony_ci case nir_op_usub_sat: 1271bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]); 1272bf215546Sopenharmony_ci break; 1273bf215546Sopenharmony_ci 1274bf215546Sopenharmony_ci case nir_op_irhadd: 1275bf215546Sopenharmony_ci case nir_op_urhadd: 1276bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1277bf215546Sopenharmony_ci inst = bld.AVG(result, op[0], op[1]); 1278bf215546Sopenharmony_ci break; 1279bf215546Sopenharmony_ci 1280bf215546Sopenharmony_ci case nir_op_ihadd: 1281bf215546Sopenharmony_ci case nir_op_uhadd: { 1282bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1283bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(result.type); 1284bf215546Sopenharmony_ci 1285bf215546Sopenharmony_ci if (devinfo->ver >= 8) { 1286bf215546Sopenharmony_ci op[0] = resolve_source_modifiers(op[0]); 1287bf215546Sopenharmony_ci op[1] = resolve_source_modifiers(op[1]); 1288bf215546Sopenharmony_ci } 1289bf215546Sopenharmony_ci 1290bf215546Sopenharmony_ci /* AVG(x, y) - ((x ^ y) & 1) */ 1291bf215546Sopenharmony_ci bld.XOR(tmp, op[0], op[1]); 1292bf215546Sopenharmony_ci bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type)); 1293bf215546Sopenharmony_ci bld.AVG(result, op[0], op[1]); 1294bf215546Sopenharmony_ci inst = bld.ADD(result, result, tmp); 1295bf215546Sopenharmony_ci inst->src[1].negate = true; 1296bf215546Sopenharmony_ci break; 1297bf215546Sopenharmony_ci } 1298bf215546Sopenharmony_ci 1299bf215546Sopenharmony_ci case nir_op_fmul: 1300bf215546Sopenharmony_ci for (unsigned i = 0; i < 2; i++) { 1301bf215546Sopenharmony_ci if (can_fuse_fmul_fsign(instr, i)) { 1302bf215546Sopenharmony_ci emit_fsign(bld, instr, result, op, i); 1303bf215546Sopenharmony_ci return; 1304bf215546Sopenharmony_ci } 1305bf215546Sopenharmony_ci } 1306bf215546Sopenharmony_ci 1307bf215546Sopenharmony_ci /* We emit the rounding mode after the previous fsign optimization since 1308bf215546Sopenharmony_ci * it won't result in a MUL, but will try to negate the value by other 1309bf215546Sopenharmony_ci * means. 1310bf215546Sopenharmony_ci */ 1311bf215546Sopenharmony_ci if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1312bf215546Sopenharmony_ci brw_rnd_mode rnd = 1313bf215546Sopenharmony_ci brw_rnd_mode_from_execution_mode(execution_mode); 1314bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1315bf215546Sopenharmony_ci brw_imm_d(rnd)); 1316bf215546Sopenharmony_ci } 1317bf215546Sopenharmony_ci 1318bf215546Sopenharmony_ci inst = bld.MUL(result, op[0], op[1]); 1319bf215546Sopenharmony_ci break; 1320bf215546Sopenharmony_ci 1321bf215546Sopenharmony_ci case nir_op_imul_2x32_64: 1322bf215546Sopenharmony_ci case nir_op_umul_2x32_64: 1323bf215546Sopenharmony_ci bld.MUL(result, op[0], op[1]); 1324bf215546Sopenharmony_ci break; 1325bf215546Sopenharmony_ci 1326bf215546Sopenharmony_ci case nir_op_imul_32x16: 1327bf215546Sopenharmony_ci case nir_op_umul_32x16: { 1328bf215546Sopenharmony_ci const bool ud = instr->op == nir_op_umul_32x16; 1329bf215546Sopenharmony_ci 1330bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) == 32); 1331bf215546Sopenharmony_ci 1332bf215546Sopenharmony_ci /* Before Gfx7, the order of the 32-bit source and the 16-bit source was 1333bf215546Sopenharmony_ci * swapped. The extension isn't enabled on those platforms, so don't 1334bf215546Sopenharmony_ci * pretend to support the differences. 1335bf215546Sopenharmony_ci */ 1336bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 1337bf215546Sopenharmony_ci 1338bf215546Sopenharmony_ci if (op[1].file == IMM) 1339bf215546Sopenharmony_ci op[1] = ud ? brw_imm_uw(op[1].ud) : brw_imm_w(op[1].d); 1340bf215546Sopenharmony_ci else { 1341bf215546Sopenharmony_ci const enum brw_reg_type word_type = 1342bf215546Sopenharmony_ci ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W; 1343bf215546Sopenharmony_ci 1344bf215546Sopenharmony_ci op[1] = subscript(op[1], word_type, 0); 1345bf215546Sopenharmony_ci } 1346bf215546Sopenharmony_ci 1347bf215546Sopenharmony_ci const enum brw_reg_type dword_type = 1348bf215546Sopenharmony_ci ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; 1349bf215546Sopenharmony_ci 1350bf215546Sopenharmony_ci bld.MUL(result, retype(op[0], dword_type), op[1]); 1351bf215546Sopenharmony_ci break; 1352bf215546Sopenharmony_ci } 1353bf215546Sopenharmony_ci 1354bf215546Sopenharmony_ci case nir_op_imul: 1355bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1356bf215546Sopenharmony_ci bld.MUL(result, op[0], op[1]); 1357bf215546Sopenharmony_ci break; 1358bf215546Sopenharmony_ci 1359bf215546Sopenharmony_ci case nir_op_imul_high: 1360bf215546Sopenharmony_ci case nir_op_umul_high: 1361bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1362bf215546Sopenharmony_ci if (nir_dest_bit_size(instr->dest.dest) == 32) { 1363bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]); 1364bf215546Sopenharmony_ci } else { 1365bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(brw_reg_type_from_bit_size(32, op[0].type)); 1366bf215546Sopenharmony_ci bld.MUL(tmp, op[0], op[1]); 1367bf215546Sopenharmony_ci bld.MOV(result, subscript(tmp, result.type, 1)); 1368bf215546Sopenharmony_ci } 1369bf215546Sopenharmony_ci break; 1370bf215546Sopenharmony_ci 1371bf215546Sopenharmony_ci case nir_op_idiv: 1372bf215546Sopenharmony_ci case nir_op_udiv: 1373bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1374bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]); 1375bf215546Sopenharmony_ci break; 1376bf215546Sopenharmony_ci 1377bf215546Sopenharmony_ci case nir_op_uadd_carry: 1378bf215546Sopenharmony_ci unreachable("Should have been lowered by carry_to_arith()."); 1379bf215546Sopenharmony_ci 1380bf215546Sopenharmony_ci case nir_op_usub_borrow: 1381bf215546Sopenharmony_ci unreachable("Should have been lowered by borrow_to_arith()."); 1382bf215546Sopenharmony_ci 1383bf215546Sopenharmony_ci case nir_op_umod: 1384bf215546Sopenharmony_ci case nir_op_irem: 1385bf215546Sopenharmony_ci /* According to the sign table for INT DIV in the Ivy Bridge PRM, it 1386bf215546Sopenharmony_ci * appears that our hardware just does the right thing for signed 1387bf215546Sopenharmony_ci * remainder. 1388bf215546Sopenharmony_ci */ 1389bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1390bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1391bf215546Sopenharmony_ci break; 1392bf215546Sopenharmony_ci 1393bf215546Sopenharmony_ci case nir_op_imod: { 1394bf215546Sopenharmony_ci /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ 1395bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1396bf215546Sopenharmony_ci 1397bf215546Sopenharmony_ci /* Math instructions don't support conditional mod */ 1398bf215546Sopenharmony_ci inst = bld.MOV(bld.null_reg_d(), result); 1399bf215546Sopenharmony_ci inst->conditional_mod = BRW_CONDITIONAL_NZ; 1400bf215546Sopenharmony_ci 1401bf215546Sopenharmony_ci /* Now, we need to determine if signs of the sources are different. 1402bf215546Sopenharmony_ci * When we XOR the sources, the top bit is 0 if they are the same and 1 1403bf215546Sopenharmony_ci * if they are different. We can then use a conditional modifier to 1404bf215546Sopenharmony_ci * turn that into a predicate. This leads us to an XOR.l instruction. 1405bf215546Sopenharmony_ci * 1406bf215546Sopenharmony_ci * Technically, according to the PRM, you're not allowed to use .l on a 1407bf215546Sopenharmony_ci * XOR instruction. However, empirical experiments and Curro's reading 1408bf215546Sopenharmony_ci * of the simulator source both indicate that it's safe. 1409bf215546Sopenharmony_ci */ 1410bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D); 1411bf215546Sopenharmony_ci inst = bld.XOR(tmp, op[0], op[1]); 1412bf215546Sopenharmony_ci inst->predicate = BRW_PREDICATE_NORMAL; 1413bf215546Sopenharmony_ci inst->conditional_mod = BRW_CONDITIONAL_L; 1414bf215546Sopenharmony_ci 1415bf215546Sopenharmony_ci /* If the result of the initial remainder operation is non-zero and the 1416bf215546Sopenharmony_ci * two sources have different signs, add in a copy of op[1] to get the 1417bf215546Sopenharmony_ci * final integer modulus value. 1418bf215546Sopenharmony_ci */ 1419bf215546Sopenharmony_ci inst = bld.ADD(result, result, op[1]); 1420bf215546Sopenharmony_ci inst->predicate = BRW_PREDICATE_NORMAL; 1421bf215546Sopenharmony_ci break; 1422bf215546Sopenharmony_ci } 1423bf215546Sopenharmony_ci 1424bf215546Sopenharmony_ci case nir_op_flt32: 1425bf215546Sopenharmony_ci case nir_op_fge32: 1426bf215546Sopenharmony_ci case nir_op_feq32: 1427bf215546Sopenharmony_ci case nir_op_fneu32: { 1428bf215546Sopenharmony_ci fs_reg dest = result; 1429bf215546Sopenharmony_ci 1430bf215546Sopenharmony_ci const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1431bf215546Sopenharmony_ci if (bit_size != 32) 1432bf215546Sopenharmony_ci dest = bld.vgrf(op[0].type, 1); 1433bf215546Sopenharmony_ci 1434bf215546Sopenharmony_ci bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op)); 1435bf215546Sopenharmony_ci 1436bf215546Sopenharmony_ci if (bit_size > 32) { 1437bf215546Sopenharmony_ci bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1438bf215546Sopenharmony_ci } else if(bit_size < 32) { 1439bf215546Sopenharmony_ci /* When we convert the result to 32-bit we need to be careful and do 1440bf215546Sopenharmony_ci * it as a signed conversion to get sign extension (for 32-bit true) 1441bf215546Sopenharmony_ci */ 1442bf215546Sopenharmony_ci const brw_reg_type src_type = 1443bf215546Sopenharmony_ci brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1444bf215546Sopenharmony_ci 1445bf215546Sopenharmony_ci bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1446bf215546Sopenharmony_ci } 1447bf215546Sopenharmony_ci break; 1448bf215546Sopenharmony_ci } 1449bf215546Sopenharmony_ci 1450bf215546Sopenharmony_ci case nir_op_ilt32: 1451bf215546Sopenharmony_ci case nir_op_ult32: 1452bf215546Sopenharmony_ci case nir_op_ige32: 1453bf215546Sopenharmony_ci case nir_op_uge32: 1454bf215546Sopenharmony_ci case nir_op_ieq32: 1455bf215546Sopenharmony_ci case nir_op_ine32: { 1456bf215546Sopenharmony_ci fs_reg dest = result; 1457bf215546Sopenharmony_ci 1458bf215546Sopenharmony_ci const uint32_t bit_size = type_sz(op[0].type) * 8; 1459bf215546Sopenharmony_ci if (bit_size != 32) 1460bf215546Sopenharmony_ci dest = bld.vgrf(op[0].type, 1); 1461bf215546Sopenharmony_ci 1462bf215546Sopenharmony_ci bld.CMP(dest, op[0], op[1], 1463bf215546Sopenharmony_ci brw_cmod_for_nir_comparison(instr->op)); 1464bf215546Sopenharmony_ci 1465bf215546Sopenharmony_ci if (bit_size > 32) { 1466bf215546Sopenharmony_ci bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1467bf215546Sopenharmony_ci } else if (bit_size < 32) { 1468bf215546Sopenharmony_ci /* When we convert the result to 32-bit we need to be careful and do 1469bf215546Sopenharmony_ci * it as a signed conversion to get sign extension (for 32-bit true) 1470bf215546Sopenharmony_ci */ 1471bf215546Sopenharmony_ci const brw_reg_type src_type = 1472bf215546Sopenharmony_ci brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1473bf215546Sopenharmony_ci 1474bf215546Sopenharmony_ci bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1475bf215546Sopenharmony_ci } 1476bf215546Sopenharmony_ci break; 1477bf215546Sopenharmony_ci } 1478bf215546Sopenharmony_ci 1479bf215546Sopenharmony_ci case nir_op_inot: 1480bf215546Sopenharmony_ci if (devinfo->ver >= 8) { 1481bf215546Sopenharmony_ci nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src); 1482bf215546Sopenharmony_ci 1483bf215546Sopenharmony_ci if (inot_src_instr != NULL && 1484bf215546Sopenharmony_ci (inot_src_instr->op == nir_op_ior || 1485bf215546Sopenharmony_ci inot_src_instr->op == nir_op_ixor || 1486bf215546Sopenharmony_ci inot_src_instr->op == nir_op_iand)) { 1487bf215546Sopenharmony_ci /* The sources of the source logical instruction are now the 1488bf215546Sopenharmony_ci * sources of the instruction that will be generated. 1489bf215546Sopenharmony_ci */ 1490bf215546Sopenharmony_ci prepare_alu_destination_and_sources(bld, inot_src_instr, op, false); 1491bf215546Sopenharmony_ci resolve_inot_sources(bld, inot_src_instr, op); 1492bf215546Sopenharmony_ci 1493bf215546Sopenharmony_ci /* Smash all of the sources and destination to be signed. This 1494bf215546Sopenharmony_ci * doesn't matter for the operation of the instruction, but cmod 1495bf215546Sopenharmony_ci * propagation fails on unsigned sources with negation (due to 1496bf215546Sopenharmony_ci * fs_inst::can_do_cmod returning false). 1497bf215546Sopenharmony_ci */ 1498bf215546Sopenharmony_ci result.type = 1499bf215546Sopenharmony_ci brw_type_for_nir_type(devinfo, 1500bf215546Sopenharmony_ci (nir_alu_type)(nir_type_int | 1501bf215546Sopenharmony_ci nir_dest_bit_size(instr->dest.dest))); 1502bf215546Sopenharmony_ci op[0].type = 1503bf215546Sopenharmony_ci brw_type_for_nir_type(devinfo, 1504bf215546Sopenharmony_ci (nir_alu_type)(nir_type_int | 1505bf215546Sopenharmony_ci nir_src_bit_size(inot_src_instr->src[0].src))); 1506bf215546Sopenharmony_ci op[1].type = 1507bf215546Sopenharmony_ci brw_type_for_nir_type(devinfo, 1508bf215546Sopenharmony_ci (nir_alu_type)(nir_type_int | 1509bf215546Sopenharmony_ci nir_src_bit_size(inot_src_instr->src[1].src))); 1510bf215546Sopenharmony_ci 1511bf215546Sopenharmony_ci /* For XOR, only invert one of the sources. Arbitrarily choose 1512bf215546Sopenharmony_ci * the first source. 1513bf215546Sopenharmony_ci */ 1514bf215546Sopenharmony_ci op[0].negate = !op[0].negate; 1515bf215546Sopenharmony_ci if (inot_src_instr->op != nir_op_ixor) 1516bf215546Sopenharmony_ci op[1].negate = !op[1].negate; 1517bf215546Sopenharmony_ci 1518bf215546Sopenharmony_ci switch (inot_src_instr->op) { 1519bf215546Sopenharmony_ci case nir_op_ior: 1520bf215546Sopenharmony_ci bld.AND(result, op[0], op[1]); 1521bf215546Sopenharmony_ci return; 1522bf215546Sopenharmony_ci 1523bf215546Sopenharmony_ci case nir_op_iand: 1524bf215546Sopenharmony_ci bld.OR(result, op[0], op[1]); 1525bf215546Sopenharmony_ci return; 1526bf215546Sopenharmony_ci 1527bf215546Sopenharmony_ci case nir_op_ixor: 1528bf215546Sopenharmony_ci bld.XOR(result, op[0], op[1]); 1529bf215546Sopenharmony_ci return; 1530bf215546Sopenharmony_ci 1531bf215546Sopenharmony_ci default: 1532bf215546Sopenharmony_ci unreachable("impossible opcode"); 1533bf215546Sopenharmony_ci } 1534bf215546Sopenharmony_ci } 1535bf215546Sopenharmony_ci op[0] = resolve_source_modifiers(op[0]); 1536bf215546Sopenharmony_ci } 1537bf215546Sopenharmony_ci bld.NOT(result, op[0]); 1538bf215546Sopenharmony_ci break; 1539bf215546Sopenharmony_ci case nir_op_ixor: 1540bf215546Sopenharmony_ci if (devinfo->ver >= 8) { 1541bf215546Sopenharmony_ci resolve_inot_sources(bld, instr, op); 1542bf215546Sopenharmony_ci } 1543bf215546Sopenharmony_ci bld.XOR(result, op[0], op[1]); 1544bf215546Sopenharmony_ci break; 1545bf215546Sopenharmony_ci case nir_op_ior: 1546bf215546Sopenharmony_ci if (devinfo->ver >= 8) { 1547bf215546Sopenharmony_ci resolve_inot_sources(bld, instr, op); 1548bf215546Sopenharmony_ci } 1549bf215546Sopenharmony_ci bld.OR(result, op[0], op[1]); 1550bf215546Sopenharmony_ci break; 1551bf215546Sopenharmony_ci case nir_op_iand: 1552bf215546Sopenharmony_ci if (devinfo->ver >= 8) { 1553bf215546Sopenharmony_ci resolve_inot_sources(bld, instr, op); 1554bf215546Sopenharmony_ci } 1555bf215546Sopenharmony_ci bld.AND(result, op[0], op[1]); 1556bf215546Sopenharmony_ci break; 1557bf215546Sopenharmony_ci 1558bf215546Sopenharmony_ci case nir_op_fdot2: 1559bf215546Sopenharmony_ci case nir_op_fdot3: 1560bf215546Sopenharmony_ci case nir_op_fdot4: 1561bf215546Sopenharmony_ci case nir_op_b32all_fequal2: 1562bf215546Sopenharmony_ci case nir_op_b32all_iequal2: 1563bf215546Sopenharmony_ci case nir_op_b32all_fequal3: 1564bf215546Sopenharmony_ci case nir_op_b32all_iequal3: 1565bf215546Sopenharmony_ci case nir_op_b32all_fequal4: 1566bf215546Sopenharmony_ci case nir_op_b32all_iequal4: 1567bf215546Sopenharmony_ci case nir_op_b32any_fnequal2: 1568bf215546Sopenharmony_ci case nir_op_b32any_inequal2: 1569bf215546Sopenharmony_ci case nir_op_b32any_fnequal3: 1570bf215546Sopenharmony_ci case nir_op_b32any_inequal3: 1571bf215546Sopenharmony_ci case nir_op_b32any_fnequal4: 1572bf215546Sopenharmony_ci case nir_op_b32any_inequal4: 1573bf215546Sopenharmony_ci unreachable("Lowered by nir_lower_alu_reductions"); 1574bf215546Sopenharmony_ci 1575bf215546Sopenharmony_ci case nir_op_ldexp: 1576bf215546Sopenharmony_ci unreachable("not reached: should be handled by ldexp_to_arith()"); 1577bf215546Sopenharmony_ci 1578bf215546Sopenharmony_ci case nir_op_fsqrt: 1579bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]); 1580bf215546Sopenharmony_ci break; 1581bf215546Sopenharmony_ci 1582bf215546Sopenharmony_ci case nir_op_frsq: 1583bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]); 1584bf215546Sopenharmony_ci break; 1585bf215546Sopenharmony_ci 1586bf215546Sopenharmony_ci case nir_op_i2b32: 1587bf215546Sopenharmony_ci case nir_op_f2b32: { 1588bf215546Sopenharmony_ci uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1589bf215546Sopenharmony_ci if (bit_size == 64) { 1590bf215546Sopenharmony_ci /* two-argument instructions can't take 64-bit immediates */ 1591bf215546Sopenharmony_ci fs_reg zero; 1592bf215546Sopenharmony_ci fs_reg tmp; 1593bf215546Sopenharmony_ci 1594bf215546Sopenharmony_ci if (instr->op == nir_op_f2b32) { 1595bf215546Sopenharmony_ci zero = vgrf(glsl_type::double_type); 1596bf215546Sopenharmony_ci tmp = vgrf(glsl_type::double_type); 1597bf215546Sopenharmony_ci bld.MOV(zero, setup_imm_df(bld, 0.0)); 1598bf215546Sopenharmony_ci } else { 1599bf215546Sopenharmony_ci zero = vgrf(glsl_type::int64_t_type); 1600bf215546Sopenharmony_ci tmp = vgrf(glsl_type::int64_t_type); 1601bf215546Sopenharmony_ci bld.MOV(zero, brw_imm_q(0)); 1602bf215546Sopenharmony_ci } 1603bf215546Sopenharmony_ci 1604bf215546Sopenharmony_ci /* A SIMD16 execution needs to be split in two instructions, so use 1605bf215546Sopenharmony_ci * a vgrf instead of the flag register as dst so instruction splitting 1606bf215546Sopenharmony_ci * works 1607bf215546Sopenharmony_ci */ 1608bf215546Sopenharmony_ci bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ); 1609bf215546Sopenharmony_ci bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0)); 1610bf215546Sopenharmony_ci } else { 1611bf215546Sopenharmony_ci fs_reg zero; 1612bf215546Sopenharmony_ci if (bit_size == 32) { 1613bf215546Sopenharmony_ci zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0); 1614bf215546Sopenharmony_ci } else { 1615bf215546Sopenharmony_ci assert(bit_size == 16); 1616bf215546Sopenharmony_ci zero = instr->op == nir_op_f2b32 ? 1617bf215546Sopenharmony_ci retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0); 1618bf215546Sopenharmony_ci } 1619bf215546Sopenharmony_ci bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ); 1620bf215546Sopenharmony_ci } 1621bf215546Sopenharmony_ci break; 1622bf215546Sopenharmony_ci } 1623bf215546Sopenharmony_ci 1624bf215546Sopenharmony_ci case nir_op_ftrunc: 1625bf215546Sopenharmony_ci inst = bld.RNDZ(result, op[0]); 1626bf215546Sopenharmony_ci if (devinfo->ver < 6) { 1627bf215546Sopenharmony_ci set_condmod(BRW_CONDITIONAL_R, inst); 1628bf215546Sopenharmony_ci set_predicate(BRW_PREDICATE_NORMAL, 1629bf215546Sopenharmony_ci bld.ADD(result, result, brw_imm_f(1.0f))); 1630bf215546Sopenharmony_ci inst = bld.MOV(result, result); /* for potential saturation */ 1631bf215546Sopenharmony_ci } 1632bf215546Sopenharmony_ci break; 1633bf215546Sopenharmony_ci 1634bf215546Sopenharmony_ci case nir_op_fceil: { 1635bf215546Sopenharmony_ci op[0].negate = !op[0].negate; 1636bf215546Sopenharmony_ci fs_reg temp = vgrf(glsl_type::float_type); 1637bf215546Sopenharmony_ci bld.RNDD(temp, op[0]); 1638bf215546Sopenharmony_ci temp.negate = true; 1639bf215546Sopenharmony_ci inst = bld.MOV(result, temp); 1640bf215546Sopenharmony_ci break; 1641bf215546Sopenharmony_ci } 1642bf215546Sopenharmony_ci case nir_op_ffloor: 1643bf215546Sopenharmony_ci inst = bld.RNDD(result, op[0]); 1644bf215546Sopenharmony_ci break; 1645bf215546Sopenharmony_ci case nir_op_ffract: 1646bf215546Sopenharmony_ci inst = bld.FRC(result, op[0]); 1647bf215546Sopenharmony_ci break; 1648bf215546Sopenharmony_ci case nir_op_fround_even: 1649bf215546Sopenharmony_ci inst = bld.RNDE(result, op[0]); 1650bf215546Sopenharmony_ci if (devinfo->ver < 6) { 1651bf215546Sopenharmony_ci set_condmod(BRW_CONDITIONAL_R, inst); 1652bf215546Sopenharmony_ci set_predicate(BRW_PREDICATE_NORMAL, 1653bf215546Sopenharmony_ci bld.ADD(result, result, brw_imm_f(1.0f))); 1654bf215546Sopenharmony_ci inst = bld.MOV(result, result); /* for potential saturation */ 1655bf215546Sopenharmony_ci } 1656bf215546Sopenharmony_ci break; 1657bf215546Sopenharmony_ci 1658bf215546Sopenharmony_ci case nir_op_fquantize2f16: { 1659bf215546Sopenharmony_ci fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D); 1660bf215546Sopenharmony_ci fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F); 1661bf215546Sopenharmony_ci fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F); 1662bf215546Sopenharmony_ci 1663bf215546Sopenharmony_ci /* The destination stride must be at least as big as the source stride. */ 1664bf215546Sopenharmony_ci tmp16.type = devinfo->ver > 7 1665bf215546Sopenharmony_ci ? BRW_REGISTER_TYPE_HF : BRW_REGISTER_TYPE_W; 1666bf215546Sopenharmony_ci tmp16.stride = 2; 1667bf215546Sopenharmony_ci 1668bf215546Sopenharmony_ci /* Check for denormal */ 1669bf215546Sopenharmony_ci fs_reg abs_src0 = op[0]; 1670bf215546Sopenharmony_ci abs_src0.abs = true; 1671bf215546Sopenharmony_ci bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), 1672bf215546Sopenharmony_ci BRW_CONDITIONAL_L); 1673bf215546Sopenharmony_ci /* Get the appropriately signed zero */ 1674bf215546Sopenharmony_ci bld.AND(retype(zero, BRW_REGISTER_TYPE_UD), 1675bf215546Sopenharmony_ci retype(op[0], BRW_REGISTER_TYPE_UD), 1676bf215546Sopenharmony_ci brw_imm_ud(0x80000000)); 1677bf215546Sopenharmony_ci /* Do the actual F32 -> F16 -> F32 conversion */ 1678bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]); 1679bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16); 1680bf215546Sopenharmony_ci /* Select that or zero based on normal status */ 1681bf215546Sopenharmony_ci inst = bld.SEL(result, zero, tmp32); 1682bf215546Sopenharmony_ci inst->predicate = BRW_PREDICATE_NORMAL; 1683bf215546Sopenharmony_ci break; 1684bf215546Sopenharmony_ci } 1685bf215546Sopenharmony_ci 1686bf215546Sopenharmony_ci case nir_op_imin: 1687bf215546Sopenharmony_ci case nir_op_umin: 1688bf215546Sopenharmony_ci case nir_op_fmin: 1689bf215546Sopenharmony_ci inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L); 1690bf215546Sopenharmony_ci break; 1691bf215546Sopenharmony_ci 1692bf215546Sopenharmony_ci case nir_op_imax: 1693bf215546Sopenharmony_ci case nir_op_umax: 1694bf215546Sopenharmony_ci case nir_op_fmax: 1695bf215546Sopenharmony_ci inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE); 1696bf215546Sopenharmony_ci break; 1697bf215546Sopenharmony_ci 1698bf215546Sopenharmony_ci case nir_op_pack_snorm_2x16: 1699bf215546Sopenharmony_ci case nir_op_pack_snorm_4x8: 1700bf215546Sopenharmony_ci case nir_op_pack_unorm_2x16: 1701bf215546Sopenharmony_ci case nir_op_pack_unorm_4x8: 1702bf215546Sopenharmony_ci case nir_op_unpack_snorm_2x16: 1703bf215546Sopenharmony_ci case nir_op_unpack_snorm_4x8: 1704bf215546Sopenharmony_ci case nir_op_unpack_unorm_2x16: 1705bf215546Sopenharmony_ci case nir_op_unpack_unorm_4x8: 1706bf215546Sopenharmony_ci case nir_op_unpack_half_2x16: 1707bf215546Sopenharmony_ci case nir_op_pack_half_2x16: 1708bf215546Sopenharmony_ci unreachable("not reached: should be handled by lower_packing_builtins"); 1709bf215546Sopenharmony_ci 1710bf215546Sopenharmony_ci case nir_op_unpack_half_2x16_split_x_flush_to_zero: 1711bf215546Sopenharmony_ci assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); 1712bf215546Sopenharmony_ci FALLTHROUGH; 1713bf215546Sopenharmony_ci case nir_op_unpack_half_2x16_split_x: 1714bf215546Sopenharmony_ci inst = bld.emit(BRW_OPCODE_F16TO32, result, 1715bf215546Sopenharmony_ci subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); 1716bf215546Sopenharmony_ci break; 1717bf215546Sopenharmony_ci 1718bf215546Sopenharmony_ci case nir_op_unpack_half_2x16_split_y_flush_to_zero: 1719bf215546Sopenharmony_ci assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); 1720bf215546Sopenharmony_ci FALLTHROUGH; 1721bf215546Sopenharmony_ci case nir_op_unpack_half_2x16_split_y: 1722bf215546Sopenharmony_ci inst = bld.emit(BRW_OPCODE_F16TO32, result, 1723bf215546Sopenharmony_ci subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); 1724bf215546Sopenharmony_ci break; 1725bf215546Sopenharmony_ci 1726bf215546Sopenharmony_ci case nir_op_pack_64_2x32_split: 1727bf215546Sopenharmony_ci case nir_op_pack_32_2x16_split: 1728bf215546Sopenharmony_ci bld.emit(FS_OPCODE_PACK, result, op[0], op[1]); 1729bf215546Sopenharmony_ci break; 1730bf215546Sopenharmony_ci 1731bf215546Sopenharmony_ci case nir_op_pack_32_4x8_split: 1732bf215546Sopenharmony_ci bld.emit(FS_OPCODE_PACK, result, op, 4); 1733bf215546Sopenharmony_ci break; 1734bf215546Sopenharmony_ci 1735bf215546Sopenharmony_ci case nir_op_unpack_64_2x32_split_x: 1736bf215546Sopenharmony_ci case nir_op_unpack_64_2x32_split_y: { 1737bf215546Sopenharmony_ci if (instr->op == nir_op_unpack_64_2x32_split_x) 1738bf215546Sopenharmony_ci bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0)); 1739bf215546Sopenharmony_ci else 1740bf215546Sopenharmony_ci bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1)); 1741bf215546Sopenharmony_ci break; 1742bf215546Sopenharmony_ci } 1743bf215546Sopenharmony_ci 1744bf215546Sopenharmony_ci case nir_op_unpack_32_2x16_split_x: 1745bf215546Sopenharmony_ci case nir_op_unpack_32_2x16_split_y: { 1746bf215546Sopenharmony_ci if (instr->op == nir_op_unpack_32_2x16_split_x) 1747bf215546Sopenharmony_ci bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); 1748bf215546Sopenharmony_ci else 1749bf215546Sopenharmony_ci bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); 1750bf215546Sopenharmony_ci break; 1751bf215546Sopenharmony_ci } 1752bf215546Sopenharmony_ci 1753bf215546Sopenharmony_ci case nir_op_fpow: 1754bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]); 1755bf215546Sopenharmony_ci break; 1756bf215546Sopenharmony_ci 1757bf215546Sopenharmony_ci case nir_op_bitfield_reverse: 1758bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1759bf215546Sopenharmony_ci bld.BFREV(result, op[0]); 1760bf215546Sopenharmony_ci break; 1761bf215546Sopenharmony_ci 1762bf215546Sopenharmony_ci case nir_op_bit_count: 1763bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1764bf215546Sopenharmony_ci bld.CBIT(result, op[0]); 1765bf215546Sopenharmony_ci break; 1766bf215546Sopenharmony_ci 1767bf215546Sopenharmony_ci case nir_op_ufind_msb: { 1768bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1769bf215546Sopenharmony_ci emit_find_msb_using_lzd(bld, result, op[0], false); 1770bf215546Sopenharmony_ci break; 1771bf215546Sopenharmony_ci } 1772bf215546Sopenharmony_ci 1773bf215546Sopenharmony_ci case nir_op_uclz: 1774bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) == 32); 1775bf215546Sopenharmony_ci bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]); 1776bf215546Sopenharmony_ci break; 1777bf215546Sopenharmony_ci 1778bf215546Sopenharmony_ci case nir_op_ifind_msb: { 1779bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1780bf215546Sopenharmony_ci 1781bf215546Sopenharmony_ci if (devinfo->ver < 7) { 1782bf215546Sopenharmony_ci emit_find_msb_using_lzd(bld, result, op[0], true); 1783bf215546Sopenharmony_ci } else { 1784bf215546Sopenharmony_ci bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]); 1785bf215546Sopenharmony_ci 1786bf215546Sopenharmony_ci /* FBH counts from the MSB side, while GLSL's findMSB() wants the 1787bf215546Sopenharmony_ci * count from the LSB side. If FBH didn't return an error 1788bf215546Sopenharmony_ci * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB 1789bf215546Sopenharmony_ci * count into an LSB count. 1790bf215546Sopenharmony_ci */ 1791bf215546Sopenharmony_ci bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ); 1792bf215546Sopenharmony_ci 1793bf215546Sopenharmony_ci inst = bld.ADD(result, result, brw_imm_d(31)); 1794bf215546Sopenharmony_ci inst->predicate = BRW_PREDICATE_NORMAL; 1795bf215546Sopenharmony_ci inst->src[0].negate = true; 1796bf215546Sopenharmony_ci } 1797bf215546Sopenharmony_ci break; 1798bf215546Sopenharmony_ci } 1799bf215546Sopenharmony_ci 1800bf215546Sopenharmony_ci case nir_op_find_lsb: 1801bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1802bf215546Sopenharmony_ci 1803bf215546Sopenharmony_ci if (devinfo->ver < 7) { 1804bf215546Sopenharmony_ci fs_reg temp = vgrf(glsl_type::int_type); 1805bf215546Sopenharmony_ci 1806bf215546Sopenharmony_ci /* (x & -x) generates a value that consists of only the LSB of x. 1807bf215546Sopenharmony_ci * For all powers of 2, findMSB(y) == findLSB(y). 1808bf215546Sopenharmony_ci */ 1809bf215546Sopenharmony_ci fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D); 1810bf215546Sopenharmony_ci fs_reg negated_src = src; 1811bf215546Sopenharmony_ci 1812bf215546Sopenharmony_ci /* One must be negated, and the other must be non-negated. It 1813bf215546Sopenharmony_ci * doesn't matter which is which. 1814bf215546Sopenharmony_ci */ 1815bf215546Sopenharmony_ci negated_src.negate = true; 1816bf215546Sopenharmony_ci src.negate = false; 1817bf215546Sopenharmony_ci 1818bf215546Sopenharmony_ci bld.AND(temp, src, negated_src); 1819bf215546Sopenharmony_ci emit_find_msb_using_lzd(bld, result, temp, false); 1820bf215546Sopenharmony_ci } else { 1821bf215546Sopenharmony_ci bld.FBL(result, op[0]); 1822bf215546Sopenharmony_ci } 1823bf215546Sopenharmony_ci break; 1824bf215546Sopenharmony_ci 1825bf215546Sopenharmony_ci case nir_op_ubitfield_extract: 1826bf215546Sopenharmony_ci case nir_op_ibitfield_extract: 1827bf215546Sopenharmony_ci unreachable("should have been lowered"); 1828bf215546Sopenharmony_ci case nir_op_ubfe: 1829bf215546Sopenharmony_ci case nir_op_ibfe: 1830bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1831bf215546Sopenharmony_ci bld.BFE(result, op[2], op[1], op[0]); 1832bf215546Sopenharmony_ci break; 1833bf215546Sopenharmony_ci case nir_op_bfm: 1834bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1835bf215546Sopenharmony_ci bld.BFI1(result, op[0], op[1]); 1836bf215546Sopenharmony_ci break; 1837bf215546Sopenharmony_ci case nir_op_bfi: 1838bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest.dest) < 64); 1839bf215546Sopenharmony_ci bld.BFI2(result, op[0], op[1], op[2]); 1840bf215546Sopenharmony_ci break; 1841bf215546Sopenharmony_ci 1842bf215546Sopenharmony_ci case nir_op_bitfield_insert: 1843bf215546Sopenharmony_ci unreachable("not reached: should have been lowered"); 1844bf215546Sopenharmony_ci 1845bf215546Sopenharmony_ci /* For all shift operations: 1846bf215546Sopenharmony_ci * 1847bf215546Sopenharmony_ci * Gen4 - Gen7: After application of source modifiers, the low 5-bits of 1848bf215546Sopenharmony_ci * src1 are used an unsigned value for the shift count. 1849bf215546Sopenharmony_ci * 1850bf215546Sopenharmony_ci * Gen8: As with earlier platforms, but for Q and UQ types on src0, the low 1851bf215546Sopenharmony_ci * 6-bit of src1 are used. 1852bf215546Sopenharmony_ci * 1853bf215546Sopenharmony_ci * Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for 1854bf215546Sopenharmony_ci * W or UW src0). 1855bf215546Sopenharmony_ci * 1856bf215546Sopenharmony_ci * The implication is that the following instruction will produce a 1857bf215546Sopenharmony_ci * different result on Gen9+ than on previous platforms: 1858bf215546Sopenharmony_ci * 1859bf215546Sopenharmony_ci * shr(8) g4<1>UW g12<8,8,1>UW 0x0010UW 1860bf215546Sopenharmony_ci * 1861bf215546Sopenharmony_ci * where Gen9+ will shift by zero, and earlier platforms will shift by 16. 1862bf215546Sopenharmony_ci * 1863bf215546Sopenharmony_ci * This does not seem to be the case. Experimentally, it has been 1864bf215546Sopenharmony_ci * determined that shifts of 16-bit values on Gen8 behave properly. Shifts 1865bf215546Sopenharmony_ci * of 8-bit values on both Gen8 and Gen9 do not. Gen11+ lowers 8-bit 1866bf215546Sopenharmony_ci * values, so those platforms were not tested. No features expose access 1867bf215546Sopenharmony_ci * to 8- or 16-bit types on Gen7 or earlier, so those platforms were not 1868bf215546Sopenharmony_ci * tested either. See 1869bf215546Sopenharmony_ci * https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76. 1870bf215546Sopenharmony_ci * 1871bf215546Sopenharmony_ci * This is part of the reason 8-bit values are lowered to 16-bit on all 1872bf215546Sopenharmony_ci * platforms. 1873bf215546Sopenharmony_ci */ 1874bf215546Sopenharmony_ci case nir_op_ishl: 1875bf215546Sopenharmony_ci bld.SHL(result, op[0], op[1]); 1876bf215546Sopenharmony_ci break; 1877bf215546Sopenharmony_ci case nir_op_ishr: 1878bf215546Sopenharmony_ci bld.ASR(result, op[0], op[1]); 1879bf215546Sopenharmony_ci break; 1880bf215546Sopenharmony_ci case nir_op_ushr: 1881bf215546Sopenharmony_ci bld.SHR(result, op[0], op[1]); 1882bf215546Sopenharmony_ci break; 1883bf215546Sopenharmony_ci 1884bf215546Sopenharmony_ci case nir_op_urol: 1885bf215546Sopenharmony_ci bld.ROL(result, op[0], op[1]); 1886bf215546Sopenharmony_ci break; 1887bf215546Sopenharmony_ci case nir_op_uror: 1888bf215546Sopenharmony_ci bld.ROR(result, op[0], op[1]); 1889bf215546Sopenharmony_ci break; 1890bf215546Sopenharmony_ci 1891bf215546Sopenharmony_ci case nir_op_pack_half_2x16_split: 1892bf215546Sopenharmony_ci bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]); 1893bf215546Sopenharmony_ci break; 1894bf215546Sopenharmony_ci 1895bf215546Sopenharmony_ci case nir_op_sdot_4x8_iadd: 1896bf215546Sopenharmony_ci case nir_op_sdot_4x8_iadd_sat: 1897bf215546Sopenharmony_ci inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D), 1898bf215546Sopenharmony_ci retype(op[2], BRW_REGISTER_TYPE_D), 1899bf215546Sopenharmony_ci retype(op[0], BRW_REGISTER_TYPE_D), 1900bf215546Sopenharmony_ci retype(op[1], BRW_REGISTER_TYPE_D)); 1901bf215546Sopenharmony_ci 1902bf215546Sopenharmony_ci if (instr->op == nir_op_sdot_4x8_iadd_sat) 1903bf215546Sopenharmony_ci inst->saturate = true; 1904bf215546Sopenharmony_ci break; 1905bf215546Sopenharmony_ci 1906bf215546Sopenharmony_ci case nir_op_udot_4x8_uadd: 1907bf215546Sopenharmony_ci case nir_op_udot_4x8_uadd_sat: 1908bf215546Sopenharmony_ci inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_UD), 1909bf215546Sopenharmony_ci retype(op[2], BRW_REGISTER_TYPE_UD), 1910bf215546Sopenharmony_ci retype(op[0], BRW_REGISTER_TYPE_UD), 1911bf215546Sopenharmony_ci retype(op[1], BRW_REGISTER_TYPE_UD)); 1912bf215546Sopenharmony_ci 1913bf215546Sopenharmony_ci if (instr->op == nir_op_udot_4x8_uadd_sat) 1914bf215546Sopenharmony_ci inst->saturate = true; 1915bf215546Sopenharmony_ci break; 1916bf215546Sopenharmony_ci 1917bf215546Sopenharmony_ci case nir_op_sudot_4x8_iadd: 1918bf215546Sopenharmony_ci case nir_op_sudot_4x8_iadd_sat: 1919bf215546Sopenharmony_ci inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D), 1920bf215546Sopenharmony_ci retype(op[2], BRW_REGISTER_TYPE_D), 1921bf215546Sopenharmony_ci retype(op[0], BRW_REGISTER_TYPE_D), 1922bf215546Sopenharmony_ci retype(op[1], BRW_REGISTER_TYPE_UD)); 1923bf215546Sopenharmony_ci 1924bf215546Sopenharmony_ci if (instr->op == nir_op_sudot_4x8_iadd_sat) 1925bf215546Sopenharmony_ci inst->saturate = true; 1926bf215546Sopenharmony_ci break; 1927bf215546Sopenharmony_ci 1928bf215546Sopenharmony_ci case nir_op_ffma: 1929bf215546Sopenharmony_ci if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1930bf215546Sopenharmony_ci brw_rnd_mode rnd = 1931bf215546Sopenharmony_ci brw_rnd_mode_from_execution_mode(execution_mode); 1932bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1933bf215546Sopenharmony_ci brw_imm_d(rnd)); 1934bf215546Sopenharmony_ci } 1935bf215546Sopenharmony_ci 1936bf215546Sopenharmony_ci inst = bld.MAD(result, op[2], op[1], op[0]); 1937bf215546Sopenharmony_ci break; 1938bf215546Sopenharmony_ci 1939bf215546Sopenharmony_ci case nir_op_flrp: 1940bf215546Sopenharmony_ci if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1941bf215546Sopenharmony_ci brw_rnd_mode rnd = 1942bf215546Sopenharmony_ci brw_rnd_mode_from_execution_mode(execution_mode); 1943bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1944bf215546Sopenharmony_ci brw_imm_d(rnd)); 1945bf215546Sopenharmony_ci } 1946bf215546Sopenharmony_ci 1947bf215546Sopenharmony_ci inst = bld.LRP(result, op[0], op[1], op[2]); 1948bf215546Sopenharmony_ci break; 1949bf215546Sopenharmony_ci 1950bf215546Sopenharmony_ci case nir_op_b32csel: 1951bf215546Sopenharmony_ci if (optimize_frontfacing_ternary(instr, result)) 1952bf215546Sopenharmony_ci return; 1953bf215546Sopenharmony_ci 1954bf215546Sopenharmony_ci bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ); 1955bf215546Sopenharmony_ci inst = bld.SEL(result, op[1], op[2]); 1956bf215546Sopenharmony_ci inst->predicate = BRW_PREDICATE_NORMAL; 1957bf215546Sopenharmony_ci break; 1958bf215546Sopenharmony_ci 1959bf215546Sopenharmony_ci case nir_op_extract_u8: 1960bf215546Sopenharmony_ci case nir_op_extract_i8: { 1961bf215546Sopenharmony_ci unsigned byte = nir_src_as_uint(instr->src[1].src); 1962bf215546Sopenharmony_ci 1963bf215546Sopenharmony_ci /* The PRMs say: 1964bf215546Sopenharmony_ci * 1965bf215546Sopenharmony_ci * BDW+ 1966bf215546Sopenharmony_ci * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB. 1967bf215546Sopenharmony_ci * Use two instructions and a word or DWord intermediate integer type. 1968bf215546Sopenharmony_ci */ 1969bf215546Sopenharmony_ci if (nir_dest_bit_size(instr->dest.dest) == 64) { 1970bf215546Sopenharmony_ci const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1971bf215546Sopenharmony_ci 1972bf215546Sopenharmony_ci if (instr->op == nir_op_extract_i8) { 1973bf215546Sopenharmony_ci /* If we need to sign extend, extract to a word first */ 1974bf215546Sopenharmony_ci fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W); 1975bf215546Sopenharmony_ci bld.MOV(w_temp, subscript(op[0], type, byte)); 1976bf215546Sopenharmony_ci bld.MOV(result, w_temp); 1977bf215546Sopenharmony_ci } else if (byte & 1) { 1978bf215546Sopenharmony_ci /* Extract the high byte from the word containing the desired byte 1979bf215546Sopenharmony_ci * offset. 1980bf215546Sopenharmony_ci */ 1981bf215546Sopenharmony_ci bld.SHR(result, 1982bf215546Sopenharmony_ci subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), 1983bf215546Sopenharmony_ci brw_imm_uw(8)); 1984bf215546Sopenharmony_ci } else { 1985bf215546Sopenharmony_ci /* Otherwise use an AND with 0xff and a word type */ 1986bf215546Sopenharmony_ci bld.AND(result, 1987bf215546Sopenharmony_ci subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), 1988bf215546Sopenharmony_ci brw_imm_uw(0xff)); 1989bf215546Sopenharmony_ci } 1990bf215546Sopenharmony_ci } else { 1991bf215546Sopenharmony_ci const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1992bf215546Sopenharmony_ci bld.MOV(result, subscript(op[0], type, byte)); 1993bf215546Sopenharmony_ci } 1994bf215546Sopenharmony_ci break; 1995bf215546Sopenharmony_ci } 1996bf215546Sopenharmony_ci 1997bf215546Sopenharmony_ci case nir_op_extract_u16: 1998bf215546Sopenharmony_ci case nir_op_extract_i16: { 1999bf215546Sopenharmony_ci const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16); 2000bf215546Sopenharmony_ci unsigned word = nir_src_as_uint(instr->src[1].src); 2001bf215546Sopenharmony_ci bld.MOV(result, subscript(op[0], type, word)); 2002bf215546Sopenharmony_ci break; 2003bf215546Sopenharmony_ci } 2004bf215546Sopenharmony_ci 2005bf215546Sopenharmony_ci default: 2006bf215546Sopenharmony_ci unreachable("unhandled instruction"); 2007bf215546Sopenharmony_ci } 2008bf215546Sopenharmony_ci 2009bf215546Sopenharmony_ci /* If we need to do a boolean resolve, replace the result with -(x & 1) 2010bf215546Sopenharmony_ci * to sign extend the low bit to 0/~0 2011bf215546Sopenharmony_ci */ 2012bf215546Sopenharmony_ci if (devinfo->ver <= 5 && 2013bf215546Sopenharmony_ci !result.is_null() && 2014bf215546Sopenharmony_ci (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { 2015bf215546Sopenharmony_ci fs_reg masked = vgrf(glsl_type::int_type); 2016bf215546Sopenharmony_ci bld.AND(masked, result, brw_imm_d(1)); 2017bf215546Sopenharmony_ci masked.negate = true; 2018bf215546Sopenharmony_ci bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked); 2019bf215546Sopenharmony_ci } 2020bf215546Sopenharmony_ci} 2021bf215546Sopenharmony_ci 2022bf215546Sopenharmony_civoid 2023bf215546Sopenharmony_cifs_visitor::nir_emit_load_const(const fs_builder &bld, 2024bf215546Sopenharmony_ci nir_load_const_instr *instr) 2025bf215546Sopenharmony_ci{ 2026bf215546Sopenharmony_ci const brw_reg_type reg_type = 2027bf215546Sopenharmony_ci brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D); 2028bf215546Sopenharmony_ci fs_reg reg = bld.vgrf(reg_type, instr->def.num_components); 2029bf215546Sopenharmony_ci 2030bf215546Sopenharmony_ci switch (instr->def.bit_size) { 2031bf215546Sopenharmony_ci case 8: 2032bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->def.num_components; i++) 2033bf215546Sopenharmony_ci bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8)); 2034bf215546Sopenharmony_ci break; 2035bf215546Sopenharmony_ci 2036bf215546Sopenharmony_ci case 16: 2037bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->def.num_components; i++) 2038bf215546Sopenharmony_ci bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16)); 2039bf215546Sopenharmony_ci break; 2040bf215546Sopenharmony_ci 2041bf215546Sopenharmony_ci case 32: 2042bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->def.num_components; i++) 2043bf215546Sopenharmony_ci bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32)); 2044bf215546Sopenharmony_ci break; 2045bf215546Sopenharmony_ci 2046bf215546Sopenharmony_ci case 64: 2047bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 2048bf215546Sopenharmony_ci if (devinfo->ver == 7) { 2049bf215546Sopenharmony_ci /* We don't get 64-bit integer types until gfx8 */ 2050bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->def.num_components; i++) { 2051bf215546Sopenharmony_ci bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF), 2052bf215546Sopenharmony_ci setup_imm_df(bld, instr->value[i].f64)); 2053bf215546Sopenharmony_ci } 2054bf215546Sopenharmony_ci } else { 2055bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->def.num_components; i++) 2056bf215546Sopenharmony_ci bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64)); 2057bf215546Sopenharmony_ci } 2058bf215546Sopenharmony_ci break; 2059bf215546Sopenharmony_ci 2060bf215546Sopenharmony_ci default: 2061bf215546Sopenharmony_ci unreachable("Invalid bit size"); 2062bf215546Sopenharmony_ci } 2063bf215546Sopenharmony_ci 2064bf215546Sopenharmony_ci nir_ssa_values[instr->def.index] = reg; 2065bf215546Sopenharmony_ci} 2066bf215546Sopenharmony_ci 2067bf215546Sopenharmony_cifs_reg 2068bf215546Sopenharmony_cifs_visitor::get_nir_src(const nir_src &src) 2069bf215546Sopenharmony_ci{ 2070bf215546Sopenharmony_ci fs_reg reg; 2071bf215546Sopenharmony_ci if (src.is_ssa) { 2072bf215546Sopenharmony_ci if (nir_src_is_undef(src)) { 2073bf215546Sopenharmony_ci const brw_reg_type reg_type = 2074bf215546Sopenharmony_ci brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D); 2075bf215546Sopenharmony_ci reg = bld.vgrf(reg_type, src.ssa->num_components); 2076bf215546Sopenharmony_ci } else { 2077bf215546Sopenharmony_ci reg = nir_ssa_values[src.ssa->index]; 2078bf215546Sopenharmony_ci } 2079bf215546Sopenharmony_ci } else { 2080bf215546Sopenharmony_ci /* We don't handle indirects on locals */ 2081bf215546Sopenharmony_ci assert(src.reg.indirect == NULL); 2082bf215546Sopenharmony_ci reg = offset(nir_locals[src.reg.reg->index], bld, 2083bf215546Sopenharmony_ci src.reg.base_offset * src.reg.reg->num_components); 2084bf215546Sopenharmony_ci } 2085bf215546Sopenharmony_ci 2086bf215546Sopenharmony_ci if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) { 2087bf215546Sopenharmony_ci /* The only 64-bit type available on gfx7 is DF, so use that. */ 2088bf215546Sopenharmony_ci reg.type = BRW_REGISTER_TYPE_DF; 2089bf215546Sopenharmony_ci } else { 2090bf215546Sopenharmony_ci /* To avoid floating-point denorm flushing problems, set the type by 2091bf215546Sopenharmony_ci * default to an integer type - instructions that need floating point 2092bf215546Sopenharmony_ci * semantics will set this to F if they need to 2093bf215546Sopenharmony_ci */ 2094bf215546Sopenharmony_ci reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src), 2095bf215546Sopenharmony_ci BRW_REGISTER_TYPE_D); 2096bf215546Sopenharmony_ci } 2097bf215546Sopenharmony_ci 2098bf215546Sopenharmony_ci return reg; 2099bf215546Sopenharmony_ci} 2100bf215546Sopenharmony_ci 2101bf215546Sopenharmony_ci/** 2102bf215546Sopenharmony_ci * Return an IMM for constants; otherwise call get_nir_src() as normal. 2103bf215546Sopenharmony_ci * 2104bf215546Sopenharmony_ci * This function should not be called on any value which may be 64 bits. 2105bf215546Sopenharmony_ci * We could theoretically support 64-bit on gfx8+ but we choose not to 2106bf215546Sopenharmony_ci * because it wouldn't work in general (no gfx7 support) and there are 2107bf215546Sopenharmony_ci * enough restrictions in 64-bit immediates that you can't take the return 2108bf215546Sopenharmony_ci * value and treat it the same as the result of get_nir_src(). 2109bf215546Sopenharmony_ci */ 2110bf215546Sopenharmony_cifs_reg 2111bf215546Sopenharmony_cifs_visitor::get_nir_src_imm(const nir_src &src) 2112bf215546Sopenharmony_ci{ 2113bf215546Sopenharmony_ci assert(nir_src_bit_size(src) == 32); 2114bf215546Sopenharmony_ci return nir_src_is_const(src) ? 2115bf215546Sopenharmony_ci fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src); 2116bf215546Sopenharmony_ci} 2117bf215546Sopenharmony_ci 2118bf215546Sopenharmony_cifs_reg 2119bf215546Sopenharmony_cifs_visitor::get_nir_dest(const nir_dest &dest) 2120bf215546Sopenharmony_ci{ 2121bf215546Sopenharmony_ci if (dest.is_ssa) { 2122bf215546Sopenharmony_ci const brw_reg_type reg_type = 2123bf215546Sopenharmony_ci brw_reg_type_from_bit_size(dest.ssa.bit_size, 2124bf215546Sopenharmony_ci dest.ssa.bit_size == 8 ? 2125bf215546Sopenharmony_ci BRW_REGISTER_TYPE_D : 2126bf215546Sopenharmony_ci BRW_REGISTER_TYPE_F); 2127bf215546Sopenharmony_ci nir_ssa_values[dest.ssa.index] = 2128bf215546Sopenharmony_ci bld.vgrf(reg_type, dest.ssa.num_components); 2129bf215546Sopenharmony_ci bld.UNDEF(nir_ssa_values[dest.ssa.index]); 2130bf215546Sopenharmony_ci return nir_ssa_values[dest.ssa.index]; 2131bf215546Sopenharmony_ci } else { 2132bf215546Sopenharmony_ci /* We don't handle indirects on locals */ 2133bf215546Sopenharmony_ci assert(dest.reg.indirect == NULL); 2134bf215546Sopenharmony_ci return offset(nir_locals[dest.reg.reg->index], bld, 2135bf215546Sopenharmony_ci dest.reg.base_offset * dest.reg.reg->num_components); 2136bf215546Sopenharmony_ci } 2137bf215546Sopenharmony_ci} 2138bf215546Sopenharmony_ci 2139bf215546Sopenharmony_civoid 2140bf215546Sopenharmony_cifs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst, 2141bf215546Sopenharmony_ci unsigned wr_mask) 2142bf215546Sopenharmony_ci{ 2143bf215546Sopenharmony_ci for (unsigned i = 0; i < 4; i++) { 2144bf215546Sopenharmony_ci if (!((wr_mask >> i) & 1)) 2145bf215546Sopenharmony_ci continue; 2146bf215546Sopenharmony_ci 2147bf215546Sopenharmony_ci fs_inst *new_inst = new(mem_ctx) fs_inst(inst); 2148bf215546Sopenharmony_ci new_inst->dst = offset(new_inst->dst, bld, i); 2149bf215546Sopenharmony_ci for (unsigned j = 0; j < new_inst->sources; j++) 2150bf215546Sopenharmony_ci if (new_inst->src[j].file == VGRF) 2151bf215546Sopenharmony_ci new_inst->src[j] = offset(new_inst->src[j], bld, i); 2152bf215546Sopenharmony_ci 2153bf215546Sopenharmony_ci bld.emit(new_inst); 2154bf215546Sopenharmony_ci } 2155bf215546Sopenharmony_ci} 2156bf215546Sopenharmony_ci 2157bf215546Sopenharmony_cistatic fs_inst * 2158bf215546Sopenharmony_ciemit_pixel_interpolater_send(const fs_builder &bld, 2159bf215546Sopenharmony_ci enum opcode opcode, 2160bf215546Sopenharmony_ci const fs_reg &dst, 2161bf215546Sopenharmony_ci const fs_reg &src, 2162bf215546Sopenharmony_ci const fs_reg &desc, 2163bf215546Sopenharmony_ci glsl_interp_mode interpolation) 2164bf215546Sopenharmony_ci{ 2165bf215546Sopenharmony_ci struct brw_wm_prog_data *wm_prog_data = 2166bf215546Sopenharmony_ci brw_wm_prog_data(bld.shader->stage_prog_data); 2167bf215546Sopenharmony_ci 2168bf215546Sopenharmony_ci fs_inst *inst = bld.emit(opcode, dst, src, desc); 2169bf215546Sopenharmony_ci /* 2 floats per slot returned */ 2170bf215546Sopenharmony_ci inst->size_written = 2 * dst.component_size(inst->exec_size); 2171bf215546Sopenharmony_ci if (interpolation == INTERP_MODE_NOPERSPECTIVE) { 2172bf215546Sopenharmony_ci inst->pi_noperspective = true; 2173bf215546Sopenharmony_ci /* TGL BSpec says: 2174bf215546Sopenharmony_ci * This field cannot be set to "Linear Interpolation" 2175bf215546Sopenharmony_ci * unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled" 2176bf215546Sopenharmony_ci */ 2177bf215546Sopenharmony_ci wm_prog_data->uses_nonperspective_interp_modes = true; 2178bf215546Sopenharmony_ci } 2179bf215546Sopenharmony_ci 2180bf215546Sopenharmony_ci wm_prog_data->pulls_bary = true; 2181bf215546Sopenharmony_ci 2182bf215546Sopenharmony_ci return inst; 2183bf215546Sopenharmony_ci} 2184bf215546Sopenharmony_ci 2185bf215546Sopenharmony_ci/** 2186bf215546Sopenharmony_ci * Computes 1 << x, given a D/UD register containing some value x. 2187bf215546Sopenharmony_ci */ 2188bf215546Sopenharmony_cistatic fs_reg 2189bf215546Sopenharmony_ciintexp2(const fs_builder &bld, const fs_reg &x) 2190bf215546Sopenharmony_ci{ 2191bf215546Sopenharmony_ci assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); 2192bf215546Sopenharmony_ci 2193bf215546Sopenharmony_ci fs_reg result = bld.vgrf(x.type, 1); 2194bf215546Sopenharmony_ci fs_reg one = bld.vgrf(x.type, 1); 2195bf215546Sopenharmony_ci 2196bf215546Sopenharmony_ci bld.MOV(one, retype(brw_imm_d(1), one.type)); 2197bf215546Sopenharmony_ci bld.SHL(result, one, x); 2198bf215546Sopenharmony_ci return result; 2199bf215546Sopenharmony_ci} 2200bf215546Sopenharmony_ci 2201bf215546Sopenharmony_civoid 2202bf215546Sopenharmony_cifs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src) 2203bf215546Sopenharmony_ci{ 2204bf215546Sopenharmony_ci assert(stage == MESA_SHADER_GEOMETRY); 2205bf215546Sopenharmony_ci 2206bf215546Sopenharmony_ci struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2207bf215546Sopenharmony_ci 2208bf215546Sopenharmony_ci if (gs_compile->control_data_header_size_bits == 0) 2209bf215546Sopenharmony_ci return; 2210bf215546Sopenharmony_ci 2211bf215546Sopenharmony_ci /* We can only do EndPrimitive() functionality when the control data 2212bf215546Sopenharmony_ci * consists of cut bits. Fortunately, the only time it isn't is when the 2213bf215546Sopenharmony_ci * output type is points, in which case EndPrimitive() is a no-op. 2214bf215546Sopenharmony_ci */ 2215bf215546Sopenharmony_ci if (gs_prog_data->control_data_format != 2216bf215546Sopenharmony_ci GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { 2217bf215546Sopenharmony_ci return; 2218bf215546Sopenharmony_ci } 2219bf215546Sopenharmony_ci 2220bf215546Sopenharmony_ci /* Cut bits use one bit per vertex. */ 2221bf215546Sopenharmony_ci assert(gs_compile->control_data_bits_per_vertex == 1); 2222bf215546Sopenharmony_ci 2223bf215546Sopenharmony_ci fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 2224bf215546Sopenharmony_ci vertex_count.type = BRW_REGISTER_TYPE_UD; 2225bf215546Sopenharmony_ci 2226bf215546Sopenharmony_ci /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting 2227bf215546Sopenharmony_ci * vertex n, 0 otherwise. So all we need to do here is mark bit 2228bf215546Sopenharmony_ci * (vertex_count - 1) % 32 in the cut_bits register to indicate that 2229bf215546Sopenharmony_ci * EndPrimitive() was called after emitting vertex (vertex_count - 1); 2230bf215546Sopenharmony_ci * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. 2231bf215546Sopenharmony_ci * 2232bf215546Sopenharmony_ci * Note that if EndPrimitive() is called before emitting any vertices, this 2233bf215546Sopenharmony_ci * will cause us to set bit 31 of the control_data_bits register to 1. 2234bf215546Sopenharmony_ci * That's fine because: 2235bf215546Sopenharmony_ci * 2236bf215546Sopenharmony_ci * - If max_vertices < 32, then vertex number 31 (zero-based) will never be 2237bf215546Sopenharmony_ci * output, so the hardware will ignore cut bit 31. 2238bf215546Sopenharmony_ci * 2239bf215546Sopenharmony_ci * - If max_vertices == 32, then vertex number 31 is guaranteed to be the 2240bf215546Sopenharmony_ci * last vertex, so setting cut bit 31 has no effect (since the primitive 2241bf215546Sopenharmony_ci * is automatically ended when the GS terminates). 2242bf215546Sopenharmony_ci * 2243bf215546Sopenharmony_ci * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the 2244bf215546Sopenharmony_ci * control_data_bits register to 0 when the first vertex is emitted. 2245bf215546Sopenharmony_ci */ 2246bf215546Sopenharmony_ci 2247bf215546Sopenharmony_ci const fs_builder abld = bld.annotate("end primitive"); 2248bf215546Sopenharmony_ci 2249bf215546Sopenharmony_ci /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ 2250bf215546Sopenharmony_ci fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2251bf215546Sopenharmony_ci abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 2252bf215546Sopenharmony_ci fs_reg mask = intexp2(abld, prev_count); 2253bf215546Sopenharmony_ci /* Note: we're relying on the fact that the GEN SHL instruction only pays 2254bf215546Sopenharmony_ci * attention to the lower 5 bits of its second source argument, so on this 2255bf215546Sopenharmony_ci * architecture, 1 << (vertex_count - 1) is equivalent to 1 << 2256bf215546Sopenharmony_ci * ((vertex_count - 1) % 32). 2257bf215546Sopenharmony_ci */ 2258bf215546Sopenharmony_ci abld.OR(this->control_data_bits, this->control_data_bits, mask); 2259bf215546Sopenharmony_ci} 2260bf215546Sopenharmony_ci 2261bf215546Sopenharmony_civoid 2262bf215546Sopenharmony_cifs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) 2263bf215546Sopenharmony_ci{ 2264bf215546Sopenharmony_ci assert(stage == MESA_SHADER_GEOMETRY); 2265bf215546Sopenharmony_ci assert(gs_compile->control_data_bits_per_vertex != 0); 2266bf215546Sopenharmony_ci 2267bf215546Sopenharmony_ci struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2268bf215546Sopenharmony_ci 2269bf215546Sopenharmony_ci const fs_builder abld = bld.annotate("emit control data bits"); 2270bf215546Sopenharmony_ci const fs_builder fwa_bld = bld.exec_all(); 2271bf215546Sopenharmony_ci 2272bf215546Sopenharmony_ci /* We use a single UD register to accumulate control data bits (32 bits 2273bf215546Sopenharmony_ci * for each of the SIMD8 channels). So we need to write a DWord (32 bits) 2274bf215546Sopenharmony_ci * at a time. 2275bf215546Sopenharmony_ci * 2276bf215546Sopenharmony_ci * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets. 2277bf215546Sopenharmony_ci * We have select a 128-bit group via the Global and Per-Slot Offsets, then 2278bf215546Sopenharmony_ci * use the Channel Mask phase to enable/disable which DWord within that 2279bf215546Sopenharmony_ci * group to write. (Remember, different SIMD8 channels may have emitted 2280bf215546Sopenharmony_ci * different numbers of vertices, so we may need per-slot offsets.) 2281bf215546Sopenharmony_ci * 2282bf215546Sopenharmony_ci * Channel masking presents an annoying problem: we may have to replicate 2283bf215546Sopenharmony_ci * the data up to 4 times: 2284bf215546Sopenharmony_ci * 2285bf215546Sopenharmony_ci * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. 2286bf215546Sopenharmony_ci * 2287bf215546Sopenharmony_ci * To avoid penalizing shaders that emit a small number of vertices, we 2288bf215546Sopenharmony_ci * can avoid these sometimes: if the size of the control data header is 2289bf215546Sopenharmony_ci * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land 2290bf215546Sopenharmony_ci * land in the same 128-bit group, so we can skip per-slot offsets. 2291bf215546Sopenharmony_ci * 2292bf215546Sopenharmony_ci * Similarly, if the control data header is <= 32 bits, there is only one 2293bf215546Sopenharmony_ci * DWord, so we can skip channel masks. 2294bf215546Sopenharmony_ci */ 2295bf215546Sopenharmony_ci fs_reg channel_mask, per_slot_offset; 2296bf215546Sopenharmony_ci 2297bf215546Sopenharmony_ci if (gs_compile->control_data_header_size_bits > 32) 2298bf215546Sopenharmony_ci channel_mask = vgrf(glsl_type::uint_type); 2299bf215546Sopenharmony_ci 2300bf215546Sopenharmony_ci if (gs_compile->control_data_header_size_bits > 128) 2301bf215546Sopenharmony_ci per_slot_offset = vgrf(glsl_type::uint_type); 2302bf215546Sopenharmony_ci 2303bf215546Sopenharmony_ci /* Figure out which DWord we're trying to write to using the formula: 2304bf215546Sopenharmony_ci * 2305bf215546Sopenharmony_ci * dword_index = (vertex_count - 1) * bits_per_vertex / 32 2306bf215546Sopenharmony_ci * 2307bf215546Sopenharmony_ci * Since bits_per_vertex is a power of two, and is known at compile 2308bf215546Sopenharmony_ci * time, this can be optimized to: 2309bf215546Sopenharmony_ci * 2310bf215546Sopenharmony_ci * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) 2311bf215546Sopenharmony_ci */ 2312bf215546Sopenharmony_ci if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) { 2313bf215546Sopenharmony_ci fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2314bf215546Sopenharmony_ci fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2315bf215546Sopenharmony_ci abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 2316bf215546Sopenharmony_ci unsigned log2_bits_per_vertex = 2317bf215546Sopenharmony_ci util_last_bit(gs_compile->control_data_bits_per_vertex); 2318bf215546Sopenharmony_ci abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex)); 2319bf215546Sopenharmony_ci 2320bf215546Sopenharmony_ci if (per_slot_offset.file != BAD_FILE) { 2321bf215546Sopenharmony_ci /* Set the per-slot offset to dword_index / 4, so that we'll write to 2322bf215546Sopenharmony_ci * the appropriate OWord within the control data header. 2323bf215546Sopenharmony_ci */ 2324bf215546Sopenharmony_ci abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u)); 2325bf215546Sopenharmony_ci } 2326bf215546Sopenharmony_ci 2327bf215546Sopenharmony_ci /* Set the channel masks to 1 << (dword_index % 4), so that we'll 2328bf215546Sopenharmony_ci * write to the appropriate DWORD within the OWORD. 2329bf215546Sopenharmony_ci */ 2330bf215546Sopenharmony_ci fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2331bf215546Sopenharmony_ci fwa_bld.AND(channel, dword_index, brw_imm_ud(3u)); 2332bf215546Sopenharmony_ci channel_mask = intexp2(fwa_bld, channel); 2333bf215546Sopenharmony_ci /* Then the channel masks need to be in bits 23:16. */ 2334bf215546Sopenharmony_ci fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u)); 2335bf215546Sopenharmony_ci } 2336bf215546Sopenharmony_ci 2337bf215546Sopenharmony_ci /* Store the control data bits in the message payload and send it. */ 2338bf215546Sopenharmony_ci const unsigned header_size = 1 + unsigned(channel_mask.file != BAD_FILE) + 2339bf215546Sopenharmony_ci unsigned(per_slot_offset.file != BAD_FILE); 2340bf215546Sopenharmony_ci 2341bf215546Sopenharmony_ci /* If there are channel masks, add 3 extra copies of the data. */ 2342bf215546Sopenharmony_ci const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE); 2343bf215546Sopenharmony_ci 2344bf215546Sopenharmony_ci fs_reg sources[4]; 2345bf215546Sopenharmony_ci 2346bf215546Sopenharmony_ci for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) 2347bf215546Sopenharmony_ci sources[i] = this->control_data_bits; 2348bf215546Sopenharmony_ci 2349bf215546Sopenharmony_ci fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 2350bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 2351bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset; 2352bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask; 2353bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length), 2354bf215546Sopenharmony_ci BRW_REGISTER_TYPE_F); 2355bf215546Sopenharmony_ci abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0); 2356bf215546Sopenharmony_ci 2357bf215546Sopenharmony_ci fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, 2358bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 2359bf215546Sopenharmony_ci inst->mlen = header_size + length; 2360bf215546Sopenharmony_ci /* We need to increment Global Offset by 256-bits to make room for 2361bf215546Sopenharmony_ci * Broadwell's extra "Vertex Count" payload at the beginning of the 2362bf215546Sopenharmony_ci * URB entry. Since this is an OWord message, Global Offset is counted 2363bf215546Sopenharmony_ci * in 128-bit units, so we must set it to 2. 2364bf215546Sopenharmony_ci */ 2365bf215546Sopenharmony_ci if (gs_prog_data->static_vertex_count == -1) 2366bf215546Sopenharmony_ci inst->offset = 2; 2367bf215546Sopenharmony_ci} 2368bf215546Sopenharmony_ci 2369bf215546Sopenharmony_civoid 2370bf215546Sopenharmony_cifs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count, 2371bf215546Sopenharmony_ci unsigned stream_id) 2372bf215546Sopenharmony_ci{ 2373bf215546Sopenharmony_ci /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ 2374bf215546Sopenharmony_ci 2375bf215546Sopenharmony_ci /* Note: we are calling this *before* increasing vertex_count, so 2376bf215546Sopenharmony_ci * this->vertex_count == vertex_count - 1 in the formula above. 2377bf215546Sopenharmony_ci */ 2378bf215546Sopenharmony_ci 2379bf215546Sopenharmony_ci /* Stream mode uses 2 bits per vertex */ 2380bf215546Sopenharmony_ci assert(gs_compile->control_data_bits_per_vertex == 2); 2381bf215546Sopenharmony_ci 2382bf215546Sopenharmony_ci /* Must be a valid stream */ 2383bf215546Sopenharmony_ci assert(stream_id < MAX_VERTEX_STREAMS); 2384bf215546Sopenharmony_ci 2385bf215546Sopenharmony_ci /* Control data bits are initialized to 0 so we don't have to set any 2386bf215546Sopenharmony_ci * bits when sending vertices to stream 0. 2387bf215546Sopenharmony_ci */ 2388bf215546Sopenharmony_ci if (stream_id == 0) 2389bf215546Sopenharmony_ci return; 2390bf215546Sopenharmony_ci 2391bf215546Sopenharmony_ci const fs_builder abld = bld.annotate("set stream control data bits", NULL); 2392bf215546Sopenharmony_ci 2393bf215546Sopenharmony_ci /* reg::sid = stream_id */ 2394bf215546Sopenharmony_ci fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2395bf215546Sopenharmony_ci abld.MOV(sid, brw_imm_ud(stream_id)); 2396bf215546Sopenharmony_ci 2397bf215546Sopenharmony_ci /* reg:shift_count = 2 * (vertex_count - 1) */ 2398bf215546Sopenharmony_ci fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2399bf215546Sopenharmony_ci abld.SHL(shift_count, vertex_count, brw_imm_ud(1u)); 2400bf215546Sopenharmony_ci 2401bf215546Sopenharmony_ci /* Note: we're relying on the fact that the GEN SHL instruction only pays 2402bf215546Sopenharmony_ci * attention to the lower 5 bits of its second source argument, so on this 2403bf215546Sopenharmony_ci * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to 2404bf215546Sopenharmony_ci * stream_id << ((2 * (vertex_count - 1)) % 32). 2405bf215546Sopenharmony_ci */ 2406bf215546Sopenharmony_ci fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2407bf215546Sopenharmony_ci abld.SHL(mask, sid, shift_count); 2408bf215546Sopenharmony_ci abld.OR(this->control_data_bits, this->control_data_bits, mask); 2409bf215546Sopenharmony_ci} 2410bf215546Sopenharmony_ci 2411bf215546Sopenharmony_civoid 2412bf215546Sopenharmony_cifs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src, 2413bf215546Sopenharmony_ci unsigned stream_id) 2414bf215546Sopenharmony_ci{ 2415bf215546Sopenharmony_ci assert(stage == MESA_SHADER_GEOMETRY); 2416bf215546Sopenharmony_ci 2417bf215546Sopenharmony_ci struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2418bf215546Sopenharmony_ci 2419bf215546Sopenharmony_ci fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 2420bf215546Sopenharmony_ci vertex_count.type = BRW_REGISTER_TYPE_UD; 2421bf215546Sopenharmony_ci 2422bf215546Sopenharmony_ci /* Haswell and later hardware ignores the "Render Stream Select" bits 2423bf215546Sopenharmony_ci * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, 2424bf215546Sopenharmony_ci * and instead sends all primitives down the pipeline for rasterization. 2425bf215546Sopenharmony_ci * If the SOL stage is enabled, "Render Stream Select" is honored and 2426bf215546Sopenharmony_ci * primitives bound to non-zero streams are discarded after stream output. 2427bf215546Sopenharmony_ci * 2428bf215546Sopenharmony_ci * Since the only purpose of primives sent to non-zero streams is to 2429bf215546Sopenharmony_ci * be recorded by transform feedback, we can simply discard all geometry 2430bf215546Sopenharmony_ci * bound to these streams when transform feedback is disabled. 2431bf215546Sopenharmony_ci */ 2432bf215546Sopenharmony_ci if (stream_id > 0 && !nir->info.has_transform_feedback_varyings) 2433bf215546Sopenharmony_ci return; 2434bf215546Sopenharmony_ci 2435bf215546Sopenharmony_ci /* If we're outputting 32 control data bits or less, then we can wait 2436bf215546Sopenharmony_ci * until the shader is over to output them all. Otherwise we need to 2437bf215546Sopenharmony_ci * output them as we go. Now is the time to do it, since we're about to 2438bf215546Sopenharmony_ci * output the vertex_count'th vertex, so it's guaranteed that the 2439bf215546Sopenharmony_ci * control data bits associated with the (vertex_count - 1)th vertex are 2440bf215546Sopenharmony_ci * correct. 2441bf215546Sopenharmony_ci */ 2442bf215546Sopenharmony_ci if (gs_compile->control_data_header_size_bits > 32) { 2443bf215546Sopenharmony_ci const fs_builder abld = 2444bf215546Sopenharmony_ci bld.annotate("emit vertex: emit control data bits"); 2445bf215546Sopenharmony_ci 2446bf215546Sopenharmony_ci /* Only emit control data bits if we've finished accumulating a batch 2447bf215546Sopenharmony_ci * of 32 bits. This is the case when: 2448bf215546Sopenharmony_ci * 2449bf215546Sopenharmony_ci * (vertex_count * bits_per_vertex) % 32 == 0 2450bf215546Sopenharmony_ci * 2451bf215546Sopenharmony_ci * (in other words, when the last 5 bits of vertex_count * 2452bf215546Sopenharmony_ci * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some 2453bf215546Sopenharmony_ci * integer n (which is always the case, since bits_per_vertex is 2454bf215546Sopenharmony_ci * always 1 or 2), this is equivalent to requiring that the last 5-n 2455bf215546Sopenharmony_ci * bits of vertex_count are 0: 2456bf215546Sopenharmony_ci * 2457bf215546Sopenharmony_ci * vertex_count & (2^(5-n) - 1) == 0 2458bf215546Sopenharmony_ci * 2459bf215546Sopenharmony_ci * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is 2460bf215546Sopenharmony_ci * equivalent to: 2461bf215546Sopenharmony_ci * 2462bf215546Sopenharmony_ci * vertex_count & (32 / bits_per_vertex - 1) == 0 2463bf215546Sopenharmony_ci * 2464bf215546Sopenharmony_ci * TODO: If vertex_count is an immediate, we could do some of this math 2465bf215546Sopenharmony_ci * at compile time... 2466bf215546Sopenharmony_ci */ 2467bf215546Sopenharmony_ci fs_inst *inst = 2468bf215546Sopenharmony_ci abld.AND(bld.null_reg_d(), vertex_count, 2469bf215546Sopenharmony_ci brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u)); 2470bf215546Sopenharmony_ci inst->conditional_mod = BRW_CONDITIONAL_Z; 2471bf215546Sopenharmony_ci 2472bf215546Sopenharmony_ci abld.IF(BRW_PREDICATE_NORMAL); 2473bf215546Sopenharmony_ci /* If vertex_count is 0, then no control data bits have been 2474bf215546Sopenharmony_ci * accumulated yet, so we can skip emitting them. 2475bf215546Sopenharmony_ci */ 2476bf215546Sopenharmony_ci abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u), 2477bf215546Sopenharmony_ci BRW_CONDITIONAL_NEQ); 2478bf215546Sopenharmony_ci abld.IF(BRW_PREDICATE_NORMAL); 2479bf215546Sopenharmony_ci emit_gs_control_data_bits(vertex_count); 2480bf215546Sopenharmony_ci abld.emit(BRW_OPCODE_ENDIF); 2481bf215546Sopenharmony_ci 2482bf215546Sopenharmony_ci /* Reset control_data_bits to 0 so we can start accumulating a new 2483bf215546Sopenharmony_ci * batch. 2484bf215546Sopenharmony_ci * 2485bf215546Sopenharmony_ci * Note: in the case where vertex_count == 0, this neutralizes the 2486bf215546Sopenharmony_ci * effect of any call to EndPrimitive() that the shader may have 2487bf215546Sopenharmony_ci * made before outputting its first vertex. 2488bf215546Sopenharmony_ci */ 2489bf215546Sopenharmony_ci inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u)); 2490bf215546Sopenharmony_ci inst->force_writemask_all = true; 2491bf215546Sopenharmony_ci abld.emit(BRW_OPCODE_ENDIF); 2492bf215546Sopenharmony_ci } 2493bf215546Sopenharmony_ci 2494bf215546Sopenharmony_ci emit_urb_writes(vertex_count); 2495bf215546Sopenharmony_ci 2496bf215546Sopenharmony_ci /* In stream mode we have to set control data bits for all vertices 2497bf215546Sopenharmony_ci * unless we have disabled control data bits completely (which we do 2498bf215546Sopenharmony_ci * do for GL_POINTS outputs that don't use streams). 2499bf215546Sopenharmony_ci */ 2500bf215546Sopenharmony_ci if (gs_compile->control_data_header_size_bits > 0 && 2501bf215546Sopenharmony_ci gs_prog_data->control_data_format == 2502bf215546Sopenharmony_ci GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { 2503bf215546Sopenharmony_ci set_gs_stream_control_data_bits(vertex_count, stream_id); 2504bf215546Sopenharmony_ci } 2505bf215546Sopenharmony_ci} 2506bf215546Sopenharmony_ci 2507bf215546Sopenharmony_civoid 2508bf215546Sopenharmony_cifs_visitor::emit_gs_input_load(const fs_reg &dst, 2509bf215546Sopenharmony_ci const nir_src &vertex_src, 2510bf215546Sopenharmony_ci unsigned base_offset, 2511bf215546Sopenharmony_ci const nir_src &offset_src, 2512bf215546Sopenharmony_ci unsigned num_components, 2513bf215546Sopenharmony_ci unsigned first_component) 2514bf215546Sopenharmony_ci{ 2515bf215546Sopenharmony_ci assert(type_sz(dst.type) == 4); 2516bf215546Sopenharmony_ci struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2517bf215546Sopenharmony_ci const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8; 2518bf215546Sopenharmony_ci 2519bf215546Sopenharmony_ci /* TODO: figure out push input layout for invocations == 1 */ 2520bf215546Sopenharmony_ci if (gs_prog_data->invocations == 1 && 2521bf215546Sopenharmony_ci nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) && 2522bf215546Sopenharmony_ci 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) { 2523bf215546Sopenharmony_ci int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 + 2524bf215546Sopenharmony_ci nir_src_as_uint(vertex_src) * push_reg_count; 2525bf215546Sopenharmony_ci for (unsigned i = 0; i < num_components; i++) { 2526bf215546Sopenharmony_ci bld.MOV(offset(dst, bld, i), 2527bf215546Sopenharmony_ci fs_reg(ATTR, imm_offset + i + first_component, dst.type)); 2528bf215546Sopenharmony_ci } 2529bf215546Sopenharmony_ci return; 2530bf215546Sopenharmony_ci } 2531bf215546Sopenharmony_ci 2532bf215546Sopenharmony_ci /* Resort to the pull model. Ensure the VUE handles are provided. */ 2533bf215546Sopenharmony_ci assert(gs_prog_data->base.include_vue_handles); 2534bf215546Sopenharmony_ci 2535bf215546Sopenharmony_ci unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2; 2536bf215546Sopenharmony_ci fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2537bf215546Sopenharmony_ci 2538bf215546Sopenharmony_ci if (gs_prog_data->invocations == 1) { 2539bf215546Sopenharmony_ci if (nir_src_is_const(vertex_src)) { 2540bf215546Sopenharmony_ci /* The vertex index is constant; just select the proper URB handle. */ 2541bf215546Sopenharmony_ci icp_handle = 2542bf215546Sopenharmony_ci retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0), 2543bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UD); 2544bf215546Sopenharmony_ci } else { 2545bf215546Sopenharmony_ci /* The vertex index is non-constant. We need to use indirect 2546bf215546Sopenharmony_ci * addressing to fetch the proper URB handle. 2547bf215546Sopenharmony_ci * 2548bf215546Sopenharmony_ci * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> 2549bf215546Sopenharmony_ci * indicating that channel <n> should read the handle from 2550bf215546Sopenharmony_ci * DWord <n>. We convert that to bytes by multiplying by 4. 2551bf215546Sopenharmony_ci * 2552bf215546Sopenharmony_ci * Next, we convert the vertex index to bytes by multiplying 2553bf215546Sopenharmony_ci * by 32 (shifting by 5), and add the two together. This is 2554bf215546Sopenharmony_ci * the final indirect byte offset. 2555bf215546Sopenharmony_ci */ 2556bf215546Sopenharmony_ci fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1); 2557bf215546Sopenharmony_ci fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2558bf215546Sopenharmony_ci fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2559bf215546Sopenharmony_ci fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2560bf215546Sopenharmony_ci 2561bf215546Sopenharmony_ci /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ 2562bf215546Sopenharmony_ci bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); 2563bf215546Sopenharmony_ci /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ 2564bf215546Sopenharmony_ci bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); 2565bf215546Sopenharmony_ci /* Convert vertex_index to bytes (multiply by 32) */ 2566bf215546Sopenharmony_ci bld.SHL(vertex_offset_bytes, 2567bf215546Sopenharmony_ci retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2568bf215546Sopenharmony_ci brw_imm_ud(5u)); 2569bf215546Sopenharmony_ci bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); 2570bf215546Sopenharmony_ci 2571bf215546Sopenharmony_ci /* Use first_icp_handle as the base offset. There is one register 2572bf215546Sopenharmony_ci * of URB handles per vertex, so inform the register allocator that 2573bf215546Sopenharmony_ci * we might read up to nir->info.gs.vertices_in registers. 2574bf215546Sopenharmony_ci */ 2575bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2576bf215546Sopenharmony_ci retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2577bf215546Sopenharmony_ci fs_reg(icp_offset_bytes), 2578bf215546Sopenharmony_ci brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE)); 2579bf215546Sopenharmony_ci } 2580bf215546Sopenharmony_ci } else { 2581bf215546Sopenharmony_ci assert(gs_prog_data->invocations > 1); 2582bf215546Sopenharmony_ci 2583bf215546Sopenharmony_ci if (nir_src_is_const(vertex_src)) { 2584bf215546Sopenharmony_ci unsigned vertex = nir_src_as_uint(vertex_src); 2585bf215546Sopenharmony_ci assert(devinfo->ver >= 9 || vertex <= 5); 2586bf215546Sopenharmony_ci bld.MOV(icp_handle, 2587bf215546Sopenharmony_ci retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8), 2588bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UD)); 2589bf215546Sopenharmony_ci } else { 2590bf215546Sopenharmony_ci /* The vertex index is non-constant. We need to use indirect 2591bf215546Sopenharmony_ci * addressing to fetch the proper URB handle. 2592bf215546Sopenharmony_ci * 2593bf215546Sopenharmony_ci */ 2594bf215546Sopenharmony_ci fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2595bf215546Sopenharmony_ci 2596bf215546Sopenharmony_ci /* Convert vertex_index to bytes (multiply by 4) */ 2597bf215546Sopenharmony_ci bld.SHL(icp_offset_bytes, 2598bf215546Sopenharmony_ci retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2599bf215546Sopenharmony_ci brw_imm_ud(2u)); 2600bf215546Sopenharmony_ci 2601bf215546Sopenharmony_ci /* Use first_icp_handle as the base offset. There is one DWord 2602bf215546Sopenharmony_ci * of URB handles per vertex, so inform the register allocator that 2603bf215546Sopenharmony_ci * we might read up to ceil(nir->info.gs.vertices_in / 8) registers. 2604bf215546Sopenharmony_ci */ 2605bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2606bf215546Sopenharmony_ci retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2607bf215546Sopenharmony_ci fs_reg(icp_offset_bytes), 2608bf215546Sopenharmony_ci brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) * 2609bf215546Sopenharmony_ci REG_SIZE)); 2610bf215546Sopenharmony_ci } 2611bf215546Sopenharmony_ci } 2612bf215546Sopenharmony_ci 2613bf215546Sopenharmony_ci fs_inst *inst; 2614bf215546Sopenharmony_ci fs_reg indirect_offset = get_nir_src(offset_src); 2615bf215546Sopenharmony_ci 2616bf215546Sopenharmony_ci if (nir_src_is_const(offset_src)) { 2617bf215546Sopenharmony_ci fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 2618bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle; 2619bf215546Sopenharmony_ci 2620bf215546Sopenharmony_ci /* Constant indexing - use global offset. */ 2621bf215546Sopenharmony_ci if (first_component != 0) { 2622bf215546Sopenharmony_ci unsigned read_components = num_components + first_component; 2623bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(dst.type, read_components); 2624bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs, 2625bf215546Sopenharmony_ci ARRAY_SIZE(srcs)); 2626bf215546Sopenharmony_ci inst->size_written = read_components * 2627bf215546Sopenharmony_ci tmp.component_size(inst->exec_size); 2628bf215546Sopenharmony_ci for (unsigned i = 0; i < num_components; i++) { 2629bf215546Sopenharmony_ci bld.MOV(offset(dst, bld, i), 2630bf215546Sopenharmony_ci offset(tmp, bld, i + first_component)); 2631bf215546Sopenharmony_ci } 2632bf215546Sopenharmony_ci } else { 2633bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs, 2634bf215546Sopenharmony_ci ARRAY_SIZE(srcs)); 2635bf215546Sopenharmony_ci inst->size_written = num_components * 2636bf215546Sopenharmony_ci dst.component_size(inst->exec_size); 2637bf215546Sopenharmony_ci } 2638bf215546Sopenharmony_ci inst->offset = base_offset + nir_src_as_uint(offset_src); 2639bf215546Sopenharmony_ci inst->mlen = 1; 2640bf215546Sopenharmony_ci } else { 2641bf215546Sopenharmony_ci /* Indirect indexing - use per-slot offsets as well. */ 2642bf215546Sopenharmony_ci unsigned read_components = num_components + first_component; 2643bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(dst.type, read_components); 2644bf215546Sopenharmony_ci 2645bf215546Sopenharmony_ci fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 2646bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle; 2647bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; 2648bf215546Sopenharmony_ci 2649bf215546Sopenharmony_ci if (first_component != 0) { 2650bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 2651bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 2652bf215546Sopenharmony_ci inst->size_written = read_components * 2653bf215546Sopenharmony_ci tmp.component_size(inst->exec_size); 2654bf215546Sopenharmony_ci for (unsigned i = 0; i < num_components; i++) { 2655bf215546Sopenharmony_ci bld.MOV(offset(dst, bld, i), 2656bf215546Sopenharmony_ci offset(tmp, bld, i + first_component)); 2657bf215546Sopenharmony_ci } 2658bf215546Sopenharmony_ci } else { 2659bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, 2660bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 2661bf215546Sopenharmony_ci inst->size_written = num_components * 2662bf215546Sopenharmony_ci dst.component_size(inst->exec_size); 2663bf215546Sopenharmony_ci } 2664bf215546Sopenharmony_ci inst->offset = base_offset; 2665bf215546Sopenharmony_ci inst->mlen = 2; 2666bf215546Sopenharmony_ci } 2667bf215546Sopenharmony_ci} 2668bf215546Sopenharmony_ci 2669bf215546Sopenharmony_cifs_reg 2670bf215546Sopenharmony_cifs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) 2671bf215546Sopenharmony_ci{ 2672bf215546Sopenharmony_ci nir_src *offset_src = nir_get_io_offset_src(instr); 2673bf215546Sopenharmony_ci 2674bf215546Sopenharmony_ci if (nir_src_is_const(*offset_src)) { 2675bf215546Sopenharmony_ci /* The only constant offset we should find is 0. brw_nir.c's 2676bf215546Sopenharmony_ci * add_const_offset_to_base() will fold other constant offsets 2677bf215546Sopenharmony_ci * into instr->const_index[0]. 2678bf215546Sopenharmony_ci */ 2679bf215546Sopenharmony_ci assert(nir_src_as_uint(*offset_src) == 0); 2680bf215546Sopenharmony_ci return fs_reg(); 2681bf215546Sopenharmony_ci } 2682bf215546Sopenharmony_ci 2683bf215546Sopenharmony_ci return get_nir_src(*offset_src); 2684bf215546Sopenharmony_ci} 2685bf215546Sopenharmony_ci 2686bf215546Sopenharmony_civoid 2687bf215546Sopenharmony_cifs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, 2688bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 2689bf215546Sopenharmony_ci{ 2690bf215546Sopenharmony_ci assert(stage == MESA_SHADER_VERTEX); 2691bf215546Sopenharmony_ci 2692bf215546Sopenharmony_ci fs_reg dest; 2693bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2694bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 2695bf215546Sopenharmony_ci 2696bf215546Sopenharmony_ci switch (instr->intrinsic) { 2697bf215546Sopenharmony_ci case nir_intrinsic_load_vertex_id: 2698bf215546Sopenharmony_ci case nir_intrinsic_load_base_vertex: 2699bf215546Sopenharmony_ci unreachable("should be lowered by nir_lower_system_values()"); 2700bf215546Sopenharmony_ci 2701bf215546Sopenharmony_ci case nir_intrinsic_load_input: { 2702bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32); 2703bf215546Sopenharmony_ci fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type); 2704bf215546Sopenharmony_ci src = offset(src, bld, nir_intrinsic_component(instr)); 2705bf215546Sopenharmony_ci src = offset(src, bld, nir_src_as_uint(instr->src[0])); 2706bf215546Sopenharmony_ci 2707bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->num_components; i++) 2708bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, i), offset(src, bld, i)); 2709bf215546Sopenharmony_ci break; 2710bf215546Sopenharmony_ci } 2711bf215546Sopenharmony_ci 2712bf215546Sopenharmony_ci case nir_intrinsic_load_vertex_id_zero_base: 2713bf215546Sopenharmony_ci case nir_intrinsic_load_instance_id: 2714bf215546Sopenharmony_ci case nir_intrinsic_load_base_instance: 2715bf215546Sopenharmony_ci case nir_intrinsic_load_draw_id: 2716bf215546Sopenharmony_ci case nir_intrinsic_load_first_vertex: 2717bf215546Sopenharmony_ci case nir_intrinsic_load_is_indexed_draw: 2718bf215546Sopenharmony_ci unreachable("lowered by brw_nir_lower_vs_inputs"); 2719bf215546Sopenharmony_ci 2720bf215546Sopenharmony_ci default: 2721bf215546Sopenharmony_ci nir_emit_intrinsic(bld, instr); 2722bf215546Sopenharmony_ci break; 2723bf215546Sopenharmony_ci } 2724bf215546Sopenharmony_ci} 2725bf215546Sopenharmony_ci 2726bf215546Sopenharmony_cifs_reg 2727bf215546Sopenharmony_cifs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld, 2728bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 2729bf215546Sopenharmony_ci{ 2730bf215546Sopenharmony_ci struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2731bf215546Sopenharmony_ci const nir_src &vertex_src = instr->src[0]; 2732bf215546Sopenharmony_ci nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src); 2733bf215546Sopenharmony_ci fs_reg icp_handle; 2734bf215546Sopenharmony_ci 2735bf215546Sopenharmony_ci if (nir_src_is_const(vertex_src)) { 2736bf215546Sopenharmony_ci /* Emit a MOV to resolve <0,1,0> regioning. */ 2737bf215546Sopenharmony_ci icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2738bf215546Sopenharmony_ci unsigned vertex = nir_src_as_uint(vertex_src); 2739bf215546Sopenharmony_ci bld.MOV(icp_handle, 2740bf215546Sopenharmony_ci retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7), 2741bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UD)); 2742bf215546Sopenharmony_ci } else if (tcs_prog_data->instances == 1 && vertex_intrin && 2743bf215546Sopenharmony_ci vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) { 2744bf215546Sopenharmony_ci /* For the common case of only 1 instance, an array index of 2745bf215546Sopenharmony_ci * gl_InvocationID means reading g1. Skip all the indirect work. 2746bf215546Sopenharmony_ci */ 2747bf215546Sopenharmony_ci icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); 2748bf215546Sopenharmony_ci } else { 2749bf215546Sopenharmony_ci /* The vertex index is non-constant. We need to use indirect 2750bf215546Sopenharmony_ci * addressing to fetch the proper URB handle. 2751bf215546Sopenharmony_ci */ 2752bf215546Sopenharmony_ci icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2753bf215546Sopenharmony_ci 2754bf215546Sopenharmony_ci /* Each ICP handle is a single DWord (4 bytes) */ 2755bf215546Sopenharmony_ci fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2756bf215546Sopenharmony_ci bld.SHL(vertex_offset_bytes, 2757bf215546Sopenharmony_ci retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2758bf215546Sopenharmony_ci brw_imm_ud(2u)); 2759bf215546Sopenharmony_ci 2760bf215546Sopenharmony_ci /* Start at g1. We might read up to 4 registers. */ 2761bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2762bf215546Sopenharmony_ci retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes, 2763bf215546Sopenharmony_ci brw_imm_ud(4 * REG_SIZE)); 2764bf215546Sopenharmony_ci } 2765bf215546Sopenharmony_ci 2766bf215546Sopenharmony_ci return icp_handle; 2767bf215546Sopenharmony_ci} 2768bf215546Sopenharmony_ci 2769bf215546Sopenharmony_cifs_reg 2770bf215546Sopenharmony_cifs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld, 2771bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 2772bf215546Sopenharmony_ci{ 2773bf215546Sopenharmony_ci struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; 2774bf215546Sopenharmony_ci struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2775bf215546Sopenharmony_ci const nir_src &vertex_src = instr->src[0]; 2776bf215546Sopenharmony_ci 2777bf215546Sopenharmony_ci unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2; 2778bf215546Sopenharmony_ci 2779bf215546Sopenharmony_ci if (nir_src_is_const(vertex_src)) { 2780bf215546Sopenharmony_ci return fs_reg(retype(brw_vec8_grf(first_icp_handle + 2781bf215546Sopenharmony_ci nir_src_as_uint(vertex_src), 0), 2782bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UD)); 2783bf215546Sopenharmony_ci } 2784bf215546Sopenharmony_ci 2785bf215546Sopenharmony_ci /* The vertex index is non-constant. We need to use indirect 2786bf215546Sopenharmony_ci * addressing to fetch the proper URB handle. 2787bf215546Sopenharmony_ci * 2788bf215546Sopenharmony_ci * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> 2789bf215546Sopenharmony_ci * indicating that channel <n> should read the handle from 2790bf215546Sopenharmony_ci * DWord <n>. We convert that to bytes by multiplying by 4. 2791bf215546Sopenharmony_ci * 2792bf215546Sopenharmony_ci * Next, we convert the vertex index to bytes by multiplying 2793bf215546Sopenharmony_ci * by 32 (shifting by 5), and add the two together. This is 2794bf215546Sopenharmony_ci * the final indirect byte offset. 2795bf215546Sopenharmony_ci */ 2796bf215546Sopenharmony_ci fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2797bf215546Sopenharmony_ci fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1); 2798bf215546Sopenharmony_ci fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2799bf215546Sopenharmony_ci fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2800bf215546Sopenharmony_ci fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2801bf215546Sopenharmony_ci 2802bf215546Sopenharmony_ci /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ 2803bf215546Sopenharmony_ci bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); 2804bf215546Sopenharmony_ci /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ 2805bf215546Sopenharmony_ci bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); 2806bf215546Sopenharmony_ci /* Convert vertex_index to bytes (multiply by 32) */ 2807bf215546Sopenharmony_ci bld.SHL(vertex_offset_bytes, 2808bf215546Sopenharmony_ci retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2809bf215546Sopenharmony_ci brw_imm_ud(5u)); 2810bf215546Sopenharmony_ci bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); 2811bf215546Sopenharmony_ci 2812bf215546Sopenharmony_ci /* Use first_icp_handle as the base offset. There is one register 2813bf215546Sopenharmony_ci * of URB handles per vertex, so inform the register allocator that 2814bf215546Sopenharmony_ci * we might read up to nir->info.gs.vertices_in registers. 2815bf215546Sopenharmony_ci */ 2816bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2817bf215546Sopenharmony_ci retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2818bf215546Sopenharmony_ci icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE)); 2819bf215546Sopenharmony_ci 2820bf215546Sopenharmony_ci return icp_handle; 2821bf215546Sopenharmony_ci} 2822bf215546Sopenharmony_ci 2823bf215546Sopenharmony_cistruct brw_reg 2824bf215546Sopenharmony_cifs_visitor::get_tcs_output_urb_handle() 2825bf215546Sopenharmony_ci{ 2826bf215546Sopenharmony_ci struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); 2827bf215546Sopenharmony_ci 2828bf215546Sopenharmony_ci if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) { 2829bf215546Sopenharmony_ci return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); 2830bf215546Sopenharmony_ci } else { 2831bf215546Sopenharmony_ci assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH); 2832bf215546Sopenharmony_ci return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); 2833bf215546Sopenharmony_ci } 2834bf215546Sopenharmony_ci} 2835bf215546Sopenharmony_ci 2836bf215546Sopenharmony_civoid 2837bf215546Sopenharmony_cifs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, 2838bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 2839bf215546Sopenharmony_ci{ 2840bf215546Sopenharmony_ci assert(stage == MESA_SHADER_TESS_CTRL); 2841bf215546Sopenharmony_ci struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; 2842bf215546Sopenharmony_ci struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2843bf215546Sopenharmony_ci struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base; 2844bf215546Sopenharmony_ci 2845bf215546Sopenharmony_ci bool eight_patch = 2846bf215546Sopenharmony_ci vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH; 2847bf215546Sopenharmony_ci 2848bf215546Sopenharmony_ci fs_reg dst; 2849bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2850bf215546Sopenharmony_ci dst = get_nir_dest(instr->dest); 2851bf215546Sopenharmony_ci 2852bf215546Sopenharmony_ci switch (instr->intrinsic) { 2853bf215546Sopenharmony_ci case nir_intrinsic_load_primitive_id: 2854bf215546Sopenharmony_ci bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0) 2855bf215546Sopenharmony_ci : brw_vec1_grf(0, 1))); 2856bf215546Sopenharmony_ci break; 2857bf215546Sopenharmony_ci case nir_intrinsic_load_invocation_id: 2858bf215546Sopenharmony_ci bld.MOV(retype(dst, invocation_id.type), invocation_id); 2859bf215546Sopenharmony_ci break; 2860bf215546Sopenharmony_ci case nir_intrinsic_load_patch_vertices_in: 2861bf215546Sopenharmony_ci bld.MOV(retype(dst, BRW_REGISTER_TYPE_D), 2862bf215546Sopenharmony_ci brw_imm_d(tcs_key->input_vertices)); 2863bf215546Sopenharmony_ci break; 2864bf215546Sopenharmony_ci 2865bf215546Sopenharmony_ci case nir_intrinsic_control_barrier: { 2866bf215546Sopenharmony_ci if (tcs_prog_data->instances == 1) 2867bf215546Sopenharmony_ci break; 2868bf215546Sopenharmony_ci 2869bf215546Sopenharmony_ci fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2870bf215546Sopenharmony_ci fs_reg m0_2 = component(m0, 2); 2871bf215546Sopenharmony_ci 2872bf215546Sopenharmony_ci const fs_builder chanbld = bld.exec_all().group(1, 0); 2873bf215546Sopenharmony_ci 2874bf215546Sopenharmony_ci /* Zero the message header */ 2875bf215546Sopenharmony_ci bld.exec_all().MOV(m0, brw_imm_ud(0u)); 2876bf215546Sopenharmony_ci 2877bf215546Sopenharmony_ci if (devinfo->verx10 >= 125) { 2878bf215546Sopenharmony_ci /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */ 2879bf215546Sopenharmony_ci fs_reg m0_10ub = component(retype(m0, BRW_REGISTER_TYPE_UB), 10); 2880bf215546Sopenharmony_ci fs_reg r0_11ub = 2881bf215546Sopenharmony_ci stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UB), 11), 2882bf215546Sopenharmony_ci 0, 1, 0); 2883bf215546Sopenharmony_ci bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub); 2884bf215546Sopenharmony_ci } else if (devinfo->ver >= 11) { 2885bf215546Sopenharmony_ci chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2886bf215546Sopenharmony_ci brw_imm_ud(INTEL_MASK(30, 24))); 2887bf215546Sopenharmony_ci 2888bf215546Sopenharmony_ci /* Set the Barrier Count and the enable bit */ 2889bf215546Sopenharmony_ci chanbld.OR(m0_2, m0_2, 2890bf215546Sopenharmony_ci brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15))); 2891bf215546Sopenharmony_ci } else { 2892bf215546Sopenharmony_ci /* Copy "Barrier ID" from r0.2, bits 16:13 */ 2893bf215546Sopenharmony_ci chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2894bf215546Sopenharmony_ci brw_imm_ud(INTEL_MASK(16, 13))); 2895bf215546Sopenharmony_ci 2896bf215546Sopenharmony_ci /* Shift it up to bits 27:24. */ 2897bf215546Sopenharmony_ci chanbld.SHL(m0_2, m0_2, brw_imm_ud(11)); 2898bf215546Sopenharmony_ci 2899bf215546Sopenharmony_ci /* Set the Barrier Count and the enable bit */ 2900bf215546Sopenharmony_ci chanbld.OR(m0_2, m0_2, 2901bf215546Sopenharmony_ci brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15))); 2902bf215546Sopenharmony_ci } 2903bf215546Sopenharmony_ci 2904bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0); 2905bf215546Sopenharmony_ci break; 2906bf215546Sopenharmony_ci } 2907bf215546Sopenharmony_ci 2908bf215546Sopenharmony_ci case nir_intrinsic_load_input: 2909bf215546Sopenharmony_ci unreachable("nir_lower_io should never give us these."); 2910bf215546Sopenharmony_ci break; 2911bf215546Sopenharmony_ci 2912bf215546Sopenharmony_ci case nir_intrinsic_load_per_vertex_input: { 2913bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32); 2914bf215546Sopenharmony_ci fs_reg indirect_offset = get_indirect_offset(instr); 2915bf215546Sopenharmony_ci unsigned imm_offset = instr->const_index[0]; 2916bf215546Sopenharmony_ci fs_inst *inst; 2917bf215546Sopenharmony_ci 2918bf215546Sopenharmony_ci fs_reg icp_handle = 2919bf215546Sopenharmony_ci eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr) 2920bf215546Sopenharmony_ci : get_tcs_single_patch_icp_handle(bld, instr); 2921bf215546Sopenharmony_ci 2922bf215546Sopenharmony_ci /* We can only read two double components with each URB read, so 2923bf215546Sopenharmony_ci * we send two read messages in that case, each one loading up to 2924bf215546Sopenharmony_ci * two double components. 2925bf215546Sopenharmony_ci */ 2926bf215546Sopenharmony_ci unsigned num_components = instr->num_components; 2927bf215546Sopenharmony_ci unsigned first_component = nir_intrinsic_component(instr); 2928bf215546Sopenharmony_ci 2929bf215546Sopenharmony_ci fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 2930bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle; 2931bf215546Sopenharmony_ci 2932bf215546Sopenharmony_ci if (indirect_offset.file == BAD_FILE) { 2933bf215546Sopenharmony_ci /* Constant indexing - use global offset. */ 2934bf215546Sopenharmony_ci if (first_component != 0) { 2935bf215546Sopenharmony_ci unsigned read_components = num_components + first_component; 2936bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(dst.type, read_components); 2937bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs, 2938bf215546Sopenharmony_ci ARRAY_SIZE(srcs)); 2939bf215546Sopenharmony_ci for (unsigned i = 0; i < num_components; i++) { 2940bf215546Sopenharmony_ci bld.MOV(offset(dst, bld, i), 2941bf215546Sopenharmony_ci offset(tmp, bld, i + first_component)); 2942bf215546Sopenharmony_ci } 2943bf215546Sopenharmony_ci } else { 2944bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs, 2945bf215546Sopenharmony_ci ARRAY_SIZE(srcs)); 2946bf215546Sopenharmony_ci } 2947bf215546Sopenharmony_ci inst->offset = imm_offset; 2948bf215546Sopenharmony_ci inst->mlen = 1; 2949bf215546Sopenharmony_ci } else { 2950bf215546Sopenharmony_ci /* Indirect indexing - use per-slot offsets as well. */ 2951bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; 2952bf215546Sopenharmony_ci 2953bf215546Sopenharmony_ci if (first_component != 0) { 2954bf215546Sopenharmony_ci unsigned read_components = num_components + first_component; 2955bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(dst.type, read_components); 2956bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 2957bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 2958bf215546Sopenharmony_ci for (unsigned i = 0; i < num_components; i++) { 2959bf215546Sopenharmony_ci bld.MOV(offset(dst, bld, i), 2960bf215546Sopenharmony_ci offset(tmp, bld, i + first_component)); 2961bf215546Sopenharmony_ci } 2962bf215546Sopenharmony_ci } else { 2963bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, 2964bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 2965bf215546Sopenharmony_ci } 2966bf215546Sopenharmony_ci inst->offset = imm_offset; 2967bf215546Sopenharmony_ci inst->mlen = 2; 2968bf215546Sopenharmony_ci } 2969bf215546Sopenharmony_ci inst->size_written = (num_components + first_component) * 2970bf215546Sopenharmony_ci inst->dst.component_size(inst->exec_size); 2971bf215546Sopenharmony_ci 2972bf215546Sopenharmony_ci /* Copy the temporary to the destination to deal with writemasking. 2973bf215546Sopenharmony_ci * 2974bf215546Sopenharmony_ci * Also attempt to deal with gl_PointSize being in the .w component. 2975bf215546Sopenharmony_ci */ 2976bf215546Sopenharmony_ci if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { 2977bf215546Sopenharmony_ci assert(type_sz(dst.type) == 4); 2978bf215546Sopenharmony_ci inst->dst = bld.vgrf(dst.type, 4); 2979bf215546Sopenharmony_ci inst->size_written = 4 * REG_SIZE; 2980bf215546Sopenharmony_ci bld.MOV(dst, offset(inst->dst, bld, 3)); 2981bf215546Sopenharmony_ci } 2982bf215546Sopenharmony_ci break; 2983bf215546Sopenharmony_ci } 2984bf215546Sopenharmony_ci 2985bf215546Sopenharmony_ci case nir_intrinsic_load_output: 2986bf215546Sopenharmony_ci case nir_intrinsic_load_per_vertex_output: { 2987bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32); 2988bf215546Sopenharmony_ci fs_reg indirect_offset = get_indirect_offset(instr); 2989bf215546Sopenharmony_ci unsigned imm_offset = instr->const_index[0]; 2990bf215546Sopenharmony_ci unsigned first_component = nir_intrinsic_component(instr); 2991bf215546Sopenharmony_ci 2992bf215546Sopenharmony_ci struct brw_reg output_handles = get_tcs_output_urb_handle(); 2993bf215546Sopenharmony_ci 2994bf215546Sopenharmony_ci fs_inst *inst; 2995bf215546Sopenharmony_ci if (indirect_offset.file == BAD_FILE) { 2996bf215546Sopenharmony_ci /* This MOV replicates the output handle to all enabled channels 2997bf215546Sopenharmony_ci * is SINGLE_PATCH mode. 2998bf215546Sopenharmony_ci */ 2999bf215546Sopenharmony_ci fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 3000bf215546Sopenharmony_ci bld.MOV(patch_handle, output_handles); 3001bf215546Sopenharmony_ci 3002bf215546Sopenharmony_ci { 3003bf215546Sopenharmony_ci fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 3004bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle; 3005bf215546Sopenharmony_ci 3006bf215546Sopenharmony_ci if (first_component != 0) { 3007bf215546Sopenharmony_ci unsigned read_components = 3008bf215546Sopenharmony_ci instr->num_components + first_component; 3009bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(dst.type, read_components); 3010bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 3011bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 3012bf215546Sopenharmony_ci inst->size_written = read_components * REG_SIZE; 3013bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->num_components; i++) { 3014bf215546Sopenharmony_ci bld.MOV(offset(dst, bld, i), 3015bf215546Sopenharmony_ci offset(tmp, bld, i + first_component)); 3016bf215546Sopenharmony_ci } 3017bf215546Sopenharmony_ci } else { 3018bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, 3019bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 3020bf215546Sopenharmony_ci inst->size_written = instr->num_components * REG_SIZE; 3021bf215546Sopenharmony_ci } 3022bf215546Sopenharmony_ci inst->offset = imm_offset; 3023bf215546Sopenharmony_ci inst->mlen = 1; 3024bf215546Sopenharmony_ci } 3025bf215546Sopenharmony_ci } else { 3026bf215546Sopenharmony_ci /* Indirect indexing - use per-slot offsets as well. */ 3027bf215546Sopenharmony_ci fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 3028bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_HANDLE] = output_handles; 3029bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; 3030bf215546Sopenharmony_ci 3031bf215546Sopenharmony_ci if (first_component != 0) { 3032bf215546Sopenharmony_ci unsigned read_components = 3033bf215546Sopenharmony_ci instr->num_components + first_component; 3034bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(dst.type, read_components); 3035bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 3036bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 3037bf215546Sopenharmony_ci inst->size_written = read_components * REG_SIZE; 3038bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->num_components; i++) { 3039bf215546Sopenharmony_ci bld.MOV(offset(dst, bld, i), 3040bf215546Sopenharmony_ci offset(tmp, bld, i + first_component)); 3041bf215546Sopenharmony_ci } 3042bf215546Sopenharmony_ci } else { 3043bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, 3044bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 3045bf215546Sopenharmony_ci inst->size_written = instr->num_components * REG_SIZE; 3046bf215546Sopenharmony_ci } 3047bf215546Sopenharmony_ci inst->offset = imm_offset; 3048bf215546Sopenharmony_ci inst->mlen = 2; 3049bf215546Sopenharmony_ci } 3050bf215546Sopenharmony_ci break; 3051bf215546Sopenharmony_ci } 3052bf215546Sopenharmony_ci 3053bf215546Sopenharmony_ci case nir_intrinsic_store_output: 3054bf215546Sopenharmony_ci case nir_intrinsic_store_per_vertex_output: { 3055bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0]) == 32); 3056bf215546Sopenharmony_ci fs_reg value = get_nir_src(instr->src[0]); 3057bf215546Sopenharmony_ci fs_reg indirect_offset = get_indirect_offset(instr); 3058bf215546Sopenharmony_ci unsigned imm_offset = instr->const_index[0]; 3059bf215546Sopenharmony_ci unsigned mask = instr->const_index[1]; 3060bf215546Sopenharmony_ci 3061bf215546Sopenharmony_ci if (mask == 0) 3062bf215546Sopenharmony_ci break; 3063bf215546Sopenharmony_ci 3064bf215546Sopenharmony_ci unsigned num_components = util_last_bit(mask); 3065bf215546Sopenharmony_ci 3066bf215546Sopenharmony_ci /* We can only pack two 64-bit components in a single message, so send 3067bf215546Sopenharmony_ci * 2 messages if we have more components 3068bf215546Sopenharmony_ci */ 3069bf215546Sopenharmony_ci unsigned first_component = nir_intrinsic_component(instr); 3070bf215546Sopenharmony_ci mask = mask << first_component; 3071bf215546Sopenharmony_ci 3072bf215546Sopenharmony_ci fs_reg mask_reg; 3073bf215546Sopenharmony_ci if (mask != WRITEMASK_XYZW) 3074bf215546Sopenharmony_ci mask_reg = brw_imm_ud(mask << 16); 3075bf215546Sopenharmony_ci 3076bf215546Sopenharmony_ci fs_reg sources[4]; 3077bf215546Sopenharmony_ci 3078bf215546Sopenharmony_ci for (unsigned i = 0; i < num_components; i++) { 3079bf215546Sopenharmony_ci if (!(mask & (1 << (i + first_component)))) 3080bf215546Sopenharmony_ci continue; 3081bf215546Sopenharmony_ci 3082bf215546Sopenharmony_ci sources[i + first_component] = offset(value, bld, i); 3083bf215546Sopenharmony_ci } 3084bf215546Sopenharmony_ci 3085bf215546Sopenharmony_ci unsigned header_size = 1 + unsigned(indirect_offset.file != BAD_FILE) + 3086bf215546Sopenharmony_ci unsigned(mask != WRITEMASK_XYZW); 3087bf215546Sopenharmony_ci const unsigned length = num_components + first_component; 3088bf215546Sopenharmony_ci 3089bf215546Sopenharmony_ci fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 3090bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_HANDLE] = get_tcs_output_urb_handle(); 3091bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; 3092bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg; 3093bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length), 3094bf215546Sopenharmony_ci BRW_REGISTER_TYPE_F); 3095bf215546Sopenharmony_ci bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0); 3096bf215546Sopenharmony_ci 3097bf215546Sopenharmony_ci fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, 3098bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 3099bf215546Sopenharmony_ci inst->offset = imm_offset; 3100bf215546Sopenharmony_ci inst->mlen = header_size + length; 3101bf215546Sopenharmony_ci break; 3102bf215546Sopenharmony_ci } 3103bf215546Sopenharmony_ci 3104bf215546Sopenharmony_ci default: 3105bf215546Sopenharmony_ci nir_emit_intrinsic(bld, instr); 3106bf215546Sopenharmony_ci break; 3107bf215546Sopenharmony_ci } 3108bf215546Sopenharmony_ci} 3109bf215546Sopenharmony_ci 3110bf215546Sopenharmony_civoid 3111bf215546Sopenharmony_cifs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, 3112bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 3113bf215546Sopenharmony_ci{ 3114bf215546Sopenharmony_ci assert(stage == MESA_SHADER_TESS_EVAL); 3115bf215546Sopenharmony_ci struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data); 3116bf215546Sopenharmony_ci 3117bf215546Sopenharmony_ci fs_reg dest; 3118bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3119bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 3120bf215546Sopenharmony_ci 3121bf215546Sopenharmony_ci switch (instr->intrinsic) { 3122bf215546Sopenharmony_ci case nir_intrinsic_load_primitive_id: 3123bf215546Sopenharmony_ci bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1))); 3124bf215546Sopenharmony_ci break; 3125bf215546Sopenharmony_ci case nir_intrinsic_load_tess_coord: 3126bf215546Sopenharmony_ci /* gl_TessCoord is part of the payload in g1-3 */ 3127bf215546Sopenharmony_ci for (unsigned i = 0; i < 3; i++) { 3128bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0))); 3129bf215546Sopenharmony_ci } 3130bf215546Sopenharmony_ci break; 3131bf215546Sopenharmony_ci 3132bf215546Sopenharmony_ci case nir_intrinsic_load_input: 3133bf215546Sopenharmony_ci case nir_intrinsic_load_per_vertex_input: { 3134bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32); 3135bf215546Sopenharmony_ci fs_reg indirect_offset = get_indirect_offset(instr); 3136bf215546Sopenharmony_ci unsigned imm_offset = instr->const_index[0]; 3137bf215546Sopenharmony_ci unsigned first_component = nir_intrinsic_component(instr); 3138bf215546Sopenharmony_ci 3139bf215546Sopenharmony_ci fs_inst *inst; 3140bf215546Sopenharmony_ci if (indirect_offset.file == BAD_FILE) { 3141bf215546Sopenharmony_ci /* Arbitrarily only push up to 32 vec4 slots worth of data, 3142bf215546Sopenharmony_ci * which is 16 registers (since each holds 2 vec4 slots). 3143bf215546Sopenharmony_ci */ 3144bf215546Sopenharmony_ci const unsigned max_push_slots = 32; 3145bf215546Sopenharmony_ci if (imm_offset < max_push_slots) { 3146bf215546Sopenharmony_ci fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type); 3147bf215546Sopenharmony_ci for (int i = 0; i < instr->num_components; i++) { 3148bf215546Sopenharmony_ci unsigned comp = 4 * (imm_offset % 2) + i + first_component; 3149bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, i), component(src, comp)); 3150bf215546Sopenharmony_ci } 3151bf215546Sopenharmony_ci 3152bf215546Sopenharmony_ci tes_prog_data->base.urb_read_length = 3153bf215546Sopenharmony_ci MAX2(tes_prog_data->base.urb_read_length, 3154bf215546Sopenharmony_ci (imm_offset / 2) + 1); 3155bf215546Sopenharmony_ci } else { 3156bf215546Sopenharmony_ci /* Replicate the patch handle to all enabled channels */ 3157bf215546Sopenharmony_ci fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 3158bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_HANDLE] = 3159bf215546Sopenharmony_ci retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); 3160bf215546Sopenharmony_ci 3161bf215546Sopenharmony_ci if (first_component != 0) { 3162bf215546Sopenharmony_ci unsigned read_components = 3163bf215546Sopenharmony_ci instr->num_components + first_component; 3164bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(dest.type, read_components); 3165bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 3166bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 3167bf215546Sopenharmony_ci inst->size_written = read_components * REG_SIZE; 3168bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->num_components; i++) { 3169bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, i), 3170bf215546Sopenharmony_ci offset(tmp, bld, i + first_component)); 3171bf215546Sopenharmony_ci } 3172bf215546Sopenharmony_ci } else { 3173bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest, 3174bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 3175bf215546Sopenharmony_ci inst->size_written = instr->num_components * REG_SIZE; 3176bf215546Sopenharmony_ci } 3177bf215546Sopenharmony_ci inst->mlen = 1; 3178bf215546Sopenharmony_ci inst->offset = imm_offset; 3179bf215546Sopenharmony_ci } 3180bf215546Sopenharmony_ci } else { 3181bf215546Sopenharmony_ci /* Indirect indexing - use per-slot offsets as well. */ 3182bf215546Sopenharmony_ci 3183bf215546Sopenharmony_ci /* We can only read two double components with each URB read, so 3184bf215546Sopenharmony_ci * we send two read messages in that case, each one loading up to 3185bf215546Sopenharmony_ci * two double components. 3186bf215546Sopenharmony_ci */ 3187bf215546Sopenharmony_ci unsigned num_components = instr->num_components; 3188bf215546Sopenharmony_ci 3189bf215546Sopenharmony_ci fs_reg srcs[URB_LOGICAL_NUM_SRCS]; 3190bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_HANDLE] = 3191bf215546Sopenharmony_ci retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); 3192bf215546Sopenharmony_ci srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; 3193bf215546Sopenharmony_ci 3194bf215546Sopenharmony_ci if (first_component != 0) { 3195bf215546Sopenharmony_ci unsigned read_components = 3196bf215546Sopenharmony_ci num_components + first_component; 3197bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(dest.type, read_components); 3198bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, 3199bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 3200bf215546Sopenharmony_ci for (unsigned i = 0; i < num_components; i++) { 3201bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, i), 3202bf215546Sopenharmony_ci offset(tmp, bld, i + first_component)); 3203bf215546Sopenharmony_ci } 3204bf215546Sopenharmony_ci } else { 3205bf215546Sopenharmony_ci inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest, 3206bf215546Sopenharmony_ci srcs, ARRAY_SIZE(srcs)); 3207bf215546Sopenharmony_ci } 3208bf215546Sopenharmony_ci inst->mlen = 2; 3209bf215546Sopenharmony_ci inst->offset = imm_offset; 3210bf215546Sopenharmony_ci inst->size_written = (num_components + first_component) * 3211bf215546Sopenharmony_ci inst->dst.component_size(inst->exec_size); 3212bf215546Sopenharmony_ci } 3213bf215546Sopenharmony_ci break; 3214bf215546Sopenharmony_ci } 3215bf215546Sopenharmony_ci default: 3216bf215546Sopenharmony_ci nir_emit_intrinsic(bld, instr); 3217bf215546Sopenharmony_ci break; 3218bf215546Sopenharmony_ci } 3219bf215546Sopenharmony_ci} 3220bf215546Sopenharmony_ci 3221bf215546Sopenharmony_civoid 3222bf215546Sopenharmony_cifs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, 3223bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 3224bf215546Sopenharmony_ci{ 3225bf215546Sopenharmony_ci assert(stage == MESA_SHADER_GEOMETRY); 3226bf215546Sopenharmony_ci fs_reg indirect_offset; 3227bf215546Sopenharmony_ci 3228bf215546Sopenharmony_ci fs_reg dest; 3229bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3230bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 3231bf215546Sopenharmony_ci 3232bf215546Sopenharmony_ci switch (instr->intrinsic) { 3233bf215546Sopenharmony_ci case nir_intrinsic_load_primitive_id: 3234bf215546Sopenharmony_ci assert(stage == MESA_SHADER_GEOMETRY); 3235bf215546Sopenharmony_ci assert(brw_gs_prog_data(prog_data)->include_primitive_id); 3236bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 3237bf215546Sopenharmony_ci retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD)); 3238bf215546Sopenharmony_ci break; 3239bf215546Sopenharmony_ci 3240bf215546Sopenharmony_ci case nir_intrinsic_load_input: 3241bf215546Sopenharmony_ci unreachable("load_input intrinsics are invalid for the GS stage"); 3242bf215546Sopenharmony_ci 3243bf215546Sopenharmony_ci case nir_intrinsic_load_per_vertex_input: 3244bf215546Sopenharmony_ci emit_gs_input_load(dest, instr->src[0], instr->const_index[0], 3245bf215546Sopenharmony_ci instr->src[1], instr->num_components, 3246bf215546Sopenharmony_ci nir_intrinsic_component(instr)); 3247bf215546Sopenharmony_ci break; 3248bf215546Sopenharmony_ci 3249bf215546Sopenharmony_ci case nir_intrinsic_emit_vertex_with_counter: 3250bf215546Sopenharmony_ci emit_gs_vertex(instr->src[0], instr->const_index[0]); 3251bf215546Sopenharmony_ci break; 3252bf215546Sopenharmony_ci 3253bf215546Sopenharmony_ci case nir_intrinsic_end_primitive_with_counter: 3254bf215546Sopenharmony_ci emit_gs_end_primitive(instr->src[0]); 3255bf215546Sopenharmony_ci break; 3256bf215546Sopenharmony_ci 3257bf215546Sopenharmony_ci case nir_intrinsic_set_vertex_and_primitive_count: 3258bf215546Sopenharmony_ci bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0])); 3259bf215546Sopenharmony_ci break; 3260bf215546Sopenharmony_ci 3261bf215546Sopenharmony_ci case nir_intrinsic_load_invocation_id: { 3262bf215546Sopenharmony_ci fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 3263bf215546Sopenharmony_ci assert(val.file != BAD_FILE); 3264bf215546Sopenharmony_ci dest.type = val.type; 3265bf215546Sopenharmony_ci bld.MOV(dest, val); 3266bf215546Sopenharmony_ci break; 3267bf215546Sopenharmony_ci } 3268bf215546Sopenharmony_ci 3269bf215546Sopenharmony_ci default: 3270bf215546Sopenharmony_ci nir_emit_intrinsic(bld, instr); 3271bf215546Sopenharmony_ci break; 3272bf215546Sopenharmony_ci } 3273bf215546Sopenharmony_ci} 3274bf215546Sopenharmony_ci 3275bf215546Sopenharmony_ci/** 3276bf215546Sopenharmony_ci * Fetch the current render target layer index. 3277bf215546Sopenharmony_ci */ 3278bf215546Sopenharmony_cistatic fs_reg 3279bf215546Sopenharmony_cifetch_render_target_array_index(const fs_builder &bld) 3280bf215546Sopenharmony_ci{ 3281bf215546Sopenharmony_ci if (bld.shader->devinfo->ver >= 12) { 3282bf215546Sopenharmony_ci /* The render target array index is provided in the thread payload as 3283bf215546Sopenharmony_ci * bits 26:16 of r1.1. 3284bf215546Sopenharmony_ci */ 3285bf215546Sopenharmony_ci const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); 3286bf215546Sopenharmony_ci bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3), 3287bf215546Sopenharmony_ci brw_imm_uw(0x7ff)); 3288bf215546Sopenharmony_ci return idx; 3289bf215546Sopenharmony_ci } else if (bld.shader->devinfo->ver >= 6) { 3290bf215546Sopenharmony_ci /* The render target array index is provided in the thread payload as 3291bf215546Sopenharmony_ci * bits 26:16 of r0.0. 3292bf215546Sopenharmony_ci */ 3293bf215546Sopenharmony_ci const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); 3294bf215546Sopenharmony_ci bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1), 3295bf215546Sopenharmony_ci brw_imm_uw(0x7ff)); 3296bf215546Sopenharmony_ci return idx; 3297bf215546Sopenharmony_ci } else { 3298bf215546Sopenharmony_ci /* Pre-SNB we only ever render into the first layer of the framebuffer 3299bf215546Sopenharmony_ci * since layered rendering is not implemented. 3300bf215546Sopenharmony_ci */ 3301bf215546Sopenharmony_ci return brw_imm_ud(0); 3302bf215546Sopenharmony_ci } 3303bf215546Sopenharmony_ci} 3304bf215546Sopenharmony_ci 3305bf215546Sopenharmony_ci/** 3306bf215546Sopenharmony_ci * Fake non-coherent framebuffer read implemented using TXF to fetch from the 3307bf215546Sopenharmony_ci * framebuffer at the current fragment coordinates and sample index. 3308bf215546Sopenharmony_ci */ 3309bf215546Sopenharmony_cifs_inst * 3310bf215546Sopenharmony_cifs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, 3311bf215546Sopenharmony_ci unsigned target) 3312bf215546Sopenharmony_ci{ 3313bf215546Sopenharmony_ci const struct intel_device_info *devinfo = bld.shader->devinfo; 3314bf215546Sopenharmony_ci 3315bf215546Sopenharmony_ci assert(bld.shader->stage == MESA_SHADER_FRAGMENT); 3316bf215546Sopenharmony_ci const brw_wm_prog_key *wm_key = 3317bf215546Sopenharmony_ci reinterpret_cast<const brw_wm_prog_key *>(key); 3318bf215546Sopenharmony_ci assert(!wm_key->coherent_fb_fetch); 3319bf215546Sopenharmony_ci 3320bf215546Sopenharmony_ci /* Calculate the fragment coordinates. */ 3321bf215546Sopenharmony_ci const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); 3322bf215546Sopenharmony_ci bld.MOV(offset(coords, bld, 0), pixel_x); 3323bf215546Sopenharmony_ci bld.MOV(offset(coords, bld, 1), pixel_y); 3324bf215546Sopenharmony_ci bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld)); 3325bf215546Sopenharmony_ci 3326bf215546Sopenharmony_ci /* Calculate the sample index and MCS payload when multisampling. Luckily 3327bf215546Sopenharmony_ci * the MCS fetch message behaves deterministically for UMS surfaces, so it 3328bf215546Sopenharmony_ci * shouldn't be necessary to recompile based on whether the framebuffer is 3329bf215546Sopenharmony_ci * CMS or UMS. 3330bf215546Sopenharmony_ci */ 3331bf215546Sopenharmony_ci if (wm_key->multisample_fbo && 3332bf215546Sopenharmony_ci nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) 3333bf215546Sopenharmony_ci nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(); 3334bf215546Sopenharmony_ci 3335bf215546Sopenharmony_ci const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 3336bf215546Sopenharmony_ci const fs_reg mcs = wm_key->multisample_fbo ? 3337bf215546Sopenharmony_ci emit_mcs_fetch(coords, 3, brw_imm_ud(target), fs_reg()) : fs_reg(); 3338bf215546Sopenharmony_ci 3339bf215546Sopenharmony_ci /* Use either a normal or a CMS texel fetch message depending on whether 3340bf215546Sopenharmony_ci * the framebuffer is single or multisample. On SKL+ use the wide CMS 3341bf215546Sopenharmony_ci * message just in case the framebuffer uses 16x multisampling, it should 3342bf215546Sopenharmony_ci * be equivalent to the normal CMS fetch for lower multisampling modes. 3343bf215546Sopenharmony_ci */ 3344bf215546Sopenharmony_ci opcode op; 3345bf215546Sopenharmony_ci if (wm_key->multisample_fbo) { 3346bf215546Sopenharmony_ci /* On SKL+ use the wide CMS message just in case the framebuffer uses 16x 3347bf215546Sopenharmony_ci * multisampling, it should be equivalent to the normal CMS fetch for 3348bf215546Sopenharmony_ci * lower multisampling modes. 3349bf215546Sopenharmony_ci * 3350bf215546Sopenharmony_ci * On Gfx12HP, there is only CMS_W variant available. 3351bf215546Sopenharmony_ci */ 3352bf215546Sopenharmony_ci if (devinfo->verx10 >= 125) 3353bf215546Sopenharmony_ci op = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL; 3354bf215546Sopenharmony_ci else if (devinfo->ver >= 9) 3355bf215546Sopenharmony_ci op = SHADER_OPCODE_TXF_CMS_W_LOGICAL; 3356bf215546Sopenharmony_ci else 3357bf215546Sopenharmony_ci op = SHADER_OPCODE_TXF_CMS_LOGICAL; 3358bf215546Sopenharmony_ci } else { 3359bf215546Sopenharmony_ci op = SHADER_OPCODE_TXF_LOGICAL; 3360bf215546Sopenharmony_ci } 3361bf215546Sopenharmony_ci 3362bf215546Sopenharmony_ci /* Emit the instruction. */ 3363bf215546Sopenharmony_ci fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 3364bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_COORDINATE] = coords; 3365bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0); 3366bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample; 3367bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_MCS] = mcs; 3368bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(target); 3369bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0); 3370bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3); 3371bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0); 3372bf215546Sopenharmony_ci 3373bf215546Sopenharmony_ci fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs)); 3374bf215546Sopenharmony_ci inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3375bf215546Sopenharmony_ci 3376bf215546Sopenharmony_ci return inst; 3377bf215546Sopenharmony_ci} 3378bf215546Sopenharmony_ci 3379bf215546Sopenharmony_ci/** 3380bf215546Sopenharmony_ci * Actual coherent framebuffer read implemented using the native render target 3381bf215546Sopenharmony_ci * read message. Requires SKL+. 3382bf215546Sopenharmony_ci */ 3383bf215546Sopenharmony_cistatic fs_inst * 3384bf215546Sopenharmony_ciemit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target) 3385bf215546Sopenharmony_ci{ 3386bf215546Sopenharmony_ci assert(bld.shader->devinfo->ver >= 9); 3387bf215546Sopenharmony_ci fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst); 3388bf215546Sopenharmony_ci inst->target = target; 3389bf215546Sopenharmony_ci inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3390bf215546Sopenharmony_ci 3391bf215546Sopenharmony_ci return inst; 3392bf215546Sopenharmony_ci} 3393bf215546Sopenharmony_ci 3394bf215546Sopenharmony_cistatic fs_reg 3395bf215546Sopenharmony_cialloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n) 3396bf215546Sopenharmony_ci{ 3397bf215546Sopenharmony_ci if (n && regs[0].file != BAD_FILE) { 3398bf215546Sopenharmony_ci return regs[0]; 3399bf215546Sopenharmony_ci 3400bf215546Sopenharmony_ci } else { 3401bf215546Sopenharmony_ci const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size); 3402bf215546Sopenharmony_ci 3403bf215546Sopenharmony_ci for (unsigned i = 0; i < n; i++) 3404bf215546Sopenharmony_ci regs[i] = tmp; 3405bf215546Sopenharmony_ci 3406bf215546Sopenharmony_ci return tmp; 3407bf215546Sopenharmony_ci } 3408bf215546Sopenharmony_ci} 3409bf215546Sopenharmony_ci 3410bf215546Sopenharmony_cistatic fs_reg 3411bf215546Sopenharmony_cialloc_frag_output(fs_visitor *v, unsigned location) 3412bf215546Sopenharmony_ci{ 3413bf215546Sopenharmony_ci assert(v->stage == MESA_SHADER_FRAGMENT); 3414bf215546Sopenharmony_ci const brw_wm_prog_key *const key = 3415bf215546Sopenharmony_ci reinterpret_cast<const brw_wm_prog_key *>(v->key); 3416bf215546Sopenharmony_ci const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION); 3417bf215546Sopenharmony_ci const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX); 3418bf215546Sopenharmony_ci 3419bf215546Sopenharmony_ci if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1)) 3420bf215546Sopenharmony_ci return alloc_temporary(v->bld, 4, &v->dual_src_output, 1); 3421bf215546Sopenharmony_ci 3422bf215546Sopenharmony_ci else if (l == FRAG_RESULT_COLOR) 3423bf215546Sopenharmony_ci return alloc_temporary(v->bld, 4, v->outputs, 3424bf215546Sopenharmony_ci MAX2(key->nr_color_regions, 1)); 3425bf215546Sopenharmony_ci 3426bf215546Sopenharmony_ci else if (l == FRAG_RESULT_DEPTH) 3427bf215546Sopenharmony_ci return alloc_temporary(v->bld, 1, &v->frag_depth, 1); 3428bf215546Sopenharmony_ci 3429bf215546Sopenharmony_ci else if (l == FRAG_RESULT_STENCIL) 3430bf215546Sopenharmony_ci return alloc_temporary(v->bld, 1, &v->frag_stencil, 1); 3431bf215546Sopenharmony_ci 3432bf215546Sopenharmony_ci else if (l == FRAG_RESULT_SAMPLE_MASK) 3433bf215546Sopenharmony_ci return alloc_temporary(v->bld, 1, &v->sample_mask, 1); 3434bf215546Sopenharmony_ci 3435bf215546Sopenharmony_ci else if (l >= FRAG_RESULT_DATA0 && 3436bf215546Sopenharmony_ci l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS) 3437bf215546Sopenharmony_ci return alloc_temporary(v->bld, 4, 3438bf215546Sopenharmony_ci &v->outputs[l - FRAG_RESULT_DATA0], 1); 3439bf215546Sopenharmony_ci 3440bf215546Sopenharmony_ci else 3441bf215546Sopenharmony_ci unreachable("Invalid location"); 3442bf215546Sopenharmony_ci} 3443bf215546Sopenharmony_ci 3444bf215546Sopenharmony_civoid 3445bf215546Sopenharmony_cifs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, 3446bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 3447bf215546Sopenharmony_ci{ 3448bf215546Sopenharmony_ci assert(stage == MESA_SHADER_FRAGMENT); 3449bf215546Sopenharmony_ci 3450bf215546Sopenharmony_ci fs_reg dest; 3451bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3452bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 3453bf215546Sopenharmony_ci 3454bf215546Sopenharmony_ci switch (instr->intrinsic) { 3455bf215546Sopenharmony_ci case nir_intrinsic_load_front_face: 3456bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 3457bf215546Sopenharmony_ci emit_frontfacing_interpolation()); 3458bf215546Sopenharmony_ci break; 3459bf215546Sopenharmony_ci 3460bf215546Sopenharmony_ci case nir_intrinsic_load_sample_pos: 3461bf215546Sopenharmony_ci case nir_intrinsic_load_sample_pos_or_center: { 3462bf215546Sopenharmony_ci fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 3463bf215546Sopenharmony_ci assert(sample_pos.file != BAD_FILE); 3464bf215546Sopenharmony_ci dest.type = sample_pos.type; 3465bf215546Sopenharmony_ci bld.MOV(dest, sample_pos); 3466bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1)); 3467bf215546Sopenharmony_ci break; 3468bf215546Sopenharmony_ci } 3469bf215546Sopenharmony_ci 3470bf215546Sopenharmony_ci case nir_intrinsic_load_layer_id: 3471bf215546Sopenharmony_ci dest.type = BRW_REGISTER_TYPE_UD; 3472bf215546Sopenharmony_ci bld.MOV(dest, fetch_render_target_array_index(bld)); 3473bf215546Sopenharmony_ci break; 3474bf215546Sopenharmony_ci 3475bf215546Sopenharmony_ci case nir_intrinsic_is_helper_invocation: 3476bf215546Sopenharmony_ci emit_is_helper_invocation(dest); 3477bf215546Sopenharmony_ci break; 3478bf215546Sopenharmony_ci 3479bf215546Sopenharmony_ci case nir_intrinsic_load_helper_invocation: 3480bf215546Sopenharmony_ci case nir_intrinsic_load_sample_mask_in: 3481bf215546Sopenharmony_ci case nir_intrinsic_load_sample_id: 3482bf215546Sopenharmony_ci case nir_intrinsic_load_frag_shading_rate: { 3483bf215546Sopenharmony_ci gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3484bf215546Sopenharmony_ci fs_reg val = nir_system_values[sv]; 3485bf215546Sopenharmony_ci assert(val.file != BAD_FILE); 3486bf215546Sopenharmony_ci dest.type = val.type; 3487bf215546Sopenharmony_ci bld.MOV(dest, val); 3488bf215546Sopenharmony_ci break; 3489bf215546Sopenharmony_ci } 3490bf215546Sopenharmony_ci 3491bf215546Sopenharmony_ci case nir_intrinsic_store_output: { 3492bf215546Sopenharmony_ci const fs_reg src = get_nir_src(instr->src[0]); 3493bf215546Sopenharmony_ci const unsigned store_offset = nir_src_as_uint(instr->src[1]); 3494bf215546Sopenharmony_ci const unsigned location = nir_intrinsic_base(instr) + 3495bf215546Sopenharmony_ci SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION); 3496bf215546Sopenharmony_ci const fs_reg new_dest = retype(alloc_frag_output(this, location), 3497bf215546Sopenharmony_ci src.type); 3498bf215546Sopenharmony_ci 3499bf215546Sopenharmony_ci for (unsigned j = 0; j < instr->num_components; j++) 3500bf215546Sopenharmony_ci bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j), 3501bf215546Sopenharmony_ci offset(src, bld, j)); 3502bf215546Sopenharmony_ci 3503bf215546Sopenharmony_ci break; 3504bf215546Sopenharmony_ci } 3505bf215546Sopenharmony_ci 3506bf215546Sopenharmony_ci case nir_intrinsic_load_output: { 3507bf215546Sopenharmony_ci const unsigned l = GET_FIELD(nir_intrinsic_base(instr), 3508bf215546Sopenharmony_ci BRW_NIR_FRAG_OUTPUT_LOCATION); 3509bf215546Sopenharmony_ci assert(l >= FRAG_RESULT_DATA0); 3510bf215546Sopenharmony_ci const unsigned load_offset = nir_src_as_uint(instr->src[0]); 3511bf215546Sopenharmony_ci const unsigned target = l - FRAG_RESULT_DATA0 + load_offset; 3512bf215546Sopenharmony_ci const fs_reg tmp = bld.vgrf(dest.type, 4); 3513bf215546Sopenharmony_ci 3514bf215546Sopenharmony_ci if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch) 3515bf215546Sopenharmony_ci emit_coherent_fb_read(bld, tmp, target); 3516bf215546Sopenharmony_ci else 3517bf215546Sopenharmony_ci emit_non_coherent_fb_read(bld, tmp, target); 3518bf215546Sopenharmony_ci 3519bf215546Sopenharmony_ci for (unsigned j = 0; j < instr->num_components; j++) { 3520bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, j), 3521bf215546Sopenharmony_ci offset(tmp, bld, nir_intrinsic_component(instr) + j)); 3522bf215546Sopenharmony_ci } 3523bf215546Sopenharmony_ci 3524bf215546Sopenharmony_ci break; 3525bf215546Sopenharmony_ci } 3526bf215546Sopenharmony_ci 3527bf215546Sopenharmony_ci case nir_intrinsic_demote: 3528bf215546Sopenharmony_ci case nir_intrinsic_discard: 3529bf215546Sopenharmony_ci case nir_intrinsic_terminate: 3530bf215546Sopenharmony_ci case nir_intrinsic_demote_if: 3531bf215546Sopenharmony_ci case nir_intrinsic_discard_if: 3532bf215546Sopenharmony_ci case nir_intrinsic_terminate_if: { 3533bf215546Sopenharmony_ci /* We track our discarded pixels in f0.1/f1.0. By predicating on it, we 3534bf215546Sopenharmony_ci * can update just the flag bits that aren't yet discarded. If there's 3535bf215546Sopenharmony_ci * no condition, we emit a CMP of g0 != g0, so all currently executing 3536bf215546Sopenharmony_ci * channels will get turned off. 3537bf215546Sopenharmony_ci */ 3538bf215546Sopenharmony_ci fs_inst *cmp = NULL; 3539bf215546Sopenharmony_ci if (instr->intrinsic == nir_intrinsic_demote_if || 3540bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_discard_if || 3541bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_terminate_if) { 3542bf215546Sopenharmony_ci nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]); 3543bf215546Sopenharmony_ci 3544bf215546Sopenharmony_ci if (alu != NULL && 3545bf215546Sopenharmony_ci alu->op != nir_op_bcsel && 3546bf215546Sopenharmony_ci (devinfo->ver > 5 || 3547bf215546Sopenharmony_ci (alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE || 3548bf215546Sopenharmony_ci alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 || 3549bf215546Sopenharmony_ci alu->op == nir_op_flt32 || alu->op == nir_op_fge32 || 3550bf215546Sopenharmony_ci alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 || 3551bf215546Sopenharmony_ci alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 || 3552bf215546Sopenharmony_ci alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) { 3553bf215546Sopenharmony_ci /* Re-emit the instruction that generated the Boolean value, but 3554bf215546Sopenharmony_ci * do not store it. Since this instruction will be conditional, 3555bf215546Sopenharmony_ci * other instructions that want to use the real Boolean value may 3556bf215546Sopenharmony_ci * get garbage. This was a problem for piglit's fs-discard-exit-2 3557bf215546Sopenharmony_ci * test. 3558bf215546Sopenharmony_ci * 3559bf215546Sopenharmony_ci * Ideally we'd detect that the instruction cannot have a 3560bf215546Sopenharmony_ci * conditional modifier before emitting the instructions. Alas, 3561bf215546Sopenharmony_ci * that is nigh impossible. Instead, we're going to assume the 3562bf215546Sopenharmony_ci * instruction (or last instruction) generated can have a 3563bf215546Sopenharmony_ci * conditional modifier. If it cannot, fallback to the old-style 3564bf215546Sopenharmony_ci * compare, and hope dead code elimination will clean up the 3565bf215546Sopenharmony_ci * extra instructions generated. 3566bf215546Sopenharmony_ci */ 3567bf215546Sopenharmony_ci nir_emit_alu(bld, alu, false); 3568bf215546Sopenharmony_ci 3569bf215546Sopenharmony_ci cmp = (fs_inst *) instructions.get_tail(); 3570bf215546Sopenharmony_ci if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) { 3571bf215546Sopenharmony_ci if (cmp->can_do_cmod()) 3572bf215546Sopenharmony_ci cmp->conditional_mod = BRW_CONDITIONAL_Z; 3573bf215546Sopenharmony_ci else 3574bf215546Sopenharmony_ci cmp = NULL; 3575bf215546Sopenharmony_ci } else { 3576bf215546Sopenharmony_ci /* The old sequence that would have been generated is, 3577bf215546Sopenharmony_ci * basically, bool_result == false. This is equivalent to 3578bf215546Sopenharmony_ci * !bool_result, so negate the old modifier. 3579bf215546Sopenharmony_ci */ 3580bf215546Sopenharmony_ci cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod); 3581bf215546Sopenharmony_ci } 3582bf215546Sopenharmony_ci } 3583bf215546Sopenharmony_ci 3584bf215546Sopenharmony_ci if (cmp == NULL) { 3585bf215546Sopenharmony_ci cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]), 3586bf215546Sopenharmony_ci brw_imm_d(0), BRW_CONDITIONAL_Z); 3587bf215546Sopenharmony_ci } 3588bf215546Sopenharmony_ci } else { 3589bf215546Sopenharmony_ci fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), 3590bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UW)); 3591bf215546Sopenharmony_ci cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ); 3592bf215546Sopenharmony_ci } 3593bf215546Sopenharmony_ci 3594bf215546Sopenharmony_ci cmp->predicate = BRW_PREDICATE_NORMAL; 3595bf215546Sopenharmony_ci cmp->flag_subreg = sample_mask_flag_subreg(this); 3596bf215546Sopenharmony_ci 3597bf215546Sopenharmony_ci fs_inst *jump = bld.emit(BRW_OPCODE_HALT); 3598bf215546Sopenharmony_ci jump->flag_subreg = sample_mask_flag_subreg(this); 3599bf215546Sopenharmony_ci jump->predicate_inverse = true; 3600bf215546Sopenharmony_ci 3601bf215546Sopenharmony_ci if (instr->intrinsic == nir_intrinsic_terminate || 3602bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_terminate_if) { 3603bf215546Sopenharmony_ci jump->predicate = BRW_PREDICATE_NORMAL; 3604bf215546Sopenharmony_ci } else { 3605bf215546Sopenharmony_ci /* Only jump when the whole quad is demoted. For historical 3606bf215546Sopenharmony_ci * reasons this is also used for discard. 3607bf215546Sopenharmony_ci */ 3608bf215546Sopenharmony_ci jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H; 3609bf215546Sopenharmony_ci } 3610bf215546Sopenharmony_ci 3611bf215546Sopenharmony_ci if (devinfo->ver < 7) 3612bf215546Sopenharmony_ci limit_dispatch_width( 3613bf215546Sopenharmony_ci 16, "Fragment discard/demote not implemented in SIMD32 mode.\n"); 3614bf215546Sopenharmony_ci break; 3615bf215546Sopenharmony_ci } 3616bf215546Sopenharmony_ci 3617bf215546Sopenharmony_ci case nir_intrinsic_load_input: { 3618bf215546Sopenharmony_ci /* In Fragment Shaders load_input is used either for flat inputs or 3619bf215546Sopenharmony_ci * per-primitive inputs. 3620bf215546Sopenharmony_ci */ 3621bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32); 3622bf215546Sopenharmony_ci unsigned base = nir_intrinsic_base(instr); 3623bf215546Sopenharmony_ci unsigned comp = nir_intrinsic_component(instr); 3624bf215546Sopenharmony_ci unsigned num_components = instr->num_components; 3625bf215546Sopenharmony_ci 3626bf215546Sopenharmony_ci /* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */ 3627bf215546Sopenharmony_ci 3628bf215546Sopenharmony_ci /* Special case fields in the VUE header */ 3629bf215546Sopenharmony_ci if (base == VARYING_SLOT_LAYER) 3630bf215546Sopenharmony_ci comp = 1; 3631bf215546Sopenharmony_ci else if (base == VARYING_SLOT_VIEWPORT) 3632bf215546Sopenharmony_ci comp = 2; 3633bf215546Sopenharmony_ci 3634bf215546Sopenharmony_ci if (BITFIELD64_BIT(base) & nir->info.per_primitive_inputs) { 3635bf215546Sopenharmony_ci assert(base != VARYING_SLOT_PRIMITIVE_INDICES); 3636bf215546Sopenharmony_ci for (unsigned int i = 0; i < num_components; i++) { 3637bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, i), 3638bf215546Sopenharmony_ci retype(component(per_primitive_reg(base), comp + i), dest.type)); 3639bf215546Sopenharmony_ci } 3640bf215546Sopenharmony_ci } else { 3641bf215546Sopenharmony_ci for (unsigned int i = 0; i < num_components; i++) { 3642bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, i), 3643bf215546Sopenharmony_ci retype(component(interp_reg(base, comp + i), 3), dest.type)); 3644bf215546Sopenharmony_ci } 3645bf215546Sopenharmony_ci } 3646bf215546Sopenharmony_ci break; 3647bf215546Sopenharmony_ci } 3648bf215546Sopenharmony_ci 3649bf215546Sopenharmony_ci case nir_intrinsic_load_fs_input_interp_deltas: { 3650bf215546Sopenharmony_ci assert(stage == MESA_SHADER_FRAGMENT); 3651bf215546Sopenharmony_ci assert(nir_src_as_uint(instr->src[0]) == 0); 3652bf215546Sopenharmony_ci fs_reg interp = interp_reg(nir_intrinsic_base(instr), 3653bf215546Sopenharmony_ci nir_intrinsic_component(instr)); 3654bf215546Sopenharmony_ci dest.type = BRW_REGISTER_TYPE_F; 3655bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, 0), component(interp, 3)); 3656bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, 1), component(interp, 1)); 3657bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, 2), component(interp, 0)); 3658bf215546Sopenharmony_ci break; 3659bf215546Sopenharmony_ci } 3660bf215546Sopenharmony_ci 3661bf215546Sopenharmony_ci case nir_intrinsic_load_barycentric_pixel: 3662bf215546Sopenharmony_ci case nir_intrinsic_load_barycentric_centroid: 3663bf215546Sopenharmony_ci case nir_intrinsic_load_barycentric_sample: { 3664bf215546Sopenharmony_ci /* Use the delta_xy values computed from the payload */ 3665bf215546Sopenharmony_ci enum brw_barycentric_mode bary = brw_barycentric_mode(instr); 3666bf215546Sopenharmony_ci const fs_reg srcs[] = { offset(this->delta_xy[bary], bld, 0), 3667bf215546Sopenharmony_ci offset(this->delta_xy[bary], bld, 1) }; 3668bf215546Sopenharmony_ci bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); 3669bf215546Sopenharmony_ci break; 3670bf215546Sopenharmony_ci } 3671bf215546Sopenharmony_ci 3672bf215546Sopenharmony_ci case nir_intrinsic_load_barycentric_at_sample: { 3673bf215546Sopenharmony_ci const glsl_interp_mode interpolation = 3674bf215546Sopenharmony_ci (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3675bf215546Sopenharmony_ci 3676bf215546Sopenharmony_ci if (nir_src_is_const(instr->src[0])) { 3677bf215546Sopenharmony_ci unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4; 3678bf215546Sopenharmony_ci 3679bf215546Sopenharmony_ci emit_pixel_interpolater_send(bld, 3680bf215546Sopenharmony_ci FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3681bf215546Sopenharmony_ci dest, 3682bf215546Sopenharmony_ci fs_reg(), /* src */ 3683bf215546Sopenharmony_ci brw_imm_ud(msg_data), 3684bf215546Sopenharmony_ci interpolation); 3685bf215546Sopenharmony_ci } else { 3686bf215546Sopenharmony_ci const fs_reg sample_src = retype(get_nir_src(instr->src[0]), 3687bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UD); 3688bf215546Sopenharmony_ci 3689bf215546Sopenharmony_ci if (nir_src_is_always_uniform(instr->src[0])) { 3690bf215546Sopenharmony_ci const fs_reg sample_id = bld.emit_uniformize(sample_src); 3691bf215546Sopenharmony_ci const fs_reg msg_data = vgrf(glsl_type::uint_type); 3692bf215546Sopenharmony_ci bld.exec_all().group(1, 0) 3693bf215546Sopenharmony_ci .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3694bf215546Sopenharmony_ci emit_pixel_interpolater_send(bld, 3695bf215546Sopenharmony_ci FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3696bf215546Sopenharmony_ci dest, 3697bf215546Sopenharmony_ci fs_reg(), /* src */ 3698bf215546Sopenharmony_ci component(msg_data, 0), 3699bf215546Sopenharmony_ci interpolation); 3700bf215546Sopenharmony_ci } else { 3701bf215546Sopenharmony_ci /* Make a loop that sends a message to the pixel interpolater 3702bf215546Sopenharmony_ci * for the sample number in each live channel. If there are 3703bf215546Sopenharmony_ci * multiple channels with the same sample number then these 3704bf215546Sopenharmony_ci * will be handled simultaneously with a single iteration of 3705bf215546Sopenharmony_ci * the loop. 3706bf215546Sopenharmony_ci */ 3707bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_DO); 3708bf215546Sopenharmony_ci 3709bf215546Sopenharmony_ci /* Get the next live sample number into sample_id_reg */ 3710bf215546Sopenharmony_ci const fs_reg sample_id = bld.emit_uniformize(sample_src); 3711bf215546Sopenharmony_ci 3712bf215546Sopenharmony_ci /* Set the flag register so that we can perform the send 3713bf215546Sopenharmony_ci * message on all channels that have the same sample number 3714bf215546Sopenharmony_ci */ 3715bf215546Sopenharmony_ci bld.CMP(bld.null_reg_ud(), 3716bf215546Sopenharmony_ci sample_src, sample_id, 3717bf215546Sopenharmony_ci BRW_CONDITIONAL_EQ); 3718bf215546Sopenharmony_ci const fs_reg msg_data = vgrf(glsl_type::uint_type); 3719bf215546Sopenharmony_ci bld.exec_all().group(1, 0) 3720bf215546Sopenharmony_ci .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3721bf215546Sopenharmony_ci fs_inst *inst = 3722bf215546Sopenharmony_ci emit_pixel_interpolater_send(bld, 3723bf215546Sopenharmony_ci FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3724bf215546Sopenharmony_ci dest, 3725bf215546Sopenharmony_ci fs_reg(), /* src */ 3726bf215546Sopenharmony_ci component(msg_data, 0), 3727bf215546Sopenharmony_ci interpolation); 3728bf215546Sopenharmony_ci set_predicate(BRW_PREDICATE_NORMAL, inst); 3729bf215546Sopenharmony_ci 3730bf215546Sopenharmony_ci /* Continue the loop if there are any live channels left */ 3731bf215546Sopenharmony_ci set_predicate_inv(BRW_PREDICATE_NORMAL, 3732bf215546Sopenharmony_ci true, /* inverse */ 3733bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_WHILE)); 3734bf215546Sopenharmony_ci } 3735bf215546Sopenharmony_ci } 3736bf215546Sopenharmony_ci break; 3737bf215546Sopenharmony_ci } 3738bf215546Sopenharmony_ci 3739bf215546Sopenharmony_ci case nir_intrinsic_load_barycentric_at_offset: { 3740bf215546Sopenharmony_ci const glsl_interp_mode interpolation = 3741bf215546Sopenharmony_ci (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3742bf215546Sopenharmony_ci 3743bf215546Sopenharmony_ci nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3744bf215546Sopenharmony_ci 3745bf215546Sopenharmony_ci if (const_offset) { 3746bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0]) == 32); 3747bf215546Sopenharmony_ci unsigned off_x = const_offset[0].u32 & 0xf; 3748bf215546Sopenharmony_ci unsigned off_y = const_offset[1].u32 & 0xf; 3749bf215546Sopenharmony_ci 3750bf215546Sopenharmony_ci emit_pixel_interpolater_send(bld, 3751bf215546Sopenharmony_ci FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, 3752bf215546Sopenharmony_ci dest, 3753bf215546Sopenharmony_ci fs_reg(), /* src */ 3754bf215546Sopenharmony_ci brw_imm_ud(off_x | (off_y << 4)), 3755bf215546Sopenharmony_ci interpolation); 3756bf215546Sopenharmony_ci } else { 3757bf215546Sopenharmony_ci fs_reg src = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_D); 3758bf215546Sopenharmony_ci const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; 3759bf215546Sopenharmony_ci emit_pixel_interpolater_send(bld, 3760bf215546Sopenharmony_ci opcode, 3761bf215546Sopenharmony_ci dest, 3762bf215546Sopenharmony_ci src, 3763bf215546Sopenharmony_ci brw_imm_ud(0u), 3764bf215546Sopenharmony_ci interpolation); 3765bf215546Sopenharmony_ci } 3766bf215546Sopenharmony_ci break; 3767bf215546Sopenharmony_ci } 3768bf215546Sopenharmony_ci 3769bf215546Sopenharmony_ci case nir_intrinsic_load_frag_coord: 3770bf215546Sopenharmony_ci emit_fragcoord_interpolation(dest); 3771bf215546Sopenharmony_ci break; 3772bf215546Sopenharmony_ci 3773bf215546Sopenharmony_ci case nir_intrinsic_load_interpolated_input: { 3774bf215546Sopenharmony_ci assert(instr->src[0].ssa && 3775bf215546Sopenharmony_ci instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic); 3776bf215546Sopenharmony_ci nir_intrinsic_instr *bary_intrinsic = 3777bf215546Sopenharmony_ci nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); 3778bf215546Sopenharmony_ci nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; 3779bf215546Sopenharmony_ci enum glsl_interp_mode interp_mode = 3780bf215546Sopenharmony_ci (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic); 3781bf215546Sopenharmony_ci fs_reg dst_xy; 3782bf215546Sopenharmony_ci 3783bf215546Sopenharmony_ci if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || 3784bf215546Sopenharmony_ci bary_intrin == nir_intrinsic_load_barycentric_at_sample) { 3785bf215546Sopenharmony_ci /* Use the result of the PI message. */ 3786bf215546Sopenharmony_ci dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); 3787bf215546Sopenharmony_ci } else { 3788bf215546Sopenharmony_ci /* Use the delta_xy values computed from the payload */ 3789bf215546Sopenharmony_ci enum brw_barycentric_mode bary = brw_barycentric_mode(bary_intrinsic); 3790bf215546Sopenharmony_ci dst_xy = this->delta_xy[bary]; 3791bf215546Sopenharmony_ci } 3792bf215546Sopenharmony_ci 3793bf215546Sopenharmony_ci for (unsigned int i = 0; i < instr->num_components; i++) { 3794bf215546Sopenharmony_ci fs_reg interp = 3795bf215546Sopenharmony_ci component(interp_reg(nir_intrinsic_base(instr), 3796bf215546Sopenharmony_ci nir_intrinsic_component(instr) + i), 0); 3797bf215546Sopenharmony_ci interp.type = BRW_REGISTER_TYPE_F; 3798bf215546Sopenharmony_ci dest.type = BRW_REGISTER_TYPE_F; 3799bf215546Sopenharmony_ci 3800bf215546Sopenharmony_ci if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) { 3801bf215546Sopenharmony_ci fs_reg tmp = vgrf(glsl_type::float_type); 3802bf215546Sopenharmony_ci bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp); 3803bf215546Sopenharmony_ci bld.MUL(offset(dest, bld, i), tmp, this->pixel_w); 3804bf215546Sopenharmony_ci } else { 3805bf215546Sopenharmony_ci bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); 3806bf215546Sopenharmony_ci } 3807bf215546Sopenharmony_ci } 3808bf215546Sopenharmony_ci break; 3809bf215546Sopenharmony_ci } 3810bf215546Sopenharmony_ci 3811bf215546Sopenharmony_ci default: 3812bf215546Sopenharmony_ci nir_emit_intrinsic(bld, instr); 3813bf215546Sopenharmony_ci break; 3814bf215546Sopenharmony_ci } 3815bf215546Sopenharmony_ci} 3816bf215546Sopenharmony_ci 3817bf215546Sopenharmony_civoid 3818bf215546Sopenharmony_cifs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, 3819bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 3820bf215546Sopenharmony_ci{ 3821bf215546Sopenharmony_ci assert(gl_shader_stage_uses_workgroup(stage)); 3822bf215546Sopenharmony_ci struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); 3823bf215546Sopenharmony_ci 3824bf215546Sopenharmony_ci fs_reg dest; 3825bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3826bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 3827bf215546Sopenharmony_ci 3828bf215546Sopenharmony_ci switch (instr->intrinsic) { 3829bf215546Sopenharmony_ci case nir_intrinsic_control_barrier: 3830bf215546Sopenharmony_ci /* The whole workgroup fits in a single HW thread, so all the 3831bf215546Sopenharmony_ci * invocations are already executed lock-step. Instead of an actual 3832bf215546Sopenharmony_ci * barrier just emit a scheduling fence, that will generate no code. 3833bf215546Sopenharmony_ci */ 3834bf215546Sopenharmony_ci if (!nir->info.workgroup_size_variable && 3835bf215546Sopenharmony_ci workgroup_size() <= dispatch_width) { 3836bf215546Sopenharmony_ci bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE); 3837bf215546Sopenharmony_ci break; 3838bf215546Sopenharmony_ci } 3839bf215546Sopenharmony_ci 3840bf215546Sopenharmony_ci emit_barrier(); 3841bf215546Sopenharmony_ci cs_prog_data->uses_barrier = true; 3842bf215546Sopenharmony_ci break; 3843bf215546Sopenharmony_ci 3844bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_id: 3845bf215546Sopenharmony_ci if (devinfo->verx10 >= 125) 3846bf215546Sopenharmony_ci bld.AND(retype(dest, BRW_REGISTER_TYPE_UD), 3847bf215546Sopenharmony_ci retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 3848bf215546Sopenharmony_ci brw_imm_ud(INTEL_MASK(7, 0))); 3849bf215546Sopenharmony_ci else 3850bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id); 3851bf215546Sopenharmony_ci break; 3852bf215546Sopenharmony_ci 3853bf215546Sopenharmony_ci case nir_intrinsic_load_local_invocation_id: 3854bf215546Sopenharmony_ci case nir_intrinsic_load_workgroup_id: { 3855bf215546Sopenharmony_ci gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3856bf215546Sopenharmony_ci fs_reg val = nir_system_values[sv]; 3857bf215546Sopenharmony_ci assert(val.file != BAD_FILE); 3858bf215546Sopenharmony_ci dest.type = val.type; 3859bf215546Sopenharmony_ci for (unsigned i = 0; i < 3; i++) 3860bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, i), offset(val, bld, i)); 3861bf215546Sopenharmony_ci break; 3862bf215546Sopenharmony_ci } 3863bf215546Sopenharmony_ci 3864bf215546Sopenharmony_ci case nir_intrinsic_load_num_workgroups: { 3865bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32); 3866bf215546Sopenharmony_ci 3867bf215546Sopenharmony_ci cs_prog_data->uses_num_work_groups = true; 3868bf215546Sopenharmony_ci 3869bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3870bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(0); 3871bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3872bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */ 3873bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0); 3874bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 3875bf215546Sopenharmony_ci fs_inst *inst = 3876bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 3877bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 3878bf215546Sopenharmony_ci inst->size_written = 3 * dispatch_width * 4; 3879bf215546Sopenharmony_ci break; 3880bf215546Sopenharmony_ci } 3881bf215546Sopenharmony_ci 3882bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_add: 3883bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_imin: 3884bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_umin: 3885bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_imax: 3886bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_umax: 3887bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_and: 3888bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_or: 3889bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_xor: 3890bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_exchange: 3891bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_comp_swap: 3892bf215546Sopenharmony_ci nir_emit_shared_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); 3893bf215546Sopenharmony_ci break; 3894bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_fmin: 3895bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_fmax: 3896bf215546Sopenharmony_ci case nir_intrinsic_shared_atomic_fcomp_swap: 3897bf215546Sopenharmony_ci nir_emit_shared_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); 3898bf215546Sopenharmony_ci break; 3899bf215546Sopenharmony_ci 3900bf215546Sopenharmony_ci case nir_intrinsic_load_shared: { 3901bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 3902bf215546Sopenharmony_ci assert(nir_intrinsic_base(instr) == 0); 3903bf215546Sopenharmony_ci 3904bf215546Sopenharmony_ci const unsigned bit_size = nir_dest_bit_size(instr->dest); 3905bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3906bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 3907bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]); 3908bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3909bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 3910bf215546Sopenharmony_ci 3911bf215546Sopenharmony_ci /* Make dest unsigned because that's what the temporary will be */ 3912bf215546Sopenharmony_ci dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 3913bf215546Sopenharmony_ci 3914bf215546Sopenharmony_ci /* Read the vector */ 3915bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) <= 32); 3916bf215546Sopenharmony_ci assert(nir_intrinsic_align(instr) > 0); 3917bf215546Sopenharmony_ci if (nir_dest_bit_size(instr->dest) == 32 && 3918bf215546Sopenharmony_ci nir_intrinsic_align(instr) >= 4) { 3919bf215546Sopenharmony_ci assert(nir_dest_num_components(instr->dest) <= 4); 3920bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 3921bf215546Sopenharmony_ci fs_inst *inst = 3922bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 3923bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 3924bf215546Sopenharmony_ci inst->size_written = instr->num_components * dispatch_width * 4; 3925bf215546Sopenharmony_ci } else { 3926bf215546Sopenharmony_ci assert(nir_dest_num_components(instr->dest) == 1); 3927bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 3928bf215546Sopenharmony_ci 3929bf215546Sopenharmony_ci fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 3930bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 3931bf215546Sopenharmony_ci read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 3932bf215546Sopenharmony_ci bld.MOV(dest, subscript(read_result, dest.type, 0)); 3933bf215546Sopenharmony_ci } 3934bf215546Sopenharmony_ci break; 3935bf215546Sopenharmony_ci } 3936bf215546Sopenharmony_ci 3937bf215546Sopenharmony_ci case nir_intrinsic_store_shared: { 3938bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 3939bf215546Sopenharmony_ci assert(nir_intrinsic_base(instr) == 0); 3940bf215546Sopenharmony_ci 3941bf215546Sopenharmony_ci const unsigned bit_size = nir_src_bit_size(instr->src[0]); 3942bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3943bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 3944bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 3945bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3946bf215546Sopenharmony_ci /* No point in masking with sample mask, here we're handling compute 3947bf215546Sopenharmony_ci * intrinsics. 3948bf215546Sopenharmony_ci */ 3949bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 3950bf215546Sopenharmony_ci 3951bf215546Sopenharmony_ci fs_reg data = get_nir_src(instr->src[0]); 3952bf215546Sopenharmony_ci data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 3953bf215546Sopenharmony_ci 3954bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0]) <= 32); 3955bf215546Sopenharmony_ci assert(nir_intrinsic_write_mask(instr) == 3956bf215546Sopenharmony_ci (1u << instr->num_components) - 1); 3957bf215546Sopenharmony_ci assert(nir_intrinsic_align(instr) > 0); 3958bf215546Sopenharmony_ci if (nir_src_bit_size(instr->src[0]) == 32 && 3959bf215546Sopenharmony_ci nir_intrinsic_align(instr) >= 4) { 3960bf215546Sopenharmony_ci assert(nir_src_num_components(instr->src[0]) <= 4); 3961bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = data; 3962bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 3963bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 3964bf215546Sopenharmony_ci fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 3965bf215546Sopenharmony_ci } else { 3966bf215546Sopenharmony_ci assert(nir_src_num_components(instr->src[0]) == 1); 3967bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 3968bf215546Sopenharmony_ci 3969bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 3970bf215546Sopenharmony_ci bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 3971bf215546Sopenharmony_ci 3972bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 3973bf215546Sopenharmony_ci fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 3974bf215546Sopenharmony_ci } 3975bf215546Sopenharmony_ci break; 3976bf215546Sopenharmony_ci } 3977bf215546Sopenharmony_ci 3978bf215546Sopenharmony_ci case nir_intrinsic_load_workgroup_size: { 3979bf215546Sopenharmony_ci /* For non-variable case, this should've been lowered already. */ 3980bf215546Sopenharmony_ci assert(nir->info.workgroup_size_variable); 3981bf215546Sopenharmony_ci 3982bf215546Sopenharmony_ci assert(compiler->lower_variable_group_size); 3983bf215546Sopenharmony_ci assert(gl_shader_stage_is_compute(stage)); 3984bf215546Sopenharmony_ci 3985bf215546Sopenharmony_ci for (unsigned i = 0; i < 3; i++) { 3986bf215546Sopenharmony_ci bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), 3987bf215546Sopenharmony_ci group_size[i]); 3988bf215546Sopenharmony_ci } 3989bf215546Sopenharmony_ci break; 3990bf215546Sopenharmony_ci } 3991bf215546Sopenharmony_ci 3992bf215546Sopenharmony_ci default: 3993bf215546Sopenharmony_ci nir_emit_intrinsic(bld, instr); 3994bf215546Sopenharmony_ci break; 3995bf215546Sopenharmony_ci } 3996bf215546Sopenharmony_ci} 3997bf215546Sopenharmony_ci 3998bf215546Sopenharmony_cistatic void 3999bf215546Sopenharmony_ciemit_rt_lsc_fence(const fs_builder &bld, 4000bf215546Sopenharmony_ci enum lsc_fence_scope scope, 4001bf215546Sopenharmony_ci enum lsc_flush_type flush_type) 4002bf215546Sopenharmony_ci{ 4003bf215546Sopenharmony_ci const intel_device_info *devinfo = bld.shader->devinfo; 4004bf215546Sopenharmony_ci 4005bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(8, 0); 4006bf215546Sopenharmony_ci fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4007bf215546Sopenharmony_ci fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp, 4008bf215546Sopenharmony_ci brw_imm_ud(0) /* desc */, 4009bf215546Sopenharmony_ci brw_imm_ud(0) /* ex_desc */, 4010bf215546Sopenharmony_ci brw_vec8_grf(0, 0) /* payload */); 4011bf215546Sopenharmony_ci send->sfid = GFX12_SFID_UGM; 4012bf215546Sopenharmony_ci send->desc = lsc_fence_msg_desc(devinfo, scope, flush_type, true); 4013bf215546Sopenharmony_ci send->mlen = 1; /* g0 header */ 4014bf215546Sopenharmony_ci send->ex_mlen = 0; 4015bf215546Sopenharmony_ci send->size_written = REG_SIZE; /* Temp write for scheduling */ 4016bf215546Sopenharmony_ci send->send_has_side_effects = true; 4017bf215546Sopenharmony_ci 4018bf215546Sopenharmony_ci ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp); 4019bf215546Sopenharmony_ci} 4020bf215546Sopenharmony_ci 4021bf215546Sopenharmony_ci 4022bf215546Sopenharmony_civoid 4023bf215546Sopenharmony_cifs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld, 4024bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 4025bf215546Sopenharmony_ci{ 4026bf215546Sopenharmony_ci assert(brw_shader_stage_is_bindless(stage)); 4027bf215546Sopenharmony_ci 4028bf215546Sopenharmony_ci fs_reg dest; 4029bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 4030bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 4031bf215546Sopenharmony_ci 4032bf215546Sopenharmony_ci switch (instr->intrinsic) { 4033bf215546Sopenharmony_ci case nir_intrinsic_load_btd_global_arg_addr_intel: 4034bf215546Sopenharmony_ci bld.MOV(dest, retype(brw_vec1_grf(2, 0), dest.type)); 4035bf215546Sopenharmony_ci break; 4036bf215546Sopenharmony_ci 4037bf215546Sopenharmony_ci case nir_intrinsic_load_btd_local_arg_addr_intel: 4038bf215546Sopenharmony_ci bld.MOV(dest, retype(brw_vec1_grf(2, 2), dest.type)); 4039bf215546Sopenharmony_ci break; 4040bf215546Sopenharmony_ci 4041bf215546Sopenharmony_ci case nir_intrinsic_load_btd_shader_type_intel: { 4042bf215546Sopenharmony_ci fs_reg ud_dest = retype(dest, BRW_REGISTER_TYPE_UD); 4043bf215546Sopenharmony_ci bld.MOV(ud_dest, retype(brw_vec1_grf(0, 3), ud_dest.type)); 4044bf215546Sopenharmony_ci bld.AND(ud_dest, ud_dest, brw_imm_ud(0xf)); 4045bf215546Sopenharmony_ci break; 4046bf215546Sopenharmony_ci } 4047bf215546Sopenharmony_ci 4048bf215546Sopenharmony_ci default: 4049bf215546Sopenharmony_ci nir_emit_intrinsic(bld, instr); 4050bf215546Sopenharmony_ci break; 4051bf215546Sopenharmony_ci } 4052bf215546Sopenharmony_ci} 4053bf215546Sopenharmony_ci 4054bf215546Sopenharmony_cistatic fs_reg 4055bf215546Sopenharmony_cibrw_nir_reduction_op_identity(const fs_builder &bld, 4056bf215546Sopenharmony_ci nir_op op, brw_reg_type type) 4057bf215546Sopenharmony_ci{ 4058bf215546Sopenharmony_ci nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8); 4059bf215546Sopenharmony_ci switch (type_sz(type)) { 4060bf215546Sopenharmony_ci case 1: 4061bf215546Sopenharmony_ci if (type == BRW_REGISTER_TYPE_UB) { 4062bf215546Sopenharmony_ci return brw_imm_uw(value.u8); 4063bf215546Sopenharmony_ci } else { 4064bf215546Sopenharmony_ci assert(type == BRW_REGISTER_TYPE_B); 4065bf215546Sopenharmony_ci return brw_imm_w(value.i8); 4066bf215546Sopenharmony_ci } 4067bf215546Sopenharmony_ci case 2: 4068bf215546Sopenharmony_ci return retype(brw_imm_uw(value.u16), type); 4069bf215546Sopenharmony_ci case 4: 4070bf215546Sopenharmony_ci return retype(brw_imm_ud(value.u32), type); 4071bf215546Sopenharmony_ci case 8: 4072bf215546Sopenharmony_ci if (type == BRW_REGISTER_TYPE_DF) 4073bf215546Sopenharmony_ci return setup_imm_df(bld, value.f64); 4074bf215546Sopenharmony_ci else 4075bf215546Sopenharmony_ci return retype(brw_imm_u64(value.u64), type); 4076bf215546Sopenharmony_ci default: 4077bf215546Sopenharmony_ci unreachable("Invalid type size"); 4078bf215546Sopenharmony_ci } 4079bf215546Sopenharmony_ci} 4080bf215546Sopenharmony_ci 4081bf215546Sopenharmony_cistatic opcode 4082bf215546Sopenharmony_cibrw_op_for_nir_reduction_op(nir_op op) 4083bf215546Sopenharmony_ci{ 4084bf215546Sopenharmony_ci switch (op) { 4085bf215546Sopenharmony_ci case nir_op_iadd: return BRW_OPCODE_ADD; 4086bf215546Sopenharmony_ci case nir_op_fadd: return BRW_OPCODE_ADD; 4087bf215546Sopenharmony_ci case nir_op_imul: return BRW_OPCODE_MUL; 4088bf215546Sopenharmony_ci case nir_op_fmul: return BRW_OPCODE_MUL; 4089bf215546Sopenharmony_ci case nir_op_imin: return BRW_OPCODE_SEL; 4090bf215546Sopenharmony_ci case nir_op_umin: return BRW_OPCODE_SEL; 4091bf215546Sopenharmony_ci case nir_op_fmin: return BRW_OPCODE_SEL; 4092bf215546Sopenharmony_ci case nir_op_imax: return BRW_OPCODE_SEL; 4093bf215546Sopenharmony_ci case nir_op_umax: return BRW_OPCODE_SEL; 4094bf215546Sopenharmony_ci case nir_op_fmax: return BRW_OPCODE_SEL; 4095bf215546Sopenharmony_ci case nir_op_iand: return BRW_OPCODE_AND; 4096bf215546Sopenharmony_ci case nir_op_ior: return BRW_OPCODE_OR; 4097bf215546Sopenharmony_ci case nir_op_ixor: return BRW_OPCODE_XOR; 4098bf215546Sopenharmony_ci default: 4099bf215546Sopenharmony_ci unreachable("Invalid reduction operation"); 4100bf215546Sopenharmony_ci } 4101bf215546Sopenharmony_ci} 4102bf215546Sopenharmony_ci 4103bf215546Sopenharmony_cistatic brw_conditional_mod 4104bf215546Sopenharmony_cibrw_cond_mod_for_nir_reduction_op(nir_op op) 4105bf215546Sopenharmony_ci{ 4106bf215546Sopenharmony_ci switch (op) { 4107bf215546Sopenharmony_ci case nir_op_iadd: return BRW_CONDITIONAL_NONE; 4108bf215546Sopenharmony_ci case nir_op_fadd: return BRW_CONDITIONAL_NONE; 4109bf215546Sopenharmony_ci case nir_op_imul: return BRW_CONDITIONAL_NONE; 4110bf215546Sopenharmony_ci case nir_op_fmul: return BRW_CONDITIONAL_NONE; 4111bf215546Sopenharmony_ci case nir_op_imin: return BRW_CONDITIONAL_L; 4112bf215546Sopenharmony_ci case nir_op_umin: return BRW_CONDITIONAL_L; 4113bf215546Sopenharmony_ci case nir_op_fmin: return BRW_CONDITIONAL_L; 4114bf215546Sopenharmony_ci case nir_op_imax: return BRW_CONDITIONAL_GE; 4115bf215546Sopenharmony_ci case nir_op_umax: return BRW_CONDITIONAL_GE; 4116bf215546Sopenharmony_ci case nir_op_fmax: return BRW_CONDITIONAL_GE; 4117bf215546Sopenharmony_ci case nir_op_iand: return BRW_CONDITIONAL_NONE; 4118bf215546Sopenharmony_ci case nir_op_ior: return BRW_CONDITIONAL_NONE; 4119bf215546Sopenharmony_ci case nir_op_ixor: return BRW_CONDITIONAL_NONE; 4120bf215546Sopenharmony_ci default: 4121bf215546Sopenharmony_ci unreachable("Invalid reduction operation"); 4122bf215546Sopenharmony_ci } 4123bf215546Sopenharmony_ci} 4124bf215546Sopenharmony_ci 4125bf215546Sopenharmony_cifs_reg 4126bf215546Sopenharmony_cifs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld, 4127bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 4128bf215546Sopenharmony_ci{ 4129bf215546Sopenharmony_ci fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD); 4130bf215546Sopenharmony_ci fs_reg surf_index = image; 4131bf215546Sopenharmony_ci 4132bf215546Sopenharmony_ci return bld.emit_uniformize(surf_index); 4133bf215546Sopenharmony_ci} 4134bf215546Sopenharmony_ci 4135bf215546Sopenharmony_cifs_reg 4136bf215546Sopenharmony_cifs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld, 4137bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 4138bf215546Sopenharmony_ci{ 4139bf215546Sopenharmony_ci /* SSBO stores are weird in that their index is in src[1] */ 4140bf215546Sopenharmony_ci const bool is_store = 4141bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_store_ssbo || 4142bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_store_ssbo_block_intel; 4143bf215546Sopenharmony_ci const unsigned src = is_store ? 1 : 0; 4144bf215546Sopenharmony_ci 4145bf215546Sopenharmony_ci if (nir_src_is_const(instr->src[src])) { 4146bf215546Sopenharmony_ci return brw_imm_ud(nir_src_as_uint(instr->src[src])); 4147bf215546Sopenharmony_ci } else { 4148bf215546Sopenharmony_ci return bld.emit_uniformize(get_nir_src(instr->src[src])); 4149bf215546Sopenharmony_ci } 4150bf215546Sopenharmony_ci} 4151bf215546Sopenharmony_ci 4152bf215546Sopenharmony_ci/** 4153bf215546Sopenharmony_ci * The offsets we get from NIR act as if each SIMD channel has it's own blob 4154bf215546Sopenharmony_ci * of contiguous space. However, if we actually place each SIMD channel in 4155bf215546Sopenharmony_ci * it's own space, we end up with terrible cache performance because each SIMD 4156bf215546Sopenharmony_ci * channel accesses a different cache line even when they're all accessing the 4157bf215546Sopenharmony_ci * same byte offset. To deal with this problem, we swizzle the address using 4158bf215546Sopenharmony_ci * a simple algorithm which ensures that any time a SIMD message reads or 4159bf215546Sopenharmony_ci * writes the same address, it's all in the same cache line. We have to keep 4160bf215546Sopenharmony_ci * the bottom two bits fixed so that we can read/write up to a dword at a time 4161bf215546Sopenharmony_ci * and the individual element is contiguous. We do this by splitting the 4162bf215546Sopenharmony_ci * address as follows: 4163bf215546Sopenharmony_ci * 4164bf215546Sopenharmony_ci * 31 4-6 2 0 4165bf215546Sopenharmony_ci * +-------------------------------+------------+----------+ 4166bf215546Sopenharmony_ci * | Hi address bits | chan index | addr low | 4167bf215546Sopenharmony_ci * +-------------------------------+------------+----------+ 4168bf215546Sopenharmony_ci * 4169bf215546Sopenharmony_ci * In other words, the bottom two address bits stay, and the top 30 get 4170bf215546Sopenharmony_ci * shifted up so that we can stick the SIMD channel index in the middle. This 4171bf215546Sopenharmony_ci * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit 4172bf215546Sopenharmony_ci * at the same logical offset, the scratch read/write instruction acts on 4173bf215546Sopenharmony_ci * continuous elements and we get good cache locality. 4174bf215546Sopenharmony_ci */ 4175bf215546Sopenharmony_cifs_reg 4176bf215546Sopenharmony_cifs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld, 4177bf215546Sopenharmony_ci const fs_reg &nir_addr, 4178bf215546Sopenharmony_ci bool in_dwords) 4179bf215546Sopenharmony_ci{ 4180bf215546Sopenharmony_ci const fs_reg &chan_index = 4181bf215546Sopenharmony_ci nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; 4182bf215546Sopenharmony_ci const unsigned chan_index_bits = ffs(dispatch_width) - 1; 4183bf215546Sopenharmony_ci 4184bf215546Sopenharmony_ci fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD); 4185bf215546Sopenharmony_ci if (in_dwords) { 4186bf215546Sopenharmony_ci /* In this case, we know the address is aligned to a DWORD and we want 4187bf215546Sopenharmony_ci * the final address in DWORDs. 4188bf215546Sopenharmony_ci */ 4189bf215546Sopenharmony_ci bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2)); 4190bf215546Sopenharmony_ci bld.OR(addr, addr, chan_index); 4191bf215546Sopenharmony_ci } else { 4192bf215546Sopenharmony_ci /* This case substantially more annoying because we have to pay 4193bf215546Sopenharmony_ci * attention to those pesky two bottom bits. 4194bf215546Sopenharmony_ci */ 4195bf215546Sopenharmony_ci fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD); 4196bf215546Sopenharmony_ci bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u)); 4197bf215546Sopenharmony_ci bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits)); 4198bf215546Sopenharmony_ci fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD); 4199bf215546Sopenharmony_ci bld.SHL(chan_addr, chan_index, brw_imm_ud(2)); 4200bf215546Sopenharmony_ci bld.AND(addr, nir_addr, brw_imm_ud(0x3u)); 4201bf215546Sopenharmony_ci bld.OR(addr, addr, addr_hi); 4202bf215546Sopenharmony_ci bld.OR(addr, addr, chan_addr); 4203bf215546Sopenharmony_ci } 4204bf215546Sopenharmony_ci return addr; 4205bf215546Sopenharmony_ci} 4206bf215546Sopenharmony_ci 4207bf215546Sopenharmony_cistatic unsigned 4208bf215546Sopenharmony_cichoose_oword_block_size_dwords(unsigned dwords) 4209bf215546Sopenharmony_ci{ 4210bf215546Sopenharmony_ci unsigned block; 4211bf215546Sopenharmony_ci if (dwords >= 32) { 4212bf215546Sopenharmony_ci block = 32; 4213bf215546Sopenharmony_ci } else if (dwords >= 16) { 4214bf215546Sopenharmony_ci block = 16; 4215bf215546Sopenharmony_ci } else { 4216bf215546Sopenharmony_ci block = 8; 4217bf215546Sopenharmony_ci } 4218bf215546Sopenharmony_ci assert(block <= dwords); 4219bf215546Sopenharmony_ci return block; 4220bf215546Sopenharmony_ci} 4221bf215546Sopenharmony_ci 4222bf215546Sopenharmony_cistatic void 4223bf215546Sopenharmony_ciincrement_a64_address(const fs_builder &bld, fs_reg address, uint32_t v) 4224bf215546Sopenharmony_ci{ 4225bf215546Sopenharmony_ci if (bld.shader->devinfo->has_64bit_int) { 4226bf215546Sopenharmony_ci bld.ADD(address, address, brw_imm_ud(v)); 4227bf215546Sopenharmony_ci } else { 4228bf215546Sopenharmony_ci fs_reg low = retype(address, BRW_REGISTER_TYPE_UD); 4229bf215546Sopenharmony_ci fs_reg high = offset(low, bld, 1); 4230bf215546Sopenharmony_ci 4231bf215546Sopenharmony_ci /* Add low and if that overflows, add carry to high. */ 4232bf215546Sopenharmony_ci bld.ADD(low, low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O; 4233bf215546Sopenharmony_ci bld.ADD(high, high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL; 4234bf215546Sopenharmony_ci } 4235bf215546Sopenharmony_ci} 4236bf215546Sopenharmony_ci 4237bf215546Sopenharmony_cistatic fs_reg 4238bf215546Sopenharmony_ciemit_fence(const fs_builder &bld, enum opcode opcode, 4239bf215546Sopenharmony_ci uint8_t sfid, uint32_t desc, 4240bf215546Sopenharmony_ci bool commit_enable, uint8_t bti) 4241bf215546Sopenharmony_ci{ 4242bf215546Sopenharmony_ci assert(opcode == SHADER_OPCODE_INTERLOCK || 4243bf215546Sopenharmony_ci opcode == SHADER_OPCODE_MEMORY_FENCE); 4244bf215546Sopenharmony_ci 4245bf215546Sopenharmony_ci fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD); 4246bf215546Sopenharmony_ci fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0), 4247bf215546Sopenharmony_ci brw_imm_ud(commit_enable), 4248bf215546Sopenharmony_ci brw_imm_ud(bti)); 4249bf215546Sopenharmony_ci fence->sfid = sfid; 4250bf215546Sopenharmony_ci fence->desc = desc; 4251bf215546Sopenharmony_ci 4252bf215546Sopenharmony_ci return dst; 4253bf215546Sopenharmony_ci} 4254bf215546Sopenharmony_ci 4255bf215546Sopenharmony_cistatic uint32_t 4256bf215546Sopenharmony_cilsc_fence_descriptor_for_intrinsic(const struct intel_device_info *devinfo, 4257bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 4258bf215546Sopenharmony_ci{ 4259bf215546Sopenharmony_ci assert(devinfo->has_lsc); 4260bf215546Sopenharmony_ci 4261bf215546Sopenharmony_ci enum lsc_fence_scope scope = LSC_FENCE_LOCAL; 4262bf215546Sopenharmony_ci enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE; 4263bf215546Sopenharmony_ci 4264bf215546Sopenharmony_ci if (nir_intrinsic_has_memory_scope(instr)) { 4265bf215546Sopenharmony_ci switch (nir_intrinsic_memory_scope(instr)) { 4266bf215546Sopenharmony_ci case NIR_SCOPE_DEVICE: 4267bf215546Sopenharmony_ci case NIR_SCOPE_QUEUE_FAMILY: 4268bf215546Sopenharmony_ci scope = LSC_FENCE_TILE; 4269bf215546Sopenharmony_ci flush_type = LSC_FLUSH_TYPE_EVICT; 4270bf215546Sopenharmony_ci break; 4271bf215546Sopenharmony_ci case NIR_SCOPE_WORKGROUP: 4272bf215546Sopenharmony_ci scope = LSC_FENCE_THREADGROUP; 4273bf215546Sopenharmony_ci flush_type = LSC_FLUSH_TYPE_EVICT; 4274bf215546Sopenharmony_ci break; 4275bf215546Sopenharmony_ci case NIR_SCOPE_SHADER_CALL: 4276bf215546Sopenharmony_ci case NIR_SCOPE_INVOCATION: 4277bf215546Sopenharmony_ci case NIR_SCOPE_SUBGROUP: 4278bf215546Sopenharmony_ci case NIR_SCOPE_NONE: 4279bf215546Sopenharmony_ci break; 4280bf215546Sopenharmony_ci } 4281bf215546Sopenharmony_ci } else { 4282bf215546Sopenharmony_ci /* No scope defined. */ 4283bf215546Sopenharmony_ci scope = LSC_FENCE_TILE; 4284bf215546Sopenharmony_ci flush_type = LSC_FLUSH_TYPE_EVICT; 4285bf215546Sopenharmony_ci } 4286bf215546Sopenharmony_ci return lsc_fence_msg_desc(devinfo, scope, flush_type, true); 4287bf215546Sopenharmony_ci} 4288bf215546Sopenharmony_ci 4289bf215546Sopenharmony_civoid 4290bf215546Sopenharmony_cifs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) 4291bf215546Sopenharmony_ci{ 4292bf215546Sopenharmony_ci fs_reg dest; 4293bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 4294bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 4295bf215546Sopenharmony_ci 4296bf215546Sopenharmony_ci switch (instr->intrinsic) { 4297bf215546Sopenharmony_ci case nir_intrinsic_image_load: 4298bf215546Sopenharmony_ci case nir_intrinsic_image_store: 4299bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_add: 4300bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_imin: 4301bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_umin: 4302bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_imax: 4303bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_umax: 4304bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_and: 4305bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_or: 4306bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_xor: 4307bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_exchange: 4308bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_comp_swap: 4309bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_load: 4310bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_store: 4311bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_atomic_add: 4312bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_atomic_imin: 4313bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_atomic_umin: 4314bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_atomic_imax: 4315bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_atomic_umax: 4316bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_atomic_and: 4317bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_atomic_or: 4318bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_atomic_xor: 4319bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_atomic_exchange: 4320bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_atomic_comp_swap: { 4321bf215546Sopenharmony_ci /* Get some metadata from the image intrinsic. */ 4322bf215546Sopenharmony_ci const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; 4323bf215546Sopenharmony_ci 4324bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4325bf215546Sopenharmony_ci 4326bf215546Sopenharmony_ci switch (instr->intrinsic) { 4327bf215546Sopenharmony_ci case nir_intrinsic_image_load: 4328bf215546Sopenharmony_ci case nir_intrinsic_image_store: 4329bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_add: 4330bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_imin: 4331bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_umin: 4332bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_imax: 4333bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_umax: 4334bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_and: 4335bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_or: 4336bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_xor: 4337bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_exchange: 4338bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_comp_swap: 4339bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4340bf215546Sopenharmony_ci get_nir_image_intrinsic_image(bld, instr); 4341bf215546Sopenharmony_ci break; 4342bf215546Sopenharmony_ci 4343bf215546Sopenharmony_ci default: 4344bf215546Sopenharmony_ci /* Bindless */ 4345bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = 4346bf215546Sopenharmony_ci bld.emit_uniformize(get_nir_src(instr->src[0])); 4347bf215546Sopenharmony_ci break; 4348bf215546Sopenharmony_ci } 4349bf215546Sopenharmony_ci 4350bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4351bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = 4352bf215546Sopenharmony_ci brw_imm_ud(nir_image_intrinsic_coord_components(instr)); 4353bf215546Sopenharmony_ci 4354bf215546Sopenharmony_ci /* Emit an image load, store or atomic op. */ 4355bf215546Sopenharmony_ci if (instr->intrinsic == nir_intrinsic_image_load || 4356bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_bindless_image_load) { 4357bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4358bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 4359bf215546Sopenharmony_ci fs_inst *inst = 4360bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, 4361bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4362bf215546Sopenharmony_ci inst->size_written = instr->num_components * dispatch_width * 4; 4363bf215546Sopenharmony_ci } else if (instr->intrinsic == nir_intrinsic_image_store || 4364bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_bindless_image_store) { 4365bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4366bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]); 4367bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 4368bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, 4369bf215546Sopenharmony_ci fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4370bf215546Sopenharmony_ci } else { 4371bf215546Sopenharmony_ci unsigned num_srcs = info->num_srcs; 4372bf215546Sopenharmony_ci int op = brw_aop_for_nir_intrinsic(instr); 4373bf215546Sopenharmony_ci if (op == BRW_AOP_INC || op == BRW_AOP_DEC) { 4374bf215546Sopenharmony_ci assert(num_srcs == 4); 4375bf215546Sopenharmony_ci num_srcs = 3; 4376bf215546Sopenharmony_ci } 4377bf215546Sopenharmony_ci 4378bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 4379bf215546Sopenharmony_ci 4380bf215546Sopenharmony_ci fs_reg data; 4381bf215546Sopenharmony_ci if (num_srcs >= 4) 4382bf215546Sopenharmony_ci data = get_nir_src(instr->src[3]); 4383bf215546Sopenharmony_ci if (num_srcs >= 5) { 4384bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(data.type, 2); 4385bf215546Sopenharmony_ci fs_reg sources[2] = { data, get_nir_src(instr->src[4]) }; 4386bf215546Sopenharmony_ci bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 4387bf215546Sopenharmony_ci data = tmp; 4388bf215546Sopenharmony_ci } 4389bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = data; 4390bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 4391bf215546Sopenharmony_ci 4392bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, 4393bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4394bf215546Sopenharmony_ci } 4395bf215546Sopenharmony_ci break; 4396bf215546Sopenharmony_ci } 4397bf215546Sopenharmony_ci 4398bf215546Sopenharmony_ci case nir_intrinsic_image_size: 4399bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_size: { 4400bf215546Sopenharmony_ci /* Cube image sizes should have previously been lowered to a 2D array */ 4401bf215546Sopenharmony_ci assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE); 4402bf215546Sopenharmony_ci 4403bf215546Sopenharmony_ci /* Unlike the [un]typed load and store opcodes, the TXS that this turns 4404bf215546Sopenharmony_ci * into will handle the binding table index for us in the geneerator. 4405bf215546Sopenharmony_ci * Incidentally, this means that we can handle bindless with exactly the 4406bf215546Sopenharmony_ci * same code. 4407bf215546Sopenharmony_ci */ 4408bf215546Sopenharmony_ci fs_reg image = retype(get_nir_src_imm(instr->src[0]), 4409bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UD); 4410bf215546Sopenharmony_ci image = bld.emit_uniformize(image); 4411bf215546Sopenharmony_ci 4412bf215546Sopenharmony_ci assert(nir_src_as_uint(instr->src[1]) == 0); 4413bf215546Sopenharmony_ci 4414bf215546Sopenharmony_ci fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 4415bf215546Sopenharmony_ci if (instr->intrinsic == nir_intrinsic_image_size) 4416bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SURFACE] = image; 4417bf215546Sopenharmony_ci else 4418bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image; 4419bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0); 4420bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0); 4421bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); 4422bf215546Sopenharmony_ci 4423bf215546Sopenharmony_ci /* Since the image size is always uniform, we can just emit a SIMD8 4424bf215546Sopenharmony_ci * query instruction and splat the result out. 4425bf215546Sopenharmony_ci */ 4426bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(8, 0); 4427bf215546Sopenharmony_ci 4428bf215546Sopenharmony_ci fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 4429bf215546Sopenharmony_ci fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL, 4430bf215546Sopenharmony_ci tmp, srcs, ARRAY_SIZE(srcs)); 4431bf215546Sopenharmony_ci inst->size_written = 4 * REG_SIZE; 4432bf215546Sopenharmony_ci 4433bf215546Sopenharmony_ci for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) { 4434bf215546Sopenharmony_ci bld.MOV(offset(retype(dest, tmp.type), bld, c), 4435bf215546Sopenharmony_ci component(offset(tmp, ubld, c), 0)); 4436bf215546Sopenharmony_ci } 4437bf215546Sopenharmony_ci break; 4438bf215546Sopenharmony_ci } 4439bf215546Sopenharmony_ci 4440bf215546Sopenharmony_ci case nir_intrinsic_image_load_raw_intel: { 4441bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4442bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4443bf215546Sopenharmony_ci get_nir_image_intrinsic_image(bld, instr); 4444bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4445bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4446bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4447bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 4448bf215546Sopenharmony_ci 4449bf215546Sopenharmony_ci fs_inst *inst = 4450bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 4451bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4452bf215546Sopenharmony_ci inst->size_written = instr->num_components * dispatch_width * 4; 4453bf215546Sopenharmony_ci break; 4454bf215546Sopenharmony_ci } 4455bf215546Sopenharmony_ci 4456bf215546Sopenharmony_ci case nir_intrinsic_image_store_raw_intel: { 4457bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4458bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4459bf215546Sopenharmony_ci get_nir_image_intrinsic_image(bld, instr); 4460bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4461bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]); 4462bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4463bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4464bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 4465bf215546Sopenharmony_ci 4466bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 4467bf215546Sopenharmony_ci fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4468bf215546Sopenharmony_ci break; 4469bf215546Sopenharmony_ci } 4470bf215546Sopenharmony_ci 4471bf215546Sopenharmony_ci case nir_intrinsic_scoped_barrier: 4472bf215546Sopenharmony_ci assert(nir_intrinsic_execution_scope(instr) == NIR_SCOPE_NONE); 4473bf215546Sopenharmony_ci FALLTHROUGH; 4474bf215546Sopenharmony_ci case nir_intrinsic_group_memory_barrier: 4475bf215546Sopenharmony_ci case nir_intrinsic_memory_barrier_shared: 4476bf215546Sopenharmony_ci case nir_intrinsic_memory_barrier_buffer: 4477bf215546Sopenharmony_ci case nir_intrinsic_memory_barrier_image: 4478bf215546Sopenharmony_ci case nir_intrinsic_memory_barrier: 4479bf215546Sopenharmony_ci case nir_intrinsic_begin_invocation_interlock: 4480bf215546Sopenharmony_ci case nir_intrinsic_end_invocation_interlock: { 4481bf215546Sopenharmony_ci bool ugm_fence, slm_fence, tgm_fence, urb_fence; 4482bf215546Sopenharmony_ci const enum opcode opcode = 4483bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_begin_invocation_interlock ? 4484bf215546Sopenharmony_ci SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE; 4485bf215546Sopenharmony_ci 4486bf215546Sopenharmony_ci switch (instr->intrinsic) { 4487bf215546Sopenharmony_ci case nir_intrinsic_scoped_barrier: { 4488bf215546Sopenharmony_ci nir_variable_mode modes = nir_intrinsic_memory_modes(instr); 4489bf215546Sopenharmony_ci ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global); 4490bf215546Sopenharmony_ci slm_fence = modes & nir_var_mem_shared; 4491bf215546Sopenharmony_ci tgm_fence = modes & nir_var_image; 4492bf215546Sopenharmony_ci urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload); 4493bf215546Sopenharmony_ci break; 4494bf215546Sopenharmony_ci } 4495bf215546Sopenharmony_ci 4496bf215546Sopenharmony_ci case nir_intrinsic_begin_invocation_interlock: 4497bf215546Sopenharmony_ci case nir_intrinsic_end_invocation_interlock: 4498bf215546Sopenharmony_ci /* For beginInvocationInterlockARB(), we will generate a memory fence 4499bf215546Sopenharmony_ci * but with a different opcode so that generator can pick SENDC 4500bf215546Sopenharmony_ci * instead of SEND. 4501bf215546Sopenharmony_ci * 4502bf215546Sopenharmony_ci * For endInvocationInterlockARB(), we need to insert a memory fence which 4503bf215546Sopenharmony_ci * stalls in the shader until the memory transactions prior to that 4504bf215546Sopenharmony_ci * fence are complete. This ensures that the shader does not end before 4505bf215546Sopenharmony_ci * any writes from its critical section have landed. Otherwise, you can 4506bf215546Sopenharmony_ci * end up with a case where the next invocation on that pixel properly 4507bf215546Sopenharmony_ci * stalls for previous FS invocation on its pixel to complete but 4508bf215546Sopenharmony_ci * doesn't actually wait for the dataport memory transactions from that 4509bf215546Sopenharmony_ci * thread to land before submitting its own. 4510bf215546Sopenharmony_ci * 4511bf215546Sopenharmony_ci * Handling them here will allow the logic for IVB render cache (see 4512bf215546Sopenharmony_ci * below) to be reused. 4513bf215546Sopenharmony_ci */ 4514bf215546Sopenharmony_ci assert(stage == MESA_SHADER_FRAGMENT); 4515bf215546Sopenharmony_ci ugm_fence = tgm_fence = true; 4516bf215546Sopenharmony_ci slm_fence = urb_fence = false; 4517bf215546Sopenharmony_ci break; 4518bf215546Sopenharmony_ci 4519bf215546Sopenharmony_ci default: 4520bf215546Sopenharmony_ci ugm_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared && 4521bf215546Sopenharmony_ci instr->intrinsic != nir_intrinsic_memory_barrier_image; 4522bf215546Sopenharmony_ci slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || 4523bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_memory_barrier || 4524bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_memory_barrier_shared; 4525bf215546Sopenharmony_ci tgm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || 4526bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_memory_barrier || 4527bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_memory_barrier_image; 4528bf215546Sopenharmony_ci urb_fence = instr->intrinsic == nir_intrinsic_memory_barrier; 4529bf215546Sopenharmony_ci break; 4530bf215546Sopenharmony_ci } 4531bf215546Sopenharmony_ci 4532bf215546Sopenharmony_ci if (nir->info.shared_size > 0) { 4533bf215546Sopenharmony_ci assert(gl_shader_stage_uses_workgroup(stage)); 4534bf215546Sopenharmony_ci } else { 4535bf215546Sopenharmony_ci slm_fence = false; 4536bf215546Sopenharmony_ci } 4537bf215546Sopenharmony_ci 4538bf215546Sopenharmony_ci /* If the workgroup fits in a single HW thread, the messages for SLM are 4539bf215546Sopenharmony_ci * processed in-order and the shader itself is already synchronized so 4540bf215546Sopenharmony_ci * the memory fence is not necessary. 4541bf215546Sopenharmony_ci * 4542bf215546Sopenharmony_ci * TODO: Check if applies for many HW threads sharing same Data Port. 4543bf215546Sopenharmony_ci */ 4544bf215546Sopenharmony_ci if (!nir->info.workgroup_size_variable && 4545bf215546Sopenharmony_ci slm_fence && workgroup_size() <= dispatch_width) 4546bf215546Sopenharmony_ci slm_fence = false; 4547bf215546Sopenharmony_ci 4548bf215546Sopenharmony_ci switch (stage) { 4549bf215546Sopenharmony_ci case MESA_SHADER_TESS_CTRL: 4550bf215546Sopenharmony_ci case MESA_SHADER_TASK: 4551bf215546Sopenharmony_ci case MESA_SHADER_MESH: 4552bf215546Sopenharmony_ci break; 4553bf215546Sopenharmony_ci default: 4554bf215546Sopenharmony_ci urb_fence = false; 4555bf215546Sopenharmony_ci break; 4556bf215546Sopenharmony_ci } 4557bf215546Sopenharmony_ci 4558bf215546Sopenharmony_ci unsigned fence_regs_count = 0; 4559bf215546Sopenharmony_ci fs_reg fence_regs[4] = {}; 4560bf215546Sopenharmony_ci 4561bf215546Sopenharmony_ci const fs_builder ubld = bld.group(8, 0); 4562bf215546Sopenharmony_ci 4563bf215546Sopenharmony_ci if (devinfo->has_lsc) { 4564bf215546Sopenharmony_ci assert(devinfo->verx10 >= 125); 4565bf215546Sopenharmony_ci uint32_t desc = 4566bf215546Sopenharmony_ci lsc_fence_descriptor_for_intrinsic(devinfo, instr); 4567bf215546Sopenharmony_ci if (ugm_fence) { 4568bf215546Sopenharmony_ci fence_regs[fence_regs_count++] = 4569bf215546Sopenharmony_ci emit_fence(ubld, opcode, GFX12_SFID_UGM, desc, 4570bf215546Sopenharmony_ci true /* commit_enable */, 4571bf215546Sopenharmony_ci 0 /* bti; ignored for LSC */); 4572bf215546Sopenharmony_ci } 4573bf215546Sopenharmony_ci 4574bf215546Sopenharmony_ci if (tgm_fence) { 4575bf215546Sopenharmony_ci fence_regs[fence_regs_count++] = 4576bf215546Sopenharmony_ci emit_fence(ubld, opcode, GFX12_SFID_TGM, desc, 4577bf215546Sopenharmony_ci true /* commit_enable */, 4578bf215546Sopenharmony_ci 0 /* bti; ignored for LSC */); 4579bf215546Sopenharmony_ci } 4580bf215546Sopenharmony_ci 4581bf215546Sopenharmony_ci if (slm_fence) { 4582bf215546Sopenharmony_ci assert(opcode == SHADER_OPCODE_MEMORY_FENCE); 4583bf215546Sopenharmony_ci fence_regs[fence_regs_count++] = 4584bf215546Sopenharmony_ci emit_fence(ubld, opcode, GFX12_SFID_SLM, desc, 4585bf215546Sopenharmony_ci true /* commit_enable */, 4586bf215546Sopenharmony_ci 0 /* BTI; ignored for LSC */); 4587bf215546Sopenharmony_ci } 4588bf215546Sopenharmony_ci 4589bf215546Sopenharmony_ci if (urb_fence) { 4590bf215546Sopenharmony_ci assert(opcode == SHADER_OPCODE_MEMORY_FENCE); 4591bf215546Sopenharmony_ci fence_regs[fence_regs_count++] = 4592bf215546Sopenharmony_ci emit_fence(ubld, opcode, BRW_SFID_URB, desc, 4593bf215546Sopenharmony_ci true /* commit_enable */, 4594bf215546Sopenharmony_ci 0 /* BTI; ignored for LSC */); 4595bf215546Sopenharmony_ci } 4596bf215546Sopenharmony_ci } else if (devinfo->ver >= 11) { 4597bf215546Sopenharmony_ci if (tgm_fence || ugm_fence || urb_fence) { 4598bf215546Sopenharmony_ci fence_regs[fence_regs_count++] = 4599bf215546Sopenharmony_ci emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, 4600bf215546Sopenharmony_ci true /* commit_enable HSD ES # 1404612949 */, 4601bf215546Sopenharmony_ci 0 /* BTI = 0 means data cache */); 4602bf215546Sopenharmony_ci } 4603bf215546Sopenharmony_ci 4604bf215546Sopenharmony_ci if (slm_fence) { 4605bf215546Sopenharmony_ci assert(opcode == SHADER_OPCODE_MEMORY_FENCE); 4606bf215546Sopenharmony_ci fence_regs[fence_regs_count++] = 4607bf215546Sopenharmony_ci emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, 4608bf215546Sopenharmony_ci true /* commit_enable HSD ES # 1404612949 */, 4609bf215546Sopenharmony_ci GFX7_BTI_SLM); 4610bf215546Sopenharmony_ci } 4611bf215546Sopenharmony_ci } else { 4612bf215546Sopenharmony_ci /* Prior to Icelake, they're all lumped into a single cache except on 4613bf215546Sopenharmony_ci * Ivy Bridge and Bay Trail where typed messages actually go through 4614bf215546Sopenharmony_ci * the render cache. There, we need both fences because we may 4615bf215546Sopenharmony_ci * access storage images as either typed or untyped. 4616bf215546Sopenharmony_ci */ 4617bf215546Sopenharmony_ci const bool render_fence = tgm_fence && devinfo->verx10 == 70; 4618bf215546Sopenharmony_ci 4619bf215546Sopenharmony_ci /* Simulation also complains on Gfx9 if we do not enable commit. 4620bf215546Sopenharmony_ci */ 4621bf215546Sopenharmony_ci const bool commit_enable = render_fence || 4622bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_end_invocation_interlock || 4623bf215546Sopenharmony_ci devinfo->ver == 9; 4624bf215546Sopenharmony_ci 4625bf215546Sopenharmony_ci if (tgm_fence || ugm_fence || slm_fence || urb_fence) { 4626bf215546Sopenharmony_ci fence_regs[fence_regs_count++] = 4627bf215546Sopenharmony_ci emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, 4628bf215546Sopenharmony_ci commit_enable, 0 /* BTI */); 4629bf215546Sopenharmony_ci } 4630bf215546Sopenharmony_ci 4631bf215546Sopenharmony_ci if (render_fence) { 4632bf215546Sopenharmony_ci fence_regs[fence_regs_count++] = 4633bf215546Sopenharmony_ci emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0, 4634bf215546Sopenharmony_ci commit_enable, /* bti */ 0); 4635bf215546Sopenharmony_ci } 4636bf215546Sopenharmony_ci } 4637bf215546Sopenharmony_ci 4638bf215546Sopenharmony_ci assert(fence_regs_count <= ARRAY_SIZE(fence_regs)); 4639bf215546Sopenharmony_ci 4640bf215546Sopenharmony_ci /* There are four cases where we want to insert a stall: 4641bf215546Sopenharmony_ci * 4642bf215546Sopenharmony_ci * 1. If we're a nir_intrinsic_end_invocation_interlock. This is 4643bf215546Sopenharmony_ci * required to ensure that the shader EOT doesn't happen until 4644bf215546Sopenharmony_ci * after the fence returns. Otherwise, we might end up with the 4645bf215546Sopenharmony_ci * next shader invocation for that pixel not respecting our fence 4646bf215546Sopenharmony_ci * because it may happen on a different HW thread. 4647bf215546Sopenharmony_ci * 4648bf215546Sopenharmony_ci * 2. If we have multiple fences. This is required to ensure that 4649bf215546Sopenharmony_ci * they all complete and nothing gets weirdly out-of-order. 4650bf215546Sopenharmony_ci * 4651bf215546Sopenharmony_ci * 3. If we have no fences. In this case, we need at least a 4652bf215546Sopenharmony_ci * scheduling barrier to keep the compiler from moving things 4653bf215546Sopenharmony_ci * around in an invalid way. 4654bf215546Sopenharmony_ci * 4655bf215546Sopenharmony_ci * 4. On platforms with LSC. 4656bf215546Sopenharmony_ci */ 4657bf215546Sopenharmony_ci if (instr->intrinsic == nir_intrinsic_end_invocation_interlock || 4658bf215546Sopenharmony_ci fence_regs_count != 1 || devinfo->has_lsc) { 4659bf215546Sopenharmony_ci ubld.exec_all().group(1, 0).emit( 4660bf215546Sopenharmony_ci FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), 4661bf215546Sopenharmony_ci fence_regs, fence_regs_count); 4662bf215546Sopenharmony_ci } 4663bf215546Sopenharmony_ci 4664bf215546Sopenharmony_ci break; 4665bf215546Sopenharmony_ci } 4666bf215546Sopenharmony_ci 4667bf215546Sopenharmony_ci case nir_intrinsic_memory_barrier_tcs_patch: 4668bf215546Sopenharmony_ci break; 4669bf215546Sopenharmony_ci 4670bf215546Sopenharmony_ci case nir_intrinsic_shader_clock: { 4671bf215546Sopenharmony_ci /* We cannot do anything if there is an event, so ignore it for now */ 4672bf215546Sopenharmony_ci const fs_reg shader_clock = get_timestamp(bld); 4673bf215546Sopenharmony_ci const fs_reg srcs[] = { component(shader_clock, 0), 4674bf215546Sopenharmony_ci component(shader_clock, 1) }; 4675bf215546Sopenharmony_ci bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); 4676bf215546Sopenharmony_ci break; 4677bf215546Sopenharmony_ci } 4678bf215546Sopenharmony_ci 4679bf215546Sopenharmony_ci case nir_intrinsic_image_samples: 4680bf215546Sopenharmony_ci /* The driver does not support multi-sampled images. */ 4681bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1)); 4682bf215546Sopenharmony_ci break; 4683bf215546Sopenharmony_ci 4684bf215546Sopenharmony_ci case nir_intrinsic_load_reloc_const_intel: { 4685bf215546Sopenharmony_ci uint32_t id = nir_intrinsic_param_idx(instr); 4686bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_MOV_RELOC_IMM, 4687bf215546Sopenharmony_ci dest, brw_imm_ud(id)); 4688bf215546Sopenharmony_ci break; 4689bf215546Sopenharmony_ci } 4690bf215546Sopenharmony_ci 4691bf215546Sopenharmony_ci case nir_intrinsic_load_uniform: { 4692bf215546Sopenharmony_ci /* Offsets are in bytes but they should always aligned to 4693bf215546Sopenharmony_ci * the type size 4694bf215546Sopenharmony_ci */ 4695bf215546Sopenharmony_ci assert(instr->const_index[0] % 4 == 0 || 4696bf215546Sopenharmony_ci instr->const_index[0] % type_sz(dest.type) == 0); 4697bf215546Sopenharmony_ci 4698bf215546Sopenharmony_ci fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type); 4699bf215546Sopenharmony_ci 4700bf215546Sopenharmony_ci if (nir_src_is_const(instr->src[0])) { 4701bf215546Sopenharmony_ci unsigned load_offset = nir_src_as_uint(instr->src[0]); 4702bf215546Sopenharmony_ci assert(load_offset % type_sz(dest.type) == 0); 4703bf215546Sopenharmony_ci /* For 16-bit types we add the module of the const_index[0] 4704bf215546Sopenharmony_ci * offset to access to not 32-bit aligned element 4705bf215546Sopenharmony_ci */ 4706bf215546Sopenharmony_ci src.offset = load_offset + instr->const_index[0] % 4; 4707bf215546Sopenharmony_ci 4708bf215546Sopenharmony_ci for (unsigned j = 0; j < instr->num_components; j++) { 4709bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, j), offset(src, bld, j)); 4710bf215546Sopenharmony_ci } 4711bf215546Sopenharmony_ci } else { 4712bf215546Sopenharmony_ci fs_reg indirect = retype(get_nir_src(instr->src[0]), 4713bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UD); 4714bf215546Sopenharmony_ci 4715bf215546Sopenharmony_ci /* We need to pass a size to the MOV_INDIRECT but we don't want it to 4716bf215546Sopenharmony_ci * go past the end of the uniform. In order to keep the n'th 4717bf215546Sopenharmony_ci * component from running past, we subtract off the size of all but 4718bf215546Sopenharmony_ci * one component of the vector. 4719bf215546Sopenharmony_ci */ 4720bf215546Sopenharmony_ci assert(instr->const_index[1] >= 4721bf215546Sopenharmony_ci instr->num_components * (int) type_sz(dest.type)); 4722bf215546Sopenharmony_ci unsigned read_size = instr->const_index[1] - 4723bf215546Sopenharmony_ci (instr->num_components - 1) * type_sz(dest.type); 4724bf215546Sopenharmony_ci 4725bf215546Sopenharmony_ci bool supports_64bit_indirects = 4726bf215546Sopenharmony_ci devinfo->platform != INTEL_PLATFORM_CHV && !intel_device_info_is_9lp(devinfo); 4727bf215546Sopenharmony_ci 4728bf215546Sopenharmony_ci if (type_sz(dest.type) != 8 || supports_64bit_indirects) { 4729bf215546Sopenharmony_ci for (unsigned j = 0; j < instr->num_components; j++) { 4730bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4731bf215546Sopenharmony_ci offset(dest, bld, j), offset(src, bld, j), 4732bf215546Sopenharmony_ci indirect, brw_imm_ud(read_size)); 4733bf215546Sopenharmony_ci } 4734bf215546Sopenharmony_ci } else { 4735bf215546Sopenharmony_ci const unsigned num_mov_indirects = 4736bf215546Sopenharmony_ci type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD); 4737bf215546Sopenharmony_ci /* We read a little bit less per MOV INDIRECT, as they are now 4738bf215546Sopenharmony_ci * 32-bits ones instead of 64-bit. Fix read_size then. 4739bf215546Sopenharmony_ci */ 4740bf215546Sopenharmony_ci const unsigned read_size_32bit = read_size - 4741bf215546Sopenharmony_ci (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD); 4742bf215546Sopenharmony_ci for (unsigned j = 0; j < instr->num_components; j++) { 4743bf215546Sopenharmony_ci for (unsigned i = 0; i < num_mov_indirects; i++) { 4744bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4745bf215546Sopenharmony_ci subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i), 4746bf215546Sopenharmony_ci subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i), 4747bf215546Sopenharmony_ci indirect, brw_imm_ud(read_size_32bit)); 4748bf215546Sopenharmony_ci } 4749bf215546Sopenharmony_ci } 4750bf215546Sopenharmony_ci } 4751bf215546Sopenharmony_ci } 4752bf215546Sopenharmony_ci break; 4753bf215546Sopenharmony_ci } 4754bf215546Sopenharmony_ci 4755bf215546Sopenharmony_ci case nir_intrinsic_load_ubo: { 4756bf215546Sopenharmony_ci fs_reg surf_index; 4757bf215546Sopenharmony_ci if (nir_src_is_const(instr->src[0])) { 4758bf215546Sopenharmony_ci const unsigned index = nir_src_as_uint(instr->src[0]); 4759bf215546Sopenharmony_ci surf_index = brw_imm_ud(index); 4760bf215546Sopenharmony_ci } else { 4761bf215546Sopenharmony_ci /* The block index is not a constant. Evaluate the index expression 4762bf215546Sopenharmony_ci * per-channel and add the base UBO index; we have to select a value 4763bf215546Sopenharmony_ci * from any live channel. 4764bf215546Sopenharmony_ci */ 4765bf215546Sopenharmony_ci surf_index = vgrf(glsl_type::uint_type); 4766bf215546Sopenharmony_ci bld.MOV(surf_index, get_nir_src(instr->src[0])); 4767bf215546Sopenharmony_ci surf_index = bld.emit_uniformize(surf_index); 4768bf215546Sopenharmony_ci } 4769bf215546Sopenharmony_ci 4770bf215546Sopenharmony_ci if (!nir_src_is_const(instr->src[1])) { 4771bf215546Sopenharmony_ci fs_reg base_offset = retype(get_nir_src(instr->src[1]), 4772bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UD); 4773bf215546Sopenharmony_ci 4774bf215546Sopenharmony_ci for (int i = 0; i < instr->num_components; i++) 4775bf215546Sopenharmony_ci VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, 4776bf215546Sopenharmony_ci base_offset, i * type_sz(dest.type), 4777bf215546Sopenharmony_ci nir_dest_bit_size(instr->dest) / 8); 4778bf215546Sopenharmony_ci 4779bf215546Sopenharmony_ci prog_data->has_ubo_pull = true; 4780bf215546Sopenharmony_ci } else { 4781bf215546Sopenharmony_ci /* Even if we are loading doubles, a pull constant load will load 4782bf215546Sopenharmony_ci * a 32-bit vec4, so should only reserve vgrf space for that. If we 4783bf215546Sopenharmony_ci * need to load a full dvec4 we will have to emit 2 loads. This is 4784bf215546Sopenharmony_ci * similar to demote_pull_constants(), except that in that case we 4785bf215546Sopenharmony_ci * see individual accesses to each component of the vector and then 4786bf215546Sopenharmony_ci * we let CSE deal with duplicate loads. Here we see a vector access 4787bf215546Sopenharmony_ci * and we have to split it if necessary. 4788bf215546Sopenharmony_ci */ 4789bf215546Sopenharmony_ci const unsigned type_size = type_sz(dest.type); 4790bf215546Sopenharmony_ci const unsigned load_offset = nir_src_as_uint(instr->src[1]); 4791bf215546Sopenharmony_ci 4792bf215546Sopenharmony_ci /* See if we've selected this as a push constant candidate */ 4793bf215546Sopenharmony_ci if (nir_src_is_const(instr->src[0])) { 4794bf215546Sopenharmony_ci const unsigned ubo_block = nir_src_as_uint(instr->src[0]); 4795bf215546Sopenharmony_ci const unsigned offset_256b = load_offset / 32; 4796bf215546Sopenharmony_ci 4797bf215546Sopenharmony_ci fs_reg push_reg; 4798bf215546Sopenharmony_ci for (int i = 0; i < 4; i++) { 4799bf215546Sopenharmony_ci const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; 4800bf215546Sopenharmony_ci if (range->block == ubo_block && 4801bf215546Sopenharmony_ci offset_256b >= range->start && 4802bf215546Sopenharmony_ci offset_256b < range->start + range->length) { 4803bf215546Sopenharmony_ci 4804bf215546Sopenharmony_ci push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type); 4805bf215546Sopenharmony_ci push_reg.offset = load_offset - 32 * range->start; 4806bf215546Sopenharmony_ci break; 4807bf215546Sopenharmony_ci } 4808bf215546Sopenharmony_ci } 4809bf215546Sopenharmony_ci 4810bf215546Sopenharmony_ci if (push_reg.file != BAD_FILE) { 4811bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->num_components; i++) { 4812bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, i), 4813bf215546Sopenharmony_ci byte_offset(push_reg, i * type_size)); 4814bf215546Sopenharmony_ci } 4815bf215546Sopenharmony_ci break; 4816bf215546Sopenharmony_ci } 4817bf215546Sopenharmony_ci } 4818bf215546Sopenharmony_ci 4819bf215546Sopenharmony_ci prog_data->has_ubo_pull = true; 4820bf215546Sopenharmony_ci 4821bf215546Sopenharmony_ci const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ 4822bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); 4823bf215546Sopenharmony_ci const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4824bf215546Sopenharmony_ci 4825bf215546Sopenharmony_ci for (unsigned c = 0; c < instr->num_components;) { 4826bf215546Sopenharmony_ci const unsigned base = load_offset + c * type_size; 4827bf215546Sopenharmony_ci /* Number of usable components in the next block-aligned load. */ 4828bf215546Sopenharmony_ci const unsigned count = MIN2(instr->num_components - c, 4829bf215546Sopenharmony_ci (block_sz - base % block_sz) / type_size); 4830bf215546Sopenharmony_ci 4831bf215546Sopenharmony_ci ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 4832bf215546Sopenharmony_ci packed_consts, surf_index, 4833bf215546Sopenharmony_ci brw_imm_ud(base & ~(block_sz - 1))); 4834bf215546Sopenharmony_ci 4835bf215546Sopenharmony_ci const fs_reg consts = 4836bf215546Sopenharmony_ci retype(byte_offset(packed_consts, base & (block_sz - 1)), 4837bf215546Sopenharmony_ci dest.type); 4838bf215546Sopenharmony_ci 4839bf215546Sopenharmony_ci for (unsigned d = 0; d < count; d++) 4840bf215546Sopenharmony_ci bld.MOV(offset(dest, bld, c + d), component(consts, d)); 4841bf215546Sopenharmony_ci 4842bf215546Sopenharmony_ci c += count; 4843bf215546Sopenharmony_ci } 4844bf215546Sopenharmony_ci } 4845bf215546Sopenharmony_ci break; 4846bf215546Sopenharmony_ci } 4847bf215546Sopenharmony_ci 4848bf215546Sopenharmony_ci case nir_intrinsic_load_global: 4849bf215546Sopenharmony_ci case nir_intrinsic_load_global_constant: { 4850bf215546Sopenharmony_ci assert(devinfo->ver >= 8); 4851bf215546Sopenharmony_ci 4852bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) <= 32); 4853bf215546Sopenharmony_ci assert(nir_intrinsic_align(instr) > 0); 4854bf215546Sopenharmony_ci fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 4855bf215546Sopenharmony_ci srcs[A64_LOGICAL_ADDRESS] = get_nir_src(instr->src[0]); 4856bf215546Sopenharmony_ci srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ 4857bf215546Sopenharmony_ci srcs[A64_LOGICAL_ENABLE_HELPERS] = 4858bf215546Sopenharmony_ci brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS); 4859bf215546Sopenharmony_ci 4860bf215546Sopenharmony_ci if (nir_dest_bit_size(instr->dest) == 32 && 4861bf215546Sopenharmony_ci nir_intrinsic_align(instr) >= 4) { 4862bf215546Sopenharmony_ci assert(nir_dest_num_components(instr->dest) <= 4); 4863bf215546Sopenharmony_ci 4864bf215546Sopenharmony_ci srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components); 4865bf215546Sopenharmony_ci 4866bf215546Sopenharmony_ci fs_inst *inst = 4867bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest, 4868bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS); 4869bf215546Sopenharmony_ci inst->size_written = instr->num_components * 4870bf215546Sopenharmony_ci inst->dst.component_size(inst->exec_size); 4871bf215546Sopenharmony_ci } else { 4872bf215546Sopenharmony_ci const unsigned bit_size = nir_dest_bit_size(instr->dest); 4873bf215546Sopenharmony_ci assert(nir_dest_num_components(instr->dest) == 1); 4874bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4875bf215546Sopenharmony_ci 4876bf215546Sopenharmony_ci srcs[A64_LOGICAL_ARG] = brw_imm_ud(bit_size); 4877bf215546Sopenharmony_ci 4878bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp, 4879bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS); 4880bf215546Sopenharmony_ci bld.MOV(dest, subscript(tmp, dest.type, 0)); 4881bf215546Sopenharmony_ci } 4882bf215546Sopenharmony_ci break; 4883bf215546Sopenharmony_ci } 4884bf215546Sopenharmony_ci 4885bf215546Sopenharmony_ci case nir_intrinsic_store_global: { 4886bf215546Sopenharmony_ci assert(devinfo->ver >= 8); 4887bf215546Sopenharmony_ci 4888bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0]) <= 32); 4889bf215546Sopenharmony_ci assert(nir_intrinsic_write_mask(instr) == 4890bf215546Sopenharmony_ci (1u << instr->num_components) - 1); 4891bf215546Sopenharmony_ci assert(nir_intrinsic_align(instr) > 0); 4892bf215546Sopenharmony_ci 4893bf215546Sopenharmony_ci fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 4894bf215546Sopenharmony_ci srcs[A64_LOGICAL_ADDRESS] = get_nir_src(instr->src[1]); 4895bf215546Sopenharmony_ci srcs[A64_LOGICAL_ENABLE_HELPERS] = 4896bf215546Sopenharmony_ci brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS); 4897bf215546Sopenharmony_ci 4898bf215546Sopenharmony_ci if (nir_src_bit_size(instr->src[0]) == 32 && 4899bf215546Sopenharmony_ci nir_intrinsic_align(instr) >= 4) { 4900bf215546Sopenharmony_ci assert(nir_src_num_components(instr->src[0]) <= 4); 4901bf215546Sopenharmony_ci 4902bf215546Sopenharmony_ci srcs[A64_LOGICAL_SRC] = get_nir_src(instr->src[0]); /* Data */ 4903bf215546Sopenharmony_ci srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components); 4904bf215546Sopenharmony_ci 4905bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, fs_reg(), 4906bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS); 4907bf215546Sopenharmony_ci } else { 4908bf215546Sopenharmony_ci assert(nir_src_num_components(instr->src[0]) == 1); 4909bf215546Sopenharmony_ci const unsigned bit_size = nir_src_bit_size(instr->src[0]); 4910bf215546Sopenharmony_ci brw_reg_type data_type = 4911bf215546Sopenharmony_ci brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4912bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4913bf215546Sopenharmony_ci bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type)); 4914bf215546Sopenharmony_ci 4915bf215546Sopenharmony_ci srcs[A64_LOGICAL_SRC] = tmp; 4916bf215546Sopenharmony_ci srcs[A64_LOGICAL_ARG] = brw_imm_ud(nir_src_bit_size(instr->src[0])); 4917bf215546Sopenharmony_ci 4918bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, fs_reg(), 4919bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS); 4920bf215546Sopenharmony_ci } 4921bf215546Sopenharmony_ci break; 4922bf215546Sopenharmony_ci } 4923bf215546Sopenharmony_ci 4924bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_add: 4925bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_imin: 4926bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_umin: 4927bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_imax: 4928bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_umax: 4929bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_and: 4930bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_or: 4931bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_xor: 4932bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_exchange: 4933bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_comp_swap: 4934bf215546Sopenharmony_ci nir_emit_global_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); 4935bf215546Sopenharmony_ci break; 4936bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_fadd: 4937bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_fmin: 4938bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_fmax: 4939bf215546Sopenharmony_ci case nir_intrinsic_global_atomic_fcomp_swap: 4940bf215546Sopenharmony_ci nir_emit_global_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); 4941bf215546Sopenharmony_ci break; 4942bf215546Sopenharmony_ci 4943bf215546Sopenharmony_ci case nir_intrinsic_load_global_const_block_intel: { 4944bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32); 4945bf215546Sopenharmony_ci assert(instr->num_components == 8 || instr->num_components == 16); 4946bf215546Sopenharmony_ci 4947bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(instr->num_components, 0); 4948bf215546Sopenharmony_ci fs_reg load_val; 4949bf215546Sopenharmony_ci 4950bf215546Sopenharmony_ci bool is_pred_const = nir_src_is_const(instr->src[1]); 4951bf215546Sopenharmony_ci if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) { 4952bf215546Sopenharmony_ci /* In this case, we don't want the UBO load at all. We really 4953bf215546Sopenharmony_ci * shouldn't get here but it's possible. 4954bf215546Sopenharmony_ci */ 4955bf215546Sopenharmony_ci load_val = brw_imm_ud(0); 4956bf215546Sopenharmony_ci } else { 4957bf215546Sopenharmony_ci /* The uniform process may stomp the flag so do this first */ 4958bf215546Sopenharmony_ci fs_reg addr = bld.emit_uniformize(get_nir_src(instr->src[0])); 4959bf215546Sopenharmony_ci 4960bf215546Sopenharmony_ci load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4961bf215546Sopenharmony_ci 4962bf215546Sopenharmony_ci /* If the predicate is constant and we got here, then it's non-zero 4963bf215546Sopenharmony_ci * and we don't need the predicate at all. 4964bf215546Sopenharmony_ci */ 4965bf215546Sopenharmony_ci if (!is_pred_const) { 4966bf215546Sopenharmony_ci /* Load the predicate */ 4967bf215546Sopenharmony_ci fs_reg pred = bld.emit_uniformize(get_nir_src(instr->src[1])); 4968bf215546Sopenharmony_ci fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred); 4969bf215546Sopenharmony_ci mov->conditional_mod = BRW_CONDITIONAL_NZ; 4970bf215546Sopenharmony_ci 4971bf215546Sopenharmony_ci /* Stomp the destination with 0 if we're OOB */ 4972bf215546Sopenharmony_ci mov = ubld.MOV(load_val, brw_imm_ud(0)); 4973bf215546Sopenharmony_ci mov->predicate = BRW_PREDICATE_NORMAL; 4974bf215546Sopenharmony_ci mov->predicate_inverse = true; 4975bf215546Sopenharmony_ci } 4976bf215546Sopenharmony_ci 4977bf215546Sopenharmony_ci fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 4978bf215546Sopenharmony_ci srcs[A64_LOGICAL_ADDRESS] = addr; 4979bf215546Sopenharmony_ci srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ 4980bf215546Sopenharmony_ci srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components); 4981bf215546Sopenharmony_ci /* This intrinsic loads memory from a uniform address, sometimes 4982bf215546Sopenharmony_ci * shared across lanes. We never need to mask it. 4983bf215546Sopenharmony_ci */ 4984bf215546Sopenharmony_ci srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); 4985bf215546Sopenharmony_ci 4986bf215546Sopenharmony_ci fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL, 4987bf215546Sopenharmony_ci load_val, srcs, A64_LOGICAL_NUM_SRCS); 4988bf215546Sopenharmony_ci if (!is_pred_const) 4989bf215546Sopenharmony_ci load->predicate = BRW_PREDICATE_NORMAL; 4990bf215546Sopenharmony_ci } 4991bf215546Sopenharmony_ci 4992bf215546Sopenharmony_ci /* From the HW perspective, we just did a single SIMD16 instruction 4993bf215546Sopenharmony_ci * which loaded a dword in each SIMD channel. From NIR's perspective, 4994bf215546Sopenharmony_ci * this instruction returns a vec16. Any users of this data in the 4995bf215546Sopenharmony_ci * back-end will expect a vec16 per SIMD channel so we have to emit a 4996bf215546Sopenharmony_ci * pile of MOVs to resolve this discrepancy. Fortunately, copy-prop 4997bf215546Sopenharmony_ci * will generally clean them up for us. 4998bf215546Sopenharmony_ci */ 4999bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->num_components; i++) { 5000bf215546Sopenharmony_ci bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), 5001bf215546Sopenharmony_ci component(load_val, i)); 5002bf215546Sopenharmony_ci } 5003bf215546Sopenharmony_ci break; 5004bf215546Sopenharmony_ci } 5005bf215546Sopenharmony_ci 5006bf215546Sopenharmony_ci case nir_intrinsic_load_ssbo: { 5007bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 5008bf215546Sopenharmony_ci 5009bf215546Sopenharmony_ci const unsigned bit_size = nir_dest_bit_size(instr->dest); 5010bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5011bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = 5012bf215546Sopenharmony_ci get_nir_ssbo_intrinsic_index(bld, instr); 5013bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 5014bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5015bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 5016bf215546Sopenharmony_ci 5017bf215546Sopenharmony_ci /* Make dest unsigned because that's what the temporary will be */ 5018bf215546Sopenharmony_ci dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 5019bf215546Sopenharmony_ci 5020bf215546Sopenharmony_ci /* Read the vector */ 5021bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) <= 32); 5022bf215546Sopenharmony_ci assert(nir_intrinsic_align(instr) > 0); 5023bf215546Sopenharmony_ci if (nir_dest_bit_size(instr->dest) == 32 && 5024bf215546Sopenharmony_ci nir_intrinsic_align(instr) >= 4) { 5025bf215546Sopenharmony_ci assert(nir_dest_num_components(instr->dest) <= 4); 5026bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 5027bf215546Sopenharmony_ci fs_inst *inst = 5028bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 5029bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5030bf215546Sopenharmony_ci inst->size_written = instr->num_components * dispatch_width * 4; 5031bf215546Sopenharmony_ci } else { 5032bf215546Sopenharmony_ci assert(nir_dest_num_components(instr->dest) == 1); 5033bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 5034bf215546Sopenharmony_ci 5035bf215546Sopenharmony_ci fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 5036bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 5037bf215546Sopenharmony_ci read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 5038bf215546Sopenharmony_ci bld.MOV(dest, subscript(read_result, dest.type, 0)); 5039bf215546Sopenharmony_ci } 5040bf215546Sopenharmony_ci break; 5041bf215546Sopenharmony_ci } 5042bf215546Sopenharmony_ci 5043bf215546Sopenharmony_ci case nir_intrinsic_store_ssbo: { 5044bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 5045bf215546Sopenharmony_ci 5046bf215546Sopenharmony_ci const unsigned bit_size = nir_src_bit_size(instr->src[0]); 5047bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5048bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = 5049bf215546Sopenharmony_ci get_nir_ssbo_intrinsic_index(bld, instr); 5050bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]); 5051bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5052bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 5053bf215546Sopenharmony_ci 5054bf215546Sopenharmony_ci fs_reg data = get_nir_src(instr->src[0]); 5055bf215546Sopenharmony_ci data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 5056bf215546Sopenharmony_ci 5057bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0]) <= 32); 5058bf215546Sopenharmony_ci assert(nir_intrinsic_write_mask(instr) == 5059bf215546Sopenharmony_ci (1u << instr->num_components) - 1); 5060bf215546Sopenharmony_ci assert(nir_intrinsic_align(instr) > 0); 5061bf215546Sopenharmony_ci if (nir_src_bit_size(instr->src[0]) == 32 && 5062bf215546Sopenharmony_ci nir_intrinsic_align(instr) >= 4) { 5063bf215546Sopenharmony_ci assert(nir_src_num_components(instr->src[0]) <= 4); 5064bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5065bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 5066bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 5067bf215546Sopenharmony_ci fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5068bf215546Sopenharmony_ci } else { 5069bf215546Sopenharmony_ci assert(nir_src_num_components(instr->src[0]) == 1); 5070bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 5071bf215546Sopenharmony_ci 5072bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 5073bf215546Sopenharmony_ci bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 5074bf215546Sopenharmony_ci 5075bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 5076bf215546Sopenharmony_ci fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5077bf215546Sopenharmony_ci } 5078bf215546Sopenharmony_ci break; 5079bf215546Sopenharmony_ci } 5080bf215546Sopenharmony_ci 5081bf215546Sopenharmony_ci case nir_intrinsic_store_output: { 5082bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0]) == 32); 5083bf215546Sopenharmony_ci fs_reg src = get_nir_src(instr->src[0]); 5084bf215546Sopenharmony_ci 5085bf215546Sopenharmony_ci unsigned store_offset = nir_src_as_uint(instr->src[1]); 5086bf215546Sopenharmony_ci unsigned num_components = instr->num_components; 5087bf215546Sopenharmony_ci unsigned first_component = nir_intrinsic_component(instr); 5088bf215546Sopenharmony_ci 5089bf215546Sopenharmony_ci fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld, 5090bf215546Sopenharmony_ci 4 * store_offset), src.type); 5091bf215546Sopenharmony_ci for (unsigned j = 0; j < num_components; j++) { 5092bf215546Sopenharmony_ci bld.MOV(offset(new_dest, bld, j + first_component), 5093bf215546Sopenharmony_ci offset(src, bld, j)); 5094bf215546Sopenharmony_ci } 5095bf215546Sopenharmony_ci break; 5096bf215546Sopenharmony_ci } 5097bf215546Sopenharmony_ci 5098bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_add: 5099bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_imin: 5100bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_umin: 5101bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_imax: 5102bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_umax: 5103bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_and: 5104bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_or: 5105bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_xor: 5106bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_exchange: 5107bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_comp_swap: 5108bf215546Sopenharmony_ci nir_emit_ssbo_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); 5109bf215546Sopenharmony_ci break; 5110bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_fadd: 5111bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_fmin: 5112bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_fmax: 5113bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_fcomp_swap: 5114bf215546Sopenharmony_ci nir_emit_ssbo_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); 5115bf215546Sopenharmony_ci break; 5116bf215546Sopenharmony_ci 5117bf215546Sopenharmony_ci case nir_intrinsic_get_ssbo_size: { 5118bf215546Sopenharmony_ci assert(nir_src_num_components(instr->src[0]) == 1); 5119bf215546Sopenharmony_ci unsigned ssbo_index = nir_src_is_const(instr->src[0]) ? 5120bf215546Sopenharmony_ci nir_src_as_uint(instr->src[0]) : 0; 5121bf215546Sopenharmony_ci 5122bf215546Sopenharmony_ci /* A resinfo's sampler message is used to get the buffer size. The 5123bf215546Sopenharmony_ci * SIMD8's writeback message consists of four registers and SIMD16's 5124bf215546Sopenharmony_ci * writeback message consists of 8 destination registers (two per each 5125bf215546Sopenharmony_ci * component). Because we are only interested on the first channel of 5126bf215546Sopenharmony_ci * the first returned component, where resinfo returns the buffer size 5127bf215546Sopenharmony_ci * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of 5128bf215546Sopenharmony_ci * the dispatch width. 5129bf215546Sopenharmony_ci */ 5130bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(8, 0); 5131bf215546Sopenharmony_ci fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5132bf215546Sopenharmony_ci fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 5133bf215546Sopenharmony_ci 5134bf215546Sopenharmony_ci /* Set LOD = 0 */ 5135bf215546Sopenharmony_ci ubld.MOV(src_payload, brw_imm_d(0)); 5136bf215546Sopenharmony_ci 5137bf215546Sopenharmony_ci const unsigned index = ssbo_index; 5138bf215546Sopenharmony_ci fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload, 5139bf215546Sopenharmony_ci src_payload, brw_imm_ud(index)); 5140bf215546Sopenharmony_ci inst->header_size = 0; 5141bf215546Sopenharmony_ci inst->mlen = 1; 5142bf215546Sopenharmony_ci inst->size_written = 4 * REG_SIZE; 5143bf215546Sopenharmony_ci 5144bf215546Sopenharmony_ci /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting: 5145bf215546Sopenharmony_ci * 5146bf215546Sopenharmony_ci * "Out-of-bounds checking is always performed at a DWord granularity. If 5147bf215546Sopenharmony_ci * any part of the DWord is out-of-bounds then the whole DWord is 5148bf215546Sopenharmony_ci * considered out-of-bounds." 5149bf215546Sopenharmony_ci * 5150bf215546Sopenharmony_ci * This implies that types with size smaller than 4-bytes need to be 5151bf215546Sopenharmony_ci * padded if they don't complete the last dword of the buffer. But as we 5152bf215546Sopenharmony_ci * need to maintain the original size we need to reverse the padding 5153bf215546Sopenharmony_ci * calculation to return the correct size to know the number of elements 5154bf215546Sopenharmony_ci * of an unsized array. As we stored in the last two bits of the surface 5155bf215546Sopenharmony_ci * size the needed padding for the buffer, we calculate here the 5156bf215546Sopenharmony_ci * original buffer_size reversing the surface_size calculation: 5157bf215546Sopenharmony_ci * 5158bf215546Sopenharmony_ci * surface_size = isl_align(buffer_size, 4) + 5159bf215546Sopenharmony_ci * (isl_align(buffer_size) - buffer_size) 5160bf215546Sopenharmony_ci * 5161bf215546Sopenharmony_ci * buffer_size = surface_size & ~3 - surface_size & 3 5162bf215546Sopenharmony_ci */ 5163bf215546Sopenharmony_ci 5164bf215546Sopenharmony_ci fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5165bf215546Sopenharmony_ci fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5166bf215546Sopenharmony_ci fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5167bf215546Sopenharmony_ci 5168bf215546Sopenharmony_ci ubld.AND(size_padding, ret_payload, brw_imm_ud(3)); 5169bf215546Sopenharmony_ci ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3)); 5170bf215546Sopenharmony_ci ubld.ADD(buffer_size, size_aligned4, negate(size_padding)); 5171bf215546Sopenharmony_ci 5172bf215546Sopenharmony_ci bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0)); 5173bf215546Sopenharmony_ci break; 5174bf215546Sopenharmony_ci } 5175bf215546Sopenharmony_ci 5176bf215546Sopenharmony_ci case nir_intrinsic_load_scratch: { 5177bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 5178bf215546Sopenharmony_ci 5179bf215546Sopenharmony_ci assert(nir_dest_num_components(instr->dest) == 1); 5180bf215546Sopenharmony_ci const unsigned bit_size = nir_dest_bit_size(instr->dest); 5181bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5182bf215546Sopenharmony_ci 5183bf215546Sopenharmony_ci if (devinfo->verx10 >= 125) { 5184bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(1, 0); 5185bf215546Sopenharmony_ci fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0); 5186bf215546Sopenharmony_ci ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), 5187bf215546Sopenharmony_ci brw_imm_ud(~0x3ffu)); 5188bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle; 5189bf215546Sopenharmony_ci } else if (devinfo->ver >= 8) { 5190bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = 5191bf215546Sopenharmony_ci brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT); 5192bf215546Sopenharmony_ci } else { 5193bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); 5194bf215546Sopenharmony_ci } 5195bf215546Sopenharmony_ci 5196bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5197bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 5198bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 5199bf215546Sopenharmony_ci const fs_reg nir_addr = get_nir_src(instr->src[0]); 5200bf215546Sopenharmony_ci 5201bf215546Sopenharmony_ci /* Make dest unsigned because that's what the temporary will be */ 5202bf215546Sopenharmony_ci dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 5203bf215546Sopenharmony_ci 5204bf215546Sopenharmony_ci /* Read the vector */ 5205bf215546Sopenharmony_ci assert(nir_dest_num_components(instr->dest) == 1); 5206bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) <= 32); 5207bf215546Sopenharmony_ci assert(nir_intrinsic_align(instr) > 0); 5208bf215546Sopenharmony_ci if (nir_dest_bit_size(instr->dest) == 32 && 5209bf215546Sopenharmony_ci nir_intrinsic_align(instr) >= 4) { 5210bf215546Sopenharmony_ci if (devinfo->verx10 >= 125) { 5211bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32 && 5212bf215546Sopenharmony_ci nir_intrinsic_align(instr) >= 4); 5213bf215546Sopenharmony_ci 5214bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5215bf215546Sopenharmony_ci swizzle_nir_scratch_addr(bld, nir_addr, false); 5216bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); 5217bf215546Sopenharmony_ci 5218bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 5219bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5220bf215546Sopenharmony_ci } else { 5221bf215546Sopenharmony_ci /* The offset for a DWORD scattered message is in dwords. */ 5222bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5223bf215546Sopenharmony_ci swizzle_nir_scratch_addr(bld, nir_addr, true); 5224bf215546Sopenharmony_ci 5225bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL, 5226bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5227bf215546Sopenharmony_ci } 5228bf215546Sopenharmony_ci } else { 5229bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5230bf215546Sopenharmony_ci swizzle_nir_scratch_addr(bld, nir_addr, false); 5231bf215546Sopenharmony_ci 5232bf215546Sopenharmony_ci fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 5233bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 5234bf215546Sopenharmony_ci read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 5235bf215546Sopenharmony_ci bld.MOV(dest, read_result); 5236bf215546Sopenharmony_ci } 5237bf215546Sopenharmony_ci 5238bf215546Sopenharmony_ci shader_stats.fill_count += DIV_ROUND_UP(dispatch_width, 16); 5239bf215546Sopenharmony_ci break; 5240bf215546Sopenharmony_ci } 5241bf215546Sopenharmony_ci 5242bf215546Sopenharmony_ci case nir_intrinsic_store_scratch: { 5243bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 5244bf215546Sopenharmony_ci 5245bf215546Sopenharmony_ci assert(nir_src_num_components(instr->src[0]) == 1); 5246bf215546Sopenharmony_ci const unsigned bit_size = nir_src_bit_size(instr->src[0]); 5247bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5248bf215546Sopenharmony_ci 5249bf215546Sopenharmony_ci if (devinfo->verx10 >= 125) { 5250bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(1, 0); 5251bf215546Sopenharmony_ci fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0); 5252bf215546Sopenharmony_ci ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), 5253bf215546Sopenharmony_ci brw_imm_ud(~0x3ffu)); 5254bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle; 5255bf215546Sopenharmony_ci } else if (devinfo->ver >= 8) { 5256bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = 5257bf215546Sopenharmony_ci brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT); 5258bf215546Sopenharmony_ci } else { 5259bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); 5260bf215546Sopenharmony_ci } 5261bf215546Sopenharmony_ci 5262bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5263bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 5264bf215546Sopenharmony_ci /** 5265bf215546Sopenharmony_ci * While this instruction has side-effects, it should not be predicated 5266bf215546Sopenharmony_ci * on sample mask, because otherwise fs helper invocations would 5267bf215546Sopenharmony_ci * load undefined values from scratch memory. And scratch memory 5268bf215546Sopenharmony_ci * load-stores are produced from operations without side-effects, thus 5269bf215546Sopenharmony_ci * they should not have different behaviour in the helper invocations. 5270bf215546Sopenharmony_ci */ 5271bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 5272bf215546Sopenharmony_ci const fs_reg nir_addr = get_nir_src(instr->src[1]); 5273bf215546Sopenharmony_ci 5274bf215546Sopenharmony_ci fs_reg data = get_nir_src(instr->src[0]); 5275bf215546Sopenharmony_ci data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 5276bf215546Sopenharmony_ci 5277bf215546Sopenharmony_ci assert(nir_src_num_components(instr->src[0]) == 1); 5278bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0]) <= 32); 5279bf215546Sopenharmony_ci assert(nir_intrinsic_write_mask(instr) == 1); 5280bf215546Sopenharmony_ci assert(nir_intrinsic_align(instr) > 0); 5281bf215546Sopenharmony_ci if (nir_src_bit_size(instr->src[0]) == 32 && 5282bf215546Sopenharmony_ci nir_intrinsic_align(instr) >= 4) { 5283bf215546Sopenharmony_ci if (devinfo->verx10 >= 125) { 5284bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5285bf215546Sopenharmony_ci 5286bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5287bf215546Sopenharmony_ci swizzle_nir_scratch_addr(bld, nir_addr, false); 5288bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); 5289bf215546Sopenharmony_ci 5290bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 5291bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5292bf215546Sopenharmony_ci } else { 5293bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5294bf215546Sopenharmony_ci 5295bf215546Sopenharmony_ci /* The offset for a DWORD scattered message is in dwords. */ 5296bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5297bf215546Sopenharmony_ci swizzle_nir_scratch_addr(bld, nir_addr, true); 5298bf215546Sopenharmony_ci 5299bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL, 5300bf215546Sopenharmony_ci fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5301bf215546Sopenharmony_ci } 5302bf215546Sopenharmony_ci } else { 5303bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 5304bf215546Sopenharmony_ci bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 5305bf215546Sopenharmony_ci 5306bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5307bf215546Sopenharmony_ci swizzle_nir_scratch_addr(bld, nir_addr, false); 5308bf215546Sopenharmony_ci 5309bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 5310bf215546Sopenharmony_ci fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5311bf215546Sopenharmony_ci } 5312bf215546Sopenharmony_ci shader_stats.spill_count += DIV_ROUND_UP(dispatch_width, 16); 5313bf215546Sopenharmony_ci break; 5314bf215546Sopenharmony_ci } 5315bf215546Sopenharmony_ci 5316bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_size: 5317bf215546Sopenharmony_ci /* This should only happen for fragment shaders because every other case 5318bf215546Sopenharmony_ci * is lowered in NIR so we can optimize on it. 5319bf215546Sopenharmony_ci */ 5320bf215546Sopenharmony_ci assert(stage == MESA_SHADER_FRAGMENT); 5321bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width)); 5322bf215546Sopenharmony_ci break; 5323bf215546Sopenharmony_ci 5324bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_invocation: 5325bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 5326bf215546Sopenharmony_ci nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]); 5327bf215546Sopenharmony_ci break; 5328bf215546Sopenharmony_ci 5329bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_eq_mask: 5330bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_ge_mask: 5331bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_gt_mask: 5332bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_le_mask: 5333bf215546Sopenharmony_ci case nir_intrinsic_load_subgroup_lt_mask: 5334bf215546Sopenharmony_ci unreachable("not reached"); 5335bf215546Sopenharmony_ci 5336bf215546Sopenharmony_ci case nir_intrinsic_vote_any: { 5337bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(1, 0); 5338bf215546Sopenharmony_ci 5339bf215546Sopenharmony_ci /* The any/all predicates do not consider channel enables. To prevent 5340bf215546Sopenharmony_ci * dead channels from affecting the result, we initialize the flag with 5341bf215546Sopenharmony_ci * with the identity value for the logical operation. 5342bf215546Sopenharmony_ci */ 5343bf215546Sopenharmony_ci if (dispatch_width == 32) { 5344bf215546Sopenharmony_ci /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 5345bf215546Sopenharmony_ci ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 5346bf215546Sopenharmony_ci brw_imm_ud(0)); 5347bf215546Sopenharmony_ci } else { 5348bf215546Sopenharmony_ci ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0)); 5349bf215546Sopenharmony_ci } 5350bf215546Sopenharmony_ci bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 5351bf215546Sopenharmony_ci 5352bf215546Sopenharmony_ci /* For some reason, the any/all predicates don't work properly with 5353bf215546Sopenharmony_ci * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 5354bf215546Sopenharmony_ci * doesn't read the correct subset of the flag register and you end up 5355bf215546Sopenharmony_ci * getting garbage in the second half. Work around this by using a pair 5356bf215546Sopenharmony_ci * of 1-wide MOVs and scattering the result. 5357bf215546Sopenharmony_ci */ 5358bf215546Sopenharmony_ci fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 5359bf215546Sopenharmony_ci ubld.MOV(res1, brw_imm_d(0)); 5360bf215546Sopenharmony_ci set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H : 5361bf215546Sopenharmony_ci dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H : 5362bf215546Sopenharmony_ci BRW_PREDICATE_ALIGN1_ANY32H, 5363bf215546Sopenharmony_ci ubld.MOV(res1, brw_imm_d(-1))); 5364bf215546Sopenharmony_ci 5365bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 5366bf215546Sopenharmony_ci break; 5367bf215546Sopenharmony_ci } 5368bf215546Sopenharmony_ci case nir_intrinsic_vote_all: { 5369bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(1, 0); 5370bf215546Sopenharmony_ci 5371bf215546Sopenharmony_ci /* The any/all predicates do not consider channel enables. To prevent 5372bf215546Sopenharmony_ci * dead channels from affecting the result, we initialize the flag with 5373bf215546Sopenharmony_ci * with the identity value for the logical operation. 5374bf215546Sopenharmony_ci */ 5375bf215546Sopenharmony_ci if (dispatch_width == 32) { 5376bf215546Sopenharmony_ci /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 5377bf215546Sopenharmony_ci ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 5378bf215546Sopenharmony_ci brw_imm_ud(0xffffffff)); 5379bf215546Sopenharmony_ci } else { 5380bf215546Sopenharmony_ci ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 5381bf215546Sopenharmony_ci } 5382bf215546Sopenharmony_ci bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 5383bf215546Sopenharmony_ci 5384bf215546Sopenharmony_ci /* For some reason, the any/all predicates don't work properly with 5385bf215546Sopenharmony_ci * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 5386bf215546Sopenharmony_ci * doesn't read the correct subset of the flag register and you end up 5387bf215546Sopenharmony_ci * getting garbage in the second half. Work around this by using a pair 5388bf215546Sopenharmony_ci * of 1-wide MOVs and scattering the result. 5389bf215546Sopenharmony_ci */ 5390bf215546Sopenharmony_ci fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 5391bf215546Sopenharmony_ci ubld.MOV(res1, brw_imm_d(0)); 5392bf215546Sopenharmony_ci set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 5393bf215546Sopenharmony_ci dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 5394bf215546Sopenharmony_ci BRW_PREDICATE_ALIGN1_ALL32H, 5395bf215546Sopenharmony_ci ubld.MOV(res1, brw_imm_d(-1))); 5396bf215546Sopenharmony_ci 5397bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 5398bf215546Sopenharmony_ci break; 5399bf215546Sopenharmony_ci } 5400bf215546Sopenharmony_ci case nir_intrinsic_vote_feq: 5401bf215546Sopenharmony_ci case nir_intrinsic_vote_ieq: { 5402bf215546Sopenharmony_ci fs_reg value = get_nir_src(instr->src[0]); 5403bf215546Sopenharmony_ci if (instr->intrinsic == nir_intrinsic_vote_feq) { 5404bf215546Sopenharmony_ci const unsigned bit_size = nir_src_bit_size(instr->src[0]); 5405bf215546Sopenharmony_ci value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B : 5406bf215546Sopenharmony_ci brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); 5407bf215546Sopenharmony_ci } 5408bf215546Sopenharmony_ci 5409bf215546Sopenharmony_ci fs_reg uniformized = bld.emit_uniformize(value); 5410bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(1, 0); 5411bf215546Sopenharmony_ci 5412bf215546Sopenharmony_ci /* The any/all predicates do not consider channel enables. To prevent 5413bf215546Sopenharmony_ci * dead channels from affecting the result, we initialize the flag with 5414bf215546Sopenharmony_ci * with the identity value for the logical operation. 5415bf215546Sopenharmony_ci */ 5416bf215546Sopenharmony_ci if (dispatch_width == 32) { 5417bf215546Sopenharmony_ci /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 5418bf215546Sopenharmony_ci ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 5419bf215546Sopenharmony_ci brw_imm_ud(0xffffffff)); 5420bf215546Sopenharmony_ci } else { 5421bf215546Sopenharmony_ci ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 5422bf215546Sopenharmony_ci } 5423bf215546Sopenharmony_ci bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z); 5424bf215546Sopenharmony_ci 5425bf215546Sopenharmony_ci /* For some reason, the any/all predicates don't work properly with 5426bf215546Sopenharmony_ci * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 5427bf215546Sopenharmony_ci * doesn't read the correct subset of the flag register and you end up 5428bf215546Sopenharmony_ci * getting garbage in the second half. Work around this by using a pair 5429bf215546Sopenharmony_ci * of 1-wide MOVs and scattering the result. 5430bf215546Sopenharmony_ci */ 5431bf215546Sopenharmony_ci fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 5432bf215546Sopenharmony_ci ubld.MOV(res1, brw_imm_d(0)); 5433bf215546Sopenharmony_ci set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 5434bf215546Sopenharmony_ci dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 5435bf215546Sopenharmony_ci BRW_PREDICATE_ALIGN1_ALL32H, 5436bf215546Sopenharmony_ci ubld.MOV(res1, brw_imm_d(-1))); 5437bf215546Sopenharmony_ci 5438bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 5439bf215546Sopenharmony_ci break; 5440bf215546Sopenharmony_ci } 5441bf215546Sopenharmony_ci 5442bf215546Sopenharmony_ci case nir_intrinsic_ballot: { 5443bf215546Sopenharmony_ci const fs_reg value = retype(get_nir_src(instr->src[0]), 5444bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UD); 5445bf215546Sopenharmony_ci struct brw_reg flag = brw_flag_reg(0, 0); 5446bf215546Sopenharmony_ci /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well 5447bf215546Sopenharmony_ci * as f0.0. This is a problem for fragment programs as we currently use 5448bf215546Sopenharmony_ci * f0.1 for discards. Fortunately, we don't support SIMD32 fragment 5449bf215546Sopenharmony_ci * programs yet so this isn't a problem. When we do, something will 5450bf215546Sopenharmony_ci * have to change. 5451bf215546Sopenharmony_ci */ 5452bf215546Sopenharmony_ci if (dispatch_width == 32) 5453bf215546Sopenharmony_ci flag.type = BRW_REGISTER_TYPE_UD; 5454bf215546Sopenharmony_ci 5455bf215546Sopenharmony_ci bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u)); 5456bf215546Sopenharmony_ci bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ); 5457bf215546Sopenharmony_ci 5458bf215546Sopenharmony_ci if (instr->dest.ssa.bit_size > 32) { 5459bf215546Sopenharmony_ci dest.type = BRW_REGISTER_TYPE_UQ; 5460bf215546Sopenharmony_ci } else { 5461bf215546Sopenharmony_ci dest.type = BRW_REGISTER_TYPE_UD; 5462bf215546Sopenharmony_ci } 5463bf215546Sopenharmony_ci bld.MOV(dest, flag); 5464bf215546Sopenharmony_ci break; 5465bf215546Sopenharmony_ci } 5466bf215546Sopenharmony_ci 5467bf215546Sopenharmony_ci case nir_intrinsic_read_invocation: { 5468bf215546Sopenharmony_ci const fs_reg value = get_nir_src(instr->src[0]); 5469bf215546Sopenharmony_ci const fs_reg invocation = get_nir_src(instr->src[1]); 5470bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(value.type); 5471bf215546Sopenharmony_ci 5472bf215546Sopenharmony_ci bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value, 5473bf215546Sopenharmony_ci bld.emit_uniformize(invocation)); 5474bf215546Sopenharmony_ci 5475bf215546Sopenharmony_ci bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0))); 5476bf215546Sopenharmony_ci break; 5477bf215546Sopenharmony_ci } 5478bf215546Sopenharmony_ci 5479bf215546Sopenharmony_ci case nir_intrinsic_read_first_invocation: { 5480bf215546Sopenharmony_ci const fs_reg value = get_nir_src(instr->src[0]); 5481bf215546Sopenharmony_ci bld.MOV(retype(dest, value.type), bld.emit_uniformize(value)); 5482bf215546Sopenharmony_ci break; 5483bf215546Sopenharmony_ci } 5484bf215546Sopenharmony_ci 5485bf215546Sopenharmony_ci case nir_intrinsic_shuffle: { 5486bf215546Sopenharmony_ci const fs_reg value = get_nir_src(instr->src[0]); 5487bf215546Sopenharmony_ci const fs_reg index = get_nir_src(instr->src[1]); 5488bf215546Sopenharmony_ci 5489bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index); 5490bf215546Sopenharmony_ci break; 5491bf215546Sopenharmony_ci } 5492bf215546Sopenharmony_ci 5493bf215546Sopenharmony_ci case nir_intrinsic_first_invocation: { 5494bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 5495bf215546Sopenharmony_ci bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp); 5496bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 5497bf215546Sopenharmony_ci fs_reg(component(tmp, 0))); 5498bf215546Sopenharmony_ci break; 5499bf215546Sopenharmony_ci } 5500bf215546Sopenharmony_ci 5501bf215546Sopenharmony_ci case nir_intrinsic_last_invocation: { 5502bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 5503bf215546Sopenharmony_ci bld.exec_all().emit(SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp); 5504bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 5505bf215546Sopenharmony_ci fs_reg(component(tmp, 0))); 5506bf215546Sopenharmony_ci break; 5507bf215546Sopenharmony_ci } 5508bf215546Sopenharmony_ci 5509bf215546Sopenharmony_ci case nir_intrinsic_quad_broadcast: { 5510bf215546Sopenharmony_ci const fs_reg value = get_nir_src(instr->src[0]); 5511bf215546Sopenharmony_ci const unsigned index = nir_src_as_uint(instr->src[1]); 5512bf215546Sopenharmony_ci 5513bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type), 5514bf215546Sopenharmony_ci value, brw_imm_ud(index), brw_imm_ud(4)); 5515bf215546Sopenharmony_ci break; 5516bf215546Sopenharmony_ci } 5517bf215546Sopenharmony_ci 5518bf215546Sopenharmony_ci case nir_intrinsic_quad_swap_horizontal: { 5519bf215546Sopenharmony_ci const fs_reg value = get_nir_src(instr->src[0]); 5520bf215546Sopenharmony_ci const fs_reg tmp = bld.vgrf(value.type); 5521bf215546Sopenharmony_ci if (devinfo->ver <= 7) { 5522bf215546Sopenharmony_ci /* The hardware doesn't seem to support these crazy regions with 5523bf215546Sopenharmony_ci * compressed instructions on gfx7 and earlier so we fall back to 5524bf215546Sopenharmony_ci * using quad swizzles. Fortunately, we don't support 64-bit 5525bf215546Sopenharmony_ci * anything in Vulkan on gfx7. 5526bf215546Sopenharmony_ci */ 5527bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0]) == 32); 5528bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all(); 5529bf215546Sopenharmony_ci ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 5530bf215546Sopenharmony_ci brw_imm_ud(BRW_SWIZZLE4(1,0,3,2))); 5531bf215546Sopenharmony_ci bld.MOV(retype(dest, value.type), tmp); 5532bf215546Sopenharmony_ci } else { 5533bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0); 5534bf215546Sopenharmony_ci 5535bf215546Sopenharmony_ci const fs_reg src_left = horiz_stride(value, 2); 5536bf215546Sopenharmony_ci const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2); 5537bf215546Sopenharmony_ci const fs_reg tmp_left = horiz_stride(tmp, 2); 5538bf215546Sopenharmony_ci const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2); 5539bf215546Sopenharmony_ci 5540bf215546Sopenharmony_ci ubld.MOV(tmp_left, src_right); 5541bf215546Sopenharmony_ci ubld.MOV(tmp_right, src_left); 5542bf215546Sopenharmony_ci 5543bf215546Sopenharmony_ci } 5544bf215546Sopenharmony_ci bld.MOV(retype(dest, value.type), tmp); 5545bf215546Sopenharmony_ci break; 5546bf215546Sopenharmony_ci } 5547bf215546Sopenharmony_ci 5548bf215546Sopenharmony_ci case nir_intrinsic_quad_swap_vertical: { 5549bf215546Sopenharmony_ci const fs_reg value = get_nir_src(instr->src[0]); 5550bf215546Sopenharmony_ci if (nir_src_bit_size(instr->src[0]) == 32) { 5551bf215546Sopenharmony_ci /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 5552bf215546Sopenharmony_ci const fs_reg tmp = bld.vgrf(value.type); 5553bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all(); 5554bf215546Sopenharmony_ci ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 5555bf215546Sopenharmony_ci brw_imm_ud(BRW_SWIZZLE4(2,3,0,1))); 5556bf215546Sopenharmony_ci bld.MOV(retype(dest, value.type), tmp); 5557bf215546Sopenharmony_ci } else { 5558bf215546Sopenharmony_ci /* For larger data types, we have to either emit dispatch_width many 5559bf215546Sopenharmony_ci * MOVs or else fall back to doing indirects. 5560bf215546Sopenharmony_ci */ 5561bf215546Sopenharmony_ci fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 5562bf215546Sopenharmony_ci bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 5563bf215546Sopenharmony_ci brw_imm_w(0x2)); 5564bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 5565bf215546Sopenharmony_ci } 5566bf215546Sopenharmony_ci break; 5567bf215546Sopenharmony_ci } 5568bf215546Sopenharmony_ci 5569bf215546Sopenharmony_ci case nir_intrinsic_quad_swap_diagonal: { 5570bf215546Sopenharmony_ci const fs_reg value = get_nir_src(instr->src[0]); 5571bf215546Sopenharmony_ci if (nir_src_bit_size(instr->src[0]) == 32) { 5572bf215546Sopenharmony_ci /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 5573bf215546Sopenharmony_ci const fs_reg tmp = bld.vgrf(value.type); 5574bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all(); 5575bf215546Sopenharmony_ci ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 5576bf215546Sopenharmony_ci brw_imm_ud(BRW_SWIZZLE4(3,2,1,0))); 5577bf215546Sopenharmony_ci bld.MOV(retype(dest, value.type), tmp); 5578bf215546Sopenharmony_ci } else { 5579bf215546Sopenharmony_ci /* For larger data types, we have to either emit dispatch_width many 5580bf215546Sopenharmony_ci * MOVs or else fall back to doing indirects. 5581bf215546Sopenharmony_ci */ 5582bf215546Sopenharmony_ci fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 5583bf215546Sopenharmony_ci bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 5584bf215546Sopenharmony_ci brw_imm_w(0x3)); 5585bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 5586bf215546Sopenharmony_ci } 5587bf215546Sopenharmony_ci break; 5588bf215546Sopenharmony_ci } 5589bf215546Sopenharmony_ci 5590bf215546Sopenharmony_ci case nir_intrinsic_reduce: { 5591bf215546Sopenharmony_ci fs_reg src = get_nir_src(instr->src[0]); 5592bf215546Sopenharmony_ci nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 5593bf215546Sopenharmony_ci unsigned cluster_size = nir_intrinsic_cluster_size(instr); 5594bf215546Sopenharmony_ci if (cluster_size == 0 || cluster_size > dispatch_width) 5595bf215546Sopenharmony_ci cluster_size = dispatch_width; 5596bf215546Sopenharmony_ci 5597bf215546Sopenharmony_ci /* Figure out the source type */ 5598bf215546Sopenharmony_ci src.type = brw_type_for_nir_type(devinfo, 5599bf215546Sopenharmony_ci (nir_alu_type)(nir_op_infos[redop].input_types[0] | 5600bf215546Sopenharmony_ci nir_src_bit_size(instr->src[0]))); 5601bf215546Sopenharmony_ci 5602bf215546Sopenharmony_ci fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 5603bf215546Sopenharmony_ci opcode brw_op = brw_op_for_nir_reduction_op(redop); 5604bf215546Sopenharmony_ci brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 5605bf215546Sopenharmony_ci 5606bf215546Sopenharmony_ci /* Set up a register for all of our scratching around and initialize it 5607bf215546Sopenharmony_ci * to reduction operation's identity value. 5608bf215546Sopenharmony_ci */ 5609bf215546Sopenharmony_ci fs_reg scan = bld.vgrf(src.type); 5610bf215546Sopenharmony_ci bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 5611bf215546Sopenharmony_ci 5612bf215546Sopenharmony_ci bld.emit_scan(brw_op, scan, cluster_size, cond_mod); 5613bf215546Sopenharmony_ci 5614bf215546Sopenharmony_ci dest.type = src.type; 5615bf215546Sopenharmony_ci if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) { 5616bf215546Sopenharmony_ci /* In this case, CLUSTER_BROADCAST instruction isn't needed because 5617bf215546Sopenharmony_ci * the distance between clusters is at least 2 GRFs. In this case, 5618bf215546Sopenharmony_ci * we don't need the weird striding of the CLUSTER_BROADCAST 5619bf215546Sopenharmony_ci * instruction and can just do regular MOVs. 5620bf215546Sopenharmony_ci */ 5621bf215546Sopenharmony_ci assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0); 5622bf215546Sopenharmony_ci const unsigned groups = 5623bf215546Sopenharmony_ci (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2); 5624bf215546Sopenharmony_ci const unsigned group_size = dispatch_width / groups; 5625bf215546Sopenharmony_ci for (unsigned i = 0; i < groups; i++) { 5626bf215546Sopenharmony_ci const unsigned cluster = (i * group_size) / cluster_size; 5627bf215546Sopenharmony_ci const unsigned comp = cluster * cluster_size + (cluster_size - 1); 5628bf215546Sopenharmony_ci bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size), 5629bf215546Sopenharmony_ci component(scan, comp)); 5630bf215546Sopenharmony_ci } 5631bf215546Sopenharmony_ci } else { 5632bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan, 5633bf215546Sopenharmony_ci brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size)); 5634bf215546Sopenharmony_ci } 5635bf215546Sopenharmony_ci break; 5636bf215546Sopenharmony_ci } 5637bf215546Sopenharmony_ci 5638bf215546Sopenharmony_ci case nir_intrinsic_inclusive_scan: 5639bf215546Sopenharmony_ci case nir_intrinsic_exclusive_scan: { 5640bf215546Sopenharmony_ci fs_reg src = get_nir_src(instr->src[0]); 5641bf215546Sopenharmony_ci nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 5642bf215546Sopenharmony_ci 5643bf215546Sopenharmony_ci /* Figure out the source type */ 5644bf215546Sopenharmony_ci src.type = brw_type_for_nir_type(devinfo, 5645bf215546Sopenharmony_ci (nir_alu_type)(nir_op_infos[redop].input_types[0] | 5646bf215546Sopenharmony_ci nir_src_bit_size(instr->src[0]))); 5647bf215546Sopenharmony_ci 5648bf215546Sopenharmony_ci fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 5649bf215546Sopenharmony_ci opcode brw_op = brw_op_for_nir_reduction_op(redop); 5650bf215546Sopenharmony_ci brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 5651bf215546Sopenharmony_ci 5652bf215546Sopenharmony_ci /* Set up a register for all of our scratching around and initialize it 5653bf215546Sopenharmony_ci * to reduction operation's identity value. 5654bf215546Sopenharmony_ci */ 5655bf215546Sopenharmony_ci fs_reg scan = bld.vgrf(src.type); 5656bf215546Sopenharmony_ci const fs_builder allbld = bld.exec_all(); 5657bf215546Sopenharmony_ci allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 5658bf215546Sopenharmony_ci 5659bf215546Sopenharmony_ci if (instr->intrinsic == nir_intrinsic_exclusive_scan) { 5660bf215546Sopenharmony_ci /* Exclusive scan is a bit harder because we have to do an annoying 5661bf215546Sopenharmony_ci * shift of the contents before we can begin. To make things worse, 5662bf215546Sopenharmony_ci * we can't do this with a normal stride; we have to use indirects. 5663bf215546Sopenharmony_ci */ 5664bf215546Sopenharmony_ci fs_reg shifted = bld.vgrf(src.type); 5665bf215546Sopenharmony_ci fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 5666bf215546Sopenharmony_ci allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 5667bf215546Sopenharmony_ci brw_imm_w(-1)); 5668bf215546Sopenharmony_ci allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx); 5669bf215546Sopenharmony_ci allbld.group(1, 0).MOV(component(shifted, 0), identity); 5670bf215546Sopenharmony_ci scan = shifted; 5671bf215546Sopenharmony_ci } 5672bf215546Sopenharmony_ci 5673bf215546Sopenharmony_ci bld.emit_scan(brw_op, scan, dispatch_width, cond_mod); 5674bf215546Sopenharmony_ci 5675bf215546Sopenharmony_ci bld.MOV(retype(dest, src.type), scan); 5676bf215546Sopenharmony_ci break; 5677bf215546Sopenharmony_ci } 5678bf215546Sopenharmony_ci 5679bf215546Sopenharmony_ci case nir_intrinsic_load_global_block_intel: { 5680bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32); 5681bf215546Sopenharmony_ci 5682bf215546Sopenharmony_ci fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[0])); 5683bf215546Sopenharmony_ci 5684bf215546Sopenharmony_ci const fs_builder ubld1 = bld.exec_all().group(1, 0); 5685bf215546Sopenharmony_ci const fs_builder ubld8 = bld.exec_all().group(8, 0); 5686bf215546Sopenharmony_ci const fs_builder ubld16 = bld.exec_all().group(16, 0); 5687bf215546Sopenharmony_ci 5688bf215546Sopenharmony_ci const unsigned total = instr->num_components * dispatch_width; 5689bf215546Sopenharmony_ci unsigned loaded = 0; 5690bf215546Sopenharmony_ci 5691bf215546Sopenharmony_ci while (loaded < total) { 5692bf215546Sopenharmony_ci const unsigned block = 5693bf215546Sopenharmony_ci choose_oword_block_size_dwords(total - loaded); 5694bf215546Sopenharmony_ci const unsigned block_bytes = block * 4; 5695bf215546Sopenharmony_ci 5696bf215546Sopenharmony_ci const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5697bf215546Sopenharmony_ci 5698bf215546Sopenharmony_ci fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 5699bf215546Sopenharmony_ci srcs[A64_LOGICAL_ADDRESS] = address; 5700bf215546Sopenharmony_ci srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ 5701bf215546Sopenharmony_ci srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); 5702bf215546Sopenharmony_ci srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(1); 5703bf215546Sopenharmony_ci ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, 5704bf215546Sopenharmony_ci retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD), 5705bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes; 5706bf215546Sopenharmony_ci 5707bf215546Sopenharmony_ci increment_a64_address(ubld1, address, block_bytes); 5708bf215546Sopenharmony_ci loaded += block; 5709bf215546Sopenharmony_ci } 5710bf215546Sopenharmony_ci 5711bf215546Sopenharmony_ci assert(loaded == total); 5712bf215546Sopenharmony_ci break; 5713bf215546Sopenharmony_ci } 5714bf215546Sopenharmony_ci 5715bf215546Sopenharmony_ci case nir_intrinsic_store_global_block_intel: { 5716bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0]) == 32); 5717bf215546Sopenharmony_ci 5718bf215546Sopenharmony_ci fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[1])); 5719bf215546Sopenharmony_ci fs_reg src = get_nir_src(instr->src[0]); 5720bf215546Sopenharmony_ci 5721bf215546Sopenharmony_ci const fs_builder ubld1 = bld.exec_all().group(1, 0); 5722bf215546Sopenharmony_ci const fs_builder ubld8 = bld.exec_all().group(8, 0); 5723bf215546Sopenharmony_ci const fs_builder ubld16 = bld.exec_all().group(16, 0); 5724bf215546Sopenharmony_ci 5725bf215546Sopenharmony_ci const unsigned total = instr->num_components * dispatch_width; 5726bf215546Sopenharmony_ci unsigned written = 0; 5727bf215546Sopenharmony_ci 5728bf215546Sopenharmony_ci while (written < total) { 5729bf215546Sopenharmony_ci const unsigned block = 5730bf215546Sopenharmony_ci choose_oword_block_size_dwords(total - written); 5731bf215546Sopenharmony_ci 5732bf215546Sopenharmony_ci fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 5733bf215546Sopenharmony_ci srcs[A64_LOGICAL_ADDRESS] = address; 5734bf215546Sopenharmony_ci srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4), 5735bf215546Sopenharmony_ci BRW_REGISTER_TYPE_UD); 5736bf215546Sopenharmony_ci srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); 5737bf215546Sopenharmony_ci srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); 5738bf215546Sopenharmony_ci 5739bf215546Sopenharmony_ci const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5740bf215546Sopenharmony_ci ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, fs_reg(), 5741bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS); 5742bf215546Sopenharmony_ci 5743bf215546Sopenharmony_ci const unsigned block_bytes = block * 4; 5744bf215546Sopenharmony_ci increment_a64_address(ubld1, address, block_bytes); 5745bf215546Sopenharmony_ci written += block; 5746bf215546Sopenharmony_ci } 5747bf215546Sopenharmony_ci 5748bf215546Sopenharmony_ci assert(written == total); 5749bf215546Sopenharmony_ci break; 5750bf215546Sopenharmony_ci } 5751bf215546Sopenharmony_ci 5752bf215546Sopenharmony_ci case nir_intrinsic_load_shared_block_intel: 5753bf215546Sopenharmony_ci case nir_intrinsic_load_ssbo_block_intel: { 5754bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32); 5755bf215546Sopenharmony_ci 5756bf215546Sopenharmony_ci const bool is_ssbo = 5757bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_load_ssbo_block_intel; 5758bf215546Sopenharmony_ci fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 1 : 0])); 5759bf215546Sopenharmony_ci 5760bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5761bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ? 5762bf215546Sopenharmony_ci get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM)); 5763bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address; 5764bf215546Sopenharmony_ci 5765bf215546Sopenharmony_ci const fs_builder ubld1 = bld.exec_all().group(1, 0); 5766bf215546Sopenharmony_ci const fs_builder ubld8 = bld.exec_all().group(8, 0); 5767bf215546Sopenharmony_ci const fs_builder ubld16 = bld.exec_all().group(16, 0); 5768bf215546Sopenharmony_ci 5769bf215546Sopenharmony_ci const unsigned total = instr->num_components * dispatch_width; 5770bf215546Sopenharmony_ci unsigned loaded = 0; 5771bf215546Sopenharmony_ci 5772bf215546Sopenharmony_ci while (loaded < total) { 5773bf215546Sopenharmony_ci const unsigned block = 5774bf215546Sopenharmony_ci choose_oword_block_size_dwords(total - loaded); 5775bf215546Sopenharmony_ci const unsigned block_bytes = block * 4; 5776bf215546Sopenharmony_ci 5777bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); 5778bf215546Sopenharmony_ci 5779bf215546Sopenharmony_ci const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5780bf215546Sopenharmony_ci ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, 5781bf215546Sopenharmony_ci retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD), 5782bf215546Sopenharmony_ci srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes; 5783bf215546Sopenharmony_ci 5784bf215546Sopenharmony_ci ubld1.ADD(address, address, brw_imm_ud(block_bytes)); 5785bf215546Sopenharmony_ci loaded += block; 5786bf215546Sopenharmony_ci } 5787bf215546Sopenharmony_ci 5788bf215546Sopenharmony_ci assert(loaded == total); 5789bf215546Sopenharmony_ci break; 5790bf215546Sopenharmony_ci } 5791bf215546Sopenharmony_ci 5792bf215546Sopenharmony_ci case nir_intrinsic_store_shared_block_intel: 5793bf215546Sopenharmony_ci case nir_intrinsic_store_ssbo_block_intel: { 5794bf215546Sopenharmony_ci assert(nir_src_bit_size(instr->src[0]) == 32); 5795bf215546Sopenharmony_ci 5796bf215546Sopenharmony_ci const bool is_ssbo = 5797bf215546Sopenharmony_ci instr->intrinsic == nir_intrinsic_store_ssbo_block_intel; 5798bf215546Sopenharmony_ci 5799bf215546Sopenharmony_ci fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 2 : 1])); 5800bf215546Sopenharmony_ci fs_reg src = get_nir_src(instr->src[0]); 5801bf215546Sopenharmony_ci 5802bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5803bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ? 5804bf215546Sopenharmony_ci get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM)); 5805bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address; 5806bf215546Sopenharmony_ci 5807bf215546Sopenharmony_ci const fs_builder ubld1 = bld.exec_all().group(1, 0); 5808bf215546Sopenharmony_ci const fs_builder ubld8 = bld.exec_all().group(8, 0); 5809bf215546Sopenharmony_ci const fs_builder ubld16 = bld.exec_all().group(16, 0); 5810bf215546Sopenharmony_ci 5811bf215546Sopenharmony_ci const unsigned total = instr->num_components * dispatch_width; 5812bf215546Sopenharmony_ci unsigned written = 0; 5813bf215546Sopenharmony_ci 5814bf215546Sopenharmony_ci while (written < total) { 5815bf215546Sopenharmony_ci const unsigned block = 5816bf215546Sopenharmony_ci choose_oword_block_size_dwords(total - written); 5817bf215546Sopenharmony_ci 5818bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); 5819bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = 5820bf215546Sopenharmony_ci retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD); 5821bf215546Sopenharmony_ci 5822bf215546Sopenharmony_ci const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5823bf215546Sopenharmony_ci ubld.emit(SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL, 5824bf215546Sopenharmony_ci fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5825bf215546Sopenharmony_ci 5826bf215546Sopenharmony_ci const unsigned block_bytes = block * 4; 5827bf215546Sopenharmony_ci ubld1.ADD(address, address, brw_imm_ud(block_bytes)); 5828bf215546Sopenharmony_ci written += block; 5829bf215546Sopenharmony_ci } 5830bf215546Sopenharmony_ci 5831bf215546Sopenharmony_ci assert(written == total); 5832bf215546Sopenharmony_ci break; 5833bf215546Sopenharmony_ci } 5834bf215546Sopenharmony_ci 5835bf215546Sopenharmony_ci case nir_intrinsic_load_topology_id_intel: { 5836bf215546Sopenharmony_ci /* These move around basically every hardware generation, so don' 5837bf215546Sopenharmony_ci * do any >= checks and fail if the platform hasn't explicitly 5838bf215546Sopenharmony_ci * been enabled here. 5839bf215546Sopenharmony_ci */ 5840bf215546Sopenharmony_ci assert(devinfo->ver == 12); 5841bf215546Sopenharmony_ci 5842bf215546Sopenharmony_ci /* Here is what the layout of SR0 looks like on Gfx12 : 5843bf215546Sopenharmony_ci * [13:11] : Slice ID. 5844bf215546Sopenharmony_ci * [10:9] : Dual-SubSlice ID 5845bf215546Sopenharmony_ci * [8] : SubSlice ID 5846bf215546Sopenharmony_ci * [7] : EUID[2] (aka EU Row ID) 5847bf215546Sopenharmony_ci * [6] : Reserved 5848bf215546Sopenharmony_ci * [5:4] : EUID[1:0] 5849bf215546Sopenharmony_ci * [2:0] : Thread ID 5850bf215546Sopenharmony_ci */ 5851bf215546Sopenharmony_ci fs_reg raw_id = bld.vgrf(BRW_REGISTER_TYPE_UD); 5852bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_READ_SR_REG, raw_id, brw_imm_ud(0)); 5853bf215546Sopenharmony_ci switch (nir_intrinsic_base(instr)) { 5854bf215546Sopenharmony_ci case BRW_TOPOLOGY_ID_DSS: 5855bf215546Sopenharmony_ci bld.AND(raw_id, raw_id, brw_imm_ud(0x3fff)); 5856bf215546Sopenharmony_ci /* Get rid of anything below dualsubslice */ 5857bf215546Sopenharmony_ci bld.SHR(retype(dest, BRW_REGISTER_TYPE_UD), raw_id, brw_imm_ud(9)); 5858bf215546Sopenharmony_ci break; 5859bf215546Sopenharmony_ci case BRW_TOPOLOGY_ID_EU_THREAD_SIMD: { 5860bf215546Sopenharmony_ci limit_dispatch_width(16, "Topology helper for Ray queries, " 5861bf215546Sopenharmony_ci "not supported in SIMD32 mode."); 5862bf215546Sopenharmony_ci fs_reg dst = retype(dest, BRW_REGISTER_TYPE_UD); 5863bf215546Sopenharmony_ci 5864bf215546Sopenharmony_ci /* EU[3:0] << 7 5865bf215546Sopenharmony_ci * 5866bf215546Sopenharmony_ci * The 4bit EU[3:0] we need to build for ray query memory addresses 5867bf215546Sopenharmony_ci * computations is a bit odd : 5868bf215546Sopenharmony_ci * 5869bf215546Sopenharmony_ci * EU[1:0] = raw_id[5:4] (identified as EUID[1:0]) 5870bf215546Sopenharmony_ci * EU[2] = raw_id[8] (identified as SubSlice ID) 5871bf215546Sopenharmony_ci * EU[3] = raw_id[7] (identified as EUID[2] or Row ID) 5872bf215546Sopenharmony_ci */ 5873bf215546Sopenharmony_ci { 5874bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 5875bf215546Sopenharmony_ci bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(7, 7))); 5876bf215546Sopenharmony_ci bld.SHL(dst, tmp, brw_imm_ud(3)); 5877bf215546Sopenharmony_ci bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(8, 8))); 5878bf215546Sopenharmony_ci bld.SHL(tmp, tmp, brw_imm_ud(1)); 5879bf215546Sopenharmony_ci bld.OR(dst, dst, tmp); 5880bf215546Sopenharmony_ci bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(5, 4))); 5881bf215546Sopenharmony_ci bld.SHL(tmp, tmp, brw_imm_ud(3)); 5882bf215546Sopenharmony_ci bld.OR(dst, dst, tmp); 5883bf215546Sopenharmony_ci } 5884bf215546Sopenharmony_ci 5885bf215546Sopenharmony_ci /* ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0]) */ 5886bf215546Sopenharmony_ci { 5887bf215546Sopenharmony_ci bld.AND(raw_id, raw_id, brw_imm_ud(INTEL_MASK(2, 0))); 5888bf215546Sopenharmony_ci bld.SHL(raw_id, raw_id, brw_imm_ud(4)); 5889bf215546Sopenharmony_ci bld.OR(dst, dst, raw_id); 5890bf215546Sopenharmony_ci } 5891bf215546Sopenharmony_ci 5892bf215546Sopenharmony_ci /* LaneID[0:3] << 0 (We build up LaneID by putting the right number 5893bf215546Sopenharmony_ci * in each lane) 5894bf215546Sopenharmony_ci */ 5895bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW); 5896bf215546Sopenharmony_ci const fs_builder ubld8 = bld.exec_all().group(8, 0); 5897bf215546Sopenharmony_ci ubld8.MOV(quarter(tmp, 0), brw_imm_v(0x76543210)); 5898bf215546Sopenharmony_ci if (bld.dispatch_width() == 16) { 5899bf215546Sopenharmony_ci /* Sets 0xfedcba98 to the upper part of the register. */ 5900bf215546Sopenharmony_ci ubld8.ADD(quarter(tmp, 1), quarter(tmp, 0), brw_imm_ud(8)); 5901bf215546Sopenharmony_ci } 5902bf215546Sopenharmony_ci bld.ADD(dst, dst, tmp); 5903bf215546Sopenharmony_ci break; 5904bf215546Sopenharmony_ci } 5905bf215546Sopenharmony_ci default: 5906bf215546Sopenharmony_ci unreachable("Invalid topology id type"); 5907bf215546Sopenharmony_ci } 5908bf215546Sopenharmony_ci break; 5909bf215546Sopenharmony_ci } 5910bf215546Sopenharmony_ci 5911bf215546Sopenharmony_ci case nir_intrinsic_load_btd_stack_id_intel: 5912bf215546Sopenharmony_ci if (stage == MESA_SHADER_COMPUTE) { 5913bf215546Sopenharmony_ci assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids); 5914bf215546Sopenharmony_ci } else { 5915bf215546Sopenharmony_ci assert(brw_shader_stage_is_bindless(stage)); 5916bf215546Sopenharmony_ci } 5917bf215546Sopenharmony_ci /* Stack IDs are always in R1 regardless of whether we're coming from a 5918bf215546Sopenharmony_ci * bindless shader or a regular compute shader. 5919bf215546Sopenharmony_ci */ 5920bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 5921bf215546Sopenharmony_ci retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW)); 5922bf215546Sopenharmony_ci break; 5923bf215546Sopenharmony_ci 5924bf215546Sopenharmony_ci case nir_intrinsic_btd_spawn_intel: 5925bf215546Sopenharmony_ci if (stage == MESA_SHADER_COMPUTE) { 5926bf215546Sopenharmony_ci assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids); 5927bf215546Sopenharmony_ci } else { 5928bf215546Sopenharmony_ci assert(brw_shader_stage_is_bindless(stage)); 5929bf215546Sopenharmony_ci } 5930bf215546Sopenharmony_ci /* Make sure all the pointers to resume shaders have landed where other 5931bf215546Sopenharmony_ci * threads can see them. 5932bf215546Sopenharmony_ci */ 5933bf215546Sopenharmony_ci emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE); 5934bf215546Sopenharmony_ci 5935bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(), 5936bf215546Sopenharmony_ci bld.emit_uniformize(get_nir_src(instr->src[0])), 5937bf215546Sopenharmony_ci get_nir_src(instr->src[1])); 5938bf215546Sopenharmony_ci break; 5939bf215546Sopenharmony_ci 5940bf215546Sopenharmony_ci case nir_intrinsic_btd_retire_intel: 5941bf215546Sopenharmony_ci if (stage == MESA_SHADER_COMPUTE) { 5942bf215546Sopenharmony_ci assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids); 5943bf215546Sopenharmony_ci } else { 5944bf215546Sopenharmony_ci assert(brw_shader_stage_is_bindless(stage)); 5945bf215546Sopenharmony_ci } 5946bf215546Sopenharmony_ci /* Make sure all the pointers to resume shaders have landed where other 5947bf215546Sopenharmony_ci * threads can see them. 5948bf215546Sopenharmony_ci */ 5949bf215546Sopenharmony_ci emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE); 5950bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL); 5951bf215546Sopenharmony_ci break; 5952bf215546Sopenharmony_ci 5953bf215546Sopenharmony_ci case nir_intrinsic_trace_ray_intel: { 5954bf215546Sopenharmony_ci const bool synchronous = nir_intrinsic_synchronous(instr); 5955bf215546Sopenharmony_ci assert(brw_shader_stage_is_bindless(stage) || synchronous); 5956bf215546Sopenharmony_ci 5957bf215546Sopenharmony_ci /* Make sure all the previous RT structure writes are visible to the RT 5958bf215546Sopenharmony_ci * fixed function within the DSS, as well as stack pointers to resume 5959bf215546Sopenharmony_ci * shaders. 5960bf215546Sopenharmony_ci */ 5961bf215546Sopenharmony_ci emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE); 5962bf215546Sopenharmony_ci 5963bf215546Sopenharmony_ci fs_reg srcs[RT_LOGICAL_NUM_SRCS]; 5964bf215546Sopenharmony_ci 5965bf215546Sopenharmony_ci fs_reg globals = get_nir_src(instr->src[0]); 5966bf215546Sopenharmony_ci srcs[RT_LOGICAL_SRC_GLOBALS] = bld.emit_uniformize(globals); 5967bf215546Sopenharmony_ci srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(instr->src[1]); 5968bf215546Sopenharmony_ci srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(instr->src[2]); 5969bf215546Sopenharmony_ci srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous); 5970bf215546Sopenharmony_ci bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, bld.null_reg_ud(), 5971bf215546Sopenharmony_ci srcs, RT_LOGICAL_NUM_SRCS); 5972bf215546Sopenharmony_ci 5973bf215546Sopenharmony_ci /* There is no actual value to use in the destination register of the 5974bf215546Sopenharmony_ci * synchronous trace instruction. All of the communication with the HW 5975bf215546Sopenharmony_ci * unit happens through memory reads/writes. So to ensure that the 5976bf215546Sopenharmony_ci * operation has completed before we go read the results in memory, we 5977bf215546Sopenharmony_ci * need a barrier followed by an invalidate before accessing memory. 5978bf215546Sopenharmony_ci */ 5979bf215546Sopenharmony_ci if (synchronous) { 5980bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_SYNC, bld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR)); 5981bf215546Sopenharmony_ci emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_INVALIDATE); 5982bf215546Sopenharmony_ci } 5983bf215546Sopenharmony_ci break; 5984bf215546Sopenharmony_ci } 5985bf215546Sopenharmony_ci 5986bf215546Sopenharmony_ci default: 5987bf215546Sopenharmony_ci#ifndef NDEBUG 5988bf215546Sopenharmony_ci assert(instr->intrinsic < nir_num_intrinsics); 5989bf215546Sopenharmony_ci fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name); 5990bf215546Sopenharmony_ci#endif 5991bf215546Sopenharmony_ci unreachable("unknown intrinsic"); 5992bf215546Sopenharmony_ci } 5993bf215546Sopenharmony_ci} 5994bf215546Sopenharmony_ci 5995bf215546Sopenharmony_civoid 5996bf215546Sopenharmony_cifs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, 5997bf215546Sopenharmony_ci int op, nir_intrinsic_instr *instr) 5998bf215546Sopenharmony_ci{ 5999bf215546Sopenharmony_ci /* The BTI untyped atomic messages only support 32-bit atomics. If you 6000bf215546Sopenharmony_ci * just look at the big table of messages in the Vol 7 of the SKL PRM, they 6001bf215546Sopenharmony_ci * appear to exist. However, if you look at Vol 2a, there are no message 6002bf215546Sopenharmony_ci * descriptors provided for Qword atomic ops except for A64 messages. 6003bf215546Sopenharmony_ci */ 6004bf215546Sopenharmony_ci assert(nir_dest_bit_size(instr->dest) == 32 || 6005bf215546Sopenharmony_ci (nir_dest_bit_size(instr->dest) == 64 && devinfo->has_lsc)); 6006bf215546Sopenharmony_ci 6007bf215546Sopenharmony_ci fs_reg dest; 6008bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 6009bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 6010bf215546Sopenharmony_ci 6011bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 6012bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr); 6013bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 6014bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 6015bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 6016bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 6017bf215546Sopenharmony_ci 6018bf215546Sopenharmony_ci fs_reg data; 6019bf215546Sopenharmony_ci if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 6020bf215546Sopenharmony_ci data = get_nir_src(instr->src[2]); 6021bf215546Sopenharmony_ci 6022bf215546Sopenharmony_ci if (op == BRW_AOP_CMPWR) { 6023bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(data.type, 2); 6024bf215546Sopenharmony_ci fs_reg sources[2] = { data, get_nir_src(instr->src[3]) }; 6025bf215546Sopenharmony_ci bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6026bf215546Sopenharmony_ci data = tmp; 6027bf215546Sopenharmony_ci } 6028bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = data; 6029bf215546Sopenharmony_ci 6030bf215546Sopenharmony_ci /* Emit the actual atomic operation */ 6031bf215546Sopenharmony_ci 6032bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 6033bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 6034bf215546Sopenharmony_ci} 6035bf215546Sopenharmony_ci 6036bf215546Sopenharmony_civoid 6037bf215546Sopenharmony_cifs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld, 6038bf215546Sopenharmony_ci int op, nir_intrinsic_instr *instr) 6039bf215546Sopenharmony_ci{ 6040bf215546Sopenharmony_ci fs_reg dest; 6041bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 6042bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 6043bf215546Sopenharmony_ci 6044bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 6045bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr); 6046bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 6047bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 6048bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 6049bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 6050bf215546Sopenharmony_ci 6051bf215546Sopenharmony_ci fs_reg data = get_nir_src(instr->src[2]); 6052bf215546Sopenharmony_ci if (op == BRW_AOP_FCMPWR) { 6053bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(data.type, 2); 6054bf215546Sopenharmony_ci fs_reg sources[2] = { data, get_nir_src(instr->src[3]) }; 6055bf215546Sopenharmony_ci bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6056bf215546Sopenharmony_ci data = tmp; 6057bf215546Sopenharmony_ci } 6058bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = data; 6059bf215546Sopenharmony_ci 6060bf215546Sopenharmony_ci /* Emit the actual atomic operation */ 6061bf215546Sopenharmony_ci 6062bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL, 6063bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 6064bf215546Sopenharmony_ci} 6065bf215546Sopenharmony_ci 6066bf215546Sopenharmony_civoid 6067bf215546Sopenharmony_cifs_visitor::nir_emit_shared_atomic(const fs_builder &bld, 6068bf215546Sopenharmony_ci int op, nir_intrinsic_instr *instr) 6069bf215546Sopenharmony_ci{ 6070bf215546Sopenharmony_ci fs_reg dest; 6071bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 6072bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 6073bf215546Sopenharmony_ci 6074bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 6075bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 6076bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 6077bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 6078bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 6079bf215546Sopenharmony_ci 6080bf215546Sopenharmony_ci fs_reg data; 6081bf215546Sopenharmony_ci if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 6082bf215546Sopenharmony_ci data = get_nir_src(instr->src[1]); 6083bf215546Sopenharmony_ci if (op == BRW_AOP_CMPWR) { 6084bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(data.type, 2); 6085bf215546Sopenharmony_ci fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 6086bf215546Sopenharmony_ci bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6087bf215546Sopenharmony_ci data = tmp; 6088bf215546Sopenharmony_ci } 6089bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = data; 6090bf215546Sopenharmony_ci 6091bf215546Sopenharmony_ci /* Get the offset */ 6092bf215546Sopenharmony_ci if (nir_src_is_const(instr->src[0])) { 6093bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 6094bf215546Sopenharmony_ci brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0])); 6095bf215546Sopenharmony_ci } else { 6096bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type); 6097bf215546Sopenharmony_ci bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], 6098bf215546Sopenharmony_ci retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 6099bf215546Sopenharmony_ci brw_imm_ud(instr->const_index[0])); 6100bf215546Sopenharmony_ci } 6101bf215546Sopenharmony_ci 6102bf215546Sopenharmony_ci /* Emit the actual atomic operation operation */ 6103bf215546Sopenharmony_ci 6104bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 6105bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 6106bf215546Sopenharmony_ci} 6107bf215546Sopenharmony_ci 6108bf215546Sopenharmony_civoid 6109bf215546Sopenharmony_cifs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld, 6110bf215546Sopenharmony_ci int op, nir_intrinsic_instr *instr) 6111bf215546Sopenharmony_ci{ 6112bf215546Sopenharmony_ci fs_reg dest; 6113bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 6114bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 6115bf215546Sopenharmony_ci 6116bf215546Sopenharmony_ci fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 6117bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 6118bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 6119bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 6120bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 6121bf215546Sopenharmony_ci 6122bf215546Sopenharmony_ci fs_reg data = get_nir_src(instr->src[1]); 6123bf215546Sopenharmony_ci if (op == BRW_AOP_FCMPWR) { 6124bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(data.type, 2); 6125bf215546Sopenharmony_ci fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 6126bf215546Sopenharmony_ci bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6127bf215546Sopenharmony_ci data = tmp; 6128bf215546Sopenharmony_ci } 6129bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_DATA] = data; 6130bf215546Sopenharmony_ci 6131bf215546Sopenharmony_ci /* Get the offset */ 6132bf215546Sopenharmony_ci if (nir_src_is_const(instr->src[0])) { 6133bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 6134bf215546Sopenharmony_ci brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0])); 6135bf215546Sopenharmony_ci } else { 6136bf215546Sopenharmony_ci srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type); 6137bf215546Sopenharmony_ci bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], 6138bf215546Sopenharmony_ci retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 6139bf215546Sopenharmony_ci brw_imm_ud(instr->const_index[0])); 6140bf215546Sopenharmony_ci } 6141bf215546Sopenharmony_ci 6142bf215546Sopenharmony_ci /* Emit the actual atomic operation operation */ 6143bf215546Sopenharmony_ci 6144bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL, 6145bf215546Sopenharmony_ci dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 6146bf215546Sopenharmony_ci} 6147bf215546Sopenharmony_ci 6148bf215546Sopenharmony_cistatic fs_reg 6149bf215546Sopenharmony_ciexpand_to_32bit(const fs_builder &bld, const fs_reg &src) 6150bf215546Sopenharmony_ci{ 6151bf215546Sopenharmony_ci if (type_sz(src.type) == 2) { 6152bf215546Sopenharmony_ci fs_reg src32 = bld.vgrf(BRW_REGISTER_TYPE_UD); 6153bf215546Sopenharmony_ci bld.MOV(src32, retype(src, BRW_REGISTER_TYPE_UW)); 6154bf215546Sopenharmony_ci return src32; 6155bf215546Sopenharmony_ci } else { 6156bf215546Sopenharmony_ci return src; 6157bf215546Sopenharmony_ci } 6158bf215546Sopenharmony_ci} 6159bf215546Sopenharmony_ci 6160bf215546Sopenharmony_civoid 6161bf215546Sopenharmony_cifs_visitor::nir_emit_global_atomic(const fs_builder &bld, 6162bf215546Sopenharmony_ci int op, nir_intrinsic_instr *instr) 6163bf215546Sopenharmony_ci{ 6164bf215546Sopenharmony_ci fs_reg dest; 6165bf215546Sopenharmony_ci if (nir_intrinsic_infos[instr->intrinsic].has_dest) 6166bf215546Sopenharmony_ci dest = get_nir_dest(instr->dest); 6167bf215546Sopenharmony_ci 6168bf215546Sopenharmony_ci fs_reg addr = get_nir_src(instr->src[0]); 6169bf215546Sopenharmony_ci 6170bf215546Sopenharmony_ci fs_reg data; 6171bf215546Sopenharmony_ci if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 6172bf215546Sopenharmony_ci data = expand_to_32bit(bld, get_nir_src(instr->src[1])); 6173bf215546Sopenharmony_ci 6174bf215546Sopenharmony_ci if (op == BRW_AOP_CMPWR) { 6175bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(data.type, 2); 6176bf215546Sopenharmony_ci fs_reg sources[2] = { 6177bf215546Sopenharmony_ci data, 6178bf215546Sopenharmony_ci expand_to_32bit(bld, get_nir_src(instr->src[2])) 6179bf215546Sopenharmony_ci }; 6180bf215546Sopenharmony_ci bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6181bf215546Sopenharmony_ci data = tmp; 6182bf215546Sopenharmony_ci } 6183bf215546Sopenharmony_ci 6184bf215546Sopenharmony_ci fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 6185bf215546Sopenharmony_ci srcs[A64_LOGICAL_ADDRESS] = addr; 6186bf215546Sopenharmony_ci srcs[A64_LOGICAL_SRC] = data; 6187bf215546Sopenharmony_ci srcs[A64_LOGICAL_ARG] = brw_imm_ud(op); 6188bf215546Sopenharmony_ci srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); 6189bf215546Sopenharmony_ci 6190bf215546Sopenharmony_ci switch (nir_dest_bit_size(instr->dest)) { 6191bf215546Sopenharmony_ci case 16: { 6192bf215546Sopenharmony_ci fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD); 6193bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL, dest32, 6194bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS); 6195bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32); 6196bf215546Sopenharmony_ci break; 6197bf215546Sopenharmony_ci } 6198bf215546Sopenharmony_ci case 32: 6199bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest, 6200bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS); 6201bf215546Sopenharmony_ci break; 6202bf215546Sopenharmony_ci case 64: 6203bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL, dest, 6204bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS); 6205bf215546Sopenharmony_ci break; 6206bf215546Sopenharmony_ci default: 6207bf215546Sopenharmony_ci unreachable("Unsupported bit size"); 6208bf215546Sopenharmony_ci } 6209bf215546Sopenharmony_ci} 6210bf215546Sopenharmony_ci 6211bf215546Sopenharmony_civoid 6212bf215546Sopenharmony_cifs_visitor::nir_emit_global_atomic_float(const fs_builder &bld, 6213bf215546Sopenharmony_ci int op, nir_intrinsic_instr *instr) 6214bf215546Sopenharmony_ci{ 6215bf215546Sopenharmony_ci assert(nir_intrinsic_infos[instr->intrinsic].has_dest); 6216bf215546Sopenharmony_ci fs_reg dest = get_nir_dest(instr->dest); 6217bf215546Sopenharmony_ci 6218bf215546Sopenharmony_ci fs_reg addr = get_nir_src(instr->src[0]); 6219bf215546Sopenharmony_ci 6220bf215546Sopenharmony_ci assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC); 6221bf215546Sopenharmony_ci fs_reg data = expand_to_32bit(bld, get_nir_src(instr->src[1])); 6222bf215546Sopenharmony_ci 6223bf215546Sopenharmony_ci if (op == BRW_AOP_FCMPWR) { 6224bf215546Sopenharmony_ci fs_reg tmp = bld.vgrf(data.type, 2); 6225bf215546Sopenharmony_ci fs_reg sources[2] = { 6226bf215546Sopenharmony_ci data, 6227bf215546Sopenharmony_ci expand_to_32bit(bld, get_nir_src(instr->src[2])) 6228bf215546Sopenharmony_ci }; 6229bf215546Sopenharmony_ci bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6230bf215546Sopenharmony_ci data = tmp; 6231bf215546Sopenharmony_ci } 6232bf215546Sopenharmony_ci 6233bf215546Sopenharmony_ci fs_reg srcs[A64_LOGICAL_NUM_SRCS]; 6234bf215546Sopenharmony_ci srcs[A64_LOGICAL_ADDRESS] = addr; 6235bf215546Sopenharmony_ci srcs[A64_LOGICAL_SRC] = data; 6236bf215546Sopenharmony_ci srcs[A64_LOGICAL_ARG] = brw_imm_ud(op); 6237bf215546Sopenharmony_ci srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); 6238bf215546Sopenharmony_ci 6239bf215546Sopenharmony_ci switch (nir_dest_bit_size(instr->dest)) { 6240bf215546Sopenharmony_ci case 16: { 6241bf215546Sopenharmony_ci fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD); 6242bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL, dest32, 6243bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS); 6244bf215546Sopenharmony_ci bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32); 6245bf215546Sopenharmony_ci break; 6246bf215546Sopenharmony_ci } 6247bf215546Sopenharmony_ci case 32: 6248bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL, dest, 6249bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS); 6250bf215546Sopenharmony_ci break; 6251bf215546Sopenharmony_ci case 64: 6252bf215546Sopenharmony_ci bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL, dest, 6253bf215546Sopenharmony_ci srcs, A64_LOGICAL_NUM_SRCS); 6254bf215546Sopenharmony_ci break; 6255bf215546Sopenharmony_ci default: 6256bf215546Sopenharmony_ci unreachable("Unsupported bit size"); 6257bf215546Sopenharmony_ci } 6258bf215546Sopenharmony_ci} 6259bf215546Sopenharmony_ci 6260bf215546Sopenharmony_civoid 6261bf215546Sopenharmony_cifs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) 6262bf215546Sopenharmony_ci{ 6263bf215546Sopenharmony_ci unsigned texture = instr->texture_index; 6264bf215546Sopenharmony_ci unsigned sampler = instr->sampler_index; 6265bf215546Sopenharmony_ci 6266bf215546Sopenharmony_ci fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 6267bf215546Sopenharmony_ci 6268bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture); 6269bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler); 6270bf215546Sopenharmony_ci 6271bf215546Sopenharmony_ci int lod_components = 0; 6272bf215546Sopenharmony_ci 6273bf215546Sopenharmony_ci /* The hardware requires a LOD for buffer textures */ 6274bf215546Sopenharmony_ci if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 6275bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0); 6276bf215546Sopenharmony_ci 6277bf215546Sopenharmony_ci uint32_t header_bits = 0; 6278bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->num_srcs; i++) { 6279bf215546Sopenharmony_ci fs_reg src = get_nir_src(instr->src[i].src); 6280bf215546Sopenharmony_ci switch (instr->src[i].src_type) { 6281bf215546Sopenharmony_ci case nir_tex_src_bias: 6282bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_LOD] = 6283bf215546Sopenharmony_ci retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 6284bf215546Sopenharmony_ci break; 6285bf215546Sopenharmony_ci case nir_tex_src_comparator: 6286bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F); 6287bf215546Sopenharmony_ci break; 6288bf215546Sopenharmony_ci case nir_tex_src_coord: 6289bf215546Sopenharmony_ci switch (instr->op) { 6290bf215546Sopenharmony_ci case nir_texop_txf: 6291bf215546Sopenharmony_ci case nir_texop_txf_ms: 6292bf215546Sopenharmony_ci case nir_texop_txf_ms_mcs_intel: 6293bf215546Sopenharmony_ci case nir_texop_samples_identical: 6294bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D); 6295bf215546Sopenharmony_ci break; 6296bf215546Sopenharmony_ci default: 6297bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F); 6298bf215546Sopenharmony_ci break; 6299bf215546Sopenharmony_ci } 6300bf215546Sopenharmony_ci 6301bf215546Sopenharmony_ci /* Wa_14013363432: 6302bf215546Sopenharmony_ci * 6303bf215546Sopenharmony_ci * Compiler should send U,V,R parameters even if V,R are 0. 6304bf215546Sopenharmony_ci */ 6305bf215546Sopenharmony_ci if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && devinfo->verx10 == 125) 6306bf215546Sopenharmony_ci assert(instr->coord_components >= 3u); 6307bf215546Sopenharmony_ci break; 6308bf215546Sopenharmony_ci case nir_tex_src_ddx: 6309bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F); 6310bf215546Sopenharmony_ci lod_components = nir_tex_instr_src_size(instr, i); 6311bf215546Sopenharmony_ci break; 6312bf215546Sopenharmony_ci case nir_tex_src_ddy: 6313bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F); 6314bf215546Sopenharmony_ci break; 6315bf215546Sopenharmony_ci case nir_tex_src_lod: 6316bf215546Sopenharmony_ci switch (instr->op) { 6317bf215546Sopenharmony_ci case nir_texop_txs: 6318bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_LOD] = 6319bf215546Sopenharmony_ci retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD); 6320bf215546Sopenharmony_ci break; 6321bf215546Sopenharmony_ci case nir_texop_txf: 6322bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_LOD] = 6323bf215546Sopenharmony_ci retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D); 6324bf215546Sopenharmony_ci break; 6325bf215546Sopenharmony_ci default: 6326bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_LOD] = 6327bf215546Sopenharmony_ci retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 6328bf215546Sopenharmony_ci break; 6329bf215546Sopenharmony_ci } 6330bf215546Sopenharmony_ci break; 6331bf215546Sopenharmony_ci case nir_tex_src_min_lod: 6332bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_MIN_LOD] = 6333bf215546Sopenharmony_ci retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 6334bf215546Sopenharmony_ci break; 6335bf215546Sopenharmony_ci case nir_tex_src_ms_index: 6336bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD); 6337bf215546Sopenharmony_ci break; 6338bf215546Sopenharmony_ci 6339bf215546Sopenharmony_ci case nir_tex_src_offset: { 6340bf215546Sopenharmony_ci uint32_t offset_bits = 0; 6341bf215546Sopenharmony_ci if (brw_texture_offset(instr, i, &offset_bits)) { 6342bf215546Sopenharmony_ci header_bits |= offset_bits; 6343bf215546Sopenharmony_ci } else { 6344bf215546Sopenharmony_ci /* On gfx12.5+, if the offsets are not both constant and in the 6345bf215546Sopenharmony_ci * {-8,7} range, nir_lower_tex() will have already lowered the 6346bf215546Sopenharmony_ci * source offset. So we should never reach this point. 6347bf215546Sopenharmony_ci */ 6348bf215546Sopenharmony_ci assert(devinfo->verx10 < 125); 6349bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_TG4_OFFSET] = 6350bf215546Sopenharmony_ci retype(src, BRW_REGISTER_TYPE_D); 6351bf215546Sopenharmony_ci } 6352bf215546Sopenharmony_ci break; 6353bf215546Sopenharmony_ci } 6354bf215546Sopenharmony_ci 6355bf215546Sopenharmony_ci case nir_tex_src_projector: 6356bf215546Sopenharmony_ci unreachable("should be lowered"); 6357bf215546Sopenharmony_ci 6358bf215546Sopenharmony_ci case nir_tex_src_texture_offset: { 6359bf215546Sopenharmony_ci /* Emit code to evaluate the actual indexing expression */ 6360bf215546Sopenharmony_ci fs_reg tmp = vgrf(glsl_type::uint_type); 6361bf215546Sopenharmony_ci bld.ADD(tmp, src, brw_imm_ud(texture)); 6362bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp); 6363bf215546Sopenharmony_ci break; 6364bf215546Sopenharmony_ci } 6365bf215546Sopenharmony_ci 6366bf215546Sopenharmony_ci case nir_tex_src_sampler_offset: { 6367bf215546Sopenharmony_ci /* Emit code to evaluate the actual indexing expression */ 6368bf215546Sopenharmony_ci fs_reg tmp = vgrf(glsl_type::uint_type); 6369bf215546Sopenharmony_ci bld.ADD(tmp, src, brw_imm_ud(sampler)); 6370bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp); 6371bf215546Sopenharmony_ci break; 6372bf215546Sopenharmony_ci } 6373bf215546Sopenharmony_ci 6374bf215546Sopenharmony_ci case nir_tex_src_texture_handle: 6375bf215546Sopenharmony_ci assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1); 6376bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg(); 6377bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src); 6378bf215546Sopenharmony_ci break; 6379bf215546Sopenharmony_ci 6380bf215546Sopenharmony_ci case nir_tex_src_sampler_handle: 6381bf215546Sopenharmony_ci assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1); 6382bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg(); 6383bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src); 6384bf215546Sopenharmony_ci break; 6385bf215546Sopenharmony_ci 6386bf215546Sopenharmony_ci case nir_tex_src_ms_mcs_intel: 6387bf215546Sopenharmony_ci assert(instr->op == nir_texop_txf_ms); 6388bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D); 6389bf215546Sopenharmony_ci break; 6390bf215546Sopenharmony_ci 6391bf215546Sopenharmony_ci default: 6392bf215546Sopenharmony_ci unreachable("unknown texture source"); 6393bf215546Sopenharmony_ci } 6394bf215546Sopenharmony_ci } 6395bf215546Sopenharmony_ci 6396bf215546Sopenharmony_ci if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE && 6397bf215546Sopenharmony_ci (instr->op == nir_texop_txf_ms || 6398bf215546Sopenharmony_ci instr->op == nir_texop_samples_identical)) { 6399bf215546Sopenharmony_ci if (devinfo->ver >= 7 && 6400bf215546Sopenharmony_ci key_tex->compressed_multisample_layout_mask & (1 << texture)) { 6401bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_MCS] = 6402bf215546Sopenharmony_ci emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE], 6403bf215546Sopenharmony_ci instr->coord_components, 6404bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SURFACE], 6405bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]); 6406bf215546Sopenharmony_ci } else { 6407bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u); 6408bf215546Sopenharmony_ci } 6409bf215546Sopenharmony_ci } 6410bf215546Sopenharmony_ci 6411bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components); 6412bf215546Sopenharmony_ci srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components); 6413bf215546Sopenharmony_ci 6414bf215546Sopenharmony_ci enum opcode opcode; 6415bf215546Sopenharmony_ci switch (instr->op) { 6416bf215546Sopenharmony_ci case nir_texop_tex: 6417bf215546Sopenharmony_ci opcode = SHADER_OPCODE_TEX_LOGICAL; 6418bf215546Sopenharmony_ci break; 6419bf215546Sopenharmony_ci case nir_texop_txb: 6420bf215546Sopenharmony_ci opcode = FS_OPCODE_TXB_LOGICAL; 6421bf215546Sopenharmony_ci break; 6422bf215546Sopenharmony_ci case nir_texop_txl: 6423bf215546Sopenharmony_ci opcode = SHADER_OPCODE_TXL_LOGICAL; 6424bf215546Sopenharmony_ci break; 6425bf215546Sopenharmony_ci case nir_texop_txd: 6426bf215546Sopenharmony_ci opcode = SHADER_OPCODE_TXD_LOGICAL; 6427bf215546Sopenharmony_ci break; 6428bf215546Sopenharmony_ci case nir_texop_txf: 6429bf215546Sopenharmony_ci opcode = SHADER_OPCODE_TXF_LOGICAL; 6430bf215546Sopenharmony_ci break; 6431bf215546Sopenharmony_ci case nir_texop_txf_ms: 6432bf215546Sopenharmony_ci /* On Gfx12HP there is only CMS_W available. From the Bspec: Shared 6433bf215546Sopenharmony_ci * Functions - 3D Sampler - Messages - Message Format: 6434bf215546Sopenharmony_ci * 6435bf215546Sopenharmony_ci * ld2dms REMOVEDBY(GEN:HAS:1406788836) 6436bf215546Sopenharmony_ci */ 6437bf215546Sopenharmony_ci if (devinfo->verx10 >= 125) 6438bf215546Sopenharmony_ci opcode = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL; 6439bf215546Sopenharmony_ci else if ((key_tex->msaa_16 & (1 << sampler))) 6440bf215546Sopenharmony_ci opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL; 6441bf215546Sopenharmony_ci else 6442bf215546Sopenharmony_ci opcode = SHADER_OPCODE_TXF_CMS_LOGICAL; 6443bf215546Sopenharmony_ci break; 6444bf215546Sopenharmony_ci case nir_texop_txf_ms_mcs_intel: 6445bf215546Sopenharmony_ci opcode = SHADER_OPCODE_TXF_MCS_LOGICAL; 6446bf215546Sopenharmony_ci break; 6447bf215546Sopenharmony_ci case nir_texop_query_levels: 6448bf215546Sopenharmony_ci case nir_texop_txs: 6449bf215546Sopenharmony_ci opcode = SHADER_OPCODE_TXS_LOGICAL; 6450bf215546Sopenharmony_ci break; 6451bf215546Sopenharmony_ci case nir_texop_lod: 6452bf215546Sopenharmony_ci opcode = SHADER_OPCODE_LOD_LOGICAL; 6453bf215546Sopenharmony_ci break; 6454bf215546Sopenharmony_ci case nir_texop_tg4: 6455bf215546Sopenharmony_ci if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) 6456bf215546Sopenharmony_ci opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL; 6457bf215546Sopenharmony_ci else 6458bf215546Sopenharmony_ci opcode = SHADER_OPCODE_TG4_LOGICAL; 6459bf215546Sopenharmony_ci break; 6460bf215546Sopenharmony_ci case nir_texop_texture_samples: 6461bf215546Sopenharmony_ci opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL; 6462bf215546Sopenharmony_ci break; 6463bf215546Sopenharmony_ci case nir_texop_samples_identical: { 6464bf215546Sopenharmony_ci fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D); 6465bf215546Sopenharmony_ci 6466bf215546Sopenharmony_ci /* If mcs is an immediate value, it means there is no MCS. In that case 6467bf215546Sopenharmony_ci * just return false. 6468bf215546Sopenharmony_ci */ 6469bf215546Sopenharmony_ci if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) { 6470bf215546Sopenharmony_ci bld.MOV(dst, brw_imm_ud(0u)); 6471bf215546Sopenharmony_ci } else if ((key_tex->msaa_16 & (1 << sampler))) { 6472bf215546Sopenharmony_ci fs_reg tmp = vgrf(glsl_type::uint_type); 6473bf215546Sopenharmony_ci bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS], 6474bf215546Sopenharmony_ci offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1)); 6475bf215546Sopenharmony_ci bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ); 6476bf215546Sopenharmony_ci } else { 6477bf215546Sopenharmony_ci bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u), 6478bf215546Sopenharmony_ci BRW_CONDITIONAL_EQ); 6479bf215546Sopenharmony_ci } 6480bf215546Sopenharmony_ci return; 6481bf215546Sopenharmony_ci } 6482bf215546Sopenharmony_ci default: 6483bf215546Sopenharmony_ci unreachable("unknown texture opcode"); 6484bf215546Sopenharmony_ci } 6485bf215546Sopenharmony_ci 6486bf215546Sopenharmony_ci if (instr->op == nir_texop_tg4) { 6487bf215546Sopenharmony_ci if (instr->component == 1 && 6488bf215546Sopenharmony_ci key_tex->gather_channel_quirk_mask & (1 << texture)) { 6489bf215546Sopenharmony_ci /* gather4 sampler is broken for green channel on RG32F -- 6490bf215546Sopenharmony_ci * we must ask for blue instead. 6491bf215546Sopenharmony_ci */ 6492bf215546Sopenharmony_ci header_bits |= 2 << 16; 6493bf215546Sopenharmony_ci } else { 6494bf215546Sopenharmony_ci header_bits |= instr->component << 16; 6495bf215546Sopenharmony_ci } 6496bf215546Sopenharmony_ci } 6497bf215546Sopenharmony_ci 6498bf215546Sopenharmony_ci fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4); 6499bf215546Sopenharmony_ci fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); 6500bf215546Sopenharmony_ci inst->offset = header_bits; 6501bf215546Sopenharmony_ci 6502bf215546Sopenharmony_ci const unsigned dest_size = nir_tex_instr_dest_size(instr); 6503bf215546Sopenharmony_ci if (devinfo->ver >= 9 && 6504bf215546Sopenharmony_ci instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) { 6505bf215546Sopenharmony_ci unsigned write_mask = instr->dest.is_ssa ? 6506bf215546Sopenharmony_ci nir_ssa_def_components_read(&instr->dest.ssa): 6507bf215546Sopenharmony_ci (1 << dest_size) - 1; 6508bf215546Sopenharmony_ci assert(write_mask != 0); /* dead code should have been eliminated */ 6509bf215546Sopenharmony_ci inst->size_written = util_last_bit(write_mask) * 6510bf215546Sopenharmony_ci inst->dst.component_size(inst->exec_size); 6511bf215546Sopenharmony_ci } else { 6512bf215546Sopenharmony_ci inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 6513bf215546Sopenharmony_ci } 6514bf215546Sopenharmony_ci 6515bf215546Sopenharmony_ci if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE) 6516bf215546Sopenharmony_ci inst->shadow_compare = true; 6517bf215546Sopenharmony_ci 6518bf215546Sopenharmony_ci fs_reg nir_dest[5]; 6519bf215546Sopenharmony_ci for (unsigned i = 0; i < dest_size; i++) 6520bf215546Sopenharmony_ci nir_dest[i] = offset(dst, bld, i); 6521bf215546Sopenharmony_ci 6522bf215546Sopenharmony_ci if (instr->op == nir_texop_query_levels) { 6523bf215546Sopenharmony_ci /* # levels is in .w */ 6524bf215546Sopenharmony_ci if (devinfo->ver <= 9) { 6525bf215546Sopenharmony_ci /** 6526bf215546Sopenharmony_ci * Wa_1940217: 6527bf215546Sopenharmony_ci * 6528bf215546Sopenharmony_ci * When a surface of type SURFTYPE_NULL is accessed by resinfo, the 6529bf215546Sopenharmony_ci * MIPCount returned is undefined instead of 0. 6530bf215546Sopenharmony_ci */ 6531bf215546Sopenharmony_ci fs_inst *mov = bld.MOV(bld.null_reg_d(), dst); 6532bf215546Sopenharmony_ci mov->conditional_mod = BRW_CONDITIONAL_NZ; 6533bf215546Sopenharmony_ci nir_dest[0] = bld.vgrf(BRW_REGISTER_TYPE_D); 6534bf215546Sopenharmony_ci fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0)); 6535bf215546Sopenharmony_ci sel->predicate = BRW_PREDICATE_NORMAL; 6536bf215546Sopenharmony_ci } else { 6537bf215546Sopenharmony_ci nir_dest[0] = offset(dst, bld, 3); 6538bf215546Sopenharmony_ci } 6539bf215546Sopenharmony_ci } else if (instr->op == nir_texop_txs && 6540bf215546Sopenharmony_ci dest_size >= 3 && devinfo->ver < 7) { 6541bf215546Sopenharmony_ci /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */ 6542bf215546Sopenharmony_ci fs_reg depth = offset(dst, bld, 2); 6543bf215546Sopenharmony_ci nir_dest[2] = vgrf(glsl_type::int_type); 6544bf215546Sopenharmony_ci bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE); 6545bf215546Sopenharmony_ci } 6546bf215546Sopenharmony_ci 6547bf215546Sopenharmony_ci bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0); 6548bf215546Sopenharmony_ci} 6549bf215546Sopenharmony_ci 6550bf215546Sopenharmony_civoid 6551bf215546Sopenharmony_cifs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr) 6552bf215546Sopenharmony_ci{ 6553bf215546Sopenharmony_ci switch (instr->type) { 6554bf215546Sopenharmony_ci case nir_jump_break: 6555bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_BREAK); 6556bf215546Sopenharmony_ci break; 6557bf215546Sopenharmony_ci case nir_jump_continue: 6558bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_CONTINUE); 6559bf215546Sopenharmony_ci break; 6560bf215546Sopenharmony_ci case nir_jump_halt: 6561bf215546Sopenharmony_ci bld.emit(BRW_OPCODE_HALT); 6562bf215546Sopenharmony_ci break; 6563bf215546Sopenharmony_ci case nir_jump_return: 6564bf215546Sopenharmony_ci default: 6565bf215546Sopenharmony_ci unreachable("unknown jump"); 6566bf215546Sopenharmony_ci } 6567bf215546Sopenharmony_ci} 6568bf215546Sopenharmony_ci 6569bf215546Sopenharmony_ci/* 6570bf215546Sopenharmony_ci * This helper takes a source register and un/shuffles it into the destination 6571bf215546Sopenharmony_ci * register. 6572bf215546Sopenharmony_ci * 6573bf215546Sopenharmony_ci * If source type size is smaller than destination type size the operation 6574bf215546Sopenharmony_ci * needed is a component shuffle. The opposite case would be an unshuffle. If 6575bf215546Sopenharmony_ci * source/destination type size is equal a shuffle is done that would be 6576bf215546Sopenharmony_ci * equivalent to a simple MOV. 6577bf215546Sopenharmony_ci * 6578bf215546Sopenharmony_ci * For example, if source is a 16-bit type and destination is 32-bit. A 3 6579bf215546Sopenharmony_ci * components .xyz 16-bit vector on SIMD8 would be. 6580bf215546Sopenharmony_ci * 6581bf215546Sopenharmony_ci * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8| 6582bf215546Sopenharmony_ci * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | | 6583bf215546Sopenharmony_ci * 6584bf215546Sopenharmony_ci * This helper will return the following 2 32-bit components with the 16-bit 6585bf215546Sopenharmony_ci * values shuffled: 6586bf215546Sopenharmony_ci * 6587bf215546Sopenharmony_ci * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8| 6588bf215546Sopenharmony_ci * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 | 6589bf215546Sopenharmony_ci * 6590bf215546Sopenharmony_ci * For unshuffle, the example would be the opposite, a 64-bit type source 6591bf215546Sopenharmony_ci * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8 6592bf215546Sopenharmony_ci * would be: 6593bf215546Sopenharmony_ci * 6594bf215546Sopenharmony_ci * | x1l x1h | x2l x2h | x3l x3h | x4l x4h | 6595bf215546Sopenharmony_ci * | x5l x5h | x6l x6h | x7l x7h | x8l x8h | 6596bf215546Sopenharmony_ci * | y1l y1h | y2l y2h | y3l y3h | y4l y4h | 6597bf215546Sopenharmony_ci * | y5l y5h | y6l y6h | y7l y7h | y8l y8h | 6598bf215546Sopenharmony_ci * 6599bf215546Sopenharmony_ci * The returned result would be the following 4 32-bit components unshuffled: 6600bf215546Sopenharmony_ci * 6601bf215546Sopenharmony_ci * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l | 6602bf215546Sopenharmony_ci * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h | 6603bf215546Sopenharmony_ci * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l | 6604bf215546Sopenharmony_ci * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h | 6605bf215546Sopenharmony_ci * 6606bf215546Sopenharmony_ci * - Source and destination register must not be overlapped. 6607bf215546Sopenharmony_ci * - components units are measured in terms of the smaller type between 6608bf215546Sopenharmony_ci * source and destination because we are un/shuffling the smaller 6609bf215546Sopenharmony_ci * components from/into the bigger ones. 6610bf215546Sopenharmony_ci * - first_component parameter allows skipping source components. 6611bf215546Sopenharmony_ci */ 6612bf215546Sopenharmony_civoid 6613bf215546Sopenharmony_cishuffle_src_to_dst(const fs_builder &bld, 6614bf215546Sopenharmony_ci const fs_reg &dst, 6615bf215546Sopenharmony_ci const fs_reg &src, 6616bf215546Sopenharmony_ci uint32_t first_component, 6617bf215546Sopenharmony_ci uint32_t components) 6618bf215546Sopenharmony_ci{ 6619bf215546Sopenharmony_ci if (type_sz(src.type) == type_sz(dst.type)) { 6620bf215546Sopenharmony_ci assert(!regions_overlap(dst, 6621bf215546Sopenharmony_ci type_sz(dst.type) * bld.dispatch_width() * components, 6622bf215546Sopenharmony_ci offset(src, bld, first_component), 6623bf215546Sopenharmony_ci type_sz(src.type) * bld.dispatch_width() * components)); 6624bf215546Sopenharmony_ci for (unsigned i = 0; i < components; i++) { 6625bf215546Sopenharmony_ci bld.MOV(retype(offset(dst, bld, i), src.type), 6626bf215546Sopenharmony_ci offset(src, bld, i + first_component)); 6627bf215546Sopenharmony_ci } 6628bf215546Sopenharmony_ci } else if (type_sz(src.type) < type_sz(dst.type)) { 6629bf215546Sopenharmony_ci /* Source is shuffled into destination */ 6630bf215546Sopenharmony_ci unsigned size_ratio = type_sz(dst.type) / type_sz(src.type); 6631bf215546Sopenharmony_ci assert(!regions_overlap(dst, 6632bf215546Sopenharmony_ci type_sz(dst.type) * bld.dispatch_width() * 6633bf215546Sopenharmony_ci DIV_ROUND_UP(components, size_ratio), 6634bf215546Sopenharmony_ci offset(src, bld, first_component), 6635bf215546Sopenharmony_ci type_sz(src.type) * bld.dispatch_width() * components)); 6636bf215546Sopenharmony_ci 6637bf215546Sopenharmony_ci brw_reg_type shuffle_type = 6638bf215546Sopenharmony_ci brw_reg_type_from_bit_size(8 * type_sz(src.type), 6639bf215546Sopenharmony_ci BRW_REGISTER_TYPE_D); 6640bf215546Sopenharmony_ci for (unsigned i = 0; i < components; i++) { 6641bf215546Sopenharmony_ci fs_reg shuffle_component_i = 6642bf215546Sopenharmony_ci subscript(offset(dst, bld, i / size_ratio), 6643bf215546Sopenharmony_ci shuffle_type, i % size_ratio); 6644bf215546Sopenharmony_ci bld.MOV(shuffle_component_i, 6645bf215546Sopenharmony_ci retype(offset(src, bld, i + first_component), shuffle_type)); 6646bf215546Sopenharmony_ci } 6647bf215546Sopenharmony_ci } else { 6648bf215546Sopenharmony_ci /* Source is unshuffled into destination */ 6649bf215546Sopenharmony_ci unsigned size_ratio = type_sz(src.type) / type_sz(dst.type); 6650bf215546Sopenharmony_ci assert(!regions_overlap(dst, 6651bf215546Sopenharmony_ci type_sz(dst.type) * bld.dispatch_width() * components, 6652bf215546Sopenharmony_ci offset(src, bld, first_component / size_ratio), 6653bf215546Sopenharmony_ci type_sz(src.type) * bld.dispatch_width() * 6654bf215546Sopenharmony_ci DIV_ROUND_UP(components + (first_component % size_ratio), 6655bf215546Sopenharmony_ci size_ratio))); 6656bf215546Sopenharmony_ci 6657bf215546Sopenharmony_ci brw_reg_type shuffle_type = 6658bf215546Sopenharmony_ci brw_reg_type_from_bit_size(8 * type_sz(dst.type), 6659bf215546Sopenharmony_ci BRW_REGISTER_TYPE_D); 6660bf215546Sopenharmony_ci for (unsigned i = 0; i < components; i++) { 6661bf215546Sopenharmony_ci fs_reg shuffle_component_i = 6662bf215546Sopenharmony_ci subscript(offset(src, bld, (first_component + i) / size_ratio), 6663bf215546Sopenharmony_ci shuffle_type, (first_component + i) % size_ratio); 6664bf215546Sopenharmony_ci bld.MOV(retype(offset(dst, bld, i), shuffle_type), 6665bf215546Sopenharmony_ci shuffle_component_i); 6666bf215546Sopenharmony_ci } 6667bf215546Sopenharmony_ci } 6668bf215546Sopenharmony_ci} 6669bf215546Sopenharmony_ci 6670bf215546Sopenharmony_civoid 6671bf215546Sopenharmony_cishuffle_from_32bit_read(const fs_builder &bld, 6672bf215546Sopenharmony_ci const fs_reg &dst, 6673bf215546Sopenharmony_ci const fs_reg &src, 6674bf215546Sopenharmony_ci uint32_t first_component, 6675bf215546Sopenharmony_ci uint32_t components) 6676bf215546Sopenharmony_ci{ 6677bf215546Sopenharmony_ci assert(type_sz(src.type) == 4); 6678bf215546Sopenharmony_ci 6679bf215546Sopenharmony_ci /* This function takes components in units of the destination type while 6680bf215546Sopenharmony_ci * shuffle_src_to_dst takes components in units of the smallest type 6681bf215546Sopenharmony_ci */ 6682bf215546Sopenharmony_ci if (type_sz(dst.type) > 4) { 6683bf215546Sopenharmony_ci assert(type_sz(dst.type) == 8); 6684bf215546Sopenharmony_ci first_component *= 2; 6685bf215546Sopenharmony_ci components *= 2; 6686bf215546Sopenharmony_ci } 6687bf215546Sopenharmony_ci 6688bf215546Sopenharmony_ci shuffle_src_to_dst(bld, dst, src, first_component, components); 6689bf215546Sopenharmony_ci} 6690bf215546Sopenharmony_ci 6691bf215546Sopenharmony_cifs_reg 6692bf215546Sopenharmony_cisetup_imm_df(const fs_builder &bld, double v) 6693bf215546Sopenharmony_ci{ 6694bf215546Sopenharmony_ci const struct intel_device_info *devinfo = bld.shader->devinfo; 6695bf215546Sopenharmony_ci assert(devinfo->ver >= 7); 6696bf215546Sopenharmony_ci 6697bf215546Sopenharmony_ci if (devinfo->ver >= 8) 6698bf215546Sopenharmony_ci return brw_imm_df(v); 6699bf215546Sopenharmony_ci 6700bf215546Sopenharmony_ci /* gfx7.5 does not support DF immediates straightforward but the DIM 6701bf215546Sopenharmony_ci * instruction allows to set the 64-bit immediate value. 6702bf215546Sopenharmony_ci */ 6703bf215546Sopenharmony_ci if (devinfo->platform == INTEL_PLATFORM_HSW) { 6704bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(1, 0); 6705bf215546Sopenharmony_ci fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1); 6706bf215546Sopenharmony_ci ubld.DIM(dst, brw_imm_df(v)); 6707bf215546Sopenharmony_ci return component(dst, 0); 6708bf215546Sopenharmony_ci } 6709bf215546Sopenharmony_ci 6710bf215546Sopenharmony_ci /* gfx7 does not support DF immediates, so we generate a 64-bit constant by 6711bf215546Sopenharmony_ci * writing the low 32-bit of the constant to suboffset 0 of a VGRF and 6712bf215546Sopenharmony_ci * the high 32-bit to suboffset 4 and then applying a stride of 0. 6713bf215546Sopenharmony_ci * 6714bf215546Sopenharmony_ci * Alternatively, we could also produce a normal VGRF (without stride 0) 6715bf215546Sopenharmony_ci * by writing to all the channels in the VGRF, however, that would hit the 6716bf215546Sopenharmony_ci * gfx7 bug where we have to split writes that span more than 1 register 6717bf215546Sopenharmony_ci * into instructions with a width of 4 (otherwise the write to the second 6718bf215546Sopenharmony_ci * register written runs into an execmask hardware bug) which isn't very 6719bf215546Sopenharmony_ci * nice. 6720bf215546Sopenharmony_ci */ 6721bf215546Sopenharmony_ci union { 6722bf215546Sopenharmony_ci double d; 6723bf215546Sopenharmony_ci struct { 6724bf215546Sopenharmony_ci uint32_t i1; 6725bf215546Sopenharmony_ci uint32_t i2; 6726bf215546Sopenharmony_ci }; 6727bf215546Sopenharmony_ci } di; 6728bf215546Sopenharmony_ci 6729bf215546Sopenharmony_ci di.d = v; 6730bf215546Sopenharmony_ci 6731bf215546Sopenharmony_ci const fs_builder ubld = bld.exec_all().group(1, 0); 6732bf215546Sopenharmony_ci const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 6733bf215546Sopenharmony_ci ubld.MOV(tmp, brw_imm_ud(di.i1)); 6734bf215546Sopenharmony_ci ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2)); 6735bf215546Sopenharmony_ci 6736bf215546Sopenharmony_ci return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0); 6737bf215546Sopenharmony_ci} 6738bf215546Sopenharmony_ci 6739bf215546Sopenharmony_cifs_reg 6740bf215546Sopenharmony_cisetup_imm_b(const fs_builder &bld, int8_t v) 6741bf215546Sopenharmony_ci{ 6742bf215546Sopenharmony_ci const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B); 6743bf215546Sopenharmony_ci bld.MOV(tmp, brw_imm_w(v)); 6744bf215546Sopenharmony_ci return tmp; 6745bf215546Sopenharmony_ci} 6746bf215546Sopenharmony_ci 6747bf215546Sopenharmony_cifs_reg 6748bf215546Sopenharmony_cisetup_imm_ub(const fs_builder &bld, uint8_t v) 6749bf215546Sopenharmony_ci{ 6750bf215546Sopenharmony_ci const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB); 6751bf215546Sopenharmony_ci bld.MOV(tmp, brw_imm_uw(v)); 6752bf215546Sopenharmony_ci return tmp; 6753bf215546Sopenharmony_ci} 6754