1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright (C) 2017-2018 Rob Clark <robclark@freedesktop.org> 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21bf215546Sopenharmony_ci * SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci * Authors: 24bf215546Sopenharmony_ci * Rob Clark <robclark@freedesktop.org> 25bf215546Sopenharmony_ci */ 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci/* 500 gets us LDIB but doesn't change any other a4xx instructions */ 28bf215546Sopenharmony_ci#define GPU 500 29bf215546Sopenharmony_ci 30bf215546Sopenharmony_ci#include "ir3_context.h" 31bf215546Sopenharmony_ci#include "ir3_image.h" 32bf215546Sopenharmony_ci 33bf215546Sopenharmony_ci/* SSBO data is available at this CB address, addressed like regular consts 34bf215546Sopenharmony_ci * containing the following data in each vec4: 35bf215546Sopenharmony_ci * 36bf215546Sopenharmony_ci * [ base address, pitch, array_pitch, cpp ] 37bf215546Sopenharmony_ci * 38bf215546Sopenharmony_ci * These mirror the values uploaded to A4XX_SSBO_0 state. For A5XX, these are 39bf215546Sopenharmony_ci * uploaded manually by the driver. 40bf215546Sopenharmony_ci */ 41bf215546Sopenharmony_ci#define A4XX_SSBO_CB_BASE(i) (0x700 + ((i) << 2)) 42bf215546Sopenharmony_ci 43bf215546Sopenharmony_ci/* 44bf215546Sopenharmony_ci * Handlers for instructions changed/added in a4xx: 45bf215546Sopenharmony_ci */ 46bf215546Sopenharmony_ci 47bf215546Sopenharmony_ci/* Convert byte offset to address of appropriate width for GPU */ 48bf215546Sopenharmony_cistatic struct ir3_instruction * 49bf215546Sopenharmony_cibyte_offset_to_address(struct ir3_context *ctx, 50bf215546Sopenharmony_ci nir_src *ssbo, 51bf215546Sopenharmony_ci struct ir3_instruction *byte_offset) 52bf215546Sopenharmony_ci{ 53bf215546Sopenharmony_ci struct ir3_block *b = ctx->block; 54bf215546Sopenharmony_ci 55bf215546Sopenharmony_ci if (ctx->compiler->gen == 4) { 56bf215546Sopenharmony_ci uint32_t index = nir_src_as_uint(*ssbo); 57bf215546Sopenharmony_ci unsigned cb = A4XX_SSBO_CB_BASE(index); 58bf215546Sopenharmony_ci byte_offset = ir3_ADD_U(b, create_uniform(b, cb), 0, byte_offset, 0); 59bf215546Sopenharmony_ci } 60bf215546Sopenharmony_ci 61bf215546Sopenharmony_ci if (fd_dev_64b(ctx->compiler->dev_id)) { 62bf215546Sopenharmony_ci return ir3_collect(b, byte_offset, create_immed(b, 0)); 63bf215546Sopenharmony_ci } else { 64bf215546Sopenharmony_ci return byte_offset; 65bf215546Sopenharmony_ci } 66bf215546Sopenharmony_ci} 67bf215546Sopenharmony_ci 68bf215546Sopenharmony_ci/* src[] = { buffer_index, offset }. No const_index */ 69bf215546Sopenharmony_cistatic void 70bf215546Sopenharmony_ciemit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, 71bf215546Sopenharmony_ci struct ir3_instruction **dst) 72bf215546Sopenharmony_ci{ 73bf215546Sopenharmony_ci struct ir3_block *b = ctx->block; 74bf215546Sopenharmony_ci struct ir3_instruction *ldgb, *src0, *src1, *byte_offset, *offset; 75bf215546Sopenharmony_ci 76bf215546Sopenharmony_ci struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]); 77bf215546Sopenharmony_ci 78bf215546Sopenharmony_ci byte_offset = ir3_get_src(ctx, &intr->src[1])[0]; 79bf215546Sopenharmony_ci offset = ir3_get_src(ctx, &intr->src[2])[0]; 80bf215546Sopenharmony_ci 81bf215546Sopenharmony_ci /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */ 82bf215546Sopenharmony_ci src0 = byte_offset_to_address(ctx, &intr->src[0], byte_offset); 83bf215546Sopenharmony_ci src1 = offset; 84bf215546Sopenharmony_ci 85bf215546Sopenharmony_ci ldgb = ir3_LDGB(b, ssbo, 0, src0, 0, src1, 0); 86bf215546Sopenharmony_ci ldgb->dsts[0]->wrmask = MASK(intr->num_components); 87bf215546Sopenharmony_ci ldgb->cat6.iim_val = intr->num_components; 88bf215546Sopenharmony_ci ldgb->cat6.d = 4; 89bf215546Sopenharmony_ci ldgb->cat6.type = TYPE_U32; 90bf215546Sopenharmony_ci ldgb->barrier_class = IR3_BARRIER_BUFFER_R; 91bf215546Sopenharmony_ci ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W; 92bf215546Sopenharmony_ci 93bf215546Sopenharmony_ci ir3_split_dest(b, dst, ldgb, 0, intr->num_components); 94bf215546Sopenharmony_ci} 95bf215546Sopenharmony_ci 96bf215546Sopenharmony_ci/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */ 97bf215546Sopenharmony_cistatic void 98bf215546Sopenharmony_ciemit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) 99bf215546Sopenharmony_ci{ 100bf215546Sopenharmony_ci struct ir3_block *b = ctx->block; 101bf215546Sopenharmony_ci struct ir3_instruction *stgb, *src0, *src1, *src2, *byte_offset, *offset; 102bf215546Sopenharmony_ci unsigned wrmask = nir_intrinsic_write_mask(intr); 103bf215546Sopenharmony_ci unsigned ncomp = ffs(~wrmask) - 1; 104bf215546Sopenharmony_ci 105bf215546Sopenharmony_ci assert(wrmask == BITFIELD_MASK(intr->num_components)); 106bf215546Sopenharmony_ci 107bf215546Sopenharmony_ci struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[1]); 108bf215546Sopenharmony_ci 109bf215546Sopenharmony_ci byte_offset = ir3_get_src(ctx, &intr->src[2])[0]; 110bf215546Sopenharmony_ci offset = ir3_get_src(ctx, &intr->src[3])[0]; 111bf215546Sopenharmony_ci 112bf215546Sopenharmony_ci /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0).. 113bf215546Sopenharmony_ci * nir already *= 4: 114bf215546Sopenharmony_ci */ 115bf215546Sopenharmony_ci src0 = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp); 116bf215546Sopenharmony_ci src1 = offset; 117bf215546Sopenharmony_ci src2 = byte_offset_to_address(ctx, &intr->src[1], byte_offset); 118bf215546Sopenharmony_ci 119bf215546Sopenharmony_ci stgb = ir3_STGB(b, ssbo, 0, src0, 0, src1, 0, src2, 0); 120bf215546Sopenharmony_ci stgb->cat6.iim_val = ncomp; 121bf215546Sopenharmony_ci stgb->cat6.d = 4; 122bf215546Sopenharmony_ci stgb->cat6.type = TYPE_U32; 123bf215546Sopenharmony_ci stgb->barrier_class = IR3_BARRIER_BUFFER_W; 124bf215546Sopenharmony_ci stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; 125bf215546Sopenharmony_ci 126bf215546Sopenharmony_ci array_insert(b, b->keeps, stgb); 127bf215546Sopenharmony_ci} 128bf215546Sopenharmony_ci 129bf215546Sopenharmony_ci/* 130bf215546Sopenharmony_ci * SSBO atomic intrinsics 131bf215546Sopenharmony_ci * 132bf215546Sopenharmony_ci * All of the SSBO atomic memory operations read a value from memory, 133bf215546Sopenharmony_ci * compute a new value using one of the operations below, write the new 134bf215546Sopenharmony_ci * value to memory, and return the original value read. 135bf215546Sopenharmony_ci * 136bf215546Sopenharmony_ci * All operations take 3 sources except CompSwap that takes 4. These 137bf215546Sopenharmony_ci * sources represent: 138bf215546Sopenharmony_ci * 139bf215546Sopenharmony_ci * 0: The SSBO buffer index. 140bf215546Sopenharmony_ci * 1: The byte offset into the SSBO buffer of the variable that the atomic 141bf215546Sopenharmony_ci * operation will operate on. 142bf215546Sopenharmony_ci * 2: The data parameter to the atomic function (i.e. the value to add 143bf215546Sopenharmony_ci * in ssbo_atomic_add, etc). 144bf215546Sopenharmony_ci * 3: CompSwap: the second data parameter. 145bf215546Sopenharmony_ci * Non-CompSwap: The dword offset into the SSBO buffer variable. 146bf215546Sopenharmony_ci * 4: CompSwap: The dword offset into the SSBO buffer variable. 147bf215546Sopenharmony_ci * 148bf215546Sopenharmony_ci * We use custom ssbo_*_ir3 intrinsics generated by ir3_nir_lower_io_offsets() 149bf215546Sopenharmony_ci * so we can have the dword offset generated in NIR. 150bf215546Sopenharmony_ci */ 151bf215546Sopenharmony_cistatic struct ir3_instruction * 152bf215546Sopenharmony_ciemit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) 153bf215546Sopenharmony_ci{ 154bf215546Sopenharmony_ci struct ir3_block *b = ctx->block; 155bf215546Sopenharmony_ci struct ir3_instruction *atomic; 156bf215546Sopenharmony_ci type_t type = TYPE_U32; 157bf215546Sopenharmony_ci 158bf215546Sopenharmony_ci struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]); 159bf215546Sopenharmony_ci 160bf215546Sopenharmony_ci struct ir3_instruction *data = ir3_get_src(ctx, &intr->src[2])[0]; 161bf215546Sopenharmony_ci /* 64b byte offset */ 162bf215546Sopenharmony_ci struct ir3_instruction *byte_offset = 163bf215546Sopenharmony_ci byte_offset_to_address(ctx, &intr->src[0], ir3_get_src(ctx, &intr->src[1])[0]); 164bf215546Sopenharmony_ci /* dword offset for everything but comp_swap */ 165bf215546Sopenharmony_ci struct ir3_instruction *src3 = ir3_get_src(ctx, &intr->src[3])[0]; 166bf215546Sopenharmony_ci 167bf215546Sopenharmony_ci switch (intr->intrinsic) { 168bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_add_ir3: 169bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_ADD(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0); 170bf215546Sopenharmony_ci break; 171bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_imin_ir3: 172bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_MIN(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0); 173bf215546Sopenharmony_ci type = TYPE_S32; 174bf215546Sopenharmony_ci break; 175bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_umin_ir3: 176bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_MIN(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0); 177bf215546Sopenharmony_ci break; 178bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_imax_ir3: 179bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_MAX(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0); 180bf215546Sopenharmony_ci type = TYPE_S32; 181bf215546Sopenharmony_ci break; 182bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_umax_ir3: 183bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_MAX(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0); 184bf215546Sopenharmony_ci break; 185bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_and_ir3: 186bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_AND(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0); 187bf215546Sopenharmony_ci break; 188bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_or_ir3: 189bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_OR(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0); 190bf215546Sopenharmony_ci break; 191bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_xor_ir3: 192bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_XOR(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0); 193bf215546Sopenharmony_ci break; 194bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_exchange_ir3: 195bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_XCHG(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0); 196bf215546Sopenharmony_ci break; 197bf215546Sopenharmony_ci case nir_intrinsic_ssbo_atomic_comp_swap_ir3: 198bf215546Sopenharmony_ci /* for cmpxchg, src0 is [ui]vec2(data, compare): */ 199bf215546Sopenharmony_ci data = ir3_collect(b, src3, data); 200bf215546Sopenharmony_ci struct ir3_instruction *dword_offset = ir3_get_src(ctx, &intr->src[4])[0]; 201bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_CMPXCHG(b, ssbo, 0, data, 0, dword_offset, 0, 202bf215546Sopenharmony_ci byte_offset, 0); 203bf215546Sopenharmony_ci break; 204bf215546Sopenharmony_ci default: 205bf215546Sopenharmony_ci unreachable("boo"); 206bf215546Sopenharmony_ci } 207bf215546Sopenharmony_ci 208bf215546Sopenharmony_ci atomic->cat6.iim_val = 1; 209bf215546Sopenharmony_ci atomic->cat6.d = 4; 210bf215546Sopenharmony_ci atomic->cat6.type = type; 211bf215546Sopenharmony_ci atomic->barrier_class = IR3_BARRIER_BUFFER_W; 212bf215546Sopenharmony_ci atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; 213bf215546Sopenharmony_ci 214bf215546Sopenharmony_ci /* even if nothing consume the result, we can't DCE the instruction: */ 215bf215546Sopenharmony_ci array_insert(b, b->keeps, atomic); 216bf215546Sopenharmony_ci 217bf215546Sopenharmony_ci return atomic; 218bf215546Sopenharmony_ci} 219bf215546Sopenharmony_ci 220bf215546Sopenharmony_cistatic struct ir3_instruction * 221bf215546Sopenharmony_ciget_image_offset(struct ir3_context *ctx, const nir_intrinsic_instr *instr, 222bf215546Sopenharmony_ci struct ir3_instruction *const *coords, bool byteoff) 223bf215546Sopenharmony_ci{ 224bf215546Sopenharmony_ci struct ir3_block *b = ctx->block; 225bf215546Sopenharmony_ci struct ir3_instruction *offset; 226bf215546Sopenharmony_ci unsigned index = nir_src_as_uint(instr->src[0]); 227bf215546Sopenharmony_ci unsigned ncoords = ir3_get_image_coords(instr, NULL); 228bf215546Sopenharmony_ci 229bf215546Sopenharmony_ci /* to calculate the byte offset (yes, uggg) we need (up to) three 230bf215546Sopenharmony_ci * const values to know the bytes per pixel, and y and z stride: 231bf215546Sopenharmony_ci */ 232bf215546Sopenharmony_ci unsigned cb; 233bf215546Sopenharmony_ci if (ctx->compiler->gen > 4) { 234bf215546Sopenharmony_ci const struct ir3_const_state *const_state = ir3_const_state(ctx->so); 235bf215546Sopenharmony_ci assert(const_state->image_dims.mask & (1 << index)); 236bf215546Sopenharmony_ci 237bf215546Sopenharmony_ci cb = regid(const_state->offsets.image_dims, 0) + 238bf215546Sopenharmony_ci const_state->image_dims.off[index]; 239bf215546Sopenharmony_ci } else { 240bf215546Sopenharmony_ci index += ctx->s->info.num_ssbos; 241bf215546Sopenharmony_ci cb = A4XX_SSBO_CB_BASE(index); 242bf215546Sopenharmony_ci } 243bf215546Sopenharmony_ci 244bf215546Sopenharmony_ci /* offset = coords.x * bytes_per_pixel: */ 245bf215546Sopenharmony_ci if (ctx->compiler->gen == 4) 246bf215546Sopenharmony_ci offset = ir3_MUL_S24(b, coords[0], 0, create_uniform(b, cb + 3), 0); 247bf215546Sopenharmony_ci else 248bf215546Sopenharmony_ci offset = ir3_MUL_S24(b, coords[0], 0, create_uniform(b, cb + 0), 0); 249bf215546Sopenharmony_ci if (ncoords > 1) { 250bf215546Sopenharmony_ci /* offset += coords.y * y_pitch: */ 251bf215546Sopenharmony_ci offset = 252bf215546Sopenharmony_ci ir3_MAD_S24(b, create_uniform(b, cb + 1), 0, coords[1], 0, offset, 0); 253bf215546Sopenharmony_ci } 254bf215546Sopenharmony_ci if (ncoords > 2) { 255bf215546Sopenharmony_ci /* offset += coords.z * z_pitch: */ 256bf215546Sopenharmony_ci offset = 257bf215546Sopenharmony_ci ir3_MAD_S24(b, create_uniform(b, cb + 2), 0, coords[2], 0, offset, 0); 258bf215546Sopenharmony_ci } 259bf215546Sopenharmony_ci 260bf215546Sopenharmony_ci /* a4xx: must add in the base address: */ 261bf215546Sopenharmony_ci if (ctx->compiler->gen == 4) 262bf215546Sopenharmony_ci offset = ir3_ADD_U(b, offset, 0, create_uniform(b, cb + 0), 0); 263bf215546Sopenharmony_ci 264bf215546Sopenharmony_ci if (!byteoff) { 265bf215546Sopenharmony_ci /* Some cases, like atomics, seem to use dword offset instead 266bf215546Sopenharmony_ci * of byte offsets.. blob just puts an extra shr.b in there 267bf215546Sopenharmony_ci * in those cases: 268bf215546Sopenharmony_ci */ 269bf215546Sopenharmony_ci offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); 270bf215546Sopenharmony_ci } 271bf215546Sopenharmony_ci 272bf215546Sopenharmony_ci if (fd_dev_64b(ctx->compiler->dev_id)) 273bf215546Sopenharmony_ci return ir3_collect(b, offset, create_immed(b, 0)); 274bf215546Sopenharmony_ci else 275bf215546Sopenharmony_ci return offset; 276bf215546Sopenharmony_ci} 277bf215546Sopenharmony_ci 278bf215546Sopenharmony_ci/* src[] = { deref, coord, sample_index }. const_index[] = {} */ 279bf215546Sopenharmony_cistatic void 280bf215546Sopenharmony_ciemit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr, 281bf215546Sopenharmony_ci struct ir3_instruction **dst) 282bf215546Sopenharmony_ci{ 283bf215546Sopenharmony_ci struct ir3_block *b = ctx->block; 284bf215546Sopenharmony_ci struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]); 285bf215546Sopenharmony_ci struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]); 286bf215546Sopenharmony_ci struct ir3_instruction *offset = get_image_offset(ctx, intr, coords, true); 287bf215546Sopenharmony_ci unsigned ncoords = ir3_get_image_coords(intr, NULL); 288bf215546Sopenharmony_ci unsigned ncomp = 289bf215546Sopenharmony_ci ir3_get_num_components_for_image_format(nir_intrinsic_format(intr)); 290bf215546Sopenharmony_ci 291bf215546Sopenharmony_ci struct ir3_instruction *ldib; 292bf215546Sopenharmony_ci /* At least A420 does not have LDIB. Use LDGB and perform conversion 293bf215546Sopenharmony_ci * ourselves. 294bf215546Sopenharmony_ci * 295bf215546Sopenharmony_ci * TODO: Actually do the conversion. ES 3.1 only requires this for 296bf215546Sopenharmony_ci * single-component 32-bit types anyways. 297bf215546Sopenharmony_ci */ 298bf215546Sopenharmony_ci if (ctx->compiler->gen > 4) { 299bf215546Sopenharmony_ci ldib = ir3_LDIB( 300bf215546Sopenharmony_ci b, ibo, 0, offset, 0, ir3_create_collect(b, coords, ncoords), 0); 301bf215546Sopenharmony_ci } else { 302bf215546Sopenharmony_ci ldib = ir3_LDGB( 303bf215546Sopenharmony_ci b, ibo, 0, offset, 0, ir3_create_collect(b, coords, ncoords), 0); 304bf215546Sopenharmony_ci switch (nir_intrinsic_format(intr)) { 305bf215546Sopenharmony_ci case PIPE_FORMAT_R32_UINT: 306bf215546Sopenharmony_ci case PIPE_FORMAT_R32_SINT: 307bf215546Sopenharmony_ci case PIPE_FORMAT_R32_FLOAT: 308bf215546Sopenharmony_ci break; 309bf215546Sopenharmony_ci default: 310bf215546Sopenharmony_ci /* For some reason even more 32-bit components don't work. */ 311bf215546Sopenharmony_ci assert(0); 312bf215546Sopenharmony_ci break; 313bf215546Sopenharmony_ci } 314bf215546Sopenharmony_ci } 315bf215546Sopenharmony_ci ldib->dsts[0]->wrmask = MASK(intr->num_components); 316bf215546Sopenharmony_ci ldib->cat6.iim_val = ncomp; 317bf215546Sopenharmony_ci ldib->cat6.d = ncoords; 318bf215546Sopenharmony_ci ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr); 319bf215546Sopenharmony_ci ldib->cat6.typed = true; 320bf215546Sopenharmony_ci ldib->barrier_class = IR3_BARRIER_IMAGE_R; 321bf215546Sopenharmony_ci ldib->barrier_conflict = IR3_BARRIER_IMAGE_W; 322bf215546Sopenharmony_ci 323bf215546Sopenharmony_ci ir3_split_dest(b, dst, ldib, 0, intr->num_components); 324bf215546Sopenharmony_ci} 325bf215546Sopenharmony_ci 326bf215546Sopenharmony_ci/* src[] = { index, coord, sample_index, value }. const_index[] = {} */ 327bf215546Sopenharmony_cistatic void 328bf215546Sopenharmony_ciemit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) 329bf215546Sopenharmony_ci{ 330bf215546Sopenharmony_ci struct ir3_block *b = ctx->block; 331bf215546Sopenharmony_ci struct ir3_instruction *stib, *offset; 332bf215546Sopenharmony_ci struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[3]); 333bf215546Sopenharmony_ci struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]); 334bf215546Sopenharmony_ci struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]); 335bf215546Sopenharmony_ci unsigned ncoords = ir3_get_image_coords(intr, NULL); 336bf215546Sopenharmony_ci unsigned ncomp = 337bf215546Sopenharmony_ci ir3_get_num_components_for_image_format(nir_intrinsic_format(intr)); 338bf215546Sopenharmony_ci 339bf215546Sopenharmony_ci /* src0 is value 340bf215546Sopenharmony_ci * src1 is coords 341bf215546Sopenharmony_ci * src2 is 64b byte offset 342bf215546Sopenharmony_ci */ 343bf215546Sopenharmony_ci 344bf215546Sopenharmony_ci offset = get_image_offset(ctx, intr, coords, true); 345bf215546Sopenharmony_ci 346bf215546Sopenharmony_ci /* NOTE: stib seems to take byte offset, but stgb.typed can be used 347bf215546Sopenharmony_ci * too and takes a dword offset.. not quite sure yet why blob uses 348bf215546Sopenharmony_ci * one over the other in various cases. 349bf215546Sopenharmony_ci */ 350bf215546Sopenharmony_ci 351bf215546Sopenharmony_ci stib = ir3_STIB(b, ibo, 0, ir3_create_collect(b, value, ncomp), 0, 352bf215546Sopenharmony_ci ir3_create_collect(b, coords, ncoords), 0, offset, 0); 353bf215546Sopenharmony_ci stib->cat6.iim_val = ncomp; 354bf215546Sopenharmony_ci stib->cat6.d = ncoords; 355bf215546Sopenharmony_ci stib->cat6.type = ir3_get_type_for_image_intrinsic(intr); 356bf215546Sopenharmony_ci stib->cat6.typed = true; 357bf215546Sopenharmony_ci stib->barrier_class = IR3_BARRIER_IMAGE_W; 358bf215546Sopenharmony_ci stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; 359bf215546Sopenharmony_ci 360bf215546Sopenharmony_ci array_insert(b, b->keeps, stib); 361bf215546Sopenharmony_ci} 362bf215546Sopenharmony_ci 363bf215546Sopenharmony_ci/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */ 364bf215546Sopenharmony_cistatic struct ir3_instruction * 365bf215546Sopenharmony_ciemit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) 366bf215546Sopenharmony_ci{ 367bf215546Sopenharmony_ci struct ir3_block *b = ctx->block; 368bf215546Sopenharmony_ci struct ir3_instruction *atomic, *src0, *src1, *src2; 369bf215546Sopenharmony_ci struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]); 370bf215546Sopenharmony_ci struct ir3_instruction *image = ir3_image_to_ibo(ctx, intr->src[0]); 371bf215546Sopenharmony_ci unsigned ncoords = ir3_get_image_coords(intr, NULL); 372bf215546Sopenharmony_ci 373bf215546Sopenharmony_ci /* src0 is value (or uvec2(value, compare)) 374bf215546Sopenharmony_ci * src1 is coords 375bf215546Sopenharmony_ci * src2 is 64b byte offset 376bf215546Sopenharmony_ci */ 377bf215546Sopenharmony_ci src0 = ir3_get_src(ctx, &intr->src[3])[0]; 378bf215546Sopenharmony_ci src1 = ir3_create_collect(b, coords, ncoords); 379bf215546Sopenharmony_ci src2 = get_image_offset(ctx, intr, coords, ctx->compiler->gen == 4); 380bf215546Sopenharmony_ci 381bf215546Sopenharmony_ci switch (intr->intrinsic) { 382bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_add: 383bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_ADD(b, image, 0, src0, 0, src1, 0, src2, 0); 384bf215546Sopenharmony_ci break; 385bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_imin: 386bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_umin: 387bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_MIN(b, image, 0, src0, 0, src1, 0, src2, 0); 388bf215546Sopenharmony_ci break; 389bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_imax: 390bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_umax: 391bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_MAX(b, image, 0, src0, 0, src1, 0, src2, 0); 392bf215546Sopenharmony_ci break; 393bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_and: 394bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_AND(b, image, 0, src0, 0, src1, 0, src2, 0); 395bf215546Sopenharmony_ci break; 396bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_or: 397bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_OR(b, image, 0, src0, 0, src1, 0, src2, 0); 398bf215546Sopenharmony_ci break; 399bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_xor: 400bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_XOR(b, image, 0, src0, 0, src1, 0, src2, 0); 401bf215546Sopenharmony_ci break; 402bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_exchange: 403bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_XCHG(b, image, 0, src0, 0, src1, 0, src2, 0); 404bf215546Sopenharmony_ci break; 405bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_comp_swap: 406bf215546Sopenharmony_ci /* for cmpxchg, src0 is [ui]vec2(data, compare): */ 407bf215546Sopenharmony_ci src0 = ir3_collect(b, ir3_get_src(ctx, &intr->src[4])[0], src0); 408bf215546Sopenharmony_ci atomic = ir3_ATOMIC_S_CMPXCHG(b, image, 0, src0, 0, src1, 0, src2, 0); 409bf215546Sopenharmony_ci break; 410bf215546Sopenharmony_ci default: 411bf215546Sopenharmony_ci unreachable("boo"); 412bf215546Sopenharmony_ci } 413bf215546Sopenharmony_ci 414bf215546Sopenharmony_ci atomic->cat6.iim_val = 1; 415bf215546Sopenharmony_ci atomic->cat6.d = ncoords; 416bf215546Sopenharmony_ci atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr); 417bf215546Sopenharmony_ci atomic->cat6.typed = ctx->compiler->gen == 5; 418bf215546Sopenharmony_ci atomic->barrier_class = IR3_BARRIER_IMAGE_W; 419bf215546Sopenharmony_ci atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; 420bf215546Sopenharmony_ci 421bf215546Sopenharmony_ci /* even if nothing consume the result, we can't DCE the instruction: */ 422bf215546Sopenharmony_ci array_insert(b, b->keeps, atomic); 423bf215546Sopenharmony_ci 424bf215546Sopenharmony_ci return atomic; 425bf215546Sopenharmony_ci} 426bf215546Sopenharmony_ci 427bf215546Sopenharmony_cistatic struct ir3_instruction * 428bf215546Sopenharmony_ciemit_intrinsic_atomic_global(struct ir3_context *ctx, nir_intrinsic_instr *intr) 429bf215546Sopenharmony_ci{ 430bf215546Sopenharmony_ci unreachable("Global atomic are unimplemented on A5xx"); 431bf215546Sopenharmony_ci} 432bf215546Sopenharmony_ci 433bf215546Sopenharmony_ciconst struct ir3_context_funcs ir3_a4xx_funcs = { 434bf215546Sopenharmony_ci .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo, 435bf215546Sopenharmony_ci .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo, 436bf215546Sopenharmony_ci .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo, 437bf215546Sopenharmony_ci .emit_intrinsic_load_image = emit_intrinsic_load_image, 438bf215546Sopenharmony_ci .emit_intrinsic_store_image = emit_intrinsic_store_image, 439bf215546Sopenharmony_ci .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image, 440bf215546Sopenharmony_ci .emit_intrinsic_image_size = emit_intrinsic_image_size_tex, 441bf215546Sopenharmony_ci .emit_intrinsic_load_global_ir3 = NULL, 442bf215546Sopenharmony_ci .emit_intrinsic_store_global_ir3 = NULL, 443bf215546Sopenharmony_ci .emit_intrinsic_atomic_global = emit_intrinsic_atomic_global, 444bf215546Sopenharmony_ci}; 445