1/* 2 * Copyright © 2021 Ilia Mirkin 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include "ir3/ir3_compiler.h" 25 26#include "util/u_math.h" 27#include "util/u_queue.h" 28#include "util/half_float.h" 29 30#include "adreno_pm4.xml.h" 31#include "adreno_common.xml.h" 32#include "a4xx.xml.h" 33 34#include "ir3_asm.h" 35#include "main.h" 36 37struct a4xx_backend { 38 struct backend base; 39 40 struct ir3_compiler *compiler; 41 struct fd_device *dev; 42}; 43define_cast(backend, a4xx_backend); 44 45/* 46 * Backend implementation: 47 */ 48 49static struct kernel * 50a4xx_assemble(struct backend *b, FILE *in) 51{ 52 struct a4xx_backend *a4xx_backend = to_a4xx_backend(b); 53 struct ir3_kernel *ir3_kernel = ir3_asm_assemble(a4xx_backend->compiler, in); 54 ir3_kernel->backend = b; 55 return &ir3_kernel->base; 56} 57 58static void 59a4xx_disassemble(struct kernel *kernel, FILE *out) 60{ 61 ir3_asm_disassemble(to_ir3_kernel(kernel), out); 62} 63 64static void 65cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel) 66{ 67 struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel); 68 struct ir3_shader_variant *v = ir3_kernel->v; 69 const struct ir3_info *i = &v->info; 70 enum a3xx_threadsize thrsz = i->double_threadsize ? FOUR_QUADS : TWO_QUADS; 71 72 OUT_PKT0(ring, REG_A4XX_UCHE_INVALIDATE0, 2); 73 OUT_RING(ring, 0x00000000); 74 OUT_RING(ring, 0x00000012); 75 76 OUT_WFI(ring); 77 78 OUT_PKT0(ring, REG_A4XX_SP_MODE_CONTROL, 1); 79 OUT_RING(ring, 0x0000001e); 80 81 OUT_PKT0(ring, REG_A4XX_TPL1_TP_MODE_CONTROL, 1); 82 OUT_RING(ring, 0x00000038); 83 84 OUT_PKT0(ring, REG_A4XX_TPL1_TP_FS_TEX_COUNT, 1); 85 OUT_RING(ring, 0x00000000); 86 87 OUT_WFI(ring); 88 89 OUT_PKT0(ring, REG_A4XX_HLSQ_MODE_CONTROL, 1); 90 OUT_RING(ring, 0x00000003); 91 92 OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 1); 93 OUT_RING(ring, 0x080005f0); 94 95 OUT_PKT0(ring, REG_A4XX_HLSQ_UPDATE_CONTROL, 1); 96 OUT_RING(ring, 0x00000038); 97 98 OUT_PKT0(ring, REG_A4XX_SP_SP_CTRL_REG, 1); 99 OUT_RING(ring, 0x00860010); 100 // OUT_RING(ring, 0x00920000); 101 102 OUT_PKT0(ring, REG_A4XX_SP_INSTR_CACHE_CTRL, 1); 103 OUT_RING(ring, 0x000004ff); 104 // OUT_RING(ring, 0x00000260); 105 106 OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG1, 1); 107 OUT_RING(ring, 0x80000000); 108 109 OUT_PKT0(ring, REG_A4XX_SP_CS_CTRL_REG0, 1); 110 OUT_RING(ring, 111 A4XX_SP_CS_CTRL_REG0_THREADSIZE(thrsz) | 112 A4XX_SP_CS_CTRL_REG0_SUPERTHREADMODE | 113 A4XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(i->max_half_reg + 1) | 114 A4XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1)); 115 116 OUT_PKT0(ring, REG_A4XX_HLSQ_CS_CONTROL_REG, 1); 117 OUT_RING(ring, A4XX_HLSQ_CS_CONTROL_REG_CONSTOBJECTOFFSET(0) | 118 A4XX_HLSQ_CS_CONTROL_REG_SHADEROBJOFFSET(0) | 119 A4XX_HLSQ_CS_CONTROL_REG_ENABLED | 120 A4XX_HLSQ_CS_CONTROL_REG_INSTRLENGTH(1) | 121 COND(v->has_ssbo, A4XX_HLSQ_CS_CONTROL_REG_SSBO_ENABLE) | 122 A4XX_HLSQ_CS_CONTROL_REG_CONSTLENGTH(v->constlen / 4)); 123 124 OUT_PKT0(ring, REG_A4XX_SP_CS_OBJ_START, 1); 125 OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_OBJ_START */ 126 127 OUT_PKT0(ring, REG_A4XX_SP_CS_LENGTH_REG, 1); 128 OUT_RING(ring, v->instrlen); 129 130 uint32_t local_invocation_id, work_group_id, num_wg_id; 131 local_invocation_id = 132 ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); 133 work_group_id = ir3_kernel->info.wgid; 134 num_wg_id = ir3_kernel->info.numwg; 135 136 OUT_PKT0(ring, REG_A4XX_HLSQ_CL_CONTROL_0, 2); 137 OUT_RING(ring, A4XX_HLSQ_CL_CONTROL_0_WGIDCONSTID(work_group_id) | 138 A4XX_HLSQ_CL_CONTROL_0_KERNELDIMCONSTID(regid(63, 0)) | 139 A4XX_HLSQ_CL_CONTROL_0_LOCALIDREGID(local_invocation_id)); 140 OUT_RING(ring, A4XX_HLSQ_CL_CONTROL_1_UNK0CONSTID(regid(63, 0)) | 141 A4XX_HLSQ_CL_CONTROL_1_WORKGROUPSIZECONSTID(regid(63, 0))); 142 143 OUT_PKT0(ring, REG_A4XX_HLSQ_CL_KERNEL_CONST, 1); 144 OUT_RING(ring, A4XX_HLSQ_CL_KERNEL_CONST_UNK0CONSTID(regid(63, 0)) | 145 A4XX_HLSQ_CL_KERNEL_CONST_NUMWGCONSTID(num_wg_id)); 146 147 OUT_PKT0(ring, REG_A4XX_HLSQ_CL_WG_OFFSET, 1); 148 OUT_RING(ring, A4XX_HLSQ_CL_WG_OFFSET_UNK0CONSTID(regid(63, 0))); 149 150 OUT_PKT3(ring, CP_LOAD_STATE4, 2); 151 OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | 152 CP_LOAD_STATE4_0_STATE_SRC(SS4_INDIRECT) | 153 CP_LOAD_STATE4_0_STATE_BLOCK(SB4_CS_SHADER) | 154 CP_LOAD_STATE4_0_NUM_UNIT(v->instrlen)); 155 OUT_RELOC(ring, v->bo, 0, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER), 0); 156} 157 158static void 159emit_const(struct fd_ringbuffer *ring, struct kernel *kernel, uint32_t constid, uint32_t sizedwords, 160 const uint32_t *dwords) 161{ 162 uint32_t align_sz; 163 164 assert((constid % 4) == 0); 165 166 /* Overwrite appropriate entries with buffer addresses */ 167 struct fd_bo **replacements = calloc(sizedwords, sizeof(struct fd_bo *)); 168 for (int i = 0; i < MAX_BUFS; i++) { 169 if (kernel->buf_addr_regs[i] != INVALID_REG) { 170 int idx = kernel->buf_addr_regs[i]; 171 assert(idx < sizedwords); 172 173 replacements[idx] = kernel->bufs[i]; 174 } 175 } 176 177 align_sz = align(sizedwords, 4); 178 179 OUT_PKT3(ring, CP_LOAD_STATE4, 2 + align_sz); 180 OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(constid / 4) | 181 CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | 182 CP_LOAD_STATE4_0_STATE_BLOCK(SB4_CS_SHADER) | 183 CP_LOAD_STATE4_0_NUM_UNIT(DIV_ROUND_UP(sizedwords, 4))); 184 OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | 185 CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS)); 186 for (unsigned i = 0; i < sizedwords; i++) { 187 if (replacements[i]) 188 OUT_RELOC(ring, replacements[i], 0, 0, 0); 189 else 190 OUT_RING(ring, dwords[i]); 191 } 192 193 /* Zero-pad to multiple of 4 dwords */ 194 for (uint32_t i = sizedwords; i < align_sz; i++) { 195 OUT_RING(ring, 0); 196 } 197 198 free(replacements); 199} 200 201static void 202cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, 203 uint32_t grid[3]) 204{ 205 struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel); 206 struct ir3_shader_variant *v = ir3_kernel->v; 207 208 const struct ir3_const_state *const_state = ir3_const_state(v); 209 uint32_t base = const_state->offsets.immediate; 210 int size = DIV_ROUND_UP(const_state->immediates_count, 4); 211 212 /* truncate size to avoid writing constants that shader 213 * does not use: 214 */ 215 size = MIN2(size + base, v->constlen) - base; 216 217 /* convert out of vec4: */ 218 base *= 4; 219 size *= 4; 220 221 if (size > 0) { 222 emit_const(ring, kernel, base, size, const_state->immediates); 223 } 224} 225 226static void 227cs_ibo_emit(struct fd_ringbuffer *ring, struct fd_submit *submit, 228 struct kernel *kernel) 229{ 230 OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (4 * kernel->num_bufs)); 231 OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | 232 CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | 233 CP_LOAD_STATE4_0_STATE_BLOCK(SB4_CS_SSBO) | 234 CP_LOAD_STATE4_0_NUM_UNIT(kernel->num_bufs)); 235 OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER) | 236 CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); 237 for (unsigned i = 0; i < kernel->num_bufs; i++) { 238 OUT_RELOC(ring, kernel->bufs[i], 0, 0, 0); 239#if 1 240 OUT_RING(ring, 0); 241 OUT_RING(ring, 0); 242 OUT_RING(ring, 0); 243#else 244 OUT_RING(ring, kernel->buf_sizes[i]); 245 OUT_RING(ring, kernel->buf_sizes[i]); 246 OUT_RING(ring, 0x00000004); 247#endif 248 } 249 250 OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (2 * kernel->num_bufs)); 251 OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | 252 CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | 253 CP_LOAD_STATE4_0_STATE_BLOCK(SB4_CS_SSBO) | 254 CP_LOAD_STATE4_0_NUM_UNIT(kernel->num_bufs)); 255 OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) | 256 CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); 257 for (unsigned i = 0; i < kernel->num_bufs; i++) { 258 unsigned sz = kernel->buf_sizes[i]; 259 260 /* width is in dwords, overflows into height: */ 261 sz /= 4; 262 263#if 1 264 OUT_RING(ring, A4XX_SSBO_1_0_WIDTH(sz)); 265 OUT_RING(ring, A4XX_SSBO_1_1_HEIGHT(sz >> 16)); 266#else 267 OUT_RING(ring, A4XX_SSBO_1_0_WIDTH(sz) | 268 A4XX_SSBO_1_0_FMT(RB4_R32_UINT) | 269 A4XX_SSBO_1_0_CPP(4)); 270 OUT_RING(ring, A4XX_SSBO_1_1_HEIGHT(DIV_ROUND_UP(sz, 1 << 16)) | 271 A4XX_SSBO_1_1_DEPTH(1)); 272#endif 273 } 274} 275 276static void 277a4xx_emit_grid(struct kernel *kernel, uint32_t grid[3], 278 struct fd_submit *submit) 279{ 280 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( 281 submit, 0, FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE); 282 283 cs_program_emit(ring, kernel); 284 cs_const_emit(ring, kernel, grid); 285 cs_ibo_emit(ring, submit, kernel); 286 287 const unsigned *local_size = kernel->local_size; 288 const unsigned *num_groups = grid; 289 290 unsigned work_dim = 0; 291 for (int i = 0; i < 3; i++) { 292 if (!grid[i]) 293 break; 294 work_dim++; 295 } 296 297 OUT_PKT0(ring, REG_A4XX_HLSQ_CL_NDRANGE_0, 7); 298 OUT_RING(ring, A4XX_HLSQ_CL_NDRANGE_0_KERNELDIM(work_dim) | 299 A4XX_HLSQ_CL_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) | 300 A4XX_HLSQ_CL_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) | 301 A4XX_HLSQ_CL_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1)); 302 OUT_RING(ring, 303 A4XX_HLSQ_CL_NDRANGE_1_SIZE_X(local_size[0] * num_groups[0])); 304 OUT_RING(ring, 0); /* HLSQ_CL_NDRANGE_2_GLOBALOFF_X */ 305 OUT_RING(ring, 306 A4XX_HLSQ_CL_NDRANGE_3_SIZE_Y(local_size[1] * num_groups[1])); 307 OUT_RING(ring, 0); /* HLSQ_CL_NDRANGE_4_GLOBALOFF_Y */ 308 OUT_RING(ring, 309 A4XX_HLSQ_CL_NDRANGE_5_SIZE_Z(local_size[2] * num_groups[2])); 310 OUT_RING(ring, 0); /* HLSQ_CL_NDRANGE_6_GLOBALOFF_Z */ 311 312#if 1 313 OUT_PKT3(ring, CP_EXEC_CS, 4); 314 OUT_RING(ring, 0x00000000); 315 OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(grid[0])); 316 OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(grid[1])); 317 OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(grid[2])); 318#else 319 OUT_PKT0(ring, REG_A4XX_HLSQ_CL_KERNEL_GROUP_X, 3); 320 OUT_RING(ring, grid[0]); /* HLSQ_CL_KERNEL_GROUP_X */ 321 OUT_RING(ring, grid[1]); /* HLSQ_CL_KERNEL_GROUP_Y */ 322 OUT_RING(ring, grid[2]); /* HLSQ_CL_KERNEL_GROUP_Z */ 323 324 OUT_PKT3(ring, CP_RUN_OPENCL, 1); 325 OUT_RING(ring, 0); 326#endif 327 328 OUT_WFI(ring); 329 330 /* TODO: cache_flush */ 331} 332 333struct backend * 334a4xx_init(struct fd_device *dev, const struct fd_dev_id *dev_id) 335{ 336 struct a4xx_backend *a4xx_backend = calloc(1, sizeof(*a4xx_backend)); 337 338 a4xx_backend->base = (struct backend){ 339 .assemble = a4xx_assemble, 340 .disassemble = a4xx_disassemble, 341 .emit_grid = a4xx_emit_grid, 342 }; 343 344 a4xx_backend->compiler = ir3_compiler_create(dev, dev_id, 345 &(struct ir3_compiler_options) {}); 346 a4xx_backend->dev = dev; 347 348 return &a4xx_backend->base; 349} 350