1/* 2 * Copyright © 2019 Google LLC 3 * SPDX-License-Identifier: MIT 4 */ 5 6#ifndef TU_CS_H 7#define TU_CS_H 8 9#include "tu_common.h" 10 11#include "freedreno_pm4.h" 12 13#include "tu_drm.h" 14 15/* For breadcrumbs we may open a network socket based on the envvar, 16 * it's not something that should be enabled by default. 17 */ 18#define TU_BREADCRUMBS_ENABLED 0 19 20enum tu_cs_mode 21{ 22 23 /* 24 * A command stream in TU_CS_MODE_GROW mode grows automatically whenever it 25 * is full. tu_cs_begin must be called before command packet emission and 26 * tu_cs_end must be called after. 27 * 28 * This mode may create multiple entries internally. The entries must be 29 * submitted together. 30 */ 31 TU_CS_MODE_GROW, 32 33 /* 34 * A command stream in TU_CS_MODE_EXTERNAL mode wraps an external, 35 * fixed-size buffer. tu_cs_begin and tu_cs_end are optional and have no 36 * effect on it. 37 * 38 * This mode does not create any entry or any BO. 39 */ 40 TU_CS_MODE_EXTERNAL, 41 42 /* 43 * A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct 44 * command packet emission. tu_cs_begin_sub_stream must be called to get a 45 * sub-stream to emit comamnd packets to. When done with the sub-stream, 46 * tu_cs_end_sub_stream must be called. 47 * 48 * This mode does not create any entry internally. 49 */ 50 TU_CS_MODE_SUB_STREAM, 51}; 52 53struct tu_cs_entry 54{ 55 /* No ownership */ 56 const struct tu_bo *bo; 57 58 uint32_t size; 59 uint32_t offset; 60}; 61 62struct tu_cs_memory { 63 uint32_t *map; 64 uint64_t iova; 65}; 66 67struct tu_draw_state { 68 uint64_t iova : 48; 69 uint32_t size : 16; 70}; 71 72#define TU_COND_EXEC_STACK_SIZE 4 73 74struct tu_cs 75{ 76 uint32_t *start; 77 uint32_t *cur; 78 uint32_t *reserved_end; 79 uint32_t *end; 80 81 struct tu_device *device; 82 enum tu_cs_mode mode; 83 uint32_t next_bo_size; 84 85 struct tu_cs_entry *entries; 86 uint32_t entry_count; 87 uint32_t entry_capacity; 88 89 struct tu_bo **bos; 90 uint32_t bo_count; 91 uint32_t bo_capacity; 92 93 /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */ 94 struct tu_bo *refcount_bo; 95 96 /* state for cond_exec_start/cond_exec_end */ 97 uint32_t cond_stack_depth; 98 uint32_t cond_flags[TU_COND_EXEC_STACK_SIZE]; 99 uint32_t *cond_dwords[TU_COND_EXEC_STACK_SIZE]; 100 101 uint32_t breadcrumb_emit_after; 102}; 103 104void 105tu_breadcrumbs_init(struct tu_device *device); 106 107void 108tu_breadcrumbs_finish(struct tu_device *device); 109 110void 111tu_cs_init(struct tu_cs *cs, 112 struct tu_device *device, 113 enum tu_cs_mode mode, 114 uint32_t initial_size); 115 116void 117tu_cs_init_external(struct tu_cs *cs, struct tu_device *device, 118 uint32_t *start, uint32_t *end); 119 120void 121tu_cs_init_suballoc(struct tu_cs *cs, struct tu_device *device, 122 struct tu_suballoc_bo *bo); 123 124void 125tu_cs_finish(struct tu_cs *cs); 126 127void 128tu_cs_begin(struct tu_cs *cs); 129 130void 131tu_cs_end(struct tu_cs *cs); 132 133VkResult 134tu_cs_begin_sub_stream(struct tu_cs *cs, uint32_t size, struct tu_cs *sub_cs); 135 136VkResult 137tu_cs_alloc(struct tu_cs *cs, 138 uint32_t count, 139 uint32_t size, 140 struct tu_cs_memory *memory); 141 142struct tu_cs_entry 143tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs); 144 145static inline struct tu_draw_state 146tu_cs_end_draw_state(struct tu_cs *cs, struct tu_cs *sub_cs) 147{ 148 struct tu_cs_entry entry = tu_cs_end_sub_stream(cs, sub_cs); 149 return (struct tu_draw_state) { 150 .iova = entry.bo->iova + entry.offset, 151 .size = entry.size / sizeof(uint32_t), 152 }; 153} 154 155VkResult 156tu_cs_reserve_space(struct tu_cs *cs, uint32_t reserved_size); 157 158static inline struct tu_draw_state 159tu_cs_draw_state(struct tu_cs *sub_cs, struct tu_cs *cs, uint32_t size) 160{ 161 struct tu_cs_memory memory; 162 163 /* TODO: clean this up */ 164 tu_cs_alloc(sub_cs, size, 1, &memory); 165 tu_cs_init_external(cs, sub_cs->device, memory.map, memory.map + size); 166 tu_cs_begin(cs); 167 tu_cs_reserve_space(cs, size); 168 169 return (struct tu_draw_state) { 170 .iova = memory.iova, 171 .size = size, 172 }; 173} 174 175void 176tu_cs_reset(struct tu_cs *cs); 177 178VkResult 179tu_cs_add_entries(struct tu_cs *cs, struct tu_cs *target); 180 181/** 182 * Get the size of the command packets emitted since the last call to 183 * tu_cs_add_entry. 184 */ 185static inline uint32_t 186tu_cs_get_size(const struct tu_cs *cs) 187{ 188 return cs->cur - cs->start; 189} 190 191/** 192 * Return true if there is no command packet emitted since the last call to 193 * tu_cs_add_entry. 194 */ 195static inline uint32_t 196tu_cs_is_empty(const struct tu_cs *cs) 197{ 198 return tu_cs_get_size(cs) == 0; 199} 200 201/** 202 * Discard all entries. This allows \a cs to be reused while keeping the 203 * existing BOs and command packets intact. 204 */ 205static inline void 206tu_cs_discard_entries(struct tu_cs *cs) 207{ 208 assert(cs->mode == TU_CS_MODE_GROW); 209 cs->entry_count = 0; 210} 211 212/** 213 * Get the size needed for tu_cs_emit_call. 214 */ 215static inline uint32_t 216tu_cs_get_call_size(const struct tu_cs *cs) 217{ 218 assert(cs->mode == TU_CS_MODE_GROW); 219 /* each CP_INDIRECT_BUFFER needs 4 dwords */ 220 return cs->entry_count * 4; 221} 222 223/** 224 * Assert that we did not exceed the reserved space. 225 */ 226static inline void 227tu_cs_sanity_check(const struct tu_cs *cs) 228{ 229 assert(cs->start <= cs->cur); 230 assert(cs->cur <= cs->reserved_end); 231 assert(cs->reserved_end <= cs->end); 232} 233 234void 235tu_cs_emit_sync_breadcrumb(struct tu_cs *cs, uint8_t opcode, uint16_t cnt); 236 237/** 238 * Emit a uint32_t value into a command stream, without boundary checking. 239 */ 240static inline void 241tu_cs_emit(struct tu_cs *cs, uint32_t value) 242{ 243 assert(cs->cur < cs->reserved_end); 244 *cs->cur = value; 245 ++cs->cur; 246 247#if TU_BREADCRUMBS_ENABLED 248 cs->breadcrumb_emit_after--; 249 if (cs->breadcrumb_emit_after == 0) 250 tu_cs_emit_sync_breadcrumb(cs, -1, 0); 251#endif 252} 253 254/** 255 * Emit an array of uint32_t into a command stream, without boundary checking. 256 */ 257static inline void 258tu_cs_emit_array(struct tu_cs *cs, const uint32_t *values, uint32_t length) 259{ 260 assert(cs->cur + length <= cs->reserved_end); 261 memcpy(cs->cur, values, sizeof(uint32_t) * length); 262 cs->cur += length; 263} 264 265/** 266 * Get the size of the remaining space in the current BO. 267 */ 268static inline uint32_t 269tu_cs_get_space(const struct tu_cs *cs) 270{ 271 return cs->end - cs->cur; 272} 273 274static inline void 275tu_cs_reserve(struct tu_cs *cs, uint32_t reserved_size) 276{ 277 if (cs->mode != TU_CS_MODE_GROW) { 278 assert(tu_cs_get_space(cs) >= reserved_size); 279 assert(cs->reserved_end == cs->end); 280 return; 281 } 282 283 if (tu_cs_get_space(cs) >= reserved_size && 284 cs->entry_count < cs->entry_capacity) { 285 cs->reserved_end = cs->cur + reserved_size; 286 return; 287 } 288 289 ASSERTED VkResult result = tu_cs_reserve_space(cs, reserved_size); 290 /* TODO: set this error in tu_cs and use it */ 291 assert(result == VK_SUCCESS); 292} 293 294/** 295 * Emit a type-4 command packet header into a command stream. 296 */ 297static inline void 298tu_cs_emit_pkt4(struct tu_cs *cs, uint16_t regindx, uint16_t cnt) 299{ 300 tu_cs_reserve(cs, cnt + 1); 301 tu_cs_emit(cs, pm4_pkt4_hdr(regindx, cnt)); 302} 303 304/** 305 * Emit a type-7 command packet header into a command stream. 306 */ 307static inline void 308tu_cs_emit_pkt7(struct tu_cs *cs, uint8_t opcode, uint16_t cnt) 309{ 310#if TU_BREADCRUMBS_ENABLED 311 tu_cs_emit_sync_breadcrumb(cs, opcode, cnt + 1); 312#endif 313 314 tu_cs_reserve(cs, cnt + 1); 315 tu_cs_emit(cs, pm4_pkt7_hdr(opcode, cnt)); 316} 317 318static inline void 319tu_cs_emit_wfi(struct tu_cs *cs) 320{ 321 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_IDLE, 0); 322} 323 324static inline void 325tu_cs_emit_qw(struct tu_cs *cs, uint64_t value) 326{ 327 tu_cs_emit(cs, (uint32_t) value); 328 tu_cs_emit(cs, (uint32_t) (value >> 32)); 329} 330 331static inline void 332tu_cs_emit_write_reg(struct tu_cs *cs, uint16_t reg, uint32_t value) 333{ 334 tu_cs_emit_pkt4(cs, reg, 1); 335 tu_cs_emit(cs, value); 336} 337 338/** 339 * Emit a CP_INDIRECT_BUFFER command packet. 340 */ 341static inline void 342tu_cs_emit_ib(struct tu_cs *cs, const struct tu_cs_entry *entry) 343{ 344 assert(entry->bo); 345 assert(entry->size && entry->offset + entry->size <= entry->bo->size); 346 assert(entry->size % sizeof(uint32_t) == 0); 347 assert(entry->offset % sizeof(uint32_t) == 0); 348 349 tu_cs_emit_pkt7(cs, CP_INDIRECT_BUFFER, 3); 350 tu_cs_emit_qw(cs, entry->bo->iova + entry->offset); 351 tu_cs_emit(cs, entry->size / sizeof(uint32_t)); 352} 353 354/* for compute which isn't using SET_DRAW_STATE */ 355static inline void 356tu_cs_emit_state_ib(struct tu_cs *cs, struct tu_draw_state state) 357{ 358 if (state.size) { 359 tu_cs_emit_pkt7(cs, CP_INDIRECT_BUFFER, 3); 360 tu_cs_emit_qw(cs, state.iova); 361 tu_cs_emit(cs, state.size); 362 } 363} 364 365/** 366 * Emit a CP_INDIRECT_BUFFER command packet for each entry in the target 367 * command stream. 368 */ 369static inline void 370tu_cs_emit_call(struct tu_cs *cs, const struct tu_cs *target) 371{ 372 assert(target->mode == TU_CS_MODE_GROW); 373 for (uint32_t i = 0; i < target->entry_count; i++) 374 tu_cs_emit_ib(cs, target->entries + i); 375} 376 377/* Helpers for bracketing a large sequence of commands of unknown size inside 378 * a CP_COND_REG_EXEC packet. 379 */ 380static inline void 381tu_cond_exec_start(struct tu_cs *cs, uint32_t cond_flags) 382{ 383 assert(cs->mode == TU_CS_MODE_GROW); 384 assert(cs->cond_stack_depth < TU_COND_EXEC_STACK_SIZE); 385 386 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); 387 tu_cs_emit(cs, cond_flags); 388 389 cs->cond_flags[cs->cond_stack_depth] = cond_flags; 390 cs->cond_dwords[cs->cond_stack_depth] = cs->cur; 391 392 /* Emit dummy DWORD field here */ 393 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(0)); 394 395 cs->cond_stack_depth++; 396} 397#define CP_COND_EXEC_0_RENDER_MODE_GMEM \ 398 (CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | CP_COND_REG_EXEC_0_GMEM) 399#define CP_COND_EXEC_0_RENDER_MODE_SYSMEM \ 400 (CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | CP_COND_REG_EXEC_0_SYSMEM) 401 402static inline void 403tu_cond_exec_end(struct tu_cs *cs) 404{ 405 assert(cs->cond_stack_depth > 0); 406 cs->cond_stack_depth--; 407 408 cs->cond_flags[cs->cond_stack_depth] = 0; 409 /* Subtract one here to account for the DWORD field itself. */ 410 *cs->cond_dwords[cs->cond_stack_depth] = 411 cs->cur - cs->cond_dwords[cs->cond_stack_depth] - 1; 412} 413 414/* Temporary struct for tracking a register state to be written, used by 415 * a6xx-pack.h and tu_cs_emit_regs() 416 */ 417struct tu_reg_value { 418 uint32_t reg; 419 uint64_t value; 420 bool is_address; 421 struct tu_bo *bo; 422 bool bo_write; 423 uint32_t bo_offset; 424 uint32_t bo_shift; 425}; 426 427#define fd_reg_pair tu_reg_value 428#define __bo_type struct tu_bo * 429 430#include "a6xx-pack.xml.h" 431 432#define __assert_eq(a, b) \ 433 do { \ 434 if ((a) != (b)) { \ 435 fprintf(stderr, "assert failed: " #a " (0x%x) != " #b " (0x%x)\n", a, b); \ 436 assert((a) == (b)); \ 437 } \ 438 } while (0) 439 440#define __ONE_REG(i, regs) \ 441 do { \ 442 if (i < ARRAY_SIZE(regs) && regs[i].reg > 0) { \ 443 __assert_eq(regs[0].reg + i, regs[i].reg); \ 444 if (regs[i].bo) { \ 445 uint64_t v = regs[i].bo->iova + regs[i].bo_offset; \ 446 v >>= regs[i].bo_shift; \ 447 v |= regs[i].value; \ 448 \ 449 *p++ = v; \ 450 *p++ = v >> 32; \ 451 } else { \ 452 *p++ = regs[i].value; \ 453 if (regs[i].is_address) \ 454 *p++ = regs[i].value >> 32; \ 455 } \ 456 } \ 457 } while (0) 458 459/* Emits a sequence of register writes in order using a pkt4. This will check 460 * (at runtime on a !NDEBUG build) that the registers were actually set up in 461 * order in the code. 462 * 463 * Note that references to buffers aren't automatically added to the CS, 464 * unlike in freedreno. We are clever in various places to avoid duplicating 465 * the reference add work. 466 * 467 * Also, 64-bit address registers don't have a way (currently) to set a 64-bit 468 * address without having a reference to a BO, since the .dword field in the 469 * register's struct is only 32-bit wide. We should fix this in the pack 470 * codegen later. 471 */ 472#define tu_cs_emit_regs(cs, ...) do { \ 473 const struct fd_reg_pair regs[] = { __VA_ARGS__ }; \ 474 unsigned count = ARRAY_SIZE(regs); \ 475 \ 476 STATIC_ASSERT(ARRAY_SIZE(regs) > 0); \ 477 STATIC_ASSERT(ARRAY_SIZE(regs) <= 16); \ 478 \ 479 tu_cs_emit_pkt4((cs), regs[0].reg, count); \ 480 uint32_t *p = (cs)->cur; \ 481 __ONE_REG( 0, regs); \ 482 __ONE_REG( 1, regs); \ 483 __ONE_REG( 2, regs); \ 484 __ONE_REG( 3, regs); \ 485 __ONE_REG( 4, regs); \ 486 __ONE_REG( 5, regs); \ 487 __ONE_REG( 6, regs); \ 488 __ONE_REG( 7, regs); \ 489 __ONE_REG( 8, regs); \ 490 __ONE_REG( 9, regs); \ 491 __ONE_REG(10, regs); \ 492 __ONE_REG(11, regs); \ 493 __ONE_REG(12, regs); \ 494 __ONE_REG(13, regs); \ 495 __ONE_REG(14, regs); \ 496 __ONE_REG(15, regs); \ 497 (cs)->cur = p; \ 498 } while (0) 499 500#endif /* TU_CS_H */ 501