1/* 2 * Copyright © 2022 Imagination Technologies Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a copy 5 * of this software and associated documentation files (the "Software"), to deal 6 * in the Software without restriction, including without limitation the rights 7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 * copies of the Software, and to permit persons to whom the Software is 9 * furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include <assert.h> 25#include <stdbool.h> 26#include <stdint.h> 27#include <vulkan/vulkan.h> 28 29#include "hwdef/rogue_hw_defs.h" 30#include "hwdef/rogue_hw_utils.h" 31#include "pvr_bo.h" 32#include "pvr_csb.h" 33#include "pvr_csb_enum_helpers.h" 34#include "pvr_debug.h" 35#include "pvr_job_common.h" 36#include "pvr_job_context.h" 37#include "pvr_job_render.h" 38#include "pvr_pds.h" 39#include "pvr_private.h" 40#include "pvr_rogue_fw.h" 41#include "pvr_types.h" 42#include "pvr_winsys.h" 43#include "util/compiler.h" 44#include "util/macros.h" 45#include "util/u_math.h" 46#include "vk_alloc.h" 47#include "vk_log.h" 48#include "vk_util.h" 49 50#define ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE 16U 51 52/* FIXME: Is there a hardware define we can use instead? */ 53/* 1 DWord per PM physical page stored in the free list */ 54#define ROGUE_FREE_LIST_ENTRY_SIZE ((uint32_t)sizeof(uint32_t)) 55 56/* FIXME: The three defines below, for the number of PC, PD and PT entries in a 57 * 4KB page, come from rgxmmudefs_km.h (meaning they're part of the 58 * auto-generated hwdefs). Should these be defined in rogue_mmu.xml? Keeping in 59 * mind that we probably only need these three values. */ 60#define ROGUE_NUM_PC_ENTRIES_PER_PAGE 0x400U 61 62#define ROGUE_NUM_PD_ENTRIES_PER_PAGE 0x200U 63 64#define ROGUE_NUM_PT_ENTRIES_PER_PAGE 0x200U 65 66struct pvr_free_list { 67 struct pvr_device *device; 68 69 uint64_t size; 70 71 struct pvr_bo *bo; 72 73 struct pvr_winsys_free_list *ws_free_list; 74}; 75 76/* Macrotile information. */ 77struct pvr_rt_mtile_info { 78 uint32_t tile_size_x; 79 uint32_t tile_size_y; 80 81 uint32_t num_tiles_x; 82 uint32_t num_tiles_y; 83 84 uint32_t tiles_per_mtile_x; 85 uint32_t tiles_per_mtile_y; 86 87 uint32_t x_tile_max; 88 uint32_t y_tile_max; 89 90 uint32_t mtiles_x; 91 uint32_t mtiles_y; 92 93 uint32_t mtile_x1; 94 uint32_t mtile_y1; 95 uint32_t mtile_x2; 96 uint32_t mtile_y2; 97 uint32_t mtile_x3; 98 uint32_t mtile_y3; 99 100 uint32_t mtile_stride; 101}; 102 103struct pvr_rt_dataset { 104 struct pvr_device *device; 105 106 /* RT dataset information */ 107 uint32_t width; 108 uint32_t height; 109 uint32_t samples; 110 uint32_t layers; 111 112 struct pvr_free_list *global_free_list; 113 struct pvr_free_list *local_free_list; 114 115 struct pvr_bo *vheap_rtc_bo; 116 pvr_dev_addr_t vheap_dev_addr; 117 pvr_dev_addr_t rtc_dev_addr; 118 119 struct pvr_bo *tpc_bo; 120 uint64_t tpc_stride; 121 uint64_t tpc_size; 122 123 struct pvr_winsys_rt_dataset *ws_rt_dataset; 124 125 /* RT data information */ 126 struct pvr_bo *mta_mlist_bo; 127 128 struct pvr_bo *rgn_headers_bo; 129 uint64_t rgn_headers_stride; 130 131 bool need_frag; 132 133 uint8_t rt_data_idx; 134 135 struct { 136 pvr_dev_addr_t mta_dev_addr; 137 pvr_dev_addr_t mlist_dev_addr; 138 pvr_dev_addr_t rgn_headers_dev_addr; 139 } rt_datas[ROGUE_NUM_RTDATAS]; 140}; 141 142VkResult pvr_free_list_create(struct pvr_device *device, 143 uint32_t initial_size, 144 uint32_t max_size, 145 uint32_t grow_size, 146 uint32_t grow_threshold, 147 struct pvr_free_list *parent_free_list, 148 struct pvr_free_list **const free_list_out) 149{ 150 struct pvr_winsys_free_list *parent_ws_free_list = 151 parent_free_list ? parent_free_list->ws_free_list : NULL; 152 const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED | 153 PVR_BO_ALLOC_FLAG_PM_FW_PROTECT; 154 struct pvr_free_list *free_list; 155 uint32_t cache_line_size; 156 uint32_t initial_num_pages; 157 uint32_t grow_num_pages; 158 uint32_t max_num_pages; 159 uint64_t addr_alignment; 160 uint64_t size_alignment; 161 uint64_t size; 162 VkResult result; 163 164 assert((initial_size + grow_size) <= max_size); 165 assert(max_size != 0); 166 assert(grow_threshold <= 100); 167 168 /* Make sure the free list is created with at least a single page. */ 169 if (initial_size == 0) 170 initial_size = ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE; 171 172 /* The freelists sizes must respect the PM freelist base address alignment 173 * requirement. As the freelist entries are cached by the SLC, it's also 174 * necessary to ensure the sizes respect the SLC cache line size to avoid 175 * invalid entries appearing in the cache, which would be problematic after 176 * a grow operation, as the SLC entries aren't invalidated. We do this by 177 * making sure the freelist values are appropriately aligned. 178 * 179 * To calculate the alignment, we first take the largest of the freelist 180 * base address alignment and the SLC cache line size. We then divide this 181 * by the freelist entry size to determine the number of freelist entries 182 * required by the PM. Finally, as each entry holds a single PM physical 183 * page, we multiple the number of entries by the page size. 184 * 185 * As an example, if the base address alignment is 16 bytes, the SLC cache 186 * line size is 64 bytes and the freelist entry size is 4 bytes then 16 187 * entries are required, as we take the SLC cacheline size (being the larger 188 * of the two values) and divide this by 4. If the PM page size is 4096 189 * bytes then we end up with an alignment of 65536 bytes. 190 */ 191 cache_line_size = rogue_get_slc_cache_line_size(&device->pdevice->dev_info); 192 193 addr_alignment = 194 MAX2(ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE, cache_line_size); 195 size_alignment = (addr_alignment / ROGUE_FREE_LIST_ENTRY_SIZE) * 196 ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE; 197 198 assert(util_is_power_of_two_nonzero(size_alignment)); 199 200 initial_size = align64(initial_size, size_alignment); 201 max_size = align64(max_size, size_alignment); 202 grow_size = align64(grow_size, size_alignment); 203 204 /* Make sure the 'max' size doesn't exceed what the firmware supports and 205 * adjust the other sizes accordingly. 206 */ 207 if (max_size > ROGUE_FREE_LIST_MAX_SIZE) { 208 max_size = ROGUE_FREE_LIST_MAX_SIZE; 209 assert(align64(max_size, size_alignment) == max_size); 210 } 211 212 if (initial_size > max_size) 213 initial_size = max_size; 214 215 if (initial_size == max_size) 216 grow_size = 0; 217 218 initial_num_pages = initial_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT; 219 max_num_pages = max_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT; 220 grow_num_pages = grow_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT; 221 222 /* Calculate the size of the buffer needed to store the free list entries 223 * based on the maximum number of pages we can have. 224 */ 225 size = max_num_pages * ROGUE_FREE_LIST_ENTRY_SIZE; 226 assert(align64(size, addr_alignment) == size); 227 228 free_list = vk_alloc(&device->vk.alloc, 229 sizeof(*free_list), 230 8, 231 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); 232 if (!free_list) 233 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 234 235 /* FIXME: The memory is mapped GPU uncached, but this seems to contradict 236 * the comment above about aligning to the SLC cache line size. 237 */ 238 result = pvr_bo_alloc(device, 239 device->heaps.general_heap, 240 size, 241 addr_alignment, 242 bo_flags, 243 &free_list->bo); 244 if (result != VK_SUCCESS) 245 goto err_vk_free_free_list; 246 247 result = device->ws->ops->free_list_create(device->ws, 248 free_list->bo->vma, 249 initial_num_pages, 250 max_num_pages, 251 grow_num_pages, 252 grow_threshold, 253 parent_ws_free_list, 254 &free_list->ws_free_list); 255 if (result != VK_SUCCESS) 256 goto err_pvr_bo_free_bo; 257 258 free_list->device = device; 259 free_list->size = size; 260 261 *free_list_out = free_list; 262 263 return VK_SUCCESS; 264 265err_pvr_bo_free_bo: 266 pvr_bo_free(device, free_list->bo); 267 268err_vk_free_free_list: 269 vk_free(&device->vk.alloc, free_list); 270 271 return result; 272} 273 274void pvr_free_list_destroy(struct pvr_free_list *free_list) 275{ 276 struct pvr_device *device = free_list->device; 277 278 device->ws->ops->free_list_destroy(free_list->ws_free_list); 279 pvr_bo_free(device, free_list->bo); 280 vk_free(&device->vk.alloc, free_list); 281} 282 283static inline void pvr_get_samples_in_xy(uint32_t samples, 284 uint32_t *const x_out, 285 uint32_t *const y_out) 286{ 287 switch (samples) { 288 case 1: 289 *x_out = 1; 290 *y_out = 1; 291 break; 292 case 2: 293 *x_out = 1; 294 *y_out = 2; 295 break; 296 case 4: 297 *x_out = 2; 298 *y_out = 2; 299 break; 300 case 8: 301 *x_out = 2; 302 *y_out = 4; 303 break; 304 default: 305 unreachable("Unsupported number of samples"); 306 } 307} 308 309static void pvr_rt_mtile_info_init(struct pvr_device *device, 310 struct pvr_rt_mtile_info *info, 311 uint32_t width, 312 uint32_t height, 313 uint32_t samples) 314{ 315 const struct pvr_device_info *dev_info = &device->pdevice->dev_info; 316 uint32_t samples_in_x; 317 uint32_t samples_in_y; 318 319 pvr_get_samples_in_xy(samples, &samples_in_x, &samples_in_y); 320 321 info->tile_size_x = PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 1); 322 info->tile_size_y = PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 1); 323 324 info->num_tiles_x = DIV_ROUND_UP(width, info->tile_size_x); 325 info->num_tiles_y = DIV_ROUND_UP(height, info->tile_size_y); 326 327 rogue_get_num_macrotiles_xy(dev_info, &info->mtiles_x, &info->mtiles_y); 328 329 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { 330 assert(PVR_GET_FEATURE_VALUE(dev_info, 331 simple_parameter_format_version, 332 0) == 2); 333 /* Set up 16 macrotiles with a multiple of 2x2 tiles per macrotile, 334 * which is aligned to a tile group. 335 */ 336 info->mtile_x1 = DIV_ROUND_UP(info->num_tiles_x, 8) * 2; 337 info->mtile_y1 = DIV_ROUND_UP(info->num_tiles_y, 8) * 2; 338 info->mtile_x2 = 0; 339 info->mtile_y2 = 0; 340 info->mtile_x3 = 0; 341 info->mtile_y3 = 0; 342 info->x_tile_max = ALIGN_POT(info->num_tiles_x, 2) - 1; 343 info->y_tile_max = ALIGN_POT(info->num_tiles_y, 2) - 1; 344 } else { 345 /* Set up 16 macrotiles with a multiple of 4x4 tiles per macrotile. */ 346 info->mtile_x1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_x, 4), 4); 347 info->mtile_y1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_y, 4), 4); 348 info->mtile_x2 = info->mtile_x1 * 2; 349 info->mtile_y2 = info->mtile_y1 * 2; 350 info->mtile_x3 = info->mtile_x1 * 3; 351 info->mtile_y3 = info->mtile_y1 * 3; 352 info->x_tile_max = info->num_tiles_x - 1; 353 info->y_tile_max = info->num_tiles_y - 1; 354 } 355 356 info->tiles_per_mtile_x = info->mtile_x1 * samples_in_x; 357 info->tiles_per_mtile_y = info->mtile_y1 * samples_in_y; 358 359 info->mtile_stride = info->mtile_x1 * info->mtile_y1; 360} 361 362/* Note that the unit of the return value depends on the GPU. For cores with the 363 * simple_internal_parameter_format feature the returned size is interpreted as 364 * the number of region headers. For cores without this feature its interpreted 365 * as the size in dwords. 366 */ 367static uint64_t 368pvr_rt_get_isp_region_size(struct pvr_device *device, 369 const struct pvr_rt_mtile_info *mtile_info) 370{ 371 const struct pvr_device_info *dev_info = &device->pdevice->dev_info; 372 uint64_t rgn_size = 373 mtile_info->tiles_per_mtile_x * mtile_info->tiles_per_mtile_y; 374 375 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { 376 uint32_t version; 377 378 rgn_size *= mtile_info->mtiles_x * mtile_info->mtiles_y; 379 380 if (PVR_FEATURE_VALUE(dev_info, 381 simple_parameter_format_version, 382 &version)) { 383 version = 0; 384 } 385 386 if (version == 2) { 387 /* One region header per 2x2 tile group. */ 388 rgn_size /= (2U * 2U); 389 } 390 } else { 391 const uint64_t rgn_header_size = rogue_get_region_header_size(dev_info); 392 393 /* Round up to next dword to prevent IPF overrun and convert to bytes. 394 */ 395 rgn_size = DIV_ROUND_UP(rgn_size * rgn_header_size, 4); 396 } 397 398 return rgn_size; 399} 400 401static VkResult pvr_rt_vheap_rtc_data_init(struct pvr_device *device, 402 struct pvr_rt_dataset *rt_dataset, 403 uint32_t layers) 404{ 405 const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED | 406 PVR_BO_ALLOC_FLAG_ZERO_ON_ALLOC; 407 uint64_t vheap_size; 408 uint32_t alignment; 409 uint64_t rtc_size; 410 VkResult result; 411 412 vheap_size = ROGUE_CR_PM_VHEAP_TABLE_SIZE * ROGUE_PM_VHEAP_ENTRY_SIZE; 413 414 if (layers > 1) { 415 uint64_t rtc_entries; 416 417 vheap_size = ALIGN_POT(vheap_size, PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT)); 418 419 rtc_entries = ROGUE_NUM_TEAC + ROGUE_NUM_TE + ROGUE_NUM_VCE; 420 if (PVR_HAS_QUIRK(&device->pdevice->dev_info, 48545)) 421 rtc_entries += ROGUE_NUM_TE; 422 423 rtc_size = rtc_entries * ROGUE_RTC_SIZE_IN_BYTES; 424 } else { 425 rtc_size = 0; 426 } 427 428 alignment = MAX2(PVRX(CR_PM_VHEAP_TABLE_BASE_ADDR_ALIGNMENT), 429 PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT)); 430 431 result = pvr_bo_alloc(device, 432 device->heaps.general_heap, 433 vheap_size + rtc_size, 434 alignment, 435 bo_flags, 436 &rt_dataset->vheap_rtc_bo); 437 if (result != VK_SUCCESS) 438 return result; 439 440 rt_dataset->vheap_dev_addr = rt_dataset->vheap_rtc_bo->vma->dev_addr; 441 442 if (rtc_size > 0) { 443 rt_dataset->rtc_dev_addr = 444 PVR_DEV_ADDR_OFFSET(rt_dataset->vheap_dev_addr, vheap_size); 445 } else { 446 rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID; 447 } 448 449 return VK_SUCCESS; 450} 451 452static void pvr_rt_vheap_rtc_data_fini(struct pvr_rt_dataset *rt_dataset) 453{ 454 rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID; 455 456 pvr_bo_free(rt_dataset->device, rt_dataset->vheap_rtc_bo); 457 rt_dataset->vheap_rtc_bo = NULL; 458} 459 460static void 461pvr_rt_get_tail_ptr_stride_size(const struct pvr_device *device, 462 const struct pvr_rt_mtile_info *mtile_info, 463 uint32_t layers, 464 uint64_t *const stride_out, 465 uint64_t *const size_out) 466{ 467 uint32_t max_num_mtiles; 468 uint32_t num_mtiles_x; 469 uint32_t num_mtiles_y; 470 uint32_t version; 471 uint64_t size; 472 473 num_mtiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x; 474 num_mtiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y; 475 476 max_num_mtiles = MAX2(util_next_power_of_two64(num_mtiles_x), 477 util_next_power_of_two64(num_mtiles_y)); 478 479 size = max_num_mtiles * max_num_mtiles; 480 481 if (PVR_FEATURE_VALUE(&device->pdevice->dev_info, 482 simple_parameter_format_version, 483 &version)) { 484 version = 0; 485 } 486 487 if (version == 2) { 488 /* One tail pointer cache entry per 2x2 tile group. */ 489 size /= (2U * 2U); 490 } 491 492 size *= ROGUE_TAIL_POINTER_SIZE; 493 494 if (layers > 1) { 495 size = ALIGN_POT(size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE); 496 497 *stride_out = size / ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE; 498 *size_out = size * layers; 499 } else { 500 *stride_out = 0; 501 *size_out = size; 502 } 503} 504 505static VkResult pvr_rt_tpc_data_init(struct pvr_device *device, 506 struct pvr_rt_dataset *rt_dataset, 507 const struct pvr_rt_mtile_info *mtile_info, 508 uint32_t layers) 509{ 510 const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED | 511 PVR_BO_ALLOC_FLAG_ZERO_ON_ALLOC; 512 uint64_t tpc_size; 513 514 pvr_rt_get_tail_ptr_stride_size(device, 515 mtile_info, 516 layers, 517 &rt_dataset->tpc_stride, 518 &rt_dataset->tpc_size); 519 tpc_size = ALIGN_POT(rt_dataset->tpc_size, ROGUE_TE_TPC_CACHE_LINE_SIZE); 520 521 return pvr_bo_alloc(device, 522 device->heaps.general_heap, 523 tpc_size, 524 PVRX(CR_TE_TPC_ADDR_BASE_ALIGNMENT), 525 bo_flags, 526 &rt_dataset->tpc_bo); 527} 528 529static void pvr_rt_tpc_data_fini(struct pvr_rt_dataset *rt_dataset) 530{ 531 pvr_bo_free(rt_dataset->device, rt_dataset->tpc_bo); 532 rt_dataset->tpc_bo = NULL; 533} 534 535static uint32_t 536pvr_rt_get_mlist_size(const struct pvr_free_list *global_free_list, 537 const struct pvr_free_list *local_free_list) 538{ 539 uint32_t num_pte_pages; 540 uint32_t num_pde_pages; 541 uint32_t num_pce_pages; 542 uint64_t total_pages; 543 uint32_t mlist_size; 544 545 assert(global_free_list->size + local_free_list->size <= 546 ROGUE_PM_MAX_PB_VIRT_ADDR_SPACE); 547 548 total_pages = (global_free_list->size + local_free_list->size) >> 549 ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT; 550 551 /* Calculate the total number of physical pages required to hold the page 552 * table, directory and catalog entries for the freelist pages. 553 */ 554 num_pte_pages = DIV_ROUND_UP(total_pages, ROGUE_NUM_PT_ENTRIES_PER_PAGE); 555 num_pde_pages = DIV_ROUND_UP(num_pte_pages, ROGUE_NUM_PD_ENTRIES_PER_PAGE); 556 num_pce_pages = DIV_ROUND_UP(num_pde_pages, ROGUE_NUM_PC_ENTRIES_PER_PAGE); 557 558 /* Calculate the MList size considering the total number of pages in the PB 559 * are shared among all the PM address spaces. 560 */ 561 mlist_size = (num_pce_pages + num_pde_pages + num_pte_pages) * 562 ROGUE_NUM_PM_ADDRESS_SPACES * ROGUE_MLIST_ENTRY_STRIDE; 563 564 return ALIGN_POT(mlist_size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE); 565} 566 567static void pvr_rt_get_region_headers_stride_size( 568 const struct pvr_device *device, 569 const struct pvr_rt_mtile_info *mtile_info, 570 uint32_t layers, 571 uint64_t *const stride_out, 572 uint64_t *const size_out) 573{ 574 const struct pvr_device_info *dev_info = &device->pdevice->dev_info; 575 const uint32_t rgn_header_size = rogue_get_region_header_size(dev_info); 576 uint32_t rgn_headers_size; 577 uint32_t num_tiles_x; 578 uint32_t num_tiles_y; 579 uint32_t group_size; 580 uint32_t version; 581 582 if (PVR_FEATURE_VALUE(dev_info, simple_parameter_format_version, &version)) 583 version = 0; 584 585 group_size = version == 2 ? 2 : 1; 586 587 num_tiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x; 588 num_tiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y; 589 590 rgn_headers_size = 591 (num_tiles_x / group_size) * (num_tiles_y / group_size) * rgn_header_size; 592 593 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { 594 rgn_headers_size = 595 ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT)); 596 } 597 598 if (layers > 1) { 599 rgn_headers_size = 600 ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE)); 601 } 602 603 *stride_out = rgn_header_size; 604 *size_out = rgn_headers_size * layers; 605} 606 607static VkResult 608pvr_rt_mta_mlist_data_init(struct pvr_device *device, 609 struct pvr_rt_dataset *rt_dataset, 610 const struct pvr_free_list *global_free_list, 611 const struct pvr_free_list *local_free_list, 612 const struct pvr_rt_mtile_info *mtile_info) 613{ 614 const struct pvr_device_info *dev_info = &device->pdevice->dev_info; 615 const uint32_t mlist_size = 616 pvr_rt_get_mlist_size(global_free_list, local_free_list); 617 uint32_t mta_size = rogue_get_macrotile_array_size(dev_info); 618 const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas); 619 uint32_t rt_datas_mlist_size; 620 uint32_t rt_datas_mta_size; 621 pvr_dev_addr_t dev_addr; 622 VkResult result; 623 624 /* Allocate memory for macrotile array and Mlist for all RT datas. 625 * 626 * Allocation layout: MTA[0..N] + Mlist alignment padding + Mlist[0..N]. 627 * 628 * N is number of RT datas. 629 */ 630 rt_datas_mta_size = ALIGN_POT(mta_size * num_rt_datas, 631 PVRX(CR_PM_MLIST0_BASE_ADDR_ALIGNMENT)); 632 rt_datas_mlist_size = mlist_size * num_rt_datas; 633 634 result = pvr_bo_alloc(device, 635 device->heaps.general_heap, 636 rt_datas_mta_size + rt_datas_mlist_size, 637 PVRX(CR_PM_MTILE_ARRAY_BASE_ADDR_ALIGNMENT), 638 PVR_BO_ALLOC_FLAG_GPU_UNCACHED, 639 &rt_dataset->mta_mlist_bo); 640 if (result != VK_SUCCESS) 641 return result; 642 643 dev_addr = rt_dataset->mta_mlist_bo->vma->dev_addr; 644 645 for (uint32_t i = 0; i < num_rt_datas; i++) { 646 if (mta_size != 0) { 647 rt_dataset->rt_datas[i].mta_dev_addr = dev_addr; 648 dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mta_size); 649 } else { 650 rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID; 651 } 652 } 653 654 dev_addr = PVR_DEV_ADDR_OFFSET(rt_dataset->mta_mlist_bo->vma->dev_addr, 655 rt_datas_mta_size); 656 657 for (uint32_t i = 0; i < num_rt_datas; i++) { 658 if (mlist_size != 0) { 659 rt_dataset->rt_datas[i].mlist_dev_addr = dev_addr; 660 dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mlist_size); 661 } else { 662 rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID; 663 } 664 } 665 666 return VK_SUCCESS; 667} 668 669static void pvr_rt_mta_mlist_data_fini(struct pvr_rt_dataset *rt_dataset) 670{ 671 for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++) { 672 rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID; 673 rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID; 674 } 675 676 pvr_bo_free(rt_dataset->device, rt_dataset->mta_mlist_bo); 677 rt_dataset->mta_mlist_bo = NULL; 678} 679 680static VkResult 681pvr_rt_rgn_headers_data_init(struct pvr_device *device, 682 struct pvr_rt_dataset *rt_dataset, 683 const struct pvr_rt_mtile_info *mtile_info, 684 uint32_t layers) 685{ 686 const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas); 687 uint64_t rgn_headers_size; 688 pvr_dev_addr_t dev_addr; 689 VkResult result; 690 691 pvr_rt_get_region_headers_stride_size(device, 692 mtile_info, 693 layers, 694 &rt_dataset->rgn_headers_stride, 695 &rgn_headers_size); 696 697 result = pvr_bo_alloc(device, 698 device->heaps.rgn_hdr_heap, 699 rgn_headers_size * num_rt_datas, 700 PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT), 701 PVR_BO_ALLOC_FLAG_GPU_UNCACHED, 702 &rt_dataset->rgn_headers_bo); 703 if (result != VK_SUCCESS) 704 return result; 705 706 dev_addr = rt_dataset->rgn_headers_bo->vma->dev_addr; 707 708 for (uint32_t i = 0; i < num_rt_datas; i++) { 709 rt_dataset->rt_datas[i].rgn_headers_dev_addr = dev_addr; 710 dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, rgn_headers_size); 711 } 712 713 return VK_SUCCESS; 714} 715 716static void pvr_rt_rgn_headers_data_fini(struct pvr_rt_dataset *rt_dataset) 717{ 718 for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++) 719 rt_dataset->rt_datas[i].rgn_headers_dev_addr = PVR_DEV_ADDR_INVALID; 720 721 pvr_bo_free(rt_dataset->device, rt_dataset->rgn_headers_bo); 722 rt_dataset->rgn_headers_bo = NULL; 723} 724 725static VkResult pvr_rt_datas_init(struct pvr_device *device, 726 struct pvr_rt_dataset *rt_dataset, 727 const struct pvr_free_list *global_free_list, 728 const struct pvr_free_list *local_free_list, 729 const struct pvr_rt_mtile_info *mtile_info, 730 uint32_t layers) 731{ 732 VkResult result; 733 734 result = pvr_rt_mta_mlist_data_init(device, 735 rt_dataset, 736 global_free_list, 737 local_free_list, 738 mtile_info); 739 if (result != VK_SUCCESS) 740 return result; 741 742 result = 743 pvr_rt_rgn_headers_data_init(device, rt_dataset, mtile_info, layers); 744 if (result != VK_SUCCESS) 745 goto err_pvr_rt_mta_mlist_data_fini; 746 747 return VK_SUCCESS; 748 749err_pvr_rt_mta_mlist_data_fini: 750 pvr_rt_mta_mlist_data_fini(rt_dataset); 751 752 return VK_SUCCESS; 753} 754 755static void pvr_rt_datas_fini(struct pvr_rt_dataset *rt_dataset) 756{ 757 pvr_rt_rgn_headers_data_fini(rt_dataset); 758 pvr_rt_mta_mlist_data_fini(rt_dataset); 759} 760 761static uint32_t 762pvr_rogue_get_cr_isp_mtile_size_val(const struct pvr_device_info *dev_info, 763 uint32_t samples, 764 const struct pvr_rt_mtile_info *mtile_info) 765{ 766 uint32_t samples_per_pixel = 767 PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 0); 768 uint32_t isp_mtile_size; 769 770 pvr_csb_pack (&isp_mtile_size, CR_ISP_MTILE_SIZE, value) { 771 value.x = mtile_info->mtile_x1; 772 value.y = mtile_info->mtile_y1; 773 774 if (samples_per_pixel == 1) { 775 if (samples >= 4) 776 value.x <<= 1; 777 778 if (samples >= 2) 779 value.y <<= 1; 780 } else if (samples_per_pixel == 2) { 781 if (samples >= 8) 782 value.x <<= 1; 783 784 if (samples >= 4) 785 value.y <<= 1; 786 } else if (samples_per_pixel == 4) { 787 if (samples >= 8) 788 value.y <<= 1; 789 } else { 790 assert(!"Unsupported ISP samples per pixel value"); 791 } 792 } 793 794 return isp_mtile_size; 795} 796 797static uint64_t pvr_rogue_get_cr_multisamplectl_val(uint32_t samples, 798 bool y_flip) 799{ 800 static const struct { 801 uint8_t x[8]; 802 uint8_t y[8]; 803 } sample_positions[4] = { 804 /* 1 sample */ 805 { 806 .x = { 8 }, 807 .y = { 8 }, 808 }, 809 /* 2 samples */ 810 { 811 .x = { 12, 4 }, 812 .y = { 12, 4 }, 813 }, 814 /* 4 samples */ 815 { 816 .x = { 6, 14, 2, 10 }, 817 .y = { 2, 6, 10, 14 }, 818 }, 819 /* 8 samples */ 820 { 821 .x = { 9, 7, 13, 5, 3, 1, 11, 15 }, 822 .y = { 5, 11, 9, 3, 13, 7, 15, 1 }, 823 }, 824 }; 825 uint64_t multisamplectl; 826 uint8_t idx; 827 828 idx = util_fast_log2(samples); 829 assert(idx < ARRAY_SIZE(sample_positions)); 830 831 pvr_csb_pack (&multisamplectl, CR_PPP_MULTISAMPLECTL, value) { 832 switch (samples) { 833 case 8: 834 value.msaa_x7 = sample_positions[idx].x[7]; 835 value.msaa_x6 = sample_positions[idx].x[6]; 836 value.msaa_x5 = sample_positions[idx].x[5]; 837 value.msaa_x4 = sample_positions[idx].x[4]; 838 839 if (y_flip) { 840 value.msaa_y7 = 16U - sample_positions[idx].y[7]; 841 value.msaa_y6 = 16U - sample_positions[idx].y[6]; 842 value.msaa_y5 = 16U - sample_positions[idx].y[5]; 843 value.msaa_y4 = 16U - sample_positions[idx].y[4]; 844 } else { 845 value.msaa_y7 = sample_positions[idx].y[7]; 846 value.msaa_y6 = sample_positions[idx].y[6]; 847 value.msaa_y5 = sample_positions[idx].y[5]; 848 value.msaa_y4 = sample_positions[idx].y[4]; 849 } 850 851 FALLTHROUGH; 852 case 4: 853 value.msaa_x3 = sample_positions[idx].x[3]; 854 value.msaa_x2 = sample_positions[idx].x[2]; 855 856 if (y_flip) { 857 value.msaa_y3 = 16U - sample_positions[idx].y[3]; 858 value.msaa_y2 = 16U - sample_positions[idx].y[2]; 859 } else { 860 value.msaa_y3 = sample_positions[idx].y[3]; 861 value.msaa_y2 = sample_positions[idx].y[2]; 862 } 863 864 FALLTHROUGH; 865 case 2: 866 value.msaa_x1 = sample_positions[idx].x[1]; 867 868 if (y_flip) { 869 value.msaa_y1 = 16U - sample_positions[idx].y[1]; 870 } else { 871 value.msaa_y1 = sample_positions[idx].y[1]; 872 } 873 874 FALLTHROUGH; 875 case 1: 876 value.msaa_x0 = sample_positions[idx].x[0]; 877 878 if (y_flip) { 879 value.msaa_y0 = 16U - sample_positions[idx].y[0]; 880 } else { 881 value.msaa_y0 = sample_positions[idx].y[0]; 882 } 883 884 break; 885 default: 886 unreachable("Unsupported number of samples"); 887 } 888 } 889 890 return multisamplectl; 891} 892 893static uint32_t 894pvr_rogue_get_cr_te_aa_val(const struct pvr_device_info *dev_info, 895 uint32_t samples) 896{ 897 uint32_t samples_per_pixel = 898 PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 0); 899 uint32_t te_aa; 900 901 pvr_csb_pack (&te_aa, CR_TE_AA, value) { 902 if (samples_per_pixel == 1) { 903 if (samples >= 2) 904 value.y = true; 905 if (samples >= 4) 906 value.x = true; 907 } else if (samples_per_pixel == 2) { 908 if (samples >= 2) 909 value.x2 = true; 910 if (samples >= 4) 911 value.y = true; 912 if (samples >= 8) 913 value.x = true; 914 } else if (samples_per_pixel == 4) { 915 if (samples >= 2) 916 value.x2 = true; 917 if (samples >= 4) 918 value.y2 = true; 919 if (samples >= 8) 920 value.y = true; 921 } else { 922 assert(!"Unsupported ISP samples per pixel value"); 923 } 924 } 925 926 return te_aa; 927} 928 929static void pvr_rt_dataset_ws_create_info_init( 930 struct pvr_rt_dataset *rt_dataset, 931 const struct pvr_rt_mtile_info *mtile_info, 932 struct pvr_winsys_rt_dataset_create_info *create_info) 933{ 934 struct pvr_device *device = rt_dataset->device; 935 const struct pvr_device_info *dev_info = &device->pdevice->dev_info; 936 937 memset(create_info, 0, sizeof(*create_info)); 938 939 /* Local freelist. */ 940 create_info->local_free_list = rt_dataset->local_free_list->ws_free_list; 941 942 /* ISP register values. */ 943 if (PVR_HAS_ERN(dev_info, 42307) && 944 !(PVR_HAS_FEATURE(dev_info, roguexe) && mtile_info->tile_size_x == 16)) { 945 float value; 946 947 if (rt_dataset->width != 0) { 948 value = 949 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->width; 950 create_info->isp_merge_lower_x = fui(value); 951 952 value = 953 ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->width; 954 create_info->isp_merge_upper_x = fui(value); 955 } 956 957 if (rt_dataset->height != 0) { 958 value = 959 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->height; 960 create_info->isp_merge_lower_y = fui(value); 961 962 value = 963 ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->height; 964 create_info->isp_merge_upper_y = fui(value); 965 } 966 967 value = ((float)rt_dataset->width * ROGUE_ISP_MERGE_SCALE_FACTOR) / 968 (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR - 969 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR); 970 create_info->isp_merge_scale_x = fui(value); 971 972 value = ((float)rt_dataset->height * ROGUE_ISP_MERGE_SCALE_FACTOR) / 973 (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR - 974 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR); 975 create_info->isp_merge_scale_y = fui(value); 976 } 977 978 create_info->isp_mtile_size = 979 pvr_rogue_get_cr_isp_mtile_size_val(dev_info, 980 rt_dataset->samples, 981 mtile_info); 982 983 /* PPP register values. */ 984 create_info->ppp_multi_sample_ctl = 985 pvr_rogue_get_cr_multisamplectl_val(rt_dataset->samples, false); 986 create_info->ppp_multi_sample_ctl_y_flipped = 987 pvr_rogue_get_cr_multisamplectl_val(rt_dataset->samples, true); 988 989 pvr_csb_pack (&create_info->ppp_screen, CR_PPP_SCREEN, value) { 990 value.pixxmax = rt_dataset->width - 1; 991 value.pixymax = rt_dataset->height - 1; 992 } 993 994 /* TE register values. */ 995 create_info->te_aa = 996 pvr_rogue_get_cr_te_aa_val(dev_info, rt_dataset->samples); 997 998 pvr_csb_pack (&create_info->te_mtile1, CR_TE_MTILE1, value) { 999 value.x1 = mtile_info->mtile_x1; 1000 if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { 1001 value.x2 = mtile_info->mtile_x2; 1002 value.x3 = mtile_info->mtile_x3; 1003 } 1004 } 1005 1006 pvr_csb_pack (&create_info->te_mtile2, CR_TE_MTILE2, value) { 1007 value.y1 = mtile_info->mtile_y1; 1008 if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { 1009 value.y2 = mtile_info->mtile_y2; 1010 value.y3 = mtile_info->mtile_y3; 1011 } 1012 } 1013 1014 pvr_csb_pack (&create_info->te_screen, CR_TE_SCREEN, value) { 1015 value.xmax = mtile_info->x_tile_max; 1016 value.ymax = mtile_info->y_tile_max; 1017 } 1018 1019 /* Allocations and associated information. */ 1020 create_info->vheap_table_dev_addr = rt_dataset->vheap_dev_addr; 1021 create_info->rtc_dev_addr = rt_dataset->rtc_dev_addr; 1022 1023 create_info->tpc_dev_addr = rt_dataset->tpc_bo->vma->dev_addr; 1024 create_info->tpc_stride = rt_dataset->tpc_stride; 1025 create_info->tpc_size = rt_dataset->tpc_size; 1026 1027 STATIC_ASSERT(ARRAY_SIZE(create_info->rt_datas) == 1028 ARRAY_SIZE(rt_dataset->rt_datas)); 1029 for (uint32_t i = 0; i < ARRAY_SIZE(create_info->rt_datas); i++) { 1030 create_info->rt_datas[i].pm_mlist_dev_addr = 1031 rt_dataset->rt_datas[i].mlist_dev_addr; 1032 create_info->rt_datas[i].macrotile_array_dev_addr = 1033 rt_dataset->rt_datas[i].mta_dev_addr; 1034 create_info->rt_datas[i].rgn_header_dev_addr = 1035 rt_dataset->rt_datas[i].rgn_headers_dev_addr; 1036 } 1037 1038 create_info->rgn_header_size = 1039 pvr_rt_get_isp_region_size(device, mtile_info); 1040 1041 /* Miscellaneous. */ 1042 create_info->mtile_stride = mtile_info->mtile_stride; 1043 create_info->max_rts = rt_dataset->layers; 1044} 1045 1046VkResult 1047pvr_render_target_dataset_create(struct pvr_device *device, 1048 uint32_t width, 1049 uint32_t height, 1050 uint32_t samples, 1051 uint32_t layers, 1052 struct pvr_rt_dataset **const rt_dataset_out) 1053{ 1054 const struct pvr_device_info *dev_info = &device->pdevice->dev_info; 1055 struct pvr_winsys_rt_dataset_create_info rt_dataset_create_info; 1056 struct pvr_rt_mtile_info mtile_info; 1057 struct pvr_rt_dataset *rt_dataset; 1058 VkResult result; 1059 1060 assert(device->global_free_list); 1061 assert(width <= rogue_get_render_size_max_x(dev_info)); 1062 assert(height <= rogue_get_render_size_max_y(dev_info)); 1063 assert(layers > 0 && layers <= PVR_MAX_FRAMEBUFFER_LAYERS); 1064 1065 pvr_rt_mtile_info_init(device, &mtile_info, width, height, samples); 1066 1067 rt_dataset = vk_zalloc(&device->vk.alloc, 1068 sizeof(*rt_dataset), 1069 8, 1070 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); 1071 if (!rt_dataset) 1072 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1073 1074 rt_dataset->device = device; 1075 rt_dataset->width = width; 1076 rt_dataset->height = height; 1077 rt_dataset->samples = samples; 1078 rt_dataset->layers = layers; 1079 rt_dataset->global_free_list = device->global_free_list; 1080 1081 /* The maximum supported free list size is based on the assumption that this 1082 * freelist (the "local" freelist) is always the minimum size required by 1083 * the hardware. See the documentation of ROGUE_FREE_LIST_MAX_SIZE for more 1084 * details. 1085 */ 1086 result = pvr_free_list_create(device, 1087 rogue_get_min_free_list_size(dev_info), 1088 rogue_get_min_free_list_size(dev_info), 1089 0 /* grow_size */, 1090 0 /* grow_threshold */, 1091 rt_dataset->global_free_list, 1092 &rt_dataset->local_free_list); 1093 if (result != VK_SUCCESS) 1094 goto err_vk_free_rt_dataset; 1095 1096 result = pvr_rt_vheap_rtc_data_init(device, rt_dataset, layers); 1097 if (result != VK_SUCCESS) 1098 goto err_pvr_free_list_destroy; 1099 1100 result = pvr_rt_tpc_data_init(device, rt_dataset, &mtile_info, layers); 1101 if (result != VK_SUCCESS) 1102 goto err_pvr_rt_vheap_rtc_data_fini; 1103 1104 result = pvr_rt_datas_init(device, 1105 rt_dataset, 1106 rt_dataset->global_free_list, 1107 rt_dataset->local_free_list, 1108 &mtile_info, 1109 layers); 1110 if (result != VK_SUCCESS) 1111 goto err_pvr_rt_tpc_data_fini; 1112 1113 /* rt_dataset must be fully initialized by this point since 1114 * pvr_rt_dataset_ws_create_info_init() depends on this. 1115 */ 1116 pvr_rt_dataset_ws_create_info_init(rt_dataset, 1117 &mtile_info, 1118 &rt_dataset_create_info); 1119 1120 result = 1121 device->ws->ops->render_target_dataset_create(device->ws, 1122 &rt_dataset_create_info, 1123 &rt_dataset->ws_rt_dataset); 1124 if (result != VK_SUCCESS) 1125 goto err_pvr_rt_datas_fini; 1126 1127 *rt_dataset_out = rt_dataset; 1128 1129 return VK_SUCCESS; 1130 1131err_pvr_rt_datas_fini: 1132 pvr_rt_datas_fini(rt_dataset); 1133 1134err_pvr_rt_tpc_data_fini: 1135 pvr_rt_tpc_data_fini(rt_dataset); 1136 1137err_pvr_rt_vheap_rtc_data_fini: 1138 pvr_rt_vheap_rtc_data_fini(rt_dataset); 1139 1140err_pvr_free_list_destroy: 1141 pvr_free_list_destroy(rt_dataset->local_free_list); 1142 1143err_vk_free_rt_dataset: 1144 vk_free(&device->vk.alloc, rt_dataset); 1145 1146 return result; 1147} 1148 1149void pvr_render_target_dataset_destroy(struct pvr_rt_dataset *rt_dataset) 1150{ 1151 struct pvr_device *device = rt_dataset->device; 1152 1153 device->ws->ops->render_target_dataset_destroy(rt_dataset->ws_rt_dataset); 1154 1155 pvr_rt_datas_fini(rt_dataset); 1156 pvr_rt_tpc_data_fini(rt_dataset); 1157 pvr_rt_vheap_rtc_data_fini(rt_dataset); 1158 1159 pvr_free_list_destroy(rt_dataset->local_free_list); 1160 1161 vk_free(&device->vk.alloc, rt_dataset); 1162} 1163 1164static void 1165pvr_render_job_ws_geometry_state_init(struct pvr_render_ctx *ctx, 1166 struct pvr_render_job *job, 1167 struct pvr_winsys_geometry_state *state) 1168{ 1169 const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info; 1170 1171 /* FIXME: Should this just be done unconditionally? The firmware will just 1172 * ignore the value anyway. 1173 */ 1174 if (PVR_HAS_QUIRK(dev_info, 56279)) { 1175 pvr_csb_pack (&state->regs.pds_ctrl, CR_PDS_CTRL, value) { 1176 value.max_num_vdm_tasks = rogue_get_max_num_vdm_pds_tasks(dev_info); 1177 } 1178 } else { 1179 state->regs.pds_ctrl = 0; 1180 } 1181 1182 pvr_csb_pack (&state->regs.ppp_ctrl, CR_PPP_CTRL, value) { 1183 value.wclampen = true; 1184 value.fixed_point_format = 1; 1185 } 1186 1187 pvr_csb_pack (&state->regs.te_psg, CR_TE_PSG, value) { 1188 value.completeonterminate = job->geometry_terminate; 1189 1190 value.region_stride = job->rt_dataset->rgn_headers_stride / 1191 PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE); 1192 1193 value.forcenewstate = PVR_HAS_QUIRK(dev_info, 52942); 1194 } 1195 1196 /* The set up of CR_TPU must be identical to 1197 * pvr_render_job_ws_fragment_state_init(). 1198 */ 1199 pvr_csb_pack (&state->regs.tpu, CR_TPU, value) { 1200 value.tag_cem_4k_face_packing = true; 1201 } 1202 1203 pvr_csb_pack (&state->regs.tpu_border_colour_table, 1204 CR_TPU_BORDER_COLOUR_TABLE_VDM, 1205 value) { 1206 value.border_colour_table_address = job->border_colour_table_addr; 1207 } 1208 1209 pvr_csb_pack (&state->regs.vdm_ctrl_stream_base, 1210 CR_VDM_CTRL_STREAM_BASE, 1211 value) { 1212 value.addr = job->ctrl_stream_addr; 1213 } 1214 1215 /* Set up the USC common size for the context switch resume/load program 1216 * (ctx->ctx_switch.programs[i].sr->pds_load_program), which was created 1217 * as part of the render context. 1218 */ 1219 pvr_csb_pack (&state->regs.vdm_ctx_resume_task0_size, 1220 VDMCTRL_PDS_STATE0, 1221 value) { 1222 /* Calculate the size in bytes. */ 1223 const uint16_t shared_registers_size = job->max_shared_registers * 4; 1224 1225 value.usc_common_size = 1226 DIV_ROUND_UP(shared_registers_size, 1227 PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE)); 1228 }; 1229 1230 state->flags = 0; 1231 1232 if (!job->rt_dataset->need_frag) 1233 state->flags |= PVR_WINSYS_GEOM_FLAG_FIRST_GEOMETRY; 1234 1235 if (job->geometry_terminate) 1236 state->flags |= PVR_WINSYS_GEOM_FLAG_LAST_GEOMETRY; 1237 1238 if (job->frag_uses_atomic_ops) 1239 state->flags |= PVR_WINSYS_GEOM_FLAG_SINGLE_CORE; 1240} 1241 1242static inline void 1243pvr_get_isp_num_tiles_xy(const struct pvr_device_info *dev_info, 1244 uint32_t samples, 1245 uint32_t width, 1246 uint32_t height, 1247 uint32_t *const x_out, 1248 uint32_t *const y_out) 1249{ 1250 uint32_t tile_samples_x; 1251 uint32_t tile_samples_y; 1252 uint32_t scale_x; 1253 uint32_t scale_y; 1254 1255 rogue_get_isp_samples_per_tile_xy(dev_info, 1256 samples, 1257 &tile_samples_x, 1258 &tile_samples_y); 1259 1260 switch (samples) { 1261 case 1: 1262 scale_x = 1; 1263 scale_y = 1; 1264 break; 1265 case 2: 1266 scale_x = 1; 1267 scale_y = 2; 1268 break; 1269 case 4: 1270 scale_x = 2; 1271 scale_y = 2; 1272 break; 1273 case 8: 1274 scale_x = 2; 1275 scale_y = 4; 1276 break; 1277 default: 1278 unreachable("Unsupported number of samples"); 1279 } 1280 1281 *x_out = DIV_ROUND_UP(width * scale_x, tile_samples_x); 1282 *y_out = DIV_ROUND_UP(height * scale_y, tile_samples_y); 1283 1284 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { 1285 assert(PVR_GET_FEATURE_VALUE(dev_info, 1286 simple_parameter_format_version, 1287 0U) == 2U); 1288 /* Align to a 2x2 tile block. */ 1289 *x_out = ALIGN_POT(*x_out, 2); 1290 *y_out = ALIGN_POT(*y_out, 2); 1291 } 1292} 1293 1294static void 1295pvr_render_job_ws_fragment_state_init(struct pvr_render_ctx *ctx, 1296 struct pvr_render_job *job, 1297 struct pvr_winsys_fragment_state *state) 1298{ 1299 const enum PVRX(CR_ISP_AA_MODE_TYPE) 1300 isp_aa_mode = pvr_cr_isp_aa_mode_type(job->samples); 1301 const struct pvr_device_runtime_info *dev_runtime_info = 1302 &ctx->device->pdevice->dev_runtime_info; 1303 const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info; 1304 uint32_t isp_ctl; 1305 1306 /* FIXME: what to do when job->run_frag is false? */ 1307 1308 /* FIXME: pass in the number of samples rather than isp_aa_mode? */ 1309 pvr_setup_tiles_in_flight(dev_info, 1310 dev_runtime_info, 1311 isp_aa_mode, 1312 job->pixel_output_width, 1313 false, 1314 job->max_tiles_in_flight, 1315 &isp_ctl, 1316 &state->regs.usc_pixel_output_ctrl); 1317 1318 pvr_csb_pack (&state->regs.isp_ctl, CR_ISP_CTL, value) { 1319 value.sample_pos = true; 1320 1321 /* FIXME: There are a number of things that cause this to be set, this 1322 * is just one of them. 1323 */ 1324 value.process_empty_tiles = job->process_empty_tiles; 1325 } 1326 1327 /* FIXME: When pvr_setup_tiles_in_flight() is refactored it might be 1328 * possible to fully pack CR_ISP_CTL above rather than having to OR in part 1329 * of the value. 1330 */ 1331 state->regs.isp_ctl |= isp_ctl; 1332 1333 pvr_csb_pack (&state->regs.isp_aa, CR_ISP_AA, value) { 1334 value.mode = isp_aa_mode; 1335 } 1336 1337 /* The set up of CR_TPU must be identical to 1338 * pvr_render_job_ws_geometry_state_init(). 1339 */ 1340 pvr_csb_pack (&state->regs.tpu, CR_TPU, value) { 1341 value.tag_cem_4k_face_packing = true; 1342 } 1343 1344 if (PVR_HAS_FEATURE(dev_info, cluster_grouping) && 1345 PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) && 1346 dev_runtime_info->num_phantoms > 1 && job->frag_uses_atomic_ops) { 1347 /* Each phantom has its own MCU, so atomicity can only be guaranteed 1348 * when all work items are processed on the same phantom. This means we 1349 * need to disable all USCs other than those of the first phantom, which 1350 * has 4 clusters. Note that we only need to do this for atomic 1351 * operations in fragment shaders, since hardware prevents the TA to run 1352 * on more than one phantom anyway. 1353 */ 1354 state->regs.pixel_phantom = 0xF; 1355 } else { 1356 state->regs.pixel_phantom = 0; 1357 } 1358 1359 pvr_csb_pack (&state->regs.isp_bgobjvals, CR_ISP_BGOBJVALS, value) { 1360 value.enablebgtag = job->enable_bg_tag; 1361 1362 value.mask = true; 1363 1364 /* FIXME: Hard code this for now as we don't currently support any 1365 * stencil image formats. 1366 */ 1367 value.stencil = 0xFF; 1368 } 1369 1370 pvr_csb_pack (&state->regs.isp_bgobjdepth, CR_ISP_BGOBJDEPTH, value) { 1371 /* FIXME: This is suitable for the single depth format the driver 1372 * currently supports, but may need updating to handle other depth 1373 * formats. 1374 */ 1375 value.value = fui(job->depth_clear_value); 1376 } 1377 1378 /* FIXME: Some additional set up needed to support depth and stencil 1379 * load/store operations. 1380 */ 1381 pvr_csb_pack (&state->regs.isp_zlsctl, CR_ISP_ZLSCTL, value) { 1382 uint32_t aligned_width = 1383 ALIGN_POT(job->depth_physical_width, ROGUE_IPF_TILE_SIZE_PIXELS); 1384 uint32_t aligned_height = 1385 ALIGN_POT(job->depth_physical_height, ROGUE_IPF_TILE_SIZE_PIXELS); 1386 1387 pvr_get_isp_num_tiles_xy(dev_info, 1388 job->samples, 1389 aligned_width, 1390 aligned_height, 1391 &value.zlsextent_x_z, 1392 &value.zlsextent_y_z); 1393 value.zlsextent_x_z -= 1; 1394 value.zlsextent_y_z -= 1; 1395 1396 if (job->depth_memlayout == PVR_MEMLAYOUT_TWIDDLED) { 1397 value.loadtwiddled = true; 1398 value.storetwiddled = true; 1399 } 1400 1401 /* FIXME: This is suitable for the single depth format the driver 1402 * currently supports, but may need updating to handle other depth 1403 * formats. 1404 */ 1405 assert(job->depth_vk_format == VK_FORMAT_D32_SFLOAT); 1406 value.zloadformat = PVRX(CR_ZLOADFORMAT_TYPE_F32Z); 1407 value.zstoreformat = PVRX(CR_ZSTOREFORMAT_TYPE_F32Z); 1408 } 1409 1410 if (PVR_HAS_FEATURE(dev_info, zls_subtile)) { 1411 pvr_csb_pack (&state->regs.isp_zls_pixels, CR_ISP_ZLS_PIXELS, value) { 1412 value.x = job->depth_stride - 1; 1413 value.y = job->depth_height - 1; 1414 } 1415 } else { 1416 state->regs.isp_zls_pixels = 0; 1417 } 1418 1419 pvr_csb_pack (&state->regs.isp_zload_store_base, CR_ISP_ZLOAD_BASE, value) { 1420 value.addr = job->depth_addr; 1421 } 1422 1423 pvr_csb_pack (&state->regs.isp_stencil_load_store_base, 1424 CR_ISP_STENCIL_LOAD_BASE, 1425 value) { 1426 value.addr = job->stencil_addr; 1427 1428 /* FIXME: May need to set value.enable to true. */ 1429 } 1430 1431 pvr_csb_pack (&state->regs.tpu_border_colour_table, 1432 CR_TPU_BORDER_COLOUR_TABLE_PDM, 1433 value) { 1434 value.border_colour_table_address = job->border_colour_table_addr; 1435 } 1436 1437 state->regs.isp_oclqry_base = 0; 1438 1439 pvr_csb_pack (&state->regs.isp_dbias_base, CR_ISP_DBIAS_BASE, value) { 1440 value.addr = job->depth_bias_table_addr; 1441 } 1442 1443 pvr_csb_pack (&state->regs.isp_scissor_base, CR_ISP_SCISSOR_BASE, value) { 1444 value.addr = job->scissor_table_addr; 1445 } 1446 1447 pvr_csb_pack (&state->regs.event_pixel_pds_info, 1448 CR_EVENT_PIXEL_PDS_INFO, 1449 value) { 1450 value.const_size = 1451 DIV_ROUND_UP(ctx->device->pixel_event_data_size_in_dwords, 1452 PVRX(CR_EVENT_PIXEL_PDS_INFO_CONST_SIZE_UNIT_SIZE)); 1453 value.temp_stride = 0; 1454 value.usc_sr_size = 1455 DIV_ROUND_UP(PVR_STATE_PBE_DWORDS, 1456 PVRX(CR_EVENT_PIXEL_PDS_INFO_USC_SR_SIZE_UNIT_SIZE)); 1457 } 1458 1459 pvr_csb_pack (&state->regs.event_pixel_pds_data, 1460 CR_EVENT_PIXEL_PDS_DATA, 1461 value) { 1462 value.addr = PVR_DEV_ADDR(job->pds_pixel_event_data_offset); 1463 } 1464 1465 STATIC_ASSERT(ARRAY_SIZE(state->regs.pbe_word) == 1466 ARRAY_SIZE(job->pbe_reg_words)); 1467 STATIC_ASSERT(ARRAY_SIZE(state->regs.pbe_word[0]) == 1468 ARRAY_SIZE(job->pbe_reg_words[0])); 1469 1470 for (uint32_t i = 0; i < ARRAY_SIZE(job->pbe_reg_words); i++) { 1471 state->regs.pbe_word[i][0] = job->pbe_reg_words[i][0]; 1472 state->regs.pbe_word[i][1] = job->pbe_reg_words[i][1]; 1473 state->regs.pbe_word[i][2] = job->pbe_reg_words[i][2]; 1474 } 1475 1476 STATIC_ASSERT(__same_type(state->regs.pds_bgnd, job->pds_bgnd_reg_values)); 1477 typed_memcpy(state->regs.pds_bgnd, 1478 job->pds_bgnd_reg_values, 1479 ARRAY_SIZE(state->regs.pds_bgnd)); 1480 1481 memset(state->regs.pds_pr_bgnd, 0, sizeof(state->regs.pds_pr_bgnd)); 1482 1483 /* FIXME: Merge geometry and fragment flags into a single flags member? */ 1484 /* FIXME: move to its own function? */ 1485 state->flags = 0; 1486 1487 if (job->depth_addr.addr) 1488 state->flags |= PVR_WINSYS_FRAG_FLAG_DEPTH_BUFFER_PRESENT; 1489 1490 if (job->stencil_addr.addr) 1491 state->flags |= PVR_WINSYS_FRAG_FLAG_STENCIL_BUFFER_PRESENT; 1492 1493 if (job->disable_compute_overlap) 1494 state->flags |= PVR_WINSYS_FRAG_FLAG_PREVENT_CDM_OVERLAP; 1495 1496 if (job->frag_uses_atomic_ops) 1497 state->flags |= PVR_WINSYS_FRAG_FLAG_SINGLE_CORE; 1498 1499 state->zls_stride = job->depth_layer_size; 1500 state->sls_stride = job->depth_layer_size; 1501} 1502 1503static void pvr_render_job_ws_submit_info_init( 1504 struct pvr_render_ctx *ctx, 1505 struct pvr_render_job *job, 1506 const struct pvr_winsys_job_bo *bos, 1507 uint32_t bo_count, 1508 struct vk_sync **waits, 1509 uint32_t wait_count, 1510 uint32_t *stage_flags, 1511 struct pvr_winsys_render_submit_info *submit_info) 1512{ 1513 memset(submit_info, 0, sizeof(*submit_info)); 1514 1515 submit_info->rt_dataset = job->rt_dataset->ws_rt_dataset; 1516 submit_info->rt_data_idx = job->rt_dataset->rt_data_idx; 1517 1518 submit_info->frame_num = ctx->device->global_queue_present_count; 1519 submit_info->job_num = ctx->device->global_queue_job_count; 1520 1521 submit_info->run_frag = job->run_frag; 1522 1523 submit_info->bos = bos; 1524 submit_info->bo_count = bo_count; 1525 1526 submit_info->waits = waits; 1527 submit_info->wait_count = wait_count; 1528 submit_info->stage_flags = stage_flags; 1529 1530 /* FIXME: add WSI image bos. */ 1531 1532 pvr_render_job_ws_geometry_state_init(ctx, job, &submit_info->geometry); 1533 pvr_render_job_ws_fragment_state_init(ctx, job, &submit_info->fragment); 1534 1535 /* These values are expected to match. */ 1536 assert(submit_info->geometry.regs.tpu == submit_info->fragment.regs.tpu); 1537} 1538 1539VkResult pvr_render_job_submit(struct pvr_render_ctx *ctx, 1540 struct pvr_render_job *job, 1541 const struct pvr_winsys_job_bo *bos, 1542 uint32_t bo_count, 1543 struct vk_sync **waits, 1544 uint32_t wait_count, 1545 uint32_t *stage_flags, 1546 struct vk_sync *signal_sync_geom, 1547 struct vk_sync *signal_sync_frag) 1548{ 1549 struct pvr_rt_dataset *rt_dataset = job->rt_dataset; 1550 struct pvr_winsys_render_submit_info submit_info; 1551 struct pvr_device *device = ctx->device; 1552 VkResult result; 1553 1554 pvr_render_job_ws_submit_info_init(ctx, 1555 job, 1556 bos, 1557 bo_count, 1558 waits, 1559 wait_count, 1560 stage_flags, 1561 &submit_info); 1562 1563 result = device->ws->ops->render_submit(ctx->ws_ctx, 1564 &submit_info, 1565 signal_sync_geom, 1566 signal_sync_frag); 1567 if (result != VK_SUCCESS) 1568 return result; 1569 1570 if (job->run_frag) { 1571 /* Move to the next render target data now that a fragment job has been 1572 * successfully submitted. This will allow the next geometry job to be 1573 * submitted to been run in parallel with it. 1574 */ 1575 rt_dataset->rt_data_idx = 1576 (rt_dataset->rt_data_idx + 1) % ARRAY_SIZE(rt_dataset->rt_datas); 1577 1578 rt_dataset->need_frag = false; 1579 } else { 1580 rt_dataset->need_frag = true; 1581 } 1582 1583 return VK_SUCCESS; 1584} 1585