/* * Copyright © 2022 Imagination Technologies Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include "hwdef/rogue_hw_defs.h" #include "hwdef/rogue_hw_utils.h" #include "pvr_bo.h" #include "pvr_csb.h" #include "pvr_csb_enum_helpers.h" #include "pvr_debug.h" #include "pvr_job_common.h" #include "pvr_job_context.h" #include "pvr_job_render.h" #include "pvr_pds.h" #include "pvr_private.h" #include "pvr_rogue_fw.h" #include "pvr_types.h" #include "pvr_winsys.h" #include "util/compiler.h" #include "util/macros.h" #include "util/u_math.h" #include "vk_alloc.h" #include "vk_log.h" #include "vk_util.h" #define ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE 16U /* FIXME: Is there a hardware define we can use instead? */ /* 1 DWord per PM physical page stored in the free list */ #define ROGUE_FREE_LIST_ENTRY_SIZE ((uint32_t)sizeof(uint32_t)) /* FIXME: The three defines below, for the number of PC, PD and PT entries in a * 4KB page, come from rgxmmudefs_km.h (meaning they're part of the * auto-generated hwdefs). Should these be defined in rogue_mmu.xml? Keeping in * mind that we probably only need these three values. */ #define ROGUE_NUM_PC_ENTRIES_PER_PAGE 0x400U #define ROGUE_NUM_PD_ENTRIES_PER_PAGE 0x200U #define ROGUE_NUM_PT_ENTRIES_PER_PAGE 0x200U struct pvr_free_list { struct pvr_device *device; uint64_t size; struct pvr_bo *bo; struct pvr_winsys_free_list *ws_free_list; }; /* Macrotile information. */ struct pvr_rt_mtile_info { uint32_t tile_size_x; uint32_t tile_size_y; uint32_t num_tiles_x; uint32_t num_tiles_y; uint32_t tiles_per_mtile_x; uint32_t tiles_per_mtile_y; uint32_t x_tile_max; uint32_t y_tile_max; uint32_t mtiles_x; uint32_t mtiles_y; uint32_t mtile_x1; uint32_t mtile_y1; uint32_t mtile_x2; uint32_t mtile_y2; uint32_t mtile_x3; uint32_t mtile_y3; uint32_t mtile_stride; }; struct pvr_rt_dataset { struct pvr_device *device; /* RT dataset information */ uint32_t width; uint32_t height; uint32_t samples; uint32_t layers; struct pvr_free_list *global_free_list; struct pvr_free_list *local_free_list; struct pvr_bo *vheap_rtc_bo; pvr_dev_addr_t vheap_dev_addr; pvr_dev_addr_t rtc_dev_addr; struct pvr_bo *tpc_bo; uint64_t tpc_stride; uint64_t tpc_size; struct pvr_winsys_rt_dataset *ws_rt_dataset; /* RT data information */ struct pvr_bo *mta_mlist_bo; struct pvr_bo *rgn_headers_bo; uint64_t rgn_headers_stride; bool need_frag; uint8_t rt_data_idx; struct { pvr_dev_addr_t mta_dev_addr; pvr_dev_addr_t mlist_dev_addr; pvr_dev_addr_t rgn_headers_dev_addr; } rt_datas[ROGUE_NUM_RTDATAS]; }; VkResult pvr_free_list_create(struct pvr_device *device, uint32_t initial_size, uint32_t max_size, uint32_t grow_size, uint32_t grow_threshold, struct pvr_free_list *parent_free_list, struct pvr_free_list **const free_list_out) { struct pvr_winsys_free_list *parent_ws_free_list = parent_free_list ? parent_free_list->ws_free_list : NULL; const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED | PVR_BO_ALLOC_FLAG_PM_FW_PROTECT; struct pvr_free_list *free_list; uint32_t cache_line_size; uint32_t initial_num_pages; uint32_t grow_num_pages; uint32_t max_num_pages; uint64_t addr_alignment; uint64_t size_alignment; uint64_t size; VkResult result; assert((initial_size + grow_size) <= max_size); assert(max_size != 0); assert(grow_threshold <= 100); /* Make sure the free list is created with at least a single page. */ if (initial_size == 0) initial_size = ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE; /* The freelists sizes must respect the PM freelist base address alignment * requirement. As the freelist entries are cached by the SLC, it's also * necessary to ensure the sizes respect the SLC cache line size to avoid * invalid entries appearing in the cache, which would be problematic after * a grow operation, as the SLC entries aren't invalidated. We do this by * making sure the freelist values are appropriately aligned. * * To calculate the alignment, we first take the largest of the freelist * base address alignment and the SLC cache line size. We then divide this * by the freelist entry size to determine the number of freelist entries * required by the PM. Finally, as each entry holds a single PM physical * page, we multiple the number of entries by the page size. * * As an example, if the base address alignment is 16 bytes, the SLC cache * line size is 64 bytes and the freelist entry size is 4 bytes then 16 * entries are required, as we take the SLC cacheline size (being the larger * of the two values) and divide this by 4. If the PM page size is 4096 * bytes then we end up with an alignment of 65536 bytes. */ cache_line_size = rogue_get_slc_cache_line_size(&device->pdevice->dev_info); addr_alignment = MAX2(ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE, cache_line_size); size_alignment = (addr_alignment / ROGUE_FREE_LIST_ENTRY_SIZE) * ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE; assert(util_is_power_of_two_nonzero(size_alignment)); initial_size = align64(initial_size, size_alignment); max_size = align64(max_size, size_alignment); grow_size = align64(grow_size, size_alignment); /* Make sure the 'max' size doesn't exceed what the firmware supports and * adjust the other sizes accordingly. */ if (max_size > ROGUE_FREE_LIST_MAX_SIZE) { max_size = ROGUE_FREE_LIST_MAX_SIZE; assert(align64(max_size, size_alignment) == max_size); } if (initial_size > max_size) initial_size = max_size; if (initial_size == max_size) grow_size = 0; initial_num_pages = initial_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT; max_num_pages = max_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT; grow_num_pages = grow_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT; /* Calculate the size of the buffer needed to store the free list entries * based on the maximum number of pages we can have. */ size = max_num_pages * ROGUE_FREE_LIST_ENTRY_SIZE; assert(align64(size, addr_alignment) == size); free_list = vk_alloc(&device->vk.alloc, sizeof(*free_list), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!free_list) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); /* FIXME: The memory is mapped GPU uncached, but this seems to contradict * the comment above about aligning to the SLC cache line size. */ result = pvr_bo_alloc(device, device->heaps.general_heap, size, addr_alignment, bo_flags, &free_list->bo); if (result != VK_SUCCESS) goto err_vk_free_free_list; result = device->ws->ops->free_list_create(device->ws, free_list->bo->vma, initial_num_pages, max_num_pages, grow_num_pages, grow_threshold, parent_ws_free_list, &free_list->ws_free_list); if (result != VK_SUCCESS) goto err_pvr_bo_free_bo; free_list->device = device; free_list->size = size; *free_list_out = free_list; return VK_SUCCESS; err_pvr_bo_free_bo: pvr_bo_free(device, free_list->bo); err_vk_free_free_list: vk_free(&device->vk.alloc, free_list); return result; } void pvr_free_list_destroy(struct pvr_free_list *free_list) { struct pvr_device *device = free_list->device; device->ws->ops->free_list_destroy(free_list->ws_free_list); pvr_bo_free(device, free_list->bo); vk_free(&device->vk.alloc, free_list); } static inline void pvr_get_samples_in_xy(uint32_t samples, uint32_t *const x_out, uint32_t *const y_out) { switch (samples) { case 1: *x_out = 1; *y_out = 1; break; case 2: *x_out = 1; *y_out = 2; break; case 4: *x_out = 2; *y_out = 2; break; case 8: *x_out = 2; *y_out = 4; break; default: unreachable("Unsupported number of samples"); } } static void pvr_rt_mtile_info_init(struct pvr_device *device, struct pvr_rt_mtile_info *info, uint32_t width, uint32_t height, uint32_t samples) { const struct pvr_device_info *dev_info = &device->pdevice->dev_info; uint32_t samples_in_x; uint32_t samples_in_y; pvr_get_samples_in_xy(samples, &samples_in_x, &samples_in_y); info->tile_size_x = PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 1); info->tile_size_y = PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 1); info->num_tiles_x = DIV_ROUND_UP(width, info->tile_size_x); info->num_tiles_y = DIV_ROUND_UP(height, info->tile_size_y); rogue_get_num_macrotiles_xy(dev_info, &info->mtiles_x, &info->mtiles_y); if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { assert(PVR_GET_FEATURE_VALUE(dev_info, simple_parameter_format_version, 0) == 2); /* Set up 16 macrotiles with a multiple of 2x2 tiles per macrotile, * which is aligned to a tile group. */ info->mtile_x1 = DIV_ROUND_UP(info->num_tiles_x, 8) * 2; info->mtile_y1 = DIV_ROUND_UP(info->num_tiles_y, 8) * 2; info->mtile_x2 = 0; info->mtile_y2 = 0; info->mtile_x3 = 0; info->mtile_y3 = 0; info->x_tile_max = ALIGN_POT(info->num_tiles_x, 2) - 1; info->y_tile_max = ALIGN_POT(info->num_tiles_y, 2) - 1; } else { /* Set up 16 macrotiles with a multiple of 4x4 tiles per macrotile. */ info->mtile_x1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_x, 4), 4); info->mtile_y1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_y, 4), 4); info->mtile_x2 = info->mtile_x1 * 2; info->mtile_y2 = info->mtile_y1 * 2; info->mtile_x3 = info->mtile_x1 * 3; info->mtile_y3 = info->mtile_y1 * 3; info->x_tile_max = info->num_tiles_x - 1; info->y_tile_max = info->num_tiles_y - 1; } info->tiles_per_mtile_x = info->mtile_x1 * samples_in_x; info->tiles_per_mtile_y = info->mtile_y1 * samples_in_y; info->mtile_stride = info->mtile_x1 * info->mtile_y1; } /* Note that the unit of the return value depends on the GPU. For cores with the * simple_internal_parameter_format feature the returned size is interpreted as * the number of region headers. For cores without this feature its interpreted * as the size in dwords. */ static uint64_t pvr_rt_get_isp_region_size(struct pvr_device *device, const struct pvr_rt_mtile_info *mtile_info) { const struct pvr_device_info *dev_info = &device->pdevice->dev_info; uint64_t rgn_size = mtile_info->tiles_per_mtile_x * mtile_info->tiles_per_mtile_y; if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { uint32_t version; rgn_size *= mtile_info->mtiles_x * mtile_info->mtiles_y; if (PVR_FEATURE_VALUE(dev_info, simple_parameter_format_version, &version)) { version = 0; } if (version == 2) { /* One region header per 2x2 tile group. */ rgn_size /= (2U * 2U); } } else { const uint64_t rgn_header_size = rogue_get_region_header_size(dev_info); /* Round up to next dword to prevent IPF overrun and convert to bytes. */ rgn_size = DIV_ROUND_UP(rgn_size * rgn_header_size, 4); } return rgn_size; } static VkResult pvr_rt_vheap_rtc_data_init(struct pvr_device *device, struct pvr_rt_dataset *rt_dataset, uint32_t layers) { const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED | PVR_BO_ALLOC_FLAG_ZERO_ON_ALLOC; uint64_t vheap_size; uint32_t alignment; uint64_t rtc_size; VkResult result; vheap_size = ROGUE_CR_PM_VHEAP_TABLE_SIZE * ROGUE_PM_VHEAP_ENTRY_SIZE; if (layers > 1) { uint64_t rtc_entries; vheap_size = ALIGN_POT(vheap_size, PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT)); rtc_entries = ROGUE_NUM_TEAC + ROGUE_NUM_TE + ROGUE_NUM_VCE; if (PVR_HAS_QUIRK(&device->pdevice->dev_info, 48545)) rtc_entries += ROGUE_NUM_TE; rtc_size = rtc_entries * ROGUE_RTC_SIZE_IN_BYTES; } else { rtc_size = 0; } alignment = MAX2(PVRX(CR_PM_VHEAP_TABLE_BASE_ADDR_ALIGNMENT), PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT)); result = pvr_bo_alloc(device, device->heaps.general_heap, vheap_size + rtc_size, alignment, bo_flags, &rt_dataset->vheap_rtc_bo); if (result != VK_SUCCESS) return result; rt_dataset->vheap_dev_addr = rt_dataset->vheap_rtc_bo->vma->dev_addr; if (rtc_size > 0) { rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_OFFSET(rt_dataset->vheap_dev_addr, vheap_size); } else { rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID; } return VK_SUCCESS; } static void pvr_rt_vheap_rtc_data_fini(struct pvr_rt_dataset *rt_dataset) { rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID; pvr_bo_free(rt_dataset->device, rt_dataset->vheap_rtc_bo); rt_dataset->vheap_rtc_bo = NULL; } static void pvr_rt_get_tail_ptr_stride_size(const struct pvr_device *device, const struct pvr_rt_mtile_info *mtile_info, uint32_t layers, uint64_t *const stride_out, uint64_t *const size_out) { uint32_t max_num_mtiles; uint32_t num_mtiles_x; uint32_t num_mtiles_y; uint32_t version; uint64_t size; num_mtiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x; num_mtiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y; max_num_mtiles = MAX2(util_next_power_of_two64(num_mtiles_x), util_next_power_of_two64(num_mtiles_y)); size = max_num_mtiles * max_num_mtiles; if (PVR_FEATURE_VALUE(&device->pdevice->dev_info, simple_parameter_format_version, &version)) { version = 0; } if (version == 2) { /* One tail pointer cache entry per 2x2 tile group. */ size /= (2U * 2U); } size *= ROGUE_TAIL_POINTER_SIZE; if (layers > 1) { size = ALIGN_POT(size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE); *stride_out = size / ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE; *size_out = size * layers; } else { *stride_out = 0; *size_out = size; } } static VkResult pvr_rt_tpc_data_init(struct pvr_device *device, struct pvr_rt_dataset *rt_dataset, const struct pvr_rt_mtile_info *mtile_info, uint32_t layers) { const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED | PVR_BO_ALLOC_FLAG_ZERO_ON_ALLOC; uint64_t tpc_size; pvr_rt_get_tail_ptr_stride_size(device, mtile_info, layers, &rt_dataset->tpc_stride, &rt_dataset->tpc_size); tpc_size = ALIGN_POT(rt_dataset->tpc_size, ROGUE_TE_TPC_CACHE_LINE_SIZE); return pvr_bo_alloc(device, device->heaps.general_heap, tpc_size, PVRX(CR_TE_TPC_ADDR_BASE_ALIGNMENT), bo_flags, &rt_dataset->tpc_bo); } static void pvr_rt_tpc_data_fini(struct pvr_rt_dataset *rt_dataset) { pvr_bo_free(rt_dataset->device, rt_dataset->tpc_bo); rt_dataset->tpc_bo = NULL; } static uint32_t pvr_rt_get_mlist_size(const struct pvr_free_list *global_free_list, const struct pvr_free_list *local_free_list) { uint32_t num_pte_pages; uint32_t num_pde_pages; uint32_t num_pce_pages; uint64_t total_pages; uint32_t mlist_size; assert(global_free_list->size + local_free_list->size <= ROGUE_PM_MAX_PB_VIRT_ADDR_SPACE); total_pages = (global_free_list->size + local_free_list->size) >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT; /* Calculate the total number of physical pages required to hold the page * table, directory and catalog entries for the freelist pages. */ num_pte_pages = DIV_ROUND_UP(total_pages, ROGUE_NUM_PT_ENTRIES_PER_PAGE); num_pde_pages = DIV_ROUND_UP(num_pte_pages, ROGUE_NUM_PD_ENTRIES_PER_PAGE); num_pce_pages = DIV_ROUND_UP(num_pde_pages, ROGUE_NUM_PC_ENTRIES_PER_PAGE); /* Calculate the MList size considering the total number of pages in the PB * are shared among all the PM address spaces. */ mlist_size = (num_pce_pages + num_pde_pages + num_pte_pages) * ROGUE_NUM_PM_ADDRESS_SPACES * ROGUE_MLIST_ENTRY_STRIDE; return ALIGN_POT(mlist_size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE); } static void pvr_rt_get_region_headers_stride_size( const struct pvr_device *device, const struct pvr_rt_mtile_info *mtile_info, uint32_t layers, uint64_t *const stride_out, uint64_t *const size_out) { const struct pvr_device_info *dev_info = &device->pdevice->dev_info; const uint32_t rgn_header_size = rogue_get_region_header_size(dev_info); uint32_t rgn_headers_size; uint32_t num_tiles_x; uint32_t num_tiles_y; uint32_t group_size; uint32_t version; if (PVR_FEATURE_VALUE(dev_info, simple_parameter_format_version, &version)) version = 0; group_size = version == 2 ? 2 : 1; num_tiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x; num_tiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y; rgn_headers_size = (num_tiles_x / group_size) * (num_tiles_y / group_size) * rgn_header_size; if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { rgn_headers_size = ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT)); } if (layers > 1) { rgn_headers_size = ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE)); } *stride_out = rgn_header_size; *size_out = rgn_headers_size * layers; } static VkResult pvr_rt_mta_mlist_data_init(struct pvr_device *device, struct pvr_rt_dataset *rt_dataset, const struct pvr_free_list *global_free_list, const struct pvr_free_list *local_free_list, const struct pvr_rt_mtile_info *mtile_info) { const struct pvr_device_info *dev_info = &device->pdevice->dev_info; const uint32_t mlist_size = pvr_rt_get_mlist_size(global_free_list, local_free_list); uint32_t mta_size = rogue_get_macrotile_array_size(dev_info); const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas); uint32_t rt_datas_mlist_size; uint32_t rt_datas_mta_size; pvr_dev_addr_t dev_addr; VkResult result; /* Allocate memory for macrotile array and Mlist for all RT datas. * * Allocation layout: MTA[0..N] + Mlist alignment padding + Mlist[0..N]. * * N is number of RT datas. */ rt_datas_mta_size = ALIGN_POT(mta_size * num_rt_datas, PVRX(CR_PM_MLIST0_BASE_ADDR_ALIGNMENT)); rt_datas_mlist_size = mlist_size * num_rt_datas; result = pvr_bo_alloc(device, device->heaps.general_heap, rt_datas_mta_size + rt_datas_mlist_size, PVRX(CR_PM_MTILE_ARRAY_BASE_ADDR_ALIGNMENT), PVR_BO_ALLOC_FLAG_GPU_UNCACHED, &rt_dataset->mta_mlist_bo); if (result != VK_SUCCESS) return result; dev_addr = rt_dataset->mta_mlist_bo->vma->dev_addr; for (uint32_t i = 0; i < num_rt_datas; i++) { if (mta_size != 0) { rt_dataset->rt_datas[i].mta_dev_addr = dev_addr; dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mta_size); } else { rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID; } } dev_addr = PVR_DEV_ADDR_OFFSET(rt_dataset->mta_mlist_bo->vma->dev_addr, rt_datas_mta_size); for (uint32_t i = 0; i < num_rt_datas; i++) { if (mlist_size != 0) { rt_dataset->rt_datas[i].mlist_dev_addr = dev_addr; dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mlist_size); } else { rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID; } } return VK_SUCCESS; } static void pvr_rt_mta_mlist_data_fini(struct pvr_rt_dataset *rt_dataset) { for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++) { rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID; rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID; } pvr_bo_free(rt_dataset->device, rt_dataset->mta_mlist_bo); rt_dataset->mta_mlist_bo = NULL; } static VkResult pvr_rt_rgn_headers_data_init(struct pvr_device *device, struct pvr_rt_dataset *rt_dataset, const struct pvr_rt_mtile_info *mtile_info, uint32_t layers) { const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas); uint64_t rgn_headers_size; pvr_dev_addr_t dev_addr; VkResult result; pvr_rt_get_region_headers_stride_size(device, mtile_info, layers, &rt_dataset->rgn_headers_stride, &rgn_headers_size); result = pvr_bo_alloc(device, device->heaps.rgn_hdr_heap, rgn_headers_size * num_rt_datas, PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT), PVR_BO_ALLOC_FLAG_GPU_UNCACHED, &rt_dataset->rgn_headers_bo); if (result != VK_SUCCESS) return result; dev_addr = rt_dataset->rgn_headers_bo->vma->dev_addr; for (uint32_t i = 0; i < num_rt_datas; i++) { rt_dataset->rt_datas[i].rgn_headers_dev_addr = dev_addr; dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, rgn_headers_size); } return VK_SUCCESS; } static void pvr_rt_rgn_headers_data_fini(struct pvr_rt_dataset *rt_dataset) { for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++) rt_dataset->rt_datas[i].rgn_headers_dev_addr = PVR_DEV_ADDR_INVALID; pvr_bo_free(rt_dataset->device, rt_dataset->rgn_headers_bo); rt_dataset->rgn_headers_bo = NULL; } static VkResult pvr_rt_datas_init(struct pvr_device *device, struct pvr_rt_dataset *rt_dataset, const struct pvr_free_list *global_free_list, const struct pvr_free_list *local_free_list, const struct pvr_rt_mtile_info *mtile_info, uint32_t layers) { VkResult result; result = pvr_rt_mta_mlist_data_init(device, rt_dataset, global_free_list, local_free_list, mtile_info); if (result != VK_SUCCESS) return result; result = pvr_rt_rgn_headers_data_init(device, rt_dataset, mtile_info, layers); if (result != VK_SUCCESS) goto err_pvr_rt_mta_mlist_data_fini; return VK_SUCCESS; err_pvr_rt_mta_mlist_data_fini: pvr_rt_mta_mlist_data_fini(rt_dataset); return VK_SUCCESS; } static void pvr_rt_datas_fini(struct pvr_rt_dataset *rt_dataset) { pvr_rt_rgn_headers_data_fini(rt_dataset); pvr_rt_mta_mlist_data_fini(rt_dataset); } static uint32_t pvr_rogue_get_cr_isp_mtile_size_val(const struct pvr_device_info *dev_info, uint32_t samples, const struct pvr_rt_mtile_info *mtile_info) { uint32_t samples_per_pixel = PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 0); uint32_t isp_mtile_size; pvr_csb_pack (&isp_mtile_size, CR_ISP_MTILE_SIZE, value) { value.x = mtile_info->mtile_x1; value.y = mtile_info->mtile_y1; if (samples_per_pixel == 1) { if (samples >= 4) value.x <<= 1; if (samples >= 2) value.y <<= 1; } else if (samples_per_pixel == 2) { if (samples >= 8) value.x <<= 1; if (samples >= 4) value.y <<= 1; } else if (samples_per_pixel == 4) { if (samples >= 8) value.y <<= 1; } else { assert(!"Unsupported ISP samples per pixel value"); } } return isp_mtile_size; } static uint64_t pvr_rogue_get_cr_multisamplectl_val(uint32_t samples, bool y_flip) { static const struct { uint8_t x[8]; uint8_t y[8]; } sample_positions[4] = { /* 1 sample */ { .x = { 8 }, .y = { 8 }, }, /* 2 samples */ { .x = { 12, 4 }, .y = { 12, 4 }, }, /* 4 samples */ { .x = { 6, 14, 2, 10 }, .y = { 2, 6, 10, 14 }, }, /* 8 samples */ { .x = { 9, 7, 13, 5, 3, 1, 11, 15 }, .y = { 5, 11, 9, 3, 13, 7, 15, 1 }, }, }; uint64_t multisamplectl; uint8_t idx; idx = util_fast_log2(samples); assert(idx < ARRAY_SIZE(sample_positions)); pvr_csb_pack (&multisamplectl, CR_PPP_MULTISAMPLECTL, value) { switch (samples) { case 8: value.msaa_x7 = sample_positions[idx].x[7]; value.msaa_x6 = sample_positions[idx].x[6]; value.msaa_x5 = sample_positions[idx].x[5]; value.msaa_x4 = sample_positions[idx].x[4]; if (y_flip) { value.msaa_y7 = 16U - sample_positions[idx].y[7]; value.msaa_y6 = 16U - sample_positions[idx].y[6]; value.msaa_y5 = 16U - sample_positions[idx].y[5]; value.msaa_y4 = 16U - sample_positions[idx].y[4]; } else { value.msaa_y7 = sample_positions[idx].y[7]; value.msaa_y6 = sample_positions[idx].y[6]; value.msaa_y5 = sample_positions[idx].y[5]; value.msaa_y4 = sample_positions[idx].y[4]; } FALLTHROUGH; case 4: value.msaa_x3 = sample_positions[idx].x[3]; value.msaa_x2 = sample_positions[idx].x[2]; if (y_flip) { value.msaa_y3 = 16U - sample_positions[idx].y[3]; value.msaa_y2 = 16U - sample_positions[idx].y[2]; } else { value.msaa_y3 = sample_positions[idx].y[3]; value.msaa_y2 = sample_positions[idx].y[2]; } FALLTHROUGH; case 2: value.msaa_x1 = sample_positions[idx].x[1]; if (y_flip) { value.msaa_y1 = 16U - sample_positions[idx].y[1]; } else { value.msaa_y1 = sample_positions[idx].y[1]; } FALLTHROUGH; case 1: value.msaa_x0 = sample_positions[idx].x[0]; if (y_flip) { value.msaa_y0 = 16U - sample_positions[idx].y[0]; } else { value.msaa_y0 = sample_positions[idx].y[0]; } break; default: unreachable("Unsupported number of samples"); } } return multisamplectl; } static uint32_t pvr_rogue_get_cr_te_aa_val(const struct pvr_device_info *dev_info, uint32_t samples) { uint32_t samples_per_pixel = PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 0); uint32_t te_aa; pvr_csb_pack (&te_aa, CR_TE_AA, value) { if (samples_per_pixel == 1) { if (samples >= 2) value.y = true; if (samples >= 4) value.x = true; } else if (samples_per_pixel == 2) { if (samples >= 2) value.x2 = true; if (samples >= 4) value.y = true; if (samples >= 8) value.x = true; } else if (samples_per_pixel == 4) { if (samples >= 2) value.x2 = true; if (samples >= 4) value.y2 = true; if (samples >= 8) value.y = true; } else { assert(!"Unsupported ISP samples per pixel value"); } } return te_aa; } static void pvr_rt_dataset_ws_create_info_init( struct pvr_rt_dataset *rt_dataset, const struct pvr_rt_mtile_info *mtile_info, struct pvr_winsys_rt_dataset_create_info *create_info) { struct pvr_device *device = rt_dataset->device; const struct pvr_device_info *dev_info = &device->pdevice->dev_info; memset(create_info, 0, sizeof(*create_info)); /* Local freelist. */ create_info->local_free_list = rt_dataset->local_free_list->ws_free_list; /* ISP register values. */ if (PVR_HAS_ERN(dev_info, 42307) && !(PVR_HAS_FEATURE(dev_info, roguexe) && mtile_info->tile_size_x == 16)) { float value; if (rt_dataset->width != 0) { value = ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->width; create_info->isp_merge_lower_x = fui(value); value = ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->width; create_info->isp_merge_upper_x = fui(value); } if (rt_dataset->height != 0) { value = ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->height; create_info->isp_merge_lower_y = fui(value); value = ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->height; create_info->isp_merge_upper_y = fui(value); } value = ((float)rt_dataset->width * ROGUE_ISP_MERGE_SCALE_FACTOR) / (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR - ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR); create_info->isp_merge_scale_x = fui(value); value = ((float)rt_dataset->height * ROGUE_ISP_MERGE_SCALE_FACTOR) / (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR - ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR); create_info->isp_merge_scale_y = fui(value); } create_info->isp_mtile_size = pvr_rogue_get_cr_isp_mtile_size_val(dev_info, rt_dataset->samples, mtile_info); /* PPP register values. */ create_info->ppp_multi_sample_ctl = pvr_rogue_get_cr_multisamplectl_val(rt_dataset->samples, false); create_info->ppp_multi_sample_ctl_y_flipped = pvr_rogue_get_cr_multisamplectl_val(rt_dataset->samples, true); pvr_csb_pack (&create_info->ppp_screen, CR_PPP_SCREEN, value) { value.pixxmax = rt_dataset->width - 1; value.pixymax = rt_dataset->height - 1; } /* TE register values. */ create_info->te_aa = pvr_rogue_get_cr_te_aa_val(dev_info, rt_dataset->samples); pvr_csb_pack (&create_info->te_mtile1, CR_TE_MTILE1, value) { value.x1 = mtile_info->mtile_x1; if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { value.x2 = mtile_info->mtile_x2; value.x3 = mtile_info->mtile_x3; } } pvr_csb_pack (&create_info->te_mtile2, CR_TE_MTILE2, value) { value.y1 = mtile_info->mtile_y1; if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { value.y2 = mtile_info->mtile_y2; value.y3 = mtile_info->mtile_y3; } } pvr_csb_pack (&create_info->te_screen, CR_TE_SCREEN, value) { value.xmax = mtile_info->x_tile_max; value.ymax = mtile_info->y_tile_max; } /* Allocations and associated information. */ create_info->vheap_table_dev_addr = rt_dataset->vheap_dev_addr; create_info->rtc_dev_addr = rt_dataset->rtc_dev_addr; create_info->tpc_dev_addr = rt_dataset->tpc_bo->vma->dev_addr; create_info->tpc_stride = rt_dataset->tpc_stride; create_info->tpc_size = rt_dataset->tpc_size; STATIC_ASSERT(ARRAY_SIZE(create_info->rt_datas) == ARRAY_SIZE(rt_dataset->rt_datas)); for (uint32_t i = 0; i < ARRAY_SIZE(create_info->rt_datas); i++) { create_info->rt_datas[i].pm_mlist_dev_addr = rt_dataset->rt_datas[i].mlist_dev_addr; create_info->rt_datas[i].macrotile_array_dev_addr = rt_dataset->rt_datas[i].mta_dev_addr; create_info->rt_datas[i].rgn_header_dev_addr = rt_dataset->rt_datas[i].rgn_headers_dev_addr; } create_info->rgn_header_size = pvr_rt_get_isp_region_size(device, mtile_info); /* Miscellaneous. */ create_info->mtile_stride = mtile_info->mtile_stride; create_info->max_rts = rt_dataset->layers; } VkResult pvr_render_target_dataset_create(struct pvr_device *device, uint32_t width, uint32_t height, uint32_t samples, uint32_t layers, struct pvr_rt_dataset **const rt_dataset_out) { const struct pvr_device_info *dev_info = &device->pdevice->dev_info; struct pvr_winsys_rt_dataset_create_info rt_dataset_create_info; struct pvr_rt_mtile_info mtile_info; struct pvr_rt_dataset *rt_dataset; VkResult result; assert(device->global_free_list); assert(width <= rogue_get_render_size_max_x(dev_info)); assert(height <= rogue_get_render_size_max_y(dev_info)); assert(layers > 0 && layers <= PVR_MAX_FRAMEBUFFER_LAYERS); pvr_rt_mtile_info_init(device, &mtile_info, width, height, samples); rt_dataset = vk_zalloc(&device->vk.alloc, sizeof(*rt_dataset), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!rt_dataset) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); rt_dataset->device = device; rt_dataset->width = width; rt_dataset->height = height; rt_dataset->samples = samples; rt_dataset->layers = layers; rt_dataset->global_free_list = device->global_free_list; /* The maximum supported free list size is based on the assumption that this * freelist (the "local" freelist) is always the minimum size required by * the hardware. See the documentation of ROGUE_FREE_LIST_MAX_SIZE for more * details. */ result = pvr_free_list_create(device, rogue_get_min_free_list_size(dev_info), rogue_get_min_free_list_size(dev_info), 0 /* grow_size */, 0 /* grow_threshold */, rt_dataset->global_free_list, &rt_dataset->local_free_list); if (result != VK_SUCCESS) goto err_vk_free_rt_dataset; result = pvr_rt_vheap_rtc_data_init(device, rt_dataset, layers); if (result != VK_SUCCESS) goto err_pvr_free_list_destroy; result = pvr_rt_tpc_data_init(device, rt_dataset, &mtile_info, layers); if (result != VK_SUCCESS) goto err_pvr_rt_vheap_rtc_data_fini; result = pvr_rt_datas_init(device, rt_dataset, rt_dataset->global_free_list, rt_dataset->local_free_list, &mtile_info, layers); if (result != VK_SUCCESS) goto err_pvr_rt_tpc_data_fini; /* rt_dataset must be fully initialized by this point since * pvr_rt_dataset_ws_create_info_init() depends on this. */ pvr_rt_dataset_ws_create_info_init(rt_dataset, &mtile_info, &rt_dataset_create_info); result = device->ws->ops->render_target_dataset_create(device->ws, &rt_dataset_create_info, &rt_dataset->ws_rt_dataset); if (result != VK_SUCCESS) goto err_pvr_rt_datas_fini; *rt_dataset_out = rt_dataset; return VK_SUCCESS; err_pvr_rt_datas_fini: pvr_rt_datas_fini(rt_dataset); err_pvr_rt_tpc_data_fini: pvr_rt_tpc_data_fini(rt_dataset); err_pvr_rt_vheap_rtc_data_fini: pvr_rt_vheap_rtc_data_fini(rt_dataset); err_pvr_free_list_destroy: pvr_free_list_destroy(rt_dataset->local_free_list); err_vk_free_rt_dataset: vk_free(&device->vk.alloc, rt_dataset); return result; } void pvr_render_target_dataset_destroy(struct pvr_rt_dataset *rt_dataset) { struct pvr_device *device = rt_dataset->device; device->ws->ops->render_target_dataset_destroy(rt_dataset->ws_rt_dataset); pvr_rt_datas_fini(rt_dataset); pvr_rt_tpc_data_fini(rt_dataset); pvr_rt_vheap_rtc_data_fini(rt_dataset); pvr_free_list_destroy(rt_dataset->local_free_list); vk_free(&device->vk.alloc, rt_dataset); } static void pvr_render_job_ws_geometry_state_init(struct pvr_render_ctx *ctx, struct pvr_render_job *job, struct pvr_winsys_geometry_state *state) { const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info; /* FIXME: Should this just be done unconditionally? The firmware will just * ignore the value anyway. */ if (PVR_HAS_QUIRK(dev_info, 56279)) { pvr_csb_pack (&state->regs.pds_ctrl, CR_PDS_CTRL, value) { value.max_num_vdm_tasks = rogue_get_max_num_vdm_pds_tasks(dev_info); } } else { state->regs.pds_ctrl = 0; } pvr_csb_pack (&state->regs.ppp_ctrl, CR_PPP_CTRL, value) { value.wclampen = true; value.fixed_point_format = 1; } pvr_csb_pack (&state->regs.te_psg, CR_TE_PSG, value) { value.completeonterminate = job->geometry_terminate; value.region_stride = job->rt_dataset->rgn_headers_stride / PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE); value.forcenewstate = PVR_HAS_QUIRK(dev_info, 52942); } /* The set up of CR_TPU must be identical to * pvr_render_job_ws_fragment_state_init(). */ pvr_csb_pack (&state->regs.tpu, CR_TPU, value) { value.tag_cem_4k_face_packing = true; } pvr_csb_pack (&state->regs.tpu_border_colour_table, CR_TPU_BORDER_COLOUR_TABLE_VDM, value) { value.border_colour_table_address = job->border_colour_table_addr; } pvr_csb_pack (&state->regs.vdm_ctrl_stream_base, CR_VDM_CTRL_STREAM_BASE, value) { value.addr = job->ctrl_stream_addr; } /* Set up the USC common size for the context switch resume/load program * (ctx->ctx_switch.programs[i].sr->pds_load_program), which was created * as part of the render context. */ pvr_csb_pack (&state->regs.vdm_ctx_resume_task0_size, VDMCTRL_PDS_STATE0, value) { /* Calculate the size in bytes. */ const uint16_t shared_registers_size = job->max_shared_registers * 4; value.usc_common_size = DIV_ROUND_UP(shared_registers_size, PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE)); }; state->flags = 0; if (!job->rt_dataset->need_frag) state->flags |= PVR_WINSYS_GEOM_FLAG_FIRST_GEOMETRY; if (job->geometry_terminate) state->flags |= PVR_WINSYS_GEOM_FLAG_LAST_GEOMETRY; if (job->frag_uses_atomic_ops) state->flags |= PVR_WINSYS_GEOM_FLAG_SINGLE_CORE; } static inline void pvr_get_isp_num_tiles_xy(const struct pvr_device_info *dev_info, uint32_t samples, uint32_t width, uint32_t height, uint32_t *const x_out, uint32_t *const y_out) { uint32_t tile_samples_x; uint32_t tile_samples_y; uint32_t scale_x; uint32_t scale_y; rogue_get_isp_samples_per_tile_xy(dev_info, samples, &tile_samples_x, &tile_samples_y); switch (samples) { case 1: scale_x = 1; scale_y = 1; break; case 2: scale_x = 1; scale_y = 2; break; case 4: scale_x = 2; scale_y = 2; break; case 8: scale_x = 2; scale_y = 4; break; default: unreachable("Unsupported number of samples"); } *x_out = DIV_ROUND_UP(width * scale_x, tile_samples_x); *y_out = DIV_ROUND_UP(height * scale_y, tile_samples_y); if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { assert(PVR_GET_FEATURE_VALUE(dev_info, simple_parameter_format_version, 0U) == 2U); /* Align to a 2x2 tile block. */ *x_out = ALIGN_POT(*x_out, 2); *y_out = ALIGN_POT(*y_out, 2); } } static void pvr_render_job_ws_fragment_state_init(struct pvr_render_ctx *ctx, struct pvr_render_job *job, struct pvr_winsys_fragment_state *state) { const enum PVRX(CR_ISP_AA_MODE_TYPE) isp_aa_mode = pvr_cr_isp_aa_mode_type(job->samples); const struct pvr_device_runtime_info *dev_runtime_info = &ctx->device->pdevice->dev_runtime_info; const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info; uint32_t isp_ctl; /* FIXME: what to do when job->run_frag is false? */ /* FIXME: pass in the number of samples rather than isp_aa_mode? */ pvr_setup_tiles_in_flight(dev_info, dev_runtime_info, isp_aa_mode, job->pixel_output_width, false, job->max_tiles_in_flight, &isp_ctl, &state->regs.usc_pixel_output_ctrl); pvr_csb_pack (&state->regs.isp_ctl, CR_ISP_CTL, value) { value.sample_pos = true; /* FIXME: There are a number of things that cause this to be set, this * is just one of them. */ value.process_empty_tiles = job->process_empty_tiles; } /* FIXME: When pvr_setup_tiles_in_flight() is refactored it might be * possible to fully pack CR_ISP_CTL above rather than having to OR in part * of the value. */ state->regs.isp_ctl |= isp_ctl; pvr_csb_pack (&state->regs.isp_aa, CR_ISP_AA, value) { value.mode = isp_aa_mode; } /* The set up of CR_TPU must be identical to * pvr_render_job_ws_geometry_state_init(). */ pvr_csb_pack (&state->regs.tpu, CR_TPU, value) { value.tag_cem_4k_face_packing = true; } if (PVR_HAS_FEATURE(dev_info, cluster_grouping) && PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) && dev_runtime_info->num_phantoms > 1 && job->frag_uses_atomic_ops) { /* Each phantom has its own MCU, so atomicity can only be guaranteed * when all work items are processed on the same phantom. This means we * need to disable all USCs other than those of the first phantom, which * has 4 clusters. Note that we only need to do this for atomic * operations in fragment shaders, since hardware prevents the TA to run * on more than one phantom anyway. */ state->regs.pixel_phantom = 0xF; } else { state->regs.pixel_phantom = 0; } pvr_csb_pack (&state->regs.isp_bgobjvals, CR_ISP_BGOBJVALS, value) { value.enablebgtag = job->enable_bg_tag; value.mask = true; /* FIXME: Hard code this for now as we don't currently support any * stencil image formats. */ value.stencil = 0xFF; } pvr_csb_pack (&state->regs.isp_bgobjdepth, CR_ISP_BGOBJDEPTH, value) { /* FIXME: This is suitable for the single depth format the driver * currently supports, but may need updating to handle other depth * formats. */ value.value = fui(job->depth_clear_value); } /* FIXME: Some additional set up needed to support depth and stencil * load/store operations. */ pvr_csb_pack (&state->regs.isp_zlsctl, CR_ISP_ZLSCTL, value) { uint32_t aligned_width = ALIGN_POT(job->depth_physical_width, ROGUE_IPF_TILE_SIZE_PIXELS); uint32_t aligned_height = ALIGN_POT(job->depth_physical_height, ROGUE_IPF_TILE_SIZE_PIXELS); pvr_get_isp_num_tiles_xy(dev_info, job->samples, aligned_width, aligned_height, &value.zlsextent_x_z, &value.zlsextent_y_z); value.zlsextent_x_z -= 1; value.zlsextent_y_z -= 1; if (job->depth_memlayout == PVR_MEMLAYOUT_TWIDDLED) { value.loadtwiddled = true; value.storetwiddled = true; } /* FIXME: This is suitable for the single depth format the driver * currently supports, but may need updating to handle other depth * formats. */ assert(job->depth_vk_format == VK_FORMAT_D32_SFLOAT); value.zloadformat = PVRX(CR_ZLOADFORMAT_TYPE_F32Z); value.zstoreformat = PVRX(CR_ZSTOREFORMAT_TYPE_F32Z); } if (PVR_HAS_FEATURE(dev_info, zls_subtile)) { pvr_csb_pack (&state->regs.isp_zls_pixels, CR_ISP_ZLS_PIXELS, value) { value.x = job->depth_stride - 1; value.y = job->depth_height - 1; } } else { state->regs.isp_zls_pixels = 0; } pvr_csb_pack (&state->regs.isp_zload_store_base, CR_ISP_ZLOAD_BASE, value) { value.addr = job->depth_addr; } pvr_csb_pack (&state->regs.isp_stencil_load_store_base, CR_ISP_STENCIL_LOAD_BASE, value) { value.addr = job->stencil_addr; /* FIXME: May need to set value.enable to true. */ } pvr_csb_pack (&state->regs.tpu_border_colour_table, CR_TPU_BORDER_COLOUR_TABLE_PDM, value) { value.border_colour_table_address = job->border_colour_table_addr; } state->regs.isp_oclqry_base = 0; pvr_csb_pack (&state->regs.isp_dbias_base, CR_ISP_DBIAS_BASE, value) { value.addr = job->depth_bias_table_addr; } pvr_csb_pack (&state->regs.isp_scissor_base, CR_ISP_SCISSOR_BASE, value) { value.addr = job->scissor_table_addr; } pvr_csb_pack (&state->regs.event_pixel_pds_info, CR_EVENT_PIXEL_PDS_INFO, value) { value.const_size = DIV_ROUND_UP(ctx->device->pixel_event_data_size_in_dwords, PVRX(CR_EVENT_PIXEL_PDS_INFO_CONST_SIZE_UNIT_SIZE)); value.temp_stride = 0; value.usc_sr_size = DIV_ROUND_UP(PVR_STATE_PBE_DWORDS, PVRX(CR_EVENT_PIXEL_PDS_INFO_USC_SR_SIZE_UNIT_SIZE)); } pvr_csb_pack (&state->regs.event_pixel_pds_data, CR_EVENT_PIXEL_PDS_DATA, value) { value.addr = PVR_DEV_ADDR(job->pds_pixel_event_data_offset); } STATIC_ASSERT(ARRAY_SIZE(state->regs.pbe_word) == ARRAY_SIZE(job->pbe_reg_words)); STATIC_ASSERT(ARRAY_SIZE(state->regs.pbe_word[0]) == ARRAY_SIZE(job->pbe_reg_words[0])); for (uint32_t i = 0; i < ARRAY_SIZE(job->pbe_reg_words); i++) { state->regs.pbe_word[i][0] = job->pbe_reg_words[i][0]; state->regs.pbe_word[i][1] = job->pbe_reg_words[i][1]; state->regs.pbe_word[i][2] = job->pbe_reg_words[i][2]; } STATIC_ASSERT(__same_type(state->regs.pds_bgnd, job->pds_bgnd_reg_values)); typed_memcpy(state->regs.pds_bgnd, job->pds_bgnd_reg_values, ARRAY_SIZE(state->regs.pds_bgnd)); memset(state->regs.pds_pr_bgnd, 0, sizeof(state->regs.pds_pr_bgnd)); /* FIXME: Merge geometry and fragment flags into a single flags member? */ /* FIXME: move to its own function? */ state->flags = 0; if (job->depth_addr.addr) state->flags |= PVR_WINSYS_FRAG_FLAG_DEPTH_BUFFER_PRESENT; if (job->stencil_addr.addr) state->flags |= PVR_WINSYS_FRAG_FLAG_STENCIL_BUFFER_PRESENT; if (job->disable_compute_overlap) state->flags |= PVR_WINSYS_FRAG_FLAG_PREVENT_CDM_OVERLAP; if (job->frag_uses_atomic_ops) state->flags |= PVR_WINSYS_FRAG_FLAG_SINGLE_CORE; state->zls_stride = job->depth_layer_size; state->sls_stride = job->depth_layer_size; } static void pvr_render_job_ws_submit_info_init( struct pvr_render_ctx *ctx, struct pvr_render_job *job, const struct pvr_winsys_job_bo *bos, uint32_t bo_count, struct vk_sync **waits, uint32_t wait_count, uint32_t *stage_flags, struct pvr_winsys_render_submit_info *submit_info) { memset(submit_info, 0, sizeof(*submit_info)); submit_info->rt_dataset = job->rt_dataset->ws_rt_dataset; submit_info->rt_data_idx = job->rt_dataset->rt_data_idx; submit_info->frame_num = ctx->device->global_queue_present_count; submit_info->job_num = ctx->device->global_queue_job_count; submit_info->run_frag = job->run_frag; submit_info->bos = bos; submit_info->bo_count = bo_count; submit_info->waits = waits; submit_info->wait_count = wait_count; submit_info->stage_flags = stage_flags; /* FIXME: add WSI image bos. */ pvr_render_job_ws_geometry_state_init(ctx, job, &submit_info->geometry); pvr_render_job_ws_fragment_state_init(ctx, job, &submit_info->fragment); /* These values are expected to match. */ assert(submit_info->geometry.regs.tpu == submit_info->fragment.regs.tpu); } VkResult pvr_render_job_submit(struct pvr_render_ctx *ctx, struct pvr_render_job *job, const struct pvr_winsys_job_bo *bos, uint32_t bo_count, struct vk_sync **waits, uint32_t wait_count, uint32_t *stage_flags, struct vk_sync *signal_sync_geom, struct vk_sync *signal_sync_frag) { struct pvr_rt_dataset *rt_dataset = job->rt_dataset; struct pvr_winsys_render_submit_info submit_info; struct pvr_device *device = ctx->device; VkResult result; pvr_render_job_ws_submit_info_init(ctx, job, bos, bo_count, waits, wait_count, stage_flags, &submit_info); result = device->ws->ops->render_submit(ctx->ws_ctx, &submit_info, signal_sync_geom, signal_sync_frag); if (result != VK_SUCCESS) return result; if (job->run_frag) { /* Move to the next render target data now that a fragment job has been * successfully submitted. This will allow the next geometry job to be * submitted to been run in parallel with it. */ rt_dataset->rt_data_idx = (rt_dataset->rt_data_idx + 1) % ARRAY_SIZE(rt_dataset->rt_datas); rt_dataset->need_frag = false; } else { rt_dataset->need_frag = true; } return VK_SUCCESS; }