1/*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include <assert.h>
25#include <stdbool.h>
26#include <stdint.h>
27#include <vulkan/vulkan.h>
28
29#include "hwdef/rogue_hw_defs.h"
30#include "hwdef/rogue_hw_utils.h"
31#include "pvr_bo.h"
32#include "pvr_csb.h"
33#include "pvr_csb_enum_helpers.h"
34#include "pvr_debug.h"
35#include "pvr_job_common.h"
36#include "pvr_job_context.h"
37#include "pvr_job_render.h"
38#include "pvr_pds.h"
39#include "pvr_private.h"
40#include "pvr_rogue_fw.h"
41#include "pvr_types.h"
42#include "pvr_winsys.h"
43#include "util/compiler.h"
44#include "util/macros.h"
45#include "util/u_math.h"
46#include "vk_alloc.h"
47#include "vk_log.h"
48#include "vk_util.h"
49
50#define ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE 16U
51
52/* FIXME: Is there a hardware define we can use instead? */
53/* 1 DWord per PM physical page stored in the free list */
54#define ROGUE_FREE_LIST_ENTRY_SIZE ((uint32_t)sizeof(uint32_t))
55
56/* FIXME: The three defines below, for the number of PC, PD and PT entries in a
57 * 4KB page, come from rgxmmudefs_km.h (meaning they're part of the
58 * auto-generated hwdefs). Should these be defined in rogue_mmu.xml? Keeping in
59 * mind that we probably only need these three values. */
60#define ROGUE_NUM_PC_ENTRIES_PER_PAGE 0x400U
61
62#define ROGUE_NUM_PD_ENTRIES_PER_PAGE 0x200U
63
64#define ROGUE_NUM_PT_ENTRIES_PER_PAGE 0x200U
65
66struct pvr_free_list {
67   struct pvr_device *device;
68
69   uint64_t size;
70
71   struct pvr_bo *bo;
72
73   struct pvr_winsys_free_list *ws_free_list;
74};
75
76/* Macrotile information. */
77struct pvr_rt_mtile_info {
78   uint32_t tile_size_x;
79   uint32_t tile_size_y;
80
81   uint32_t num_tiles_x;
82   uint32_t num_tiles_y;
83
84   uint32_t tiles_per_mtile_x;
85   uint32_t tiles_per_mtile_y;
86
87   uint32_t x_tile_max;
88   uint32_t y_tile_max;
89
90   uint32_t mtiles_x;
91   uint32_t mtiles_y;
92
93   uint32_t mtile_x1;
94   uint32_t mtile_y1;
95   uint32_t mtile_x2;
96   uint32_t mtile_y2;
97   uint32_t mtile_x3;
98   uint32_t mtile_y3;
99
100   uint32_t mtile_stride;
101};
102
103struct pvr_rt_dataset {
104   struct pvr_device *device;
105
106   /* RT dataset information */
107   uint32_t width;
108   uint32_t height;
109   uint32_t samples;
110   uint32_t layers;
111
112   struct pvr_free_list *global_free_list;
113   struct pvr_free_list *local_free_list;
114
115   struct pvr_bo *vheap_rtc_bo;
116   pvr_dev_addr_t vheap_dev_addr;
117   pvr_dev_addr_t rtc_dev_addr;
118
119   struct pvr_bo *tpc_bo;
120   uint64_t tpc_stride;
121   uint64_t tpc_size;
122
123   struct pvr_winsys_rt_dataset *ws_rt_dataset;
124
125   /* RT data information */
126   struct pvr_bo *mta_mlist_bo;
127
128   struct pvr_bo *rgn_headers_bo;
129   uint64_t rgn_headers_stride;
130
131   bool need_frag;
132
133   uint8_t rt_data_idx;
134
135   struct {
136      pvr_dev_addr_t mta_dev_addr;
137      pvr_dev_addr_t mlist_dev_addr;
138      pvr_dev_addr_t rgn_headers_dev_addr;
139   } rt_datas[ROGUE_NUM_RTDATAS];
140};
141
142VkResult pvr_free_list_create(struct pvr_device *device,
143                              uint32_t initial_size,
144                              uint32_t max_size,
145                              uint32_t grow_size,
146                              uint32_t grow_threshold,
147                              struct pvr_free_list *parent_free_list,
148                              struct pvr_free_list **const free_list_out)
149{
150   struct pvr_winsys_free_list *parent_ws_free_list =
151      parent_free_list ? parent_free_list->ws_free_list : NULL;
152   const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
153                             PVR_BO_ALLOC_FLAG_PM_FW_PROTECT;
154   struct pvr_free_list *free_list;
155   uint32_t cache_line_size;
156   uint32_t initial_num_pages;
157   uint32_t grow_num_pages;
158   uint32_t max_num_pages;
159   uint64_t addr_alignment;
160   uint64_t size_alignment;
161   uint64_t size;
162   VkResult result;
163
164   assert((initial_size + grow_size) <= max_size);
165   assert(max_size != 0);
166   assert(grow_threshold <= 100);
167
168   /* Make sure the free list is created with at least a single page. */
169   if (initial_size == 0)
170      initial_size = ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
171
172   /* The freelists sizes must respect the PM freelist base address alignment
173    * requirement. As the freelist entries are cached by the SLC, it's also
174    * necessary to ensure the sizes respect the SLC cache line size to avoid
175    * invalid entries appearing in the cache, which would be problematic after
176    * a grow operation, as the SLC entries aren't invalidated. We do this by
177    * making sure the freelist values are appropriately aligned.
178    *
179    * To calculate the alignment, we first take the largest of the freelist
180    * base address alignment and the SLC cache line size. We then divide this
181    * by the freelist entry size to determine the number of freelist entries
182    * required by the PM. Finally, as each entry holds a single PM physical
183    * page, we multiple the number of entries by the page size.
184    *
185    * As an example, if the base address alignment is 16 bytes, the SLC cache
186    * line size is 64 bytes and the freelist entry size is 4 bytes then 16
187    * entries are required, as we take the SLC cacheline size (being the larger
188    * of the two values) and divide this by 4. If the PM page size is 4096
189    * bytes then we end up with an alignment of 65536 bytes.
190    */
191   cache_line_size = rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
192
193   addr_alignment =
194      MAX2(ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE, cache_line_size);
195   size_alignment = (addr_alignment / ROGUE_FREE_LIST_ENTRY_SIZE) *
196                    ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
197
198   assert(util_is_power_of_two_nonzero(size_alignment));
199
200   initial_size = align64(initial_size, size_alignment);
201   max_size = align64(max_size, size_alignment);
202   grow_size = align64(grow_size, size_alignment);
203
204   /* Make sure the 'max' size doesn't exceed what the firmware supports and
205    * adjust the other sizes accordingly.
206    */
207   if (max_size > ROGUE_FREE_LIST_MAX_SIZE) {
208      max_size = ROGUE_FREE_LIST_MAX_SIZE;
209      assert(align64(max_size, size_alignment) == max_size);
210   }
211
212   if (initial_size > max_size)
213      initial_size = max_size;
214
215   if (initial_size == max_size)
216      grow_size = 0;
217
218   initial_num_pages = initial_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
219   max_num_pages = max_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
220   grow_num_pages = grow_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
221
222   /* Calculate the size of the buffer needed to store the free list entries
223    * based on the maximum number of pages we can have.
224    */
225   size = max_num_pages * ROGUE_FREE_LIST_ENTRY_SIZE;
226   assert(align64(size, addr_alignment) == size);
227
228   free_list = vk_alloc(&device->vk.alloc,
229                        sizeof(*free_list),
230                        8,
231                        VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
232   if (!free_list)
233      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
234
235   /* FIXME: The memory is mapped GPU uncached, but this seems to contradict
236    * the comment above about aligning to the SLC cache line size.
237    */
238   result = pvr_bo_alloc(device,
239                         device->heaps.general_heap,
240                         size,
241                         addr_alignment,
242                         bo_flags,
243                         &free_list->bo);
244   if (result != VK_SUCCESS)
245      goto err_vk_free_free_list;
246
247   result = device->ws->ops->free_list_create(device->ws,
248                                              free_list->bo->vma,
249                                              initial_num_pages,
250                                              max_num_pages,
251                                              grow_num_pages,
252                                              grow_threshold,
253                                              parent_ws_free_list,
254                                              &free_list->ws_free_list);
255   if (result != VK_SUCCESS)
256      goto err_pvr_bo_free_bo;
257
258   free_list->device = device;
259   free_list->size = size;
260
261   *free_list_out = free_list;
262
263   return VK_SUCCESS;
264
265err_pvr_bo_free_bo:
266   pvr_bo_free(device, free_list->bo);
267
268err_vk_free_free_list:
269   vk_free(&device->vk.alloc, free_list);
270
271   return result;
272}
273
274void pvr_free_list_destroy(struct pvr_free_list *free_list)
275{
276   struct pvr_device *device = free_list->device;
277
278   device->ws->ops->free_list_destroy(free_list->ws_free_list);
279   pvr_bo_free(device, free_list->bo);
280   vk_free(&device->vk.alloc, free_list);
281}
282
283static inline void pvr_get_samples_in_xy(uint32_t samples,
284                                         uint32_t *const x_out,
285                                         uint32_t *const y_out)
286{
287   switch (samples) {
288   case 1:
289      *x_out = 1;
290      *y_out = 1;
291      break;
292   case 2:
293      *x_out = 1;
294      *y_out = 2;
295      break;
296   case 4:
297      *x_out = 2;
298      *y_out = 2;
299      break;
300   case 8:
301      *x_out = 2;
302      *y_out = 4;
303      break;
304   default:
305      unreachable("Unsupported number of samples");
306   }
307}
308
309static void pvr_rt_mtile_info_init(struct pvr_device *device,
310                                   struct pvr_rt_mtile_info *info,
311                                   uint32_t width,
312                                   uint32_t height,
313                                   uint32_t samples)
314{
315   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
316   uint32_t samples_in_x;
317   uint32_t samples_in_y;
318
319   pvr_get_samples_in_xy(samples, &samples_in_x, &samples_in_y);
320
321   info->tile_size_x = PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 1);
322   info->tile_size_y = PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 1);
323
324   info->num_tiles_x = DIV_ROUND_UP(width, info->tile_size_x);
325   info->num_tiles_y = DIV_ROUND_UP(height, info->tile_size_y);
326
327   rogue_get_num_macrotiles_xy(dev_info, &info->mtiles_x, &info->mtiles_y);
328
329   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
330      assert(PVR_GET_FEATURE_VALUE(dev_info,
331                                   simple_parameter_format_version,
332                                   0) == 2);
333      /* Set up 16 macrotiles with a multiple of 2x2 tiles per macrotile,
334       * which is aligned to a tile group.
335       */
336      info->mtile_x1 = DIV_ROUND_UP(info->num_tiles_x, 8) * 2;
337      info->mtile_y1 = DIV_ROUND_UP(info->num_tiles_y, 8) * 2;
338      info->mtile_x2 = 0;
339      info->mtile_y2 = 0;
340      info->mtile_x3 = 0;
341      info->mtile_y3 = 0;
342      info->x_tile_max = ALIGN_POT(info->num_tiles_x, 2) - 1;
343      info->y_tile_max = ALIGN_POT(info->num_tiles_y, 2) - 1;
344   } else {
345      /* Set up 16 macrotiles with a multiple of 4x4 tiles per macrotile. */
346      info->mtile_x1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_x, 4), 4);
347      info->mtile_y1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_y, 4), 4);
348      info->mtile_x2 = info->mtile_x1 * 2;
349      info->mtile_y2 = info->mtile_y1 * 2;
350      info->mtile_x3 = info->mtile_x1 * 3;
351      info->mtile_y3 = info->mtile_y1 * 3;
352      info->x_tile_max = info->num_tiles_x - 1;
353      info->y_tile_max = info->num_tiles_y - 1;
354   }
355
356   info->tiles_per_mtile_x = info->mtile_x1 * samples_in_x;
357   info->tiles_per_mtile_y = info->mtile_y1 * samples_in_y;
358
359   info->mtile_stride = info->mtile_x1 * info->mtile_y1;
360}
361
362/* Note that the unit of the return value depends on the GPU. For cores with the
363 * simple_internal_parameter_format feature the returned size is interpreted as
364 * the number of region headers. For cores without this feature its interpreted
365 * as the size in dwords.
366 */
367static uint64_t
368pvr_rt_get_isp_region_size(struct pvr_device *device,
369                           const struct pvr_rt_mtile_info *mtile_info)
370{
371   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
372   uint64_t rgn_size =
373      mtile_info->tiles_per_mtile_x * mtile_info->tiles_per_mtile_y;
374
375   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
376      uint32_t version;
377
378      rgn_size *= mtile_info->mtiles_x * mtile_info->mtiles_y;
379
380      if (PVR_FEATURE_VALUE(dev_info,
381                            simple_parameter_format_version,
382                            &version)) {
383         version = 0;
384      }
385
386      if (version == 2) {
387         /* One region header per 2x2 tile group. */
388         rgn_size /= (2U * 2U);
389      }
390   } else {
391      const uint64_t rgn_header_size = rogue_get_region_header_size(dev_info);
392
393      /* Round up to next dword to prevent IPF overrun and convert to bytes.
394       */
395      rgn_size = DIV_ROUND_UP(rgn_size * rgn_header_size, 4);
396   }
397
398   return rgn_size;
399}
400
401static VkResult pvr_rt_vheap_rtc_data_init(struct pvr_device *device,
402                                           struct pvr_rt_dataset *rt_dataset,
403                                           uint32_t layers)
404{
405   const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
406                             PVR_BO_ALLOC_FLAG_ZERO_ON_ALLOC;
407   uint64_t vheap_size;
408   uint32_t alignment;
409   uint64_t rtc_size;
410   VkResult result;
411
412   vheap_size = ROGUE_CR_PM_VHEAP_TABLE_SIZE * ROGUE_PM_VHEAP_ENTRY_SIZE;
413
414   if (layers > 1) {
415      uint64_t rtc_entries;
416
417      vheap_size = ALIGN_POT(vheap_size, PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT));
418
419      rtc_entries = ROGUE_NUM_TEAC + ROGUE_NUM_TE + ROGUE_NUM_VCE;
420      if (PVR_HAS_QUIRK(&device->pdevice->dev_info, 48545))
421         rtc_entries += ROGUE_NUM_TE;
422
423      rtc_size = rtc_entries * ROGUE_RTC_SIZE_IN_BYTES;
424   } else {
425      rtc_size = 0;
426   }
427
428   alignment = MAX2(PVRX(CR_PM_VHEAP_TABLE_BASE_ADDR_ALIGNMENT),
429                    PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT));
430
431   result = pvr_bo_alloc(device,
432                         device->heaps.general_heap,
433                         vheap_size + rtc_size,
434                         alignment,
435                         bo_flags,
436                         &rt_dataset->vheap_rtc_bo);
437   if (result != VK_SUCCESS)
438      return result;
439
440   rt_dataset->vheap_dev_addr = rt_dataset->vheap_rtc_bo->vma->dev_addr;
441
442   if (rtc_size > 0) {
443      rt_dataset->rtc_dev_addr =
444         PVR_DEV_ADDR_OFFSET(rt_dataset->vheap_dev_addr, vheap_size);
445   } else {
446      rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID;
447   }
448
449   return VK_SUCCESS;
450}
451
452static void pvr_rt_vheap_rtc_data_fini(struct pvr_rt_dataset *rt_dataset)
453{
454   rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID;
455
456   pvr_bo_free(rt_dataset->device, rt_dataset->vheap_rtc_bo);
457   rt_dataset->vheap_rtc_bo = NULL;
458}
459
460static void
461pvr_rt_get_tail_ptr_stride_size(const struct pvr_device *device,
462                                const struct pvr_rt_mtile_info *mtile_info,
463                                uint32_t layers,
464                                uint64_t *const stride_out,
465                                uint64_t *const size_out)
466{
467   uint32_t max_num_mtiles;
468   uint32_t num_mtiles_x;
469   uint32_t num_mtiles_y;
470   uint32_t version;
471   uint64_t size;
472
473   num_mtiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x;
474   num_mtiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y;
475
476   max_num_mtiles = MAX2(util_next_power_of_two64(num_mtiles_x),
477                         util_next_power_of_two64(num_mtiles_y));
478
479   size = max_num_mtiles * max_num_mtiles;
480
481   if (PVR_FEATURE_VALUE(&device->pdevice->dev_info,
482                         simple_parameter_format_version,
483                         &version)) {
484      version = 0;
485   }
486
487   if (version == 2) {
488      /* One tail pointer cache entry per 2x2 tile group. */
489      size /= (2U * 2U);
490   }
491
492   size *= ROGUE_TAIL_POINTER_SIZE;
493
494   if (layers > 1) {
495      size = ALIGN_POT(size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE);
496
497      *stride_out = size / ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
498      *size_out = size * layers;
499   } else {
500      *stride_out = 0;
501      *size_out = size;
502   }
503}
504
505static VkResult pvr_rt_tpc_data_init(struct pvr_device *device,
506                                     struct pvr_rt_dataset *rt_dataset,
507                                     const struct pvr_rt_mtile_info *mtile_info,
508                                     uint32_t layers)
509{
510   const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
511                             PVR_BO_ALLOC_FLAG_ZERO_ON_ALLOC;
512   uint64_t tpc_size;
513
514   pvr_rt_get_tail_ptr_stride_size(device,
515                                   mtile_info,
516                                   layers,
517                                   &rt_dataset->tpc_stride,
518                                   &rt_dataset->tpc_size);
519   tpc_size = ALIGN_POT(rt_dataset->tpc_size, ROGUE_TE_TPC_CACHE_LINE_SIZE);
520
521   return pvr_bo_alloc(device,
522                       device->heaps.general_heap,
523                       tpc_size,
524                       PVRX(CR_TE_TPC_ADDR_BASE_ALIGNMENT),
525                       bo_flags,
526                       &rt_dataset->tpc_bo);
527}
528
529static void pvr_rt_tpc_data_fini(struct pvr_rt_dataset *rt_dataset)
530{
531   pvr_bo_free(rt_dataset->device, rt_dataset->tpc_bo);
532   rt_dataset->tpc_bo = NULL;
533}
534
535static uint32_t
536pvr_rt_get_mlist_size(const struct pvr_free_list *global_free_list,
537                      const struct pvr_free_list *local_free_list)
538{
539   uint32_t num_pte_pages;
540   uint32_t num_pde_pages;
541   uint32_t num_pce_pages;
542   uint64_t total_pages;
543   uint32_t mlist_size;
544
545   assert(global_free_list->size + local_free_list->size <=
546          ROGUE_PM_MAX_PB_VIRT_ADDR_SPACE);
547
548   total_pages = (global_free_list->size + local_free_list->size) >>
549                 ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
550
551   /* Calculate the total number of physical pages required to hold the page
552    * table, directory and catalog entries for the freelist pages.
553    */
554   num_pte_pages = DIV_ROUND_UP(total_pages, ROGUE_NUM_PT_ENTRIES_PER_PAGE);
555   num_pde_pages = DIV_ROUND_UP(num_pte_pages, ROGUE_NUM_PD_ENTRIES_PER_PAGE);
556   num_pce_pages = DIV_ROUND_UP(num_pde_pages, ROGUE_NUM_PC_ENTRIES_PER_PAGE);
557
558   /* Calculate the MList size considering the total number of pages in the PB
559    * are shared among all the PM address spaces.
560    */
561   mlist_size = (num_pce_pages + num_pde_pages + num_pte_pages) *
562                ROGUE_NUM_PM_ADDRESS_SPACES * ROGUE_MLIST_ENTRY_STRIDE;
563
564   return ALIGN_POT(mlist_size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE);
565}
566
567static void pvr_rt_get_region_headers_stride_size(
568   const struct pvr_device *device,
569   const struct pvr_rt_mtile_info *mtile_info,
570   uint32_t layers,
571   uint64_t *const stride_out,
572   uint64_t *const size_out)
573{
574   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
575   const uint32_t rgn_header_size = rogue_get_region_header_size(dev_info);
576   uint32_t rgn_headers_size;
577   uint32_t num_tiles_x;
578   uint32_t num_tiles_y;
579   uint32_t group_size;
580   uint32_t version;
581
582   if (PVR_FEATURE_VALUE(dev_info, simple_parameter_format_version, &version))
583      version = 0;
584
585   group_size = version == 2 ? 2 : 1;
586
587   num_tiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x;
588   num_tiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y;
589
590   rgn_headers_size =
591      (num_tiles_x / group_size) * (num_tiles_y / group_size) * rgn_header_size;
592
593   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
594      rgn_headers_size =
595         ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT));
596   }
597
598   if (layers > 1) {
599      rgn_headers_size =
600         ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE));
601   }
602
603   *stride_out = rgn_header_size;
604   *size_out = rgn_headers_size * layers;
605}
606
607static VkResult
608pvr_rt_mta_mlist_data_init(struct pvr_device *device,
609                           struct pvr_rt_dataset *rt_dataset,
610                           const struct pvr_free_list *global_free_list,
611                           const struct pvr_free_list *local_free_list,
612                           const struct pvr_rt_mtile_info *mtile_info)
613{
614   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
615   const uint32_t mlist_size =
616      pvr_rt_get_mlist_size(global_free_list, local_free_list);
617   uint32_t mta_size = rogue_get_macrotile_array_size(dev_info);
618   const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas);
619   uint32_t rt_datas_mlist_size;
620   uint32_t rt_datas_mta_size;
621   pvr_dev_addr_t dev_addr;
622   VkResult result;
623
624   /* Allocate memory for macrotile array and Mlist for all RT datas.
625    *
626    * Allocation layout: MTA[0..N] + Mlist alignment padding + Mlist[0..N].
627    *
628    * N is number of RT datas.
629    */
630   rt_datas_mta_size = ALIGN_POT(mta_size * num_rt_datas,
631                                 PVRX(CR_PM_MLIST0_BASE_ADDR_ALIGNMENT));
632   rt_datas_mlist_size = mlist_size * num_rt_datas;
633
634   result = pvr_bo_alloc(device,
635                         device->heaps.general_heap,
636                         rt_datas_mta_size + rt_datas_mlist_size,
637                         PVRX(CR_PM_MTILE_ARRAY_BASE_ADDR_ALIGNMENT),
638                         PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
639                         &rt_dataset->mta_mlist_bo);
640   if (result != VK_SUCCESS)
641      return result;
642
643   dev_addr = rt_dataset->mta_mlist_bo->vma->dev_addr;
644
645   for (uint32_t i = 0; i < num_rt_datas; i++) {
646      if (mta_size != 0) {
647         rt_dataset->rt_datas[i].mta_dev_addr = dev_addr;
648         dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mta_size);
649      } else {
650         rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID;
651      }
652   }
653
654   dev_addr = PVR_DEV_ADDR_OFFSET(rt_dataset->mta_mlist_bo->vma->dev_addr,
655                                  rt_datas_mta_size);
656
657   for (uint32_t i = 0; i < num_rt_datas; i++) {
658      if (mlist_size != 0) {
659         rt_dataset->rt_datas[i].mlist_dev_addr = dev_addr;
660         dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mlist_size);
661      } else {
662         rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID;
663      }
664   }
665
666   return VK_SUCCESS;
667}
668
669static void pvr_rt_mta_mlist_data_fini(struct pvr_rt_dataset *rt_dataset)
670{
671   for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++) {
672      rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID;
673      rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID;
674   }
675
676   pvr_bo_free(rt_dataset->device, rt_dataset->mta_mlist_bo);
677   rt_dataset->mta_mlist_bo = NULL;
678}
679
680static VkResult
681pvr_rt_rgn_headers_data_init(struct pvr_device *device,
682                             struct pvr_rt_dataset *rt_dataset,
683                             const struct pvr_rt_mtile_info *mtile_info,
684                             uint32_t layers)
685{
686   const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas);
687   uint64_t rgn_headers_size;
688   pvr_dev_addr_t dev_addr;
689   VkResult result;
690
691   pvr_rt_get_region_headers_stride_size(device,
692                                         mtile_info,
693                                         layers,
694                                         &rt_dataset->rgn_headers_stride,
695                                         &rgn_headers_size);
696
697   result = pvr_bo_alloc(device,
698                         device->heaps.rgn_hdr_heap,
699                         rgn_headers_size * num_rt_datas,
700                         PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT),
701                         PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
702                         &rt_dataset->rgn_headers_bo);
703   if (result != VK_SUCCESS)
704      return result;
705
706   dev_addr = rt_dataset->rgn_headers_bo->vma->dev_addr;
707
708   for (uint32_t i = 0; i < num_rt_datas; i++) {
709      rt_dataset->rt_datas[i].rgn_headers_dev_addr = dev_addr;
710      dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, rgn_headers_size);
711   }
712
713   return VK_SUCCESS;
714}
715
716static void pvr_rt_rgn_headers_data_fini(struct pvr_rt_dataset *rt_dataset)
717{
718   for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++)
719      rt_dataset->rt_datas[i].rgn_headers_dev_addr = PVR_DEV_ADDR_INVALID;
720
721   pvr_bo_free(rt_dataset->device, rt_dataset->rgn_headers_bo);
722   rt_dataset->rgn_headers_bo = NULL;
723}
724
725static VkResult pvr_rt_datas_init(struct pvr_device *device,
726                                  struct pvr_rt_dataset *rt_dataset,
727                                  const struct pvr_free_list *global_free_list,
728                                  const struct pvr_free_list *local_free_list,
729                                  const struct pvr_rt_mtile_info *mtile_info,
730                                  uint32_t layers)
731{
732   VkResult result;
733
734   result = pvr_rt_mta_mlist_data_init(device,
735                                       rt_dataset,
736                                       global_free_list,
737                                       local_free_list,
738                                       mtile_info);
739   if (result != VK_SUCCESS)
740      return result;
741
742   result =
743      pvr_rt_rgn_headers_data_init(device, rt_dataset, mtile_info, layers);
744   if (result != VK_SUCCESS)
745      goto err_pvr_rt_mta_mlist_data_fini;
746
747   return VK_SUCCESS;
748
749err_pvr_rt_mta_mlist_data_fini:
750   pvr_rt_mta_mlist_data_fini(rt_dataset);
751
752   return VK_SUCCESS;
753}
754
755static void pvr_rt_datas_fini(struct pvr_rt_dataset *rt_dataset)
756{
757   pvr_rt_rgn_headers_data_fini(rt_dataset);
758   pvr_rt_mta_mlist_data_fini(rt_dataset);
759}
760
761static uint32_t
762pvr_rogue_get_cr_isp_mtile_size_val(const struct pvr_device_info *dev_info,
763                                    uint32_t samples,
764                                    const struct pvr_rt_mtile_info *mtile_info)
765{
766   uint32_t samples_per_pixel =
767      PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 0);
768   uint32_t isp_mtile_size;
769
770   pvr_csb_pack (&isp_mtile_size, CR_ISP_MTILE_SIZE, value) {
771      value.x = mtile_info->mtile_x1;
772      value.y = mtile_info->mtile_y1;
773
774      if (samples_per_pixel == 1) {
775         if (samples >= 4)
776            value.x <<= 1;
777
778         if (samples >= 2)
779            value.y <<= 1;
780      } else if (samples_per_pixel == 2) {
781         if (samples >= 8)
782            value.x <<= 1;
783
784         if (samples >= 4)
785            value.y <<= 1;
786      } else if (samples_per_pixel == 4) {
787         if (samples >= 8)
788            value.y <<= 1;
789      } else {
790         assert(!"Unsupported ISP samples per pixel value");
791      }
792   }
793
794   return isp_mtile_size;
795}
796
797static uint64_t pvr_rogue_get_cr_multisamplectl_val(uint32_t samples,
798                                                    bool y_flip)
799{
800   static const struct {
801      uint8_t x[8];
802      uint8_t y[8];
803   } sample_positions[4] = {
804      /* 1 sample */
805      {
806         .x = { 8 },
807         .y = { 8 },
808      },
809      /* 2 samples */
810      {
811         .x = { 12, 4 },
812         .y = { 12, 4 },
813      },
814      /* 4 samples */
815      {
816         .x = { 6, 14, 2, 10 },
817         .y = { 2, 6, 10, 14 },
818      },
819      /* 8 samples */
820      {
821         .x = { 9, 7, 13, 5, 3, 1, 11, 15 },
822         .y = { 5, 11, 9, 3, 13, 7, 15, 1 },
823      },
824   };
825   uint64_t multisamplectl;
826   uint8_t idx;
827
828   idx = util_fast_log2(samples);
829   assert(idx < ARRAY_SIZE(sample_positions));
830
831   pvr_csb_pack (&multisamplectl, CR_PPP_MULTISAMPLECTL, value) {
832      switch (samples) {
833      case 8:
834         value.msaa_x7 = sample_positions[idx].x[7];
835         value.msaa_x6 = sample_positions[idx].x[6];
836         value.msaa_x5 = sample_positions[idx].x[5];
837         value.msaa_x4 = sample_positions[idx].x[4];
838
839         if (y_flip) {
840            value.msaa_y7 = 16U - sample_positions[idx].y[7];
841            value.msaa_y6 = 16U - sample_positions[idx].y[6];
842            value.msaa_y5 = 16U - sample_positions[idx].y[5];
843            value.msaa_y4 = 16U - sample_positions[idx].y[4];
844         } else {
845            value.msaa_y7 = sample_positions[idx].y[7];
846            value.msaa_y6 = sample_positions[idx].y[6];
847            value.msaa_y5 = sample_positions[idx].y[5];
848            value.msaa_y4 = sample_positions[idx].y[4];
849         }
850
851         FALLTHROUGH;
852      case 4:
853         value.msaa_x3 = sample_positions[idx].x[3];
854         value.msaa_x2 = sample_positions[idx].x[2];
855
856         if (y_flip) {
857            value.msaa_y3 = 16U - sample_positions[idx].y[3];
858            value.msaa_y2 = 16U - sample_positions[idx].y[2];
859         } else {
860            value.msaa_y3 = sample_positions[idx].y[3];
861            value.msaa_y2 = sample_positions[idx].y[2];
862         }
863
864         FALLTHROUGH;
865      case 2:
866         value.msaa_x1 = sample_positions[idx].x[1];
867
868         if (y_flip) {
869            value.msaa_y1 = 16U - sample_positions[idx].y[1];
870         } else {
871            value.msaa_y1 = sample_positions[idx].y[1];
872         }
873
874         FALLTHROUGH;
875      case 1:
876         value.msaa_x0 = sample_positions[idx].x[0];
877
878         if (y_flip) {
879            value.msaa_y0 = 16U - sample_positions[idx].y[0];
880         } else {
881            value.msaa_y0 = sample_positions[idx].y[0];
882         }
883
884         break;
885      default:
886         unreachable("Unsupported number of samples");
887      }
888   }
889
890   return multisamplectl;
891}
892
893static uint32_t
894pvr_rogue_get_cr_te_aa_val(const struct pvr_device_info *dev_info,
895                           uint32_t samples)
896{
897   uint32_t samples_per_pixel =
898      PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 0);
899   uint32_t te_aa;
900
901   pvr_csb_pack (&te_aa, CR_TE_AA, value) {
902      if (samples_per_pixel == 1) {
903         if (samples >= 2)
904            value.y = true;
905         if (samples >= 4)
906            value.x = true;
907      } else if (samples_per_pixel == 2) {
908         if (samples >= 2)
909            value.x2 = true;
910         if (samples >= 4)
911            value.y = true;
912         if (samples >= 8)
913            value.x = true;
914      } else if (samples_per_pixel == 4) {
915         if (samples >= 2)
916            value.x2 = true;
917         if (samples >= 4)
918            value.y2 = true;
919         if (samples >= 8)
920            value.y = true;
921      } else {
922         assert(!"Unsupported ISP samples per pixel value");
923      }
924   }
925
926   return te_aa;
927}
928
929static void pvr_rt_dataset_ws_create_info_init(
930   struct pvr_rt_dataset *rt_dataset,
931   const struct pvr_rt_mtile_info *mtile_info,
932   struct pvr_winsys_rt_dataset_create_info *create_info)
933{
934   struct pvr_device *device = rt_dataset->device;
935   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
936
937   memset(create_info, 0, sizeof(*create_info));
938
939   /* Local freelist. */
940   create_info->local_free_list = rt_dataset->local_free_list->ws_free_list;
941
942   /* ISP register values. */
943   if (PVR_HAS_ERN(dev_info, 42307) &&
944       !(PVR_HAS_FEATURE(dev_info, roguexe) && mtile_info->tile_size_x == 16)) {
945      float value;
946
947      if (rt_dataset->width != 0) {
948         value =
949            ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->width;
950         create_info->isp_merge_lower_x = fui(value);
951
952         value =
953            ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->width;
954         create_info->isp_merge_upper_x = fui(value);
955      }
956
957      if (rt_dataset->height != 0) {
958         value =
959            ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->height;
960         create_info->isp_merge_lower_y = fui(value);
961
962         value =
963            ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->height;
964         create_info->isp_merge_upper_y = fui(value);
965      }
966
967      value = ((float)rt_dataset->width * ROGUE_ISP_MERGE_SCALE_FACTOR) /
968              (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR -
969               ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR);
970      create_info->isp_merge_scale_x = fui(value);
971
972      value = ((float)rt_dataset->height * ROGUE_ISP_MERGE_SCALE_FACTOR) /
973              (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR -
974               ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR);
975      create_info->isp_merge_scale_y = fui(value);
976   }
977
978   create_info->isp_mtile_size =
979      pvr_rogue_get_cr_isp_mtile_size_val(dev_info,
980                                          rt_dataset->samples,
981                                          mtile_info);
982
983   /* PPP register values. */
984   create_info->ppp_multi_sample_ctl =
985      pvr_rogue_get_cr_multisamplectl_val(rt_dataset->samples, false);
986   create_info->ppp_multi_sample_ctl_y_flipped =
987      pvr_rogue_get_cr_multisamplectl_val(rt_dataset->samples, true);
988
989   pvr_csb_pack (&create_info->ppp_screen, CR_PPP_SCREEN, value) {
990      value.pixxmax = rt_dataset->width - 1;
991      value.pixymax = rt_dataset->height - 1;
992   }
993
994   /* TE register values. */
995   create_info->te_aa =
996      pvr_rogue_get_cr_te_aa_val(dev_info, rt_dataset->samples);
997
998   pvr_csb_pack (&create_info->te_mtile1, CR_TE_MTILE1, value) {
999      value.x1 = mtile_info->mtile_x1;
1000      if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1001         value.x2 = mtile_info->mtile_x2;
1002         value.x3 = mtile_info->mtile_x3;
1003      }
1004   }
1005
1006   pvr_csb_pack (&create_info->te_mtile2, CR_TE_MTILE2, value) {
1007      value.y1 = mtile_info->mtile_y1;
1008      if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1009         value.y2 = mtile_info->mtile_y2;
1010         value.y3 = mtile_info->mtile_y3;
1011      }
1012   }
1013
1014   pvr_csb_pack (&create_info->te_screen, CR_TE_SCREEN, value) {
1015      value.xmax = mtile_info->x_tile_max;
1016      value.ymax = mtile_info->y_tile_max;
1017   }
1018
1019   /* Allocations and associated information. */
1020   create_info->vheap_table_dev_addr = rt_dataset->vheap_dev_addr;
1021   create_info->rtc_dev_addr = rt_dataset->rtc_dev_addr;
1022
1023   create_info->tpc_dev_addr = rt_dataset->tpc_bo->vma->dev_addr;
1024   create_info->tpc_stride = rt_dataset->tpc_stride;
1025   create_info->tpc_size = rt_dataset->tpc_size;
1026
1027   STATIC_ASSERT(ARRAY_SIZE(create_info->rt_datas) ==
1028                 ARRAY_SIZE(rt_dataset->rt_datas));
1029   for (uint32_t i = 0; i < ARRAY_SIZE(create_info->rt_datas); i++) {
1030      create_info->rt_datas[i].pm_mlist_dev_addr =
1031         rt_dataset->rt_datas[i].mlist_dev_addr;
1032      create_info->rt_datas[i].macrotile_array_dev_addr =
1033         rt_dataset->rt_datas[i].mta_dev_addr;
1034      create_info->rt_datas[i].rgn_header_dev_addr =
1035         rt_dataset->rt_datas[i].rgn_headers_dev_addr;
1036   }
1037
1038   create_info->rgn_header_size =
1039      pvr_rt_get_isp_region_size(device, mtile_info);
1040
1041   /* Miscellaneous. */
1042   create_info->mtile_stride = mtile_info->mtile_stride;
1043   create_info->max_rts = rt_dataset->layers;
1044}
1045
1046VkResult
1047pvr_render_target_dataset_create(struct pvr_device *device,
1048                                 uint32_t width,
1049                                 uint32_t height,
1050                                 uint32_t samples,
1051                                 uint32_t layers,
1052                                 struct pvr_rt_dataset **const rt_dataset_out)
1053{
1054   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
1055   struct pvr_winsys_rt_dataset_create_info rt_dataset_create_info;
1056   struct pvr_rt_mtile_info mtile_info;
1057   struct pvr_rt_dataset *rt_dataset;
1058   VkResult result;
1059
1060   assert(device->global_free_list);
1061   assert(width <= rogue_get_render_size_max_x(dev_info));
1062   assert(height <= rogue_get_render_size_max_y(dev_info));
1063   assert(layers > 0 && layers <= PVR_MAX_FRAMEBUFFER_LAYERS);
1064
1065   pvr_rt_mtile_info_init(device, &mtile_info, width, height, samples);
1066
1067   rt_dataset = vk_zalloc(&device->vk.alloc,
1068                          sizeof(*rt_dataset),
1069                          8,
1070                          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1071   if (!rt_dataset)
1072      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1073
1074   rt_dataset->device = device;
1075   rt_dataset->width = width;
1076   rt_dataset->height = height;
1077   rt_dataset->samples = samples;
1078   rt_dataset->layers = layers;
1079   rt_dataset->global_free_list = device->global_free_list;
1080
1081   /* The maximum supported free list size is based on the assumption that this
1082    * freelist (the "local" freelist) is always the minimum size required by
1083    * the hardware. See the documentation of ROGUE_FREE_LIST_MAX_SIZE for more
1084    * details.
1085    */
1086   result = pvr_free_list_create(device,
1087                                 rogue_get_min_free_list_size(dev_info),
1088                                 rogue_get_min_free_list_size(dev_info),
1089                                 0 /* grow_size */,
1090                                 0 /* grow_threshold */,
1091                                 rt_dataset->global_free_list,
1092                                 &rt_dataset->local_free_list);
1093   if (result != VK_SUCCESS)
1094      goto err_vk_free_rt_dataset;
1095
1096   result = pvr_rt_vheap_rtc_data_init(device, rt_dataset, layers);
1097   if (result != VK_SUCCESS)
1098      goto err_pvr_free_list_destroy;
1099
1100   result = pvr_rt_tpc_data_init(device, rt_dataset, &mtile_info, layers);
1101   if (result != VK_SUCCESS)
1102      goto err_pvr_rt_vheap_rtc_data_fini;
1103
1104   result = pvr_rt_datas_init(device,
1105                              rt_dataset,
1106                              rt_dataset->global_free_list,
1107                              rt_dataset->local_free_list,
1108                              &mtile_info,
1109                              layers);
1110   if (result != VK_SUCCESS)
1111      goto err_pvr_rt_tpc_data_fini;
1112
1113   /* rt_dataset must be fully initialized by this point since
1114    * pvr_rt_dataset_ws_create_info_init() depends on this.
1115    */
1116   pvr_rt_dataset_ws_create_info_init(rt_dataset,
1117                                      &mtile_info,
1118                                      &rt_dataset_create_info);
1119
1120   result =
1121      device->ws->ops->render_target_dataset_create(device->ws,
1122                                                    &rt_dataset_create_info,
1123                                                    &rt_dataset->ws_rt_dataset);
1124   if (result != VK_SUCCESS)
1125      goto err_pvr_rt_datas_fini;
1126
1127   *rt_dataset_out = rt_dataset;
1128
1129   return VK_SUCCESS;
1130
1131err_pvr_rt_datas_fini:
1132   pvr_rt_datas_fini(rt_dataset);
1133
1134err_pvr_rt_tpc_data_fini:
1135   pvr_rt_tpc_data_fini(rt_dataset);
1136
1137err_pvr_rt_vheap_rtc_data_fini:
1138   pvr_rt_vheap_rtc_data_fini(rt_dataset);
1139
1140err_pvr_free_list_destroy:
1141   pvr_free_list_destroy(rt_dataset->local_free_list);
1142
1143err_vk_free_rt_dataset:
1144   vk_free(&device->vk.alloc, rt_dataset);
1145
1146   return result;
1147}
1148
1149void pvr_render_target_dataset_destroy(struct pvr_rt_dataset *rt_dataset)
1150{
1151   struct pvr_device *device = rt_dataset->device;
1152
1153   device->ws->ops->render_target_dataset_destroy(rt_dataset->ws_rt_dataset);
1154
1155   pvr_rt_datas_fini(rt_dataset);
1156   pvr_rt_tpc_data_fini(rt_dataset);
1157   pvr_rt_vheap_rtc_data_fini(rt_dataset);
1158
1159   pvr_free_list_destroy(rt_dataset->local_free_list);
1160
1161   vk_free(&device->vk.alloc, rt_dataset);
1162}
1163
1164static void
1165pvr_render_job_ws_geometry_state_init(struct pvr_render_ctx *ctx,
1166                                      struct pvr_render_job *job,
1167                                      struct pvr_winsys_geometry_state *state)
1168{
1169   const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info;
1170
1171   /* FIXME: Should this just be done unconditionally? The firmware will just
1172    * ignore the value anyway.
1173    */
1174   if (PVR_HAS_QUIRK(dev_info, 56279)) {
1175      pvr_csb_pack (&state->regs.pds_ctrl, CR_PDS_CTRL, value) {
1176         value.max_num_vdm_tasks = rogue_get_max_num_vdm_pds_tasks(dev_info);
1177      }
1178   } else {
1179      state->regs.pds_ctrl = 0;
1180   }
1181
1182   pvr_csb_pack (&state->regs.ppp_ctrl, CR_PPP_CTRL, value) {
1183      value.wclampen = true;
1184      value.fixed_point_format = 1;
1185   }
1186
1187   pvr_csb_pack (&state->regs.te_psg, CR_TE_PSG, value) {
1188      value.completeonterminate = job->geometry_terminate;
1189
1190      value.region_stride = job->rt_dataset->rgn_headers_stride /
1191                            PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE);
1192
1193      value.forcenewstate = PVR_HAS_QUIRK(dev_info, 52942);
1194   }
1195
1196   /* The set up of CR_TPU must be identical to
1197    * pvr_render_job_ws_fragment_state_init().
1198    */
1199   pvr_csb_pack (&state->regs.tpu, CR_TPU, value) {
1200      value.tag_cem_4k_face_packing = true;
1201   }
1202
1203   pvr_csb_pack (&state->regs.tpu_border_colour_table,
1204                 CR_TPU_BORDER_COLOUR_TABLE_VDM,
1205                 value) {
1206      value.border_colour_table_address = job->border_colour_table_addr;
1207   }
1208
1209   pvr_csb_pack (&state->regs.vdm_ctrl_stream_base,
1210                 CR_VDM_CTRL_STREAM_BASE,
1211                 value) {
1212      value.addr = job->ctrl_stream_addr;
1213   }
1214
1215   /* Set up the USC common size for the context switch resume/load program
1216    * (ctx->ctx_switch.programs[i].sr->pds_load_program), which was created
1217    * as part of the render context.
1218    */
1219   pvr_csb_pack (&state->regs.vdm_ctx_resume_task0_size,
1220                 VDMCTRL_PDS_STATE0,
1221                 value) {
1222      /* Calculate the size in bytes. */
1223      const uint16_t shared_registers_size = job->max_shared_registers * 4;
1224
1225      value.usc_common_size =
1226         DIV_ROUND_UP(shared_registers_size,
1227                      PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
1228   };
1229
1230   state->flags = 0;
1231
1232   if (!job->rt_dataset->need_frag)
1233      state->flags |= PVR_WINSYS_GEOM_FLAG_FIRST_GEOMETRY;
1234
1235   if (job->geometry_terminate)
1236      state->flags |= PVR_WINSYS_GEOM_FLAG_LAST_GEOMETRY;
1237
1238   if (job->frag_uses_atomic_ops)
1239      state->flags |= PVR_WINSYS_GEOM_FLAG_SINGLE_CORE;
1240}
1241
1242static inline void
1243pvr_get_isp_num_tiles_xy(const struct pvr_device_info *dev_info,
1244                         uint32_t samples,
1245                         uint32_t width,
1246                         uint32_t height,
1247                         uint32_t *const x_out,
1248                         uint32_t *const y_out)
1249{
1250   uint32_t tile_samples_x;
1251   uint32_t tile_samples_y;
1252   uint32_t scale_x;
1253   uint32_t scale_y;
1254
1255   rogue_get_isp_samples_per_tile_xy(dev_info,
1256                                     samples,
1257                                     &tile_samples_x,
1258                                     &tile_samples_y);
1259
1260   switch (samples) {
1261   case 1:
1262      scale_x = 1;
1263      scale_y = 1;
1264      break;
1265   case 2:
1266      scale_x = 1;
1267      scale_y = 2;
1268      break;
1269   case 4:
1270      scale_x = 2;
1271      scale_y = 2;
1272      break;
1273   case 8:
1274      scale_x = 2;
1275      scale_y = 4;
1276      break;
1277   default:
1278      unreachable("Unsupported number of samples");
1279   }
1280
1281   *x_out = DIV_ROUND_UP(width * scale_x, tile_samples_x);
1282   *y_out = DIV_ROUND_UP(height * scale_y, tile_samples_y);
1283
1284   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1285      assert(PVR_GET_FEATURE_VALUE(dev_info,
1286                                   simple_parameter_format_version,
1287                                   0U) == 2U);
1288      /* Align to a 2x2 tile block. */
1289      *x_out = ALIGN_POT(*x_out, 2);
1290      *y_out = ALIGN_POT(*y_out, 2);
1291   }
1292}
1293
1294static void
1295pvr_render_job_ws_fragment_state_init(struct pvr_render_ctx *ctx,
1296                                      struct pvr_render_job *job,
1297                                      struct pvr_winsys_fragment_state *state)
1298{
1299   const enum PVRX(CR_ISP_AA_MODE_TYPE)
1300      isp_aa_mode = pvr_cr_isp_aa_mode_type(job->samples);
1301   const struct pvr_device_runtime_info *dev_runtime_info =
1302      &ctx->device->pdevice->dev_runtime_info;
1303   const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info;
1304   uint32_t isp_ctl;
1305
1306   /* FIXME: what to do when job->run_frag is false? */
1307
1308   /* FIXME: pass in the number of samples rather than isp_aa_mode? */
1309   pvr_setup_tiles_in_flight(dev_info,
1310                             dev_runtime_info,
1311                             isp_aa_mode,
1312                             job->pixel_output_width,
1313                             false,
1314                             job->max_tiles_in_flight,
1315                             &isp_ctl,
1316                             &state->regs.usc_pixel_output_ctrl);
1317
1318   pvr_csb_pack (&state->regs.isp_ctl, CR_ISP_CTL, value) {
1319      value.sample_pos = true;
1320
1321      /* FIXME: There are a number of things that cause this to be set, this
1322       * is just one of them.
1323       */
1324      value.process_empty_tiles = job->process_empty_tiles;
1325   }
1326
1327   /* FIXME: When pvr_setup_tiles_in_flight() is refactored it might be
1328    * possible to fully pack CR_ISP_CTL above rather than having to OR in part
1329    * of the value.
1330    */
1331   state->regs.isp_ctl |= isp_ctl;
1332
1333   pvr_csb_pack (&state->regs.isp_aa, CR_ISP_AA, value) {
1334      value.mode = isp_aa_mode;
1335   }
1336
1337   /* The set up of CR_TPU must be identical to
1338    * pvr_render_job_ws_geometry_state_init().
1339    */
1340   pvr_csb_pack (&state->regs.tpu, CR_TPU, value) {
1341      value.tag_cem_4k_face_packing = true;
1342   }
1343
1344   if (PVR_HAS_FEATURE(dev_info, cluster_grouping) &&
1345       PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) &&
1346       dev_runtime_info->num_phantoms > 1 && job->frag_uses_atomic_ops) {
1347      /* Each phantom has its own MCU, so atomicity can only be guaranteed
1348       * when all work items are processed on the same phantom. This means we
1349       * need to disable all USCs other than those of the first phantom, which
1350       * has 4 clusters. Note that we only need to do this for atomic
1351       * operations in fragment shaders, since hardware prevents the TA to run
1352       * on more than one phantom anyway.
1353       */
1354      state->regs.pixel_phantom = 0xF;
1355   } else {
1356      state->regs.pixel_phantom = 0;
1357   }
1358
1359   pvr_csb_pack (&state->regs.isp_bgobjvals, CR_ISP_BGOBJVALS, value) {
1360      value.enablebgtag = job->enable_bg_tag;
1361
1362      value.mask = true;
1363
1364      /* FIXME: Hard code this for now as we don't currently support any
1365       * stencil image formats.
1366       */
1367      value.stencil = 0xFF;
1368   }
1369
1370   pvr_csb_pack (&state->regs.isp_bgobjdepth, CR_ISP_BGOBJDEPTH, value) {
1371      /* FIXME: This is suitable for the single depth format the driver
1372       * currently supports, but may need updating to handle other depth
1373       * formats.
1374       */
1375      value.value = fui(job->depth_clear_value);
1376   }
1377
1378   /* FIXME: Some additional set up needed to support depth and stencil
1379    * load/store operations.
1380    */
1381   pvr_csb_pack (&state->regs.isp_zlsctl, CR_ISP_ZLSCTL, value) {
1382      uint32_t aligned_width =
1383         ALIGN_POT(job->depth_physical_width, ROGUE_IPF_TILE_SIZE_PIXELS);
1384      uint32_t aligned_height =
1385         ALIGN_POT(job->depth_physical_height, ROGUE_IPF_TILE_SIZE_PIXELS);
1386
1387      pvr_get_isp_num_tiles_xy(dev_info,
1388                               job->samples,
1389                               aligned_width,
1390                               aligned_height,
1391                               &value.zlsextent_x_z,
1392                               &value.zlsextent_y_z);
1393      value.zlsextent_x_z -= 1;
1394      value.zlsextent_y_z -= 1;
1395
1396      if (job->depth_memlayout == PVR_MEMLAYOUT_TWIDDLED) {
1397         value.loadtwiddled = true;
1398         value.storetwiddled = true;
1399      }
1400
1401      /* FIXME: This is suitable for the single depth format the driver
1402       * currently supports, but may need updating to handle other depth
1403       * formats.
1404       */
1405      assert(job->depth_vk_format == VK_FORMAT_D32_SFLOAT);
1406      value.zloadformat = PVRX(CR_ZLOADFORMAT_TYPE_F32Z);
1407      value.zstoreformat = PVRX(CR_ZSTOREFORMAT_TYPE_F32Z);
1408   }
1409
1410   if (PVR_HAS_FEATURE(dev_info, zls_subtile)) {
1411      pvr_csb_pack (&state->regs.isp_zls_pixels, CR_ISP_ZLS_PIXELS, value) {
1412         value.x = job->depth_stride - 1;
1413         value.y = job->depth_height - 1;
1414      }
1415   } else {
1416      state->regs.isp_zls_pixels = 0;
1417   }
1418
1419   pvr_csb_pack (&state->regs.isp_zload_store_base, CR_ISP_ZLOAD_BASE, value) {
1420      value.addr = job->depth_addr;
1421   }
1422
1423   pvr_csb_pack (&state->regs.isp_stencil_load_store_base,
1424                 CR_ISP_STENCIL_LOAD_BASE,
1425                 value) {
1426      value.addr = job->stencil_addr;
1427
1428      /* FIXME: May need to set value.enable to true. */
1429   }
1430
1431   pvr_csb_pack (&state->regs.tpu_border_colour_table,
1432                 CR_TPU_BORDER_COLOUR_TABLE_PDM,
1433                 value) {
1434      value.border_colour_table_address = job->border_colour_table_addr;
1435   }
1436
1437   state->regs.isp_oclqry_base = 0;
1438
1439   pvr_csb_pack (&state->regs.isp_dbias_base, CR_ISP_DBIAS_BASE, value) {
1440      value.addr = job->depth_bias_table_addr;
1441   }
1442
1443   pvr_csb_pack (&state->regs.isp_scissor_base, CR_ISP_SCISSOR_BASE, value) {
1444      value.addr = job->scissor_table_addr;
1445   }
1446
1447   pvr_csb_pack (&state->regs.event_pixel_pds_info,
1448                 CR_EVENT_PIXEL_PDS_INFO,
1449                 value) {
1450      value.const_size =
1451         DIV_ROUND_UP(ctx->device->pixel_event_data_size_in_dwords,
1452                      PVRX(CR_EVENT_PIXEL_PDS_INFO_CONST_SIZE_UNIT_SIZE));
1453      value.temp_stride = 0;
1454      value.usc_sr_size =
1455         DIV_ROUND_UP(PVR_STATE_PBE_DWORDS,
1456                      PVRX(CR_EVENT_PIXEL_PDS_INFO_USC_SR_SIZE_UNIT_SIZE));
1457   }
1458
1459   pvr_csb_pack (&state->regs.event_pixel_pds_data,
1460                 CR_EVENT_PIXEL_PDS_DATA,
1461                 value) {
1462      value.addr = PVR_DEV_ADDR(job->pds_pixel_event_data_offset);
1463   }
1464
1465   STATIC_ASSERT(ARRAY_SIZE(state->regs.pbe_word) ==
1466                 ARRAY_SIZE(job->pbe_reg_words));
1467   STATIC_ASSERT(ARRAY_SIZE(state->regs.pbe_word[0]) ==
1468                 ARRAY_SIZE(job->pbe_reg_words[0]));
1469
1470   for (uint32_t i = 0; i < ARRAY_SIZE(job->pbe_reg_words); i++) {
1471      state->regs.pbe_word[i][0] = job->pbe_reg_words[i][0];
1472      state->regs.pbe_word[i][1] = job->pbe_reg_words[i][1];
1473      state->regs.pbe_word[i][2] = job->pbe_reg_words[i][2];
1474   }
1475
1476   STATIC_ASSERT(__same_type(state->regs.pds_bgnd, job->pds_bgnd_reg_values));
1477   typed_memcpy(state->regs.pds_bgnd,
1478                job->pds_bgnd_reg_values,
1479                ARRAY_SIZE(state->regs.pds_bgnd));
1480
1481   memset(state->regs.pds_pr_bgnd, 0, sizeof(state->regs.pds_pr_bgnd));
1482
1483   /* FIXME: Merge geometry and fragment flags into a single flags member? */
1484   /* FIXME: move to its own function? */
1485   state->flags = 0;
1486
1487   if (job->depth_addr.addr)
1488      state->flags |= PVR_WINSYS_FRAG_FLAG_DEPTH_BUFFER_PRESENT;
1489
1490   if (job->stencil_addr.addr)
1491      state->flags |= PVR_WINSYS_FRAG_FLAG_STENCIL_BUFFER_PRESENT;
1492
1493   if (job->disable_compute_overlap)
1494      state->flags |= PVR_WINSYS_FRAG_FLAG_PREVENT_CDM_OVERLAP;
1495
1496   if (job->frag_uses_atomic_ops)
1497      state->flags |= PVR_WINSYS_FRAG_FLAG_SINGLE_CORE;
1498
1499   state->zls_stride = job->depth_layer_size;
1500   state->sls_stride = job->depth_layer_size;
1501}
1502
1503static void pvr_render_job_ws_submit_info_init(
1504   struct pvr_render_ctx *ctx,
1505   struct pvr_render_job *job,
1506   const struct pvr_winsys_job_bo *bos,
1507   uint32_t bo_count,
1508   struct vk_sync **waits,
1509   uint32_t wait_count,
1510   uint32_t *stage_flags,
1511   struct pvr_winsys_render_submit_info *submit_info)
1512{
1513   memset(submit_info, 0, sizeof(*submit_info));
1514
1515   submit_info->rt_dataset = job->rt_dataset->ws_rt_dataset;
1516   submit_info->rt_data_idx = job->rt_dataset->rt_data_idx;
1517
1518   submit_info->frame_num = ctx->device->global_queue_present_count;
1519   submit_info->job_num = ctx->device->global_queue_job_count;
1520
1521   submit_info->run_frag = job->run_frag;
1522
1523   submit_info->bos = bos;
1524   submit_info->bo_count = bo_count;
1525
1526   submit_info->waits = waits;
1527   submit_info->wait_count = wait_count;
1528   submit_info->stage_flags = stage_flags;
1529
1530   /* FIXME: add WSI image bos. */
1531
1532   pvr_render_job_ws_geometry_state_init(ctx, job, &submit_info->geometry);
1533   pvr_render_job_ws_fragment_state_init(ctx, job, &submit_info->fragment);
1534
1535   /* These values are expected to match. */
1536   assert(submit_info->geometry.regs.tpu == submit_info->fragment.regs.tpu);
1537}
1538
1539VkResult pvr_render_job_submit(struct pvr_render_ctx *ctx,
1540                               struct pvr_render_job *job,
1541                               const struct pvr_winsys_job_bo *bos,
1542                               uint32_t bo_count,
1543                               struct vk_sync **waits,
1544                               uint32_t wait_count,
1545                               uint32_t *stage_flags,
1546                               struct vk_sync *signal_sync_geom,
1547                               struct vk_sync *signal_sync_frag)
1548{
1549   struct pvr_rt_dataset *rt_dataset = job->rt_dataset;
1550   struct pvr_winsys_render_submit_info submit_info;
1551   struct pvr_device *device = ctx->device;
1552   VkResult result;
1553
1554   pvr_render_job_ws_submit_info_init(ctx,
1555                                      job,
1556                                      bos,
1557                                      bo_count,
1558                                      waits,
1559                                      wait_count,
1560                                      stage_flags,
1561                                      &submit_info);
1562
1563   result = device->ws->ops->render_submit(ctx->ws_ctx,
1564                                           &submit_info,
1565                                           signal_sync_geom,
1566                                           signal_sync_frag);
1567   if (result != VK_SUCCESS)
1568      return result;
1569
1570   if (job->run_frag) {
1571      /* Move to the next render target data now that a fragment job has been
1572       * successfully submitted. This will allow the next geometry job to be
1573       * submitted to been run in parallel with it.
1574       */
1575      rt_dataset->rt_data_idx =
1576         (rt_dataset->rt_data_idx + 1) % ARRAY_SIZE(rt_dataset->rt_datas);
1577
1578      rt_dataset->need_frag = false;
1579   } else {
1580      rt_dataset->need_frag = true;
1581   }
1582
1583   return VK_SUCCESS;
1584}
1585