1/*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include <stdbool.h>
25#include <stdint.h>
26
27#include "hwdef/rogue_hw_utils.h"
28#include "pvr_bo.h"
29#include "pvr_device_info.h"
30#include "pvr_formats.h"
31#include "pvr_hw_pass.h"
32#include "pvr_pds.h"
33#include "pvr_private.h"
34#include "pvr_usc_fragment_shader.h"
35#include "rogue/rogue.h"
36#include "vk_alloc.h"
37#include "vk_format.h"
38#include "vk_log.h"
39
40/*****************************************************************************
41  PDS pre-baked program generation parameters and variables.
42*****************************************************************************/
43/* These would normally be produced by the compiler or other code. We're using
44 * them for now just to speed up things. All of these should eventually be
45 * removed.
46 */
47
48static const struct {
49   /* Indicates the amount of temporaries for the shader. */
50   uint32_t temp_count;
51   enum rogue_msaa_mode msaa_mode;
52   /* Indicates the presence of PHAS instruction. */
53   bool has_phase_rate_change;
54} pvr_pds_fragment_program_params = {
55   .temp_count = 0,
56   .msaa_mode = ROGUE_MSAA_MODE_PIXEL,
57   .has_phase_rate_change = false,
58};
59
60static inline bool pvr_subpass_has_msaa_input_attachment(
61   struct pvr_render_subpass *subpass,
62   const VkRenderPassCreateInfo2 *pCreateInfo)
63{
64   for (uint32_t i = 0; i < subpass->input_count; i++) {
65      const uint32_t attachment = subpass->input_attachments[i];
66
67      if (pCreateInfo->pAttachments[attachment].samples > 1)
68         return true;
69   }
70
71   return false;
72}
73
74static inline size_t
75pvr_num_subpass_attachments(const VkSubpassDescription2 *desc)
76{
77   return desc->inputAttachmentCount + desc->colorAttachmentCount +
78          (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
79          (desc->pDepthStencilAttachment != NULL);
80}
81
82static bool pvr_is_subpass_initops_flush_needed(
83   const struct pvr_render_pass *pass,
84   const struct pvr_renderpass_hwsetup_render *hw_render)
85{
86   struct pvr_render_subpass *subpass = &pass->subpasses[0];
87   uint32_t render_loadop_mask = 0;
88   uint32_t color_attachment_mask;
89
90   for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
91      if (hw_render->color_init[i].op != RENDERPASS_SURFACE_INITOP_NOP)
92         render_loadop_mask |= (1 << hw_render->color_init[i].driver_id);
93   }
94
95   /* If there are no load ops then there's nothing to flush. */
96   if (render_loadop_mask == 0)
97      return false;
98
99   /* If the first subpass has any input attachments, they need to be
100    * initialized with the result of the load op. Since the input attachment
101    * may be read from fragments with an opaque pass type, the load ops must be
102    * flushed or else they would be obscured and eliminated by HSR.
103    */
104   if (subpass->input_count != 0)
105      return true;
106
107   color_attachment_mask = 0;
108
109   for (uint32_t i = 0; i < subpass->color_count; i++) {
110      const int32_t color_idx = subpass->color_attachments[i];
111
112      if (color_idx != -1)
113         color_attachment_mask |= (1 << pass->attachments[color_idx].index);
114   }
115
116   /* If the first subpass does not write to all attachments which have a load
117    * op then the load ops need to be flushed to ensure they don't get obscured
118    * and removed by HSR.
119    */
120   return (render_loadop_mask & color_attachment_mask) != render_loadop_mask;
121}
122
123static void
124pvr_init_subpass_userpass_spawn(struct pvr_renderpass_hwsetup *hw_setup,
125                                struct pvr_render_pass *pass,
126                                struct pvr_render_subpass *subpasses)
127{
128   uint32_t subpass_idx = 0;
129
130   for (uint32_t i = 0; i < hw_setup->render_count; i++) {
131      struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i];
132      uint32_t initial_userpass_spawn =
133         (uint32_t)pvr_is_subpass_initops_flush_needed(pass, hw_render);
134
135      for (uint32_t j = 0; j < hw_render->subpass_count; j++) {
136         subpasses[subpass_idx].userpass_spawn = (j + initial_userpass_spawn);
137         subpass_idx++;
138      }
139   }
140
141   assert(subpass_idx == pass->subpass_count);
142}
143
144static inline bool pvr_has_output_register_writes(
145   const struct pvr_renderpass_hwsetup_render *hw_render)
146{
147   for (uint32_t i = 0; i < hw_render->init_setup.render_targets_count; i++) {
148      struct usc_mrt_resource *mrt_resource =
149         &hw_render->init_setup.mrt_resources[i];
150
151      if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REGISTER)
152         return true;
153   }
154
155   return false;
156}
157
158VkResult pvr_pds_unitex_state_program_create_and_upload(
159   struct pvr_device *device,
160   const VkAllocationCallbacks *allocator,
161   uint32_t texture_kicks,
162   uint32_t uniform_kicks,
163   struct pvr_pds_upload *const pds_upload_out)
164{
165   struct pvr_pds_pixel_shader_sa_program program = {
166      .num_texture_dma_kicks = texture_kicks,
167      .num_uniform_dma_kicks = uniform_kicks,
168   };
169   uint32_t staging_buffer_size;
170   uint32_t *staging_buffer;
171   VkResult result;
172
173   pvr_pds_set_sizes_pixel_shader_uniform_texture_code(&program);
174
175   staging_buffer_size = program.code_size * sizeof(*staging_buffer);
176
177   staging_buffer = vk_alloc2(&device->vk.alloc,
178                              allocator,
179                              staging_buffer_size,
180                              8U,
181                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
182   if (!staging_buffer)
183      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
184
185   pvr_pds_generate_pixel_shader_sa_code_segment(&program, staging_buffer);
186
187   /* FIXME: Figure out the define for alignment of 16. */
188   result = pvr_gpu_upload_pds(device,
189                               NULL,
190                               0U,
191                               0U,
192                               staging_buffer,
193                               program.code_size,
194                               16U,
195                               16U,
196                               pds_upload_out);
197   if (result != VK_SUCCESS) {
198      vk_free2(&device->vk.alloc, allocator, staging_buffer);
199      return result;
200   }
201
202   vk_free2(&device->vk.alloc, allocator, staging_buffer);
203
204   return VK_SUCCESS;
205}
206
207static VkResult
208pvr_load_op_create(struct pvr_device *device,
209                   const VkAllocationCallbacks *allocator,
210                   struct pvr_renderpass_hwsetup_render *hw_render,
211                   struct pvr_load_op **const load_op_out)
212{
213   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
214   const uint32_t cache_line_size = rogue_get_slc_cache_line_size(dev_info);
215   struct pvr_load_op *load_op;
216   VkResult result;
217
218   load_op = vk_zalloc2(&device->vk.alloc,
219                        allocator,
220                        sizeof(*load_op),
221                        8,
222                        VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
223   if (!load_op)
224      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
225
226   for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
227      struct pvr_renderpass_colorinit *color_init = &hw_render->color_init[i];
228
229      if (color_init->op == RENDERPASS_SURFACE_INITOP_CLEAR)
230         load_op->clear_mask |= 1U << i;
231      else if (color_init->op == RENDERPASS_SURFACE_INITOP_LOAD)
232         pvr_finishme("Missing 'load' load op");
233   }
234
235   result = pvr_gpu_upload_usc(device,
236                               pvr_usc_fragment_shader,
237                               sizeof(pvr_usc_fragment_shader),
238                               cache_line_size,
239                               &load_op->usc_frag_prog_bo);
240   if (result != VK_SUCCESS)
241      goto err_free_load_op;
242
243   result = pvr_pds_fragment_program_create_and_upload(
244      device,
245      allocator,
246      load_op->usc_frag_prog_bo,
247      pvr_pds_fragment_program_params.temp_count,
248      pvr_pds_fragment_program_params.msaa_mode,
249      pvr_pds_fragment_program_params.has_phase_rate_change,
250      &load_op->pds_frag_prog);
251   if (result != VK_SUCCESS)
252      goto err_free_usc_frag_prog_bo;
253
254   result = pvr_pds_unitex_state_program_create_and_upload(
255      device,
256      allocator,
257      1U,
258      0U,
259      &load_op->pds_tex_state_prog);
260   if (result != VK_SUCCESS)
261      goto err_free_pds_frag_prog;
262
263   load_op->is_hw_object = true;
264   /* FIXME: These should be based on the USC and PDS programs, but are hard
265    * coded for now.
266    */
267   load_op->const_shareds_count = 1;
268   load_op->shareds_dest_offset = 0;
269   load_op->shareds_count = 1;
270   load_op->temps_count = 1;
271
272   *load_op_out = load_op;
273
274   return VK_SUCCESS;
275
276err_free_pds_frag_prog:
277   pvr_bo_free(device, load_op->pds_frag_prog.pvr_bo);
278
279err_free_usc_frag_prog_bo:
280   pvr_bo_free(device, load_op->usc_frag_prog_bo);
281
282err_free_load_op:
283   vk_free2(&device->vk.alloc, allocator, load_op);
284
285   return result;
286}
287
288static void pvr_load_op_destroy(struct pvr_device *device,
289                                const VkAllocationCallbacks *allocator,
290                                struct pvr_load_op *load_op)
291{
292   pvr_bo_free(device, load_op->pds_tex_state_prog.pvr_bo);
293   pvr_bo_free(device, load_op->pds_frag_prog.pvr_bo);
294   pvr_bo_free(device, load_op->usc_frag_prog_bo);
295   vk_free2(&device->vk.alloc, allocator, load_op);
296}
297
298#define PVR_SPM_LOAD_IN_BUFFERS_COUNT(dev_info)              \
299   ({                                                        \
300      int __ret = 7U;                                        \
301      if (PVR_HAS_FEATURE(dev_info, eight_output_registers)) \
302         __ret = 3U;                                         \
303      __ret;                                                 \
304   })
305
306VkResult pvr_CreateRenderPass2(VkDevice _device,
307                               const VkRenderPassCreateInfo2 *pCreateInfo,
308                               const VkAllocationCallbacks *pAllocator,
309                               VkRenderPass *pRenderPass)
310{
311   struct pvr_render_pass_attachment *attachments;
312   PVR_FROM_HANDLE(pvr_device, device, _device);
313   struct pvr_render_subpass *subpasses;
314   size_t subpass_attachment_count;
315   uint32_t *subpass_attachments;
316   struct pvr_render_pass *pass;
317   uint32_t *dep_list;
318   bool *flush_on_dep;
319   VkResult result;
320
321   VK_MULTIALLOC(ma);
322   vk_multialloc_add(&ma, &pass, __typeof__(*pass), 1);
323   vk_multialloc_add(&ma,
324                     &attachments,
325                     __typeof__(*attachments),
326                     pCreateInfo->attachmentCount);
327   vk_multialloc_add(&ma,
328                     &subpasses,
329                     __typeof__(*subpasses),
330                     pCreateInfo->subpassCount);
331
332   subpass_attachment_count = 0;
333   for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
334      subpass_attachment_count +=
335         pvr_num_subpass_attachments(&pCreateInfo->pSubpasses[i]);
336   }
337
338   vk_multialloc_add(&ma,
339                     &subpass_attachments,
340                     __typeof__(*subpass_attachments),
341                     subpass_attachment_count);
342   vk_multialloc_add(&ma,
343                     &dep_list,
344                     __typeof__(*dep_list),
345                     pCreateInfo->dependencyCount);
346   vk_multialloc_add(&ma,
347                     &flush_on_dep,
348                     __typeof__(*flush_on_dep),
349                     pCreateInfo->dependencyCount);
350
351   if (!vk_multialloc_zalloc2(&ma,
352                              &device->vk.alloc,
353                              pAllocator,
354                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
355      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
356   }
357
358   vk_object_base_init(&device->vk, &pass->base, VK_OBJECT_TYPE_RENDER_PASS);
359   pass->attachment_count = pCreateInfo->attachmentCount;
360   pass->attachments = attachments;
361   pass->subpass_count = pCreateInfo->subpassCount;
362   pass->subpasses = subpasses;
363   pass->max_sample_count = 1;
364
365   /* Copy attachment descriptions. */
366   for (uint32_t i = 0; i < pass->attachment_count; i++) {
367      const VkAttachmentDescription2 *desc = &pCreateInfo->pAttachments[i];
368      struct pvr_render_pass_attachment *attachment = &pass->attachments[i];
369
370      pvr_assert(!(desc->flags & ~VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT));
371
372      attachment->load_op = desc->loadOp;
373      attachment->store_op = desc->storeOp;
374
375      attachment->has_stencil = vk_format_has_stencil(attachment->vk_format);
376      if (attachment->has_stencil) {
377         attachment->stencil_load_op = desc->stencilLoadOp;
378         attachment->stencil_store_op = desc->stencilStoreOp;
379      }
380
381      attachment->vk_format = desc->format;
382      attachment->sample_count = desc->samples;
383      attachment->initial_layout = desc->initialLayout;
384      attachment->is_pbe_downscalable =
385         pvr_format_is_pbe_downscalable(attachment->vk_format);
386      attachment->index = i;
387
388      if (attachment->sample_count > pass->max_sample_count)
389         pass->max_sample_count = attachment->sample_count;
390   }
391
392   /* Count how many dependencies each subpass has. */
393   for (uint32_t i = 0; i < pCreateInfo->dependencyCount; i++) {
394      const VkSubpassDependency2 *dep = &pCreateInfo->pDependencies[i];
395
396      if (dep->srcSubpass != VK_SUBPASS_EXTERNAL &&
397          dep->dstSubpass != VK_SUBPASS_EXTERNAL &&
398          dep->srcSubpass != dep->dstSubpass) {
399         pass->subpasses[dep->dstSubpass].dep_count++;
400      }
401   }
402
403   /* Assign reference pointers to lists, and fill in the attachments list, we
404    * need to re-walk the dependencies array later to fill the per-subpass
405    * dependencies lists in.
406    */
407   for (uint32_t i = 0; i < pass->subpass_count; i++) {
408      const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i];
409      struct pvr_render_subpass *subpass = &pass->subpasses[i];
410
411      subpass->pipeline_bind_point = desc->pipelineBindPoint;
412      subpass->sample_count = 1;
413
414      subpass->color_count = desc->colorAttachmentCount;
415      if (subpass->color_count > 0) {
416         bool has_used_color_attachment = false;
417         uint32_t index;
418
419         subpass->color_attachments = subpass_attachments;
420         subpass_attachments += subpass->color_count;
421
422         for (uint32_t j = 0; j < subpass->color_count; j++) {
423            subpass->color_attachments[j] =
424               desc->pColorAttachments[j].attachment;
425
426            if (subpass->color_attachments[j] == VK_ATTACHMENT_UNUSED)
427               continue;
428
429            index = subpass->color_attachments[j];
430            subpass->sample_count = pass->attachments[index].sample_count;
431            has_used_color_attachment = true;
432         }
433
434         if (!has_used_color_attachment && desc->pDepthStencilAttachment &&
435             desc->pDepthStencilAttachment->attachment !=
436                VK_ATTACHMENT_UNUSED) {
437            index = desc->pDepthStencilAttachment->attachment;
438            subpass->sample_count = pass->attachments[index].sample_count;
439         }
440      }
441
442      if (desc->pResolveAttachments) {
443         subpass->resolve_attachments = subpass_attachments;
444         subpass_attachments += subpass->color_count;
445
446         for (uint32_t j = 0; j < subpass->color_count; j++) {
447            subpass->resolve_attachments[j] =
448               desc->pResolveAttachments[j].attachment;
449         }
450      }
451
452      subpass->input_count = desc->inputAttachmentCount;
453      if (subpass->input_count > 0) {
454         subpass->input_attachments = subpass_attachments;
455         subpass_attachments += subpass->input_count;
456
457         for (uint32_t j = 0; j < subpass->input_count; j++) {
458            subpass->input_attachments[j] =
459               desc->pInputAttachments[j].attachment;
460         }
461      }
462
463      if (desc->pDepthStencilAttachment) {
464         subpass->depth_stencil_attachment = subpass_attachments++;
465         *subpass->depth_stencil_attachment =
466            desc->pDepthStencilAttachment->attachment;
467      }
468
469      /* Give the dependencies a slice of the subpass_attachments array. */
470      subpass->dep_list = dep_list;
471      dep_list += subpass->dep_count;
472      subpass->flush_on_dep = flush_on_dep;
473      flush_on_dep += subpass->dep_count;
474
475      /* Reset the dependencies count so we can start from 0 and index into
476       * the dependencies array.
477       */
478      subpass->dep_count = 0;
479      subpass->index = i;
480   }
481
482   /* Compute dependencies and populate dep_list and flush_on_dep. */
483   for (uint32_t i = 0; i < pCreateInfo->dependencyCount; i++) {
484      const VkSubpassDependency2 *dep = &pCreateInfo->pDependencies[i];
485
486      if (dep->srcSubpass != VK_SUBPASS_EXTERNAL &&
487          dep->dstSubpass != VK_SUBPASS_EXTERNAL &&
488          dep->srcSubpass != dep->dstSubpass) {
489         struct pvr_render_subpass *subpass = &pass->subpasses[dep->dstSubpass];
490
491         subpass->dep_list[subpass->dep_count] = dep->srcSubpass;
492         if (pvr_subpass_has_msaa_input_attachment(subpass, pCreateInfo))
493            subpass->flush_on_dep[subpass->dep_count] = true;
494
495         subpass->dep_count++;
496      }
497   }
498
499   pass->max_tilebuffer_count =
500      PVR_SPM_LOAD_IN_BUFFERS_COUNT(&device->pdevice->dev_info);
501
502   pass->hw_setup = pvr_create_renderpass_hwsetup(device, pass, false);
503   if (!pass->hw_setup) {
504      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
505      goto err_free_pass;
506   }
507
508   pvr_init_subpass_userpass_spawn(pass->hw_setup, pass, pass->subpasses);
509
510   for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
511      struct pvr_renderpass_hwsetup_render *hw_render =
512         &pass->hw_setup->renders[i];
513      struct pvr_load_op *load_op = NULL;
514
515      if (hw_render->tile_buffers_count)
516         pvr_finishme("Set up tile buffer table");
517
518      if (!hw_render->color_init_count) {
519         assert(!hw_render->client_data);
520         continue;
521      }
522
523      if (!pvr_has_output_register_writes(hw_render))
524         pvr_finishme("Add output register write");
525
526      result = pvr_load_op_create(device, pAllocator, hw_render, &load_op);
527      if (result != VK_SUCCESS)
528         goto err_load_op_destroy;
529
530      hw_render->client_data = load_op;
531   }
532
533   *pRenderPass = pvr_render_pass_to_handle(pass);
534
535   return VK_SUCCESS;
536
537err_load_op_destroy:
538   for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
539      struct pvr_renderpass_hwsetup_render *hw_render =
540         &pass->hw_setup->renders[i];
541
542      if (hw_render->client_data)
543         pvr_load_op_destroy(device, pAllocator, hw_render->client_data);
544   }
545
546   pvr_destroy_renderpass_hwsetup(device, pass->hw_setup);
547
548err_free_pass:
549   vk_object_base_finish(&pass->base);
550   vk_free2(&device->vk.alloc, pAllocator, pass);
551
552   return result;
553}
554
555void pvr_DestroyRenderPass(VkDevice _device,
556                           VkRenderPass _pass,
557                           const VkAllocationCallbacks *pAllocator)
558{
559   PVR_FROM_HANDLE(pvr_device, device, _device);
560   PVR_FROM_HANDLE(pvr_render_pass, pass, _pass);
561
562   if (!pass)
563      return;
564
565   for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
566      struct pvr_renderpass_hwsetup_render *hw_render =
567         &pass->hw_setup->renders[i];
568
569      pvr_load_op_destroy(device, pAllocator, hw_render->client_data);
570   }
571
572   pvr_destroy_renderpass_hwsetup(device, pass->hw_setup);
573   vk_object_base_finish(&pass->base);
574   vk_free2(&device->vk.alloc, pAllocator, pass);
575}
576
577void pvr_GetRenderAreaGranularity(VkDevice _device,
578                                  VkRenderPass renderPass,
579                                  VkExtent2D *pGranularity)
580{
581   PVR_FROM_HANDLE(pvr_device, device, _device);
582   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
583
584   /* Granularity does not depend on any settings in the render pass, so return
585    * the tile granularity.
586    *
587    * The default value is based on the minimum value found in all existing
588    * cores.
589    */
590   pGranularity->width = PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 16);
591   pGranularity->height = PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 16);
592}
593